From f5f02d1316e814db3c5c1e15f5b48b14918dde4f Mon Sep 17 00:00:00 2001
From: Daniel Povey <dpovey@gmail.com>
Date: Sat, 9 Mar 2019 21:12:20 -0500
Subject: [PATCH 001/163] [src] Lots of changes: first stab at kaldi10
 (non-compatible version of Kaldi)

---
 src/Makefile                                  |   34 +-
 src/bin/Makefile                              |    2 +-
 src/bin/acc-lda.cc                            |    4 +-
 src/bin/acc-tree-stats.cc                     |    4 +-
 src/bin/add-self-loops.cc                     |    4 +-
 src/bin/ali-to-pdf.cc                         |    4 +-
 src/bin/ali-to-phones.cc                      |    4 +-
 src/bin/ali-to-post.cc                        |    2 +-
 src/bin/align-compiled-mapped.cc              |    4 +-
 src/bin/align-equal-compiled.cc               |    2 +-
 src/bin/align-equal.cc                        |    4 +-
 src/bin/align-mapped.cc                       |    4 +-
 src/bin/am-info.cc                            |    4 +-
 src/bin/build-pfile-from-ali.cc               |    4 +-
 src/bin/build-tree-two-level.cc               |    4 +-
 src/bin/build-tree.cc                         |    4 +-
 src/bin/compile-graph.cc                      |    4 +-
 src/bin/compile-questions.cc                  |    6 +-
 src/bin/compile-train-graphs-fsts.cc          |    4 +-
 src/bin/compile-train-graphs.cc               |    4 +-
 src/bin/convert-ali.cc                        |    8 +-
 src/bin/copy-gselect.cc                       |    2 +-
 src/bin/copy-transition-model.cc              |    4 +-
 src/bin/copy-tree.cc                          |    2 +-
 src/{nnetbin => bin}/cuda-gpu-available.cc    |    0
 src/bin/decode-faster-mapped.cc               |    4 +-
 src/bin/decode-faster.cc                      |    2 +-
 src/bin/est-mllt.cc                           |    2 +-
 src/bin/get-post-on-ali.cc                    |    2 +-
 src/bin/hmm-info.cc                           |    4 +-
 src/bin/latgen-faster-mapped-parallel.cc      |    4 +-
 src/bin/latgen-faster-mapped.cc               |    4 +-
 src/bin/logprob-to-post.cc                    |    2 +-
 src/bin/make-h-transducer.cc                  |    4 +-
 src/bin/make-ilabel-transducer.cc             |    4 +-
 src/bin/make-pdf-to-tid-transducer.cc         |    4 +-
 src/bin/phones-to-prons.cc                    |    2 +-
 src/bin/post-to-pdf-post.cc                   |    4 +-
 src/bin/post-to-phone-post.cc                 |    4 +-
 src/bin/post-to-tacc.cc                       |    4 +-
 src/bin/prob-to-post.cc                       |    2 +-
 src/bin/prons-to-wordali.cc                   |    2 +-
 src/bin/show-alignments.cc                    |    4 +-
 src/bin/show-transitions.cc                   |    4 +-
 src/bin/tree-info.cc                          |    2 +-
 src/bin/weight-silence-post.cc                |    4 +-
 src/chain/chain-den-graph.cc                  |    4 +-
 src/chain/chain-den-graph.h                   |    6 +-
 src/chain/chain-denominator.h                 |    2 +-
 src/chain/chain-generic-numerator.h           |    2 +-
 src/chain/chain-numerator.h                   |    2 +-
 src/chain/chain-supervision-test.cc           |   10 +-
 src/chain/chain-supervision.cc                |    8 +-
 src/chain/chain-supervision.h                 |   18 +-
 src/chain/chain-training.h                    |    2 +-
 src/chainbin/chain-get-supervision.cc         |    4 +-
 src/chainbin/chain-make-den-fst.cc            |    2 +-
 src/chainbin/nnet3-chain-acc-lda-stats.cc     |    2 +-
 src/chainbin/nnet3-chain-copy-egs.cc          |    2 +-
 src/chainbin/nnet3-chain-e2e-get-egs.cc       |    6 +-
 src/chainbin/nnet3-chain-get-egs.cc           |    8 +-
 src/chainbin/nnet3-chain-merge-egs.cc         |    2 +-
 src/chainbin/nnet3-chain-normalize-egs.cc     |    2 +-
 src/chainbin/nnet3-chain-shuffle-egs.cc       |    2 +-
 src/decoder/decodable-matrix.cc               |    8 +-
 src/decoder/decodable-matrix.h                |   22 +-
 src/decoder/decoder-wrappers.cc               |   10 +-
 src/decoder/decoder-wrappers.h                |    8 +-
 src/decoder/training-graph-compiler.cc        |    2 +-
 src/decoder/training-graph-compiler.h         |    6 +-
 src/feat/Makefile                             |    8 +-
 src/feat/feature-common-inl.h                 |    7 +-
 src/feat/feature-common.h                     |    8 +-
 src/feat/feature-fbank.cc                     |   36 +-
 src/feat/feature-fbank.h                      |   48 +-
 src/feat/feature-mfcc-test.cc                 |   40 +-
 src/feat/feature-mfcc.cc                      |   43 +-
 src/feat/feature-mfcc.h                       |   45 +-
 src/feat/feature-plp-test.cc                  |  177 -
 src/feat/feature-plp.cc                       |  191 -
 src/feat/feature-plp.h                        |  176 -
 src/feat/feature-window.cc                    |  147 +-
 src/feat/feature-window.h                     |   61 +-
 src/feat/mel-computations.cc                  |   95 +-
 src/feat/mel-computations.h                   |   43 +-
 src/feat/online-feature-test.cc               |   81 +-
 src/feat/online-feature.cc                    |   18 +-
 src/feat/online-feature.h                     |   12 +-
 src/feat/pitch-functions-test.cc              |    1 -
 src/feat/wave-reader.cc                       |   14 +-
 src/feat/wave-reader.h                        |    8 +-
 src/featbin/Makefile                          |    4 +-
 src/featbin/compute-plp-feats.cc              |  184 -
 src/featbin/compute-spectrogram-feats.cc      |  158 -
 src/fgmmbin/fgmm-global-info.cc               |    2 +-
 src/fgmmbin/fgmm-gselect.cc                   |    2 +-
 src/gmm/Makefile                              |    6 +-
 src/gmm/decodable-am-diag-gmm.h               |   36 +-
 src/gmmbin/Makefile                           |   17 +-
 src/gmmbin/gmm-acc-mllt-global.cc             |    2 +-
 src/gmmbin/gmm-acc-mllt.cc                    |    4 +-
 src/gmmbin/gmm-acc-stats-ali.cc               |    4 +-
 src/gmmbin/gmm-acc-stats-twofeats.cc          |    4 +-
 src/gmmbin/gmm-acc-stats.cc                   |    4 +-
 src/gmmbin/gmm-acc-stats2.cc                  |    4 +-
 src/gmmbin/gmm-adapt-map.cc                   |    4 +-
 src/gmmbin/gmm-align-compiled.cc              |    4 +-
 src/gmmbin/gmm-align.cc                       |    4 +-
 src/gmmbin/gmm-basis-fmllr-accs-gpost.cc      |    6 +-
 src/gmmbin/gmm-basis-fmllr-accs.cc            |    6 +-
 src/gmmbin/gmm-basis-fmllr-training.cc        |    4 +-
 src/gmmbin/gmm-boost-silence.cc               |    4 +-
 src/gmmbin/gmm-compute-likes.cc               |    4 +-
 src/gmmbin/gmm-copy.cc                        |    4 +-
 src/gmmbin/gmm-decode-biglm-faster.cc         |    4 +-
 src/gmmbin/gmm-decode-faster-regtree-fmllr.cc |  290 --
 src/gmmbin/gmm-decode-faster-regtree-mllr.cc  |  267 -
 src/gmmbin/gmm-decode-faster.cc               |    4 +-
 src/gmmbin/gmm-decode-simple.cc               |    4 +-
 src/gmmbin/gmm-est-basis-fmllr-gpost.cc       |    6 +-
 src/gmmbin/gmm-est-basis-fmllr.cc             |    6 +-
 src/gmmbin/gmm-est-fmllr-global.cc            |    2 +-
 src/gmmbin/gmm-est-fmllr-gpost.cc             |    6 +-
 src/gmmbin/gmm-est-fmllr-raw-gpost.cc         |  198 -
 src/gmmbin/gmm-est-fmllr-raw.cc               |  199 -
 src/gmmbin/gmm-est-fmllr.cc                   |    6 +-
 src/gmmbin/gmm-est-gaussians-ebw.cc           |    4 +-
 src/gmmbin/gmm-est-lvtln-trans.cc             |    4 +-
 src/gmmbin/gmm-est-map.cc                     |    4 +-
 src/gmmbin/gmm-est-regtree-fmllr-ali.cc       |  202 -
 src/gmmbin/gmm-est-regtree-fmllr.cc           |  216 -
 src/gmmbin/gmm-est-regtree-mllr.cc            |  215 -
 src/gmmbin/gmm-est-rescale.cc                 |    4 +-
 src/gmmbin/gmm-est-weights-ebw.cc             |    4 +-
 src/gmmbin/gmm-est.cc                         |    4 +-
 src/gmmbin/gmm-fmpe-acc-stats.cc              |    4 +-
 src/gmmbin/gmm-get-stats-deriv.cc             |    4 +-
 src/gmmbin/gmm-global-est-fmllr.cc            |    2 +-
 src/gmmbin/gmm-global-est-lvtln-trans.cc      |    2 +-
 src/gmmbin/gmm-global-info.cc                 |    2 +-
 src/gmmbin/gmm-gselect.cc                     |    2 +-
 src/gmmbin/gmm-info.cc                        |    4 +-
 src/gmmbin/gmm-init-biphone.cc                |    8 +-
 src/gmmbin/gmm-init-model-flat.cc             |    6 +-
 src/gmmbin/gmm-init-model.cc                  |   10 +-
 src/gmmbin/gmm-init-mono.cc                   |    8 +-
 src/gmmbin/gmm-ismooth-stats.cc               |    4 +-
 src/gmmbin/gmm-latgen-biglm-faster.cc         |    6 +-
 src/gmmbin/gmm-latgen-faster-parallel.cc      |    4 +-
 src/gmmbin/gmm-latgen-faster-regtree-fmllr.cc |  218 -
 src/gmmbin/gmm-latgen-faster.cc               |    4 +-
 src/gmmbin/gmm-latgen-map.cc                  |    4 +-
 src/gmmbin/gmm-latgen-simple.cc               |    4 +-
 src/gmmbin/gmm-make-regtree.cc                |  107 -
 src/gmmbin/gmm-mixup.cc                       |    4 +-
 src/gmmbin/gmm-post-to-gpost.cc               |    4 +-
 src/gmmbin/gmm-rescore-lattice.cc             |    4 +-
 src/gmmbin/gmm-sum-accs.cc                    |    2 +-
 src/gmmbin/gmm-transform-means-global.cc      |    2 +-
 src/gmmbin/gmm-transform-means.cc             |    4 +-
 .../gst-online-gmm-decode-faster.cc           |    2 +-
 src/gst-plugin/gst-online-gmm-decode-faster.h |    2 +-
 src/hmm/Makefile                              |    7 +-
 src/hmm/hmm-test-utils.cc                     |   26 +-
 src/hmm/hmm-test-utils.h                      |   24 +-
 src/hmm/hmm-topology-test.cc                  |   12 +-
 src/hmm/hmm-topology.h                        |  194 -
 src/hmm/hmm-utils-test.cc                     |    6 +-
 src/hmm/hmm-utils.cc                          |   60 +-
 src/hmm/hmm-utils.h                           |   34 +-
 src/hmm/posterior.cc                          |   20 +-
 src/hmm/posterior.h                           |   16 +-
 src/hmm/{hmm-topology.cc => topology.cc}      |   50 +-
 src/hmm/topology.h                            |  138 +
 src/hmm/transition-model.h                    |  371 --
 ...tion-model-test.cc => transitions-test.cc} |   10 +-
 .../{transition-model.cc => transitions.cc}   |  139 +-
 src/hmm/transitions.h                         |  263 +
 src/hmm/tree-accu.cc                          |    2 +-
 src/hmm/tree-accu.h                           |    4 +-
 src/itf/context-dep-itf.h                     |    4 +-
 src/lat/determinize-lattice-pruned.cc         |   56 +-
 src/lat/determinize-lattice-pruned.h          |   10 +-
 src/lat/lattice-functions.cc                  |   68 +-
 src/lat/lattice-functions.h                   |   24 +-
 src/lat/minimize-lattice.cc                   |    2 +-
 src/lat/phone-align-lattice.cc                |  122 +-
 src/lat/phone-align-lattice.h                 |    4 +-
 src/lat/push-lattice.cc                       |    2 +-
 src/lat/word-align-lattice-lexicon-test.cc    |    2 +-
 src/lat/word-align-lattice-lexicon.cc         |   29 +-
 src/lat/word-align-lattice-lexicon.h          |    6 +-
 src/lat/word-align-lattice.cc                 |  563 +--
 src/lat/word-align-lattice.h                  |   33 +-
 src/latbin/lattice-add-trans-probs.cc         |    4 +-
 src/latbin/lattice-align-phones.cc            |    2 +-
 src/latbin/lattice-align-words-lexicon.cc     |    2 +-
 src/latbin/lattice-align-words.cc             |    2 +-
 src/latbin/lattice-arc-post.cc                |    6 +-
 src/latbin/lattice-boost-ali.cc               |    2 +-
 ...ttice-determinize-phone-pruned-parallel.cc |    8 +-
 .../lattice-determinize-phone-pruned.cc       |    4 +-
 src/latbin/lattice-rescore-mapped.cc          |    6 +-
 src/latbin/lattice-to-mpe-post.cc             |    4 +-
 src/latbin/lattice-to-phone-lattice.cc        |    4 +-
 src/latbin/lattice-to-smbr-post.cc            |    4 +-
 src/latbin/nbest-to-prons.cc                  |    2 +-
 src/nnet/Makefile                             |   22 -
 src/nnet/nnet-activation.h                    |  373 --
 src/nnet/nnet-affine-transform.h              |  247 -
 src/nnet/nnet-average-pooling-2d-component.h  |  209 -
 src/nnet/nnet-average-pooling-component.h     |  169 -
 src/nnet/nnet-blstm-projected.h               | 1206 -----
 src/nnet/nnet-component-test.cc               |  451 --
 src/nnet/nnet-component.cc                    |  288 --
 src/nnet/nnet-component.h                     |  358 --
 src/nnet/nnet-convolutional-2d-component.h    |  495 --
 src/nnet/nnet-convolutional-component.h       |  482 --
 src/nnet/nnet-frame-pooling-component.h       |  290 --
 src/nnet/nnet-kl-hmm.h                        |  155 -
 src/nnet/nnet-linear-transform.h              |  212 -
 src/nnet/nnet-loss.cc                         |  460 --
 src/nnet/nnet-loss.h                          |  251 -
 src/nnet/nnet-lstm-projected.h                |  737 ---
 src/nnet/nnet-matrix-buffer.h                 |  233 -
 src/nnet/nnet-max-pooling-2d-component.h      |  225 -
 src/nnet/nnet-max-pooling-component.h         |  176 -
 src/nnet/nnet-multibasis-component.h          |  456 --
 src/nnet/nnet-nnet.cc                         |  520 --
 src/nnet/nnet-nnet.h                          |  186 -
 src/nnet/nnet-parallel-component.h            |  361 --
 src/nnet/nnet-parametric-relu.h               |  213 -
 src/nnet/nnet-pdf-prior.cc                    |   90 -
 src/nnet/nnet-pdf-prior.h                     |   77 -
 src/nnet/nnet-randomizer-test.cc              |  240 -
 src/nnet/nnet-randomizer.cc                   |  234 -
 src/nnet/nnet-randomizer.h                    |  274 -
 src/nnet/nnet-rbm.h                           |  433 --
 src/nnet/nnet-recurrent.h                     |  346 --
 src/nnet/nnet-sentence-averaging-component.h  |  314 --
 src/nnet/nnet-trnopts.h                       |  118 -
 src/nnet/nnet-utils.h                         |  317 --
 src/nnet/nnet-various.h                       |  518 --
 src/nnet2/Makefile                            |   33 -
 src/nnet2/am-nnet-test.cc                     |   88 -
 src/nnet2/am-nnet.cc                          |   83 -
 src/nnet2/am-nnet.h                           |   86 -
 src/nnet2/combine-nnet-a.cc                   |  230 -
 src/nnet2/combine-nnet-a.h                    |   85 -
 src/nnet2/combine-nnet-fast.cc                |  443 --
 src/nnet2/combine-nnet-fast.h                 |  112 -
 src/nnet2/combine-nnet.cc                     |  253 -
 src/nnet2/combine-nnet.h                      |   74 -
 src/nnet2/decodable-am-nnet.h                 |  187 -
 src/nnet2/get-feature-transform.cc            |  203 -
 src/nnet2/get-feature-transform.h             |  180 -
 src/nnet2/mixup-nnet.cc                       |  222 -
 src/nnet2/mixup-nnet.h                        |   69 -
 src/nnet2/nnet-component-test.cc              |  915 ----
 src/nnet2/nnet-component.cc                   | 4390 -----------------
 src/nnet2/nnet-component.h                    | 1816 -------
 .../nnet-compute-discriminative-parallel.cc   |  222 -
 .../nnet-compute-discriminative-parallel.h    |   49 -
 src/nnet2/nnet-compute-discriminative.cc      |  416 --
 src/nnet2/nnet-compute-discriminative.h       |  115 -
 src/nnet2/nnet-compute-online.cc              |  215 -
 src/nnet2/nnet-compute-online.h               |  110 -
 src/nnet2/nnet-compute-test.cc                |  134 -
 src/nnet2/nnet-compute.cc                     |  224 -
 src/nnet2/nnet-compute.h                      |   85 -
 src/nnet2/nnet-example-functions-test.cc      |   69 -
 src/nnet2/nnet-example-functions.cc           |  997 ----
 src/nnet2/nnet-example-functions.h            |  300 --
 src/nnet2/nnet-example.cc                     |  309 --
 src/nnet2/nnet-example.h                      |  191 -
 src/nnet2/nnet-fix.cc                         |  111 -
 src/nnet2/nnet-fix.h                          |   74 -
 src/nnet2/nnet-functions.cc                   |   78 -
 src/nnet2/nnet-functions.h                    |   70 -
 src/nnet2/nnet-limit-rank.cc                  |  112 -
 src/nnet2/nnet-limit-rank.h                   |   62 -
 src/nnet2/nnet-nnet-test.cc                   |   57 -
 src/nnet2/nnet-nnet.cc                        |  846 ----
 src/nnet2/nnet-nnet.h                         |  306 --
 src/nnet2/nnet-precondition-online-test.cc    |  342 --
 src/nnet2/nnet-precondition-online.cc         |  641 ---
 src/nnet2/nnet-precondition-online.h          |  574 ---
 src/nnet2/nnet-precondition-test.cc           |   67 -
 src/nnet2/nnet-precondition.cc                |  352 --
 src/nnet2/nnet-precondition.h                 |   88 -
 src/nnet2/nnet-stats.cc                       |  122 -
 src/nnet2/nnet-stats.h                        |   97 -
 src/nnet2/nnet-update-parallel.cc             |  271 -
 src/nnet2/nnet-update-parallel.h              |   88 -
 src/nnet2/nnet-update.cc                      |  361 --
 src/nnet2/nnet-update.h                       |  191 -
 src/nnet2/online-nnet2-decodable-test.cc      |  114 -
 src/nnet2/online-nnet2-decodable.cc           |  145 -
 src/nnet2/online-nnet2-decodable.h            |  122 -
 src/nnet2/rescale-nnet.cc                     |  227 -
 src/nnet2/rescale-nnet.h                      |   80 -
 src/nnet2/shrink-nnet.cc                      |  112 -
 src/nnet2/shrink-nnet.h                       |   59 -
 src/nnet2/train-nnet-ensemble.cc              |  141 -
 src/nnet2/train-nnet-ensemble.h               |  105 -
 src/nnet2/train-nnet.cc                       |  206 -
 src/nnet2/train-nnet.h                        |   64 -
 src/nnet2/widen-nnet.cc                       |  100 -
 src/nnet2/widen-nnet.h                        |   65 -
 src/nnet2bin/Makefile                         |   44 -
 src/nnet2bin/cuda-compiled.cc                 |   36 -
 src/nnet2bin/nnet-adjust-priors.cc            |  144 -
 src/nnet2bin/nnet-align-compiled.cc           |  159 -
 src/nnet2bin/nnet-am-average.cc               |  259 -
 src/nnet2bin/nnet-am-compute.cc               |  163 -
 src/nnet2bin/nnet-am-copy.cc                  |  214 -
 src/nnet2bin/nnet-am-fix.cc                   |   88 -
 src/nnet2bin/nnet-am-info.cc                  |   87 -
 src/nnet2bin/nnet-am-init.cc                  |  110 -
 src/nnet2bin/nnet-am-mixup.cc                 |   81 -
 src/nnet2bin/nnet-am-reinitialize.cc          |   87 -
 .../nnet-am-switch-preconditioning.cc         |   97 -
 src/nnet2bin/nnet-am-widen.cc                 |   83 -
 .../nnet-combine-egs-discriminative.cc        |  115 -
 src/nnet2bin/nnet-combine-fast.cc             |  133 -
 src/nnet2bin/nnet-combine.cc                  |  124 -
 .../nnet-compare-hash-discriminative.cc       |  138 -
 src/nnet2bin/nnet-compute-from-egs.cc         |   99 -
 src/nnet2bin/nnet-compute-prob.cc             |  104 -
 src/nnet2bin/nnet-compute.cc                  |  105 -
 src/nnet2bin/nnet-copy-egs-discriminative.cc  |  158 -
 src/nnet2bin/nnet-copy-egs.cc                 |  179 -
 src/nnet2bin/nnet-get-egs-discriminative.cc   |  151 -
 src/nnet2bin/nnet-get-egs.cc                  |  184 -
 .../nnet-get-feature-transform-multi.cc       |   94 -
 src/nnet2bin/nnet-get-feature-transform.cc    |   87 -
 src/nnet2bin/nnet-get-weighted-egs.cc         |  232 -
 src/nnet2bin/nnet-init.cc                     |   76 -
 src/nnet2bin/nnet-insert.cc                   |  138 -
 src/nnet2bin/nnet-latgen-faster-parallel.cc   |  207 -
 src/nnet2bin/nnet-latgen-faster.cc            |  196 -
 src/nnet2bin/nnet-modify-learning-rates.cc    |  211 -
 src/nnet2bin/nnet-normalize-stddev.cc         |  174 -
 src/nnet2bin/nnet-relabel-egs.cc              |  168 -
 src/nnet2bin/nnet-replace-last-layers.cc      |   97 -
 src/nnet2bin/nnet-show-progress.cc            |  164 -
 .../nnet-shuffle-egs-discriminative.cc        |  114 -
 src/nnet2bin/nnet-shuffle-egs.cc              |  113 -
 src/nnet2bin/nnet-subset-egs.cc               |  102 -
 src/nnet2bin/nnet-to-raw-nnet.cc              |   83 -
 .../nnet-train-discriminative-parallel.cc     |   95 -
 .../nnet-train-discriminative-simple.cc       |  116 -
 src/nnet2bin/nnet-train-ensemble.cc           |  145 -
 src/nnet2bin/nnet-train-parallel.cc           |  112 -
 src/nnet2bin/nnet-train-simple.cc             |  117 -
 src/nnet2bin/nnet-train-transitions.cc        |  147 -
 src/nnet2bin/nnet1-to-raw-nnet.cc             |  222 -
 src/nnet2bin/raw-nnet-concat.cc               |   75 -
 src/nnet2bin/raw-nnet-copy.cc                 |  107 -
 src/nnet2bin/raw-nnet-info.cc                 |   63 -
 src/nnet2bin/raw-nnet-init                    |    1 -
 src/nnet3/decodable-online-looped.h           |    8 +-
 src/nnet3/decodable-simple-looped.cc          |    2 +-
 src/nnet3/decodable-simple-looped.h           |    6 +-
 src/nnet3/discriminative-supervision.cc       |    2 +-
 src/nnet3/discriminative-supervision.h        |    6 +-
 src/nnet3/discriminative-training.cc          |    8 +-
 src/nnet3/discriminative-training.h           |    4 +-
 src/nnet3/nnet-am-decodable-simple.cc         |    4 +-
 src/nnet3/nnet-am-decodable-simple.h          |   10 +-
 src/nnet3/nnet-batch-compute.cc               |    2 +-
 src/nnet3/nnet-batch-compute.h                |    6 +-
 src/nnet3/nnet-discriminative-diagnostics.cc  |    2 +-
 src/nnet3/nnet-discriminative-diagnostics.h   |    4 +-
 src/nnet3/nnet-discriminative-example.h       |    2 +-
 src/nnet3/nnet-discriminative-training.cc     |    2 +-
 src/nnet3/nnet-discriminative-training.h      |    4 +-
 src/nnet3/nnet-nnet.cc                        |    6 +-
 src/nnet3bin/Makefile                         |    4 +-
 src/nnet3bin/nnet3-acc-lda-stats.cc           |    2 +-
 src/nnet3bin/nnet3-align-compiled.cc          |    4 +-
 src/nnet3bin/nnet3-am-adjust-priors.cc        |    4 +-
 src/nnet3bin/nnet3-am-copy.cc                 |    4 +-
 src/nnet3bin/nnet3-am-info.cc                 |    4 +-
 src/nnet3bin/nnet3-am-init.cc                 |   10 +-
 src/nnet3bin/nnet3-am-train-transitions.cc    |  147 -
 src/nnet3bin/nnet3-average.cc                 |    2 +-
 src/nnet3bin/nnet3-compute-batch.cc           |    2 +-
 src/nnet3bin/nnet3-compute-from-egs.cc        |    2 +-
 src/nnet3bin/nnet3-compute.cc                 |    2 +-
 src/nnet3bin/nnet3-copy-egs.cc                |    2 +-
 src/nnet3bin/nnet3-copy.cc                    |    2 +-
 .../nnet3-discriminative-compute-from-egs.cc  |    2 +-
 .../nnet3-discriminative-compute-objf.cc      |    2 +-
 src/nnet3bin/nnet3-discriminative-copy-egs.cc |    2 +-
 src/nnet3bin/nnet3-discriminative-get-egs.cc  |    6 +-
 .../nnet3-discriminative-merge-egs.cc         |    2 +-
 .../nnet3-discriminative-shuffle-egs.cc       |    2 +-
 src/nnet3bin/nnet3-discriminative-train.cc    |    2 +-
 src/nnet3bin/nnet3-egs-augment-image.cc       |    2 +-
 src/nnet3bin/nnet3-get-egs-dense-targets.cc   |    2 +-
 src/nnet3bin/nnet3-get-egs-simple.cc          |    2 +-
 src/nnet3bin/nnet3-get-egs.cc                 |    2 +-
 src/nnet3bin/nnet3-init.cc                    |    2 +-
 src/nnet3bin/nnet3-latgen-faster-batch.cc     |    4 +-
 src/nnet3bin/nnet3-latgen-faster-looped.cc    |    4 +-
 src/nnet3bin/nnet3-latgen-faster-parallel.cc  |    4 +-
 src/nnet3bin/nnet3-latgen-faster.cc           |    4 +-
 src/nnet3bin/nnet3-latgen-grammar.cc          |    4 +-
 src/nnet3bin/nnet3-merge-egs.cc               |    2 +-
 src/nnet3bin/nnet3-show-progress.cc           |    2 +-
 src/nnet3bin/nnet3-shuffle-egs.cc             |    2 +-
 src/nnetbin/Makefile                          |   30 -
 src/nnetbin/cmvn-to-nnet.cc                   |  121 -
 src/nnetbin/feat-to-post.cc                   |   80 -
 src/nnetbin/nnet-concat.cc                    |   90 -
 src/nnetbin/nnet-copy.cc                      |  151 -
 src/nnetbin/nnet-forward.cc                   |  208 -
 src/nnetbin/nnet-info.cc                      |   65 -
 src/nnetbin/nnet-initialize.cc                |   71 -
 src/nnetbin/nnet-set-learnrate.cc             |  104 -
 src/nnetbin/nnet-train-frmshuff.cc            |  424 --
 src/nnetbin/nnet-train-mmi-sequential.cc      |  481 --
 src/nnetbin/nnet-train-mpe-sequential.cc      |  412 --
 src/nnetbin/nnet-train-multistream-perutt.cc  |  363 --
 src/nnetbin/nnet-train-multistream.cc         |  460 --
 src/nnetbin/nnet-train-perutt.cc              |  310 --
 src/nnetbin/paste-post.cc                     |  168 -
 src/nnetbin/rbm-convert-to-nnet.cc            |   77 -
 src/nnetbin/rbm-train-cd1-frmshuff.cc         |  287 --
 src/nnetbin/train-transitions.cc              |  101 -
 src/nnetbin/transf-to-nnet.cc                 |   79 -
 src/online/online-decodable.cc                |    2 +-
 src/online/online-decodable.h                 |    4 +-
 src/online/online-faster-decoder.h            |    6 +-
 src/online2/Makefile                          |    6 +-
 src/online2/online-endpoint.cc                |    8 +-
 src/online2/online-endpoint.h                 |    6 +-
 src/online2/online-gmm-decodable.cc           |    2 +-
 src/online2/online-gmm-decodable.h            |    6 +-
 src/online2/online-gmm-decoding.cc            |   16 +-
 src/online2/online-gmm-decoding.h             |    6 +-
 src/online2/online-ivector-feature.cc         |    2 +-
 src/online2/online-ivector-feature.h          |    4 +-
 src/online2/online-nnet2-decoding-threaded.cc |  652 ---
 src/online2/online-nnet2-decoding-threaded.h  |    6 +-
 src/online2/online-nnet2-decoding.cc          |   81 -
 src/online2/online-nnet2-decoding.h           |    6 +-
 src/online2/online-nnet3-decoding.cc          |    2 +-
 src/online2/online-nnet3-decoding.h           |    6 +-
 ...ipeline.cc => online2-feature-pipeline.cc} |    0
 .../online2-wav-nnet2-am-compute.cc           |    2 +-
 .../online2-wav-nnet2-latgen-faster.cc        |    2 +-
 .../online2-wav-nnet2-latgen-threaded.cc      |    2 +-
 .../online2-wav-nnet3-latgen-faster.cc        |    2 +-
 .../online2-wav-nnet3-latgen-grammar.cc       |    2 +-
 .../online-audio-server-decode-faster.cc      |    2 +-
 src/onlinebin/online-gmm-decode-faster.cc     |    2 +-
 .../online-server-gmm-decode-faster.cc        |    2 +-
 src/onlinebin/online-wav-gmm-decode-faster.cc |    2 +-
 src/sgmm2/Makefile                            |   19 -
 src/sgmm2/am-sgmm2-project.cc                 |  265 -
 src/sgmm2/am-sgmm2-project.h                  |   86 -
 src/sgmm2/am-sgmm2-test.cc                    |  285 --
 src/sgmm2/am-sgmm2.cc                         | 1493 ------
 src/sgmm2/am-sgmm2.h                          |  586 ---
 src/sgmm2/decodable-am-sgmm2.cc               |   54 -
 src/sgmm2/decodable-am-sgmm2.h                |  138 -
 src/sgmm2/estimate-am-sgmm2-ebw.cc            |  736 ---
 src/sgmm2/estimate-am-sgmm2-ebw.h             |  242 -
 src/sgmm2/estimate-am-sgmm2-test.cc           |  167 -
 src/sgmm2/estimate-am-sgmm2.cc                | 1952 --------
 src/sgmm2/estimate-am-sgmm2.h                 |  478 --
 src/sgmm2/fmllr-sgmm2-test.cc                 |  243 -
 src/sgmm2/fmllr-sgmm2.cc                      |  555 ---
 src/sgmm2/fmllr-sgmm2.h                       |  193 -
 src/sgmm2bin/Makefile                         |   26 -
 src/sgmm2bin/init-ubm.cc                      |   95 -
 src/sgmm2bin/sgmm2-acc-stats-gpost.cc         |  181 -
 src/sgmm2bin/sgmm2-acc-stats.cc               |  223 -
 src/sgmm2bin/sgmm2-acc-stats2.cc              |  240 -
 src/sgmm2bin/sgmm2-align-compiled.cc          |  183 -
 src/sgmm2bin/sgmm2-comp-prexform.cc           |   84 -
 src/sgmm2bin/sgmm2-copy.cc                    |   74 -
 src/sgmm2bin/sgmm2-est-ebw.cc                 |  118 -
 src/sgmm2bin/sgmm2-est-fmllr.cc               |  302 --
 src/sgmm2bin/sgmm2-est-spkvecs-gpost.cc       |  218 -
 src/sgmm2bin/sgmm2-est-spkvecs.cc             |  259 -
 src/sgmm2bin/sgmm2-est.cc                     |  166 -
 src/sgmm2bin/sgmm2-gselect.cc                 |  110 -
 src/sgmm2bin/sgmm2-info.cc                    |  115 -
 src/sgmm2bin/sgmm2-init.cc                    |  132 -
 src/sgmm2bin/sgmm2-latgen-faster-parallel.cc  |  291 --
 src/sgmm2bin/sgmm2-latgen-faster.cc           |  268 -
 src/sgmm2bin/sgmm2-post-to-gpost.cc           |  186 -
 src/sgmm2bin/sgmm2-project.cc                 |  116 -
 src/sgmm2bin/sgmm2-rescore-lattice.cc         |  166 -
 src/sgmm2bin/sgmm2-sum-accs.cc                |   94 -
 src/transform/Makefile                        |   14 +-
 .../decodable-am-diag-gmm-regtree.cc          |  234 -
 src/transform/decodable-am-diag-gmm-regtree.h |  141 -
 src/transform/fmllr-raw-test.cc               |  123 -
 src/transform/fmllr-raw.cc                    |  546 --
 src/transform/fmllr-raw.h                     |  206 -
 src/transform/fmpe-test.cc                    |  177 -
 src/transform/fmpe.cc                         |  691 ---
 src/transform/fmpe.h                          |  271 -
 src/transform/regtree-fmllr-diag-gmm-test.cc  |  320 --
 src/transform/regtree-fmllr-diag-gmm.cc       |  407 --
 src/transform/regtree-fmllr-diag-gmm.h        |  204 -
 src/transform/regtree-mllr-diag-gmm-test.cc   |  194 -
 src/transform/regtree-mllr-diag-gmm.cc        |  398 --
 src/transform/regtree-mllr-diag-gmm.h         |  164 -
 src/tree/build-tree.h                         |    4 +-
 src/tree/context-dep.h                        |    6 +-
 515 files changed, 1652 insertions(+), 63007 deletions(-)
 rename src/{nnetbin => bin}/cuda-gpu-available.cc (100%)
 delete mode 100644 src/feat/feature-plp-test.cc
 delete mode 100644 src/feat/feature-plp.cc
 delete mode 100644 src/feat/feature-plp.h
 delete mode 100644 src/featbin/compute-plp-feats.cc
 delete mode 100644 src/featbin/compute-spectrogram-feats.cc
 delete mode 100644 src/gmmbin/gmm-decode-faster-regtree-fmllr.cc
 delete mode 100644 src/gmmbin/gmm-decode-faster-regtree-mllr.cc
 delete mode 100644 src/gmmbin/gmm-est-fmllr-raw-gpost.cc
 delete mode 100644 src/gmmbin/gmm-est-fmllr-raw.cc
 delete mode 100644 src/gmmbin/gmm-est-regtree-fmllr-ali.cc
 delete mode 100644 src/gmmbin/gmm-est-regtree-fmllr.cc
 delete mode 100644 src/gmmbin/gmm-est-regtree-mllr.cc
 delete mode 100644 src/gmmbin/gmm-latgen-faster-regtree-fmllr.cc
 delete mode 100644 src/gmmbin/gmm-make-regtree.cc
 delete mode 100644 src/hmm/hmm-topology.h
 rename src/hmm/{hmm-topology.cc => topology.cc} (89%)
 create mode 100644 src/hmm/topology.h
 delete mode 100644 src/hmm/transition-model.h
 rename src/hmm/{transition-model-test.cc => transitions-test.cc} (87%)
 rename src/hmm/{transition-model.cc => transitions.cc} (88%)
 create mode 100644 src/hmm/transitions.h
 delete mode 100644 src/nnet/Makefile
 delete mode 100644 src/nnet/nnet-activation.h
 delete mode 100644 src/nnet/nnet-affine-transform.h
 delete mode 100644 src/nnet/nnet-average-pooling-2d-component.h
 delete mode 100644 src/nnet/nnet-average-pooling-component.h
 delete mode 100644 src/nnet/nnet-blstm-projected.h
 delete mode 100644 src/nnet/nnet-component-test.cc
 delete mode 100644 src/nnet/nnet-component.cc
 delete mode 100644 src/nnet/nnet-component.h
 delete mode 100644 src/nnet/nnet-convolutional-2d-component.h
 delete mode 100644 src/nnet/nnet-convolutional-component.h
 delete mode 100644 src/nnet/nnet-frame-pooling-component.h
 delete mode 100644 src/nnet/nnet-kl-hmm.h
 delete mode 100644 src/nnet/nnet-linear-transform.h
 delete mode 100644 src/nnet/nnet-loss.cc
 delete mode 100644 src/nnet/nnet-loss.h
 delete mode 100644 src/nnet/nnet-lstm-projected.h
 delete mode 100644 src/nnet/nnet-matrix-buffer.h
 delete mode 100644 src/nnet/nnet-max-pooling-2d-component.h
 delete mode 100644 src/nnet/nnet-max-pooling-component.h
 delete mode 100644 src/nnet/nnet-multibasis-component.h
 delete mode 100644 src/nnet/nnet-nnet.cc
 delete mode 100644 src/nnet/nnet-nnet.h
 delete mode 100644 src/nnet/nnet-parallel-component.h
 delete mode 100644 src/nnet/nnet-parametric-relu.h
 delete mode 100644 src/nnet/nnet-pdf-prior.cc
 delete mode 100644 src/nnet/nnet-pdf-prior.h
 delete mode 100644 src/nnet/nnet-randomizer-test.cc
 delete mode 100644 src/nnet/nnet-randomizer.cc
 delete mode 100644 src/nnet/nnet-randomizer.h
 delete mode 100644 src/nnet/nnet-rbm.h
 delete mode 100644 src/nnet/nnet-recurrent.h
 delete mode 100644 src/nnet/nnet-sentence-averaging-component.h
 delete mode 100644 src/nnet/nnet-trnopts.h
 delete mode 100644 src/nnet/nnet-utils.h
 delete mode 100644 src/nnet/nnet-various.h
 delete mode 100644 src/nnet2/Makefile
 delete mode 100644 src/nnet2/am-nnet-test.cc
 delete mode 100644 src/nnet2/am-nnet.cc
 delete mode 100644 src/nnet2/am-nnet.h
 delete mode 100644 src/nnet2/combine-nnet-a.cc
 delete mode 100644 src/nnet2/combine-nnet-a.h
 delete mode 100644 src/nnet2/combine-nnet-fast.cc
 delete mode 100644 src/nnet2/combine-nnet-fast.h
 delete mode 100644 src/nnet2/combine-nnet.cc
 delete mode 100644 src/nnet2/combine-nnet.h
 delete mode 100644 src/nnet2/decodable-am-nnet.h
 delete mode 100644 src/nnet2/get-feature-transform.cc
 delete mode 100644 src/nnet2/get-feature-transform.h
 delete mode 100644 src/nnet2/mixup-nnet.cc
 delete mode 100644 src/nnet2/mixup-nnet.h
 delete mode 100644 src/nnet2/nnet-component-test.cc
 delete mode 100644 src/nnet2/nnet-component.cc
 delete mode 100644 src/nnet2/nnet-component.h
 delete mode 100644 src/nnet2/nnet-compute-discriminative-parallel.cc
 delete mode 100644 src/nnet2/nnet-compute-discriminative-parallel.h
 delete mode 100644 src/nnet2/nnet-compute-discriminative.cc
 delete mode 100644 src/nnet2/nnet-compute-discriminative.h
 delete mode 100644 src/nnet2/nnet-compute-online.cc
 delete mode 100644 src/nnet2/nnet-compute-online.h
 delete mode 100644 src/nnet2/nnet-compute-test.cc
 delete mode 100644 src/nnet2/nnet-compute.cc
 delete mode 100644 src/nnet2/nnet-compute.h
 delete mode 100644 src/nnet2/nnet-example-functions-test.cc
 delete mode 100644 src/nnet2/nnet-example-functions.cc
 delete mode 100644 src/nnet2/nnet-example-functions.h
 delete mode 100644 src/nnet2/nnet-example.cc
 delete mode 100644 src/nnet2/nnet-example.h
 delete mode 100644 src/nnet2/nnet-fix.cc
 delete mode 100644 src/nnet2/nnet-fix.h
 delete mode 100644 src/nnet2/nnet-functions.cc
 delete mode 100644 src/nnet2/nnet-functions.h
 delete mode 100644 src/nnet2/nnet-limit-rank.cc
 delete mode 100644 src/nnet2/nnet-limit-rank.h
 delete mode 100644 src/nnet2/nnet-nnet-test.cc
 delete mode 100644 src/nnet2/nnet-nnet.cc
 delete mode 100644 src/nnet2/nnet-nnet.h
 delete mode 100644 src/nnet2/nnet-precondition-online-test.cc
 delete mode 100644 src/nnet2/nnet-precondition-online.cc
 delete mode 100644 src/nnet2/nnet-precondition-online.h
 delete mode 100644 src/nnet2/nnet-precondition-test.cc
 delete mode 100644 src/nnet2/nnet-precondition.cc
 delete mode 100644 src/nnet2/nnet-precondition.h
 delete mode 100644 src/nnet2/nnet-stats.cc
 delete mode 100644 src/nnet2/nnet-stats.h
 delete mode 100644 src/nnet2/nnet-update-parallel.cc
 delete mode 100644 src/nnet2/nnet-update-parallel.h
 delete mode 100644 src/nnet2/nnet-update.cc
 delete mode 100644 src/nnet2/nnet-update.h
 delete mode 100644 src/nnet2/online-nnet2-decodable-test.cc
 delete mode 100644 src/nnet2/online-nnet2-decodable.cc
 delete mode 100644 src/nnet2/online-nnet2-decodable.h
 delete mode 100644 src/nnet2/rescale-nnet.cc
 delete mode 100644 src/nnet2/rescale-nnet.h
 delete mode 100644 src/nnet2/shrink-nnet.cc
 delete mode 100644 src/nnet2/shrink-nnet.h
 delete mode 100644 src/nnet2/train-nnet-ensemble.cc
 delete mode 100644 src/nnet2/train-nnet-ensemble.h
 delete mode 100644 src/nnet2/train-nnet.cc
 delete mode 100644 src/nnet2/train-nnet.h
 delete mode 100644 src/nnet2/widen-nnet.cc
 delete mode 100644 src/nnet2/widen-nnet.h
 delete mode 100644 src/nnet2bin/Makefile
 delete mode 100644 src/nnet2bin/cuda-compiled.cc
 delete mode 100644 src/nnet2bin/nnet-adjust-priors.cc
 delete mode 100644 src/nnet2bin/nnet-align-compiled.cc
 delete mode 100644 src/nnet2bin/nnet-am-average.cc
 delete mode 100644 src/nnet2bin/nnet-am-compute.cc
 delete mode 100644 src/nnet2bin/nnet-am-copy.cc
 delete mode 100644 src/nnet2bin/nnet-am-fix.cc
 delete mode 100644 src/nnet2bin/nnet-am-info.cc
 delete mode 100644 src/nnet2bin/nnet-am-init.cc
 delete mode 100644 src/nnet2bin/nnet-am-mixup.cc
 delete mode 100644 src/nnet2bin/nnet-am-reinitialize.cc
 delete mode 100644 src/nnet2bin/nnet-am-switch-preconditioning.cc
 delete mode 100644 src/nnet2bin/nnet-am-widen.cc
 delete mode 100644 src/nnet2bin/nnet-combine-egs-discriminative.cc
 delete mode 100644 src/nnet2bin/nnet-combine-fast.cc
 delete mode 100644 src/nnet2bin/nnet-combine.cc
 delete mode 100644 src/nnet2bin/nnet-compare-hash-discriminative.cc
 delete mode 100644 src/nnet2bin/nnet-compute-from-egs.cc
 delete mode 100644 src/nnet2bin/nnet-compute-prob.cc
 delete mode 100644 src/nnet2bin/nnet-compute.cc
 delete mode 100644 src/nnet2bin/nnet-copy-egs-discriminative.cc
 delete mode 100644 src/nnet2bin/nnet-copy-egs.cc
 delete mode 100644 src/nnet2bin/nnet-get-egs-discriminative.cc
 delete mode 100644 src/nnet2bin/nnet-get-egs.cc
 delete mode 100644 src/nnet2bin/nnet-get-feature-transform-multi.cc
 delete mode 100644 src/nnet2bin/nnet-get-feature-transform.cc
 delete mode 100644 src/nnet2bin/nnet-get-weighted-egs.cc
 delete mode 100644 src/nnet2bin/nnet-init.cc
 delete mode 100644 src/nnet2bin/nnet-insert.cc
 delete mode 100644 src/nnet2bin/nnet-latgen-faster-parallel.cc
 delete mode 100644 src/nnet2bin/nnet-latgen-faster.cc
 delete mode 100644 src/nnet2bin/nnet-modify-learning-rates.cc
 delete mode 100644 src/nnet2bin/nnet-normalize-stddev.cc
 delete mode 100644 src/nnet2bin/nnet-relabel-egs.cc
 delete mode 100644 src/nnet2bin/nnet-replace-last-layers.cc
 delete mode 100644 src/nnet2bin/nnet-show-progress.cc
 delete mode 100644 src/nnet2bin/nnet-shuffle-egs-discriminative.cc
 delete mode 100644 src/nnet2bin/nnet-shuffle-egs.cc
 delete mode 100644 src/nnet2bin/nnet-subset-egs.cc
 delete mode 100644 src/nnet2bin/nnet-to-raw-nnet.cc
 delete mode 100644 src/nnet2bin/nnet-train-discriminative-parallel.cc
 delete mode 100644 src/nnet2bin/nnet-train-discriminative-simple.cc
 delete mode 100644 src/nnet2bin/nnet-train-ensemble.cc
 delete mode 100644 src/nnet2bin/nnet-train-parallel.cc
 delete mode 100644 src/nnet2bin/nnet-train-simple.cc
 delete mode 100644 src/nnet2bin/nnet-train-transitions.cc
 delete mode 100644 src/nnet2bin/nnet1-to-raw-nnet.cc
 delete mode 100644 src/nnet2bin/raw-nnet-concat.cc
 delete mode 100644 src/nnet2bin/raw-nnet-copy.cc
 delete mode 100644 src/nnet2bin/raw-nnet-info.cc
 delete mode 120000 src/nnet2bin/raw-nnet-init
 delete mode 100644 src/nnet3bin/nnet3-am-train-transitions.cc
 delete mode 100644 src/nnetbin/Makefile
 delete mode 100644 src/nnetbin/cmvn-to-nnet.cc
 delete mode 100644 src/nnetbin/feat-to-post.cc
 delete mode 100644 src/nnetbin/nnet-concat.cc
 delete mode 100644 src/nnetbin/nnet-copy.cc
 delete mode 100644 src/nnetbin/nnet-forward.cc
 delete mode 100644 src/nnetbin/nnet-info.cc
 delete mode 100644 src/nnetbin/nnet-initialize.cc
 delete mode 100644 src/nnetbin/nnet-set-learnrate.cc
 delete mode 100644 src/nnetbin/nnet-train-frmshuff.cc
 delete mode 100644 src/nnetbin/nnet-train-mmi-sequential.cc
 delete mode 100644 src/nnetbin/nnet-train-mpe-sequential.cc
 delete mode 100644 src/nnetbin/nnet-train-multistream-perutt.cc
 delete mode 100644 src/nnetbin/nnet-train-multistream.cc
 delete mode 100644 src/nnetbin/nnet-train-perutt.cc
 delete mode 100644 src/nnetbin/paste-post.cc
 delete mode 100644 src/nnetbin/rbm-convert-to-nnet.cc
 delete mode 100644 src/nnetbin/rbm-train-cd1-frmshuff.cc
 delete mode 100644 src/nnetbin/train-transitions.cc
 delete mode 100644 src/nnetbin/transf-to-nnet.cc
 delete mode 100644 src/online2/online-nnet2-decoding-threaded.cc
 delete mode 100644 src/online2/online-nnet2-decoding.cc
 rename src/online2/{online-nnet2-feature-pipeline.cc => online2-feature-pipeline.cc} (100%)
 delete mode 100644 src/sgmm2/Makefile
 delete mode 100644 src/sgmm2/am-sgmm2-project.cc
 delete mode 100644 src/sgmm2/am-sgmm2-project.h
 delete mode 100644 src/sgmm2/am-sgmm2-test.cc
 delete mode 100644 src/sgmm2/am-sgmm2.cc
 delete mode 100644 src/sgmm2/am-sgmm2.h
 delete mode 100644 src/sgmm2/decodable-am-sgmm2.cc
 delete mode 100644 src/sgmm2/decodable-am-sgmm2.h
 delete mode 100644 src/sgmm2/estimate-am-sgmm2-ebw.cc
 delete mode 100644 src/sgmm2/estimate-am-sgmm2-ebw.h
 delete mode 100644 src/sgmm2/estimate-am-sgmm2-test.cc
 delete mode 100644 src/sgmm2/estimate-am-sgmm2.cc
 delete mode 100644 src/sgmm2/estimate-am-sgmm2.h
 delete mode 100644 src/sgmm2/fmllr-sgmm2-test.cc
 delete mode 100644 src/sgmm2/fmllr-sgmm2.cc
 delete mode 100644 src/sgmm2/fmllr-sgmm2.h
 delete mode 100644 src/sgmm2bin/Makefile
 delete mode 100644 src/sgmm2bin/init-ubm.cc
 delete mode 100644 src/sgmm2bin/sgmm2-acc-stats-gpost.cc
 delete mode 100644 src/sgmm2bin/sgmm2-acc-stats.cc
 delete mode 100644 src/sgmm2bin/sgmm2-acc-stats2.cc
 delete mode 100644 src/sgmm2bin/sgmm2-align-compiled.cc
 delete mode 100644 src/sgmm2bin/sgmm2-comp-prexform.cc
 delete mode 100644 src/sgmm2bin/sgmm2-copy.cc
 delete mode 100644 src/sgmm2bin/sgmm2-est-ebw.cc
 delete mode 100644 src/sgmm2bin/sgmm2-est-fmllr.cc
 delete mode 100644 src/sgmm2bin/sgmm2-est-spkvecs-gpost.cc
 delete mode 100644 src/sgmm2bin/sgmm2-est-spkvecs.cc
 delete mode 100644 src/sgmm2bin/sgmm2-est.cc
 delete mode 100644 src/sgmm2bin/sgmm2-gselect.cc
 delete mode 100644 src/sgmm2bin/sgmm2-info.cc
 delete mode 100644 src/sgmm2bin/sgmm2-init.cc
 delete mode 100644 src/sgmm2bin/sgmm2-latgen-faster-parallel.cc
 delete mode 100644 src/sgmm2bin/sgmm2-latgen-faster.cc
 delete mode 100644 src/sgmm2bin/sgmm2-post-to-gpost.cc
 delete mode 100644 src/sgmm2bin/sgmm2-project.cc
 delete mode 100644 src/sgmm2bin/sgmm2-rescore-lattice.cc
 delete mode 100644 src/sgmm2bin/sgmm2-sum-accs.cc
 delete mode 100644 src/transform/decodable-am-diag-gmm-regtree.cc
 delete mode 100644 src/transform/decodable-am-diag-gmm-regtree.h
 delete mode 100644 src/transform/fmllr-raw-test.cc
 delete mode 100644 src/transform/fmllr-raw.cc
 delete mode 100644 src/transform/fmllr-raw.h
 delete mode 100644 src/transform/fmpe-test.cc
 delete mode 100644 src/transform/fmpe.cc
 delete mode 100644 src/transform/fmpe.h
 delete mode 100644 src/transform/regtree-fmllr-diag-gmm-test.cc
 delete mode 100644 src/transform/regtree-fmllr-diag-gmm.cc
 delete mode 100644 src/transform/regtree-fmllr-diag-gmm.h
 delete mode 100644 src/transform/regtree-mllr-diag-gmm-test.cc
 delete mode 100644 src/transform/regtree-mllr-diag-gmm.cc
 delete mode 100644 src/transform/regtree-mllr-diag-gmm.h

diff --git a/src/Makefile b/src/Makefile
index a49c912c6ed..88da5ed1e55 100644
--- a/src/Makefile
+++ b/src/Makefile
@@ -6,15 +6,15 @@ SHELL := /bin/bash
 
 
 SUBDIRS = base matrix util feat tree gmm transform \
-          fstext hmm lm decoder lat kws cudamatrix nnet \
+          fstext hmm lm decoder lat kws cudamatrix \
           bin fstbin gmmbin fgmmbin featbin \
-          nnetbin latbin sgmm2 sgmm2bin nnet2 nnet3 rnnlm chain nnet3bin nnet2bin kwsbin \
+          latbin nnet3 rnnlm chain nnet3bin kwsbin \
           ivector ivectorbin online2 online2bin lmbin chainbin rnnlmbin
 
 MEMTESTDIRS = base matrix util feat tree gmm transform \
-          fstext hmm lm decoder lat nnet kws chain \
+          fstext hmm lm decoder lat kws chain \
           bin fstbin gmmbin fgmmbin featbin \
-          nnetbin latbin sgmm2 nnet2 nnet3 rnnlm nnet2bin nnet3bin sgmm2bin kwsbin \
+          latbin nnet3 rnnlm nnet3bin kwsbin \
           ivector ivectorbin online2 online2bin lmbin
 
 CUDAMEMTESTDIR = cudamatrix
@@ -23,9 +23,6 @@ SUBDIRS_LIB = $(filter-out %bin, $(SUBDIRS))
 
 KALDI_SONAME ?= libkaldi.so
 
-# Optional subdirectories
-EXT_SUBDIRS = online onlinebin  # python-kaldi-decoding
-EXT_SUBDIRS_LIB = $(filter-out %bin, $(EXT_SUBDIRS))
 
 include kaldi.mk
 
@@ -72,19 +69,6 @@ endif
 endif
 endif
 
-biglibext: $(EXT_SUBDIRS_LIB)
-ifeq ($(KALDI_FLAVOR), dynamic)
-ifeq ($(shell uname), Darwin)
-	$(CXX) -dynamiclib -o $(KALDILIBDIR)/libkaldi_ext.dylib -install_name @rpath/libkaldi_ext.dylib -framework Accelerate $(LDFLAGS) $(EXT_SUBDIRS_LIB:=/*.dylib)
-else
-ifeq ($(shell uname), Linux)
-	#$(warning The following command will probably fail, in that case add -fPIC to your CXXFLAGS and remake all.)
-	$(CXX) -shared -o $(KALDILIBDIR)/libkaldi_ext.so -Wl,-soname=libkaldi_ext.so,--whole-archive  $(EXT_SUBDIRS_LIB:=/kaldi-*.a) -Wl,--no-whole-archive
-else
-	$(error Dynamic libraries not supported on this platform. Run configure with --static flag. )
-endif
-endif
-endif
 
 kaldi.mk:
 	@[ -f kaldi.mk ] || { echo "kaldi.mk does not exist; you have to run ./configure"; exit 1; }
@@ -143,9 +127,9 @@ $(EXT_SUBDIRS) : checkversion kaldi.mk mklibdir ext_depend
 ### Dependency list ###
 # this is necessary for correct parallel compilation
 #1)The tools depend on all the libraries
-bin fstbin gmmbin fgmmbin sgmm2bin featbin nnetbin nnet2bin nnet3bin chainbin latbin ivectorbin lmbin kwsbin online2bin rnnlmbin: \
+bin fstbin gmmbin fgmmbin sgmm2bin featbin nnet3bin chainbin latbin ivectorbin lmbin kwsbin online2bin rnnlmbin: \
  base matrix util feat tree gmm transform sgmm2 fstext hmm \
- lm decoder lat cudamatrix nnet nnet2 nnet3 ivector chain kws online2 rnnlm
+ lm decoder lat cudamatrix nnet3 ivector chain kws online2 rnnlm
 
 #2)The libraries have inter-dependencies
 base: base/.depend.mk
@@ -162,15 +146,13 @@ lm: base util matrix fstext
 decoder: base util matrix gmm hmm tree transform lat
 lat: base util hmm tree matrix
 cudamatrix: base util matrix
-nnet: base util hmm tree matrix cudamatrix
-nnet2: base util matrix lat gmm hmm tree transform cudamatrix
 nnet3: base util matrix lat gmm hmm tree transform cudamatrix chain fstext
 rnnlm: base util matrix cudamatrix nnet3 lm hmm
 chain: lat hmm tree fstext matrix cudamatrix util base
 ivector: base util matrix transform tree gmm
 #3)Dependencies for optional parts of Kaldi
-onlinebin: base matrix util feat tree gmm transform sgmm2 fstext hmm lm decoder lat cudamatrix nnet nnet2 online
+onlinebin: base matrix util feat tree gmm transform sgmm2 fstext hmm lm decoder lat cudamatrix online
 # python-kaldi-decoding: base matrix util feat tree gmm transform sgmm2 fstext hmm decoder lat online
 online: decoder gmm transform feat matrix util base lat hmm tree
-online2: decoder gmm transform feat matrix util base lat hmm tree ivector cudamatrix nnet2 nnet3 chain
+online2: decoder gmm transform feat matrix util base lat hmm tree ivector cudamatrix nnet3 chain
 kws: base util hmm tree matrix lat
diff --git a/src/bin/Makefile b/src/bin/Makefile
index 7cb01b50120..f8f0564743c 100644
--- a/src/bin/Makefile
+++ b/src/bin/Makefile
@@ -22,7 +22,7 @@ BINFILES = align-equal align-equal-compiled acc-tree-stats \
         matrix-sum build-pfile-from-ali get-post-on-ali tree-info am-info \
         vector-sum matrix-sum-rows est-pca sum-lda-accs sum-mllt-accs \
         transform-vec align-text matrix-dim post-to-smat compile-graph \
-        compare-int-vector
+        compare-int-vector cuda-gpu-available
 
 
 OBJFILES =
diff --git a/src/bin/acc-lda.cc b/src/bin/acc-lda.cc
index b664135bdc7..a0451218513 100644
--- a/src/bin/acc-lda.cc
+++ b/src/bin/acc-lda.cc
@@ -21,7 +21,7 @@
 
 #include "base/kaldi-common.h"
 #include "util/common-utils.h"
-#include "hmm/transition-model.h"
+#include "hmm/transitions.h"
 #include "hmm/posterior.h"
 #include "transform/lda-estimate.h"
 
@@ -57,7 +57,7 @@ int main(int argc, char *argv[]) {
     std::string posteriors_rspecifier = po.GetArg(3);
     std::string acc_wxfilename = po.GetArg(4);
 
-    TransitionModel trans_model;
+    Transitions trans_model;
     {
       bool binary_read;
       Input ki(model_rxfilename, &binary_read);
diff --git a/src/bin/acc-tree-stats.cc b/src/bin/acc-tree-stats.cc
index 8b9ce9065b4..c0eb31f6064 100644
--- a/src/bin/acc-tree-stats.cc
+++ b/src/bin/acc-tree-stats.cc
@@ -22,7 +22,7 @@
 #include "util/common-utils.h"
 #include "tree/context-dep.h"
 #include "tree/build-tree-utils.h"
-#include "hmm/transition-model.h"
+#include "hmm/transitions.h"
 #include "hmm/tree-accu.h"
 
 /** @brief Accumulate tree statistics for decision tree training. The
@@ -62,7 +62,7 @@ int main(int argc, char *argv[]) {
 
     AccumulateTreeStatsInfo acc_tree_stats_info(opts);
 
-    TransitionModel trans_model;
+    Transitions trans_model;
     {
       bool binary;
       Input ki(model_filename, &binary);
diff --git a/src/bin/add-self-loops.cc b/src/bin/add-self-loops.cc
index b223dfe317d..562b0977a69 100644
--- a/src/bin/add-self-loops.cc
+++ b/src/bin/add-self-loops.cc
@@ -18,7 +18,7 @@
 // See the Apache 2 License for the specific language governing permissions and
 // limitations under the License.
 
-#include "hmm/transition-model.h"
+#include "hmm/transitions.h"
 #include "hmm/hmm-utils.h"
 #include "tree/context-dep.h"
 #include "util/common-utils.h"
@@ -88,7 +88,7 @@ int main(int argc, char *argv[]) {
                       "standard input" : disambig_in_filename);
     }
 
-    TransitionModel trans_model;
+    Transitions trans_model;
     ReadKaldiObject(model_in_filename, &trans_model);
 
 
diff --git a/src/bin/ali-to-pdf.cc b/src/bin/ali-to-pdf.cc
index 61b5138cf31..1706f5aa371 100644
--- a/src/bin/ali-to-pdf.cc
+++ b/src/bin/ali-to-pdf.cc
@@ -21,7 +21,7 @@
 */
 #include "base/kaldi-common.h"
 #include "gmm/am-diag-gmm.h"
-#include "hmm/transition-model.h"
+#include "hmm/transitions.h"
 #include "hmm/hmm-utils.h"
 #include "util/common-utils.h"
 #include "fst/fstlib.h"
@@ -48,7 +48,7 @@ int main(int argc, char *argv[]) {
         alignments_rspecifier = po.GetArg(2),
         pdfs_wspecifier = po.GetArg(3);
 
-    TransitionModel trans_model;
+    Transitions trans_model;
     ReadKaldiObject(model_filename, &trans_model);
 
     SequentialInt32VectorReader reader(alignments_rspecifier);
diff --git a/src/bin/ali-to-phones.cc b/src/bin/ali-to-phones.cc
index 602e32e9768..5def11ffc79 100644
--- a/src/bin/ali-to-phones.cc
+++ b/src/bin/ali-to-phones.cc
@@ -20,7 +20,7 @@
 
 
 #include "base/kaldi-common.h"
-#include "hmm/transition-model.h"
+#include "hmm/transitions.h"
 #include "hmm/hmm-utils.h"
 #include "util/common-utils.h"
 #include "fst/fstlib.h"
@@ -68,7 +68,7 @@ int main(int argc, char *argv[]) {
     std::string model_filename = po.GetArg(1),
         alignments_rspecifier = po.GetArg(2);
 
-    TransitionModel trans_model;
+    Transitions trans_model;
     ReadKaldiObject(model_filename, &trans_model);
 
     SequentialInt32VectorReader reader(alignments_rspecifier);
diff --git a/src/bin/ali-to-post.cc b/src/bin/ali-to-post.cc
index ac87d676c06..00c026c0692 100644
--- a/src/bin/ali-to-post.cc
+++ b/src/bin/ali-to-post.cc
@@ -22,7 +22,7 @@
 #include "base/kaldi-common.h"
 #include "util/common-utils.h"
 #include "gmm/am-diag-gmm.h"
-#include "hmm/transition-model.h"
+#include "hmm/transitions.h"
 #include "hmm/hmm-utils.h"
 #include "hmm/posterior.h"
 
diff --git a/src/bin/align-compiled-mapped.cc b/src/bin/align-compiled-mapped.cc
index 98ffebd6eaa..ab7425c1a32 100644
--- a/src/bin/align-compiled-mapped.cc
+++ b/src/bin/align-compiled-mapped.cc
@@ -20,7 +20,7 @@
 
 #include "base/kaldi-common.h"
 #include "util/common-utils.h"
-#include "hmm/transition-model.h"
+#include "hmm/transitions.h"
 #include "hmm/hmm-utils.h"
 #include "fstext/fstext-lib.h"
 #include "decoder/decoder-wrappers.h"
@@ -74,7 +74,7 @@ int main(int argc, char *argv[]) {
     std::string alignment_wspecifier = po.GetArg(4);
     std::string scores_wspecifier = po.GetOptArg(5);
 
-    TransitionModel trans_model;
+    Transitions trans_model;
     ReadKaldiObject(model_in_filename, &trans_model);
 
     SequentialBaseFloatMatrixReader loglikes_reader(feature_rspecifier);
diff --git a/src/bin/align-equal-compiled.cc b/src/bin/align-equal-compiled.cc
index c4ab9d4205a..f5900727aef 100644
--- a/src/bin/align-equal-compiled.cc
+++ b/src/bin/align-equal-compiled.cc
@@ -21,7 +21,7 @@
 #include "base/kaldi-common.h"
 #include "util/common-utils.h"
 #include "tree/context-dep.h"
-#include "hmm/transition-model.h"
+#include "hmm/transitions.h"
 #include "fstext/fstext-lib.h"
 #include "decoder/training-graph-compiler.h"
 
diff --git a/src/bin/align-equal.cc b/src/bin/align-equal.cc
index a3bc40dc236..671c515f33e 100644
--- a/src/bin/align-equal.cc
+++ b/src/bin/align-equal.cc
@@ -21,7 +21,7 @@
 #include "base/kaldi-common.h"
 #include "util/common-utils.h"
 #include "tree/context-dep.h"
-#include "hmm/transition-model.h"
+#include "hmm/transitions.h"
 #include "fstext/fstext-lib.h"
 #include "decoder/training-graph-compiler.h"
 
@@ -65,7 +65,7 @@ int main(int argc, char *argv[]) {
     ContextDependency ctx_dep;
     ReadKaldiObject(tree_in_filename, &ctx_dep);
 
-    TransitionModel trans_model;
+    Transitions trans_model;
     ReadKaldiObject(model_in_filename, &trans_model);
 
     // need VectorFst because we will change it by adding subseq symbol.
diff --git a/src/bin/align-mapped.cc b/src/bin/align-mapped.cc
index c78401fffdd..e8249c4a123 100644
--- a/src/bin/align-mapped.cc
+++ b/src/bin/align-mapped.cc
@@ -20,7 +20,7 @@
 
 #include "base/kaldi-common.h"
 #include "util/common-utils.h"
-#include "hmm/transition-model.h"
+#include "hmm/transitions.h"
 #include "fstext/fstext-lib.h"
 #include "decoder/decoder-wrappers.h"
 #include "decoder/training-graph-compiler.h"
@@ -72,7 +72,7 @@ int main(int argc, char *argv[]) {
     ContextDependency ctx_dep;
     ReadKaldiObject(tree_in_filename, &ctx_dep);
 
-    TransitionModel trans_model;
+    Transitions trans_model;
     ReadKaldiObject(model_in_filename, &trans_model);
 
     VectorFst<StdArc> *lex_fst = fst::ReadFstKaldi(lex_in_filename);
diff --git a/src/bin/am-info.cc b/src/bin/am-info.cc
index 6afb0c5014e..dd59047c35c 100644
--- a/src/bin/am-info.cc
+++ b/src/bin/am-info.cc
@@ -20,7 +20,7 @@
 #include "base/kaldi-common.h"
 #include "util/common-utils.h"
 #include "gmm/am-diag-gmm.h"
-#include "hmm/transition-model.h"
+#include "hmm/transitions.h"
 
 int main(int argc, char *argv[]) {
   try {
@@ -45,7 +45,7 @@ int main(int argc, char *argv[]) {
 
     std::string model_in_filename = po.GetArg(1);
 
-    TransitionModel trans_model;
+    Transitions trans_model;
     {
       bool binary_read;
       Input ki(model_in_filename, &binary_read);
diff --git a/src/bin/build-pfile-from-ali.cc b/src/bin/build-pfile-from-ali.cc
index fadb873825f..fb82fe27eaa 100644
--- a/src/bin/build-pfile-from-ali.cc
+++ b/src/bin/build-pfile-from-ali.cc
@@ -25,7 +25,7 @@ using std::vector;
 
 #include "base/kaldi-common.h"
 #include "gmm/am-diag-gmm.h"
-#include "hmm/transition-model.h"
+#include "hmm/transitions.h"
 #include "hmm/hmm-utils.h"
 #include "util/common-utils.h"
 
@@ -64,7 +64,7 @@ int main(int argc, char *argv[]) {
         feature_rspecifier = po.GetArg(3),
         pfile_wspecifier = po.GetArg(4);
 
-    TransitionModel trans_model;
+    Transitions trans_model;
     AmDiagGmm am_gmm;
     {
       bool binary;
diff --git a/src/bin/build-tree-two-level.cc b/src/bin/build-tree-two-level.cc
index c7cd553484e..005c5d80532 100644
--- a/src/bin/build-tree-two-level.cc
+++ b/src/bin/build-tree-two-level.cc
@@ -20,7 +20,7 @@
 
 #include "base/kaldi-common.h"
 #include "util/common-utils.h"
-#include "hmm/hmm-topology.h"
+#include "hmm/topology.h"
 #include "tree/context-dep.h"
 #include "tree/build-tree.h"
 #include "tree/build-tree-utils.h"
@@ -112,7 +112,7 @@ int main(int argc, char *argv[]) {
       ReadRootsFile(ki.Stream(), &phone_sets, &is_shared_root, &is_split_root);
     }
 
-    HmmTopology topo;
+    Topology topo;
     ReadKaldiObject(topo_filename, &topo);
 
     BuildTreeStatsType stats;
diff --git a/src/bin/build-tree.cc b/src/bin/build-tree.cc
index 72774900d61..b37c9c7d184 100644
--- a/src/bin/build-tree.cc
+++ b/src/bin/build-tree.cc
@@ -20,7 +20,7 @@
 
 #include "base/kaldi-common.h"
 #include "util/common-utils.h"
-#include "hmm/hmm-topology.h"
+#include "hmm/topology.h"
 #include "tree/context-dep.h"
 #include "tree/build-tree.h"
 #include "tree/build-tree-utils.h"
@@ -91,7 +91,7 @@ int main(int argc, char *argv[]) {
       ReadRootsFile(ki.Stream(), &phone_sets, &is_shared_root, &is_split_root);
     }
 
-    HmmTopology topo;
+    Topology topo;
     ReadKaldiObject(topo_filename, &topo);
 
     BuildTreeStatsType stats;
diff --git a/src/bin/compile-graph.cc b/src/bin/compile-graph.cc
index 7174fdf8113..2dae81fa702 100644
--- a/src/bin/compile-graph.cc
+++ b/src/bin/compile-graph.cc
@@ -20,7 +20,7 @@
 #include "base/kaldi-common.h"
 #include "util/common-utils.h"
 #include "tree/context-dep.h"
-#include "hmm/transition-model.h"
+#include "hmm/transitions.h"
 #include "hmm/hmm-utils.h"
 #include "fstext/fstext-lib.h"
 #include "fstext/push-special.h"
@@ -81,7 +81,7 @@ int main(int argc, char *argv[]) {
     ContextDependency ctx_dep;  // the tree.
     ReadKaldiObject(tree_rxfilename, &ctx_dep);
 
-    TransitionModel trans_model;
+    Transitions trans_model;
     ReadKaldiObject(model_rxfilename, &trans_model);
 
     VectorFst<StdArc> *lex_fst = fst::ReadFstKaldi(lex_rxfilename),
diff --git a/src/bin/compile-questions.cc b/src/bin/compile-questions.cc
index f9694140ae8..1c8565e032d 100644
--- a/src/bin/compile-questions.cc
+++ b/src/bin/compile-questions.cc
@@ -19,12 +19,12 @@
 
 #include "base/kaldi-common.h"
 #include "util/common-utils.h"
-#include "hmm/hmm-topology.h"
+#include "hmm/topology.h"
 #include "tree/build-tree-questions.h"
 
 
 namespace kaldi {
-int32 ProcessTopo(const HmmTopology &topo, const std::vector<std::vector<int32> > &questions) {
+int32 ProcessTopo(const Topology &topo, const std::vector<std::vector<int32> > &questions) {
   std::vector<int32> seen_phones;  // ids of phones seen in questions.
   for (size_t i = 0; i < questions.size(); i++)
     for (size_t j= 0; j < questions[i].size(); j++) seen_phones.push_back(questions[i][j]);
@@ -93,7 +93,7 @@ int main(int argc, char *argv[]) {
         questions_rxfilename = po.GetArg(2),
         questions_out_filename = po.GetArg(3);
 
-    HmmTopology topo;  // just needed for checking, and to get the
+    Topology topo;  // just needed for checking, and to get the
     // largest number of pdf-classes for any phone.
     ReadKaldiObject(topo_filename, &topo);
 
diff --git a/src/bin/compile-train-graphs-fsts.cc b/src/bin/compile-train-graphs-fsts.cc
index 00ec1038943..473887538ae 100644
--- a/src/bin/compile-train-graphs-fsts.cc
+++ b/src/bin/compile-train-graphs-fsts.cc
@@ -21,7 +21,7 @@
 #include "base/kaldi-common.h"
 #include "util/common-utils.h"
 #include "tree/context-dep.h"
-#include "hmm/transition-model.h"
+#include "hmm/transitions.h"
 #include "fstext/fstext-lib.h"
 #include "decoder/training-graph-compiler.h"
 
@@ -80,7 +80,7 @@ int main(int argc, char *argv[]) {
     ContextDependency ctx_dep;  // the tree.
     ReadKaldiObject(tree_rxfilename, &ctx_dep);
 
-    TransitionModel trans_model;
+    Transitions trans_model;
     ReadKaldiObject(model_rxfilename, &trans_model);
 
     // need VectorFst because we will change it by adding subseq symbol.
diff --git a/src/bin/compile-train-graphs.cc b/src/bin/compile-train-graphs.cc
index 874d079376e..a0722c920b4 100644
--- a/src/bin/compile-train-graphs.cc
+++ b/src/bin/compile-train-graphs.cc
@@ -21,7 +21,7 @@
 #include "base/kaldi-common.h"
 #include "util/common-utils.h"
 #include "tree/context-dep.h"
-#include "hmm/transition-model.h"
+#include "hmm/transitions.h"
 #include "fstext/fstext-lib.h"
 #include "decoder/training-graph-compiler.h"
 
@@ -74,7 +74,7 @@ int main(int argc, char *argv[]) {
     ContextDependency ctx_dep;  // the tree.
     ReadKaldiObject(tree_rxfilename, &ctx_dep);
 
-    TransitionModel trans_model;
+    Transitions trans_model;
     ReadKaldiObject(model_rxfilename, &trans_model);
 
     // need VectorFst because we will change it by adding subseq symbol.
diff --git a/src/bin/convert-ali.cc b/src/bin/convert-ali.cc
index 89fe838638c..7daeb40ca53 100644
--- a/src/bin/convert-ali.cc
+++ b/src/bin/convert-ali.cc
@@ -22,7 +22,7 @@
 #include "base/kaldi-common.h"
 #include "util/common-utils.h"
 #include "gmm/am-diag-gmm.h"
-#include "hmm/transition-model.h"
+#include "hmm/transitions.h"
 #include "hmm/hmm-utils.h"
 #include "hmm/tree-accu.h" // for ReadPhoneMap
 
@@ -48,7 +48,7 @@ int main(int argc, char *argv[]) {
                 "old-integer-id new-integer-id)");
     po.Register("reorder", &reorder,
                 "True if you want the converted alignments to be 'reordered' "
-                "versus the way they appear in the HmmTopology object");
+                "versus the way they appear in the Topology object");
     po.Register("repeat-frames", &repeat_frames,
                 "Only relevant when frame-subsampling-factor != 1.  If true, "
                 "repeat frames of alignment by 'frame-subsampling-factor' "
@@ -79,10 +79,10 @@ int main(int argc, char *argv[]) {
     SequentialInt32VectorReader alignment_reader(old_alignments_rspecifier);
     Int32VectorWriter alignment_writer(new_alignments_wspecifier);
 
-    TransitionModel old_trans_model;
+    Transitions old_trans_model;
     ReadKaldiObject(old_model_filename, &old_trans_model);
 
-    TransitionModel new_trans_model;
+    Transitions new_trans_model;
     ReadKaldiObject(new_model_filename, &new_trans_model);
 
     if (!(old_trans_model.GetTopo() == new_trans_model.GetTopo()))
diff --git a/src/bin/copy-gselect.cc b/src/bin/copy-gselect.cc
index e6c92013b58..ee427d59b8e 100644
--- a/src/bin/copy-gselect.cc
+++ b/src/bin/copy-gselect.cc
@@ -21,7 +21,7 @@
 #include "base/kaldi-common.h"
 #include "util/common-utils.h"
 #include "gmm/diag-gmm.h"
-#include "hmm/transition-model.h"
+#include "hmm/transitions.h"
 
 int main(int argc, char *argv[]) {
   try {
diff --git a/src/bin/copy-transition-model.cc b/src/bin/copy-transition-model.cc
index 62a5d0c51dd..b05c64d28bf 100644
--- a/src/bin/copy-transition-model.cc
+++ b/src/bin/copy-transition-model.cc
@@ -17,7 +17,7 @@
 // See the Apache 2 License for the specific language governing permissions and
 // limitations under the License.
 
-#include "hmm/transition-model.h"
+#include "hmm/transitions.h"
 #include "fst/fstlib.h"
 #include "util/common-utils.h"
 
@@ -54,7 +54,7 @@ int main(int argc, char *argv[]) {
         transition_model_wxfilename = po.GetArg(2);
 
 
-    TransitionModel trans_model;
+    Transitions trans_model;
     ReadKaldiObject(transition_model_rxfilename, &trans_model);
 
     WriteKaldiObject(trans_model, transition_model_wxfilename, binary);
diff --git a/src/bin/copy-tree.cc b/src/bin/copy-tree.cc
index c412366b151..69ab0c309ad 100644
--- a/src/bin/copy-tree.cc
+++ b/src/bin/copy-tree.cc
@@ -20,7 +20,7 @@
 
 #include "base/kaldi-common.h"
 #include "util/common-utils.h"
-#include "hmm/hmm-topology.h"
+#include "hmm/topology.h"
 #include "tree/context-dep.h"
 #include "tree/clusterable-classes.h"
 #include "util/text-utils.h"
diff --git a/src/nnetbin/cuda-gpu-available.cc b/src/bin/cuda-gpu-available.cc
similarity index 100%
rename from src/nnetbin/cuda-gpu-available.cc
rename to src/bin/cuda-gpu-available.cc
diff --git a/src/bin/decode-faster-mapped.cc b/src/bin/decode-faster-mapped.cc
index c7411592504..4606933411f 100644
--- a/src/bin/decode-faster-mapped.cc
+++ b/src/bin/decode-faster-mapped.cc
@@ -22,7 +22,7 @@
 #include "base/kaldi-common.h"
 #include "util/common-utils.h"
 #include "tree/context-dep.h"
-#include "hmm/transition-model.h"
+#include "hmm/transitions.h"
 #include "fstext/fstext-lib.h"
 #include "decoder/faster-decoder.h"
 #include "decoder/decodable-matrix.h"
@@ -67,7 +67,7 @@ int main(int argc, char *argv[]) {
         words_wspecifier = po.GetArg(4),
         alignment_wspecifier = po.GetOptArg(5);
 
-    TransitionModel trans_model;
+    Transitions trans_model;
     ReadKaldiObject(model_in_filename, &trans_model);
 
     Int32VectorWriter words_writer(words_wspecifier);
diff --git a/src/bin/decode-faster.cc b/src/bin/decode-faster.cc
index cbcdb771d56..a1e112b129f 100644
--- a/src/bin/decode-faster.cc
+++ b/src/bin/decode-faster.cc
@@ -22,7 +22,7 @@
 #include "base/kaldi-common.h"
 #include "util/common-utils.h"
 #include "tree/context-dep.h"
-#include "hmm/transition-model.h"
+#include "hmm/transitions.h"
 #include "fstext/fstext-lib.h"
 #include "decoder/faster-decoder.h"
 #include "decoder/decodable-matrix.h"
diff --git a/src/bin/est-mllt.cc b/src/bin/est-mllt.cc
index 48021304b80..2a01f0dbb78 100644
--- a/src/bin/est-mllt.cc
+++ b/src/bin/est-mllt.cc
@@ -20,7 +20,7 @@
 #include "util/common-utils.h"
 #include "gmm/am-diag-gmm.h"
 #include "tree/context-dep.h"
-#include "hmm/transition-model.h"
+#include "hmm/transitions.h"
 #include "transform/mllt.h"
 
 int main(int argc, char *argv[]) {
diff --git a/src/bin/get-post-on-ali.cc b/src/bin/get-post-on-ali.cc
index 6d6dfd0d3df..471bbfbfff2 100644
--- a/src/bin/get-post-on-ali.cc
+++ b/src/bin/get-post-on-ali.cc
@@ -22,7 +22,7 @@
 #include "base/kaldi-common.h"
 #include "util/common-utils.h"
 #include "gmm/am-diag-gmm.h"
-#include "hmm/transition-model.h"
+#include "hmm/transitions.h"
 #include "hmm/hmm-utils.h"
 #include "hmm/posterior.h"
 
diff --git a/src/bin/hmm-info.cc b/src/bin/hmm-info.cc
index 4ece5e88171..30d6f999c8e 100644
--- a/src/bin/hmm-info.cc
+++ b/src/bin/hmm-info.cc
@@ -19,7 +19,7 @@
 
 #include "base/kaldi-common.h"
 #include "util/common-utils.h"
-#include "hmm/transition-model.h"
+#include "hmm/transitions.h"
 
 int main(int argc, char *argv[]) {
   try {
@@ -43,7 +43,7 @@ int main(int argc, char *argv[]) {
 
     std::string model_in_filename = po.GetArg(1);
 
-    TransitionModel trans_model;
+    Transitions trans_model;
     {
       bool binary_read;
       Input ki(model_in_filename, &binary_read);
diff --git a/src/bin/latgen-faster-mapped-parallel.cc b/src/bin/latgen-faster-mapped-parallel.cc
index 4479ec8b73e..415fd1a3584 100644
--- a/src/bin/latgen-faster-mapped-parallel.cc
+++ b/src/bin/latgen-faster-mapped-parallel.cc
@@ -24,7 +24,7 @@
 #include "base/kaldi-common.h"
 #include "util/common-utils.h"
 #include "tree/context-dep.h"
-#include "hmm/transition-model.h"
+#include "hmm/transitions.h"
 #include "fstext/fstext-lib.h"
 #include "decoder/decoder-wrappers.h"
 #include "decoder/decodable-matrix.h"
@@ -74,7 +74,7 @@ int main(int argc, char *argv[]) {
         words_wspecifier = po.GetOptArg(5),
         alignment_wspecifier = po.GetOptArg(6);
 
-    TransitionModel trans_model;
+    Transitions trans_model;
     ReadKaldiObject(model_in_filename, &trans_model);
 
     bool determinize = config.determinize_lattice;
diff --git a/src/bin/latgen-faster-mapped.cc b/src/bin/latgen-faster-mapped.cc
index 610d9aa6d7d..3a65d78be04 100644
--- a/src/bin/latgen-faster-mapped.cc
+++ b/src/bin/latgen-faster-mapped.cc
@@ -23,7 +23,7 @@
 #include "base/kaldi-common.h"
 #include "util/common-utils.h"
 #include "tree/context-dep.h"
-#include "hmm/transition-model.h"
+#include "hmm/transitions.h"
 #include "fstext/fstext-lib.h"
 #include "decoder/decoder-wrappers.h"
 #include "decoder/decodable-matrix.h"
@@ -70,7 +70,7 @@ int main(int argc, char *argv[]) {
         words_wspecifier = po.GetOptArg(5),
         alignment_wspecifier = po.GetOptArg(6);
 
-    TransitionModel trans_model;
+    Transitions trans_model;
     ReadKaldiObject(model_in_filename, &trans_model);
 
     bool determinize = config.determinize_lattice;
diff --git a/src/bin/logprob-to-post.cc b/src/bin/logprob-to-post.cc
index f221580a484..0edfba0189d 100644
--- a/src/bin/logprob-to-post.cc
+++ b/src/bin/logprob-to-post.cc
@@ -21,7 +21,7 @@
 #include "base/kaldi-common.h"
 #include "util/common-utils.h"
 #include "gmm/am-diag-gmm.h"
-#include "hmm/transition-model.h"
+#include "hmm/transitions.h"
 #include "hmm/hmm-utils.h"
 #include "hmm/posterior.h"
 
diff --git a/src/bin/make-h-transducer.cc b/src/bin/make-h-transducer.cc
index c54b9250cf7..777cab0f94d 100644
--- a/src/bin/make-h-transducer.cc
+++ b/src/bin/make-h-transducer.cc
@@ -16,7 +16,7 @@
 // See the Apache 2 License for the specific language governing permissions and
 // limitations under the License.
 
-#include "hmm/transition-model.h"
+#include "hmm/transitions.h"
 #include "hmm/hmm-utils.h"
 #include "tree/context-dep.h"
 #include "util/common-utils.h"
@@ -71,7 +71,7 @@ int main(int argc, char *argv[]) {
     ContextDependency ctx_dep;
     ReadKaldiObject(tree_filename, &ctx_dep);
 
-    TransitionModel trans_model;
+    Transitions trans_model;
     ReadKaldiObject(model_filename, &trans_model);
 
     std::vector<int32> disambig_syms_out;
diff --git a/src/bin/make-ilabel-transducer.cc b/src/bin/make-ilabel-transducer.cc
index a78cefafd3a..70a5d6d4e18 100644
--- a/src/bin/make-ilabel-transducer.cc
+++ b/src/bin/make-ilabel-transducer.cc
@@ -16,7 +16,7 @@
 // See the Apache 2 License for the specific language governing permissions and
 // limitations under the License.
 
-#include "hmm/transition-model.h"
+#include "hmm/transitions.h"
 #include "hmm/hmm-utils.h"
 #include "tree/context-dep.h"
 #include "util/common-utils.h"
@@ -71,7 +71,7 @@ int main(int argc, char *argv[]) {
     ContextDependency ctx_dep;
     ReadKaldiObject(tree_filename, &ctx_dep);
 
-    TransitionModel trans_model;
+    Transitions trans_model;
     ReadKaldiObject(model_filename, &trans_model);
 
 
diff --git a/src/bin/make-pdf-to-tid-transducer.cc b/src/bin/make-pdf-to-tid-transducer.cc
index 907380a974d..ad9c627e558 100644
--- a/src/bin/make-pdf-to-tid-transducer.cc
+++ b/src/bin/make-pdf-to-tid-transducer.cc
@@ -16,7 +16,7 @@
 // See the Apache 2 License for the specific language governing permissions and
 // limitations under the License.
 
-#include "hmm/transition-model.h"
+#include "hmm/transitions.h"
 #include "hmm/hmm-utils.h"
 #include "util/common-utils.h"
 #include "fst/fstlib.h"
@@ -47,7 +47,7 @@ int main(int argc, char *argv[]) {
     std::string trans_model_filename = po.GetArg(1);
     std::string fst_out_filename = po.GetOptArg(2);
 
-    TransitionModel trans_model;
+    Transitions trans_model;
     ReadKaldiObject(trans_model_filename, &trans_model);
 
     fst::VectorFst<fst::StdArc> *fst = GetPdfToTransitionIdTransducer(trans_model);
diff --git a/src/bin/phones-to-prons.cc b/src/bin/phones-to-prons.cc
index 0d7ab12c232..23c17a58385 100644
--- a/src/bin/phones-to-prons.cc
+++ b/src/bin/phones-to-prons.cc
@@ -20,7 +20,7 @@
 
 
 #include "base/kaldi-common.h"
-#include "hmm/transition-model.h"
+#include "hmm/transitions.h"
 #include "hmm/hmm-utils.h"
 #include "util/common-utils.h"
 #include "fst/fstlib.h"
diff --git a/src/bin/post-to-pdf-post.cc b/src/bin/post-to-pdf-post.cc
index 99aa5770aa5..6c2227806b4 100644
--- a/src/bin/post-to-pdf-post.cc
+++ b/src/bin/post-to-pdf-post.cc
@@ -21,7 +21,7 @@
 #include "base/kaldi-common.h"
 #include "util/common-utils.h"
 #include "gmm/am-diag-gmm.h"
-#include "hmm/transition-model.h"
+#include "hmm/transitions.h"
 #include "hmm/hmm-utils.h"
 #include "hmm/posterior.h"
 
@@ -50,7 +50,7 @@ int main(int argc, char *argv[]) {
         posteriors_rspecifier = po.GetArg(2),
         posteriors_wspecifier = po.GetArg(3);
 
-    TransitionModel trans_model;    
+    Transitions trans_model;    
     {
       bool binary_in;
       Input ki(model_rxfilename, &binary_in);
diff --git a/src/bin/post-to-phone-post.cc b/src/bin/post-to-phone-post.cc
index 92f67514a0f..d6ba0991924 100644
--- a/src/bin/post-to-phone-post.cc
+++ b/src/bin/post-to-phone-post.cc
@@ -20,7 +20,7 @@
 
 #include "base/kaldi-common.h"
 #include "util/common-utils.h"
-#include "hmm/transition-model.h"
+#include "hmm/transitions.h"
 #include "hmm/posterior.h"
 
 int main(int argc, char *argv[]) {
@@ -51,7 +51,7 @@ int main(int argc, char *argv[]) {
     kaldi::SequentialPosteriorReader posterior_reader(post_rspecifier);
     kaldi::PosteriorWriter posterior_writer(phone_post_wspecifier);
 
-    TransitionModel trans_model;    
+    Transitions trans_model;    
     {
       bool binary_in;
       Input ki(model_rxfilename, &binary_in);
diff --git a/src/bin/post-to-tacc.cc b/src/bin/post-to-tacc.cc
index afa5315d6b4..7867e9f5697 100644
--- a/src/bin/post-to-tacc.cc
+++ b/src/bin/post-to-tacc.cc
@@ -21,7 +21,7 @@
 
 #include "base/kaldi-common.h"
 #include "util/common-utils.h"
-#include "hmm/transition-model.h"
+#include "hmm/transitions.h"
 #include "hmm/posterior.h"
 
 int main(int argc, char *argv[]) {
@@ -61,7 +61,7 @@ int main(int argc, char *argv[]) {
 
       bool binary_in;
       Input ki(model_rxfilename, &binary_in);
-      TransitionModel trans_model;
+      Transitions trans_model;
       trans_model.Read(ki.Stream(), binary_in);
       num_transition_ids = trans_model.NumTransitionIds();
 
diff --git a/src/bin/prob-to-post.cc b/src/bin/prob-to-post.cc
index 4266d34ca47..7bdff6f1e78 100644
--- a/src/bin/prob-to-post.cc
+++ b/src/bin/prob-to-post.cc
@@ -21,7 +21,7 @@
 #include "base/kaldi-common.h"
 #include "util/common-utils.h"
 #include "gmm/am-diag-gmm.h"
-#include "hmm/transition-model.h"
+#include "hmm/transitions.h"
 #include "hmm/hmm-utils.h"
 #include "hmm/posterior.h"
 
diff --git a/src/bin/prons-to-wordali.cc b/src/bin/prons-to-wordali.cc
index a6331043500..8579c79ea02 100644
--- a/src/bin/prons-to-wordali.cc
+++ b/src/bin/prons-to-wordali.cc
@@ -19,7 +19,7 @@
 
 
 #include "base/kaldi-common.h"
-#include "hmm/transition-model.h"
+#include "hmm/transitions.h"
 #include "hmm/hmm-utils.h"
 #include "util/common-utils.h"
 #include "fst/fstlib.h"
diff --git a/src/bin/show-alignments.cc b/src/bin/show-alignments.cc
index 06bc907005f..beadf1b590c 100644
--- a/src/bin/show-alignments.cc
+++ b/src/bin/show-alignments.cc
@@ -19,7 +19,7 @@
 
 
 #include "base/kaldi-common.h"
-#include "hmm/transition-model.h"
+#include "hmm/transitions.h"
 #include "hmm/hmm-utils.h"
 #include "util/common-utils.h"
 #include "fst/fstlib.h"
@@ -47,7 +47,7 @@ int main(int argc, char *argv[]) {
         model_filename = po.GetArg(2),
         alignments_rspecifier = po.GetArg(3);
 
-    TransitionModel trans_model;
+    Transitions trans_model;
     ReadKaldiObject(model_filename, &trans_model);
 
     fst::SymbolTable *phones_symtab = NULL;
diff --git a/src/bin/show-transitions.cc b/src/bin/show-transitions.cc
index bdc780b060a..db72d47f988 100644
--- a/src/bin/show-transitions.cc
+++ b/src/bin/show-transitions.cc
@@ -18,7 +18,7 @@
 // See the Apache 2 License for the specific language governing permissions and
 // limitations under the License.
 
-#include "hmm/transition-model.h"
+#include "hmm/transitions.h"
 #include "fst/fstlib.h"
 #include "util/common-utils.h"
 
@@ -59,7 +59,7 @@ int main(int argc, char *argv[]) {
     for (size_t i = 0; i < syms->NumSymbols(); i++)
       names[i] = syms->Find(i);
 
-    TransitionModel trans_model;
+    Transitions trans_model;
     ReadKaldiObject(transition_model_filename, &trans_model);
 
     Vector<double> occs;
diff --git a/src/bin/tree-info.cc b/src/bin/tree-info.cc
index ce3c5c9cfc1..a1f4f21e983 100644
--- a/src/bin/tree-info.cc
+++ b/src/bin/tree-info.cc
@@ -20,7 +20,7 @@
 
 #include "base/kaldi-common.h"
 #include "util/common-utils.h"
-#include "hmm/hmm-topology.h"
+#include "hmm/topology.h"
 #include "tree/context-dep.h"
 
 int main(int argc, char *argv[]) {
diff --git a/src/bin/weight-silence-post.cc b/src/bin/weight-silence-post.cc
index dba935d1cd3..3c8478752c8 100644
--- a/src/bin/weight-silence-post.cc
+++ b/src/bin/weight-silence-post.cc
@@ -22,7 +22,7 @@
 #include "base/kaldi-common.h"
 #include "util/common-utils.h"
 #include "gmm/am-diag-gmm.h"
-#include "hmm/transition-model.h"
+#include "hmm/transitions.h"
 #include "hmm/hmm-utils.h"
 #include "hmm/posterior.h"
 
@@ -71,7 +71,7 @@ int main(int argc, char *argv[]) {
       KALDI_WARN <<"No silence phones, this will have no effect";
     ConstIntegerSet<int32> silence_set(silence_phones);  // faster lookup.
 
-    TransitionModel trans_model;
+    Transitions trans_model;
     ReadKaldiObject(model_rxfilename, &trans_model);
 
     int32 num_posteriors = 0;
diff --git a/src/chain/chain-den-graph.cc b/src/chain/chain-den-graph.cc
index 11c851091bd..920ade49348 100644
--- a/src/chain/chain-den-graph.cc
+++ b/src/chain/chain-den-graph.cc
@@ -162,7 +162,7 @@ void DenominatorGraph::GetNormalizationFst(const fst::StdVectorFst &ifst,
 }
 
 
-void MapFstToPdfIdsPlusOne(const TransitionModel &trans_model,
+void MapFstToPdfIdsPlusOne(const Transitions &trans_model,
                            fst::StdVectorFst *fst) {
   int32 num_states = fst->NumStates();
   for (int32 s = 0; s < num_states; s++) {
@@ -295,7 +295,7 @@ static void CheckDenominatorFst(int32 num_pdfs,
 }
 
 void CreateDenominatorFst(const ContextDependency &ctx_dep,
-                          const TransitionModel &trans_model,
+                          const Transitions &trans_model,
                           const fst::StdVectorFst &phone_lm_in,
                           fst::StdVectorFst *den_fst) {
   using fst::StdVectorFst;
diff --git a/src/chain/chain-den-graph.h b/src/chain/chain-den-graph.h
index b2510651f39..baf5ac2c6f1 100644
--- a/src/chain/chain-den-graph.h
+++ b/src/chain/chain-den-graph.h
@@ -32,7 +32,7 @@
 #include "lat/kaldi-lattice.h"
 #include "matrix/kaldi-matrix.h"
 #include "chain/chain-datastruct.h"
-#include "hmm/transition-model.h"
+#include "hmm/transitions.h"
 #include "cudamatrix/cu-matrix.h"
 #include "cudamatrix/cu-vector.h"
 #include "cudamatrix/cu-array.h"
@@ -149,7 +149,7 @@ void MinimizeAcceptorNoPush(fst::StdVectorFst *fst);
 // transition-ids to pdf-ids plus one.  Assumes 'fst'
 // is an acceptor, but does not check this (only looks at its
 // ilabels).
-void MapFstToPdfIdsPlusOne(const TransitionModel &trans_model,
+void MapFstToPdfIdsPlusOne(const Transitions &trans_model,
                            fst::StdVectorFst *fst);
 
 // Starting from an acceptor on phones that represents some kind of compiled
@@ -157,7 +157,7 @@ void MapFstToPdfIdsPlusOne(const TransitionModel &trans_model,
 // denominator-graph.  Note: there is similar code in chain-supervision.cc, when
 // creating the supervision graph.
 void CreateDenominatorFst(const ContextDependency &ctx_dep,
-                          const TransitionModel &trans_model,
+                          const Transitions &trans_model,
                           const fst::StdVectorFst &phone_lm,
                           fst::StdVectorFst *den_graph);
 
diff --git a/src/chain/chain-denominator.h b/src/chain/chain-denominator.h
index d76e4244ae2..9960dfede0b 100644
--- a/src/chain/chain-denominator.h
+++ b/src/chain/chain-denominator.h
@@ -31,7 +31,7 @@
 #include "tree/context-dep.h"
 #include "lat/kaldi-lattice.h"
 #include "matrix/kaldi-matrix.h"
-#include "hmm/transition-model.h"
+#include "hmm/transitions.h"
 #include "cudamatrix/cu-matrix.h"
 #include "cudamatrix/cu-array.h"
 #include "chain/chain-den-graph.h"
diff --git a/src/chain/chain-generic-numerator.h b/src/chain/chain-generic-numerator.h
index fc5e00b2c63..8c542d6049c 100644
--- a/src/chain/chain-generic-numerator.h
+++ b/src/chain/chain-generic-numerator.h
@@ -32,7 +32,7 @@
 #include "tree/context-dep.h"
 #include "lat/kaldi-lattice.h"
 #include "matrix/kaldi-matrix.h"
-#include "hmm/transition-model.h"
+#include "hmm/transitions.h"
 #include "chain/chain-supervision.h"
 #include "cudamatrix/cu-matrix.h"
 #include "cudamatrix/cu-array.h"
diff --git a/src/chain/chain-numerator.h b/src/chain/chain-numerator.h
index 15cb31e0571..c4ea4774b53 100644
--- a/src/chain/chain-numerator.h
+++ b/src/chain/chain-numerator.h
@@ -31,7 +31,7 @@
 #include "tree/context-dep.h"
 #include "lat/kaldi-lattice.h"
 #include "matrix/kaldi-matrix.h"
-#include "hmm/transition-model.h"
+#include "hmm/transitions.h"
 #include "chain/chain-supervision.h"
 #include "cudamatrix/cu-matrix.h"
 #include "cudamatrix/cu-array.h"
diff --git a/src/chain/chain-supervision-test.cc b/src/chain/chain-supervision-test.cc
index 7ee5ee117b0..10385e2c4f2 100644
--- a/src/chain/chain-supervision-test.cc
+++ b/src/chain/chain-supervision-test.cc
@@ -57,7 +57,7 @@ void ComputeExamplePhoneLanguageModel(const std::vector<int32> &phones,
 
 
 void ComputeExampleDenFst(const ContextDependency &ctx_dep,
-                          const TransitionModel &trans_model,
+                          const Transitions &trans_model,
                           fst::StdVectorFst *den_graph) {
   using fst::StdVectorFst;
   using fst::StdArc;
@@ -151,7 +151,7 @@ void TestSupervisionNumerator(const Supervision &supervision) {
 
 }
 
-void TestSupervisionAppend(const TransitionModel &trans_model,
+void TestSupervisionAppend(const Transitions &trans_model,
                            const Supervision &supervision) {
   int32 num_append = RandInt(1,5);
   std::vector<const Supervision*> input(num_append);
@@ -180,7 +180,7 @@ void TestSupervisionAppend(const TransitionModel &trans_model,
   output.Check(trans_model);
 }
 
-void TestSupervisionReattached(const TransitionModel &trans_model,
+void TestSupervisionReattached(const Transitions &trans_model,
                                const Supervision &supervision,
                                const Supervision &reattached_supervision) {
   using namespace fst;
@@ -333,7 +333,7 @@ void ChainTrainingTest(const DenominatorGraph &den_graph,
 }
 
 void TestSupervisionSplitting(const ContextDependency &ctx_dep,
-                              const TransitionModel &trans_model,
+                              const Transitions &trans_model,
                               const Supervision &supervision) {
   fst::StdVectorFst den_fst, normalization_fst;
   ComputeExampleDenFst(ctx_dep, trans_model, &den_fst);
@@ -456,7 +456,7 @@ void ChainDenominatorTest(const DenominatorGraph &den_graph) {
 
 void ChainSupervisionTest() {
   ContextDependency *ctx_dep;
-  TransitionModel *trans_model = GenRandTransitionModel(&ctx_dep);
+  Transitions *trans_model = GenRandTransitionModel(&ctx_dep);
   const std::vector<int32> &phones = trans_model->GetPhones();
 
   int32 subsample_factor = RandInt(1, 3);
diff --git a/src/chain/chain-supervision.cc b/src/chain/chain-supervision.cc
index f8a2c1d11cc..af28ef85a33 100644
--- a/src/chain/chain-supervision.cc
+++ b/src/chain/chain-supervision.cc
@@ -255,7 +255,7 @@ bool TimeEnforcerFst::GetArc(StateId s, Label ilabel, fst::StdArc* oarc) {
 
 bool TrainingGraphToSupervisionE2e(
     const fst::StdVectorFst &training_graph,
-    const TransitionModel &trans_model,
+    const Transitions &trans_model,
     int32 num_frames,
     Supervision *supervision) {
   using fst::VectorFst;
@@ -292,7 +292,7 @@ bool TrainingGraphToSupervisionE2e(
 
 bool ProtoSupervisionToSupervision(
     const ContextDependencyInterface &ctx_dep,
-    const TransitionModel &trans_model,
+    const Transitions &trans_model,
     const ProtoSupervision &proto_supervision,
     bool convert_to_pdfs,
     Supervision *supervision) {
@@ -906,7 +906,7 @@ bool Supervision::operator == (const Supervision &other) const {
       label_dim == other.label_dim && fst::Equal(fst, other.fst);
 }
 
-void Supervision::Check(const TransitionModel &trans_mdl) const {
+void Supervision::Check(const Transitions &trans_mdl) const {
   if (weight <= 0.0)
     KALDI_ERR << "Weight should be positive.";
   if (frames_per_sequence <= 0)
@@ -970,7 +970,7 @@ void GetWeightsForRanges(int32 range_length,
 }
 
 bool ConvertSupervisionToUnconstrained(
-    const TransitionModel &trans_mdl,
+    const Transitions &trans_mdl,
     Supervision *supervision) {
   KALDI_ASSERT(supervision->label_dim == trans_mdl.NumTransitionIds() &&
                supervision->fst.NumStates() > 0 &&
diff --git a/src/chain/chain-supervision.h b/src/chain/chain-supervision.h
index f1a796dc2f8..0b8a760f1e6 100644
--- a/src/chain/chain-supervision.h
+++ b/src/chain/chain-supervision.h
@@ -29,7 +29,7 @@
 #include "util/common-utils.h"
 #include "lat/kaldi-lattice.h"
 #include "fstext/deterministic-fst.h"
-#include "hmm/transition-model.h"
+#include "hmm/transitions.h"
 
 namespace kaldi {
 namespace chain {
@@ -181,7 +181,7 @@ class TimeEnforcerFst:
   typedef fst::StdArc::StateId StateId;
   typedef fst::StdArc::Label Label;
 
-  TimeEnforcerFst(const TransitionModel &trans_model,
+  TimeEnforcerFst(const Transitions &trans_model,
                   bool convert_to_pdfs,
                   const std::vector<std::vector<int32> > &allowed_phones):
       trans_model_(trans_model),
@@ -204,7 +204,7 @@ class TimeEnforcerFst:
   virtual bool GetArc(StateId s, Label ilabel, fst::StdArc* oarc);
 
  private:
-  const TransitionModel &trans_model_;
+  const Transitions &trans_model_;
   // if convert_to_pdfs_ is true, this FST will map from transition-id (on the
   // input side) to pdf-id plus one (on the output side); if false, both sides'
   // labels will be transition-id.
@@ -234,10 +234,10 @@ struct Supervision {
 
   // the maximum possible value of the labels in 'fst' (which go from 1 to
   // label_dim).  For fully-processed examples this will equal the NumPdfs() in the
-  // TransitionModel object, but for newer-style "unconstrained" examples
+  // Transitions object, but for newer-style "unconstrained" examples
   // that have been output by chain-get-supervision but not yet processed
   // by nnet3-chain-get-egs, it will be the NumTransitionIds() of the
-  // TransitionModel object.
+  // Transitions object.
   int32 label_dim;
 
   // This is an epsilon-free unweighted acceptor that is sorted in increasing
@@ -297,7 +297,7 @@ struct Supervision {
 
   // This function checks that this supervision object satifsies some
   // of the properties we expect of it, and calls KALDI_ERR if not.
-  void Check(const TransitionModel &trans_model) const;
+  void Check(const Transitions &trans_model) const;
 
   void Write(std::ostream &os, bool binary) const;
   void Read(std::istream &is, bool binary);
@@ -317,7 +317,7 @@ struct Supervision {
 */
 bool ProtoSupervisionToSupervision(
     const ContextDependencyInterface &ctx_dep,
-    const TransitionModel &trans_model,
+    const Transitions &trans_model,
     const ProtoSupervision &proto_supervision,
     bool convert_to_pdfs,
     Supervision *supervision);
@@ -333,7 +333,7 @@ bool ProtoSupervisionToSupervision(
  */
 bool TrainingGraphToSupervisionE2e(
     const fst::StdVectorFst& training_graph,
-    const TransitionModel& trans_model,
+    const Transitions& trans_model,
     int32 num_frames,
     Supervision *supervision);
 
@@ -484,7 +484,7 @@ void GetWeightsForRanges(int32 range_length,
 /// It returns true on success, and false if some kind of error happened
 /// (this is not expected).
 bool ConvertSupervisionToUnconstrained(
-    const TransitionModel &trans_mdl,
+    const Transitions &trans_mdl,
     Supervision *supervision);
 
 
diff --git a/src/chain/chain-training.h b/src/chain/chain-training.h
index 6ea70b5ca41..7dbc1a058c2 100644
--- a/src/chain/chain-training.h
+++ b/src/chain/chain-training.h
@@ -31,7 +31,7 @@
 #include "tree/context-dep.h"
 #include "lat/kaldi-lattice.h"
 #include "matrix/kaldi-matrix.h"
-#include "hmm/transition-model.h"
+#include "hmm/transitions.h"
 #include "chain/chain-den-graph.h"
 #include "chain/chain-supervision.h"
 
diff --git a/src/chainbin/chain-get-supervision.cc b/src/chainbin/chain-get-supervision.cc
index 1ac89d4630b..8a4904843be 100644
--- a/src/chainbin/chain-get-supervision.cc
+++ b/src/chainbin/chain-get-supervision.cc
@@ -30,7 +30,7 @@ namespace chain {
 
 // This wrapper function does all the job of processing the features and
 // lattice into ChainSupervision objects, and writing them out.
-static bool ProcessSupervision(const TransitionModel &trans_model,
+static bool ProcessSupervision(const Transitions &trans_model,
                                const ContextDependencyInterface &ctx_dep,
                                const ProtoSupervision &proto_sup,
                                const std::string &key,
@@ -97,7 +97,7 @@ int main(int argc, char *argv[]) {
         phone_durs_or_lat_rspecifier = po.GetArg(3),
         supervision_wspecifier = po.GetArg(4);
 
-    TransitionModel trans_model;
+    Transitions trans_model;
     ReadKaldiObject(trans_model_rxfilename, &trans_model);
 
     ContextDependency ctx_dep;
diff --git a/src/chainbin/chain-make-den-fst.cc b/src/chainbin/chain-make-den-fst.cc
index 0d8d249242b..dc2b41a369d 100644
--- a/src/chainbin/chain-make-den-fst.cc
+++ b/src/chainbin/chain-make-den-fst.cc
@@ -56,7 +56,7 @@ int main(int argc, char *argv[]) {
 
 
     ContextDependency ctx_dep;
-    TransitionModel trans_model;
+    Transitions trans_model;
     fst::StdVectorFst phone_lm;
 
     ReadKaldiObject(tree_rxfilename, &ctx_dep);
diff --git a/src/chainbin/nnet3-chain-acc-lda-stats.cc b/src/chainbin/nnet3-chain-acc-lda-stats.cc
index 693eb2dad86..0cf2d449d76 100644
--- a/src/chainbin/nnet3-chain-acc-lda-stats.cc
+++ b/src/chainbin/nnet3-chain-acc-lda-stats.cc
@@ -19,7 +19,7 @@
 
 #include "base/kaldi-common.h"
 #include "util/common-utils.h"
-#include "hmm/transition-model.h"
+#include "hmm/transitions.h"
 #include "lat/lattice-functions.h"
 #include "nnet3/nnet-nnet.h"
 #include "nnet3/nnet-chain-example.h"
diff --git a/src/chainbin/nnet3-chain-copy-egs.cc b/src/chainbin/nnet3-chain-copy-egs.cc
index 0117fe2200f..46744b239d0 100644
--- a/src/chainbin/nnet3-chain-copy-egs.cc
+++ b/src/chainbin/nnet3-chain-copy-egs.cc
@@ -21,7 +21,7 @@
 
 #include "base/kaldi-common.h"
 #include "util/common-utils.h"
-#include "hmm/transition-model.h"
+#include "hmm/transitions.h"
 #include "nnet3/nnet-chain-example.h"
 
 namespace kaldi {
diff --git a/src/chainbin/nnet3-chain-e2e-get-egs.cc b/src/chainbin/nnet3-chain-e2e-get-egs.cc
index 8cdda8deb32..31b14cb7b0f 100644
--- a/src/chainbin/nnet3-chain-e2e-get-egs.cc
+++ b/src/chainbin/nnet3-chain-e2e-get-egs.cc
@@ -22,7 +22,7 @@
 
 #include "base/kaldi-common.h"
 #include "util/common-utils.h"
-#include "hmm/transition-model.h"
+#include "hmm/transitions.h"
 #include "fstext/fstext-lib.h"
 #include "hmm/posterior.h"
 #include "nnet3/nnet-example.h"
@@ -74,7 +74,7 @@ static int32 FindMinimumLengthPath(
 */
 
 static bool ProcessFile(const ExampleGenerationConfig &opts,
-                        const TransitionModel &trans_model,
+                        const Transitions &trans_model,
                         const fst::StdVectorFst &normalization_fst,
                         const MatrixBase<BaseFloat> &feats,
                         const MatrixBase<BaseFloat> *ivector_feats,
@@ -285,7 +285,7 @@ int main(int argc, char *argv[]) {
       KALDI_ASSERT(normalization_fst.NumStates() > 0);
     }
 
-    TransitionModel trans_model;
+    Transitions trans_model;
     ReadKaldiObject(trans_model_rxfilename, &trans_model);
 
     RandomAccessBaseFloatMatrixReader feat_reader(feature_rspecifier);
diff --git a/src/chainbin/nnet3-chain-get-egs.cc b/src/chainbin/nnet3-chain-get-egs.cc
index 1032b7e2125..2c506c5b460 100644
--- a/src/chainbin/nnet3-chain-get-egs.cc
+++ b/src/chainbin/nnet3-chain-get-egs.cc
@@ -21,7 +21,7 @@
 
 #include "base/kaldi-common.h"
 #include "util/common-utils.h"
-#include "hmm/transition-model.h"
+#include "hmm/transitions.h"
 #include "hmm/posterior.h"
 #include "nnet3/nnet-example.h"
 #include "nnet3/nnet-chain-example.h"
@@ -86,7 +86,7 @@ namespace nnet3 {
 
 **/
 
-static bool ProcessFile(const TransitionModel *trans_mdl,
+static bool ProcessFile(const Transitions *trans_mdl,
                         const fst::StdVectorFst &normalization_fst,
                         const GeneralMatrix &feats,
                         const MatrixBase<BaseFloat> *ivector_feats,
@@ -345,8 +345,8 @@ int main(int argc, char *argv[]) {
     UtteranceSplitter utt_splitter(eg_config);
 
 
-    const TransitionModel *trans_mdl_ptr = NULL;
-    TransitionModel trans_mdl;
+    const Transitions *trans_mdl_ptr = NULL;
+    Transitions trans_mdl;
     if (!trans_mdl_rxfilename.empty()) {
       ReadKaldiObject(trans_mdl_rxfilename,
                       &trans_mdl);
diff --git a/src/chainbin/nnet3-chain-merge-egs.cc b/src/chainbin/nnet3-chain-merge-egs.cc
index a3686d2fc30..14bdbe55115 100644
--- a/src/chainbin/nnet3-chain-merge-egs.cc
+++ b/src/chainbin/nnet3-chain-merge-egs.cc
@@ -20,7 +20,7 @@
 
 #include "base/kaldi-common.h"
 #include "util/common-utils.h"
-#include "hmm/transition-model.h"
+#include "hmm/transitions.h"
 #include "nnet3/nnet-chain-example.h"
 
 
diff --git a/src/chainbin/nnet3-chain-normalize-egs.cc b/src/chainbin/nnet3-chain-normalize-egs.cc
index a97797e3246..70f6852e963 100644
--- a/src/chainbin/nnet3-chain-normalize-egs.cc
+++ b/src/chainbin/nnet3-chain-normalize-egs.cc
@@ -19,7 +19,7 @@
 
 #include "base/kaldi-common.h"
 #include "util/common-utils.h"
-#include "hmm/transition-model.h"
+#include "hmm/transitions.h"
 #include "nnet3/nnet-chain-example.h"
 #include "chain/chain-supervision.h"
 
diff --git a/src/chainbin/nnet3-chain-shuffle-egs.cc b/src/chainbin/nnet3-chain-shuffle-egs.cc
index 7ab6e28f607..94ba30799b0 100644
--- a/src/chainbin/nnet3-chain-shuffle-egs.cc
+++ b/src/chainbin/nnet3-chain-shuffle-egs.cc
@@ -20,7 +20,7 @@
 
 #include "base/kaldi-common.h"
 #include "util/common-utils.h"
-#include "hmm/transition-model.h"
+#include "hmm/transitions.h"
 #include "nnet3/nnet-chain-example.h"
 
 int main(int argc, char *argv[]) {
diff --git a/src/decoder/decodable-matrix.cc b/src/decoder/decodable-matrix.cc
index 3cc7b87f2d7..98cd75d1ede 100644
--- a/src/decoder/decodable-matrix.cc
+++ b/src/decoder/decodable-matrix.cc
@@ -22,7 +22,7 @@
 namespace kaldi {
 
 DecodableMatrixMapped::DecodableMatrixMapped(
-    const TransitionModel &tm,
+    const Transitions &tm,
     const MatrixBase<BaseFloat> &likes,
     int32 frame_offset):
     trans_model_(tm), likes_(&likes), likes_to_delete_(NULL),
@@ -32,12 +32,12 @@ DecodableMatrixMapped::DecodableMatrixMapped(
 
   if (likes.NumCols() != tm.NumPdfs())
     KALDI_ERR << "Mismatch, matrix has "
-              << likes.NumCols() << " rows but transition-model has "
+              << likes.NumCols() << " rows but transitions.has "
               << tm.NumPdfs() << " pdf-ids.";
 }
 
 DecodableMatrixMapped::DecodableMatrixMapped(
-    const TransitionModel &tm, const Matrix<BaseFloat> *likes,
+    const Transitions &tm, const Matrix<BaseFloat> *likes,
     int32 frame_offset):
     trans_model_(tm), likes_(likes), likes_to_delete_(likes),
     frame_offset_(frame_offset) {
@@ -45,7 +45,7 @@ DecodableMatrixMapped::DecodableMatrixMapped(
   raw_data_ = likes->Data() - (stride_ * frame_offset_);
   if (likes->NumCols() != tm.NumPdfs())
     KALDI_ERR << "Mismatch, matrix has "
-              << likes->NumCols() << " rows but transition-model has "
+              << likes->NumCols() << " rows but transitions.has "
               << tm.NumPdfs() << " pdf-ids.";
 }
 
diff --git a/src/decoder/decodable-matrix.h b/src/decoder/decodable-matrix.h
index 475638a35af..5e9642ee6b9 100644
--- a/src/decoder/decodable-matrix.h
+++ b/src/decoder/decodable-matrix.h
@@ -24,7 +24,7 @@
 #include <vector>
 
 #include "base/kaldi-common.h"
-#include "hmm/transition-model.h"
+#include "hmm/transitions.h"
 #include "itf/decodable-itf.h"
 #include "matrix/kaldi-matrix.h"
 
@@ -34,26 +34,26 @@ namespace kaldi {
 class DecodableMatrixScaledMapped: public DecodableInterface {
  public:
   // This constructor creates an object that will not delete "likes" when done.
-  DecodableMatrixScaledMapped(const TransitionModel &tm,
+  DecodableMatrixScaledMapped(const Transitions &tm,
                               const Matrix<BaseFloat> &likes,
                               BaseFloat scale): trans_model_(tm), likes_(&likes),
                                                 scale_(scale), delete_likes_(false) {
     if (likes.NumCols() != tm.NumPdfs())
       KALDI_ERR << "DecodableMatrixScaledMapped: mismatch, matrix has "
-                << likes.NumCols() << " rows but transition-model has "
+                << likes.NumCols() << " rows but transitions.has "
                 << tm.NumPdfs() << " pdf-ids.";
   }
 
   // This constructor creates an object that will delete "likes"
   // when done.
-  DecodableMatrixScaledMapped(const TransitionModel &tm,
+  DecodableMatrixScaledMapped(const Transitions &tm,
                               BaseFloat scale,
                               const Matrix<BaseFloat> *likes):
       trans_model_(tm), likes_(likes),
       scale_(scale), delete_likes_(true) {
     if (likes->NumCols() != tm.NumPdfs())
       KALDI_ERR << "DecodableMatrixScaledMapped: mismatch, matrix has "
-                << likes->NumCols() << " rows but transition-model has "
+                << likes->NumCols() << " rows but transitions.has "
                 << tm.NumPdfs() << " pdf-ids.";
   }
 
@@ -76,7 +76,7 @@ class DecodableMatrixScaledMapped: public DecodableInterface {
     if (delete_likes_) delete likes_;
   }
  private:
-  const TransitionModel &trans_model_;  // for tid to pdf mapping
+  const Transitions &trans_model_;  // for tid to pdf mapping
   const Matrix<BaseFloat> *likes_;
   BaseFloat scale_;
   bool delete_likes_;
@@ -100,13 +100,13 @@ class DecodableMatrixMapped: public DecodableInterface {
   // This constructor creates an object that will not delete "likes" when done.
   // the frame_offset is the frame the row 0 of 'likes' corresponds to, would be
   // greater than one if this is not the first chunk of likelihoods.
-  DecodableMatrixMapped(const TransitionModel &tm,
+  DecodableMatrixMapped(const Transitions &tm,
                         const MatrixBase<BaseFloat> &likes,
                         int32 frame_offset = 0);
 
   // This constructor creates an object that will delete "likes"
   // when done.
-  DecodableMatrixMapped(const TransitionModel &tm,
+  DecodableMatrixMapped(const Transitions &tm,
                         const Matrix<BaseFloat> *likes,
                         int32 frame_offset = 0);
 
@@ -122,7 +122,7 @@ class DecodableMatrixMapped: public DecodableInterface {
   virtual ~DecodableMatrixMapped();
 
  private:
-  const TransitionModel &trans_model_;  // for tid to pdf mapping
+  const Transitions &trans_model_;  // for tid to pdf mapping
   const MatrixBase<BaseFloat> *likes_;
   const Matrix<BaseFloat> *likes_to_delete_;
   int32 frame_offset_;
@@ -151,7 +151,7 @@ class DecodableMatrixMapped: public DecodableInterface {
 */
 class DecodableMatrixMappedOffset: public DecodableInterface {
  public:
-  DecodableMatrixMappedOffset(const TransitionModel &tm):
+  DecodableMatrixMappedOffset(const Transitions &tm):
       trans_model_(tm), frame_offset_(0), input_is_finished_(false) { }
 
   virtual int32 NumFramesReady() { return frame_offset_ + loglikes_.NumRows(); }
@@ -194,7 +194,7 @@ class DecodableMatrixMappedOffset: public DecodableInterface {
   // nothing special to do in destructor.
   virtual ~DecodableMatrixMappedOffset() { }
  private:
-  const TransitionModel &trans_model_;  // for tid to pdf mapping
+  const Transitions &trans_model_;  // for tid to pdf mapping
   Matrix<BaseFloat> loglikes_;
   int32 frame_offset_;
   bool input_is_finished_;
diff --git a/src/decoder/decoder-wrappers.cc b/src/decoder/decoder-wrappers.cc
index ff573c74d15..71799a5b700 100644
--- a/src/decoder/decoder-wrappers.cc
+++ b/src/decoder/decoder-wrappers.cc
@@ -32,7 +32,7 @@ namespace kaldi {
 DecodeUtteranceLatticeFasterClass::DecodeUtteranceLatticeFasterClass(
     LatticeFasterDecoder *decoder,
     DecodableInterface *decodable,
-    const TransitionModel &trans_model,
+    const Transitions &trans_model,
     const fst::SymbolTable *word_syms,
     std::string utt,
     BaseFloat acoustic_scale,
@@ -201,7 +201,7 @@ template <typename FST>
 bool DecodeUtteranceLatticeFaster(
     LatticeFasterDecoderTpl<FST> &decoder, // not const but is really an input.
     DecodableInterface &decodable, // not const but is really an input.
-    const TransitionModel &trans_model,
+    const Transitions &trans_model,
     const fst::SymbolTable *word_syms,
     std::string utt,
     double acoustic_scale,
@@ -299,7 +299,7 @@ bool DecodeUtteranceLatticeFaster(
 template bool DecodeUtteranceLatticeFaster(
     LatticeFasterDecoderTpl<fst::Fst<fst::StdArc> > &decoder,
     DecodableInterface &decodable,
-    const TransitionModel &trans_model,
+    const Transitions &trans_model,
     const fst::SymbolTable *word_syms,
     std::string utt,
     double acoustic_scale,
@@ -314,7 +314,7 @@ template bool DecodeUtteranceLatticeFaster(
 template bool DecodeUtteranceLatticeFaster(
     LatticeFasterDecoderTpl<fst::GrammarFst> &decoder,
     DecodableInterface &decodable,
-    const TransitionModel &trans_model,
+    const Transitions &trans_model,
     const fst::SymbolTable *word_syms,
     std::string utt,
     double acoustic_scale,
@@ -331,7 +331,7 @@ template bool DecodeUtteranceLatticeFaster(
 bool DecodeUtteranceLatticeSimple(
     LatticeSimpleDecoder &decoder, // not const but is really an input.
     DecodableInterface &decodable, // not const but is really an input.
-    const TransitionModel &trans_model,
+    const Transitions &trans_model,
     const fst::SymbolTable *word_syms,
     std::string utt,
     double acoustic_scale,
diff --git a/src/decoder/decoder-wrappers.h b/src/decoder/decoder-wrappers.h
index fc81137f356..c2c357c2629 100644
--- a/src/decoder/decoder-wrappers.h
+++ b/src/decoder/decoder-wrappers.h
@@ -103,7 +103,7 @@ template <typename FST>
 bool DecodeUtteranceLatticeFaster(
     LatticeFasterDecoderTpl<FST> &decoder, // not const but is really an input.
     DecodableInterface &decodable, // not const but is really an input.
-    const TransitionModel &trans_model,
+    const Transitions &trans_model,
     const fst::SymbolTable *word_syms,
     std::string utt,
     double acoustic_scale,
@@ -129,7 +129,7 @@ class DecodeUtteranceLatticeFasterClass {
   DecodeUtteranceLatticeFasterClass(
       LatticeFasterDecoder *decoder,
       DecodableInterface *decodable,
-      const TransitionModel &trans_model,
+      const Transitions &trans_model,
       const fst::SymbolTable *word_syms,
       std::string utt,
       BaseFloat acoustic_scale,
@@ -150,7 +150,7 @@ class DecodeUtteranceLatticeFasterClass {
   // The following variables correspond to inputs:
   LatticeFasterDecoder *decoder_;
   DecodableInterface *decodable_;
-  const TransitionModel *trans_model_;
+  const Transitions *trans_model_;
   const fst::SymbolTable *word_syms_;
   std::string utt_;
   BaseFloat acoustic_scale_;
@@ -183,7 +183,7 @@ class DecodeUtteranceLatticeFasterClass {
 bool DecodeUtteranceLatticeSimple(
     LatticeSimpleDecoder &decoder, // not const but is really an input.
     DecodableInterface &decodable, // not const but is really an input.
-    const TransitionModel &trans_model,
+    const Transitions &trans_model,
     const fst::SymbolTable *word_syms,
     std::string utt,
     double acoustic_scale,
diff --git a/src/decoder/training-graph-compiler.cc b/src/decoder/training-graph-compiler.cc
index 191d02f1720..db1a75f7a25 100644
--- a/src/decoder/training-graph-compiler.cc
+++ b/src/decoder/training-graph-compiler.cc
@@ -23,7 +23,7 @@
 namespace kaldi {
 
 
-TrainingGraphCompiler::TrainingGraphCompiler(const TransitionModel &trans_model,
+TrainingGraphCompiler::TrainingGraphCompiler(const Transitions &trans_model,
                                              const ContextDependency &ctx_dep,  // Does not maintain reference to this.
                                              fst::VectorFst<fst::StdArc> *lex_fst,
                                              const std::vector<int32> &disambig_syms,
diff --git a/src/decoder/training-graph-compiler.h b/src/decoder/training-graph-compiler.h
index ee56c6dfb3d..600844b8b8a 100644
--- a/src/decoder/training-graph-compiler.h
+++ b/src/decoder/training-graph-compiler.h
@@ -21,7 +21,7 @@
 #define KALDI_DECODER_TRAINING_GRAPH_COMPILER_H_
 
 #include "base/kaldi-common.h"
-#include "hmm/transition-model.h"
+#include "hmm/transitions.h"
 #include "fst/fstlib.h"
 #include "fstext/fstext-lib.h"
 #include "tree/context-dep.h"
@@ -58,7 +58,7 @@ struct TrainingGraphCompilerOptions {
 
 class TrainingGraphCompiler {
  public:
-  TrainingGraphCompiler(const TransitionModel &trans_model,  // Maintains reference to this object.
+  TrainingGraphCompiler(const Transitions &trans_model,  // Maintains reference to this object.
                         const ContextDependency &ctx_dep,  // And this.
                         fst::VectorFst<fst::StdArc> *lex_fst,  // Takes ownership of this object.
                         // It should not contain disambiguation symbols or subsequential symbol,
@@ -93,7 +93,7 @@ class TrainingGraphCompiler {
 
   ~TrainingGraphCompiler() { delete lex_fst_; }
  private:
-  const TransitionModel &trans_model_;
+  const Transitions &trans_model_;
   const ContextDependency &ctx_dep_;
   fst::VectorFst<fst::StdArc> *lex_fst_; // lexicon FST (an input; we take
   // ownership as we need to modify it).
diff --git a/src/feat/Makefile b/src/feat/Makefile
index dcd029f7f94..9850e578d9a 100644
--- a/src/feat/Makefile
+++ b/src/feat/Makefile
@@ -4,12 +4,12 @@ all:
 
 include ../kaldi.mk
 
-TESTFILES = feature-mfcc-test feature-plp-test feature-fbank-test \
+TESTFILES = feature-mfcc-test feature-fbank-test \
          feature-functions-test pitch-functions-test feature-sdc-test \
          resample-test online-feature-test signal-test wave-reader-test
 
-OBJFILES = feature-functions.o feature-mfcc.o feature-plp.o feature-fbank.o \
-           feature-spectrogram.o mel-computations.o wave-reader.o \
+OBJFILES = feature-functions.o feature-mfcc.o feature-fbank.o \
+           mel-computations.o wave-reader.o \
            pitch-functions.o resample.o online-feature.o signal.o \
            feature-window.o
 
@@ -17,6 +17,6 @@ LIBNAME = kaldi-feat
 
 ADDLIBS = ../transform/kaldi-transform.a ../gmm/kaldi-gmm.a \
           ../tree/kaldi-tree.a ../util/kaldi-util.a ../matrix/kaldi-matrix.a \
-          ../base/kaldi-base.a 
+          ../base/kaldi-base.a
 
 include ../makefiles/default_rules.mk
diff --git a/src/feat/feature-common-inl.h b/src/feat/feature-common-inl.h
index 26127a4dc4d..10bfe5cdfd1 100644
--- a/src/feat/feature-common-inl.h
+++ b/src/feat/feature-common-inl.h
@@ -70,15 +70,12 @@ void OfflineFeatureTpl<F>::Compute(
   }
   output->Resize(rows_out, cols_out);
   Vector<BaseFloat> window;  // windowed waveform.
-  bool use_raw_log_energy = computer_.NeedRawLogEnergy();
   for (int32 r = 0; r < rows_out; r++) {  // r is frame index.
-    BaseFloat raw_log_energy = 0.0;
     ExtractWindow(0, wave, r, computer_.GetFrameOptions(),
-                  feature_window_function_, &window,
-                  (use_raw_log_energy ? &raw_log_energy : NULL));
+                  feature_window_function_, &window);
 
     SubVector<BaseFloat> output_row(*output, r);
-    computer_.Compute(raw_log_energy, vtln_warp, &window, &output_row);
+    computer_.Compute(vtln_warp, &window, &output_row);
   }
 }
 
diff --git a/src/feat/feature-common.h b/src/feat/feature-common.h
index 45911cef585..664806beb49 100644
--- a/src/feat/feature-common.h
+++ b/src/feat/feature-common.h
@@ -115,8 +115,10 @@ class OfflineFeatureTpl {
   // Note: feature_window_function_ is the windowing function, which initialized
   // using the options class, that we cache at this level.
   OfflineFeatureTpl(const Options &opts):
-      computer_(opts),
-      feature_window_function_(computer_.GetFrameOptions()) { }
+      computer_(opts) {
+    InitFeatureWindowFunction(computer_.GetFrameOptions(),
+                              &feature_window_function_);
+  }
 
   // Internal (and back-compatibility) interface for computing features, which
   // requires that the user has already checked that the sampling frequency
@@ -164,7 +166,7 @@ class OfflineFeatureTpl {
   OfflineFeatureTpl<F> &operator =(const OfflineFeatureTpl<F> &other);
 
   F computer_;
-  FeatureWindowFunction feature_window_function_;
+  Vector<BaseFloat> feature_window_function_;
 };
 
 /// @} End of "addtogroup feat"
diff --git a/src/feat/feature-fbank.cc b/src/feat/feature-fbank.cc
index 10f7e67d607..8becf6a8141 100644
--- a/src/feat/feature-fbank.cc
+++ b/src/feat/feature-fbank.cc
@@ -25,8 +25,7 @@ namespace kaldi {
 
 FbankComputer::FbankComputer(const FbankOptions &opts):
     opts_(opts), srfft_(NULL) {
-  if (opts.energy_floor > 0.0)
-    log_energy_floor_ = Log(opts.energy_floor);
+  KALDI_ASSERT(opts.energy_floor > 0.0 && "Nonzero energy floor is required.");
 
   int32 padded_window_size = opts.frame_opts.PaddedWindowSize();
   if ((padded_window_size & (padded_window_size-1)) == 0)  // Is a power of two...
@@ -38,7 +37,7 @@ FbankComputer::FbankComputer(const FbankOptions &opts):
 }
 
 FbankComputer::FbankComputer(const FbankComputer &other):
-    opts_(other.opts_), log_energy_floor_(other.log_energy_floor_),
+    opts_(other.opts_),
     mel_banks_(other.mel_banks_), srfft_(NULL) {
   for (std::map<BaseFloat, MelBanks*>::iterator iter = mel_banks_.begin();
       iter != mel_banks_.end();
@@ -69,8 +68,7 @@ const MelBanks* FbankComputer::GetMelBanks(BaseFloat vtln_warp) {
   return this_mel_banks;
 }
 
-void FbankComputer::Compute(BaseFloat signal_log_energy,
-                            BaseFloat vtln_warp,
+void FbankComputer::Compute(BaseFloat vtln_warp,
                             VectorBase<BaseFloat> *signal_frame,
                             VectorBase<BaseFloat> *feature) {
 
@@ -80,10 +78,10 @@ void FbankComputer::Compute(BaseFloat signal_log_energy,
                feature->Dim() == this->Dim());
 
 
-  // Compute energy after window function (not the raw one).
-  if (opts_.use_energy && !opts_.raw_energy)
+  BaseFloat signal_log_energy;
+  if (opts_.use_energy)
     signal_log_energy = Log(std::max<BaseFloat>(VecVec(*signal_frame, *signal_frame),
-                                     std::numeric_limits<float>::min()));
+                                                opts_.energy_floor));
 
   if (srfft_ != NULL)  // Compute FFT using split-radix algorithm.
     srfft_->Compute(signal_frame->Data(), true);
@@ -95,30 +93,20 @@ void FbankComputer::Compute(BaseFloat signal_log_energy,
   SubVector<BaseFloat> power_spectrum(*signal_frame, 0,
                                       signal_frame->Dim() / 2 + 1);
 
-  // Use magnitude instead of power if requested.
-  if (!opts_.use_power)
-    power_spectrum.ApplyPow(0.5);
-
-  int32 mel_offset = ((opts_.use_energy && !opts_.htk_compat) ? 1 : 0);
+  int32 mel_offset = (opts_.use_energy ? 1 : 0);
   SubVector<BaseFloat> mel_energies(*feature,
                                     mel_offset,
                                     opts_.mel_opts.num_bins);
 
   // Sum with mel fiterbanks over the power spectrum
   mel_banks.Compute(power_spectrum, &mel_energies);
-  if (opts_.use_log_fbank) {
-    // Avoid log of zero (which should be prevented anyway by dithering).
-    mel_energies.ApplyFloor(std::numeric_limits<float>::epsilon());
-    mel_energies.ApplyLog();  // take the log.
-  }
 
-  // Copy energy as first value (or the last, if htk_compat == true).
+  mel_energies.ApplyFloor(opts_.energy_floor);
+  mel_energies.ApplyLog();  // take the log.
+
+  // Copy energy as first value
   if (opts_.use_energy) {
-    if (opts_.energy_floor > 0.0 && signal_log_energy < log_energy_floor_) {
-      signal_log_energy = log_energy_floor_;
-    }
-    int32 energy_index = opts_.htk_compat ? opts_.mel_opts.num_bins : 0;
-    (*feature)(energy_index) = signal_log_energy;
+    (*feature)(0) = signal_log_energy;
   }
 }
 
diff --git a/src/feat/feature-fbank.h b/src/feat/feature-fbank.h
index 724d7d148dc..04421c506b6 100644
--- a/src/feat/feature-fbank.h
+++ b/src/feat/feature-fbank.h
@@ -42,41 +42,24 @@ struct FbankOptions {
   FrameExtractionOptions frame_opts;
   MelBanksOptions mel_opts;
   bool use_energy;  // append an extra dimension with energy to the filter banks
-  BaseFloat energy_floor;
-  bool raw_energy;  // If true, compute energy before preemphasis and windowing
-  bool htk_compat;  // If true, put energy last (if using energy)
-  bool use_log_fbank;  // if true (default), produce log-filterbank, else linear
-  bool use_power;  // if true (default), use power in filterbank analysis, else magnitude.
+  BaseFloat energy_floor;  // Floor on energy, to avoid log(0.0).  The floor of
+                           // 1e-10 may be interpreted as (approximately)
+                           // 0.1 * 2**-30.  The smallest nonzero value in a 16-bit
+                           // waveform would be 1^-15, and 1^-30 is its square.
 
   FbankOptions(): mel_opts(23),
-                 // defaults the #mel-banks to 23 for the FBANK computations.
-                 // this seems to be common for 16khz-sampled data,
-                 // but for 8khz-sampled data, 15 may be better.
-                 use_energy(false),
-                 energy_floor(0.0),
-                 raw_energy(true),
-                 htk_compat(false),
-                 use_log_fbank(true),
-                 use_power(true) {}
+                  use_energy(false),
+                  energy_floor(1.0e-10) { }
 
   void Register(OptionsItf *opts) {
     frame_opts.Register(opts);
     mel_opts.Register(opts);
     opts->Register("use-energy", &use_energy,
-                   "Add an extra dimension with energy to the FBANK output.");
+                   "Add an extra dimension with energy to the filterbank "
+                   "output.");
     opts->Register("energy-floor", &energy_floor,
-                   "Floor on energy (absolute, not relative) in FBANK computation. "
-                   "Only makes a difference if --use-energy=true; only necessary if "
-                   "--dither=0.0.  Suggested values: 0.1 or 1.0");
-    opts->Register("raw-energy", &raw_energy,
-                   "If true, compute energy before preemphasis and windowing");
-    opts->Register("htk-compat", &htk_compat, "If true, put energy last.  "
-                   "Warning: not sufficient to get HTK compatible features (need "
-                   "to change other parameters).");
-    opts->Register("use-log-fbank", &use_log_fbank,
-                   "If true, produce log-filterbank, else produce linear.");
-    opts->Register("use-power", &use_power,
-                   "If true, use power, else use magnitude.");
+                   "Floor on energy (absolute, not relative) in filterbank "
+                   "computation.");
   }
 };
 
@@ -94,8 +77,6 @@ class FbankComputer {
     return opts_.mel_opts.num_bins + (opts_.use_energy ? 1 : 0);
   }
 
-  bool NeedRawLogEnergy() { return opts_.use_energy && opts_.raw_energy; }
-
   const FrameExtractionOptions &GetFrameOptions() const {
     return opts_.frame_opts;
   }
@@ -104,11 +85,6 @@ class FbankComputer {
      Function that computes one frame of features from
      one frame of signal.
 
-     @param [in] signal_raw_log_energy The log-energy of the frame of the signal
-         prior to windowing and pre-emphasis, or
-         log(numeric_limits<float>::min()), whichever is greater.  Must be
-         ignored by this function if this class returns false from
-         this->NeedsRawLogEnergy().
      @param [in] vtln_warp  The VTLN warping factor that the user wants
          to be applied when computing features for this utterance.  Will
          normally be 1.0, meaning no warping is to be done.  The value will
@@ -121,8 +97,7 @@ class FbankComputer {
      @param [out] feature  Pointer to a vector of size this->Dim(), to which
          the computed feature will be written.
   */
-  void Compute(BaseFloat signal_log_energy,
-               BaseFloat vtln_warp,
+  void Compute(BaseFloat vtln_warp,
                VectorBase<BaseFloat> *signal_frame,
                VectorBase<BaseFloat> *feature);
 
@@ -133,7 +108,6 @@ class FbankComputer {
 
 
   FbankOptions opts_;
-  BaseFloat log_energy_floor_;
   std::map<BaseFloat, MelBanks*> mel_banks_;  // BaseFloat is VTLN coefficient.
   SplitRadixRealFft<BaseFloat> *srfft_;
   // Disallow assignment.
diff --git a/src/feat/feature-mfcc-test.cc b/src/feat/feature-mfcc-test.cc
index c4367139707..305ac5abe50 100644
--- a/src/feat/feature-mfcc-test.cc
+++ b/src/feat/feature-mfcc-test.cc
@@ -88,14 +88,10 @@ static void UnitTestSimple() {
   // the parametrization object
   MfccOptions op;
   // trying to have same opts as baseline.
-  op.frame_opts.dither = 0.0;
-  op.frame_opts.preemph_coeff = 0.0;
   op.frame_opts.window_type = "rectangular";
   op.frame_opts.remove_dc_offset = false;
   op.frame_opts.round_to_power_of_two = true;
   op.mel_opts.low_freq = 0.0;
-  op.mel_opts.htk_mode = true;
-  op.htk_compat = true;
 
   Mfcc mfcc(op);
   // use default parameters
@@ -129,14 +125,10 @@ static void UnitTestHTKCompare1() {
 
   // use mfcc with default configuration...
   MfccOptions op;
-  op.frame_opts.dither = 0.0;
-  op.frame_opts.preemph_coeff = 0.0;
   op.frame_opts.window_type = "hamming";
   op.frame_opts.remove_dc_offset = false;
   op.frame_opts.round_to_power_of_two = true;
   op.mel_opts.low_freq = 0.0;
-  op.mel_opts.htk_mode = true;
-  op.htk_compat = true;
   op.use_energy = false;  // C0 not energy.
 
   Mfcc mfcc(op);
@@ -188,7 +180,7 @@ static void UnitTestHTKCompare1() {
   }
 
   std::cout << "Test passed :)\n\n";
-  
+
   unlink("tmp.test.wav.fea_kaldi.1");
 }
 
@@ -213,14 +205,10 @@ static void UnitTestHTKCompare2() {
 
   // use mfcc with default configuration...
   MfccOptions op;
-  op.frame_opts.dither = 0.0;
-  op.frame_opts.preemph_coeff = 0.0;
   op.frame_opts.window_type = "hamming";
   op.frame_opts.remove_dc_offset = false;
   op.frame_opts.round_to_power_of_two = true;
   op.mel_opts.low_freq = 0.0;
-  op.mel_opts.htk_mode = true;
-  op.htk_compat = true;
   op.use_energy = true;  // Use energy.
 
   Mfcc mfcc(op);
@@ -272,7 +260,7 @@ static void UnitTestHTKCompare2() {
   }
 
   std::cout << "Test passed :)\n\n";
-  
+
   unlink("tmp.test.wav.fea_kaldi.2");
 }
 
@@ -297,16 +285,11 @@ static void UnitTestHTKCompare3() {
 
   // use mfcc with default configuration...
   MfccOptions op;
-  op.frame_opts.dither = 0.0;
-  op.frame_opts.preemph_coeff = 0.0;
   op.frame_opts.window_type = "hamming";
   op.frame_opts.remove_dc_offset = false;
   op.frame_opts.round_to_power_of_two = true;
-  op.htk_compat = true;
   op.use_energy = true;  // Use energy.
   op.mel_opts.low_freq = 20.0;
-  //op.mel_opts.debug_mel = true;
-  op.mel_opts.htk_mode = true;
 
   Mfcc mfcc(op);
 
@@ -357,7 +340,7 @@ static void UnitTestHTKCompare3() {
   }
 
   std::cout << "Test passed :)\n\n";
-  
+
   unlink("tmp.test.wav.fea_kaldi.3");
 }
 
@@ -382,14 +365,11 @@ static void UnitTestHTKCompare4() {
 
   // use mfcc with default configuration...
   MfccOptions op;
-  op.frame_opts.dither = 0.0;
   op.frame_opts.window_type = "hamming";
   op.frame_opts.remove_dc_offset = false;
   op.frame_opts.round_to_power_of_two = true;
   op.mel_opts.low_freq = 0.0;
-  op.htk_compat = true;
   op.use_energy = true;  // Use energy.
-  op.mel_opts.htk_mode = true;
 
   Mfcc mfcc(op);
 
@@ -440,7 +420,7 @@ static void UnitTestHTKCompare4() {
   }
 
   std::cout << "Test passed :)\n\n";
-  
+
   unlink("tmp.test.wav.fea_kaldi.4");
 }
 
@@ -465,16 +445,13 @@ static void UnitTestHTKCompare5() {
 
   // use mfcc with default configuration...
   MfccOptions op;
-  op.frame_opts.dither = 0.0;
   op.frame_opts.window_type = "hamming";
   op.frame_opts.remove_dc_offset = false;
   op.frame_opts.round_to_power_of_two = true;
-  op.htk_compat = true;
   op.use_energy = true;  // Use energy.
   op.mel_opts.low_freq = 0.0;
   op.mel_opts.vtln_low = 100.0;
   op.mel_opts.vtln_high = 7500.0;
-  op.mel_opts.htk_mode = true;
 
   BaseFloat vtln_warp = 1.1; // our approach identical to htk for warp factor >1,
   // differs slightly for higher mel bins if warp_factor <0.9
@@ -528,7 +505,7 @@ static void UnitTestHTKCompare5() {
   }
 
   std::cout << "Test passed :)\n\n";
-  
+
   unlink("tmp.test.wav.fea_kaldi.5");
 }
 
@@ -553,15 +530,12 @@ static void UnitTestHTKCompare6() {
 
   // use mfcc with default configuration...
   MfccOptions op;
-  op.frame_opts.dither = 0.0;
-  op.frame_opts.preemph_coeff = 0.97;
   op.frame_opts.window_type = "hamming";
   op.frame_opts.remove_dc_offset = false;
   op.frame_opts.round_to_power_of_two = true;
   op.mel_opts.num_bins = 24;
   op.mel_opts.low_freq = 125.0;
   op.mel_opts.high_freq = 7800.0;
-  op.htk_compat = true;
   op.use_energy = false;  // C0 not energy.
 
   Mfcc mfcc(op);
@@ -613,7 +587,7 @@ static void UnitTestHTKCompare6() {
   }
 
   std::cout << "Test passed :)\n\n";
-  
+
   unlink("tmp.test.wav.fea_kaldi.6");
 }
 
@@ -682,5 +656,3 @@ int main() {
     return 1;
   }
 }
-
-
diff --git a/src/feat/feature-mfcc.cc b/src/feat/feature-mfcc.cc
index 899988c2822..ffa3b5450b5 100644
--- a/src/feat/feature-mfcc.cc
+++ b/src/feat/feature-mfcc.cc
@@ -25,18 +25,18 @@
 namespace kaldi {
 
 
-void MfccComputer::Compute(BaseFloat signal_log_energy,
-                           BaseFloat vtln_warp,
+void MfccComputer::Compute(BaseFloat vtln_warp,
                            VectorBase<BaseFloat> *signal_frame,
                            VectorBase<BaseFloat> *feature) {
   KALDI_ASSERT(signal_frame->Dim() == opts_.frame_opts.PaddedWindowSize() &&
                feature->Dim() == this->Dim());
 
-  const MelBanks &mel_banks = *(GetMelBanks(vtln_warp));
-
-  if (opts_.use_energy && !opts_.raw_energy)
+  BaseFloat signal_log_energy;
+  if (opts_.use_energy)
     signal_log_energy = Log(std::max<BaseFloat>(VecVec(*signal_frame, *signal_frame),
-                                     std::numeric_limits<float>::min()));
+                                                opts_.energy_floor));
+
+  const MelBanks &mel_banks = *(GetMelBanks(vtln_warp));
 
   if (srfft_ != NULL)  // Compute FFT using the split-radix algorithm.
     srfft_->Compute(signal_frame->Data(), true);
@@ -50,33 +50,15 @@ void MfccComputer::Compute(BaseFloat signal_log_energy,
 
   mel_banks.Compute(power_spectrum, &mel_energies_);
 
-  // avoid log of zero (which should be prevented anyway by dithering).
-  mel_energies_.ApplyFloor(std::numeric_limits<float>::epsilon());
-  mel_energies_.ApplyLog();  // take the log.
+  mel_energies_.ApplyFloor(opts_.energy_floor);
+  mel_energies_.ApplyLog();
 
   feature->SetZero();  // in case there were NaNs.
   // feature = dct_matrix_ * mel_energies [which now have log]
   feature->AddMatVec(1.0, dct_matrix_, kNoTrans, mel_energies_, 0.0);
 
-  if (opts_.cepstral_lifter != 0.0)
-    feature->MulElements(lifter_coeffs_);
-
-  if (opts_.use_energy) {
-    if (opts_.energy_floor > 0.0 && signal_log_energy < log_energy_floor_)
-      signal_log_energy = log_energy_floor_;
+  if (opts_.use_energy)
     (*feature)(0) = signal_log_energy;
-  }
-
-  if (opts_.htk_compat) {
-    BaseFloat energy = (*feature)(0);
-    for (int32 i = 0; i < opts_.num_ceps - 1; i++)
-      (*feature)(i) = (*feature)(i+1);
-    if (!opts_.use_energy)
-      energy *= M_SQRT2;  // scale on C0 (actually removing a scale
-    // we previously added that's part of one common definition of
-    // the cosine transform.)
-    (*feature)(opts_.num_ceps - 1)  = energy;
-  }
 }
 
 MfccComputer::MfccComputer(const MfccOptions &opts):
@@ -98,12 +80,6 @@ MfccComputer::MfccComputer(const MfccOptions &opts):
   SubMatrix<BaseFloat> dct_rows(dct_matrix, 0, opts.num_ceps, 0, num_bins);
   dct_matrix_.Resize(opts.num_ceps, num_bins);
   dct_matrix_.CopyFromMat(dct_rows);  // subset of rows.
-  if (opts.cepstral_lifter != 0.0) {
-    lifter_coeffs_.Resize(opts.num_ceps);
-    ComputeLifterCoeffs(opts.cepstral_lifter, &lifter_coeffs_);
-  }
-  if (opts.energy_floor > 0.0)
-    log_energy_floor_ = Log(opts.energy_floor);
 
   int32 padded_window_size = opts.frame_opts.PaddedWindowSize();
   if ((padded_window_size & (padded_window_size-1)) == 0)  // Is a power of two...
@@ -117,7 +93,6 @@ MfccComputer::MfccComputer(const MfccOptions &opts):
 MfccComputer::MfccComputer(const MfccComputer &other):
     opts_(other.opts_), lifter_coeffs_(other.lifter_coeffs_),
     dct_matrix_(other.dct_matrix_),
-    log_energy_floor_(other.log_energy_floor_),
     mel_banks_(other.mel_banks_),
     srfft_(NULL),
     mel_energies_(other.mel_energies_.Dim(), kUndefined) {
diff --git a/src/feat/feature-mfcc.h b/src/feat/feature-mfcc.h
index 66c52e89821..83aea3fb9bb 100644
--- a/src/feat/feature-mfcc.h
+++ b/src/feat/feature-mfcc.h
@@ -1,7 +1,7 @@
 // feat/feature-mfcc.h
 
 // Copyright 2009-2011  Karel Vesely;  Petr Motlicek;  Saarland University
-//           2014-2016  Johns Hopkins University (author: Daniel Povey)
+//           2014-2019  Johns Hopkins University (author: Daniel Povey)
 
 // See ../../COPYING for clarification regarding multiple authors
 //
@@ -39,25 +39,14 @@ struct MfccOptions {
   FrameExtractionOptions frame_opts;
   MelBanksOptions mel_opts;
   int32 num_ceps;  // e.g. 13: num cepstral coeffs, counting zero.
-  bool use_energy;  // use energy; else C0
-  BaseFloat energy_floor;  // 0 by default; set to a value like 1.0 or 0.1 if
-                           // you disable dithering.
-  bool raw_energy;  // If true, compute energy before preemphasis and windowing
-  BaseFloat cepstral_lifter;  // Scaling factor on cepstra for HTK compatibility.
-                              // if 0.0, no liftering is done.
-  bool htk_compat;  // if true, put energy/C0 last and introduce a factor of
-                    // sqrt(2) on C0 to be the same as HTK.
+  bool use_energy;  // if true, use energy; else C0
+  BaseFloat energy_floor;
 
   MfccOptions() : mel_opts(23),
-                  // defaults the #mel-banks to 23 for the MFCC computations.
-                  // this seems to be common for 16khz-sampled data,
-                  // but for 8khz-sampled data, 15 may be better.
                   num_ceps(13),
                   use_energy(true),
-                  energy_floor(0.0),
-                  raw_energy(true),
-                  cepstral_lifter(22.0),
-                  htk_compat(false) {}
+                  energy_floor(1.0e-10) { }
+
 
   void Register(OptionsItf *opts) {
     frame_opts.Register(opts);
@@ -67,17 +56,8 @@ struct MfccOptions {
     opts->Register("use-energy", &use_energy,
                    "Use energy (not C0) in MFCC computation");
     opts->Register("energy-floor", &energy_floor,
-                   "Floor on energy (absolute, not relative) in MFCC computation. "
-                   "Only makes a difference if --use-energy=true; only necessary if "
-                   "--dither=0.0.  Suggested values: 0.1 or 1.0");
-    opts->Register("raw-energy", &raw_energy,
-                   "If true, compute energy before preemphasis and windowing");
-    opts->Register("cepstral-lifter", &cepstral_lifter,
-                   "Constant that controls scaling of MFCCs");
-    opts->Register("htk-compat", &htk_compat,
-                   "If true, put energy or C0 last and use a factor of sqrt(2) on "
-                   "C0.  Warning: not sufficient to get HTK compatible features "
-                   "(need to change other parameters).");
+                   "Floor on energy (absolute, not relative) of mel bins etc. "
+                   "in MFCC computation. ");
   }
 };
 
@@ -96,17 +76,10 @@ class MfccComputer {
 
   int32 Dim() const { return opts_.num_ceps; }
 
-  bool NeedRawLogEnergy() { return opts_.use_energy && opts_.raw_energy; }
-
   /**
      Function that computes one frame of features from
      one frame of signal.
 
-     @param [in] signal_raw_log_energy The log-energy of the frame of the signal
-         prior to windowing and pre-emphasis, or
-         log(numeric_limits<float>::min()), whichever is greater.  Must be
-         ignored by this function if this class returns false from
-         this->NeedsRawLogEnergy().
      @param [in] vtln_warp  The VTLN warping factor that the user wants
          to be applied when computing features for this utterance.  Will
          normally be 1.0, meaning no warping is to be done.  The value will
@@ -119,8 +92,7 @@ class MfccComputer {
      @param [out] feature  Pointer to a vector of size this->Dim(), to which
          the computed feature will be written.
   */
-  void Compute(BaseFloat signal_log_energy,
-               BaseFloat vtln_warp,
+  void Compute(BaseFloat vtln_warp,
                VectorBase<BaseFloat> *signal_frame,
                VectorBase<BaseFloat> *feature);
 
@@ -134,7 +106,6 @@ class MfccComputer {
   MfccOptions opts_;
   Vector<BaseFloat> lifter_coeffs_;
   Matrix<BaseFloat> dct_matrix_;  // matrix we left-multiply by to perform DCT.
-  BaseFloat log_energy_floor_;
   std::map<BaseFloat, MelBanks*> mel_banks_;  // BaseFloat is VTLN coefficient.
   SplitRadixRealFft<BaseFloat> *srfft_;
 
diff --git a/src/feat/feature-plp-test.cc b/src/feat/feature-plp-test.cc
deleted file mode 100644
index ad872cffcd0..00000000000
--- a/src/feat/feature-plp-test.cc
+++ /dev/null
@@ -1,177 +0,0 @@
-// feat/feature-plp-test.cc
-
-// Copyright 2009-2011  Karel Vesely;  Petr Motlicek
-
-// See ../../COPYING for clarification regarding multiple authors
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//  http://www.apache.org/licenses/LICENSE-2.0
-//
-// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
-// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
-// MERCHANTABLITY OR NON-INFRINGEMENT.
-// See the Apache 2 License for the specific language governing permissions and
-// limitations under the License.
-
-
-#include <iostream>
-
-#include "feat/feature-plp.h"
-#include "base/kaldi-math.h"
-#include "matrix/kaldi-matrix-inl.h"
-#include "feat/wave-reader.h"
-
-using namespace kaldi;
-
-
-
-
-
-/**
- */
-static void UnitTestSimple() {
-  std::cout << "=== UnitTestSimple() ===\n";
-
-  Vector<BaseFloat> v(100000);
-  Matrix<BaseFloat> m;
-
-  // init with noise
-  for (int32 i = 0; i < v.Dim(); i++) {
-    v(i) = (abs( i * 433024253 ) % 65535) - (65535 / 2);
-  }
-
-  std::cout << "<<<=== Just make sure it runs... Nothing is compared\n";
-  // the parametrization object
-  PlpOptions op;
-  // trying to have same opts as baseline.
-  op.frame_opts.dither = 0.0;
-  op.frame_opts.preemph_coeff = 0.0;
-  op.frame_opts.window_type = "rectangular";
-  op.frame_opts.remove_dc_offset = false;
-  op.frame_opts.round_to_power_of_two = true;
-  op.mel_opts.low_freq = 0.0;
-//  op.htk_compat = true;
-
-  Plp plp(op);
-  // use default parameters
-
-  // compute mfccs.
-  plp.Compute(v, 1.0, &m);
-
-  // possibly dump
-  //   std::cout << "== Output features == \n" << m;
-  std::cout << "Test passed :)\n\n";
-}
-
-
-static void UnitTestHTKCompare1() {
-  std::cout << "=== UnitTestHTKCompare1() ===\n";
-
-  std::ifstream is("test_data/test.wav", std::ios_base::binary);
-  WaveData wave;
-  wave.Read(is);
-  KALDI_ASSERT(wave.Data().NumRows() == 1);
-  SubVector<BaseFloat> waveform(wave.Data(), 0);
-
-  // read the HTK features
-  Matrix<BaseFloat> htk_features;
-  {
-    std::ifstream is("test_data/test.wav.plp_htk.1",
-                     std::ios::in | std::ios_base::binary);
-    bool ans = ReadHtk(is, &htk_features, 0);
-    KALDI_ASSERT(ans);
-  }
-
-  // use plp with default configuration...
-  PlpOptions op;
-  op.frame_opts.dither = 0.0;
-  op.frame_opts.preemph_coeff = 0.0;
-  op.frame_opts.window_type = "hamming";
-  op.frame_opts.remove_dc_offset = false;
-  op.frame_opts.round_to_power_of_two = true;
-  op.mel_opts.low_freq = 0.0;
-  op.htk_compat = true;
-  op.use_energy = false;  // C0 not energy.
-  op.cepstral_scale = 1.0;
-
-  Plp plp(op);
-
-  // calculate kaldi features
-  Matrix<BaseFloat> kaldi_raw_features;
-  plp.Compute(waveform, 1.0, &kaldi_raw_features);
-
-  DeltaFeaturesOptions delta_opts;
-  Matrix<BaseFloat> kaldi_features;
-  ComputeDeltas(delta_opts,
-                kaldi_raw_features,
-                &kaldi_features);
-
-  // compare the results
-  bool passed = true;
-  int32 i_old = -1;
-  KALDI_ASSERT(kaldi_features.NumRows() == htk_features.NumRows());
-  KALDI_ASSERT(kaldi_features.NumCols() == htk_features.NumCols());
-  // Ignore ends-- we make slightly different choices than
-  // HTK about how to treat the deltas at the ends.
-  for (int32 i = 10; i+10 < kaldi_features.NumRows(); i++) {
-    for (int32 j = 0; j < kaldi_features.NumCols(); j++) {
-      BaseFloat a = kaldi_features(i, j), b = htk_features(i, j);
-      if ((std::abs(b - a)) > 0.10) {  //<< TOLERANCE TO DIFFERENCES!!!!!
-        // print the non-matching data only once per-line
-        if (i_old != i) {
-          std::cout << "\n\n\n[HTK-row: " << i << "] " << htk_features.Row(i) << "\n";
-          std::cout << "[Kaldi-row: " << i << "] " << kaldi_features.Row(i) << "\n\n\n";
-          i_old = i;
-        }
-        // print indices of non-matching cells
-        std::cout << "[" << i << ", " << j << "]";
-        passed = false;
-  }}}
-  if (!passed) KALDI_ERR << "Test failed";
-
-  // write the htk features for later inspection
-  HtkHeader header = {
-    kaldi_features.NumRows(),
-    100000,  // 10ms
-    static_cast<int16>(sizeof(float)*kaldi_features.NumCols()),
-    021413  // PLP_D_A_0
-  };
-  {
-    std::ofstream os("tmp.test.wav.plp_kaldi.1",
-                     std::ios::out|std::ios::binary);
-    WriteHtk(os, kaldi_features, header);
-  }
-
-  std::cout << "Test passed :)\n\n";
-  
-  unlink("tmp.test.wav.plp_kaldi.1");
-}
-
-
-
-
-static void UnitTestFeat() {
-  UnitTestSimple();
-  UnitTestHTKCompare1();
-}
-
-
-
-
-int main() {
-  try {
-    for (int i = 0; i < 5; i++)
-      UnitTestFeat();
-    std::cout << "Tests succeeded.\n";
-    return 0;
-  } catch (const std::exception &e) {
-    std::cerr << e.what();
-    return 1;
-  }
-}
-
-
diff --git a/src/feat/feature-plp.cc b/src/feat/feature-plp.cc
deleted file mode 100644
index 8f4a7d66161..00000000000
--- a/src/feat/feature-plp.cc
+++ /dev/null
@@ -1,191 +0,0 @@
-// feat/feature-plp.cc
-
-// Copyright 2009-2011  Petr Motlicek;  Karel Vesely
-//                2016  Johns Hopkins University (author: Daniel Povey)
-
-// See ../../COPYING for clarification regarding multiple authors
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//  http://www.apache.org/licenses/LICENSE-2.0
-//
-// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
-// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
-// MERCHANTABLITY OR NON-INFRINGEMENT.
-// See the Apache 2 License for the specific language governing permissions and
-// limitations under the License.
-
-
-#include "feat/feature-plp.h"
-
-namespace kaldi {
-
-PlpComputer::PlpComputer(const PlpOptions &opts):
-    opts_(opts), srfft_(NULL),
-    mel_energies_duplicated_(opts_.mel_opts.num_bins + 2, kUndefined),
-    autocorr_coeffs_(opts_.lpc_order + 1, kUndefined),
-    lpc_coeffs_(opts_.lpc_order, kUndefined),
-    raw_cepstrum_(opts_.lpc_order, kUndefined) {
-
-  if (opts.cepstral_lifter != 0.0) {
-    lifter_coeffs_.Resize(opts.num_ceps);
-    ComputeLifterCoeffs(opts.cepstral_lifter, &lifter_coeffs_);
-  }
-  InitIdftBases(opts_.lpc_order + 1, opts_.mel_opts.num_bins + 2,
-                &idft_bases_);
-
-  if (opts.energy_floor > 0.0)
-    log_energy_floor_ = Log(opts.energy_floor);
-
-  int32 padded_window_size = opts.frame_opts.PaddedWindowSize();
-  if ((padded_window_size & (padded_window_size-1)) == 0)  // Is a power of two...
-    srfft_ = new SplitRadixRealFft<BaseFloat>(padded_window_size);
-
-  // We'll definitely need the filterbanks info for VTLN warping factor 1.0.
-  // [note: this call caches it.]
-  GetMelBanks(1.0);
-}
-
-PlpComputer::PlpComputer(const PlpComputer &other):
-    opts_(other.opts_), lifter_coeffs_(other.lifter_coeffs_),
-    idft_bases_(other.idft_bases_), log_energy_floor_(other.log_energy_floor_),
-    mel_banks_(other.mel_banks_), equal_loudness_(other.equal_loudness_),
-    srfft_(NULL),
-    mel_energies_duplicated_(opts_.mel_opts.num_bins + 2, kUndefined),
-    autocorr_coeffs_(opts_.lpc_order + 1, kUndefined),
-    lpc_coeffs_(opts_.lpc_order, kUndefined),
-    raw_cepstrum_(opts_.lpc_order, kUndefined) {
-  for (std::map<BaseFloat, MelBanks*>::iterator iter = mel_banks_.begin();
-       iter != mel_banks_.end(); ++iter)
-    iter->second = new MelBanks(*(iter->second));
-  for (std::map<BaseFloat, Vector<BaseFloat>*>::iterator
-           iter = equal_loudness_.begin();
-       iter != equal_loudness_.end(); ++iter)
-    iter->second = new Vector<BaseFloat>(*(iter->second));
-  if (other.srfft_ != NULL)
-    srfft_ = new SplitRadixRealFft<BaseFloat>(*(other.srfft_));
-}
-
-PlpComputer::~PlpComputer() {
-  for (std::map<BaseFloat, MelBanks*>::iterator iter = mel_banks_.begin();
-      iter != mel_banks_.end(); ++iter)
-    delete iter->second;
-  for (std::map<BaseFloat, Vector<BaseFloat>* >::iterator
-           iter = equal_loudness_.begin();
-       iter != equal_loudness_.end(); ++iter)
-    delete iter->second;
-  delete srfft_;
-}
-
-const MelBanks *PlpComputer::GetMelBanks(BaseFloat vtln_warp) {
-  MelBanks *this_mel_banks = NULL;
-  std::map<BaseFloat, MelBanks*>::iterator iter = mel_banks_.find(vtln_warp);
-  if (iter == mel_banks_.end()) {
-    this_mel_banks = new MelBanks(opts_.mel_opts,
-                                  opts_.frame_opts,
-                                  vtln_warp);
-    mel_banks_[vtln_warp] = this_mel_banks;
-  } else {
-    this_mel_banks = iter->second;
-  }
-  return this_mel_banks;
-}
-
-const Vector<BaseFloat> *PlpComputer::GetEqualLoudness(BaseFloat vtln_warp) {
-  const MelBanks *this_mel_banks = GetMelBanks(vtln_warp);
-  Vector<BaseFloat> *ans = NULL;
-  std::map<BaseFloat, Vector<BaseFloat>*>::iterator iter
-      = equal_loudness_.find(vtln_warp);
-  if (iter == equal_loudness_.end()) {
-    ans = new Vector<BaseFloat>;
-    GetEqualLoudnessVector(*this_mel_banks, ans);
-    equal_loudness_[vtln_warp] = ans;
-  } else {
-    ans = iter->second;
-  }
-  return ans;
-}
-
-void PlpComputer::Compute(BaseFloat signal_log_energy,
-                          BaseFloat vtln_warp,
-                          VectorBase<BaseFloat> *signal_frame,
-                          VectorBase<BaseFloat> *feature) {
-  KALDI_ASSERT(signal_frame->Dim() == opts_.frame_opts.PaddedWindowSize() &&
-               feature->Dim() == this->Dim());
-
-  const MelBanks &mel_banks = *GetMelBanks(vtln_warp);
-  const Vector<BaseFloat> &equal_loudness = *GetEqualLoudness(vtln_warp);
-
-
-  KALDI_ASSERT(opts_.num_ceps <= opts_.lpc_order+1);  // our num-ceps includes C0.
-
-
-  if (opts_.use_energy && !opts_.raw_energy)
-    signal_log_energy = Log(std::max<BaseFloat>(VecVec(*signal_frame, *signal_frame),
-                                     std::numeric_limits<float>::min()));
-
-  if (srfft_ != NULL)  // Compute FFT using split-radix algorithm.
-    srfft_->Compute(signal_frame->Data(), true);
-  else  // An alternative algorithm that works for non-powers-of-two.
-    RealFft(signal_frame, true);
-
-  // Convert the FFT into a power spectrum.
-  ComputePowerSpectrum(signal_frame);  // elements 0 ... signal_frame->Dim()/2
-
-  SubVector<BaseFloat> power_spectrum(*signal_frame,
-                                      0, signal_frame->Dim() / 2 + 1);
-
-  int32 num_mel_bins = opts_.mel_opts.num_bins;
-
-  SubVector<BaseFloat> mel_energies(mel_energies_duplicated_, 1, num_mel_bins);
-
-  mel_banks.Compute(power_spectrum, &mel_energies);
-
-  mel_energies.MulElements(equal_loudness);
-
-  mel_energies.ApplyPow(opts_.compress_factor);
-
-  // duplicate first and last elements
-  mel_energies_duplicated_(0) = mel_energies_duplicated_(1);
-  mel_energies_duplicated_(num_mel_bins + 1) =
-      mel_energies_duplicated_(num_mel_bins);
-
-  autocorr_coeffs_.SetZero();  // In case of NaNs or infs
-  autocorr_coeffs_.AddMatVec(1.0, idft_bases_, kNoTrans,
-                             mel_energies_duplicated_,  0.0);
-
-  BaseFloat residual_log_energy = ComputeLpc(autocorr_coeffs_, &lpc_coeffs_);
-
-  residual_log_energy = std::max<BaseFloat>(residual_log_energy,
-                                 std::numeric_limits<float>::min());
-
-  Lpc2Cepstrum(opts_.lpc_order, lpc_coeffs_.Data(), raw_cepstrum_.Data());
-  feature->Range(1, opts_.num_ceps - 1).CopyFromVec(
-      raw_cepstrum_.Range(0, opts_.num_ceps - 1));
-  (*feature)(0) = residual_log_energy;
-
-  if (opts_.cepstral_lifter != 0.0)
-    feature->MulElements(lifter_coeffs_);
-
-  if (opts_.cepstral_scale != 1.0)
-    feature->Scale(opts_.cepstral_scale);
-
-  if (opts_.use_energy) {
-    if (opts_.energy_floor > 0.0 && signal_log_energy < log_energy_floor_)
-      signal_log_energy = log_energy_floor_;
-    (*feature)(0) = signal_log_energy;
-  }
-
-  if (opts_.htk_compat) {  // reorder the features.
-    BaseFloat log_energy = (*feature)(0);
-    for (int32 i = 0; i < opts_.num_ceps-1; i++)
-      (*feature)(i) = (*feature)(i+1);
-    (*feature)(opts_.num_ceps-1)  = log_energy;
-  }
-}
-
-
-}  // namespace kaldi
diff --git a/src/feat/feature-plp.h b/src/feat/feature-plp.h
deleted file mode 100644
index 958c5706e89..00000000000
--- a/src/feat/feature-plp.h
+++ /dev/null
@@ -1,176 +0,0 @@
-// feat/feature-plp.h
-
-// Copyright 2009-2011  Petr Motlicek;  Karel Vesely
-
-// See ../../COPYING for clarification regarding multiple authors
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//  http://www.apache.org/licenses/LICENSE-2.0
-//
-// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
-// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
-// MERCHANTABLITY OR NON-INFRINGEMENT.
-// See the Apache 2 License for the specific language governing permissions and
-// limitations under the License.
-
-#ifndef KALDI_FEAT_FEATURE_PLP_H_
-#define KALDI_FEAT_FEATURE_PLP_H_
-
-#include <map>
-#include <string>
-
-#include "feat/feature-common.h"
-#include "feat/feature-functions.h"
-#include "feat/feature-window.h"
-#include "feat/mel-computations.h"
-#include "itf/options-itf.h"
-
-namespace kaldi {
-/// @addtogroup  feat FeatureExtraction
-/// @{
-
-
-
-/// PlpOptions contains basic options for computing PLP features.
-/// It only includes things that can be done in a "stateless" way, i.e.
-/// it does not include energy max-normalization.
-/// It does not include delta computation.
-struct PlpOptions {
-  FrameExtractionOptions frame_opts;
-  MelBanksOptions mel_opts;
-  int32 lpc_order;
-  int32 num_ceps;  // num cepstra including zero
-  bool use_energy;  // use energy; else C0
-  BaseFloat energy_floor;
-  bool raw_energy;  // If true, compute energy before preemphasis and windowing
-  BaseFloat compress_factor;
-  int32 cepstral_lifter;
-  BaseFloat cepstral_scale;
-
-  bool htk_compat;  // if true, put energy/C0 last and introduce a factor of
-                    // sqrt(2) on C0 to be the same as HTK.
-
-  PlpOptions() : mel_opts(23),
-                 // default number of mel-banks for the PLP computation; this
-                 // seems to be common for 16kHz-sampled data. For 8kHz-sampled
-                 // data, 15 may be better.
-                 lpc_order(12),
-                 num_ceps(13),
-                 use_energy(true),
-                 energy_floor(0.0),
-                 raw_energy(true),
-                 compress_factor(0.33333),
-                 cepstral_lifter(22),
-                 cepstral_scale(1.0),
-                 htk_compat(false) {}
-
-  void Register(OptionsItf *opts) {
-    frame_opts.Register(opts);
-    mel_opts.Register(opts);
-    opts->Register("lpc-order", &lpc_order,
-                   "Order of LPC analysis in PLP computation");
-    opts->Register("num-ceps", &num_ceps,
-                   "Number of cepstra in PLP computation (including C0)");
-    opts->Register("use-energy", &use_energy,
-                   "Use energy (not C0) for zeroth PLP feature");
-    opts->Register("energy-floor", &energy_floor,
-                   "Floor on energy (absolute, not relative) in PLP computation. "
-                   "Only makes a difference if --use-energy=true; only necessary if "
-                   "--dither=0.0.  Suggested values: 0.1 or 1.0");
-    opts->Register("raw-energy", &raw_energy,
-                   "If true, compute energy before preemphasis and windowing");
-    opts->Register("compress-factor", &compress_factor,
-                   "Compression factor in PLP computation");
-    opts->Register("cepstral-lifter", &cepstral_lifter,
-                   "Constant that controls scaling of PLPs");
-    opts->Register("cepstral-scale", &cepstral_scale,
-                   "Scaling constant in PLP computation");
-    opts->Register("htk-compat", &htk_compat,
-                   "If true, put energy or C0 last.  Warning: not sufficient "
-                   "to get HTK compatible features (need to change other "
-                   "parameters).");
-  }
-};
-
-
-/// This is the new-style interface to the PLP computation.
-class PlpComputer {
- public:
-  typedef PlpOptions Options;
-  explicit PlpComputer(const PlpOptions &opts);
-  PlpComputer(const PlpComputer &other);
-
-  const FrameExtractionOptions &GetFrameOptions() const {
-    return opts_.frame_opts;
-  }
-
-  int32 Dim() const { return opts_.num_ceps; }
-
-  bool NeedRawLogEnergy() { return opts_.use_energy && opts_.raw_energy; }
-
-  /**
-     Function that computes one frame of features from
-     one frame of signal.
-
-     @param [in] signal_raw_log_energy The log-energy of the frame of the signal
-         prior to windowing and pre-emphasis, or
-         log(numeric_limits<float>::min()), whichever is greater.  Must be
-         ignored by this function if this class returns false from
-         this->NeedsRawLogEnergy().
-     @param [in] vtln_warp  The VTLN warping factor that the user wants
-         to be applied when computing features for this utterance.  Will
-         normally be 1.0, meaning no warping is to be done.  The value will
-         be ignored for feature types that don't support VLTN, such as
-         spectrogram features.
-     @param [in] signal_frame  One frame of the signal,
-       as extracted using the function ExtractWindow() using the options
-       returned by this->GetFrameOptions().  The function will use the
-       vector as a workspace, which is why it's a non-const pointer.
-     @param [out] feature  Pointer to a vector of size this->Dim(), to which
-         the computed feature will be written.
-  */
-  void Compute(BaseFloat signal_log_energy,
-               BaseFloat vtln_warp,
-               VectorBase<BaseFloat> *signal_frame,
-               VectorBase<BaseFloat> *feature);
-
-  ~PlpComputer();
- private:
-
-  const MelBanks *GetMelBanks(BaseFloat vtln_warp);
-
-  const Vector<BaseFloat> *GetEqualLoudness(BaseFloat vtln_warp);
-
-  PlpOptions opts_;
-  Vector<BaseFloat> lifter_coeffs_;
-  Matrix<BaseFloat> idft_bases_;
-  BaseFloat log_energy_floor_;
-  std::map<BaseFloat, MelBanks*> mel_banks_;  // BaseFloat is VTLN coefficient.
-  std::map<BaseFloat, Vector<BaseFloat>* > equal_loudness_;
-  SplitRadixRealFft<BaseFloat> *srfft_;
-
-  // temporary vector used inside Compute; size is opts_.mel_opts.num_bins + 2
-  Vector<BaseFloat> mel_energies_duplicated_;
-  // temporary vector used inside Compute; size is opts_.lpc_order + 1
-  Vector<BaseFloat> autocorr_coeffs_;
-  // temporary vector used inside Compute; size is opts_.lpc_order
-  Vector<BaseFloat> lpc_coeffs_;
-  // temporary vector used inside Compute; size is opts_.lpc_order
-  Vector<BaseFloat> raw_cepstrum_;
-
-  // Disallow assignment.
-  PlpComputer &operator =(const PlpComputer &other);
-};
-
-typedef OfflineFeatureTpl<PlpComputer> Plp;
-
-/// @} End of "addtogroup feat"
-
-}  // namespace kaldi
-
-
-#endif  // KALDI_FEAT_FEATURE_PLP_H_
diff --git a/src/feat/feature-window.cc b/src/feat/feature-window.cc
index c5d4cc29831..b68b8854128 100644
--- a/src/feat/feature-window.cc
+++ b/src/feat/feature-window.cc
@@ -1,7 +1,7 @@
 // feat/feature-window.cc
 
 // Copyright 2009-2011  Karel Vesely;  Petr Motlicek;  Microsoft Corporation
-//           2013-2016  Johns Hopkins University (author: Daniel Povey)
+//           2013-2019  Johns Hopkins University (author: Daniel Povey)
 //                2014  IMSL, PKU-HKUST (author: Wei Shi)
 
 // See ../../COPYING for clarification regarding multiple authors
@@ -30,13 +30,9 @@ namespace kaldi {
 int64 FirstSampleOfFrame(int32 frame,
                          const FrameExtractionOptions &opts) {
   int64 frame_shift = opts.WindowShift();
-  if (opts.snip_edges) {
-    return frame * frame_shift;
-  } else {
-    int64 midpoint_of_frame = frame_shift * frame  +  frame_shift / 2,
-        beginning_of_frame = midpoint_of_frame  -  opts.WindowSize() / 2;
-    return beginning_of_frame;
-  }
+  int64 midpoint_of_frame = frame_shift * frame  +  frame_shift / 2,
+      beginning_of_frame = midpoint_of_frame  -  opts.WindowSize() / 2;
+  return beginning_of_frame;
 }
 
 int32 NumFrames(int64 num_samples,
@@ -44,85 +40,54 @@ int32 NumFrames(int64 num_samples,
                 bool flush) {
   int64 frame_shift = opts.WindowShift();
   int64 frame_length = opts.WindowSize();
-  if (opts.snip_edges) {
-    // with --snip-edges=true (the default), we use a HTK-like approach to
-    // determining the number of frames-- all frames have to fit completely into
-    // the waveform, and the first frame begins at sample zero.
-    if (num_samples < frame_length)
-      return 0;
-    else
-      return (1 + ((num_samples - frame_length) / frame_shift));
-    // You can understand the expression above as follows: 'num_samples -
-    // frame_length' is how much room we have to shift the frame within the
-    // waveform; 'frame_shift' is how much we shift it each time; and the ratio
-    // is how many times we can shift it (integer arithmetic rounds down).
-  } else {
-    // if --snip-edges=false, the number of frames is determined by rounding the
-    // (file-length / frame-shift) to the nearest integer.  The point of this
-    // formula is to make the number of frames an obvious and predictable
-    // function of the frame shift and signal length, which makes many
-    // segmentation-related questions simpler.
-    //
-    // Because integer division in C++ rounds toward zero, we add (half the
-    // frame-shift minus epsilon) before dividing, to have the effect of
-    // rounding towards the closest integer.
-    int32 num_frames = (num_samples + (frame_shift / 2)) / frame_shift;
-
-    if (flush)
-      return num_frames;
-
-    // note: 'end' always means the last plus one, i.e. one past the last.
-    int64 end_sample_of_last_frame = FirstSampleOfFrame(num_frames - 1, opts)
-        + frame_length;
-
-    // the following code is optimized more for clarity than efficiency.
-    // If flush == false, we can't output frames that extend past the end
-    // of the signal.
-    while (num_frames > 0 && end_sample_of_last_frame > num_samples) {
-      num_frames--;
-      end_sample_of_last_frame -= frame_shift;
-    }
+
+  // The number of frames is determined by rounding the
+  // (file-length / frame-shift) to the nearest integer.  The point of this
+  // formula is to make the number of frames an obvious and predictable
+  // function of the frame shift and signal length, which makes many
+  // segmentation-related questions simpler.
+  //
+  // Because integer division in C++ rounds toward zero, we add (half the
+  // frame-shift minus epsilon) before dividing, to have the effect of
+  // rounding towards the closest integer.
+  int32 num_frames = (num_samples + (frame_shift / 2)) / frame_shift;
+
+  if (flush)
     return num_frames;
-  }
-}
 
+  // note: 'end' always means the last plus one, i.e. one past the last.
+  int64 end_sample_of_last_frame = FirstSampleOfFrame(num_frames - 1, opts)
+      + frame_length;
 
-void Dither(VectorBase<BaseFloat> *waveform, BaseFloat dither_value) {
-  if (dither_value == 0.0)
-    return;
-  int32 dim = waveform->Dim();
-  BaseFloat *data = waveform->Data();
-  RandomState rstate;
-  for (int32 i = 0; i < dim; i++)
-    data[i] += RandGauss(&rstate) * dither_value;
+  // the following code is optimized more for clarity than efficiency.
+  // If flush == false, we can't output frames that extend past the end
+  // of the signal.
+  while (num_frames > 0 && end_sample_of_last_frame > num_samples) {
+    num_frames--;
+    end_sample_of_last_frame -= frame_shift;
+  }
+  return num_frames;
 }
 
 
-void Preemphasize(VectorBase<BaseFloat> *waveform, BaseFloat preemph_coeff) {
-  if (preemph_coeff == 0.0) return;
-  KALDI_ASSERT(preemph_coeff >= 0.0 && preemph_coeff <= 1.0);
-  for (int32 i = waveform->Dim()-1; i > 0; i--)
-    (*waveform)(i) -= preemph_coeff * (*waveform)(i-1);
-  (*waveform)(0) -= preemph_coeff * (*waveform)(0);
-}
-
-FeatureWindowFunction::FeatureWindowFunction(const FrameExtractionOptions &opts) {
+void InitFeatureWindowFunction(const FrameExtractionOptions &opts,
+                               Vector<BaseFloat> *window_function) {
   int32 frame_length = opts.WindowSize();
   KALDI_ASSERT(frame_length > 0);
-  window.Resize(frame_length);
+  window_function->Resize(frame_length);
   double a = M_2PI / (frame_length-1);
   for (int32 i = 0; i < frame_length; i++) {
     double i_fl = static_cast<double>(i);
     if (opts.window_type == "hanning") {
-      window(i) = 0.5  - 0.5*cos(a * i_fl);
+      (*window_function)(i) = 0.5  - 0.5*cos(a * i_fl);
     } else if (opts.window_type == "hamming") {
-      window(i) = 0.54 - 0.46*cos(a * i_fl);
+      (*window_function)(i) = 0.54 - 0.46*cos(a * i_fl);
     } else if (opts.window_type == "povey") {  // like hamming but goes to zero at edges.
-      window(i) = pow(0.5 - 0.5*cos(a * i_fl), 0.85);
+      (*window_function)(i) = pow(0.5 - 0.5*cos(a * i_fl), 0.85);
     } else if (opts.window_type == "rectangular") {
-      window(i) = 1.0;
+      (*window_function)(i) = 1.0;
     } else if (opts.window_type == "blackman") {
-      window(i) = opts.blackman_coeff - 0.5*cos(a * i_fl) +
+      (*window_function)(i) = opts.blackman_coeff - 0.5*cos(a * i_fl) +
         (0.5 - opts.blackman_coeff) * cos(2 * a * i_fl);
     } else {
       KALDI_ERR << "Invalid window type " << opts.window_type;
@@ -131,54 +96,32 @@ FeatureWindowFunction::FeatureWindowFunction(const FrameExtractionOptions &opts)
 }
 
 void ProcessWindow(const FrameExtractionOptions &opts,
-                   const FeatureWindowFunction &window_function,
-                   VectorBase<BaseFloat> *window,
-                   BaseFloat *log_energy_pre_window) {
+                   const VectorBase<BaseFloat> *window_function,
+                   VectorBase<BaseFloat> *window) {
   int32 frame_length = opts.WindowSize();
   KALDI_ASSERT(window->Dim() == frame_length);
 
-  if (opts.dither != 0.0)
-    Dither(window, opts.dither);
-
   if (opts.remove_dc_offset)
     window->Add(-window->Sum() / frame_length);
 
-  if (log_energy_pre_window != NULL) {
-    BaseFloat energy = std::max<BaseFloat>(VecVec(*window, *window),
-                                std::numeric_limits<float>::epsilon());
-    *log_energy_pre_window = Log(energy);
-  }
-
-  if (opts.preemph_coeff != 0.0)
-    Preemphasize(window, opts.preemph_coeff);
-
-  window->MulElements(window_function.window);
+  window->MulElements(*window_function);
 }
 
 
 // ExtractWindow extracts a windowed frame of waveform with a power-of-two,
-// padded size.  It does mean subtraction, pre-emphasis and dithering as
-// requested.
+// padded size.  It does mean subtraction if requested.
 void ExtractWindow(int64 sample_offset,
                    const VectorBase<BaseFloat> &wave,
                    int32 f,  // with 0 <= f < NumFrames(feats, opts)
                    const FrameExtractionOptions &opts,
-                   const FeatureWindowFunction &window_function,
-                   Vector<BaseFloat> *window,
-                   BaseFloat *log_energy_pre_window) {
+                   const Vector<BaseFloat> &window_function,
+                   Vector<BaseFloat> *window) {
   KALDI_ASSERT(sample_offset >= 0 && wave.Dim() != 0);
   int32 frame_length = opts.WindowSize(),
       frame_length_padded = opts.PaddedWindowSize();
-  int64 num_samples = sample_offset + wave.Dim(),
-      start_sample = FirstSampleOfFrame(f, opts),
-      end_sample = start_sample + frame_length;
+  int64 start_sample = FirstSampleOfFrame(f, opts);
 
-  if (opts.snip_edges) {
-    KALDI_ASSERT(start_sample >= sample_offset &&
-                 end_sample <= num_samples);
-  } else {
-    KALDI_ASSERT(sample_offset == 0 || start_sample >= sample_offset);
-  }
+  KALDI_ASSERT(sample_offset == 0 || start_sample >= sample_offset);
 
   if (window->Dim() != frame_length_padded)
     window->Resize(frame_length_padded, kUndefined);
@@ -216,7 +159,7 @@ void ExtractWindow(int64 sample_offset,
 
   SubVector<BaseFloat> frame(*window, 0, frame_length);
 
-  ProcessWindow(opts, window_function, &frame, log_energy_pre_window);
+  ProcessWindow(opts, window_function, &frame);
 }
 
 }  // namespace kaldi
diff --git a/src/feat/feature-window.h b/src/feat/feature-window.h
index 2fccaefb9a1..ccbc1cd2d9b 100644
--- a/src/feat/feature-window.h
+++ b/src/feat/feature-window.h
@@ -36,8 +36,6 @@ struct FrameExtractionOptions {
   BaseFloat samp_freq;
   BaseFloat frame_shift_ms;  // in milliseconds.
   BaseFloat frame_length_ms;  // in milliseconds.
-  BaseFloat dither;  // Amount of dithering, 0.0 means no dither.
-  BaseFloat preemph_coeff;  // Preemphasis coefficient.
   bool remove_dc_offset;  // Subtract mean of wave before FFT.
   std::string window_type;  // e.g. Hamming window
   // May be "hamming", "rectangular", "povey", "hanning", "blackman"
@@ -46,7 +44,6 @@ struct FrameExtractionOptions {
   // I just don't think the Hamming window makes sense as a windowing function.
   bool round_to_power_of_two;
   BaseFloat blackman_coeff;
-  bool snip_edges;
   bool allow_downsample;
   bool allow_upsample;
   int max_feature_vectors;
@@ -54,16 +51,14 @@ struct FrameExtractionOptions {
       samp_freq(16000),
       frame_shift_ms(10.0),
       frame_length_ms(25.0),
-      dither(1.0),
-      preemph_coeff(0.97),
       remove_dc_offset(true),
       window_type("povey"),
       round_to_power_of_two(true),
       blackman_coeff(0.42),
-      snip_edges(true),
       allow_downsample(false),
-      max_feature_vectors(-1),
-      allow_upsample(false) { }
+      allow_upsample(false),
+      max_feature_vectors(-1) { }
+
 
   void Register(OptionsItf *opts) {
     opts->Register("sample-frequency", &samp_freq,
@@ -71,13 +66,8 @@ struct FrameExtractionOptions {
                    "if specified there)");
     opts->Register("frame-length", &frame_length_ms, "Frame length in milliseconds");
     opts->Register("frame-shift", &frame_shift_ms, "Frame shift in milliseconds");
-    opts->Register("preemphasis-coefficient", &preemph_coeff,
-                   "Coefficient for use in signal preemphasis");
     opts->Register("remove-dc-offset", &remove_dc_offset,
                    "Subtract mean from waveform on each frame");
-    opts->Register("dither", &dither, "Dithering constant (0.0 means no dither). "
-                   "If you turn this off, you should set the --energy-floor "
-                   "option, e.g. to 1.0 or 0.1");
     opts->Register("window-type", &window_type, "Type of window "
                    "(\"hamming\"|\"hanning\"|\"povey\"|\"rectangular\""
                    "|\"blackmann\")");
@@ -86,11 +76,6 @@ struct FrameExtractionOptions {
     opts->Register("round-to-power-of-two", &round_to_power_of_two,
                    "If true, round window size to power of two by zero-padding "
                    "input to FFT.");
-    opts->Register("snip-edges", &snip_edges,
-                   "If true, end effects will be handled by outputting only frames that "
-                   "completely fit in the file, and the number of frames depends on the "
-                   "frame-length.  If false, the number of frames depends only on the "
-                   "frame-shift, and we reflect the data at the ends.");
     opts->Register("allow-downsample", &allow_downsample,
                    "If true, allow the input waveform to have a higher frequency than "
                    "the specified --sample-frequency (and we'll downsample).");
@@ -115,13 +100,11 @@ struct FrameExtractionOptions {
 };
 
 
-struct FeatureWindowFunction {
-  FeatureWindowFunction() {}
-  explicit FeatureWindowFunction(const FrameExtractionOptions &opts);
-  FeatureWindowFunction(const FeatureWindowFunction &other):
-      window(other.window) { }
-  Vector<BaseFloat> window;
-};
+// Sets up the feature window function (e.g. Hamming) as specified by the
+// options.
+void InitFeatureWindowFunction(
+    const FrameExtractionOptions &opts,
+    Vector<BaseFloat> *window_function);
 
 
 /**
@@ -157,13 +140,12 @@ int64 FirstSampleOfFrame(int32 frame,
 
 void Dither(VectorBase<BaseFloat> *waveform, BaseFloat dither_value);
 
-void Preemphasize(VectorBase<BaseFloat> *waveform, BaseFloat preemph_coeff);
 
 /**
-  This function does all the windowing steps after actually
-  extracting the windowed signal: depeding on the
-  configuration, it does dithering, dc offset removal,
-  preemphasis, and multiplication by the windowing function.
+  This function does all the windowing steps after actually extracting the
+  windowed signal: depeding on the configuration, it dc offset removal and
+  multiplication by the windowing function.
+
    @param [in] opts  The options class to be used
    @param [in] window_function  The windowing function-- should have
                     been initialized using 'opts'.
@@ -172,14 +154,10 @@ void Preemphasize(VectorBase<BaseFloat> *waveform, BaseFloat preemph_coeff);
       opts.PaddedWindowSize(), with the remaining samples zero,
       as the FFT code is more efficient if it operates on data with
       power-of-two size.
-   @param [out]   log_energy_pre_window If non-NULL, then after dithering and
-      DC offset removal, this function will write to this pointer the log of
-      the total energy (i.e. sum-squared) of the frame.
  */
 void ProcessWindow(const FrameExtractionOptions &opts,
-                   const FeatureWindowFunction &window_function,
-                   VectorBase<BaseFloat> *window,
-                   BaseFloat *log_energy_pre_window = NULL);
+                   const VectorBase<BaseFloat> &window_function,
+                   VectorBase<BaseFloat> *window);
 
 
 /*
@@ -201,18 +179,15 @@ void ProcessWindow(const FrameExtractionOptions &opts,
   @param [in] window_function  The windowing function, as derived from the
                     options class.
   @param [out] window  The windowed, possibly-padded waveform to be
-                     extracted.  Will be resized as needed.
-  @param [out] log_energy_pre_window  If non-NULL, the log-energy of
-                   the signal prior to pre-emphasis and multiplying by
-                   the windowing function will be written to here.
+                    extracted.  Will be resized as needed.
 */
 void ExtractWindow(int64 sample_offset,
                    const VectorBase<BaseFloat> &wave,
                    int32 f,
                    const FrameExtractionOptions &opts,
-                   const FeatureWindowFunction &window_function,
-                   Vector<BaseFloat> *window,
-                   BaseFloat *log_energy_pre_window = NULL);
+                   const VectorBase<BaseFloat> &window_function,
+                   Vector<BaseFloat> *window);
+
 
 
 /// @} End of "addtogroup feat"
diff --git a/src/feat/mel-computations.cc b/src/feat/mel-computations.cc
index 810b6247e93..1772caadf4a 100644
--- a/src/feat/mel-computations.cc
+++ b/src/feat/mel-computations.cc
@@ -32,8 +32,7 @@ namespace kaldi {
 
 MelBanks::MelBanks(const MelBanksOptions &opts,
                    const FrameExtractionOptions &frame_opts,
-                   BaseFloat vtln_warp_factor):
-    htk_mode_(opts.htk_mode) {
+                   BaseFloat vtln_warp_factor) {
   int32 num_bins = opts.num_bins;
   if (num_bins < 3) KALDI_ERR << "Must have at least 3 mel bins";
   BaseFloat sample_freq = frame_opts.samp_freq;
@@ -128,10 +127,6 @@ MelBanks::MelBanks(const MelBanksOptions &opts,
     bins_[bin].second.Resize(size);
     bins_[bin].second.CopyFromVec(this_bin.Range(first_index, size));
 
-    // Replicate a bug in HTK, for testing purposes.
-    if (opts.htk_mode && bin == 0 && mel_low_freq != 0.0)
-      bins_[bin].second(0) = 0.0;
-
   }
   if (debug_) {
     for (size_t i = 0; i < bins_.size(); i++) {
@@ -144,8 +139,7 @@ MelBanks::MelBanks(const MelBanksOptions &opts,
 MelBanks::MelBanks(const MelBanks &other):
     center_freqs_(other.center_freqs_),
     bins_(other.bins_),
-    debug_(other.debug_),
-    htk_mode_(other.htk_mode_) { }
+    debug_(other.debug_) { }
 
 BaseFloat MelBanks::VtlnWarpFreq(BaseFloat vtln_low_cutoff,  // upper+lower frequency cutoffs for VTLN.
                                  BaseFloat vtln_high_cutoff,
@@ -232,8 +226,6 @@ void MelBanks::Compute(const VectorBase<BaseFloat> &power_spectrum,
     int32 offset = bins_[i].first;
     const Vector<BaseFloat> &v(bins_[i].second);
     BaseFloat energy = VecVec(v, power_spectrum.Range(offset, v.Dim()));
-    // HTK-like flooring- for testing purposes (we prefer dither)
-    if (htk_mode_ && energy < 1.0) energy = 1.0;
     (*mel_energies_out)(i) = energy;
 
     // The following assert was added due to a problem with OpenBlas that
@@ -250,91 +242,8 @@ void MelBanks::Compute(const VectorBase<BaseFloat> &power_spectrum,
   }
 }
 
-void ComputeLifterCoeffs(BaseFloat Q, VectorBase<BaseFloat> *coeffs) {
-  // Compute liftering coefficients (scaling on cepstral coeffs)
-  // coeffs are numbered slightly differently from HTK: the zeroth
-  // index is C0, which is not affected.
-  for (int32 i = 0; i < coeffs->Dim(); i++)
-    (*coeffs)(i) = 1.0 + 0.5 * Q * sin (M_PI * i / Q);
-}
-
-
-// Durbin's recursion - converts autocorrelation coefficients to the LPC
-// pTmp - temporal place [n]
-// pAC - autocorrelation coefficients [n + 1]
-// pLP - linear prediction coefficients [n] (predicted_sn = sum_1^P{a[i] * s[n-i]}})
-//       F(z) = 1 / (1 - A(z)), 1 is not stored in the demoninator
-BaseFloat Durbin(int n, const BaseFloat *pAC, BaseFloat *pLP, BaseFloat *pTmp) {
-  BaseFloat ki;                // reflection coefficient
-  int i;
-  int j;
-
-  BaseFloat E = pAC[0];
-
-  for (i = 0; i < n; i++) {
-    // next reflection coefficient
-    ki = pAC[i + 1];
-    for (j = 0; j < i; j++)
-      ki += pLP[j] * pAC[i - j];
-    ki = ki / E;
-
-    // new error
-    BaseFloat c = 1 - ki * ki;
-    if (c < 1.0e-5) // remove NaNs for constan signal
-      c = 1.0e-5;
-    E *= c;
-
-    // new LP coefficients
-    pTmp[i] = -ki;
-    for (j = 0; j < i; j++)
-      pTmp[j] = pLP[j] - ki * pLP[i - j - 1];
-
-    for (j = 0; j <= i; j++)
-      pLP[j] = pTmp[j];
-  }
-
-  return E;
-}
-
-
-void Lpc2Cepstrum(int n, const BaseFloat *pLPC, BaseFloat *pCepst) {
-  for (int32 i = 0; i < n; i++) {
-    double sum = 0.0;
-    int j;
-    for (j = 0; j < i; j++) {
-      sum += static_cast<BaseFloat>(i - j) * pLPC[j] * pCepst[i - j - 1];
-    }
-    pCepst[i] = -pLPC[i] - sum / static_cast<BaseFloat>(i + 1);
-  }
-}
-
-void GetEqualLoudnessVector(const MelBanks &mel_banks,
-                            Vector<BaseFloat> *ans) {
-  int32 n = mel_banks.NumBins();
-  // Central frequency of each mel bin.
-  const Vector<BaseFloat> &f0 = mel_banks.GetCenterFreqs();
-  ans->Resize(n);
-  for (int32 i = 0; i < n; i++) {
-    BaseFloat fsq = f0(i) * f0(i);
-    BaseFloat fsub = fsq / (fsq + 1.6e5);
-    (*ans)(i) = fsub * fsub * ((fsq + 1.44e6) / (fsq + 9.61e6));
-  }
-}
 
 
-// Compute LP coefficients from autocorrelation coefficients.
-BaseFloat ComputeLpc(const VectorBase<BaseFloat> &autocorr_in,
-                     Vector<BaseFloat> *lpc_out) {
-  int32 n = autocorr_in.Dim() - 1;
-  KALDI_ASSERT(lpc_out->Dim() == n);
-  Vector<BaseFloat> tmp(n);
-  BaseFloat ans = Durbin(n, autocorr_in.Data(),
-                         lpc_out->Data(),
-                         tmp.Data());
-  if (ans <= 0.0)
-    KALDI_WARN << "Zero energy in LPC computation";
-  return -Log(1.0 / ans);  // forms the C0 value
-}
 
 
 }  // namespace kaldi
diff --git a/src/feat/mel-computations.h b/src/feat/mel-computations.h
index 7053da54f3a..6c56a9ab83d 100644
--- a/src/feat/mel-computations.h
+++ b/src/feat/mel-computations.h
@@ -1,7 +1,7 @@
 // feat/mel-computations.h
 
 // Copyright 2009-2011  Phonexia s.r.o.;  Microsoft Corporation
-//                2016  Johns Hopkins University (author: Daniel Povey)
+//           2016-2019  Johns Hopkins University (author: Daniel Povey)
 
 // See ../../COPYING for clarification regarding multiple authors
 //
@@ -44,18 +44,14 @@ struct MelBanksOptions {
   int32 num_bins;  // e.g. 25; number of triangular bins
   BaseFloat low_freq;  // e.g. 20; lower frequency cutoff
   BaseFloat high_freq;  // an upper frequency cutoff; 0 -> no cutoff, negative
-  // ->added to the Nyquist frequency to get the cutoff.
+                        // ->added to the Nyquist frequency to get the cutoff.
   BaseFloat vtln_low;  // vtln lower cutoff of warping function.
   BaseFloat vtln_high;  // vtln upper cutoff of warping function: if negative, added
                         // to the Nyquist frequency to get the cutoff.
   bool debug_mel;
-  // htk_mode is a "hidden" config, it does not show up on command line.
-  // Enables more exact compatibibility with HTK, for testing purposes.  Affects
-  // mel-energy flooring and reproduces a bug in HTK.
-  bool htk_mode;
   explicit MelBanksOptions(int num_bins = 25)
       : num_bins(num_bins), low_freq(20), high_freq(0), vtln_low(100),
-        vtln_high(-500), debug_mel(false), htk_mode(false) {}
+        vtln_high(-500), debug_mel(false) { }
 
   void Register(OptionsItf *opts) {
     opts->Register("num-mel-bins", &num_bins,
@@ -87,10 +83,9 @@ class MelBanks {
   }
 
   static BaseFloat VtlnWarpFreq(BaseFloat vtln_low_cutoff,
-                                BaseFloat vtln_high_cutoff,  // discontinuities in warp func
+                                BaseFloat vtln_high_cutoff,
                                 BaseFloat low_freq,
-                                BaseFloat high_freq,  // upper+lower frequency cutoffs in
-                                // the mel computation
+                                BaseFloat high_freq,
                                 BaseFloat vtln_warp_factor,
                                 BaseFloat freq);
 
@@ -106,7 +101,7 @@ class MelBanks {
            const FrameExtractionOptions &frame_opts,
            BaseFloat vtln_warp_factor);
 
-  /// Compute Mel energies (note: not log enerties).
+  /// Compute Mel energies (note: not log energies).
   /// At input, "fft_energies" contains the FFT energies (not log).
   void Compute(const VectorBase<BaseFloat> &fft_energies,
                VectorBase<BaseFloat> *mel_energies_out) const;
@@ -131,35 +126,9 @@ class MelBanks {
   std::vector<std::pair<int32, Vector<BaseFloat> > > bins_;
 
   bool debug_;
-  bool htk_mode_;
 };
 
 
-// Compute liftering coefficients (scaling on cepstral coeffs)
-// coeffs are numbered slightly differently from HTK: the zeroth
-// index is C0, which is not affected.
-void ComputeLifterCoeffs(BaseFloat Q, VectorBase<BaseFloat> *coeffs);
-
-
-// Durbin's recursion - converts autocorrelation coefficients to the LPC
-// pTmp - temporal place [n]
-// pAC - autocorrelation coefficients [n + 1]
-// pLP - linear prediction coefficients [n] (predicted_sn = sum_1^P{a[i] * s[n-i]}})
-//       F(z) = 1 / (1 - A(z)), 1 is not stored in the demoninator
-// Returns log energy of residual (I think)
-BaseFloat Durbin(int n, const BaseFloat *pAC, BaseFloat *pLP, BaseFloat *pTmp);
-
-// Compute LP coefficients from autocorrelation coefficients.
-// Returns log energy of residual (I think)
-BaseFloat ComputeLpc(const VectorBase<BaseFloat> &autocorr_in,
-                     Vector<BaseFloat> *lpc_out);
-
-void Lpc2Cepstrum(int n, const BaseFloat *pLPC, BaseFloat *pCepst);
-
-
-
-void GetEqualLoudnessVector(const MelBanks &mel_banks,
-                            Vector<BaseFloat> *ans);
 
 /// @} End of "addtogroup feat"
 }  // namespace kaldi
diff --git a/src/feat/online-feature-test.cc b/src/feat/online-feature-test.cc
index 7ba6c7c32be..c5a2ae44ec7 100644
--- a/src/feat/online-feature-test.cc
+++ b/src/feat/online-feature-test.cc
@@ -195,55 +195,6 @@ void TestOnlineMfcc() {
   }
 }
 
-void TestOnlinePlp() {
-  std::ifstream is("../feat/test_data/test.wav", std::ios_base::binary);
-  WaveData wave;
-  wave.Read(is);
-  KALDI_ASSERT(wave.Data().NumRows() == 1);
-  SubVector<BaseFloat> waveform(wave.Data(), 0);
-
-  // the parametrization object
-  PlpOptions op;
-  op.frame_opts.dither = 0.0;
-  op.frame_opts.preemph_coeff = 0.0;
-  op.frame_opts.window_type = "hamming";
-  op.frame_opts.remove_dc_offset = false;
-  op.frame_opts.round_to_power_of_two = true;
-  op.frame_opts.samp_freq = wave.SampFreq();
-  op.mel_opts.low_freq = 0.0;
-  op.htk_compat = false;
-  op.use_energy = false;  // C0 not energy.
-  Plp plp(op);
-
-  // compute plp offline
-  Matrix<BaseFloat> plp_feats;
-  plp.Compute(waveform, 1.0, &plp_feats);  // vtln not supported
-
-  // compare
-  // The test waveform is about 1.44s long, so
-  // we try to break it into from 5 pieces to 9(not essential to do so)
-  for (int32 num_piece = 5; num_piece < 10; num_piece++) {
-    OnlinePlp online_plp(op);
-    std::vector<int32> piece_length(num_piece);
-    bool ret = RandomSplit(waveform.Dim(), &piece_length, num_piece);
-    KALDI_ASSERT(ret);
-
-    int32 offset_start = 0;
-    for (int32 i = 0; i < num_piece; i++) {
-      Vector<BaseFloat> wave_piece(
-        waveform.Range(offset_start, piece_length[i]));
-      online_plp.AcceptWaveform(wave.SampFreq(), wave_piece);
-      offset_start += piece_length[i];
-    }
-    online_plp.InputFinished();
-
-    Matrix<BaseFloat> online_plp_feats;
-    GetOutput(&online_plp, &online_plp_feats);
-
-    AssertEqual(plp_feats, online_plp_feats);
-  }
-}
-
 void TestOnlineTransform() {
   std::ifstream is("../feat/test_data/test.wav", std::ios_base::binary);
   WaveData wave;
@@ -332,9 +283,9 @@ void TestOnlineAppendFeature() {
   // The test waveform is about 1.44s long, so
   // we try to break it into from 5 pieces to 9(not essential to do so)
   for (int32 num_piece = 5; num_piece < 10; num_piece++) {
-    OnlineMfcc online_mfcc(mfcc_op);
-    OnlinePlp online_plp(plp_op);
-    OnlineAppendFeature online_mfcc_plp(&online_mfcc, &online_plp);
+    OnlineMfcc online_mfcc(mfcc_op),
+        online_mfcc2(mfcc_op);
+    OnlineAppendFeature online_mfcc_doubled(&online_mfcc, &online_mfcc2);
 
     std::vector<int32> piece_length(num_piece);
     bool ret = RandomSplit(waveform.Dim(), &piece_length, num_piece);
@@ -344,32 +295,32 @@ void TestOnlineAppendFeature() {
       Vector<BaseFloat> wave_piece(
         waveform.Range(offset_start, piece_length[i]));
       online_mfcc.AcceptWaveform(wave.SampFreq(), wave_piece);
-      online_plp.AcceptWaveform(wave.SampFreq(), wave_piece);
+      online_mfcc2.AcceptWaveform(wave.SampFreq(), wave_piece);
       offset_start += piece_length[i];
     }
     online_mfcc.InputFinished();
-    online_plp.InputFinished();
+    online_mfcc2.InputFinished();
 
-    Matrix<BaseFloat> online_mfcc_plp_feats;
-    GetOutput(&online_mfcc_plp, &online_mfcc_plp_feats);
+    Matrix<BaseFloat> online_mfcc_doubled_feats;
+    GetOutput(&online_mfcc_doubled, &online_mfcc_doubled_feats);
 
-    // compare mfcc_feats & plp_features with online_mfcc_plp_feats
-    KALDI_ASSERT(mfcc_feats.NumRows() == online_mfcc_plp_feats.NumRows()
-      && plp_feats.NumRows() == online_mfcc_plp_feats.NumRows()
+    // compare mfcc_feats & plp_features with online_mfcc_doubled_feats
+    KALDI_ASSERT(mfcc_feats.NumRows() == online_mfcc_doubled_feats.NumRows()
+      && plp_feats.NumRows() == online_mfcc_doubled_feats.NumRows()
       && mfcc_feats.NumCols() + plp_feats.NumCols()
-         == online_mfcc_plp_feats.NumCols());
-    for (MatrixIndexT i = 0; i < online_mfcc_plp_feats.NumRows(); i++) {
+         == online_mfcc_doubled_feats.NumCols());
+    for (MatrixIndexT i = 0; i < online_mfcc_doubled_feats.NumRows(); i++) {
       for (MatrixIndexT j = 0; j < mfcc_feats.NumCols(); j++) {
-        KALDI_ASSERT(std::abs(mfcc_feats(i, j) - online_mfcc_plp_feats(i, j))
+        KALDI_ASSERT(std::abs(mfcc_feats(i, j) - online_mfcc_doubled_feats(i, j))
           < 0.0001*std::max(1.0, static_cast<double>(std::abs(mfcc_feats(i, j))
-                                    + std::abs(online_mfcc_plp_feats(i, j)))));
+                                    + std::abs(online_mfcc_doubled_feats(i, j)))));
       }
       for (MatrixIndexT k = 0; k < plp_feats.NumCols(); k++) {
         KALDI_ASSERT(
           std::abs(plp_feats(i, k) -
-            online_mfcc_plp_feats(i, mfcc_feats.NumCols() + k))
+            online_mfcc_doubled_feats(i, mfcc_feats.NumCols() + k))
           < 0.0001*std::max(1.0, static_cast<double>(std::abs(plp_feats(i, k))
-            +std::abs(online_mfcc_plp_feats(i, mfcc_feats.NumCols() + k)))));
+            +std::abs(online_mfcc_doubled_feats(i, mfcc_feats.NumCols() + k)))));
       }
     }
   }
diff --git a/src/feat/online-feature.cc b/src/feat/online-feature.cc
index 813e7b16f0c..138dabe2236 100644
--- a/src/feat/online-feature.cc
+++ b/src/feat/online-feature.cc
@@ -69,9 +69,13 @@ void OnlineGenericBaseFeature<C>::GetFrame(int32 frame,
 template<class C>
 OnlineGenericBaseFeature<C>::OnlineGenericBaseFeature(
     const typename C::Options &opts):
-    computer_(opts), window_function_(computer_.GetFrameOptions()),
-    input_finished_(false), waveform_offset_(0),
-    features_(opts.frame_opts.max_feature_vectors) { }
+    computer_(opts),
+    features_(opts.frame_opts.max_feature_vectors),
+    input_finished_(false),
+    waveform_offset_(0) {
+  InitFeatureWindowFunction(computer_.GetFrameOptions(),
+                            &window_function_);
+}
 
 template<class C>
 void OnlineGenericBaseFeature<C>::AcceptWaveform(BaseFloat sampling_rate,
@@ -105,17 +109,14 @@ void OnlineGenericBaseFeature<C>::ComputeFeatures() {
   KALDI_ASSERT(num_frames_new >= num_frames_old);
 
   Vector<BaseFloat> window;
-  bool need_raw_log_energy = computer_.NeedRawLogEnergy();
   for (int32 frame = num_frames_old; frame < num_frames_new; frame++) {
-    BaseFloat raw_log_energy = 0.0;
     ExtractWindow(waveform_offset_, waveform_remainder_, frame,
-                  frame_opts, window_function_, &window,
-                  need_raw_log_energy ? &raw_log_energy : NULL);
+                  frame_opts, window_function_, &window);
     Vector<BaseFloat> *this_feature = new Vector<BaseFloat>(computer_.Dim(),
                                                             kUndefined);
     // note: this online feature-extraction code does not support VTLN.
     BaseFloat vtln_warp = 1.0;
-    computer_.Compute(raw_log_energy, vtln_warp, &window, this_feature);
+    computer_.Compute(vtln_warp, &window, this_feature);
     features_.PushBack(this_feature);
   }
   // OK, we will now discard any portion of the signal that will not be
@@ -142,7 +143,6 @@ void OnlineGenericBaseFeature<C>::ComputeFeatures() {
 
 // instantiate the templates defined here for MFCC, PLP and filterbank classes.
 template class OnlineGenericBaseFeature<MfccComputer>;
-template class OnlineGenericBaseFeature<PlpComputer>;
 template class OnlineGenericBaseFeature<FbankComputer>;
 
 
diff --git a/src/feat/online-feature.h b/src/feat/online-feature.h
index d47a6b13e9b..0ddc2601dec 100644
--- a/src/feat/online-feature.h
+++ b/src/feat/online-feature.h
@@ -32,7 +32,6 @@
 #include "base/kaldi-error.h"
 #include "feat/feature-functions.h"
 #include "feat/feature-mfcc.h"
-#include "feat/feature-plp.h"
 #include "feat/feature-fbank.h"
 #include "itf/online-feature-itf.h"
 
@@ -72,7 +71,7 @@ class RecyclingVector {
 
 
 /// This is a templated class for online feature extraction;
-/// it's templated on a class like MfccComputer or PlpComputer
+/// it's templated on a class like MfccComputer
 /// that does the basic feature extraction.
 template<class C>
 class OnlineGenericBaseFeature: public OnlineBaseFeature {
@@ -127,11 +126,11 @@ class OnlineGenericBaseFeature: public OnlineBaseFeature {
   // waveform_remainder_ while incrementing waveform_offset_ by the same amount.
   void ComputeFeatures();
 
-  C computer_;  // class that does the MFCC or PLP or filterbank computation
+  C computer_;  // class that does the MFCC or filterbank computation
 
-  FeatureWindowFunction window_function_;
+  Vector<BaseFloat> window_function_;
 
-  // features_ is the Mfcc or Plp or Fbank features that we have already computed.
+  // features_ is the Mfcc or Fbank features that we have already computed.
 
   RecyclingVector features_;
 
@@ -153,7 +152,6 @@ class OnlineGenericBaseFeature: public OnlineBaseFeature {
 };
 
 typedef OnlineGenericBaseFeature<MfccComputer> OnlineMfcc;
-typedef OnlineGenericBaseFeature<PlpComputer> OnlinePlp;
 typedef OnlineGenericBaseFeature<FbankComputer> OnlineFbank;
 
 
@@ -594,7 +592,7 @@ class OnlineCacheFeature: public OnlineFeatureInterface {
 
 
 /// This online-feature class implements combination of two feature
-/// streams (such as pitch, plp) into one stream.
+/// streams (such as pitch) into one stream.
 class OnlineAppendFeature: public OnlineFeatureInterface {
  public:
   virtual int32 Dim() const { return src1_->Dim() + src2_->Dim(); }
diff --git a/src/feat/pitch-functions-test.cc b/src/feat/pitch-functions-test.cc
index 0e481c18674..e3953acb884 100644
--- a/src/feat/pitch-functions-test.cc
+++ b/src/feat/pitch-functions-test.cc
@@ -25,7 +25,6 @@
 #include <iostream>
 
 #include "base/kaldi-math.h"
-#include "feat/feature-plp.h"
 #include "feat/pitch-functions.h"
 #include "feat/wave-reader.h"
 #include "sys/stat.h"
diff --git a/src/feat/wave-reader.cc b/src/feat/wave-reader.cc
index f8259a3a82e..bd35b1cff43 100644
--- a/src/feat/wave-reader.cc
+++ b/src/feat/wave-reader.cc
@@ -308,7 +308,11 @@ void WaveData::Read(std::istream &is) {
 
   uint16 *data_ptr = reinterpret_cast<uint16*>(&buffer[0]);
 
-  // The matrix is arranged row per channel, column per sample.
+  // Scale the wave data to the range [-1, 1].  Prior to kaldi-10,
+  // it was in the range [-327680.0, 32768.0].
+  const BaseFloat scale = 1.0 / 32768.0;
+
+  // The row-indexes are channels; column-indexes are samples.
   data_.Resize(header.NumChannels(),
                buffer.size() / header.BlockAlign());
   for (uint32 i = 0; i < data_.NumCols(); ++i) {
@@ -316,7 +320,7 @@ void WaveData::Read(std::istream &is) {
       int16 k = *data_ptr++;
       if (header.ReverseBytes())
         KALDI_SWAP2(k);
-      data_(j, i) =  k;
+      data_(j, i) =  k * scale;
     }
   }
 }
@@ -358,9 +362,13 @@ void WaveData::Write(std::ostream &os) const {
   int32 stride = data_.Stride();
 
   int num_clipped = 0;
+
+  // This scaling factor is because we are writing 16-bit data.
+  const BaseFloat scale = 32768.0;
+
   for (int32 i = 0; i < num_samp; i++) {
     for (int32 j = 0; j < num_chan; j++) {
-      int32 elem = static_cast<int32>(trunc(data_ptr[j * stride + i]));
+      int32 elem = static_cast<int32>(trunc(data_ptr[j * stride + i] * scale));
       int16 elem_16 = static_cast<int16>(elem);
       if (elem < std::numeric_limits<int16>::min()) {
         elem_16 = std::numeric_limits<int16>::min();
diff --git a/src/feat/wave-reader.h b/src/feat/wave-reader.h
index 7ba981c2c24..6c7fb5b5258 100644
--- a/src/feat/wave-reader.h
+++ b/src/feat/wave-reader.h
@@ -2,7 +2,7 @@
 
 // Copyright 2009-2011  Karel Vesely;  Microsoft Corporation
 //                2013  Florent Masson
-//                2013  Johns Hopkins University (author: Daniel Povey)
+//           2013-2019  Johns Hopkins University (author: Daniel Povey)
 
 // See ../../COPYING for clarification regarding multiple authors
 //
@@ -57,10 +57,6 @@
 
 namespace kaldi {
 
-/// For historical reasons, we scale waveforms to the range
-/// (2^15-1)*[-1, 1], not the usual default DSP range [-1, 1].
-const BaseFloat kWaveSampleMax = 32768.0;
-
 /// This class reads and hold wave file header information.
 class WaveInfo {
  public:
@@ -121,6 +117,8 @@ class WaveData {
   // This function returns the wave data-- it's in a matrix
   // becase there may be multiple channels.  In the normal case
   // there's just one channel so Data() will have one row.
+  // This data will be in the range [-1, 1].  This is a difference
+  // from pre-kaldi10.
   const Matrix<BaseFloat> &Data() const { return data_; }
 
   BaseFloat SampFreq() const { return samp_freq_; }
diff --git a/src/featbin/Makefile b/src/featbin/Makefile
index 861ba3f7a93..1067244b2db 100644
--- a/src/featbin/Makefile
+++ b/src/featbin/Makefile
@@ -8,7 +8,7 @@ BINFILES = add-deltas add-deltas-sdc append-post-to-feats \
            compose-transforms compute-and-process-kaldi-pitch-feats \
            compute-cmvn-stats compute-cmvn-stats-two-channel \
            compute-fbank-feats compute-kaldi-pitch-feats compute-mfcc-feats \
-           compute-plp-feats compute-spectrogram-feats concat-feats copy-feats \
+           concat-feats copy-feats \
            copy-feats-to-htk copy-feats-to-sphinx extend-transform-dim \
            extract-feature-segments extract-segments feat-to-dim \
            feat-to-len fmpe-acc-stats fmpe-apply-transform fmpe-est \
@@ -26,6 +26,6 @@ TESTFILES =
 ADDLIBS = ../hmm/kaldi-hmm.a ../feat/kaldi-feat.a \
           ../transform/kaldi-transform.a ../gmm/kaldi-gmm.a \
           ../tree/kaldi-tree.a ../util/kaldi-util.a ../matrix/kaldi-matrix.a \
-          ../base/kaldi-base.a 
+          ../base/kaldi-base.a
 
 include ../makefiles/default_rules.mk
diff --git a/src/featbin/compute-plp-feats.cc b/src/featbin/compute-plp-feats.cc
deleted file mode 100644
index 3e9fe9d7423..00000000000
--- a/src/featbin/compute-plp-feats.cc
+++ /dev/null
@@ -1,184 +0,0 @@
-// featbin/compute-plp-feats.cc
-
-// Copyright 2009-2012  Microsoft Corporation
-//                      Johns Hopkins University (author: Daniel Povey)
-
-// See ../../COPYING for clarification regarding multiple authors
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//  http://www.apache.org/licenses/LICENSE-2.0
-//
-// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
-// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
-// MERCHANTABLITY OR NON-INFRINGEMENT.
-// See the Apache 2 License for the specific language governing permissions and
-// limitations under the License.
-
-#include "base/kaldi-common.h"
-#include "util/common-utils.h"
-#include "feat/feature-plp.h"
-#include "feat/wave-reader.h"
-
-
-int main(int argc, char *argv[]) {
-  try {
-    using namespace kaldi;
-    const char *usage =
-        "Create PLP feature files.\n"
-        "Usage:  compute-plp-feats [options...] <wav-rspecifier> <feats-wspecifier>\n";
-
-    // construct all the global objects
-    ParseOptions po(usage);
-    PlpOptions plp_opts;
-    bool subtract_mean = false;
-    BaseFloat vtln_warp = 1.0;
-    std::string vtln_map_rspecifier;
-    std::string utt2spk_rspecifier;
-    int32 channel = -1;
-    BaseFloat min_duration = 0.0;
-    // Define defaults for gobal options
-    std::string output_format = "kaldi";
-
-    // Register the options
-    po.Register("output-format", &output_format, "Format of the output "
-                "files [kaldi, htk]");
-    po.Register("subtract-mean", &subtract_mean, "Subtract mean of each "
-                "feature file [CMS]. ");
-    po.Register("vtln-warp", &vtln_warp, "Vtln warp factor (only applicable "
-                "if vtln-map not specified)");
-    po.Register("vtln-map", &vtln_map_rspecifier, "Map from utterance or "
-                "speaker-id to vtln warp factor (rspecifier)");
-    po.Register("utt2spk", &utt2spk_rspecifier, "Utterance to speaker-id "
-                "map (if doing VTLN and you have warps per speaker)");
-    po.Register("channel", &channel, "Channel to extract (-1 -> expect mono, "
-                "0 -> left, 1 -> right)");
-    po.Register("min-duration", &min_duration, "Minimum duration of segments "
-                "to process (in seconds).");
-
-    plp_opts.Register(&po);
-
-    po.Read(argc, argv);
-    
-    if (po.NumArgs() != 2) {
-      po.PrintUsage();
-      exit(1);
-    }
-
-    std::string wav_rspecifier = po.GetArg(1);
-
-    std::string output_wspecifier = po.GetArg(2);
-
-    Plp plp(plp_opts);
-
-    SequentialTableReader<WaveHolder> reader(wav_rspecifier);
-    BaseFloatMatrixWriter kaldi_writer;  // typedef to TableWriter<something>.
-    TableWriter<HtkMatrixHolder> htk_writer;
-
-    if (utt2spk_rspecifier != "")
-      KALDI_ASSERT(vtln_map_rspecifier != "" && "the utt2spk option is only "
-                   "needed if the vtln-map option is used.");
-    RandomAccessBaseFloatReaderMapped vtln_map_reader(vtln_map_rspecifier,
-                                                      utt2spk_rspecifier);
-    
-    if (output_format == "kaldi") {
-      if (!kaldi_writer.Open(output_wspecifier))
-        KALDI_ERR << "Could not initialize output with wspecifier "
-                  << output_wspecifier;
-    } else if (output_format == "htk") {
-      if (!htk_writer.Open(output_wspecifier))
-        KALDI_ERR << "Could not initialize output with wspecifier "
-                  << output_wspecifier;
-    } else {
-      KALDI_ERR << "Invalid output_format string " << output_format;
-    }
-
-    int32 num_utts = 0, num_success = 0;
-    for (; !reader.Done(); reader.Next()) {
-      num_utts++;
-      std::string utt = reader.Key();
-      const WaveData &wave_data = reader.Value();
-      if (wave_data.Duration() < min_duration) {
-        KALDI_WARN << "File: " << utt << " is too short ("
-                   << wave_data.Duration() << " sec): producing no output.";
-        continue;
-      }
-      int32 num_chan = wave_data.Data().NumRows(), this_chan = channel;
-      {  // This block works out the channel (0=left, 1=right...)
-        KALDI_ASSERT(num_chan > 0);  // should have been caught in
-        // reading code if no channels.
-        if (channel == -1) {
-          this_chan = 0;
-          if (num_chan != 1)
-            KALDI_WARN << "Channel not specified but you have data with "
-                       << num_chan  << " channels; defaulting to zero";
-        } else {
-          if (this_chan >= num_chan) {
-            KALDI_WARN << "File with id " << utt << " has "
-                       << num_chan << " channels but you specified channel "
-                       << channel << ", producing no output.";
-            continue;
-          }
-        }
-      }
-      BaseFloat vtln_warp_local;  // Work out VTLN warp factor.
-      if (vtln_map_rspecifier != "") {
-        if (!vtln_map_reader.HasKey(utt)) {
-          KALDI_WARN << "No vtln-map entry for utterance-id (or speaker-id) "
-                     << utt;
-          continue;
-        }
-        vtln_warp_local = vtln_map_reader.Value(utt);
-      } else {
-        vtln_warp_local = vtln_warp;
-      }
-
-      SubVector<BaseFloat> waveform(wave_data.Data(), this_chan);
-      Matrix<BaseFloat> features;
-      try {
-        plp.ComputeFeatures(waveform, wave_data.SampFreq(), vtln_warp_local, &features);
-      } catch (...) {
-        KALDI_WARN << "Failed to compute features for utterance "
-                   << utt;
-        continue;
-      }
-      if (subtract_mean) {
-        Vector<BaseFloat> mean(features.NumCols());
-        mean.AddRowSumMat(1.0, features);
-        mean.Scale(1.0 / features.NumRows());
-        for (size_t i = 0; i < features.NumRows(); i++)
-          features.Row(i).AddVec(-1.0, mean);
-      }
-      if (output_format == "kaldi") {
-        kaldi_writer.Write(utt, features);
-      } else {
-        std::pair<Matrix<BaseFloat>, HtkHeader> p;
-        p.first.Resize(features.NumRows(), features.NumCols());
-        p.first.CopyFromMat(features);
-        HtkHeader header = {
-          features.NumRows(),
-          100000,  // 10ms shift
-          static_cast<int16>(sizeof(float)*features.NumCols()),
-          013 | // PLP
-          020000 // C0 [no option currently to use energy in PLP.
-        };
-        p.second = header;
-        htk_writer.Write(utt, p);
-      }
-      if (num_utts % 10 == 0)
-        KALDI_LOG << "Processed " << num_utts << " utterances";
-      KALDI_VLOG(2) << "Processed features for key " << utt;
-      num_success++;
-    }
-    KALDI_LOG << " Done " << num_success << " out of " << num_utts
-              << " utterances.";
-    return (num_success != 0 ? 0 : 1);
-  } catch(const std::exception &e) {
-    std::cerr << e.what();
-    return -1;
-  }
-}
-
diff --git a/src/featbin/compute-spectrogram-feats.cc b/src/featbin/compute-spectrogram-feats.cc
deleted file mode 100644
index 3b40a6fa5c7..00000000000
--- a/src/featbin/compute-spectrogram-feats.cc
+++ /dev/null
@@ -1,158 +0,0 @@
-// featbin/compute-spectrogram-feats.cc
-
-// Copyright 2009-2011  Microsoft Corporation
-
-// See ../../COPYING for clarification regarding multiple authors
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//  http://www.apache.org/licenses/LICENSE-2.0
-//
-// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
-// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
-// MERCHANTABLITY OR NON-INFRINGEMENT.
-// See the Apache 2 License for the specific language governing permissions and
-// limitations under the License.
-
-#include "base/kaldi-common.h"
-#include "util/common-utils.h"
-#include "feat/feature-spectrogram.h"
-#include "feat/wave-reader.h"
-
-
-int main(int argc, char *argv[]) {
-  try {
-    using namespace kaldi;
-    const char *usage =
-        "Create spectrogram feature files.\n"
-        "Usage:  compute-spectrogram-feats [options...] <wav-rspecifier> <feats-wspecifier>\n";
-
-    // construct all the global objects
-    ParseOptions po(usage);
-    SpectrogramOptions spec_opts;
-    bool subtract_mean = false;
-    int32 channel = -1;
-    BaseFloat min_duration = 0.0;
-    // Define defaults for gobal options
-    std::string output_format = "kaldi";
-
-    // Register the option struct
-    spec_opts.Register(&po);
-    // Register the options
-    po.Register("output-format", &output_format, "Format of the output files [kaldi, htk]");
-    po.Register("subtract-mean", &subtract_mean, "Subtract mean of each feature file [CMS]; not recommended to do it this way. ");
-    po.Register("channel", &channel, "Channel to extract (-1 -> expect mono, 0 -> left, 1 -> right)");
-    po.Register("min-duration", &min_duration, "Minimum duration of segments to process (in seconds).");
-
-    // OPTION PARSING ..........................................................
-    //
-
-    // parse options (+filling the registered variables)
-    po.Read(argc, argv);
-
-    if (po.NumArgs() != 2) {
-      po.PrintUsage();
-      exit(1);
-    }
-
-    std::string wav_rspecifier = po.GetArg(1);
-
-    std::string output_wspecifier = po.GetArg(2);
-
-    Spectrogram spec(spec_opts);
-
-    SequentialTableReader<WaveHolder> reader(wav_rspecifier);
-    BaseFloatMatrixWriter kaldi_writer;  // typedef to TableWriter<something>.
-    TableWriter<HtkMatrixHolder> htk_writer;
-
-    if (output_format == "kaldi") {
-      if (!kaldi_writer.Open(output_wspecifier))
-        KALDI_ERR << "Could not initialize output with wspecifier "
-                  << output_wspecifier;
-    } else if (output_format == "htk") {
-      if (!htk_writer.Open(output_wspecifier))
-        KALDI_ERR << "Could not initialize output with wspecifier "
-                  << output_wspecifier;
-    } else {
-      KALDI_ERR << "Invalid output_format string " << output_format;
-    }
-
-    int32 num_utts = 0, num_success = 0;
-    for (; !reader.Done(); reader.Next()) {
-      num_utts++;
-      std::string utt = reader.Key();
-      const WaveData &wave_data = reader.Value();
-      if (wave_data.Duration() < min_duration) {
-        KALDI_WARN << "File: " << utt << " is too short ("
-                   << wave_data.Duration() << " sec): producing no output.";
-        continue;
-      }
-      int32 num_chan = wave_data.Data().NumRows(), this_chan = channel;
-      {  // This block works out the channel (0=left, 1=right...)
-        KALDI_ASSERT(num_chan > 0);  // should have been caught in
-        // reading code if no channels.
-        if (channel == -1) {
-          this_chan = 0;
-          if (num_chan != 1)
-            KALDI_WARN << "Channel not specified but you have data with "
-                       << num_chan  << " channels; defaulting to zero";
-        } else {
-          if (this_chan >= num_chan) {
-            KALDI_WARN << "File with id " << utt << " has "
-                       << num_chan << " channels but you specified channel "
-                       << channel << ", producing no output.";
-            continue;
-          }
-        }
-      }
-
-      SubVector<BaseFloat> waveform(wave_data.Data(), this_chan);
-      Matrix<BaseFloat> features;
-      try {
-        spec.ComputeFeatures(waveform, wave_data.SampFreq(), 1.0, &features);
-      } catch (...) {
-        KALDI_WARN << "Failed to compute features for utterance "
-                   << utt;
-        continue;
-      }
-      if (subtract_mean) {
-        Vector<BaseFloat> mean(features.NumCols());
-        mean.AddRowSumMat(1.0, features);
-        mean.Scale(1.0 / features.NumRows());
-        for (int32 i = 0; i < features.NumRows(); i++)
-          features.Row(i).AddVec(-1.0, mean);
-      }
-      if (output_format == "kaldi") {
-        kaldi_writer.Write(utt, features);
-      } else {
-        std::pair<Matrix<BaseFloat>, HtkHeader> p;
-        p.first.Resize(features.NumRows(), features.NumCols());
-        p.first.CopyFromMat(features);
-        int32 frame_shift = spec_opts.frame_opts.frame_shift_ms * 10000;
-        HtkHeader header = {
-          features.NumRows(),
-          frame_shift,
-          static_cast<int16>(sizeof(float)*features.NumCols()),
-          007 | 020000
-        };
-        p.second = header;
-        htk_writer.Write(utt, p);
-      }
-      if(num_utts % 10 == 0)
-        KALDI_LOG << "Processed " << num_utts << " utterances";
-      KALDI_VLOG(2) << "Processed features for key " << utt;
-      num_success++;
-    }
-    KALDI_LOG << " Done " << num_success << " out of " << num_utts
-              << " utterances.";
-    return (num_success != 0 ? 0 : 1);
-  } catch(const std::exception& e) {
-    std::cerr << e.what();
-    return -1;
-  }
-  return 0;
-}
-
diff --git a/src/fgmmbin/fgmm-global-info.cc b/src/fgmmbin/fgmm-global-info.cc
index e00384fe13f..867db3bdc50 100644
--- a/src/fgmmbin/fgmm-global-info.cc
+++ b/src/fgmmbin/fgmm-global-info.cc
@@ -20,7 +20,7 @@
 #include "base/kaldi-common.h"
 #include "util/common-utils.h"
 #include "gmm/full-gmm.h"
-#include "hmm/transition-model.h"
+#include "hmm/transitions.h"
 
 int main(int argc, char *argv[]) {
   try {
diff --git a/src/fgmmbin/fgmm-gselect.cc b/src/fgmmbin/fgmm-gselect.cc
index ab36af74275..3d962972127 100644
--- a/src/fgmmbin/fgmm-gselect.cc
+++ b/src/fgmmbin/fgmm-gselect.cc
@@ -21,7 +21,7 @@
 #include "base/kaldi-common.h"
 #include "util/common-utils.h"
 #include "gmm/full-gmm.h"
-#include "hmm/transition-model.h"
+#include "hmm/transitions.h"
 
 int main(int argc, char *argv[]) {
   try {
diff --git a/src/gmm/Makefile b/src/gmm/Makefile
index caee6734afe..9b770bb4845 100644
--- a/src/gmm/Makefile
+++ b/src/gmm/Makefile
@@ -9,13 +9,13 @@ TESTFILES = diag-gmm-test mle-diag-gmm-test full-gmm-test mle-full-gmm-test \
 
 OBJFILES = diag-gmm.o diag-gmm-normal.o mle-diag-gmm.o am-diag-gmm.o \
            mle-am-diag-gmm.o full-gmm.o full-gmm-normal.o mle-full-gmm.o \
-					 model-common.o decodable-am-diag-gmm.o model-test-common.o \
-					 ebw-diag-gmm.o indirect-diff-diag-gmm.o
+ 		   model-common.o decodable-am-diag-gmm.o model-test-common.o \
+		 ebw-diag-gmm.o indirect-diff-diag-gmm.o
 
 LIBNAME = kaldi-gmm
 
 ADDLIBS = ../tree/kaldi-tree.a ../util/kaldi-util.a ../matrix/kaldi-matrix.a \
-          ../base/kaldi-base.a 
+          ../base/kaldi-base.a
 
 
 
diff --git a/src/gmm/decodable-am-diag-gmm.h b/src/gmm/decodable-am-diag-gmm.h
index 745b4f61b14..f2e03005708 100644
--- a/src/gmm/decodable-am-diag-gmm.h
+++ b/src/gmm/decodable-am-diag-gmm.h
@@ -26,11 +26,9 @@
 
 #include "base/kaldi-common.h"
 #include "gmm/am-diag-gmm.h"
-#include "hmm/transition-model.h"
+#include "hmm/transitions.h"
 #include "itf/decodable-itf.h"
-#include "transform/regression-tree.h"
-#include "transform/regtree-fmllr-diag-gmm.h"
-#include "transform/regtree-mllr-diag-gmm.h"
+
 
 namespace kaldi {
 
@@ -46,13 +44,13 @@ class DecodableAmDiagGmmUnmapped : public DecodableInterface {
  public:
   /// If you set log_sum_exp_prune to a value greater than 0 it will prune
   /// in the LogSumExp operation (larger = more exact); I suggest 5.
-  /// This is advisable if it's spending a long time doing exp 
-  /// operations. 
+  /// This is advisable if it's spending a long time doing exp
+  /// operations.
   DecodableAmDiagGmmUnmapped(const AmDiagGmm &am,
                              const Matrix<BaseFloat> &feats,
                              BaseFloat log_sum_exp_prune = -1.0):
     acoustic_model_(am), feature_matrix_(feats),
-    previous_frame_(-1), log_sum_exp_prune_(log_sum_exp_prune), 
+    previous_frame_(-1), log_sum_exp_prune_(log_sum_exp_prune),
     data_squared_(feats.NumCols()) {
     ResetLogLikeCache();
   }
@@ -63,7 +61,7 @@ class DecodableAmDiagGmmUnmapped : public DecodableInterface {
     return LogLikelihoodZeroBased(frame, state_index - 1);
   }
   virtual int32 NumFramesReady() const { return feature_matrix_.NumRows(); }
-  
+
   // Indices are one-based!  This is for compatibility with OpenFst.
   virtual int32 NumIndices() const { return acoustic_model_.NumPdfs(); }
 
@@ -98,7 +96,7 @@ class DecodableAmDiagGmmUnmapped : public DecodableInterface {
 class DecodableAmDiagGmm: public DecodableAmDiagGmmUnmapped {
  public:
   DecodableAmDiagGmm(const AmDiagGmm &am,
-                     const TransitionModel &tm,
+                     const Transitions &tm,
                      const Matrix<BaseFloat> &feats,
                      BaseFloat log_sum_exp_prune = -1.0)
     : DecodableAmDiagGmmUnmapped(am, feats, log_sum_exp_prune),
@@ -107,21 +105,21 @@ class DecodableAmDiagGmm: public DecodableAmDiagGmmUnmapped {
   // Note, frames are numbered from zero.
   virtual BaseFloat LogLikelihood(int32 frame, int32 tid) {
     return LogLikelihoodZeroBased(frame,
-                                  trans_model_.TransitionIdToPdf(tid));
+                                  trans_model_.TransitionIdToPdfFast(tid));
   }
   // Indices are one-based!  This is for compatibility with OpenFst.
   virtual int32 NumIndices() const { return trans_model_.NumTransitionIds(); }
 
-  const TransitionModel *TransModel() { return &trans_model_; }
+  const Transitions *TransModel() { return &trans_model_; }
  private: // want to access public to have pdf id information
-  const TransitionModel &trans_model_;  // for tid to pdf mapping
+  const Transitions &trans_model_;  // for tid to pdf mapping
   KALDI_DISALLOW_COPY_AND_ASSIGN(DecodableAmDiagGmm);
 };
 
 class DecodableAmDiagGmmScaled: public DecodableAmDiagGmmUnmapped {
  public:
   DecodableAmDiagGmmScaled(const AmDiagGmm &am,
-                           const TransitionModel &tm,
+                           const Transitions &tm,
                            const Matrix<BaseFloat> &feats,
                            BaseFloat scale,
                            BaseFloat log_sum_exp_prune = -1.0):
@@ -131,7 +129,7 @@ class DecodableAmDiagGmmScaled: public DecodableAmDiagGmmUnmapped {
   // This version of the initializer takes ownership of the pointer
   // "feats" and will delete it when this class is destroyed.
   DecodableAmDiagGmmScaled(const AmDiagGmm &am,
-                           const TransitionModel &tm,
+                           const Transitions &tm,
                            BaseFloat scale,
                            BaseFloat log_sum_exp_prune,
                            Matrix<BaseFloat> *feats):
@@ -140,20 +138,20 @@ class DecodableAmDiagGmmScaled: public DecodableAmDiagGmmUnmapped {
 
   // Note, frames are numbered from zero but transition-ids from one.
   virtual BaseFloat LogLikelihood(int32 frame, int32 tid) {
-    return scale_*LogLikelihoodZeroBased(frame,
-                                         trans_model_.TransitionIdToPdf(tid));
+    return scale_ * LogLikelihoodZeroBased(
+        frame, trans_model_.TransitionIdToPdfFast(tid));
   }
   // Indices are one-based!  This is for compatibility with OpenFst.
   virtual int32 NumIndices() const { return trans_model_.NumTransitionIds(); }
 
-  const TransitionModel *TransModel() { return &trans_model_; }
+  const Transitions *TransModel() { return &trans_model_; }
 
   virtual ~DecodableAmDiagGmmScaled() {
     delete delete_feats_;
   }
-  
+
  private: // want to access it public to have pdf id information
-  const TransitionModel &trans_model_;  // for transition-id to pdf mapping
+  const Transitions &trans_model_;  // for transition-id to pdf mapping
   BaseFloat scale_;
   Matrix<BaseFloat> *delete_feats_;
   KALDI_DISALLOW_COPY_AND_ASSIGN(DecodableAmDiagGmmScaled);
diff --git a/src/gmmbin/Makefile b/src/gmmbin/Makefile
index 82d10abe9ce..1e926e88432 100644
--- a/src/gmmbin/Makefile
+++ b/src/gmmbin/Makefile
@@ -6,25 +6,24 @@ include ../kaldi.mk
 
 BINFILES = gmm-init-mono gmm-est gmm-acc-stats-ali gmm-align \
            gmm-decode-faster gmm-decode-simple gmm-align-compiled \
-           gmm-sum-accs gmm-est-regtree-fmllr gmm-acc-stats-twofeats \
+           gmm-sum-accs gmm-acc-stats-twofeats \
            gmm-acc-stats gmm-init-lvtln gmm-est-lvtln-trans gmm-train-lvtln-special \
            gmm-acc-mllt gmm-mixup gmm-init-model gmm-transform-means \
-           gmm-make-regtree gmm-decode-faster-regtree-fmllr gmm-post-to-gpost \
-           gmm-est-fmllr-gpost gmm-est-fmllr gmm-est-regtree-fmllr-ali \
-           gmm-est-regtree-mllr gmm-compute-likes \
-           gmm-decode-faster-regtree-mllr gmm-latgen-simple \
+           gmm-post-to-gpost \
+           gmm-est-fmllr-gpost gmm-est-fmllr gmm-compute-likes \
+           gmm-latgen-simple \
            gmm-rescore-lattice gmm-decode-biglm-faster \
            gmm-est-gaussians-ebw gmm-est-weights-ebw gmm-latgen-faster gmm-copy \
            gmm-global-acc-stats gmm-global-est gmm-global-sum-accs gmm-gselect \
            gmm-latgen-biglm-faster gmm-ismooth-stats gmm-global-get-frame-likes \
            gmm-global-est-fmllr gmm-global-to-fgmm gmm-global-acc-stats-twofeats \
-           gmm-global-copy gmm-fmpe-acc-stats gmm-acc-stats2 gmm-init-model-flat gmm-info \
+           gmm-global-copy gmm-acc-stats2 gmm-init-model-flat gmm-info \
            gmm-get-stats-deriv gmm-est-rescale gmm-boost-silence \
            gmm-basis-fmllr-accs gmm-basis-fmllr-training gmm-est-basis-fmllr \
            gmm-est-map gmm-adapt-map gmm-latgen-map gmm-basis-fmllr-accs-gpost \
            gmm-est-basis-fmllr-gpost gmm-latgen-faster-parallel \
-           gmm-est-fmllr-raw gmm-est-fmllr-raw-gpost gmm-global-init-from-feats \
-           gmm-global-info gmm-latgen-faster-regtree-fmllr gmm-est-fmllr-global \
+           gmm-global-init-from-feats \
+           gmm-global-info gmm-est-fmllr-global \
            gmm-acc-mllt-global gmm-transform-means-global gmm-global-get-post \
            gmm-global-gselect-to-post gmm-global-est-lvtln-trans gmm-init-biphone
 
@@ -38,7 +37,7 @@ ADDLIBS = ../decoder/kaldi-decoder.a ../lat/kaldi-lat.a \
           ../fstext/kaldi-fstext.a ../hmm/kaldi-hmm.a ../feat/kaldi-feat.a \
           ../transform/kaldi-transform.a ../gmm/kaldi-gmm.a \
           ../tree/kaldi-tree.a ../util/kaldi-util.a ../matrix/kaldi-matrix.a \
-          ../base/kaldi-base.a 
+          ../base/kaldi-base.a
 
 
 include ../makefiles/default_rules.mk
diff --git a/src/gmmbin/gmm-acc-mllt-global.cc b/src/gmmbin/gmm-acc-mllt-global.cc
index bed91c053d3..ac3ec2237c9 100644
--- a/src/gmmbin/gmm-acc-mllt-global.cc
+++ b/src/gmmbin/gmm-acc-mllt-global.cc
@@ -23,7 +23,7 @@
 #include "base/kaldi-common.h"
 #include "util/common-utils.h"
 #include "gmm/am-diag-gmm.h"
-#include "hmm/transition-model.h"
+#include "hmm/transitions.h"
 #include "transform/mllt.h"
 #include "hmm/posterior.h"
 
diff --git a/src/gmmbin/gmm-acc-mllt.cc b/src/gmmbin/gmm-acc-mllt.cc
index 6e57f082a62..be0d501b3f5 100644
--- a/src/gmmbin/gmm-acc-mllt.cc
+++ b/src/gmmbin/gmm-acc-mllt.cc
@@ -22,7 +22,7 @@
 #include "base/kaldi-common.h"
 #include "util/common-utils.h"
 #include "gmm/am-diag-gmm.h"
-#include "hmm/transition-model.h"
+#include "hmm/transitions.h"
 #include "transform/mllt.h"
 #include "hmm/posterior.h"
 
@@ -58,7 +58,7 @@ int main(int argc, char *argv[]) {
     typedef kaldi::int32 int32;
 
     AmDiagGmm am_gmm;
-    TransitionModel trans_model;
+    Transitions trans_model;
     {
       bool binary;
       Input ki(model_filename, &binary);
diff --git a/src/gmmbin/gmm-acc-stats-ali.cc b/src/gmmbin/gmm-acc-stats-ali.cc
index 5552d45738e..b20212b4771 100644
--- a/src/gmmbin/gmm-acc-stats-ali.cc
+++ b/src/gmmbin/gmm-acc-stats-ali.cc
@@ -21,7 +21,7 @@
 #include "base/kaldi-common.h"
 #include "util/common-utils.h"
 #include "gmm/am-diag-gmm.h"
-#include "hmm/transition-model.h"
+#include "hmm/transitions.h"
 #include "gmm/mle-am-diag-gmm.h"
 
 
@@ -53,7 +53,7 @@ int main(int argc, char *argv[]) {
         accs_wxfilename = po.GetArg(4);
 
     AmDiagGmm am_gmm;
-    TransitionModel trans_model;
+    Transitions trans_model;
     {
       bool binary;
       Input ki(model_filename, &binary);
diff --git a/src/gmmbin/gmm-acc-stats-twofeats.cc b/src/gmmbin/gmm-acc-stats-twofeats.cc
index 05f94ff5ef6..3bae910233b 100644
--- a/src/gmmbin/gmm-acc-stats-twofeats.cc
+++ b/src/gmmbin/gmm-acc-stats-twofeats.cc
@@ -23,7 +23,7 @@
 #include "base/kaldi-common.h"
 #include "util/common-utils.h"
 #include "gmm/am-diag-gmm.h"
-#include "hmm/transition-model.h"
+#include "hmm/transitions.h"
 #include "gmm/mle-am-diag-gmm.h"
 #include "hmm/posterior.h"
 
@@ -59,7 +59,7 @@ int main(int argc, char *argv[]) {
     typedef kaldi::int32 int32;
 
     AmDiagGmm am_gmm;
-    TransitionModel trans_model;
+    Transitions trans_model;
     {
       bool binary;
       Input ki(model_filename, &binary);
diff --git a/src/gmmbin/gmm-acc-stats.cc b/src/gmmbin/gmm-acc-stats.cc
index e213fffdeff..beeee8ec758 100644
--- a/src/gmmbin/gmm-acc-stats.cc
+++ b/src/gmmbin/gmm-acc-stats.cc
@@ -22,7 +22,7 @@
 #include "base/kaldi-common.h"
 #include "util/common-utils.h"
 #include "gmm/am-diag-gmm.h"
-#include "hmm/transition-model.h"
+#include "hmm/transitions.h"
 #include "gmm/mle-am-diag-gmm.h"
 #include "hmm/posterior.h"
 
@@ -59,7 +59,7 @@ int main(int argc, char *argv[]) {
 
 
     AmDiagGmm am_gmm;
-    TransitionModel trans_model;
+    Transitions trans_model;
     {
       bool binary;
       Input ki(model_filename, &binary);
diff --git a/src/gmmbin/gmm-acc-stats2.cc b/src/gmmbin/gmm-acc-stats2.cc
index 70730c8ca7d..30f3ff80e10 100644
--- a/src/gmmbin/gmm-acc-stats2.cc
+++ b/src/gmmbin/gmm-acc-stats2.cc
@@ -21,7 +21,7 @@
 #include "base/kaldi-common.h"
 #include "util/common-utils.h"
 #include "gmm/am-diag-gmm.h"
-#include "hmm/transition-model.h"
+#include "hmm/transitions.h"
 #include "gmm/mle-am-diag-gmm.h"
 #include "hmm/posterior.h"
 
@@ -62,7 +62,7 @@ int main(int argc, char *argv[]) {
 
     
     AmDiagGmm am_gmm;
-    TransitionModel trans_model;
+    Transitions trans_model;
     {
       bool binary;
       Input ki(model_rxfilename, &binary);
diff --git a/src/gmmbin/gmm-adapt-map.cc b/src/gmmbin/gmm-adapt-map.cc
index ec3eb8cea9b..30fbc1e8d73 100644
--- a/src/gmmbin/gmm-adapt-map.cc
+++ b/src/gmmbin/gmm-adapt-map.cc
@@ -25,7 +25,7 @@
 #include "base/kaldi-common.h"
 #include "util/common-utils.h"
 #include "gmm/am-diag-gmm.h"
-#include "hmm/transition-model.h"
+#include "hmm/transitions.h"
 #include "gmm/mle-am-diag-gmm.h"
 #include "hmm/posterior.h"
 
@@ -72,7 +72,7 @@ int main(int argc, char *argv[]) {
     MapAmDiagGmmWriter map_am_writer(map_am_wspecifier);
 
     AmDiagGmm am_gmm;
-    TransitionModel trans_model;
+    Transitions trans_model;
     {
       bool binary;
       Input is(model_filename, &binary);
diff --git a/src/gmmbin/gmm-align-compiled.cc b/src/gmmbin/gmm-align-compiled.cc
index 36349774773..02beb372b60 100644
--- a/src/gmmbin/gmm-align-compiled.cc
+++ b/src/gmmbin/gmm-align-compiled.cc
@@ -22,7 +22,7 @@
 #include "base/kaldi-common.h"
 #include "util/common-utils.h"
 #include "gmm/am-diag-gmm.h"
-#include "hmm/transition-model.h"
+#include "hmm/transitions.h"
 #include "hmm/hmm-utils.h"
 #include "fstext/fstext-lib.h"
 #include "decoder/decoder-wrappers.h"
@@ -77,7 +77,7 @@ int main(int argc, char *argv[]) {
         alignment_wspecifier = po.GetArg(4),
         scores_wspecifier = po.GetOptArg(5);
 
-    TransitionModel trans_model;
+    Transitions trans_model;
     AmDiagGmm am_gmm;
     {
       bool binary;
diff --git a/src/gmmbin/gmm-align.cc b/src/gmmbin/gmm-align.cc
index c9c2fde11f6..e84a90cdb9a 100644
--- a/src/gmmbin/gmm-align.cc
+++ b/src/gmmbin/gmm-align.cc
@@ -21,7 +21,7 @@
 #include "base/kaldi-common.h"
 #include "util/common-utils.h"
 #include "gmm/am-diag-gmm.h"
-#include "hmm/transition-model.h"
+#include "hmm/transitions.h"
 #include "fstext/fstext-utils.h"
 #include "decoder/decoder-wrappers.h"
 #include "decoder/training-graph-compiler.h"
@@ -73,7 +73,7 @@ int main(int argc, char *argv[]) {
     ContextDependency ctx_dep;
     ReadKaldiObject(tree_in_filename, &ctx_dep);
 
-    TransitionModel trans_model;
+    Transitions trans_model;
     AmDiagGmm am_gmm;
     {
       bool binary;
diff --git a/src/gmmbin/gmm-basis-fmllr-accs-gpost.cc b/src/gmmbin/gmm-basis-fmllr-accs-gpost.cc
index f8f7b5d3433..9001b64ae82 100644
--- a/src/gmmbin/gmm-basis-fmllr-accs-gpost.cc
+++ b/src/gmmbin/gmm-basis-fmllr-accs-gpost.cc
@@ -26,7 +26,7 @@ using std::vector;
 #include "base/kaldi-common.h"
 #include "util/common-utils.h"
 #include "gmm/am-diag-gmm.h"
-#include "hmm/transition-model.h"
+#include "hmm/transitions.h"
 #include "transform/fmllr-diag-gmm.h"
 #include "transform/basis-fmllr-diag-gmm.h"
 #include "hmm/posterior.h"
@@ -34,7 +34,7 @@ using std::vector;
 namespace kaldi {
 void AccumulateForUtterance(const Matrix<BaseFloat> &feats,
                             const GaussPost &gpost,
-                            const TransitionModel &trans_model,
+                            const Transitions &trans_model,
                             const AmDiagGmm &am_gmm,
                             FmllrDiagGmmAccs *spk_stats) {
   for (size_t i = 0; i < gpost.size(); i++) {
@@ -81,7 +81,7 @@ int main(int argc, char *argv[]) {
         gpost_rspecifier = po.GetArg(3),
         accs_wspecifier = po.GetArg(4);
 
-    TransitionModel trans_model;
+    Transitions trans_model;
     AmDiagGmm am_gmm;
     {
       bool binary;
diff --git a/src/gmmbin/gmm-basis-fmllr-accs.cc b/src/gmmbin/gmm-basis-fmllr-accs.cc
index 58b365318f0..d78d652dfc5 100644
--- a/src/gmmbin/gmm-basis-fmllr-accs.cc
+++ b/src/gmmbin/gmm-basis-fmllr-accs.cc
@@ -26,7 +26,7 @@ using std::vector;
 #include "base/kaldi-common.h"
 #include "util/common-utils.h"
 #include "gmm/am-diag-gmm.h"
-#include "hmm/transition-model.h"
+#include "hmm/transitions.h"
 #include "transform/fmllr-diag-gmm.h"
 #include "transform/basis-fmllr-diag-gmm.h"
 #include "hmm/posterior.h"
@@ -34,7 +34,7 @@ using std::vector;
 namespace kaldi {
 void AccumulateForUtterance(const Matrix<BaseFloat> &feats,
                             const Posterior &post,
-                            const TransitionModel &trans_model,
+                            const Transitions &trans_model,
                             const AmDiagGmm &am_gmm,
                             FmllrDiagGmmAccs *spk_stats) {
   Posterior pdf_post;
@@ -82,7 +82,7 @@ int main(int argc, char *argv[]) {
         post_rspecifier = po.GetArg(3),
         accs_wspecifier = po.GetArg(4);
 
-    TransitionModel trans_model;
+    Transitions trans_model;
     AmDiagGmm am_gmm;
     {
       bool binary;
diff --git a/src/gmmbin/gmm-basis-fmllr-training.cc b/src/gmmbin/gmm-basis-fmllr-training.cc
index 3d93c3ca877..d433f6903f6 100644
--- a/src/gmmbin/gmm-basis-fmllr-training.cc
+++ b/src/gmmbin/gmm-basis-fmllr-training.cc
@@ -25,7 +25,7 @@ using std::vector;
 #include "base/kaldi-common.h"
 #include "util/common-utils.h"
 #include "gmm/am-diag-gmm.h"
-#include "hmm/transition-model.h"
+#include "hmm/transitions.h"
 #include "transform/fmllr-diag-gmm.h"
 #include "transform/basis-fmllr-diag-gmm.h"
 
@@ -53,7 +53,7 @@ int main(int argc, char *argv[]) {
         model_rxfilename = po.GetArg(1),
         basis_wspecifier = po.GetArg(2);
 
-    TransitionModel trans_model;
+    Transitions trans_model;
     AmDiagGmm am_gmm;
     {
       bool binary;
diff --git a/src/gmmbin/gmm-boost-silence.cc b/src/gmmbin/gmm-boost-silence.cc
index 7c9e4c82806..ef57f1190cb 100644
--- a/src/gmmbin/gmm-boost-silence.cc
+++ b/src/gmmbin/gmm-boost-silence.cc
@@ -19,7 +19,7 @@
 
 #include "base/kaldi-common.h"
 #include "util/common-utils.h"
-#include "hmm/transition-model.h"
+#include "hmm/transitions.h"
 #include "gmm/am-diag-gmm.h"
 
 int main(int argc, char *argv[]) {
@@ -67,7 +67,7 @@ int main(int argc, char *argv[]) {
     }
     
     AmDiagGmm am_gmm;
-    TransitionModel trans_model;
+    Transitions trans_model;
     {
       bool binary_read;
       Input ki(model_rxfilename, &binary_read);
diff --git a/src/gmmbin/gmm-compute-likes.cc b/src/gmmbin/gmm-compute-likes.cc
index 78c813e1c3b..c7101f1a9ae 100644
--- a/src/gmmbin/gmm-compute-likes.cc
+++ b/src/gmmbin/gmm-compute-likes.cc
@@ -21,7 +21,7 @@
 #include "base/kaldi-common.h"
 #include "util/common-utils.h"
 #include "gmm/am-diag-gmm.h"
-#include "hmm/transition-model.h"
+#include "hmm/transitions.h"
 #include "fstext/fstext-lib.h"
 #include "base/timer.h"
 
@@ -55,7 +55,7 @@ int main(int argc, char *argv[]) {
     AmDiagGmm am_gmm;
     {
       bool binary;
-      TransitionModel trans_model;  // not needed.
+      Transitions trans_model;  // not needed.
       Input ki(model_in_filename, &binary);
       trans_model.Read(ki.Stream(), binary);
       am_gmm.Read(ki.Stream(), binary);
diff --git a/src/gmmbin/gmm-copy.cc b/src/gmmbin/gmm-copy.cc
index 0b33bc6d67f..bd42aeb2a25 100644
--- a/src/gmmbin/gmm-copy.cc
+++ b/src/gmmbin/gmm-copy.cc
@@ -20,7 +20,7 @@
 #include "base/kaldi-common.h"
 #include "util/common-utils.h"
 #include "gmm/am-diag-gmm.h"
-#include "hmm/transition-model.h"
+#include "hmm/transitions.h"
 
 int main(int argc, char *argv[]) {
   try {
@@ -54,7 +54,7 @@ int main(int argc, char *argv[]) {
         model_out_filename = po.GetArg(2);
 
     AmDiagGmm am_gmm;
-    TransitionModel trans_model;
+    Transitions trans_model;
     {
       bool binary_read;
       Input ki(model_in_filename, &binary_read);
diff --git a/src/gmmbin/gmm-decode-biglm-faster.cc b/src/gmmbin/gmm-decode-biglm-faster.cc
index 6e47d68de3c..9e7845e7849 100644
--- a/src/gmmbin/gmm-decode-biglm-faster.cc
+++ b/src/gmmbin/gmm-decode-biglm-faster.cc
@@ -21,7 +21,7 @@
 #include "base/kaldi-common.h"
 #include "util/common-utils.h"
 #include "gmm/am-diag-gmm.h"
-#include "hmm/transition-model.h"
+#include "hmm/transitions.h"
 #include "fstext/fstext-lib.h"
 #include "decoder/biglm-faster-decoder.h"
 #include "gmm/decodable-am-diag-gmm.h"
@@ -111,7 +111,7 @@ int main(int argc, char *argv[])
         alignment_wspecifier = po.GetOptArg(7),
         lattice_wspecifier = po.GetOptArg(8);
 
-    TransitionModel trans_model;
+    Transitions trans_model;
     AmDiagGmm am_gmm;
     {
       bool binary;
diff --git a/src/gmmbin/gmm-decode-faster-regtree-fmllr.cc b/src/gmmbin/gmm-decode-faster-regtree-fmllr.cc
deleted file mode 100644
index ca39cbe8cb7..00000000000
--- a/src/gmmbin/gmm-decode-faster-regtree-fmllr.cc
+++ /dev/null
@@ -1,290 +0,0 @@
-// gmmbin/gmm-decode-faster-regtree-fmllr.cc
-
-// Copyright 2009-2012  Microsoft Corporation;  Saarland University;
-//                      Johns Hopkins University (author: Daniel Povey)
-
-// See ../../COPYING for clarification regarding multiple authors
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//  http://www.apache.org/licenses/LICENSE-2.0
-//
-// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
-// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
-// MERCHANTABLITY OR NON-INFRINGEMENT.
-// See the Apache 2 License for the specific language governing permissions and
-// limitations under the License.
-
-#include <string>
-#include <vector>
-
-#include "base/kaldi-common.h"
-#include "util/common-utils.h"
-#include "gmm/am-diag-gmm.h"
-#include "hmm/transition-model.h"
-#include "transform/regression-tree.h"
-#include "transform/regtree-fmllr-diag-gmm.h"
-#include "transform/fmllr-diag-gmm.h"
-#include "fstext/fstext-lib.h"
-#include "decoder/faster-decoder.h"
-#include "transform/decodable-am-diag-gmm-regtree.h"
-#include "base/timer.h"
-#include "lat/kaldi-lattice.h" // for {Compact}LatticeArc
-
-using fst::SymbolTable;
-using fst::VectorFst;
-using fst::StdArc;
-using kaldi::BaseFloat;
-using std::string;
-using std::vector;
-using kaldi::LatticeWeight;
-using kaldi::LatticeArc;
-
-struct DecodeInfo {
- public:
-  DecodeInfo(const kaldi::AmDiagGmm &am,
-             const kaldi::TransitionModel &tm, kaldi::FasterDecoder *decoder,
-             BaseFloat scale, bool allow_partial,
-             const kaldi::Int32VectorWriter &wwriter,
-             const kaldi::Int32VectorWriter &awriter, fst::SymbolTable *wsyms)
-      : acoustic_model(am), trans_model(tm), decoder(decoder),
-        acoustic_scale(scale), allow_partial(allow_partial), words_writer(wwriter),
-        alignment_writer(awriter), word_syms(wsyms) {}
-
-  const kaldi::AmDiagGmm &acoustic_model;
-  const kaldi::TransitionModel &trans_model;
-  kaldi::FasterDecoder *decoder;
-  BaseFloat acoustic_scale;
-  bool allow_partial;
-  const kaldi::Int32VectorWriter &words_writer;
-  const kaldi::Int32VectorWriter &alignment_writer;
-  fst::SymbolTable *word_syms;
-
- private:
-  KALDI_DISALLOW_COPY_AND_ASSIGN(DecodeInfo);
-};
-
-bool DecodeUtterance(kaldi::FasterDecoder *decoder,
-                     kaldi::DecodableInterface *decodable,
-                     DecodeInfo *info,
-                     const string &uttid,
-                     int32 num_frames,
-                     BaseFloat *total_like) {
-  decoder->Decode(decodable);
-  KALDI_LOG << "Length of file is " << num_frames;
-
-  VectorFst<LatticeArc> decoded;  // linear FST.
-  if ( (info->allow_partial || decoder->ReachedFinal())
-       && decoder->GetBestPath(&decoded) ) {
-    if (!decoder->ReachedFinal())
-      KALDI_WARN << "Decoder did not reach end-state, outputting partial "
-          "traceback.";
-    
-    vector<kaldi::int32> alignment, words;
-    LatticeWeight weight;
-    GetLinearSymbolSequence(decoded, &alignment, &words, &weight);
-
-    info->words_writer.Write(uttid, words);
-    if (info->alignment_writer.IsOpen())
-      info->alignment_writer.Write(uttid, alignment);
-    if (info->word_syms != NULL) {
-      std::ostringstream ss;
-      ss << uttid << ' ';
-      for (size_t i = 0; i < words.size(); i++) {
-        string s = info->word_syms->Find(words[i]);
-        if (s == "")
-          KALDI_ERR << "Word-id " << words[i] << " not in symbol table.";
-        ss << s << ' ';
-      }
-      ss << '\n';
-      KALDI_LOG << ss.str();
-    }
-
-    BaseFloat like = -weight.Value1() -weight.Value2();
-    KALDI_LOG << "Log-like per frame = " << (like/num_frames);
-    (*total_like) += like;
-    return true;
-  } else {
-    KALDI_WARN << "Did not successfully decode utterance, length = "
-               << num_frames;
-    return false;
-  }
-}
-
-int main(int argc, char *argv[]) {
-  try {
-    using namespace kaldi;
-    typedef kaldi::int32 int32;
-
-    const char *usage = "Decode features using GMM-based model.\n"
-              "Usage: gmm-decode-faster-regtree-fmllr [options] model-in fst-in "
-              "regtree-in features-rspecifier transforms-rspecifier "
-              "words-wspecifier [alignments-wspecifier]\n";
-    ParseOptions po(usage);
-    bool binary = true;
-    bool allow_partial = true;
-    BaseFloat acoustic_scale = 0.1;
-    
-    std::string word_syms_filename, utt2spk_rspecifier;
-    FasterDecoderOptions decoder_opts;
-    decoder_opts.Register(&po, true);  // true == include obscure settings.
-    po.Register("utt2spk", &utt2spk_rspecifier, "rspecifier for utterance to "
-                "speaker map");
-    po.Register("binary", &binary, "Write output in binary mode");
-    po.Register("acoustic-scale", &acoustic_scale,
-        "Scaling factor for acoustic likelihoods");
-    po.Register("word-symbol-table", &word_syms_filename,
-        "Symbol table for words [for debug output]");
-    po.Register("allow-partial", &allow_partial,
-                "Produce output even when final state was not reached");
-    po.Read(argc, argv);
-
-    if (po.NumArgs() < 6 || po.NumArgs() > 7) {
-      po.PrintUsage();
-      exit(1);
-    }
-
-    std::string model_in_filename = po.GetArg(1),
-        fst_in_filename = po.GetArg(2),
-        regtree_filename = po.GetArg(3),
-        feature_rspecifier = po.GetArg(4),
-        xforms_rspecifier = po.GetArg(5),
-        words_wspecifier = po.GetArg(6),
-        alignment_wspecifier = po.GetOptArg(7);
-
-    TransitionModel trans_model;
-    AmDiagGmm am_gmm;
-    {
-      bool binary_read;
-      Input ki(model_in_filename, &binary_read);
-      trans_model.Read(ki.Stream(), binary_read);
-      am_gmm.Read(ki.Stream(), binary_read);
-    }
-
-    VectorFst<StdArc> *decode_fst = fst::ReadFstKaldi(fst_in_filename);
-
-    RegressionTree regtree;
-    {
-      bool binary_read;
-      Input in(regtree_filename, &binary_read);
-      regtree.Read(in.Stream(), binary_read, am_gmm);
-    }
-
-    RandomAccessRegtreeFmllrDiagGmmReaderMapped fmllr_reader(xforms_rspecifier,
-                                                             utt2spk_rspecifier);
-
-    Int32VectorWriter words_writer(words_wspecifier);
-
-    Int32VectorWriter alignment_writer(alignment_wspecifier);
-
-    fst::SymbolTable *word_syms = NULL;
-    if (word_syms_filename != "") {
-      word_syms = fst::SymbolTable::ReadText(word_syms_filename);
-      if (!word_syms) {
-        KALDI_ERR << "Could not read symbol table from file "
-            << word_syms_filename;
-      }
-    }
-
-    BaseFloat tot_like = 0.0;
-    kaldi::int64 frame_count = 0;
-    int num_success = 0, num_fail = 0;
-    FasterDecoder decoder(*decode_fst, decoder_opts);
-
-    Timer timer;
-
-    DecodeInfo decode_info(am_gmm, trans_model, &decoder, acoustic_scale,
-                           allow_partial, words_writer, alignment_writer,
-                           word_syms);
-
-    SequentialBaseFloatMatrixReader feature_reader(feature_rspecifier);
-    for (; !feature_reader.Done(); feature_reader.Next()) {
-      string utt = feature_reader.Key();
-
-      Matrix<BaseFloat> features(feature_reader.Value());
-      feature_reader.FreeCurrent();
-      if (features.NumRows() == 0) {
-        KALDI_WARN << "Zero-length utterance: " << utt;
-        num_fail++;
-        continue;
-      }
-
-      if (!fmllr_reader.HasKey(utt)) {  // Decode without FMLLR if none found
-        KALDI_WARN << "No FMLLR transform for key " << utt <<
-            ", decoding without fMLLR.";
-        kaldi::DecodableAmDiagGmmScaled gmm_decodable(am_gmm, trans_model,
-                                                      features,
-                                                      acoustic_scale);
-        if (DecodeUtterance(&decoder, &gmm_decodable, &decode_info,
-                            utt, features.NumRows(), &tot_like)) {
-          frame_count += gmm_decodable.NumFramesReady();
-          num_success++;
-        } else {
-          num_fail++;
-        }
-        continue;
-      }
-
-      // If found, load the transforms for the current utterance.
-      RegtreeFmllrDiagGmm fmllr(fmllr_reader.Value(utt));
-      if (fmllr.NumRegClasses() == 1) {
-        Matrix<BaseFloat> xformed_features(features);
-        Matrix<BaseFloat> fmllr_matrix;
-        fmllr.GetXformMatrix(0, &fmllr_matrix);
-        for (int32 i = 0; i < xformed_features.NumRows(); i++) {
-          SubVector<BaseFloat> row(xformed_features, i);
-          ApplyAffineTransform(fmllr_matrix, &row);
-        }
-        kaldi::DecodableAmDiagGmmScaled gmm_decodable(am_gmm, trans_model,
-                                                      xformed_features,
-                                                      acoustic_scale);
-
-        if (DecodeUtterance(&decoder, &gmm_decodable, &decode_info,
-                            utt, xformed_features.NumRows(), &tot_like)) {
-          frame_count += gmm_decodable.NumFramesReady();
-          num_success++;
-        } else {
-          num_fail++;
-        }
-      } else {
-        kaldi::DecodableAmDiagGmmRegtreeFmllr gmm_decodable(am_gmm, trans_model,
-                                                            features, fmllr,
-                                                            regtree,
-                                                            acoustic_scale);
-        if (DecodeUtterance(&decoder, &gmm_decodable, &decode_info,
-                            utt, features.NumRows(), &tot_like)) {
-          frame_count += gmm_decodable.NumFramesReady();
-          num_success++;
-        } else {
-          num_fail++;
-        }
-      }
-    }  // end looping over all utterances
-
-    KALDI_LOG << "Average log-likelihood per frame is " << (tot_like
-                                                            / frame_count) << " over " << frame_count << " frames.";
-
-    double elapsed = timer.Elapsed();
-    KALDI_LOG << "Time taken [excluding initialization] " << elapsed
-              << "s: real-time factor assuming 100 frames/sec is "
-              << (elapsed * 100.0 / frame_count);
-    KALDI_LOG << "Done " << num_success << " utterances, failed for "
-              << num_fail;
-
-    delete word_syms;
-    delete decode_fst;
-    if (num_success != 0)
-      return 0;
-    else
-      return 1;
-  }
-  catch(const std::exception &e) {
-    std::cerr << e.what();
-    return -1;
-  }
-}
-
-
diff --git a/src/gmmbin/gmm-decode-faster-regtree-mllr.cc b/src/gmmbin/gmm-decode-faster-regtree-mllr.cc
deleted file mode 100644
index 9a5d9486b9f..00000000000
--- a/src/gmmbin/gmm-decode-faster-regtree-mllr.cc
+++ /dev/null
@@ -1,267 +0,0 @@
-// gmmbin/gmm-decode-faster-regtree-mllr.cc
-
-// Copyright 2009-2013  Microsoft Corporation;  Saarland University;
-//                      Johns Hopkins University (author: Daniel Povey)
-
-// See ../../COPYING for clarification regarding multiple authors
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//  http://www.apache.org/licenses/LICENSE-2.0
-//
-// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
-// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
-// MERCHANTABLITY OR NON-INFRINGEMENT.
-// See the Apache 2 License for the specific language governing permissions and
-// limitations under the License.
-
-#include <string>
-#include <vector>
-
-#include "base/kaldi-common.h"
-#include "util/common-utils.h"
-#include "gmm/am-diag-gmm.h"
-#include "hmm/transition-model.h"
-#include "transform/regression-tree.h"
-#include "transform/regtree-mllr-diag-gmm.h"
-#include "fstext/fstext-lib.h"
-#include "decoder/faster-decoder.h"
-#include "transform/decodable-am-diag-gmm-regtree.h"
-#include "base/timer.h"
-#include "lat/kaldi-lattice.h" // for {Compact}LatticeArc
-
-using fst::SymbolTable;
-using fst::VectorFst;
-using fst::StdArc;
-using kaldi::BaseFloat;
-using std::string;
-using std::vector;
-using kaldi::LatticeWeight;
-using kaldi::LatticeArc;
-
-struct DecodeInfo {
- public:
-  DecodeInfo(const kaldi::AmDiagGmm &am,
-             const kaldi::TransitionModel &tm, kaldi::FasterDecoder *decoder,
-             BaseFloat scale, bool allow_partial,
-             const kaldi::Int32VectorWriter &wwriter,
-             const kaldi::Int32VectorWriter &awriter, fst::SymbolTable *wsyms)
-      : acoustic_model(am), trans_model(tm), decoder(decoder),
-        acoustic_scale(scale), allow_partial(allow_partial), words_writer(wwriter),
-        alignment_writer(awriter), word_syms(wsyms) {}
-
-  const kaldi::AmDiagGmm &acoustic_model;
-  const kaldi::TransitionModel &trans_model;
-  kaldi::FasterDecoder *decoder;
-  BaseFloat acoustic_scale;
-  bool allow_partial;
-  const kaldi::Int32VectorWriter &words_writer;
-  const kaldi::Int32VectorWriter &alignment_writer;
-  fst::SymbolTable *word_syms;
-
- private:
-  KALDI_DISALLOW_COPY_AND_ASSIGN(DecodeInfo);
-};
-
-bool DecodeUtterance(kaldi::FasterDecoder *decoder,
-                     kaldi::DecodableInterface *decodable,
-                     DecodeInfo *info,
-                     const string &uttid,
-                     int32 num_frames,
-                     BaseFloat *total_like) {
-  decoder->Decode(decodable);
-  KALDI_LOG << "Length of file is " << num_frames;;
-
-  VectorFst<LatticeArc> decoded;  // linear FST.
-  if ( (info->allow_partial || decoder->ReachedFinal())
-       && decoder->GetBestPath(&decoded) ) {
-    if (!decoder->ReachedFinal())
-      KALDI_WARN << "Decoder did not reach end-state, outputting partial "
-          "traceback.";
-    
-    vector<kaldi::int32> alignment, words;
-    LatticeWeight weight;
-    GetLinearSymbolSequence(decoded, &alignment, &words, &weight);
-
-    info->words_writer.Write(uttid, words);
-    if (info->alignment_writer.IsOpen())
-      info->alignment_writer.Write(uttid, alignment);
-    if (info->word_syms != NULL) {
-      std::ostringstream ss;
-      ss << uttid << ' ';
-      for (size_t i = 0; i < words.size(); i++) {
-        string s = info->word_syms->Find(words[i]);
-        if (s == "")
-          KALDI_ERR << "Word-id " << words[i] << " not in symbol table.";
-        ss << s << ' ';
-      }
-      ss << '\n';
-      KALDI_LOG << ss.str();
-    }
-
-    BaseFloat like = -weight.Value1() -weight.Value2();
-    KALDI_LOG << "Log-like per frame = " << (like/num_frames);
-    (*total_like) += like;
-    return true;
-  } else {
-    KALDI_WARN << "Did not successfully decode utterance, length = "
-               << num_frames;
-    return false;
-  }
-}
-
-int main(int argc, char *argv[]) {
-  try {
-    using namespace kaldi;
-    typedef kaldi::int32 int32;
-
-    const char *usage = "Decode features using GMM-based model.\n"
-              "Usage: gmm-decode-faster-regtree-mllr [options] model-in fst-in "
-              "regtree-in features-rspecifier transforms-rspecifier "
-              "words-wspecifier [alignments-wspecifier]\n";
-    ParseOptions po(usage);
-    bool binary = true;
-    bool allow_partial = true;
-    BaseFloat acoustic_scale = 0.1;
-    
-    std::string word_syms_filename, utt2spk_rspecifier;
-    FasterDecoderOptions decoder_opts;
-    decoder_opts.Register(&po, true);  // true == include obscure settings.
-    po.Register("utt2spk", &utt2spk_rspecifier, "rspecifier for utterance to "
-                "speaker map");
-    po.Register("binary", &binary, "Write output in binary mode");
-    po.Register("acoustic-scale", &acoustic_scale,
-        "Scaling factor for acoustic likelihoods");
-    po.Register("word-symbol-table", &word_syms_filename,
-        "Symbol table for words [for debug output]");
-    po.Register("allow-partial", &allow_partial,
-                "Produce output even when final state was not reached");
-    po.Read(argc, argv);
-
-    if (po.NumArgs() < 6 || po.NumArgs() > 7) {
-      po.PrintUsage();
-      exit(1);
-    }
-
-    std::string model_in_filename = po.GetArg(1),
-        fst_in_filename = po.GetArg(2),
-        regtree_filename = po.GetArg(3),
-        feature_rspecifier = po.GetArg(4),
-        xforms_rspecifier = po.GetArg(5),
-        words_wspecifier = po.GetArg(6),
-        alignment_wspecifier = po.GetOptArg(7);
-
-    TransitionModel trans_model;
-    AmDiagGmm am_gmm;
-    {
-      bool binary_read;
-      Input ki(model_in_filename, &binary_read);
-      trans_model.Read(ki.Stream(), binary_read);
-      am_gmm.Read(ki.Stream(), binary_read);
-    }
-
-    VectorFst<StdArc> *decode_fst = fst::ReadFstKaldi(fst_in_filename);
-
-    RegressionTree regtree;
-    {
-      bool binary_read;
-      Input in(regtree_filename, &binary_read);
-      regtree.Read(in.Stream(), binary_read, am_gmm);
-    }
-
-    RandomAccessRegtreeMllrDiagGmmReaderMapped mllr_reader(xforms_rspecifier,
-                                                           utt2spk_rspecifier);
-
-    Int32VectorWriter words_writer(words_wspecifier);
-
-    Int32VectorWriter alignment_writer(alignment_wspecifier);
-
-    fst::SymbolTable *word_syms = NULL;
-    if (word_syms_filename != "") {
-      word_syms = fst::SymbolTable::ReadText(word_syms_filename);
-      if (!word_syms) {
-        KALDI_ERR << "Could not read symbol table from file "
-            << word_syms_filename;
-      }
-    }
-
-    BaseFloat tot_like = 0.0;
-    kaldi::int64 frame_count = 0;
-    int num_success = 0, num_fail = 0;
-    FasterDecoder decoder(*decode_fst, decoder_opts);
-
-    Timer timer;
-
-    DecodeInfo decode_info(am_gmm, trans_model, &decoder, acoustic_scale,
-                           allow_partial, words_writer, alignment_writer,
-                           word_syms);
-
-    SequentialBaseFloatMatrixReader feature_reader(feature_rspecifier);
-    for (; !feature_reader.Done(); feature_reader.Next()) {
-      string utt = feature_reader.Key();
-
-      Matrix<BaseFloat> features(feature_reader.Value());
-      feature_reader.FreeCurrent();
-      if (features.NumRows() == 0) {
-        KALDI_WARN << "Zero-length utterance: " << utt;
-        num_fail++;
-        continue;
-      }
-
-      if (!mllr_reader.HasKey(utt)) {  // Decode without MLLR if none found
-        KALDI_WARN << "No MLLR transform for key " << utt <<
-            ", decoding without MLLR.";
-        kaldi::DecodableAmDiagGmmScaled gmm_decodable(am_gmm, trans_model,
-                                                      features,
-                                                      acoustic_scale);
-        if (DecodeUtterance(&decoder, &gmm_decodable, &decode_info,
-                            utt, features.NumRows(), &tot_like)) {
-          frame_count += gmm_decodable.NumFramesReady();
-          num_success++;
-        } else {
-          num_fail++;
-        }
-        continue;
-      }
-
-      // If found, load the transforms for the current utterance.
-      const RegtreeMllrDiagGmm &mllr = mllr_reader.Value(utt);
-      kaldi::DecodableAmDiagGmmRegtreeMllr gmm_decodable(am_gmm, trans_model,
-                                                         features, mllr,
-                                                         regtree,
-                                                         acoustic_scale);
-      if (DecodeUtterance(&decoder, &gmm_decodable, &decode_info,
-                          utt, features.NumRows(), &tot_like)) {
-        frame_count += gmm_decodable.NumFramesReady();
-        num_success++;
-      } else {
-        num_fail++;
-      }
-    }  // end looping over all utterances
-
-    double elapsed = timer.Elapsed();
-    KALDI_LOG << "Time taken [excluding initialization] " << elapsed
-              << "s: real-time factor assuming 100 frames/sec is "
-              << (elapsed * 100.0 / frame_count);
-    KALDI_LOG << "Done " << num_success << " utterances, failed for "
-              << num_fail;
-    KALDI_LOG << "Overall log-likelihood per frame is "
-              << (tot_like / frame_count) << " over " << frame_count
-              << " frames.";
-    
-    delete decode_fst;
-    if (num_success != 0)
-      return 0;
-    else
-      return 1;
-  }
-  catch(const std::exception &e) {
-    std::cerr << e.what();
-    return -1;
-  }
-}
-
-
diff --git a/src/gmmbin/gmm-decode-faster.cc b/src/gmmbin/gmm-decode-faster.cc
index 34c4ff2c37e..438e3d9c9d1 100644
--- a/src/gmmbin/gmm-decode-faster.cc
+++ b/src/gmmbin/gmm-decode-faster.cc
@@ -22,7 +22,7 @@
 #include "base/kaldi-common.h"
 #include "util/common-utils.h"
 #include "gmm/am-diag-gmm.h"
-#include "hmm/transition-model.h"
+#include "hmm/transitions.h"
 #include "fstext/fstext-lib.h"
 #include "decoder/faster-decoder.h"
 #include "gmm/decodable-am-diag-gmm.h"
@@ -101,7 +101,7 @@ int main(int argc, char *argv[]) {
         alignment_wspecifier = po.GetOptArg(5),
         lattice_wspecifier = po.GetOptArg(6);
 
-    TransitionModel trans_model;
+    Transitions trans_model;
     AmDiagGmm am_gmm;
     {
       bool binary;
diff --git a/src/gmmbin/gmm-decode-simple.cc b/src/gmmbin/gmm-decode-simple.cc
index 5ef35552dc0..ef87585cc1e 100644
--- a/src/gmmbin/gmm-decode-simple.cc
+++ b/src/gmmbin/gmm-decode-simple.cc
@@ -23,7 +23,7 @@
 #include "util/common-utils.h"
 #include "gmm/am-diag-gmm.h"
 #include "tree/context-dep.h"
-#include "hmm/transition-model.h"
+#include "hmm/transitions.h"
 #include "fstext/fstext-lib.h"
 #include "decoder/simple-decoder.h"
 #include "gmm/decodable-am-diag-gmm.h"
@@ -78,7 +78,7 @@ int main(int argc, char *argv[]) {
         alignment_wspecifier = po.GetOptArg(5),
         lattice_wspecifier = po.GetOptArg(6);
 
-    TransitionModel trans_model;
+    Transitions trans_model;
     AmDiagGmm am_gmm;
     {
       bool binary;
diff --git a/src/gmmbin/gmm-est-basis-fmllr-gpost.cc b/src/gmmbin/gmm-est-basis-fmllr-gpost.cc
index 54b92d8aa61..3d864c88086 100644
--- a/src/gmmbin/gmm-est-basis-fmllr-gpost.cc
+++ b/src/gmmbin/gmm-est-basis-fmllr-gpost.cc
@@ -26,7 +26,7 @@ using std::vector;
 #include "base/kaldi-common.h"
 #include "util/common-utils.h"
 #include "gmm/am-diag-gmm.h"
-#include "hmm/transition-model.h"
+#include "hmm/transitions.h"
 #include "transform/fmllr-diag-gmm.h"
 #include "transform/basis-fmllr-diag-gmm.h"
 #include "hmm/posterior.h"
@@ -34,7 +34,7 @@ using std::vector;
 namespace kaldi {
 void AccumulateForUtterance(const Matrix<BaseFloat> &feats,
                             const GaussPost &gpost,
-                            const TransitionModel &trans_model,
+                            const Transitions &trans_model,
                             const AmDiagGmm &am_gmm,
                             FmllrDiagGmmAccs *spk_stats) {
   for (size_t i = 0; i < gpost.size(); i++) {
@@ -87,7 +87,7 @@ int main(int argc, char *argv[]) {
         gpost_rspecifier = po.GetArg(4),
         trans_wspecifier = po.GetArg(5);
 
-    TransitionModel trans_model;
+    Transitions trans_model;
     AmDiagGmm am_gmm;
     {
       bool binary;
diff --git a/src/gmmbin/gmm-est-basis-fmllr.cc b/src/gmmbin/gmm-est-basis-fmllr.cc
index 0d163169ce2..fe64a1b2166 100644
--- a/src/gmmbin/gmm-est-basis-fmllr.cc
+++ b/src/gmmbin/gmm-est-basis-fmllr.cc
@@ -26,7 +26,7 @@ using std::vector;
 #include "base/kaldi-common.h"
 #include "util/common-utils.h"
 #include "gmm/am-diag-gmm.h"
-#include "hmm/transition-model.h"
+#include "hmm/transitions.h"
 #include "transform/fmllr-diag-gmm.h"
 #include "transform/basis-fmllr-diag-gmm.h"
 #include "hmm/posterior.h"
@@ -34,7 +34,7 @@ using std::vector;
 namespace kaldi {
 void AccumulateForUtterance(const Matrix<BaseFloat> &feats,
                             const Posterior &post,
-                            const TransitionModel &trans_model,
+                            const Transitions &trans_model,
                             const AmDiagGmm &am_gmm,
                             FmllrDiagGmmAccs *spk_stats) {
   Posterior pdf_post;
@@ -89,7 +89,7 @@ int main(int argc, char *argv[]) {
         post_rspecifier = po.GetArg(4),
         trans_wspecifier = po.GetArg(5);
 
-    TransitionModel trans_model;
+    Transitions trans_model;
     AmDiagGmm am_gmm;
     {
       bool binary;
diff --git a/src/gmmbin/gmm-est-fmllr-global.cc b/src/gmmbin/gmm-est-fmllr-global.cc
index b3af0780aa5..d167ba25890 100644
--- a/src/gmmbin/gmm-est-fmllr-global.cc
+++ b/src/gmmbin/gmm-est-fmllr-global.cc
@@ -27,7 +27,7 @@ using std::vector;
 #include "base/kaldi-common.h"
 #include "util/common-utils.h"
 #include "gmm/am-diag-gmm.h"
-#include "hmm/transition-model.h"
+#include "hmm/transitions.h"
 #include "transform/fmllr-diag-gmm.h"
 #include "hmm/posterior.h"
 
diff --git a/src/gmmbin/gmm-est-fmllr-gpost.cc b/src/gmmbin/gmm-est-fmllr-gpost.cc
index d1cae0d7f48..9d830737718 100644
--- a/src/gmmbin/gmm-est-fmllr-gpost.cc
+++ b/src/gmmbin/gmm-est-fmllr-gpost.cc
@@ -27,14 +27,14 @@ using std::vector;
 #include "base/kaldi-common.h"
 #include "util/common-utils.h"
 #include "gmm/am-diag-gmm.h"
-#include "hmm/transition-model.h"
+#include "hmm/transitions.h"
 #include "transform/fmllr-diag-gmm.h"
 #include "hmm/posterior.h"
 
 namespace kaldi {
 void AccumulateForUtterance(const Matrix<BaseFloat> &feats,
                             const GaussPost &gpost,
-                            const TransitionModel &trans_model,
+                            const Transitions &trans_model,
                             const AmDiagGmm &am_gmm,
                             FmllrDiagGmmAccs *spk_stats) {
   for (size_t i = 0; i < gpost.size(); i++) {
@@ -81,7 +81,7 @@ int main(int argc, char *argv[]) {
         gpost_rspecifier = po.GetArg(3),
         trans_wspecifier = po.GetArg(4);
 
-    TransitionModel trans_model;
+    Transitions trans_model;
     AmDiagGmm am_gmm;
     {
       bool binary;
diff --git a/src/gmmbin/gmm-est-fmllr-raw-gpost.cc b/src/gmmbin/gmm-est-fmllr-raw-gpost.cc
deleted file mode 100644
index 1f5a09f233b..00000000000
--- a/src/gmmbin/gmm-est-fmllr-raw-gpost.cc
+++ /dev/null
@@ -1,198 +0,0 @@
-// gmmbin/gmm-est-fmllr-raw-gpost.cc
-
-// Copyright 2013  Johns Hopkins University (author: Daniel Povey)
-//           2014  Guoguo Chen
-
-// See ../../COPYING for clarification regarding multiple authors
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//  http://www.apache.org/licenses/LICENSE-2.0
-//
-// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
-// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
-// MERCHANTABLITY OR NON-INFRINGEMENT.
-// See the Apache 2 License for the specific language governing permissions and
-// limitations under the License.
-
-#include "base/kaldi-common.h"
-#include "transform/fmllr-raw.h"
-#include "gmm/am-diag-gmm.h"
-#include "hmm/transition-model.h"
-#include "util/common-utils.h"
-#include "hmm/posterior.h"
-
-namespace kaldi {
-
-
-void AccStatsForUtterance(const TransitionModel &trans_model,
-                          const AmDiagGmm &am_gmm,
-                          const GaussPost &gpost,
-                          const Matrix<BaseFloat> &feats,
-                          FmllrRawAccs *accs) {
-  for (size_t t = 0; t < gpost.size(); t++) {
-    for (size_t i = 0; i < gpost[t].size(); i++) {
-      int32 pdf = gpost[t][i].first;
-      const Vector<BaseFloat> &posterior(gpost[t][i].second);      
-      accs->AccumulateFromPosteriors(am_gmm.GetPdf(pdf),
-                                     feats.Row(t), posterior);
-    }
-  }
-}
-
-
-}
-
-int main(int argc, char *argv[]) {
-  try {
-    typedef kaldi::int32 int32;
-    using namespace kaldi;
-    const char *usage =
-        "Estimate fMLLR transforms in the space before splicing and linear transforms\n"
-        "such as LDA+MLLT, but using models in the space transformed by these transforms\n"
-        "Requires the original spliced features, and the full LDA+MLLT (or similar) matrix\n"
-        "including the 'rejected' rows (see the program get-full-lda-mat).  Reads in\n"
-        "Gaussian-level posteriors.\n"
-        "Usage: gmm-est-fmllr-raw-gpost [options] <model-in> <full-lda-mat-in> "
-        "<feature-rspecifier> <gpost-rspecifier> <transform-wspecifier>\n";
-
-
-    int32 raw_feat_dim = 13;
-    ParseOptions po(usage);
-    FmllrRawOptions opts;
-    std::string spk2utt_rspecifier;
-    po.Register("spk2utt", &spk2utt_rspecifier, "rspecifier for speaker to "
-                "utterance-list map");
-    po.Register("raw-feat-dim", &raw_feat_dim, "Dimension of raw features "
-                "prior to splicing");
-    opts.Register(&po);
-
-    po.Read(argc, argv);
-
-    if (po.NumArgs() != 5) {
-      po.PrintUsage();
-      exit(1);
-    }
-    
-    std::string model_rxfilename = po.GetArg(1),
-        full_lda_mat_rxfilename = po.GetArg(2),
-        feature_rspecifier = po.GetArg(3),
-        gpost_rspecifier = po.GetArg(4),
-        transform_wspecifier = po.GetArg(5);
-
-    AmDiagGmm am_gmm;
-    TransitionModel trans_model;
-    {
-      bool binary;
-      Input ki(model_rxfilename, &binary);
-      trans_model.Read(ki.Stream(), binary);
-      am_gmm.Read(ki.Stream(), binary);
-    }
-
-    Matrix<BaseFloat> full_lda_mat;
-    ReadKaldiObject(full_lda_mat_rxfilename, &full_lda_mat);
-    
-    RandomAccessGaussPostReader gpost_reader(gpost_rspecifier);
-    BaseFloatMatrixWriter transform_writer(transform_wspecifier);
-    
-    double tot_auxf_impr = 0.0, tot_count = 0.0;
-    
-    int32 num_done = 0, num_err = 0;
-    if (!spk2utt_rspecifier.empty()) { // Adapting per speaker
-      SequentialTokenVectorReader spk2utt_reader(spk2utt_rspecifier);
-      RandomAccessBaseFloatMatrixReader feature_reader(feature_rspecifier);
-      
-      for (; !spk2utt_reader.Done(); spk2utt_reader.Next()) {
-        FmllrRawAccs accs(raw_feat_dim, am_gmm.Dim(), full_lda_mat);
-        std::string spk = spk2utt_reader.Key();
-        const std::vector<std::string> &uttlist = spk2utt_reader.Value();
-        for (size_t i = 0; i < uttlist.size(); i++) {
-          std::string utt = uttlist[i];
-          if (!feature_reader.HasKey(utt)) {
-            KALDI_WARN << "Features not found for utterance " << utt;
-            num_err++;
-            continue;
-          }
-          if (!gpost_reader.HasKey(utt)) {
-            KALDI_WARN << "Gaussian-level posteriors not found for utterance " << utt;
-            num_err++;
-            continue;
-          }
-          const Matrix<BaseFloat> &feats = feature_reader.Value(utt);
-          const GaussPost &gpost = gpost_reader.Value(utt);
-          if (static_cast<int32>(gpost.size()) != feats.NumRows()) {
-            KALDI_WARN << "Size mismatch between gposteriors " << gpost.size()
-                       << " and features " << feats.NumRows();
-            num_err++;
-            continue;
-          }
-
-          AccStatsForUtterance(trans_model, am_gmm, gpost, feats, &accs);
-          num_done++;
-        }
-        
-        BaseFloat auxf_impr, count;
-        {
-          Matrix<BaseFloat> transform(raw_feat_dim, raw_feat_dim + 1);
-          transform.SetUnit();
-          accs.Update(opts, &transform, &auxf_impr, &count);
-          transform_writer.Write(spk, transform);
-        }
-        KALDI_LOG << "For speaker " << spk << ", auxf-impr from raw fMLLR is "
-                  << (auxf_impr/count) << " over " << count << " frames.";
-        tot_auxf_impr += auxf_impr;
-        tot_count += count;
-      }
-    } else {  // --spk2utt option not given -> adapt per utterance.
-      SequentialBaseFloatMatrixReader feature_reader(feature_rspecifier);
-      for (; !feature_reader.Done(); feature_reader.Next()) {
-        std::string utt = feature_reader.Key();
-        if (!gpost_reader.HasKey(utt)) {
-          KALDI_WARN << "Gaussian-level posteriors not found for utterance " << utt;
-          num_err++;
-          continue;
-        }
-        const Matrix<BaseFloat> &feats = feature_reader.Value();
-        const GaussPost &gpost = gpost_reader.Value(utt);
-
-        if (static_cast<int32>(gpost.size()) != feats.NumRows()) {
-          KALDI_WARN << "Size mismatch between posteriors " << gpost.size()
-                     << " and features " << feats.NumRows();
-          num_err++;
-          continue;
-        }
-
-        FmllrRawAccs accs(raw_feat_dim, am_gmm.Dim(), full_lda_mat);
-
-        AccStatsForUtterance(trans_model, am_gmm, gpost, feats, &accs);
-        
-        BaseFloat auxf_impr, count;        
-        {
-          Matrix<BaseFloat> transform(raw_feat_dim, raw_feat_dim + 1);
-          transform.SetUnit();
-          accs.Update(opts, &transform, &auxf_impr, &count);
-          transform_writer.Write(utt, transform);
-        }
-        KALDI_LOG << "For utterance " << utt << ", auxf-impr from raw fMLLR is "
-                  << (auxf_impr/count) << " over " << count << " frames.";
-        tot_auxf_impr += auxf_impr;
-        tot_count += count;
-        num_done++;
-      }
-    }
-
-    KALDI_LOG << "Processed " << num_done << " utterances, "
-              << num_err << " had errors.";
-    KALDI_LOG << "Overall raw-fMLLR auxf impr per frame is "
-              << (tot_auxf_impr / tot_count) << " over " << tot_count
-              << " frames.";
-    return (num_done != 0 ? 0 : 1);
-  } catch(const std::exception &e) {
-    std::cerr << e.what();
-    return -1;
-  }
-}
-
diff --git a/src/gmmbin/gmm-est-fmllr-raw.cc b/src/gmmbin/gmm-est-fmllr-raw.cc
deleted file mode 100644
index 5e83bfb1fb3..00000000000
--- a/src/gmmbin/gmm-est-fmllr-raw.cc
+++ /dev/null
@@ -1,199 +0,0 @@
-// gmmbin/gmm-est-fmllr-raw.cc
-
-// Copyright 2013  Johns Hopkins University (author: Daniel Povey)
-//           2014  Guoguo Chen
-
-// See ../../COPYING for clarification regarding multiple authors
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//  http://www.apache.org/licenses/LICENSE-2.0
-//
-// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
-// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
-// MERCHANTABLITY OR NON-INFRINGEMENT.
-// See the Apache 2 License for the specific language governing permissions and
-// limitations under the License.
-
-#include "base/kaldi-common.h"
-#include "transform/fmllr-raw.h"
-#include "gmm/am-diag-gmm.h"
-#include "hmm/transition-model.h"
-#include "util/common-utils.h"
-#include "hmm/posterior.h"
-
-namespace kaldi {
-
-
-void AccStatsForUtterance(const TransitionModel &trans_model,
-                          const AmDiagGmm &am_gmm,
-                          const Posterior &post,
-                          const Matrix<BaseFloat> &feats,
-                          FmllrRawAccs *accs) {
-  Posterior pdf_post;
-  ConvertPosteriorToPdfs(trans_model, post, &pdf_post);
-  for (size_t t = 0; t < post.size(); t++) {
-    for (size_t i = 0; i < pdf_post[t].size(); i++) {
-      int32 pdf = pdf_post[t][i].first;
-      BaseFloat weight = pdf_post[t][i].second;
-      accs->AccumulateForGmm(am_gmm.GetPdf(pdf),
-                             feats.Row(t), weight);
-    }
-  }
-}
-
-
-}
-
-int main(int argc, char *argv[]) {
-  try {
-    typedef kaldi::int32 int32;
-    using namespace kaldi;
-    const char *usage =
-        "Estimate fMLLR transforms in the space before splicing and linear transforms\n"
-        "such as LDA+MLLT, but using models in the space transformed by these transforms\n"
-        "Requires the original spliced features, and the full LDA+MLLT (or similar) matrix\n"
-        "including the 'rejected' rows (see the program get-full-lda-mat)\n"
-        "Usage: gmm-est-fmllr-raw [options] <model-in> <full-lda-mat-in> "
-        "<feature-rspecifier> <post-rspecifier> <transform-wspecifier>\n";
-
-
-    int32 raw_feat_dim = 13;
-    ParseOptions po(usage);
-    FmllrRawOptions opts;
-    std::string spk2utt_rspecifier;
-    po.Register("spk2utt", &spk2utt_rspecifier, "rspecifier for speaker to "
-                "utterance-list map");
-    po.Register("raw-feat-dim", &raw_feat_dim, "Dimension of raw features "
-                "prior to splicing");
-    opts.Register(&po);
-
-    po.Read(argc, argv);
-
-    if (po.NumArgs() != 5) {
-      po.PrintUsage();
-      exit(1);
-    }
-    
-    std::string model_rxfilename = po.GetArg(1),
-        full_lda_mat_rxfilename = po.GetArg(2),
-        feature_rspecifier = po.GetArg(3),
-        post_rspecifier = po.GetArg(4),
-        transform_wspecifier = po.GetArg(5);
-
-    AmDiagGmm am_gmm;
-    TransitionModel trans_model;
-    {
-      bool binary;
-      Input ki(model_rxfilename, &binary);
-      trans_model.Read(ki.Stream(), binary);
-      am_gmm.Read(ki.Stream(), binary);
-    }
-
-    Matrix<BaseFloat> full_lda_mat;
-    ReadKaldiObject(full_lda_mat_rxfilename, &full_lda_mat);
-    
-    RandomAccessPosteriorReader post_reader(post_rspecifier);
-    BaseFloatMatrixWriter transform_writer(transform_wspecifier);
-    
-    double tot_auxf_impr = 0.0, tot_count = 0.0;
-    
-    int32 num_done = 0, num_err = 0;
-    if (!spk2utt_rspecifier.empty()) { // Adapting per speaker
-      SequentialTokenVectorReader spk2utt_reader(spk2utt_rspecifier);
-      RandomAccessBaseFloatMatrixReader feature_reader(feature_rspecifier);
-      
-      for (; !spk2utt_reader.Done(); spk2utt_reader.Next()) {
-        FmllrRawAccs accs(raw_feat_dim, am_gmm.Dim(), full_lda_mat);
-        std::string spk = spk2utt_reader.Key();
-        const std::vector<std::string> &uttlist = spk2utt_reader.Value();
-        for (size_t i = 0; i < uttlist.size(); i++) {
-          std::string utt = uttlist[i];
-          if (!feature_reader.HasKey(utt)) {
-            KALDI_WARN << "Features not found for utterance " << utt;
-            num_err++;
-            continue;
-          }
-          if (!post_reader.HasKey(utt)) {
-            KALDI_WARN << "Posteriors not found for utterance " << utt;
-            num_err++;
-            continue;
-          }
-          const Matrix<BaseFloat> &feats = feature_reader.Value(utt);
-          const Posterior &post = post_reader.Value(utt);
-          if (static_cast<int32>(post.size()) != feats.NumRows()) {
-            KALDI_WARN << "Size mismatch between posteriors " << post.size()
-                       << " and features " << feats.NumRows();
-            num_err++;
-            continue;
-          }
-
-          AccStatsForUtterance(trans_model, am_gmm, post, feats, &accs);
-          num_done++;
-        }
-        
-        BaseFloat auxf_impr, count;
-        {
-          Matrix<BaseFloat> transform(raw_feat_dim, raw_feat_dim + 1);
-          transform.SetUnit();
-          accs.Update(opts, &transform, &auxf_impr, &count);
-          transform_writer.Write(spk, transform);
-        }
-        KALDI_LOG << "For speaker " << spk << ", auxf-impr from raw fMLLR is "
-                  << (auxf_impr/count) << " over " << count << " frames.";
-        tot_auxf_impr += auxf_impr;
-        tot_count += count;
-      }
-    } else {  // --spk2utt option not given -> adapt per utterance.
-      SequentialBaseFloatMatrixReader feature_reader(feature_rspecifier);
-      for (; !feature_reader.Done(); feature_reader.Next()) {
-        std::string utt = feature_reader.Key();
-        if (!post_reader.HasKey(utt)) {
-          KALDI_WARN << "Posteriors not found for utterance " << utt;
-          num_err++;
-          continue;
-        }
-        const Matrix<BaseFloat> &feats = feature_reader.Value();
-        const Posterior &post = post_reader.Value(utt);
-
-        if (static_cast<int32>(post.size()) != feats.NumRows()) {
-          KALDI_WARN << "Size mismatch between posteriors " << post.size()
-                     << " and features " << feats.NumRows();
-          num_err++;
-          continue;
-        }
-
-        FmllrRawAccs accs(raw_feat_dim, am_gmm.Dim(), full_lda_mat);
-
-        AccStatsForUtterance(trans_model, am_gmm, post, feats, &accs);
-        
-        BaseFloat auxf_impr, count;        
-        {
-          Matrix<BaseFloat> transform(raw_feat_dim, raw_feat_dim + 1);
-          transform.SetUnit();
-          accs.Update(opts, &transform, &auxf_impr, &count);
-          transform_writer.Write(utt, transform);
-        }
-        KALDI_LOG << "For utterance " << utt << ", auxf-impr from raw fMLLR is "
-                  << (auxf_impr/count) << " over " << count << " frames.";
-        tot_auxf_impr += auxf_impr;
-        tot_count += count;
-        num_done++;
-      }
-    }
-
-    KALDI_LOG << "Processed " << num_done << " utterances, "
-              << num_err << " had errors.";
-    KALDI_LOG << "Overall raw-fMLLR auxf impr per frame is "
-              << (tot_auxf_impr / tot_count) << " over " << tot_count
-              << " frames.";
-    return (num_done != 0 ? 0 : 1);
-  } catch(const std::exception &e) {
-    std::cerr << e.what();
-    return -1;
-  }
-}
-
diff --git a/src/gmmbin/gmm-est-fmllr.cc b/src/gmmbin/gmm-est-fmllr.cc
index 9f8dfd89143..c44a284b2f8 100644
--- a/src/gmmbin/gmm-est-fmllr.cc
+++ b/src/gmmbin/gmm-est-fmllr.cc
@@ -27,14 +27,14 @@ using std::vector;
 #include "base/kaldi-common.h"
 #include "util/common-utils.h"
 #include "gmm/am-diag-gmm.h"
-#include "hmm/transition-model.h"
+#include "hmm/transitions.h"
 #include "transform/fmllr-diag-gmm.h"
 #include "hmm/posterior.h"
 
 namespace kaldi {
 void AccumulateForUtterance(const Matrix<BaseFloat> &feats,
                             const Posterior &post,
-                            const TransitionModel &trans_model,
+                            const Transitions &trans_model,
                             const AmDiagGmm &am_gmm,
                             FmllrDiagGmmAccs *spk_stats) {
   Posterior pdf_post;
@@ -83,7 +83,7 @@ int main(int argc, char *argv[]) {
         post_rspecifier = po.GetArg(3),
         trans_wspecifier = po.GetArg(4);
 
-    TransitionModel trans_model;
+    Transitions trans_model;
     AmDiagGmm am_gmm;
     {
       bool binary;
diff --git a/src/gmmbin/gmm-est-gaussians-ebw.cc b/src/gmmbin/gmm-est-gaussians-ebw.cc
index bbd53c2bec0..cfbb8ece02d 100644
--- a/src/gmmbin/gmm-est-gaussians-ebw.cc
+++ b/src/gmmbin/gmm-est-gaussians-ebw.cc
@@ -21,7 +21,7 @@
 #include "util/common-utils.h"
 #include "gmm/am-diag-gmm.h"
 #include "tree/context-dep.h"
-#include "hmm/transition-model.h"
+#include "hmm/transitions.h"
 #include "gmm/ebw-diag-gmm.h"
 
 int main(int argc, char *argv[]) {
@@ -62,7 +62,7 @@ int main(int argc, char *argv[]) {
         model_out_filename = po.GetArg(4);
 
     AmDiagGmm am_gmm;
-    TransitionModel trans_model;
+    Transitions trans_model;
     {
       bool binary_read;
       Input ki(model_in_filename, &binary_read);
diff --git a/src/gmmbin/gmm-est-lvtln-trans.cc b/src/gmmbin/gmm-est-lvtln-trans.cc
index abfc24a6585..849560dd437 100644
--- a/src/gmmbin/gmm-est-lvtln-trans.cc
+++ b/src/gmmbin/gmm-est-lvtln-trans.cc
@@ -26,7 +26,7 @@ using std::vector;
 #include "base/kaldi-common.h"
 #include "util/common-utils.h"
 #include "gmm/am-diag-gmm.h"
-#include "hmm/transition-model.h"
+#include "hmm/transitions.h"
 #include "transform/lvtln.h"
 #include "hmm/posterior.h"
 
@@ -86,7 +86,7 @@ int main(int argc, char *argv[]) {
     {
       bool binary;
       Input ki(model_rxfilename, &binary);
-      TransitionModel trans_model;
+      Transitions trans_model;
       trans_model.Read(ki.Stream(), binary);
       am_gmm.Read(ki.Stream(), binary);
     }
diff --git a/src/gmmbin/gmm-est-map.cc b/src/gmmbin/gmm-est-map.cc
index 22ea8acda51..eb2b44d5961 100644
--- a/src/gmmbin/gmm-est-map.cc
+++ b/src/gmmbin/gmm-est-map.cc
@@ -22,7 +22,7 @@
 #include "util/common-utils.h"
 #include "gmm/am-diag-gmm.h"
 #include "tree/context-dep.h"
-#include "hmm/transition-model.h"
+#include "hmm/transitions.h"
 #include "gmm/mle-am-diag-gmm.h"
 
 int main(int argc, char *argv[]) {
@@ -65,7 +65,7 @@ int main(int argc, char *argv[]) {
         model_out_filename = po.GetArg(3);
 
     AmDiagGmm am_gmm;
-    TransitionModel trans_model;
+    Transitions trans_model;
     {
       bool binary_read;
       Input ki(model_in_filename, &binary_read);
diff --git a/src/gmmbin/gmm-est-regtree-fmllr-ali.cc b/src/gmmbin/gmm-est-regtree-fmllr-ali.cc
deleted file mode 100644
index 0158bae8298..00000000000
--- a/src/gmmbin/gmm-est-regtree-fmllr-ali.cc
+++ /dev/null
@@ -1,202 +0,0 @@
-// gmmbin/gmm-est-regtree-fmllr-ali.cc
-
-// Copyright 2009-2011  Saarland University;  Microsoft Corporation
-
-// See ../../COPYING for clarification regarding multiple authors
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//  http://www.apache.org/licenses/LICENSE-2.0
-//
-// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
-// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
-// MERCHANTABLITY OR NON-INFRINGEMENT.
-// See the Apache 2 License for the specific language governing permissions and
-// limitations under the License.
-
-#include <string>
-using std::string;
-#include <vector>
-using std::vector;
-
-#include "base/kaldi-common.h"
-#include "util/common-utils.h"
-#include "gmm/am-diag-gmm.h"
-#include "hmm/transition-model.h"
-#include "transform/regtree-fmllr-diag-gmm.h"
-
-int main(int argc, char *argv[]) {
-  try {
-    typedef kaldi::int32 int32;
-    using namespace kaldi;
-    const char *usage =
-        "Compute FMLLR transforms per-utterance (default) or per-speaker for "
-        "the supplied set of speakers (spk2utt option).  Note: writes RegtreeFmllrDiagGmm objects\n"
-        "Usage: gmm-est-regtree-fmllr-ali  [options] <model-in> <feature-rspecifier> "
-        "<alignments-rspecifier> <regression-tree> <transforms-wspecifier>\n";
-
-    ParseOptions po(usage);
-    string spk2utt_rspecifier;
-    bool binary = true;
-    po.Register("spk2utt", &spk2utt_rspecifier, "rspecifier for speaker to "
-                "utterance-list map");
-    po.Register("binary", &binary, "Write output in binary mode");
-    // register other modules
-    RegtreeFmllrOptions opts;
-    opts.Register(&po);
-
-    po.Read(argc, argv);
-
-    if (po.NumArgs() != 5) {
-      po.PrintUsage();
-      exit(1);
-    }
-
-    string model_filename = po.GetArg(1),
-        feature_rspecifier = po.GetArg(2),
-        alignments_rspecifier = po.GetArg(3),
-        regtree_filename = po.GetArg(4),
-        xforms_wspecifier = po.GetArg(5);
-
-    RandomAccessInt32VectorReader alignments_reader(alignments_rspecifier);
-    RegtreeFmllrDiagGmmWriter fmllr_writer(xforms_wspecifier);
-
-    AmDiagGmm am_gmm;
-    TransitionModel trans_model;
-    {
-      bool binary;
-      Input ki(model_filename, &binary);
-      trans_model.Read(ki.Stream(), binary);
-      am_gmm.Read(ki.Stream(), binary);
-    }
-    RegressionTree regtree;
-    {
-      bool binary;
-      Input in(regtree_filename, &binary);
-      regtree.Read(in.Stream(), binary, am_gmm);
-    }
-
-    RegtreeFmllrDiagGmm fmllr_xforms;
-    RegtreeFmllrDiagGmmAccs fmllr_accs;
-    fmllr_accs.Init(regtree.NumBaseclasses(), am_gmm.Dim());
-
-    double tot_like = 0.0;
-    kaldi::int64 tot_t = 0;
-
-    int32 num_done = 0, num_no_alignment = 0, num_other_error = 0;
-    double tot_objf_impr = 0.0, tot_t_objf = 0.0;
-    if (spk2utt_rspecifier != "") {  // per-speaker adaptation
-      SequentialTokenVectorReader spk2utt_reader(spk2utt_rspecifier);
-      RandomAccessBaseFloatMatrixReader feature_reader(feature_rspecifier);
-      for (; !spk2utt_reader.Done(); spk2utt_reader.Next()) {
-        string spk = spk2utt_reader.Key();
-        fmllr_accs.SetZero();
-        const vector<string> &uttlist = spk2utt_reader.Value();
-        for (vector<string>::const_iterator utt_itr = uttlist.begin(),
-            itr_end = uttlist.end(); utt_itr != itr_end; ++utt_itr) {
-          if (!feature_reader.HasKey(*utt_itr)) {
-            KALDI_WARN << "Did not find features for utterance " << *utt_itr;
-            continue;
-          }
-          if (!alignments_reader.HasKey(*utt_itr)) {
-            KALDI_WARN << "Did not find aligned transcription for utterance "
-                << *utt_itr;
-            num_no_alignment++;
-            continue;
-          }
-          const Matrix<BaseFloat> &feats = feature_reader.Value(*utt_itr);
-          const vector<int32> &alignment = alignments_reader.Value(*utt_itr);
-          if (static_cast<int32>(alignment.size()) != feats.NumRows()) {
-            KALDI_WARN << "Alignments has wrong size " << (alignment.size())
-                << " vs. " << (feats.NumRows());
-            num_other_error++;
-            continue;
-          }
-
-          BaseFloat file_like = 0.0;
-          for (size_t i = 0; i < alignment.size(); i++) {
-            int32 pdf_id = trans_model.TransitionIdToPdf(alignment[i]);
-            file_like += fmllr_accs.AccumulateForGmm(regtree, am_gmm,
-                feats.Row(i), pdf_id, 1.0);
-          }
-          KALDI_VLOG(2) << "Average like for this file is " << (file_like
-              / alignment.size()) << " over " << alignment.size()
-              << " frames.\n";
-          tot_like += file_like;
-          tot_t += alignment.size();
-          num_done++;
-          if (num_done % 10 == 0) KALDI_VLOG(1)
-              << "Avg like per frame so far is " << (tot_like / tot_t) << '\n';
-        }  // end looping over all utterances of the current speaker
-        BaseFloat objf_impr, t;
-        fmllr_accs.Update(regtree, opts, &fmllr_xforms, &objf_impr, &t);
-        KALDI_LOG << "fMLLR objf improvement for speaker " << spk << " is "
-                  << (objf_impr/(t+1.0e-10)) << " per frame over " << t
-                  << " frames.";
-        tot_objf_impr += objf_impr;
-        tot_t_objf += t;
-        fmllr_writer.Write(spk, fmllr_xforms);
-      }  // end looping over speakers
-    } else {  // per-utterance adaptation
-      SequentialBaseFloatMatrixReader feature_reader(feature_rspecifier);
-      for (; !feature_reader.Done(); feature_reader.Next()) {
-        string key = feature_reader.Key();
-        if (!alignments_reader.HasKey(key)) {
-          KALDI_WARN << "Did not find aligned transcription for utterance "
-              << key;
-          num_no_alignment++;
-          continue;
-        }
-        const Matrix<BaseFloat> &feats = feature_reader.Value();
-        const vector<int32> &alignment = alignments_reader.Value(key);
-
-        if (static_cast<int32>(alignment.size()) != feats.NumRows()) {
-          KALDI_WARN << "Alignments has wrong size " << (alignment.size())
-              << " vs. " << (feats.NumRows());
-          num_other_error++;
-          continue;
-        }
-
-        num_done++;
-        BaseFloat file_like = 0.0;
-        fmllr_accs.SetZero();
-        for (size_t i = 0; i < alignment.size(); i++) {
-          int32 pdf_id = trans_model.TransitionIdToPdf(alignment[i]);
-          file_like += fmllr_accs.AccumulateForGmm(regtree, am_gmm,
-              feats.Row(i), pdf_id, 1.0);
-        }
-        KALDI_VLOG(2) << "Average like for this file is " << (file_like
-            / alignment.size()) << " over " << alignment.size() << " frames.";
-        tot_like += file_like;
-        tot_t += alignment.size();
-        if (num_done % 10 == 0) KALDI_VLOG(1)
-            << "Avg like per frame so far is " << (tot_like / tot_t);
-        BaseFloat objf_impr, t;
-        fmllr_accs.Update(regtree, opts, &fmllr_xforms, &objf_impr, &t);
-        KALDI_LOG << "fMLLR objf improvement for utterance " << key << " is "
-                  << (objf_impr/(t+1.0e-10)) << " per frame over " << t
-                  << " frames.";
-        tot_objf_impr += objf_impr;
-        tot_t_objf += t;
-        fmllr_writer.Write(feature_reader.Key(), fmllr_xforms);
-      }
-    }
-
-    KALDI_LOG << "Overall objf improvement from fMLLR is "
-              << (tot_objf_impr/tot_t_objf)
-              << " per frame over " << tot_t_objf << " frames.";
-    KALDI_LOG << "Done " << num_done << " files, " << num_no_alignment
-              << " with no alignments, " << num_other_error
-              << " with other errors.";
-    KALDI_LOG << "Overall acoustic like per frame = " << (tot_like / tot_t)
-              << " over " << tot_t << " frames.";
-    return 0;
-  } catch(const std::exception &e) {
-    std::cerr << e.what();
-    return -1;
-  }
-}
-
diff --git a/src/gmmbin/gmm-est-regtree-fmllr.cc b/src/gmmbin/gmm-est-regtree-fmllr.cc
deleted file mode 100644
index ca807f07fd4..00000000000
--- a/src/gmmbin/gmm-est-regtree-fmllr.cc
+++ /dev/null
@@ -1,216 +0,0 @@
-// gmmbin/gmm-est-regtree-fmllr.cc
-
-// Copyright 2009-2011  Saarland University;  Microsoft Corporation
-//                2014  Guoguo Chen
-
-// See ../../COPYING for clarification regarding multiple authors
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//  http://www.apache.org/licenses/LICENSE-2.0
-//
-// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
-// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
-// MERCHANTABLITY OR NON-INFRINGEMENT.
-// See the Apache 2 License for the specific language governing permissions and
-// limitations under the License.
-
-#include <string>
-using std::string;
-#include <vector>
-using std::vector;
-
-#include "base/kaldi-common.h"
-#include "util/common-utils.h"
-#include "gmm/am-diag-gmm.h"
-#include "hmm/transition-model.h"
-#include "hmm/posterior.h"
-#include "transform/regtree-fmllr-diag-gmm.h"
-
-int main(int argc, char *argv[]) {
-  try {
-    typedef kaldi::int32 int32;
-    using namespace kaldi;
-    const char *usage =
-        "Compute FMLLR transforms per-utterance (default) or per-speaker for "
-        "the supplied set of speakers (spk2utt option).  Note: writes RegtreeFmllrDiagGmm objects\n"
-        "Usage: gmm-est-regtree-fmllr  [options] <model-in> <feature-rspecifier> "
-        "<posteriors-rspecifier> <regression-tree> <transforms-wspecifier>\n";
-
-    ParseOptions po(usage);
-    string spk2utt_rspecifier;
-    bool binary = true;
-    po.Register("spk2utt", &spk2utt_rspecifier, "rspecifier for speaker to "
-                "utterance-list map");
-    po.Register("binary", &binary, "Write output in binary mode");
-    // register other modules
-    RegtreeFmllrOptions opts;
-    opts.Register(&po);
-
-    po.Read(argc, argv);
-
-    if (po.NumArgs() != 5) {
-      po.PrintUsage();
-      exit(1);
-    }
-
-    string model_filename = po.GetArg(1),
-        feature_rspecifier = po.GetArg(2),
-        posteriors_rspecifier = po.GetArg(3),
-        regtree_filename = po.GetArg(4),
-        xforms_wspecifier = po.GetArg(5);
-
-    RandomAccessPosteriorReader posteriors_reader(posteriors_rspecifier);
-    RegtreeFmllrDiagGmmWriter fmllr_writer(xforms_wspecifier);
-
-    AmDiagGmm am_gmm;
-    TransitionModel trans_model;
-    {
-      bool binary;
-      Input ki(model_filename, &binary);
-      trans_model.Read(ki.Stream(), binary);
-      am_gmm.Read(ki.Stream(), binary);
-    }
-    RegressionTree regtree;
-    {
-      bool binary;
-      Input in(regtree_filename, &binary);
-      regtree.Read(in.Stream(), binary, am_gmm);
-    }
-
-    RegtreeFmllrDiagGmm fmllr_xforms;
-    RegtreeFmllrDiagGmmAccs fmllr_accs;
-    fmllr_accs.Init(regtree.NumBaseclasses(), am_gmm.Dim());
-
-    double tot_like = 0.0, tot_t = 0;
-
-    int32 num_done = 0, num_no_posterior = 0, num_other_error = 0;
-    double tot_objf_impr = 0.0, tot_t_objf = 0.0;
-    if (spk2utt_rspecifier != "") {  // per-speaker adaptation
-      SequentialTokenVectorReader spk2utt_reader(spk2utt_rspecifier);
-      RandomAccessBaseFloatMatrixReader feature_reader(feature_rspecifier);
-      for (; !spk2utt_reader.Done(); spk2utt_reader.Next()) {
-        string spk = spk2utt_reader.Key();
-        fmllr_accs.SetZero();
-        const vector<string> &uttlist = spk2utt_reader.Value();
-        for (vector<string>::const_iterator utt_itr = uttlist.begin(),
-            itr_end = uttlist.end(); utt_itr != itr_end; ++utt_itr) {
-          if (!feature_reader.HasKey(*utt_itr)) {
-            KALDI_WARN << "Did not find features for utterance " << *utt_itr;
-            continue;
-          }
-          if (!posteriors_reader.HasKey(*utt_itr)) {
-            KALDI_WARN << "Did not find posteriors for utterance "
-                << *utt_itr;
-            num_no_posterior++;
-            continue;
-          }
-          const Matrix<BaseFloat> &feats = feature_reader.Value(*utt_itr);
-          const Posterior &posterior = posteriors_reader.Value(*utt_itr);
-          if (static_cast<int32>(posterior.size()) != feats.NumRows()) {
-            KALDI_WARN << "Posteriors has wrong size " << (posterior.size())
-                << " vs. " << (feats.NumRows());
-            num_other_error++;
-            continue;
-          }
-
-          BaseFloat file_like = 0.0, file_t = 0.0;
-          Posterior pdf_posterior;
-          ConvertPosteriorToPdfs(trans_model, posterior, &pdf_posterior);
-          for (size_t i = 0; i < posterior.size(); i++) {
-            for (size_t j = 0; j < pdf_posterior[i].size(); j++) {
-              int32 pdf_id = pdf_posterior[i][j].first;
-              BaseFloat prob = pdf_posterior[i][j].second;
-              file_like += fmllr_accs.AccumulateForGmm(regtree, am_gmm,
-                                                       feats.Row(i), pdf_id,
-                                                       prob);
-              file_t += prob;
-            }
-          }
-          KALDI_VLOG(2) << "Average like for this file is " << (file_like/file_t)
-                        << " over " << file_t << " frames.";
-          tot_like += file_like;
-          tot_t += file_t;
-          num_done++;
-          if (num_done % 10 == 0)
-            KALDI_VLOG(1) << "Avg like per frame so far is "
-                          << (tot_like / tot_t);
-        }  // end looping over all utterances of the current speaker
-        BaseFloat objf_impr, t;
-        fmllr_accs.Update(regtree, opts, &fmllr_xforms, &objf_impr, &t);
-        KALDI_LOG << "fMLLR objf improvement for speaker " << spk << " is "
-                  << (objf_impr/(t+1.0e-10)) << " per frame over " << t
-                  << " frames.";
-        tot_objf_impr += objf_impr;
-        tot_t_objf += t;
-        fmllr_writer.Write(spk, fmllr_xforms);
-      }  // end looping over speakers
-    } else {  // per-utterance adaptation
-      SequentialBaseFloatMatrixReader feature_reader(feature_rspecifier);
-      for (; !feature_reader.Done(); feature_reader.Next()) {
-        string key = feature_reader.Key();
-        if (!posteriors_reader.HasKey(key)) {
-          KALDI_WARN << "Did not find posteriors for utterance "
-              << key;
-          num_no_posterior++;
-          continue;
-        }
-        const Matrix<BaseFloat> &feats = feature_reader.Value();
-        const Posterior &posterior = posteriors_reader.Value(key);
-
-        if (static_cast<int32>(posterior.size()) != feats.NumRows()) {
-          KALDI_WARN << "Posteriors has wrong size " << (posterior.size())
-              << " vs. " << (feats.NumRows());
-          num_other_error++;
-          continue;
-        }
-
-        num_done++;
-        BaseFloat file_like = 0.0, file_t = 0.0;
-        fmllr_accs.SetZero();
-        Posterior pdf_posterior;
-        ConvertPosteriorToPdfs(trans_model, posterior, &pdf_posterior);
-        for (size_t i = 0; i < posterior.size(); i++) {
-          for (size_t j = 0; j < pdf_posterior[i].size(); j++) {
-            int32 pdf_id = pdf_posterior[i][j].first;
-            BaseFloat prob = pdf_posterior[i][j].second;
-            file_like += fmllr_accs.AccumulateForGmm(regtree, am_gmm,
-                                                     feats.Row(i), pdf_id,
-                                                     prob);
-            file_t += prob;
-          }
-        }
-        KALDI_VLOG(2) << "Average like for this file is " << (file_like/file_t)
-                      << " over " << file_t << " frames.";
-        tot_like += file_like;
-        tot_t += file_t;
-        if (num_done % 10 == 0)
-          KALDI_VLOG(1) << "Avg like per frame so far is "
-                        << (tot_like / tot_t);
-        BaseFloat objf_impr, t;
-        fmllr_accs.Update(regtree, opts, &fmllr_xforms, &objf_impr, &t);
-        KALDI_LOG << "fMLLR objf improvement for utterance " << key << " is "
-                  << (objf_impr/(t+1.0e-10)) << " per frame over " << t
-                  << " frames.";
-        tot_objf_impr += objf_impr;
-        tot_t_objf += t;
-        fmllr_writer.Write(feature_reader.Key(), fmllr_xforms);
-      }
-    }
-    KALDI_LOG << "Done " << num_done << " files, " << num_no_posterior
-              << " with no posteriors, " << num_other_error
-              << " with other errors.";
-    KALDI_LOG << "Overall objf improvement from MLLR is " << (tot_objf_impr/tot_t_objf)
-              << " per frame " << " over " << tot_t_objf << " frames.";
-    KALDI_LOG << "Overall acoustic likelihood was " << (tot_like/tot_t)
-              << " over " << tot_t << " frames.";
-    return 0;
-  } catch(const std::exception &e) {
-    std::cerr << e.what();
-    return -1;
-  }
-}
-
diff --git a/src/gmmbin/gmm-est-regtree-mllr.cc b/src/gmmbin/gmm-est-regtree-mllr.cc
deleted file mode 100644
index a4df5cc84c1..00000000000
--- a/src/gmmbin/gmm-est-regtree-mllr.cc
+++ /dev/null
@@ -1,215 +0,0 @@
-// gmmbin/gmm-est-regtree-mllr.cc
-
-// Copyright 2009-2011  Saarland University;  Microsoft Corporation
-//                2014  Guoguo Chen
-
-// See ../../COPYING for clarification regarding multiple authors
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//  http://www.apache.org/licenses/LICENSE-2.0
-//
-// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
-// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
-// MERCHANTABLITY OR NON-INFRINGEMENT.
-// See the Apache 2 License for the specific language governing permissions and
-// limitations under the License.
-
-#include <string>
-using std::string;
-#include <vector>
-using std::vector;
-
-#include "base/kaldi-common.h"
-#include "util/common-utils.h"
-#include "gmm/am-diag-gmm.h"
-#include "hmm/transition-model.h"
-#include "transform/regtree-mllr-diag-gmm.h"
-#include "hmm/posterior.h"
-
-int main(int argc, char *argv[]) {
-  try {
-    typedef kaldi::int32 int32;
-    using namespace kaldi;
-    const char *usage =
-        "Compute MLLR transforms per-utterance (default) or per-speaker for "
-        "the supplied set of speakers (spk2utt option).  Note: writes RegtreeMllrDiagGmm objects\n"
-        "Usage: gmm-est-regtree-mllr  [options] <model-in> <feature-rspecifier> "
-        "<posteriors-rspecifier> <regression-tree> <transforms-wspecifier>\n";
-
-    ParseOptions po(usage);
-    string spk2utt_rspecifier;
-    bool binary = true;
-    po.Register("spk2utt", &spk2utt_rspecifier, "rspecifier for speaker to "
-                "utterance-list map");
-    po.Register("binary", &binary, "Write output in binary mode");
-    // register other modules
-    RegtreeMllrOptions opts;
-    opts.Register(&po);
-
-    po.Read(argc, argv);
-
-    if (po.NumArgs() != 5) {
-      po.PrintUsage();
-      exit(1);
-    }
-
-    string model_filename = po.GetArg(1),
-        feature_rspecifier = po.GetArg(2),
-        posteriors_rspecifier = po.GetArg(3),
-        regtree_filename = po.GetArg(4),
-        xforms_wspecifier = po.GetArg(5);
-
-    RandomAccessPosteriorReader posteriors_reader(posteriors_rspecifier);
-    RegtreeMllrDiagGmmWriter mllr_writer(xforms_wspecifier);
-
-    AmDiagGmm am_gmm;
-    TransitionModel trans_model;
-    {
-      bool binary;
-      Input ki(model_filename, &binary);
-      trans_model.Read(ki.Stream(), binary);
-      am_gmm.Read(ki.Stream(), binary);
-    }
-    RegressionTree regtree;
-    {
-      bool binary;
-      Input in(regtree_filename, &binary);
-      regtree.Read(in.Stream(), binary, am_gmm);
-    }
-
-    RegtreeMllrDiagGmm mllr_xforms;
-    RegtreeMllrDiagGmmAccs mllr_accs;
-    mllr_accs.Init(regtree.NumBaseclasses(), am_gmm.Dim());
-
-    double tot_like = 0.0, tot_t = 0;
-
-    int32 num_done = 0, num_no_posterior = 0, num_other_error = 0;
-    double tot_objf_impr = 0.0, tot_t_objf = 0.0;
-    if (spk2utt_rspecifier != "") {  // per-speaker adaptation
-      SequentialTokenVectorReader spk2utt_reader(spk2utt_rspecifier);
-      RandomAccessBaseFloatMatrixReader feature_reader(feature_rspecifier);
-      for (; !spk2utt_reader.Done(); spk2utt_reader.Next()) {
-        string spk = spk2utt_reader.Key();
-        mllr_accs.SetZero();
-        const vector<string> &uttlist = spk2utt_reader.Value();
-        for (vector<string>::const_iterator utt_itr = uttlist.begin(),
-            itr_end = uttlist.end(); utt_itr != itr_end; ++utt_itr) {
-          if (!feature_reader.HasKey(*utt_itr)) {
-            KALDI_WARN << "Did not find features for utterance " << *utt_itr;
-            continue;
-          }
-          if (!posteriors_reader.HasKey(*utt_itr)) {
-            KALDI_WARN << "Did not find posteriors for utterance "
-                << *utt_itr;
-            num_no_posterior++;
-            continue;
-          }
-          const Matrix<BaseFloat> &feats = feature_reader.Value(*utt_itr);
-          const Posterior &posterior = posteriors_reader.Value(*utt_itr);
-          if (posterior.size() != feats.NumRows()) {
-            KALDI_WARN << "Posteriors has wrong size " << (posterior.size())
-                << " vs. " << (feats.NumRows());
-            num_other_error++;
-            continue;
-          }
-
-          BaseFloat file_like = 0.0, file_t = 0.0;
-          Posterior pdf_posterior;
-          ConvertPosteriorToPdfs(trans_model, posterior, &pdf_posterior);
-          for (size_t i = 0; i < posterior.size(); i++) {
-            for (size_t j = 0; j < pdf_posterior[i].size(); j++) {
-              int32 pdf_id = pdf_posterior[i][j].first;
-              BaseFloat prob = pdf_posterior[i][j].second;
-              file_like += mllr_accs.AccumulateForGmm(regtree, am_gmm,
-                                                      feats.Row(i), pdf_id,
-                                                      prob);
-              file_t += prob;
-            }
-          }
-          KALDI_VLOG(2) << "Average like for this file is " << (file_like/file_t)
-                        << " over " << file_t << " frames.";
-          tot_like += file_like;
-          tot_t += file_t;
-          num_done++;
-          if (num_done % 10 == 0)
-            KALDI_VLOG(1) << "Avg like per frame so far is "
-                          << (tot_like / tot_t);
-        }  // end looping over all utterances of the current speaker
-        BaseFloat objf_impr, t;
-        mllr_accs.Update(regtree, opts, &mllr_xforms, &objf_impr, &t);
-        KALDI_LOG << "MLLR objf improvement for speaker " << spk << " is "
-                  << (objf_impr/(t+1.0e-10)) << " per frame over " << t
-                  << " frames.";
-        tot_objf_impr += objf_impr;
-        tot_t_objf += t;
-        mllr_writer.Write(spk, mllr_xforms);
-      }  // end looping over speakers
-    } else {  // per-utterance adaptation
-      SequentialBaseFloatMatrixReader feature_reader(feature_rspecifier);
-      for (; !feature_reader.Done(); feature_reader.Next()) {
-        string key = feature_reader.Key();
-        if (!posteriors_reader.HasKey(key)) {
-          KALDI_WARN << "Did not find aligned transcription for utterance "
-              << key;
-          num_no_posterior++;
-          continue;
-        }
-        const Matrix<BaseFloat> &feats = feature_reader.Value();
-        const Posterior &posterior = posteriors_reader.Value(key);
-
-        if (posterior.size() != feats.NumRows()) {
-          KALDI_WARN << "Posteriors has wrong size " << (posterior.size())
-              << " vs. " << (feats.NumRows());
-          num_other_error++;
-          continue;
-        }
-
-        num_done++;
-        BaseFloat file_like = 0.0, file_t = 0.0;
-        mllr_accs.SetZero();
-        Posterior pdf_posterior;
-        ConvertPosteriorToPdfs(trans_model, posterior, &pdf_posterior);
-        for (size_t i = 0; i < posterior.size(); i++) {
-          for (size_t j = 0; j < pdf_posterior[i].size(); j++) {
-            int32 pdf_id = pdf_posterior[i][j].first;
-            BaseFloat prob = pdf_posterior[i][j].second;
-            file_like += mllr_accs.AccumulateForGmm(regtree, am_gmm,
-                                                    feats.Row(i), pdf_id,
-                                                    prob);
-            file_t += prob;
-          }
-        }
-        KALDI_VLOG(2) << "Average like for this file is " << (file_like/file_t)
-                      << " over " << file_t << " frames.";
-        tot_like += file_like;
-        tot_t += file_t;
-        if (num_done % 10 == 0)
-          KALDI_VLOG(1) << "Avg like per frame so far is " << (tot_like / tot_t);
-        BaseFloat objf_impr, t;
-        mllr_accs.Update(regtree, opts, &mllr_xforms, &objf_impr, &t);
-        KALDI_LOG << "MLLR objf improvement for utterance " << key << " is "
-                  << (objf_impr/(t+1.0e-10)) << " per frame over " << t
-                  << " frames.";
-        tot_objf_impr += objf_impr;
-        tot_t_objf += t;
-        mllr_writer.Write(feature_reader.Key(), mllr_xforms);
-      }
-    }
-    KALDI_LOG << "Done " << num_done << " files, " << num_no_posterior
-              << " with no posteriors, " << num_other_error
-              << " with other errors.";
-    KALDI_LOG << "Overall objf improvement from MLLR is " << (tot_objf_impr/tot_t_objf)
-              << " per frame " << " over " << tot_t_objf << " frames.";
-    KALDI_LOG << "Overall acoustic likelihood was " << (tot_like/tot_t)
-              << " over " << tot_t << " frames.";
-    return 0;
-  } catch(const std::exception &e) {
-    std::cerr << e.what();
-    return -1;
-  }
-}
-
diff --git a/src/gmmbin/gmm-est-rescale.cc b/src/gmmbin/gmm-est-rescale.cc
index a432b3d77f6..1e9c1e2aa84 100644
--- a/src/gmmbin/gmm-est-rescale.cc
+++ b/src/gmmbin/gmm-est-rescale.cc
@@ -21,7 +21,7 @@
 #include "util/common-utils.h"
 #include "gmm/indirect-diff-diag-gmm.h"
 #include "tree/context-dep.h"
-#include "hmm/transition-model.h"
+#include "hmm/transitions.h"
 
 int main(int argc, char *argv[]) {
   using namespace kaldi;
@@ -62,7 +62,7 @@ int main(int argc, char *argv[]) {
         model_wxfilename = po.GetArg(4);
 
     AmDiagGmm am_gmm;
-    TransitionModel trans_model;
+    Transitions trans_model;
     {
       bool binary_read;
       Input ki(model_rxfilename, &binary_read);
diff --git a/src/gmmbin/gmm-est-weights-ebw.cc b/src/gmmbin/gmm-est-weights-ebw.cc
index f19343a7ac4..9cf2c2d7d04 100644
--- a/src/gmmbin/gmm-est-weights-ebw.cc
+++ b/src/gmmbin/gmm-est-weights-ebw.cc
@@ -21,7 +21,7 @@
 #include "util/common-utils.h"
 #include "gmm/am-diag-gmm.h"
 #include "tree/context-dep.h"
-#include "hmm/transition-model.h"
+#include "hmm/transitions.h"
 #include "gmm/ebw-diag-gmm.h"
 
 int main(int argc, char *argv[]) {
@@ -62,7 +62,7 @@ int main(int argc, char *argv[]) {
         model_out_filename = po.GetArg(4);
 
     AmDiagGmm am_gmm;
-    TransitionModel trans_model;
+    Transitions trans_model;
     {
       bool binary_read;
       Input ki(model_in_filename, &binary_read);
diff --git a/src/gmmbin/gmm-est.cc b/src/gmmbin/gmm-est.cc
index 18c836a1f50..545bbc054ef 100644
--- a/src/gmmbin/gmm-est.cc
+++ b/src/gmmbin/gmm-est.cc
@@ -21,7 +21,7 @@
 #include "util/common-utils.h"
 #include "gmm/am-diag-gmm.h"
 #include "tree/context-dep.h"
-#include "hmm/transition-model.h"
+#include "hmm/transitions.h"
 #include "gmm/mle-am-diag-gmm.h"
 
 int main(int argc, char *argv[]) {
@@ -79,7 +79,7 @@ int main(int argc, char *argv[]) {
         model_out_filename = po.GetArg(3);
 
     AmDiagGmm am_gmm;
-    TransitionModel trans_model;
+    Transitions trans_model;
     {
       bool binary_read;
       Input ki(model_in_filename, &binary_read);
diff --git a/src/gmmbin/gmm-fmpe-acc-stats.cc b/src/gmmbin/gmm-fmpe-acc-stats.cc
index 4868b63b6ae..17cba7dc489 100644
--- a/src/gmmbin/gmm-fmpe-acc-stats.cc
+++ b/src/gmmbin/gmm-fmpe-acc-stats.cc
@@ -21,7 +21,7 @@
 #include "base/kaldi-common.h"
 #include "util/common-utils.h"
 #include "gmm/am-diag-gmm.h"
-#include "hmm/transition-model.h"
+#include "hmm/transitions.h"
 #include "transform/fmpe.h"
 
 
@@ -60,7 +60,7 @@ int main(int argc, char *argv[]) {
         stats_wxfilename = po.GetArg(6);
     
     AmDiagGmm am_gmm;
-    TransitionModel trans_model;
+    Transitions trans_model;
     {
       bool binary;
       Input ki(model_rxfilename, &binary);
diff --git a/src/gmmbin/gmm-get-stats-deriv.cc b/src/gmmbin/gmm-get-stats-deriv.cc
index 939fe260b34..a6fd9764719 100644
--- a/src/gmmbin/gmm-get-stats-deriv.cc
+++ b/src/gmmbin/gmm-get-stats-deriv.cc
@@ -21,7 +21,7 @@
 #include "util/common-utils.h"
 #include "gmm/am-diag-gmm.h"
 #include "tree/context-dep.h"
-#include "hmm/transition-model.h"
+#include "hmm/transitions.h"
 #include "gmm/indirect-diff-diag-gmm.h"
 
 int main(int argc, char *argv[]) {
@@ -64,7 +64,7 @@ int main(int argc, char *argv[]) {
         deriv_wxfilename = po.GetArg(5);
         
     AmDiagGmm am_gmm;
-    TransitionModel trans_model;
+    Transitions trans_model;
     {
       bool binary_read;
       Input ki(model_rxfilename, &binary_read);
diff --git a/src/gmmbin/gmm-global-est-fmllr.cc b/src/gmmbin/gmm-global-est-fmllr.cc
index b1d5b68e594..951b8addf2d 100644
--- a/src/gmmbin/gmm-global-est-fmllr.cc
+++ b/src/gmmbin/gmm-global-est-fmllr.cc
@@ -25,7 +25,7 @@ using std::vector;
 #include "base/kaldi-common.h"
 #include "util/common-utils.h"
 #include "gmm/am-diag-gmm.h"
-#include "hmm/transition-model.h"
+#include "hmm/transitions.h"
 #include "transform/fmllr-diag-gmm.h"
 
 namespace kaldi {
diff --git a/src/gmmbin/gmm-global-est-lvtln-trans.cc b/src/gmmbin/gmm-global-est-lvtln-trans.cc
index 10bb5bec5d5..95b56503f2c 100644
--- a/src/gmmbin/gmm-global-est-lvtln-trans.cc
+++ b/src/gmmbin/gmm-global-est-lvtln-trans.cc
@@ -26,7 +26,7 @@ using std::vector;
 #include "base/kaldi-common.h"
 #include "util/common-utils.h"
 #include "gmm/am-diag-gmm.h"
-#include "hmm/transition-model.h"
+#include "hmm/transitions.h"
 #include "transform/lvtln.h"
 #include "hmm/posterior.h"
 
diff --git a/src/gmmbin/gmm-global-info.cc b/src/gmmbin/gmm-global-info.cc
index 7c21005b449..00222ef81c3 100644
--- a/src/gmmbin/gmm-global-info.cc
+++ b/src/gmmbin/gmm-global-info.cc
@@ -20,7 +20,7 @@
 #include "base/kaldi-common.h"
 #include "util/common-utils.h"
 #include "gmm/diag-gmm.h"
-#include "hmm/transition-model.h"
+#include "hmm/transitions.h"
 
 int main(int argc, char *argv[]) {
   try {
diff --git a/src/gmmbin/gmm-gselect.cc b/src/gmmbin/gmm-gselect.cc
index a873b962591..357998e996d 100644
--- a/src/gmmbin/gmm-gselect.cc
+++ b/src/gmmbin/gmm-gselect.cc
@@ -22,7 +22,7 @@
 #include "base/kaldi-common.h"
 #include "util/common-utils.h"
 #include "gmm/diag-gmm.h"
-#include "hmm/transition-model.h"
+#include "hmm/transitions.h"
 
 int main(int argc, char *argv[]) {
   try {
diff --git a/src/gmmbin/gmm-info.cc b/src/gmmbin/gmm-info.cc
index 31f7aea0921..f1c436cd57e 100644
--- a/src/gmmbin/gmm-info.cc
+++ b/src/gmmbin/gmm-info.cc
@@ -20,7 +20,7 @@
 #include "base/kaldi-common.h"
 #include "util/common-utils.h"
 #include "gmm/am-diag-gmm.h"
-#include "hmm/transition-model.h"
+#include "hmm/transitions.h"
 
 int main(int argc, char *argv[]) {
   try {
@@ -46,7 +46,7 @@ int main(int argc, char *argv[]) {
     std::string model_in_filename = po.GetArg(1);
 
     AmDiagGmm am_gmm;
-    TransitionModel trans_model;
+    Transitions trans_model;
     {
       bool binary_read;
       Input ki(model_in_filename, &binary_read);
diff --git a/src/gmmbin/gmm-init-biphone.cc b/src/gmmbin/gmm-init-biphone.cc
index 0775a5c7b23..10fc9ad4048 100644
--- a/src/gmmbin/gmm-init-biphone.cc
+++ b/src/gmmbin/gmm-init-biphone.cc
@@ -23,8 +23,8 @@
 #include "gmm/am-diag-gmm.h"
 #include "tree/event-map.h"
 #include "tree/context-dep.h"
-#include "hmm/hmm-topology.h"
-#include "hmm/transition-model.h"
+#include "hmm/topology.h"
+#include "hmm/transitions.h"
 
 namespace kaldi {
 // This function reads a file like:
@@ -314,7 +314,7 @@ int main(int argc, char *argv[]) {
     Vector<BaseFloat> glob_mean(dim);
     glob_mean.Set(1.0);
 
-    HmmTopology topo;
+    Topology topo;
     bool binary_in;
     Input ki(topo_filename, &binary_in);
     topo.Read(ki.Stream(), binary_in);
@@ -375,7 +375,7 @@ int main(int argc, char *argv[]) {
       am_gmm.AddPdf(gmm);
 
     // Now the transition model:
-    TransitionModel trans_model(*ctx_dep, topo);
+    Transitions trans_model(*ctx_dep, topo);
 
     {
       Output ko(model_filename, binary);
diff --git a/src/gmmbin/gmm-init-model-flat.cc b/src/gmmbin/gmm-init-model-flat.cc
index fecd91f49fd..d41b99c35e6 100644
--- a/src/gmmbin/gmm-init-model-flat.cc
+++ b/src/gmmbin/gmm-init-model-flat.cc
@@ -21,7 +21,7 @@
 #include "base/kaldi-common.h"
 #include "util/common-utils.h"
 #include "gmm/am-diag-gmm.h"
-#include "hmm/transition-model.h"
+#include "hmm/transitions.h"
 #include "gmm/mle-am-diag-gmm.h"
 #include "tree/build-tree-utils.h"
 #include "tree/context-dep.h"
@@ -104,7 +104,7 @@ int main(int argc, char *argv[]) {
     ContextDependency ctx_dep;
     ReadKaldiObject(tree_filename, &ctx_dep);
 
-    HmmTopology topo; 
+    Topology topo; 
     ReadKaldiObject(topo_filename, &topo);
 
     Vector<BaseFloat> global_inverse_var, global_mean;
@@ -138,7 +138,7 @@ int main(int argc, char *argv[]) {
     for (int i = 0; i < num_pdfs; i++)
       am_gmm.AddPdf(gmm);
     
-    TransitionModel trans_model(ctx_dep, topo);
+    Transitions trans_model(ctx_dep, topo);
 
     {
       Output ko(model_out_filename, binary);
diff --git a/src/gmmbin/gmm-init-model.cc b/src/gmmbin/gmm-init-model.cc
index e2d943b19eb..a081f326b1c 100644
--- a/src/gmmbin/gmm-init-model.cc
+++ b/src/gmmbin/gmm-init-model.cc
@@ -22,7 +22,7 @@
 #include "base/kaldi-common.h"
 #include "util/common-utils.h"
 #include "gmm/am-diag-gmm.h"
-#include "hmm/transition-model.h"
+#include "hmm/transitions.h"
 #include "gmm/mle-am-diag-gmm.h"
 #include "tree/build-tree-utils.h"
 #include "tree/context-dep.h"
@@ -35,7 +35,7 @@ namespace kaldi {
 void InitAmGmm(const BuildTreeStatsType &stats,
                const EventMap &to_pdf_map,
                AmDiagGmm *am_gmm,
-               const TransitionModel &trans_model,
+               const Transitions &trans_model,
                BaseFloat var_floor) {
   // Get stats split by tree-leaf ( == pdf):
   std::vector<BuildTreeStatsType> split_stats;
@@ -126,7 +126,7 @@ void InitAmGmmFromOld(const BuildTreeStatsType &stats,
   ContextDependency old_tree;
   {  // Read old_gm_gmm
     bool binary_in;
-    TransitionModel old_trans_model;
+    Transitions old_trans_model;
     Input ki(old_model_rxfilename, &binary_in);
     old_trans_model.Read(ki.Stream(), binary_in);
     old_am_gmm.Read(ki.Stream(), binary_in);
@@ -270,12 +270,12 @@ int main(int argc, char *argv[]) {
     }
     KALDI_LOG << "Number of separate statistics is " << stats.size();
 
-    HmmTopology topo;
+    Topology topo;
     ReadKaldiObject(topo_filename, &topo);
 
     const EventMap &to_pdf = ctx_dep.ToPdfMap();  // not owned here.
 
-    TransitionModel trans_model(ctx_dep, topo);
+    Transitions trans_model(ctx_dep, topo);
     
     // Now, the summed_stats will be used to initialize the GMM.
     AmDiagGmm am_gmm;
diff --git a/src/gmmbin/gmm-init-mono.cc b/src/gmmbin/gmm-init-mono.cc
index 3c370c36515..a91948e446b 100644
--- a/src/gmmbin/gmm-init-mono.cc
+++ b/src/gmmbin/gmm-init-mono.cc
@@ -21,8 +21,8 @@
 #include "base/kaldi-common.h"
 #include "util/common-utils.h"
 #include "gmm/am-diag-gmm.h"
-#include "hmm/hmm-topology.h"
-#include "hmm/transition-model.h"
+#include "hmm/topology.h"
+#include "hmm/transitions.h"
 #include "tree/context-dep.h"
 
 namespace kaldi {
@@ -116,7 +116,7 @@ int main(int argc, char *argv[]) {
       glob_mean.CopyFromVec(mean_stats);
     }
 
-    HmmTopology topo;
+    Topology topo;
     bool binary_in;
     Input ki(topo_filename, &binary_in);
     topo.Read(ki.Stream(), binary_in);
@@ -164,7 +164,7 @@ int main(int argc, char *argv[]) {
     }
 
     // Now the transition model:
-    TransitionModel trans_model(*ctx_dep, topo);
+    Transitions trans_model(*ctx_dep, topo);
 
     {
       Output ko(model_filename, binary);
diff --git a/src/gmmbin/gmm-ismooth-stats.cc b/src/gmmbin/gmm-ismooth-stats.cc
index b29e1efc1c3..a524d27b47b 100644
--- a/src/gmmbin/gmm-ismooth-stats.cc
+++ b/src/gmmbin/gmm-ismooth-stats.cc
@@ -21,7 +21,7 @@
 #include "util/common-utils.h"
 #include "gmm/am-diag-gmm.h"
 #include "tree/context-dep.h"
-#include "hmm/transition-model.h"
+#include "hmm/transitions.h"
 #include "gmm/ebw-diag-gmm.h"
 
 int main(int argc, char *argv[]) {
@@ -77,7 +77,7 @@ int main(int argc, char *argv[]) {
       stats.Write(ko.Stream(), binary_write);
     } else if (smooth_from_model) { // Smoothing from model...
       AmDiagGmm am_gmm;
-      TransitionModel trans_model;
+      Transitions trans_model;
       Vector<double> dst_transition_accs;
       AccumAmDiagGmm dst_stats;
       { // read src model
diff --git a/src/gmmbin/gmm-latgen-biglm-faster.cc b/src/gmmbin/gmm-latgen-biglm-faster.cc
index d4e0645b16c..0d881b41ebb 100644
--- a/src/gmmbin/gmm-latgen-biglm-faster.cc
+++ b/src/gmmbin/gmm-latgen-biglm-faster.cc
@@ -24,7 +24,7 @@
 #include "util/common-utils.h"
 #include "gmm/am-diag-gmm.h"
 #include "tree/context-dep.h"
-#include "hmm/transition-model.h"
+#include "hmm/transitions.h"
 #include "fstext/fstext-lib.h"
 #include "decoder/lattice-biglm-faster-decoder.h"
 #include "gmm/decodable-am-diag-gmm.h"
@@ -35,7 +35,7 @@ namespace kaldi {
 // Takes care of output.  Returns true on success.
 bool DecodeUtterance(LatticeBiglmFasterDecoder &decoder, // not const but is really an input.
                      DecodableInterface &decodable, // not const but is really an input.
-                     const TransitionModel &trans_model,
+                     const Transitions &trans_model,
                      const fst::SymbolTable *word_syms,
                      std::string utt,
                      double acoustic_scale,
@@ -186,7 +186,7 @@ int main(int argc, char *argv[]) {
         words_wspecifier = po.GetOptArg(7),
         alignment_wspecifier = po.GetOptArg(8);
     
-    TransitionModel trans_model;
+    Transitions trans_model;
     AmDiagGmm am_gmm;
     {
       bool binary;
diff --git a/src/gmmbin/gmm-latgen-faster-parallel.cc b/src/gmmbin/gmm-latgen-faster-parallel.cc
index 41f414bcb9c..8cc0aa5dad4 100644
--- a/src/gmmbin/gmm-latgen-faster-parallel.cc
+++ b/src/gmmbin/gmm-latgen-faster-parallel.cc
@@ -24,7 +24,7 @@
 #include "util/common-utils.h"
 #include "gmm/am-diag-gmm.h"
 #include "tree/context-dep.h"
-#include "hmm/transition-model.h"
+#include "hmm/transitions.h"
 #include "fstext/fstext-lib.h"
 #include "decoder/decoder-wrappers.h"
 #include "gmm/decodable-am-diag-gmm.h"
@@ -82,7 +82,7 @@ int main(int argc, char *argv[]) {
         words_wspecifier = po.GetOptArg(5),
         alignment_wspecifier = po.GetOptArg(6);
 
-    TransitionModel trans_model;
+    Transitions trans_model;
     AmDiagGmm am_gmm;
     {
       bool binary;
diff --git a/src/gmmbin/gmm-latgen-faster-regtree-fmllr.cc b/src/gmmbin/gmm-latgen-faster-regtree-fmllr.cc
deleted file mode 100644
index 36031b13c1e..00000000000
--- a/src/gmmbin/gmm-latgen-faster-regtree-fmllr.cc
+++ /dev/null
@@ -1,218 +0,0 @@
-// gmmbin/gmm-latgen-faster-regtree-fmllr.cc
-
-// Copyright 2009-2012  Microsoft Corporation
-//           2012-2013  Johns Hopkins University (author: Daniel Povey)
-//                2014  Alpha Cephei Inc.
-
-// See ../../COPYING for clarification regarding multiple authors
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//  http://www.apache.org/licenses/LICENSE-2.0
-//
-// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
-// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
-// MERCHANTABLITY OR NON-INFRINGEMENT.
-// See the Apache 2 License for the specific language governing permissions and
-// limitations under the License.
-
-
-#include "base/kaldi-common.h"
-#include "util/common-utils.h"
-#include "gmm/am-diag-gmm.h"
-#include "tree/context-dep.h"
-#include "hmm/transition-model.h"
-#include "fstext/fstext-lib.h"
-#include "decoder/decoder-wrappers.h"
-#include "gmm/decodable-am-diag-gmm.h"
-#include "base/timer.h"
-#include "transform/regression-tree.h"
-#include "transform/regtree-fmllr-diag-gmm.h"
-#include "transform/decodable-am-diag-gmm-regtree.h"
-#include "feat/feature-functions.h"  // feature reversal
-
-int main(int argc, char *argv[]) {
-  try {
-    using namespace kaldi;
-    typedef kaldi::int32 int32;
-    using fst::SymbolTable;
-    using fst::Fst;
-    using fst::StdArc;
-
-    const char *usage =
-        "Generate lattices using GMM-based model and RegTree-FMLLR adaptation.\n"
-        "Usage: gmm-latgen-faster-regtree-fmllr [options] model-in regtree-in (fst-in|fsts-rspecifier) features-rspecifier transform-rspecifier"
-        " lattice-wspecifier [ words-wspecifier [alignments-wspecifier] ]\n";
-    ParseOptions po(usage);
-    Timer timer;
-    bool allow_partial = false;
-    BaseFloat acoustic_scale = 0.1;
-    LatticeFasterDecoderConfig config;
-
-    std::string word_syms_filename, utt2spk_rspecifier;
-    config.Register(&po);
-    po.Register("utt2spk", &utt2spk_rspecifier, "rspecifier for utterance to "
-                "speaker map used to load the transform");
-    po.Register("acoustic-scale", &acoustic_scale,
-                "Scaling factor for acoustic likelihoods");
-    po.Register("word-symbol-table", &word_syms_filename,
-                "Symbol table for words [for debug output]");
-    po.Register("allow-partial", &allow_partial,
-                "If true, produce output even if end state was not reached.");
-    
-    po.Read(argc, argv);
-
-    if (po.NumArgs() < 4 || po.NumArgs() > 6) {
-      po.PrintUsage();
-      exit(1);
-    }
-
-    std::string model_in_filename = po.GetArg(1),
-        regtree_in_str = po.GetArg(2),
-        fst_in_str = po.GetArg(3),
-        feature_rspecifier = po.GetArg(4),
-        xforms_rspecifier = po.GetArg(5),
-        lattice_wspecifier = po.GetArg(6),
-        words_wspecifier = po.GetOptArg(7),
-        alignment_wspecifier = po.GetOptArg(8);
-
-    TransitionModel trans_model;
-    AmDiagGmm am_gmm;
-    {
-      bool binary;
-      Input ki(model_in_filename, &binary);
-      trans_model.Read(ki.Stream(), binary);
-      am_gmm.Read(ki.Stream(), binary);
-    }
-
-    RegressionTree regtree;
-    {
-      bool binary_read;
-      Input in(regtree_in_str, &binary_read);
-      regtree.Read(in.Stream(), binary_read, am_gmm);
-    }
-
-    RandomAccessRegtreeFmllrDiagGmmReaderMapped fmllr_reader(xforms_rspecifier,
-                                                             utt2spk_rspecifier);
-
-    bool determinize = config.determinize_lattice;
-    CompactLatticeWriter compact_lattice_writer;
-    LatticeWriter lattice_writer;
-    if (! (determinize ? compact_lattice_writer.Open(lattice_wspecifier)
-           : lattice_writer.Open(lattice_wspecifier)))
-      KALDI_ERR << "Could not open table for writing lattices: "
-                 << lattice_wspecifier;
-
-    Int32VectorWriter words_writer(words_wspecifier);
-
-    Int32VectorWriter alignment_writer(alignment_wspecifier);
-
-    fst::SymbolTable *word_syms = NULL;
-    if (word_syms_filename != "") 
-      if (!(word_syms = fst::SymbolTable::ReadText(word_syms_filename)))
-        KALDI_ERR << "Could not read symbol table from file "
-                   << word_syms_filename;
-
-    double tot_like = 0.0;
-    kaldi::int64 frame_count = 0;
-    int num_done = 0, num_err = 0;
-
-    if (ClassifyRspecifier(fst_in_str, NULL, NULL) == kNoRspecifier) {
-      SequentialBaseFloatMatrixReader feature_reader(feature_rspecifier);
-      // Input FST is just one FST, not a table of FSTs.
-      Fst<StdArc> *decode_fst = fst::ReadFstKaldiGeneric(fst_in_str);
-      
-      {
-        LatticeFasterDecoder decoder(*decode_fst, config);
-    
-        for (; !feature_reader.Done(); feature_reader.Next()) {
-          std::string utt = feature_reader.Key();
-          Matrix<BaseFloat> features (feature_reader.Value());
-          feature_reader.FreeCurrent();
-          if (features.NumRows() == 0) {
-            KALDI_WARN << "Zero-length utterance: " << utt;
-            num_err++;
-            continue;
-          }
-          if (!fmllr_reader.HasKey(utt)) {
-            KALDI_WARN << "Not decoding utterance " << utt
-                       << " because no transform available.";
-            num_err++;
-            continue;
-          }
-
-          RegtreeFmllrDiagGmm fmllr(fmllr_reader.Value(utt));
-
-          kaldi::DecodableAmDiagGmmRegtreeFmllr gmm_decodable(am_gmm, trans_model,
-                                                            features, fmllr,
-                                                            regtree,
-                                                            acoustic_scale);
-          double like;
-          if (DecodeUtteranceLatticeFaster(
-                  decoder, gmm_decodable, trans_model, word_syms, utt, acoustic_scale,
-                  determinize, allow_partial, &alignment_writer, &words_writer,
-                  &compact_lattice_writer, &lattice_writer, &like)) {
-            tot_like += like;
-            frame_count += features.NumRows();
-            num_done++;
-          } else num_err++;
-        }
-      }
-      delete decode_fst; // delete this only after decoder goes out of scope.
-    } else { // We have different FSTs for different utterances.
-      SequentialTableReader<fst::VectorFstHolder> fst_reader(fst_in_str);
-      RandomAccessBaseFloatMatrixReader feature_reader(feature_rspecifier);          
-      for (; !fst_reader.Done(); fst_reader.Next()) {
-        std::string utt = fst_reader.Key();
-        const Matrix<BaseFloat> &features = feature_reader.Value(utt);
-        if (features.NumRows() == 0) {
-          KALDI_WARN << "Zero-length utterance: " << utt;
-          num_err++;
-          continue;
-        }
-        if (!fmllr_reader.HasKey(utt)) {
-          KALDI_WARN << "Not decoding utterance " << utt
-                     << " because no transform available.";
-          num_err++;
-          continue;
-        }
-
-        RegtreeFmllrDiagGmm fmllr(fmllr_reader.Value(utt));
-        kaldi::DecodableAmDiagGmmRegtreeFmllr gmm_decodable(am_gmm, trans_model,
-                                                            features, fmllr,
-                                                            regtree,
-                                                            acoustic_scale);
-
-        LatticeFasterDecoder decoder(fst_reader.Value(), config);
-        double like;
-        if (DecodeUtteranceLatticeFaster(
-                decoder, gmm_decodable, trans_model, word_syms, utt, acoustic_scale,
-                determinize, allow_partial, &alignment_writer, &words_writer,
-                &compact_lattice_writer, &lattice_writer, &like)) {
-          tot_like += like;
-          frame_count += features.NumRows();
-          num_done++;
-        } else num_err++;
-      }
-    }
-      
-    double elapsed = timer.Elapsed();
-    KALDI_LOG << "Time taken "<< elapsed
-              << "s: real-time factor assuming 100 frames/sec is "
-              << (elapsed*100.0/frame_count);
-    KALDI_LOG << "Done " << num_done << " utterances, failed for "
-              << num_err;
-    KALDI_LOG << "Overall log-likelihood per frame is " << (tot_like/frame_count) << " over "
-              << frame_count << " frames.";
-
-    delete word_syms;
-    if (num_done != 0) return 0;
-    else return 1;
-  } catch(const std::exception &e) {
-    std::cerr << e.what();
-    return -1;
-  }
-}
diff --git a/src/gmmbin/gmm-latgen-faster.cc b/src/gmmbin/gmm-latgen-faster.cc
index 6bc475d1b79..75a9d95aacd 100644
--- a/src/gmmbin/gmm-latgen-faster.cc
+++ b/src/gmmbin/gmm-latgen-faster.cc
@@ -24,7 +24,7 @@
 #include "util/common-utils.h"
 #include "gmm/am-diag-gmm.h"
 #include "tree/context-dep.h"
-#include "hmm/transition-model.h"
+#include "hmm/transitions.h"
 #include "fstext/fstext-lib.h"
 #include "decoder/decoder-wrappers.h"
 #include "gmm/decodable-am-diag-gmm.h"
@@ -72,7 +72,7 @@ int main(int argc, char *argv[]) {
         words_wspecifier = po.GetOptArg(5),
         alignment_wspecifier = po.GetOptArg(6);
 
-    TransitionModel trans_model;
+    Transitions trans_model;
     AmDiagGmm am_gmm;
     {
       bool binary;
diff --git a/src/gmmbin/gmm-latgen-map.cc b/src/gmmbin/gmm-latgen-map.cc
index 541b031fe6c..b7462b93e0f 100644
--- a/src/gmmbin/gmm-latgen-map.cc
+++ b/src/gmmbin/gmm-latgen-map.cc
@@ -26,7 +26,7 @@
 #include "util/common-utils.h"
 #include "gmm/am-diag-gmm.h"
 #include "gmm/mle-am-diag-gmm.h"
-#include "hmm/transition-model.h"
+#include "hmm/transitions.h"
 #include "transform/fmllr-diag-gmm.h"
 #include "fstext/fstext-lib.h"
 #include "decoder/decoder-wrappers.h"
@@ -84,7 +84,7 @@ int main(int argc, char *argv[]) {
         words_wspecifier = po.GetOptArg(6),
         alignment_wspecifier = po.GetOptArg(7);
 
-    TransitionModel trans_model;
+    Transitions trans_model;
     {
       bool binary_read;
       Input is(model_in_filename, &binary_read);
diff --git a/src/gmmbin/gmm-latgen-simple.cc b/src/gmmbin/gmm-latgen-simple.cc
index 812bee7fef4..d7ffe86c4ae 100644
--- a/src/gmmbin/gmm-latgen-simple.cc
+++ b/src/gmmbin/gmm-latgen-simple.cc
@@ -24,7 +24,7 @@
 #include "util/common-utils.h"
 #include "gmm/am-diag-gmm.h"
 #include "tree/context-dep.h"
-#include "hmm/transition-model.h"
+#include "hmm/transitions.h"
 #include "fstext/fstext-lib.h"
 #include "decoder/decoder-wrappers.h"
 #include "gmm/decodable-am-diag-gmm.h"
@@ -71,7 +71,7 @@ int main(int argc, char *argv[]) {
         words_wspecifier = po.GetOptArg(5),
         alignment_wspecifier = po.GetOptArg(6);
     
-    TransitionModel trans_model;
+    Transitions trans_model;
     AmDiagGmm am_gmm;
     {
       bool binary;
diff --git a/src/gmmbin/gmm-make-regtree.cc b/src/gmmbin/gmm-make-regtree.cc
deleted file mode 100644
index 8c79d013e0d..00000000000
--- a/src/gmmbin/gmm-make-regtree.cc
+++ /dev/null
@@ -1,107 +0,0 @@
-// gmmbin/gmm-make-regtree.cc
-
-// Copyright 2009-2011  Saarland University;  Microsoft Corporation
-
-// See ../../COPYING for clarification regarding multiple authors
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//  http://www.apache.org/licenses/LICENSE-2.0
-//
-// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
-// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
-// MERCHANTABLITY OR NON-INFRINGEMENT.
-// See the Apache 2 License for the specific language governing permissions and
-// limitations under the License.
-
-#include "base/kaldi-common.h"
-#include "util/kaldi-io.h"
-#include "util/text-utils.h"
-#include "gmm/mle-am-diag-gmm.h"
-#include "tree/context-dep.h"
-#include "hmm/transition-model.h"
-#include "transform/regression-tree.h"
-
-
-int main(int argc, char *argv[]) {
-  try {
-    typedef kaldi::int32 int32;
-    typedef kaldi::BaseFloat BaseFloat;
-
-    const char *usage =
-        "Build regression class tree.\n"
-        "Usage: gmm-make-regtree [options] <model-file> <regtree-out>\n"
-        "E.g.: gmm-make-regtree --silphones=1:2:3 --state-occs=1.occs 1.mdl 1.regtree\n"
-        " [Note: state-occs come from --write-occs option of gmm-est]\n";
-
-    std::string occs_in_filename;
-    std::string sil_phones_str;
-    bool binary_write = true;
-    int32 max_leaves = 1;
-    kaldi::ParseOptions po(usage);
-    po.Register("state-occs", &occs_in_filename, "File containing state occupancies (use --write-occs in gmm-est)");
-    po.Register("sil-phones", &sil_phones_str, "Colon-separated list of integer ids of silence phones, e.g. 1:2:3; if used, create top-level speech/sil split (only one reg-class for silence).");
-    po.Register("binary", &binary_write, "Write output in binary mode");
-    po.Register("max-leaves", &max_leaves, "Maximum number of leaves in regression tree.");
-    po.Read(argc, argv);
-
-    if (po.NumArgs() != 2) {
-      po.PrintUsage();
-      exit(1);
-    }
-
-    std::string model_in_filename = po.GetArg(1),
-        tree_out_filename = po.GetArg(2);
-
-    kaldi::AmDiagGmm am_gmm;
-    kaldi::TransitionModel trans_model;
-    {
-      bool binary_read;
-      kaldi::Input ki(model_in_filename, &binary_read);
-      trans_model.Read(ki.Stream(), binary_read);
-      am_gmm.Read(ki.Stream(), binary_read);
-    }
-
-    kaldi::Vector<BaseFloat> state_occs;
-    if (occs_in_filename != "") {
-      bool binary_read;
-      kaldi::Input ki(occs_in_filename, &binary_read);
-      state_occs.Read(ki.Stream(), binary_read);
-    } else {
-      KALDI_LOG << "--state-occs option not provided so using constant occupancies.";
-      state_occs.Resize(am_gmm.NumPdfs());
-      state_occs.Set(1.0);
-    }
-
-    std::vector<int32> sil_pdfs;
-    if (sil_phones_str != "") {
-      std::vector<int32> sil_phones;
-      if (!kaldi::SplitStringToIntegers(sil_phones_str, ":", false, &sil_phones))
-        KALDI_ERR << "invalid sil-phones option " << sil_phones_str;
-      std::sort(sil_phones.begin(), sil_phones.end());
-      bool ans = GetPdfsForPhones(trans_model, sil_phones, &sil_pdfs);
-      if (!ans)
-        KALDI_WARN << "Pdfs associated with silence phones are not only "
-            "associated with silence phones: your speech-silence split "
-            "may not be meaningful.";
-    }
-
-    kaldi::RegressionTree regtree;
-    regtree.BuildTree(state_occs, sil_pdfs, am_gmm, max_leaves);
-    // Write out the regression tree
-    {
-      kaldi::Output ko(tree_out_filename, binary_write);
-      regtree.Write(ko.Stream(), binary_write);
-    }
-
-    KALDI_LOG << "Written regression tree to " << tree_out_filename;
-  } catch(const std::exception &e) {
-    std::cerr << e.what() << '\n';
-    return -1;
-  }
-}
-
-
diff --git a/src/gmmbin/gmm-mixup.cc b/src/gmmbin/gmm-mixup.cc
index a76b3805d89..51919560b10 100644
--- a/src/gmmbin/gmm-mixup.cc
+++ b/src/gmmbin/gmm-mixup.cc
@@ -21,7 +21,7 @@
 #include "util/common-utils.h"
 #include "gmm/am-diag-gmm.h"
 #include "tree/context-dep.h"
-#include "hmm/transition-model.h"
+#include "hmm/transitions.h"
 #include "gmm/mle-am-diag-gmm.h"
 
 int main(int argc, char *argv[]) {
@@ -70,7 +70,7 @@ int main(int argc, char *argv[]) {
         model_out_filename = po.GetArg(3);
 
     AmDiagGmm am_gmm;
-    TransitionModel trans_model;
+    Transitions trans_model;
     {
       bool binary_read;
       Input ki(model_in_filename, &binary_read);
diff --git a/src/gmmbin/gmm-post-to-gpost.cc b/src/gmmbin/gmm-post-to-gpost.cc
index 59da0f9a1ac..1260c9b922a 100644
--- a/src/gmmbin/gmm-post-to-gpost.cc
+++ b/src/gmmbin/gmm-post-to-gpost.cc
@@ -22,7 +22,7 @@
 #include "base/kaldi-common.h"
 #include "util/common-utils.h"
 #include "gmm/am-diag-gmm.h"
-#include "hmm/transition-model.h"
+#include "hmm/transitions.h"
 #include "hmm/posterior.h"
 
 int main(int argc, char *argv[]) {
@@ -56,7 +56,7 @@ int main(int argc, char *argv[]) {
     typedef kaldi::int32 int32;
 
     AmDiagGmm am_gmm;
-    TransitionModel trans_model;
+    Transitions trans_model;
     {
       bool binary;
       Input ki(model_filename, &binary);
diff --git a/src/gmmbin/gmm-rescore-lattice.cc b/src/gmmbin/gmm-rescore-lattice.cc
index 54156442e64..36088cac304 100644
--- a/src/gmmbin/gmm-rescore-lattice.cc
+++ b/src/gmmbin/gmm-rescore-lattice.cc
@@ -22,7 +22,7 @@
 #include "util/common-utils.h"
 #include "util/stl-utils.h"
 #include "gmm/am-diag-gmm.h"
-#include "hmm/transition-model.h"
+#include "hmm/transitions.h"
 #include "fstext/fstext-lib.h"
 #include "lat/kaldi-lattice.h"
 #include "lat/lattice-functions.h"
@@ -61,7 +61,7 @@ int main(int argc, char *argv[]) {
         lats_wspecifier = po.GetArg(4);
 
     AmDiagGmm am_gmm;
-    TransitionModel trans_model;
+    Transitions trans_model;
     {
       bool binary;
       Input ki(model_filename, &binary);
diff --git a/src/gmmbin/gmm-sum-accs.cc b/src/gmmbin/gmm-sum-accs.cc
index c9886e867f5..49146925bab 100644
--- a/src/gmmbin/gmm-sum-accs.cc
+++ b/src/gmmbin/gmm-sum-accs.cc
@@ -19,7 +19,7 @@
 
 #include "util/common-utils.h"
 #include "gmm/mle-am-diag-gmm.h"
-#include "hmm/transition-model.h"
+#include "hmm/transitions.h"
 
 
 int main(int argc, char *argv[]) {
diff --git a/src/gmmbin/gmm-transform-means-global.cc b/src/gmmbin/gmm-transform-means-global.cc
index 6b1a6be8330..857b602c19b 100644
--- a/src/gmmbin/gmm-transform-means-global.cc
+++ b/src/gmmbin/gmm-transform-means-global.cc
@@ -22,7 +22,7 @@
 #include "util/common-utils.h"
 #include "gmm/diag-gmm.h"
 #include "tree/context-dep.h"
-#include "hmm/transition-model.h"
+#include "hmm/transitions.h"
 #include "transform/mllt.h"
 
 int main(int argc, char *argv[]) {
diff --git a/src/gmmbin/gmm-transform-means.cc b/src/gmmbin/gmm-transform-means.cc
index 5c08ec32b10..3a27d73a947 100644
--- a/src/gmmbin/gmm-transform-means.cc
+++ b/src/gmmbin/gmm-transform-means.cc
@@ -22,7 +22,7 @@
 #include "util/common-utils.h"
 #include "gmm/am-diag-gmm.h"
 #include "tree/context-dep.h"
-#include "hmm/transition-model.h"
+#include "hmm/transitions.h"
 #include "transform/mllt.h"
 
 int main(int argc, char *argv[]) {
@@ -55,7 +55,7 @@ int main(int argc, char *argv[]) {
     ReadKaldiObject(mat_rxfilename, &mat);
 
     AmDiagGmm am_gmm;
-    TransitionModel trans_model;
+    Transitions trans_model;
     {
       bool binary_read;
       Input ki(model_in_rxfilename, &binary_read);
diff --git a/src/gst-plugin/gst-online-gmm-decode-faster.cc b/src/gst-plugin/gst-online-gmm-decode-faster.cc
index 958bce41d80..094d398960a 100644
--- a/src/gst-plugin/gst-online-gmm-decode-faster.cc
+++ b/src/gst-plugin/gst-online-gmm-decode-faster.cc
@@ -389,7 +389,7 @@ gst_online_gmm_decode_faster_allocate(GstOnlineGmmDecodeFaster * filter) {
       Input ki(filter->lda_mat_rspecifier_, &binary_in);
       filter->lda_transform_->Read(ki.Stream(), binary_in);
     }
-    filter->trans_model_ = new TransitionModel();
+    filter->trans_model_ = new Transitions();
     filter->am_gmm_ = new AmDiagGmm();
     {
       bool binary;
diff --git a/src/gst-plugin/gst-online-gmm-decode-faster.h b/src/gst-plugin/gst-online-gmm-decode-faster.h
index b950d1e0a12..529c510115a 100644
--- a/src/gst-plugin/gst-online-gmm-decode-faster.h
+++ b/src/gst-plugin/gst-online-gmm-decode-faster.h
@@ -65,7 +65,7 @@ struct _GstOnlineGmmDecodeFaster {
 
   OnlineFasterDecoder *decoder_;
   Matrix<BaseFloat> *lda_transform_;
-  TransitionModel *trans_model_;
+  Transitions *trans_model_;
   AmDiagGmm *am_gmm_;
   fst::Fst<fst::StdArc> *decode_fst_;
   fst::SymbolTable *word_syms_;
diff --git a/src/hmm/Makefile b/src/hmm/Makefile
index 0ad5da74c28..0315a51b214 100644
--- a/src/hmm/Makefile
+++ b/src/hmm/Makefile
@@ -3,14 +3,13 @@ all:
 
 include ../kaldi.mk
 
-TESTFILES = hmm-topology-test hmm-utils-test transition-model-test posterior-test
+TESTFILES = topology-test hmm-utils-test transitions-test posterior-test
 
-OBJFILES = hmm-topology.o transition-model.o hmm-utils.o tree-accu.o \
+OBJFILES = topology.o transitions.o hmm-utils.o tree-accu.o \
         posterior.o hmm-test-utils.o
 
 LIBNAME = kaldi-hmm
 ADDLIBS = ../tree/kaldi-tree.a ../util/kaldi-util.a ../matrix/kaldi-matrix.a \
-          ../base/kaldi-base.a 
+          ../base/kaldi-base.a
 
 include ../makefiles/default_rules.mk
-
diff --git a/src/hmm/hmm-test-utils.cc b/src/hmm/hmm-test-utils.cc
index ceca116c828..5f00474219b 100644
--- a/src/hmm/hmm-test-utils.cc
+++ b/src/hmm/hmm-test-utils.cc
@@ -23,7 +23,7 @@
 
 namespace kaldi {
 
-TransitionModel *GenRandTransitionModel(ContextDependency **ctx_dep_out) {
+Transitions *GenRandTransitionModel(ContextDependency **ctx_dep_out) {
   std::vector<int32> phones;
   phones.push_back(1);
   for (int32 i = 2; i < 20; i++)
@@ -38,16 +38,16 @@ TransitionModel *GenRandTransitionModel(ContextDependency **ctx_dep_out) {
       GenRandContextDependencyLarge(phones, N, P,
                                     true, &num_pdf_classes);
 
-  HmmTopology topo = GenRandTopology(phones, num_pdf_classes);
+  Topology topo = GenRandTopology(phones, num_pdf_classes);
 
-  TransitionModel *trans_model = new TransitionModel(*ctx_dep, topo);
+  Transitions *trans_model = new TransitionModel(*ctx_dep, topo);
 
   if (ctx_dep_out == NULL) delete ctx_dep;
   else *ctx_dep_out = ctx_dep;
   return trans_model;
 }
 
-HmmTopology GetDefaultTopology(const std::vector<int32> &phones_in) {
+Topology GetDefaultTopology(const std::vector<int32> &phones_in) {
   std::vector<int32> phones(phones_in);
   std::sort(phones.begin(), phones.end());
   KALDI_ASSERT(IsSortedAndUniq(phones) && !phones.empty());
@@ -76,7 +76,7 @@ HmmTopology GetDefaultTopology(const std::vector<int32> &phones_in) {
       " </TopologyEntry>\n"
       " </Topology>\n";
 
-  HmmTopology topo;
+  Topology topo;
   std::istringstream iss(topo_string.str());
   topo.Read(iss, false);
   return topo;
@@ -84,7 +84,7 @@ HmmTopology GetDefaultTopology(const std::vector<int32> &phones_in) {
 }
 
 
-HmmTopology GenRandTopology(const std::vector<int32> &phones_in,
+Topology GenRandTopology(const std::vector<int32> &phones_in,
                             const std::vector<int32> &num_pdf_classes) {
   std::vector<int32> phones(phones_in);
   std::sort(phones.begin(), phones.end());
@@ -165,13 +165,13 @@ HmmTopology GenRandTopology(const std::vector<int32> &phones_in,
   }
   topo_string << "</Topology>\n";
 
-  HmmTopology topo;
+  Topology topo;
   std::istringstream iss(topo_string.str());
   topo.Read(iss, false);
   return topo;
 }
 
-HmmTopology GenRandTopology() {
+Topology GenRandTopology() {
   std::vector<int32> phones;
   phones.push_back(1);
   for (int32 i = 2; i < 20; i++)
@@ -187,12 +187,12 @@ HmmTopology GenRandTopology() {
   }
 }
 
-void GeneratePathThroughHmm(const HmmTopology &topology,
+void GeneratePathThroughHmm(const Topology &topology,
                             bool reorder,
                             int32 phone,
                             std::vector<std::pair<int32, int32> > *path) {
   path->clear();
-  const HmmTopology::TopologyEntry &this_entry =
+  const Topology::TopologyEntry &this_entry =
       topology.TopologyForPhone(phone);
   int32 cur_state = 0;  // start-state is always state zero.
   int32 num_states = this_entry.size(), final_state = num_states - 1;
@@ -200,7 +200,7 @@ void GeneratePathThroughHmm(const HmmTopology &topology,
   // that's different from the start state.
   std::vector<std::pair<int32, int32> > pending_self_loops;
   while (cur_state != final_state) {
-    const HmmTopology::HmmState &cur_hmm_state = this_entry[cur_state];
+    const Topology::HmmState &cur_hmm_state = this_entry[cur_state];
     int32 num_transitions = cur_hmm_state.transitions.size(),
         transition_index = RandInt(0, num_transitions - 1);
     if (cur_hmm_state.forward_pdf_class != -1) {
@@ -230,7 +230,7 @@ void GeneratePathThroughHmm(const HmmTopology &topology,
 
 
 void GenerateRandomAlignment(const ContextDependencyInterface &ctx_dep,
-                             const TransitionModel &trans_model,
+                             const Transitions &trans_model,
                              bool reorder,
                              const std::vector<int32> &phone_sequence,
                              std::vector<int32> *alignment) {
@@ -253,7 +253,7 @@ void GenerateRandomAlignment(const ContextDependencyInterface &ctx_dep,
     int32 phone = phone_sequence[i];
     GeneratePathThroughHmm(trans_model.GetTopo(), reorder, phone, &path);
     for (size_t k = 0; k < path.size(); k++) {
-      const HmmTopology::TopologyEntry &entry =
+      const Topology::TopologyEntry &entry =
           trans_model.GetTopo().TopologyForPhone(phone);
       int32 hmm_state = path[k].first,
           transition_index = path[k].second,
diff --git a/src/hmm/hmm-test-utils.h b/src/hmm/hmm-test-utils.h
index 4faaa92fa66..f9f516e7d4c 100644
--- a/src/hmm/hmm-test-utils.h
+++ b/src/hmm/hmm-test-utils.h
@@ -21,38 +21,38 @@
 #ifndef KALDI_HMM_HMM_TEST_UTILS_H_
 #define KALDI_HMM_HMM_TEST_UTILS_H_
 
-#include "hmm/hmm-topology.h"
-#include "hmm/transition-model.h"
+#include "hmm/topology.h"
+#include "hmm/transitions.h"
 #include "lat/kaldi-lattice.h"
 #include "tree/context-dep.h"
 
 namespace kaldi {
 
-// Here we put a convenience function for generating a TransitionModel object --
+// Here we put a convenience function for generating a Transitions object --
 // useful in test code.  We may put other testing-related things here in time.
 
-// This function returns a randomly generated TransitionModel object.
+// This function returns a randomly generated Transitions object.
 // If 'ctx_dep' is not NULL, it outputs to *ctx_dep a pointer to the
 // tree that was used to generate the transition model.
-TransitionModel *GenRandTransitionModel(ContextDependency **ctx_dep);
+Transitions *GenRandTransitionModel(ContextDependency **ctx_dep);
 
-/// This function returns a HmmTopology object giving a normal 3-state topology,
+/// This function returns a Topology object giving a normal 3-state topology,
 /// covering all phones in the list "phones".  This is mainly of use in testing
 /// code.
-HmmTopology GetDefaultTopology(const std::vector<int32> &phones);
+Topology GetDefaultTopology(const std::vector<int32> &phones);
 
 
-/// This method of generating an arbitrary HmmTopology object allows you to
+/// This method of generating an arbitrary Topology object allows you to
 /// specify the number of pdf-classes for each phone separately.
 /// 'num_pdf_classes' is indexed by the phone-index (so the length will be
 /// longer than the length of the 'phones' vector, which for example lacks the
 /// zero index and may have gaps).
-HmmTopology GenRandTopology(const std::vector<int32> &phones,
+Topology GenRandTopology(const std::vector<int32> &phones,
                             const std::vector<int32> &num_pdf_classes);
 
 /// This version of GenRandTopology() generates the phone list and number of pdf
 /// classes randomly.
-HmmTopology GenRandTopology();
+Topology GenRandTopology();
 
 /// This function generates a random path through the HMM for the given
 /// phone.  The 'path' output is a list of pairs (HMM-state, transition-index)
@@ -60,7 +60,7 @@ HmmTopology GenRandTopology();
 /// used in other test code.
 /// the 'reorder' option is as described in the documentation; if true, the
 /// self-loops from a state are reordered to come after the forward-transition.
-void GeneratePathThroughHmm(const HmmTopology &topology,
+void GeneratePathThroughHmm(const Topology &topology,
                             bool reorder,
                             int32 phone,
                             std::vector<std::pair<int32, int32> > *path);
@@ -69,7 +69,7 @@ void GeneratePathThroughHmm(const HmmTopology &topology,
 /// For use in test code, this function generates an alignment (a sequence of
 /// transition-ids) corresponding to a given phone sequence.
 void GenerateRandomAlignment(const ContextDependencyInterface &ctx_dep,
-                             const TransitionModel &trans_model,
+                             const Transitions &trans_model,
                              bool reorder,
                              const std::vector<int32> &phone_sequence,
                              std::vector<int32> *alignment);
diff --git a/src/hmm/hmm-topology-test.cc b/src/hmm/hmm-topology-test.cc
index 14081d2355d..9a3a65b61a4 100644
--- a/src/hmm/hmm-topology-test.cc
+++ b/src/hmm/hmm-topology-test.cc
@@ -18,13 +18,13 @@
 // See the Apache 2 License for the specific language governing permissions and
 // limitations under the License.
 
-#include "hmm/hmm-topology.h"
+#include "hmm/topology.h"
 #include "hmm/hmm-test-utils.h"
 
 namespace kaldi {
 
 
-void TestHmmTopology() {
+void TestTopology() {
   bool binary = (Rand()%2 == 0);
 
   std::string input_str = "<Topology>\n"
@@ -69,7 +69,7 @@ void TestHmmTopology() {
       "</TopologyEntry>\n"
       "</Topology>\n";
 
-  HmmTopology topo;
+  Topology topo;
 
   if (RandInt(0, 1) == 0) {
     topo = GenRandTopology();
@@ -83,7 +83,7 @@ void TestHmmTopology() {
   std::ostringstream oss;
   topo.Write(oss, binary);
 
-  HmmTopology topo2;
+  Topology topo2;
   // std::cout << oss.str() << '\n' << std::flush;
   std::istringstream iss2(oss.str());
   topo2.Read(iss2, binary);
@@ -96,7 +96,7 @@ void TestHmmTopology() {
   }
 
   {  // test chain topology
-    HmmTopology chain_topo;
+    Topology chain_topo;
     std::istringstream chain_iss(chain_input_str);
     chain_topo.Read(chain_iss, false);
     KALDI_ASSERT(chain_topo.MinLength(3) == 1);
@@ -116,7 +116,7 @@ void TestHmmTopology() {
 int main() {
   // repeat the test ten times
   for (int i = 0; i < 10; i++) {
-    kaldi::TestHmmTopology();
+    kaldi::TestTopology();
   }
   std::cout << "Test OK.\n";
 }
diff --git a/src/hmm/hmm-topology.h b/src/hmm/hmm-topology.h
deleted file mode 100644
index 750d35bcfe4..00000000000
--- a/src/hmm/hmm-topology.h
+++ /dev/null
@@ -1,194 +0,0 @@
-// hmm/hmm-topology.h
-
-// Copyright 2009-2011  Microsoft Corporation
-
-// See ../../COPYING for clarification regarding multiple authors
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//  http://www.apache.org/licenses/LICENSE-2.0
-//
-// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
-// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
-// MERCHANTABLITY OR NON-INFRINGEMENT.
-// See the Apache 2 License for the specific language governing permissions and
-// limitations under the License.
-
-#ifndef KALDI_HMM_HMM_TOPOLOGY_H_
-#define KALDI_HMM_HMM_TOPOLOGY_H_
-
-#include "base/kaldi-common.h"
-#include "util/const-integer-set.h"
-
-
-namespace kaldi {
-
-
-/// \addtogroup hmm_group
-/// @{
-
-/*
- // The following would be the text form for the "normal" HMM topology.
- // Note that the first state is the start state, and the final state,
- // which must have no output transitions and must be nonemitting, has
- // an exit probability of one (no other state can have nonzero exit
- // probability; you can treat the transition probability to the final
- // state as an exit probability).
- // Note also that it's valid to omit the "<PdfClass>" entry of the <State>, which
- // will mean we won't have a pdf on that state [non-emitting state].  This is equivalent
- // to setting the <PdfClass> to -1.  We do this normally just for the final state.
- // The Topology object can have multiple <TopologyEntry> blocks.
- // This is useful if there are multiple types of topology in the system.
-
- <Topology>
- <TopologyEntry>
- <ForPhones> 1 2 3 4 5 6 7 8 </ForPhones>
- <State> 0 <PdfClass> 0
- <Transition> 0 0.5
- <Transition> 1 0.5
- </State>
- <State> 1 <PdfClass> 1
- <Transition> 1 0.5
- <Transition> 2 0.5
- </State>
- <State> 2 <PdfClass> 2
- <Transition> 2 0.5
- <Transition> 3 0.5
- <Final> 0.5
- </State>
- <State> 3
- </State>
- </TopologyEntry>
- </Topology>
-*/
-
-// kNoPdf is used where pdf_class or pdf would be used, to indicate,
-// none is there.  Mainly useful in skippable models, but also used
-// for end states.
-// A caveat with nonemitting states is that their out-transitions
-// are not trainable, due to technical issues with the way
-// we decided to accumulate the stats.  Any transitions arising from (*)
-// HMM states with "kNoPdf" as the label are second-class transitions,
-// They do not have "transition-states" or "transition-ids" associated
-// with them.  They are used to create the FST version of the
-// HMMs, where they lead to epsilon arcs.
-// (*) "arising from" is a bit of a technical term here, due to the way
-// (if reorder == true), we put the transition-id associated with the
-// outward arcs of the state, on the input transition to the state.
-
-/// A constant used in the HmmTopology class as the \ref pdf_class "pdf-class"
-/// kNoPdf, which is used when a HMM-state is nonemitting (has no associated
-/// PDF).
-
-static const int32 kNoPdf = -1;
-
-/// A class for storing topology information for phones.  See  \ref hmm for context.
-/// This object is sometimes accessed in a file by itself, but more often
-/// as a class member of the Transition class (this is for convenience to reduce
-/// the number of files programs have to access).
-
-class HmmTopology {
- public:
-  /// A structure defined inside HmmTopology to represent a HMM state.
-  struct HmmState {
-    /// The \ref pdf_class forward-pdf-class, typically 0, 1 or 2 (the same as the HMM-state index),
-    /// but may be different to enable us to hardwire sharing of state, and may be
-    /// equal to \ref kNoPdf == -1 in order to specify nonemitting states (unusual).
-    int32 forward_pdf_class;
-
-    /// The \ref pdf_class self-loop pdf-class, similar to \ref pdf_class forward-pdf-class.
-    /// They will either both be \ref kNoPdf, or neither be \ref kNoPdf.
-    int32 self_loop_pdf_class;
-
-    /// A list of transitions, indexed by what we call a 'transition-index'.
-    /// The first member of each pair is the index of the next HmmState, and the
-    /// second is the default transition probability (before training).
-    std::vector<std::pair<int32, BaseFloat> > transitions;
-
-    explicit HmmState(int32 pdf_class) {
-      this->forward_pdf_class = pdf_class;
-      this->self_loop_pdf_class = pdf_class;
-    }
-    explicit HmmState(int32 forward_pdf_class, int32 self_loop_pdf_class) {
-      KALDI_ASSERT((forward_pdf_class != kNoPdf && self_loop_pdf_class != kNoPdf) ||
-                   (forward_pdf_class == kNoPdf && self_loop_pdf_class == kNoPdf));
-      this->forward_pdf_class = forward_pdf_class;
-      this->self_loop_pdf_class = self_loop_pdf_class;
-    }
-
-    bool operator == (const HmmState &other) const {
-      return (forward_pdf_class == other.forward_pdf_class &&
-              self_loop_pdf_class == other.self_loop_pdf_class &&
-              transitions == other.transitions);
-    }
-
-    HmmState(): forward_pdf_class(-1), self_loop_pdf_class(-1) { }
-  };
-
-  /// TopologyEntry is a typedef that represents the topology of
-  /// a single (prototype) state.
-  typedef std::vector<HmmState> TopologyEntry;
-
-  void Read(std::istream &is, bool binary);
-  void Write(std::ostream &os, bool binary) const;
-
-  // Checks that the object is valid, and throw exception otherwise.
-  void Check();
-
-  /// Returns true if this HmmTopology is really 'hmm-like', i.e. the pdf-class on
-  /// the self-loops and forward transitions of all states are identical. [note: in HMMs,
-  /// the densities are associated with the states.] We have extended this to
-  /// support 'non-hmm-like' topologies (where those pdf-classes are different),
-  /// in order to make for more compact decoding graphs in our so-called 'chain models'
-  /// (AKA lattice-free MMI), where we use 1-state topologies that have different pdf-classes
-  /// for the self-loop and the forward transition. Note that we always use the 'reorder=true'
-  /// option so the 'forward transition' actually comes before the self-loop.
-  bool IsHmm() const;
-
-  /// Returns the topology entry (i.e. vector of HmmState) for this phone;
-  /// will throw exception if phone not covered by the topology.
-  const TopologyEntry &TopologyForPhone(int32 phone) const;
-
-  /// Returns the number of \ref pdf_class "pdf-classes" for this phone;
-  /// throws exception if phone not covered by this topology.
-  int32 NumPdfClasses(int32 phone) const;
-
-  /// Returns a reference to a sorted, unique list of phones covered by
-  /// the topology (these phones will be positive integers, and usually
-  /// contiguous and starting from one but the toolkit doesn't assume
-  /// they are contiguous).
-  const std::vector<int32> &GetPhones() const { return phones_; };
-
-  /// Outputs a vector of int32, indexed by phone, that gives the
-  /// number of \ref pdf_class pdf-classes for the phones; this is
-  /// used by tree-building code such as BuildTree().
-  void GetPhoneToNumPdfClasses(std::vector<int32> *phone2num_pdf_classes) const;
-
-  // Returns the minimum number of frames it takes to traverse this model for
-  // this phone: e.g. 3 for the normal HMM topology.
-  int32 MinLength(int32 phone) const;
-
-  HmmTopology() {}
-
-  bool operator == (const HmmTopology &other) const {
-    return phones_ == other.phones_ && phone2idx_ == other.phone2idx_
-        && entries_ == other.entries_;
-  }
-  // Allow default assignment operator and copy constructor.
- private:
-  std::vector<int32> phones_;  // list of all phones we have topology for.  Sorted, uniq.  no epsilon (zero) phone.
-  std::vector<int32> phone2idx_;  // map from phones to indexes into the entries vector (or -1 for not present).
-  std::vector<TopologyEntry> entries_;
-};
-
-
-/// @} end "addtogroup hmm_group"
-
-
-} // end namespace kaldi
-
-
-#endif
diff --git a/src/hmm/hmm-utils-test.cc b/src/hmm/hmm-utils-test.cc
index 69728cc8ca7..cf282ac03c5 100644
--- a/src/hmm/hmm-utils-test.cc
+++ b/src/hmm/hmm-utils-test.cc
@@ -202,7 +202,7 @@ void TestAccumulateTreeStatsOptions() {
 
 void TestSplitToPhones() {
   ContextDependency *ctx_dep = NULL;
-  TransitionModel *trans_model = GenRandTransitionModel(&ctx_dep);
+  Transitions *trans_model = GenRandTransitionModel(&ctx_dep);
   std::vector<int32> phone_seq;
   int32 num_phones = RandInt(0, 10);
   const std::vector<int32> &phone_list = trans_model->GetPhones();
@@ -273,11 +273,11 @@ void TestConvertAlignment() {
   }
 
 
-  HmmTopology topo_old = GenRandTopology(phones, num_pdf_classes_old),
+  Topology topo_old = GenRandTopology(phones, num_pdf_classes_old),
       topo_new =  (new_topology ?
                    GenRandTopology(phones, num_pdf_classes_new) : topo_old);
 
-  TransitionModel trans_model_old(*ctx_dep_old, topo_old),
+  Transitions trans_model_old(*ctx_dep_old, topo_old),
       trans_model_new(*ctx_dep_new, topo_new);
 
   std::vector<int32> phone_sequence;
diff --git a/src/hmm/hmm-utils.cc b/src/hmm/hmm-utils.cc
index 06edf8d5976..a70dc5275c2 100644
--- a/src/hmm/hmm-utils.cc
+++ b/src/hmm/hmm-utils.cc
@@ -32,7 +32,7 @@ namespace kaldi {
 fst::VectorFst<fst::StdArc> *GetHmmAsFsa(
     std::vector<int32> phone_window,
     const ContextDependencyInterface &ctx_dep,
-    const TransitionModel &trans_model,
+    const Transitions &trans_model,
     const HTransducerConfig &config,
     HmmCacheType *cache) {
   using namespace fst;
@@ -48,8 +48,8 @@ fst::VectorFst<fst::StdArc> *GetHmmAsFsa(
     KALDI_ERR << "phone == 0.  Some mismatch happened, or there is "
           "a code error.";
 
-  const HmmTopology &topo = trans_model.GetTopo();
-  const HmmTopology::TopologyEntry &entry  = topo.TopologyForPhone(phone);
+  const Topology &topo = trans_model.GetTopo();
+  const Topology::TopologyEntry &entry  = topo.TopologyForPhone(phone);
 
   // vector of the pdfs, indexed by pdf-class (pdf-classes must start from zero
   // and be contiguous).
@@ -154,7 +154,7 @@ fst::VectorFst<fst::StdArc> *GetHmmAsFsa(
 fst::VectorFst<fst::StdArc>*
 GetHmmAsFsaSimple(std::vector<int32> phone_window,
                   const ContextDependencyInterface &ctx_dep,
-                  const TransitionModel &trans_model,
+                  const Transitions &trans_model,
                   BaseFloat prob_scale) {
   using namespace fst;
 
@@ -167,8 +167,8 @@ GetHmmAsFsaSimple(std::vector<int32> phone_window,
   int32 phone = phone_window[P];
   KALDI_ASSERT(phone != 0);
 
-  const HmmTopology &topo = trans_model.GetTopo();
-  const HmmTopology::TopologyEntry &entry  = topo.TopologyForPhone(phone);
+  const Topology &topo = trans_model.GetTopo();
+  const Topology::TopologyEntry &entry  = topo.TopologyForPhone(phone);
 
   VectorFst<StdArc> *ans = new VectorFst<StdArc>;
 
@@ -253,7 +253,7 @@ static inline fst::VectorFst<fst::StdArc> *MakeTrivialAcceptor(int32 label) {
 // The H transducer has a separate outgoing arc for each of the symbols in ilabel_info.
 fst::VectorFst<fst::StdArc> *GetHTransducer(const std::vector<std::vector<int32> > &ilabel_info,
                                             const ContextDependencyInterface &ctx_dep,
-                                            const TransitionModel &trans_model,
+                                            const Transitions &trans_model,
                                             const HTransducerConfig &config,
                                             std::vector<int32> *disambig_syms_left) {
   KALDI_ASSERT(ilabel_info.size() >= 1 && ilabel_info[0].size() == 0);  // make sure that eps == eps.
@@ -334,7 +334,7 @@ fst::VectorFst<fst::StdArc> *GetHTransducer(const std::vector<std::vector<int32>
 
 void GetIlabelMapping (const std::vector<std::vector<int32> > &ilabel_info_old,
                        const ContextDependencyInterface &ctx_dep,
-                       const TransitionModel &trans_model,
+                       const Transitions &trans_model,
                        std::vector<int32> *old2new_map) {
   KALDI_ASSERT(old2new_map != NULL);
 
@@ -404,7 +404,7 @@ void GetIlabelMapping (const std::vector<std::vector<int32> > &ilabel_info_old,
 
 
 
-fst::VectorFst<fst::StdArc> *GetPdfToTransitionIdTransducer(const TransitionModel &trans_model) {
+fst::VectorFst<fst::StdArc> *GetPdfToTransitionIdTransducer(const Transitions &trans_model) {
   using namespace fst;
   VectorFst<StdArc> *ans = new VectorFst<StdArc>;
   typedef VectorFst<StdArc>::Weight Weight;
@@ -437,7 +437,7 @@ class TidToTstateMapper {
   // with values over 100000/kNontermBigNumber) to zero.
   // Its point is to provide an equivalence class on labels that's relevant to what
   // the self-loop will be on the following (or preceding) state.
-  TidToTstateMapper(const TransitionModel &trans_model,
+  TidToTstateMapper(const Transitions &trans_model,
                     const std::vector<int32> &disambig_syms,
                     bool check_no_self_loops):
       trans_model_(trans_model),
@@ -461,7 +461,7 @@ class TidToTstateMapper {
   }
 
 private:
-  const TransitionModel &trans_model_;
+  const Transitions &trans_model_;
   const std::vector<int32> &disambig_syms_;  // sorted.
   bool check_no_self_loops_;
 };
@@ -469,7 +469,7 @@ class TidToTstateMapper {
 // This is the code that expands an FST from transition-states to
 // transition-ids, in the case where reorder == true, i.e. the non-optional
 // transition is before the self-loop.
-static void AddSelfLoopsReorder(const TransitionModel &trans_model,
+static void AddSelfLoopsReorder(const Transitions &trans_model,
                                 const std::vector<int32> &disambig_syms,
                                 BaseFloat self_loop_scale,
                                 bool check_no_self_loops,
@@ -553,7 +553,7 @@ static void AddSelfLoopsReorder(const TransitionModel &trans_model,
 // transition-ids, in the case where reorder == false, i.e. non-optional
 // transition is after the self-loop.
 static void AddSelfLoopsNoReorder(
-    const TransitionModel &trans_model,
+    const Transitions &trans_model,
     const std::vector<int32> &disambig_syms,
     BaseFloat self_loop_scale,
     bool check_no_self_loops,
@@ -599,7 +599,7 @@ static void AddSelfLoopsNoReorder(
   }
 }
 
-void AddSelfLoops(const TransitionModel &trans_model,
+void AddSelfLoops(const Transitions &trans_model,
                   const std::vector<int32> &disambig_syms,
                   BaseFloat self_loop_scale,
                   bool reorder,
@@ -622,7 +622,7 @@ void AddSelfLoops(const TransitionModel &trans_model,
 // code doesn't care what the answer is.
 // The "alignment" vector contains a sequence of TransitionIds.
 
-static bool IsReordered(const TransitionModel &trans_model,
+static bool IsReordered(const Transitions &trans_model,
                         const std::vector<int32> &alignment) {
   for (size_t i = 0; i + 1 < alignment.size(); i++) {
     int32 tstate1 = trans_model.TransitionIdToTransitionState(alignment[i]),
@@ -656,7 +656,7 @@ static bool IsReordered(const TransitionModel &trans_model,
 // checks (if the input does not start at the start of a phone or does not
 // end at the end of a phone, we should expect that false will be returned).
 
-static bool SplitToPhonesInternal(const TransitionModel &trans_model,
+static bool SplitToPhonesInternal(const Transitions &trans_model,
                                   const std::vector<int32> &alignment,
                                   bool reordered,
                                   std::vector<std::vector<int32> > *split_output) {
@@ -720,7 +720,7 @@ static bool SplitToPhonesInternal(const TransitionModel &trans_model,
 }
 
 
-bool SplitToPhones(const TransitionModel &trans_model,
+bool SplitToPhones(const Transitions &trans_model,
                    const std::vector<int32> &alignment,
                    std::vector<std::vector<int32> > *split_alignment) {
   KALDI_ASSERT(split_alignment != NULL);
@@ -740,8 +740,8 @@ bool SplitToPhones(const TransitionModel &trans_model,
     'subsample' value is not 1).
  */
 static inline void ConvertAlignmentForPhone(
-    const TransitionModel &old_trans_model,
-    const TransitionModel &new_trans_model,
+    const Transitions &old_trans_model,
+    const Transitions &new_trans_model,
     const ContextDependencyInterface &new_ctx_dep,
     const std::vector<int32> &old_phone_alignment,
     const std::vector<int32> &new_phone_window,
@@ -754,7 +754,7 @@ static inline void ConvertAlignmentForPhone(
       old_central_phone = old_trans_model.TransitionIdToPhone(
           old_phone_alignment[0]),
       new_central_phone = new_phone_window[P];
-  const HmmTopology &old_topo = old_trans_model.GetTopo(),
+  const Topology &old_topo = old_trans_model.GetTopo(),
       &new_topo = new_trans_model.GetTopo();
 
   bool topology_mismatch = !(old_topo.TopologyForPhone(old_central_phone) ==
@@ -846,7 +846,7 @@ static inline void ConvertAlignmentForPhone(
                                 reduced-frame-rate system.
    @param new_lengths [out]     The vector for storing new lengths.
 */
-static bool ComputeNewPhoneLengths(const HmmTopology &topology,
+static bool ComputeNewPhoneLengths(const Topology &topology,
                                    const std::vector<int32> &mapped_phones,
                                    const std::vector<int32> &old_lengths,
                                    int32 conversion_shift,
@@ -923,8 +923,8 @@ static bool ComputeNewPhoneLengths(const HmmTopology &topology,
   'conversion_shift' is for.
 */
 
-static bool ConvertAlignmentInternal(const TransitionModel &old_trans_model,
-                      const TransitionModel &new_trans_model,
+static bool ConvertAlignmentInternal(const Transitions &old_trans_model,
+                      const Transitions &new_trans_model,
                       const ContextDependencyInterface &new_ctx_dep,
                       const std::vector<int32> &old_alignment,
                       int32 conversion_shift,
@@ -1010,8 +1010,8 @@ static bool ConvertAlignmentInternal(const TransitionModel &old_trans_model,
   return true;
 }
 
-bool ConvertAlignment(const TransitionModel &old_trans_model,
-                      const TransitionModel &new_trans_model,
+bool ConvertAlignment(const Transitions &old_trans_model,
+                      const Transitions &new_trans_model,
                       const ContextDependencyInterface &new_ctx_dep,
                       const std::vector<int32> &old_alignment,
                       int32 subsample_factor,
@@ -1062,7 +1062,7 @@ bool ConvertAlignment(const TransitionModel &old_trans_model,
 }
 
 // Returns the scaled, but not negated, log-prob, with the given scaling factors.
-static BaseFloat GetScaledTransitionLogProb(const TransitionModel &trans_model,
+static BaseFloat GetScaledTransitionLogProb(const Transitions &trans_model,
                                             int32 trans_id,
                                             BaseFloat transition_scale,
                                             BaseFloat self_loop_scale) {
@@ -1085,7 +1085,7 @@ static BaseFloat GetScaledTransitionLogProb(const TransitionModel &trans_model,
 
 
 
-void AddTransitionProbs(const TransitionModel &trans_model,
+void AddTransitionProbs(const Transitions &trans_model,
                         const std::vector<int32> &disambig_syms,  // may be empty
                         BaseFloat transition_scale,
                         BaseFloat self_loop_scale,
@@ -1118,7 +1118,7 @@ void AddTransitionProbs(const TransitionModel &trans_model,
   }
 }
 
-void AddTransitionProbs(const TransitionModel &trans_model,
+void AddTransitionProbs(const Transitions &trans_model,
                         BaseFloat transition_scale,
                         BaseFloat self_loop_scale,
                         Lattice *lat) {
@@ -1205,7 +1205,7 @@ bool ConvertPhnxToProns(const std::vector<int32> &phnx,
 
 
 void GetRandomAlignmentForPhone(const ContextDependencyInterface &ctx_dep,
-                                const TransitionModel &trans_model,
+                                const Transitions &trans_model,
                                 const std::vector<int32> &phone_window,
                                 std::vector<int32> *alignment) {
   typedef fst::StdArc Arc;
@@ -1257,7 +1257,7 @@ void GetRandomAlignmentForPhone(const ContextDependencyInterface &ctx_dep,
   delete fst;
 }
 
-void ChangeReorderingOfAlignment(const TransitionModel &trans_model,
+void ChangeReorderingOfAlignment(const Transitions &trans_model,
                                  std::vector<int32> *alignment) {
   int32 start_pos = 0, size = alignment->size();
   while (start_pos != size) {
diff --git a/src/hmm/hmm-utils.h b/src/hmm/hmm-utils.h
index a8ad846949e..9cefa557bb3 100644
--- a/src/hmm/hmm-utils.h
+++ b/src/hmm/hmm-utils.h
@@ -20,8 +20,8 @@
 #ifndef KALDI_HMM_HMM_UTILS_H_
 #define KALDI_HMM_HMM_UTILS_H_
 
-#include "hmm/hmm-topology.h"
-#include "hmm/transition-model.h"
+#include "hmm/topology.h"
+#include "hmm/transitions.h"
 #include "lat/kaldi-lattice.h"
 
 namespace kaldi {
@@ -93,7 +93,7 @@ typedef unordered_map<std::pair<int32, std::vector<int32> >,
 fst::VectorFst<fst::StdArc> *GetHmmAsFsa(
     std::vector<int32> context_window,
     const ContextDependencyInterface &ctx_dep,
-    const TransitionModel &trans_model,
+    const Transitions &trans_model,
     const HTransducerConfig &config,
     HmmCacheType *cache = NULL);
 
@@ -104,7 +104,7 @@ fst::VectorFst<fst::StdArc> *GetHmmAsFsa(
 fst::VectorFst<fst::StdArc>*
 GetHmmAsFsaSimple(std::vector<int32> context_window,
                   const ContextDependencyInterface &ctx_dep,
-                  const TransitionModel &trans_model,
+                  const Transitions &trans_model,
                   BaseFloat prob_scale);
 
 
@@ -126,7 +126,7 @@ GetHmmAsFsaSimple(std::vector<int32> context_window,
 fst::VectorFst<fst::StdArc>*
 GetHTransducer(const std::vector<std::vector<int32> > &ilabel_info,
                const ContextDependencyInterface &ctx_dep,
-               const TransitionModel &trans_model,
+               const Transitions &trans_model,
                const HTransducerConfig &config,
                std::vector<int32> *disambig_syms_left);
 
@@ -148,7 +148,7 @@ GetHTransducer(const std::vector<std::vector<int32> > &ilabel_info,
   */
 void GetIlabelMapping(const std::vector<std::vector<int32> > &ilabel_info_old,
                       const ContextDependencyInterface &ctx_dep,
-                      const TransitionModel &trans_model,
+                      const Transitions &trans_model,
                       std::vector<int32> *old2new_map);
 
 
@@ -182,7 +182,7 @@ void GetIlabelMapping(const std::vector<std::vector<int32> > &ilabel_info_old,
   *                      which emulates the behavior of older code.
   * @param  fst [in, out] The FST to be modified.
   */
-void AddSelfLoops(const TransitionModel &trans_model,
+void AddSelfLoops(const Transitions &trans_model,
                   const std::vector<int32> &disambig_syms,  // used as a check only.
                   BaseFloat self_loop_scale,
                   bool reorder,
@@ -206,7 +206,7 @@ void AddSelfLoops(const TransitionModel &trans_model,
   *                      see \ref hmm_scale.
   * @param  fst [in, out] The FST to be modified.
   */
-void AddTransitionProbs(const TransitionModel &trans_model,
+void AddTransitionProbs(const Transitions &trans_model,
                         const std::vector<int32> &disambig_syms,
                         BaseFloat transition_scale,
                         BaseFloat self_loop_scale,
@@ -216,7 +216,7 @@ void AddTransitionProbs(const TransitionModel &trans_model,
    This is as AddSelfLoops(), but operates on a Lattice, where
    it affects the graph part of the weight (the first element
    of the pair). */
-void AddTransitionProbs(const TransitionModel &trans_model,
+void AddTransitionProbs(const Transitions &trans_model,
                         BaseFloat transition_scale,
                         BaseFloat self_loop_scale,
                         Lattice *lat);
@@ -225,11 +225,11 @@ void AddTransitionProbs(const TransitionModel &trans_model,
 /// Returns a transducer from pdfs plus one (input) to  transition-ids (output).
 /// Currenly of use only for testing.
 fst::VectorFst<fst::StdArc>*
-GetPdfToTransitionIdTransducer(const TransitionModel &trans_model);
+GetPdfToTransitionIdTransducer(const Transitions &trans_model);
 
 /// Converts all transition-ids in the FST to pdfs plus one.
 /// Placeholder: not implemented yet!
-void ConvertTransitionIdsToPdfs(const TransitionModel &trans_model,
+void ConvertTransitionIdsToPdfs(const Transitions &trans_model,
                                 const std::vector<int32> &disambig_syms,
                                 fst::VectorFst<fst::StdArc> *fst);
 
@@ -248,7 +248,7 @@ void ConvertTransitionIdsToPdfs(const TransitionModel &trans_model,
 /// die or throw an exception.
 /// This function works out by itself whether the graph was created
 /// with "reordering", and just does the right thing.
-bool SplitToPhones(const TransitionModel &trans_model,
+bool SplitToPhones(const Transitions &trans_model,
                    const std::vector<int32> &alignment,
                    std::vector<std::vector<int32> > *split_alignment);
 
@@ -279,13 +279,13 @@ bool SplitToPhones(const TransitionModel &trans_model,
                                 the same as the input where possible.]
    @param reorder [in]          True if you want the pdf-ids on the new alignment to
                                 be 'reordered'. (vs. the way they appear in
-                                the HmmTopology object)
+                                the Topology object)
    @param phone_map [in]        If non-NULL, map from old to new phones.
    @param new_alignment [out]   The converted alignment.
 */
 
-bool ConvertAlignment(const TransitionModel &old_trans_model,
-                      const TransitionModel &new_trans_model,
+bool ConvertAlignment(const Transitions &old_trans_model,
+                      const Transitions &new_trans_model,
                       const ContextDependencyInterface &new_ctx_dep,
                       const std::vector<int32> &old_alignment,
                       int32 subsample_factor,  // 1 in the normal case -> no subsampling.
@@ -319,14 +319,14 @@ bool ConvertPhnxToProns(const std::vector<int32> &phnx,
    The alignment will be without 'reordering'.
 */
 void GetRandomAlignmentForPhone(const ContextDependencyInterface &ctx_dep,
-                                const TransitionModel &trans_model,
+                                const Transitions &trans_model,
                                 const std::vector<int32> &phone_window,
                                 std::vector<int32> *alignment);
 
 /*
   If the alignment was non-reordered makes it reordered, and vice versa.
 */
-void ChangeReorderingOfAlignment(const TransitionModel &trans_model,
+void ChangeReorderingOfAlignment(const Transitions &trans_model,
                                  std::vector<int32> *alignment);
 
 /// @} end "addtogroup hmm_group"
diff --git a/src/hmm/posterior.cc b/src/hmm/posterior.cc
index 860a979a0ce..3089be237b2 100644
--- a/src/hmm/posterior.cc
+++ b/src/hmm/posterior.cc
@@ -299,8 +299,8 @@ void AlignmentToPosterior(const std::vector<int32> &ali,
 }
 
 struct ComparePosteriorByPdfs {
-  const TransitionModel *tmodel_;
-  ComparePosteriorByPdfs(const TransitionModel &tmodel): tmodel_(&tmodel) {}
+  const Transitions *tmodel_;
+  ComparePosteriorByPdfs(const Transitions &tmodel): tmodel_(&tmodel) {}
   bool operator() (const std::pair<int32, BaseFloat> &a,
                    const std::pair<int32, BaseFloat> &b) {
     if (tmodel_->TransitionIdToPdf(a.first)
@@ -311,7 +311,7 @@ struct ComparePosteriorByPdfs {
   }
 };
 
-void SortPosteriorByPdfs(const TransitionModel &tmodel,
+void SortPosteriorByPdfs(const Transitions &tmodel,
                          Posterior *post) {
   ComparePosteriorByPdfs compare(tmodel);
   for (size_t i = 0; i < post->size(); i++) {
@@ -319,7 +319,7 @@ void SortPosteriorByPdfs(const TransitionModel &tmodel,
   }
 }
 
-void ConvertPosteriorToPdfs(const TransitionModel &tmodel,
+void ConvertPosteriorToPdfs(const Transitions &tmodel,
                             const Posterior &post_in,
                             Posterior *post_out) {
   post_out->clear();
@@ -345,7 +345,7 @@ void ConvertPosteriorToPdfs(const TransitionModel &tmodel,
   }
 }
 
-void ConvertPosteriorToPhones(const TransitionModel &tmodel,
+void ConvertPosteriorToPhones(const Transitions &tmodel,
                               const Posterior &post_in,
                               Posterior *post_out) {
   post_out->clear();
@@ -372,7 +372,7 @@ void ConvertPosteriorToPhones(const TransitionModel &tmodel,
 }
 
 
-void WeightSilencePost(const TransitionModel &trans_model,
+void WeightSilencePost(const Transitions &trans_model,
                        const ConstIntegerSet<int32> &silence_set,
                        BaseFloat silence_scale,
                        Posterior *post) {
@@ -395,7 +395,7 @@ void WeightSilencePost(const TransitionModel &trans_model,
 }
 
 
-void WeightSilencePostDistributed(const TransitionModel &trans_model,
+void WeightSilencePostDistributed(const Transitions &trans_model,
                                   const ConstIntegerSet<int32> &silence_set,
                                   BaseFloat silence_scale,
                                   Posterior *post) {
@@ -537,7 +537,7 @@ template void PosteriorToMatrix<double>(const Posterior &post,
 
 template <typename Real>
 void PosteriorToPdfMatrix(const Posterior &post,
-                          const TransitionModel &model,
+                          const Transitions &model,
                           Matrix<Real> *mat) {
   // Allocate the matrix,
   int32 num_rows = post.size(),
@@ -557,10 +557,10 @@ void PosteriorToPdfMatrix(const Posterior &post,
 }
 // instantiate the template function,
 template void PosteriorToPdfMatrix<float>(const Posterior &post,
-                                          const TransitionModel &model,
+                                          const Transitions &model,
                                           Matrix<float> *mat);
 template void PosteriorToPdfMatrix<double>(const Posterior &post,
-                                           const TransitionModel &model,
+                                           const Transitions &model,
                                            Matrix<double> *mat);
 
 } // End namespace kaldi
diff --git a/src/hmm/posterior.h b/src/hmm/posterior.h
index e153c249740..7663cf0ce42 100644
--- a/src/hmm/posterior.h
+++ b/src/hmm/posterior.h
@@ -26,7 +26,7 @@
 #include "base/kaldi-common.h"
 #include "util/const-integer-set.h"
 #include "util/kaldi-table.h"
-#include "hmm/transition-model.h"
+#include "hmm/transitions.h"
 #include "matrix/kaldi-matrix.h"
 
 
@@ -205,19 +205,19 @@ void AlignmentToPosterior(const std::vector<int32> &ali,
 
 /// Sorts posterior entries so that transition-ids with same pdf-id are next to
 /// each other.
-void SortPosteriorByPdfs(const TransitionModel &tmodel,
+void SortPosteriorByPdfs(const Transitions &tmodel,
                          Posterior *post);
 
 
 /// Converts a posterior over transition-ids to be a posterior
 /// over pdf-ids.
-void ConvertPosteriorToPdfs(const TransitionModel &tmodel,
+void ConvertPosteriorToPdfs(const Transitions &tmodel,
                             const Posterior &post_in,
                             Posterior *post_out);
 
 /// Converts a posterior over transition-ids to be a posterior
 /// over phones.
-void ConvertPosteriorToPhones(const TransitionModel &tmodel,
+void ConvertPosteriorToPhones(const Transitions &tmodel,
                               const Posterior &post_in,
                               Posterior *post_out);
 
@@ -225,7 +225,7 @@ void ConvertPosteriorToPhones(const TransitionModel &tmodel,
 /// in the set "silence_set" by scale "silence_scale".
 /// The interface was changed in Feb 2014 to do the modification
 /// "in-place" rather than having separate input and output.
-void WeightSilencePost(const TransitionModel &trans_model,
+void WeightSilencePost(const Transitions &trans_model,
                        const ConstIntegerSet<int32> &silence_set,
                        BaseFloat silence_scale,
                        Posterior *post);
@@ -236,7 +236,7 @@ void WeightSilencePost(const TransitionModel &trans_model,
 /// has the effect that frames that are mostly silence get down-weighted.
 /// The interface was changed in Feb 2014 to do the modification
 /// "in-place" rather than having separate input and output.
-void WeightSilencePostDistributed(const TransitionModel &trans_model,
+void WeightSilencePostDistributed(const Transitions &trans_model,
                                   const ConstIntegerSet<int32> &silence_set,
                                   BaseFloat silence_scale,
                                   Posterior *post);
@@ -250,11 +250,11 @@ void PosteriorToMatrix(const Posterior &post,
 
 /// This converts a Posterior to a Matrix. The number of matrix-rows is the same
 /// as the 'post.size()', the number of matrix-columns is defined by 'NumPdfs'
-/// in the TransitionModel.
+/// in the Transitions.
 /// The elements which are not specified in 'Posterior' are equal to zero.
 template <typename Real>
 void PosteriorToPdfMatrix(const Posterior &post,
-                          const TransitionModel &model,
+                          const Transitions &model,
                           Matrix<Real> *mat);
 
 /// @} end "addtogroup posterior_group"
diff --git a/src/hmm/hmm-topology.cc b/src/hmm/topology.cc
similarity index 89%
rename from src/hmm/hmm-topology.cc
rename to src/hmm/topology.cc
index cf134065dbf..a0563f90c0d 100644
--- a/src/hmm/hmm-topology.cc
+++ b/src/hmm/topology.cc
@@ -1,7 +1,7 @@
-// hmm/hmm-topology.cc
+// hmm/topology.cc
 
 // Copyright 2009-2011  Microsoft Corporation
-//                2014  Johns Hopkins University (author: Daniel Povey)
+//           2014-2019  Johns Hopkins University (author: Daniel Povey)
 
 // See ../../COPYING for clarification regarding multiple authors
 //
@@ -20,7 +20,7 @@
 
 #include <vector>
 
-#include "hmm/hmm-topology.h"
+#include "hmm/topology.h"
 #include "util/text-utils.h"
 
 
@@ -28,7 +28,7 @@ namespace kaldi {
 
 
 
-void HmmTopology::GetPhoneToNumPdfClasses(std::vector<int32> *phone2num_pdf_classes) const {
+void Topology::GetPhoneToNumPdfClasses(std::vector<int32> *phone2num_pdf_classes) const {
   KALDI_ASSERT(!phones_.empty());
   phone2num_pdf_classes->clear();
   phone2num_pdf_classes->resize(phones_.back() + 1, -1);
@@ -36,7 +36,7 @@ void HmmTopology::GetPhoneToNumPdfClasses(std::vector<int32> *phone2num_pdf_clas
     (*phone2num_pdf_classes)[phones_[i]] = NumPdfClasses(phones_[i]);
 }
 
-void HmmTopology::Read(std::istream &is, bool binary) {
+void Topology::Read(std::istream &is, bool binary) {
   ExpectToken(is, binary, "<Topology>");
   if (!binary) {  // Text-mode read, different "human-readable" format.
     phones_.clear();
@@ -46,19 +46,19 @@ void HmmTopology::Read(std::istream &is, bool binary) {
     while ( ! (is >> token).fail() ) {
       if (token == "</Topology>") { break; } // finished parsing.
       else  if (token != "<TopologyEntry>") {
-        KALDI_ERR << "Reading HmmTopology object, expected </Topology> or <TopologyEntry>, got "<<token;
+        KALDI_ERR << "Reading Topology object, expected </Topology> or <TopologyEntry>, got "<<token;
       } else {
         ExpectToken(is, binary, "<ForPhones>");
         std::vector<int32> phones;
         std::string s;
         while (1) {
           is >> s;
-          if (is.fail()) KALDI_ERR << "Reading HmmTopology object, unexpected end of file while expecting phones.";
+          if (is.fail()) KALDI_ERR << "Reading Topology object, unexpected end of file while expecting phones.";
           if (s == "</ForPhones>") break;
           else {
             int32 phone;
             if (!ConvertStringToInteger(s, &phone))
-              KALDI_ERR << "Reading HmmTopology object, expected "
+              KALDI_ERR << "Reading Topology object, expected "
                         << "integer, got instead " << s;
             phones.push_back(phone);
           }
@@ -105,7 +105,7 @@ void HmmTopology::Read(std::istream &is, bool binary) {
           if(token == "<Final>") // TODO: remove this clause after a while.
             KALDI_ERR << "You are trying to read old-format topology with new Kaldi.";
           if (token != "</State>")
-            KALDI_ERR << "Reading HmmTopology,  unexpected token "<<token;
+            KALDI_ERR << "Reading Topology,  unexpected token "<<token;
           ReadToken(is, binary, &token);
         }
         int32 my_index = entries_.size();
@@ -161,7 +161,7 @@ void HmmTopology::Read(std::istream &is, bool binary) {
 }
 
 
-void HmmTopology::Write(std::ostream &os, bool binary) const {
+void Topology::Write(std::ostream &os, bool binary) const {
   bool is_hmm = IsHmm();
   WriteToken(os, binary, "<Topology>");
   if (!binary) {  // Text-mode write.
@@ -228,15 +228,15 @@ void HmmTopology::Write(std::ostream &os, bool binary) const {
   if (!binary) os << "\n";
 }
 
-void HmmTopology::Check() {
+void Topology::Check() {
   if (entries_.empty() || phones_.empty() || phone2idx_.empty())
-    KALDI_ERR << "HmmTopology::Check(), empty object.";
+    KALDI_ERR << "Topology::Check(), empty object.";
   std::vector<bool> is_seen(entries_.size(), false);
   for (size_t i = 0; i < phones_.size(); i++) {
     int32 phone = phones_[i];
     if (static_cast<size_t>(phone) >= phone2idx_.size() ||
         static_cast<size_t>(phone2idx_[phone]) >= entries_.size())
-      KALDI_ERR << "HmmTopology::Check(), phone has no valid index.";
+      KALDI_ERR << "Topology::Check(), phone has no valid index.";
     is_seen[phone2idx_[phone]] = true;
   }
   for (size_t i = 0; i < entries_.size(); i++) {
@@ -244,13 +244,13 @@ void HmmTopology::Check() {
       KALDI_ERR << "HmmTopoloy::Check(), entry with no corresponding phones.";
     int32 num_states = static_cast<int32>(entries_[i].size());
     if (num_states <= 1)
-      KALDI_ERR << "HmmTopology::Check(), cannot only have one state (i.e., must "
+      KALDI_ERR << "Topology::Check(), cannot only have one state (i.e., must "
           "have at least one emitting state).";
     if (!entries_[i][num_states-1].transitions.empty())
-      KALDI_ERR << "HmmTopology::Check(), last state must have no transitions.";
+      KALDI_ERR << "Topology::Check(), last state must have no transitions.";
     // not sure how necessary this next stipulation is.
     if (entries_[i][num_states-1].forward_pdf_class != kNoPdf)
-      KALDI_ERR << "HmmTopology::Check(), last state must not be emitting.";
+      KALDI_ERR << "Topology::Check(), last state must not be emitting.";
 
     std::vector<bool> has_trans_in(num_states, false);
     std::vector<int32> seen_pdf_classes;
@@ -267,7 +267,7 @@ void HmmTopology::Check() {
            k++) {
         tot_prob += entries_[i][j].transitions[k].second;
         if (entries_[i][j].transitions[k].second <= 0.0)
-          KALDI_ERR << "HmmTopology::Check(), negative or zero transition prob.";
+          KALDI_ERR << "Topology::Check(), negative or zero transition prob.";
         int32 dst_state = entries_[i][j].transitions[k].first;
         // The commented code in the next few lines disallows a completely
         // skippable phone, as this would cause to stop working some mechanisms
@@ -280,9 +280,9 @@ void HmmTopology::Check() {
               "stop the SplitToPhones function from identifying the last state "
               "of a phone.";
         if (dst_state < 0 || dst_state >= num_states)
-          KALDI_ERR << "HmmTopology::Check(), invalid dest state " << (dst_state);
+          KALDI_ERR << "Topology::Check(), invalid dest state " << (dst_state);
         if (seen_transition.count(dst_state) != 0)
-          KALDI_ERR << "HmmTopology::Check(), duplicate transition found.";
+          KALDI_ERR << "Topology::Check(), duplicate transition found.";
         if (dst_state == k) {  // self_loop...
           KALDI_ASSERT(entries_[i][j].self_loop_pdf_class != kNoPdf &&
                        "Nonemitting states cannot have self-loops.");
@@ -302,17 +302,17 @@ void HmmTopology::Check() {
     // make sure all but start state have input transitions.
     for (int32 j = 1; j < num_states; j++)
       if (!has_trans_in[j])
-        KALDI_ERR << "HmmTopology::Check, state "<<(j)<<" has no input transitions.";
+        KALDI_ERR << "Topology::Check, state "<<(j)<<" has no input transitions.";
     SortAndUniq(&seen_pdf_classes);
     if (seen_pdf_classes.front() != 0 ||
         seen_pdf_classes.back() != static_cast<int32>(seen_pdf_classes.size()) - 1) {
-      KALDI_ERR << "HmmTopology::Check(), pdf_classes are expected to be "
+      KALDI_ERR << "Topology::Check(), pdf_classes are expected to be "
           "contiguous and start from zero.";
     }
   }
 }
 
-bool HmmTopology::IsHmm() const {
+bool Topology::IsHmm() const {
   const std::vector<int32> &phones = GetPhones();
   KALDI_ASSERT(!phones.empty());
   for (size_t i = 0; i < phones.size(); i++) {
@@ -328,14 +328,14 @@ bool HmmTopology::IsHmm() const {
   return true;
 }
 
-const HmmTopology::TopologyEntry& HmmTopology::TopologyForPhone(int32 phone) const {  // Will throw if phone not covered.
+const Topology::TopologyEntry& HmmTopology::TopologyForPhone(int32 phone) const {  // Will throw if phone not covered.
   if (static_cast<size_t>(phone) >= phone2idx_.size() || phone2idx_[phone] == -1) {
     KALDI_ERR << "TopologyForPhone(), phone "<<(phone)<<" not covered.";
   }
   return entries_[phone2idx_[phone]];
 }
 
-int32 HmmTopology::NumPdfClasses(int32 phone) const {
+int32 Topology::NumPdfClasses(int32 phone) const {
   // will throw if phone not covered.
   const TopologyEntry &entry = TopologyForPhone(phone);
   int32 max_pdf_class = 0;
@@ -346,7 +346,7 @@ int32 HmmTopology::NumPdfClasses(int32 phone) const {
   return max_pdf_class+1;
 }
 
-int32 HmmTopology::MinLength(int32 phone) const {
+int32 Topology::MinLength(int32 phone) const {
   const TopologyEntry &entry = TopologyForPhone(phone);
   // min_length[state] gives the minimum length for sequences up to and
   // including that state.
diff --git a/src/hmm/topology.h b/src/hmm/topology.h
new file mode 100644
index 00000000000..eae0640af08
--- /dev/null
+++ b/src/hmm/topology.h
@@ -0,0 +1,138 @@
+// hmm/topology.h
+
+// Copyright 2009-2011  Microsoft Corporation
+//                2019  Johns Hopkins University (author: Daniel Povey)
+
+// See ../../COPYING for clarification regarding multiple authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//  http://www.apache.org/licenses/LICENSE-2.0
+//
+// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
+// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
+// MERCHANTABLITY OR NON-INFRINGEMENT.
+// See the Apache 2 License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef KALDI_HMM_HMM_TOPOLOGY_H_
+#define KALDI_HMM_HMM_TOPOLOGY_H_
+
+#include <fst/fstlib.h>
+#include "base/kaldi-common.h"
+
+
+namespace kaldi {
+
+
+/// \addtogroup hmm_group
+/// @{
+
+/*
+  The following would be the text form for the "normal" 3-state HMM topology/
+  "bakis model", with the typical reordering that we do to improve the
+  compactness of the compiled FSTs.  The format is the OpenFst acceptor format.
+  The fields are, for transitions,
+  <from-state> <to-state> <pdf-class> <transition-cost>
+ and, for final-states,
+  <state> <final-cost>
+
+  The <transition-cost> may be interpreted as negative log probabilities.
+  We normally set them so as to sum to one, in order to keep the fully
+  compiled (HCLG) graph fairly stochastic (meaning: sum-to-one, like an
+  HMM).
+
+  The integers on the arcs, which we call 'pdf-classes', define which
+  arcs share the same "pdf" and which ones are distinct.
+
+  Preconditions on topology:
+     - pdf-classes (3rd field on arcs) must
+       form a contiguous list of numbers starting from 1, although
+       different arcs with the same pdf-class are allowed.  (We avoid 0
+       because it is "special" in OpenFST, it is used for epsilon).
+     - The start state must be state 0 and there must be no
+       transitions entering it except (possibly) a self-loop (although
+       a self-loop on state 0 is not advised for decoding-graph-size
+       reasons)
+     - The start state must not be final.
+
+
+ <Topology>
+ <TopologyEntry>
+ <ForPhones> 1 2 3 4 5 6 7 8 </ForPhones>
+ 0  1  1  0.0
+ 1  1  1  0.693
+ 1  2  2  0.693
+ 2  2  2  0.693
+ 2  3  3  0.693
+ 3  3  3  0.693
+ 3  0.693
+ </TopologyEntry>
+ </Topology>
+*/
+
+
+/// A class for storing topology information for phones.  See  \ref hmm for context.
+/// This object is sometimes accessed in a file by itself, but more often
+/// as a class member of the Transition class (this is for convenience to reduce
+/// the number of files programs have to access).
+
+class Topology {
+ public:
+
+  void Read(std::istream &is, bool binary);
+  void Write(std::ostream &os, bool binary) const;
+
+  // Checks that the object is valid, and throw exception otherwise.
+  void Check();
+
+  /// Returns the topology entry for this phone;
+  /// will throw exception if phone not covered by the topology.
+  const fst::StdFst &TopologyForPhone(int32 phone) const;
+
+  /// Returns the number of \ref pdf_class "pdf-classes" for this phone;
+  /// throws exception if phone not covered by this topology.
+  int32 NumPdfClasses(int32 phone) const;
+
+  /// Returns a reference to a sorted, unique list of phones covered by
+  /// the topology (these phones will be positive integers, and usually
+  /// contiguous and starting from one but the toolkit doesn't assume
+  /// they are contiguous).
+  const std::vector<int32> &GetPhones() const { return phones_; };
+
+  /// Outputs a vector of int32, indexed by phone, that gives the
+  /// number of \ref pdf_class pdf-classes for the phones; this is
+  /// used by tree-building code such as BuildTree().
+  void GetPhoneToNumPdfClasses(std::vector<int32> *phone2num_pdf_classes) const;
+
+  // Returns the minimum number of arcs/frames it takes to traverse this model
+  // for this phone: e.g. 3 for the normal HMM topology.
+  int32 MinLength(int32 phone) const;
+
+  Topology() {}
+
+  bool operator == (const Topology &other) const;
+
+  // was:
+  //return phones_ == other.phones_ && phone2idx_ == other.phone2idx_
+  //&& entries_ == other.entries_;
+  // TODO: implement this; we probably need Equal() on fsts.
+
+  // Allow default assignment operator and copy constructor.
+ private:
+  std::vector<int32> phones_;  // list of all phones we have topology for.  Sorted, uniq.  no epsilon (zero) phone.
+  std::vector<int32> phone2idx_;  // map from phones to indexes into the entries vector (or -1 for not present).
+  std::vector<fst::StdFst> entries_;
+};
+
+
+/// @} end "addtogroup hmm_group"
+
+
+} // end namespace kaldi
+
+
+#endif
diff --git a/src/hmm/transition-model.h b/src/hmm/transition-model.h
deleted file mode 100644
index e453c24f9cb..00000000000
--- a/src/hmm/transition-model.h
+++ /dev/null
@@ -1,371 +0,0 @@
-// hmm/transition-model.h
-
-// Copyright 2009-2012  Microsoft Corporation
-//                      Johns Hopkins University (author: Guoguo Chen)
-
-// See ../../COPYING for clarification regarding multiple authors
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//  http://www.apache.org/licenses/LICENSE-2.0
-//
-// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
-// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
-// MERCHANTABLITY OR NON-INFRINGEMENT.
-// See the Apache 2 License for the specific language governing permissions and
-// limitations under the License.
-
-#ifndef KALDI_HMM_TRANSITION_MODEL_H_
-#define KALDI_HMM_TRANSITION_MODEL_H_
-
-#include "base/kaldi-common.h"
-#include "util/const-integer-set.h"
-#include "fst/fst-decl.h" // forward declarations.
-#include "hmm/hmm-topology.h"
-#include "itf/options-itf.h"
-#include "itf/context-dep-itf.h"
-#include "matrix/kaldi-vector.h"
-
-namespace kaldi {
-
-/// \addtogroup hmm_group
-/// @{
-
-// The class TransitionModel is a repository for the transition probabilities.
-// It also handles certain integer mappings.
-// The basic model is as follows.  Each phone has a HMM topology defined in
-// hmm-topology.h.  Each HMM-state of each of these phones has a number of
-// transitions (and final-probs) out of it.  Each HMM-state defined in the
-// HmmTopology class has an associated "pdf_class".  This gets replaced with
-// an actual pdf-id via the tree.  The transition model associates the
-// transition probs with the (phone, HMM-state, pdf-id).  We associate with
-// each such triple a transition-state.  Each
-// transition-state has a number of associated probabilities to estimate;
-// this depends on the number of transitions/final-probs in the topology for
-// that (phone, HMM-state).  Each probability has an associated transition-index.
-// We associate with each (transition-state, transition-index) a unique transition-id.
-// Each individual probability estimated by the transition-model is asociated with a
-// transition-id.
-//
-// List of the various types of quantity referred to here and what they mean:
-//           phone:  a phone index (1, 2, 3 ...)
-//       HMM-state:  a number (0, 1, 2...) that indexes TopologyEntry (see hmm-topology.h)
-//          pdf-id:  a number output by the Compute function of ContextDependency (it
-//                   indexes pdf's, either forward or self-loop).  Zero-based.
-// transition-state:  the states for which we estimate transition probabilities for transitions
-//                    out of them.  In some topologies, will map one-to-one with pdf-ids.
-//                    One-based, since it appears on FSTs.
-// transition-index:  identifier of a transition (or final-prob) in the HMM.  Indexes the
-//                    "transitions" vector in HmmTopology::HmmState.  [if it is out of range,
-//                    equal to transitions.size(), it refers to the final-prob.]
-//                    Zero-based.
-//   transition-id:   identifier of a unique parameter of the TransitionModel.
-//                    Associated with a (transition-state, transition-index) pair.
-//                    One-based, since it appears on FSTs.
-//
-// List of the possible mappings TransitionModel can do:
-//   (phone, HMM-state, forward-pdf-id, self-loop-pdf-id) -> transition-state
-//                   (transition-state, transition-index) -> transition-id
-//  Reverse mappings:
-//                        transition-id -> transition-state
-//                        transition-id -> transition-index
-//                     transition-state -> phone
-//                     transition-state -> HMM-state
-//                     transition-state -> forward-pdf-id
-//                     transition-state -> self-loop-pdf-id
-//
-// The main things the TransitionModel object can do are:
-//    Get initialized (need ContextDependency and HmmTopology objects).
-//    Read/write.
-//    Update [given a vector of counts indexed by transition-id].
-//    Do the various integer mappings mentioned above.
-//    Get the probability (or log-probability) associated with a particular transition-id.
-
-
-// Note: this was previously called TransitionUpdateConfig.
-struct MleTransitionUpdateConfig {
-  BaseFloat floor;
-  BaseFloat mincount;
-  bool share_for_pdfs; // If true, share all transition parameters that have the same pdf.
-  MleTransitionUpdateConfig(BaseFloat floor = 0.01,
-                            BaseFloat mincount = 5.0,
-                            bool share_for_pdfs = false):
-      floor(floor), mincount(mincount), share_for_pdfs(share_for_pdfs) {}
-
-  void Register (OptionsItf *opts) {
-    opts->Register("transition-floor", &floor,
-                   "Floor for transition probabilities");
-    opts->Register("transition-min-count", &mincount,
-                   "Minimum count required to update transitions from a state");
-    opts->Register("share-for-pdfs", &share_for_pdfs,
-                   "If true, share all transition parameters where the states "
-                   "have the same pdf.");
-  }
-};
-
-struct MapTransitionUpdateConfig {
-  BaseFloat tau;
-  bool share_for_pdfs; // If true, share all transition parameters that have the same pdf.
-  MapTransitionUpdateConfig(): tau(5.0), share_for_pdfs(false) { }
-
-  void Register (OptionsItf *opts) {
-    opts->Register("transition-tau", &tau, "Tau value for MAP estimation of transition "
-                   "probabilities.");
-    opts->Register("share-for-pdfs", &share_for_pdfs,
-                   "If true, share all transition parameters where the states "
-                   "have the same pdf.");
-  }
-};
-
-class TransitionModel {
-
- public:
-  /// Initialize the object [e.g. at the start of training].
-  /// The class keeps a copy of the HmmTopology object, but not
-  /// the ContextDependency object.
-  TransitionModel(const ContextDependencyInterface &ctx_dep,
-                  const HmmTopology &hmm_topo);
-
-
-  /// Constructor that takes no arguments: typically used prior to calling Read.
-  TransitionModel(): num_pdfs_(0) { }
-
-  void Read(std::istream &is, bool binary);  // note, no symbol table: topo object always read/written w/o symbols.
-  void Write(std::ostream &os, bool binary) const;
-
-
-  /// return reference to HMM-topology object.
-  const HmmTopology &GetTopo() const { return topo_; }
-
-  /// \name Integer mapping functions
-  /// @{
-
-  int32 TupleToTransitionState(int32 phone, int32 hmm_state, int32 pdf, int32 self_loop_pdf) const;
-  int32 PairToTransitionId(int32 trans_state, int32 trans_index) const;
-  int32 TransitionIdToTransitionState(int32 trans_id) const;
-  int32 TransitionIdToTransitionIndex(int32 trans_id) const;
-  int32 TransitionStateToPhone(int32 trans_state) const;
-  int32 TransitionStateToHmmState(int32 trans_state) const;
-  int32 TransitionStateToForwardPdfClass(int32 trans_state) const;
-  int32 TransitionStateToSelfLoopPdfClass(int32 trans_state) const;
-  int32 TransitionStateToForwardPdf(int32 trans_state) const;
-  int32 TransitionStateToSelfLoopPdf(int32 trans_state) const;
-  int32 SelfLoopOf(int32 trans_state) const;  // returns the self-loop transition-id, or zero if
-  // this state doesn't have a self-loop.
-
-  inline int32 TransitionIdToPdf(int32 trans_id) const;
-  // TransitionIdToPdfFast is as TransitionIdToPdf but skips an assertion
-  // (unless we're in paranoid mode).
-  inline int32 TransitionIdToPdfFast(int32 trans_id) const;
-
-  int32 TransitionIdToPhone(int32 trans_id) const;
-  int32 TransitionIdToPdfClass(int32 trans_id) const;
-  int32 TransitionIdToHmmState(int32 trans_id) const;
-
-  /// @}
-
-  bool IsFinal(int32 trans_id) const;  // returns true if this trans_id goes to the final state
-  // (which is bound to be nonemitting).
-  bool IsSelfLoop(int32 trans_id) const;  // return true if this trans_id corresponds to a self-loop.
-
-  /// Returns the total number of transition-ids (note, these are one-based).
-  inline int32 NumTransitionIds() const { return id2state_.size()-1; }
-
-  /// Returns the number of transition-indices for a particular transition-state.
-  /// Note: "Indices" is the plural of "index".   Index is not the same as "id",
-  /// here.  A transition-index is a zero-based offset into the transitions
-  /// out of a particular transition state.
-  int32 NumTransitionIndices(int32 trans_state) const;
-
-  /// Returns the total number of transition-states (note, these are one-based).
-  int32 NumTransitionStates() const { return tuples_.size(); }
-
-  // NumPdfs() actually returns the highest-numbered pdf we ever saw, plus one.
-  // In normal cases this should equal the number of pdfs in the system, but if you
-  // initialized this object with fewer than all the phones, and it happens that
-  // an unseen phone has the highest-numbered pdf, this might be different.
-  int32 NumPdfs() const { return num_pdfs_; }
-
-  // This loops over the tuples and finds the highest phone index present. If
-  // the FST symbol table for the phones is created in the expected way, i.e.:
-  // starting from 1 (<eps> is 0) and numbered contiguously till the last phone,
-  // this will be the total number of phones.
-  int32 NumPhones() const;
-
-  /// Returns a sorted, unique list of phones.
-  const std::vector<int32> &GetPhones() const { return topo_.GetPhones(); }
-
-  // Transition-parameter-getting functions:
-  BaseFloat GetTransitionProb(int32 trans_id) const;
-  BaseFloat GetTransitionLogProb(int32 trans_id) const;
-
-  // The following functions are more specialized functions for getting
-  // transition probabilities, that are provided for convenience.
-
-  /// Returns the log-probability of a particular non-self-loop transition
-  /// after subtracting the probability mass of the self-loop and renormalizing;
-  /// will crash if called on a self-loop.  Specifically:
-  /// for non-self-loops it returns the log of (that prob divided by (1 minus
-  /// self-loop-prob-for-that-state)).
-  BaseFloat GetTransitionLogProbIgnoringSelfLoops(int32 trans_id) const;
-
-  /// Returns the log-prob of the non-self-loop probability
-  /// mass for this transition state. (you can get the self-loop prob, if a self-loop
-  /// exists, by calling GetTransitionLogProb(SelfLoopOf(trans_state)).
-  BaseFloat GetNonSelfLoopLogProb(int32 trans_state) const;
-
-  /// Does Maximum Likelihood estimation.  The stats are counts/weights, indexed
-  /// by transition-id.  This was previously called Update().
-  void MleUpdate(const Vector<double> &stats,
-                 const MleTransitionUpdateConfig &cfg,
-                 BaseFloat *objf_impr_out,
-                 BaseFloat *count_out);
-
-  /// Does Maximum A Posteriori (MAP) estimation.  The stats are counts/weights,
-  /// indexed by transition-id.
-  void MapUpdate(const Vector<double> &stats,
-                 const MapTransitionUpdateConfig &cfg,
-                 BaseFloat *objf_impr_out,
-                 BaseFloat *count_out);
-
-  /// Print will print the transition model in a human-readable way, for purposes of human
-  /// inspection.  The "occs" are optional (they are indexed by pdf-id).
-  void Print(std::ostream &os,
-             const std::vector<std::string> &phone_names,
-             const Vector<double> *occs = NULL);
-
-
-  void InitStats(Vector<double> *stats) const { stats->Resize(NumTransitionIds()+1); }
-
-  void Accumulate(BaseFloat prob, int32 trans_id, Vector<double> *stats) const {
-    KALDI_ASSERT(trans_id <= NumTransitionIds());
-    (*stats)(trans_id) += prob;
-    // This is trivial and doesn't require class members, but leaves us more open
-    // to design changes than doing it manually.
-  }
-
-  /// returns true if all the integer class members are identical (but does not
-  /// compare the transition probabilities.
-  bool Compatible(const TransitionModel &other) const;
-
- private:
-  void MleUpdateShared(const Vector<double> &stats,
-                       const MleTransitionUpdateConfig &cfg,
-                       BaseFloat *objf_impr_out, BaseFloat *count_out);
-  void MapUpdateShared(const Vector<double> &stats,
-                       const MapTransitionUpdateConfig &cfg,
-                       BaseFloat *objf_impr_out, BaseFloat *count_out);
-  void ComputeTuples(const ContextDependencyInterface &ctx_dep);  // called from constructor.  initializes tuples_.
-  void ComputeTuplesIsHmm(const ContextDependencyInterface &ctx_dep);
-  void ComputeTuplesNotHmm(const ContextDependencyInterface &ctx_dep);
-  void ComputeDerived();  // called from constructor and Read function: computes state2id_ and id2state_.
-  void ComputeDerivedOfProbs();  // computes quantities derived from log-probs (currently just
-  // non_self_loop_log_probs_; called whenever log-probs change.
-  void InitializeProbs();  // called from constructor.
-  void Check() const;
-  bool IsHmm() const;
-
-  struct Tuple {
-    int32 phone;
-    int32 hmm_state;
-    int32 forward_pdf;
-    int32 self_loop_pdf;
-    Tuple() { }
-    Tuple(int32 phone, int32 hmm_state, int32 forward_pdf, int32 self_loop_pdf):
-      phone(phone), hmm_state(hmm_state), forward_pdf(forward_pdf), self_loop_pdf(self_loop_pdf) { }
-    bool operator < (const Tuple &other) const {
-      if (phone < other.phone) return true;
-      else if (phone > other.phone) return false;
-      else if (hmm_state < other.hmm_state) return true;
-      else if (hmm_state > other.hmm_state) return false;
-      else if (forward_pdf < other.forward_pdf) return true;
-      else if (forward_pdf > other.forward_pdf) return false;
-      else return (self_loop_pdf < other.self_loop_pdf);
-    }
-    bool operator == (const Tuple &other) const {
-      return (phone == other.phone && hmm_state == other.hmm_state
-              && forward_pdf == other.forward_pdf && self_loop_pdf == other.self_loop_pdf);
-    }
-  };
-
-  HmmTopology topo_;
-
-  /// Tuples indexed by transition state minus one;
-  /// the tuples are in sorted order which allows us to do the reverse mapping from
-  /// tuple to transition state
-  std::vector<Tuple> tuples_;
-
-  /// Gives the first transition_id of each transition-state; indexed by
-  /// the transition-state.  Array indexed 1..num-transition-states+1 (the last one
-  /// is needed so we can know the num-transitions of the last transition-state.
-  std::vector<int32> state2id_;
-
-  /// For each transition-id, the corresponding transition
-  /// state (indexed by transition-id).
-  std::vector<int32> id2state_;
-
-  std::vector<int32> id2pdf_id_;
-
-  /// For each transition-id, the corresponding log-prob.  Indexed by transition-id.
-  Vector<BaseFloat> log_probs_;
-
-  /// For each transition-state, the log of (1 - self-loop-prob).  Indexed by
-  /// transition-state.
-  Vector<BaseFloat> non_self_loop_log_probs_;
-
-  /// This is actually one plus the highest-numbered pdf we ever got back from the
-  /// tree (but the tree numbers pdfs contiguously from zero so this is the number
-  /// of pdfs).
-  int32 num_pdfs_;
-
-  KALDI_DISALLOW_COPY_AND_ASSIGN(TransitionModel);
-};
-
-inline int32 TransitionModel::TransitionIdToPdf(int32 trans_id) const {
-  KALDI_ASSERT(
-      static_cast<size_t>(trans_id) < id2pdf_id_.size() &&
-      "Likely graph/model mismatch (graph built from wrong model?)");
-  return id2pdf_id_[trans_id];
-}
-
-inline int32 TransitionModel::TransitionIdToPdfFast(int32 trans_id) const {
-  // Note: it's a little dangerous to assert this only in paranoid mode.
-  // However, this function is called in the inner loop of decoders and
-  // the assertion likely takes a significant amount of time.  We make
-  // sure that past the end of thd id2pdf_id_ array there are big
-  // numbers, which will make the calling code more likely to segfault
-  // (rather than silently die) if this is called for out-of-range values.
-  KALDI_PARANOID_ASSERT(
-      static_cast<size_t>(trans_id) < id2pdf_id_.size() &&
-      "Likely graph/model mismatch (graph built from wrong model?)");
-  return id2pdf_id_[trans_id];
-}
-
-/// Works out which pdfs might correspond to the given phones.  Will return true
-/// if these pdfs correspond *just* to these phones, false if these pdfs are also
-/// used by other phones.
-/// @param trans_model [in] Transition-model used to work out this information
-/// @param phones [in] A sorted, uniq vector that represents a set of phones
-/// @param pdfs [out] Will be set to a sorted, uniq list of pdf-ids that correspond
-///                   to one of this set of phones.
-/// @return  Returns true if all of the pdfs output to "pdfs" correspond to phones from
-///          just this set (false if they may be shared with phones outside this set).
-bool GetPdfsForPhones(const TransitionModel &trans_model,
-                      const std::vector<int32> &phones,
-                      std::vector<int32> *pdfs);
-
-/// Works out which phones might correspond to the given pdfs. Similar to the
-/// above GetPdfsForPhones(, ,)
-bool GetPhonesForPdfs(const TransitionModel &trans_model,
-                      const std::vector<int32> &pdfs,
-                      std::vector<int32> *phones);
-/// @}
-
-
-} // end namespace kaldi
-
-
-#endif
diff --git a/src/hmm/transition-model-test.cc b/src/hmm/transitions-test.cc
similarity index 87%
rename from src/hmm/transition-model-test.cc
rename to src/hmm/transitions-test.cc
index 841c714efb1..9b9d7099801 100644
--- a/src/hmm/transition-model-test.cc
+++ b/src/hmm/transitions-test.cc
@@ -17,22 +17,22 @@
 // See the Apache 2 License for the specific language governing permissions and
 // limitations under the License.
 
-#include "hmm/transition-model.h"
+#include "hmm/transitions.h"
 #include "hmm/hmm-test-utils.h"
 
 namespace kaldi {
 
 
-void TestTransitionModel() {
+void TestTransitions() {
 
-  TransitionModel *trans_model = GenRandTransitionModel(NULL);
+  Transitions *trans_model = GenRandTransitionModel(NULL);
 
   bool binary = (rand() % 2 == 0);
 
   std::ostringstream os;
   trans_model->Write(os, binary);
 
-  TransitionModel trans_model2;
+  Transitions trans_model2;
   std::istringstream is2(os.str());
   trans_model2.Read(is2, binary);
 
@@ -50,7 +50,7 @@ void TestTransitionModel() {
 
 int main() {
   for (int i = 0; i < 2; i++)
-    kaldi::TestTransitionModel();
+    kaldi::TestTransitions();
   KALDI_LOG << "Test OK.\n";
 }
 
diff --git a/src/hmm/transition-model.cc b/src/hmm/transitions.cc
similarity index 88%
rename from src/hmm/transition-model.cc
rename to src/hmm/transitions.cc
index 5ecb7776f00..4198ea9cd45 100644
--- a/src/hmm/transition-model.cc
+++ b/src/hmm/transitions.cc
@@ -1,7 +1,9 @@
-// hmm/transition-model.cc
+// hmm/transitions.cc
 
-// Copyright 2009-2012  Microsoft Corporation  Johns Hopkins University (Author: Daniel Povey)
+// Copyright 2009-2012  Microsoft Corporation
 //        Johns Hopkins University (author: Guoguo Chen)
+//        2012-2019 Johns Hopkins University (Author: Daniel Povey)
+
 
 // See ../../COPYING for clarification regarding multiple authors
 //
@@ -19,12 +21,17 @@
 // limitations under the License.
 
 #include <vector>
-#include "hmm/transition-model.h"
+#include "hmm/transitions.h"
 #include "tree/context-dep.h"
 
 namespace kaldi {
 
-void TransitionModel::ComputeTuples(const ContextDependencyInterface &ctx_dep) {
+bool Transitions::operator == (const Transitions &other) {
+  return topo_ == other.topo_ && info_ == other.info_ &&
+      num_pdfs_ == other.num_pdfs_;
+}
+
+void Transitions::ComputeTuples(const ContextDependencyInterface &ctx_dep) {
   if (IsHmm())
     ComputeTuplesIsHmm(ctx_dep);
   else
@@ -35,7 +42,7 @@ void TransitionModel::ComputeTuples(const ContextDependencyInterface &ctx_dep) {
   // this sorting defines the transition-ids.
 }
 
-void TransitionModel::ComputeTuplesIsHmm(const ContextDependencyInterface &ctx_dep) {
+void Transitions::ComputeTuplesIsHmm(const ContextDependencyInterface &ctx_dep) {
   const std::vector<int32> &phones = topo_.GetPhones();
   KALDI_ASSERT(!phones.empty());
 
@@ -54,7 +61,7 @@ void TransitionModel::ComputeTuplesIsHmm(const ContextDependencyInterface &ctx_d
   // can correspond to.
   for (size_t i = 0; i < phones.size(); i++) {  // setting up to_hmm_state_list.
     int32 phone = phones[i];
-    const HmmTopology::TopologyEntry &entry = topo_.TopologyForPhone(phone);
+    const Topology::TopologyEntry &entry = topo_.TopologyForPhone(phone);
     for (int32 j = 0; j < static_cast<int32>(entry.size()); j++) {  // for each state...
       int32 pdf_class = entry[j].forward_pdf_class;
       if (pdf_class != kNoPdf) {
@@ -79,7 +86,7 @@ void TransitionModel::ComputeTuplesIsHmm(const ContextDependencyInterface &ctx_d
   }
 }
 
-void TransitionModel::ComputeTuplesNotHmm(const ContextDependencyInterface &ctx_dep) {
+void Transitions::ComputeTuplesNotHmm(const ContextDependencyInterface &ctx_dep) {
   const std::vector<int32> &phones = topo_.GetPhones();
   KALDI_ASSERT(!phones.empty());
 
@@ -94,7 +101,7 @@ void TransitionModel::ComputeTuplesNotHmm(const ContextDependencyInterface &ctx_
   pdf_class_pairs.resize(1 + *std::max_element(phones.begin(), phones.end()));
   for (size_t i = 0; i < phones.size(); i++) {
     int32 phone = phones[i];
-    const HmmTopology::TopologyEntry &entry = topo_.TopologyForPhone(phone);
+    const Topology::TopologyEntry &entry = topo_.TopologyForPhone(phone);
     for (int32 j = 0; j < static_cast<int32>(entry.size()); j++) {  // for each state...
       int32 forward_pdf_class = entry[j].forward_pdf_class, self_loop_pdf_class = entry[j].self_loop_pdf_class;
       if (forward_pdf_class != kNoPdf)
@@ -110,7 +117,7 @@ void TransitionModel::ComputeTuplesNotHmm(const ContextDependencyInterface &ctx_
   // can correspond to.
   for (size_t i = 0; i < phones.size(); i++) {  // setting up to_hmm_state_list.
     int32 phone = phones[i];
-    const HmmTopology::TopologyEntry &entry = topo_.TopologyForPhone(phone);
+    const Topology::TopologyEntry &entry = topo_.TopologyForPhone(phone);
     std::map<std::pair<int32, int32>, std::vector<int32> > phone_to_hmm_state_list;
     for (int32 j = 0; j < static_cast<int32>(entry.size()); j++) {  // for each state...
       int32 forward_pdf_class = entry[j].forward_pdf_class, self_loop_pdf_class = entry[j].self_loop_pdf_class;
@@ -141,7 +148,7 @@ void TransitionModel::ComputeTuplesNotHmm(const ContextDependencyInterface &ctx_
   }
 }
 
-void TransitionModel::ComputeDerived() {
+void Transitions::ComputeDerived() {
   state2id_.resize(tuples_.size()+2);  // indexed by transition-state, which
   // is one based, but also an entry for one past end of list.
 
@@ -158,7 +165,7 @@ void TransitionModel::ComputeDerived() {
           self_loop_pdf = tuples_[tstate-1].self_loop_pdf;
       num_pdfs_ = std::max(num_pdfs_, 1 + forward_pdf);
       num_pdfs_ = std::max(num_pdfs_, 1 + self_loop_pdf);
-      const HmmTopology::HmmState &state = topo_.TopologyForPhone(phone)[hmm_state];
+      const Topology::HmmState &state = topo_.TopologyForPhone(phone)[hmm_state];
       int32 my_num_ids = static_cast<int32>(state.transitions.size());
       cur_transition_id += my_num_ids;  // # trans out of this state.
     }
@@ -187,26 +194,26 @@ void TransitionModel::ComputeDerived() {
   id2pdf_id_.resize(cur_transition_id);
 }
 
-void TransitionModel::InitializeProbs() {
+void Transitions::InitializeProbs() {
   log_probs_.Resize(NumTransitionIds()+1);  // one-based array, zeroth element empty.
   for (int32 trans_id = 1; trans_id <= NumTransitionIds(); trans_id++) {
     int32 trans_state = id2state_[trans_id];
     int32 trans_index = trans_id - state2id_[trans_state];
     const Tuple &tuple = tuples_[trans_state-1];
-    const HmmTopology::TopologyEntry &entry = topo_.TopologyForPhone(tuple.phone);
+    const Topology::TopologyEntry &entry = topo_.TopologyForPhone(tuple.phone);
     KALDI_ASSERT(static_cast<size_t>(tuple.hmm_state) < entry.size());
     BaseFloat prob = entry[tuple.hmm_state].transitions[trans_index].second;
     if (prob <= 0.0)
-      KALDI_ERR << "TransitionModel::InitializeProbs, zero "
+      KALDI_ERR << "Transitions::InitializeProbs, zero "
           "probability [should remove that entry in the topology]";
     if (prob > 1.0)
-      KALDI_WARN << "TransitionModel::InitializeProbs, prob greater than one.";
+      KALDI_WARN << "Transitions::InitializeProbs, prob greater than one.";
     log_probs_(trans_id) = Log(prob);
   }
   ComputeDerivedOfProbs();
 }
 
-void TransitionModel::Check() const {
+void Transitions::Check() const {
   KALDI_ASSERT(NumTransitionIds() != 0 && NumTransitionStates() != 0);
   {
     int32 sum = 0;
@@ -228,12 +235,12 @@ void TransitionModel::Check() const {
   }
 }
 
-bool TransitionModel::IsHmm() const {
+bool Transitions::IsHmm() const {
   const std::vector<int32> &phones = topo_.GetPhones();
   KALDI_ASSERT(!phones.empty());
   for (size_t i = 0; i < phones.size(); i++) {
     int32 phone = phones[i];
-    const HmmTopology::TopologyEntry &entry = topo_.TopologyForPhone(phone);
+    const Topology::TopologyEntry &entry = topo_.TopologyForPhone(phone);
     for (int32 j = 0; j < static_cast<int32>(entry.size()); j++) {  // for each state...
       if (entry[j].forward_pdf_class != entry[j].self_loop_pdf_class)
         return false;
@@ -242,8 +249,8 @@ bool TransitionModel::IsHmm() const {
   return true;
 }
 
-TransitionModel::TransitionModel(const ContextDependencyInterface &ctx_dep,
-                                 const HmmTopology &hmm_topo): topo_(hmm_topo) {
+Transitions::TransitionModel(const ContextDependencyInterface &ctx_dep,
+                                 const Topology &hmm_topo): topo_(hmm_topo) {
   // First thing is to get all possible tuples.
   ComputeTuples(ctx_dep);
   ComputeDerived();
@@ -251,7 +258,7 @@ TransitionModel::TransitionModel(const ContextDependencyInterface &ctx_dep,
   Check();
 }
 
-int32 TransitionModel::TupleToTransitionState(int32 phone, int32 hmm_state, int32 pdf, int32 self_loop_pdf) const {
+int32 Transitions::TupleToTransitionState(int32 phone, int32 hmm_state, int32 pdf, int32 self_loop_pdf) const {
   Tuple tuple(phone, hmm_state, pdf, self_loop_pdf);
   // Note: if this ever gets too expensive, which is unlikely, we can refactor
   // this code to sort first on pdf, and then index on pdf, so those
@@ -259,7 +266,7 @@ int32 TransitionModel::TupleToTransitionState(int32 phone, int32 hmm_state, int3
   std::vector<Tuple>::const_iterator iter =
       std::lower_bound(tuples_.begin(), tuples_.end(), tuple);
   if (iter == tuples_.end() || !(*iter == tuple)) {
-    KALDI_ERR << "TransitionModel::TupleToTransitionState, tuple not found."
+    KALDI_ERR << "Transitions::TupleToTransitionState, tuple not found."
               << " (incompatible tree and model?)";
   }
   // tuples_ is indexed by transition_state-1, so add one.
@@ -267,68 +274,68 @@ int32 TransitionModel::TupleToTransitionState(int32 phone, int32 hmm_state, int3
 }
 
 
-int32 TransitionModel::NumTransitionIndices(int32 trans_state) const {
+int32 Transitions::NumTransitionIndices(int32 trans_state) const {
   KALDI_ASSERT(static_cast<size_t>(trans_state) <= tuples_.size());
   return static_cast<int32>(state2id_[trans_state+1]-state2id_[trans_state]);
 }
 
-int32 TransitionModel::TransitionIdToTransitionState(int32 trans_id) const {
+int32 Transitions::TransitionIdToTransitionState(int32 trans_id) const {
   KALDI_ASSERT(trans_id != 0 &&  static_cast<size_t>(trans_id) < id2state_.size());
   return id2state_[trans_id];
 }
 
-int32 TransitionModel::TransitionIdToTransitionIndex(int32 trans_id) const {
+int32 Transitions::TransitionIdToTransitionIndex(int32 trans_id) const {
   KALDI_ASSERT(trans_id != 0 && static_cast<size_t>(trans_id) < id2state_.size());
   return trans_id - state2id_[id2state_[trans_id]];
 }
 
-int32 TransitionModel::TransitionStateToPhone(int32 trans_state) const {
+int32 Transitions::TransitionStateToPhone(int32 trans_state) const {
   KALDI_ASSERT(static_cast<size_t>(trans_state) <= tuples_.size());
   return tuples_[trans_state-1].phone;
 }
 
-int32 TransitionModel::TransitionStateToForwardPdf(int32 trans_state) const {
+int32 Transitions::TransitionStateToForwardPdf(int32 trans_state) const {
   KALDI_ASSERT(static_cast<size_t>(trans_state) <= tuples_.size());
   return tuples_[trans_state-1].forward_pdf;
 }
 
-int32 TransitionModel::TransitionStateToForwardPdfClass(
+int32 Transitions::TransitionStateToForwardPdfClass(
     int32 trans_state) const {
   KALDI_ASSERT(static_cast<size_t>(trans_state) <= tuples_.size());
   const Tuple &t = tuples_[trans_state-1];
-  const HmmTopology::TopologyEntry &entry = topo_.TopologyForPhone(t.phone);
+  const Topology::TopologyEntry &entry = topo_.TopologyForPhone(t.phone);
   KALDI_ASSERT(static_cast<size_t>(t.hmm_state) < entry.size());
   return entry[t.hmm_state].forward_pdf_class;
 }
 
 
-int32 TransitionModel::TransitionStateToSelfLoopPdfClass(
+int32 Transitions::TransitionStateToSelfLoopPdfClass(
     int32 trans_state) const {
   KALDI_ASSERT(static_cast<size_t>(trans_state) <= tuples_.size());
   const Tuple &t = tuples_[trans_state-1];
-  const HmmTopology::TopologyEntry &entry = topo_.TopologyForPhone(t.phone);
+  const Topology::TopologyEntry &entry = topo_.TopologyForPhone(t.phone);
   KALDI_ASSERT(static_cast<size_t>(t.hmm_state) < entry.size());
   return entry[t.hmm_state].self_loop_pdf_class;
 }
 
 
-int32 TransitionModel::TransitionStateToSelfLoopPdf(int32 trans_state) const {
+int32 Transitions::TransitionStateToSelfLoopPdf(int32 trans_state) const {
   KALDI_ASSERT(static_cast<size_t>(trans_state) <= tuples_.size());
   return tuples_[trans_state-1].self_loop_pdf;
 }
 
-int32 TransitionModel::TransitionStateToHmmState(int32 trans_state) const {
+int32 Transitions::TransitionStateToHmmState(int32 trans_state) const {
   KALDI_ASSERT(static_cast<size_t>(trans_state) <= tuples_.size());
   return tuples_[trans_state-1].hmm_state;
 }
 
-int32 TransitionModel::PairToTransitionId(int32 trans_state, int32 trans_index) const {
+int32 Transitions::PairToTransitionId(int32 trans_state, int32 trans_index) const {
   KALDI_ASSERT(static_cast<size_t>(trans_state) <= tuples_.size());
   KALDI_ASSERT(trans_index < state2id_[trans_state+1] - state2id_[trans_state]);
   return state2id_[trans_state] + trans_index;
 }
 
-int32 TransitionModel::NumPhones() const {
+int32 Transitions::NumPhones() const {
   int32 num_trans_state = tuples_.size();
   int32 max_phone_id = 0;
   for (int32 i = 0; i < num_trans_state; ++i) {
@@ -339,12 +346,12 @@ int32 TransitionModel::NumPhones() const {
 }
 
 
-bool TransitionModel::IsFinal(int32 trans_id) const {
+bool Transitions::IsFinal(int32 trans_id) const {
   KALDI_ASSERT(static_cast<size_t>(trans_id) < id2state_.size());
   int32 trans_state = id2state_[trans_id];
   int32 trans_index = trans_id - state2id_[trans_state];
   const Tuple &tuple = tuples_[trans_state-1];
-  const HmmTopology::TopologyEntry &entry = topo_.TopologyForPhone(tuple.phone);
+  const Topology::TopologyEntry &entry = topo_.TopologyForPhone(tuple.phone);
   KALDI_ASSERT(static_cast<size_t>(tuple.hmm_state) < entry.size());
   KALDI_ASSERT(static_cast<size_t>(tuple.hmm_state) < entry.size());
   KALDI_ASSERT(static_cast<size_t>(trans_index) <
@@ -357,12 +364,12 @@ bool TransitionModel::IsFinal(int32 trans_id) const {
 
 
 
-int32 TransitionModel::SelfLoopOf(int32 trans_state) const {  // returns the self-loop transition-id,
+int32 Transitions::SelfLoopOf(int32 trans_state) const {  // returns the self-loop transition-id,
   KALDI_ASSERT(static_cast<size_t>(trans_state-1) < tuples_.size());
   const Tuple &tuple = tuples_[trans_state-1];
   // or zero if does not exist.
   int32 phone = tuple.phone, hmm_state = tuple.hmm_state;
-  const HmmTopology::TopologyEntry &entry = topo_.TopologyForPhone(phone);
+  const Topology::TopologyEntry &entry = topo_.TopologyForPhone(phone);
   KALDI_ASSERT(static_cast<size_t>(hmm_state) < entry.size());
   for (int32 trans_index = 0;
       trans_index < static_cast<int32>(entry[hmm_state].transitions.size());
@@ -372,7 +379,7 @@ int32 TransitionModel::SelfLoopOf(int32 trans_state) const {  // returns the sel
   return 0;  // invalid transition id.
 }
 
-void TransitionModel::ComputeDerivedOfProbs() {
+void Transitions::ComputeDerivedOfProbs() {
   non_self_loop_log_probs_.Resize(NumTransitionStates()+1);  // this array indexed
   //  by transition-state with nothing in zeroth element.
   for (int32 tstate = 1; tstate <= NumTransitionStates(); tstate++) {
@@ -391,8 +398,8 @@ void TransitionModel::ComputeDerivedOfProbs() {
   }
 }
 
-void TransitionModel::Read(std::istream &is, bool binary) {
-  ExpectToken(is, binary, "<TransitionModel>");
+void Transitions::Read(std::istream &is, bool binary) {
+  ExpectToken(is, binary, "<Transitions>");
   topo_.Read(is, binary);
   std::string token;
   ReadToken(is, binary, &token);
@@ -414,14 +421,14 @@ void TransitionModel::Read(std::istream &is, bool binary) {
   ExpectToken(is, binary, "<LogProbs>");
   log_probs_.Read(is, binary);
   ExpectToken(is, binary, "</LogProbs>");
-  ExpectToken(is, binary, "</TransitionModel>");
+  ExpectToken(is, binary, "</Transitions>");
   ComputeDerivedOfProbs();
   Check();
 }
 
-void TransitionModel::Write(std::ostream &os, bool binary) const {
+void Transitions::Write(std::ostream &os, bool binary) const {
   bool is_hmm = IsHmm();
-  WriteToken(os, binary, "<TransitionModel>");
+  WriteToken(os, binary, "<Transitions>");
   if (!binary) os << "\n";
   topo_.Write(os, binary);
   if (is_hmm)
@@ -448,31 +455,31 @@ void TransitionModel::Write(std::ostream &os, bool binary) const {
   log_probs_.Write(os, binary);
   WriteToken(os, binary, "</LogProbs>");
   if (!binary) os << "\n";
-  WriteToken(os, binary, "</TransitionModel>");
+  WriteToken(os, binary, "</Transitions>");
   if (!binary) os << "\n";
 }
 
-BaseFloat TransitionModel::GetTransitionProb(int32 trans_id) const {
+BaseFloat Transitions::GetTransitionProb(int32 trans_id) const {
   return Exp(log_probs_(trans_id));
 }
 
-BaseFloat TransitionModel::GetTransitionLogProb(int32 trans_id) const {
+BaseFloat Transitions::GetTransitionLogProb(int32 trans_id) const {
   return log_probs_(trans_id);
 }
 
-BaseFloat TransitionModel::GetNonSelfLoopLogProb(int32 trans_state) const {
+BaseFloat Transitions::GetNonSelfLoopLogProb(int32 trans_state) const {
   KALDI_ASSERT(trans_state != 0);
   return non_self_loop_log_probs_(trans_state);
 }
 
-BaseFloat TransitionModel::GetTransitionLogProbIgnoringSelfLoops(int32 trans_id) const {
+BaseFloat Transitions::GetTransitionLogProbIgnoringSelfLoops(int32 trans_id) const {
   KALDI_ASSERT(trans_id != 0);
   KALDI_PARANOID_ASSERT(!IsSelfLoop(trans_id));
   return log_probs_(trans_id) - GetNonSelfLoopLogProb(TransitionIdToTransitionState(trans_id));
 }
 
 // stats are counts/weights, indexed by transition-id.
-void TransitionModel::MleUpdate(const Vector<double> &stats,
+void Transitions::MleUpdate(const Vector<double> &stats,
                                 const MleTransitionUpdateConfig &cfg,
                                 BaseFloat *objf_impr_out,
                                 BaseFloat *count_out) {
@@ -525,7 +532,7 @@ void TransitionModel::MleUpdate(const Vector<double> &stats,
       }
     }
   }
-  KALDI_LOG << "TransitionModel::Update, objf change is "
+  KALDI_LOG << "Transitions::Update, objf change is "
             << (objf_impr_sum / count_sum) << " per frame over " << count_sum
             << " frames. ";
   KALDI_LOG <<  num_floored << " probabilities floored, " << num_skipped
@@ -538,7 +545,7 @@ void TransitionModel::MleUpdate(const Vector<double> &stats,
 
 
 // stats are counts/weights, indexed by transition-id.
-void TransitionModel::MapUpdate(const Vector<double> &stats,
+void Transitions::MapUpdate(const Vector<double> &stats,
                                 const MapTransitionUpdateConfig &cfg,
                                 BaseFloat *objf_impr_out,
                                 BaseFloat *count_out) {
@@ -596,7 +603,7 @@ void TransitionModel::MapUpdate(const Vector<double> &stats,
 /// This version of the Update() function is for if the user specifies
 /// --share-for-pdfs=true.  We share the transitions for all states that
 /// share the same pdf.
-void TransitionModel::MleUpdateShared(const Vector<double> &stats,
+void Transitions::MleUpdateShared(const Vector<double> &stats,
                                       const MleTransitionUpdateConfig &cfg,
                                       BaseFloat *objf_impr_out,
                                       BaseFloat *count_out) {
@@ -695,7 +702,7 @@ void TransitionModel::MleUpdateShared(const Vector<double> &stats,
 /// This version of the MapUpdate() function is for if the user specifies
 /// --share-for-pdfs=true.  We share the transitions for all states that
 /// share the same pdf.
-void TransitionModel::MapUpdateShared(const Vector<double> &stats,
+void Transitions::MapUpdateShared(const Vector<double> &stats,
                                       const MapTransitionUpdateConfig &cfg,
                                       BaseFloat *objf_impr_out,
                                       BaseFloat *count_out) {
@@ -782,18 +789,18 @@ void TransitionModel::MapUpdateShared(const Vector<double> &stats,
 }
 
 
-int32 TransitionModel::TransitionIdToPhone(int32 trans_id) const {
+int32 Transitions::TransitionIdToPhone(int32 trans_id) const {
   KALDI_ASSERT(trans_id != 0 && static_cast<size_t>(trans_id) < id2state_.size());
   int32 trans_state = id2state_[trans_id];
   return tuples_[trans_state-1].phone;
 }
 
-int32 TransitionModel::TransitionIdToPdfClass(int32 trans_id) const {
+int32 Transitions::TransitionIdToPdfClass(int32 trans_id) const {
   KALDI_ASSERT(trans_id != 0 && static_cast<size_t>(trans_id) < id2state_.size());
   int32 trans_state = id2state_[trans_id];
 
   const Tuple &t = tuples_[trans_state-1];
-  const HmmTopology::TopologyEntry &entry = topo_.TopologyForPhone(t.phone);
+  const Topology::TopologyEntry &entry = topo_.TopologyForPhone(t.phone);
   KALDI_ASSERT(static_cast<size_t>(t.hmm_state) < entry.size());
   if (IsSelfLoop(trans_id))
     return entry[t.hmm_state].self_loop_pdf_class;
@@ -802,14 +809,14 @@ int32 TransitionModel::TransitionIdToPdfClass(int32 trans_id) const {
 }
 
 
-int32 TransitionModel::TransitionIdToHmmState(int32 trans_id) const {
+int32 Transitions::TransitionIdToHmmState(int32 trans_id) const {
   KALDI_ASSERT(trans_id != 0 && static_cast<size_t>(trans_id) < id2state_.size());
   int32 trans_state = id2state_[trans_id];
   const Tuple &t = tuples_[trans_state-1];
   return t.hmm_state;
 }
 
-void TransitionModel::Print(std::ostream &os,
+void Transitions::Print(std::ostream &os,
                             const std::vector<std::string> &phone_names,
                             const Vector<double> *occs) {
   if (occs != NULL)
@@ -841,7 +848,7 @@ void TransitionModel::Print(std::ostream &os,
       if (IsSelfLoop(tid)) os << " [self-loop]\n";
       else {
         int32 hmm_state = tuple.hmm_state;
-        const HmmTopology::TopologyEntry &entry = topo_.TopologyForPhone(tuple.phone);
+        const Topology::TopologyEntry &entry = topo_.TopologyForPhone(tuple.phone);
         KALDI_ASSERT(static_cast<size_t>(hmm_state) < entry.size());
         int32 next_hmm_state = entry[hmm_state].transitions[tidx].first;
         KALDI_ASSERT(next_hmm_state != hmm_state);
@@ -851,7 +858,7 @@ void TransitionModel::Print(std::ostream &os,
   }
 }
 
-bool GetPdfsForPhones(const TransitionModel &trans_model,
+bool GetPdfsForPhones(const Transitions &trans_model,
                       const std::vector<int32> &phones,
                       std::vector<int32> *pdfs) {
   KALDI_ASSERT(IsSortedAndUniq(phones));
@@ -877,7 +884,7 @@ bool GetPdfsForPhones(const TransitionModel &trans_model,
   return true;
 }
 
-bool GetPhonesForPdfs(const TransitionModel &trans_model,
+bool GetPhonesForPdfs(const Transitions &trans_model,
                      const std::vector<int32> &pdfs,
                      std::vector<int32> *phones) {
   KALDI_ASSERT(IsSortedAndUniq(pdfs));
@@ -903,19 +910,19 @@ bool GetPhonesForPdfs(const TransitionModel &trans_model,
   return true;
 }
 
-bool TransitionModel::Compatible(const TransitionModel &other) const {
+bool Transitions::Compatible(const TransitionModel &other) const {
   return (topo_ == other.topo_ && tuples_ == other.tuples_ &&
           state2id_ == other.state2id_ && id2state_ == other.id2state_
           && num_pdfs_ == other.num_pdfs_);
 }
 
-bool TransitionModel::IsSelfLoop(int32 trans_id) const {
+bool Transitions::IsSelfLoop(int32 trans_id) const {
   KALDI_ASSERT(static_cast<size_t>(trans_id) < id2state_.size());
   int32 trans_state = id2state_[trans_id];
   int32 trans_index = trans_id - state2id_[trans_state];
   const Tuple &tuple = tuples_[trans_state-1];
   int32 phone = tuple.phone, hmm_state = tuple.hmm_state;
-  const HmmTopology::TopologyEntry &entry = topo_.TopologyForPhone(phone);
+  const Topology::TopologyEntry &entry = topo_.TopologyForPhone(phone);
   KALDI_ASSERT(static_cast<size_t>(hmm_state) < entry.size());
   return (static_cast<size_t>(trans_index) < entry[hmm_state].transitions.size()
           && entry[hmm_state].transitions[trans_index].first == hmm_state);
diff --git a/src/hmm/transitions.h b/src/hmm/transitions.h
new file mode 100644
index 00000000000..b446e4cc6c4
--- /dev/null
+++ b/src/hmm/transitions.h
@@ -0,0 +1,263 @@
+// hmm/transitions.h
+
+// Copyright 2009-2012  Microsoft Corporation
+//                2015  Guoguo Chen
+//                2019  Johns Hopkins University (author: Daniel Povey)
+
+// See ../../COPYING for clarification regarding multiple authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//  http://www.apache.org/licenses/LICENSE-2.0
+//
+// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
+// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
+// MERCHANTABLITY OR NON-INFRINGEMENT.
+// See the Apache 2 License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef KALDI_HMM_TRANSITION_MODEL_H_
+#define KALDI_HMM_TRANSITION_MODEL_H_
+
+#include "base/kaldi-common.h"
+#include "util/const-integer-set.h"
+#include "fst/fst-decl.h" // forward declarations.
+#include "hmm/topology.h"
+#include "itf/options-itf.h"
+#include "itf/context-dep-itf.h"
+#include "matrix/kaldi-vector.h"
+
+namespace kaldi {
+
+
+// The class Transitions handles various integer mappings.
+// It used to be the home for the trainable transitions, but these
+// no longer exist.  This class can be initialized from the
+// tree and the topology.
+//
+// The topology of an individual phone is as defined in topology.h.
+//
+//  This class basically defines the concept of a "transition-id",
+//  which is a construct that we use in compiled decoding graphs
+//  to make it easy to look up the 'pdf-id' (think of this as the
+//  distribution or neural net output column associated with this
+//  state) and also figure out which phone we are in and which
+//  arc in that phone.
+//
+//  In the original Kaldi, this object contained trainable transition
+//  probabilities, but these have been removed to simplify things.
+//
+//  A transition-id maps to a 4-tuple as follows:
+//       (pdf-id, phone, topo-state, arc-index)
+//  where 'topo-state' is the state index in the fst::StdFst
+//  for the topology, and 'arc-index' is the index of
+//  the arc leaving that state (zero for the first-listed one,
+//  one for the second, etc.)
+
+
+// List of the various types of quantity referred to here and what they mean:
+//           phone:  a phone index (1, 2, 3 ...)
+//       topo-state:  a state index in the phone-topology FST (see topology.h)
+//       arc-index:  The index of the arc leaving this topo-state:
+//                   0 for the first-listed one, 1 for the second.  Will be used
+//                   to Seek() in the ArcIterator.
+//          pdf-id:  A number output by the Compute() function of ContextDependency (it
+//                   indexes pdf's, either forward or self-loop).  Zero-based.
+//                   In DNN-based systems this would be the column index of
+//                   the neural net output.
+// (*)self-loop-pdf-id:  The pdf-id associated with the self-loop of this state,
+//                   if there is one (we do not allow >1), or -1 if there is no
+//                   self-loop.  This will be the same as pdf-id' if this transition
+//                   *is* the self-loop.  It might seem odd that we require this
+//                   to get the transition-id for a non-self-loop arc; the reason
+//                   why it's necessary is that we initially create the graph
+//                   without self-loops (for efficiency) and we need to be able
+//                   to look up the corresponding self-loop transition-id to
+//                   add self-loops to the graph.
+//
+//   transition-id:  The numbers that we put on the decoding-graph arcs.
+//                   Each transition-id is associated with a 4-tuple
+//                   (pdf-id, phone, topo-state, arc-index).
+//
+
+
+class Transitions {
+
+ public:
+  /// Initialize the object.  This is deterministic, so initializing
+  /// from the same objects will give you an equivalent numbering.
+  /// The class keeps a copy of the Topology object, but not
+  /// the ContextDependency object.
+  Transitions(const ContextDependencyInterface &ctx_dep,
+                  const Topology &topo);
+
+
+  /// Constructor that takes no arguments: typically used prior to calling Read.
+  Transitions(): num_pdfs_(0) { }
+
+  void Read(std::istream &is, bool binary);
+  void Write(std::ostream &os, bool binary) const;
+
+  // This struct is the information associated with one transition-id.
+  // You can work out the transition-id from the first 5 fields.
+  struct TransitionIdInfo {
+    int32 phone;      // The phone
+    int32 topo_state; // The state in the topology FST for this phone
+    int32 arc_index;  // The arc-index leaving this state
+    int32 pdf_id;     // The pdf-id associated with this arc (obtained from the
+                      // tree and phonetic-context information, etc.)
+
+    int32 self_loop_pdf_id;  // The pdf-id associated with the self-loop
+                             // transition (if any) leaving the *destiation*
+                             // state of this arc, or zero if that state has no
+                             // self-loop.  Search for (*) above for
+                             // explanation.
+
+    // The remaining fields are 'derived information' that are worked out
+    // from the information above and from the phone topology, and placed
+    // here for convenience.
+
+    // is_self_loop is true if this is a self-loop (a transition to the same
+    // state).  We often need to know this, so it's convenient to have this
+    // information here.
+    bool is_self_loop;
+    // is_initial is true if this is a transition leaving the
+    // initial state.
+    // you transition through the HMM (we check that the topology has no
+    // other transitions to the first HMM-state).
+    bool is_initial;
+
+    // is_final is true if this is a transition entering a final
+    // state.  This is used together with is_initial (and boundary
+    // information) to locate phone boundaries, e.g. for lattice
+    // word alignment: an 'is_final' transition-id followed by an
+    // 'is_initial' transition-id marks a phone boundary, which
+    // we know because we do not allow the start-state in
+    // topologies to be final.
+    bool is_final;
+
+    // transition_cost is the cost (negative log-prob) of this transition).
+    BaseFloat transition_cost;
+    // The transition-id associated with the self-loop of the *destination* of
+    // this arc, if there is one, or 0 if there is no such self-loop.
+    int32 self_loop_transition_id;
+
+    bool operator < (const TransitionIdInfo &other) const {
+      if (phone < other.phone) return true;
+      else if (phone > other.phone) return false;
+      else if (topo_state < other.topo_state) return true;
+      else if (topo_state > other.topo_state) return false;
+      else if (pdf_id < other.pdf_id) return true;
+      else if (pdf_id > other.pdf_id) return false;
+      else return (self_loop_pdf_id < other.self_loop_pdf_id);
+    }
+    // TODO.  operator == can compare all members.
+    bool operator == (const TransitionIdInfo &other) const;
+  };
+
+
+  /// return reference to HMM-topology object.
+  const Topology &GetTopo() const { return topo_; }
+
+  const TransitionIdInfo &InfoForTransitionId(int32 transition_id) const;
+
+  inline int32 TransitionIdToPdfFast(int32 trans_id) const;
+
+  /// This allows you to look up a transition-id.  It returns 0 if nothing
+  /// was found.
+  int32 TupleToTransitionId(int32 phone, int32 topo_state, int32 arc_index,
+                            int32 pdf_id, int32 self_loop_pdf_id) const;
+
+
+  /// Returns the total number of transition-ids (note, these are one-based).
+  inline int32 NumTransitionIds() const { return info_.size()-1; }
+
+  // NumPdfs() returns the number of pdfs (pdf-ids) in the tree,
+  // as returned by ctx_dep.NumPdfs() for the tree passed to the constructor.
+  int32 NumPdfs() const { return num_pdfs_; }
+
+  /// Returns a sorted, unique list of phones.
+  const std::vector<int32> &GetPhones() const { return topo_.GetPhones(); }
+
+
+  /// Print will print the transition model in a human-readable way, for purposes of human
+  /// inspection.  The "occs" are optional (they are indexed by pdf-id).
+  void Print(std::ostream &os,
+             const std::vector<std::string> &phone_names,
+             const Vector<double> *occs = NULL);
+
+  /// returns true if this is identical to 'other'
+  bool operator == (const Transitions &other);
+
+ private:
+
+  // Called from constructor.  initializes info_ (at least, the first 5
+  // fields); you then have to call ComputeDerived() to initalize teh rest.
+  void ComputeInfo(const ContextDependencyInterface &ctx_dep);
+
+  void ComputeDerived();  // called from constructor and Read function: computes state2id_ and id2state_.
+
+  void Check() const;
+
+
+  Topology topo_;
+
+  /// Information about transition-ids, indexed by transition-id.
+  /// the tuples are in sorted order which allows us to do the reverse mapping from
+  /// tuple to transition state
+  std::vector<const TransitionIdInfo> info_;
+
+
+  /// Accessing pdf_ids_[i] allows us to look up info_[i].pdf_id in a way that
+  /// is more friendly to memory caches than accessing info_; this is done in
+  /// the inner loops of decoders so it makes sense to optimize for it.
+  std::vector<int32> pdf_ids_;
+
+  /// This is a copy of the NumPdfs() returned by the tree when we constructed
+  /// this object.  Note: pdf-ids are zero-based.
+  int32 num_pdfs_;
+
+  KALDI_DISALLOW_COPY_AND_ASSIGN(Transitions);
+};
+
+inline int32 Transitions::TransitionIdToPdfFast(int32 trans_id) const {
+  // Note: it's a little dangerous to assert this only in paranoid mode.
+  // However, this function is called in the inner loop of decoders and
+  // the assertion likely takes a significant amount of time.  We make
+  // sure that past the end of thd id2pdf_id_ array there are big
+  // numbers, which will make the calling code more likely to segfault
+  // (rather than silently die) if this is called for out-of-range values.
+  KALDI_PARANOID_ASSERT(
+      static_cast<size_t>(trans_id) < pdf_ids_.size() &&
+      "Likely graph/model mismatch (graph built from wrong model?)");
+  return pdf_ids_[trans_id];
+}
+
+/// Works out which pdfs might correspond to the given phones.  Will return true
+/// if these pdfs correspond *just* to these phones, false if these pdfs are also
+/// used by other phones.
+/// @param trans_model [in] Transition-model used to work out this information
+/// @param phones [in] A sorted, uniq vector that represents a set of phones
+/// @param pdfs [out] Will be set to a sorted, uniq list of pdf-ids that correspond
+///                   to one of this set of phones.
+/// @return  Returns true if all of the pdfs output to "pdfs" correspond to phones from
+///          just this set (false if they may be shared with phones outside this set).
+bool GetPdfsForPhones(const Transitions &trans_model,
+                      const std::vector<int32> &phones,
+                      std::vector<int32> *pdfs);
+
+/// Works out which phones might correspond to the given pdfs. Similar to the
+/// above GetPdfsForPhones(, ,)
+bool GetPhonesForPdfs(const Transitions &trans_model,
+                      const std::vector<int32> &pdfs,
+                      std::vector<int32> *phones);
+/// @}
+
+
+} // end namespace kaldi
+
+
+#endif
diff --git a/src/hmm/tree-accu.cc b/src/hmm/tree-accu.cc
index c8ce49d9bc7..80041d275e6 100644
--- a/src/hmm/tree-accu.cc
+++ b/src/hmm/tree-accu.cc
@@ -33,7 +33,7 @@ static int32 MapPhone(const std::vector<int32> &phone_map,
 }
 
 
-void AccumulateTreeStats(const TransitionModel &trans_model,
+void AccumulateTreeStats(const Transitions &trans_model,
                          const AccumulateTreeStatsInfo &info,
                          const std::vector<int32> &alignment,
                          const Matrix<BaseFloat> &features,
diff --git a/src/hmm/tree-accu.h b/src/hmm/tree-accu.h
index 92e83c535c7..fd3e09567b5 100644
--- a/src/hmm/tree-accu.h
+++ b/src/hmm/tree-accu.h
@@ -23,7 +23,7 @@
 #include <cctype>  // For isspace.
 #include <limits>
 #include "base/kaldi-common.h"
-#include "hmm/transition-model.h"
+#include "hmm/transitions.h"
 #include "tree/clusterable-classes.h"
 #include "tree/build-tree-questions.h" // needed for this typedef:
 // typedef std::vector<std::pair<EventVector, Clusterable*> > BuildTreeStatsType;
@@ -74,7 +74,7 @@ struct AccumulateTreeStatsInfo {
 /// "normal" way).  It adds to 'stats' the stats obtained from this file.  Any
 /// new GaussClusterable* pointers in "stats" will be allocated with "new".
 
-void AccumulateTreeStats(const TransitionModel &trans_model,
+void AccumulateTreeStats(const Transitions &trans_model,
                          const AccumulateTreeStatsInfo &info,
                          const std::vector<int32> &alignment,
                          const Matrix<BaseFloat> &features,
diff --git a/src/itf/context-dep-itf.h b/src/itf/context-dep-itf.h
index 40681bb5ccd..1fda7b93020 100644
--- a/src/itf/context-dep-itf.h
+++ b/src/itf/context-dep-itf.h
@@ -62,9 +62,9 @@ class ContextDependencyInterface {
 
   /// GetPdfInfo returns a vector indexed by pdf-id, saying for each pdf which
   /// pairs of (phone, pdf-class) it can correspond to.  (Usually just one).
-  /// c.f. hmm/hmm-topology.h for meaning of pdf-class.
+  /// c.f. hmm/topology.h for meaning of pdf-class.
   /// This is the old, simpler interface of GetPdfInfo(), and that this one can
-  /// only be called if the HmmTopology object's IsHmm() function call returns
+  /// only be called if the Topology object's IsHmm() function call returns
   /// true.
   virtual void GetPdfInfo(
       const std::vector<int32> &phones,  // list of phones
diff --git a/src/lat/determinize-lattice-pruned.cc b/src/lat/determinize-lattice-pruned.cc
index 22eae8199ff..64d8c3fffc0 100644
--- a/src/lat/determinize-lattice-pruned.cc
+++ b/src/lat/determinize-lattice-pruned.cc
@@ -1290,7 +1290,7 @@ bool DeterminizeLatticePruned(const ExpandedFst<ArcTpl<Weight> > &ifst,
 
 template<class Weight>
 typename ArcTpl<Weight>::Label DeterminizeLatticeInsertPhones(
-    const kaldi::TransitionModel &trans_model,
+    const kaldi::Transitions &trans_model,
     MutableFst<ArcTpl<Weight> > *fst) {
   // Define some types.
   typedef ArcTpl<Weight> Arc;
@@ -1312,32 +1312,28 @@ typename ArcTpl<Weight>::Label DeterminizeLatticeInsertPhones(
          !aiter.Done(); aiter.Next()) {
       Arc arc = aiter.Value();
 
-      // Note: the words are on the input symbol side and transition-id's are on
+      // Note: the words are on the input symbol side and transition-ids are on
       // the output symbol side.
-      if ((arc.olabel != 0)
-          && (trans_model.TransitionIdToHmmState(arc.olabel) == 0)
-          && (!trans_model.IsSelfLoop(arc.olabel))) {
-        Label phone =
-            static_cast<Label>(trans_model.TransitionIdToPhone(arc.olabel));
-
-        // Skips <eps>.
-        KALDI_ASSERT(phone != 0);
-
-        if (arc.ilabel == 0) {
-          // If there is no word on the arc, insert the phone directly.
-          arc.ilabel = first_phone_label + phone;
-        } else {
-          // Otherwise, add an additional arc.
-          StateId additional_state = fst->AddState();
-          StateId next_state = arc.nextstate;
-          arc.nextstate = additional_state;
-          fst->AddArc(additional_state,
-                      Arc(first_phone_label + phone, 0,
-                          Weight::One(), next_state));
+      if (arc.olabel != 0) {
+        auto info = trans_model.InfoForTransitionId(arc.olabel);
+        if (info.is_initial && !info.is_self_loop) {
+          Label phone = static_cast<Label>(info.phone);
+          KALDI_ASSERT(phone != 0);
+          if (arc.ilabel == 0) {
+            // If there is no word on the arc, insert the phone directly.
+            arc.ilabel = first_phone_label + phone;
+          } else {
+            // Otherwise, add an additional arc.
+            StateId additional_state = fst->AddState();
+            StateId next_state = arc.nextstate;
+            arc.nextstate = additional_state;
+            fst->AddArc(additional_state,
+                        Arc(first_phone_label + phone, 0,
+                            Weight::One(), next_state));
+          }
+          aiter.SetValue(arc);
         }
       }
-
-      aiter.SetValue(arc);
     }
   }
 
@@ -1387,7 +1383,7 @@ void DeterminizeLatticeDeletePhones(
 */
 template<class Weight, class IntType>
 bool DeterminizeLatticePhonePrunedFirstPass(
-    const kaldi::TransitionModel &trans_model,
+    const kaldi::Transitions &trans_model,
     double beam,
     MutableFst<ArcTpl<Weight> > *fst,
     const DeterminizeLatticePrunedOptions &opts) {
@@ -1410,7 +1406,7 @@ bool DeterminizeLatticePhonePrunedFirstPass(
 // lattice might be modified.
 template<class Weight, class IntType>
 bool DeterminizeLatticePhonePruned(
-    const kaldi::TransitionModel &trans_model,
+    const kaldi::Transitions &trans_model,
     MutableFst<ArcTpl<Weight> > *ifst,
     double beam,
     MutableFst<ArcTpl<CompactLatticeWeightTpl<Weight, IntType> > > *ofst,
@@ -1471,7 +1467,7 @@ bool DeterminizeLatticePhonePruned(
 // will be kept as unchanged.
 template<class Weight, class IntType>
 bool DeterminizeLatticePhonePruned(
-    const kaldi::TransitionModel &trans_model,
+    const kaldi::Transitions &trans_model,
     const ExpandedFst<ArcTpl<Weight> > &ifst,
     double beam,
     MutableFst<ArcTpl<CompactLatticeWeightTpl<Weight, IntType> > > *ofst,
@@ -1482,7 +1478,7 @@ bool DeterminizeLatticePhonePruned(
 }
 
 bool DeterminizeLatticePhonePrunedWrapper(
-    const kaldi::TransitionModel &trans_model,
+    const kaldi::Transitions &trans_model,
     MutableFst<kaldi::LatticeArc> *ifst,
     double beam,
     MutableFst<kaldi::CompactLatticeArc> *ofst,
@@ -1524,7 +1520,7 @@ bool DeterminizeLatticePruned<kaldi::LatticeWeight>(
 
 template
 bool DeterminizeLatticePhonePruned<kaldi::LatticeWeight, kaldi::int32>(
-    const kaldi::TransitionModel &trans_model,
+    const kaldi::Transitions &trans_model,
     const ExpandedFst<kaldi::LatticeArc> &ifst,
     double prune,
     MutableFst<kaldi::CompactLatticeArc> *ofst,
@@ -1532,7 +1528,7 @@ bool DeterminizeLatticePhonePruned<kaldi::LatticeWeight, kaldi::int32>(
 
 template
 bool DeterminizeLatticePhonePruned<kaldi::LatticeWeight, kaldi::int32>(
-    const kaldi::TransitionModel &trans_model,
+    const kaldi::Transitions &trans_model,
     MutableFst<kaldi::LatticeArc> *ifst,
     double prune,
     MutableFst<kaldi::CompactLatticeArc> *ofst,
diff --git a/src/lat/determinize-lattice-pruned.h b/src/lat/determinize-lattice-pruned.h
index 8e1858aa2b1..63154109ec2 100644
--- a/src/lat/determinize-lattice-pruned.h
+++ b/src/lat/determinize-lattice-pruned.h
@@ -28,7 +28,7 @@
 #include <set>
 #include <vector>
 #include "fstext/lattice-weight.h"
-#include "hmm/transition-model.h"
+#include "hmm/transitions.h"
 #include "itf/options-itf.h"
 #include "lat/kaldi-lattice.h"
 
@@ -222,7 +222,7 @@ bool DeterminizeLatticePruned(
 */
 template<class Weight>
 typename ArcTpl<Weight>::Label DeterminizeLatticeInsertPhones(
-    const kaldi::TransitionModel &trans_model,
+    const kaldi::Transitions &trans_model,
     MutableFst<ArcTpl<Weight> > *fst);
 
 /** This function takes in lattices and deletes "phones" from them. The "phones"
@@ -253,7 +253,7 @@ void DeterminizeLatticeDeletePhones(
 */
 template<class Weight, class IntType>
 bool DeterminizeLatticePhonePruned(
-    const kaldi::TransitionModel &trans_model,
+    const kaldi::Transitions &trans_model,
     const ExpandedFst<ArcTpl<Weight> > &ifst,
     double prune,
     MutableFst<ArcTpl<CompactLatticeWeightTpl<Weight, IntType> > > *ofst,
@@ -265,7 +265,7 @@ bool DeterminizeLatticePhonePruned(
 */
 template<class Weight, class IntType>
 bool DeterminizeLatticePhonePruned(
-    const kaldi::TransitionModel &trans_model,
+    const kaldi::Transitions &trans_model,
     MutableFst<ArcTpl<Weight> > *ifst,
     double prune,
     MutableFst<ArcTpl<CompactLatticeWeightTpl<Weight, IntType> > > *ofst,
@@ -282,7 +282,7 @@ bool DeterminizeLatticePhonePruned(
     code.
 */
 bool DeterminizeLatticePhonePrunedWrapper(
-    const kaldi::TransitionModel &trans_model,
+    const kaldi::Transitions &trans_model,
     MutableFst<kaldi::LatticeArc> *ifst,
     double prune,
     MutableFst<kaldi::CompactLatticeArc> *ofst,
diff --git a/src/lat/lattice-functions.cc b/src/lat/lattice-functions.cc
index 54c856a9403..878a99d79e3 100644
--- a/src/lat/lattice-functions.cc
+++ b/src/lat/lattice-functions.cc
@@ -24,7 +24,7 @@
 
 
 #include "lat/lattice-functions.h"
-#include "hmm/transition-model.h"
+#include "hmm/transitions.h"
 #include "util/stl-utils.h"
 #include "base/kaldi-math.h"
 #include "hmm/hmm-utils.h"
@@ -396,7 +396,7 @@ BaseFloat LatticeForwardBackward(const Lattice &lat, Posterior *post,
 }
 
 
-void LatticeActivePhones(const Lattice &lat, const TransitionModel &trans,
+void LatticeActivePhones(const Lattice &lat, const Transitions &trans,
                          const vector<int32> &silence_phones,
                          vector< std::set<int32> > *active_phones) {
   KALDI_ASSERT(IsSortedAndUniq(silence_phones));
@@ -411,7 +411,7 @@ void LatticeActivePhones(const Lattice &lat, const TransitionModel &trans,
         aiter.Next()) {
       const LatticeArc &arc = aiter.Value();
       if (arc.ilabel != 0) {  // Non-epsilon arc
-        int32 phone = trans.TransitionIdToPhone(arc.ilabel);
+        int32 phone = trans.InfoForTransitionId(arc.ilabel).phone;
         if (!std::binary_search(silence_phones.begin(),
                                 silence_phones.end(), phone))
           (*active_phones)[cur_time].insert(phone);
@@ -420,7 +420,7 @@ void LatticeActivePhones(const Lattice &lat, const TransitionModel &trans,
   }  // end looping over states
 }
 
-void ConvertLatticeToPhones(const TransitionModel &trans,
+void ConvertLatticeToPhones(const Transitions &trans,
                             Lattice *lat) {
   typedef LatticeArc Arc;
   int32 num_states = lat->NumStates();
@@ -429,11 +429,11 @@ void ConvertLatticeToPhones(const TransitionModel &trans,
         aiter.Next()) {
       Arc arc(aiter.Value());
       arc.olabel = 0; // remove any word.
-      if ((arc.ilabel != 0) // has a transition-id on input..
-          && (trans.TransitionIdToHmmState(arc.ilabel) == 0)
-          && (!trans.IsSelfLoop(arc.ilabel))) {
-         // && trans.IsFinal(arc.ilabel)) // there is one of these per phone...
-        arc.olabel = trans.TransitionIdToPhone(arc.ilabel);
+
+      if (arc.ilabel != 0) { // has a transition-id on input..
+        auto info = trans.InfoForTransitionId(arc.ilabel);
+        if (info.is_initial && !info.is_self_loop)
+          arc.olabel = info.phone;
       }
       aiter.SetValue(arc);
     }  // end looping over arcs
@@ -697,7 +697,7 @@ void CompactLatticeDepthPerFrame(const CompactLattice &clat,
 
 
 
-void ConvertCompactLatticeToPhones(const TransitionModel &trans,
+void ConvertCompactLatticeToPhones(const Transitions &trans,
                                    CompactLattice *clat) {
   typedef CompactLatticeArc Arc;
   typedef Arc::Weight Weight;
@@ -711,8 +711,9 @@ void ConvertCompactLatticeToPhones(const TransitionModel &trans,
       const std::vector<int32> &tid_seq = arc.weight.String();
       for (std::vector<int32>::const_iterator iter = tid_seq.begin();
            iter != tid_seq.end(); ++iter) {
-        if (trans.IsFinal(*iter))// note: there is one of these per phone...
-          phone_seq.push_back(trans.TransitionIdToPhone(*iter));
+        auto info = trans.InfoForTransitionId(*iter);
+        if (info.is_initial && !info.is_self_loop) // note: there is one of these per phone.
+          phone_seq.push_back(info.phone);
       }
       arc.weight.SetString(phone_seq);
       aiter.SetValue(arc);
@@ -723,8 +724,9 @@ void ConvertCompactLatticeToPhones(const TransitionModel &trans,
       const std::vector<int32> &tid_seq = f.String();
       for (std::vector<int32>::const_iterator iter = tid_seq.begin();
            iter != tid_seq.end(); ++iter) {
-        if (trans.IsFinal(*iter))// note: there is one of these per phone...
-          phone_seq.push_back(trans.TransitionIdToPhone(*iter));
+        auto info = trans.InfoForTransitionId(*iter);
+        if (info.is_initial && !info.is_self_loop) // note: there is one of these per phone.
+          phone_seq.push_back(info.phone);
       }
       f.SetString(phone_seq);
       clat->SetFinal(state, f);
@@ -732,7 +734,7 @@ void ConvertCompactLatticeToPhones(const TransitionModel &trans,
   }  // end looping over states
 }
 
-bool LatticeBoost(const TransitionModel &trans,
+bool LatticeBoost(const Transitions &trans,
                   const std::vector<int32> &alignment,
                   const std::vector<int32> &silence_phones,
                   BaseFloat b,
@@ -761,8 +763,8 @@ bool LatticeBoost(const TransitionModel &trans,
                      << "lattice/model mismatch?";
           return false;
         }
-        int32 phone = trans.TransitionIdToPhone(arc.ilabel),
-            ref_phone = trans.TransitionIdToPhone(alignment[cur_time]);
+        int32 phone = trans.InfoForTransitionId(arc.ilabel).phone,
+            ref_phone = trans.InfoForTransitionId(alignment[cur_time]).phone;
         BaseFloat frame_error;
         if (phone == ref_phone) {
           frame_error = 0.0;
@@ -792,7 +794,7 @@ bool LatticeBoost(const TransitionModel &trans,
 
 
 BaseFloat LatticeForwardBackwardMpeVariants(
-    const TransitionModel &trans,
+    const Transitions &trans,
     const std::vector<int32> &silence_phones,
     const Lattice &lat,
     const std::vector<int32> &num_ali,
@@ -873,8 +875,8 @@ BaseFloat LatticeForwardBackwardMpeVariants(
       double frame_acc = 0.0;
       if (arc.ilabel != 0) {
         int32 cur_time = state_times[s];
-        int32 phone = trans.TransitionIdToPhone(arc.ilabel),
-            ref_phone = trans.TransitionIdToPhone(num_ali[cur_time]);
+        int32 phone = trans.InfoForTransitionId(arc.ilabel).phone,
+            ref_phone = trans.InfoForTransitionId(num_ali[cur_time]).phone;
         bool phone_is_sil = std::binary_search(silence_phones.begin(),
                                                silence_phones.end(),
                                                phone),
@@ -883,8 +885,8 @@ BaseFloat LatticeForwardBackwardMpeVariants(
                                                   ref_phone),
             both_sil = phone_is_sil && ref_phone_is_sil;
         if (!is_mpfe) { // smbr.
-          int32 pdf = trans.TransitionIdToPdf(arc.ilabel),
-              ref_pdf = trans.TransitionIdToPdf(num_ali[cur_time]);
+          int32 pdf = trans.InfoForTransitionId(arc.ilabel).pdf_id,
+              ref_pdf = trans.InfoForTransitionId(num_ali[cur_time]).pdf_id;
           if (!one_silence_class)  // old behavior
             frame_acc = (pdf == ref_pdf && !phone_is_sil) ? 1.0 : 0.0;
           else
@@ -918,8 +920,8 @@ BaseFloat LatticeForwardBackwardMpeVariants(
       int32 transition_id = arc.ilabel;
       if (arc.ilabel != 0) {
         int32 cur_time = state_times[s];
-        int32 phone = trans.TransitionIdToPhone(arc.ilabel),
-            ref_phone = trans.TransitionIdToPhone(num_ali[cur_time]);
+        int32 phone = trans.InfoForTransitionId(arc.ilabel).phone,
+            ref_phone = trans.InfoForTransitionId(num_ali[cur_time]).phone;
         bool phone_is_sil = std::binary_search(silence_phones.begin(),
                                                silence_phones.end(), phone),
             ref_phone_is_sil = std::binary_search(silence_phones.begin(),
@@ -927,8 +929,8 @@ BaseFloat LatticeForwardBackwardMpeVariants(
                                                   ref_phone),
             both_sil = phone_is_sil && ref_phone_is_sil;
         if (!is_mpfe) { // smbr.
-          int32 pdf = trans.TransitionIdToPdf(arc.ilabel),
-              ref_pdf = trans.TransitionIdToPdf(num_ali[cur_time]);
+          int32 pdf = trans.InfoForTransitionId(arc.ilabel).pdf_id,
+              ref_pdf = trans.InfoForTransitionId(num_ali[cur_time]).pdf_id;
           if (!one_silence_class)  // old behavior
             frame_acc = (pdf == ref_pdf && !phone_is_sil) ? 1.0 : 0.0;
           else
@@ -1024,7 +1026,7 @@ bool CompactLatticeToWordAlignment(const CompactLattice &clat,
 
 
 bool CompactLatticeToWordProns(
-    const TransitionModel &tmodel,
+    const Transitions &tmodel,
     const CompactLattice &clat,
     std::vector<int32> *words,
     std::vector<int32> *begin_times,
@@ -1080,7 +1082,7 @@ bool CompactLatticeToWordProns(
       std::vector<int32> plengths(split_alignment.size());
       for (size_t i = 0; i < split_alignment.size(); i++) {
         KALDI_ASSERT(!split_alignment[i].empty());
-        phones[i] = tmodel.TransitionIdToPhone(split_alignment[i][0]);
+        phones[i] = tmodel.InfoForTransitionId(split_alignment[i][0]).phone;
         plengths[i] = split_alignment[i].size();
       }
       prons->push_back(phones);
@@ -1215,7 +1217,7 @@ struct ClatRescoreTuple {
     RescoreCompactLattice, "tmodel" will be NULL and speedup_factor will be 1.0.
  */
 bool RescoreCompactLatticeInternal(
-    const TransitionModel *tmodel,
+    const Transitions *tmodel,
     BaseFloat speedup_factor,
     DecodableInterface *decodable,
     CompactLattice *clat) {
@@ -1286,10 +1288,10 @@ bool RescoreCompactLatticeInternal(
     BaseFloat frame_scale = 1.0;
     KALDI_ASSERT(!time_to_state[t].empty());
     if (tmodel != NULL) {
-      int32 pdf_id = tmodel->TransitionIdToPdf(time_to_state[t][0].tid);
+      int32 pdf_id = tmodel->InfoForTransitionId(time_to_state[t][0].tid).pdf_id;
       bool frame_has_multiple_pdfs = false;
       for (size_t i = 1; i < time_to_state[t].size(); i++) {
-        if (tmodel->TransitionIdToPdf(time_to_state[t][i].tid) != pdf_id) {
+        if (tmodel->InfoForTransitionId(time_to_state[t][i].tid).pdf_id != pdf_id) {
           frame_has_multiple_pdfs = true;
           break;
         }
@@ -1345,7 +1347,7 @@ bool RescoreCompactLatticeInternal(
 
 
 bool RescoreCompactLatticeSpeedup(
-    const TransitionModel &tmodel,
+    const Transitions &tmodel,
     BaseFloat speedup_factor,
     DecodableInterface *decodable,
     CompactLattice *clat) {
@@ -1413,7 +1415,7 @@ bool RescoreLattice(DecodableInterface *decodable,
 
 
 BaseFloat LatticeForwardBackwardMmi(
-    const TransitionModel &tmodel,
+    const Transitions &tmodel,
     const Lattice &lat,
     const std::vector<int32> &num_ali,
     bool drop_frames,
diff --git a/src/lat/lattice-functions.h b/src/lat/lattice-functions.h
index c7fe4833a4a..b54b12551c8 100644
--- a/src/lat/lattice-functions.h
+++ b/src/lat/lattice-functions.h
@@ -30,7 +30,7 @@
 #include "base/kaldi-common.h"
 #include "hmm/posterior.h"
 #include "fstext/fstext-lib.h"
-#include "hmm/transition-model.h"
+#include "hmm/transitions.h"
 #include "lat/kaldi-lattice.h"
 #include "itf/decodable-itf.h"
 
@@ -137,21 +137,21 @@ void CompactLatticeLimitDepth(int32 max_arcs_per_frame,
 /// outputs for each frame the set of phones active on that frame.  If
 /// sil_phones (which must be sorted and uniq) is nonempty, it excludes
 /// phones in this list.
-void LatticeActivePhones(const Lattice &lat, const TransitionModel &trans,
+void LatticeActivePhones(const Lattice &lat, const Transitions &trans,
                          const std::vector<int32> &sil_phones,
                          std::vector<std::set<int32> > *active_phones);
 
 /// Given a lattice, and a transition model to map pdf-ids to phones,
 /// replace the output symbols (presumably words), with phones; we
-/// use the TransitionModel to work out the phone sequence.  Note
+/// use the Transitions to work out the phone sequence.  Note
 /// that the phone labels are not exactly aligned with the phone
 /// boundaries.  We put a phone label to coincide with any transition
 /// to the final, nonemitting state of a phone (this state always exists,
-/// we ensure this in HmmTopology::Check()).  This would be the last
+/// we ensure this in Topology::Check()).  This would be the last
 /// transition-id in the phone if reordering is not done (but typically
 /// we do reorder).
 /// Also see PhoneAlignLattice, in phone-align-lattice.h.
-void ConvertLatticeToPhones(const TransitionModel &trans_model,
+void ConvertLatticeToPhones(const Transitions &trans_model,
                             Lattice *lat);
 
 /// Prunes a lattice or compact lattice.  Returns true on success, false if
@@ -164,7 +164,7 @@ bool PruneLattice(BaseFloat beam, LatticeType *lat);
 /// replace the sequences of transition-ids with sequences of phones.
 /// Note that this is different from ConvertLatticeToPhones, in that
 /// we replace the transition-ids not the words.
-void ConvertCompactLatticeToPhones(const TransitionModel &trans_model,
+void ConvertCompactLatticeToPhones(const Transitions &trans_model,
                                    CompactLattice *clat);
 
 /// Boosts LM probabilities by b * [number of frame errors]; equivalently, adds
@@ -172,14 +172,14 @@ void ConvertCompactLatticeToPhones(const TransitionModel &trans_model,
 /// There is a frame error if a particular transition-id on a particular frame
 /// corresponds to a phone not matching transcription's alignment for that frame.
 /// This is used in "margin-inspired" discriminative training, esp. Boosted MMI.
-/// The TransitionModel is used to map transition-ids in the lattice
+/// The Transitions is used to map transition-ids in the lattice
 /// input-side to phones; the phones appearing in
 /// "silence_phones" are treated specially in that we replace the frame error f
 /// (either zero or 1) for a frame, with the minimum of f or max_silence_error.
 /// For the normal recipe, max_silence_error would be zero.
 /// Returns true on success, false if there was some kind of mismatch.
 /// At input, silence_phones must be sorted and unique.
-bool LatticeBoost(const TransitionModel &trans,
+bool LatticeBoost(const Transitions &trans,
                   const std::vector<int32> &alignment,
                   const std::vector<int32> &silence_phones,
                   BaseFloat b,
@@ -226,7 +226,7 @@ bool LatticeBoost(const TransitionModel &trans,
                         pseudo log-likelihoods of states at each frame.
 */
 BaseFloat LatticeForwardBackwardMpeVariants(
-    const TransitionModel &trans,
+    const Transitions &trans,
     const std::vector<int32> &silence_phones,
     const Lattice &lat,
     const std::vector<int32> &num_ali,
@@ -261,7 +261,7 @@ BaseFloat LatticeForwardBackwardMpeVariants(
 
    It returns the forward-backward likelihood of the lattice. */
 BaseFloat LatticeForwardBackwardMmi(
-    const TransitionModel &trans,
+    const Transitions &trans,
     const Lattice &lat,
     const std::vector<int32> &num_ali,
     bool drop_frames,
@@ -298,7 +298,7 @@ bool CompactLatticeToWordAlignment(const CompactLattice &clat,
 /// did not have the correct format (e.g. if it is empty or it is not
 /// linear).
 bool CompactLatticeToWordProns(
-    const TransitionModel &tmodel,
+    const Transitions &tmodel,
     const CompactLattice &clat,
     std::vector<int32> *words,
     std::vector<int32> *begin_times,
@@ -350,7 +350,7 @@ int32 LongestSentenceLength(const CompactLattice &lat);
 /// speedup_factor; otherwise we set them to zero.  This gives the right
 /// expected probability so our corpus-level diagnostics will be about right.
 bool RescoreCompactLatticeSpeedup(
-    const TransitionModel &tmodel,
+    const Transitions &tmodel,
     BaseFloat speedup_factor,
     DecodableInterface *decodable,
     CompactLattice *clat);
diff --git a/src/lat/minimize-lattice.cc b/src/lat/minimize-lattice.cc
index 2132d324d20..2cae91a2563 100644
--- a/src/lat/minimize-lattice.cc
+++ b/src/lat/minimize-lattice.cc
@@ -22,7 +22,7 @@
 
 
 #include "lat/minimize-lattice.h"
-#include "hmm/transition-model.h"
+#include "hmm/transitions.h"
 #include "util/stl-utils.h"
 
 namespace fst {
diff --git a/src/lat/phone-align-lattice.cc b/src/lat/phone-align-lattice.cc
index a8da7b76a0f..6b7fb520a54 100644
--- a/src/lat/phone-align-lattice.cc
+++ b/src/lat/phone-align-lattice.cc
@@ -20,7 +20,7 @@
 
 
 #include "lat/phone-align-lattice.h"
-#include "hmm/transition-model.h"
+#include "hmm/transitions.h"
 #include "util/stl-utils.h"
 
 namespace kaldi {
@@ -58,7 +58,7 @@ class LatticePhoneAligner {
     /// wrong so don't trust the output too fully.
     /// Note: the "next_state" of the arc will not be set, you have to do that
     /// yourself.
-    bool OutputPhoneArc(const TransitionModel &tmodel,
+    bool OutputPhoneArc(const Transitions &tmodel,
                         const PhoneAlignLatticeOptions &opts,
                         CompactLatticeArc *arc_out,
                         bool *error);
@@ -67,7 +67,7 @@ class LatticePhoneAligner {
     /// the arc won't have any transition-ids on it.  This is intended to fix
     /// a particular pathology where too many words were pending and we had
     /// blowup.
-    bool OutputWordArc(const TransitionModel &tmodel,
+    bool OutputWordArc(const Transitions &tmodel,
                        const PhoneAlignLatticeOptions &opts,
                        CompactLatticeArc *arc_out,
                        bool *error);
@@ -91,7 +91,7 @@ class LatticePhoneAligner {
     /// will consist of partial words, and this will only
     /// happen for lattices that were somehow broken, i.e.
     /// had not reached the final state.
-    void OutputArcForce(const TransitionModel &tmodel,
+    void OutputArcForce(const Transitions &tmodel,
                         const PhoneAlignLatticeOptions &opts,
                         CompactLatticeArc *arc_out,
                         bool *error);
@@ -242,7 +242,7 @@ class LatticePhoneAligner {
   }
 
   LatticePhoneAligner(const CompactLattice &lat,
-                      const TransitionModel &tmodel,
+                      const Transitions &tmodel,
                       const PhoneAlignLatticeOptions &opts,
                      CompactLattice *lat_out):
       lat_(lat), tmodel_(tmodel), opts_(opts), lat_out_(lat_out),
@@ -280,7 +280,7 @@ class LatticePhoneAligner {
   }
 
   CompactLattice lat_;
-  const TransitionModel &tmodel_;
+  const Transitions &tmodel_;
   const PhoneAlignLatticeOptions &opts_;
   CompactLattice *lat_out_;
 
@@ -290,76 +290,71 @@ class LatticePhoneAligner {
 };
 
 bool LatticePhoneAligner::ComputationState::OutputPhoneArc(
-    const TransitionModel &tmodel,
+    const Transitions &tmodel,
     const PhoneAlignLatticeOptions &opts,
     CompactLatticeArc *arc_out,
     bool *error) {
   if (transition_ids_.empty()) return false;
-  int32 phone = tmodel.TransitionIdToPhone(transition_ids_[0]);
-  // we assume the start of transition_ids_ is the start of the phone;
-  // this is a precondition.
-  size_t len = transition_ids_.size(), i;
-  // Keep going till we reach a "final" transition-id; note, if
-  // reorder==true, we have to go a bit further after this.
-  for (i = 0; i < len; i++) {
-    int32 tid = transition_ids_[i];
-    int32 this_phone = tmodel.TransitionIdToPhone(tid);
-    if (this_phone != phone && ! *error) { // error condition: should have
-                                           // reached final transition-id first.
-      *error = true;
-      KALDI_WARN << phone << " -> " << this_phone;
-      KALDI_WARN << "Phone changed before final transition-id found "
-          "[broken lattice or mismatched model or wrong --reorder option?]";
-    }
-    if (tmodel.IsFinal(tid))
-      break;
-  }
-  if (i == len) return false; // fell off loop.
-  i++; // go past the one for which IsFinal returned true.
-  if (opts.reorder) // we have to consume the following self-loop transition-ids.
-    while (i < len && tmodel.IsSelfLoop(transition_ids_[i])) i++;
-  if (i == len) return false; // we don't know if it ends here... so can't output arc.
 
-  // interpret i as the number of transition-ids to consume.
-  std::vector<int32> tids_out(transition_ids_.begin(),
-                              transition_ids_.begin()+i);
+  const Transitions::TransitionIdInfo *prev_info = &tmodel.InfoForTransitionId(
+      transition_ids_[0]);
+  if (!prev_info->is_initial)
+    return false;
 
-  Label output_label = 0;
-  if (!word_labels_.empty()) {
-    output_label = word_labels_[0];
-    word_labels_.erase(word_labels_.begin(), word_labels_.begin()+1);
+  size_t len = transition_ids_.size(), i;
+  for (i = 1; i < len; i++) {
+    const Transitions::TransitionIdInfo *this_info = &tmodel.InfoForTransitionId(
+        transition_ids_[i]);
+    if (prev_info->is_final && this_info->is_initial) {
+      // This is a phone boundary.
+      Label output_label = 0;
+      if (!word_labels_.empty()) {
+        // Note: this word label won't necessarily be meaningfully aligned with
+        // the phones.
+        output_label = word_labels_[0];
+        word_labels_.erase(word_labels_.begin());
+      }
+      if (opts.replace_output_symbols)
+        output_label = prev_info->phone;
+      std::vector<int32> tids_out(transition_ids_.begin(),
+                                  transition_ids_.begin() + i);
+      *arc_out = CompactLatticeArc(output_label, output_label,
+                                   CompactLatticeWeight(weight_, tids_out),
+                                   fst::kNoStateId);
+      transition_ids_.erase(transition_ids_.begin(), transition_ids_.begin()+i);
+      weight_ = LatticeWeight::One(); // we just output the weight.
+      return true;
+    }
+    prev_info = this_info;
   }
-  if (opts.replace_output_symbols)
-    output_label = phone;
-  *arc_out = CompactLatticeArc(output_label, output_label,
-                               CompactLatticeWeight(weight_, tids_out),
-                               fst::kNoStateId);
-  transition_ids_.erase(transition_ids_.begin(), transition_ids_.begin()+i);
-  weight_ = LatticeWeight::One(); // we just output the weight.
-  return true;
+  return false;
 }
 
 bool LatticePhoneAligner::ComputationState::OutputWordArc(
-    const TransitionModel &tmodel,
+    const Transitions &tmodel,
     const PhoneAlignLatticeOptions &opts,
     CompactLatticeArc *arc_out,
     bool *error) {
   // output a word but no phones.
+
+  // I believe the reason we don't do this if there is just one word, is that we
+  // have reason to believe there is still a way to output that word on a
+  // regular phone arc.
   if (word_labels_.size() < 2) return false;
 
   int32 output_label = word_labels_[0];
-  word_labels_.erase(word_labels_.begin(), word_labels_.begin()+1);
+  word_labels_.erase(word_labels_.begin());
 
   *arc_out = CompactLatticeArc(output_label, output_label,
                                CompactLatticeWeight(weight_, std::vector<int32>()),
                                fst::kNoStateId);
-  weight_ = LatticeWeight::One(); // we just output the weight, so set it to one.
+  weight_ = LatticeWeight::One(); // we just output the cost, so remove it.
   return true;
 }
 
 
 void LatticePhoneAligner::ComputationState::OutputArcForce(
-    const TransitionModel &tmodel,
+    const Transitions &tmodel,
     const PhoneAlignLatticeOptions &opts,
     CompactLatticeArc *arc_out,
     bool *error) {
@@ -369,24 +364,25 @@ void LatticePhoneAligner::ComputationState::OutputArcForce(
   // although it might not be obvious from superficially checking
   // the code.  IsEmpty() would be true if we had transition_ids_.empty()
   // and opts.replace_output_symbols, so we would already die by assertion;
-  // in fact, this function would neve be called.
+  // in fact, this function would never have been called.
 
-  if (!transition_ids_.empty()) { // Do some checking here.
+  if (!transition_ids_.empty()) {
+    // Do some checking here.  We expect there to be exactly one phone on this
+    // arc.  This code is reached at the end of a lattice.
     int32 tid = transition_ids_[0];
-    phone = tmodel.TransitionIdToPhone(tid);
-    int32 num_final = 0;
+    phone = tmodel.InfoForTransitionId(tid).phone;
     for (int32 i = 0; i < transition_ids_.size(); i++) { // A check.
       int32 this_tid = transition_ids_[i];
-      int32 this_phone = tmodel.TransitionIdToPhone(this_tid);
-      bool is_final = tmodel.IsFinal(this_tid); // should be exactly one.
-      if (is_final) num_final++;
+      int32 this_phone = tmodel.InfoForTransitionId(this_tid).phone;
       if (this_phone != phone && ! *error) {
-        KALDI_WARN << "Mismatch in phone: error in lattice or mismatched transition model?";
+        KALDI_WARN << "Mismatch in phone: error in lattice or mismatched "
+            "transition model?";
         *error = true;
       }
     }
-    if (num_final != 1 && ! *error) {
-      KALDI_WARN << "Problem phone-aligning lattice: saw " << num_final
+    if (!tmodel.InfoForTransitionId(transition_ids_.back()).is_final &&
+        ! *error) {
+      KALDI_WARN << "Problem phone-aligning lattice: found no final-state for "
                  << " final-states in last phone in lattice (forced out?) "
                  << "Producing partial lattice.";
       *error = true;
@@ -396,7 +392,7 @@ void LatticePhoneAligner::ComputationState::OutputArcForce(
   Label output_label = 0;
   if (!word_labels_.empty()) {
     output_label = word_labels_[0];
-    word_labels_.erase(word_labels_.begin(), word_labels_.begin()+1);
+    word_labels_.erase(word_labels_.begin());
   }
   if (opts.replace_output_symbols)
     output_label = phone;
@@ -404,11 +400,11 @@ void LatticePhoneAligner::ComputationState::OutputArcForce(
                                CompactLatticeWeight(weight_, transition_ids_),
                                fst::kNoStateId);
   transition_ids_.clear();
-  weight_ = LatticeWeight::One(); // we just output the weight.
+  weight_ = LatticeWeight::One();
 }
 
 bool PhoneAlignLattice(const CompactLattice &lat,
-                       const TransitionModel &tmodel,
+                       const Transitions &tmodel,
                        const PhoneAlignLatticeOptions &opts,
                        CompactLattice *lat_out) {
   LatticePhoneAligner aligner(lat, tmodel, opts, lat_out);
diff --git a/src/lat/phone-align-lattice.h b/src/lat/phone-align-lattice.h
index 106e5e03e21..b8916f34e94 100644
--- a/src/lat/phone-align-lattice.h
+++ b/src/lat/phone-align-lattice.h
@@ -25,7 +25,7 @@
 #include "base/kaldi-common.h"
 #include "util/common-utils.h"
 #include "fstext/fstext-lib.h"
-#include "hmm/transition-model.h"
+#include "hmm/transitions.h"
 #include "lat/kaldi-lattice.h"
 
 namespace kaldi {
@@ -59,7 +59,7 @@ struct PhoneAlignLatticeOptions {
 /// everything was OK, false if some kind of error was detected (e.g. the
 /// "reorder" option was incorrectly specified.)
 bool PhoneAlignLattice(const CompactLattice &lat,
-                       const TransitionModel &tmodel,
+                       const Transitions &tmodel,
                        const PhoneAlignLatticeOptions &opts,
                        CompactLattice *lat_out);
 
diff --git a/src/lat/push-lattice.cc b/src/lat/push-lattice.cc
index 616c8c5ad06..cf2464c6be8 100644
--- a/src/lat/push-lattice.cc
+++ b/src/lat/push-lattice.cc
@@ -22,7 +22,7 @@
 
 
 #include "lat/push-lattice.h"
-#include "hmm/transition-model.h"
+#include "hmm/transitions.h"
 #include "util/stl-utils.h"
 
 namespace fst {
diff --git a/src/lat/word-align-lattice-lexicon-test.cc b/src/lat/word-align-lattice-lexicon-test.cc
index 240153417b1..db70e21d43e 100644
--- a/src/lat/word-align-lattice-lexicon-test.cc
+++ b/src/lat/word-align-lattice-lexicon-test.cc
@@ -172,7 +172,7 @@ void GenerateCompactLatticeRandomly(const std::vector<int32> &alignment,
 
 void TestWordAlignLatticeLexicon() {
   ContextDependency *ctx_dep;
-  TransitionModel *trans_model = GenRandTransitionModel(&ctx_dep);
+  Transitions *trans_model = GenRandTransitionModel(&ctx_dep);
   bool allow_zero_words = true;
   bool allow_empty_word = true;
   bool allow_multiple_prons = true;
diff --git a/src/lat/word-align-lattice-lexicon.cc b/src/lat/word-align-lattice-lexicon.cc
index 63284b771de..2a268f90dbf 100644
--- a/src/lat/word-align-lattice-lexicon.cc
+++ b/src/lat/word-align-lattice-lexicon.cc
@@ -21,7 +21,7 @@
 #include "lat/phone-align-lattice.h"
 #include "lat/word-align-lattice-lexicon.h"
 #include "lat/lattice-functions.h"
-#include "hmm/transition-model.h"
+#include "hmm/transitions.h"
 #include "hmm/hmm-utils.h"
 #include "util/stl-utils.h"
 
@@ -71,7 +71,7 @@ class LatticeLexiconWordAligner {
     /// previously did PhoneAlignLattice, we can assume this arc corresponds to
     /// exactly one or zero phones.
     void Advance(const CompactLatticeArc &arc,
-                 const TransitionModel &tmodel,
+                 const Transitions &tmodel,
                  LatticeWeight *leftover_weight);
 
     /// Returns true if, assuming we were to add one or more phones by calling
@@ -283,7 +283,7 @@ class LatticeLexiconWordAligner {
   }
 
   LatticeLexiconWordAligner(const CompactLattice &lat,
-                            const TransitionModel &tmodel,
+                            const Transitions &tmodel,
                             const WordAlignLatticeLexiconInfo &lexicon_info,
                             int32 max_states,
                             int32 partial_word_label,
@@ -343,7 +343,7 @@ class LatticeLexiconWordAligner {
   }
 
   CompactLattice lat_in_;
-  const TransitionModel &tmodel_;
+  const Transitions &tmodel_;
   const WordAlignLatticeLexiconInfo &lexicon_info_;
   int32 max_states_;
   CompactLattice *lat_out_;
@@ -571,14 +571,18 @@ void LatticeLexiconWordAligner::ProcessFinalForceOut() {
 }
 
 void LatticeLexiconWordAligner::ComputationState::Advance(
-    const CompactLatticeArc &arc, const TransitionModel &tmodel, LatticeWeight *weight) {
+    const CompactLatticeArc &arc, const Transitions &tmodel, LatticeWeight *weight) {
   const std::vector<int32> &tids = arc.weight.String();
   int32 phone;
   if (tids.empty()) phone = 0;
   else {
-    phone = tmodel.TransitionIdToPhone(tids.front());
-    KALDI_ASSERT(phone == tmodel.TransitionIdToPhone(tids.back()) &&
-                 "Error: lattice is not phone-aligned.");
+    const Transitions::TransitionIdInfo
+        &start_info = tmodel.InfoForTransitionId(tids.front()),
+        &end_info = tmodel.InfoForTransitionId(tids.back());
+    if (!start_info.is_initial || !end_info.is_final ||
+        start_info.phone != end_info.phone)
+      KALDI_ERR << "Error: lattice is not phone-aligned.";
+    phone = start_info.phone;
   }
   if (arc.ilabel != 0) { // note: arc.ilabel==arc.olabel (acceptor)
     words_.push_back(arc.ilabel);
@@ -743,7 +747,7 @@ bool LatticeLexiconWordAligner::ComputationState::TakeTransition(
 // has the same input-word and output-word.  The other case is complex
 // to test.
 static bool IsPlausibleWord(const WordAlignLatticeLexiconInfo &lexicon_info,
-                            const TransitionModel &tmodel,
+                            const Transitions &tmodel,
                             int32 word_id,
                             const std::vector<int32> &transition_ids) {
 
@@ -754,7 +758,7 @@ static bool IsPlausibleWord(const WordAlignLatticeLexiconInfo &lexicon_info,
   std::vector<int32> phones(split_alignment.size());
   for (size_t i = 0; i < split_alignment.size(); i++) {
     KALDI_ASSERT(!split_alignment[i].empty());
-    phones[i] = tmodel.TransitionIdToPhone(split_alignment[i][0]);
+    phones[i] = tmodel.InfoForTransitionId(split_alignment[i][0]).phone;
   }
   std::vector<int32> lexicon_entry;
   lexicon_entry.push_back(word_id);
@@ -925,7 +929,7 @@ static void MapSymbols(const WordAlignLatticeLexiconInfo &lexicon_info,
 }
 
 static bool TestWordAlignedLattice(const WordAlignLatticeLexiconInfo &lexicon_info,
-                                   const TransitionModel &tmodel,
+                                   const Transitions &tmodel,
                                    CompactLattice clat,
                                    CompactLattice aligned_clat,
                                    bool allow_duplicate_paths) {
@@ -999,7 +1003,7 @@ static bool TestWordAlignedLattice(const WordAlignLatticeLexiconInfo &lexicon_in
 
 // This is the wrapper function for users to call.
 bool WordAlignLatticeLexicon(const CompactLattice &lat,
-                             const TransitionModel &tmodel,
+                             const Transitions &tmodel,
                              const WordAlignLatticeLexiconInfo &lexicon_info,
                              const WordAlignLatticeLexiconOpts &opts,
                              CompactLattice *lat_out) {
@@ -1065,4 +1069,3 @@ bool ReadLexiconForWordAlign (std::istream &is,
 }
 
 }  // namespace kaldi
-
diff --git a/src/lat/word-align-lattice-lexicon.h b/src/lat/word-align-lattice-lexicon.h
index 915142234a0..823def61f80 100644
--- a/src/lat/word-align-lattice-lexicon.h
+++ b/src/lat/word-align-lattice-lexicon.h
@@ -25,7 +25,7 @@
 #include "base/kaldi-common.h"
 #include "util/common-utils.h"
 #include "fstext/fstext-lib.h"
-#include "hmm/transition-model.h"
+#include "hmm/transitions.h"
 #include "lat/kaldi-lattice.h"
 
 namespace kaldi {
@@ -161,7 +161,7 @@ struct WordAlignLatticeLexiconOpts {
 /// error including when the the lattice seems to have been "forced out"
 /// (did not reach end state, resulting in partial word at end).
 bool WordAlignLatticeLexicon(const CompactLattice &lat,
-                             const TransitionModel &tmodel,
+                             const Transitions &tmodel,
                              const WordAlignLatticeLexiconInfo &lexicon_info,
                              const WordAlignLatticeLexiconOpts &opts,
                              CompactLattice *lat_out);
@@ -177,7 +177,7 @@ bool WordAlignLatticeLexicon(const CompactLattice &lat,
 ///   partial-word arcs, with the partial-word label.
 ///   silence arcs, with the silence label.
 void TestWordAlignedLatticeLexicon(const CompactLattice &lat,
-                                   const TransitionModel &tmodel,
+                                   const Transitions &tmodel,
                                    const std::vector<std::vector<int32> > &lexicon,
                                    const CompactLattice &aligned_lat,
                                    bool allow_duplicate_paths);
diff --git a/src/lat/word-align-lattice.cc b/src/lat/word-align-lattice.cc
index 3cc43d54100..56514822130 100644
--- a/src/lat/word-align-lattice.cc
+++ b/src/lat/word-align-lattice.cc
@@ -19,7 +19,7 @@
 
 
 #include "lat/word-align-lattice.h"
-#include "hmm/transition-model.h"
+#include "hmm/transitions.h"
 #include "util/stl-utils.h"
 
 namespace kaldi {
@@ -57,7 +57,7 @@ class LatticeWordAligner {
     /// Note: the "next_state" of the arc will not be set, you have to do that
     /// yourself.
     bool OutputArc(const WordBoundaryInfo &info,
-                   const TransitionModel &tmodel,
+                   const Transitions &tmodel,
                    CompactLatticeArc *arc_out,
                    bool *error) {
       // order of this ||-expression doesn't matter for
@@ -69,15 +69,15 @@ class LatticeWordAligner {
     }
 
     bool OutputSilenceArc(const WordBoundaryInfo &info,
-                          const TransitionModel &tmodel,
+                          const Transitions &tmodel,
                           CompactLatticeArc *arc_out,
                           bool *error);
     bool OutputOnePhoneWordArc(const WordBoundaryInfo &info,
-                               const TransitionModel &tmodel,
+                               const Transitions &tmodel,
                                CompactLatticeArc *arc_out,
                                bool *error);
     bool OutputNormalWordArc(const WordBoundaryInfo &info,
-                             const TransitionModel &tmodel,
+                             const Transitions &tmodel,
                              CompactLatticeArc *arc_out,
                              bool *error);
 
@@ -101,7 +101,7 @@ class LatticeWordAligner {
     /// happen for lattices that were somehow broken, i.e.
     /// had not reached the final state.
     void OutputArcForce(const WordBoundaryInfo &info,
-                        const TransitionModel &tmodel,
+                        const Transitions &tmodel,
                         CompactLatticeArc *arc_out,
                         bool *error);
 
@@ -185,7 +185,7 @@ class LatticeWordAligner {
       // have returned false or we wouldn't have been called, so we have to
       // force it out.
       CompactLatticeArc lat_arc;
-      tuple.comp_state.OutputArcForce(info_, tmodel_, &lat_arc, &error_);
+      tuple.comp_state.OutputArcForce(wb_info_, tmodel_, &lat_arc, &error_);
       // True in the next line means add it to the queue.
       lat_arc.nextstate = GetStateForTuple(tuple, true);
       // The final-prob stuff will get called again from ProcessQueueElement().
@@ -211,7 +211,7 @@ class LatticeWordAligner {
     // epsilon-sequencing rules encoded by the filters in
     // composition.
     CompactLatticeArc lat_arc;
-    if (tuple.comp_state.OutputArc(info_, tmodel_, &lat_arc, &error_)) {
+    if (tuple.comp_state.OutputArc(wb_info_, tmodel_, &lat_arc, &error_)) {
       // note: this function changes the tuple (when it returns true).
       lat_arc.nextstate = GetStateForTuple(tuple, true); // true == add to queue,
       // if not already present.
@@ -250,11 +250,11 @@ class LatticeWordAligner {
   }
 
   LatticeWordAligner(const CompactLattice &lat,
-                     const TransitionModel &tmodel,
+                     const Transitions &tmodel,
                      const WordBoundaryInfo &info,
                      int32 max_states,
                      CompactLattice *lat_out):
-      lat_(lat), tmodel_(tmodel), info_in_(info), info_(info),
+      lat_(lat), tmodel_(tmodel), wb_info_in_(info), wb_info_(info),
       max_states_(max_states), lat_out_(lat_out),
       error_(false) {
     bool test = true;
@@ -272,17 +272,17 @@ class LatticeWordAligner {
     // stage, where we don't want the arcs corresponding to silence or
     // partial words to be removed-- only the arcs with nothing at all
     // on them.
-    if (info_.partial_word_label == 0 || info_.silence_label == 0) {
+    if (wb_info_.partial_word_label == 0 || wb_info_.silence_label == 0) {
       int32 unused_label = 1 + HighestNumberedOutputSymbol(lat);
-      if (info_.partial_word_label >= unused_label)
-        unused_label = info_.partial_word_label + 1;
-      if (info_.silence_label >= unused_label)
-        unused_label = info_.silence_label + 1;
+      if (wb_info_.partial_word_label >= unused_label)
+        unused_label = wb_info_.partial_word_label + 1;
+      if (wb_info_.silence_label >= unused_label)
+        unused_label = wb_info_.silence_label + 1;
       KALDI_ASSERT(unused_label > 0);
-      if (info_.partial_word_label == 0)
-        info_.partial_word_label = unused_label++;
-      if (info_.silence_label == 0)
-        info_.silence_label = unused_label;
+      if (wb_info_.partial_word_label == 0)
+        wb_info_.partial_word_label = unused_label++;
+      if (wb_info_.silence_label == 0)
+        wb_info_.silence_label = unused_label;
     }
   }
 
@@ -294,10 +294,10 @@ class LatticeWordAligner {
     // Remove epsilon arcs from output lattice.
     RmEpsilon(lat_out_, true); // true = connect.
     std::vector<int32> syms_to_remove;
-    if (info_in_.partial_word_label == 0)
-      syms_to_remove.push_back(info_.partial_word_label);
-    if (info_in_.silence_label == 0)
-      syms_to_remove.push_back(info_.silence_label);
+    if (wb_info_in_.partial_word_label == 0)
+      syms_to_remove.push_back(wb_info_.partial_word_label);
+    if (wb_info_in_.silence_label == 0)
+      syms_to_remove.push_back(wb_info_.silence_label);
     if (!syms_to_remove.empty()) {
       RemoveSomeInputSymbols(syms_to_remove, lat_out_);
       Project(lat_out_, fst::PROJECT_INPUT);
@@ -332,9 +332,9 @@ class LatticeWordAligner {
   }
 
   CompactLattice lat_;
-  const TransitionModel &tmodel_;
-  const WordBoundaryInfo &info_in_;
-  WordBoundaryInfo info_;
+  const Transitions &tmodel_;
+  const WordBoundaryInfo &wb_info_in_;
+  WordBoundaryInfo wb_info_;
   int32 max_states_;
   CompactLattice *lat_out_;
 
@@ -348,222 +348,175 @@ class LatticeWordAligner {
 };
 
 bool LatticeWordAligner::ComputationState::OutputSilenceArc(
-    const WordBoundaryInfo &info, const TransitionModel &tmodel,
+    const WordBoundaryInfo &wb_info, const Transitions &tmodel,
     CompactLatticeArc *arc_out,  bool *error) {
   if (transition_ids_.empty()) return false;
-  int32 phone = tmodel.TransitionIdToPhone(transition_ids_[0]);
-  if (info.TypeOfPhone(phone) != WordBoundaryInfo::kNonWordPhone) return false;
+  const Transitions::TransitionIdInfo *prev_info = &tmodel.InfoForTransitionId(
+      transition_ids_[0]);
+
+  if (wb_info.TypeOfPhone(prev_info->phone) != WordBoundaryInfo::kNonWordPhone)
+    return false;
+
 
   // we assume the start of transition_ids_ is the start of the phone [silence];
   // this is a precondition.
+  if (!prev_info->is_initial) {
+    KALDI_WARN << "Something went wrong in word alignment; likely model mismatch.";
+    return false;
+  }
+
   size_t len = transition_ids_.size(), i;
-  // Keep going till we reach a "final" transition-id; note, if
-  // reorder==true, we have to go a bit further after this.
-  for (i = 0; i < len; i++) {
-    int32 tid = transition_ids_[i];
-    int32 this_phone = tmodel.TransitionIdToPhone(tid);
-    if (this_phone != phone && ! *error) { // error condition: should have reached final transition-id first.
-      *error = true;
-      KALDI_WARN << "Phone changed before final transition-id found "
-          "[broken lattice or mismatched model or wrong --reorder option?]";
+
+  for (i = 1; i < len; i++) {
+    const Transitions::TransitionIdInfo *this_info = &tmodel.InfoForTransitionId(
+        transition_ids_[i]);
+    if (prev_info->is_final && this_info->is_initial) {
+      // This is a phone boundary.
+      std::vector<int32> tids_out(transition_ids_.begin(),
+                                  transition_ids_.begin() + i);
+      *arc_out = CompactLatticeArc(wb_info.silence_label, wb_info.silence_label,
+                                   CompactLatticeWeight(weight_, tids_out), fst::kNoStateId);
+      transition_ids_.erase(transition_ids_.begin(), transition_ids_.begin() + i);
+      weight_ = LatticeWeight::One(); // we just output the weight.
+      return true;
     }
-    if (tmodel.IsFinal(tid))
-      break;
-  }
-  if (i == len) return false; // fell off loop.
-  i++; // go past the one for which IsFinal returned true.
-  if (info.reorder) // we have to consume the following self-loop transition-ids.
-    while (i < len && tmodel.IsSelfLoop(transition_ids_[i])) i++;
-  if (i == len) return false; // we don't know if it ends here... so can't output arc.
-
-  if (tmodel.TransitionIdToPhone(transition_ids_[i-1]) != phone
-      && ! *error) { // another check.
-    KALDI_WARN << "Phone changed unexpectedly in lattice "
-        "[broken lattice or mismatched model?]";
+    prev_info = this_info;
   }
-  // interpret i as the number of transition-ids to consume.
-  std::vector<int32> tids_out(transition_ids_.begin(), transition_ids_.begin()+i);
-
-  // consumed transition ids from our internal state.
-  *arc_out = CompactLatticeArc(info.silence_label, info.silence_label,
-                               CompactLatticeWeight(weight_, tids_out), fst::kNoStateId);
-  transition_ids_.erase(transition_ids_.begin(), transition_ids_.begin()+i); // delete these
-  weight_ = LatticeWeight::One(); // we just output the weight.
-  return true;
+  // We couldn't find a word boundary.  Note: we also return false if the
+  // word boundary was at the end of this sequence, because we don't know at this point
+  // that it was a word boundary.   End of lattice effects will be handled separately.
+  return false;
 }
 
 
 bool LatticeWordAligner::ComputationState::OutputOnePhoneWordArc(
-    const WordBoundaryInfo &info, const TransitionModel &tmodel,
+    const WordBoundaryInfo &wb_info, const Transitions &tmodel,
     CompactLatticeArc *arc_out,  bool *error) {
   if (transition_ids_.empty()) return false;
   if (word_labels_.empty()) return false;
-  int32 phone = tmodel.TransitionIdToPhone(transition_ids_[0]);
-  if (info.TypeOfPhone(phone) != WordBoundaryInfo::kWordBeginAndEndPhone)
+  const Transitions::TransitionIdInfo *prev_info = &tmodel.InfoForTransitionId(
+      transition_ids_[0]);
+  if (wb_info.TypeOfPhone(prev_info->phone) != WordBoundaryInfo::kWordBeginAndEndPhone)
     return false;
-  // we assume the start of transition_ids_ is the start of the phone.
-  // this is a precondition.
+  if (!prev_info->is_initial) {
+    KALDI_WARN << "Something went wrong in word alignment; likely model mismatch.";
+    return false;
+  }
+
   size_t len = transition_ids_.size(), i;
-  for (i = 0; i < len; i++) {
-    int32 tid = transition_ids_[i];
-    int32 this_phone = tmodel.TransitionIdToPhone(tid);
-    if (this_phone != phone && ! *error) { // error condition: should have reached final transition-id first.
-      KALDI_WARN << "Phone changed before final transition-id found "
-          "[broken lattice or mismatched model or wrong --reorder option?]";
-      // just continue, ignoring this-- we'll probably output something...
+  for (i = 1; i < len; i++) {
+    const Transitions::TransitionIdInfo *this_info = &tmodel.InfoForTransitionId(
+        transition_ids_[i]);
+    if (prev_info->is_final && this_info->is_initial) {
+      // This is a phone boundary.
+      int32 word = word_labels_[0];
+      std::vector<int32> tids_out(transition_ids_.begin(),
+                                  transition_ids_.begin() + i);
+      *arc_out = CompactLatticeArc(word, word,
+                                   CompactLatticeWeight(weight_, tids_out), fst::kNoStateId);
+      transition_ids_.erase(transition_ids_.begin(),
+                            transition_ids_.begin() + i);
+      weight_ = LatticeWeight::One();  // we just output the weight.
+      word_labels_.erase(word_labels_.begin());
+      return true;
     }
-    if (tmodel.IsFinal(tid))
-      break;
-  }
-  if (i == len) return false; // fell off loop.
-  i++; // go past the one for which IsFinal returned true.
-  if (info.reorder) // we have to consume the following self-loop transition-ids.
-    while (i < len && tmodel.IsSelfLoop(transition_ids_[i])) i++;
-  if (i == len) return false; // we don't know if it ends here... so can't output arc.
-
-  if (tmodel.TransitionIdToPhone(transition_ids_[i-1]) != phone
-      && ! *error) { // another check.
-    KALDI_WARN << "Phone changed unexpectedly in lattice "
-        "[broken lattice or mismatched model?]";
-    *error = true;
+    prev_info = this_info;
   }
-
-  // interpret i as the number of transition-ids to consume.
-  std::vector<int32> tids_out(transition_ids_.begin(),
-                              transition_ids_.begin() + i);
-
-  // consumed transition ids from our internal state.
-  int32 word = word_labels_[0];
-  *arc_out = CompactLatticeArc(word, word,
-                               CompactLatticeWeight(weight_, tids_out), fst::kNoStateId);
-  transition_ids_.erase(transition_ids_.begin(),
-                        transition_ids_.begin() + i); // delete these
-  // Remove the word that we just output.
-  word_labels_.erase(word_labels_.begin(), word_labels_.begin() + 1);
-  weight_ = LatticeWeight::One(); // we just output the weight.
-  return true;
+  // We couldn't find a word boundary.  Note: we also return false if the
+  // word boundary was at the end of this sequence, because we don't know at this point
+  // that it was a word boundary.   End of lattice effects will be handled separately.
+  return false;
 }
 
 
 /// This function tries to see if it can output a normal word arc--
 /// one with at least two phones in it.
 bool LatticeWordAligner::ComputationState::OutputNormalWordArc(
-    const WordBoundaryInfo &info, const TransitionModel &tmodel,
+    const WordBoundaryInfo &wb_info, const Transitions &tmodel,
     CompactLatticeArc *arc_out,  bool *error) {
   if (transition_ids_.empty()) return false;
   if (word_labels_.empty()) return false;
-  int32 begin_phone = tmodel.TransitionIdToPhone(transition_ids_[0]);
-  if (info.TypeOfPhone(begin_phone) != WordBoundaryInfo::kWordBeginPhone)
+  const Transitions::TransitionIdInfo *prev_info = &tmodel.InfoForTransitionId(
+      transition_ids_[0]);
+  if (wb_info.TypeOfPhone(prev_info->phone) != WordBoundaryInfo::kWordBeginPhone)
     return false;
-  // we assume the start of transition_ids_ is the start of the phone.
+  // Note, we assume the start of transition_ids_ is the start of the phone.
   // this is a precondition.
   size_t len = transition_ids_.size(), i;
 
-  // Eat up the transition-ids of this word-begin phone until we get to the
-  // "final" transition-id.  [there may be self-loops following this though,
-  // if reorder==true]
-  for (i = 0; i < len && !tmodel.IsFinal(transition_ids_[i]); i++);
-  if (i == len) return false;
-  i++; // Skip over this final-transition.
-  if (info.reorder) // Skip over any reordered self-loops for this final-transition
-    for (; i < len && tmodel.IsSelfLoop(transition_ids_[i]); i++);
-  if (i == len) return false;
-  if (tmodel.TransitionIdToPhone(transition_ids_[i-1]) != begin_phone
-      && ! *error) { // another check.
-    KALDI_WARN << "Phone changed unexpectedly in lattice "
-        "[broken lattice or mismatched model?]";
-    *error = true;
-  }
-  // Now keep going till we hit a word-ending phone.
-  // Note: we don't expect anything except word-internal phones
-  // here, but we'll just print a warning if we get something
-  // else.
-  for (; i < len; i++) {
-    int32 this_phone = tmodel.TransitionIdToPhone(transition_ids_[i]);
-    if (info.TypeOfPhone(this_phone) == WordBoundaryInfo::kWordEndPhone)
+  for (i = 1; i < len; i++) {
+    const Transitions::TransitionIdInfo *this_info = &tmodel.InfoForTransitionId(
+        transition_ids_[i]);
+    if (prev_info->is_final && this_info->is_initial) {
+      // This is a phone boundary.
       break;
-    if (info.TypeOfPhone(this_phone) != WordBoundaryInfo::kWordInternalPhone
-        && !*error) {
-      KALDI_WARN << "Unexpected phone " << this_phone
-                 << " found inside a word.";
-      *error = true;
     }
+    prev_info = this_info;
   }
-  if (i == len) return false;
-
-  // OK, we hit a word-ending phone.  Continue till we get to
-  // a "final-transition".
-
-  // this variable just used for checks.
-  int32 final_phone = tmodel.TransitionIdToPhone(transition_ids_[i]);
-  for (; i < len; i++) {
-    int32 this_phone = tmodel.TransitionIdToPhone(transition_ids_[i]);
-    if (this_phone != final_phone && ! *error) {
-      *error = true;
-      KALDI_WARN << "Phone changed before final transition-id found "
-          "[broken lattice or mismatched model or wrong --reorder option?]";
-    }
-    if (tmodel.IsFinal(transition_ids_[i])) break;
+  // OK, we just consumed the word-initial phone.
+  if (i == len)
+    return false;
+  // Eat up any word-internal phones.
+  while (i < len && wb_info.TypeOfPhone(prev_info->phone) ==
+         WordBoundaryInfo::kWordInternalPhone) {
+    prev_info = &tmodel.InfoForTransitionId(transition_ids_[i]);
+    i++;
   }
-  if (i == len) return false;
-  i++;
-  // We got to the final-transition of the final phone;
-  // if reorder==true, continue eating up the self-loop.
-  if (info.reorder == true)
-    while (i < len && tmodel.IsSelfLoop(transition_ids_[i])) i++;
-  if (i == len) return false;
-  if (tmodel.TransitionIdToPhone(transition_ids_[i-1]) != final_phone
-      && ! *error) {
-    *error = true;
-    KALDI_WARN << "Phone changed while following final self-loop "
-        "[broken lattice or mismatched model or wrong --reorder option?]";
+  if (i == len)
+    return false;
+  // Try to find the ending of the next phone, which should be a word-final
+  // phone.
+  for (i = 1; i < len; i++) {
+    const Transitions::TransitionIdInfo *this_info = &tmodel.InfoForTransitionId(
+        transition_ids_[i]);
+    if (prev_info->is_final && this_info->is_initial) {
+      // This is a phone boundary.
+      if (wb_info.TypeOfPhone(prev_info->phone) !=
+          WordBoundaryInfo::kWordEndPhone) {
+        if (! *error) {
+          *error = true;
+          KALDI_WARN << "Unexpected phone sequences found.. something is wrong.";
+        }
+        return false;
+      }
+      int32 word = word_labels_[0];
+      std::vector<int32> tids_out(transition_ids_.begin(),
+                                  transition_ids_.begin() + i);
+      *arc_out = CompactLatticeArc(word, word,
+                                   CompactLatticeWeight(weight_, tids_out),
+                                   fst::kNoStateId);
+      transition_ids_.erase(transition_ids_.begin(),
+                            transition_ids_.begin() + i);
+      weight_ = LatticeWeight::One();  // we just output the weight.
+      word_labels_.erase(word_labels_.begin());
+      return true;
+    }
+    prev_info = this_info;
   }
-
-  // OK, we're ready to output the word.
-  // Interpret i as the number of transition-ids to consume.
-  std::vector<int32> tids_out(transition_ids_.begin(),
-                              transition_ids_.begin() + i);
-
-  // consumed transition ids from our internal state.
-  int32 word = word_labels_[0];
-  *arc_out = CompactLatticeArc(word, word,
-                               CompactLatticeWeight(weight_, tids_out),
-                               fst::kNoStateId);
-  transition_ids_.erase(transition_ids_.begin(),
-                        transition_ids_.begin() + i); // delete these
-  // Remove the word that we just output.
-  word_labels_.erase(word_labels_.begin(),
-                     word_labels_.begin() + 1);
-  weight_ = LatticeWeight::One(); // we just output the weight.
-  return true;
+  return false;
 }
 
 // Returns true if this vector of transition-ids could be a valid
 // word.  Note: the checks are not 100% exhaustive.
-static bool IsPlausibleWord(const WordBoundaryInfo &info,
-                            const TransitionModel &tmodel,
+static bool IsPlausibleWord(const WordBoundaryInfo &wb_info,
+                            const Transitions &tmodel,
                             const std::vector<int32> &transition_ids) {
   if (transition_ids.empty()) return false;
-  int32 first_phone = tmodel.TransitionIdToPhone(transition_ids.front()),
-      last_phone = tmodel.TransitionIdToPhone(transition_ids.back());
-  if ( (info.TypeOfPhone(first_phone) == WordBoundaryInfo::kWordBeginAndEndPhone
-        && first_phone == last_phone)
-       ||
-       (info.TypeOfPhone(first_phone) == WordBoundaryInfo::kWordBeginPhone &&
-        info.TypeOfPhone(last_phone) == WordBoundaryInfo::kWordEndPhone) ) {
-    if (! info.reorder) {
-      return (tmodel.IsFinal(transition_ids.back()));
-    } else {
-      int32 i = transition_ids.size() - 1;
-      while (i > 0 && tmodel.IsSelfLoop(transition_ids[i])) i--;
-      return tmodel.IsFinal(transition_ids[i]);
-    }
-  } else return false;
+  const Transitions::TransitionIdInfo
+      &first_info = tmodel.InfoForTransitionId(transition_ids.front()),
+      &last_info = tmodel.InfoForTransitionId(transition_ids.back());
+  if (!first_info.is_initial || !last_info.is_final)
+    return false;
+  int32 first_phone = first_info.phone, last_phone = last_info.phone;
+  return ((wb_info.TypeOfPhone(first_phone) == WordBoundaryInfo::kWordBeginAndEndPhone
+           && first_phone == last_phone) ||
+          (wb_info.TypeOfPhone(first_phone) == WordBoundaryInfo::kWordBeginPhone &&
+           wb_info.TypeOfPhone(last_phone) == WordBoundaryInfo::kWordEndPhone) );
 }
 
 
 void LatticeWordAligner::ComputationState::OutputArcForce(
-    const WordBoundaryInfo &info, const TransitionModel &tmodel,
+    const WordBoundaryInfo &info, const Transitions &tmodel,
     CompactLatticeArc *arc_out,  bool *error) {
 
   KALDI_ASSERT(!IsEmpty());
@@ -600,10 +553,10 @@ void LatticeWordAligner::ComputationState::OutputArcForce(
     word_labels_.clear();
   } else if (!transition_ids_.empty() && word_labels_.empty()) {
     // Transition-ids but no word label-- either silence or partial word.
-    int32 first_phone = tmodel.TransitionIdToPhone(transition_ids_[0]);
+    int32 first_phone = tmodel.InfoForTransitionId(transition_ids_[0]).phone;
     if (info.TypeOfPhone(first_phone) == WordBoundaryInfo::kNonWordPhone) {
       // first phone is silence...
-      if (first_phone != tmodel.TransitionIdToPhone(transition_ids_.back())
+      if (first_phone != tmodel.InfoForTransitionId(transition_ids_.back()).phone
           && ! *error) {
         *error = true;
         // Phone changed-- this is a code error, because the regular OutputArc
@@ -612,16 +565,12 @@ void LatticeWordAligner::ComputationState::OutputArcForce(
         KALDI_ERR << "Broken silence arc at end of utterance (the phone "
             "changed); code error";
       }
-      if (!*error) { // Check that it ends at the end state of silence; error otherwise.
-        int32 i = transition_ids_.size() - 1;
-        if (info.reorder)
-          while (tmodel.IsSelfLoop(transition_ids_[i]) && i > 0)
-            i--;
-        if (!tmodel.IsFinal(transition_ids_[i])) {
-          *error = true;
-          KALDI_WARN << "Broken silence arc at end of utterance (does not "
-              "reach end of silence)";
-        }
+      if (!*error &&
+          !tmodel.InfoForTransitionId(transition_ids_.back()).is_final) {
+        // warn but output it anyway.
+        *error = true;
+        KALDI_WARN << "Broken silence arc at end of utterance (does not "
+            "reach end of silence)";
       }
       CompactLatticeWeight cw(weight_, transition_ids_);
       *arc_out = CompactLatticeArc(info.silence_label, info.silence_label,
@@ -673,20 +622,17 @@ WordBoundaryInfo::WordBoundaryInfo(const WordBoundaryInfoOpts &opts) {
   SetOptions(opts.winternal_phones, kWordInternalPhone);
   SetOptions(opts.silence_phones, (opts.silence_has_olabels ?
                                    kWordBeginAndEndPhone : kNonWordPhone));
-  reorder = opts.reorder;
   silence_label = opts.silence_label;
   partial_word_label = opts.partial_word_label;
 }
 
 WordBoundaryInfo::WordBoundaryInfo(const WordBoundaryInfoNewOpts &opts) {
-  reorder = opts.reorder;
   silence_label = opts.silence_label;
   partial_word_label = opts.partial_word_label;
 }
 
 WordBoundaryInfo::WordBoundaryInfo(const WordBoundaryInfoNewOpts &opts,
                                    std::string word_boundary_file) {
-  reorder = opts.reorder;
   silence_label = opts.silence_label;
   partial_word_label = opts.partial_word_label;
   bool binary_in;
@@ -721,7 +667,7 @@ void WordBoundaryInfo::Init(std::istream &stream) {
 }
 
 bool WordAlignLattice(const CompactLattice &lat,
-                      const TransitionModel &tmodel,
+                      const Transitions &tmodel,
                       const WordBoundaryInfo &info,
                       int32 max_states,
                       CompactLattice *lat_out) {
@@ -734,165 +680,94 @@ bool WordAlignLattice(const CompactLattice &lat,
 class WordAlignedLatticeTester {
  public:
   WordAlignedLatticeTester(const CompactLattice &lat,
-                           const TransitionModel &tmodel,
+                           const Transitions &tmodel,
                            const WordBoundaryInfo &info,
                            const CompactLattice &aligned_lat):
-      lat_(lat), tmodel_(tmodel), info_(info), aligned_lat_(aligned_lat) { }
+      lat_(lat), tmodel_(tmodel), wb_info_(info), aligned_lat_(aligned_lat) { }
 
   void Test() {
     // First test that each aligned arc is valid.
     typedef CompactLattice::StateId StateId ;
+    typedef CompactLattice::Arc Arc;
     for (StateId s = 0; s < aligned_lat_.NumStates(); s++) {
       for (fst::ArcIterator<CompactLattice> iter(aligned_lat_, s);
            !iter.Done();
            iter.Next()) {
-        TestArc(iter.Value());
-      }
-      if (aligned_lat_.Final(s) != CompactLatticeWeight::Zero()) {
-        TestFinal(aligned_lat_.Final(s));
+        const Arc &arc = iter.Value();
+        if (!TestArc(arc))
+          KALDI_ERR << "Invalid arc in aligned CompactLattice: "
+                    << arc.ilabel << " " << arc.olabel << " " << arc.nextstate
+                    << " " << arc.weight;
       }
+      if (aligned_lat_.Final(s) != CompactLatticeWeight::Zero() &&
+          !aligned_lat_.Final(s).String().empty())
+        KALDI_ERR << "Expect to have no strings on final-weights of word-aligned "
+            "lattices.";
     }
     TestEquivalent();
   }
  private:
-  void TestArc(const CompactLatticeArc &arc) {
-    if (! (TestArcSilence(arc) || TestArcNormalWord(arc) || TestArcOnePhoneWord(arc)
-           || TestArcEmpty(arc)))
-      KALDI_ERR << "Invalid arc in aligned CompactLattice: "
-                << arc.ilabel << " " << arc.olabel << " " << arc.nextstate
-                << " " << arc.weight;
-  }
-  bool TestArcEmpty(const CompactLatticeArc &arc) {
-    if (arc.ilabel != 0) return false; // Check there is no label.  Note, ilabel==olabel.
-    const std::vector<int32> &tids = arc.weight.String();
-    return tids.empty();
-  }
-  bool TestArcSilence(const CompactLatticeArc &arc) {
-    // This only applies when silence doesn't have word labels.
-    if (arc.ilabel !=  info_.silence_label) return false; // Check the label is
-    // the silence label. Note, ilabel==olabel.
-    const std::vector<int32> &tids = arc.weight.String();
-    if (tids.empty()) return false;
-    int32 first_phone = tmodel_.TransitionIdToPhone(tids.front());
-    if (info_.TypeOfPhone(first_phone) != WordBoundaryInfo::kNonWordPhone)
+  bool TestArc(const CompactLatticeArc &arc) {
+    std::vector<int32> phones;
+    if (!SplitArcToPhones(arc, &phones))
       return false;
-    for (size_t i = 0; i < tids.size(); i++)
-      if (tmodel_.TransitionIdToPhone(tids[i]) != first_phone) return false;
-
-    if (!info_.reorder) return tmodel_.IsFinal(tids.back());
-    else {
-      for (size_t i = 0; i < tids.size(); i++) {
-        if (tmodel_.IsFinal(tids[i])) { // got the "final" transition, which is
-          // reordered to actually not be final.  Make sure that all the
-          // rest of the transition ids are the self-loop of that same
-          // transition-state.
-          for (size_t j = i+1; j < tids.size(); j++) {
-            if (!(tmodel_.TransitionIdToTransitionState(tids[j])
-                  == tmodel_.TransitionIdToTransitionState(tids[i]))) return false;
-          }
-          return true;
-        }
-      }
-      return false; // fell off loop.  No final-state present.
-    }
-  }
-
-  bool TestArcOnePhoneWord(const CompactLatticeArc &arc) {
-    if (arc.ilabel == 0) return false; // Check there's a label.  Note, ilabel==olabel.
-    const std::vector<int32> &tids = arc.weight.String();
-    if (tids.empty()) return false;
-    int32 first_phone = tmodel_.TransitionIdToPhone(tids.front());
-    if (info_.TypeOfPhone(first_phone) !=
-        WordBoundaryInfo::kWordBeginAndEndPhone) return false;
-    for (size_t i = 0; i < tids.size(); i++)
-      if (tmodel_.TransitionIdToPhone(tids[i]) != first_phone) return false;
-
-    if (!info_.reorder) return tmodel_.IsFinal(tids.back());
-    else {
-      for (size_t i = 0; i < tids.size(); i++) {
-        if (tmodel_.IsFinal(tids[i])) { // got the "final" transition, which is
-          // reordered to actually not be final.  Make sure that all the
-          // rest of the transition ids are the self-loop of that same
-          // transition-state.
-          for (size_t j = i+1; j < tids.size(); j++) {
-            if (tmodel_.TransitionIdToTransitionState(tids[j])
-                != tmodel_.TransitionIdToTransitionState(tids[i])) return false;
-          }
-          return true;
-        }
-      }
-      return false; // fell off loop.  No final-state present.
+    if (arc.ilabel == 0 && phones.empty())
+      return true;  // epsilon/empty arc (allowed).
+    if (arc.ilabel == wb_info_.silence_label &&
+        phones.size() == 1 &&
+        wb_info_.TypeOfPhone(phones.front()) == WordBoundaryInfo::kNonWordPhone)
+      return true;  // could be a silence arc.
+    if (arc.ilabel != 0 && phones.size() == 1 &&
+        wb_info_.TypeOfPhone(phones.front()) == WordBoundaryInfo::kWordBeginAndEndPhone)
+      return true;  // could be single-phone word arc.
+
+
+    {  // Now test if it could be a normal (non-single-phone) word arc.
+      if (phones.size() < 2 || arc.ilabel == 0) return false;
+      if (wb_info_.TypeOfPhone(phones.front()) != WordBoundaryInfo::kWordBeginPhone)
+        return false;
+      for (size_t i = 1; 1 + 1 < phones.size(); i++)
+        if (wb_info_.TypeOfPhone(phones[i]) != WordBoundaryInfo::kWordInternalPhone)
+          return false;
+      if (wb_info_.TypeOfPhone(phones.back()) != WordBoundaryInfo::kWordEndPhone)
+        return false;
+      return true;  // A normal word arc
     }
   }
 
-  bool TestArcNormalWord(const CompactLatticeArc &arc) {
-    if (arc.ilabel == 0) return false; // Check there's a label.  Note, ilabel==olabel.
+  // This function, used in testing code, splits up the transition_ids on an arc into
+  // a sequence of phones.  If returns false if the arc does not contain "whole phones",
+  // i.e. if it doesn't start at the start of a phone and end at the end of a phone.
+  bool SplitArcToPhones(const CompactLatticeArc &arc,
+                        std::vector<int32> *phones) {
     const std::vector<int32> &tids = arc.weight.String();
-    if (tids.empty()) return false;
-    int32 first_phone = tmodel_.TransitionIdToPhone(tids.front());
-    if (info_.TypeOfPhone(first_phone) != WordBoundaryInfo::kWordBeginPhone)
+    phones->clear();
+    if (tids.empty())
+      return true;
+    const Transitions::TransitionIdInfo *cur_info = &tmodel_.InfoForTransitionId(
+        tids[0]);
+    if (!cur_info->is_initial)
       return false;
-    size_t i;
-    { // first phone.
-      int num_final = 0;
-      for (i = 0; i < tids.size(); i++) {
-        if (tmodel_.TransitionIdToPhone(tids[i]) != first_phone) break;
-        if (tmodel_.IsFinal(tids[i])) num_final++;
-      }
-      if (num_final != 1)
-        return false; // Something went wrong-- perhaps we
-      // got two beginning phones in a row.
-    }
-    { // middle phones.  Skip over them.
-      while (i < tids.size() &&
-             info_.TypeOfPhone(tmodel_.TransitionIdToPhone(tids[i]))
-             == WordBoundaryInfo::kWordInternalPhone)
-        i++;
-    }
-    if (i == tids.size()) return false;
-    int32 final_phone = tmodel_.TransitionIdToPhone(tids[i]);
-    if (info_.TypeOfPhone(final_phone) != WordBoundaryInfo::kWordEndPhone)
-      return false; // not word-ending.
-    for (size_t j = i; j < tids.size(); j++) // make sure only this final phone till end.
-      if (tmodel_.TransitionIdToPhone(tids[j]) != final_phone)
-        return false; // Other phones after final phone.
-
-    for (size_t j = i; j < tids.size(); j++) {
-      if (tmodel_.IsFinal(tids[j])) { // Found "final transition"..   Note:
-        // may be "reordered" with its self loops.
-        if (!info_.reorder) return (j+1 == tids.size());
-        else {
-          // Make sure the only thing that follows this is self-loops
-          // of the final transition-state.
-          for (size_t k = j + 1; k < tids.size(); k++)
-            if (tmodel_.TransitionIdToTransitionState(tids[k])
-                != tmodel_.TransitionIdToTransitionState(tids[j])
-                || !tmodel_.IsSelfLoop(tids[k]))
-              return false;
-          return true;
-        }
+    size_t len = tids.size(), i;
+    for (i = 0; i < len; i++) {
+      cur_info = &tmodel_.InfoForTransitionId(tids[i]);
+      if (cur_info->is_initial && !cur_info->is_self_loop) {
+        // there is exactly one such arc per phone.
+        phones->push_back(cur_info->phone);
       }
+      return false;
     }
-    return false; // Found no final state.
+    if (!cur_info->is_final)
+      return false;
+    return true;
   }
 
-  bool TestArcPartialWord(const CompactLatticeArc &arc) {
-    if (arc.ilabel != info_.partial_word_label) return false; // label should
-    // be the partial-word label.
-    const std::vector<int32> &tids = arc.weight.String();
-    if (tids.empty()) return false;
-    return true; // We're pretty liberal when it comes to partial words here.
-  }
 
-  void TestFinal(const CompactLatticeWeight &w) {
-    if (!w.String().empty())
-      KALDI_ERR << "Expect to have no strings on final-weights of lattices.";
-  }
   void TestEquivalent() {
     CompactLattice aligned_lat(aligned_lat_);
-    if (info_.silence_label != 0) { // remove silence labels.
+    if (wb_info_.silence_label != 0) { // remove silence labels.
       std::vector<int32> to_remove;
-      to_remove.push_back(info_.silence_label);
+      to_remove.push_back(wb_info_.silence_label);
       RemoveSomeInputSymbols(to_remove, &aligned_lat);
       Project(&aligned_lat, fst::PROJECT_INPUT);
     }
@@ -904,8 +779,8 @@ class WordAlignedLatticeTester {
   }
 
   const CompactLattice &lat_;
-  const TransitionModel &tmodel_;
-  const WordBoundaryInfo &info_;
+  const Transitions &tmodel_;
+  const WordBoundaryInfo &wb_info_;
   const CompactLattice &aligned_lat_;
 };
 
@@ -916,7 +791,7 @@ class WordAlignedLatticeTester {
 /// succeeded and it wasn't a forced-out lattice); otherwise the test will most
 /// likely fail.
 void TestWordAlignedLattice(const CompactLattice &lat,
-                            const TransitionModel &tmodel,
+                            const Transitions &tmodel,
                             const WordBoundaryInfo &info,
                             const CompactLattice &aligned_lat) {
   WordAlignedLatticeTester t(lat, tmodel, info, aligned_lat);
diff --git a/src/lat/word-align-lattice.h b/src/lat/word-align-lattice.h
index 41be075d108..725341bc867 100644
--- a/src/lat/word-align-lattice.h
+++ b/src/lat/word-align-lattice.h
@@ -25,7 +25,7 @@
 #include "base/kaldi-common.h"
 #include "util/common-utils.h"
 #include "fstext/fstext-lib.h"
-#include "hmm/transition-model.h"
+#include "hmm/transitions.h"
 #include "lat/kaldi-lattice.h"
 
 namespace kaldi {
@@ -34,7 +34,7 @@ namespace kaldi {
 struct WordBoundaryInfoOpts {
   // Note: use of this structure
   // is deprecated, see WordBoundaryInfoNewOpts.
-  
+
   // Note: this structure (and the code in word-align-lattice.{h,cc}
   // makes stronger assumptions than the rest of the Kaldi toolkit:
   // that is, it assumes you have word-position-dependent phones,
@@ -51,14 +51,13 @@ struct WordBoundaryInfoOpts {
   std::string silence_phones;
   int32 silence_label;
   int32 partial_word_label;
-  bool reorder;
   bool silence_may_be_word_internal;
   bool silence_has_olabels;
-  
+
   WordBoundaryInfoOpts(): silence_label(0), partial_word_label(0),
-                          reorder(true), silence_may_be_word_internal(false),
+                          silence_may_be_word_internal(false),
                           silence_has_olabels(false) { }
-  
+
   void Register(OptionsItf *opts) {
     opts->Register("wbegin-phones", &wbegin_phones, "Colon-separated list of "
                    "numeric ids of phones that begin a word");
@@ -80,12 +79,9 @@ struct WordBoundaryInfoOpts {
                    "word symbol that is to be used for arcs in the word-aligned "
                    "lattice corresponding to partial words at the end of "
                    "\"forced-out\" utterances (zero is OK)");
-    opts->Register("reorder", &reorder, "True if the lattices were generated "
-                   "from graphs that had the --reorder option true, relating to "
-                   "reordering self-loops (typically true)");
     opts->Register("silence-may-be-word-internal", &silence_may_be_word_internal,
                    "If true, silence may appear inside words' prons (but not at begin/end!)\n");
-    opts->Register("silence-has-olabels", &silence_has_olabels, 
+    opts->Register("silence-has-olabels", &silence_has_olabels,
                    "If true, silence phones have output labels in the lattice, just\n"
                    "like regular words.  [This means you can't have un-labeled silences]");
   }
@@ -96,11 +92,9 @@ struct WordBoundaryInfoOpts {
 struct WordBoundaryInfoNewOpts {
   int32 silence_label;
   int32 partial_word_label;
-  bool reorder;
-  
-  WordBoundaryInfoNewOpts(): silence_label(0), partial_word_label(0),
-                             reorder(true) { }
-  
+
+  WordBoundaryInfoNewOpts(): silence_label(0), partial_word_label(0) { }
+
   void Register(OptionsItf *opts) {
     opts->Register("silence-label", &silence_label, "Numeric id of word symbol "
                    "that is to be used for silence arcs in the word-aligned "
@@ -109,9 +103,6 @@ struct WordBoundaryInfoNewOpts {
                    "word symbol that is to be used for arcs in the word-aligned "
                    "lattice corresponding to partial words at the end of "
                    "\"forced-out\" utterances (zero is OK)");
-    opts->Register("reorder", &reorder, "True if the lattices were generated "
-                   "from graphs that had the --reorder option true, relating to "
-                   "reordering self-loops (typically true)");
   }
 };
 
@@ -150,7 +141,7 @@ struct WordBoundaryInfo {
           "word-boundary file (or options)";
     return phone_to_type[p];
   }
-  
+
   std::vector<PhoneType> phone_to_type;
 
   int32 silence_label; // The integer label we give to silence words.
@@ -189,7 +180,7 @@ struct WordBoundaryInfo {
 /// abort the computation, return false and produce an empty
 /// lattice out.
 bool WordAlignLattice(const CompactLattice &lat,
-                      const TransitionModel &tmodel,
+                      const Transitions &tmodel,
                       const WordBoundaryInfo &info,
                       int32 max_states,
                       CompactLattice *lat_out);
@@ -203,7 +194,7 @@ bool WordAlignLattice(const CompactLattice &lat,
 ///   partial-word arcs, with the partial-word label.
 ///   silence arcs, with the silence label.
 void TestWordAlignedLattice(const CompactLattice &lat,
-                            const TransitionModel &tmodel,
+                            const Transitions &tmodel,
                             const WordBoundaryInfo &info,
                             const CompactLattice &aligned_lat);
 
diff --git a/src/latbin/lattice-add-trans-probs.cc b/src/latbin/lattice-add-trans-probs.cc
index 0fa79338f8e..7f764756930 100644
--- a/src/latbin/lattice-add-trans-probs.cc
+++ b/src/latbin/lattice-add-trans-probs.cc
@@ -23,7 +23,7 @@
 #include "fstext/fstext-lib.h"
 #include "lat/kaldi-lattice.h"
 #include "lat/lattice-functions.h"
-#include "hmm/transition-model.h"
+#include "hmm/transitions.h"
 #include "hmm/hmm-utils.h"
 
 int main(int argc, char *argv[]) {
@@ -68,7 +68,7 @@ int main(int argc, char *argv[]) {
 
     int32 n_done = 0;
 
-    TransitionModel trans_model;
+    Transitions trans_model;
 
     ReadKaldiObject(model_rxfilename, &trans_model);
 
diff --git a/src/latbin/lattice-align-phones.cc b/src/latbin/lattice-align-phones.cc
index 9367fb1f3a7..6781487e962 100644
--- a/src/latbin/lattice-align-phones.cc
+++ b/src/latbin/lattice-align-phones.cc
@@ -63,7 +63,7 @@ int main(int argc, char *argv[]) {
         lats_rspecifier = po.GetArg(2),
         lats_wspecifier = po.GetArg(3);
     
-    TransitionModel tmodel;
+    Transitions tmodel;
     ReadKaldiObject(model_rxfilename, &tmodel);
     
     SequentialCompactLatticeReader clat_reader(lats_rspecifier);
diff --git a/src/latbin/lattice-align-words-lexicon.cc b/src/latbin/lattice-align-words-lexicon.cc
index 72226731c7c..37dd670f9f8 100644
--- a/src/latbin/lattice-align-words-lexicon.cc
+++ b/src/latbin/lattice-align-words-lexicon.cc
@@ -80,7 +80,7 @@ int main(int argc, char *argv[]) {
       }
     }
 
-    TransitionModel tmodel;
+    Transitions tmodel;
     ReadKaldiObject(model_rxfilename, &tmodel);
     
     SequentialCompactLatticeReader clat_reader(lats_rspecifier);
diff --git a/src/latbin/lattice-align-words.cc b/src/latbin/lattice-align-words.cc
index 7f024258c42..0b8841fffe2 100644
--- a/src/latbin/lattice-align-words.cc
+++ b/src/latbin/lattice-align-words.cc
@@ -71,7 +71,7 @@ int main(int argc, char *argv[]) {
         lats_rspecifier = po.GetArg(3),
         lats_wspecifier = po.GetArg(4);
 
-    TransitionModel tmodel;
+    Transitions tmodel;
     ReadKaldiObject(model_rxfilename, &tmodel);
     
     SequentialCompactLatticeReader clat_reader(lats_rspecifier);
diff --git a/src/latbin/lattice-arc-post.cc b/src/latbin/lattice-arc-post.cc
index 38a5d6d304d..63d25383aa5 100644
--- a/src/latbin/lattice-arc-post.cc
+++ b/src/latbin/lattice-arc-post.cc
@@ -34,7 +34,7 @@ class ArcPosteriorComputer {
   ArcPosteriorComputer(const CompactLattice &clat,
                        BaseFloat min_post,
                        bool print_alignment,
-                       const TransitionModel *trans_model = NULL):
+                       const Transitions *trans_model = NULL):
       clat_(clat), min_post_(min_post), print_alignment_(print_alignment),
       trans_model_(trans_model) { }
 
@@ -103,7 +103,7 @@ class ArcPosteriorComputer {
 
   BaseFloat min_post_;
   bool print_alignment_;
-  const TransitionModel *trans_model_;
+  const Transitions *trans_model_;
 };
 
 }
@@ -159,7 +159,7 @@ int main(int argc, char *argv[]) {
     if (acoustic_scale == 0.0)
       KALDI_ERR << "Do not use a zero acoustic scale (cannot be inverted)";
 
-    kaldi::TransitionModel trans_model;
+    kaldi::Transitions trans_model;
 
     std::string lats_rspecifier, output_wxfilename;
     if (po.NumArgs() == 3) {
diff --git a/src/latbin/lattice-boost-ali.cc b/src/latbin/lattice-boost-ali.cc
index 98913fdb034..18b6887062d 100644
--- a/src/latbin/lattice-boost-ali.cc
+++ b/src/latbin/lattice-boost-ali.cc
@@ -79,7 +79,7 @@ int main(int argc, char *argv[]) {
     kaldi::RandomAccessInt32VectorReader alignment_reader(ali_rspecifier);
     kaldi::CompactLatticeWriter compact_lattice_writer(lats_wspecifier);
 
-    kaldi::TransitionModel trans;
+    kaldi::Transitions trans;
     {
       bool binary_in;
       kaldi::Input ki(model_rxfilename, &binary_in);
diff --git a/src/latbin/lattice-determinize-phone-pruned-parallel.cc b/src/latbin/lattice-determinize-phone-pruned-parallel.cc
index 6d273d433c6..0221fd8eb47 100644
--- a/src/latbin/lattice-determinize-phone-pruned-parallel.cc
+++ b/src/latbin/lattice-determinize-phone-pruned-parallel.cc
@@ -17,7 +17,7 @@
 // See the Apache 2 License for the specific language governing permissions and
 // limitations under the License.
 #include "base/kaldi-common.h"
-#include "hmm/transition-model.h"
+#include "hmm/transitions.h"
 #include "lat/kaldi-lattice.h"
 #include "lat/determinize-lattice-pruned.h"
 #include "lat/lattice-functions.h"
@@ -31,7 +31,7 @@ class DeterminizeLatticeTask {
  public:
   // Initializer takes ownership of "lat".
   DeterminizeLatticeTask(
-      const TransitionModel &trans_model,
+      const Transitions &trans_model,
       fst::DeterminizeLatticePhonePrunedOptions &opts,
       std::string key,
       BaseFloat acoustic_scale,
@@ -69,7 +69,7 @@ class DeterminizeLatticeTask {
     clat_writer_->Write(key_, det_clat_);
   }
  private:
-  const TransitionModel *trans_model_;
+  const Transitions *trans_model_;
   const fst::DeterminizeLatticePhonePrunedOptions &opts_;
   std::string key_;
   BaseFloat acoustic_scale_;
@@ -130,7 +130,7 @@ int main(int argc, char *argv[]) {
         lats_rspecifier = po.GetArg(2),
         lats_wspecifier = po.GetArg(3);
 
-    TransitionModel trans_model;
+    Transitions trans_model;
     ReadKaldiObject(model_rxfilename, &trans_model);
 
     // Reads as regular lattice-- this is the form the determinization code
diff --git a/src/latbin/lattice-determinize-phone-pruned.cc b/src/latbin/lattice-determinize-phone-pruned.cc
index 94a8530273b..94a484dabbb 100644
--- a/src/latbin/lattice-determinize-phone-pruned.cc
+++ b/src/latbin/lattice-determinize-phone-pruned.cc
@@ -18,7 +18,7 @@
 // See the Apache 2 License for the specific language governing permissions and
 // limitations under the License.
 #include "base/kaldi-common.h"
-#include "hmm/transition-model.h"
+#include "hmm/transitions.h"
 #include "lat/kaldi-lattice.h"
 #include "lat/determinize-lattice-pruned.h"
 #include "lat/lattice-functions.h"
@@ -71,7 +71,7 @@ int main(int argc, char *argv[]) {
         lats_rspecifier = po.GetArg(2),
         lats_wspecifier = po.GetArg(3);
 
-    TransitionModel trans_model;
+    Transitions trans_model;
     ReadKaldiObject(model_rxfilename, &trans_model);
 
     // Reads as regular lattice-- this is the form the determinization code
diff --git a/src/latbin/lattice-rescore-mapped.cc b/src/latbin/lattice-rescore-mapped.cc
index 9dcc63219ee..d0ce5c64526 100644
--- a/src/latbin/lattice-rescore-mapped.cc
+++ b/src/latbin/lattice-rescore-mapped.cc
@@ -21,14 +21,14 @@
 #include "base/kaldi-common.h"
 #include "util/common-utils.h"
 #include "util/stl-utils.h"
-#include "hmm/transition-model.h"
+#include "hmm/transitions.h"
 #include "fstext/fstext-lib.h"
 #include "lat/kaldi-lattice.h"
 #include "lat/lattice-functions.h"
 
 namespace kaldi {
 
-void LatticeAcousticRescore(const TransitionModel &trans_model,
+void LatticeAcousticRescore(const Transitions &trans_model,
                             const Matrix<BaseFloat> &log_likes,
                             const std::vector<int32> &state_times,
                             Lattice *lat) {
@@ -109,7 +109,7 @@ int main(int argc, char *argv[]) {
         loglike_rspecifier = po.GetArg(3),
         lats_wspecifier = po.GetArg(4);
 
-    TransitionModel trans_model;
+    Transitions trans_model;
     {
       bool binary;
       Input ki(model_filename, &binary);
diff --git a/src/latbin/lattice-to-mpe-post.cc b/src/latbin/lattice-to-mpe-post.cc
index e7f0f334a45..ddc6382d1a9 100644
--- a/src/latbin/lattice-to-mpe-post.cc
+++ b/src/latbin/lattice-to-mpe-post.cc
@@ -24,7 +24,7 @@
 #include "lat/kaldi-lattice.h"
 #include "lat/lattice-functions.h"
 #include "gmm/am-diag-gmm.h"
-#include "hmm/transition-model.h"
+#include "hmm/transitions.h"
 
 int main(int argc, char *argv[]) {
   try {
@@ -86,7 +86,7 @@ int main(int argc, char *argv[]) {
     }
     RandomAccessInt32VectorReader alignments_reader(alignments_rspecifier);
     
-    TransitionModel trans_model;
+    Transitions trans_model;
     {
       bool binary;
       Input ki(model_filename, &binary);
diff --git a/src/latbin/lattice-to-phone-lattice.cc b/src/latbin/lattice-to-phone-lattice.cc
index 10da2b47bf1..2e62a498d2d 100644
--- a/src/latbin/lattice-to-phone-lattice.cc
+++ b/src/latbin/lattice-to-phone-lattice.cc
@@ -23,7 +23,7 @@
 #include "fstext/fstext-lib.h"
 #include "lat/kaldi-lattice.h"
 #include "lat/lattice-functions.h"
-#include "hmm/transition-model.h"
+#include "hmm/transitions.h"
 
 int main(int argc, char *argv[]) {
   try {
@@ -66,7 +66,7 @@ int main(int argc, char *argv[]) {
     
     int32 n_done = 0;
 
-    TransitionModel trans_model;
+    Transitions trans_model;
     
     ReadKaldiObject(model_rxfilename, &trans_model);
     
diff --git a/src/latbin/lattice-to-smbr-post.cc b/src/latbin/lattice-to-smbr-post.cc
index 5e78ea9996c..5e4634d9185 100644
--- a/src/latbin/lattice-to-smbr-post.cc
+++ b/src/latbin/lattice-to-smbr-post.cc
@@ -24,7 +24,7 @@
 #include "lat/kaldi-lattice.h"
 #include "lat/lattice-functions.h"
 #include "gmm/am-diag-gmm.h"
-#include "hmm/transition-model.h"
+#include "hmm/transitions.h"
 
 int main(int argc, char *argv[]) {
   try {
@@ -87,7 +87,7 @@ int main(int argc, char *argv[]) {
     }
     RandomAccessInt32VectorReader alignments_reader(alignments_rspecifier);
     
-    TransitionModel trans_model;
+    Transitions trans_model;
     {
       bool binary;
       Input ki(model_filename, &binary);
diff --git a/src/latbin/nbest-to-prons.cc b/src/latbin/nbest-to-prons.cc
index aa6326e031c..4999e81b30c 100644
--- a/src/latbin/nbest-to-prons.cc
+++ b/src/latbin/nbest-to-prons.cc
@@ -63,7 +63,7 @@ int main(int argc, char *argv[]) {
         wxfilename = po.GetArg(3);
 
 
-    TransitionModel trans_model;
+    Transitions trans_model;
     ReadKaldiObject(model_rxfilename, &trans_model);
 
     SequentialCompactLatticeReader clat_reader(lats_rspecifier);
diff --git a/src/nnet/Makefile b/src/nnet/Makefile
deleted file mode 100644
index 7f324479a0f..00000000000
--- a/src/nnet/Makefile
+++ /dev/null
@@ -1,22 +0,0 @@
-
-
-all:
-
-include ../kaldi.mk
-
-LDFLAGS += $(CUDA_LDFLAGS)
-LDLIBS += $(CUDA_LDLIBS)
-
-TESTFILES = nnet-randomizer-test nnet-component-test
-
-OBJFILES = nnet-nnet.o nnet-component.o nnet-loss.o \
-           nnet-pdf-prior.o nnet-randomizer.o
-
-LIBNAME = kaldi-nnet
-
-ADDLIBS = ../cudamatrix/kaldi-cudamatrix.a ../hmm/kaldi-hmm.a \
-          ../tree/kaldi-tree.a ../util/kaldi-util.a ../matrix/kaldi-matrix.a \
-          ../base/kaldi-base.a 
-
-include ../makefiles/default_rules.mk
-
diff --git a/src/nnet/nnet-activation.h b/src/nnet/nnet-activation.h
deleted file mode 100644
index 74b0ebad650..00000000000
--- a/src/nnet/nnet-activation.h
+++ /dev/null
@@ -1,373 +0,0 @@
-// nnet/nnet-activation.h
-
-// Copyright 2011-2016  Brno University of Technology (author: Karel Vesely)
-
-// See ../../COPYING for clarification regarding multiple authors
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//  http://www.apache.org/licenses/LICENSE-2.0
-//
-// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
-// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
-// MERCHANTABLITY OR NON-INFRINGEMENT.
-// See the Apache 2 License for the specific language governing permissions and
-// limitations under the License.
-
-
-#ifndef KALDI_NNET_NNET_ACTIVATION_H_
-#define KALDI_NNET_NNET_ACTIVATION_H_
-
-#include <string>
-#include <vector>
-#include <cmath>
-
-#include "nnet/nnet-component.h"
-#include "nnet/nnet-utils.h"
-#include "cudamatrix/cu-math.h"
-#include "cudamatrix/cu-rand.h"
-#include "util/text-utils.h"
-
-namespace kaldi {
-namespace nnet1 {
-
-class Softmax : public Component {
- public:
-  Softmax(int32 dim_in, int32 dim_out):
-    Component(dim_in, dim_out)
-  { }
-
-  ~Softmax()
-  { }
-
-  Component* Copy() const { return new Softmax(*this); }
-  ComponentType GetType() const { return kSoftmax; }
-
-  void PropagateFnc(const CuMatrixBase<BaseFloat> &in,
-                    CuMatrixBase<BaseFloat> *out) {
-    // y = e^x_j/sum_j(e^x_j)
-    out->ApplySoftMaxPerRow(in);
-  }
-
-  void BackpropagateFnc(const CuMatrixBase<BaseFloat> &in,
-                        const CuMatrixBase<BaseFloat> &out,
-                        const CuMatrixBase<BaseFloat> &out_diff,
-                        CuMatrixBase<BaseFloat> *in_diff) {
-    // simply copy the error derivative
-    // (ie. assume crossentropy error function,
-    // while in_diff contains (net_output-target) :
-    // this is already derivative of the error with
-    // respect to activations of last layer neurons)
-    in_diff->CopyFromMat(out_diff);
-  }
-};
-
-
-class HiddenSoftmax : public Component {
- public:
-  HiddenSoftmax(int32 dim_in, int32 dim_out) :
-    Component(dim_in, dim_out)
-  { }
-
-  ~HiddenSoftmax()
-  { }
-
-  Component* Copy() const { return new HiddenSoftmax(*this); }
-  ComponentType GetType() const { return kHiddenSoftmax; }
-
-  void PropagateFnc(const CuMatrixBase<BaseFloat> &in,
-                    CuMatrixBase<BaseFloat> *out) {
-    // y = e^x_j/sum_j(e^x_j)
-    out->ApplySoftMaxPerRow(in);
-  }
-
-  void BackpropagateFnc(const CuMatrixBase<BaseFloat> &in,
-                        const CuMatrixBase<BaseFloat> &out,
-                        const CuMatrixBase<BaseFloat> &out_diff,
-                        CuMatrixBase<BaseFloat> *in_diff) {
-    // This Softmax should be used for a hidden layer, it calculates
-    // the true Jacobian of Softmax: J = diag(out) - out*out^T
-
-    // The backpropagation formual is:
-    // in_diff = out_diff \odot out - out(out_diff^T * out)
-    // (where \odot is Hadamard product)
-
-    // 1st term, out_diff \odot out,
-    in_diff->CopyFromMat(out_diff);
-    in_diff->MulElements(out);
-
-    // 2nd term, -out(out_diff^T * out),
-    diag_out_diff_out_.Resize(out.NumRows());
-    diag_out_diff_out_.AddDiagMatMat(1.0, out_diff, kNoTrans, out, kTrans, 0.0);
-    in_diff->AddDiagVecMat(-1.0, diag_out_diff_out_, out, kNoTrans, 1.0);
-  }
-
- private:
-  /// buffer for dot-products in BackpropagateFnc,
-  CuVector<BaseFloat> diag_out_diff_out_;
-};
-
-class BlockSoftmax : public Component {
- public:
-  BlockSoftmax(int32 dim_in, int32 dim_out):
-    Component(dim_in, dim_out)
-  { }
-
-  ~BlockSoftmax()
-  { }
-
-  Component* Copy() const { return new BlockSoftmax(*this); }
-  ComponentType GetType() const { return kBlockSoftmax; }
-
-  void InitData(std::istream &is) {
-    // parse config
-    std::string token,
-      dims_str;
-    while (is >> std::ws, !is.eof()) {
-      ReadToken(is, false, &token);
-      /**/ if (token == "<BlockDims>") is >> dims_str;
-      else KALDI_ERR << "Unknown token " << token << ", a typo in config?"
-                     << " (BlockDims)";
-    }
-    // parse dims,
-    if (!kaldi::SplitStringToIntegers(dims_str, ",:", false, &block_dims))
-      KALDI_ERR << "Invalid block-dims " << dims_str;
-    // sanity check
-    int32 sum = 0;
-    for (int32 i = 0; i < block_dims.size(); i++) {
-      sum += block_dims[i];
-    }
-    KALDI_ASSERT(sum == OutputDim());
-  }
-
-  void ReadData(std::istream &is, bool binary) {
-    ReadIntegerVector(is, binary, &block_dims);
-    block_offset.resize(block_dims.size()+1, 0);
-    for (int32 i = 0; i < block_dims.size(); i++) {
-      block_offset[i+1] = block_offset[i] + block_dims[i];
-    }
-    // check
-    KALDI_ASSERT(OutputDim() == block_offset[block_offset.size()-1]);
-  }
-
-  void WriteData(std::ostream &os, bool binary) const {
-    WriteIntegerVector(os, binary, block_dims);
-  }
-
-  void PropagateFnc(const CuMatrixBase<BaseFloat> &in,
-                    CuMatrixBase<BaseFloat> *out) {
-    // perform softmax per block:
-    for (int32 bl = 0; bl < block_dims.size(); bl++) {
-      // get the blocks,
-      CuSubMatrix<BaseFloat> in_bl =
-        in.ColRange(block_offset[bl], block_dims[bl]);
-      CuSubMatrix<BaseFloat> out_bl =
-        out->ColRange(block_offset[bl], block_dims[bl]);
-      // y = e^x_j/sum_j(e^x_j),
-      out_bl.ApplySoftMaxPerRow(in_bl);
-    }
-  }
-
-  void BackpropagateFnc(const CuMatrixBase<BaseFloat> &in,
-                        const CuMatrixBase<BaseFloat> &out,
-                        const CuMatrixBase<BaseFloat> &out_diff,
-                        CuMatrixBase<BaseFloat> *in_diff) {
-    // copy the error derivative:
-    // (assuming we already got softmax-cross-entropy derivative in out_diff)
-    in_diff->CopyFromMat(out_diff);
-
-    // Set the derivatives to zero for the matrix-lines in which
-    // the sum of 'derivatives' was 1.0 (i.e. there was no target):
-    for (int32 bl = 0; bl < block_dims.size(); bl++) {
-      // get the block,
-      CuSubMatrix<BaseFloat> diff_bl =
-        in_diff->ColRange(block_offset[bl], block_dims[bl]);
-      // get the sum of each row,
-      CuVector<BaseFloat> row_sum(diff_bl.NumRows());
-      row_sum.AddColSumMat(1.0, diff_bl, 0.0);  // 0: keep as-is, 1: zero-out
-      // we'll scale rows by 0/1 masks,
-      CuVector<BaseFloat> row_diff_mask(row_sum);
-      row_diff_mask.Scale(-1.0);  // 0: keep as-is, -1: zero-out
-      row_diff_mask.Add(1.0);  // 1: keep as-is, 0: zero-out
-      // here we should have only 0's and 1's,
-      diff_bl.MulRowsVec(row_diff_mask);
-    }
-  }
-
-  std::string Info() const {
-    return "\n  softmax-dims " + ToString(block_dims);
-  }
-
-  std::vector<int32> block_dims;
-  std::vector<int32> block_offset;
-};
-
-
-
-
-class Sigmoid : public Component {
- public:
-  Sigmoid(int32 dim_in, int32 dim_out):
-    Component(dim_in, dim_out)
-  { }
-
-  ~Sigmoid()
-  { }
-
-  Component* Copy() const { return new Sigmoid(*this); }
-  ComponentType GetType() const { return kSigmoid; }
-
-  void PropagateFnc(const CuMatrixBase<BaseFloat> &in,
-                    CuMatrixBase<BaseFloat> *out) {
-    // y = 1/(1+e^-x)
-    out->Sigmoid(in);
-  }
-
-  void BackpropagateFnc(const CuMatrixBase<BaseFloat> &in,
-                        const CuMatrixBase<BaseFloat> &out,
-                        const CuMatrixBase<BaseFloat> &out_diff,
-                        CuMatrixBase<BaseFloat> *in_diff) {
-    // ey = y(1-y)ex,
-    in_diff->DiffSigmoid(out, out_diff);
-  }
-};
-
-
-
-class Tanh : public Component {
- public:
-  Tanh(int32 dim_in, int32 dim_out):
-    Component(dim_in, dim_out)
-  { }
-
-  ~Tanh()
-  { }
-
-  Component* Copy() const { return new Tanh(*this); }
-  ComponentType GetType() const { return kTanh; }
-
-  void PropagateFnc(const CuMatrixBase<BaseFloat> &in,
-                    CuMatrixBase<BaseFloat> *out) {
-    // y = (e^x - e^(-x)) / (e^x + e^(-x)),
-    out->Tanh(in);
-  }
-
-  void BackpropagateFnc(const CuMatrixBase<BaseFloat> &in,
-                        const CuMatrixBase<BaseFloat> &out,
-                        const CuMatrixBase<BaseFloat> &out_diff,
-                        CuMatrixBase<BaseFloat> *in_diff) {
-    // ey = (1 - y^2)ex
-    in_diff->DiffTanh(out, out_diff);
-  }
-};
-
-
-
-class Dropout : public Component {
- public:
-  Dropout(int32 dim_in, int32 dim_out):
-      Component(dim_in, dim_out),
-      dropout_rate_(0.5)
-  { }
-
-  ~Dropout()
-  { }
-
-  Component* Copy() const { return new Dropout(*this); }
-  ComponentType GetType() const { return kDropout; }
-
-  void InitData(std::istream &is) {
-    is >> std::ws;  // eat-up whitespace
-    // parse config
-    std::string token;
-    while (is >> std::ws, !is.eof()) {
-      ReadToken(is, false, &token);
-      /**/ if (token == "<DropoutRate>") ReadBasicType(is, false, &dropout_rate_);
-      else KALDI_ERR << "Unknown token " << token << ", a typo in config?"
-                     << " (DropoutRate)";
-    }
-    KALDI_ASSERT(dropout_rate_ >= 0.0 && dropout_rate_ < 1.0);
-  }
-
-  void ReadData(std::istream &is, bool binary) {
-    // Read all the '<Tokens>' in arbitrary order,
-    bool finished = false;
-    while ('<' == Peek(is, binary) && !finished) {
-      std::string token;
-      int first_char = PeekToken(is, binary);
-      switch (first_char) {
-        case 'D': ReadToken(is, false, &token);
-          /**/ if (token == "<DropoutRate>") ReadBasicType(is, binary, &dropout_rate_);
-          else if (token == "<DropoutRetention>") { /* compatibility */
-            BaseFloat dropout_retention;
-            ReadBasicType(is, binary, &dropout_retention);
-            dropout_rate_ = 1.0 - dropout_retention;
-          } else KALDI_ERR << "Unknown token: " << token;
-          break;
-        case '!': ExpectToken(is, binary, "<!EndOfComponent>");
-          finished = true;
-          break;
-        default: ReadToken(is, false, &token);
-          KALDI_ERR << "Unknown token: " << token;
-      }
-    }
-    KALDI_ASSERT(dropout_rate_ >= 0.0 && dropout_rate_ < 1.0);
-  }
-
-  void WriteData(std::ostream &os, bool binary) const {
-    WriteToken(os, binary, "<DropoutRate>");
-    WriteBasicType(os, binary, dropout_rate_);
-  }
-
-  std::string Info() const {
-    return std::string("<DropoutRate> ") + ToString(dropout_rate_);
-  }
-
-  void PropagateFnc(const CuMatrixBase<BaseFloat> &in,
-                    CuMatrixBase<BaseFloat> *out) {
-    out->CopyFromMat(in);
-    // set N inputs to zero, according to the 'dropout_rate_' ...
-    dropout_mask_.Resize(out->NumRows(), out->NumCols());
-    rand_.RandUniform(&dropout_mask_);  // [0..1]
-    dropout_mask_.Add(-dropout_rate_);  // [(-rate)..(1-rate)]
-    dropout_mask_.Heaviside(dropout_mask_); // (x > 0.0 ? 1 : 0)
-    out->MulElements(dropout_mask_);
-    // rescale to keep the same dynamic range as w/o dropout,
-    out->Scale(1.0 / (1.0 - dropout_rate_));
-  }
-
-  void BackpropagateFnc(const CuMatrixBase<BaseFloat> &in,
-                        const CuMatrixBase<BaseFloat> &out,
-                        const CuMatrixBase<BaseFloat> &out_diff,
-                        CuMatrixBase<BaseFloat> *in_diff) {
-    in_diff->CopyFromMat(out_diff);
-    // use same mask on the error derivatives...
-    in_diff->MulElements(dropout_mask_);
-    // enlarge the output to fit same dynamic range as w/o dropout
-    in_diff->Scale(1.0 / (1.0 - dropout_rate_));
-  }
-
-  BaseFloat GetDropoutRate() { return dropout_rate_; }
-
-  void SetDropoutRate(BaseFloat dr) {
-    dropout_rate_ = dr;
-    KALDI_ASSERT(dropout_rate_ >= 0.0 && dropout_rate_ < 1.0);
-  }
-
- private:
-  BaseFloat dropout_rate_;  ///< probability that a neuron is dropped,
-
-  CuRand<BaseFloat> rand_;  ///< generator of random numbers,
-
-  CuMatrix<BaseFloat> dropout_mask_;  // random binary mask,
-                                      // 1 = keep neuron, 0 = drop neuron,
-};
-
-}  // namespace nnet1
-}  // namespace kaldi
-
-#endif  // KALDI_NNET_NNET_ACTIVATION_H_
-
diff --git a/src/nnet/nnet-affine-transform.h b/src/nnet/nnet-affine-transform.h
deleted file mode 100644
index 0dc84fae6d8..00000000000
--- a/src/nnet/nnet-affine-transform.h
+++ /dev/null
@@ -1,247 +0,0 @@
-// nnet/nnet-affine-transform.h
-
-// Copyright 2011-2014  Brno University of Technology (author: Karel Vesely)
-
-// See ../../COPYING for clarification regarding multiple authors
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//  http://www.apache.org/licenses/LICENSE-2.0
-//
-// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
-// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
-// MERCHANTABLITY OR NON-INFRINGEMENT.
-// See the Apache 2 License for the specific language governing permissions and
-// limitations under the License.
-
-
-#ifndef KALDI_NNET_NNET_AFFINE_TRANSFORM_H_
-#define KALDI_NNET_NNET_AFFINE_TRANSFORM_H_
-
-#include <string>
-
-#include "nnet/nnet-component.h"
-#include "nnet/nnet-utils.h"
-#include "cudamatrix/cu-math.h"
-
-namespace kaldi {
-namespace nnet1 {
-
-class AffineTransform : public UpdatableComponent {
- public:
-  AffineTransform(int32 dim_in, int32 dim_out):
-    UpdatableComponent(dim_in, dim_out),
-    linearity_(dim_out, dim_in), bias_(dim_out),
-    linearity_corr_(dim_out, dim_in), bias_corr_(dim_out),
-    max_norm_(0.0)
-  { }
-  ~AffineTransform()
-  { }
-
-  Component* Copy() const { return new AffineTransform(*this); }
-  ComponentType GetType() const { return kAffineTransform; }
-
-  void InitData(std::istream &is) {
-    // define options
-    float bias_mean = -2.0, bias_range = 2.0, param_stddev = 0.1;
-    // parse config
-    std::string token;
-    while (is >> std::ws, !is.eof()) {
-      ReadToken(is, false, &token);
-      /**/ if (token == "<ParamStddev>") ReadBasicType(is, false, &param_stddev);
-      else if (token == "<BiasMean>")    ReadBasicType(is, false, &bias_mean);
-      else if (token == "<BiasRange>")   ReadBasicType(is, false, &bias_range);
-      else if (token == "<LearnRateCoef>") ReadBasicType(is, false, &learn_rate_coef_);
-      else if (token == "<BiasLearnRateCoef>") ReadBasicType(is, false, &bias_learn_rate_coef_);
-      else if (token == "<MaxNorm>") ReadBasicType(is, false, &max_norm_);
-      else KALDI_ERR << "Unknown token " << token << ", a typo in config?"
-                     << " (ParamStddev|BiasMean|BiasRange|LearnRateCoef|BiasLearnRateCoef)";
-    }
-
-    //
-    // Initialize trainable parameters,
-    //
-    // Gaussian with given std_dev (mean = 0),
-    linearity_.Resize(OutputDim(), InputDim());
-    RandGauss(0.0, param_stddev, &linearity_);
-    // Uniform,
-    bias_.Resize(OutputDim());
-    RandUniform(bias_mean, bias_range, &bias_);
-  }
-
-  void ReadData(std::istream &is, bool binary) {
-    // Read all the '<Tokens>' in arbitrary order,
-    while ('<' == Peek(is, binary)) {
-      int first_char = PeekToken(is, binary);
-      switch (first_char) {
-        case 'L': ExpectToken(is, binary, "<LearnRateCoef>");
-          ReadBasicType(is, binary, &learn_rate_coef_);
-          break;
-        case 'B': ExpectToken(is, binary, "<BiasLearnRateCoef>");
-          ReadBasicType(is, binary, &bias_learn_rate_coef_);
-          break;
-        case 'M': ExpectToken(is, binary, "<MaxNorm>");
-          ReadBasicType(is, binary, &max_norm_);
-          break;
-        default:
-          std::string token;
-          ReadToken(is, false, &token);
-          KALDI_ERR << "Unknown token: " << token;
-      }
-    }
-    // Read the data (data follow the tokens),
-
-    // weight matrix,
-    linearity_.Read(is, binary);
-    // bias vector,
-    bias_.Read(is, binary);
-
-    KALDI_ASSERT(linearity_.NumRows() == output_dim_);
-    KALDI_ASSERT(linearity_.NumCols() == input_dim_);
-    KALDI_ASSERT(bias_.Dim() == output_dim_);
-  }
-
-  void WriteData(std::ostream &os, bool binary) const {
-    WriteToken(os, binary, "<LearnRateCoef>");
-    WriteBasicType(os, binary, learn_rate_coef_);
-    WriteToken(os, binary, "<BiasLearnRateCoef>");
-    WriteBasicType(os, binary, bias_learn_rate_coef_);
-    WriteToken(os, binary, "<MaxNorm>");
-    WriteBasicType(os, binary, max_norm_);
-    if (!binary) os << "\n";
-    // weights
-    linearity_.Write(os, binary);
-    bias_.Write(os, binary);
-  }
-
-  int32 NumParams() const {
-    return linearity_.NumRows()*linearity_.NumCols() + bias_.Dim();
-  }
-
-  void GetGradient(VectorBase<BaseFloat>* gradient) const {
-    KALDI_ASSERT(gradient->Dim() == NumParams());
-    int32 linearity_num_elem = linearity_.NumRows() * linearity_.NumCols();
-    gradient->Range(0, linearity_num_elem).CopyRowsFromMat(linearity_corr_);
-    gradient->Range(linearity_num_elem, bias_.Dim()).CopyFromVec(bias_corr_);
-  }
-
-  void GetParams(VectorBase<BaseFloat>* params) const {
-    KALDI_ASSERT(params->Dim() == NumParams());
-    int32 linearity_num_elem = linearity_.NumRows() * linearity_.NumCols();
-    params->Range(0, linearity_num_elem).CopyRowsFromMat(linearity_);
-    params->Range(linearity_num_elem, bias_.Dim()).CopyFromVec(bias_);
-  }
-
-  void SetParams(const VectorBase<BaseFloat>& params) {
-    KALDI_ASSERT(params.Dim() == NumParams());
-    int32 linearity_num_elem = linearity_.NumRows() * linearity_.NumCols();
-    linearity_.CopyRowsFromVec(params.Range(0, linearity_num_elem));
-    bias_.CopyFromVec(params.Range(linearity_num_elem, bias_.Dim()));
-  }
-
-  std::string Info() const {
-    return std::string("\n  linearity") +
-      MomentStatistics(linearity_) +
-      ", lr-coef " + ToString(learn_rate_coef_) +
-      ", max-norm " + ToString(max_norm_) +
-      "\n  bias" + MomentStatistics(bias_) +
-      ", lr-coef " + ToString(bias_learn_rate_coef_);
-  }
-  std::string InfoGradient() const {
-    return std::string("\n  linearity_grad") +
-      MomentStatistics(linearity_corr_) +
-      ", lr-coef " + ToString(learn_rate_coef_) +
-      ", max-norm " + ToString(max_norm_) +
-      "\n  bias_grad" + MomentStatistics(bias_corr_) +
-      ", lr-coef " + ToString(bias_learn_rate_coef_);
-  }
-
-  void PropagateFnc(const CuMatrixBase<BaseFloat> &in,
-                    CuMatrixBase<BaseFloat> *out) {
-    // precopy bias
-    out->AddVecToRows(1.0, bias_, 0.0);
-    // multiply by weights^t
-    out->AddMatMat(1.0, in, kNoTrans, linearity_, kTrans, 1.0);
-  }
-
-  void BackpropagateFnc(const CuMatrixBase<BaseFloat> &in,
-                        const CuMatrixBase<BaseFloat> &out,
-                        const CuMatrixBase<BaseFloat> &out_diff,
-                        CuMatrixBase<BaseFloat> *in_diff) {
-    // multiply error derivative by weights
-    in_diff->AddMatMat(1.0, out_diff, kNoTrans, linearity_, kNoTrans, 0.0);
-  }
-
-
-  void Update(const CuMatrixBase<BaseFloat> &input,
-              const CuMatrixBase<BaseFloat> &diff) {
-    // we use following hyperparameters from the option class
-    const BaseFloat lr = opts_.learn_rate * learn_rate_coef_;
-    const BaseFloat lr_bias = opts_.learn_rate * bias_learn_rate_coef_;
-    const BaseFloat mmt = opts_.momentum;
-    const BaseFloat l2 = opts_.l2_penalty;
-    const BaseFloat l1 = opts_.l1_penalty;
-    // we will also need the number of frames in the mini-batch
-    const int32 num_frames = input.NumRows();
-    // compute gradient (incl. momentum)
-    linearity_corr_.AddMatMat(1.0, diff, kTrans, input, kNoTrans, mmt);
-    bias_corr_.AddRowSumMat(1.0, diff, mmt);
-    // l2 regularization
-    if (l2 != 0.0) {
-      linearity_.AddMat(-lr*l2*num_frames, linearity_);
-    }
-    // l1 regularization
-    if (l1 != 0.0) {
-      cu::RegularizeL1(&linearity_, &linearity_corr_, lr*l1*num_frames, lr);
-    }
-    // update
-    linearity_.AddMat(-lr, linearity_corr_);
-    bias_.AddVec(-lr_bias, bias_corr_);
-    // max-norm
-    if (max_norm_ > 0.0) {
-      CuMatrix<BaseFloat> lin_sqr(linearity_);
-      lin_sqr.MulElements(linearity_);
-      CuVector<BaseFloat> l2(OutputDim());
-      l2.AddColSumMat(1.0, lin_sqr, 0.0);
-      l2.ApplyPow(0.5);  // we have per-neuron L2 norms,
-      CuVector<BaseFloat> scl(l2);
-      scl.Scale(1.0/max_norm_);
-      scl.ApplyFloor(1.0);
-      scl.InvertElements();
-      linearity_.MulRowsVec(scl);  // shink to sphere!
-    }
-  }
-
-  /// Accessors to the component parameters,
-  const CuVectorBase<BaseFloat>& GetBias() const { return bias_; }
-
-  void SetBias(const CuVectorBase<BaseFloat>& bias) {
-    KALDI_ASSERT(bias.Dim() == bias_.Dim());
-    bias_.CopyFromVec(bias);
-  }
-
-  const CuMatrixBase<BaseFloat>& GetLinearity() const { return linearity_; }
-
-  void SetLinearity(const CuMatrixBase<BaseFloat>& linearity) {
-    KALDI_ASSERT(linearity.NumRows() == linearity_.NumRows());
-    KALDI_ASSERT(linearity.NumCols() == linearity_.NumCols());
-    linearity_.CopyFromMat(linearity);
-  }
-
- private:
-  CuMatrix<BaseFloat> linearity_;
-  CuVector<BaseFloat> bias_;
-
-  CuMatrix<BaseFloat> linearity_corr_;
-  CuVector<BaseFloat> bias_corr_;
-
-  BaseFloat max_norm_;
-};
-
-}  // namespace nnet1
-}  // namespace kaldi
-
-#endif  // KALDI_NNET_NNET_AFFINE_TRANSFORM_H_
diff --git a/src/nnet/nnet-average-pooling-2d-component.h b/src/nnet/nnet-average-pooling-2d-component.h
deleted file mode 100644
index 17ae87f94db..00000000000
--- a/src/nnet/nnet-average-pooling-2d-component.h
+++ /dev/null
@@ -1,209 +0,0 @@
-// nnet/nnet-average-pooling-2d-component.h
-
-// Copyright 2014  Brno University of Technology (author: Karel Vesely)
-//                 Johns Hopkins University (author: Sri Harish Mallidi)
-
-// See ../../COPYING for clarification regarding multiple authors
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//  http://www.apache.org/licenses/LICENSE-2.0
-//
-// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
-// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
-// MERCHANTABLITY OR NON-INFRINGEMENT.
-// See the Apache 2 License for the specific language governing permissions and
-// limitations under the License.
-
-
-#ifndef KALDI_NNET_NNET_AVERAGE_POOLING_2D_COMPONENT_H_
-#define KALDI_NNET_NNET_AVERAGE_POOLING_2D_COMPONENT_H_
-
-#include <string>
-#include <vector>
-
-#include "nnet/nnet-component.h"
-#include "nnet/nnet-utils.h"
-#include "cudamatrix/cu-math.h"
-
-namespace kaldi {
-namespace nnet1 {
-
-/**
- * AveragePoolingComponent :
- * The input/output matrices are split to submatrices with width 'pool_stride_'.
- * The pooling is done over 3rd axis, of the set of 2d matrices.
- * Our pooling supports overlaps, overlaps occur when (pool_step_ < pool_size_).
- */
-class AveragePooling2DComponent : public Component {
- public:
-  AveragePooling2DComponent(int32 dim_in, int32 dim_out):
-    Component(dim_in, dim_out),
-    fmap_x_len_(0), fmap_y_len_(0),
-    pool_x_len_(0), pool_y_len_(0),
-    pool_x_step_(0), pool_y_step_(0)
-  { }
-  ~AveragePooling2DComponent()
-  { }
-
-  Component* Copy() const { return new AveragePooling2DComponent(*this); }
-  ComponentType GetType() const { return kAveragePooling2DComponent; }
-
-  void InitData(std::istream &is) {
-    // parse config
-    std::string token;
-    while (is >> std::ws, !is.eof()) {
-      ReadToken(is, false, &token);
-      /**/ if (token == "<FmapXLen>") ReadBasicType(is, false, &fmap_x_len_);
-      else if (token == "<FmapYLen>") ReadBasicType(is, false, &fmap_y_len_);
-      else if (token == "<PoolXLen>") ReadBasicType(is, false, &pool_x_len_);
-      else if (token == "<PoolYLen>") ReadBasicType(is, false, &pool_y_len_);
-      else if (token == "<PoolXStep>") ReadBasicType(is, false, &pool_x_step_);
-      else if (token == "<PoolYStep>") ReadBasicType(is, false, &pool_y_step_);
-      else KALDI_ERR << "Unknown token " << token << ", a typo in config?"
-             << " (FmapXLen|FmapYLen|PoolXLen|PoolYLen|PoolXStep|PoolYStep)";
-    }
-    // check
-    KALDI_ASSERT(fmap_x_len_ * fmap_y_len_ != 0);
-    KALDI_ASSERT(pool_x_len_ * pool_y_len_ != 0);
-    KALDI_ASSERT(pool_x_step_ * pool_y_step_  != 0);
-  }
-
-  void ReadData(std::istream &is, bool binary) {
-    // pooling hyperparameters
-    ExpectToken(is, binary, "<FmapXLen>");
-    ReadBasicType(is, binary, &fmap_x_len_);
-    ExpectToken(is, binary, "<FmapYLen>");
-    ReadBasicType(is, binary, &fmap_y_len_);
-    ExpectToken(is, binary, "<PoolXLen>");
-    ReadBasicType(is, binary, &pool_x_len_);
-    ExpectToken(is, binary, "<PoolYLen>");
-    ReadBasicType(is, binary, &pool_y_len_);
-    ExpectToken(is, binary, "<PoolXStep>");
-    ReadBasicType(is, binary, &pool_x_step_);
-    ExpectToken(is, binary, "<PoolYStep>");
-    ReadBasicType(is, binary, &pool_y_step_);
-
-    //
-    // Sanity checks:
-    //
-    // input sanity checks
-    // input_dim_ should be multiple of (fmap_x_len_ * fmap_y_len_)
-    KALDI_ASSERT(input_dim_ % (fmap_x_len_ * fmap_y_len_) == 0);
-    int32 num_input_fmaps = input_dim_ / (fmap_x_len_ * fmap_y_len_);
-    KALDI_LOG << "num_fmaps " << num_input_fmaps;
-    // check if step is in sync with fmap_len and filt_len
-    KALDI_ASSERT((fmap_x_len_ - pool_x_len_) % (pool_x_step_) == 0);
-    KALDI_ASSERT((fmap_y_len_ - pool_y_len_) % (pool_y_step_) == 0);
-    int32 out_fmap_x_len = (fmap_x_len_ - pool_x_len_)/pool_x_step_ + 1;
-    int32 out_fmap_y_len = (fmap_y_len_ - pool_y_len_)/pool_y_step_ + 1;
-    //    int32 out_fmap_size = out_fmap_x_len*out_fmap_y_len;
-    // output sanity checks
-    KALDI_ASSERT(output_dim_ % (out_fmap_x_len * out_fmap_y_len)  == 0);
-    int32 num_output_fmaps = output_dim_ / (out_fmap_x_len * out_fmap_y_len);
-    KALDI_ASSERT(num_input_fmaps == num_output_fmaps);
-  }
-
-  void WriteData(std::ostream &os, bool binary) const {
-    // pooling hyperparameters
-    WriteToken(os, binary, "<FmapXLen>");
-    WriteBasicType(os, binary, fmap_x_len_);
-    WriteToken(os, binary, "<FmapYLen>");
-    WriteBasicType(os, binary, fmap_y_len_);
-    WriteToken(os, binary, "<PoolXLen>");
-    WriteBasicType(os, binary, pool_x_len_);
-    WriteToken(os, binary, "<PoolYLen>");
-    WriteBasicType(os, binary, pool_y_len_);
-    WriteToken(os, binary, "<PoolXStep>");
-    WriteBasicType(os, binary, pool_x_step_);
-    WriteToken(os, binary, "<PoolYStep>");
-    WriteBasicType(os, binary, pool_y_step_);
-  }
-
-  void PropagateFnc(const CuMatrixBase<BaseFloat> &in,
-                    CuMatrixBase<BaseFloat> *out) {
-    // useful dims
-    int32 num_input_fmaps = input_dim_ / (fmap_x_len_ * fmap_y_len_);
-    int out_fmap_cnt = 0;
-    for (int32 m = 0; m < fmap_x_len_-pool_x_len_+1; m = m+pool_x_step_) {
-      for (int32 n = 0; n < fmap_y_len_-pool_y_len_+1; n = n+pool_y_step_) {
-        int32 st = 0;
-        st = (m * fmap_y_len_ + n) * num_input_fmaps;
-        CuSubMatrix<BaseFloat> pool(out->ColRange(out_fmap_cnt * num_input_fmaps, num_input_fmaps));
-        pool.SetZero();  // reset
-        for (int32 i = 0; i < pool_x_len_; i++) {
-          for (int32 j = 0; j < pool_y_len_; j++) {
-            int32 c = 0;
-            c = st + i * (num_input_fmaps * fmap_y_len_)
-                   + j * num_input_fmaps;
-            pool.AddMat(1.0, in.ColRange(c, num_input_fmaps));
-          }
-        }
-        pool.Scale(1.0 / (pool_x_len_ * pool_y_len_));
-        out_fmap_cnt++;
-      }
-    }
-  }
-
-  void BackpropagateFnc(const CuMatrixBase<BaseFloat> &in,
-                        const CuMatrixBase<BaseFloat> &out,
-                        const CuMatrixBase<BaseFloat> &out_diff,
-                        CuMatrixBase<BaseFloat> *in_diff) {
-    // useful dims
-    int32 num_input_fmaps = input_dim_ / (fmap_x_len_ * fmap_y_len_);
-    int32 inp_fmap_size = fmap_x_len_ * fmap_y_len_;
-    //
-    // here we note how many diff matrices are summed for each input patch,
-    std::vector<int32> patch_summands(inp_fmap_size, 0);
-    // this metainfo will be used to divide diff of patches
-    // used in more than one pool.
-    //
-
-    in_diff->SetZero();  // reset
-    int out_fmap_cnt = 0;
-    for (int32 m = 0; m < fmap_x_len_-pool_x_len_+1; m = m+pool_x_step_) {
-      for (int32 n = 0; n < fmap_y_len_-pool_y_len_+1; n = n+pool_y_step_) {
-        int32 st = 0;
-        st = (m * fmap_y_len_ + n) * num_input_fmaps;
-        CuSubMatrix<BaseFloat> src(out_diff.ColRange(out_fmap_cnt * num_input_fmaps, num_input_fmaps));
-        for (int32 i = 0; i < pool_x_len_; i++) {
-          for (int32 j = 0; j < pool_y_len_; j++) {
-            int32 c = 0;
-            c = st + i * (num_input_fmaps * fmap_y_len_)
-                   + j * num_input_fmaps;
-            CuSubMatrix<BaseFloat> tgt(in_diff->ColRange(c, num_input_fmaps));
-            tgt.AddMat(1.0, src);
-            patch_summands[c / num_input_fmaps] += 1;
-          }
-        }
-        out_fmap_cnt++;
-      }
-    }
-
-    // divide diff by average-pooling-dim (derivative of averaging)
-    in_diff->Scale(1.0 / (pool_x_len_ * pool_y_len_));
-
-    // divide diff by #summands (compensate for patches used in more pools)
-    for (int i = 0; i < fmap_x_len_; i++) {
-      for (int32 j = 0; j < fmap_y_len_; j++) {
-        int32 c = i * fmap_y_len_ + j;
-        CuSubMatrix<BaseFloat> tgt(in_diff->ColRange(c*num_input_fmaps, num_input_fmaps));
-        KALDI_ASSERT(patch_summands[c] > 0);  // patch at least in one pool
-        tgt.Scale(1.0 / patch_summands[c]);
-      }
-    }
-  }
-
- private:
-  int32 fmap_x_len_, fmap_y_len_,
-        pool_x_len_, pool_y_len_,
-        pool_x_step_, pool_y_step_;
-};
-
-}  // namespace nnet1
-}  // namespace kaldi
-
-#endif  // KALDI_NNET_NNET_AVERAGE_POOLING_2D_COMPONENT_H_
diff --git a/src/nnet/nnet-average-pooling-component.h b/src/nnet/nnet-average-pooling-component.h
deleted file mode 100644
index 605c6ba327a..00000000000
--- a/src/nnet/nnet-average-pooling-component.h
+++ /dev/null
@@ -1,169 +0,0 @@
-// nnet/nnet-average-pooling-component.h
-
-// Copyright 2014  Brno University of Technology (author: Karel Vesely)
-
-// See ../../COPYING for clarification regarding multiple authors
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//  http://www.apache.org/licenses/LICENSE-2.0
-//
-// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
-// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
-// MERCHANTABLITY OR NON-INFRINGEMENT.
-// See the Apache 2 License for the specific language governing permissions and
-// limitations under the License.
-
-
-#ifndef KALDI_NNET_NNET_AVERAGE_POOLING_COMPONENT_H_
-#define KALDI_NNET_NNET_AVERAGE_POOLING_COMPONENT_H_
-
-#include <string>
-#include <vector>
-
-#include "nnet/nnet-component.h"
-#include "nnet/nnet-utils.h"
-#include "cudamatrix/cu-math.h"
-
-namespace kaldi {
-namespace nnet1 {
-
-/**
- * AveragePoolingComponent :
- * The input/output matrices are split to submatrices with width 'pool_stride_'.
- * The pooling is done over 3rd axis, of the set of 2d matrices.
- * Our pooling supports overlaps, overlaps occur when (pool_step_ < pool_size_).
- */
-class AveragePoolingComponent : public Component {
- public:
-  AveragePoolingComponent(int32 dim_in, int32 dim_out):
-    Component(dim_in, dim_out),
-    pool_size_(0),
-    pool_step_(0),
-    pool_stride_(0)
-  { }
-
-  ~AveragePoolingComponent()
-  { }
-
-  Component* Copy() const { return new AveragePoolingComponent(*this); }
-  ComponentType GetType() const { return kAveragePoolingComponent; }
-
-  void InitData(std::istream &is) {
-    // parse config
-    std::string token;
-    while (is >> std::ws, !is.eof()) {
-      ReadToken(is, false, &token);
-      /**/ if (token == "<PoolSize>") ReadBasicType(is, false, &pool_size_);
-      else if (token == "<PoolStep>") ReadBasicType(is, false, &pool_step_);
-      else if (token == "<PoolStride>") ReadBasicType(is, false, &pool_stride_);
-      else KALDI_ERR << "Unknown token " << token << ", a typo in config?"
-                     << " (PoolSize|PoolStep|PoolStride)";
-    }
-    // check
-    KALDI_ASSERT(pool_size_ != 0 && pool_step_ != 0 && pool_stride_ != 0);
-  }
-
-  void ReadData(std::istream &is, bool binary) {
-    // pooling hyperparameters
-    ExpectToken(is, binary, "<PoolSize>");
-    ReadBasicType(is, binary, &pool_size_);
-    ExpectToken(is, binary, "<PoolStep>");
-    ReadBasicType(is, binary, &pool_step_);
-    ExpectToken(is, binary, "<PoolStride>");
-    ReadBasicType(is, binary, &pool_stride_);
-
-    //
-    // Sanity checks:
-    //
-    // number of patches:
-    KALDI_ASSERT(input_dim_ % pool_stride_ == 0);
-    int32 num_patches = input_dim_ / pool_stride_;
-    // number of pools:
-    KALDI_ASSERT((num_patches - pool_size_) % pool_step_ == 0);
-    int32 num_pools = 1 + (num_patches - pool_size_) / pool_step_;
-    // check output dim:
-    KALDI_ASSERT(output_dim_ == num_pools * pool_stride_);
-    //
-  }
-
-  void WriteData(std::ostream &os, bool binary) const {
-    // pooling hyperparameters
-    WriteToken(os, binary, "<PoolSize>");
-    WriteBasicType(os, binary, pool_size_);
-    WriteToken(os, binary, "<PoolStep>");
-    WriteBasicType(os, binary, pool_step_);
-    WriteToken(os, binary, "<PoolStride>");
-    WriteBasicType(os, binary, pool_stride_);
-  }
-
-  void PropagateFnc(const CuMatrixBase<BaseFloat> &in,
-                    CuMatrixBase<BaseFloat> *out) {
-    // useful dims
-    int32 num_patches = input_dim_ / pool_stride_;
-    int32 num_pools = 1 + (num_patches - pool_size_) / pool_step_;
-
-    // do the average-pooling (pools indexed by q)
-    for (int32 q = 0; q < num_pools; q++) {
-      // get output buffer of the pool
-      CuSubMatrix<BaseFloat> pool(out->ColRange(q*pool_stride_, pool_stride_));
-      pool.SetZero();  // reset,
-      for (int32 r = 0; r < pool_size_; r++) {  // sum
-        int32 p = r + q * pool_step_;  // p = input patch
-        pool.AddMat(1.0, in.ColRange(p*pool_stride_, pool_stride_));
-      }
-      pool.Scale(1.0 / pool_size_);  // divide by #summands
-    }
-  }
-
-  void BackpropagateFnc(const CuMatrixBase<BaseFloat> &in,
-                        const CuMatrixBase<BaseFloat> &out,
-                        const CuMatrixBase<BaseFloat> &out_diff,
-                        CuMatrixBase<BaseFloat> *in_diff) {
-    // useful dims
-    int32 num_patches = input_dim_ / pool_stride_;
-    int32 num_pools = 1 + (num_patches - pool_size_) / pool_step_;
-
-    //
-    // here we note how many diff matrices are summed for each input patch,
-    std::vector<int32> patch_summands(num_patches, 0);
-    // this metainfo will be used to divide diff of patches
-    // used in more than one pool.
-    //
-
-    in_diff->SetZero();  // reset
-
-    for (int32 q = 0; q < num_pools; q++) {  // sum
-      for (int32 r = 0; r < pool_size_; r++) {
-        int32 p = r + q * pool_step_;
-        CuSubMatrix<BaseFloat> tgt(in_diff->ColRange(p*pool_stride_, pool_stride_));
-        CuSubMatrix<BaseFloat> src(out_diff.ColRange(q*pool_stride_, pool_stride_));
-        tgt.AddMat(1.0, src);
-        patch_summands[p] += 1;
-      }
-    }
-
-    // divide diff by average-pooling-dim (derivative of averaging)
-    in_diff->Scale(1.0 / pool_size_);
-
-    // divide diff by #summands (compensate for patches used in more pools)
-    for (int32 p = 0; p < num_patches; p++) {
-      CuSubMatrix<BaseFloat> tgt(in_diff->ColRange(p*pool_stride_, pool_stride_));
-      KALDI_ASSERT(patch_summands[p] > 0);  // patch at least in one pool
-      tgt.Scale(1.0/patch_summands[p]);
-    }
-  }
-
- private:
-  int32 pool_size_,    // input patches used for pooling
-        pool_step_,    // shift used for pooling (allow overlapping pools)
-        pool_stride_;  // stride used to cut input to a vector of matrices
-};
-
-}  // namespace nnet1
-}  // namespace kaldi
-
-#endif  // KALDI_NNET_NNET_AVERAGE_POOLING_COMPONENT_H_
diff --git a/src/nnet/nnet-blstm-projected.h b/src/nnet/nnet-blstm-projected.h
deleted file mode 100644
index 45851f5d9fc..00000000000
--- a/src/nnet/nnet-blstm-projected.h
+++ /dev/null
@@ -1,1206 +0,0 @@
-// nnet/nnet-blstm-projected-streams.h
-
-// Copyright 2016  Brno University of Techology (author: Karel Vesely)
-// Copyright 2015  Chongjia Ni
-// Copyright 2014  Jiayu DU (Jerry), Wei Li
-// See ../../COPYING for clarification regarding multiple authors
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//  http://www.apache.org/licenses/LICENSE-2.0
-//
-// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
-// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
-// MERCHANTABLITY OR NON-INFRINGEMENT.
-// See the Apache 2 License for the specific language governing permissions and
-// limitations under the License.
-
-
-#ifndef KALDI_NNET_NNET_BLSTM_PROJECTED_H_
-#define KALDI_NNET_NNET_BLSTM_PROJECTED_H_
-
-#include <string>
-#include <vector>
-
-#include "nnet/nnet-component.h"
-#include "nnet/nnet-utils.h"
-#include "cudamatrix/cu-math.h"
-
-/*************************************
- * x: input neuron
- * g: squashing neuron near input
- * i: Input gate
- * f: Forget gate
- * o: Output gate
- * c: memory Cell (CEC)
- * h: squashing neuron near output
- * m: output neuron of Memory block
- * r: recurrent projection neuron
- * y: output neuron of LSTMP
- * f-*: forward direction
- * b-*: backward direction
- *************************************/
-
-namespace kaldi {
-namespace nnet1 {
-
-class BlstmProjected : public MultistreamComponent {
- public:
-  BlstmProjected(int32 input_dim, int32 output_dim):
-    MultistreamComponent(input_dim, output_dim),
-    cell_dim_(0),
-    proj_dim_(static_cast<int32>(output_dim/2)),
-    cell_clip_(50.0),
-    diff_clip_(1.0),
-    cell_diff_clip_(0.0),
-    grad_clip_(250.0)
-  { }
-
-  ~BlstmProjected()
-  { }
-
-  Component* Copy() const { return new BlstmProjected(*this); }
-  ComponentType GetType() const { return kBlstmProjected; }
-
-  void InitData(std::istream &is) {
-    // define options,
-    float param_range = 0.1;
-    // parse the line from prototype,
-    std::string token;
-    while (is >> std::ws, !is.eof()) {
-      ReadToken(is, false, &token);
-      /**/ if (token == "<ParamRange>") ReadBasicType(is, false, &param_range);
-      else if (token == "<CellDim>") ReadBasicType(is, false, &cell_dim_);
-      else if (token == "<LearnRateCoef>") ReadBasicType(is, false, &learn_rate_coef_);
-      else if (token == "<BiasLearnRateCoef>") ReadBasicType(is, false, &bias_learn_rate_coef_);
-      else if (token == "<CellClip>") ReadBasicType(is, false, &cell_clip_);
-      else if (token == "<DiffClip>") ReadBasicType(is, false, &diff_clip_);
-      else if (token == "<CellDiffClip>") ReadBasicType(is, false, &cell_diff_clip_);
-      else if (token == "<GradClip>") ReadBasicType(is, false, &grad_clip_);
-      else KALDI_ERR << "Unknown token " << token << ", a typo in config?"
-                     << " (ParamRange|CellDim|LearnRateCoef|BiasLearnRateCoef|CellClip|DiffClip|GradClip)";
-    }
-
-    // init the weights and biases (from uniform dist.),
-    // forward direction,
-    f_w_gifo_x_.Resize(4*cell_dim_, input_dim_, kUndefined);
-    f_w_gifo_r_.Resize(4*cell_dim_, proj_dim_, kUndefined);
-    f_bias_.Resize(4*cell_dim_, kUndefined);
-    f_peephole_i_c_.Resize(cell_dim_, kUndefined);
-    f_peephole_f_c_.Resize(cell_dim_, kUndefined);
-    f_peephole_o_c_.Resize(cell_dim_, kUndefined);
-    f_w_r_m_.Resize(proj_dim_, cell_dim_, kUndefined);
-    //       (mean), (range)
-    RandUniform(0.0, 2.0 * param_range, &f_w_gifo_x_);
-    RandUniform(0.0, 2.0 * param_range, &f_w_gifo_r_);
-    RandUniform(0.0, 2.0 * param_range, &f_bias_);
-    RandUniform(0.0, 2.0 * param_range, &f_peephole_i_c_);
-    RandUniform(0.0, 2.0 * param_range, &f_peephole_f_c_);
-    RandUniform(0.0, 2.0 * param_range, &f_peephole_o_c_);
-    RandUniform(0.0, 2.0 * param_range, &f_w_r_m_);
-
-    // Add 1.0 to forget-gate bias
-    // [Miao IS16: AN EMPIRICAL EXPLORATION...]
-    f_bias_.Range(2*cell_dim_, cell_dim_).Add(1.0);
-
-    // backward direction,
-    b_w_gifo_x_.Resize(4*cell_dim_, input_dim_, kUndefined);
-    b_w_gifo_r_.Resize(4*cell_dim_, proj_dim_, kUndefined);
-    b_bias_.Resize(4*cell_dim_, kUndefined);
-    b_peephole_i_c_.Resize(cell_dim_, kUndefined);
-    b_peephole_f_c_.Resize(cell_dim_, kUndefined);
-    b_peephole_o_c_.Resize(cell_dim_, kUndefined);
-    b_w_r_m_.Resize(proj_dim_, cell_dim_, kUndefined);
-
-    RandUniform(0.0, 2.0 * param_range, &b_w_gifo_x_);
-    RandUniform(0.0, 2.0 * param_range, &b_w_gifo_r_);
-    RandUniform(0.0, 2.0 * param_range, &b_bias_);
-    RandUniform(0.0, 2.0 * param_range, &b_peephole_i_c_);
-    RandUniform(0.0, 2.0 * param_range, &b_peephole_f_c_);
-    RandUniform(0.0, 2.0 * param_range, &b_peephole_o_c_);
-    RandUniform(0.0, 2.0 * param_range, &b_w_r_m_);
-
-    // Add 1.0 to forget-gate bias,
-    // [Miao IS16: AN EMPIRICAL EXPLORATION...]
-    b_bias_.Range(2*cell_dim_, cell_dim_).Add(1.0);
-
-    KALDI_ASSERT(cell_dim_ > 0);
-    KALDI_ASSERT(learn_rate_coef_ >= 0.0);
-    KALDI_ASSERT(bias_learn_rate_coef_ >= 0.0);
-  }
-
-  void ReadData(std::istream &is, bool binary) {
-    // Read all the '<Tokens>' in arbitrary order,
-    while ('<' == Peek(is, binary)) {
-      std::string token;
-      int first_char = PeekToken(is, binary);
-      switch (first_char) {
-        case 'C': ReadToken(is, false, &token);
-          /**/ if (token == "<CellDim>") ReadBasicType(is, binary, &cell_dim_);
-          else if (token == "<CellClip>") ReadBasicType(is, binary, &cell_clip_);
-          else if (token == "<CellDiffClip>") ReadBasicType(is, binary, &cell_diff_clip_);
-          else if (token == "<ClipGradient>") ReadBasicType(is, binary, &grad_clip_); // bwd-compat.
-          else KALDI_ERR << "Unknown token: " << token;
-          break;
-        case 'L': ExpectToken(is, binary, "<LearnRateCoef>");
-          ReadBasicType(is, binary, &learn_rate_coef_);
-          break;
-        case 'B': ExpectToken(is, binary, "<BiasLearnRateCoef>");
-          ReadBasicType(is, binary, &bias_learn_rate_coef_);
-          break;
-        case 'D': ExpectToken(is, binary, "<DiffClip>");
-          ReadBasicType(is, binary, &diff_clip_);
-          break;
-        case 'G': ExpectToken(is, binary, "<GradClip>");
-          ReadBasicType(is, binary, &grad_clip_);
-          break;
-        default: ReadToken(is, false, &token);
-          KALDI_ERR << "Unknown token: " << token;
-      }
-    }
-    KALDI_ASSERT(cell_dim_ != 0);
-    // Read the data (data follow the tokens),
-
-    // reading parameters corresponding to forward direction
-    f_w_gifo_x_.Read(is, binary);
-    f_w_gifo_r_.Read(is, binary);
-    f_bias_.Read(is, binary);
-
-    f_peephole_i_c_.Read(is, binary);
-    f_peephole_f_c_.Read(is, binary);
-    f_peephole_o_c_.Read(is, binary);
-
-    f_w_r_m_.Read(is, binary);
-
-    // reading parameters corresponding to backward direction
-    b_w_gifo_x_.Read(is, binary);
-    b_w_gifo_r_.Read(is, binary);
-    b_bias_.Read(is, binary);
-
-    b_peephole_i_c_.Read(is, binary);
-    b_peephole_f_c_.Read(is, binary);
-    b_peephole_o_c_.Read(is, binary);
-
-    b_w_r_m_.Read(is, binary);
-  }
-
-  void WriteData(std::ostream &os, bool binary) const {
-    WriteToken(os, binary, "<CellDim>");
-    WriteBasicType(os, binary, cell_dim_);
-
-    WriteToken(os, binary, "<LearnRateCoef>");
-    WriteBasicType(os, binary, learn_rate_coef_);
-    WriteToken(os, binary, "<BiasLearnRateCoef>");
-    WriteBasicType(os, binary, bias_learn_rate_coef_);
-
-    WriteToken(os, binary, "<CellClip>");
-    WriteBasicType(os, binary, cell_clip_);
-    WriteToken(os, binary, "<DiffClip>");
-    WriteBasicType(os, binary, diff_clip_);
-    WriteToken(os, binary, "<CellDiffClip>");
-    WriteBasicType(os, binary, cell_diff_clip_);
-    WriteToken(os, binary, "<GradClip>");
-    WriteBasicType(os, binary, grad_clip_);
-
-    if (!binary) os << "\n";
-    // writing parameters, forward direction,
-    f_w_gifo_x_.Write(os, binary);
-    f_w_gifo_r_.Write(os, binary);
-    f_bias_.Write(os, binary);
-
-    f_peephole_i_c_.Write(os, binary);
-    f_peephole_f_c_.Write(os, binary);
-    f_peephole_o_c_.Write(os, binary);
-
-    f_w_r_m_.Write(os, binary);
-
-    if (!binary) os << "\n";
-    // writing parameters, backward direction,
-    b_w_gifo_x_.Write(os, binary);
-    b_w_gifo_r_.Write(os, binary);
-    b_bias_.Write(os, binary);
-
-    b_peephole_i_c_.Write(os, binary);
-    b_peephole_f_c_.Write(os, binary);
-    b_peephole_o_c_.Write(os, binary);
-
-    b_w_r_m_.Write(os, binary);
-  }
-
-  int32 NumParams() const {
-    return 2 * ( f_w_gifo_x_.NumRows() * f_w_gifo_x_.NumCols() +
-      f_w_gifo_r_.NumRows() * f_w_gifo_r_.NumCols() +
-      f_bias_.Dim() +
-      f_peephole_i_c_.Dim() +
-      f_peephole_f_c_.Dim() +
-      f_peephole_o_c_.Dim() +
-      f_w_r_m_.NumRows() * f_w_r_m_.NumCols() );
-  }
-
-  void GetGradient(VectorBase<BaseFloat>* gradient) const {
-    KALDI_ASSERT(gradient->Dim() == NumParams());
-    int32 offset, len;
-
-    // Copying parameters corresponding to forward direction
-    offset = 0;    len = f_w_gifo_x_.NumRows() * f_w_gifo_x_.NumCols();
-    gradient->Range(offset, len).CopyRowsFromMat(f_w_gifo_x_corr_);
-
-    offset += len; len = f_w_gifo_r_.NumRows() * f_w_gifo_r_.NumCols();
-    gradient->Range(offset, len).CopyRowsFromMat(f_w_gifo_r_corr_);
-
-    offset += len; len = f_bias_.Dim();
-    gradient->Range(offset, len).CopyFromVec(f_bias_corr_);
-
-    offset += len; len = f_peephole_i_c_.Dim();
-    gradient->Range(offset, len).CopyFromVec(f_peephole_i_c_corr_);
-
-    offset += len; len = f_peephole_f_c_.Dim();
-    gradient->Range(offset, len).CopyFromVec(f_peephole_f_c_corr_);
-
-    offset += len; len = f_peephole_o_c_.Dim();
-    gradient->Range(offset, len).CopyFromVec(f_peephole_o_c_corr_);
-
-    offset += len; len = f_w_r_m_.NumRows() * f_w_r_m_.NumCols();
-    gradient->Range(offset, len).CopyRowsFromMat(f_w_r_m_corr_);
-
-    // Copying parameters corresponding to backward direction
-    offset += len; len = b_w_gifo_x_.NumRows() * b_w_gifo_x_.NumCols();
-    gradient->Range(offset, len).CopyRowsFromMat(b_w_gifo_x_corr_);
-
-    offset += len; len = b_w_gifo_r_.NumRows() * b_w_gifo_r_.NumCols();
-    gradient->Range(offset, len).CopyRowsFromMat(b_w_gifo_r_corr_);
-
-    offset += len; len = b_bias_.Dim();
-    gradient->Range(offset, len).CopyFromVec(b_bias_corr_);
-
-    offset += len; len = b_peephole_i_c_.Dim();
-    gradient->Range(offset, len).CopyFromVec(b_peephole_i_c_corr_);
-
-    offset += len; len = b_peephole_f_c_.Dim();
-    gradient->Range(offset, len).CopyFromVec(b_peephole_f_c_corr_);
-
-    offset += len; len = b_peephole_o_c_.Dim();
-    gradient->Range(offset, len).CopyFromVec(b_peephole_o_c_corr_);
-
-    offset += len; len = b_w_r_m_.NumRows() * b_w_r_m_.NumCols();
-    gradient->Range(offset, len).CopyRowsFromMat(b_w_r_m_corr_);
-
-    // check the dim,
-    offset += len;
-    KALDI_ASSERT(offset == NumParams());
-  }
-
-  void GetParams(VectorBase<BaseFloat>* params) const {
-    KALDI_ASSERT(params->Dim() == NumParams());
-    int32 offset, len;
-
-    // Copying parameters corresponding to forward direction
-    offset = 0;    len = f_w_gifo_x_.NumRows() * f_w_gifo_x_.NumCols();
-    params->Range(offset, len).CopyRowsFromMat(f_w_gifo_x_);
-
-    offset += len; len = f_w_gifo_r_.NumRows() * f_w_gifo_r_.NumCols();
-    params->Range(offset, len).CopyRowsFromMat(f_w_gifo_r_);
-
-    offset += len; len = f_bias_.Dim();
-    params->Range(offset, len).CopyFromVec(f_bias_);
-
-    offset += len; len = f_peephole_i_c_.Dim();
-    params->Range(offset, len).CopyFromVec(f_peephole_i_c_);
-
-    offset += len; len = f_peephole_f_c_.Dim();
-    params->Range(offset, len).CopyFromVec(f_peephole_f_c_);
-
-    offset += len; len = f_peephole_o_c_.Dim();
-    params->Range(offset, len).CopyFromVec(f_peephole_o_c_);
-
-    offset += len; len = f_w_r_m_.NumRows() * f_w_r_m_.NumCols();
-    params->Range(offset, len).CopyRowsFromMat(f_w_r_m_);
-
-    // Copying parameters corresponding to backward direction
-    offset += len; len = b_w_gifo_x_.NumRows() * b_w_gifo_x_.NumCols();
-    params->Range(offset, len).CopyRowsFromMat(b_w_gifo_x_);
-
-    offset += len; len = b_w_gifo_r_.NumRows() * b_w_gifo_r_.NumCols();
-    params->Range(offset, len).CopyRowsFromMat(b_w_gifo_r_);
-
-    offset += len; len = b_bias_.Dim();
-    params->Range(offset, len).CopyFromVec(b_bias_);
-
-    offset += len; len = b_peephole_i_c_.Dim();
-    params->Range(offset, len).CopyFromVec(b_peephole_i_c_);
-
-    offset += len; len = b_peephole_f_c_.Dim();
-    params->Range(offset, len).CopyFromVec(b_peephole_f_c_);
-
-    offset += len; len = b_peephole_o_c_.Dim();
-    params->Range(offset, len).CopyFromVec(b_peephole_o_c_);
-
-    offset += len; len = b_w_r_m_.NumRows() * b_w_r_m_.NumCols();
-    params->Range(offset, len).CopyRowsFromMat(b_w_r_m_);
-
-    // check the dim,
-    offset += len;
-    KALDI_ASSERT(offset == NumParams());
-  }
-
-  void SetParams(const VectorBase<BaseFloat>& params) {
-    KALDI_ASSERT(params.Dim() == NumParams());
-    int32 offset, len;
-
-    // Copying parameters corresponding to forward direction
-    offset = 0;    len = f_w_gifo_x_.NumRows() * f_w_gifo_x_.NumCols();
-    f_w_gifo_x_.CopyRowsFromVec(params.Range(offset, len));
-
-    offset += len; len = f_w_gifo_r_.NumRows() * f_w_gifo_r_.NumCols();
-    f_w_gifo_r_.CopyRowsFromVec(params.Range(offset, len));
-
-    offset += len; len = f_bias_.Dim();
-    f_bias_.CopyFromVec(params.Range(offset, len));
-
-    offset += len; len = f_peephole_i_c_.Dim();
-    f_peephole_i_c_.CopyFromVec(params.Range(offset, len));
-
-    offset += len; len = f_peephole_f_c_.Dim();
-    f_peephole_f_c_.CopyFromVec(params.Range(offset, len));
-
-    offset += len; len = f_peephole_o_c_.Dim();
-    f_peephole_o_c_.CopyFromVec(params.Range(offset, len));
-
-    offset += len; len = f_w_r_m_.NumRows() * f_w_r_m_.NumCols();
-    f_w_r_m_.CopyRowsFromVec(params.Range(offset, len));
-
-    // Copying parameters corresponding to backward direction
-    offset += len; len = b_w_gifo_x_.NumRows() * b_w_gifo_x_.NumCols();
-    b_w_gifo_x_.CopyRowsFromVec(params.Range(offset, len));
-
-    offset += len; len = b_w_gifo_r_.NumRows() * b_w_gifo_r_.NumCols();
-    b_w_gifo_r_.CopyRowsFromVec(params.Range(offset, len));
-
-    offset += len; len = b_bias_.Dim();
-    b_bias_.CopyFromVec(params.Range(offset, len));
-
-    offset += len; len = b_peephole_i_c_.Dim();
-    b_peephole_i_c_.CopyFromVec(params.Range(offset, len));
-
-    offset += len; len = b_peephole_f_c_.Dim();
-    b_peephole_f_c_.CopyFromVec(params.Range(offset, len));
-
-    offset += len; len = b_peephole_o_c_.Dim();
-    b_peephole_o_c_.CopyFromVec(params.Range(offset, len));
-
-    offset += len; len = b_w_r_m_.NumRows() * b_w_r_m_.NumCols();
-    b_w_r_m_.CopyRowsFromVec(params.Range(offset, len));
-
-    // check the dim,
-    offset += len;
-    KALDI_ASSERT(offset == NumParams());
-  }
-
-
-  std::string Info() const {
-    return std::string("cell-dim 2x") + ToString(cell_dim_) + " " +
-      "( learn_rate_coef_ " + ToString(learn_rate_coef_) +
-      ", bias_learn_rate_coef_ " + ToString(bias_learn_rate_coef_) +
-      ", cell_clip_ " + ToString(cell_clip_) +
-      ", diff_clip_ " + ToString(diff_clip_) +
-      ", grad_clip_ " + ToString(grad_clip_) + " )" +
-      "\n  Forward Direction weights:" +
-      "\n  f_w_gifo_x_  "     + MomentStatistics(f_w_gifo_x_) +
-      "\n  f_w_gifo_r_  "     + MomentStatistics(f_w_gifo_r_) +
-      "\n  f_bias_  "         + MomentStatistics(f_bias_) +
-      "\n  f_peephole_i_c_  " + MomentStatistics(f_peephole_i_c_) +
-      "\n  f_peephole_f_c_  " + MomentStatistics(f_peephole_f_c_) +
-      "\n  f_peephole_o_c_  " + MomentStatistics(f_peephole_o_c_) +
-      "\n  f_w_r_m_  "        + MomentStatistics(f_w_r_m_) +
-      "\n  Backward Direction weights:" +
-      "\n  b_w_gifo_x_  "     + MomentStatistics(b_w_gifo_x_) +
-      "\n  b_w_gifo_r_  "     + MomentStatistics(b_w_gifo_r_) +
-      "\n  b_bias_  "         + MomentStatistics(b_bias_) +
-      "\n  b_peephole_i_c_  " + MomentStatistics(b_peephole_i_c_) +
-      "\n  b_peephole_f_c_  " + MomentStatistics(b_peephole_f_c_) +
-      "\n  b_peephole_o_c_  " + MomentStatistics(b_peephole_o_c_) +
-      "\n  b_w_r_m_  "        + MomentStatistics(b_w_r_m_);
-  }
-
-
-  std::string InfoGradient() const {
-    // forward-direction activations,
-    const CuSubMatrix<BaseFloat> YG_FW(f_propagate_buf_.ColRange(0*cell_dim_, cell_dim_));
-    const CuSubMatrix<BaseFloat> YI_FW(f_propagate_buf_.ColRange(1*cell_dim_, cell_dim_));
-    const CuSubMatrix<BaseFloat> YF_FW(f_propagate_buf_.ColRange(2*cell_dim_, cell_dim_));
-    const CuSubMatrix<BaseFloat> YO_FW(f_propagate_buf_.ColRange(3*cell_dim_, cell_dim_));
-    const CuSubMatrix<BaseFloat> YC_FW(f_propagate_buf_.ColRange(4*cell_dim_, cell_dim_));
-    const CuSubMatrix<BaseFloat> YH_FW(f_propagate_buf_.ColRange(5*cell_dim_, cell_dim_));
-    const CuSubMatrix<BaseFloat> YM_FW(f_propagate_buf_.ColRange(6*cell_dim_, cell_dim_));
-    const CuSubMatrix<BaseFloat> YR_FW(f_propagate_buf_.ColRange(7*cell_dim_, proj_dim_));
-
-    // forward-direction derivatives,
-    const CuSubMatrix<BaseFloat> DG_FW(f_backpropagate_buf_.ColRange(0*cell_dim_, cell_dim_));
-    const CuSubMatrix<BaseFloat> DI_FW(f_backpropagate_buf_.ColRange(1*cell_dim_, cell_dim_));
-    const CuSubMatrix<BaseFloat> DF_FW(f_backpropagate_buf_.ColRange(2*cell_dim_, cell_dim_));
-    const CuSubMatrix<BaseFloat> DO_FW(f_backpropagate_buf_.ColRange(3*cell_dim_, cell_dim_));
-    const CuSubMatrix<BaseFloat> DC_FW(f_backpropagate_buf_.ColRange(4*cell_dim_, cell_dim_));
-    const CuSubMatrix<BaseFloat> DH_FW(f_backpropagate_buf_.ColRange(5*cell_dim_, cell_dim_));
-    const CuSubMatrix<BaseFloat> DM_FW(f_backpropagate_buf_.ColRange(6*cell_dim_, cell_dim_));
-    const CuSubMatrix<BaseFloat> DR_FW(f_backpropagate_buf_.ColRange(7*cell_dim_, proj_dim_));
-
-    // backward-direction activations,
-    const CuSubMatrix<BaseFloat> YG_BW(b_propagate_buf_.ColRange(0*cell_dim_, cell_dim_));
-    const CuSubMatrix<BaseFloat> YI_BW(b_propagate_buf_.ColRange(1*cell_dim_, cell_dim_));
-    const CuSubMatrix<BaseFloat> YF_BW(b_propagate_buf_.ColRange(2*cell_dim_, cell_dim_));
-    const CuSubMatrix<BaseFloat> YO_BW(b_propagate_buf_.ColRange(3*cell_dim_, cell_dim_));
-    const CuSubMatrix<BaseFloat> YC_BW(b_propagate_buf_.ColRange(4*cell_dim_, cell_dim_));
-    const CuSubMatrix<BaseFloat> YH_BW(b_propagate_buf_.ColRange(5*cell_dim_, cell_dim_));
-    const CuSubMatrix<BaseFloat> YM_BW(b_propagate_buf_.ColRange(6*cell_dim_, cell_dim_));
-    const CuSubMatrix<BaseFloat> YR_BW(b_propagate_buf_.ColRange(7*cell_dim_, proj_dim_));
-
-    // backward-direction derivatives,
-    const CuSubMatrix<BaseFloat> DG_BW(b_backpropagate_buf_.ColRange(0*cell_dim_, cell_dim_));
-    const CuSubMatrix<BaseFloat> DI_BW(b_backpropagate_buf_.ColRange(1*cell_dim_, cell_dim_));
-    const CuSubMatrix<BaseFloat> DF_BW(b_backpropagate_buf_.ColRange(2*cell_dim_, cell_dim_));
-    const CuSubMatrix<BaseFloat> DO_BW(b_backpropagate_buf_.ColRange(3*cell_dim_, cell_dim_));
-    const CuSubMatrix<BaseFloat> DC_BW(b_backpropagate_buf_.ColRange(4*cell_dim_, cell_dim_));
-    const CuSubMatrix<BaseFloat> DH_BW(b_backpropagate_buf_.ColRange(5*cell_dim_, cell_dim_));
-    const CuSubMatrix<BaseFloat> DM_BW(b_backpropagate_buf_.ColRange(6*cell_dim_, cell_dim_));
-    const CuSubMatrix<BaseFloat> DR_BW(b_backpropagate_buf_.ColRange(7*cell_dim_, proj_dim_));
-
-    return std::string("") +
-      "( learn_rate_coef_ " + ToString(learn_rate_coef_) +
-      ", bias_learn_rate_coef_ " + ToString(bias_learn_rate_coef_) +
-      ", cell_clip_ " + ToString(cell_clip_) +
-      ", diff_clip_ " + ToString(diff_clip_) +
-      ", grad_clip_ " + ToString(grad_clip_) + " )" +
-      "\n  ### Gradients " +
-      "\n  f_w_gifo_x_corr_  "     + MomentStatistics(f_w_gifo_x_corr_) +
-      "\n  f_w_gifo_r_corr_  "     + MomentStatistics(f_w_gifo_r_corr_) +
-      "\n  f_bias_corr_  "         + MomentStatistics(f_bias_corr_) +
-      "\n  f_peephole_i_c_corr_  " + MomentStatistics(f_peephole_i_c_corr_) +
-      "\n  f_peephole_f_c_corr_  " + MomentStatistics(f_peephole_f_c_corr_) +
-      "\n  f_peephole_o_c_corr_  " + MomentStatistics(f_peephole_o_c_corr_) +
-      "\n  f_w_r_m_corr_  "        + MomentStatistics(f_w_r_m_corr_) +
-      "\n  ---" +
-      "\n  b_w_gifo_x_corr_  "     + MomentStatistics(b_w_gifo_x_corr_) +
-      "\n  b_w_gifo_r_corr_  "     + MomentStatistics(b_w_gifo_r_corr_) +
-      "\n  b_bias_corr_  "         + MomentStatistics(b_bias_corr_) +
-      "\n  b_peephole_i_c_corr_  " + MomentStatistics(b_peephole_i_c_corr_) +
-      "\n  b_peephole_f_c_corr_  " + MomentStatistics(b_peephole_f_c_corr_) +
-      "\n  b_peephole_o_c_corr_  " + MomentStatistics(b_peephole_o_c_corr_) +
-      "\n  b_w_r_m_corr_  "        + MomentStatistics(b_w_r_m_corr_) +
-      "\n" +
-      "\n  ### Activations (mostly after non-linearities)" +
-      "\n  YI_FW(0..1)^  " + MomentStatistics(YI_FW) +
-      "\n  YF_FW(0..1)^  " + MomentStatistics(YF_FW) +
-      "\n  YO_FW(0..1)^  " + MomentStatistics(YO_FW) +
-      "\n  YG_FW(-1..1)  " + MomentStatistics(YG_FW) +
-      "\n  YC_FW(-R..R)* " + MomentStatistics(YC_FW) +
-      "\n  YH_FW(-1..1)  " + MomentStatistics(YH_FW) +
-      "\n  YM_FW(-1..1)  " + MomentStatistics(YM_FW) +
-      "\n  YR_FW(-R..R)  " + MomentStatistics(YR_FW) +
-      "\n  ---" +
-      "\n  YI_BW(0..1)^  " + MomentStatistics(YI_BW) +
-      "\n  YF_BW(0..1)^  " + MomentStatistics(YF_BW) +
-      "\n  YO_BW(0..1)^  " + MomentStatistics(YO_BW) +
-      "\n  YG_BW(-1..1)  " + MomentStatistics(YG_BW) +
-      "\n  YC_BW(-R..R)* " + MomentStatistics(YC_BW) +
-      "\n  YH_BW(-1..1)  " + MomentStatistics(YH_BW) +
-      "\n  YM_BW(-1..1)  " + MomentStatistics(YM_BW) +
-      "\n  YR_BW(-R..R)  " + MomentStatistics(YR_BW) +
-      "\n" +
-      "\n  ### Derivatives (w.r.t. inputs of non-linearities)" +
-      "\n  DI_FW^ " + MomentStatistics(DI_FW) +
-      "\n  DF_FW^ " + MomentStatistics(DF_FW) +
-      "\n  DO_FW^ " + MomentStatistics(DO_FW) +
-      "\n  DG_FW  " + MomentStatistics(DG_FW) +
-      "\n  DC_FW* " + MomentStatistics(DC_FW) +
-      "\n  DH_FW  " + MomentStatistics(DH_FW) +
-      "\n  DM_FW  " + MomentStatistics(DM_FW) +
-      "\n  DR_FW  " + MomentStatistics(DR_FW) +
-      "\n  ---" +
-      "\n  DI_BW^ " + MomentStatistics(DI_BW) +
-      "\n  DF_BW^ " + MomentStatistics(DF_BW) +
-      "\n  DO_BW^ " + MomentStatistics(DO_BW) +
-      "\n  DG_BW  " + MomentStatistics(DG_BW) +
-      "\n  DC_BW* " + MomentStatistics(DC_BW) +
-      "\n  DH_BW  " + MomentStatistics(DH_BW) +
-      "\n  DM_BW  " + MomentStatistics(DM_BW) +
-      "\n  DR_BW  " + MomentStatistics(DR_BW);
-  }
-
-  void PropagateFnc(const CuMatrixBase<BaseFloat> &in,
-                    CuMatrixBase<BaseFloat> *out) {
-
-    KALDI_ASSERT(in.NumRows() % NumStreams() == 0);
-    int32 S = NumStreams();
-    int32 T = in.NumRows() / NumStreams();
-
-    // buffers,
-    f_propagate_buf_.Resize((T+2)*S, 7 * cell_dim_ + proj_dim_, kSetZero);
-    b_propagate_buf_.Resize((T+2)*S, 7 * cell_dim_ + proj_dim_, kSetZero);
-
-    // forward-direction activations,
-    CuSubMatrix<BaseFloat> F_YG(f_propagate_buf_.ColRange(0*cell_dim_, cell_dim_));
-    CuSubMatrix<BaseFloat> F_YI(f_propagate_buf_.ColRange(1*cell_dim_, cell_dim_));
-    CuSubMatrix<BaseFloat> F_YF(f_propagate_buf_.ColRange(2*cell_dim_, cell_dim_));
-    CuSubMatrix<BaseFloat> F_YO(f_propagate_buf_.ColRange(3*cell_dim_, cell_dim_));
-    CuSubMatrix<BaseFloat> F_YC(f_propagate_buf_.ColRange(4*cell_dim_, cell_dim_));
-    CuSubMatrix<BaseFloat> F_YH(f_propagate_buf_.ColRange(5*cell_dim_, cell_dim_));
-    CuSubMatrix<BaseFloat> F_YM(f_propagate_buf_.ColRange(6*cell_dim_, cell_dim_));
-    CuSubMatrix<BaseFloat> F_YR(f_propagate_buf_.ColRange(7*cell_dim_, proj_dim_));
-    CuSubMatrix<BaseFloat> F_YGIFO(f_propagate_buf_.ColRange(0, 4*cell_dim_));
-
-    // backward-direction activations,
-    CuSubMatrix<BaseFloat> B_YG(b_propagate_buf_.ColRange(0*cell_dim_, cell_dim_));
-    CuSubMatrix<BaseFloat> B_YI(b_propagate_buf_.ColRange(1*cell_dim_, cell_dim_));
-    CuSubMatrix<BaseFloat> B_YF(b_propagate_buf_.ColRange(2*cell_dim_, cell_dim_));
-    CuSubMatrix<BaseFloat> B_YO(b_propagate_buf_.ColRange(3*cell_dim_, cell_dim_));
-    CuSubMatrix<BaseFloat> B_YC(b_propagate_buf_.ColRange(4*cell_dim_, cell_dim_));
-    CuSubMatrix<BaseFloat> B_YH(b_propagate_buf_.ColRange(5*cell_dim_, cell_dim_));
-    CuSubMatrix<BaseFloat> B_YM(b_propagate_buf_.ColRange(6*cell_dim_, cell_dim_));
-    CuSubMatrix<BaseFloat> B_YR(b_propagate_buf_.ColRange(7*cell_dim_, proj_dim_));
-    CuSubMatrix<BaseFloat> B_YGIFO(b_propagate_buf_.ColRange(0, 4*cell_dim_));
-
-    // FORWARD DIRECTION,
-    // x -> g, i, f, o, not recurrent, do it all in once
-    F_YGIFO.RowRange(1*S, T*S).AddMatMat(1.0, in, kNoTrans, f_w_gifo_x_, kTrans, 0.0);
-
-    // bias -> g, i, f, o
-    F_YGIFO.RowRange(1*S, T*S).AddVecToRows(1.0, f_bias_);
-
-    // BufferPadding [T0]:dummy, [1, T]:current sequence, [T+1]:dummy
-    for (int t = 1; t <= T; t++) {
-      // multistream buffers for current time-step,
-      CuSubMatrix<BaseFloat> y_all(f_propagate_buf_.RowRange(t*S, S));
-      CuSubMatrix<BaseFloat> y_g(F_YG.RowRange(t*S, S));
-      CuSubMatrix<BaseFloat> y_i(F_YI.RowRange(t*S, S));
-      CuSubMatrix<BaseFloat> y_f(F_YF.RowRange(t*S, S));
-      CuSubMatrix<BaseFloat> y_o(F_YO.RowRange(t*S, S));
-      CuSubMatrix<BaseFloat> y_c(F_YC.RowRange(t*S, S));
-      CuSubMatrix<BaseFloat> y_h(F_YH.RowRange(t*S, S));
-      CuSubMatrix<BaseFloat> y_m(F_YM.RowRange(t*S, S));
-      CuSubMatrix<BaseFloat> y_r(F_YR.RowRange(t*S, S));
-      CuSubMatrix<BaseFloat> y_gifo(F_YGIFO.RowRange(t*S, S));
-
-      // r(t-1) -> g, i, f, o
-      y_gifo.AddMatMat(1.0, F_YR.RowRange((t-1)*S, S), kNoTrans, f_w_gifo_r_, kTrans, 1.0);
-
-      // c(t-1) -> i(t) via peephole
-      y_i.AddMatDiagVec(1.0, F_YC.RowRange((t-1)*S, S), kNoTrans, f_peephole_i_c_, 1.0);
-
-      // c(t-1) -> f(t) via peephole
-      y_f.AddMatDiagVec(1.0, F_YC.RowRange((t-1)*S, S), kNoTrans, f_peephole_f_c_, 1.0);
-
-      // i, f sigmoid squashing
-      y_i.Sigmoid(y_i);
-      y_f.Sigmoid(y_f);
-
-      // g tanh squashing
-      y_g.Tanh(y_g);
-
-      // g * i -> c
-      y_c.AddMatMatElements(1.0, y_g, y_i, 0.0);
-      // c(t-1) * f -> c(t) via forget-gate
-      y_c.AddMatMatElements(1.0, F_YC.RowRange((t-1)*S, S), y_f, 1.0);
-
-      if (cell_clip_ > 0.0) {
-        y_c.ApplyFloor(-cell_clip_);   // Optional clipping of cell activation,
-        y_c.ApplyCeiling(cell_clip_);  // Google paper Interspeech2014: LSTM for LVCSR
-      }
-
-      // c(t) -> o(t) via peephole (not recurrent, using c(t))
-      y_o.AddMatDiagVec(1.0, y_c, kNoTrans, f_peephole_o_c_, 1.0);
-
-      // o sigmoid squashing,
-      y_o.Sigmoid(y_o);
-
-      // c -> h, tanh squashing,
-      y_h.Tanh(y_c);
-
-      // h * o -> m via output gate,
-      y_m.AddMatMatElements(1.0, y_h, y_o, 0.0);
-
-      // m -> r
-      y_r.AddMatMat(1.0, y_m, kNoTrans, f_w_r_m_, kTrans, 0.0);
-
-      // set zeros to padded frames,
-      if (sequence_lengths_.size() > 0) {
-        for (int s = 0; s < S; s++) {
-          if (t > sequence_lengths_[s]) {
-            y_all.Row(s).SetZero();
-          }
-        }
-      }
-    }
-
-    // BACKWARD DIRECTION,
-    // x -> g, i, f, o, not recurrent, do it all in once
-    B_YGIFO.RowRange(1*S, T*S).AddMatMat(1.0, in, kNoTrans, b_w_gifo_x_, kTrans, 0.0);
-
-    // bias -> g, i, f, o
-    B_YGIFO.RowRange(1*S, T*S).AddVecToRows(1.0, b_bias_);
-
-    // BufferPadding [T0]:dummy, [1, T]:current sequence, [T+1]:dummy
-    for (int t = T; t >= 1; t--) {
-      // multistream buffers for current time-step,
-      CuSubMatrix<BaseFloat> y_all(b_propagate_buf_.RowRange(t*S, S));
-      CuSubMatrix<BaseFloat> y_g(B_YG.RowRange(t*S, S));
-      CuSubMatrix<BaseFloat> y_i(B_YI.RowRange(t*S, S));
-      CuSubMatrix<BaseFloat> y_f(B_YF.RowRange(t*S, S));
-      CuSubMatrix<BaseFloat> y_o(B_YO.RowRange(t*S, S));
-      CuSubMatrix<BaseFloat> y_c(B_YC.RowRange(t*S, S));
-      CuSubMatrix<BaseFloat> y_h(B_YH.RowRange(t*S, S));
-      CuSubMatrix<BaseFloat> y_m(B_YM.RowRange(t*S, S));
-      CuSubMatrix<BaseFloat> y_r(B_YR.RowRange(t*S, S));
-      CuSubMatrix<BaseFloat> y_gifo(B_YGIFO.RowRange(t*S, S));
-
-      // r(t+1) -> g, i, f, o
-      y_gifo.AddMatMat(1.0, B_YR.RowRange((t+1)*S, S), kNoTrans, b_w_gifo_r_, kTrans, 1.0);
-
-      // c(t+1) -> i(t) via peephole
-      y_i.AddMatDiagVec(1.0, B_YC.RowRange((t+1)*S, S), kNoTrans, b_peephole_i_c_, 1.0);
-
-      // c(t+1) -> f(t) via peephole
-      y_f.AddMatDiagVec(1.0, B_YC.RowRange((t+1)*S, S), kNoTrans, b_peephole_f_c_, 1.0);
-
-      // i, f sigmoid squashing
-      y_i.Sigmoid(y_i);
-      y_f.Sigmoid(y_f);
-
-      // g tanh squashing
-      y_g.Tanh(y_g);
-
-      // g * i -> c
-      y_c.AddMatMatElements(1.0, y_g, y_i, 0.0);
-      // c(t+1) * f -> c(t) via forget-gate
-      y_c.AddMatMatElements(1.0, B_YC.RowRange((t+1)*S, S), y_f, 1.0);
-
-      if (cell_clip_ > 0.0) {
-        y_c.ApplyFloor(-cell_clip_);   // optional clipping of cell activation,
-        y_c.ApplyCeiling(cell_clip_);  // google paper Interspeech2014: LSTM for LVCSR
-      }
-
-      // c(t) -> o(t) via peephole (not recurrent, using c(t))
-      y_o.AddMatDiagVec(1.0, y_c, kNoTrans, b_peephole_o_c_, 1.0);
-
-      // o sigmoid squashing,
-      y_o.Sigmoid(y_o);
-
-      // h tanh squashing,
-      y_h.Tanh(y_c);
-
-      // h * o -> m via output gate,
-      y_m.AddMatMatElements(1.0, y_h, y_o, 0.0);
-
-      // m -> r
-      y_r.AddMatMat(1.0, y_m, kNoTrans, b_w_r_m_, kTrans, 0.0);
-
-      // set zeros to padded frames,
-      if (sequence_lengths_.size() > 0) {
-        for (int s = 0; s < S; s++) {
-          if (t > sequence_lengths_[s]) {
-            y_all.Row(s).SetZero();
-          }
-        }
-      }
-    }
-
-    CuMatrix<BaseFloat> YR_FB;
-    YR_FB.Resize((T+2)*S, 2 * proj_dim_, kSetZero);
-    // forward part
-    YR_FB.ColRange(0, proj_dim_).CopyFromMat(f_propagate_buf_.ColRange(7*cell_dim_, proj_dim_));
-    // backward part
-    YR_FB.ColRange(proj_dim_, proj_dim_).CopyFromMat(b_propagate_buf_.ColRange(7*cell_dim_, proj_dim_));
-    // recurrent projection layer is also feed-forward as BLSTM output
-    out->CopyFromMat(YR_FB.RowRange(1*S, T*S));
-  }
-
-
-  void BackpropagateFnc(const CuMatrixBase<BaseFloat> &in,
-                        const CuMatrixBase<BaseFloat> &out,
-                        const CuMatrixBase<BaseFloat> &out_diff,
-                        CuMatrixBase<BaseFloat> *in_diff) {
-
-    // the number of sequences to be processed in parallel
-    int32 T = in.NumRows() / NumStreams();
-    int32 S = NumStreams();
-
-    // buffers,
-    f_backpropagate_buf_.Resize((T+2)*S, 7 * cell_dim_ + proj_dim_, kSetZero);
-    b_backpropagate_buf_.Resize((T+2)*S, 7 * cell_dim_ + proj_dim_, kSetZero);
-
-    // FORWARD DIRECTION,
-    // forward-direction activations,
-    CuSubMatrix<BaseFloat> F_YG(f_propagate_buf_.ColRange(0*cell_dim_, cell_dim_));
-    CuSubMatrix<BaseFloat> F_YI(f_propagate_buf_.ColRange(1*cell_dim_, cell_dim_));
-    CuSubMatrix<BaseFloat> F_YF(f_propagate_buf_.ColRange(2*cell_dim_, cell_dim_));
-    CuSubMatrix<BaseFloat> F_YO(f_propagate_buf_.ColRange(3*cell_dim_, cell_dim_));
-    CuSubMatrix<BaseFloat> F_YC(f_propagate_buf_.ColRange(4*cell_dim_, cell_dim_));
-    CuSubMatrix<BaseFloat> F_YH(f_propagate_buf_.ColRange(5*cell_dim_, cell_dim_));
-    CuSubMatrix<BaseFloat> F_YM(f_propagate_buf_.ColRange(6*cell_dim_, cell_dim_));
-    CuSubMatrix<BaseFloat> F_YR(f_propagate_buf_.ColRange(7*cell_dim_, proj_dim_));
-
-    // forward-direction derivatives,
-    CuSubMatrix<BaseFloat> F_DG(f_backpropagate_buf_.ColRange(0*cell_dim_, cell_dim_));
-    CuSubMatrix<BaseFloat> F_DI(f_backpropagate_buf_.ColRange(1*cell_dim_, cell_dim_));
-    CuSubMatrix<BaseFloat> F_DF(f_backpropagate_buf_.ColRange(2*cell_dim_, cell_dim_));
-    CuSubMatrix<BaseFloat> F_DO(f_backpropagate_buf_.ColRange(3*cell_dim_, cell_dim_));
-    CuSubMatrix<BaseFloat> F_DC(f_backpropagate_buf_.ColRange(4*cell_dim_, cell_dim_));
-    CuSubMatrix<BaseFloat> F_DH(f_backpropagate_buf_.ColRange(5*cell_dim_, cell_dim_));
-    CuSubMatrix<BaseFloat> F_DM(f_backpropagate_buf_.ColRange(6*cell_dim_, cell_dim_));
-    CuSubMatrix<BaseFloat> F_DR(f_backpropagate_buf_.ColRange(7*cell_dim_, proj_dim_));
-    CuSubMatrix<BaseFloat> F_DGIFO(f_backpropagate_buf_.ColRange(0, 4*cell_dim_));
-
-    // pre-copy partial derivatives from the BLSTM output,
-    F_DR.RowRange(1*S, T*S).CopyFromMat(out_diff.ColRange(0, proj_dim_));
-
-    // BufferPadding [T0]:dummy, [1,T]:current sequence, [T+1]: dummy,
-    for (int t = T; t >= 1; t--) {
-      CuSubMatrix<BaseFloat> y_g(F_YG.RowRange(t*S, S));
-      CuSubMatrix<BaseFloat> y_i(F_YI.RowRange(t*S, S));
-      CuSubMatrix<BaseFloat> y_f(F_YF.RowRange(t*S, S));
-      CuSubMatrix<BaseFloat> y_o(F_YO.RowRange(t*S, S));
-      // CuSubMatrix<BaseFloat> y_c(F_YC.RowRange(t*S, S));
-      CuSubMatrix<BaseFloat> y_h(F_YH.RowRange(t*S, S));
-      // CuSubMatrix<BaseFloat> y_m(F_YM.RowRange(t*S, S));
-      // CuSubMatrix<BaseFloat> y_r(F_YR.RowRange(t*S, S));
-
-      CuSubMatrix<BaseFloat> d_all(f_backpropagate_buf_.RowRange(t*S, S));
-      CuSubMatrix<BaseFloat> d_g(F_DG.RowRange(t*S, S));
-      CuSubMatrix<BaseFloat> d_i(F_DI.RowRange(t*S, S));
-      CuSubMatrix<BaseFloat> d_f(F_DF.RowRange(t*S, S));
-      CuSubMatrix<BaseFloat> d_o(F_DO.RowRange(t*S, S));
-      CuSubMatrix<BaseFloat> d_c(F_DC.RowRange(t*S, S));
-      CuSubMatrix<BaseFloat> d_h(F_DH.RowRange(t*S, S));
-      CuSubMatrix<BaseFloat> d_m(F_DM.RowRange(t*S, S));
-      CuSubMatrix<BaseFloat> d_r(F_DR.RowRange(t*S, S));
-      CuSubMatrix<BaseFloat> d_gifo(F_DGIFO.RowRange(t*S, S));
-
-      // r
-      //   Version 1 (precise gradients):
-      //   backprop error from g(t+1), i(t+1), f(t+1), o(t+1) to r(t)
-      d_r.AddMatMat(1.0, F_DGIFO.RowRange((t+1)*S, S), kNoTrans, f_w_gifo_r_, kNoTrans, 1.0);
-
-      /*
-      //   Version 2 (Alex Graves' PhD dissertation):
-      //   only backprop g(t+1) to r(t)
-      CuSubMatrix<BaseFloat> w_g_r_(w_gifo_r_.RowRange(0, cell_dim_));
-      d_r.AddMatMat(1.0, DG.RowRange((t+1)*S,S), kNoTrans, w_g_r_, kNoTrans, 1.0);
-      */
-
-      /*
-      //   Version 3 (Felix Gers' PhD dissertation):
-      //   truncate gradients of g(t+1), i(t+1), f(t+1), o(t+1) once they leak out memory block
-      //   CEC(with forget connection) is the only "error-bridge" through time
-      ;
-      */
-
-      // r -> m
-      d_m.AddMatMat(1.0, d_r, kNoTrans, f_w_r_m_, kNoTrans, 0.0);
-
-      // m -> h, via output gate
-      d_h.AddMatMatElements(1.0, d_m, y_o, 0.0);
-      d_h.DiffTanh(y_h, d_h);
-
-      // o
-      d_o.AddMatMatElements(1.0, d_m, y_h, 0.0);
-      d_o.DiffSigmoid(y_o, d_o);
-
-      // c
-      // 1. diff from h(t)
-      // 2. diff from c(t+1) (via forget-gate between CEC)
-      // 3. diff from i(t+1) (via peephole)
-      // 4. diff from f(t+1) (via peephole)
-      // 5. diff from o(t)   (via peephole, not recurrent)
-      d_c.AddMat(1.0, d_h);
-      d_c.AddMatMatElements(1.0, F_DC.RowRange((t+1)*S, S), F_YF.RowRange((t+1)*S, S), 1.0);
-      d_c.AddMatDiagVec(1.0, F_DI.RowRange((t+1)*S, S), kNoTrans, f_peephole_i_c_, 1.0);
-      d_c.AddMatDiagVec(1.0, F_DF.RowRange((t+1)*S, S), kNoTrans, f_peephole_f_c_, 1.0);
-      d_c.AddMatDiagVec(1.0, d_o                      , kNoTrans, f_peephole_o_c_, 1.0);
-      // optionally clip the cell_derivative,
-      if (cell_diff_clip_ > 0.0) {
-        d_c.ApplyFloor(-cell_diff_clip_);
-        d_c.ApplyCeiling(cell_diff_clip_);
-      }
-
-      // f
-      d_f.AddMatMatElements(1.0, d_c, F_YC.RowRange((t-1)*S, S), 0.0);
-      d_f.DiffSigmoid(y_f, d_f);
-
-      // i
-      d_i.AddMatMatElements(1.0, d_c, y_g, 0.0);
-      d_i.DiffSigmoid(y_i, d_i);
-
-      // c -> g, via input gate
-      d_g.AddMatMatElements(1.0, d_c, y_i, 0.0);
-      d_g.DiffTanh(y_g, d_g);
-
-      // Clipping per-frame derivatives for the next `t'.
-      // Clipping applied to gates and input gate (as done in Google).
-      // [ICASSP2015, Sak, Learning acoustic frame labelling...],
-      //
-      // The path from 'out_diff' to 'd_c' via 'd_h' is unclipped,
-      // which is probably important for the 'Constant Error Carousel'
-      // to work well.
-      //
-      if (diff_clip_ > 0.0) {
-        d_gifo.ApplyFloor(-diff_clip_);
-        d_gifo.ApplyCeiling(diff_clip_);
-      }
-
-      // set zeros to padded frames,
-      if (sequence_lengths_.size() > 0) {
-        for (int s = 0; s < S; s++) {
-          if (t > sequence_lengths_[s]) {
-            d_all.Row(s).SetZero();
-          }
-        }
-      }
-    }
-
-    // BACKWARD DIRECTION,
-    // backward-direction activations,
-    CuSubMatrix<BaseFloat> B_YG(b_propagate_buf_.ColRange(0*cell_dim_, cell_dim_));
-    CuSubMatrix<BaseFloat> B_YI(b_propagate_buf_.ColRange(1*cell_dim_, cell_dim_));
-    CuSubMatrix<BaseFloat> B_YF(b_propagate_buf_.ColRange(2*cell_dim_, cell_dim_));
-    CuSubMatrix<BaseFloat> B_YO(b_propagate_buf_.ColRange(3*cell_dim_, cell_dim_));
-    CuSubMatrix<BaseFloat> B_YC(b_propagate_buf_.ColRange(4*cell_dim_, cell_dim_));
-    CuSubMatrix<BaseFloat> B_YH(b_propagate_buf_.ColRange(5*cell_dim_, cell_dim_));
-    CuSubMatrix<BaseFloat> B_YM(b_propagate_buf_.ColRange(6*cell_dim_, cell_dim_));
-    CuSubMatrix<BaseFloat> B_YR(b_propagate_buf_.ColRange(7*cell_dim_, proj_dim_));
-
-    // backward-direction derivatives,
-    CuSubMatrix<BaseFloat> B_DG(b_backpropagate_buf_.ColRange(0*cell_dim_, cell_dim_));
-    CuSubMatrix<BaseFloat> B_DI(b_backpropagate_buf_.ColRange(1*cell_dim_, cell_dim_));
-    CuSubMatrix<BaseFloat> B_DF(b_backpropagate_buf_.ColRange(2*cell_dim_, cell_dim_));
-    CuSubMatrix<BaseFloat> B_DO(b_backpropagate_buf_.ColRange(3*cell_dim_, cell_dim_));
-    CuSubMatrix<BaseFloat> B_DC(b_backpropagate_buf_.ColRange(4*cell_dim_, cell_dim_));
-    CuSubMatrix<BaseFloat> B_DH(b_backpropagate_buf_.ColRange(5*cell_dim_, cell_dim_));
-    CuSubMatrix<BaseFloat> B_DM(b_backpropagate_buf_.ColRange(6*cell_dim_, cell_dim_));
-    CuSubMatrix<BaseFloat> B_DR(b_backpropagate_buf_.ColRange(7*cell_dim_, proj_dim_));
-    CuSubMatrix<BaseFloat> B_DGIFO(b_backpropagate_buf_.ColRange(0, 4*cell_dim_));
-
-    // pre-copy partial derivatives from the BLSTM output,
-    B_DR.RowRange(1*S, T*S).CopyFromMat(out_diff.ColRange(proj_dim_, proj_dim_));
-
-    // BufferPadding [T0]:dummy, [1,T]:current sequence, [T+1]: dummy,
-    for (int t = 1; t <= T; t++) {
-      CuSubMatrix<BaseFloat> y_g(B_YG.RowRange(t*S, S));
-      CuSubMatrix<BaseFloat> y_i(B_YI.RowRange(t*S, S));
-      CuSubMatrix<BaseFloat> y_f(B_YF.RowRange(t*S, S));
-      CuSubMatrix<BaseFloat> y_o(B_YO.RowRange(t*S, S));
-      // CuSubMatrix<BaseFloat> y_c(B_YC.RowRange(t*S, S));
-      CuSubMatrix<BaseFloat> y_h(B_YH.RowRange(t*S, S));
-      // CuSubMatrix<BaseFloat> y_m(B_YM.RowRange(t*S, S));
-      // CuSubMatrix<BaseFloat> y_r(B_YR.RowRange(t*S, S));
-
-      CuSubMatrix<BaseFloat> d_all(b_backpropagate_buf_.RowRange(t*S, S));
-      CuSubMatrix<BaseFloat> d_g(B_DG.RowRange(t*S, S));
-      CuSubMatrix<BaseFloat> d_i(B_DI.RowRange(t*S, S));
-      CuSubMatrix<BaseFloat> d_f(B_DF.RowRange(t*S, S));
-      CuSubMatrix<BaseFloat> d_o(B_DO.RowRange(t*S, S));
-      CuSubMatrix<BaseFloat> d_c(B_DC.RowRange(t*S, S));
-      CuSubMatrix<BaseFloat> d_h(B_DH.RowRange(t*S, S));
-      CuSubMatrix<BaseFloat> d_m(B_DM.RowRange(t*S, S));
-      CuSubMatrix<BaseFloat> d_r(B_DR.RowRange(t*S, S));
-      CuSubMatrix<BaseFloat> d_gifo(B_DGIFO.RowRange(t*S, S));
-
-      // r
-      //   Version 1 (precise gradients):
-      //   backprop error from g(t-1), i(t-1), f(t-1), o(t-1) to r(t)
-      d_r.AddMatMat(1.0, B_DGIFO.RowRange((t-1)*S, S), kNoTrans, b_w_gifo_r_, kNoTrans, 1.0);
-
-      /*
-      //   Version 2 (Alex Graves' PhD dissertation):
-      //   only backprop g(t+1) to r(t)
-      CuSubMatrix<BaseFloat> w_g_r_(w_gifo_r_.RowRange(0, cell_dim_));
-      d_r.AddMatMat(1.0, DG.RowRange((t+1)*S,S), kNoTrans, w_g_r_, kNoTrans, 1.0);
-      */
-
-      /*
-      //   Version 3 (Felix Gers' PhD dissertation):
-      //   truncate gradients of g(t+1), i(t+1), f(t+1), o(t+1) once they leak out memory block
-      //   CEC(with forget connection) is the only "error-bridge" through time
-      */
-
-      // r -> m
-      d_m.AddMatMat(1.0, d_r, kNoTrans, b_w_r_m_, kNoTrans, 0.0);
-
-      // m -> h via output gate
-      d_h.AddMatMatElements(1.0, d_m, y_o, 0.0);
-      d_h.DiffTanh(y_h, d_h);
-
-      // o
-      d_o.AddMatMatElements(1.0, d_m, y_h, 0.0);
-      d_o.DiffSigmoid(y_o, d_o);
-
-      // c
-      // 1. diff from h(t)
-      // 2. diff from c(t+1) (via forget-gate between CEC)
-      // 3. diff from i(t+1) (via peephole)
-      // 4. diff from f(t+1) (via peephole)
-      // 5. diff from o(t)   (via peephole, not recurrent)
-      d_c.AddMat(1.0, d_h);
-      d_c.AddMatMatElements(1.0, B_DC.RowRange((t-1)*S, S), B_YF.RowRange((t-1)*S, S), 1.0);
-      d_c.AddMatDiagVec(1.0, B_DI.RowRange((t-1)*S, S), kNoTrans, b_peephole_i_c_, 1.0);
-      d_c.AddMatDiagVec(1.0, B_DF.RowRange((t-1)*S, S), kNoTrans, b_peephole_f_c_, 1.0);
-      d_c.AddMatDiagVec(1.0, d_o                      , kNoTrans, b_peephole_o_c_, 1.0);
-      // optionally clip the cell_derivative,
-      if (cell_diff_clip_ > 0.0) {
-        d_c.ApplyFloor(-cell_diff_clip_);
-        d_c.ApplyCeiling(cell_diff_clip_);
-      }
-
-      // f
-      d_f.AddMatMatElements(1.0, d_c, B_YC.RowRange((t-1)*S, S), 0.0);
-      d_f.DiffSigmoid(y_f, d_f);
-
-      // i
-      d_i.AddMatMatElements(1.0, d_c, y_g, 0.0);
-      d_i.DiffSigmoid(y_i, d_i);
-
-      // c -> g, via input gate,
-      d_g.AddMatMatElements(1.0, d_c, y_i, 0.0);
-      d_g.DiffTanh(y_g, d_g);
-
-      // Clipping per-frame derivatives for the next `t'.
-      // Clipping applied to gates and input gate (as done in Google).
-      // [ICASSP2015, Sak, Learning acoustic frame labelling...],
-      //
-      // The path from 'out_diff' to 'd_c' via 'd_h' is unclipped,
-      // which is probably important for the 'Constant Error Carousel'
-      // to work well.
-      //
-      if (diff_clip_ > 0.0) {
-        d_gifo.ApplyFloor(-diff_clip_);
-        d_gifo.ApplyCeiling(diff_clip_);
-      }
-
-      // set zeros to padded frames,
-      if (sequence_lengths_.size() > 0) {
-        for (int s = 0; s < S; s++) {
-          if (t > sequence_lengths_[s]) {
-            d_all.Row(s).SetZero();
-          }
-        }
-      }
-    }
-
-    // g,i,f,o -> x, calculating input derivatives,
-    // forward direction difference
-    in_diff->AddMatMat(1.0, F_DGIFO.RowRange(1*S, T*S), kNoTrans, f_w_gifo_x_, kNoTrans, 0.0);
-    // backward direction difference
-    in_diff->AddMatMat(1.0, B_DGIFO.RowRange(1*S, T*S), kNoTrans, b_w_gifo_x_, kNoTrans, 1.0);
-
-    // lazy initialization of udpate buffers,
-    if (f_w_gifo_x_corr_.NumRows() == 0) {
-      // init delta buffers,
-      // forward direction,
-      f_w_gifo_x_corr_.Resize(4*cell_dim_, input_dim_, kSetZero);
-      f_w_gifo_r_corr_.Resize(4*cell_dim_, proj_dim_, kSetZero);
-      f_bias_corr_.Resize(4*cell_dim_, kSetZero);
-      f_peephole_i_c_corr_.Resize(cell_dim_, kSetZero);
-      f_peephole_f_c_corr_.Resize(cell_dim_, kSetZero);
-      f_peephole_o_c_corr_.Resize(cell_dim_, kSetZero);
-      f_w_r_m_corr_.Resize(proj_dim_, cell_dim_, kSetZero);
-
-      // backward direction,
-      b_w_gifo_x_corr_.Resize(4*cell_dim_, input_dim_, kSetZero);
-      b_w_gifo_r_corr_.Resize(4*cell_dim_, proj_dim_, kSetZero);
-      b_bias_corr_.Resize(4*cell_dim_, kSetZero);
-      b_peephole_i_c_corr_.Resize(cell_dim_, kSetZero);
-      b_peephole_f_c_corr_.Resize(cell_dim_, kSetZero);
-      b_peephole_o_c_corr_.Resize(cell_dim_, kSetZero);
-      b_w_r_m_corr_.Resize(proj_dim_, cell_dim_, kSetZero);
-    }
-
-    // calculate delta
-    const BaseFloat mmt = opts_.momentum;
-
-    // forward direction
-    // weight x -> g, i, f, o
-    f_w_gifo_x_corr_.AddMatMat(1.0, F_DGIFO.RowRange(1*S, T*S), kTrans,
-                                    in,                        kNoTrans, mmt);
-    // recurrent weight r -> g, i, f, o
-    f_w_gifo_r_corr_.AddMatMat(1.0, F_DGIFO.RowRange(1*S, T*S), kTrans,
-                                    F_YR.RowRange(0*S, T*S),    kNoTrans, mmt);
-    // bias of g, i, f, o
-    f_bias_corr_.AddRowSumMat(1.0, F_DGIFO.RowRange(1*S, T*S), mmt);
-
-    // recurrent peephole c -> i
-    f_peephole_i_c_corr_.AddDiagMatMat(1.0, F_DI.RowRange(1*S, T*S), kTrans,
-                                            F_YC.RowRange(0*S, T*S), kNoTrans, mmt);
-    // recurrent peephole c -> f
-    f_peephole_f_c_corr_.AddDiagMatMat(1.0, F_DF.RowRange(1*S, T*S), kTrans,
-                                            F_YC.RowRange(0*S, T*S), kNoTrans, mmt);
-    // peephole c -> o
-    f_peephole_o_c_corr_.AddDiagMatMat(1.0, F_DO.RowRange(1*S, T*S), kTrans,
-                                            F_YC.RowRange(1*S, T*S), kNoTrans, mmt);
-
-    f_w_r_m_corr_.AddMatMat(1.0, F_DR.RowRange(1*S, T*S), kTrans,
-                                 F_YM.RowRange(1*S, T*S), kNoTrans, mmt);
-
-    // backward direction backpropagate
-    // weight x -> g, i, f, o
-    b_w_gifo_x_corr_.AddMatMat(1.0, B_DGIFO.RowRange(1*S, T*S), kTrans, in, kNoTrans, mmt);
-    // recurrent weight r -> g, i, f, o
-    b_w_gifo_r_corr_.AddMatMat(1.0, B_DGIFO.RowRange(1*S, T*S), kTrans,
-                                    B_YR.RowRange(0*S, T*S)   , kNoTrans, mmt);
-    // bias of g, i, f, o
-    b_bias_corr_.AddRowSumMat(1.0, B_DGIFO.RowRange(1*S, T*S), mmt);
-
-    // recurrent peephole c -> i, c(t+1) --> i
-    b_peephole_i_c_corr_.AddDiagMatMat(1.0, B_DI.RowRange(1*S, T*S), kTrans,
-                                            B_YC.RowRange(2*S, T*S), kNoTrans, mmt);
-    // recurrent peephole c -> f, c(t+1) --> f
-    b_peephole_f_c_corr_.AddDiagMatMat(1.0, B_DF.RowRange(1*S, T*S), kTrans,
-                                            B_YC.RowRange(2*S, T*S), kNoTrans, mmt);
-    // peephole c -> o
-    b_peephole_o_c_corr_.AddDiagMatMat(1.0, B_DO.RowRange(1*S, T*S), kTrans,
-                                            B_YC.RowRange(1*S, T*S), kNoTrans, mmt);
-
-    b_w_r_m_corr_.AddMatMat(1.0, B_DR.RowRange(1*S, T*S), kTrans,
-                                 B_YM.RowRange(1*S, T*S), kNoTrans, mmt);
-  }
-
-  void Update(const CuMatrixBase<BaseFloat> &input,
-              const CuMatrixBase<BaseFloat> &diff) {
-
-    // apply the gradient clipping,
-    if (grad_clip_ > 0.0) {
-      f_w_gifo_x_corr_.ApplyFloor(-grad_clip_);
-      f_w_gifo_x_corr_.ApplyCeiling(grad_clip_);
-      f_w_gifo_r_corr_.ApplyFloor(-grad_clip_);
-      f_w_gifo_r_corr_.ApplyCeiling(grad_clip_);
-      f_bias_corr_.ApplyFloor(-grad_clip_);
-      f_bias_corr_.ApplyCeiling(grad_clip_);
-      f_w_r_m_corr_.ApplyFloor(-grad_clip_);
-      f_w_r_m_corr_.ApplyCeiling(grad_clip_);
-      f_peephole_i_c_corr_.ApplyFloor(-grad_clip_);
-      f_peephole_i_c_corr_.ApplyCeiling(grad_clip_);
-      f_peephole_f_c_corr_.ApplyFloor(-grad_clip_);
-      f_peephole_f_c_corr_.ApplyCeiling(grad_clip_);
-      f_peephole_o_c_corr_.ApplyFloor(-grad_clip_);
-      f_peephole_o_c_corr_.ApplyCeiling(grad_clip_);
-
-      b_w_gifo_x_corr_.ApplyFloor(-grad_clip_);
-      b_w_gifo_x_corr_.ApplyCeiling(grad_clip_);
-      b_w_gifo_r_corr_.ApplyFloor(-grad_clip_);
-      b_w_gifo_r_corr_.ApplyCeiling(grad_clip_);
-      b_bias_corr_.ApplyFloor(-grad_clip_);
-      b_bias_corr_.ApplyCeiling(grad_clip_);
-      b_w_r_m_corr_.ApplyFloor(-grad_clip_);
-      b_w_r_m_corr_.ApplyCeiling(grad_clip_);
-      b_peephole_i_c_corr_.ApplyFloor(-grad_clip_);
-      b_peephole_i_c_corr_.ApplyCeiling(grad_clip_);
-      b_peephole_f_c_corr_.ApplyFloor(-grad_clip_);
-      b_peephole_f_c_corr_.ApplyCeiling(grad_clip_);
-      b_peephole_o_c_corr_.ApplyFloor(-grad_clip_);
-      b_peephole_o_c_corr_.ApplyCeiling(grad_clip_);
-    }
-
-    const BaseFloat lr = opts_.learn_rate;
-
-    // forward direction update
-    f_w_gifo_x_.AddMat(-lr * learn_rate_coef_, f_w_gifo_x_corr_);
-    f_w_gifo_r_.AddMat(-lr * learn_rate_coef_, f_w_gifo_r_corr_);
-    f_bias_.AddVec(-lr * bias_learn_rate_coef_, f_bias_corr_, 1.0);
-
-    f_peephole_i_c_.AddVec(-lr * bias_learn_rate_coef_, f_peephole_i_c_corr_, 1.0);
-    f_peephole_f_c_.AddVec(-lr * bias_learn_rate_coef_, f_peephole_f_c_corr_, 1.0);
-    f_peephole_o_c_.AddVec(-lr * bias_learn_rate_coef_, f_peephole_o_c_corr_, 1.0);
-
-    f_w_r_m_.AddMat(-lr * learn_rate_coef_, f_w_r_m_corr_);
-
-    // backward direction update
-    b_w_gifo_x_.AddMat(-lr * learn_rate_coef_, b_w_gifo_x_corr_);
-    b_w_gifo_r_.AddMat(-lr * learn_rate_coef_, b_w_gifo_r_corr_);
-    b_bias_.AddVec(-lr * bias_learn_rate_coef_, b_bias_corr_, 1.0);
-
-    b_peephole_i_c_.AddVec(-lr * bias_learn_rate_coef_, b_peephole_i_c_corr_, 1.0);
-    b_peephole_f_c_.AddVec(-lr * bias_learn_rate_coef_, b_peephole_f_c_corr_, 1.0);
-    b_peephole_o_c_.AddVec(-lr * bias_learn_rate_coef_, b_peephole_o_c_corr_, 1.0);
-
-    b_w_r_m_.AddMat(-lr * learn_rate_coef_, b_w_r_m_corr_);
-  }
-
- private:
-  // dims
-  int32 cell_dim_;  ///< the number of memory-cell blocks,
-  int32 proj_dim_;  ///< recurrent projection layer dim,
-
-  BaseFloat cell_clip_;  ///< Clipping of 'cell-values' in forward pass (per-frame),
-  BaseFloat diff_clip_;  ///< Clipping of 'derivatives' in backprop (per-frame),
-  BaseFloat cell_diff_clip_; ///< Clipping of 'cell-derivatives' accumulated over CEC (per-frame),
-  BaseFloat grad_clip_;  ///< Clipping of the updates,
-
-  // feed-forward connections: from x to [g, i, f, o]
-  // forward direction
-  CuMatrix<BaseFloat> f_w_gifo_x_;
-  CuMatrix<BaseFloat> f_w_gifo_x_corr_;
-  // backward direction
-  CuMatrix<BaseFloat> b_w_gifo_x_;
-  CuMatrix<BaseFloat> b_w_gifo_x_corr_;
-
-  // recurrent projection connections: from r to [g, i, f, o]
-  // forward direction
-  CuMatrix<BaseFloat> f_w_gifo_r_;
-  CuMatrix<BaseFloat> f_w_gifo_r_corr_;
-  // backward direction
-  CuMatrix<BaseFloat> b_w_gifo_r_;
-  CuMatrix<BaseFloat> b_w_gifo_r_corr_;
-
-  // biases of [g, i, f, o]
-  // forward direction
-  CuVector<BaseFloat> f_bias_;
-  CuVector<BaseFloat> f_bias_corr_;
-  // backward direction
-  CuVector<BaseFloat> b_bias_;
-  CuVector<BaseFloat> b_bias_corr_;
-
-  // peephole from c to i, f, g
-  // peephole connections are diagonal, so we use vector form,
-  // forward direction
-  CuVector<BaseFloat> f_peephole_i_c_;
-  CuVector<BaseFloat> f_peephole_f_c_;
-  CuVector<BaseFloat> f_peephole_o_c_;
-  // backward direction
-  CuVector<BaseFloat> b_peephole_i_c_;
-  CuVector<BaseFloat> b_peephole_f_c_;
-  CuVector<BaseFloat> b_peephole_o_c_;
-
-  // forward direction
-  CuVector<BaseFloat> f_peephole_i_c_corr_;
-  CuVector<BaseFloat> f_peephole_f_c_corr_;
-  CuVector<BaseFloat> f_peephole_o_c_corr_;
-  // backward direction
-  CuVector<BaseFloat> b_peephole_i_c_corr_;
-  CuVector<BaseFloat> b_peephole_f_c_corr_;
-  CuVector<BaseFloat> b_peephole_o_c_corr_;
-
-  // projection layer r: from m to r
-  // forward direction
-  CuMatrix<BaseFloat> f_w_r_m_;
-  CuMatrix<BaseFloat> f_w_r_m_corr_;
-  // backward direction
-  CuMatrix<BaseFloat> b_w_r_m_;
-  CuMatrix<BaseFloat> b_w_r_m_corr_;
-
-  // propagate buffer: output of [g, i, f, o, c, h, m, r]
-  // forward direction
-  CuMatrix<BaseFloat> f_propagate_buf_;
-  // backward direction
-  CuMatrix<BaseFloat> b_propagate_buf_;
-
-  // back-propagate buffer: diff-input of [g, i, f, o, c, h, m, r]
-  // forward direction
-  CuMatrix<BaseFloat> f_backpropagate_buf_;
-  // backward direction
-  CuMatrix<BaseFloat> b_backpropagate_buf_;
-};  // class BlstmProjected
-
-}  // namespace nnet1
-}  // namespace kaldi
-
-#endif  // KALDI_NNET_NNET_BLSTM_PROJECTED_H_
diff --git a/src/nnet/nnet-component-test.cc b/src/nnet/nnet-component-test.cc
deleted file mode 100644
index da181bd18f6..00000000000
--- a/src/nnet/nnet-component-test.cc
+++ /dev/null
@@ -1,451 +0,0 @@
-// nnet/nnet-component-test.cc
-// Copyright 2014-2015  Brno University of Technology (author: Karel Vesely),
-//                      The Johns Hopkins University (author: Sri Harish Mallidi)
-
-// See ../../COPYING for clarification regarding multiple authors
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//  http://www.apache.org/licenses/LICENSE-2.0
-//
-// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
-// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
-// MERCHANTABLITY OR NON-INFRINGEMENT.
-// See the Apache 2 License for the specific language governing permissions and
-// limitations under the License.
-
-#include <sstream>
-#include <fstream>
-#include <algorithm>
-
-#include "nnet/nnet-component.h"
-#include "nnet/nnet-nnet.h"
-#include "nnet/nnet-convolutional-component.h"
-#include "nnet/nnet-convolutional-2d-component.h"
-#include "nnet/nnet-max-pooling-component.h"
-#include "nnet/nnet-max-pooling-2d-component.h"
-#include "nnet/nnet-average-pooling-2d-component.h"
-#include "util/common-utils.h"
-
-namespace kaldi {
-namespace nnet1 {
-
-  /*
-   * Helper functions
-   */
-  template<typename Real>
-  void ReadCuMatrixFromString(const std::string& s, CuMatrix<Real>* m) {
-    std::istringstream is(s + "\n");
-    m->Read(is, false);  // false for ascii
-  }
-
-  Component* ReadComponentFromString(const std::string& s) {
-    std::istringstream is(s + "\n");
-    return Component::Read(is, false);  // false for ascii
-  }
-
-
-  /*
-   * Unit tests,
-   */
-  void UnitTestLengthNorm() {
-    // make L2-length normalization component,
-    Component* c = ReadComponentFromString("<LengthNormComponent> 5 5");
-    // prepare input,
-    CuMatrix<BaseFloat> mat_in;
-    ReadCuMatrixFromString("[ 1 2 3 4 5 \n 2 3 5 6 8 ] ", &mat_in);
-    // propagate,
-    CuMatrix<BaseFloat> mat_out;
-    c->Propagate(mat_in, &mat_out);
-    // check the length,
-    mat_out.MulElements(mat_out);  // ^2,
-    CuVector<BaseFloat> check_length_is_one(2);
-    check_length_is_one.AddColSumMat(1.0, mat_out, 0.0);  // sum_of_cols(x^2),
-    check_length_is_one.ApplyPow(0.5);  // L2norm = sqrt(sum_of_cols(x^2)),
-    CuVector<BaseFloat> ones(2);
-    ones.Set(1.0);
-    AssertEqual(check_length_is_one, ones);
-  }
-
-  void UnitTestSimpleSentenceAveragingComponent() {
-    // make SimpleSentenceAveraging component,
-    Component* c = ReadComponentFromString(
-      "<SimpleSentenceAveragingComponent> 2 2 <GradientBoost> 10.0"
-    );
-    // prepare input,
-    CuMatrix<BaseFloat> mat_in;
-    ReadCuMatrixFromString("[ 0 0.5 \n 1 1 \n 2 1.5 ] ", &mat_in);
-
-    // propagate,
-    CuMatrix<BaseFloat> mat_out;
-    c->Propagate(mat_in, &mat_out);
-    // check the output,
-    CuVector<BaseFloat> ones(2);
-    ones.Set(1.0);
-    for (int32 i = 0; i < mat_out.NumRows(); i++) {
-      AssertEqual(mat_out.Row(i), ones);
-    }
-
-    // backpropagate,
-    CuMatrix<BaseFloat> dummy1(3, 2), dummy2(3, 2), diff_out(mat_in), diff_in;
-    // the average 1.0 in 'diff_in' will be boosted by 10.0,
-    c->Backpropagate(dummy1, dummy2, diff_out, &diff_in);
-    // check the output,
-    CuVector<BaseFloat> tens(2); tens.Set(10);
-    for (int32 i = 0; i < diff_in.NumRows(); i++) {
-      AssertEqual(diff_in.Row(i), tens);
-    }
-  }
-
-  void UnitTestConvolutionalComponentUnity() {
-    // make 'identity' convolutional component,
-    Component* c = ReadComponentFromString("<ConvolutionalComponent> 5 5 \
-      <PatchDim> 1 <PatchStep> 1 <PatchStride> 5 \
-      <LearnRateCoef> 1.0 <BiasLearnRateCoef> 1.0 \
-      <MaxNorm> 0 \
-      <Filters> [ 1 \
-      ] <Bias> [ 0 ]"
-    );
-
-    // prepare input,
-    CuMatrix<BaseFloat> mat_in;
-    ReadCuMatrixFromString("[ 1 2 3 4 5 ] ", &mat_in);
-
-    // propagate,
-    CuMatrix<BaseFloat> mat_out;
-    c->Propagate(mat_in, &mat_out);
-    KALDI_LOG << "mat_in" << mat_in << "mat_out" << mat_out;
-    AssertEqual(mat_in, mat_out);
-
-    // backpropagate,
-    CuMatrix<BaseFloat> mat_out_diff(mat_in), mat_in_diff;
-    c->Backpropagate(mat_in, mat_out, mat_out_diff, &mat_in_diff);
-    KALDI_LOG << "mat_out_diff " << mat_out_diff
-              << " mat_in_diff " << mat_in_diff;
-    AssertEqual(mat_out_diff, mat_in_diff);
-
-    // clean,
-    delete c;
-  }
-
-  void UnitTestConvolutionalComponent3x3() {
-    // make 3x3 convolutional component,
-    // design such weights and input so output is zero,
-    Component* c = ReadComponentFromString("<ConvolutionalComponent> 9 15 \
-      <PatchDim> 3 <PatchStep> 1 <PatchStride> 5 \
-      <LearnRateCoef> 1.0 <BiasLearnRateCoef> 1.0 \
-      <MaxNorm> 0 \
-      <Filters> [ -1 -2 -7   0 0 0   1 2 7 ; \
-                  -1  0  1  -3 0 3  -2 2 0 ; \
-                  -4  0  0  -3 0 3   4 0 0 ] \
-      <Bias> [ -20 -20 -20 ]"
-    );
-
-    // prepare input, reference output,
-    CuMatrix<BaseFloat> mat_in;
-    ReadCuMatrixFromString("[ 1 3 5 7 9  2 4 6 8 10  3 5 7 9 11 ]", &mat_in);
-    CuMatrix<BaseFloat> mat_out_ref;
-    ReadCuMatrixFromString("[ 0 0 0  0 0 0  0 0 0 ]", &mat_out_ref);
-
-    // propagate,
-    CuMatrix<BaseFloat> mat_out;
-    c->Propagate(mat_in, &mat_out);
-    KALDI_LOG << "mat_in" << mat_in << "mat_out" << mat_out;
-    AssertEqual(mat_out, mat_out_ref);
-
-    // prepare mat_out_diff, mat_in_diff_ref,
-    CuMatrix<BaseFloat> mat_out_diff;
-    ReadCuMatrixFromString("[ 1 0 0  1 1 0  1 1 1 ]", &mat_out_diff);
-    // hand-computed back-propagated values,
-    CuMatrix<BaseFloat> mat_in_diff_ref;
-    ReadCuMatrixFromString("[ -1 -4 -15 -8 -6   0 -3 -6 3 6   1 1 14 11 7 ]",
-                           &mat_in_diff_ref);
-
-    // backpropagate,
-    CuMatrix<BaseFloat> mat_in_diff;
-    c->Backpropagate(mat_in, mat_out, mat_out_diff, &mat_in_diff);
-    KALDI_LOG << "mat_in_diff " << mat_in_diff
-              << " mat_in_diff_ref " << mat_in_diff_ref;
-    AssertEqual(mat_in_diff, mat_in_diff_ref);
-
-    // clean,
-    delete c;
-  }
-
-
-  void UnitTestMaxPoolingComponent() {
-    // make max-pooling component, assuming 4 conv. neurons,
-    // non-overlapping pool of size 3,
-    Component* c = Component::Init(
-        "<MaxPoolingComponent> <InputDim> 24 <OutputDim> 8 \
-         <PoolSize> 3 <PoolStep> 3 <PoolStride> 4"
-    );
-
-    // input matrix,
-    CuMatrix<BaseFloat> mat_in;
-    ReadCuMatrixFromString("[ 3 8 2 9 \
-                              8 3 9 3 \
-                              2 4 9 6 \
-                              \
-                              2 4 2 0 \
-                              6 4 9 4 \
-                              7 3 0 3;\
-                              \
-                              5 4 7 8 \
-                              3 9 5 6 \
-                              3 4 8 9 \
-                              \
-                              5 4 5 6 \
-                              3 1 4 5 \
-                              8 2 1 7 ]", &mat_in);
-
-    // expected output (max values in columns),
-    CuMatrix<BaseFloat> mat_out_ref;
-    ReadCuMatrixFromString("[ 8 8 9 9 \
-                              7 4 9 4;\
-                              5 9 8 9 \
-                              8 4 5 7 ]", &mat_out_ref);
-
-    // propagate,
-    CuMatrix<BaseFloat> mat_out;
-    c->Propagate(mat_in, &mat_out);
-    KALDI_LOG << "mat_out" << mat_out << "mat_out_ref" << mat_out_ref;
-    AssertEqual(mat_out, mat_out_ref);
-
-    // locations of max values will be shown,
-    CuMatrix<BaseFloat> mat_out_diff(mat_out);
-    mat_out_diff.Set(1);
-    // expected backpropagated values (hand-computed),
-    CuMatrix<BaseFloat> mat_in_diff_ref;
-    ReadCuMatrixFromString("[ 0 1 0 1 \
-                              1 0 1 0 \
-                              0 0 1 0 \
-                              \
-                              0 1 0 0 \
-                              0 1 1 1 \
-                              1 0 0 0;\
-                              \
-                              1 0 0 0 \
-                              0 1 0 0 \
-                              0 0 1 1 \
-                              \
-                              0 1 1 0 \
-                              0 0 0 0 \
-                              1 0 0 1 ]", &mat_in_diff_ref);
-    // backpropagate,
-    CuMatrix<BaseFloat> mat_in_diff;
-    c->Backpropagate(mat_in, mat_out, mat_out_diff, &mat_in_diff);
-    KALDI_LOG << "mat_in_diff " << mat_in_diff
-              << " mat_in_diff_ref " << mat_in_diff_ref;
-    AssertEqual(mat_in_diff, mat_in_diff_ref);
-
-    delete c;
-  }
-
-  void UnitTestMaxPooling2DComponent() { /* Implemented by Harish Mallidi */
-    // make max-pooling2d component
-    Component* c = Component::Init(
-      "<MaxPooling2DComponent> <InputDim> 56 <OutputDim> 18 \
-       <FmapXLen> 4 <FmapYLen> 7 <PoolXLen> 2 <PoolYLen> 3 \
-       <PoolXStep> 1 <PoolYStep> 2"
-    );
-
-    // input matrix,
-    CuMatrix<BaseFloat> mat_in;
-    ReadCuMatrixFromString("[ 0 0 1 1 2 2 3 3 4 4 5 5 6 6 7 7 8 8 9 9 10 10 \
-      11 11 12 12 13 13 14 14 15 15 16 16 17 17 18 18 19 19 20 20 21 21 \
-      22 22 23 23 24 24 25 25 26 26 27 27 ]", &mat_in);
-
-    // expected output (max values in the patch)
-    CuMatrix<BaseFloat> mat_out_ref;
-    ReadCuMatrixFromString("[ 9 9 11 11 13 13 16 16 18 18 \
-      20 20 23 23 25 25 27 27 ]", &mat_out_ref);
-
-    // propagate,
-    CuMatrix<BaseFloat> mat_out;
-    c->Propagate(mat_in, &mat_out);
-    KALDI_LOG << "mat_out" << mat_out << "mat_out_ref" << mat_out_ref;
-    AssertEqual(mat_out, mat_out_ref);
-
-
-    // locations of max values will be shown
-    CuMatrix<BaseFloat> mat_out_diff(mat_out);
-    ReadCuMatrixFromString(
-      "[ 0 0 1 1 2 2 3 3 4 4 5 5 6 6 7 7 8 8 ]", &mat_out_diff
-    );
-
-    // expected backpropagated values,
-    CuMatrix<BaseFloat> mat_in_diff_ref;  // hand-computed back-propagated values,
-    ReadCuMatrixFromString("[ 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 \
-      0.25 0.25 0 0 1 1 0 0 0 0 0.75 0.75 0 0 1 1 0 0 2.5 2.5 \
-      0 0 0 0 3 3 0 0 3.5 3.5 0 0 8 8 ]", &mat_in_diff_ref
-    );
-
-    // backpropagate,
-    CuMatrix<BaseFloat> mat_in_diff;
-    c->Backpropagate(mat_in, mat_out, mat_out_diff, &mat_in_diff);
-    KALDI_LOG << "mat_in_diff " << mat_in_diff
-              << " mat_in_diff_ref " << mat_in_diff_ref;
-    AssertEqual(mat_in_diff, mat_in_diff_ref);
-
-    delete c;
-  }
-
-  void UnitTestAveragePooling2DComponent() { /* Implemented by Harish Mallidi */
-    // make average-pooling2d component
-    Component* c = Component::Init(
-      "<AveragePooling2DComponent> <InputDim> 56 <OutputDim> 18 \
-       <FmapXLen> 4 <FmapYLen> 7 <PoolXLen> 2 <PoolYLen> 3 \
-       <PoolXStep> 1 <PoolYStep> 2"
-    );
-
-    // input matrix,
-    CuMatrix<BaseFloat> mat_in;
-    ReadCuMatrixFromString("[ 0 0 1 1 2 2 3 3 4 4 5 5 6 6 7 7 8 8 9 9 10 10 \
-      11 11 12 12 13 13 14 14 15 15 16 16 17 17 18 18 19 19 20 20 \
-      21 21 22 22 23 23 24 24 25 25 26 26 27 27 ]", &mat_in);
-
-    // expected output (max values in the patch)
-    CuMatrix<BaseFloat> mat_out_ref;
-    ReadCuMatrixFromString("[ 4.5 4.5 6.5 6.5 8.5 8.5 11.5 11.5 13.5 13.5 \
-      15.5 15.5 18.5 18.5 20.5 20.5 22.5 22.5 ]", &mat_out_ref);
-
-    // propagate,
-    CuMatrix<BaseFloat> mat_out;
-    c->Propagate(mat_in, &mat_out);
-    KALDI_LOG << "mat_out" << mat_out << "mat_out_ref" << mat_out_ref;
-    AssertEqual(mat_out, mat_out_ref);
-
-
-    // locations of max values will be shown
-    CuMatrix<BaseFloat> mat_out_diff(mat_out);
-    ReadCuMatrixFromString("[ 0 0 1 1 2 2 3 3 4 4 5 5 6 6 7 7 8 8 ]", &mat_out_diff);
-
-    // expected backpropagated values,
-    CuMatrix<BaseFloat> mat_in_diff_ref;  // hand-computed back-propagated values,
-    ReadCuMatrixFromString("[  0 0 0 0 0.0833333 0.0833333 0.166667 0.166667 \
-      0.25 0.25 0.333333 0.333333 0.333333 0.333333 0.25 0.25 0.25 0.25 \
-      0.333333 0.333333 0.416667 0.416667 0.5 0.5 0.583333 0.583333 0.583333 \
-      0.583333 0.75 0.75 0.75 0.75 0.833333 0.833333 0.916667 0.916667 1 1 \
-      1.08333 1.08333 1.08333 1.08333 1 1 1 1 1.08333 1.08333 1.16667 1.16667 \
-      1.25 1.25 1.33333 1.33333 1.33333 1.33333 ]", &mat_in_diff_ref
-    );
-
-    // backpropagate,
-    CuMatrix<BaseFloat> mat_in_diff;
-    c->Backpropagate(mat_in, mat_out, mat_out_diff, &mat_in_diff);
-    KALDI_LOG << "mat_in_diff " << mat_in_diff
-              << " mat_in_diff_ref " << mat_in_diff_ref;
-    AssertEqual(mat_in_diff, mat_in_diff_ref);
-
-    delete c;
-  }
-
-
-  void UnitTestConvolutional2DComponent() { /* Implemented by Harish Mallidi */
-    // Convolutional2D component
-    Component* c = ReadComponentFromString("<Convolutional2DComponent> 18 56 \
-      <LearnRateCoef> 0 <BiasLearnRateCoef> 0 <FmapXLen> 4 <FmapYLen> 7 \
-      <FiltXLen> 2 <FiltYLen> 3 <FiltXStep> 1 <FiltYStep> 2 <ConnectFmap> 1 \
-      <Filters> [ 0 0 1 1 2 2 3 3 4 4 5 5 ; 0 0 1 1 2 2 3 3 4 4 5 5 ] \
-      <Bias> [ 0 0 ]"
-    );
-
-    // input matrix
-    CuMatrix<BaseFloat> mat_in;
-    ReadCuMatrixFromString("[ 0 0 1 1 2 2 3 3 4 4 5 5 6 6 7 7 8 8 9 9 10 10 \
-      11 11 12 12 13 13 14 14 15 15 16 16 17 17 18 18 19 19 20 20 \
-      21 21 22 22 23 23 24 24 25 25 26 26 27 27 ]", &mat_in);
-
-    CuMatrix<BaseFloat> mat_out_ref;
-    ReadCuMatrixFromString("[ 206 206 266 266 326 326 416 416 476 476 536 536 \
-      626 626 686 686 746 746 ]", &mat_out_ref);
-
-    // propagate
-    CuMatrix<BaseFloat> mat_out;
-    c->Propagate(mat_in, &mat_out);
-    KALDI_LOG << "mat_out" << mat_out << "mat_out" << mat_out_ref;
-    AssertEqual(mat_out, mat_out_ref);
-
-    // prepare mat_out_diff, mat_in_diff_ref,
-    CuMatrix<BaseFloat> mat_out_diff;
-    ReadCuMatrixFromString("[ 0 0 1 1 2 2 3 3 4 4 5 5 6 6 7 7 8 8 ]",
-                           &mat_out_diff);
-
-    CuMatrix<BaseFloat> mat_in_diff_ref;
-    ReadCuMatrixFromString("[ 0 0 0 0 0 0 2 2 2 2 4 4 8 8 0 0 3 3 4.5 4.5 8 8 \
-      9.5 9.5 13 13 20 20 9 9 18 18 19.5 19.5 23 23 24.5 24.5 28 28 41 41 \
-      36 36 48 48 51 51 56 56 59 59 64 64 80 80 ]", &mat_in_diff_ref);
-
-    // backpropagate
-    CuMatrix<BaseFloat> mat_in_diff;
-    c->Backpropagate(mat_in, mat_out, mat_out_diff, &mat_in_diff);
-    KALDI_LOG << "mat_in_diff " << mat_in_diff
-              << " mat_in_diff_ref " << mat_in_diff_ref;
-    AssertEqual(mat_in_diff, mat_in_diff_ref);
-
-    delete c;
-  }
-
-  void UnitTestDropoutComponent() {
-    Component* c = ReadComponentFromString("<Dropout> 100 100 <DropoutRetention> 0.7");
-    // buffers,
-    CuMatrix<BaseFloat> in(777, 100),
-                        out,
-                        out_diff,
-                        in_diff;
-    // init,
-    in.Set(2.0);
-
-    // propagate,
-    c->Propagate(in, &out);
-    AssertEqual(in.Sum(), out.Sum(), 0.01);
-
-    // backprop,
-    out_diff = in;
-    c->Backpropagate(in, out, out_diff, &in_diff);
-    AssertEqual(in_diff, out);
-
-    delete c;
-  }
-
-}  // namespace nnet1
-}  // namespace kaldi
-
-int main() {
-  using namespace kaldi;
-  using namespace kaldi::nnet1;
-
-  for (kaldi::int32 loop = 0; loop < 2; loop++) {
-#if HAVE_CUDA == 1
-    if (loop == 0)
-      // use no GPU,
-      CuDevice::Instantiate().SelectGpuId("no");
-    else
-      // use GPU when available,
-      CuDevice::Instantiate().SelectGpuId("optional");
-#endif
-    // unit-tests :
-    UnitTestLengthNorm();
-    UnitTestSimpleSentenceAveragingComponent();
-    UnitTestConvolutionalComponentUnity();
-    UnitTestConvolutionalComponent3x3();
-    UnitTestMaxPoolingComponent();
-    UnitTestConvolutional2DComponent();
-    UnitTestMaxPooling2DComponent();
-    UnitTestAveragePooling2DComponent();
-    UnitTestDropoutComponent();
-    // end of unit-tests,
-    if (loop == 0)
-        KALDI_LOG << "Tests without GPU use succeeded.";
-      else
-        KALDI_LOG << "Tests with GPU use (if available) succeeded.";
-  }
-#if HAVE_CUDA == 1
-  CuDevice::Instantiate().PrintProfile();
-#endif
-  return 0;
-}
diff --git a/src/nnet/nnet-component.cc b/src/nnet/nnet-component.cc
deleted file mode 100644
index 34f988972a0..00000000000
--- a/src/nnet/nnet-component.cc
+++ /dev/null
@@ -1,288 +0,0 @@
-// nnet/nnet-component.cc
-
-// Copyright 2011-2013  Brno University of Technology (Author: Karel Vesely)
-
-// See ../../COPYING for clarification regarding multiple authors
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//  http://www.apache.org/licenses/LICENSE-2.0
-//
-// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
-// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
-// MERCHANTABLITY OR NON-INFRINGEMENT.
-// See the Apache 2 License for the specific language governing permissions and
-// limitations under the License.
-
-
-#include <algorithm>
-#include <sstream>
-
-#include "nnet/nnet-component.h"
-
-#include "nnet/nnet-nnet.h"
-#include "nnet/nnet-activation.h"
-#include "nnet/nnet-kl-hmm.h"
-#include "nnet/nnet-affine-transform.h"
-#include "nnet/nnet-linear-transform.h"
-#include "nnet/nnet-rbm.h"
-#include "nnet/nnet-various.h"
-
-#include "nnet/nnet-convolutional-component.h"
-#include "nnet/nnet-average-pooling-component.h"
-#include "nnet/nnet-max-pooling-component.h"
-
-#include "nnet/nnet-convolutional-2d-component.h"
-#include "nnet/nnet-average-pooling-2d-component.h"
-#include "nnet/nnet-max-pooling-2d-component.h"
-
-#include "nnet/nnet-lstm-projected.h"
-#include "nnet/nnet-blstm-projected.h"
-#include "nnet/nnet-recurrent.h"
-
-#include "nnet/nnet-sentence-averaging-component.h"
-#include "nnet/nnet-frame-pooling-component.h"
-#include "nnet/nnet-parallel-component.h"
-#include "nnet/nnet-multibasis-component.h"
-#include "nnet/nnet-parametric-relu.h"
-
-namespace kaldi {
-namespace nnet1 {
-
-const struct Component::key_value Component::kMarkerMap[] = {
-  { Component::kAffineTransform, "<AffineTransform>" },
-  { Component::kLinearTransform, "<LinearTransform>" },
-  { Component::kConvolutionalComponent, "<ConvolutionalComponent>" },
-  { Component::kConvolutional2DComponent, "<Convolutional2DComponent>" },
-  { Component::kLstmProjected, "<LstmProjected>" },
-  { Component::kLstmProjected, "<LstmProjectedStreams>" }, // bwd compat.
-  { Component::kBlstmProjected, "<BlstmProjected>" },
-  { Component::kBlstmProjected, "<BlstmProjectedStreams>" }, // bwd compat.
-  { Component::kRecurrentComponent, "<RecurrentComponent>" },
-  { Component::kSoftmax, "<Softmax>" },
-  { Component::kHiddenSoftmax, "<HiddenSoftmax>" },
-  { Component::kBlockSoftmax, "<BlockSoftmax>" },
-  { Component::kSigmoid, "<Sigmoid>" },
-  { Component::kTanh, "<Tanh>" },
-  { Component::kParametricRelu,"<ParametricRelu>" },
-  { Component::kDropout, "<Dropout>" },
-  { Component::kLengthNormComponent, "<LengthNormComponent>" },
-  { Component::kRbm, "<Rbm>" },
-  { Component::kSplice, "<Splice>" },
-  { Component::kCopy, "<Copy>" },
-  { Component::kAddShift, "<AddShift>" },
-  { Component::kRescale, "<Rescale>" },
-  { Component::kKlHmm, "<KlHmm>" },
-  { Component::kAveragePoolingComponent, "<AveragePoolingComponent>" },
-  { Component::kAveragePooling2DComponent, "<AveragePooling2DComponent>" },
-  { Component::kMaxPoolingComponent, "<MaxPoolingComponent>" },
-  { Component::kMaxPooling2DComponent, "<MaxPooling2DComponent>" },
-  { Component::kSentenceAveragingComponent, "<SentenceAveragingComponent>" },
-  { Component::kSimpleSentenceAveragingComponent, "<SimpleSentenceAveragingComponent>" },
-  { Component::kFramePoolingComponent, "<FramePoolingComponent>" },
-  { Component::kParallelComponent, "<ParallelComponent>" },
-  { Component::kMultiBasisComponent, "<MultiBasisComponent>" },
-};
-
-
-const char* Component::TypeToMarker(ComponentType t) {
-  // Retuns the 1st '<string>' corresponding to the type in 'kMarkerMap',
-  int32 N = sizeof(kMarkerMap) / sizeof(kMarkerMap[0]);
-  for (int i = 0; i < N; i++) {
-    if (kMarkerMap[i].key == t) return kMarkerMap[i].value;
-  }
-  KALDI_ERR << "Unknown type : " << t;
-  return NULL;
-}
-
-Component::ComponentType Component::MarkerToType(const std::string &s) {
-  std::string s_lowercase(s);
-  std::transform(s.begin(), s.end(), s_lowercase.begin(), ::tolower);  // lc
-  int32 N = sizeof(kMarkerMap) / sizeof(kMarkerMap[0]);
-  for (int i = 0; i < N; i++) {
-    std::string m(kMarkerMap[i].value);
-    std::string m_lowercase(m);
-    std::transform(m.begin(), m.end(), m_lowercase.begin(), ::tolower);
-    if (s_lowercase == m_lowercase) return kMarkerMap[i].key;
-  }
-  KALDI_ERR << "Unknown 'Component' marker : '" << s << "'\n"
-            << "(isn't the model 'too old' or incompatible?)";
-  return kUnknown;
-}
-
-
-Component* Component::NewComponentOfType(ComponentType comp_type,
-                      int32 input_dim, int32 output_dim) {
-  Component *ans = NULL;
-  switch (comp_type) {
-    case Component::kAffineTransform :
-      ans = new AffineTransform(input_dim, output_dim);
-      break;
-    case Component::kLinearTransform :
-      ans = new LinearTransform(input_dim, output_dim);
-      break;
-    case Component::kConvolutionalComponent :
-      ans = new ConvolutionalComponent(input_dim, output_dim);
-      break;
-    case Component::kConvolutional2DComponent :
-      ans = new Convolutional2DComponent(input_dim, output_dim);
-      break;
-    case Component::kLstmProjected :
-      ans = new LstmProjected(input_dim, output_dim);
-      break;
-    case Component::kBlstmProjected :
-      ans = new BlstmProjected(input_dim, output_dim);
-      break;
-    case Component::kRecurrentComponent :
-      ans = new RecurrentComponent(input_dim, output_dim);
-      break;
-    case Component::kSoftmax :
-      ans = new Softmax(input_dim, output_dim);
-      break;
-    case Component::kHiddenSoftmax :
-      ans = new HiddenSoftmax(input_dim, output_dim);
-      break;
-    case Component::kBlockSoftmax :
-      ans = new BlockSoftmax(input_dim, output_dim);
-      break;
-    case Component::kSigmoid :
-      ans = new Sigmoid(input_dim, output_dim);
-      break;
-    case Component::kTanh :
-      ans = new Tanh(input_dim, output_dim);
-      break;
-    case Component::kParametricRelu :
-      ans = new ParametricRelu(input_dim, output_dim);
-      break;
-    case Component::kDropout :
-      ans = new Dropout(input_dim, output_dim);
-      break;
-    case Component::kLengthNormComponent :
-      ans = new LengthNormComponent(input_dim, output_dim);
-      break;
-    case Component::kRbm :
-      ans = new Rbm(input_dim, output_dim);
-      break;
-    case Component::kSplice :
-      ans = new Splice(input_dim, output_dim);
-      break;
-    case Component::kCopy :
-      ans = new CopyComponent(input_dim, output_dim);
-      break;
-    case Component::kAddShift :
-      ans = new AddShift(input_dim, output_dim);
-      break;
-    case Component::kRescale :
-      ans = new Rescale(input_dim, output_dim);
-      break;
-    case Component::kKlHmm :
-      ans = new KlHmm(input_dim, output_dim);
-      break;
-    case Component::kSentenceAveragingComponent :
-      ans = new SentenceAveragingComponent(input_dim, output_dim);
-      break;
-    case Component::kSimpleSentenceAveragingComponent :
-      ans = new SimpleSentenceAveragingComponent(input_dim, output_dim);
-      break;
-    case Component::kAveragePoolingComponent :
-      ans = new AveragePoolingComponent(input_dim, output_dim);
-      break;
-    case Component::kAveragePooling2DComponent :
-      ans = new AveragePooling2DComponent(input_dim, output_dim);
-      break;
-    case Component::kMaxPoolingComponent :
-      ans = new MaxPoolingComponent(input_dim, output_dim);
-      break;
-    case Component::kMaxPooling2DComponent :
-      ans = new MaxPooling2DComponent(input_dim, output_dim);
-      break;
-    case Component::kFramePoolingComponent :
-      ans = new FramePoolingComponent(input_dim, output_dim);
-      break;
-    case Component::kParallelComponent :
-      ans = new ParallelComponent(input_dim, output_dim);
-      break;
-    case Component::kMultiBasisComponent :
-      ans = new MultiBasisComponent(input_dim, output_dim);
-      break;
-    case Component::kUnknown :
-    default :
-      KALDI_ERR << "Missing type: " << TypeToMarker(comp_type);
-  }
-  return ans;
-}
-
-
-Component* Component::Init(const std::string &conf_line) {
-  std::istringstream is(conf_line);
-  std::string component_type_string;
-  int32 input_dim, output_dim;
-
-  // initialize component w/o internal data
-  ReadToken(is, false, &component_type_string);
-  ComponentType component_type = MarkerToType(component_type_string);
-  ExpectToken(is, false, "<InputDim>");
-  ReadBasicType(is, false, &input_dim);
-  ExpectToken(is, false, "<OutputDim>");
-  ReadBasicType(is, false, &output_dim);
-  Component *ans = NewComponentOfType(component_type, input_dim, output_dim);
-
-  // initialize internal data with the remaining part of config line
-  ans->InitData(is);
-
-  return ans;
-}
-
-
-Component* Component::Read(std::istream &is, bool binary) {
-  int32 dim_out, dim_in;
-  std::string token;
-
-  int first_char = Peek(is, binary);
-  if (first_char == EOF) return NULL;
-
-  ReadToken(is, binary, &token);
-  // Skip the optional initial token,
-  if (token == "<Nnet>") {
-    ReadToken(is, binary, &token);
-  }
-  // Network ends after terminal token appears,
-  if (token == "</Nnet>") {
-    return NULL;
-  }
-
-  // Read the dims,
-  ReadBasicType(is, binary, &dim_out);
-  ReadBasicType(is, binary, &dim_in);
-
-  // Create the component,
-  Component *ans = NewComponentOfType(MarkerToType(token), dim_in, dim_out);
-
-  // Read the content,
-  ans->ReadData(is, binary);
-
-  // 'Eat' the component separtor (can be already consumed by 'ReadData(.)'),
-  if ('<' == Peek(is, binary) && '!' == PeekToken(is, binary)) {
-    ExpectToken(is, binary, "<!EndOfComponent>");
-  }
-
-  return ans;
-}
-
-
-void Component::Write(std::ostream &os, bool binary) const {
-  WriteToken(os, binary, Component::TypeToMarker(GetType()));
-  WriteBasicType(os, binary, OutputDim());
-  WriteBasicType(os, binary, InputDim());
-  if (!binary) os << "\n";
-  this->WriteData(os, binary);
-  WriteToken(os, binary, "<!EndOfComponent>");  // Write component separator.
-  if (!binary) os << "\n";
-}
-
-
-}  // namespace nnet1
-}  // namespace kaldi
diff --git a/src/nnet/nnet-component.h b/src/nnet/nnet-component.h
deleted file mode 100644
index 2ef56622ca8..00000000000
--- a/src/nnet/nnet-component.h
+++ /dev/null
@@ -1,358 +0,0 @@
-// nnet/nnet-component.h
-
-// Copyright 2011-2016  Brno University of Technology (Author: Karel Vesely)
-
-// See ../../COPYING for clarification regarding multiple authors
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//  http://www.apache.org/licenses/LICENSE-2.0
-//
-// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
-// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
-// MERCHANTABLITY OR NON-INFRINGEMENT.
-// See the Apache 2 License for the specific language governing permissions and
-// limitations under the License.
-
-
-
-#ifndef KALDI_NNET_NNET_COMPONENT_H_
-#define KALDI_NNET_NNET_COMPONENT_H_
-
-#include <iostream>
-#include <string>
-
-#include "base/kaldi-common.h"
-#include "matrix/matrix-lib.h"
-#include "cudamatrix/cu-matrix.h"
-#include "cudamatrix/cu-vector.h"
-#include "nnet/nnet-trnopts.h"
-
-namespace kaldi {
-namespace nnet1 {
-
-/**
- * Abstract class, building block of the network.
- * It is able to propagate (PropagateFnc: compute the output based on its input)
- * and backpropagate (BackpropagateFnc: i.e. transform loss derivative w.r.t. output to derivative w.r.t. the input)
- * the formulas are implemented in descendant classes (AffineTransform,Sigmoid,Softmax,...).
- */
-class Component {
- /// Component type identification mechanism,
- public:
-  /// Types of Components,
-  typedef enum {
-    kUnknown = 0x0,
-
-    kUpdatableComponent = 0x0100,
-    kAffineTransform,
-    kLinearTransform,
-    kConvolutionalComponent,
-    kConvolutional2DComponent,
-    kLstmProjected,
-    kBlstmProjected,
-    kRecurrentComponent,
-
-    kActivationFunction = 0x0200,
-    kSoftmax,
-    kHiddenSoftmax,
-    kBlockSoftmax,
-    kSigmoid,
-    kTanh,
-    kParametricRelu,
-    kDropout,
-    kLengthNormComponent,
-
-    kTranform = 0x0400,
-    kRbm,
-    kSplice,
-    kCopy,
-    kTranspose,
-    kBlockLinearity,
-    kAddShift,
-    kRescale,
-
-    kKlHmm = 0x0800,
-    kSentenceAveragingComponent, /* deprecated */
-    kSimpleSentenceAveragingComponent,
-    kAveragePoolingComponent,
-    kAveragePooling2DComponent,
-    kMaxPoolingComponent,
-    kMaxPooling2DComponent,
-    kFramePoolingComponent,
-    kParallelComponent,
-    kMultiBasisComponent
-  } ComponentType;
-
-  /// A pair of type and marker,
-  struct key_value {
-    const Component::ComponentType key;
-    const char *value;
-  };
-
-  /// The table with pairs of Component types and markers
-  /// (defined in nnet-component.cc),
-  static const struct key_value kMarkerMap[];
-
-  /// Converts component type to marker,
-  static const char* TypeToMarker(ComponentType t);
-
-  /// Converts marker to component type (case insensitive),
-  static ComponentType MarkerToType(const std::string &s);
-
- /// Generic interface of a component,
- public:
-  Component(int32 input_dim, int32 output_dim):
-    input_dim_(input_dim),
-    output_dim_(output_dim)
-  { }
-
-  virtual ~Component()
-  { }
-
-  /// Copy component (deep copy),
-  virtual Component* Copy() const = 0;
-
-  /// Get Type Identification of the component,
-  virtual ComponentType GetType() const = 0;
-
-  /// Check if componeny has 'Updatable' interface (trainable components),
-  virtual bool IsUpdatable() const {
-    return false;
-  }
-
-  /// Check if component has 'Recurrent' interface (trainable and recurrent),
-  virtual bool IsMultistream() const {
-    return false;
-  }
-
-  /// Get the dimension of the input,
-  int32 InputDim() const {
-    return input_dim_;
-  }
-
-  /// Get the dimension of the output,
-  int32 OutputDim() const {
-    return output_dim_;
-  }
-
-  /// Perform forward-pass propagation 'in' -> 'out',
-  void Propagate(const CuMatrixBase<BaseFloat> &in, CuMatrix<BaseFloat> *out);
-
-  /// Perform backward-pass propagation 'out_diff' -> 'in_diff'.
-  /// Note: 'in' and 'out' will be used only sometimes...
-  void Backpropagate(const CuMatrixBase<BaseFloat> &in,
-                     const CuMatrixBase<BaseFloat> &out,
-                     const CuMatrixBase<BaseFloat> &out_diff,
-                     CuMatrix<BaseFloat> *in_diff);
-
-  /// Initialize component from a line in config file,
-  static Component* Init(const std::string &conf_line);
-
-  /// Read the component from a stream (static method),
-  static Component* Read(std::istream &is, bool binary);
-
-  /// Write the component to a stream,
-  void Write(std::ostream &os, bool binary) const;
-
-  /// Print some additional info (after <ComponentName> and the dims),
-  virtual std::string Info() const { return ""; }
-
-  /// Print some additional info about gradient (after <...> and dims),
-  virtual std::string InfoGradient() const { return ""; }
-
-
- /// Abstract interface for propagation/backpropagation
- protected:
-  /// Forward pass transformation (to be implemented by descending class...)
-  virtual void PropagateFnc(const CuMatrixBase<BaseFloat> &in,
-                            CuMatrixBase<BaseFloat> *out) = 0;
-
-  /// Backward pass transformation (to be implemented by descending class...)
-  virtual void BackpropagateFnc(const CuMatrixBase<BaseFloat> &in,
-                                const CuMatrixBase<BaseFloat> &out,
-                                const CuMatrixBase<BaseFloat> &out_diff,
-                                CuMatrixBase<BaseFloat> *in_diff) = 0;
-
- /// Virtual interface for initialization and I/O,
- protected:
-  /// Initialize internal data of a component
-  virtual void InitData(std::istream &is) { }
-
-  /// Reads the component content
-  virtual void ReadData(std::istream &is, bool binary) { }
-
-  /// Writes the component content
-  virtual void WriteData(std::ostream &os, bool binary) const { }
-
- /// Data members,
- protected:
-  int32 input_dim_;  ///< Dimension of the input of the Component,
-  int32 output_dim_;  ///< Dimension of the output of the Component,
-
- /// Private members (descending classes cannot call this),
- private:
-  /// Create a new intance of component,
-  static Component* NewComponentOfType(
-    ComponentType t, int32 input_dim, int32 output_dim
-  );
-};
-
-
-/**
- * Class UpdatableComponent is a Component which has trainable parameters,
- * it contains SGD training hyper-parameters in NnetTrainOptions.
- * The constants 'learning_rate_coef_' and 'bias_learn_rate_coef_'
- * are separate, and should be stored by ::WriteData(...),
- */
-class UpdatableComponent : public Component {
- public:
-  UpdatableComponent(int32 input_dim, int32 output_dim):
-    Component(input_dim, output_dim),
-    learn_rate_coef_(1.0),
-    bias_learn_rate_coef_(1.0)
-  { }
-
-  virtual ~UpdatableComponent()
-  { }
-
-  /// Check if contains trainable parameters,
-  bool IsUpdatable() const {
-    return true;
-  }
-
-  /// Number of trainable parameters,
-  virtual int32 NumParams() const = 0;
-
-  /// Get gradient reshaped as a vector,
-  virtual void GetGradient(VectorBase<BaseFloat> *gradient) const = 0;
-
-  /// Get the trainable parameters reshaped as a vector,
-  virtual void GetParams(VectorBase<BaseFloat> *params) const = 0;
-
-  /// Set the trainable parameters from, reshaped as a vector,
-  virtual void SetParams(const VectorBase<BaseFloat> &params) = 0;
-
-  /// Compute gradient and update parameters,
-  virtual void Update(const CuMatrixBase<BaseFloat> &input,
-                      const CuMatrixBase<BaseFloat> &diff) = 0;
-
-  /// Set the training options to the component,
-  virtual void SetTrainOptions(const NnetTrainOptions &opts) {
-    opts_ = opts;
-  }
-
-  /// Get the training options from the component,
-  const NnetTrainOptions& GetTrainOptions() const {
-    return opts_;
-  }
-
-  /// Set the learn-rate coefficient,
-  virtual void SetLearnRateCoef(BaseFloat val) {
-    learn_rate_coef_ = val;
-  }
-
-  /// Set the learn-rate coefficient for bias,
-  virtual void SetBiasLearnRateCoef(BaseFloat val) {
-    bias_learn_rate_coef_ = val;
-  }
-
-  /// Initialize the content of the component by the 'line' from the prototype,
-  virtual void InitData(std::istream &is) = 0;
-
- protected:
-  /// Option-class with training hyper-parameters,
-  NnetTrainOptions opts_;
-
-  /// Scalar applied to learning rate for weight matrices
-  /// (to be used in ::Update method),
-  BaseFloat learn_rate_coef_;
-
-  /// Scalar applied to learning rate for bias
-  /// (to be used in ::Update method),
-  BaseFloat bias_learn_rate_coef_;
-};
-
-
-/**
- * Class MultistreamComponent is an extension of UpdatableComponent
- * for recurrent networks, which are trained with parallel sequences.
- */
-class MultistreamComponent : public UpdatableComponent {
- public:
-  MultistreamComponent(int32 input_dim, int32 output_dim):
-    UpdatableComponent(input_dim, output_dim)
-  { }
-
-  bool IsMultistream() const {
-    return true;
-  }
-
-  virtual void SetSeqLengths(const std::vector<int32>& sequence_lengths) {
-    sequence_lengths_ = sequence_lengths;
-  }
-
-  int32 NumStreams() const {
-    return std::max<int32>(1, sequence_lengths_.size());
-  }
-
-  /// Optional function to reset the transfer of context (not used for BLSTMs
-  virtual void ResetStreams(const std::vector<int32>& stream_reset_flag)
-  { }
-
- protected:
-  std::vector<int32> sequence_lengths_;
-};
-
-
-/*
- * Inline methods for ::Component,
- */
-inline void Component::Propagate(const CuMatrixBase<BaseFloat> &in,
-                                 CuMatrix<BaseFloat> *out) {
-  // Check the dims
-  if (input_dim_ != in.NumCols()) {
-    KALDI_ERR << "Non-matching dims on the input of " << TypeToMarker(GetType())
-              << " component. The input-dim is " << input_dim_
-              << ", the data had " << in.NumCols() << " dims.";
-  }
-  // Allocate target buffer
-  out->Resize(in.NumRows(), output_dim_, kSetZero);  // reset
-  // Call the propagation implementation of the component
-  PropagateFnc(in, out);
-}
-
-inline void Component::Backpropagate(const CuMatrixBase<BaseFloat> &in,
-                                     const CuMatrixBase<BaseFloat> &out,
-                                     const CuMatrixBase<BaseFloat> &out_diff,
-                                     CuMatrix<BaseFloat> *in_diff) {
-  // Check the dims,
-  if (OutputDim() != out_diff.NumCols()) {
-    KALDI_ERR << "Non-matching dims! Component output dim " << OutputDim()
-              << ", the dim of output derivatives " << out_diff.NumCols();
-  }
-
-  int32 num_frames = out_diff.NumRows();
-  KALDI_ASSERT(num_frames == in.NumRows());
-  KALDI_ASSERT(num_frames == out.NumRows());
-
-  KALDI_ASSERT(InputDim() == in.NumCols());
-  KALDI_ASSERT(OutputDim() == out.NumCols());
-
-  // Allocate target buffer,
-  KALDI_ASSERT(in_diff != NULL);
-  in_diff->Resize(num_frames, InputDim(), kSetZero);  // reset,
-
-  // Call the 'virtual' backprop function,
-  BackpropagateFnc(in, out, out_diff, in_diff);
-}
-
-
-}  // namespace nnet1
-}  // namespace kaldi
-
-
-#endif  // KALDI_NNET_NNET_COMPONENT_H_
diff --git a/src/nnet/nnet-convolutional-2d-component.h b/src/nnet/nnet-convolutional-2d-component.h
deleted file mode 100644
index 135ce894541..00000000000
--- a/src/nnet/nnet-convolutional-2d-component.h
+++ /dev/null
@@ -1,495 +0,0 @@
-// nnet/nnet-convolutional-2d-component.h
-
-// Copyright 2014-2015  Johns Hopkins University (author: Sri Harish Mallidi)
-//                      Brno University of Technology (author: Karel Vesely),
-//
-
-// See ../../COPYING for clarification regarding multiple authors
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//  http://www.apache.org/licenses/LICENSE-2.0
-//
-// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
-// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
-// MERCHANTABLITY OR NON-INFRINGEMENT.
-// See the Apache 2 License for the specific language governing permissions and
-// limitations under the License.
-
-
-#ifndef KALDI_NNET_NNET_CONVOLUTIONAL_2D_COMPONENT_H_
-#define KALDI_NNET_NNET_CONVOLUTIONAL_2D_COMPONENT_H_
-
-#include <string>
-#include <vector>
-
-#include "nnet/nnet-component.h"
-#include "nnet/nnet-various.h"
-#include "cudamatrix/cu-math.h"
-
-namespace kaldi {
-namespace nnet1 {
-
-/**
- * Convolutional2DComponent implements convolution over 2-axis (frequency and temporal)
- * (i.e. frequency axis in case we are the 1st component in NN).
- * // We don't do convolution along temporal axis, which simplifies the
- * // implementation (and was not helpful for Tara).
- *
- * We assume the input featrues are spliced, i.e. each frame
- * is in fact a set of stacked frames, where we can form patches
- * which span over several frequency bands and time axes.
- *
- * The convolution is done over whole axis with same filters,
- * i.e. we don't use separate filters for different 'regions'
- * of frequency axis.
- *
- * In order to have a fast implementations, the filters
- * are represented in vectorized form, where each rectangular
- * filter corresponds to a row in a matrix, where all filters
- * are stored. The features are then re-shaped to a set of matrices,
- * where one matrix corresponds to single patch-position,
- * where the filters get applied.
- *
- * The type of convolution is controled by hyperparameters:
- * x_patch_dim_,y_patch_dim_     ... temporal and frequency axes sizes of the patch (e.g. (9,9) for 9x9 2D filter)
- * x_patch_step_,y_patch_step_    ... temporal and frequencey sizes of shifts in the convolution (e.g. (1,1) 2D filter with 1 step shift in both axes)
- * x_patch_stride_,y_patch_stride_  ... dimension of the feature (maps if inside convolutional layer) (e.g. (11,32) for 32-band 11 frame spliced spectrogram patch)
- * The type of convolution is controlled by hyperparameters:
- * fmap_x_len_, fmap_y_len_ ... dimension of the feature (maps if inside convolutional layer) (e.g. (11,32) for 32-band 11 frame spliced spectrogram patch)
- * filt_x_len_, filt_y_len_ ... temporal and frequency sizes of the filters (e.g. (9,9) for 9x9 2D filter)
- * filt_x_step_, filt_y_step_ ... temporal and frequency sizes of the filters (e.g. (1,1) for 2D-filter, with 1 step shift in both axes)
- *
- *
- * Due to convolution same weights are used repeateadly,
- * the final gradient is average of all position-specific
- * gradients.
- *
- */
-class Convolutional2DComponent : public UpdatableComponent {
- public:
-  Convolutional2DComponent(int32 dim_in, int32 dim_out):
-    UpdatableComponent(dim_in, dim_out),
-    fmap_x_len_(0), fmap_y_len_(0),
-    filt_x_len_(0), filt_y_len_(0),
-    filt_x_step_(0), filt_y_step_(0),
-    connect_fmap_(0)
-  { }
-
-  ~Convolutional2DComponent()
-  { }
-
-  Component* Copy() const { return new Convolutional2DComponent(*this); }
-  ComponentType GetType() const { return kConvolutional2DComponent; }
-
-  void InitData(std::istream &is) {
-    // define options
-    BaseFloat bias_mean = -2.0, bias_range = 2.0, param_stddev = 0.1;
-    // parse config
-    std::string token;
-    while (is >> std::ws, !is.eof()) {
-      ReadToken(is, false, &token);
-      /**/ if (token == "<ParamStddev>") ReadBasicType(is, false, &param_stddev);
-      else if (token == "<BiasMean>")    ReadBasicType(is, false, &bias_mean);
-      else if (token == "<BiasRange>")   ReadBasicType(is, false, &bias_range);
-      else if (token == "<FmapXLen>")    ReadBasicType(is, false, &fmap_x_len_);
-      else if (token == "<FmapYLen>")    ReadBasicType(is, false, &fmap_y_len_);
-      else if (token == "<FiltXLen>")    ReadBasicType(is, false, &filt_x_len_);
-      else if (token == "<FiltYLen>")    ReadBasicType(is, false, &filt_y_len_);
-      else if (token == "<FiltXStep>")   ReadBasicType(is, false, &filt_x_step_);
-      else if (token == "<FiltYStep>")   ReadBasicType(is, false, &filt_y_step_);
-      else if (token == "<ConnectFmap>") ReadBasicType(is, false, &connect_fmap_);
-      else if (token == "<LearnRateCoef>") ReadBasicType(is, false, &learn_rate_coef_);
-      else if (token == "<BiasLearnRateCoef>") ReadBasicType(is, false, &bias_learn_rate_coef_);
-      else KALDI_ERR << "Unknown token " << token << ", a typo in config? "
-                     << "(ParamStddev|BiasMean|BiasRange|FmapXLen|FmapYLen|"
-                        "FiltXLen|FiltYLen|FiltXStep|FiltYStep|ConnectFmap|"
-                        "LearnRateCoef|BiasLearnRateCoef)";
-    }
-
-    //
-    // Sanity checks:
-    //
-    // input sanity checks
-    // input_dim_ should be multiple of (fmap_x_len_ * fmap_y_len_)
-    KALDI_ASSERT(input_dim_ % (fmap_x_len_ * fmap_y_len_) == 0);
-    int32 num_input_fmaps = input_dim_ / (fmap_x_len_ * fmap_y_len_);
-    KALDI_LOG << "num_input_fmaps " << num_input_fmaps;
-    // check if step is in sync with fmap_len and filt_len
-    KALDI_ASSERT((fmap_x_len_ - filt_x_len_) % (filt_x_step_) == 0);
-    KALDI_ASSERT((fmap_y_len_ - filt_y_len_) % (filt_y_step_) == 0);
-    int32 out_fmap_x_len = (fmap_x_len_ - filt_x_len_)/filt_x_step_ + 1;
-    int32 out_fmap_y_len = (fmap_y_len_ - filt_y_len_)/filt_y_step_ + 1;
-    // output sanity checks
-    KALDI_ASSERT(output_dim_ % (out_fmap_x_len * out_fmap_y_len)  == 0);
-    int32 num_output_fmaps = output_dim_ / (out_fmap_x_len * out_fmap_y_len);
-    KALDI_LOG << "num_output_fmaps " << num_output_fmaps;
-    int32 num_filters = output_dim_/(out_fmap_x_len*out_fmap_y_len);
-    KALDI_LOG << "num_filters " << num_filters;
-
-    //
-    // Initialize trainable parameters,
-    //
-    filters_.Resize(num_filters, num_input_fmaps*filt_x_len_*filt_y_len_);
-    RandGauss(0.0, param_stddev, &filters_);
-    //
-    bias_.Resize(num_filters);
-    RandUniform(bias_mean, bias_range, &bias_);
-  }
-
-  void ReadData(std::istream &is, bool binary) {
-    ExpectToken(is, binary, "<LearnRateCoef>");
-    ReadBasicType(is, binary, &learn_rate_coef_);
-    ExpectToken(is, binary, "<BiasLearnRateCoef>");
-    ReadBasicType(is, binary, &bias_learn_rate_coef_);
-    // convolution hyperparameters
-    ExpectToken(is, binary, "<FmapXLen>");
-    ReadBasicType(is, binary, &fmap_x_len_);
-    ExpectToken(is, binary, "<FmapYLen>");
-    ReadBasicType(is, binary, &fmap_y_len_);
-    ExpectToken(is, binary, "<FiltXLen>");
-    ReadBasicType(is, binary, &filt_x_len_);
-    ExpectToken(is, binary, "<FiltYLen>");
-    ReadBasicType(is, binary, &filt_y_len_);
-    ExpectToken(is, binary, "<FiltXStep>");
-    ReadBasicType(is, binary, &filt_x_step_);
-    ExpectToken(is, binary, "<FiltYStep>");
-    ReadBasicType(is, binary, &filt_y_step_);
-    ExpectToken(is, binary, "<ConnectFmap>");
-    ReadBasicType(is, binary, &connect_fmap_);
-
-    // trainable parameters
-    ExpectToken(is, binary, "<Filters>");
-    filters_.Read(is, binary);
-    ExpectToken(is, binary, "<Bias>");
-    bias_.Read(is, binary);
-
-    //
-    // Sanity checks:
-    //
-    // input sanity checks
-    // input_dim_ should be multiple of (fmap_x_len_ * fmap_y_len_)
-    KALDI_ASSERT(input_dim_ % (fmap_x_len_ * fmap_y_len_) == 0);
-    // int32 num_input_fmaps = input_dim_ / (fmap_x_len_ * fmap_y_len_);
-    // KALDI_LOG << "num_input_fmaps " << num_input_fmaps;
-    // check if step is in sync with fmap_len and filt_len
-    KALDI_ASSERT((fmap_x_len_ - filt_x_len_) % (filt_x_step_) == 0);
-    KALDI_ASSERT((fmap_y_len_ - filt_y_len_) % (filt_y_step_) == 0);
-    int32 out_fmap_x_len = (fmap_x_len_ - filt_x_len_)/filt_x_step_ + 1;
-    int32 out_fmap_y_len = (fmap_y_len_ - filt_y_len_)/filt_y_step_ + 1;
-
-    // output sanity checks
-    KALDI_ASSERT(output_dim_ % (out_fmap_x_len * out_fmap_y_len)  == 0);
-  }
-
-  void WriteData(std::ostream &os, bool binary) const {
-    WriteToken(os, binary, "<LearnRateCoef>");
-    WriteBasicType(os, binary, learn_rate_coef_);
-    WriteToken(os, binary, "<BiasLearnRateCoef>");
-    WriteBasicType(os, binary, bias_learn_rate_coef_);
-    if (!binary) os << "\n";
-
-    // convolution hyperparameters
-    WriteToken(os, binary, "<FmapXLen>");
-    WriteBasicType(os, binary, fmap_x_len_);
-    WriteToken(os, binary, "<FmapYLen>");
-    WriteBasicType(os, binary, fmap_y_len_);
-    WriteToken(os, binary, "<FiltXLen>");
-    WriteBasicType(os, binary, filt_x_len_);
-    WriteToken(os, binary, "<FiltYLen>");
-    WriteBasicType(os, binary, filt_y_len_);
-    WriteToken(os, binary, "<FiltXStep>");
-    WriteBasicType(os, binary, filt_x_step_);
-    WriteToken(os, binary, "<FiltYStep>");
-    WriteBasicType(os, binary, filt_y_step_);
-    WriteToken(os, binary, "<ConnectFmap>");
-    WriteBasicType(os, binary, connect_fmap_);
-    if (!binary) os << "\n";
-
-    // trainable parameters
-    WriteToken(os, binary, "<Filters>");
-    if (!binary) os << "\n";
-    filters_.Write(os, binary);
-    WriteToken(os, binary, "<Bias>");
-    if (!binary) os << "\n";
-    bias_.Write(os, binary);
-  }
-
-  int32 NumParams() const {
-    return filters_.NumRows()*filters_.NumCols() + bias_.Dim();
-  }
-
-  void GetGradient(VectorBase<BaseFloat>* gradient) const {
-    KALDI_ASSERT(gradient->Dim() == NumParams());
-    int32 filters_num_elem = filters_.NumRows() * filters_.NumCols();
-    gradient->Range(0, filters_num_elem).CopyRowsFromMat(filters_);
-    gradient->Range(filters_num_elem, bias_.Dim()).CopyFromVec(bias_);
-  }
-
-  void GetParams(VectorBase<BaseFloat>* params) const {
-    KALDI_ASSERT(params->Dim() == NumParams());
-    int32 filters_num_elem = filters_.NumRows() * filters_.NumCols();
-    params->Range(0, filters_num_elem).CopyRowsFromMat(filters_);
-    params->Range(filters_num_elem, bias_.Dim()).CopyFromVec(bias_);
-  }
-
-  void SetParams(const VectorBase<BaseFloat>& params) {
-    KALDI_ASSERT(params.Dim() == NumParams());
-    int32 filters_num_elem = filters_.NumRows() * filters_.NumCols();
-    filters_.CopyRowsFromVec(params.Range(0, filters_num_elem));
-    bias_.CopyFromVec(params.Range(filters_num_elem, bias_.Dim()));
-  }
-
-  std::string Info() const {
-    return std::string("\n  filters") + MomentStatistics(filters_) +
-           ", lr-coef " + ToString(learn_rate_coef_) +
-           "\n  bias" + MomentStatistics(bias_) +
-           ", lr-coef " + ToString(bias_learn_rate_coef_);
-  }
-  std::string InfoGradient() const {
-    return std::string("\n  filters_grad") + MomentStatistics(filters_grad_) +
-           ", lr-coef " + ToString(learn_rate_coef_) +
-           "\n  bias_grad" + MomentStatistics(bias_grad_) +
-           ", lr-coef " + ToString(bias_learn_rate_coef_);
-  }
-
-  void PropagateFnc(const CuMatrixBase<BaseFloat> &in,
-                    CuMatrixBase<BaseFloat> *out) {
-    // useful dims
-    int32 num_input_fmaps = input_dim_ / (fmap_x_len_ * fmap_y_len_);
-    // int32 inp_fmap_size = fmap_x_len_ * fmap_y_len_;
-    int32 out_fmap_x_len = (fmap_x_len_ - filt_x_len_)/filt_x_step_ + 1;
-    int32 out_fmap_y_len = (fmap_y_len_ - filt_y_len_)/filt_y_step_ + 1;
-    int32 out_fmap_size = out_fmap_x_len*out_fmap_y_len;
-    int32 num_output_fmaps = output_dim_ / (out_fmap_x_len * out_fmap_y_len);
-    // this is total num_filters,
-    // so each input_fmap has size num_filters/num_input_fmaps
-    int32 num_filters = filters_.NumRows();
-    KALDI_ASSERT(num_filters == num_output_fmaps);
-    // int32 filter_size = filt_x_len_*filt_y_len_;
-    int32 num_frames = in.NumRows();
-
-    // we will need the buffers
-    if (vectorized_feature_patches_.size() == 0) {
-      vectorized_feature_patches_.resize(out_fmap_size);
-      feature_patch_diffs_.resize(out_fmap_size);
-    }
-
-    for (int32 p = 0; p < out_fmap_size; p++) {
-      vectorized_feature_patches_[p].Resize(num_frames, filters_.NumCols());
-    }
-
-    // Checked for num_input_fmaps=1, check for num_inp_fmaps>1
-    int32 out_fmap_cnt = 0;
-    for (int32 m = 0; m < fmap_x_len_-filt_x_len_+1; m = m+filt_x_step_) {
-      for (int32 n = 0; n < fmap_y_len_-filt_y_len_+1; n = n+filt_y_step_) {
-    std::vector<int32> column_mask;
-    int32 st = 0;
-    if (connect_fmap_ == 1) {
-      st = (m * fmap_y_len_ + n) * num_input_fmaps;
-    } else {
-      st = m * fmap_y_len_ * num_input_fmaps + n;
-    }
-
-    for (int32 i = 0; i < filt_x_len_; i++) {
-      for (int32 j = 0; j < filt_y_len_*num_input_fmaps; j++) {
-        int32 c = 0;
-        if (connect_fmap_ == 1) {
-          c = st + i * (num_input_fmaps*fmap_y_len_) + j;
-        } else {
-          c = st + i * (num_input_fmaps * fmap_y_len_)
-                     + (j / num_input_fmaps)
-                     + (j % num_input_fmaps) * fmap_y_len_;
-        }
-        column_mask.push_back(c);
-      }
-    }
-    CuArray<int32> cu_column_mask(column_mask);
-    vectorized_feature_patches_[out_fmap_cnt].CopyCols(in, cu_column_mask);
-    out_fmap_cnt++;
-      }
-    }
-
-    for (int32 p = 0; p < out_fmap_size; p++) {
-      CuSubMatrix<BaseFloat> tgt(out->ColRange(p*num_filters, num_filters));
-      tgt.AddVecToRows(1.0, bias_, 0.0);
-      tgt.AddMatMat(1.0, vectorized_feature_patches_[p], kNoTrans, filters_, kTrans, 1.0);
-    }
-  }
-
-
-  void BackpropagateFnc(const CuMatrixBase<BaseFloat> &in,
-                        const CuMatrixBase<BaseFloat> &out,
-                        const CuMatrixBase<BaseFloat> &out_diff,
-                        CuMatrixBase<BaseFloat> *in_diff) {
-    // useful dims
-    int32 num_input_fmaps = input_dim_ / (fmap_x_len_ * fmap_y_len_);
-
-    int32 out_fmap_x_len = (fmap_x_len_ - filt_x_len_)/filt_x_step_ + 1;
-    int32 out_fmap_y_len = (fmap_y_len_ - filt_y_len_)/filt_y_step_ + 1;
-    int32 out_fmap_size = out_fmap_x_len * out_fmap_y_len;
-    int32 num_output_fmaps = output_dim_ / (out_fmap_x_len * out_fmap_y_len);
-    // this is total num_filters,
-    // so each input_fmap has num_filters/num_input_fmaps
-    int32 num_filters = filters_.NumRows();
-    KALDI_ASSERT(num_filters == num_output_fmaps);
-    // int32 filter_size = filt_x_len_*filt_y_len_;
-    int32 num_frames = in.NumRows();
-
-    for (int32 p = 0; p < out_fmap_size; p++) {
-      feature_patch_diffs_[p].Resize(num_frames, filters_.NumCols(), kSetZero);
-      CuSubMatrix<BaseFloat> out_diff_patch(out_diff.ColRange(p*num_filters, num_filters));
-      feature_patch_diffs_[p].AddMatMat(1.0, out_diff_patch, kNoTrans, filters_, kNoTrans, 0.0);
-    }
-
-    // compute in_diff_summands_ once
-    if (in_diff_summands_.Dim() == 0) {
-      in_diff_summands_.Resize(in_diff->NumCols(), kSetZero);
-      for (int32 m = 0; m < fmap_x_len_-filt_x_len_+1; m = m+filt_x_step_) {
-        for (int32 n = 0; n < fmap_y_len_-filt_y_len_+1; n = n+filt_y_step_) {
-          int32 st = 0;
-          if (connect_fmap_ == 1) {
-            st = (m * fmap_y_len_ + n) * num_input_fmaps;
-          } else {
-            st = m * fmap_y_len_ * num_input_fmaps + n;
-          }
-          for (int32 i = 0; i < filt_x_len_; i++) {
-            for (int32 j = 0; j < filt_y_len_*num_input_fmaps; j++) {
-              int32 c = 0;
-              if (connect_fmap_ == 1) {
-                c = st + i * (num_input_fmaps * fmap_y_len_) + j;
-              } else {
-                c = st + i * (num_input_fmaps * fmap_y_len_)
-                       + (j / num_input_fmaps)
-                       + (j % num_input_fmaps) * fmap_y_len_;
-              }
-              // add 1.0
-              in_diff_summands_.Range(c, 1).Add(1.0);
-            }
-          }
-        }
-      }
-      in_diff_summands_.InvertElements();
-    }
-
-    int32 out_fmap_cnt = 0;
-
-    for (int32 m = 0; m < fmap_x_len_-filt_x_len_+1; m = m+filt_x_step_) {
-      for (int32 n = 0; n< fmap_y_len_-filt_y_len_+1; n = n+filt_y_step_) {
-        int32 st = 0;
-        if (connect_fmap_ == 1) {
-          st = (m * fmap_y_len_ + n) * num_input_fmaps;
-        } else {
-          st = m * fmap_y_len_ * num_input_fmaps + n;
-        }
-
-        for (int32 i = 0; i < filt_x_len_; i++) {
-          for (int32 j = 0; j < filt_y_len_*num_input_fmaps; j++) {
-            int32 c = 0;
-            if (connect_fmap_ == 1) {
-              c = st + i *(num_input_fmaps*fmap_y_len_)+j;
-            } else {
-              c = st + i * (num_input_fmaps * fmap_y_len_)
-                     + (j / num_input_fmaps)
-                     + (j % num_input_fmaps) * fmap_y_len_;
-            }
-            // from which col?
-            CuMatrix<BaseFloat>& diff_mat = feature_patch_diffs_[out_fmap_cnt];
-            CuSubMatrix<BaseFloat> src(diff_mat.ColRange(i*filt_y_len_*num_input_fmaps+j, 1));
-            // to which col?
-            CuSubMatrix<BaseFloat> tgt(in_diff->ColRange(c, 1));
-            tgt.AddMat(1.0, src);
-          }
-        }
-        out_fmap_cnt++;
-      }
-    }
-    // compensate for summands
-    in_diff->MulColsVec(in_diff_summands_);
-  }
-
-
-  void Update(const CuMatrixBase<BaseFloat> &input,
-              const CuMatrixBase<BaseFloat> &diff) {
-    // useful dims,
-    int32 out_fmap_x_len = (fmap_x_len_ - filt_x_len_)/filt_x_step_ + 1;
-    int32 out_fmap_y_len = (fmap_y_len_ - filt_y_len_)/filt_y_step_ + 1;
-    int32 out_fmap_size = out_fmap_x_len * out_fmap_y_len;
-    int32 num_output_fmaps = output_dim_ / (out_fmap_x_len * out_fmap_y_len);
-
-    // This is total num_filters,
-    // each input_fmap has num_filters / num_input_fmaps:
-    int32 num_filters = filters_.NumRows();
-    KALDI_ASSERT(num_filters == num_output_fmaps);
-
-    // we use following hyperparameters from the option class,
-    const BaseFloat lr = opts_.learn_rate;
-
-    //
-    // calculate the gradient
-    //
-    filters_grad_.Resize(filters_.NumRows(), filters_.NumCols(), kSetZero);
-    bias_grad_.Resize(filters_.NumRows(), kSetZero);
-    //
-    for (int32 p = 0; p < out_fmap_size; p++) {
-      CuSubMatrix<BaseFloat> diff_patch(diff.ColRange(p * num_filters, num_filters));
-      filters_grad_.AddMatMat(1.0, diff_patch, kTrans, vectorized_feature_patches_[p], kNoTrans, 1.0);
-      bias_grad_.AddRowSumMat(1.0, diff_patch, 1.0);
-    }
-    // scale
-    filters_grad_.Scale(1.0/num_output_fmaps);
-    bias_grad_.Scale(1.0/num_output_fmaps);
-
-    //
-    // update
-    //
-    filters_.AddMat(-lr * learn_rate_coef_, filters_grad_);
-    bias_.AddVec(-lr * bias_learn_rate_coef_, bias_grad_);
-  }
-
- private:
-  /// feature maps dimensions (for input x_ is usually splice
-  /// and y_ is num of fbanks) shift for 2nd dim of a patch
-  /// (i.e. frame length before splicing),
-  int32 fmap_x_len_, fmap_y_len_;
-
-  /// 2D filter dimensions, x_ temporal, y_ spectral,
-  int32 filt_x_len_, filt_y_len_;
-
-  /// 2D shifts along temporal and spectral axis,
-  int32 filt_x_step_, filt_y_step_;
-
-  int32 connect_fmap_;  ///< if connect_fmap_ = 1, then each fmap has num_filt
-
-  CuMatrix<BaseFloat> filters_;  ///< row = vectorized rectangular filter
-  CuVector<BaseFloat> bias_;  ///< bias for each filter
-
-  CuMatrix<BaseFloat> filters_grad_;  ///< gradient of filters
-  CuVector<BaseFloat> bias_grad_;  ///< gradient of biases
-
-  /** Buffer of reshaped inputs:
-   *  1row = vectorized rectangular feature patch,
-   *  1col = dim over speech frames,
-   *  std::vector-dim = patch-position
-   */
-  std::vector<CuMatrix<BaseFloat> > vectorized_feature_patches_;
-
-  /** Buffer for backpropagation:
-   *  derivatives in the domain of 'vectorized_feature_patches_',
-   *  1row = vectorized rectangular feature patch,
-   *  1col = dim over speech frames,
-   *  std::vector-dim = patch-position
-   */
-  std::vector<CuMatrix<BaseFloat> > feature_patch_diffs_;
-
-  /// Auxiliary vector for compensating #summands when backpropagating
-  CuVector<BaseFloat> in_diff_summands_;
-};
-
-}  // namespace nnet1
-}  // namespace kaldi
-
-#endif  // KALDI_NNET_NNET_CONVOLUTIONAL_2D_COMPONENT_H_
diff --git a/src/nnet/nnet-convolutional-component.h b/src/nnet/nnet-convolutional-component.h
deleted file mode 100644
index bd4da7d3c0c..00000000000
--- a/src/nnet/nnet-convolutional-component.h
+++ /dev/null
@@ -1,482 +0,0 @@
-// nnet/nnet-convolutional-component.h
-
-// Copyright 2014  Brno University of Technology (author: Karel Vesely)
-
-// See ../../COPYING for clarification regarding multiple authors
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//  http://www.apache.org/licenses/LICENSE-2.0
-//
-// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
-// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
-// MERCHANTABLITY OR NON-INFRINGEMENT.
-// See the Apache 2 License for the specific language governing permissions and
-// limitations under the License.
-
-
-#ifndef KALDI_NNET_NNET_CONVOLUTIONAL_COMPONENT_H_
-#define KALDI_NNET_NNET_CONVOLUTIONAL_COMPONENT_H_
-
-#include <string>
-#include <vector>
-
-#include "nnet/nnet-component.h"
-#include "nnet/nnet-utils.h"
-#include "cudamatrix/cu-math.h"
-
-namespace kaldi {
-namespace nnet1 {
-
-/**
- * ConvolutionalComponent implements convolution over single axis
- * (i.e. frequency axis in case we are the 1st component in NN).
- * We don't do convolution along temporal axis, which simplifies the
- * implementation (and was not helpful for Tara).
- *
- * We assume the input featrues are spliced, i.e. each frame
- * is in fact a set of stacked frames, where we can form patches
- * which span over several frequency bands and whole time axis.
- *
- * The convolution is done over whole axis with same filters,
- * i.e. we don't use separate filters for different 'regions'
- * of frequency axis.
- *
- * In order to have a fast implementations, the filters
- * are represented in vectorized form, where each rectangular
- * filter corresponds to a row in a matrix, where all the filters
- * are stored. The features are then re-shaped to a set of matrices,
- * where one matrix corresponds to single patch-position,
- * where all the filters get applied.
- *
- * The type of convolution is controled by hyperparameters:
- * patch_dim_     ... frequency axis size of the patch
- * patch_step_    ... size of shift in the convolution
- * patch_stride_  ... shift for 2nd dim of a patch
- *                    (i.e. frame length before splicing)
- *
- * Due to convolution same weights are used repeateadly,
- * the final gradient is a sum of all position-specific
- * gradients (the sum was found better than averaging).
- *
- */
-class ConvolutionalComponent : public UpdatableComponent {
- public:
-  ConvolutionalComponent(int32 dim_in, int32 dim_out):
-    UpdatableComponent(dim_in, dim_out),
-    patch_dim_(0),
-    patch_step_(0),
-    patch_stride_(0),
-    max_norm_(0.0)
-  { }
-
-  ~ConvolutionalComponent()
-  { }
-
-  Component* Copy() const { return new ConvolutionalComponent(*this); }
-  ComponentType GetType() const { return kConvolutionalComponent; }
-
-  void InitData(std::istream &is) {
-    // define options
-    BaseFloat bias_mean = -2.0, bias_range = 2.0, param_stddev = 0.1;
-    // parse config
-    std::string token;
-    while (is >> std::ws, !is.eof()) {
-      ReadToken(is, false, &token);
-      /**/ if (token == "<ParamStddev>") ReadBasicType(is, false, &param_stddev);
-      else if (token == "<BiasMean>")    ReadBasicType(is, false, &bias_mean);
-      else if (token == "<BiasRange>")   ReadBasicType(is, false, &bias_range);
-      else if (token == "<PatchDim>")    ReadBasicType(is, false, &patch_dim_);
-      else if (token == "<PatchStep>")   ReadBasicType(is, false, &patch_step_);
-      else if (token == "<PatchStride>") ReadBasicType(is, false, &patch_stride_);
-      else if (token == "<LearnRateCoef>") ReadBasicType(is, false, &learn_rate_coef_);
-      else if (token == "<BiasLearnRateCoef>") ReadBasicType(is, false, &bias_learn_rate_coef_);
-      else if (token == "<MaxNorm>") ReadBasicType(is, false, &max_norm_);
-      else KALDI_ERR << "Unknown token " << token << ", a typo in config?"
-                     << " (ParamStddev|BiasMean|BiasRange|PatchDim|PatchStep|PatchStride)";
-    }
-
-    //
-    // Sanity checks:
-    //
-    // splice (input are spliced frames):
-    KALDI_ASSERT(input_dim_ % patch_stride_ == 0);
-    int32 num_splice = input_dim_ / patch_stride_;
-    KALDI_LOG << "num_splice " << num_splice;
-    // number of patches:
-    KALDI_ASSERT((patch_stride_ - patch_dim_) % patch_step_ == 0);
-    int32 num_patches = 1 + (patch_stride_ - patch_dim_) / patch_step_;
-    KALDI_LOG << "num_patches " << num_patches;
-    // filter dim:
-    int32 filter_dim = num_splice * patch_dim_;
-    KALDI_LOG << "filter_dim " << filter_dim;
-    // num filters:
-    KALDI_ASSERT(output_dim_ % num_patches == 0);
-    int32 num_filters = output_dim_ / num_patches;
-    KALDI_LOG << "num_filters " << num_filters;
-    //
-
-    //
-    // Initialize trainable parameters,
-    //
-    // Gaussian with given std_dev (mean = 0),
-    filters_.Resize(num_filters, filter_dim);
-    RandGauss(0.0, param_stddev, &filters_);
-    // Uniform,
-    bias_.Resize(num_filters);
-    RandUniform(bias_mean, bias_range, &bias_);
-  }
-
-  void ReadData(std::istream &is, bool binary) {
-    // convolution hyperparameters,
-    ExpectToken(is, binary, "<PatchDim>");
-    ReadBasicType(is, binary, &patch_dim_);
-    ExpectToken(is, binary, "<PatchStep>");
-    ReadBasicType(is, binary, &patch_step_);
-    ExpectToken(is, binary, "<PatchStride>");
-    ReadBasicType(is, binary, &patch_stride_);
-
-    // variant-length list of parameters,
-    bool end_loop = false;
-    while (!end_loop) {
-      int first_char = PeekToken(is, binary);
-      switch (first_char) {
-        case 'L': ExpectToken(is, binary, "<LearnRateCoef>");
-          ReadBasicType(is, binary, &learn_rate_coef_);
-          break;
-        case 'B': ExpectToken(is, binary, "<BiasLearnRateCoef>");
-          ReadBasicType(is, binary, &bias_learn_rate_coef_);
-          break;
-        case 'M': ExpectToken(is, binary, "<MaxNorm>");
-          ReadBasicType(is, binary, &max_norm_);
-          break;
-        case '!': ExpectToken(is, binary, "<!EndOfComponent>");
-        default: end_loop = true;
-      }
-    }
-
-    // trainable parameters
-    ExpectToken(is, binary, "<Filters>");
-    filters_.Read(is, binary);
-    ExpectToken(is, binary, "<Bias>");
-    bias_.Read(is, binary);
-
-    //
-    // Sanity checks:
-    //
-    // splice (input are spliced frames):
-    KALDI_ASSERT(input_dim_ % patch_stride_ == 0);
-    int32 num_splice = input_dim_ / patch_stride_;
-    // number of patches:
-    KALDI_ASSERT((patch_stride_ - patch_dim_) % patch_step_ == 0);
-    int32 num_patches = 1 + (patch_stride_ - patch_dim_) / patch_step_;
-    // filter dim:
-    int32 filter_dim = num_splice * patch_dim_;
-    // num filters:
-    KALDI_ASSERT(output_dim_ % num_patches == 0);
-    int32 num_filters = output_dim_ / num_patches;
-    // check parameter dims:
-    KALDI_ASSERT(num_filters == filters_.NumRows());
-    KALDI_ASSERT(num_filters == bias_.Dim());
-    KALDI_ASSERT(filter_dim == filters_.NumCols());
-    //
-  }
-
-  void WriteData(std::ostream &os, bool binary) const {
-    // convolution hyperparameters
-    WriteToken(os, binary, "<PatchDim>");
-    WriteBasicType(os, binary, patch_dim_);
-    WriteToken(os, binary, "<PatchStep>");
-    WriteBasicType(os, binary, patch_step_);
-    WriteToken(os, binary, "<PatchStride>");
-    WriteBasicType(os, binary, patch_stride_);
-    if (!binary) os << "\n";
-
-    // re-scale learn rate
-    WriteToken(os, binary, "<LearnRateCoef>");
-    WriteBasicType(os, binary, learn_rate_coef_);
-    WriteToken(os, binary, "<BiasLearnRateCoef>");
-    WriteBasicType(os, binary, bias_learn_rate_coef_);
-    // max-norm regularization
-    WriteToken(os, binary, "<MaxNorm>");
-    WriteBasicType(os, binary, max_norm_);
-    if (!binary) os << "\n";
-
-    // trainable parameters
-    WriteToken(os, binary, "<Filters>");
-    if (!binary) os << "\n";
-    filters_.Write(os, binary);
-    WriteToken(os, binary, "<Bias>");
-    if (!binary) os << "\n";
-    bias_.Write(os, binary);
-  }
-
-  int32 NumParams() const {
-    return filters_.NumRows()*filters_.NumCols() + bias_.Dim();
-  }
-
-  void GetGradient(VectorBase<BaseFloat>* gradient) const {
-    KALDI_ASSERT(gradient->Dim() == NumParams());
-    int32 filters_num_elem = filters_.NumRows() * filters_.NumCols();
-    gradient->Range(0, filters_num_elem).CopyRowsFromMat(filters_);
-    gradient->Range(filters_num_elem, bias_.Dim()).CopyFromVec(bias_);
-  }
-
-  void GetParams(VectorBase<BaseFloat>* params) const {
-    KALDI_ASSERT(params->Dim() == NumParams());
-    int32 filters_num_elem = filters_.NumRows() * filters_.NumCols();
-    params->Range(0, filters_num_elem).CopyRowsFromMat(filters_);
-    params->Range(filters_num_elem, bias_.Dim()).CopyFromVec(bias_);
-  }
-
-  void SetParams(const VectorBase<BaseFloat>& params) {
-    KALDI_ASSERT(params.Dim() == NumParams());
-    int32 filters_num_elem = filters_.NumRows() * filters_.NumCols();
-    filters_.CopyRowsFromVec(params.Range(0, filters_num_elem));
-    bias_.CopyFromVec(params.Range(filters_num_elem, bias_.Dim()));
-  }
-
-  std::string Info() const {
-    return std::string("\n  filters") + MomentStatistics(filters_) +
-      ", lr-coef " + ToString(learn_rate_coef_) +
-      ", max-norm " + ToString(max_norm_) +
-      "\n  bias" + MomentStatistics(bias_) +
-      ", lr-coef " + ToString(bias_learn_rate_coef_);
-  }
-
-  std::string InfoGradient() const {
-    return std::string("\n  filters_grad") + MomentStatistics(filters_grad_) +
-      ", lr-coef " + ToString(learn_rate_coef_) +
-      ", max-norm " + ToString(max_norm_) +
-      "\n  bias_grad" + MomentStatistics(bias_grad_) +
-      ", lr-coef " + ToString(bias_learn_rate_coef_);
-  }
-
-  void PropagateFnc(const CuMatrixBase<BaseFloat> &in,
-                    CuMatrixBase<BaseFloat> *out) {
-    // useful dims
-    int32 num_splice = input_dim_ / patch_stride_;
-    int32 num_patches = 1 + (patch_stride_ - patch_dim_) / patch_step_;
-    int32 num_filters = filters_.NumRows();
-    int32 num_frames = in.NumRows();
-    int32 filter_dim = filters_.NumCols();
-
-    // we will need the buffers
-    if (vectorized_feature_patches_.NumRows() != num_frames) {
-      vectorized_feature_patches_.Resize(num_frames, filter_dim * num_patches, kUndefined);
-      feature_patch_diffs_.Resize(num_frames, filter_dim * num_patches, kSetZero);
-    }
-
-    /* Prepare feature patches, the layout is:
-     * |----------|----------|----------|---------| (in = spliced frames)
-     *   xxx        xxx        xxx        xxx       (x = selected elements)
-     *
-     *   xxx : patch dim
-     *    xxx
-     *   ^---: patch step
-     * |----------| : patch stride
-     *
-     *   xxx-xxx-xxx-xxx : filter dim
-     *
-     */
-    // build-up a column selection map:
-    int32 index = 0;
-    column_map_.resize(filter_dim * num_patches);
-    for (int32 p = 0; p < num_patches; p++) {
-      for (int32 s = 0; s < num_splice; s++) {
-        for (int32 d = 0; d < patch_dim_; d++) {
-          column_map_[index] = p * patch_step_ + s * patch_stride_ + d;
-          index++;
-        }
-      }
-    }
-    // select the columns
-    CuArray<int32> cu_column_map(column_map_);
-    vectorized_feature_patches_.CopyCols(in, cu_column_map);
-
-    // compute filter activations
-    for (int32 p = 0; p < num_patches; p++) {
-      CuSubMatrix<BaseFloat> tgt(out->ColRange(p * num_filters, num_filters));
-      CuSubMatrix<BaseFloat> patch(vectorized_feature_patches_.ColRange(
-                                   p * filter_dim, filter_dim));
-      tgt.AddVecToRows(1.0, bias_, 0.0);  // add bias
-      // apply all filters
-      tgt.AddMatMat(1.0, patch, kNoTrans, filters_, kTrans, 1.0);
-    }
-  }
-
-  /*
-   This function does an operation similar to reversing a map,
-   except it handles maps that are not one-to-one by outputting
-   the reversed map as a vector of lists.
-   @param[in] forward_indexes is a vector of int32, each of whose
-              elements is between 0 and input_dim - 1.
-   @param[in] input_dim. See definitions of forward_indexes and
-              backward_indexes.
-   @param[out] backward_indexes is a vector of dimension input_dim
-              of lists, The list at (backward_indexes[i]) is a list
-              of all indexes j such that forward_indexes[j] = i.
-  */
-  void ReverseIndexes(const std::vector<int32> &forward_indexes,
-                      std::vector<std::vector<int32> > *backward_indexes) {
-    int32 i;
-    int32 size = forward_indexes.size();
-    backward_indexes->resize(input_dim_);
-    int32 reserve_size = 2+ forward_indexes.size() / input_dim_;
-    std::vector<std::vector<int32> >::iterator iter = backward_indexes->begin(),
-      end = backward_indexes->end();
-    for (; iter != end; ++iter)
-      iter->reserve(reserve_size);
-    for (int32 j = 0; j < size; j++) {
-      i = forward_indexes[j];
-      KALDI_ASSERT(i < input_dim_);
-      (*backward_indexes)[i].push_back(j);
-    }
-  }
-
-  /*
-   This function transforms a vector of lists into a list of vectors,
-   padded with -1.
-   @param[in] The input vector of lists. Let in.size() be D, and let
-              the longest list length (i.e. the max of in[i].size()) be L.
-   @param[out] The output list of vectors. The length of the list will
-              be L, each vector-dimension will be D (i.e. out[i].size() == D),
-              and if in[i] == j, then for some k we will have that
-              out[k][j] = i. The output vectors are padded with -1
-              where necessary if not all the input lists have the same side.
-  */
-  void RearrangeIndexes(const std::vector<std::vector<int32> > &in,
-                        std::vector<std::vector<int32> > *out) {
-    int32 D = in.size();
-    int32 L = 0;
-    for (int32 i = 0; i < D; i++)
-      if (in[i].size() > L)
-        L = in[i].size();
-    out->resize(L);
-    for (int32 i = 0; i < L; i++)
-      (*out)[i].resize(D, -1);
-    for (int32 i = 0; i < D; i++) {
-      for (int32 j = 0; j < in[i].size(); j++) {
-        (*out)[j][i] = in[i][j];
-      }
-    }
-  }
-
-  void BackpropagateFnc(const CuMatrixBase<BaseFloat> &in,
-                        const CuMatrixBase<BaseFloat> &out,
-                        const CuMatrixBase<BaseFloat> &out_diff,
-                        CuMatrixBase<BaseFloat> *in_diff) {
-    // useful dims
-    int32 num_patches = 1 + (patch_stride_ - patch_dim_) / patch_step_;
-    int32 num_filters = filters_.NumRows();
-    int32 filter_dim = filters_.NumCols();
-
-    // backpropagate to vector of matrices
-    // (corresponding to position of a filter)
-    for (int32 p = 0; p < num_patches; p++) {
-      CuSubMatrix<BaseFloat> patch_diff(feature_patch_diffs_.ColRange(
-                                        p * filter_dim, filter_dim));
-      CuSubMatrix<BaseFloat> out_diff_patch(out_diff.ColRange(
-                                            p * num_filters, num_filters));
-      patch_diff.AddMatMat(1.0, out_diff_patch, kNoTrans,
-                           filters_, kNoTrans, 0.0);
-    }
-
-    // sum the derivatives into in_diff, we will compensate #summands
-    std::vector<std::vector<int32> > reversed_column_map;
-    ReverseIndexes(column_map_, &reversed_column_map);
-    std::vector<std::vector<int32> > rearranged_column_map;
-    RearrangeIndexes(reversed_column_map, &rearranged_column_map);
-    for (int32 p = 0; p < rearranged_column_map.size(); p++) {
-      CuArray<int32> cu_cols(rearranged_column_map[p]);
-      in_diff->AddCols(feature_patch_diffs_, cu_cols);
-    }
-  }
-
-
-  void Update(const CuMatrixBase<BaseFloat> &input,
-              const CuMatrixBase<BaseFloat> &diff) {
-    // useful dims
-    int32 num_patches = 1 + (patch_stride_ - patch_dim_) / patch_step_;
-    int32 num_filters = filters_.NumRows();
-    int32 filter_dim = filters_.NumCols();
-
-    // we use following hyperparameters from the option class
-    const BaseFloat lr = opts_.learn_rate;
-
-    //
-    // calculate the gradient
-    //
-    filters_grad_.Resize(num_filters, filter_dim, kSetZero);  // reset
-    bias_grad_.Resize(num_filters, kSetZero);  // reset
-    // use all the patches
-    for (int32 p = 0; p < num_patches; p++) {  // sum
-      CuSubMatrix<BaseFloat> diff_patch(diff.ColRange(p * num_filters,
-                                                      num_filters));
-      CuSubMatrix<BaseFloat> patch(vectorized_feature_patches_.ColRange(
-                                   p * filter_dim, filter_dim));
-      filters_grad_.AddMatMat(1.0, diff_patch, kTrans, patch, kNoTrans, 1.0);
-      bias_grad_.AddRowSumMat(1.0, diff_patch, 1.0);
-    }
-
-    //
-    // update
-    //
-    filters_.AddMat(-lr*learn_rate_coef_, filters_grad_);
-    bias_.AddVec(-lr*bias_learn_rate_coef_, bias_grad_);
-    //
-
-    // max-norm
-    if (max_norm_ > 0.0) {
-      CuMatrix<BaseFloat> lin_sqr(filters_);
-      lin_sqr.MulElements(filters_);
-      CuVector<BaseFloat> l2(filters_.NumRows());
-      l2.AddColSumMat(1.0, lin_sqr, 0.0);
-      l2.ApplyPow(0.5);  // we have per-neuron L2 norms
-      CuVector<BaseFloat> scl(l2);
-      scl.Scale(1.0/max_norm_);
-      scl.ApplyFloor(1.0);
-      scl.InvertElements();
-      filters_.MulRowsVec(scl);  // shink to sphere!
-    }
-  }
-
- private:
-  int32 patch_dim_,    ///< number of consecutive inputs, 1st dim of patch
-        patch_step_,   ///< step of the convolution
-                       ///  (i.e. shift between 2 patches)
-        patch_stride_;  ///< shift for 2nd dim of a patch
-                       ///  (i.e. frame length before splicing)
-
-  CuMatrix<BaseFloat> filters_;  ///< row = vectorized rectangular filter
-  CuVector<BaseFloat> bias_;  ///< bias for each filter
-
-  CuMatrix<BaseFloat> filters_grad_;  ///< gradient of filters
-  CuVector<BaseFloat> bias_grad_;  ///< gradient of biases
-
-  BaseFloat max_norm_;  ///< limit L2 norm of a neuron weights to positive value
-
-  /** Buffer of reshaped inputs:
-   *  1row = vectorized rectangular feature patches,
-   *  1col = dim over speech frames
-   *  Map of input features:
-   *  std::vector-dim = patch-position
-   */
-  CuMatrix<BaseFloat> vectorized_feature_patches_;
-  std::vector<int32> column_map_;
-
-  /** Buffer for backpropagation:
-   *  derivatives in the domain of 'vectorized_feature_patches_',
-   *  1row = vectorized rectangular feature patches,
-   *  1col = dim over speech frames,
-   */
-  CuMatrix<BaseFloat> feature_patch_diffs_;
-};
-
-}  // namespace nnet1
-}  // namespace kaldi
-
-#endif  // KALDI_NNET_NNET_CONVOLUTIONAL_COMPONENT_H_
diff --git a/src/nnet/nnet-frame-pooling-component.h b/src/nnet/nnet-frame-pooling-component.h
deleted file mode 100644
index ecc71274993..00000000000
--- a/src/nnet/nnet-frame-pooling-component.h
+++ /dev/null
@@ -1,290 +0,0 @@
-// nnet/nnet-frame-pooling-component.h
-
-// Copyright 2014  Brno University of Technology (author: Karel Vesely)
-
-// See ../../COPYING for clarification regarding multiple authors
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//  http://www.apache.org/licenses/LICENSE-2.0
-//
-// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
-// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
-// MERCHANTABLITY OR NON-INFRINGEMENT.
-// See the Apache 2 License for the specific language governing permissions and
-// limitations under the License.
-
-
-#ifndef KALDI_NNET_NNET_FRAME_POOLING_COMPONENT_H_
-#define KALDI_NNET_NNET_FRAME_POOLING_COMPONENT_H_
-
-#include <string>
-#include <vector>
-#include <algorithm>
-#include <sstream>
-
-#include "nnet/nnet-component.h"
-#include "nnet/nnet-utils.h"
-#include "cudamatrix/cu-math.h"
-
-namespace kaldi {
-namespace nnet1 {
-
-/**
- * FramePoolingComponent :
- * The input/output matrices are split to frames of width 'feature_dim_'.
- * Here we do weighted pooling of frames along the temporal axis,
- * given a frame-offset of leftmost frame, the pool-size is defined
- * by weight-vector size.
- */
-class FramePoolingComponent : public UpdatableComponent {
- public:
-  FramePoolingComponent(int32 dim_in, int32 dim_out):
-    UpdatableComponent(dim_in, dim_out),
-    feature_dim_(0),
-    normalize_(false)
-  { }
-
-  ~FramePoolingComponent()
-  { }
-
-  Component* Copy() const { return new FramePoolingComponent(*this); }
-  ComponentType GetType() const { return kFramePoolingComponent; }
-
-  /**
-   * Here the offsets are w.r.t. central frames, which has offset 0.
-   * Note.: both the offsets and pool sizes can be negative.
-   */
-  void InitData(std::istream &is) {
-    // temporary, for initialization,
-    std::vector<int32> pool_size;
-    std::vector<int32> central_offset;
-    Vector<BaseFloat> pool_weight;
-    float learn_rate_coef = 0.01;
-    // parse config
-    std::string token;
-    while (is >> std::ws, !is.eof()) {
-      ReadToken(is, false, &token);
-      /**/ if (token == "<FeatureDim>") ReadBasicType(is, false, &feature_dim_);
-      else if (token == "<CentralOffset>") ReadIntegerVector(is, false, &central_offset);
-      else if (token == "<PoolSize>") ReadIntegerVector(is, false, &pool_size);
-      else if (token == "<PoolWeight>") pool_weight.Read(is, false);
-      else if (token == "<LearnRateCoef>") ReadBasicType(is, false, &learn_rate_coef);
-      else if (token == "<Normalize>") ReadBasicType(is, false, &normalize_);
-      else KALDI_ERR << "Unknown token " << token << ", a typo in config?"
-                     << " (FeatureDim|CentralOffset <vec>|PoolSize <vec>|LearnRateCoef|Normalize)";
-    }
-    // check inputs:
-    KALDI_ASSERT(feature_dim_ > 0);
-    KALDI_ASSERT(central_offset.size() > 0);
-    KALDI_ASSERT(central_offset.size() == pool_size.size());
-    // initialize:
-    int32 num_frames = InputDim() / feature_dim_;
-    int32 central_frame = (num_frames -1) / 2;
-    int32 num_pools = central_offset.size();
-    offset_.resize(num_pools);
-    weight_.resize(num_pools);
-    for (int32 p = 0; p < num_pools; p++) {
-      offset_[p] = central_frame + central_offset[p] + std::min(0, pool_size[p]+1);
-      weight_[p].Resize(std::abs(pool_size[p]));
-      weight_[p].Set(1.0/std::abs(pool_size[p]));
-    }
-    learn_rate_coef_ = learn_rate_coef;
-    if (pool_weight.Dim() != 0) {
-      KALDI_LOG << "Initializing from pool-weight vector";
-      int32 num_weights = 0;
-      for (int32 p = 0; p < num_pools; p++) {
-        weight_[p].CopyFromVec(pool_weight.Range(num_weights, weight_[p].Dim()));
-        num_weights += weight_[p].Dim();
-      }
-      KALDI_ASSERT(num_weights == pool_weight.Dim());
-    }
-    // check that offsets are within the splice we had,
-    for (int32 p = 0; p < num_pools; p++) {
-      KALDI_ASSERT(offset_[p] >= 0);
-      KALDI_ASSERT(offset_[p] + weight_[p].Dim() <= num_frames);
-    }
-  }
-
-  /**
-   * Here the offsets are w.r.t. leftmost frame from splice, its offset is 0.
-   * If we spliced +/- 15 frames, the central frames has index '15'.
-   */
-  void ReadData(std::istream &is, bool binary) {
-    // get the input dimension before splicing
-    ExpectToken(is, binary, "<FeatureDim>");
-    ReadBasicType(is, binary, &feature_dim_);
-    ExpectToken(is, binary, "<LearnRateCoef>");
-    ReadBasicType(is, binary, &learn_rate_coef_);
-    ExpectToken(is, binary, "<Normalize>");
-    ReadBasicType(is, binary, &normalize_);
-    // read the offsets w.r.t. central frame
-    ExpectToken(is, binary, "<FrameOffset>");
-    ReadIntegerVector(is, binary, &offset_);
-    // read the frame-weights
-    ExpectToken(is, binary, "<FrameWeight>");
-    int32 num_pools = offset_.size();
-    weight_.resize(num_pools);
-    for (int32 p = 0; p < num_pools; p++) {
-      weight_[p].Read(is, binary);
-    }
-    //
-    // Sanity checks:
-    //
-    KALDI_ASSERT(input_dim_ % feature_dim_ == 0);
-    KALDI_ASSERT(output_dim_ % feature_dim_ == 0);
-    KALDI_ASSERT(output_dim_ / feature_dim_ == num_pools);
-    KALDI_ASSERT(offset_.size() == weight_.size());
-    // check the shifts don't exceed the splicing
-    int32 total_frame = InputDim() / feature_dim_;
-    for (int32 p = 0; p < num_pools; p++) {
-      KALDI_ASSERT(offset_[p] >= 0);
-      KALDI_ASSERT(offset_[p] + (weight_[p].Dim()-1) < total_frame);
-    }
-    //
-  }
-
-  void WriteData(std::ostream &os, bool binary) const {
-    WriteToken(os, binary, "<FeatureDim>");
-    WriteBasicType(os, binary, feature_dim_);
-    WriteToken(os, binary, "<LearnRateCoef>");
-    WriteBasicType(os, binary, learn_rate_coef_);
-    WriteToken(os, binary, "<Normalize>");
-    WriteBasicType(os, binary, normalize_);
-    WriteToken(os, binary, "<FrameOffset>");
-    WriteIntegerVector(os, binary, offset_);
-    // write pooling weights of individual frames
-    WriteToken(os, binary, "<FrameWeight>");
-    int32 num_pools = offset_.size();
-    for (int32 p = 0; p < num_pools; p++) {
-      weight_[p].Write(os, binary);
-    }
-  }
-
-  int32 NumParams() const {
-    int32 ans = 0;
-    for (int32 p = 0; p < weight_.size(); p++) {
-      ans += weight_[p].Dim();
-    }
-    return ans;
-  }
-
-  void GetGradient(VectorBase<BaseFloat> *gradient) const {
-    KALDI_ERR << "Unimplemented.";
-  }
-
-  void GetParams(VectorBase<BaseFloat>* params) const {
-    KALDI_ASSERT(params->Dim() == NumParams());
-    int32 offset = 0;
-    for (int32 p = 0; p < weight_.size(); p++) {
-      params->Range(offset, weight_[p].Dim()).CopyFromVec(weight_[p]);
-      offset += weight_[p].Dim();
-    }
-    KALDI_ASSERT(offset == params->Dim());
-  }
-
-  void SetParams(const VectorBase<BaseFloat>& params) {
-    KALDI_ERR << "Unimplemented.";
-  }
-
-  std::string Info() const {
-    std::ostringstream oss;
-    oss << "\n  (offset,weights) : ";
-    for (int32 p = 0; p < weight_.size(); p++) {
-      oss << "(" << offset_[p] << "," << weight_[p] << "), ";
-    }
-    return oss.str();
-  }
-
-  std::string InfoGradient() const {
-    std::ostringstream oss;
-    oss << "\n  lr-coef " << ToString(learn_rate_coef_);
-    oss << "\n  (offset,weights_grad) : ";
-    for (int32 p = 0; p < weight_diff_.size(); p++) {
-      oss << "(" << offset_[p] << ",";
-      // pass the weight vector, remove '\n' as last char
-      oss << weight_diff_[p];
-      oss.seekp(-1, std::ios_base::cur);
-      oss << "), ";
-    }
-    return oss.str();
-  }
-
-  void PropagateFnc(const CuMatrixBase<BaseFloat> &in,
-                    CuMatrixBase<BaseFloat> *out) {
-    // check dims
-    KALDI_ASSERT(in.NumCols() % feature_dim_ == 0);
-    KALDI_ASSERT(out->NumCols() % feature_dim_ == 0);
-    // useful dims
-    int32 num_pools = offset_.size();
-    // compute the output pools
-    for (int32 p = 0; p < num_pools; p++) {
-      CuSubMatrix<BaseFloat> tgt(out->ColRange(p*feature_dim_, feature_dim_));
-      tgt.SetZero();  // reset
-      for (int32 i = 0; i < weight_[p].Dim(); i++) {
-        tgt.AddMat(weight_[p](i), in.ColRange((offset_[p]+i) * feature_dim_, feature_dim_));
-      }
-    }
-  }
-
-  void BackpropagateFnc(const CuMatrixBase<BaseFloat> &in,
-                        const CuMatrixBase<BaseFloat> &out,
-                        const CuMatrixBase<BaseFloat> &out_diff,
-                        CuMatrixBase<BaseFloat> *in_diff) {
-    KALDI_ERR << "Unimplemented.";
-  }
-
-
-  void Update(const CuMatrixBase<BaseFloat> &input,
-              const CuMatrixBase<BaseFloat> &diff) {
-    // useful dims
-    int32 num_pools = offset_.size();
-    // lazy init
-    if (weight_diff_.size() != num_pools) weight_diff_.resize(num_pools);
-    // get the derivatives
-    for (int32 p = 0; p < num_pools; p++) {
-      weight_diff_[p].Resize(weight_[p].Dim(), kSetZero);  // reset
-      for (int32 i = 0; i < weight_[p].Dim(); i++) {
-        // multiply matrices element-wise, and sum to get the derivative
-        CuSubMatrix<BaseFloat> in_frame(
-          input.ColRange((offset_[p]+i) * feature_dim_, feature_dim_)
-        );
-        CuSubMatrix<BaseFloat> diff_frame(
-          diff.ColRange(p * feature_dim_, feature_dim_)
-        );
-        CuMatrix<BaseFloat> mul_elems(in_frame);
-        mul_elems.MulElements(diff_frame);
-        weight_diff_[p](i) = mul_elems.Sum();
-      }
-    }
-    // update
-    for (int32 p = 0; p < num_pools; p++) {
-      weight_[p].AddVec(- learn_rate_coef_ * opts_.learn_rate, weight_diff_[p]);
-    }
-    // force to be positive, re-normalize the sum
-    if (normalize_) {
-      for (int32 p = 0; p < num_pools; p++) {
-        weight_[p].ApplyFloor(0.0);
-        weight_[p].Scale(1.0/weight_[p].Sum());
-      }
-    }
-  }
-
- private:
-  int32 feature_dim_;  // feature dimension before splicing
-  std::vector<int32> offset_;  // vector of pooling offsets
-  /// Vector of pooling weight vectors,
-  std::vector<Vector<BaseFloat> > weight_;
-  /// detivatives of weight vectors,
-  std::vector<Vector<BaseFloat> > weight_diff_;
-
-  bool normalize_;  // apply normalization after each update
-};
-
-}  // namespace nnet1
-}  // namespace kaldi
-
-#endif  // KALDI_NNET_NNET_FRAME_POOLING_COMPONENT_H_
diff --git a/src/nnet/nnet-kl-hmm.h b/src/nnet/nnet-kl-hmm.h
deleted file mode 100644
index 8ba3901daa7..00000000000
--- a/src/nnet/nnet-kl-hmm.h
+++ /dev/null
@@ -1,155 +0,0 @@
-// nnet/nnet-kl-hmm.h
-
-// Copyright 2013  Idiap Research Institute (Author: David Imseng)
-//                 Karlsruhe Institute of Technology (Author: Ngoc Thang Vu)
-//                 Brno University of Technology (Author: Karel Vesely)
-
-// See ../../COPYING for clarification regarding multiple authors
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//  http://www.apache.org/licenses/LICENSE-2.0
-//
-// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
-// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
-// MERCHANTABLITY OR NON-INFRINGEMENT.
-// See the Apache 2 License for the specific language governing permissions and
-// limitations under the License.
-
-
-#ifndef KALDI_NNET_NNET_KL_HMM_H_
-#define KALDI_NNET_NNET_KL_HMM_H_
-
-#include <vector>
-
-#include "nnet/nnet-component.h"
-#include "cudamatrix/cu-math.h"
-#include "cudamatrix/cu-rand.h"
-#include "matrix/kaldi-vector.h"
-#include "matrix/kaldi-matrix.h"
-
-namespace kaldi {
-namespace nnet1 {
-
-class KlHmm : public Component {
- public:
-  KlHmm(int32 dim_in, int32 dim_out):
-    Component(dim_in, dim_out),
-    kl_stats_(dim_out, dim_in, kSetZero)
-  { }
-
-  ~KlHmm()
-  { }
-
-  Component* Copy() const { return new KlHmm(*this); }
-  ComponentType GetType() const { return kKlHmm; }
-
-  void PropagateFnc(const CuMatrixBase<BaseFloat> &in,
-                    CuMatrixBase<BaseFloat> *out) {
-    if (kl_inv_q_.NumRows() == 0) {
-      // Copy the CudaMatrix to a Matrix
-      Matrix<BaseFloat> in_tmp(in.NumRows(), in.NumCols());
-      in.CopyToMat(&in_tmp);
-      // Check if there are posteriors in the Matrix (check on first row),
-      BaseFloat post_sum = in_tmp.Row(0).Sum();
-      KALDI_ASSERT(ApproxEqual(post_sum, 1.0));
-      // Get a tmp Matrix of the stats
-      Matrix<BaseFloat> kl_stats_tmp(kl_stats_);
-      // Init a vector to get the sum of the rows (for normalization)
-      Vector<BaseFloat> row_sum(kl_stats_.NumRows(), kSetZero);
-      // Get the sum of the posteriors for normalization
-      row_sum.AddColSumMat(1, kl_stats_tmp);
-      // Apply floor to make sure there is no zero
-      row_sum.ApplyFloor(1e-20);
-      // Invert the sum (to normalize)
-      row_sum.InvertElements();
-      // Normalizing the statistics vector
-      kl_stats_tmp.MulRowsVec(row_sum);
-      // Apply floor before inversion and logarithm
-      kl_stats_tmp.ApplyFloor(1e-20);
-      // Apply invesion
-      kl_stats_tmp.InvertElements();
-      // Apply logarithm
-      kl_stats_tmp.ApplyLog();
-      // Inverted and logged values
-      kl_inv_q_.Resize(kl_stats_.NumRows(), kl_stats_.NumCols());
-      // Holds now log (1/Q)
-      kl_inv_q_.CopyFromMat(kl_stats_tmp);
-    }
-    // Get the logarithm of the features for the Entropy calculation
-    // Copy the CudaMatrix to a Matrix
-    Matrix<BaseFloat> in_log_tmp(in.NumRows(), in.NumCols());
-    in.CopyToMat(&in_log_tmp);
-    // Flooring and log
-    in_log_tmp.ApplyFloor(1e-20);
-    in_log_tmp.ApplyLog();
-    CuMatrix<BaseFloat> log_in(in.NumRows(), in.NumCols());
-    log_in.CopyFromMat(in_log_tmp);
-    // P*logP
-    CuMatrix<BaseFloat> tmp_entropy(in);
-    tmp_entropy.MulElements(log_in);
-    // Getting the entropy (sum P*logP)
-    CuVector<BaseFloat> in_entropy(in.NumRows(), kSetZero);
-    in_entropy.AddColSumMat(1, tmp_entropy);
-    // sum P*log (1/Q)
-    out->AddMatMat(1, in, kNoTrans, kl_inv_q_, kTrans, 0);
-    // (sum P*logP) + (sum P*log(1/Q)
-    out->AddVecToCols(1, in_entropy);
-    // return the negative KL-divergence
-    out->Scale(-1);
-  }
-
-  void BackpropagateFnc(const CuMatrixBase<BaseFloat> &in,
-                        const CuMatrixBase<BaseFloat> &out,
-                        const CuMatrixBase<BaseFloat> &out_diff,
-                        CuMatrixBase<BaseFloat> *in_diff) {
-    KALDI_ERR << "Unimplemented";
-  }
-
-  /// Reads the component content
-  void ReadData(std::istream &is, bool binary) {
-    kl_stats_.Read(is, binary);
-    KALDI_ASSERT(kl_stats_.NumRows() == output_dim_);
-    KALDI_ASSERT(kl_stats_.NumCols() == input_dim_);
-  }
-
-  /// Writes the component content
-  void WriteData(std::ostream &os, bool binary) const {
-    kl_stats_.Write(os, binary);
-  }
-
-  /// Set the statistics matrix
-  void SetStats(const Matrix<BaseFloat> mat) {
-    KALDI_ASSERT(mat.NumRows() == output_dim_);
-    KALDI_ASSERT(mat.NumCols() == input_dim_);
-    kl_stats_.Resize(mat.NumRows(), mat.NumCols());
-    kl_stats_.CopyFromMat(mat);
-  }
-
-  /// Accumulate the statistics for KL-HMM paramter estimation,
-  void Accumulate(const Matrix<BaseFloat> &posteriors,
-                  const std::vector<int32> &alignment) {
-    KALDI_ASSERT(posteriors.NumRows() == alignment.size());
-    KALDI_ASSERT(posteriors.NumCols() == kl_stats_.NumCols());
-    int32 num_frames = alignment.size();
-    for (int32 i = 0; i < num_frames; i++) {
-      // Casting float posterior to double (fixing numerical issue),
-      Vector<double> temp(posteriors.Row(i));
-      // Sum the postiors grouped by states from the alignment,
-      kl_stats_.Row(alignment[i]).AddVec(1, temp);
-    }
-  }
-
- private:
-  Matrix<double> kl_stats_;
-  CuMatrix<BaseFloat> kl_inv_q_;
-};
-
-}  // namespace nnet1
-}  // namespace kaldi
-
-#endif  // KALDI_NNET_NNET_KL_HMM_H_
-
diff --git a/src/nnet/nnet-linear-transform.h b/src/nnet/nnet-linear-transform.h
deleted file mode 100644
index 733ad778970..00000000000
--- a/src/nnet/nnet-linear-transform.h
+++ /dev/null
@@ -1,212 +0,0 @@
-// nnet/nnet-linear-transform.h
-
-// Copyright 2011-2014  Brno University of Technology (author: Karel Vesely)
-
-// See ../../COPYING for clarification regarding multiple authors
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//  http://www.apache.org/licenses/LICENSE-2.0
-//
-// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
-// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
-// MERCHANTABLITY OR NON-INFRINGEMENT.
-// See the Apache 2 License for the specific language governing permissions and
-// limitations under the License.
-
-
-#ifndef KALDI_NNET_NNET_LINEAR_TRANSFORM_H_
-#define KALDI_NNET_NNET_LINEAR_TRANSFORM_H_
-
-#include <string>
-
-#include "nnet/nnet-component.h"
-#include "nnet/nnet-utils.h"
-#include "cudamatrix/cu-math.h"
-
-namespace kaldi {
-namespace nnet1 {
-
-class LinearTransform : public UpdatableComponent {
- public:
-  LinearTransform(int32 dim_in, int32 dim_out):
-    UpdatableComponent(dim_in, dim_out),
-    linearity_(dim_out, dim_in),
-    linearity_corr_(dim_out, dim_in)
-  { }
-
-  ~LinearTransform()
-  { }
-
-  Component* Copy() const { return new LinearTransform(*this); }
-  ComponentType GetType() const { return kLinearTransform; }
-
-  void InitData(std::istream &is) {
-    // define options
-    float param_stddev = 0.1;
-    std::string read_matrix_file;
-    // parse config
-    std::string token;
-    while (is >> std::ws, !is.eof()) {
-      ReadToken(is, false, &token);
-      /**/ if (token == "<ParamStddev>") ReadBasicType(is, false, &param_stddev);
-      else if (token == "<ReadMatrix>") ReadToken(is, false, &read_matrix_file);
-      else if (token == "<LearnRateCoef>") ReadBasicType(is, false, &learn_rate_coef_);
-      else KALDI_ERR << "Unknown token " << token << ", a typo in config?"
-                     << " (ParamStddev|ReadMatrix|LearnRateCoef)";
-    }
-
-    if (read_matrix_file != "") {  // load from file,
-      bool binary;
-      Input in(read_matrix_file, &binary);
-      linearity_.Read(in.Stream(), binary);
-      in.Close();
-      // check dims,
-      if (OutputDim() != linearity_.NumRows() ||
-          InputDim() != linearity_.NumCols()) {
-        KALDI_ERR << "Dimensionality mismatch! Expected matrix"
-                  << " r=" << OutputDim() << " c=" << InputDim()
-                  << ", loaded matrix " << read_matrix_file
-                  << " with r=" << linearity_.NumRows()
-                  << " c=" << linearity_.NumCols();
-      }
-      KALDI_LOG << "Loaded <LinearTransform> matrix from file : "
-                << read_matrix_file;
-      return;
-    }
-
-    //
-    // Initialize trainable parameters,
-    //
-    // Gaussian with given std_dev (mean = 0),
-    linearity_.Resize(OutputDim(), InputDim());
-    RandGauss(0.0, param_stddev, &linearity_);
-  }
-
-  void ReadData(std::istream &is, bool binary) {
-    // Read all the '<Tokens>' in arbitrary order,
-    while ('<' == Peek(is, binary)) {
-      int first_char = PeekToken(is, binary);
-      switch (first_char) {
-        case 'L': ExpectToken(is, binary, "<LearnRateCoef>");
-          ReadBasicType(is, binary, &learn_rate_coef_);
-          break;
-        default:
-          std::string token;
-          ReadToken(is, false, &token);
-          KALDI_ERR << "Unknown token: " << token;
-      }
-    }
-    // Read the data (data follow the tokens),
-
-    // weights
-    linearity_.Read(is, binary);
-
-    KALDI_ASSERT(linearity_.NumRows() == output_dim_);
-    KALDI_ASSERT(linearity_.NumCols() == input_dim_);
-  }
-
-  void WriteData(std::ostream &os, bool binary) const {
-    WriteToken(os, binary, "<LearnRateCoef>");
-    WriteBasicType(os, binary, learn_rate_coef_);
-    if (!binary) os << "\n";
-    linearity_.Write(os, binary);
-  }
-
-  int32 NumParams() const {
-    return linearity_.NumRows()*linearity_.NumCols();
-  }
-
-  void GetGradient(VectorBase<BaseFloat>* gradient) const {
-    KALDI_ASSERT(gradient->Dim() == NumParams());
-    gradient->CopyRowsFromMat(linearity_corr_);
-  }
-
-  void GetParams(VectorBase<BaseFloat>* params) const {
-    KALDI_ASSERT(params->Dim() == NumParams());
-    params->CopyRowsFromMat(linearity_);
-  }
-
-  void SetParams(const VectorBase<BaseFloat>& params) {
-    KALDI_ASSERT(params.Dim() == NumParams());
-    linearity_.CopyRowsFromVec(params);
-  }
-
-  void SetLinearity(const MatrixBase<BaseFloat>& l) {
-    KALDI_ASSERT(l.NumCols() == linearity_.NumCols());
-    KALDI_ASSERT(l.NumRows() == linearity_.NumRows());
-    linearity_.CopyFromMat(l);
-  }
-
-  std::string Info() const {
-    return std::string("\n  linearity") +
-      MomentStatistics(linearity_) +
-      ", lr-coef " + ToString(learn_rate_coef_);
-  }
-  std::string InfoGradient() const {
-    return std::string("\n  linearity_grad") +
-      MomentStatistics(linearity_corr_) +
-      ", lr-coef " + ToString(learn_rate_coef_);
-  }
-
-  void PropagateFnc(const CuMatrixBase<BaseFloat> &in,
-                    CuMatrixBase<BaseFloat> *out) {
-    // multiply by weights^t
-    out->AddMatMat(1.0, in, kNoTrans, linearity_, kTrans, 0.0);
-  }
-
-  void BackpropagateFnc(const CuMatrixBase<BaseFloat> &in,
-                        const CuMatrixBase<BaseFloat> &out,
-                        const CuMatrixBase<BaseFloat> &out_diff,
-                        CuMatrixBase<BaseFloat> *in_diff) {
-    // multiply error derivative by weights
-    in_diff->AddMatMat(1.0, out_diff, kNoTrans, linearity_, kNoTrans, 0.0);
-  }
-
-
-  void Update(const CuMatrixBase<BaseFloat> &input,
-              const CuMatrixBase<BaseFloat> &diff) {
-    // we use following hyperparameters from the option class
-    const BaseFloat lr = opts_.learn_rate;
-    const BaseFloat mmt = opts_.momentum;
-    const BaseFloat l2 = opts_.l2_penalty;
-    const BaseFloat l1 = opts_.l1_penalty;
-    // we will also need the number of frames in the mini-batch
-    const int32 num_frames = input.NumRows();
-    // compute gradient (incl. momentum)
-    linearity_corr_.AddMatMat(1.0, diff, kTrans, input, kNoTrans, mmt);
-    // l2 regularization
-    if (l2 != 0.0) {
-      linearity_.AddMat(-lr*l2*num_frames, linearity_);
-    }
-    // l1 regularization
-    if (l1 != 0.0) {
-      cu::RegularizeL1(&linearity_, &linearity_corr_, lr*l1*num_frames, lr);
-    }
-    // update
-    linearity_.AddMat(-lr*learn_rate_coef_, linearity_corr_);
-  }
-
-  /// Accessors to the component parameters
-  const CuMatrixBase<BaseFloat>& GetLinearity() { return linearity_; }
-
-  void SetLinearity(const CuMatrixBase<BaseFloat>& linearity) {
-    KALDI_ASSERT(linearity.NumRows() == linearity_.NumRows());
-    KALDI_ASSERT(linearity.NumCols() == linearity_.NumCols());
-    linearity_.CopyFromMat(linearity);
-  }
-
-  const CuMatrixBase<BaseFloat>& GetLinearityCorr() { return linearity_corr_; }
-
- private:
-  CuMatrix<BaseFloat> linearity_;
-  CuMatrix<BaseFloat> linearity_corr_;
-};
-
-}  // namespace nnet1
-}  // namespace kaldi
-
-#endif  // KALDI_NNET_NNET_LINEAR_TRANSFORM_H_
diff --git a/src/nnet/nnet-loss.cc b/src/nnet/nnet-loss.cc
deleted file mode 100644
index eb2233d33a6..00000000000
--- a/src/nnet/nnet-loss.cc
+++ /dev/null
@@ -1,460 +0,0 @@
-// nnet/nnet-loss.cc
-
-// Copyright 2011-2015  Brno University of Technology (author: Karel Vesely)
-
-// See ../../COPYING for clarification regarding multiple authors
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//  http://www.apache.org/licenses/LICENSE-2.0
-//
-// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
-// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
-// MERCHANTABLITY OR NON-INFRINGEMENT.
-// See the Apache 2 License for the specific language governing permissions and
-// limitations under the License.
-
-#include <sstream>
-#include <iterator>
-#include <algorithm>
-#include <iomanip>
-
-#include "nnet/nnet-loss.h"
-#include "nnet/nnet-utils.h"
-#include "cudamatrix/cu-math.h"
-#include "hmm/posterior.h"
-
-namespace kaldi {
-namespace nnet1 {
-
-
-/* Xent */
-
-/**
- * Helper function of Xent::Eval,
- * calculates number of matching elemente in 'hyp', 'ref' weighted by 'weights'.
- */
-template <typename T>
-inline void CountCorrectFramesWeighted(const CuArray<T> &hyp,
-                                       const CuArray<T> &ref,
-                                       const CuVectorBase<BaseFloat> &weights,
-                                       Vector<double> *correct) {
-  KALDI_ASSERT(hyp.Dim() == ref.Dim());
-  KALDI_ASSERT(hyp.Dim() == weights.Dim());
-  int32 dim = hyp.Dim();
-  // Get GPU data to host,
-  std::vector<T> hyp_h(dim), ref_h(dim);
-  hyp.CopyToVec(&hyp_h);
-  ref.CopyToVec(&ref_h);
-  Vector<BaseFloat> w(dim);
-  weights.CopyToVec(&w);
-  // Accumulate weighted counts of correct frames,
-  for (int32 i = 0; i < dim; i++) {
-    KALDI_ASSERT(ref_h[i] < correct->Dim());
-    (*correct)(ref_h[i]) += w(i) * (hyp_h[i] == ref_h[i] ? 1.0 : 0.0);
-  }
-}
-
-
-void Xent::Eval(const VectorBase<BaseFloat> &frame_weights,
-                const CuMatrixBase<BaseFloat> &net_out,
-                const CuMatrixBase<BaseFloat> &targets,
-                CuMatrix<BaseFloat> *diff) {
-  // check inputs,
-  KALDI_ASSERT(net_out.NumCols() == targets.NumCols());
-  KALDI_ASSERT(net_out.NumRows() == targets.NumRows());
-  KALDI_ASSERT(net_out.NumRows() == frame_weights.Dim());
-
-  KALDI_ASSERT(KALDI_ISFINITE(frame_weights.Sum()));
-  KALDI_ASSERT(KALDI_ISFINITE(net_out.Sum()));
-  KALDI_ASSERT(KALDI_ISFINITE(targets.Sum()));
-
-  // buffer initialization,
-  int32 num_classes = targets.NumCols();
-  if (frames_.Dim() == 0) {
-    frames_.Resize(num_classes);
-    xentropy_.Resize(num_classes);
-    entropy_.Resize(num_classes);
-    correct_.Resize(num_classes);
-  }
-
-  // get frame_weights to GPU,
-  frame_weights_ = frame_weights;
-
-  // There may be frames for which the sum of targets is zero.
-  // This happens in multi-lingual training when the frame
-  // has target class in the softmax of another language.
-  // We 'switch-off' such frames by masking the 'frame_weights_',
-  target_sum_.Resize(targets.NumRows());
-  target_sum_.AddColSumMat(1.0, targets, 0.0);
-  frame_weights_.MulElements(target_sum_);
-
-  // compute derivative wrt. activations of last layer of neurons,
-  *diff = net_out;
-  diff->AddMat(-1.0, targets);
-  diff->MulRowsVec(frame_weights_);  // weighting,
-
-  // count frames per class,
-  frames_aux_ = targets;
-  frames_aux_.MulRowsVec(frame_weights_);
-  frames_.AddRowSumMat(1.0, CuMatrix<double>(frames_aux_));
-
-  // evaluate the frame-level classification,
-  net_out.FindRowMaxId(&max_id_out_);  // find max in nn-output
-  targets.FindRowMaxId(&max_id_tgt_);  // find max in targets
-  CountCorrectFramesWeighted(max_id_out_, max_id_tgt_,
-                             frame_weights_, &correct_);
-
-  // calculate cross_entropy (in GPU),
-  xentropy_aux_ = net_out;  // y
-  xentropy_aux_.Add(1e-20);  // avoid log(0)
-  xentropy_aux_.ApplyLog();  // log(y)
-  xentropy_aux_.MulElements(targets);  // t*log(y)
-  xentropy_aux_.MulRowsVec(frame_weights_);  // w*t*log(y)
-  xentropy_.AddRowSumMat(-1.0, CuMatrix<double>(xentropy_aux_));
-
-  // caluculate entropy (in GPU),
-  entropy_aux_ = targets;  // t
-  entropy_aux_.Add(1e-20);  // avoid log(0)
-  entropy_aux_.ApplyLog();  // log(t)
-  entropy_aux_.MulElements(targets);  // t*log(t)
-  entropy_aux_.MulRowsVec(frame_weights_);  // w*t*log(t)
-  entropy_.AddRowSumMat(-1.0, CuMatrix<double>(entropy_aux_));
-
-  // progressive loss reporting
-  if (opts_.loss_report_frames > 0) {
-    frames_progress_ += frame_weights_.Sum();
-    xentropy_progress_ += -xentropy_aux_.Sum();
-    entropy_progress_ += -entropy_aux_.Sum();
-
-    KALDI_ASSERT(KALDI_ISFINITE(xentropy_progress_));
-    KALDI_ASSERT(KALDI_ISFINITE(entropy_progress_));
-
-    if (frames_progress_ > opts_.loss_report_frames) {
-      // loss value,
-      double progress_value =
-        (xentropy_progress_ - entropy_progress_) / frames_progress_;
-
-      // time-related info (fps is weighted),
-      double time_now = timer_.Elapsed();
-      double fps = frames_progress_ / (time_now - elapsed_seconds_);
-      double elapsed_hours = time_now / 3600;
-      elapsed_seconds_ = time_now; // store,
-
-      // print,
-      KALDI_LOG << "ProgressLoss[last "
-                << static_cast<int>(frames_progress_/100/3600) << "h of "
-                << static_cast<int>(frames_.Sum()/100/3600) << "h]: "
-                << progress_value << " (Xent)"
-                << ", fps=" << fps
-                << std::setprecision(3)
-                << ", elapsed " << elapsed_hours << "h";
-      // store,
-      loss_vec_.push_back(progress_value);
-      // reset,
-      frames_progress_ = 0;
-      xentropy_progress_ = 0.0;
-      entropy_progress_ = 0.0;
-    }
-  }
-}
-
-
-void Xent::Eval(const VectorBase<BaseFloat> &frame_weights,
-                const CuMatrixBase<BaseFloat> &net_out,
-                const Posterior &post,
-                CuMatrix<BaseFloat> *diff) {
-  int32 num_frames = net_out.NumRows(),
-    num_pdf = net_out.NumCols();
-  KALDI_ASSERT(num_frames == post.size());
-
-  // convert posterior to matrix,
-  PosteriorToMatrix(post, num_pdf, &tgt_mat_);
-
-  // call the other eval function,
-  Eval(frame_weights, net_out, tgt_mat_, diff);
-}
-
-
-std::string Xent::Report() {
-  double loss_value =
-    (xentropy_.Sum() - entropy_.Sum()) / frames_.Sum();
-  std::ostringstream oss;
-  oss << "AvgLoss: " << loss_value << " (Xent), "
-      << "[AvgXent: " << xentropy_.Sum() / frames_.Sum()
-      << ", AvgTargetEnt: " << entropy_.Sum() / frames_.Sum()
-      << "]" << std::endl;
-
-  oss << "progress: [";
-  std::copy(loss_vec_.begin(), loss_vec_.end(),
-            std::ostream_iterator<float>(oss, " "));
-  oss << "]" << std::endl;
-
-  double frame_accuracy = 100.0 * correct_.Sum() / frames_.Sum();
-  oss << "FRAME_ACCURACY >> " << frame_accuracy << "% <<" << std::endl;
-
-  return oss.str();
-}
-
-
-std::string Xent::ReportPerClass() {
-  std::ostringstream oss;
-  oss << "PER-CLASS PERFORMANCE:" << std::endl;
-  oss << "@@@ Frames per-class:" << frames_;
-  // get inverted counts,
-  CuVector<double> inv_frames(frames_);
-  inv_frames.Add(0.5);  // avoid 0-frames,
-  inv_frames.ApplyPow(-1.0);
-  // loss, kl = xentropy-entropy,
-  CuVector<double> loss(xentropy_);
-  loss.AddVec(-1.0, entropy_);
-  loss.MulElements(inv_frames);
-  oss << "@@@ Loss per-class:" << loss;
-  // frame accuracy (assuming targets are binary),
-  CuVector<double> frm_accu(correct_);
-  frm_accu.MulElements(inv_frames);
-  frm_accu.Scale(100.0);
-  oss << "@@@ Frame-accuracy per-class:" << frm_accu;
-  //
-  return oss.str();
-}
-
-
-/* Mse */
-
-void Mse::Eval(const VectorBase<BaseFloat> &frame_weights,
-               const CuMatrixBase<BaseFloat>& net_out,
-               const CuMatrixBase<BaseFloat>& target,
-               CuMatrix<BaseFloat>* diff) {
-  // check inputs,
-  KALDI_ASSERT(net_out.NumCols() == target.NumCols());
-  KALDI_ASSERT(net_out.NumRows() == target.NumRows());
-  KALDI_ASSERT(net_out.NumRows() == frame_weights.Dim());
-
-  KALDI_ASSERT(KALDI_ISFINITE(frame_weights.Sum()));
-  KALDI_ASSERT(KALDI_ISFINITE(net_out.Sum()));
-  KALDI_ASSERT(KALDI_ISFINITE(target.Sum()));
-
-  int32 num_frames = frame_weights.Sum();
-  KALDI_ASSERT(num_frames >= 0.0);
-
-  // get frame_weights to GPU,
-  frame_weights_ = frame_weights;
-
-  // compute derivative w.r.t. neural nerwork outputs
-  *diff = net_out;  // y
-  diff->AddMat(-1.0, target);  // (y - t)
-  diff->MulRowsVec(frame_weights_);  // weighting,
-
-  // Compute MeanSquareError loss of mini-batch
-  diff_pow_2_ = *diff;
-  diff_pow_2_.MulElements(diff_pow_2_);  // (y - t)^2
-  diff_pow_2_.MulRowsVec(frame_weights_);  // w*(y - t)^2
-  double mean_square_error = 0.5 * diff_pow_2_.Sum();  // sum the matrix,
-
-  KALDI_ASSERT(KALDI_ISFINITE(mean_square_error));
-
-  // accumulate
-  loss_ += mean_square_error;
-  frames_ += num_frames;
-
-  // progressive loss reporting
-  if (opts_.loss_report_frames > 0) {
-    frames_progress_ += num_frames;
-    loss_progress_ += mean_square_error;
-    if (frames_progress_ > opts_.loss_report_frames) {
-      KALDI_LOG << "ProgressLoss[last "
-                << static_cast<int>(frames_progress_/100/3600) << "h of "
-                << static_cast<int>(frames_/100/3600) << "h]: "
-                << loss_progress_/frames_progress_ << " (Mse)";
-      // store
-      loss_vec_.push_back(loss_progress_/frames_progress_);
-      // reset
-      frames_progress_ = 0;
-      loss_progress_ = 0.0;
-    }
-  }
-}
-
-
-void Mse::Eval(const VectorBase<BaseFloat> &frame_weights,
-               const CuMatrixBase<BaseFloat>& net_out,
-               const Posterior& post,
-               CuMatrix<BaseFloat>* diff) {
-  int32 num_frames = net_out.NumRows(),
-    num_nn_outputs = net_out.NumCols();
-  KALDI_ASSERT(num_frames == post.size());
-
-  // convert posterior to matrix,
-  PosteriorToMatrix(post, num_nn_outputs, &tgt_mat_);
-
-  // call the other eval function,
-  Eval(frame_weights, net_out, tgt_mat_, diff);
-}
-
-
-std::string Mse::Report() {
-  // compute root mean square,
-  int32 num_tgt = diff_pow_2_.NumCols();
-  BaseFloat root_mean_square = sqrt(loss_/frames_/num_tgt);
-  // build the message,
-  std::ostringstream oss;
-  oss << "AvgLoss: " << loss_/frames_ << " (Mse), "
-      << "[RMS " << root_mean_square << ", frames "
-      << frames_ << "]" << std::endl;
-  oss << "progress: [";
-  std::copy(loss_vec_.begin(), loss_vec_.end(),
-            std::ostream_iterator<float>(oss, " "));
-  oss << "]" << std::endl;
-  return oss.str();
-}
-
-
-/* MultiTaskLoss */
-
-void MultiTaskLoss::InitFromString(const std::string& s) {
-  std::vector<std::string> v;
-  SplitStringToVector(s, ",:" /* delimiter */, false, &v);
-
-  KALDI_ASSERT((v.size()-1) % 3 == 0);  // triplets,
-  KALDI_ASSERT(v[0] == "multitask");  // header,
-
-  // parse the definition of multitask loss,
-  std::vector<std::string>::iterator it(v.begin()+1);  // skip header,
-  for ( ; it != v.end(); ++it) {
-    // type,
-    if (*it == "xent") {
-      loss_vec_.push_back(new Xent(opts_));
-    } else if (*it == "mse") {
-      loss_vec_.push_back(new Mse(opts_));
-    } else {
-      KALDI_ERR << "Unknown objective function code : " << *it;
-    }
-    ++it;
-    // dim,
-    int32 dim;
-    if (!ConvertStringToInteger(*it, &dim)) {
-      KALDI_ERR << "Cannot convert 'dim' " << *it << " to integer!";
-    }
-    loss_dim_.push_back(dim);
-    ++it;
-    // weight,
-    BaseFloat weight;
-    if (!ConvertStringToReal(*it, &weight)) {
-      KALDI_ERR << "Cannot convert 'weight' " << *it << " to integer!";
-    }
-    KALDI_ASSERT(weight >= 0.0);
-    loss_weights_.push_back(weight);
-  }
-
-  // build vector with starting-point offsets,
-  loss_dim_offset_.resize(loss_dim_.size()+1, 0);  // 1st zero stays,
-  for (int32 i = 1; i <= loss_dim_.size(); i++) {
-    loss_dim_offset_[i] = loss_dim_offset_[i-1] + loss_dim_[i-1];
-  }
-
-  // sanity check,
-  KALDI_ASSERT(loss_vec_.size() > 0);
-  KALDI_ASSERT(loss_vec_.size() == loss_dim_.size());
-  KALDI_ASSERT(loss_vec_.size() == loss_weights_.size());
-}
-
-void MultiTaskLoss::Eval(const VectorBase<BaseFloat> &frame_weights,
-            const CuMatrixBase<BaseFloat>& net_out,
-            const Posterior& post,
-            CuMatrix<BaseFloat>* diff) {
-  int32 num_frames = net_out.NumRows(),
-    num_output = net_out.NumCols();
-  KALDI_ASSERT(num_frames == post.size());
-  KALDI_ASSERT(num_output == loss_dim_offset_.back());  // sum of loss-dims,
-
-  // convert posterior to matrix,
-  PosteriorToMatrix(post, num_output, &tgt_mat_);
-
-  // allocate diff matrix,
-  diff->Resize(num_frames, num_output);
-
-  /// One vector of frame_weights per loss-function,
-  /// The original frame weights are multiplied with
-  /// a mask of `defined targets' according to the 'Posterior'.
-  std::vector<Vector<BaseFloat> > frmwei_have_tgt;
-  for (int32 l = 0; l < loss_vec_.size(); l++) {
-    // copy original weights,
-    frmwei_have_tgt.push_back(Vector<BaseFloat>(frame_weights));
-    // We need to mask-out the frames for which the 'posterior' is not defined (= is empty):
-    int32 loss_beg = loss_dim_offset_[l];   // first column of loss target,
-    int32 loss_end = loss_dim_offset_[l+1]; // (last+1) column of loss target,
-    for (int32 f = 0; f < num_frames; f++) {
-      bool tgt_defined = false;
-      for (int32 p = 0; p < post[f].size(); p++) {
-        if (post[f][p].first >= loss_beg && post[f][p].first < loss_end) {
-          tgt_defined = true;
-          break;
-        }
-      }
-      if (!tgt_defined) {
-        frmwei_have_tgt[l](f) = 0.0; // set zero_weight for the frame with no targets!
-      }
-    }
-  }
-
-  // call the vector of loss functions,
-  CuMatrix<BaseFloat> diff_aux;
-  for (int32 l = 0; l < loss_vec_.size(); l++) {
-    loss_vec_[l]->Eval(frmwei_have_tgt[l],
-      net_out.ColRange(loss_dim_offset_[l], loss_dim_[l]),
-      tgt_mat_.ColRange(loss_dim_offset_[l], loss_dim_[l]),
-      &diff_aux);
-    // Scale the gradients,
-    diff_aux.Scale(loss_weights_[l]);
-    // Copy to diff,
-    diff->ColRange(loss_dim_offset_[l], loss_dim_[l]).CopyFromMat(diff_aux);
-  }
-}
-
-std::string MultiTaskLoss::Report() {
-  // calculate overall loss (weighted),
-  BaseFloat overall_loss = AvgLoss();
-  // copy the loss-values into a vector,
-  std::vector<BaseFloat> loss_values;
-  for (int32 i = 0; i < loss_vec_.size(); i++) {
-    loss_values.push_back(loss_vec_[i]->AvgLoss());
-  }
-
-  // build the message,
-  std::ostringstream oss;
-  oss << "MultiTaskLoss, with " << loss_vec_.size()
-      << " parallel loss functions." << std::endl;
-  // individual loss reports first,
-  for (int32 i = 0; i < loss_vec_.size(); i++) {
-    oss << "Loss " << i+1 << ", " << loss_vec_[i]->Report() << std::endl;
-  }
-
-  // overall loss is last,
-  oss << "Loss (OVERALL), "
-      << "AvgLoss: " << overall_loss << " (MultiTaskLoss), "
-      << "weights " << loss_weights_ << ", "
-      << "values " << loss_values << std::endl;
-
-  return oss.str();
-}
-
-BaseFloat MultiTaskLoss::AvgLoss() {
-  BaseFloat ans(0.0);
-  for (int32 i = 0; i < loss_vec_.size(); i++) {
-    BaseFloat val = loss_weights_[i] * loss_vec_[i]->AvgLoss();
-    if (!KALDI_ISFINITE(val)) {
-      KALDI_WARN << "Loss " << i+1 << ", has bad objective function value '"
-                 << val << "', using 0.0 instead.";
-      val = 0.0;
-    }
-    ans += val;
-  }
-  return ans;
-}
-
-}  // namespace nnet1
-}  // namespace kaldi
diff --git a/src/nnet/nnet-loss.h b/src/nnet/nnet-loss.h
deleted file mode 100644
index 9e54733d63f..00000000000
--- a/src/nnet/nnet-loss.h
+++ /dev/null
@@ -1,251 +0,0 @@
-// nnet/nnet-loss.h
-
-// Copyright 2011-2015  Brno University of Technology (author: Karel Vesely)
-
-// See ../../COPYING for clarification regarding multiple authors
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//  http://www.apache.org/licenses/LICENSE-2.0
-//
-// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
-// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
-// MERCHANTABLITY OR NON-INFRINGEMENT.
-// See the Apache 2 License for the specific language governing permissions and
-// limitations under the License.
-
-#ifndef KALDI_NNET_NNET_LOSS_H_
-#define KALDI_NNET_NNET_LOSS_H_
-
-#include <string>
-#include <vector>
-
-#include "base/kaldi-common.h"
-#include "base/timer.h"
-#include "util/kaldi-holder.h"
-#include "itf/options-itf.h"
-#include "cudamatrix/cu-matrix.h"
-#include "cudamatrix/cu-vector.h"
-#include "cudamatrix/cu-array.h"
-#include "hmm/posterior.h"
-
-namespace kaldi {
-namespace nnet1 {
-
-struct LossOptions {
-  int32 loss_report_frames; ///< Report loss value every 'report_interval' frames,
-
-  LossOptions():
-    loss_report_frames(5*3600*100) // 5h,
-  { }
-
-  void Register(OptionsItf *opts) {
-    opts->Register("loss-report-frames", &loss_report_frames,
-        "Report loss per blocks of N frames (0 = no reports)");
-  }
-};
-
-class LossItf {
- public:
-  LossItf(LossOptions& opts) {
-    opts_ = opts;
-  }
-  virtual ~LossItf() { }
-
-  /// Evaluate cross entropy using target-matrix (supports soft labels),
-  virtual void Eval(const VectorBase<BaseFloat> &frame_weights,
-            const CuMatrixBase<BaseFloat> &net_out,
-            const CuMatrixBase<BaseFloat> &target,
-            CuMatrix<BaseFloat> *diff) = 0;
-
-  /// Evaluate cross entropy using target-posteriors (supports soft labels),
-  virtual void Eval(const VectorBase<BaseFloat> &frame_weights,
-            const CuMatrixBase<BaseFloat> &net_out,
-            const Posterior &target,
-            CuMatrix<BaseFloat> *diff) = 0;
-
-  /// Generate string with error report,
-  virtual std::string Report() = 0;
-
-  /// Get loss value (frame average),
-  virtual BaseFloat AvgLoss() = 0;
-
- protected:
-  LossOptions opts_;
-  Timer timer_;
-};
-
-
-class Xent : public LossItf {
- public:
-  Xent(LossOptions &opts):
-    LossItf(opts),
-    frames_progress_(0.0),
-    xentropy_progress_(0.0),
-    entropy_progress_(0.0),
-    elapsed_seconds_(0.0)
-  { }
-
-  ~Xent()
-  { }
-
-  /// Evaluate cross entropy using target-matrix (supports soft labels),
-  void Eval(const VectorBase<BaseFloat> &frame_weights,
-            const CuMatrixBase<BaseFloat> &net_out,
-            const CuMatrixBase<BaseFloat> &target,
-            CuMatrix<BaseFloat> *diff);
-
-  /// Evaluate cross entropy using target-posteriors (supports soft labels),
-  void Eval(const VectorBase<BaseFloat> &frame_weights,
-            const CuMatrixBase<BaseFloat> &net_out,
-            const Posterior &target,
-            CuMatrix<BaseFloat> *diff);
-
-  /// Generate string with error report,
-  std::string Report();
-
-  /// Generate string with per-class error report,
-  std::string ReportPerClass();
-
-  /// Get loss value (frame average),
-  BaseFloat AvgLoss() {
-    if (frames_.Sum() == 0) return 0.0;
-    return (xentropy_.Sum() - entropy_.Sum()) / frames_.Sum();
-  }
-
- private:
-  // main stats collected per target-class,
-  CuVector<double> frames_;
-  Vector<double> correct_;
-  CuVector<double> xentropy_;
-  CuVector<double> entropy_;
-
-  // partial results during training,
-  double frames_progress_;
-  double xentropy_progress_;
-  double entropy_progress_;
-  std::vector<float> loss_vec_;
-  double elapsed_seconds_;
-
-  // weigting buffer,
-  CuVector<BaseFloat> frame_weights_;
-  CuVector<BaseFloat> target_sum_;
-
-  // loss computation buffers,
-  CuMatrix<BaseFloat> tgt_mat_;
-  CuMatrix<BaseFloat> frames_aux_;
-  CuMatrix<BaseFloat> xentropy_aux_;
-  CuMatrix<BaseFloat> entropy_aux_;
-
-  // frame classification buffers,
-  CuArray<int32> max_id_out_;
-  CuArray<int32> max_id_tgt_;
-};
-
-
-class Mse : public LossItf {
- public:
-  Mse(LossOptions &opts):
-    LossItf(opts),
-    frames_(0.0),
-    loss_(0.0),
-    frames_progress_(0.0),
-    loss_progress_(0.0)
-  { }
-
-  ~Mse()
-  { }
-
-  /// Evaluate mean square error using target-matrix,
-  void Eval(const VectorBase<BaseFloat> &frame_weights,
-            const CuMatrixBase<BaseFloat>& net_out,
-            const CuMatrixBase<BaseFloat>& target,
-            CuMatrix<BaseFloat>* diff);
-
-  /// Evaluate mean square error using target-posteior,
-  void Eval(const VectorBase<BaseFloat> &frame_weights,
-            const CuMatrixBase<BaseFloat>& net_out,
-            const Posterior& target,
-            CuMatrix<BaseFloat>* diff);
-
-  /// Generate string with error report
-  std::string Report();
-
-  /// Get loss value (frame average),
-  BaseFloat AvgLoss() {
-    if (frames_ == 0) return 0.0;
-    return loss_ / frames_;
-  }
-
- private:
-  double frames_;
-  double loss_;
-
-  double frames_progress_;
-  double loss_progress_;
-  std::vector<float> loss_vec_;
-
-  CuVector<BaseFloat> frame_weights_;
-  CuMatrix<BaseFloat> tgt_mat_;
-  CuMatrix<BaseFloat> diff_pow_2_;
-};
-
-
-class MultiTaskLoss : public LossItf {
- public:
-  MultiTaskLoss(LossOptions &opts):
-    LossItf(opts)
-  { }
-
-  ~MultiTaskLoss() {
-    while (loss_vec_.size() > 0) {
-      delete loss_vec_.back();
-      loss_vec_.pop_back();
-    }
-  }
-
-  /// Initialize from string, the format for string 's' is :
-  /// 'multitask,<type1>,<dim1>,<weight1>,...,<typeN>,<dimN>,<weightN>'
-  ///
-  /// Practically it can look like this :
-  /// 'multitask,xent,2456,1.0,mse,440,0.001'
-  void InitFromString(const std::string& s);
-
-  /// Evaluate mean square error using target-matrix,
-  void Eval(const VectorBase<BaseFloat> &frame_weights,
-            const CuMatrixBase<BaseFloat>& net_out,
-            const CuMatrixBase<BaseFloat>& target,
-            CuMatrix<BaseFloat>* diff) {
-    KALDI_ERR << "This is not supposed to be called!";
-  }
-
-  /// Evaluate mean square error using target-posteior,
-  void Eval(const VectorBase<BaseFloat> &frame_weights,
-            const CuMatrixBase<BaseFloat>& net_out,
-            const Posterior& target,
-            CuMatrix<BaseFloat>* diff);
-
-  /// Generate string with error report
-  std::string Report();
-
-  /// Get loss value (frame average),
-  BaseFloat AvgLoss();
-
- private:
-  std::vector<LossItf*>  loss_vec_;
-  std::vector<int32>     loss_dim_;
-  std::vector<BaseFloat> loss_weights_;
-
-  std::vector<int32>     loss_dim_offset_;
-
-  CuMatrix<BaseFloat>    tgt_mat_;
-};
-
-}  // namespace nnet1
-}  // namespace kaldi
-
-#endif  // KALDI_NNET_NNET_LOSS_H_
-
diff --git a/src/nnet/nnet-lstm-projected.h b/src/nnet/nnet-lstm-projected.h
deleted file mode 100644
index eaf1062794f..00000000000
--- a/src/nnet/nnet-lstm-projected.h
+++ /dev/null
@@ -1,737 +0,0 @@
-// nnet/nnet-lstm-projected-streams.h
-
-// Copyright 2015-2016  Brno University of Technology (author: Karel Vesely)
-// Copyright 2014  Jiayu DU (Jerry), Wei Li
-
-// See ../../COPYING for clarification regarding multiple authors
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//  http://www.apache.org/licenses/LICENSE-2.0
-//
-// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
-// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
-// MERCHANTABLITY OR NON-INFRINGEMENT.
-// See the Apache 2 License for the specific language governing permissions and
-// limitations under the License.
-
-
-#ifndef KALDI_NNET_NNET_LSTM_PROJECTED_H_
-#define KALDI_NNET_NNET_LSTM_PROJECTED_H_
-
-#include <string>
-#include <vector>
-
-#include "nnet/nnet-component.h"
-#include "nnet/nnet-utils.h"
-#include "cudamatrix/cu-math.h"
-
-/*************************************
- * x: input neuron
- * g: squashing neuron near input
- * i: Input gate
- * f: Forget gate
- * o: Output gate
- * c: memory Cell (CEC)
- * h: squashing neuron near output
- * m: output neuron of Memory block
- * r: recurrent projection neuron
- * y: output neuron of LSTMP
- *************************************/
-
-namespace kaldi {
-namespace nnet1 {
-
-class LstmProjected : public MultistreamComponent {
- public:
-  LstmProjected(int32 input_dim, int32 output_dim):
-    MultistreamComponent(input_dim, output_dim),
-    cell_dim_(0),
-    proj_dim_(output_dim),
-    cell_clip_(50.0),
-    diff_clip_(1.0),
-    cell_diff_clip_(0.0),
-    grad_clip_(250.0)
-  { }
-
-  ~LstmProjected()
-  { }
-
-  Component* Copy() const { return new LstmProjected(*this); }
-  ComponentType GetType() const { return kLstmProjected; }
-
-  void InitData(std::istream &is) {
-    // define options,
-    float param_range = 0.1;
-    // parse the line from prototype,
-    std::string token;
-    while (is >> std::ws, !is.eof()) {
-      ReadToken(is, false, &token);
-      /**/ if (token == "<ParamRange>") ReadBasicType(is, false, &param_range);
-      else if (token == "<CellDim>") ReadBasicType(is, false, &cell_dim_);
-      else if (token == "<LearnRateCoef>") ReadBasicType(is, false, &learn_rate_coef_);
-      else if (token == "<BiasLearnRateCoef>") ReadBasicType(is, false, &bias_learn_rate_coef_);
-      else if (token == "<CellClip>") ReadBasicType(is, false, &cell_clip_);
-      else if (token == "<DiffClip>") ReadBasicType(is, false, &diff_clip_);
-      else if (token == "<CellDiffClip>") ReadBasicType(is, false, &cell_diff_clip_);
-      else if (token == "<GradClip>") ReadBasicType(is, false, &grad_clip_);
-      else KALDI_ERR << "Unknown token " << token << ", a typo in config?"
-                     << " (ParamRange|CellDim|LearnRateCoef|BiasLearnRateCoef|CellClip|DiffClip|GradClip)";
-    }
-
-    // init the weights and biases (from uniform dist.),
-    w_gifo_x_.Resize(4*cell_dim_, input_dim_, kUndefined);
-    w_gifo_r_.Resize(4*cell_dim_, proj_dim_, kUndefined);
-    bias_.Resize(4*cell_dim_, kUndefined);
-    peephole_i_c_.Resize(cell_dim_, kUndefined);
-    peephole_f_c_.Resize(cell_dim_, kUndefined);
-    peephole_o_c_.Resize(cell_dim_, kUndefined);
-    w_r_m_.Resize(proj_dim_, cell_dim_, kUndefined);
-    //       (mean), (range)
-    RandUniform(0.0, 2.0 * param_range, &w_gifo_x_);
-    RandUniform(0.0, 2.0 * param_range, &w_gifo_r_);
-    RandUniform(0.0, 2.0 * param_range, &bias_);
-    RandUniform(0.0, 2.0 * param_range, &peephole_i_c_);
-    RandUniform(0.0, 2.0 * param_range, &peephole_f_c_);
-    RandUniform(0.0, 2.0 * param_range, &peephole_o_c_);
-    RandUniform(0.0, 2.0 * param_range, &w_r_m_);
-
-    KALDI_ASSERT(cell_dim_ > 0);
-    KALDI_ASSERT(learn_rate_coef_ >= 0.0);
-    KALDI_ASSERT(bias_learn_rate_coef_ >= 0.0);
-  }
-
-  void ReadData(std::istream &is, bool binary) {
-    // Read all the '<Tokens>' in arbitrary order,
-    while ('<' == Peek(is, binary)) {
-      std::string token;
-      int first_char = PeekToken(is, binary);
-      switch (first_char) {
-        case 'C': ReadToken(is, false, &token);
-          /**/ if (token == "<CellDim>") ReadBasicType(is, binary, &cell_dim_);
-          else if (token == "<CellClip>") ReadBasicType(is, binary, &cell_clip_);
-          else if (token == "<CellDiffClip>") ReadBasicType(is, binary, &cell_diff_clip_);
-          else if (token == "<ClipGradient>") ReadBasicType(is, binary, &grad_clip_); // bwd-compat.
-          else KALDI_ERR << "Unknown token: " << token;
-          break;
-        case 'L': ExpectToken(is, binary, "<LearnRateCoef>");
-          ReadBasicType(is, binary, &learn_rate_coef_);
-          break;
-        case 'B': ExpectToken(is, binary, "<BiasLearnRateCoef>");
-          ReadBasicType(is, binary, &bias_learn_rate_coef_);
-          break;
-        case 'D': ExpectToken(is, binary, "<DiffClip>");
-          ReadBasicType(is, binary, &diff_clip_);
-          break;
-        case 'G': ExpectToken(is, binary, "<GradClip>");
-          ReadBasicType(is, binary, &grad_clip_);
-          break;
-        default: ReadToken(is, false, &token);
-          KALDI_ERR << "Unknown token: " << token;
-      }
-    }
-    KALDI_ASSERT(cell_dim_ != 0);
-
-    // Read the model parameters,
-    w_gifo_x_.Read(is, binary);
-    w_gifo_r_.Read(is, binary);
-    bias_.Read(is, binary);
-
-    peephole_i_c_.Read(is, binary);
-    peephole_f_c_.Read(is, binary);
-    peephole_o_c_.Read(is, binary);
-
-    w_r_m_.Read(is, binary);
-  }
-
-  void WriteData(std::ostream &os, bool binary) const {
-    WriteToken(os, binary, "<CellDim>");
-    WriteBasicType(os, binary, cell_dim_);
-
-    WriteToken(os, binary, "<LearnRateCoef>");
-    WriteBasicType(os, binary, learn_rate_coef_);
-    WriteToken(os, binary, "<BiasLearnRateCoef>");
-    WriteBasicType(os, binary, bias_learn_rate_coef_);
-
-    WriteToken(os, binary, "<CellClip>");
-    WriteBasicType(os, binary, cell_clip_);
-    WriteToken(os, binary, "<DiffClip>");
-    WriteBasicType(os, binary, diff_clip_);
-    WriteToken(os, binary, "<CellDiffClip>");
-    WriteBasicType(os, binary, cell_diff_clip_);
-    WriteToken(os, binary, "<GradClip>");
-    WriteBasicType(os, binary, grad_clip_);
-
-    // write model parameters,
-    if (!binary) os << "\n";
-    w_gifo_x_.Write(os, binary);
-    w_gifo_r_.Write(os, binary);
-    bias_.Write(os, binary);
-
-    peephole_i_c_.Write(os, binary);
-    peephole_f_c_.Write(os, binary);
-    peephole_o_c_.Write(os, binary);
-
-    w_r_m_.Write(os, binary);
-  }
-
-  int32 NumParams() const {
-    return ( w_gifo_x_.NumRows() * w_gifo_x_.NumCols() +
-         w_gifo_r_.NumRows() * w_gifo_r_.NumCols() +
-         bias_.Dim() +
-         peephole_i_c_.Dim() +
-         peephole_f_c_.Dim() +
-         peephole_o_c_.Dim() +
-         w_r_m_.NumRows() * w_r_m_.NumCols() );
-  }
-
-  void GetGradient(VectorBase<BaseFloat>* gradient) const {
-    KALDI_ASSERT(gradient->Dim() == NumParams());
-    int32 offset, len;
-
-    offset = 0;    len = w_gifo_x_.NumRows() * w_gifo_x_.NumCols();
-    gradient->Range(offset, len).CopyRowsFromMat(w_gifo_x_corr_);
-
-    offset += len; len = w_gifo_r_.NumRows() * w_gifo_r_.NumCols();
-    gradient->Range(offset, len).CopyRowsFromMat(w_gifo_r_corr_);
-
-    offset += len; len = bias_.Dim();
-    gradient->Range(offset, len).CopyFromVec(bias_corr_);
-
-    offset += len; len = peephole_i_c_.Dim();
-    gradient->Range(offset, len).CopyFromVec(peephole_i_c_corr_);
-
-    offset += len; len = peephole_f_c_.Dim();
-    gradient->Range(offset, len).CopyFromVec(peephole_f_c_corr_);
-
-    offset += len; len = peephole_o_c_.Dim();
-    gradient->Range(offset, len).CopyFromVec(peephole_o_c_corr_);
-
-    offset += len; len = w_r_m_.NumRows() * w_r_m_.NumCols();
-    gradient->Range(offset, len).CopyRowsFromMat(w_r_m_corr_);
-
-    offset += len;
-    KALDI_ASSERT(offset == NumParams());
-  }
-
-  void GetParams(VectorBase<BaseFloat>* params) const {
-    KALDI_ASSERT(params->Dim() == NumParams());
-    int32 offset, len;
-
-    offset = 0;    len = w_gifo_x_.NumRows() * w_gifo_x_.NumCols();
-    params->Range(offset, len).CopyRowsFromMat(w_gifo_x_);
-
-    offset += len; len = w_gifo_r_.NumRows() * w_gifo_r_.NumCols();
-    params->Range(offset, len).CopyRowsFromMat(w_gifo_r_);
-
-    offset += len; len = bias_.Dim();
-    params->Range(offset, len).CopyFromVec(bias_);
-
-    offset += len; len = peephole_i_c_.Dim();
-    params->Range(offset, len).CopyFromVec(peephole_i_c_);
-
-    offset += len; len = peephole_f_c_.Dim();
-    params->Range(offset, len).CopyFromVec(peephole_f_c_);
-
-    offset += len; len = peephole_o_c_.Dim();
-    params->Range(offset, len).CopyFromVec(peephole_o_c_);
-
-    offset += len; len = w_r_m_.NumRows() * w_r_m_.NumCols();
-    params->Range(offset, len).CopyRowsFromMat(w_r_m_);
-
-    offset += len;
-    KALDI_ASSERT(offset == NumParams());
-  }
-
-  void SetParams(const VectorBase<BaseFloat>& params) {
-    KALDI_ASSERT(params.Dim() == NumParams());
-    int32 offset, len;
-
-    offset = 0;    len = w_gifo_x_.NumRows() * w_gifo_x_.NumCols();
-    w_gifo_x_.CopyRowsFromVec(params.Range(offset, len));
-
-    offset += len; len = w_gifo_r_.NumRows() * w_gifo_r_.NumCols();
-    w_gifo_r_.CopyRowsFromVec(params.Range(offset, len));
-
-    offset += len; len = bias_.Dim();
-    bias_.CopyFromVec(params.Range(offset, len));
-
-    offset += len; len = peephole_i_c_.Dim();
-    peephole_i_c_.CopyFromVec(params.Range(offset, len));
-
-    offset += len; len = peephole_f_c_.Dim();
-    peephole_f_c_.CopyFromVec(params.Range(offset, len));
-
-    offset += len; len = peephole_o_c_.Dim();
-    peephole_o_c_.CopyFromVec(params.Range(offset, len));
-
-    offset += len; len = w_r_m_.NumRows() * w_r_m_.NumCols();
-    w_r_m_.CopyRowsFromVec(params.Range(offset, len));
-
-    offset += len;
-    KALDI_ASSERT(offset == NumParams());
-  }
-
-  std::string Info() const {
-    return std::string("cell-dim ") + ToString(cell_dim_) + " " +
-      "( learn_rate_coef_ " + ToString(learn_rate_coef_) +
-      ", bias_learn_rate_coef_ " + ToString(bias_learn_rate_coef_) +
-      ", cell_clip_ " + ToString(cell_clip_) +
-      ", diff_clip_ " + ToString(diff_clip_) +
-      ", grad_clip_ " + ToString(grad_clip_) + " )" +
-      "\n  w_gifo_x_  "   + MomentStatistics(w_gifo_x_) +
-      "\n  w_gifo_r_  "   + MomentStatistics(w_gifo_r_) +
-      "\n  bias_  "     + MomentStatistics(bias_) +
-      "\n  peephole_i_c_  " + MomentStatistics(peephole_i_c_) +
-      "\n  peephole_f_c_  " + MomentStatistics(peephole_f_c_) +
-      "\n  peephole_o_c_  " + MomentStatistics(peephole_o_c_) +
-      "\n  w_r_m_  "    + MomentStatistics(w_r_m_);
-  }
-
-  std::string InfoGradient() const {
-    // disassemble forward-propagation buffer into different neurons,
-    const CuSubMatrix<BaseFloat> YG(propagate_buf_.ColRange(0*cell_dim_, cell_dim_));
-    const CuSubMatrix<BaseFloat> YI(propagate_buf_.ColRange(1*cell_dim_, cell_dim_));
-    const CuSubMatrix<BaseFloat> YF(propagate_buf_.ColRange(2*cell_dim_, cell_dim_));
-    const CuSubMatrix<BaseFloat> YO(propagate_buf_.ColRange(3*cell_dim_, cell_dim_));
-    const CuSubMatrix<BaseFloat> YC(propagate_buf_.ColRange(4*cell_dim_, cell_dim_));
-    const CuSubMatrix<BaseFloat> YH(propagate_buf_.ColRange(5*cell_dim_, cell_dim_));
-    const CuSubMatrix<BaseFloat> YM(propagate_buf_.ColRange(6*cell_dim_, cell_dim_));
-    const CuSubMatrix<BaseFloat> YR(propagate_buf_.ColRange(7*cell_dim_, proj_dim_));
-
-    // disassemble backpropagate buffer into different neurons,
-    const CuSubMatrix<BaseFloat> DG(backpropagate_buf_.ColRange(0*cell_dim_, cell_dim_));
-    const CuSubMatrix<BaseFloat> DI(backpropagate_buf_.ColRange(1*cell_dim_, cell_dim_));
-    const CuSubMatrix<BaseFloat> DF(backpropagate_buf_.ColRange(2*cell_dim_, cell_dim_));
-    const CuSubMatrix<BaseFloat> DO(backpropagate_buf_.ColRange(3*cell_dim_, cell_dim_));
-    const CuSubMatrix<BaseFloat> DC(backpropagate_buf_.ColRange(4*cell_dim_, cell_dim_));
-    const CuSubMatrix<BaseFloat> DH(backpropagate_buf_.ColRange(5*cell_dim_, cell_dim_));
-    const CuSubMatrix<BaseFloat> DM(backpropagate_buf_.ColRange(6*cell_dim_, cell_dim_));
-    const CuSubMatrix<BaseFloat> DR(backpropagate_buf_.ColRange(7*cell_dim_, proj_dim_));
-
-    return std::string("") +
-      "( learn_rate_coef_ " + ToString(learn_rate_coef_) +
-      ", bias_learn_rate_coef_ " + ToString(bias_learn_rate_coef_) +
-      ", cell_clip_ " + ToString(cell_clip_) +
-      ", diff_clip_ " + ToString(diff_clip_) +
-      ", grad_clip_ " + ToString(grad_clip_) + " )" +
-      "\n  ### Gradients " +
-      "\n  w_gifo_x_corr_  "   + MomentStatistics(w_gifo_x_corr_) +
-      "\n  w_gifo_r_corr_  "   + MomentStatistics(w_gifo_r_corr_) +
-      "\n  bias_corr_  "     + MomentStatistics(bias_corr_) +
-      "\n  peephole_i_c_corr_  " + MomentStatistics(peephole_i_c_corr_) +
-      "\n  peephole_f_c_corr_  " + MomentStatistics(peephole_f_c_corr_) +
-      "\n  peephole_o_c_corr_  " + MomentStatistics(peephole_o_c_corr_) +
-      "\n  w_r_m_corr_  "    + MomentStatistics(w_r_m_corr_) +
-      "\n  ### Activations (mostly after non-linearities)" +
-      "\n  YI(0..1)^  " + MomentStatistics(YI) +
-      "\n  YF(0..1)^  " + MomentStatistics(YF) +
-      "\n  YO(0..1)^  " + MomentStatistics(YO) +
-      "\n  YG(-1..1)  " + MomentStatistics(YG) +
-      "\n  YC(-R..R)* " + MomentStatistics(YC) +
-      "\n  YH(-1..1)  " + MomentStatistics(YH) +
-      "\n  YM(-1..1)  " + MomentStatistics(YM) +
-      "\n  YR(-R..R)  " + MomentStatistics(YR) +
-      "\n  ### Derivatives (w.r.t. inputs of non-linearities)" +
-      "\n  DI^ " + MomentStatistics(DI) +
-      "\n  DF^ " + MomentStatistics(DF) +
-      "\n  DO^ " + MomentStatistics(DO) +
-      "\n  DG  " + MomentStatistics(DG) +
-      "\n  DC* " + MomentStatistics(DC) +
-      "\n  DH  " + MomentStatistics(DH) +
-      "\n  DM  " + MomentStatistics(DM) +
-      "\n  DR  " + MomentStatistics(DR);
-  }
-
-  /**
-   * TODO: Do we really need this?
-   */
-  void ResetStreams(const std::vector<int32>& stream_reset_flag) {
-    KALDI_ASSERT(NumStreams() == stream_reset_flag.size());
-    if (prev_nnet_state_.NumRows() != stream_reset_flag.size()) {
-      prev_nnet_state_.Resize(NumStreams(), 7*cell_dim_ + 1*proj_dim_, kSetZero);
-    } else {
-      for (int s = 0; s < NumStreams(); s++) {
-        if (stream_reset_flag[s] == 1) {
-          prev_nnet_state_.Row(s).SetZero();
-        }
-      }
-    }
-  }
-
-  void PropagateFnc(const CuMatrixBase<BaseFloat> &in,
-                    CuMatrixBase<BaseFloat> *out) {
-
-    // reset context on each sentence if 'sequence_lengths_' not set
-    // (happens in 'nnet-forward' or 'single-stream' training),
-    if (sequence_lengths_.size() == 0) {
-      ResetStreams(std::vector<int32>(1, 1));
-    }
-
-    KALDI_ASSERT(in.NumRows() % NumStreams() == 0);
-    int32 T = in.NumRows() / NumStreams();
-    int32 S = NumStreams();
-
-    // buffers,
-    propagate_buf_.Resize((T+2)*S, 7 * cell_dim_ + proj_dim_, kSetZero);
-    if (prev_nnet_state_.NumRows() != NumStreams()) {
-      prev_nnet_state_.Resize(NumStreams(), 7*cell_dim_ + 1*proj_dim_, kSetZero); // lazy init,
-    } else {
-      propagate_buf_.RowRange(0, S).CopyFromMat(prev_nnet_state_); // use the 'previous-state',
-    }
-
-    // split activations by neuron types,
-    CuSubMatrix<BaseFloat> YG(propagate_buf_.ColRange(0*cell_dim_, cell_dim_));
-    CuSubMatrix<BaseFloat> YI(propagate_buf_.ColRange(1*cell_dim_, cell_dim_));
-    CuSubMatrix<BaseFloat> YF(propagate_buf_.ColRange(2*cell_dim_, cell_dim_));
-    CuSubMatrix<BaseFloat> YO(propagate_buf_.ColRange(3*cell_dim_, cell_dim_));
-    CuSubMatrix<BaseFloat> YC(propagate_buf_.ColRange(4*cell_dim_, cell_dim_));
-    CuSubMatrix<BaseFloat> YH(propagate_buf_.ColRange(5*cell_dim_, cell_dim_));
-    CuSubMatrix<BaseFloat> YM(propagate_buf_.ColRange(6*cell_dim_, cell_dim_));
-    CuSubMatrix<BaseFloat> YR(propagate_buf_.ColRange(7*cell_dim_, proj_dim_));
-    CuSubMatrix<BaseFloat> YGIFO(propagate_buf_.ColRange(0, 4*cell_dim_));
-
-    // x -> g, i, f, o, not recurrent, do it all in once
-    YGIFO.RowRange(1*S, T*S).AddMatMat(1.0, in, kNoTrans, w_gifo_x_, kTrans, 0.0);
-
-    // bias -> g, i, f, o
-    YGIFO.RowRange(1*S, T*S).AddVecToRows(1.0, bias_);
-
-    // BufferPadding [T0]:dummy, [1, T]:current sequence, [T+1]:dummy
-    for (int t = 1; t <= T; t++) {
-      // multistream buffers for current time-step,
-      CuSubMatrix<BaseFloat> y_all(propagate_buf_.RowRange(t*S, S));
-      CuSubMatrix<BaseFloat> y_g(YG.RowRange(t*S, S));
-      CuSubMatrix<BaseFloat> y_i(YI.RowRange(t*S, S));
-      CuSubMatrix<BaseFloat> y_f(YF.RowRange(t*S, S));
-      CuSubMatrix<BaseFloat> y_o(YO.RowRange(t*S, S));
-      CuSubMatrix<BaseFloat> y_c(YC.RowRange(t*S, S));
-      CuSubMatrix<BaseFloat> y_h(YH.RowRange(t*S, S));
-      CuSubMatrix<BaseFloat> y_m(YM.RowRange(t*S, S));
-       CuSubMatrix<BaseFloat> y_r(YR.RowRange(t*S, S));
-      CuSubMatrix<BaseFloat> y_gifo(YGIFO.RowRange(t*S, S));
-
-      // r(t-1) -> g, i, f, o
-      y_gifo.AddMatMat(1.0, YR.RowRange((t-1)*S, S), kNoTrans, w_gifo_r_, kTrans,  1.0);
-
-      // c(t-1) -> i(t) via peephole
-      y_i.AddMatDiagVec(1.0, YC.RowRange((t-1)*S, S), kNoTrans, peephole_i_c_, 1.0);
-
-      // c(t-1) -> f(t) via peephole
-      y_f.AddMatDiagVec(1.0, YC.RowRange((t-1)*S, S), kNoTrans, peephole_f_c_, 1.0);
-
-      // i, f sigmoid squashing
-      y_i.Sigmoid(y_i);
-      y_f.Sigmoid(y_f);
-
-      // g tanh squashing
-      y_g.Tanh(y_g);
-
-      // g * i -> c
-      y_c.AddMatMatElements(1.0, y_g, y_i, 0.0);
-      // c(t-1) * f -> c(t) via forget-gate
-      y_c.AddMatMatElements(1.0, YC.RowRange((t-1)*S, S), y_f, 1.0);
-
-      if (cell_clip_ > 0.0) {
-        y_c.ApplyFloor(-cell_clip_);   // optional clipping of cell activation,
-        y_c.ApplyCeiling(cell_clip_);  // google paper Interspeech2014: LSTM for LVCSR
-      }
-
-      // c(t) -> o(t) via peephole (non-recurrent, using c(t))
-      y_o.AddMatDiagVec(1.0, y_c, kNoTrans, peephole_o_c_, 1.0);
-
-      // o sigmoid squashing,
-      y_o.Sigmoid(y_o);
-
-      // h tanh squashing,
-      y_h.Tanh(y_c);
-
-      // h * o -> m via output gate,
-      y_m.AddMatMatElements(1.0, y_h, y_o, 0.0);
-
-      // m -> r
-      y_r.AddMatMat(1.0, y_m, kNoTrans, w_r_m_, kTrans, 0.0);
-
-      // set zeros to padded frames,
-      if (sequence_lengths_.size() > 0) {
-        for (int s = 0; s < S; s++) {
-          if (t > sequence_lengths_[s]) {
-            y_all.Row(s).SetZero();
-          }
-        }
-      }
-    }
-
-    // set the 'projection layer' output as the LSTM output,
-    out->CopyFromMat(YR.RowRange(1*S, T*S));
-
-    // the state in the last 'frame' is transferred (can be zero vector)
-    prev_nnet_state_.CopyFromMat(propagate_buf_.RowRange(T*S, S));
-  }
-
-  void BackpropagateFnc(const CuMatrixBase<BaseFloat> &in,
-                        const CuMatrixBase<BaseFloat> &out,
-                        const CuMatrixBase<BaseFloat> &out_diff,
-                        CuMatrixBase<BaseFloat> *in_diff) {
-
-    // the number of sequences to be processed in parallel
-    int32 T = in.NumRows() / NumStreams();
-    int32 S = NumStreams();
-
-    // buffer,
-    backpropagate_buf_.Resize((T+2)*S, 7 * cell_dim_ + proj_dim_, kSetZero);
-
-    // split activations by neuron types,
-    CuSubMatrix<BaseFloat> YG(propagate_buf_.ColRange(0*cell_dim_, cell_dim_));
-    CuSubMatrix<BaseFloat> YI(propagate_buf_.ColRange(1*cell_dim_, cell_dim_));
-    CuSubMatrix<BaseFloat> YF(propagate_buf_.ColRange(2*cell_dim_, cell_dim_));
-    CuSubMatrix<BaseFloat> YO(propagate_buf_.ColRange(3*cell_dim_, cell_dim_));
-    CuSubMatrix<BaseFloat> YC(propagate_buf_.ColRange(4*cell_dim_, cell_dim_));
-    CuSubMatrix<BaseFloat> YH(propagate_buf_.ColRange(5*cell_dim_, cell_dim_));
-    CuSubMatrix<BaseFloat> YM(propagate_buf_.ColRange(6*cell_dim_, cell_dim_));
-    CuSubMatrix<BaseFloat> YR(propagate_buf_.ColRange(7*cell_dim_, proj_dim_));
-
-    // split derivatives by neuron types,
-    CuSubMatrix<BaseFloat> DG(backpropagate_buf_.ColRange(0*cell_dim_, cell_dim_));
-    CuSubMatrix<BaseFloat> DI(backpropagate_buf_.ColRange(1*cell_dim_, cell_dim_));
-    CuSubMatrix<BaseFloat> DF(backpropagate_buf_.ColRange(2*cell_dim_, cell_dim_));
-    CuSubMatrix<BaseFloat> DO(backpropagate_buf_.ColRange(3*cell_dim_, cell_dim_));
-    CuSubMatrix<BaseFloat> DC(backpropagate_buf_.ColRange(4*cell_dim_, cell_dim_));
-    CuSubMatrix<BaseFloat> DH(backpropagate_buf_.ColRange(5*cell_dim_, cell_dim_));
-    CuSubMatrix<BaseFloat> DM(backpropagate_buf_.ColRange(6*cell_dim_, cell_dim_));
-    CuSubMatrix<BaseFloat> DR(backpropagate_buf_.ColRange(7*cell_dim_, proj_dim_));
-    CuSubMatrix<BaseFloat> DGIFO(backpropagate_buf_.ColRange(0, 4*cell_dim_));
-
-    // pre-copy partial derivatives from the LSTM output,
-    DR.RowRange(1*S, T*S).CopyFromMat(out_diff);
-
-    // BufferPadding [T0]:dummy, [1,T]:current sequence, [T+1]: dummy,
-    for (int t = T; t >= 1; t--) {
-      CuSubMatrix<BaseFloat> y_g(YG.RowRange(t*S, S));
-      CuSubMatrix<BaseFloat> y_i(YI.RowRange(t*S, S));
-      CuSubMatrix<BaseFloat> y_f(YF.RowRange(t*S, S));
-      CuSubMatrix<BaseFloat> y_o(YO.RowRange(t*S, S));
-      // CuSubMatrix<BaseFloat> y_c(YC.RowRange(t*S, S));
-      CuSubMatrix<BaseFloat> y_h(YH.RowRange(t*S, S));
-      // CuSubMatrix<BaseFloat> y_m(YM.RowRange(t*S, S));
-      // CuSubMatrix<BaseFloat> y_r(YR.RowRange(t*S, S));
-
-      CuSubMatrix<BaseFloat> d_all(backpropagate_buf_.RowRange(t*S, S));
-      CuSubMatrix<BaseFloat> d_g(DG.RowRange(t*S, S));
-      CuSubMatrix<BaseFloat> d_i(DI.RowRange(t*S, S));
-      CuSubMatrix<BaseFloat> d_f(DF.RowRange(t*S, S));
-      CuSubMatrix<BaseFloat> d_o(DO.RowRange(t*S, S));
-      CuSubMatrix<BaseFloat> d_c(DC.RowRange(t*S, S));
-      CuSubMatrix<BaseFloat> d_h(DH.RowRange(t*S, S));
-      CuSubMatrix<BaseFloat> d_m(DM.RowRange(t*S, S));
-      CuSubMatrix<BaseFloat> d_r(DR.RowRange(t*S, S));
-      CuSubMatrix<BaseFloat> d_gifo(DGIFO.RowRange(t*S, S));
-
-      // r
-      //   Version 1 (precise gradients):
-      //   backprop error from g(t+1), i(t+1), f(t+1), o(t+1) to r(t)
-      d_r.AddMatMat(1.0, DGIFO.RowRange((t+1)*S, S), kNoTrans, w_gifo_r_, kNoTrans, 1.0);
-
-      /*
-      //   Version 2 (Alex Graves' PhD dissertation):
-      //   only backprop g(t+1) to r(t)
-      CuSubMatrix<BaseFloat> w_g_r_(w_gifo_r_.RowRange(0, cell_dim_));
-      d_r.AddMatMat(1.0, DG.RowRange((t+1)*S,S), kNoTrans, w_g_r_, kNoTrans, 1.0);
-      */
-
-      /*
-      //   Version 3 (Felix Gers' PhD dissertation):
-      //   truncate gradients of g(t+1), i(t+1), f(t+1), o(t+1) once they leak out memory block
-      //   CEC(with forget connection) is the only "error-bridge" through time
-      */
-
-      // r -> m
-      d_m.AddMatMat(1.0, d_r, kNoTrans, w_r_m_, kNoTrans, 0.0);
-
-      // m -> h via output gate
-      d_h.AddMatMatElements(1.0, d_m, y_o, 0.0);
-      d_h.DiffTanh(y_h, d_h);
-
-      // o
-      d_o.AddMatMatElements(1.0, d_m, y_h, 0.0);
-      d_o.DiffSigmoid(y_o, d_o);
-
-      // c
-      // 1. diff from h(t)
-      // 2. diff from c(t+1) (via forget-gate between CEC)
-      // 3. diff from i(t+1) (via peephole)
-      // 4. diff from f(t+1) (via peephole)
-      // 5. diff from o(t)   (via peephole, not recurrent)
-      d_c.AddMat(1.0, d_h);
-      d_c.AddMatMatElements(1.0, DC.RowRange((t+1)*S, S), YF.RowRange((t+1)*S,S), 1.0);
-      d_c.AddMatDiagVec(1.0, DI.RowRange((t+1)*S, S), kNoTrans, peephole_i_c_, 1.0);
-      d_c.AddMatDiagVec(1.0, DF.RowRange((t+1)*S, S), kNoTrans, peephole_f_c_, 1.0);
-      d_c.AddMatDiagVec(1.0, d_o                    , kNoTrans, peephole_o_c_, 1.0);
-      // optionally clip the cell_derivative,
-      if (cell_diff_clip_ > 0.0) {
-        d_c.ApplyFloor(-cell_diff_clip_);
-        d_c.ApplyCeiling(cell_diff_clip_);
-      }
-
-      // f
-      d_f.AddMatMatElements(1.0, d_c, YC.RowRange((t-1)*S,S), 0.0);
-      d_f.DiffSigmoid(y_f, d_f);
-
-      // i
-      d_i.AddMatMatElements(1.0, d_c, y_g, 0.0);
-      d_i.DiffSigmoid(y_i, d_i);
-
-      // c -> g via input gate
-      d_g.AddMatMatElements(1.0, d_c, y_i, 0.0);
-      d_g.DiffTanh(y_g, d_g);
-
-      // Clipping per-frame derivatives for the next `t'.
-      // Clipping applied to gates and input gate (as done in Google).
-      // [ICASSP2015, Sak, Learning acoustic frame labelling...],
-      //
-      // The path from 'out_diff' to 'd_c' via 'd_h' is unclipped,
-      // which is probably important for the 'Constant Error Carousel'
-      // to work well.
-      //
-      if (diff_clip_ > 0.0) {
-        d_gifo.ApplyFloor(-diff_clip_);
-        d_gifo.ApplyCeiling(diff_clip_);
-      }
-
-      // set zeros to padded frames,
-      if (sequence_lengths_.size() > 0) {
-        for (int s = 0; s < S; s++) {
-          if (t > sequence_lengths_[s]) {
-            d_all.Row(s).SetZero();
-          }
-        }
-      }
-    }
-
-    // g,i,f,o -> x, calculating input derivatives,
-    in_diff->AddMatMat(1.0, DGIFO.RowRange(1*S,T*S), kNoTrans, w_gifo_x_, kNoTrans, 0.0);
-
-    // lazy initialization of udpate buffers,
-    if (w_gifo_x_corr_.NumRows() == 0) {
-      w_gifo_x_corr_.Resize(4*cell_dim_, input_dim_, kSetZero);
-      w_gifo_r_corr_.Resize(4*cell_dim_, proj_dim_, kSetZero);
-      bias_corr_.Resize(4*cell_dim_, kSetZero);
-      peephole_i_c_corr_.Resize(cell_dim_, kSetZero);
-      peephole_f_c_corr_.Resize(cell_dim_, kSetZero);
-      peephole_o_c_corr_.Resize(cell_dim_, kSetZero);
-      w_r_m_corr_.Resize(proj_dim_, cell_dim_, kSetZero);
-    }
-
-    // calculate delta
-    const BaseFloat mmt = opts_.momentum;
-
-    // weight x -> g, i, f, o
-    w_gifo_x_corr_.AddMatMat(1.0, DGIFO.RowRange(1*S, T*S), kTrans,
-                                  in                      , kNoTrans, mmt);
-    // recurrent weight r -> g, i, f, o
-    w_gifo_r_corr_.AddMatMat(1.0, DGIFO.RowRange(1*S, T*S), kTrans,
-                                  YR.RowRange(0*S, T*S)   , kNoTrans, mmt);
-    // bias of g, i, f, o
-    bias_corr_.AddRowSumMat(1.0, DGIFO.RowRange(1*S, T*S), mmt);
-
-    // recurrent peephole c -> i
-    peephole_i_c_corr_.AddDiagMatMat(1.0, DI.RowRange(1*S, T*S), kTrans,
-                                          YC.RowRange(0*S, T*S), kNoTrans, mmt);
-    // recurrent peephole c -> f
-    peephole_f_c_corr_.AddDiagMatMat(1.0, DF.RowRange(1*S, T*S), kTrans,
-                                          YC.RowRange(0*S, T*S), kNoTrans, mmt);
-    // peephole c -> o
-    peephole_o_c_corr_.AddDiagMatMat(1.0, DO.RowRange(1*S, T*S), kTrans,
-                                          YC.RowRange(1*S, T*S), kNoTrans, mmt);
-
-    w_r_m_corr_.AddMatMat(1.0, DR.RowRange(1*S, T*S), kTrans,
-                               YM.RowRange(1*S, T*S), kNoTrans, mmt);
-  }
-
-  void Update(const CuMatrixBase<BaseFloat> &input,
-              const CuMatrixBase<BaseFloat> &diff) {
-
-    // apply the gradient clipping,
-    if (grad_clip_ > 0.0) {
-      w_gifo_x_corr_.ApplyFloor(-grad_clip_);
-      w_gifo_x_corr_.ApplyCeiling(grad_clip_);
-      w_gifo_r_corr_.ApplyFloor(-grad_clip_);
-      w_gifo_r_corr_.ApplyCeiling(grad_clip_);
-      bias_corr_.ApplyFloor(-grad_clip_);
-      bias_corr_.ApplyCeiling(grad_clip_);
-      w_r_m_corr_.ApplyFloor(-grad_clip_);
-      w_r_m_corr_.ApplyCeiling(grad_clip_);
-      peephole_i_c_corr_.ApplyFloor(-grad_clip_);
-      peephole_i_c_corr_.ApplyCeiling(grad_clip_);
-      peephole_f_c_corr_.ApplyFloor(-grad_clip_);
-      peephole_f_c_corr_.ApplyCeiling(grad_clip_);
-      peephole_o_c_corr_.ApplyFloor(-grad_clip_);
-      peephole_o_c_corr_.ApplyCeiling(grad_clip_);
-    }
-
-    const BaseFloat lr  = opts_.learn_rate;
-
-    w_gifo_x_.AddMat(-lr * learn_rate_coef_, w_gifo_x_corr_);
-    w_gifo_r_.AddMat(-lr * learn_rate_coef_, w_gifo_r_corr_);
-    bias_.AddVec(-lr * bias_learn_rate_coef_, bias_corr_, 1.0);
-
-    peephole_i_c_.AddVec(-lr * bias_learn_rate_coef_, peephole_i_c_corr_, 1.0);
-    peephole_f_c_.AddVec(-lr * bias_learn_rate_coef_, peephole_f_c_corr_, 1.0);
-    peephole_o_c_.AddVec(-lr * bias_learn_rate_coef_, peephole_o_c_corr_, 1.0);
-
-    w_r_m_.AddMat(-lr * learn_rate_coef_, w_r_m_corr_);
-  }
-
- private:
-  // dims
-  int32 cell_dim_;
-  int32 proj_dim_;  ///< recurrent projection layer dim
-
-  BaseFloat cell_clip_;  ///< Clipping of 'cell-values' in forward pass (per-frame),
-  BaseFloat diff_clip_;  ///< Clipping of 'derivatives' in backprop (per-frame),
-  BaseFloat cell_diff_clip_; ///< Clipping of 'cell-derivatives' accumulated over CEC (per-frame),
-  BaseFloat grad_clip_;  ///< Clipping of the updates,
-
-  // buffer for transfering state across batches,
-  CuMatrix<BaseFloat> prev_nnet_state_;
-
-  // feed-forward connections: from x to [g, i, f, o]
-  CuMatrix<BaseFloat> w_gifo_x_;
-  CuMatrix<BaseFloat> w_gifo_x_corr_;
-
-  // recurrent projection connections: from r to [g, i, f, o]
-  CuMatrix<BaseFloat> w_gifo_r_;
-  CuMatrix<BaseFloat> w_gifo_r_corr_;
-
-  // biases of [g, i, f, o]
-  CuVector<BaseFloat> bias_;
-  CuVector<BaseFloat> bias_corr_;
-
-  // peephole from c to i, f, g
-  // peephole connections are block-internal, so we use vector form
-  CuVector<BaseFloat> peephole_i_c_;
-  CuVector<BaseFloat> peephole_f_c_;
-  CuVector<BaseFloat> peephole_o_c_;
-
-  CuVector<BaseFloat> peephole_i_c_corr_;
-  CuVector<BaseFloat> peephole_f_c_corr_;
-  CuVector<BaseFloat> peephole_o_c_corr_;
-
-  // projection layer r: from m to r
-  CuMatrix<BaseFloat> w_r_m_;
-  CuMatrix<BaseFloat> w_r_m_corr_;
-
-  // propagate buffer: output of [g, i, f, o, c, h, m, r]
-  CuMatrix<BaseFloat> propagate_buf_;
-
-  // back-propagate buffer: diff-input of [g, i, f, o, c, h, m, r]
-  CuMatrix<BaseFloat> backpropagate_buf_;
-};  // class LstmProjected
-
-}  // namespace nnet1
-}  // namespace kaldi
-
-#endif  // KALDI_NNET_NNET_LSTM_PROJECTED_H_
diff --git a/src/nnet/nnet-matrix-buffer.h b/src/nnet/nnet-matrix-buffer.h
deleted file mode 100644
index 1790aee7b2c..00000000000
--- a/src/nnet/nnet-matrix-buffer.h
+++ /dev/null
@@ -1,233 +0,0 @@
-// nnet/nnet-matrix-buffer.h
-
-// Copyright 2016  Brno University of Technology (author: Karel Vesely)
-
-// See ../../COPYING for clarification regarding multiple authors
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//  http://www.apache.org/licenses/LICENSE-2.0
-//
-// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
-// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
-// MERCHANTABLITY OR NON-INFRINGEMENT.
-// See the Apache 2 License for the specific language governing permissions and
-// limitations under the License.
-
-
-#ifndef KALDI_NNET_NNET_MATRIX_BUFFER_H_
-#define KALDI_NNET_NNET_MATRIX_BUFFER_H_
-
-#include <utility>
-#include <vector>
-#include <list>
-#include <string>
-
-#include "itf/options-itf.h"
-#include "util/common-utils.h"
-#include "matrix/kaldi-matrix.h"
-
-
-namespace kaldi {
-namespace nnet1 {
-
-struct MatrixBufferOptions {
-  int32 matrix_buffer_size;
-
-  MatrixBufferOptions():
-    matrix_buffer_size(3 * 1024)  // 3 x 1GB,
-  { }
-
-  void Register(OptionsItf *opts) {
-    opts->Register("matrix-buffer-size", &matrix_buffer_size,
-       "Capacity of buffer for feature matrices, in MB.");
-  }
-};
-
-
-/**
- * A buffer for caching (utterance-key, feature-matrix) pairs.
- * Typically, it reads 'matrix_buffer_size' megabytes of data,
- * and returns records with similar number of speech frames
- * through the standard Key(), Value(), Next(), Done() interface.
- *
- * The preferred length is reset by ResetLength().
- * The buffer gets refilled after having less
- * data than 50% of 'matrix_buffer_size'.
- */
-class MatrixBuffer {
- public:
-  MatrixBuffer():
-    reader_(NULL),
-    current_(NULL),
-    preferred_length_(0)
-  { }
-
-  ~MatrixBuffer()
-  { }
-
-  void Init(SequentialBaseFloatMatrixReader* reader,
-            MatrixBufferOptions opts = MatrixBufferOptions()) {
-    KALDI_ASSERT(SizeInBytes() == 0);
-    reader_ = reader;
-    opts_ = opts;
-
-    Read();
-  }
-
-  bool Done() {
-    return (reader_->Done() && NumPairs() <= 1);
-  }
-
-  void Next();
-
-  void ResetLength() {
-    preferred_length_ = 0;
-  }
-
-  std::string Key() {
-    return current_->first;
-  }
-  Matrix<BaseFloat> Value() {
-    return current_->second;
-  }
-
-  /// Total amount of features in the buffer (bytes),
-  size_t SizeInBytes() const;
-
-  /// Total amount of features in the buffer (Mega-bytes),
-  size_t SizeInMegaBytes() const;
-
-  /// Total number of (key,matrix) pairs in the buffer,
-  size_t NumPairs() const;
-
- private:
-
-  void Read();  ///< fills the buffer,
-  void DisposeValue();  ///< removes 'current_' from data structure,
-
-  SequentialBaseFloatMatrixReader* reader_;
-
-  typedef std::pair<std::string,Matrix<BaseFloat> > PairType;
-  typedef std::list<PairType> ListType;
-  typedef std::map<size_t, ListType> BufferType;
-  BufferType buffer_;  ///< Buffer indexed by 'NumRows()',
-
-  PairType* current_;  ///< The currently active (key,value) pair,
-
-  MatrixBufferOptions opts_;
-
-  size_t preferred_length_;
-};
-
-void MatrixBuffer::Next() {
-  KALDI_ASSERT(!buffer_.empty());
-
-  // remove old 'Value()' matrix,
-  DisposeValue();
-
-  // start re-filling,
-  if (SizeInMegaBytes() < 0.5 * opts_.matrix_buffer_size) {
-    Read();
-  }
-
-  KALDI_ASSERT(!buffer_.empty());
-
-  // randomly select 'length' present in the 'map',
-  // (weighted by total #frames in the bin),
-  if (preferred_length_ == 0) {
-    int32 longest = (--buffer_.end())->first;
-    // pre-fill the vector of 'keys',
-    std::vector<int32> keys;
-    BufferType::iterator it;
-    for (it = buffer_.begin(); it != buffer_.end(); ++it) {
-      int32 key = it->first; // i.e. NumRows() of matrices in the bin,
-      int32 frames_in_bin = it->second.size() * key;
-      for (int32 i = 0; i < frames_in_bin; i += longest) {
-        keys.push_back(key); // keys are repeated,
-      }
-    }
-    // choose the key,
-    std::vector<int32>::iterator it2 = keys.begin();
-    std::advance(it2, rand() % keys.size());
-    preferred_length_ = (*it2);  // NumRows(), key of the 'map',
-  }
-
-  // select list by 'preferred_length_',
-  BufferType::iterator it = buffer_.lower_bound(preferred_length_);
-  if (it == buffer_.end()) { --it; } // or the last one,
-
-  // take a front element 'ptr' from that list,
-  current_ = &(it->second.front());
-}
-
-size_t MatrixBuffer::SizeInBytes() const {
-  size_t ans = 0;
-  for (BufferType::const_iterator it = buffer_.begin(); it != buffer_.end(); ++it) {
-    for (ListType::const_iterator it2 = it->second.begin(); it2 != it->second.end(); ++it2) {
-      ans += it2->second.SizeInBytes();
-    }
-  }
-  return ans;
-}
-
-size_t MatrixBuffer::SizeInMegaBytes() const {
-  return (SizeInBytes() / (1024 * 1024));
-}
-
-size_t MatrixBuffer::NumPairs() const {
-  size_t ans = 0;
-  for (BufferType::const_iterator it = buffer_.begin(); it != buffer_.end(); ++it) {
-    ans += it->second.size();
-  }
-  return ans;
-}
-
-void MatrixBuffer::Read() {
-  if (!reader_->Done())
-    KALDI_LOG << "Read() started... Buffer size in MB: "
-              << SizeInMegaBytes() << ", max " << opts_.matrix_buffer_size
-              << ", having " << NumPairs() << " utterances.";
-  for ( ; !reader_->Done(); reader_->Next()) {
-    // see if we are full,
-    if (SizeInMegaBytes() > opts_.matrix_buffer_size) {
-      KALDI_LOG << "Read() finished... Buffer size in MB: "
-                << SizeInMegaBytes() << ", max " << opts_.matrix_buffer_size
-                << ", having " << NumPairs() << " utterances.";
-      break;
-    }
-    // get matrix,
-    const std::string& key = reader_->Key();
-    const Matrix<BaseFloat>& mat = reader_->Value();
-    size_t num_rows = mat.NumRows();
-    // see if 'num_rows' already in keys,
-    if (buffer_.find(num_rows) == buffer_.end()) {
-      buffer_[num_rows] = ListType();  // add empty list,
-    }
-    // add matrix to the buffer,
-    buffer_[num_rows].push_back(PairType(key, mat));
-  }
-}
-
-void MatrixBuffer::DisposeValue() {
-  // remove old 'Value()' matrix,
-  if (current_ != NULL) {
-    size_t r = current_->second.NumRows();
-    KALDI_ASSERT(current_ == &(buffer_[r].front()));
-    // remove the (key,value) pair,
-    buffer_[r].pop_front();
-    // eventually remove the 'NumRows()' key,
-    if (buffer_[r].empty()) { buffer_.erase(r); }
-    current_ = NULL;
-  }
-}
-
-
-}  // namespace nnet1
-}  // namespace kaldi
-
-#endif  // KALDI_NNET_NNET_MATRIX_BUFFER_H_
-
diff --git a/src/nnet/nnet-max-pooling-2d-component.h b/src/nnet/nnet-max-pooling-2d-component.h
deleted file mode 100644
index 4a4045ca73d..00000000000
--- a/src/nnet/nnet-max-pooling-2d-component.h
+++ /dev/null
@@ -1,225 +0,0 @@
-// nnet/nnet-max-pooling-2d-component.h
-
-// Copyright 2014  Brno University of Technology (author: Karel Vesely),
-//                 Johns Hopkins University (author: Sri Harish Mallidi)
-
-// See ../../COPYING for clarification regarding multiple authors
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//  http://www.apache.org/licenses/LICENSE-2.0
-//
-// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
-// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
-// MERCHANTABLITY OR NON-INFRINGEMENT.
-// See the Apache 2 License for the specific language governing permissions and
-// limitations under the License.
-
-
-#ifndef KALDI_NNET_NNET_MAX_POOLING_2D_COMPONENT_H_
-#define KALDI_NNET_NNET_MAX_POOLING_2D_COMPONENT_H_
-
-#include <string>
-#include <vector>
-
-#include "nnet/nnet-component.h"
-#include "nnet/nnet-utils.h"
-#include "cudamatrix/cu-math.h"
-
-namespace kaldi {
-namespace nnet1 {
-
-/**
- * MaxPoolingComponent :
- * The input/output matrices are split to submatrices with width 'pool_stride_'.
- * The pooling is done over 3rd axis, of the set of 2d matrices.
- * Our pooling supports overlaps, overlaps occur when (pool_step_ < pool_size_).
- */
-class MaxPooling2DComponent : public Component {
- public:
-  MaxPooling2DComponent(int32 dim_in, int32 dim_out):
-    Component(dim_in, dim_out),
-    fmap_x_len_(0), fmap_y_len_(0),
-    pool_x_len_(0), pool_y_len_(0),
-    pool_x_step_(0), pool_y_step_(0)
-  { }
-
-  ~MaxPooling2DComponent()
-  { }
-
-  Component* Copy() const { return new MaxPooling2DComponent(*this); }
-  ComponentType GetType() const { return kMaxPooling2DComponent; }
-
-  void InitData(std::istream &is) {
-    // parse config
-    std::string token;
-    while (is >> std::ws, !is.eof()) {
-      ReadToken(is, false, &token);
-      /**/ if (token == "<FmapXLen>") ReadBasicType(is, false, &fmap_x_len_);
-      else if (token == "<FmapYLen>") ReadBasicType(is, false, &fmap_y_len_);
-      else if (token == "<PoolXLen>") ReadBasicType(is, false, &pool_x_len_);
-      else if (token == "<PoolYLen>") ReadBasicType(is, false, &pool_y_len_);
-      else if (token == "<PoolXStep>") ReadBasicType(is, false, &pool_x_step_);
-      else if (token == "<PoolYStep>") ReadBasicType(is, false, &pool_y_step_);
-      else KALDI_ERR << "Unknown token " << token << ", a typo in config?"
-                     << " (FmapXLen|FmapYLen|PoolXLen|PoolYLen|PoolXStep|PoolYStep)";
-    }
-    // check
-    KALDI_ASSERT(fmap_x_len_ * fmap_y_len_ != 0);
-    KALDI_ASSERT(pool_x_len_ * pool_y_len_ != 0);
-    KALDI_ASSERT(pool_x_step_ * pool_y_step_ != 0);
-  }
-
-  void ReadData(std::istream &is, bool binary) {
-    // pooling hyperparameters
-    ExpectToken(is, binary, "<FmapXLen>");
-    ReadBasicType(is, binary, &fmap_x_len_);
-    ExpectToken(is, binary, "<FmapYLen>");
-    ReadBasicType(is, binary, &fmap_y_len_);
-    ExpectToken(is, binary, "<PoolXLen>");
-    ReadBasicType(is, binary, &pool_x_len_);
-    ExpectToken(is, binary, "<PoolYLen>");
-    ReadBasicType(is, binary, &pool_y_len_);
-    ExpectToken(is, binary, "<PoolXStep>");
-    ReadBasicType(is, binary, &pool_x_step_);
-    ExpectToken(is, binary, "<PoolYStep>");
-    ReadBasicType(is, binary, &pool_y_step_);
-
-    //
-    // Sanity checks:
-    //
-    // input sanity checks
-    // input_dim_ should be multiple of (fmap_x_len_ * fmap_y_len_)
-    KALDI_ASSERT(input_dim_ % (fmap_x_len_ * fmap_y_len_) == 0);
-    int32 num_input_fmaps = input_dim_ / (fmap_x_len_ * fmap_y_len_);
-    KALDI_LOG << "num_fmaps " << num_input_fmaps;
-    // check if step is in sync with fmap_len and filt_len
-    KALDI_ASSERT((fmap_x_len_ - pool_x_len_) % (pool_x_step_) == 0);
-    KALDI_ASSERT((fmap_y_len_ - pool_y_len_) % (pool_y_step_) == 0);
-    int32 out_fmap_x_len = (fmap_x_len_ - pool_x_len_)/pool_x_step_ + 1;
-    int32 out_fmap_y_len = (fmap_y_len_ - pool_y_len_)/pool_y_step_ + 1;
-    //    int32 out_fmap_size = out_fmap_x_len*out_fmap_y_len;
-    // output sanity checks
-    KALDI_ASSERT(output_dim_ % (out_fmap_x_len * out_fmap_y_len)  == 0);
-    int32 num_output_fmaps = output_dim_ / (out_fmap_x_len * out_fmap_y_len);
-    KALDI_ASSERT(num_input_fmaps == num_output_fmaps);
-  }
-
-  void WriteData(std::ostream &os, bool binary) const {
-    // pooling hyperparameters
-    WriteToken(os, binary, "<FmapXLen>");
-    WriteBasicType(os, binary, fmap_x_len_);
-    WriteToken(os, binary, "<FmapYLen>");
-    WriteBasicType(os, binary, fmap_y_len_);
-    WriteToken(os, binary, "<PoolXLen>");
-    WriteBasicType(os, binary, pool_x_len_);
-    WriteToken(os, binary, "<PoolYLen>");
-    WriteBasicType(os, binary, pool_y_len_);
-    WriteToken(os, binary, "<PoolXStep>");
-    WriteBasicType(os, binary, pool_x_step_);
-    WriteToken(os, binary, "<PoolYStep>");
-    WriteBasicType(os, binary, pool_y_step_);
-  }
-
-  void PropagateFnc(const CuMatrixBase<BaseFloat> &in,
-                    CuMatrixBase<BaseFloat> *out) {
-    // useful dims
-    int32 num_input_fmaps = input_dim_ / (fmap_x_len_ * fmap_y_len_);
-    int out_fmap_cnt = 0;
-    for (int32 m = 0; m < fmap_x_len_-pool_x_len_+1; m = m+pool_x_step_) {
-      for (int32 n = 0; n < fmap_y_len_-pool_y_len_+1; n = n+pool_y_step_) {
-        int32 st = 0;
-        st = (m * fmap_y_len_ + n) * num_input_fmaps;
-        CuSubMatrix<BaseFloat> pool(
-          out->ColRange(out_fmap_cnt * num_input_fmaps, num_input_fmaps)
-        );
-        pool.Set(-1e20);  // reset (large neg value)
-        for (int32 i = 0; i < pool_x_len_; i++) {
-          for (int32 j = 0; j < pool_y_len_; j++) {
-            int32 c = 0;
-            c = st + i * (num_input_fmaps * fmap_y_len_)
-                   + j * num_input_fmaps;
-            pool.Max(in.ColRange(c, num_input_fmaps));
-          }
-        }
-        out_fmap_cnt++;
-      }
-    }
-  }
-
-  void BackpropagateFnc(const CuMatrixBase<BaseFloat> &in,
-                        const CuMatrixBase<BaseFloat> &out,
-                        const CuMatrixBase<BaseFloat> &out_diff,
-                        CuMatrixBase<BaseFloat> *in_diff) {
-    // useful dims
-    int32 num_input_fmaps = input_dim_ / (fmap_x_len_ * fmap_y_len_);
-    int32 inp_fmap_size = fmap_x_len_ * fmap_y_len_;
-
-    //
-    // here we note how many diff matrices are summed for each input patch,
-    std::vector<int32> patch_summands(inp_fmap_size, 0);
-    // this metainfo will be used to divide diff of patches
-    // used in more than one pool.
-    //
-
-    in_diff->SetZero();  // reset
-
-    int out_fmap_cnt = 0;
-    for (int32 m = 0; m < fmap_x_len_-pool_x_len_+1; m = m+pool_x_step_) {
-      for (int32 n = 0; n < fmap_y_len_-pool_y_len_+1; n = n+pool_y_step_) {
-        int32 st = 0;
-        st = (m*fmap_y_len_+n)*num_input_fmaps;
-
-        for (int32 i = 0; i < pool_x_len_; i++) {
-          for (int32 j = 0; j < pool_y_len_; j++) {
-            int32 c = 0;
-            c = st + i * (num_input_fmaps * fmap_y_len_)
-                   + j * num_input_fmaps;
-            //
-            CuSubMatrix<BaseFloat> in_p(in.ColRange(c, num_input_fmaps));
-            CuSubMatrix<BaseFloat> out_p(
-              out.ColRange(out_fmap_cnt*num_input_fmaps, num_input_fmaps)
-            );
-            //
-
-            CuSubMatrix<BaseFloat> tgt(in_diff->ColRange(c, num_input_fmaps));
-            CuMatrix<BaseFloat> src(
-              out_diff.ColRange(out_fmap_cnt*num_input_fmaps, num_input_fmaps)
-            );
-
-            CuMatrix<BaseFloat> mask;
-            in_p.EqualElementMask(out_p, &mask);
-            src.MulElements(mask);
-            tgt.AddMat(1.0, src);
-
-            patch_summands[c/num_input_fmaps] += 1;
-          }
-        }
-        out_fmap_cnt++;
-      }
-    }
-
-    // divide diff by #summands (compensate for patches used in more pools),
-    for (int i = 0; i < fmap_x_len_; i++) {
-      for (int32 j = 0; j < fmap_y_len_; j++) {
-        int32 c = i * fmap_y_len_ + j;
-        CuSubMatrix<BaseFloat> tgt(in_diff->ColRange(c * num_input_fmaps, num_input_fmaps));
-        KALDI_ASSERT(patch_summands[c] > 0);  // patch at least in one pool
-        tgt.Scale(1.0 / patch_summands[c]);
-      }
-    }
-  }
-
- private:
-  int32 fmap_x_len_, fmap_y_len_,
-        pool_x_len_, pool_y_len_,
-        pool_x_step_, pool_y_step_;
-};
-
-}  // namespace nnet1
-}  // namespace kaldi
-
-#endif  // KALDI_NNET_NNET_MAX_POOLING_2D_COMPONENT_H_
diff --git a/src/nnet/nnet-max-pooling-component.h b/src/nnet/nnet-max-pooling-component.h
deleted file mode 100644
index c1add201b02..00000000000
--- a/src/nnet/nnet-max-pooling-component.h
+++ /dev/null
@@ -1,176 +0,0 @@
-// nnet/nnet-max-pooling-component.h
-
-// Copyright 2014  Brno University of Technology (author: Karel Vesely)
-
-// See ../../COPYING for clarification regarding multiple authors
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//  http://www.apache.org/licenses/LICENSE-2.0
-//
-// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
-// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
-// MERCHANTABLITY OR NON-INFRINGEMENT.
-// See the Apache 2 License for the specific language governing permissions and
-// limitations under the License.
-
-
-#ifndef KALDI_NNET_NNET_MAX_POOLING_COMPONENT_H_
-#define KALDI_NNET_NNET_MAX_POOLING_COMPONENT_H_
-
-#include <string>
-#include <vector>
-
-#include "nnet/nnet-component.h"
-#include "nnet/nnet-utils.h"
-#include "cudamatrix/cu-math.h"
-
-namespace kaldi {
-namespace nnet1 {
-
-/**
- * MaxPoolingComponent :
- * The input/output matrices are split to submatrices with width 'pool_stride_'.
- * The pooling is done over 3rd axis, of the set of 2d matrices.
- * Our pooling supports overlaps, overlaps occur when (pool_step_ < pool_size_).
- */
-class MaxPoolingComponent : public Component {
- public:
-  MaxPoolingComponent(int32 dim_in, int32 dim_out):
-    Component(dim_in, dim_out),
-    pool_size_(0),
-    pool_step_(0),
-    pool_stride_(0)
-  { }
-
-  ~MaxPoolingComponent()
-  { }
-
-  Component* Copy() const { return new MaxPoolingComponent(*this); }
-  ComponentType GetType() const { return kMaxPoolingComponent; }
-
-  void InitData(std::istream &is) {
-    // parse config
-    std::string token;
-    while (is >> std::ws, !is.eof()) {
-      ReadToken(is, false, &token);
-      /**/ if (token == "<PoolSize>") ReadBasicType(is, false, &pool_size_);
-      else if (token == "<PoolStep>") ReadBasicType(is, false, &pool_step_);
-      else if (token == "<PoolStride>") ReadBasicType(is, false, &pool_stride_);
-      else KALDI_ERR << "Unknown token " << token << ", a typo in config?"
-                     << " (PoolSize|PoolStep|PoolStride)";
-    }
-    // check
-    KALDI_ASSERT(pool_size_ != 0 && pool_step_ != 0 && pool_stride_ != 0);
-  }
-
-  void ReadData(std::istream &is, bool binary) {
-    // pooling hyperparameters
-    ExpectToken(is, binary, "<PoolSize>");
-    ReadBasicType(is, binary, &pool_size_);
-    ExpectToken(is, binary, "<PoolStep>");
-    ReadBasicType(is, binary, &pool_step_);
-    ExpectToken(is, binary, "<PoolStride>");
-    ReadBasicType(is, binary, &pool_stride_);
-
-    //
-    // Sanity checks:
-    //
-    // number of patches:
-    KALDI_ASSERT(input_dim_ % pool_stride_ == 0);
-    int32 num_patches = input_dim_ / pool_stride_;
-    // number of pools:
-    KALDI_ASSERT((num_patches - pool_size_) % pool_step_ == 0);
-    int32 num_pools = 1 + (num_patches - pool_size_) / pool_step_;
-    // check output dim:
-    KALDI_ASSERT(output_dim_ == num_pools * pool_stride_);
-    //
-  }
-
-  void WriteData(std::ostream &os, bool binary) const {
-    // pooling hyperparameters
-    WriteToken(os, binary, "<PoolSize>");
-    WriteBasicType(os, binary, pool_size_);
-    WriteToken(os, binary, "<PoolStep>");
-    WriteBasicType(os, binary, pool_step_);
-    WriteToken(os, binary, "<PoolStride>");
-    WriteBasicType(os, binary, pool_stride_);
-  }
-
-  void PropagateFnc(const CuMatrixBase<BaseFloat> &in,
-                    CuMatrixBase<BaseFloat> *out) {
-    // useful dims
-    int32 num_patches = input_dim_ / pool_stride_;
-    int32 num_pools = 1 + (num_patches - pool_size_) / pool_step_;
-
-    // do the max-pooling (pools indexed by q)
-    for (int32 q = 0; q < num_pools; q++) {
-      // get output buffer of the pool
-      CuSubMatrix<BaseFloat> pool(out->ColRange(q*pool_stride_, pool_stride_));
-      pool.Set(-1e20);  // reset (large negative value)
-      for (int32 r = 0; r < pool_size_; r++) {  // max
-        int32 p = r + q * pool_step_;  // p = input patch
-        pool.Max(in.ColRange(p*pool_stride_, pool_stride_));
-      }
-    }
-  }
-
-  void BackpropagateFnc(const CuMatrixBase<BaseFloat> &in,
-                        const CuMatrixBase<BaseFloat> &out,
-                        const CuMatrixBase<BaseFloat> &out_diff,
-                        CuMatrixBase<BaseFloat> *in_diff) {
-    // useful dims
-    int32 num_patches = input_dim_ / pool_stride_;
-    int32 num_pools = 1 + (num_patches - pool_size_) / pool_step_;
-
-    //
-    // here we note how many diff matrices are summed for each input patch,
-    std::vector<int32> patch_summands(num_patches, 0);
-    // this metainfo will be used to divide diff of patches
-    // used in more than one pool.
-    //
-
-    in_diff->SetZero();  // reset
-
-    for (int32 q = 0; q<num_pools; q++) {  // sum
-      for (int32 r = 0; r<pool_size_; r++) {
-        int32 p = r + q * pool_step_;  // patch number
-        //
-        CuSubMatrix<BaseFloat> in_p(in.ColRange(p*pool_stride_, pool_stride_));
-        CuSubMatrix<BaseFloat> out_q(out.ColRange(q*pool_stride_, pool_stride_));
-        //
-        CuSubMatrix<BaseFloat> tgt(in_diff->ColRange(p*pool_stride_, pool_stride_));
-        CuMatrix<BaseFloat> src(out_diff.ColRange(q*pool_stride_, pool_stride_));
-
-        // Only the pool-inputs with 'max-values' are used to back-propagate into,
-        // the rest of derivatives is zeroed-out by a mask.
-        CuMatrix<BaseFloat> mask;
-        in_p.EqualElementMask(out_q, &mask);
-        src.MulElements(mask);
-        tgt.AddMat(1.0, src);
-
-        patch_summands[p] += 1;
-      }
-    }
-
-    // divide diff by #summands (compensate for patches used in more pools)
-    for (int32 p = 0; p < num_patches; p++) {
-      CuSubMatrix<BaseFloat> tgt(in_diff->ColRange(p*pool_stride_, pool_stride_));
-      KALDI_ASSERT(patch_summands[p] > 0);  // patch at least in one pool
-      tgt.Scale(1.0/patch_summands[p]);
-    }
-  }
-
- private:
-  int32 pool_size_,    // input patches used for pooling
-        pool_step_,    // shift used for pooling (allow overlapping pools)
-        pool_stride_;  // stride used to slice input to a vector of matrices
-};
-
-}  // namespace nnet1
-}  // namespace kaldi
-
-#endif  // KALDI_NNET_NNET_MAX_POOLING_COMPONENT_H_
diff --git a/src/nnet/nnet-multibasis-component.h b/src/nnet/nnet-multibasis-component.h
deleted file mode 100644
index be3cd05c9ba..00000000000
--- a/src/nnet/nnet-multibasis-component.h
+++ /dev/null
@@ -1,456 +0,0 @@
-// nnet/nnet-multibasis-component.h
-
-// Copyright 2016  Brno University of Technology (Author: Karel Vesely)
-
-// See ../../COPYING for clarification regarding multiple authors
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//  http://www.apache.org/licenses/LICENSE-2.0
-//
-// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
-// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
-// MERCHANTABLITY OR NON-INFRINGEMENT.
-// See the Apache 2 License for the specific language governing permissions and
-// limitations under the License.
-
-
-#ifndef KALDI_NNET_NNET_MULTIBASIS_COMPONENT_H_
-#define KALDI_NNET_NNET_MULTIBASIS_COMPONENT_H_
-
-#include <sstream>
-#include <vector>
-#include <string>
-
-#include "nnet/nnet-component.h"
-#include "nnet/nnet-affine-transform.h"
-
-namespace kaldi {
-namespace nnet1 {
-
-class MultiBasisComponent : public UpdatableComponent {
- public:
-  MultiBasisComponent(int32 dim_in, int32 dim_out) :
-    UpdatableComponent(dim_in, dim_out),
-    selector_lr_coef_(1.0),
-    threshold_(0.1)
-  { }
-
-  ~MultiBasisComponent()
-  { }
-
-  Component* Copy() const { return new MultiBasisComponent(*this); }
-  ComponentType GetType() const { return kMultiBasisComponent; }
-
-  void InitData(std::istream &is) {
-    // define options,
-    std::string selector_proto;
-    std::string selector_filename;
-    std::string basis_proto;
-    std::string basis_filename;
-    std::vector<std::string> basis_filename_vector;
-
-    // parse config
-    std::string token;
-    while (is >> std::ws, !is.eof()) {
-      ReadToken(is, false, &token);
-      /**/ if (token == "<SelectorProto>") ReadToken(is, false, &selector_proto);
-      else if (token == "<SelectorFilename>") ReadToken(is, false, &selector_filename);
-      else if (token == "<SelectorLearnRateCoef>") ReadBasicType(is, false, &selector_lr_coef_);
-      else if (token == "<BasisProto>") ReadToken(is, false, &basis_proto);
-      else if (token == "<BasisFilename>") ReadToken(is, false, &basis_filename);
-      else if (token == "<BasisFilenameVector>") {
-        while(is >> std::ws, !is.eof()) {
-          std::string file_or_end;
-          ReadToken(is, false, &file_or_end);
-          if (file_or_end == "</BasisFilenameVector>") break;
-          basis_filename_vector.push_back(file_or_end);
-        }
-      } else KALDI_ERR << "Unknown token " << token << ", typo in config?"
-               << " (SelectorProto|SelectorFilename|BasisProto|BasisFilename|BasisFilenameVector)";
-    }
-
-    //// INITIALIZE
-
-    // selector,
-    if (selector_proto != "") {
-      KALDI_LOG << "Initializing 'selector' from : " << selector_proto;
-      selector_.Init(selector_proto);
-    }
-    if (selector_filename != "") {
-      KALDI_LOG << "Reading 'selector' from : " << selector_filename;
-      selector_.Read(selector_filename);
-    }
-
-    // as many empty basis as outputs of the selector,
-    nnet_basis_.resize(selector_.OutputDim());
-    // fill the basis,
-    if (basis_proto != "") {
-      // Initialized from prototype,
-      KALDI_LOG << "Initializing 'basis' from : " << basis_proto;
-      for (int32 i = 0; i < nnet_basis_.size(); i++) {
-        nnet_basis_[i].Init(basis_proto);
-      }
-    } else if (basis_filename != "") {
-      // Load 1 initial basis repeateadly,
-      KALDI_LOG << "Reading 'basis' from : " << basis_filename;
-      for (int32 i = 0; i < nnet_basis_.size(); i++) {
-        nnet_basis_[i].Read(basis_filename);
-      }
-    } else if (basis_filename_vector.size() > 0) {
-      // Read a list of basis functions,
-      if (basis_filename_vector.size() != nnet_basis_.size()) {
-        KALDI_ERR << "We need " << nnet_basis_.size() << " filenames. "
-                  << "We got " << basis_filename_vector.size();
-      }
-      for (int32 i = 0; i < nnet_basis_.size(); i++) {
-        KALDI_LOG << "Reading 'basis' from : "
-                  << basis_filename_vector[i];
-        nnet_basis_[i].Read(basis_filename_vector[i]);
-      }
-    } else {
-      // Initialize basis by square identity matrix,
-      int32 basis_input_dim = InputDim() - selector_.InputDim();
-      KALDI_LOG << "Initializing 'basis' to Identity <AffineTransform> "
-                << OutputDim() << "x" << basis_input_dim;
-      KALDI_ASSERT(OutputDim() == basis_input_dim);  // has to be square!
-      Matrix<BaseFloat> m(OutputDim(), basis_input_dim);
-      m.SetUnit();
-      // wrap identity into AffineTransform,
-      // (bias is vector of zeros),
-      AffineTransform identity_comp(basis_input_dim, OutputDim());
-      identity_comp.SetLinearity(CuMatrix<BaseFloat>(m));
-      //
-      for (int32 i = 0; i < nnet_basis_.size(); i++) {
-        nnet_basis_[i].AppendComponent(identity_comp);
-      }
-    }
-
-    // check,
-    KALDI_ASSERT(InputDim() == selector_.InputDim() + nnet_basis_[0].InputDim());
-    KALDI_ASSERT(OutputDim() == nnet_basis_[0].OutputDim());
-  }
-
-  void ReadData(std::istream &is, bool binary) {
-    // Read all the '<Tokens>' in arbitrary order,
-    bool end_loop = false;
-    while (!end_loop && '<' == Peek(is, binary)) {
-      std::string token;
-      int first_char = PeekToken(is, binary);
-      switch (first_char) {
-        case 'S': ReadToken(is, false, &token);
-          /**/ if (token == "<SelectorLearnRateCoef>") ReadBasicType(is, binary, &selector_lr_coef_);
-          else if (token == "<Selector>") selector_.Read(is, binary);
-          else KALDI_ERR << "Unknown token: " << token;
-          break;
-        case 'N': ExpectToken(is, binary, "<NumBasis>");
-          int32 num_basis;
-          ReadBasicType(is, binary, &num_basis);
-          nnet_basis_.resize(num_basis);
-          for (int32 i = 0; i < num_basis; i++) {
-            int32 dummy;
-            ExpectToken(is, binary, "<Basis>");
-            ReadBasicType(is, binary, &dummy);
-            nnet_basis_[i].Read(is, binary);
-          }
-          break;
-        case '!':
-          ExpectToken(is, binary, "<!EndOfComponent>");
-          end_loop=true;
-          break;
-        default:
-          ReadToken(is, false, &token);
-          KALDI_ERR << "Unknown token: " << token;
-      }
-    }
-
-    // check,
-    KALDI_ASSERT(nnet_basis_.size() == selector_.OutputDim());
-    KALDI_ASSERT(InputDim() == selector_.InputDim() + nnet_basis_[0].InputDim());
-    KALDI_ASSERT(OutputDim() == nnet_basis_[0].OutputDim());
-  }
-
-  void WriteData(std::ostream &os, bool binary) const {
-    int32 num_basis = nnet_basis_.size();
-    WriteToken(os, binary, "<SelectorLearnRateCoef>");
-    WriteBasicType(os, binary, selector_lr_coef_);
-    if (!binary) os << "\n\n";
-    WriteToken(os, binary, "<Selector>");
-    if (!binary) os << "\n";
-    selector_.Write(os, binary);
-    if (!binary) os << "\n";
-    WriteToken(os, binary, "<NumBasis>");
-    WriteBasicType(os, binary, num_basis);
-    if (!binary) os << "\n";
-    for (int32 i = 0; i < num_basis; i++) {
-      WriteToken(os, binary, "<Basis>");
-      WriteBasicType(os, binary, i+1);
-      if (!binary) os << "\n";
-      nnet_basis_.at(i).Write(os, binary);
-    }
-  }
-
-  Nnet& GetBasis(int32 id) { return nnet_basis_.at(id); }
-  const Nnet& GetBasis(int32 id) const { return nnet_basis_.at(id); }
-
-  int32 NumParams() const {
-    int32 num_params_sum = selector_.NumParams();
-    for (int32 i = 0; i < nnet_basis_.size(); i++) {
-      num_params_sum += nnet_basis_[i].NumParams();
-    }
-    return num_params_sum;
-  }
-
-  void GetGradient(VectorBase<BaseFloat> *gradient) const {
-    KALDI_ERR << "TODO, not yet implemented!";
-  }
-
-  void GetParams(VectorBase<BaseFloat> *params) const {
-    int32 offset = 0;
-    Vector<BaseFloat> params_tmp;
-    // selector,
-    selector_.GetParams(&params_tmp);
-    params->Range(offset, params_tmp.Dim()).CopyFromVec(params_tmp);
-    offset += params_tmp.Dim();
-    // basis,
-    for (int32 i = 0; i < nnet_basis_.size(); i++) {
-      nnet_basis_[i].GetParams(&params_tmp);
-      params->Range(offset, params_tmp.Dim()).CopyFromVec(params_tmp);
-      offset += params_tmp.Dim();
-    }
-    KALDI_ASSERT(offset == NumParams());
-  }
-
-  void SetParams(const VectorBase<BaseFloat> &params) {
-    int32 offset = 0;
-    // selector,
-    selector_.SetParams(params.Range(offset, selector_.NumParams()));
-    offset += selector_.NumParams();
-    // basis,
-    for (int32 i = 0; i < nnet_basis_.size(); i++) {
-      nnet_basis_[i].SetParams(params.Range(offset, nnet_basis_[i].NumParams()));
-      offset += nnet_basis_[i].NumParams();
-    }
-    KALDI_ASSERT(offset == NumParams());
-  }
-
-  std::string Info() const {
-    std::ostringstream os;
-    for (int32 i = 0; i < nnet_basis_.size(); i++) {
-      os << "basis_network #" << i+1 << " {\n"
-         << nnet_basis_[i].Info()
-         << "}\n";
-    }
-    os << "\nselector {\n"
-       << selector_.Info()
-       << "}";
-    return os.str();
-  }
-
-  std::string InfoGradient() const {
-    std::ostringstream os;
-    for (int32 i = 0; i < nnet_basis_.size(); i++) {
-      if (posterior_sum_(i) > threshold_) {
-        os << "basis_gradient #" << i+1 << " {\n"
-           << nnet_basis_[i].InfoGradient(false)
-           << "}\n";
-      }
-    }
-    os << "selector_gradient {\n"
-       << selector_.InfoGradient(false)
-       << "}";
-    return os.str();
-  }
-
-  std::string InfoPropagate() const {
-    std::ostringstream os;
-    for (int32 i = 0; i < nnet_basis_.size(); i++) {
-      if (posterior_sum_(i) > threshold_) {
-        os << "basis_propagate #" << i+1 << " {\n"
-           << nnet_basis_[i].InfoPropagate(false)
-           << "}\n";
-      }
-    }
-    os << "selector_propagate {\n"
-       << selector_.InfoPropagate(false)
-       << "}\n";
-    return os.str();
-  }
-
-  std::string InfoBackPropagate() const {
-    std::ostringstream os;
-    for (int32 i = 0; i < nnet_basis_.size(); i++) {
-      if (posterior_sum_(i) > threshold_) {
-        os << "basis_backpropagate #" << i+1 << "{\n"
-           << nnet_basis_[i].InfoBackPropagate(false)
-           << "}\n";
-      }
-    }
-    os << "selector_backpropagate {\n"
-       << selector_.InfoBackPropagate(false)
-       << "}\n";
-    return os.str();
-  }
-
-  void PropagateFnc(const CuMatrixBase<BaseFloat> &in,
-                    CuMatrixBase<BaseFloat> *out) {
-    // dimensions,
-    int32 num_basis = nnet_basis_.size();
-
-    // make sure we have all the buffers,
-    if (basis_out_.size() != num_basis) {
-      basis_out_.resize(num_basis);
-    }
-
-    // split the input,
-    const CuSubMatrix<BaseFloat> in_basis(
-        in.ColRange(0, nnet_basis_[0].InputDim())
-    );
-    const CuSubMatrix<BaseFloat> in_selector(
-        in.ColRange(nnet_basis_[0].InputDim(), selector_.InputDim())
-    );
-
-    // get the 'selector_' posteriors,
-    selector_.Propagate(in_selector, &posterior_);
-    KALDI_ASSERT(posterior_.Row(0).Min() >= 0.0);
-    KALDI_ASSERT(posterior_.Row(0).Max() <= 1.0);
-    KALDI_ASSERT(ApproxEqual(posterior_.Row(0).Sum(), 1.0));
-    posterior_.Transpose();  // trans,
-
-    // sum 'selector_' posteriors over time,
-    CuVector<BaseFloat> posterior_sum(num_basis);
-    posterior_sum.AddColSumMat(1.0, posterior_, 0.0);
-    posterior_sum_ = Vector<BaseFloat>(posterior_sum);
-
-    // combine the 'basis' outputs,
-    for (int32 i = 0; i < nnet_basis_.size(); i++) {
-      if (posterior_sum_(i) > threshold_) {
-        // use only basis with occupancy >0.1,
-        nnet_basis_[i].Propagate(in_basis, &basis_out_[i]);
-        out->AddDiagVecMat(1.0, posterior_.Row(i), basis_out_[i], kNoTrans);
-      }
-    }
-  }
-
-  void BackpropagateFnc(const CuMatrixBase<BaseFloat> &in,
-                        const CuMatrixBase<BaseFloat> &out,
-                        const CuMatrixBase<BaseFloat> &out_diff,
-                        CuMatrixBase<BaseFloat> *in_diff) {
-    // dimensions,
-    int32 num_basis = nnet_basis_.size(),
-          num_frames = in.NumRows();
-
-    // split the in_diff,
-    CuSubMatrix<BaseFloat> in_diff_basis(
-        in_diff->ColRange(0, nnet_basis_[0].InputDim())
-    );
-    CuSubMatrix<BaseFloat> in_diff_selector(
-        in_diff->ColRange(nnet_basis_[0].InputDim(), selector_.InputDim())
-    );
-
-    // backprop through 'selector',
-    CuMatrix<BaseFloat> selector_out_diff(num_basis, num_frames);
-    for (int32 i = 0; i < num_basis; i++) {
-      if (posterior_sum_(i) > threshold_) {
-        selector_out_diff.Row(i).AddDiagMatMat(1.0, out_diff, kNoTrans, basis_out_[i], kTrans, 0.0);
-      }
-    }
-    selector_out_diff.Transpose();
-    selector_out_diff.Scale(selector_lr_coef_);
-    CuMatrix<BaseFloat> in_diff_selector_tmp;
-    selector_.Backpropagate(selector_out_diff, &in_diff_selector_tmp);
-    in_diff_selector.CopyFromMat(in_diff_selector_tmp);
-
-    // backprop through 'basis',
-    CuMatrix<BaseFloat> out_diff_scaled(num_frames, OutputDim()),
-                        in_diff_basis_tmp;
-    for (int32 i = 0; i < num_basis; i++) {
-      // use only basis with occupancy >0.1,
-      if (posterior_sum_(i) > threshold_) {
-        out_diff_scaled.AddDiagVecMat(1.0, posterior_.Row(i), out_diff, kNoTrans, 0.0);
-        nnet_basis_[i].Backpropagate(out_diff_scaled, &in_diff_basis_tmp);
-        in_diff_basis.AddMat(1.0, in_diff_basis_tmp);
-      }
-    }
-  }
-
-  void Update(const CuMatrixBase<BaseFloat> &input,
-              const CuMatrixBase<BaseFloat> &diff) {
-    { }  // do nothing
-  }
-
-  /**
-   * Overriding the default,
-   * which was UpdatableComponent::SetTrainOptions(...)
-   */
-  void SetTrainOptions(const NnetTrainOptions &opts) {
-    selector_.SetTrainOptions(opts);
-    for (int32 i=0; i<nnet_basis_.size(); i++) {
-      nnet_basis_[i].SetTrainOptions(opts);
-    }
-  }
-
-  /**
-   * Overriding the default,
-   * which was UpdatableComponent::SetLearnRateCoef(...)
-   */
-  void SetLearnRateCoef(BaseFloat val) {
-    // loop over nnets,
-    for (int32 i = 0; i < nnet_basis_.size(); i++) {
-      // loop over components,
-      for (int32 j = 0; j < nnet_basis_[i].NumComponents(); j++) {
-        if (nnet_basis_[i].GetComponent(j).IsUpdatable()) {
-          UpdatableComponent& comp =
-            dynamic_cast<UpdatableComponent&>(nnet_basis_[i].GetComponent(j));
-          // set the value,
-          comp.SetLearnRateCoef(val);
-        }
-      }
-    }
-  }
-
-  /**
-   * Overriding the default,
-   * which was UpdatableComponent::SetBiasLearnRateCoef(...)
-   */
-  void SetBiasLearnRateCoef(BaseFloat val) {
-    // loop over nnets,
-    for (int32 i = 0; i < nnet_basis_.size(); i++) {
-      // loop over components,
-      for (int32 j = 0; j < nnet_basis_[i].NumComponents(); j++) {
-        if (nnet_basis_[i].GetComponent(j).IsUpdatable()) {
-          UpdatableComponent& comp =
-            dynamic_cast<UpdatableComponent&>(nnet_basis_[i].GetComponent(j));
-          // set the value,
-          comp.SetBiasLearnRateCoef(val);
-        }
-      }
-    }
-  }
-
- private:
-  /// The vector of 'basis' networks (output of basis is combined
-  /// according to the posterior_ from the selector_)
-  std::vector<Nnet> nnet_basis_;
-  std::vector<CuMatrix<BaseFloat> > basis_out_;
-
-  /// Selector network,
-  Nnet selector_;
-  BaseFloat selector_lr_coef_;
-
-  /// The output of 'selector_',
-  CuMatrix<BaseFloat> posterior_;
-  Vector<BaseFloat> posterior_sum_;
-
-  /// Threshold, applied to posterior_sum_, disables the unused basis,
-  BaseFloat threshold_;
-
-};
-
-}  // namespace nnet1
-}  // namespace kaldi
-
-#endif  // KALDI_NNET_NNET_MULTIBASIS_COMPONENT_H_
diff --git a/src/nnet/nnet-nnet.cc b/src/nnet/nnet-nnet.cc
deleted file mode 100644
index 86c5f9e5ad0..00000000000
--- a/src/nnet/nnet-nnet.cc
+++ /dev/null
@@ -1,520 +0,0 @@
-// nnet/nnet-nnet.cc
-
-// Copyright 2011-2016  Brno University of Technology (Author: Karel Vesely)
-
-// See ../../COPYING for clarification regarding multiple authors
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//  http://www.apache.org/licenses/LICENSE-2.0
-//
-// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
-// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
-// MERCHANTABLITY OR NON-INFRINGEMENT.
-// See the Apache 2 License for the specific language governing permissions and
-// limitations under the License.
-
-#include "nnet/nnet-nnet.h"
-#include "nnet/nnet-component.h"
-#include "nnet/nnet-parallel-component.h"
-#include "nnet/nnet-multibasis-component.h"
-#include "nnet/nnet-activation.h"
-#include "nnet/nnet-affine-transform.h"
-#include "nnet/nnet-various.h"
-
-namespace kaldi {
-namespace nnet1 {
-
-Nnet::Nnet() {
-}
-
-Nnet::~Nnet() {
-  Destroy();
-}
-
-Nnet::Nnet(const Nnet& other) {
-  // copy the components
-  for (int32 i = 0; i < other.NumComponents(); i++) {
-    components_.push_back(other.GetComponent(i).Copy());
-  }
-  // create empty buffers
-  propagate_buf_.resize(NumComponents()+1);
-  backpropagate_buf_.resize(NumComponents()+1);
-  // copy train opts
-  SetTrainOptions(other.opts_);
-  Check();
-}
-
-Nnet& Nnet::operator= (const Nnet& other) {
-  Destroy();
-  // copy the components
-  for (int32 i = 0; i < other.NumComponents(); i++) {
-    components_.push_back(other.GetComponent(i).Copy());
-  }
-  // create empty buffers
-  propagate_buf_.resize(NumComponents()+1);
-  backpropagate_buf_.resize(NumComponents()+1);
-  // copy train opts
-  SetTrainOptions(other.opts_);
-  Check();
-  return *this;
-}
-
-/**
- * Forward propagation through the network,
- * (from first component to last).
- */
-void Nnet::Propagate(const CuMatrixBase<BaseFloat> &in,
-                     CuMatrix<BaseFloat> *out) {
-  // In case of empty network copy input to output,
-  if (NumComponents() == 0) {
-    (*out) = in;  // copy,
-    return;
-  }
-  // We need C+1 buffers,
-  if (propagate_buf_.size() != NumComponents()+1) {
-    propagate_buf_.resize(NumComponents()+1);
-  }
-  // Copy input to first buffer,
-  propagate_buf_[0] = in;
-  // Propagate through all the components,
-  for (int32 i = 0; i < static_cast<int32>(components_.size()); i++) {
-    components_[i]->Propagate(propagate_buf_[i], &propagate_buf_[i+1]);
-  }
-  // Copy the output from the last buffer,
-  (*out) = propagate_buf_[NumComponents()];
-}
-
-
-/**
- * Error back-propagation through the network,
- * (from last component to first).
- */
-void Nnet::Backpropagate(const CuMatrixBase<BaseFloat> &out_diff,
-                         CuMatrix<BaseFloat> *in_diff) {
-  // Copy the derivative in case of empty network,
-  if (NumComponents() == 0) {
-    (*in_diff) = out_diff;  // copy,
-    return;
-  }
-  // We need C+1 buffers,
-  KALDI_ASSERT(static_cast<int32>(propagate_buf_.size()) == NumComponents()+1);
-  if (backpropagate_buf_.size() != NumComponents()+1) {
-    backpropagate_buf_.resize(NumComponents()+1);
-  }
-  // Copy 'out_diff' to last buffer,
-  backpropagate_buf_[NumComponents()] = out_diff;
-  // Loop from last Component to the first,
-  for (int32 i = NumComponents()-1; i >= 0; i--) {
-    // Backpropagate through 'Component',
-    components_[i]->Backpropagate(propagate_buf_[i],
-                                  propagate_buf_[i+1],
-                                  backpropagate_buf_[i+1],
-                                  &backpropagate_buf_[i]);
-    // Update 'Component' (if applicable),
-    if (components_[i]->IsUpdatable()) {
-      UpdatableComponent* uc =
-        dynamic_cast<UpdatableComponent*>(components_[i]);
-      uc->Update(propagate_buf_[i], backpropagate_buf_[i+1]);
-    }
-  }
-  // Export the derivative (if applicable),
-  if (NULL != in_diff) {
-    (*in_diff) = backpropagate_buf_[0];
-  }
-}
-
-
-void Nnet::Feedforward(const CuMatrixBase<BaseFloat> &in,
-                       CuMatrix<BaseFloat> *out) {
-  KALDI_ASSERT(NULL != out);
-  (*out) = in;  // works even with 0 components,
-  CuMatrix<BaseFloat> tmp_in;
-  for (int32 i = 0; i < NumComponents(); i++) {
-    out->Swap(&tmp_in);
-    components_[i]->Propagate(tmp_in, out);
-  }
-}
-
-
-int32 Nnet::OutputDim() const {
-  KALDI_ASSERT(!components_.empty());
-  return components_.back()->OutputDim();
-}
-
-int32 Nnet::InputDim() const {
-  KALDI_ASSERT(!components_.empty());
-  return components_.front()->InputDim();
-}
-
-const Component& Nnet::GetComponent(int32 c) const {
-  return *(components_.at(c));
-}
-
-Component& Nnet::GetComponent(int32 c) {
-  return *(components_.at(c));
-}
-
-const Component& Nnet::GetLastComponent() const {
-  return *(components_.at(NumComponents()-1));
-}
-
-Component& Nnet::GetLastComponent() {
-  return *(components_.at(NumComponents()-1));
-}
-
-void Nnet::ReplaceComponent(int32 c, const Component& comp) {
-  delete components_.at(c);
-  components_.at(c) = comp.Copy();  // deep copy,
-  Check();
-}
-
-void Nnet::SwapComponent(int32 c, Component** comp) {
-  Component* tmp = components_.at(c);
-  components_.at(c) = *comp;
-  (*comp) = tmp;
-  Check();
-}
-
-void Nnet::AppendComponent(const Component& comp) {
-  components_.push_back(comp.Copy());  // append,
-  Check();
-}
-
-void Nnet::AppendComponentPointer(Component* dynamically_allocated_comp) {
-  components_.push_back(dynamically_allocated_comp);  // append,
-  Check();
-}
-
-void Nnet::AppendNnet(const Nnet& other) {
-  for (int32 i = 0; i < other.NumComponents(); i++) {
-    AppendComponent(other.GetComponent(i));
-  }
-  Check();
-}
-
-void Nnet::RemoveComponent(int32 c) {
-  Component* ptr = components_.at(c);
-  components_.erase(components_.begin()+c);
-  delete ptr;
-  Check();
-}
-
-void Nnet::RemoveLastComponent() {
-  RemoveComponent(NumComponents()-1);
-}
-
-int32 Nnet::NumParams() const {
-  int32 n_params = 0;
-  for (int32 n = 0; n < components_.size(); n++) {
-    if (components_[n]->IsUpdatable()) {
-      n_params +=
-        dynamic_cast<UpdatableComponent*>(components_[n])->NumParams();
-    }
-  }
-  return n_params;
-}
-
-void Nnet::GetGradient(Vector<BaseFloat>* gradient) const {
-  gradient->Resize(NumParams());
-  int32 pos = 0;
-  // loop over Components,
-  for (int32 i = 0; i < components_.size(); i++) {
-    if (components_[i]->IsUpdatable()) {
-      UpdatableComponent& c =
-        dynamic_cast<UpdatableComponent&>(*components_[i]);
-      SubVector<BaseFloat> grad_range(gradient->Range(pos, c.NumParams()));
-      c.GetGradient(&grad_range);  // getting gradient,
-      pos += c.NumParams();
-    }
-  }
-  KALDI_ASSERT(pos == NumParams());
-}
-
-void Nnet::GetParams(Vector<BaseFloat>* params) const {
-  params->Resize(NumParams());
-  int32 pos = 0;
-  // loop over Components,
-  for (int32 i = 0; i < components_.size(); i++) {
-    if (components_[i]->IsUpdatable()) {
-      UpdatableComponent& c =
-        dynamic_cast<UpdatableComponent&>(*components_[i]);
-      SubVector<BaseFloat> params_range(params->Range(pos, c.NumParams()));
-      c.GetParams(&params_range);  // getting params,
-      pos += c.NumParams();
-    }
-  }
-  KALDI_ASSERT(pos == NumParams());
-}
-
-void Nnet::SetParams(const VectorBase<BaseFloat>& params) {
-  KALDI_ASSERT(params.Dim() == NumParams());
-  int32 pos = 0;
-  // loop over Components,
-  for (int32 i = 0; i < components_.size(); i++) {
-    if (components_[i]->IsUpdatable()) {
-      UpdatableComponent& c =
-        dynamic_cast<UpdatableComponent&>(*components_[i]);
-      c.SetParams(params.Range(pos, c.NumParams()));  // setting params,
-      pos += c.NumParams();
-    }
-  }
-  KALDI_ASSERT(pos == NumParams());
-}
-
-void Nnet::SetDropoutRate(BaseFloat r)  {
-  for (int32 c = 0; c < NumComponents(); c++) {
-    if (GetComponent(c).GetType() == Component::kDropout) {
-      Dropout& comp = dynamic_cast<Dropout&>(GetComponent(c));
-      BaseFloat r_old = comp.GetDropoutRate();
-      comp.SetDropoutRate(r);
-      KALDI_LOG << "Setting dropout-rate in component " << c
-                << " from " << r_old << " to " << r;
-    }
-  }
-}
-
-
-void Nnet::ResetStreams(const std::vector<int32> &stream_reset_flag) {
-  for (int32 c = 0; c < NumComponents(); c++) {
-    if (GetComponent(c).IsMultistream()) {
-      MultistreamComponent& comp =
-        dynamic_cast<MultistreamComponent&>(GetComponent(c));
-      comp.ResetStreams(stream_reset_flag);
-    }
-  }
-}
-
-void Nnet::SetSeqLengths(const std::vector<int32> &sequence_lengths) {
-  for (int32 c = 0; c < NumComponents(); c++) {
-    if (GetComponent(c).IsMultistream()) {
-      MultistreamComponent& comp =
-        dynamic_cast<MultistreamComponent&>(GetComponent(c));
-      comp.SetSeqLengths(sequence_lengths);
-    }
-  }
-}
-
-void Nnet::Init(const std::string &proto_file) {
-  Input in(proto_file);
-  std::istream &is = in.Stream();
-  std::string proto_line, token;
-
-  // Initialize from the prototype, where each line
-  // contains the description for one component.
-  while (is >> std::ws, !is.eof()) {
-    KALDI_ASSERT(is.good());
-
-    // get a line from the proto file,
-    std::getline(is, proto_line);
-    if (proto_line == "") continue;
-    KALDI_VLOG(1) << proto_line;
-
-    // get the 1st token from the line,
-    std::istringstream(proto_line) >> std::ws >> token;
-    // ignore these tokens:
-    if (token == "<NnetProto>" || token == "</NnetProto>") continue;
-
-    // create new component, append to Nnet,
-    this->AppendComponentPointer(Component::Init(proto_line+"\n"));
-  }
-  // cleanup
-  in.Close();
-  Check();
-}
-
-
-/**
- * I/O wrapper for converting 'rxfilename' to 'istream',
- */
-void Nnet::Read(const std::string &rxfilename) {
-  bool binary;
-  Input in(rxfilename, &binary);
-  Read(in.Stream(), binary);
-  in.Close();
-  // Warn if the NN is empty
-  if (NumComponents() == 0) {
-    KALDI_WARN << "The network '" << rxfilename << "' is empty.";
-  }
-}
-
-
-void Nnet::Read(std::istream &is, bool binary) {
-  // Read the Components through the 'factory' Component::Read(...),
-  Component* comp(NULL);
-  while (comp = Component::Read(is, binary), comp != NULL) {
-    // Check dims,
-    if (NumComponents() > 0) {
-      if (components_.back()->OutputDim() != comp->InputDim()) {
-        KALDI_ERR << "Dimensionality mismatch!"
-                  << " Previous layer output:" << components_.back()->OutputDim()
-                  << " Current layer input:" << comp->InputDim();
-      }
-    }
-    // Append to 'this' Nnet,
-    AppendComponentPointer(comp);
-  }
-  Check();
-}
-
-
-/**
- * I/O wrapper for converting 'wxfilename' to 'ostream',
- */
-void Nnet::Write(const std::string &wxfilename, bool binary) const {
-  Output out(wxfilename, binary, true);
-  Write(out.Stream(), binary);
-  out.Close();
-}
-
-
-void Nnet::Write(std::ostream &os, bool binary) const {
-  Check();
-  WriteToken(os, binary, "<Nnet>");
-  if (binary == false) os << std::endl;
-  for (int32 i = 0; i < NumComponents(); i++) {
-    components_[i]->Write(os, binary);
-  }
-  WriteToken(os, binary, "</Nnet>");
-  if (binary == false) os << std::endl;
-}
-
-
-std::string Nnet::Info() const {
-  // global info
-  std::ostringstream ostr;
-  ostr << "num-components " << NumComponents() << std::endl;
-  if (NumComponents() == 0)
-    return ostr.str();
-  ostr << "input-dim " << InputDim() << std::endl;
-  ostr << "output-dim " << OutputDim() << std::endl;
-  ostr << "number-of-parameters " << static_cast<float>(NumParams())/1e6
-       << " millions" << std::endl;
-  // topology & weight stats
-  for (int32 i = 0; i < NumComponents(); i++) {
-    ostr << "component " << i+1 << " : "
-         << Component::TypeToMarker(components_[i]->GetType())
-         << ", input-dim " << components_[i]->InputDim()
-         << ", output-dim " << components_[i]->OutputDim()
-         << ", " << components_[i]->Info() << std::endl;
-  }
-  return ostr.str();
-}
-
-std::string Nnet::InfoGradient(bool header) const {
-  std::ostringstream ostr;
-  // gradient stats
-  if (header) ostr << "\n### GRADIENT STATS :\n";
-  for (int32 i = 0; i < NumComponents(); i++) {
-    ostr << "Component " << i+1 << " : "
-         << Component::TypeToMarker(components_[i]->GetType())
-         << ", " << components_[i]->InfoGradient() << std::endl;
-  }
-  if (header) ostr << "### END GRADIENT\n";
-  return ostr.str();
-}
-
-std::string Nnet::InfoPropagate(bool header) const {
-  std::ostringstream ostr;
-  // forward-pass buffer stats
-  if (header) ostr << "\n### FORWARD PROPAGATION BUFFER CONTENT :\n";
-  ostr << "[0] output of <Input> " << MomentStatistics(propagate_buf_[0])
-       << std::endl;
-  for (int32 i = 0; i < NumComponents(); i++) {
-    ostr << "[" << 1+i << "] output of "
-         << Component::TypeToMarker(components_[i]->GetType())
-         << MomentStatistics(propagate_buf_[i+1]) << std::endl;
-    // nested networks too...
-    if (Component::kParallelComponent == components_[i]->GetType()) {
-      ostr <<
-        dynamic_cast<ParallelComponent*>(components_[i])->InfoPropagate();
-    }
-    if (Component::kMultiBasisComponent == components_[i]->GetType()) {
-      ostr << dynamic_cast<MultiBasisComponent*>(components_[i])->InfoPropagate();
-    }
-  }
-  if (header) ostr << "### END FORWARD\n";
-  return ostr.str();
-}
-
-std::string Nnet::InfoBackPropagate(bool header) const {
-  std::ostringstream ostr;
-  // forward-pass buffer stats
-  if (header) ostr << "\n### BACKWARD PROPAGATION BUFFER CONTENT :\n";
-  ostr << "[0] diff of <Input> " << MomentStatistics(backpropagate_buf_[0])
-       << std::endl;
-  for (int32 i = 0; i < NumComponents(); i++) {
-    ostr << "["<<1+i<< "] diff-output of "
-         << Component::TypeToMarker(components_[i]->GetType())
-         << MomentStatistics(backpropagate_buf_[i+1]) << std::endl;
-    // nested networks too...
-    if (Component::kParallelComponent == components_[i]->GetType()) {
-      ostr <<
-        dynamic_cast<ParallelComponent*>(components_[i])->InfoBackPropagate();
-    }
-    if (Component::kMultiBasisComponent == components_[i]->GetType()) {
-      ostr << dynamic_cast<MultiBasisComponent*>(components_[i])->InfoBackPropagate();
-    }
-  }
-  if (header) ostr << "### END BACKWARD\n\n";
-  return ostr.str();
-}
-
-
-void Nnet::Check() const {
-  // check dims,
-  for (size_t i = 0; i + 1 < components_.size(); i++) {
-    KALDI_ASSERT(components_[i] != NULL);
-    int32 output_dim = components_[i]->OutputDim(),
-      next_input_dim = components_[i+1]->InputDim();
-    // show error message,
-    if (output_dim != next_input_dim) {
-      KALDI_ERR << "Component dimension mismatch!"
-                << " Output dim of [" << i << "] "
-                << Component::TypeToMarker(components_[i]->GetType())
-                << " is " << output_dim << ". "
-                << "Input dim of next [" << i+1 << "] "
-                << Component::TypeToMarker(components_[i+1]->GetType())
-                << " is " << next_input_dim << ".";
-    }
-  }
-  // check for nan/inf in network weights,
-  Vector<BaseFloat> weights;
-  GetParams(&weights);
-  BaseFloat sum = weights.Sum();
-  if (KALDI_ISINF(sum)) {
-    KALDI_ERR << "'inf' in network parameters "
-              << "(weight explosion, need lower learning rate?)";
-  }
-  if (KALDI_ISNAN(sum)) {
-    KALDI_ERR << "'nan' in network parameters (need lower learning rate?)";
-  }
-}
-
-
-void Nnet::Destroy() {
-  for (int32 i = 0; i < NumComponents(); i++) {
-    delete components_[i];
-  }
-  components_.resize(0);
-  propagate_buf_.resize(0);
-  backpropagate_buf_.resize(0);
-}
-
-
-void Nnet::SetTrainOptions(const NnetTrainOptions& opts) {
-  opts_ = opts;
-  // set values to individual components,
-  for (int32 l = 0; l < NumComponents(); l++) {
-    if (GetComponent(l).IsUpdatable()) {
-      dynamic_cast<UpdatableComponent&>(GetComponent(l)).SetTrainOptions(opts_);
-    }
-  }
-}
-
-
-}  // namespace nnet1
-}  // namespace kaldi
diff --git a/src/nnet/nnet-nnet.h b/src/nnet/nnet-nnet.h
deleted file mode 100644
index cf29f91a89d..00000000000
--- a/src/nnet/nnet-nnet.h
+++ /dev/null
@@ -1,186 +0,0 @@
-// nnet/nnet-nnet.h
-
-// Copyright 2011-2016  Brno University of Technology (Author: Karel Vesely)
-
-// See ../../COPYING for clarification regarding multiple authors
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//  http://www.apache.org/licenses/LICENSE-2.0
-//
-// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
-// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
-// MERCHANTABLITY OR NON-INFRINGEMENT.
-// See the Apache 2 License for the specific language governing permissions and
-// limitations under the License.
-
-#ifndef KALDI_NNET_NNET_NNET_H_
-#define KALDI_NNET_NNET_NNET_H_
-
-#include <string>
-#include <vector>
-#include <iostream>
-#include <sstream>
-
-#include "base/kaldi-common.h"
-#include "util/kaldi-io.h"
-#include "matrix/matrix-lib.h"
-#include "nnet/nnet-trnopts.h"
-#include "nnet/nnet-component.h"
-
-namespace kaldi {
-namespace nnet1 {
-
-class Nnet {
- public:
-  Nnet();
-  ~Nnet();
-
-  Nnet(const Nnet& other);  // Allow copy constructor.
-  Nnet& operator= (const Nnet& other);  // Allow assignment operator.
-
- public:
-  /// Perform forward pass through the network,
-  void Propagate(const CuMatrixBase<BaseFloat> &in,
-                 CuMatrix<BaseFloat> *out);
-  /// Perform backward pass through the network,
-  void Backpropagate(const CuMatrixBase<BaseFloat> &out_diff,
-                     CuMatrix<BaseFloat> *in_diff);
-  /// Perform forward pass through the network (with 2 swapping buffers),
-  void Feedforward(const CuMatrixBase<BaseFloat> &in,
-                   CuMatrix<BaseFloat> *out);
-
-  /// Dimensionality on network input (input feature dim.),
-  int32 InputDim() const;
-  /// Dimensionality of network outputs (posteriors | bn-features | etc.),
-  int32 OutputDim() const;
-
-  /// Returns the number of 'Components' which form the NN.
-  /// Typically a NN layer is composed of 2 components:
-  /// the <AffineTransform> with trainable parameters
-  /// and a non-linearity like <Sigmoid> or <Softmax>.
-  /// Usually there are 2x more Components than the NN layers.
-  int32 NumComponents() const {
-    return components_.size();
-  }
-
-  /// Component accessor,
-  const Component& GetComponent(int32 c) const;
-
-  /// Component accessor,
-  Component& GetComponent(int32 c);
-
-  /// LastComponent accessor,
-  const Component& GetLastComponent() const;
-
-  /// LastComponent accessor,
-  Component& GetLastComponent();
-
-  /// Replace c'th component in 'this' Nnet (deep copy),
-  void ReplaceComponent(int32 c, const Component& comp);
-
-  /// Swap c'th component with the pointer,
-  void SwapComponent(int32 c, Component** comp);
-
-  /// Append Component to 'this' instance of Nnet (deep copy),
-  void AppendComponent(const Component& comp);
-
-  /// Append Component* to 'this' instance of Nnet by a shallow copy
-  /// ('this' instance of Nnet over-takes the ownership of the pointer).
-  void AppendComponentPointer(Component *dynamically_allocated_comp);
-
-  /// Append other Nnet to the 'this' Nnet (copy all its components),
-  void AppendNnet(const Nnet& nnet_to_append);
-
-  /// Remove c'th component,
-  void RemoveComponent(int32 c);
-
-  /// Remove the last of the Components,
-  void RemoveLastComponent();
-
-  /// Access to the forward-pass buffers
-  const std::vector<CuMatrix<BaseFloat> >& PropagateBuffer() const {
-    return propagate_buf_;
-  }
-  /// Access to the backward-pass buffers
-  const std::vector<CuMatrix<BaseFloat> >& BackpropagateBuffer() const {
-    return backpropagate_buf_;
-  }
-
-  /// Get the number of parameters in the network,
-  int32 NumParams() const;
-
-  /// Get the gradient stored in the network,
-  void GetGradient(Vector<BaseFloat>* gradient) const;
-
-  /// Get the network weights in a supervector,
-  void GetParams(Vector<BaseFloat>* params) const;
-
-  /// Set the network weights from a supervector,
-  void SetParams(const VectorBase<BaseFloat>& params);
-
-  /// Set the dropout rate
-  void SetDropoutRate(BaseFloat r);
-
-  /// Reset streams in multi-stream training,
-  void ResetStreams(const std::vector<int32> &stream_reset_flag);
-
-  /// Set sequence length in LSTM multi-stream training,
-  void SetSeqLengths(const std::vector<int32> &sequence_lengths);
-
-  /// Initialize the Nnet from the prototype,
-  void Init(const std::string &proto_file);
-
-  /// Read Nnet from 'rxfilename',
-  void Read(const std::string &rxfilename);
-  /// Read Nnet from 'istream',
-  void Read(std::istream &in, bool binary);
-
-  /// Write Nnet to 'wxfilename',
-  void Write(const std::string &wxfilename, bool binary) const;
-  /// Write Nnet to 'ostream',
-  void Write(std::ostream &out, bool binary) const;
-
-  /// Create string with human readable description of the nnet,
-  std::string Info() const;
-  /// Create string with per-component gradient statistics,
-  std::string InfoGradient(bool header = true) const;
-  /// Create string with propagation-buffer statistics,
-  std::string InfoPropagate(bool header = true) const;
-  /// Create string with back-propagation-buffer statistics,
-  std::string InfoBackPropagate(bool header = true) const;
-  /// Consistency check,
-  void Check() const;
-  /// Relese the memory,
-  void Destroy();
-
-  /// Set hyper-parameters of the training (pushes to all UpdatableComponents),
-  void SetTrainOptions(const NnetTrainOptions& opts);
-  /// Get training hyper-parameters from the network,
-  const NnetTrainOptions& GetTrainOptions() const {
-    return opts_;
-  }
-
- private:
-  /// Vector which contains all the components composing the neural network,
-  /// the components are for example: AffineTransform, Sigmoid, Softmax
-  std::vector<Component*> components_;
-
-  /// Buffers for forward pass (on demand initialization),
-  std::vector<CuMatrix<BaseFloat> > propagate_buf_;
-  /// Buffers for backward pass (on demand initialization),
-  std::vector<CuMatrix<BaseFloat> > backpropagate_buf_;
-
-  /// Option class with hyper-parameters passed to UpdatableComponent(s)
-  NnetTrainOptions opts_;
-};
-
-}  // namespace nnet1
-}  // namespace kaldi
-
-#endif  // KALDI_NNET_NNET_NNET_H_
-
-
diff --git a/src/nnet/nnet-parallel-component.h b/src/nnet/nnet-parallel-component.h
deleted file mode 100644
index 95dfddf612d..00000000000
--- a/src/nnet/nnet-parallel-component.h
+++ /dev/null
@@ -1,361 +0,0 @@
-// nnet/nnet-parallel-component.h
-
-// Copyright 2014  Brno University of Technology (Author: Karel Vesely)
-
-// See ../../COPYING for clarification regarding multiple authors
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//  http://www.apache.org/licenses/LICENSE-2.0
-//
-// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
-// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
-// MERCHANTABLITY OR NON-INFRINGEMENT.
-// See the Apache 2 License for the specific language governing permissions and
-// limitations under the License.
-
-
-#ifndef KALDI_NNET_NNET_PARALLEL_COMPONENT_H_
-#define KALDI_NNET_NNET_PARALLEL_COMPONENT_H_
-
-#include <string>
-#include <vector>
-#include <sstream>
-
-#include "nnet/nnet-component.h"
-#include "nnet/nnet-utils.h"
-#include "cudamatrix/cu-math.h"
-
-
-namespace kaldi {
-namespace nnet1 {
-
-class ParallelComponent : public MultistreamComponent {
- public:
-  ParallelComponent(int32 dim_in, int32 dim_out):
-    MultistreamComponent(dim_in, dim_out)
-  { }
-
-  ~ParallelComponent()
-  { }
-
-  Component* Copy() const { return new ParallelComponent(*this); }
-  ComponentType GetType() const { return kParallelComponent; }
-
-  const Nnet& GetNestedNnet(int32 id) const { return nnet_.at(id); }
-  Nnet& GetNestedNnet(int32 id) { return nnet_.at(id); }
-
-  void InitData(std::istream &is) {
-    // define options
-    std::vector<std::string> nested_nnet_proto;
-    std::vector<std::string> nested_nnet_filename;
-    // parse config
-    std::string token;
-    while (is >> std::ws, !is.eof()) {
-      ReadToken(is, false, &token);
-      /**/ if (token == "<NestedNnet>" || token == "<NestedNnetFilename>") {
-        while (is >> std::ws, !is.eof()) {
-          std::string file_or_end;
-          ReadToken(is, false, &file_or_end);
-          if (file_or_end == "</NestedNnet>" ||
-              file_or_end == "</NestedNnetFilename>") break;
-          nested_nnet_filename.push_back(file_or_end);
-        }
-      } else if (token == "<NestedNnetProto>") {
-        while (is >> std::ws, !is.eof()) {
-          std::string file_or_end;
-          ReadToken(is, false, &file_or_end);
-          if (file_or_end == "</NestedNnetProto>") break;
-          nested_nnet_proto.push_back(file_or_end);
-        }
-      } else { KALDI_ERR << "Unknown token " << token << ", typo in config?"
-                         << " (NestedNnet|NestedNnetFilename|NestedNnetProto)";
-      }
-    }
-    // Initialize,
-    // First, read nnets from files,
-    if (nested_nnet_filename.size() > 0) {
-      for (int32 i = 0; i < nested_nnet_filename.size(); i++) {
-        Nnet nnet;
-        nnet.Read(nested_nnet_filename[i]);
-        nnet_.push_back(nnet);
-        KALDI_LOG << "Loaded nested <Nnet> from file : "
-                  << nested_nnet_filename[i];
-      }
-    }
-    // Second, initialize nnets from prototypes,
-    if (nested_nnet_proto.size() > 0) {
-      for (int32 i = 0; i < nested_nnet_proto.size(); i++) {
-        Nnet nnet;
-        nnet.Init(nested_nnet_proto[i]);
-        nnet_.push_back(nnet);
-        KALDI_LOG << "Initialized nested <Nnet> from prototype : "
-                  << nested_nnet_proto[i];
-      }
-    }
-    // Check dim-sum of nested nnets,
-    int32 nnet_input_sum = 0, nnet_output_sum = 0;
-    for (int32 i = 0; i < nnet_.size(); i++) {
-      nnet_input_sum += nnet_[i].InputDim();
-      nnet_output_sum += nnet_[i].OutputDim();
-    }
-    KALDI_ASSERT(InputDim() == nnet_input_sum);
-    KALDI_ASSERT(OutputDim() == nnet_output_sum);
-  }
-
-  void ReadData(std::istream &is, bool binary) {
-    // read
-    ExpectToken(is, binary, "<NestedNnetCount>");
-    int32 nnet_count;
-    ReadBasicType(is, binary, &nnet_count);
-    for (int32 i = 0; i < nnet_count; i++) {
-      ExpectToken(is, binary, "<NestedNnet>");
-      int32 dummy;
-      ReadBasicType(is, binary, &dummy);
-      Nnet nnet;
-      nnet.Read(is, binary);
-      nnet_.push_back(nnet);
-    }
-    ExpectToken(is, binary, "</ParallelComponent>");
-
-    // check dim-sum of nested nnets
-    int32 nnet_input_sum = 0, nnet_output_sum = 0;
-    for (int32 i = 0; i < nnet_.size(); i++) {
-      nnet_input_sum += nnet_[i].InputDim();
-      nnet_output_sum += nnet_[i].OutputDim();
-    }
-    KALDI_ASSERT(InputDim() == nnet_input_sum);
-    KALDI_ASSERT(OutputDim() == nnet_output_sum);
-  }
-
-  void WriteData(std::ostream &os, bool binary) const {
-    // useful dims
-    int32 nnet_count = nnet_.size();
-    //
-    WriteToken(os, binary, "<NestedNnetCount>");
-    WriteBasicType(os, binary, nnet_count);
-    if (!binary) os << "\n";
-    for (int32 i = 0; i < nnet_count; i++) {
-      WriteToken(os, binary, "<NestedNnet>");
-      WriteBasicType(os, binary, i+1);
-      if (!binary) os << "\n";
-      nnet_[i].Write(os, binary);
-    }
-    WriteToken(os, binary, "</ParallelComponent>");
-  }
-
-  int32 NumParams() const {
-    int32 ans = 0;
-    for (int32 i = 0; i < nnet_.size(); i++) {
-      ans += nnet_[i].NumParams();
-    }
-    return ans;
-  }
-
-  void GetGradient(VectorBase<BaseFloat>* gradient) const {
-    KALDI_ASSERT(gradient->Dim() == NumParams());
-    int32 offset = 0;
-    for (int32 i = 0; i < nnet_.size(); i++) {
-      int32 n_params = nnet_[i].NumParams();
-      Vector<BaseFloat> gradient_aux;  // we need 'Vector<>',
-      nnet_[i].GetGradient(&gradient_aux);  // copy gradient from Nnet,
-      gradient->Range(offset, n_params).CopyFromVec(gradient_aux);
-      offset += n_params;
-    }
-    KALDI_ASSERT(offset == NumParams());
-  }
-
-  void GetParams(VectorBase<BaseFloat>* params) const {
-    KALDI_ASSERT(params->Dim() == NumParams());
-    int32 offset = 0;
-    for (int32 i = 0; i < nnet_.size(); i++) {
-      int32 n_params = nnet_[i].NumParams();
-      Vector<BaseFloat> params_aux;  // we need 'Vector<>',
-      nnet_[i].GetParams(&params_aux);  // copy params from Nnet,
-      params->Range(offset, n_params).CopyFromVec(params_aux);
-      offset += n_params;
-    }
-    KALDI_ASSERT(offset == NumParams());
-  }
-
-  void SetParams(const VectorBase<BaseFloat>& params) {
-    KALDI_ASSERT(params.Dim() == NumParams());
-    int32 offset = 0;
-    for (int32 i = 0; i < nnet_.size(); i++) {
-      int32 n_params = nnet_[i].NumParams();
-      nnet_[i].SetParams(params.Range(offset, n_params));
-      offset += n_params;
-    }
-    KALDI_ASSERT(offset == NumParams());
-  }
-
-  std::string Info() const {
-    std::ostringstream os;
-    os << "\n";
-    for (int32 i = 0; i < nnet_.size(); i++) {
-      os << "nested_network #" << i+1 << " {\n"
-         << nnet_[i].Info()
-         << "}\n";
-    }
-    std::string s(os.str());
-    s.erase(s.end() -1);  // removing last '\n'
-    return s;
-  }
-
-  std::string InfoGradient() const {
-    std::ostringstream os;
-    os << "\n";
-    for (int32 i = 0; i < nnet_.size(); i++) {
-      os << "nested_gradient #" << i+1 << " {\n"
-         << nnet_[i].InfoGradient(false)
-         << "}\n";
-    }
-    std::string s(os.str());
-    s.erase(s.end() -1);  // removing last '\n'
-    return s;
-  }
-
-  std::string InfoPropagate() const {
-    std::ostringstream os;
-    for (int32 i = 0; i < nnet_.size(); i++) {
-      os << "nested_propagate #" << i+1 << " {\n"
-         << nnet_[i].InfoPropagate(false)
-         << "}\n";
-    }
-    return os.str();
-  }
-
-  std::string InfoBackPropagate() const {
-    std::ostringstream os;
-    for (int32 i = 0; i < nnet_.size(); i++) {
-      os << "nested_backpropagate #" << i+1 << " {\n"
-         << nnet_[i].InfoBackPropagate(false)
-         << "}\n";
-    }
-    return os.str();
-  }
-
-  void PropagateFnc(const CuMatrixBase<BaseFloat> &in,
-                    CuMatrixBase<BaseFloat> *out) {
-    // column-offsets for data buffers 'in,out',
-    int32 input_offset = 0, output_offset = 0;
-    // loop over nnets,
-    for (int32 i = 0; i < nnet_.size(); i++) {
-      // get the data 'windows',
-      CuSubMatrix<BaseFloat> src(
-        in.ColRange(input_offset, nnet_[i].InputDim())
-      );
-      CuSubMatrix<BaseFloat> tgt(
-        out->ColRange(output_offset, nnet_[i].OutputDim())
-      );
-      // forward through auxiliary matrix, as 'Propagate' requires 'CuMatrix',
-      CuMatrix<BaseFloat> tgt_aux;
-      nnet_[i].Propagate(src, &tgt_aux);
-      tgt.CopyFromMat(tgt_aux);
-      // advance the offsets,
-      input_offset += nnet_[i].InputDim();
-      output_offset += nnet_[i].OutputDim();
-    }
-  }
-
-  void BackpropagateFnc(const CuMatrixBase<BaseFloat> &in,
-                        const CuMatrixBase<BaseFloat> &out,
-                        const CuMatrixBase<BaseFloat> &out_diff,
-                        CuMatrixBase<BaseFloat> *in_diff) {
-    // column-offsets for data buffers 'in,out',
-    int32 input_offset = 0, output_offset = 0;
-    // loop over nnets,
-    for (int32 i = 0; i < nnet_.size(); i++) {
-      // get the data 'windows',
-      CuSubMatrix<BaseFloat> src(
-        out_diff.ColRange(output_offset, nnet_[i].OutputDim())
-      );
-      CuSubMatrix<BaseFloat> tgt(
-        in_diff->ColRange(input_offset, nnet_[i].InputDim())
-      );
-      // ::Backpropagate through auxiliary matrix (CuMatrix in the interface),
-      CuMatrix<BaseFloat> tgt_aux;
-      nnet_[i].Backpropagate(src, &tgt_aux);
-      tgt.CopyFromMat(tgt_aux);
-      // advance the offsets,
-      input_offset += nnet_[i].InputDim();
-      output_offset += nnet_[i].OutputDim();
-    }
-  }
-
-  void Update(const CuMatrixBase<BaseFloat> &input,
-              const CuMatrixBase<BaseFloat> &diff) {
-    { }  // do nothing
-  }
-
-  /**
-   * Overriding the default,
-   * which was UpdatableComponent::SetTrainOptions(...)
-   */
-  void SetTrainOptions(const NnetTrainOptions &opts) {
-    for (int32 i = 0; i < nnet_.size(); i++) {
-      nnet_[i].SetTrainOptions(opts);
-    }
-  }
-
-  /**
-   * Overriding the default,
-   * which was UpdatableComponent::SetLearnRateCoef(...)
-   */
-  void SetLearnRateCoef(BaseFloat val) {
-    // loop over nnets,
-    for (int32 i = 0; i < nnet_.size(); i++) {
-      // loop over components,
-      for (int32 j = 0; j < nnet_[i].NumComponents(); j++) {
-        if (nnet_[i].GetComponent(j).IsUpdatable()) {
-          UpdatableComponent& comp =
-            dynamic_cast<UpdatableComponent&>(nnet_[i].GetComponent(j));
-          // set the value,
-          comp.SetLearnRateCoef(val);
-        }
-      }
-    }
-  }
-
-  /**
-   * Overriding the default,
-   * which was UpdatableComponent::SetBiasLearnRateCoef(...)
-   */
-  void SetBiasLearnRateCoef(BaseFloat val) {
-    // loop over nnets,
-    for (int32 i = 0; i < nnet_.size(); i++) {
-      // loop over components,
-      for (int32 j = 0; j < nnet_[i].NumComponents(); j++) {
-        if (nnet_[i].GetComponent(j).IsUpdatable()) {
-          UpdatableComponent& comp =
-            dynamic_cast<UpdatableComponent&>(nnet_[i].GetComponent(j));
-          // set the value,
-          comp.SetBiasLearnRateCoef(val);
-        }
-      }
-    }
-  }
-
-  /**
-   * Overriding the default,
-   * which was MultistreamComponent::SetSeqLengths(...)
-   */
-  void SetSeqLengths(const std::vector<int32> &sequence_lengths) {
-    sequence_lengths_ = sequence_lengths;
-    // loop over nnets,
-    for (int32 i = 0; i < nnet_.size(); i++) {
-      nnet_[i].SetSeqLengths(sequence_lengths);
-    }
-  }
-
- private:
-  std::vector<Nnet> nnet_;
-};
-
-}  // namespace nnet1
-}  // namespace kaldi
-
-#endif  // KALDI_NNET_NNET_PARALLEL_COMPONENT_H_
diff --git a/src/nnet/nnet-parametric-relu.h b/src/nnet/nnet-parametric-relu.h
deleted file mode 100644
index 0cdf3347f35..00000000000
--- a/src/nnet/nnet-parametric-relu.h
+++ /dev/null
@@ -1,213 +0,0 @@
-// nnet/nnet-parametric-relu.h
-
-// Copyright 2016 Brno University of Technology (author: Murali Karthick B)
-//           2011-2014  Brno University of Technology (author: Karel Vesely)
-
-// See ../../COPYING for clarification regarding multiple authors
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//  http://www.apache.org/licenses/LICENSE-2.0
-//
-// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
-// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
-// MERCHANTABLITY OR NON-INFRINGEMENT.
-// See the Apache 2 License for the specific language governing permissions and
-// limitations under the License.
-
-
-#ifndef KALDI_NNET_NNET_PARAMETRIC_RELU_H_
-#define KALDI_NNET_NNET_PARAMETRIC_RELU_H_
-
-#include <string>
-
-#include "nnet/nnet-component.h"
-#include "nnet/nnet-utils.h"
-#include "cudamatrix/cu-math.h"
-
-namespace kaldi {
-namespace nnet1 {
-
-class ParametricRelu : public UpdatableComponent {
- public:
-  ParametricRelu(int32 dim_in, int32 dim_out):
-    UpdatableComponent(dim_in, dim_out),
-    alpha_(dim_out),
-    beta_(dim_out),
-    alpha_corr_(dim_out),
-    beta_corr_(dim_out),
-    alpha_learn_rate_coef_(0.0),
-    beta_learn_rate_coef_(0.0)
-  { }
-
-  ~ParametricRelu()
-  { }
-
-  Component* Copy() const { return new ParametricRelu(*this); }
-  ComponentType GetType() const { return kParametricRelu; }
-
-  void InitData(std::istream &is) {
-    // define options
-    BaseFloat alpha = 1.0, beta = 0.0;
-
-    // parse config
-    std::string token;
-    while (is >> std::ws, !is.eof()) {
-      ReadToken(is, false, &token);
-      /**/ if (token == "<Alpha>") ReadBasicType(is, false, &alpha);
-      else if (token == "<Beta>") ReadBasicType(is, false, &beta);
-      else if (token == "<AlphaLearnRateCoef>") ReadBasicType(is, false, &alpha_learn_rate_coef_);
-      else if (token == "<BetaLearnRateCoef>") ReadBasicType(is, false, &beta_learn_rate_coef_);
-      else KALDI_ERR << "Unknown token " << token << ", a typo in config?"
-                  << " (Alpha|Beta|AlphaLearnRateCoef|BetaLearnRateCoef)";
-    }
-
-    // Initialize trainable parameters,
-    alpha_.Set(alpha);
-    beta_.Set(beta);
-  }
-
-  void ReadData(std::istream &is, bool binary) {
-    // Read all the '<Tokens>' in arbitrary order,
-    while ('<' == Peek(is, binary)) {
-      int first_char = PeekToken(is, binary);
-      switch (first_char) {
-        case 'A': ExpectToken(is, binary, "<AlphaLearnRateCoef>");
-          ReadBasicType(is, binary, &alpha_learn_rate_coef_);
-          break;
-        case 'B': ExpectToken(is, binary, "<BetaLearnRateCoef>");
-          ReadBasicType(is, binary, &beta_learn_rate_coef_);
-          break;
-        default:
-          std::string token;
-          ReadToken(is, false, &token);
-          KALDI_ERR << "Unknown token: " << token;
-      }
-    }
-    // ParametricRelu scaling parameters
-    alpha_.Read(is, binary);
-    beta_.Read(is, binary);
-    KALDI_ASSERT(alpha_.Dim() == output_dim_);
-    KALDI_ASSERT(beta_.Dim() == output_dim_);
-  }
-
-  void WriteData(std::ostream &os, bool binary) const {
-    WriteToken(os, binary, "<AlphaLearnRateCoef>");
-    WriteBasicType(os, binary, alpha_learn_rate_coef_);
-    WriteToken(os, binary, "<BetaLearnRateCoef>");
-    WriteBasicType(os, binary, beta_learn_rate_coef_);
-
-    // ParametricRelu scales for each neuron,
-    if (!binary) os << "\n";
-    alpha_.Write(os, binary);
-    beta_.Write(os, binary);
-  }
-
-  int32 NumParams() const {
-    return alpha_.Dim() + beta_.Dim();
-  }
-
-  void GetGradient(VectorBase<BaseFloat>* gradient) const {
-    KALDI_ASSERT(gradient->Dim() == NumParams());
-    int32 alpha_num_elem = alpha_.Dim();
-    int32 beta_num_elem = beta_.Dim();
-    gradient->Range(0, alpha_num_elem).CopyFromVec(Vector<BaseFloat>(alpha_corr_));
-    gradient->Range(alpha_num_elem, beta_num_elem).CopyFromVec(Vector<BaseFloat>(beta_corr_));
-  }
-
-  void GetParams(VectorBase<BaseFloat>* params) const {
-    KALDI_ASSERT(params->Dim() == NumParams());
-    int32 alpha_num_elem = alpha_.Dim();
-    int32 beta_num_elem = beta_.Dim();
-    params->Range(0, alpha_num_elem).CopyFromVec(Vector<BaseFloat>(alpha_));
-    params->Range(alpha_num_elem, beta_num_elem).CopyFromVec(Vector<BaseFloat>(beta_));
-  }
-
-  void SetParams(const VectorBase<BaseFloat>& params) {
-    KALDI_ASSERT(params.Dim() == NumParams());
-    int32 alpha_num_elem = alpha_.Dim();
-    int32 beta_num_elem = beta_.Dim();
-    alpha_.CopyFromVec(params.Range(0, alpha_num_elem));
-    beta_.CopyFromVec(params.Range(alpha_num_elem, beta_num_elem));
-  }
-
-  std::string Info() const {
-    return std::string("\n  alpha") +
-      MomentStatistics(alpha_) +
-      ", alpha-lr-coef " + ToString(alpha_learn_rate_coef_) +
-      "\n  beta" + MomentStatistics(beta_) +
-      ", beta-lr-coef " + ToString(beta_learn_rate_coef_);
-  }
-  std::string InfoGradient() const {
-    return std::string("\n  alpha_grad") +
-      MomentStatistics(alpha_corr_) +
-      ", alpha-lr-coef " + ToString(alpha_learn_rate_coef_) +
-      "\n  beta_grad" + MomentStatistics(beta_corr_) +
-      ", beta-lr-coef " + ToString(beta_learn_rate_coef_);
-  }
-
-  void PropagateFnc(const CuMatrixBase<BaseFloat> &in,
-                    CuMatrixBase<BaseFloat> *out) {
-    // out = (in < 0.0 ? aplha*in : beta*in)
-    out->ParametricRelu(in, alpha_, beta_);
-  }
-
-  void BackpropagateFnc(const CuMatrixBase<BaseFloat> &in,
-                        const CuMatrixBase<BaseFloat> &out,
-                        const CuMatrixBase<BaseFloat> &out_diff,
-                        CuMatrixBase<BaseFloat> *in_diff) {
-    // in_diff = (in > 0 ? alpha * out_diff : beta * out_diff)
-    in_diff->DiffParametricRelu(in, out_diff, alpha_, beta_);
-  }
-
-  void Update(const CuMatrixBase<BaseFloat> &input,
-              const CuMatrixBase<BaseFloat> &diff) {
-    // we use these hyperparameters,
-    const BaseFloat alpha_lr = opts_.learn_rate * alpha_learn_rate_coef_;
-    const BaseFloat beta_lr = opts_.learn_rate * beta_learn_rate_coef_;
-    const BaseFloat mmt = opts_.momentum;
-
-    if (alpha_learn_rate_coef_ > 0.0) {
-       // get gradient,
-       alpha_aux_ = input;
-       alpha_aux_.ApplyFloor(0.0); // masking positive Relu inputs,
-       alpha_aux_.MulElements(diff);
-       alpha_corr_.AddRowSumMat(1.0, alpha_aux_, mmt);
-       // update,
-       alpha_.AddVec(-alpha_lr, alpha_corr_);
-    }
-    if (beta_learn_rate_coef_ > 0.0) {
-       // get gradient,
-       beta_aux_ = input;
-       beta_aux_.ApplyCeiling(0.0); // masking positive Relu inputs,
-       beta_aux_.MulElements(diff);
-       beta_corr_.AddRowSumMat(1.0, beta_aux_, mmt);
-       beta_.AddVec(-beta_lr, beta_corr_);
-    }
-  }
-
- private:
-  CuVector<BaseFloat> alpha_;  ///< Vector of 'alphas', one value per neuron.
-  CuVector<BaseFloat> beta_;  ///< Vector of 'betas', one value per neuron.
-
-  CuVector<BaseFloat> alpha_corr_;  ///< Vector of 'alpha' updates.
-  CuVector<BaseFloat> beta_corr_;  ///< Vector of 'beta' updates.
-
-  /// Auxiliary matrix for getting 'alpha' updates,
-  CuMatrix<BaseFloat> alpha_aux_;
-  /// Auxiliary matrix for getting 'beta' updates,
-  CuMatrix<BaseFloat> beta_aux_;
-
-  /// Controls learning rate for alpha (0.0 disables learning),
-  BaseFloat alpha_learn_rate_coef_;
-  /// Controls learning rate for beta (0.0 disables learning),
-  BaseFloat beta_learn_rate_coef_;
-};
-
-}  // namespace nnet1
-}  // namespace kaldi
-
-#endif  // KALDI_NNET_NNET_PARAMETRIC_RELU_H_
diff --git a/src/nnet/nnet-pdf-prior.cc b/src/nnet/nnet-pdf-prior.cc
deleted file mode 100644
index 90ee3239a39..00000000000
--- a/src/nnet/nnet-pdf-prior.cc
+++ /dev/null
@@ -1,90 +0,0 @@
-// nnet/nnet-pdf-prior.cc
-
-// Copyright 2013  Brno University of Technology (Author: Karel Vesely);
-//                 Arnab Ghoshal
-
-// See ../../COPYING for clarification regarding multiple authors
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//  http://www.apache.org/licenses/LICENSE-2.0
-//
-// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
-// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
-// MERCHANTABLITY OR NON-INFRINGEMENT.
-// See the Apache 2 License for the specific language governing permissions and
-// limitations under the License.
-
-#include "nnet/nnet-pdf-prior.h"
-
-namespace kaldi {
-namespace nnet1 {
-
-PdfPrior::PdfPrior(const PdfPriorOptions &opts)
-    : prior_scale_(opts.prior_scale) {
-  if (opts.class_frame_counts == "") {
-    // class_frame_counts is empty, the PdfPrior is deactivated...
-    // (for example when 'nnet-forward' generates bottleneck features)
-    return;
-  }
-
-  KALDI_LOG << "Computing pdf-priors from : " << opts.class_frame_counts;
-
-  Vector<double> frame_counts, rel_freq, log_priors;
-  {
-    Input in;
-    in.OpenTextMode(opts.class_frame_counts);
-    frame_counts.Read(in.Stream(), false);
-    in.Close();
-  }
-
-  // get relative frequencies,
-  rel_freq = frame_counts;
-  rel_freq.Scale(1.0/frame_counts.Sum());
-
-  // get the log-prior,
-  log_priors = rel_freq;
-  log_priors.Add(1e-20);
-  log_priors.ApplyLog();
-
-  // Make the priors for classes with low counts +inf (i.e. -log(0))
-  // such that the classes have 0 likelihood (i.e. -inf log-likelihood).
-  // We use sqrt(FLT_MAX) instead of -kLogZeroFloat to prevent NANs
-  // from appearing in computation.
-  int32 num_floored = 0;
-  for (int32 i = 0; i < log_priors.Dim(); i++) {
-    if (rel_freq(i) < opts.prior_floor) {
-      log_priors(i) = sqrt(FLT_MAX);
-      num_floored++;
-    }
-  }
-  KALDI_LOG << "Floored " << num_floored << " pdf-priors "
-            << "(hard-set to " << sqrt(FLT_MAX)
-            << ", which disables DNN output when decoding)";
-
-  // sanity check,
-  KALDI_ASSERT(KALDI_ISFINITE(log_priors.Sum()));
-
-  // push to GPU,
-  log_priors_ = Vector<BaseFloat>(log_priors);
-}
-
-
-void PdfPrior::SubtractOnLogpost(CuMatrixBase<BaseFloat> *llk) {
-  if (log_priors_.Dim() == 0) {
-    KALDI_ERR << "--class-frame-counts is empty: Cannot initialize priors "
-              << "without the counts.";
-  }
-  if (log_priors_.Dim() != llk->NumCols()) {
-    KALDI_ERR << "Dimensionality mismatch,"
-              << " class_frame_counts " << log_priors_.Dim()
-              << " pdf_output_llk " << llk->NumCols();
-  }
-  llk->AddVecToRows(-prior_scale_, log_priors_);
-}
-
-}  // namespace nnet1
-}  // namespace kaldi
diff --git a/src/nnet/nnet-pdf-prior.h b/src/nnet/nnet-pdf-prior.h
deleted file mode 100644
index f02e61cc993..00000000000
--- a/src/nnet/nnet-pdf-prior.h
+++ /dev/null
@@ -1,77 +0,0 @@
-// nnet/nnet-pdf-prior.h
-
-// Copyright 2013  Brno University of Technology (Author: Karel Vesely)
-
-// See ../../COPYING for clarification regarding multiple authors
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//  http://www.apache.org/licenses/LICENSE-2.0
-//
-// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
-// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
-// MERCHANTABLITY OR NON-INFRINGEMENT.
-// See the Apache 2 License for the specific language governing permissions and
-// limitations under the License.
-
-#ifndef KALDI_NNET_NNET_PDF_PRIOR_H_
-#define KALDI_NNET_NNET_PDF_PRIOR_H_
-
-#include <cfloat>
-#include <string>
-
-#include "base/kaldi-common.h"
-#include "util/common-utils.h"
-#include "matrix/matrix-lib.h"
-#include "cudamatrix/cu-matrix.h"
-#include "cudamatrix/cu-vector.h"
-
-namespace kaldi {
-namespace nnet1 {
-
-struct PdfPriorOptions {
-  std::string class_frame_counts;
-  BaseFloat prior_scale;
-  BaseFloat prior_floor;
-
-  PdfPriorOptions():
-    class_frame_counts(""),
-    prior_scale(1.0),
-    prior_floor(1e-10)
-  { }
-
-  void Register(OptionsItf *opts) {
-    opts->Register("class-frame-counts", &class_frame_counts,
-                   "Vector with frame-counts of pdfs to compute log-priors."
-                   " (priors are typically subtracted from log-posteriors"
-                   " or pre-softmax activations)");
-    opts->Register("prior-scale", &prior_scale,
-                   "Scaling factor to be applied on pdf-log-priors");
-    opts->Register("prior-floor", &prior_floor,
-                   "Flooring constatnt for prior probability "
-                   "(i.e. label rel. frequency)");
-  }
-};
-
-class PdfPrior {
- public:
-  /// Initialize pdf-prior from options
-  explicit PdfPrior(const PdfPriorOptions &opts);
-
-  /// Subtract pdf priors from log-posteriors to get pseudo log-likelihoods
-  void SubtractOnLogpost(CuMatrixBase<BaseFloat> *llk);
-
- private:
-  BaseFloat prior_scale_;
-  CuVector<BaseFloat> log_priors_;
-
-  KALDI_DISALLOW_COPY_AND_ASSIGN(PdfPrior);
-};
-
-}  // namespace nnet1
-}  // namespace kaldi
-
-#endif  // KALDI_NNET_NNET_PDF_PRIOR_H_
diff --git a/src/nnet/nnet-randomizer-test.cc b/src/nnet/nnet-randomizer-test.cc
deleted file mode 100644
index 1f4b2564089..00000000000
--- a/src/nnet/nnet-randomizer-test.cc
+++ /dev/null
@@ -1,240 +0,0 @@
-// nnet/nnet-randomizer-test.cc
-
-// Copyright 2013  Brno University of Technology (author: Karel Vesely)
-
-// See ../../COPYING for clarification regarding multiple authors
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//  http://www.apache.org/licenses/LICENSE-2.0
-//
-// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
-// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
-// MERCHANTABLITY OR NON-INFRINGEMENT.
-// See the Apache 2 License for the specific language governing permissions and
-// limitations under the License.
-
-#include "nnet/nnet-randomizer.h"
-
-#include <numeric>
-#include <vector>
-#include <algorithm>
-
-using namespace kaldi;
-using namespace kaldi::nnet1;
-
-//////////////////////////////////////////////////
-
-template<class Real>
-static void InitRand(VectorBase<Real> *v) {
-  for (MatrixIndexT i = 0;i < v->Dim();i++)
-    (*v)(i) = RandGauss();
-}
-
-template<class Real>
-static void InitRand(MatrixBase<Real> *M) {
-  do {
-    for (MatrixIndexT i = 0;i < M->NumRows();i++)
-      for (MatrixIndexT j = 0;j < M->NumCols();j++)
-        (*M)(i, j) = RandGauss();
-  } while (M->NumRows() != 0 && M->Cond() > 100);
-}
-
-
-template<class Real>
-static void AssertEqual(const VectorBase<Real> &A,
-                        const VectorBase<Real> &B,
-                        float tol = 0.001) {
-  KALDI_ASSERT(A.Dim() == B.Dim());
-  for (MatrixIndexT i = 0; i < A.Dim(); i++) {
-    KALDI_ASSERT(std::abs(A(i)-B(i)) < tol);
-  }
-}
-
-
-template<class RandomAccessIterator>
-static void AssertEqual(RandomAccessIterator begin1, RandomAccessIterator end1,
-                        RandomAccessIterator begin2, RandomAccessIterator end2) {
-  KALDI_ASSERT((end1 - begin1) == (end2 - begin2));
-  KALDI_ASSERT(end1 > begin1);
-  for ( ; begin1 < end1; ++begin1, ++begin2) {
-    KALDI_ASSERT(*begin1 == *begin2);
-  }
-}
-
-
-//////////////////////////////////////////////////
-
-void UnitTestRandomizerMask() {
-  NnetDataRandomizerOptions c;
-  RandomizerMask r;
-  r.Init(c);
-  const std::vector<int32>& m = r.Generate(5);
-  KALDI_ASSERT(m.size() == 5);
-  int32 sum_of_elems = std::accumulate(m.begin(), m.end(),0);
-  KALDI_ASSERT(sum_of_elems == 4 + 3 + 2 + 1 + 0);
-}
-
-void UnitTestMatrixRandomizer() {
-  Matrix<BaseFloat> m(1111, 10);
-  InitRand(&m);
-  CuMatrix<BaseFloat> m2(m);
-  // config
-  NnetDataRandomizerOptions c;
-  c.randomizer_size = 1000;
-  c.minibatch_size = 100;
-  // randomizer
-  MatrixRandomizer r;
-  r.Init(c);
-  r.AddData(m2);
-  KALDI_ASSERT(r.IsFull());
-  // create vector with consecutive indices
-  std::vector<int32> mask(1111);
-  for (int32 i = 0; i < 1111; i++) {
-    mask[i] = i;
-  }
-  r.Randomize(mask);  // no shuffling
-  // make sure we get same data we put to randomizer
-  int32 i=0;
-  for ( ; !r.Done(); r.Next(), i++) {
-    KALDI_LOG << i;
-    const CuMatrixBase<BaseFloat> &m3 = r.Value();
-    Matrix<BaseFloat> m4(m3.NumRows(), m3.NumCols());
-    m3.CopyToMat(&m4);
-    AssertEqual(m4, m.RowRange(i * c.minibatch_size, c.minibatch_size));
-  }
-  KALDI_ASSERT(i == 11);  // 11 minibatches
-
-  KALDI_LOG << "Filling for 2nd time";
-  // try to fill buffer one more time, and empty it
-  KALDI_ASSERT(!r.IsFull());
-  r.AddData(m2);
-  KALDI_ASSERT(r.IsFull());
-  KALDI_ASSERT(r.NumFrames() == 11 + 1111);
-  {  // check last 11 rows were copied to the front in the buffer
-    const CuMatrixBase<BaseFloat> &m3 = r.Value();
-    Matrix<BaseFloat> m4(m3.NumRows(), m3.NumCols());
-    m3.CopyToMat(&m4);
-    AssertEqual(m4.RowRange(0, 11), m.RowRange(1100, 11));
-  }
-  KALDI_ASSERT(!r.Done());
-  for ( ; !r.Done(); r.Next(), i++) {
-    KALDI_LOG << i;
-    const CuMatrixBase<BaseFloat>& m3 = r.Value();
-    static_cast<const void>(m3);  // variable no longer unused,
-  }
-  KALDI_ASSERT(i == 22);  // 22 minibatches
-}
-
-void UnitTestVectorRandomizer() {
-  Vector<BaseFloat> v(1111);
-  InitRand(&v);
-  // config
-  NnetDataRandomizerOptions c;
-  c.randomizer_size = 1000;
-  c.minibatch_size = 100;
-  // randomizer
-  VectorRandomizer r;
-  r.Init(c);
-  r.AddData(v);
-  KALDI_ASSERT(r.IsFull());
-  // create vector with consecutive indices
-  std::vector<int32> mask(1111);
-  for (int32 i = 0; i < 1111; i++) {
-    mask[i] = i;
-  }
-  r.Randomize(mask);  // no shuffling
-  // make sure we get same data we put to randomizer
-  int32 i = 0;
-  for ( ; !r.Done(); r.Next(), i++) {
-    KALDI_LOG << i;
-    const VectorBase<BaseFloat> &v2 = r.Value();
-    AssertEqual(v2, v.Range(i * c.minibatch_size, c.minibatch_size));
-  }
-  KALDI_ASSERT(i == 11);  // 11 minibatches
-
-  KALDI_LOG << "Filling for 2nd time";
-  // try to fill buffer one more time, and empty it
-  KALDI_ASSERT(!r.IsFull());
-  r.AddData(v);
-  KALDI_ASSERT(r.IsFull());
-  KALDI_ASSERT(r.NumFrames() == 11 + 1111);
-  {  // check last 11 rows were copied to the front in the buffer
-    const VectorBase<BaseFloat> &v2 = r.Value();
-    AssertEqual(v2.Range(0, 11), v.Range(1100, 11));
-  }
-  KALDI_ASSERT(!r.Done());
-  for ( ; !r.Done(); r.Next(), i++) {
-    KALDI_LOG << i;
-    const VectorBase<BaseFloat>& v2 = r.Value();
-    static_cast<const void>(v2);  // variable no longer unused,
-  }
-  KALDI_ASSERT(i == 22);  // 22 minibatches
-}
-
-void UnitTestStdVectorRandomizer() {
-  // prepare vector with some data,
-  std::vector<int32> v(1111);
-  for (int32 i = 0; i < v.size(); i++) {
-    v.at(i) = i;
-  }
-  std::random_shuffle(v.begin(), v.end());
-
-  // config
-  NnetDataRandomizerOptions c;
-  c.randomizer_size = 1000;
-  c.minibatch_size = 100;
-  // randomizer
-  Int32VectorRandomizer r;
-  r.Init(c);
-  r.AddData(v);
-  KALDI_ASSERT(r.IsFull());
-  // create vector with consecutive indices
-  std::vector<int32> mask(1111);
-  for (int32 i = 0; i < 1111; i++) {
-    mask[i]=i;
-  }
-  r.Randomize(mask);  // no shuffling
-  // make sure we get same data we put to randomizer
-  int32 i = 0;
-  for ( ; !r.Done(); r.Next(), i++) {
-    KALDI_LOG << i;
-    std::vector<int32> v2 = r.Value();
-    AssertEqual(v2.begin(),
-                v2.end(),
-                v.begin() + (i * c.minibatch_size),
-                v.begin() + ((i+1) * c.minibatch_size));
-  }
-  KALDI_ASSERT(i == 11);  // 11 minibatches
-
-  KALDI_LOG << "Filling for 2nd time";
-  // try to fill buffer one more time, and empty it
-  KALDI_ASSERT(!r.IsFull());
-  r.AddData(v);
-  KALDI_ASSERT(r.IsFull());
-  KALDI_ASSERT(r.NumFrames() == 11 + 1111);
-  {  // check last 11 rows were copied to the front in the buffer
-    std::vector<int32> v2 = r.Value();
-    AssertEqual(v2.begin(), v2.begin()+11, v.begin()+1100, v.begin()+1100+11);
-  }
-  KALDI_ASSERT(!r.Done());
-  for ( ; !r.Done(); r.Next(), i++) {
-    KALDI_LOG << i;
-    std::vector<int32> v2 = r.Value();
-  }
-  KALDI_ASSERT(i == 22);  // 22 minibatches
-}
-
-
-int main() {
-  UnitTestRandomizerMask();
-  UnitTestMatrixRandomizer();
-  UnitTestVectorRandomizer();
-  UnitTestStdVectorRandomizer();
-
-  std::cout << "Tests succeeded.\n";
-}
-
diff --git a/src/nnet/nnet-randomizer.cc b/src/nnet/nnet-randomizer.cc
deleted file mode 100644
index b15214ea477..00000000000
--- a/src/nnet/nnet-randomizer.cc
+++ /dev/null
@@ -1,234 +0,0 @@
-// nnet/nnet-randomizer.cc
-
-// Copyright 2013  Brno University of Technology (author: Karel Vesely)
-
-// See ../../COPYING for clarification regarding multiple authors
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//  http://www.apache.org/licenses/LICENSE-2.0
-//
-// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
-// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
-// MERCHANTABLITY OR NON-INFRINGEMENT.
-// See the Apache 2 License for the specific language governing permissions and
-// limitations under the License.
-
-#include "nnet/nnet-randomizer.h"
-
-#include <vector>
-#include <algorithm>
-#include <utility>
-
-namespace kaldi {
-namespace nnet1 {
-
-/* RandomizerMask:: */
-
-void RandomizerMask::Init(const NnetDataRandomizerOptions& conf) {
-  KALDI_LOG << "Seeding by srand with : " << conf.randomizer_seed;
-  srand(conf.randomizer_seed);
-}
-
-const std::vector<int32>& RandomizerMask::Generate(int32 mask_size) {
-  mask_.resize(mask_size);
-  for (int32 i = 0; i < mask_size; i++) mask_[i] = i;
-  // shuffle using built-in random generator:
-  std::random_shuffle(mask_.begin(), mask_.end());
-  return mask_;
-}
-
-
-/* MatrixRandomizer:: */
-
-void MatrixRandomizer::AddData(const CuMatrixBase<BaseFloat>& m) {
-  // pre-allocate before 1st use
-  if (data_.NumCols() == 0) {
-    data_.Resize(conf_.randomizer_size, m.NumCols());
-  }
-  // optionally put previous left-over to front
-  if (data_begin_ > 0) {
-    KALDI_ASSERT(data_begin_ <= data_end_);  // sanity check,
-    int32 leftover = data_end_ - data_begin_;
-    KALDI_ASSERT(leftover < data_begin_);  // no overlap,
-    if (leftover > 0) {
-      data_.RowRange(0, leftover).CopyFromMat(data_.RowRange(data_begin_, leftover));
-    }
-    data_begin_ = 0;
-    data_end_ = leftover;
-    // set zero to the rest of the buffer,
-    data_.RowRange(leftover, data_.NumRows() - leftover).SetZero();
-  }
-  // extend the buffer if necessary,
-  if (data_.NumRows() < data_end_ + m.NumRows()) {
-    // CuMatrix -> Matrix -> CuMatrix (needs less GPU memory),
-    Matrix<BaseFloat> data_aux(data_);
-    // Add extra 3% rows, so we don't reallocate soon:
-    int32 extra_rows = 0.03 * data_.NumRows();
-    data_.Resize(data_end_ + m.NumRows() + extra_rows, data_.NumCols());
-    data_.RowRange(0, data_aux.NumRows()).CopyFromMat(data_aux);
-  }
-  // copy the data
-  data_.RowRange(data_end_, m.NumRows()).CopyFromMat(m);
-  data_end_ += m.NumRows();
-}
-
-void MatrixRandomizer::Randomize(const std::vector<int32>& mask) {
-  KALDI_ASSERT(data_begin_ == 0);
-  KALDI_ASSERT(data_end_ > 0);
-  KALDI_ASSERT(data_end_ == mask.size());
-  // Copy to auxiliary buffer for unshuffled data
-  data_aux_ = data_;
-  // Put the mask to GPU
-  CuArray<int32> mask_in_gpu(mask.size());
-  mask_in_gpu.CopyFromVec(mask);
-  // Randomize the data, mask is used to index rows in source matrix:
-  // (Here the vector 'mask_in_gpu' is typically shorter than number
-  //  of rows in 'data_aux_', because the buffer 'data_aux_'
-  //  is larger than capacity 'randomizer_size'.
-  //  The extra rows in 'data_aux_' do not contain speech frames and
-  //  are not copied from 'data_aux_', the extra rows in 'data_' are
-  //  unchanged by cu::Randomize.)
-  cu::Randomize(data_aux_, mask_in_gpu, &data_);
-}
-
-void MatrixRandomizer::Next() {
-  data_begin_ += conf_.minibatch_size;
-}
-
-const CuMatrixBase<BaseFloat>& MatrixRandomizer::Value() {
-  // make sure we have data for next minibatch,
-  KALDI_ASSERT(data_end_ - data_begin_ >= conf_.minibatch_size);
-  // prepare the mini-batch buffer,
-  minibatch_.Resize(conf_.minibatch_size, data_.NumCols(), kUndefined);
-  minibatch_.CopyFromMat(data_.RowRange(data_begin_, conf_.minibatch_size));
-  return minibatch_;
-}
-
-
-/* VectorRandomizer */
-
-void VectorRandomizer::AddData(const Vector<BaseFloat>& v) {
-  // pre-allocate before 1st use
-  if (data_.Dim() == 0) {
-    data_.Resize(conf_.randomizer_size);
-  }
-  // optionally put previous left-over to front
-  if (data_begin_ > 0) {
-    KALDI_ASSERT(data_begin_ <= data_end_);  // sanity check
-    int32 leftover = data_end_ - data_begin_;
-    KALDI_ASSERT(leftover < data_begin_);  // no overlap
-    if (leftover > 0) {
-      data_.Range(0, leftover).CopyFromVec(data_.Range(data_begin_, leftover));
-    }
-    data_begin_ = 0;
-    data_end_ = leftover;
-    data_.Range(leftover, data_.Dim()-leftover).SetZero();  // zeroing the rest
-  }
-  // extend the buffer if necessary
-  if (data_.Dim() < data_end_ + v.Dim()) {
-    Vector<BaseFloat> data_aux(data_);
-    data_.Resize(data_end_ + v.Dim() + 1000);  // +1000 row surplus
-    data_.Range(0, data_aux.Dim()).CopyFromVec(data_aux);
-  }
-  // copy the data
-  data_.Range(data_end_, v.Dim()).CopyFromVec(v);
-  data_end_ += v.Dim();
-}
-
-void VectorRandomizer::Randomize(const std::vector<int32>& mask) {
-  KALDI_ASSERT(data_begin_ == 0);
-  KALDI_ASSERT(data_end_ > 0);
-  KALDI_ASSERT(data_end_ == mask.size());
-  // Use auxiliary buffer for unshuffled data
-  Vector<BaseFloat> data_aux(data_);
-  // randomize the data, mask is used to index elements in source vector
-  for (int32 i = 0; i < mask.size(); i++) {
-    data_(i) = data_aux(mask.at(i));
-  }
-}
-
-void VectorRandomizer::Next() {
-  data_begin_ += conf_.minibatch_size;
-}
-
-const Vector<BaseFloat>& VectorRandomizer::Value() {
-  // make sure we have data for next minibatch,
-  KALDI_ASSERT(data_end_ - data_begin_ >= conf_.minibatch_size);
-  // prepare the mini-batch buffer,
-  minibatch_.Resize(conf_.minibatch_size, kUndefined);
-  minibatch_.CopyFromVec(data_.Range(data_begin_, conf_.minibatch_size));
-  return minibatch_;
-}
-
-
-/* StdVectorRandomizer */
-
-template<typename T>
-void StdVectorRandomizer<T>::AddData(const std::vector<T>& v) {
-  // pre-allocate before 1st use
-  if (data_.size() == 0) {
-    data_.resize(conf_.randomizer_size);
-  }
-  // optionally put previous left-over to front
-  if (data_begin_ > 0) {
-    KALDI_ASSERT(data_begin_ <= data_end_);  // sanity check
-    int32 leftover = data_end_ - data_begin_;
-    KALDI_ASSERT(leftover < data_begin_);  // no overlap
-    if (leftover > 0) {
-      typename std::vector<T>::iterator leftover_begin = data_.begin() + data_begin_;
-      std::copy(leftover_begin, leftover_begin + leftover, data_.begin());
-    }
-    data_begin_ = 0;
-    data_end_ = leftover;
-  }
-  // extend the buffer if necessary
-  if (data_.size() < data_end_ + v.size()) {
-    data_.resize(data_end_ + v.size() + 1000);  // +1000 row surplus
-  }
-  // copy the data
-  std::copy(v.begin(), v.end(), data_.begin()+data_end_);
-  data_end_ += v.size();
-}
-
-template<typename T>
-void StdVectorRandomizer<T>::Randomize(const std::vector<int32>& mask) {
-  KALDI_ASSERT(data_begin_ == 0);
-  KALDI_ASSERT(data_end_ > 0);
-  KALDI_ASSERT(data_end_ == mask.size());
-  // Use auxiliary buffer for unshuffled data
-  std::vector<T> data_aux(data_);
-  // randomize the data, mask is used to index elements in source vector
-  for (int32 i = 0; i < mask.size(); i++) {
-    data_.at(i) = data_aux.at(mask.at(i));
-  }
-}
-
-template<typename T>
-void StdVectorRandomizer<T>::Next() {
-  data_begin_ += conf_.minibatch_size;
-}
-
-template<typename T>
-const std::vector<T>& StdVectorRandomizer<T>::Value() {
-  // make sure we have enough data for minibatch,
-  KALDI_ASSERT(data_end_ - data_begin_ >= conf_.minibatch_size);
-  // prepare the mini-batch buffer,
-  minibatch_.resize(conf_.minibatch_size);
-  typename std::vector<T>::iterator first = data_.begin() + data_begin_;
-  typename std::vector<T>::iterator last  = first + conf_.minibatch_size;
-  std::copy(first, last, minibatch_.begin());
-  return minibatch_;
-}
-
-// Instantiate template StdVectorRandomizer with types we expect to operate on,
-// - Int32VectorRandomizer:
-template class StdVectorRandomizer<int32>;
-// - PosteriorRandomizer:
-template class StdVectorRandomizer<std::vector<std::pair<int32, BaseFloat> > >;
-
-}  // namespace nnet1
-}  // namespace kaldi
diff --git a/src/nnet/nnet-randomizer.h b/src/nnet/nnet-randomizer.h
deleted file mode 100644
index 71da6950599..00000000000
--- a/src/nnet/nnet-randomizer.h
+++ /dev/null
@@ -1,274 +0,0 @@
-// nnet/nnet-randomizer.h
-
-// Copyright 2013  Brno University of Technology (author: Karel Vesely)
-
-// See ../../COPYING for clarification regarding multiple authors
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//  http://www.apache.org/licenses/LICENSE-2.0
-//
-// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
-// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
-// MERCHANTABLITY OR NON-INFRINGEMENT.
-// See the Apache 2 License for the specific language governing permissions and
-// limitations under the License.
-
-
-#ifndef KALDI_NNET_NNET_RANDOMIZER_H_
-#define KALDI_NNET_NNET_RANDOMIZER_H_
-
-#include <utility>
-#include <vector>
-
-#include "base/kaldi-math.h"
-#include "itf/options-itf.h"
-#include "cudamatrix/cu-matrix.h"
-#include "cudamatrix/cu-math.h"
-
-namespace kaldi {
-namespace nnet1 {
-
-/**
- * Configuration variables that affect how frame-level shuffling is done.
- */
-struct NnetDataRandomizerOptions {
-  int32 randomizer_size;  ///< Maximum number of samples we have in memory,
-  int32 randomizer_seed;
-  int32 minibatch_size;
-
-  NnetDataRandomizerOptions():
-    randomizer_size(32768),
-    randomizer_seed(777),
-    minibatch_size(256)
-  { }
-
-  void Register(OptionsItf *opts) {
-    opts->Register("randomizer-size", &randomizer_size,
-       "Capacity of randomizer, length of concatenated utterances which, "
-       "are used for frame-level shuffling (in frames, affects memory "
-       "consumption, max 8000000).");
-    opts->Register("randomizer-seed", &randomizer_seed,
-       "Seed value for srand, sets fixed order of frame-level shuffling");
-    opts->Register("minibatch-size", &minibatch_size, "Size of a minibatch.");
-  }
-};
-
-
-/**
- * Generates randomly ordered vector of indices,
- */
-class RandomizerMask {
- public:
-  RandomizerMask()
-  { }
-
-  explicit RandomizerMask(const NnetDataRandomizerOptions &conf) {
-    Init(conf);
-  }
-
-  /// Init, call srand,
-  void Init(const NnetDataRandomizerOptions& conf);
-
-  /// Generate randomly ordered vector of integers 0..[mask_size -1],
-  const std::vector<int32>& Generate(int32 mask_size);
-
- private:
-  std::vector<int32> mask_;
-};
-
-
-/**
- * Shuffles rows of a matrix according to the indices in the mask,
- */
-class MatrixRandomizer {
- public:
-  MatrixRandomizer():
-    data_begin_(0),
-    data_end_(0)
-  { }
-
-  explicit MatrixRandomizer(const NnetDataRandomizerOptions &conf):
-    data_begin_(0),
-    data_end_(0)
-  {
-    Init(conf);
-  }
-
-  /// Set the randomizer parameters (size)
-  void Init(const NnetDataRandomizerOptions& conf) {
-    conf_ = conf;
-  }
-
-  /// Add data to randomization buffer
-  void AddData(const CuMatrixBase<BaseFloat>& m);
-
-  /// Returns true, when capacity is full
-  bool IsFull() {
-    return ((data_begin_ == 0) && (data_end_ > conf_.randomizer_size ));
-  }
-
-  /// Number of frames stored inside the Randomizer
-  int32 NumFrames() {
-    return data_end_;
-  }
-
-  /// Randomize matrix row-order using mask
-  void Randomize(const std::vector<int32>& mask);
-
-  /// Returns true, if no more data for another mini-batch (after current one)
-  bool Done() {
-    return (data_end_ - data_begin_ < conf_.minibatch_size);
-  }
-
-  /// Sets cursor to next mini-batch
-  void Next();
-
-  /// Returns matrix-window with next mini-batch
-  const CuMatrixBase<BaseFloat>& Value();
-
- private:
-  CuMatrix<BaseFloat> data_;  // can be larger than 'randomizer_size'
-  CuMatrix<BaseFloat> data_aux_;  // auxiliary buffer for shuffling
-  CuMatrix<BaseFloat> minibatch_;  // buffer for mini-batch
-
-  /// A cursor, pointing to the 'row' where the next mini-batch begins,
-  int32 data_begin_;
-  /// A cursor, pointing to the 'row' after the end of data,
-  int32 data_end_;
-
-  NnetDataRandomizerOptions conf_;
-};
-
-
-/// Randomizes elements of a vector according to a mask
-class VectorRandomizer {
- public:
-  VectorRandomizer():
-    data_begin_(0),
-    data_end_(0)
-  { }
-
-  explicit VectorRandomizer(const NnetDataRandomizerOptions &conf):
-    data_begin_(0),
-    data_end_(0)
-  {
-    Init(conf);
-  }
-
-  /// Set the randomizer parameters (size)
-  void Init(const NnetDataRandomizerOptions& conf) {
-    conf_ = conf;
-  }
-
-  /// Add data to randomization buffer
-  void AddData(const Vector<BaseFloat>& v);
-
-  /// Returns true, when capacity is full
-  bool IsFull() {
-    return ((data_begin_ == 0) && (data_end_ > conf_.randomizer_size ));
-  }
-
-  /// Number of frames stored inside the Randomizer
-  int32 NumFrames() {
-    return data_end_;
-  }
-
-  /// Randomize matrix row-order using mask
-  void Randomize(const std::vector<int32>& mask);
-
-  /// Returns true, if no more data for another mini-batch (after current one)
-  bool Done() {
-    return (data_end_ - data_begin_ < conf_.minibatch_size);
-  }
-
-  /// Sets cursor to next mini-batch
-  void Next();
-
-  /// Returns matrix-window with next mini-batch
-  const Vector<BaseFloat>& Value();
-
- private:
-  Vector<BaseFloat> data_;  // can be larger than 'randomizer_size'
-  Vector<BaseFloat> minibatch_;  // buffer for mini-batch
-
-  /// A cursor, pointing to the 'row' where the next mini-batch begins,
-  int32 data_begin_;
-  /// A cursor, pointing to the 'row' after the end of data,
-  int32 data_end_;
-
-  NnetDataRandomizerOptions conf_;
-};
-
-
-/// Randomizes elements of a vector according to a mask
-template<typename T>
-class StdVectorRandomizer {
- public:
-  StdVectorRandomizer():
-    data_begin_(0),
-    data_end_(0)
-  { }
-
-  explicit StdVectorRandomizer(const NnetDataRandomizerOptions &conf):
-    data_begin_(0),
-    data_end_(0)
-  {
-    Init(conf);
-  }
-
-  /// Set the randomizer parameters (size)
-  void Init(const NnetDataRandomizerOptions& conf) {
-    conf_ = conf;
-  }
-
-  /// Add data to randomization buffer
-  void AddData(const std::vector<T>& v);
-
-  /// Returns true, when capacity is full
-  bool IsFull() {
-    return ((data_begin_ == 0) && (data_end_ > conf_.randomizer_size ));
-  }
-
-  /// Number of frames stored inside the Randomizer
-  int32 NumFrames() {
-    return data_end_;
-  }
-
-  /// Randomize matrix row-order using mask
-  void Randomize(const std::vector<int32>& mask);
-
-  /// Returns true, if no more data for another mini-batch (after current one)
-  bool Done() {
-    return (data_end_ - data_begin_ < conf_.minibatch_size);
-  }
-
-  /// Sets cursor to next mini-batch
-  void Next();
-
-  /// Returns matrix-window with next mini-batch
-  const std::vector<T>& Value();
-
- private:
-  std::vector<T> data_;  // can be larger than 'randomizer_size'
-  std::vector<T> minibatch_;  // buffer for mini-batch
-
-  /// A cursor, pointing to the 'row' where the next mini-batch begins,
-  int32 data_begin_;
-  /// A cursor, pointing to the 'row' after the end of data,
-  int32 data_end_;
-
-  NnetDataRandomizerOptions conf_;
-};
-
-typedef StdVectorRandomizer<int32> Int32VectorRandomizer;
-typedef StdVectorRandomizer<std::vector<std::pair<int32, BaseFloat> > > PosteriorRandomizer;
-
-
-}  // namespace nnet1
-}  // namespace kaldi
-
-#endif  // KALDI_NNET_NNET_RANDOMIZER_H_
diff --git a/src/nnet/nnet-rbm.h b/src/nnet/nnet-rbm.h
deleted file mode 100644
index 4b5f4c1e24a..00000000000
--- a/src/nnet/nnet-rbm.h
+++ /dev/null
@@ -1,433 +0,0 @@
-// nnet/nnet-rbm.h
-
-// Copyright 2012-2013  Brno University of Technology (Author: Karel Vesely)
-
-// See ../../COPYING for clarification regarding multiple authors
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//  http://www.apache.org/licenses/LICENSE-2.0
-//
-// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
-// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
-// MERCHANTABLITY OR NON-INFRINGEMENT.
-// See the Apache 2 License for the specific language governing permissions and
-// limitations under the License.
-
-
-#ifndef KALDI_NNET_NNET_RBM_H_
-#define KALDI_NNET_NNET_RBM_H_
-
-#include <string>
-
-#include "nnet/nnet-component.h"
-#include "nnet/nnet-nnet.h"
-#include "nnet/nnet-utils.h"
-#include "nnet/nnet-various.h"
-#include "cudamatrix/cu-math.h"
-
-namespace kaldi {
-namespace nnet1 {
-
-class RbmBase : public Component {
- public:
-  typedef enum {
-    Bernoulli,
-    Gaussian
-  } RbmNodeType;
-
-  RbmBase(int32 dim_in, int32 dim_out):
-    Component(dim_in, dim_out)
-  { }
-
-  // Inherited from Component::
-  // void Propagate(...)
-  // virtual void PropagateFnc(...) = 0
-
-  virtual void Reconstruct(
-    const CuMatrixBase<BaseFloat> &hid_state,
-    CuMatrix<BaseFloat> *vis_probs
-  ) = 0;
-  virtual void RbmUpdate(
-    const CuMatrixBase<BaseFloat> &pos_vis,
-    const CuMatrixBase<BaseFloat> &pos_hid,
-    const CuMatrixBase<BaseFloat> &neg_vis,
-    const CuMatrixBase<BaseFloat> &neg_hid
-  ) = 0;
-
-  virtual RbmNodeType VisType() const = 0;
-  virtual RbmNodeType HidType() const = 0;
-
-  virtual void WriteAsNnet(std::ostream& os, bool binary) const = 0;
-
-  /// Set training hyper-parameters to the network and its UpdatableComponent(s)
-  void SetRbmTrainOptions(const RbmTrainOptions& opts) {
-    rbm_opts_ = opts;
-  }
-  /// Get training hyper-parameters from the network
-  const RbmTrainOptions& GetRbmTrainOptions() const {
-    return rbm_opts_;
-  }
-
- protected:
-  RbmTrainOptions rbm_opts_;
-
- private:
-  //// Make inherited methods inaccessible,
-  //   as for RBMs we use Reconstruct(.)
-  void Backpropagate(const CuMatrixBase<BaseFloat> &in,
-                     const CuMatrixBase<BaseFloat> &out,
-                     const CuMatrixBase<BaseFloat> &out_diff,
-                     CuMatrix<BaseFloat> *in_diff)
-  { }
-  void BackpropagateFnc(const CuMatrixBase<BaseFloat> &in,
-                        const CuMatrixBase<BaseFloat> &out,
-                        const CuMatrixBase<BaseFloat> &out_diff,
-                        CuMatrixBase<BaseFloat> *in_diff)
-  { }
-  ////
-};
-
-
-
-class Rbm : public RbmBase {
- public:
-  Rbm(int32 dim_in, int32 dim_out):
-    RbmBase(dim_in, dim_out)
-  { }
-
-  ~Rbm()
-  { }
-
-  Component* Copy() const {
-    return new Rbm(*this);
-  }
-
-  ComponentType GetType() const {
-    return kRbm;
-  }
-
-  void InitData(std::istream &is) {
-    // define options,
-    std::string vis_type;
-    std::string hid_type;
-    float vis_bias_mean = 0.0, vis_bias_range = 0.0,
-          hid_bias_mean = 0.0, hid_bias_range = 0.0,
-          param_stddev = 0.1;
-    std::string vis_bias_cmvn_file;  // initialize biases to logit(p_active)
-    // parse config,
-    std::string token;
-    while (is >> std::ws, !is.eof()) {
-      ReadToken(is, false, &token);
-      /**/ if (token == "<VisibleType>") ReadToken(is, false, &vis_type);
-      else if (token == "<HiddenType>") ReadToken(is, false, &hid_type);
-      else if (token == "<VisibleBiasMean>") ReadBasicType(is, false, &vis_bias_mean);
-      else if (token == "<VisibleBiasRange>") ReadBasicType(is, false, &vis_bias_range);
-      else if (token == "<HiddenBiasMean>") ReadBasicType(is, false, &hid_bias_mean);
-      else if (token == "<HiddenBiasRange>") ReadBasicType(is, false, &hid_bias_range);
-      else if (token == "<ParamStddev>") ReadBasicType(is, false, &param_stddev);
-      else if (token == "<VisibleBiasCmvnFilename>") ReadToken(is, false, &vis_bias_cmvn_file);
-      else KALDI_ERR << "Unknown token " << token << " Typo in config?";
-    }
-
-    // Translate the 'node' types,
-    if (vis_type == "bern" || vis_type == "Bernoulli") vis_type_ = RbmBase::Bernoulli;
-    else if (vis_type == "gauss" || vis_type == "Gaussian") vis_type_ = RbmBase::Gaussian;
-    else KALDI_ERR << "Wrong <VisibleType>" << vis_type;
-    //
-    if (hid_type == "bern" || hid_type == "Bernoulli") hid_type_ = RbmBase::Bernoulli;
-    else if (hid_type == "gauss" || hid_type == "Gaussian") hid_type_ = RbmBase::Gaussian;
-    else KALDI_ERR << "Wrong <HiddenType>" << hid_type;
-
-    //
-    // Initialize trainable parameters,
-    //
-    // visible-hidden connections,
-    vis_hid_.Resize(OutputDim(), InputDim());
-    RandGauss(0.0, param_stddev, &vis_hid_);
-    // hidden-bias,
-    hid_bias_.Resize(OutputDim());
-    RandUniform(hid_bias_mean, hid_bias_range, &hid_bias_);
-    // visible-bias,
-    if (vis_bias_cmvn_file == "") {
-      vis_bias_.Resize(InputDim());
-      RandUniform(vis_bias_mean, vis_bias_range, &vis_bias_);
-    } else {
-      KALDI_LOG << "Initializing from <VisibleBiasCmvnFilename> "
-                << vis_bias_cmvn_file;
-      // Reading Nnet with 'global-cmvn' components,
-      Nnet cmvn;
-      cmvn.Read(vis_bias_cmvn_file);
-      KALDI_ASSERT(InputDim() == cmvn.InputDim());
-      // The parameters from <AddShift> correspond to 'negative' mean values,
-      Vector<BaseFloat> p(cmvn.InputDim());
-      dynamic_cast<AddShift&>(cmvn.GetComponent(0)).GetParams(&p);
-      p.Scale(-1.0);  // 'un-do' negation of mean values,
-      p.ApplyFloor(0.0001);
-      p.ApplyCeiling(0.9999);
-      // Getting the logit,
-      Vector<BaseFloat> logit_p(p.Dim());
-      for (int32 d = 0; d < p.Dim(); d++) {
-        logit_p(d) = Log(p(d)) - Log(1.0 - p(d));
-      }
-      vis_bias_ = logit_p;
-      KALDI_ASSERT(vis_bias_.Dim() == InputDim());
-    }
-  }
-
-
-  void ReadData(std::istream &is, bool binary) {
-    std::string vis_node_type, hid_node_type;
-    ReadToken(is, binary, &vis_node_type);
-    ReadToken(is, binary, &hid_node_type);
-
-    if (vis_node_type == "bern") {
-      vis_type_ = RbmBase::Bernoulli;
-    } else if (vis_node_type == "gauss") {
-      vis_type_ = RbmBase::Gaussian;
-    }
-    if (hid_node_type == "bern") {
-      hid_type_ = RbmBase::Bernoulli;
-    } else if (hid_node_type == "gauss") {
-      hid_type_ = RbmBase::Gaussian;
-    }
-
-    vis_hid_.Read(is, binary);
-    vis_bias_.Read(is, binary);
-    hid_bias_.Read(is, binary);
-
-    KALDI_ASSERT(vis_hid_.NumRows() == output_dim_);
-    KALDI_ASSERT(vis_hid_.NumCols() == input_dim_);
-    KALDI_ASSERT(vis_bias_.Dim() == input_dim_);
-    KALDI_ASSERT(hid_bias_.Dim() == output_dim_);
-  }
-
-  void WriteData(std::ostream &os, bool binary) const {
-    switch (vis_type_) {
-      case Bernoulli : WriteToken(os,binary, "bern"); break;
-      case Gaussian  : WriteToken(os,binary, "gauss"); break;
-      default : KALDI_ERR << "Unknown type " << vis_type_;
-    }
-    switch (hid_type_) {
-      case Bernoulli : WriteToken(os,binary, "bern"); break;
-      case Gaussian  : WriteToken(os,binary, "gauss"); break;
-      default : KALDI_ERR << "Unknown type " << hid_type_;
-    }
-    vis_hid_.Write(os, binary);
-    vis_bias_.Write(os, binary);
-    hid_bias_.Write(os, binary);
-  }
-
-
-  // Component API
-  void PropagateFnc(const CuMatrixBase<BaseFloat> &in,
-                    CuMatrixBase<BaseFloat> *out) {
-    // pre-fill with bias
-    out->AddVecToRows(1.0, hid_bias_, 0.0);
-    // multiply by weights^t
-    out->AddMatMat(1.0, in, kNoTrans, vis_hid_, kTrans, 1.0);
-    // optionally apply sigmoid
-    if (hid_type_ == RbmBase::Bernoulli) {
-      out->Sigmoid(*out);
-    }
-  }
-
-  // RBM training API
-  void Reconstruct(const CuMatrixBase<BaseFloat> &hid_state,
-                   CuMatrix<BaseFloat> *vis_probs) {
-    // check the dim
-    if (output_dim_ != hid_state.NumCols()) {
-      KALDI_ERR << "Nonmatching dims, component:" << output_dim_
-                << " data:" << hid_state.NumCols();
-    }
-    // optionally allocate buffer
-    if (input_dim_ != vis_probs->NumCols() ||
-        hid_state.NumRows() != vis_probs->NumRows()) {
-      vis_probs->Resize(hid_state.NumRows(), input_dim_);
-    }
-
-    // pre-fill with bias
-    vis_probs->AddVecToRows(1.0, vis_bias_, 0.0);
-    // multiply by weights
-    vis_probs->AddMatMat(1.0, hid_state, kNoTrans, vis_hid_, kNoTrans, 1.0);
-    // optionally apply sigmoid
-    if (vis_type_ == RbmBase::Bernoulli) {
-      vis_probs->Sigmoid(*vis_probs);
-    }
-  }
-
-  void RbmUpdate(const CuMatrixBase<BaseFloat> &pos_vis,
-                 const CuMatrixBase<BaseFloat> &pos_hid,
-                 const CuMatrixBase<BaseFloat> &neg_vis,
-                 const CuMatrixBase<BaseFloat> &neg_hid) {
-    // dims
-    KALDI_ASSERT(pos_vis.NumRows() == pos_hid.NumRows() &&
-           pos_vis.NumRows() == neg_vis.NumRows() &&
-           pos_vis.NumRows() == neg_hid.NumRows() &&
-           pos_vis.NumCols() == neg_vis.NumCols() &&
-           pos_hid.NumCols() == neg_hid.NumCols() &&
-           pos_vis.NumCols() == input_dim_ &&
-           pos_hid.NumCols() == output_dim_);
-
-    // lazy initialization of buffers
-    if ( vis_hid_corr_.NumRows() != vis_hid_.NumRows() ||
-         vis_hid_corr_.NumCols() != vis_hid_.NumCols() ||
-         vis_bias_corr_.Dim()    != vis_bias_.Dim()    ||
-         hid_bias_corr_.Dim()    != hid_bias_.Dim()     ) {
-      vis_hid_corr_.Resize(vis_hid_.NumRows(), vis_hid_.NumCols(), kSetZero);
-      vis_bias_corr_.Resize(vis_bias_.Dim(), kSetZero);
-      hid_bias_corr_.Resize(hid_bias_.Dim(), kSetZero);
-    }
-
-    // ANTI-WEIGHT-EXPLOSION PROTECTION (Gaussian-Bernoulli RBM)
-    //
-    // in the following section we detect that the weights in
-    // Gaussian-Bernoulli RBM are almost exploding. The weight
-    // explosion is caused by large variance of the reconstructed data,
-    // which causes a feed-back loop that keeps increasing the weights.
-    //
-    // To avoid explosion, the standard-deviation of the visible-data
-    // and reconstructed-data should be about the same.
-    // The model is particularly sensitive at the very
-    // beginning of the CD-1 training.
-    //
-    // We compute the standard deviations on
-    // * 'A' : input mini-batch
-    // * 'B' : reconstruction.
-    // When 'B > 2*A', we stabilize the training in this way:
-    // 1. we scale down the weights and biases by 'A/B',
-    // 2. we shrink learning rate by 0.9x,
-    // 3. we reset the momentum buffer,
-    //
-    // A warning message is put to the log. In later stage
-    // the learning-rate returns back to its original value.
-    //
-    // To avoid the issue, we make sure that the weight-matrix
-    // is sensibly initialized.
-    //
-    if (vis_type_ == RbmBase::Gaussian) {
-      // check the data have no nan/inf:
-      CheckNanInf(pos_vis, "pos_vis");
-      CheckNanInf(pos_hid, "pos_hid");
-      CheckNanInf(neg_vis, "neg_vis");
-      CheckNanInf(neg_hid, "pos_hid");
-
-      // get standard deviations of pos_vis and neg_vis:
-      BaseFloat pos_vis_std = ComputeStdDev(pos_vis);
-      BaseFloat neg_vis_std = ComputeStdDev(neg_vis);
-
-      // monitor the standard deviation mismatch : data vs. reconstruction
-      if (pos_vis_std * 2 < neg_vis_std) {
-        // 1) scale-down the weights and biases
-        BaseFloat scale = pos_vis_std / neg_vis_std;
-        vis_hid_.Scale(scale);
-        vis_bias_.Scale(scale);
-        hid_bias_.Scale(scale);
-        // 2) reduce the learning rate
-        rbm_opts_.learn_rate *= 0.9;
-        // 3) reset the momentum buffers
-        vis_hid_corr_.SetZero();
-        vis_bias_corr_.SetZero();
-        hid_bias_corr_.SetZero();
-
-        KALDI_WARN << "Mismatch between pos_vis and neg_vis variances, "
-                   << "danger of weight explosion."
-                   << " a) Reducing weights with scale " << scale
-                   << " b) Lowering learning rate to " << rbm_opts_.learn_rate
-                   << " [pos_vis_std:" << pos_vis_std
-                   << ",neg_vis_std:" << neg_vis_std << "]";
-        return; /* i.e. don't update now, the update would be too BIG */
-      }
-    }
-    //
-    // End of weight-explosion check
-
-
-    //  We use these training hyper-parameters
-    //
-    const BaseFloat lr = rbm_opts_.learn_rate;
-    const BaseFloat mmt = rbm_opts_.momentum;
-    const BaseFloat l2 = rbm_opts_.l2_penalty;
-
-    //  UPDATE vishid matrix
-    //
-    //  vishidinc = momentum*vishidinc + ...
-    //              epsilonw*( (posprods-negprods)/numcases - weightcost*vishid)
-    //
-    //  vishidinc[t] = -(epsilonw/numcases)*negprods + momentum*vishidinc[t-1]
-    //                 +(epsilonw/numcases)*posprods
-    //                 -(epsilonw*weightcost)*vishid[t-1]
-    //
-    BaseFloat N = static_cast<BaseFloat>(pos_vis.NumRows());
-    vis_hid_corr_.AddMatMat(-lr/N, neg_hid, kTrans, neg_vis, kNoTrans, mmt);
-    vis_hid_corr_.AddMatMat(+lr/N, pos_hid, kTrans, pos_vis, kNoTrans, 1.0);
-    vis_hid_corr_.AddMat(-lr*l2, vis_hid_);
-    vis_hid_.AddMat(1.0, vis_hid_corr_);
-
-    //  UPDATE visbias vector
-    //
-    //  visbiasinc = momentum*visbiasinc +
-    //               (epsilonvb/numcases)*(posvisact-negvisact);
-    //
-    vis_bias_corr_.AddRowSumMat(-lr/N, neg_vis, mmt);
-    vis_bias_corr_.AddRowSumMat(+lr/N, pos_vis, 1.0);
-    vis_bias_.AddVec(1.0, vis_bias_corr_, 1.0);
-
-    //  UPDATE hidbias vector
-    //
-    // hidbiasinc = momentum*hidbiasinc +
-    //              (epsilonhb/numcases)*(poshidact-neghidact);
-    //
-    hid_bias_corr_.AddRowSumMat(-lr/N, neg_hid, mmt);
-    hid_bias_corr_.AddRowSumMat(+lr/N, pos_hid, 1.0);
-    hid_bias_.AddVec(1.0, hid_bias_corr_, 1.0);
-  }
-
-  RbmNodeType VisType() const {
-    return vis_type_;
-  }
-
-  RbmNodeType HidType() const {
-    return hid_type_;
-  }
-
-  void WriteAsNnet(std::ostream& os, bool binary) const {
-    // header,
-    WriteToken(os, binary, Component::TypeToMarker(Component::kAffineTransform));
-    WriteBasicType(os, binary, OutputDim());
-    WriteBasicType(os, binary, InputDim());
-    if (!binary) os << "\n";
-    // data,
-    vis_hid_.Write(os, binary);
-    hid_bias_.Write(os, binary);
-    // sigmoid activation,
-    if (HidType() == Bernoulli) {
-      WriteToken(os, binary, Component::TypeToMarker(Component::kSigmoid));
-      WriteBasicType(os, binary, OutputDim());
-      WriteBasicType(os, binary, OutputDim());
-    }
-    if (!binary) os << "\n";
-  }
-
- protected:
-  CuMatrix<BaseFloat> vis_hid_;        ///< Matrix with neuron weights
-  CuVector<BaseFloat> vis_bias_;       ///< Vector with biases
-  CuVector<BaseFloat> hid_bias_;       ///< Vector with biases
-
-  CuMatrix<BaseFloat> vis_hid_corr_;   ///< Matrix for linearity updates
-  CuVector<BaseFloat> vis_bias_corr_;  ///< Vector for bias updates
-  CuVector<BaseFloat> hid_bias_corr_;  ///< Vector for bias updates
-
-  RbmNodeType vis_type_;
-  RbmNodeType hid_type_;
-};
-
-
-
-}  // namespace nnet1
-}  // namespace kaldi
-
-#endif  // KALDI_NNET_NNET_RBM_H_
diff --git a/src/nnet/nnet-recurrent.h b/src/nnet/nnet-recurrent.h
deleted file mode 100644
index ef251f70f10..00000000000
--- a/src/nnet/nnet-recurrent.h
+++ /dev/null
@@ -1,346 +0,0 @@
-// nnet/nnet-lstm-projected-streams.h
-
-// Copyright 2016  Brno University of Technology (author: Karel Vesely)
-
-// See ../../COPYING for clarification regarding multiple authors
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//  http://www.apache.org/licenses/LICENSE-2.0
-//
-// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
-// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
-// MERCHANTABLITY OR NON-INFRINGEMENT.
-// See the Apache 2 License for the specific language governing permissions and
-// limitations under the License.
-
-
-
-#ifndef KALDI_NNET_NNET_RECURRENT_STREAMS_H_
-#define KALDI_NNET_NNET_RECURRENT_STREAMS_H_
-
-#include <string>
-#include <vector>
-
-#include "nnet/nnet-component.h"
-#include "nnet/nnet-utils.h"
-#include "cudamatrix/cu-math.h"
-
-
-namespace kaldi {
-namespace nnet1 {
-
-
-/**
- * Component with recurrent connections, 'tanh' non-linearity.
- * No internal state preserved, starting each sequence from zero vector.
- *
- * Can be used in 'per-sentence' training and multi-stream training.
- */
-class RecurrentComponent : public MultistreamComponent {
- public:
-  RecurrentComponent(int32 input_dim, int32 output_dim):
-    MultistreamComponent(input_dim, output_dim)
-  { }
-
-  ~RecurrentComponent()
-  { }
-
-  Component* Copy() const { return new RecurrentComponent(*this); }
-  ComponentType GetType() const { return kRecurrentComponent; }
-
-  void InitData(std::istream &is) {
-    // define options,
-    float param_scale = 0.02;
-    // parse the line from prototype,
-    std::string token;
-    while (is >> std::ws, !is.eof()) {
-      ReadToken(is, false, &token);
-      /**/ if (token == "<GradClip>") ReadBasicType(is, false, &grad_clip_);
-      else if (token == "<DiffClip>") ReadBasicType(is, false, &diff_clip_);
-      else if (token == "<LearnRateCoef>") ReadBasicType(is, false, &learn_rate_coef_);
-      else if (token == "<BiasLearnRateCoef>") ReadBasicType(is, false, &bias_learn_rate_coef_);
-      else if (token == "<ParamScale>") ReadBasicType(is, false, &param_scale);
-      else KALDI_ERR << "Unknown token " << token << ", a typo in config?"
-                     << " (GradClip|DiffClip|LearnRateCoef|BiasLearnRateCoef|ParamScale)";
-    }
-
-    // init the weights and biases (from uniform dist.),
-    w_forward_.Resize(output_dim_, input_dim_);
-    w_recurrent_.Resize(output_dim_, output_dim_);
-    bias_.Resize(output_dim_);
-
-    RandUniform(0.0, 2.0 * param_scale, &w_forward_);
-    RandUniform(0.0, 2.0 * param_scale, &w_recurrent_);
-    RandUniform(0.0, 2.0 * param_scale, &bias_);
-  }
-
-  void ReadData(std::istream &is, bool binary) {
-    // Read all the '<Tokens>' in arbitrary order,
-    while ('<' == Peek(is, binary)) {
-      std::string token;
-      int first_char = PeekToken(is, binary);
-      switch (first_char) {
-        case 'G': ExpectToken(is, binary, "<GradClip>");
-          ReadBasicType(is, binary, &grad_clip_);
-          break;
-        case 'D': ExpectToken(is, binary, "<DiffClip>");
-          ReadBasicType(is, binary, &diff_clip_);
-          break;
-        case 'L': ExpectToken(is, binary, "<LearnRateCoef>");
-          ReadBasicType(is, binary, &learn_rate_coef_);
-          break;
-        case 'B': ExpectToken(is, binary, "<BiasLearnRateCoef>");
-          ReadBasicType(is, binary, &bias_learn_rate_coef_);
-          break;
-        default: ReadToken(is, false, &token);
-          KALDI_ERR << "Unknown token: " << token;
-      }
-    }
-
-    // Read the data (data follow the tokens),
-    w_forward_.Read(is, binary);
-    w_recurrent_.Read(is, binary);
-    bias_.Read(is, binary);
-  }
-
-  void WriteData(std::ostream &os, bool binary) const {
-    WriteToken(os, binary, "<GradClip>");
-    WriteBasicType(os, binary, grad_clip_);
-    WriteToken(os, binary, "<DiffClip>");
-    WriteBasicType(os, binary, diff_clip_);
-
-    WriteToken(os, binary, "<LearnRateCoef>");
-    WriteBasicType(os, binary, learn_rate_coef_);
-    WriteToken(os, binary, "<BiasLearnRateCoef>");
-    WriteBasicType(os, binary, bias_learn_rate_coef_);
-
-    if (!binary) os << "\n";
-    w_forward_.Write(os, binary);
-    w_recurrent_.Write(os, binary);
-    bias_.Write(os, binary);
-  }
-
-  int32 NumParams() const {
-    return w_forward_.NumRows() * w_forward_.NumCols() +
-      w_recurrent_.NumRows() * w_recurrent_.NumCols() +
-      bias_.Dim();
-  }
-
-  void GetGradient(VectorBase<BaseFloat>* gradient) const {
-    KALDI_ASSERT(gradient->Dim() == NumParams());
-    int32 offset, len;
-
-    offset = 0;    len = w_forward_corr_.NumRows() * w_forward_corr_.NumCols();
-    gradient->Range(offset, len).CopyRowsFromMat(w_forward_corr_);
-
-    offset += len; len = w_recurrent_corr_.NumRows() * w_recurrent_corr_.NumCols();
-    gradient->Range(offset, len).CopyRowsFromMat(w_recurrent_corr_);
-
-    offset += len; len = bias_corr_.Dim();
-    gradient->Range(offset, len).CopyFromVec(bias_corr_);
-
-    offset += len;
-    KALDI_ASSERT(offset == NumParams());
-  }
-
-  void GetParams(VectorBase<BaseFloat>* params) const {
-    KALDI_ASSERT(params->Dim() == NumParams());
-    int32 offset, len;
-
-    offset = 0;    len = w_forward_.NumRows() * w_forward_.NumCols();
-    params->Range(offset, len).CopyRowsFromMat(w_forward_);
-
-    offset += len; len = w_recurrent_.NumRows() * w_recurrent_.NumCols();
-    params->Range(offset, len).CopyRowsFromMat(w_recurrent_);
-
-    offset += len; len = bias_.Dim();
-    params->Range(offset, len).CopyFromVec(bias_);
-
-    offset += len;
-    KALDI_ASSERT(offset == NumParams());
-  }
-
-  void SetParams(const VectorBase<BaseFloat>& params) {
-    KALDI_ASSERT(params.Dim() == NumParams());
-    int32 offset, len;
-
-    offset = 0;    len = w_forward_.NumRows() * w_forward_.NumCols();
-    w_forward_.CopyRowsFromVec(params.Range(offset, len));
-
-    offset += len; len = w_recurrent_.NumRows() * w_recurrent_.NumCols();
-    w_recurrent_.CopyRowsFromVec(params.Range(offset, len));
-
-    offset += len; len = bias_.Dim();
-    bias_.CopyFromVec(params.Range(offset, len));
-
-    offset += len;
-    KALDI_ASSERT(offset == NumParams());
-  }
-
-  std::string Info() const {
-    return std::string("  ") +
-      "\n  w_forward_  "   + MomentStatistics(w_forward_) +
-      "\n  w_recurrent_  " + MomentStatistics(w_recurrent_) +
-      "\n  bias_  "        + MomentStatistics(bias_);
-  }
-
-  std::string InfoGradient() const {
-    return std::string("") +
-      "( learn_rate_coef " + ToString(learn_rate_coef_) +
-      ", bias_learn_rate_coef " + ToString(bias_learn_rate_coef_) +
-      ", grad-clip " + ToString(grad_clip_) +
-      ", diff-clip " + ToString(diff_clip_) + " )" +
-      "\n  Gradients:" +
-      "\n  w_forward_corr_  "   + MomentStatistics(w_forward_corr_) +
-      "\n  w_recurrent_corr_  "   + MomentStatistics(w_recurrent_corr_) +
-      "\n  bias_corr_  "     + MomentStatistics(bias_corr_) +
-      "\n  Forward-pass:" +
-      "\n  out_  " + MomentStatistics(out_) +
-      "\n  Backward-pass:" +
-      "\n  out_diff_bptt_  " + MomentStatistics(out_diff_bptt_);
-  }
-
-  void PropagateFnc(const CuMatrixBase<BaseFloat> &in,
-                    CuMatrixBase<BaseFloat> *out) {
-
-
-    KALDI_ASSERT(in.NumRows() % NumStreams() == 0);
-    int32 T = in.NumRows() / NumStreams();
-    int32 S = NumStreams();
-
-    // Precopy bias,
-    out->AddVecToRows(1.0, bias_, 0.0);
-    // Apply 'forward' connections,
-    out->AddMatMat(1.0, in, kNoTrans, w_forward_, kTrans, 1.0);
-
-    // First line of 'out' w/o recurrent signal, apply 'tanh' directly,
-    out->RowRange(0, S).Tanh(out->RowRange(0, S));
-
-    // Apply 'recurrent' connections,
-    for (int32 t = 1; t < T; t++) {
-      out->RowRange(t*S, S).AddMatMat(1.0, out->RowRange((t-1)*S, S), kNoTrans, w_recurrent_, kTrans, 1.0);
-      out->RowRange(t*S, S).Tanh(out->RowRange(t*S, S));
-      // Zero output for padded frames,
-      if (sequence_lengths_.size() == S) {
-        for (int32 s = 0; s < S; s++) {
-          if (t >= sequence_lengths_[s]) {
-            out->Row(t*S + s).SetZero();
-          }
-        }
-      }
-      //
-    }
-
-    out_ = (*out);  // We'll need a copy for updating the recurrent weights!
-
-    // We are DONE ;)
-  }
-
-  void BackpropagateFnc(const CuMatrixBase<BaseFloat> &in,
-                        const CuMatrixBase<BaseFloat> &out,
-                        const CuMatrixBase<BaseFloat> &out_diff,
-                        CuMatrixBase<BaseFloat> *in_diff) {
-
-    int32 T = in.NumRows() / NumStreams();
-    int32 S = NumStreams();
-
-    // Apply BPTT on 'out_diff',
-    out_diff_bptt_ = out_diff;
-    for (int32 t = T-1; t >= 1; t--) {
-      // buffers,
-      CuSubMatrix<BaseFloat> d_t = out_diff_bptt_.RowRange(t*S, S);
-      CuSubMatrix<BaseFloat> d_t1 = out_diff_bptt_.RowRange((t-1)*S, S);
-      const CuSubMatrix<BaseFloat> y_t = out.RowRange(t*S, S);
-
-      // BPTT,
-      d_t.DiffTanh(y_t, d_t);
-      d_t1.AddMatMat(1.0, d_t, kNoTrans, w_recurrent_, kNoTrans, 1.0);
-
-      // clipping,
-      if (diff_clip_ > 0.0) {
-        d_t1.ApplyFloor(-diff_clip_);
-        d_t1.ApplyCeiling(diff_clip_);
-      }
-
-      // Zero diff for padded frames,
-      if (sequence_lengths_.size() == S) {
-        for (int32 s = 0; s < S; s++) {
-          if (t >= sequence_lengths_[s]) {
-            out_diff_bptt_.Row(t*S + s).SetZero();
-          }
-        }
-      }
-    }
-
-    // Apply 'DiffTanh' on first block,
-    CuSubMatrix<BaseFloat> d_t = out_diff_bptt_.RowRange(0, S);
-    const CuSubMatrix<BaseFloat> y_t = out.RowRange(0, S);
-    d_t.DiffTanh(y_t, d_t);
-
-    // Transform diffs to 'in_diff',
-    in_diff->AddMatMat(1.0, out_diff_bptt_, kNoTrans, w_forward_, kNoTrans, 0.0);
-
-    // We are DONE ;)
-  }
-
-  void Update(const CuMatrixBase<BaseFloat> &input,
-              const CuMatrixBase<BaseFloat> &diff) {
-    int32 T = input.NumRows() / NumStreams();
-    int32 S = NumStreams();
-
-    // getting the learning rate,
-    const BaseFloat lr  = opts_.learn_rate;
-    const BaseFloat mmt = opts_.momentum;
-
-    if (bias_corr_.Dim() != OutputDim()) {
-      w_forward_corr_.Resize(w_forward_.NumRows(), w_forward_.NumCols(), kSetZero);
-      w_recurrent_corr_.Resize(w_recurrent_.NumRows(), w_recurrent_.NumCols(), kSetZero);
-      bias_corr_.Resize(OutputDim(), kSetZero);
-    }
-
-    // getting the gradients,
-    w_forward_corr_.AddMatMat(1.0, out_diff_bptt_, kTrans, input, kNoTrans, mmt);
-
-
-    w_recurrent_corr_.AddMatMat(1.0, out_diff_bptt_.RowRange(S, (T-1)*S), kTrans,
-                                               out_.RowRange(0, (T-1)*S), kNoTrans, mmt);
-
-    bias_corr_.AddRowSumMat(1.0, out_diff_bptt_, mmt);
-
-    // updating,
-    w_forward_.AddMat(-lr * learn_rate_coef_, w_forward_corr_);
-    w_recurrent_.AddMat(-lr * learn_rate_coef_, w_recurrent_corr_);
-    bias_.AddVec(-lr * bias_learn_rate_coef_, bias_corr_);
-  }
-
- private:
-
-  BaseFloat grad_clip_;  ///< Clipping of the update,
-  BaseFloat diff_clip_;  ///< Clipping in the BPTT loop,
-
-  // trainable parameters,
-  CuMatrix<BaseFloat> w_forward_;
-  CuMatrix<BaseFloat> w_recurrent_;
-  CuVector<BaseFloat> bias_;
-
-  // udpate buffers,
-  CuMatrix<BaseFloat> w_forward_corr_;
-  CuMatrix<BaseFloat> w_recurrent_corr_;
-  CuVector<BaseFloat> bias_corr_;
-
-  // forward propagation buffer,
-  CuMatrix<BaseFloat> out_;
-
-  // back-propagate buffer,
-  CuMatrix<BaseFloat> out_diff_bptt_;
-
-};  // class RecurrentComponent
-
-}  // namespace nnet1
-}  // namespace kaldi
-
-#endif  // KALDI_NNET_NNET_RECURRENT_STREAMS_H_
diff --git a/src/nnet/nnet-sentence-averaging-component.h b/src/nnet/nnet-sentence-averaging-component.h
deleted file mode 100644
index 129b54890a7..00000000000
--- a/src/nnet/nnet-sentence-averaging-component.h
+++ /dev/null
@@ -1,314 +0,0 @@
-// nnet/nnet-sentence-averaging-component.h
-
-// Copyright 2013-2016  Brno University of Technology (Author: Karel Vesely)
-
-// See ../../COPYING for clarification regarding multiple authors
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//  http://www.apache.org/licenses/LICENSE-2.0
-//
-// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
-// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
-// MERCHANTABLITY OR NON-INFRINGEMENT.
-// See the Apache 2 License for the specific language governing permissions and
-// limitations under the License.
-
-
-#ifndef KALDI_NNET_NNET_SENTENCE_AVERAGING_COMPONENT_H_
-#define KALDI_NNET_NNET_SENTENCE_AVERAGING_COMPONENT_H_
-
-#include <string>
-
-#include "nnet/nnet-component.h"
-#include "nnet/nnet-utils.h"
-#include "cudamatrix/cu-math.h"
-
-namespace kaldi {
-namespace nnet1 {
-
-
-/**
- * SimpleSentenceAveragingComponent does not have nested network,
- * it is intended to be used inside of a <ParallelComponent>.
- * For training use 'nnet-train-perutt'.
- *
- * The sentence-averaging typically leads to small gradients, so we boost it 100x
- * by default (boost = multiply, it's equivalent to applying learning-rate factor).
- */
-class SimpleSentenceAveragingComponent : public Component {
- public:
-  SimpleSentenceAveragingComponent(int32 dim_in, int32 dim_out):
-    Component(dim_in, dim_out),
-    gradient_boost_(100.0),
-    shrinkage_(0.0),
-    only_summing_(false)
-  { }
-
-  ~SimpleSentenceAveragingComponent()
-  { }
-
-  Component* Copy() const {
-    return new SimpleSentenceAveragingComponent(*this);
-  }
-
-  ComponentType GetType() const {
-    return kSimpleSentenceAveragingComponent;
-  }
-
-  void InitData(std::istream &is) {
-    // parse config
-    std::string token;
-    while (is >> std::ws, !is.eof()) {
-      ReadToken(is, false, &token);
-      if (token == "<GradientBoost>") ReadBasicType(is, false, &gradient_boost_);
-      else if (token == "<Shrinkage>") ReadBasicType(is, false, &shrinkage_);
-      else if (token == "<OnlySumming>") ReadBasicType(is, false, &only_summing_);
-      else KALDI_ERR << "Unknown token " << token << ", a typo in config?"
-                     << " (GradientBoost|Shrinkage|OnlySumming)";
-    }
-  }
-
-  void ReadData(std::istream &is, bool binary) {
-    bool end_loop = false;
-    while (!end_loop && '<' == Peek(is, binary)) {
-      int first_char = PeekToken(is, binary);
-      switch (first_char) {
-        case 'G': ExpectToken(is, binary, "<GradientBoost>");
-          ReadBasicType(is, binary, &gradient_boost_);
-          break;
-        case 'S': ExpectToken(is, binary, "<Shrinkage>");
-          ReadBasicType(is, binary, &shrinkage_);
-          break;
-        case 'O': ExpectToken(is, binary, "<OnlySumming>");
-          // compatibility trick,
-          // in some models 'only_summing_' was float '0.0',
-          // from now 'only_summing_' is 'bool':
-          try {
-            ReadBasicType(is, binary, &only_summing_);
-          } catch(const std::exception &e) {
-            KALDI_WARN << "ERROR was handled by exception!";
-            BaseFloat dummy_float;
-            ReadBasicType(is, binary, &dummy_float);
-          }
-          break;
-        case '!':
-          ExpectToken(is, binary, "<!EndOfComponent>");
-        default:
-          end_loop = true;
-      }
-    }
-  }
-
-  void WriteData(std::ostream &os, bool binary) const {
-    WriteToken(os, binary, "<GradientBoost>");
-    WriteBasicType(os, binary, gradient_boost_);
-    WriteToken(os, binary, "<Shrinkage>");
-    WriteBasicType(os, binary, shrinkage_);
-    WriteToken(os, binary, "<OnlySumming>");
-    WriteBasicType(os, binary, only_summing_);
-  }
-
-  std::string Info() const {
-    return std::string("\n  gradient-boost ") + ToString(gradient_boost_) +
-      ", shrinkage: " + ToString(shrinkage_) +
-      ", only summing: " + ToString(only_summing_);
-  }
-  std::string InfoGradient() const {
-    return Info();
-  }
-
-  void PropagateFnc(const CuMatrixBase<BaseFloat> &in,
-                    CuMatrixBase<BaseFloat> *out) {
-    // get the average row-vector,
-    average_row_.Resize(InputDim());
-    if (only_summing_) {
-      average_row_.AddRowSumMat(1.0, in, 0.0);
-    } else {
-      average_row_.AddRowSumMat(1.0/(in.NumRows()+shrinkage_), in, 0.0);
-    }
-    // copy it on the output,
-    out->AddVecToRows(1.0, average_row_, 0.0);
-  }
-
-  void BackpropagateFnc(const CuMatrixBase<BaseFloat> &in,
-                        const CuMatrixBase<BaseFloat> &out,
-                        const CuMatrixBase<BaseFloat> &out_diff,
-                        CuMatrixBase<BaseFloat> *in_diff) {
-    // When averaging, a single frame from input influenced all the frames
-    // on the output. So the derivative w.r.t. single input frame is a sum
-    // of the output derivatives, scaled by the averaging constant 1/K.
-    //
-    // In the same time all the input frames of the average influenced
-    // all the output frames. So the loss derivarive is same for all
-    // the input frames coming to the averaging.
-    //
-    // getting the average output diff,
-    average_diff_.Resize(OutputDim());
-    if (only_summing_) {
-      average_diff_.AddRowSumMat(1.0, out_diff, 0.0);
-    } else {
-      average_diff_.AddRowSumMat(1.0/(out_diff.NumRows()+shrinkage_), out_diff, 0.0);
-    }
-    // copy the derivative into the input diff, (applying gradient-boost!!)
-    in_diff->AddVecToRows(gradient_boost_, average_diff_, 0.0);
-  }
-
- private:
-  /// Auxiliary buffer for forward propagation (for average vector),
-  CuVector<BaseFloat> average_row_;
-
-  /// Auxiliary buffer for back-propagation (for average vector),
-  CuVector<BaseFloat> average_diff_;
-
-  /// Scalar applied on gradient in backpropagation,
-  BaseFloat gradient_boost_;
-
-  /// Number of 'imaginary' zero-vectors in the average
-  /// (shrinks the average vector for short sentences),
-  BaseFloat shrinkage_;
-
-  /// Removes normalization term from arithmetic mean (when true).
-  bool only_summing_;
-};
-
-
-/** Deprecated!!!, keeping it as Katka Zmolikova used it in JSALT 2015 */
-class SentenceAveragingComponent : public UpdatableComponent {
- public:
-  SentenceAveragingComponent(int32 dim_in, int32 dim_out):
-    UpdatableComponent(dim_in, dim_out), learn_rate_factor_(100.0)
-  { }
-  ~SentenceAveragingComponent()
-  { }
-
-  Component* Copy() const { return new SentenceAveragingComponent(*this); }
-  ComponentType GetType() const { return kSentenceAveragingComponent; }
-
-  void InitData(std::istream &is) {
-    // define options
-    std::string nested_nnet_filename;
-    std::string nested_nnet_proto;
-    // parse config
-    std::string token;
-    while (is >> std::ws, !is.eof()) {
-      ReadToken(is, false, &token);
-      /**/ if (token == "<NestedNnetFilename>") ReadToken(is, false, &nested_nnet_filename);
-      else if (token == "<NestedNnetProto>") ReadToken(is, false, &nested_nnet_proto);
-      else if (token == "<LearnRateFactor>") ReadBasicType(is, false, &learn_rate_factor_);
-      else KALDI_ERR << "Unknown token " << token << " Typo in config?";
-    }
-    // initialize (read already prepared nnet from file)
-    KALDI_ASSERT((nested_nnet_proto != "") ^ (nested_nnet_filename != ""));  // xor,
-    if (nested_nnet_filename != "") nnet_.Read(nested_nnet_filename);
-    if (nested_nnet_proto != "") nnet_.Init(nested_nnet_proto);
-    // check dims of nested nnet
-    KALDI_ASSERT(InputDim() == nnet_.InputDim());
-    KALDI_ASSERT(OutputDim() == nnet_.OutputDim() + InputDim());
-  }
-
-  void ReadData(std::istream &is, bool binary) {
-    nnet_.Read(is, binary);
-    KALDI_ASSERT(nnet_.InputDim() == InputDim());
-    KALDI_ASSERT(nnet_.OutputDim() + InputDim() == OutputDim());
-  }
-
-  void WriteData(std::ostream &os, bool binary) const {
-    nnet_.Write(os, binary);
-  }
-
-  int32 NumParams() const { return nnet_.NumParams(); }
-
-  void GetGradient(VectorBase<BaseFloat>* gradient) const {
-    KALDI_ERR << "Unimplemented!";
-  }
-
-  void GetParams(VectorBase<BaseFloat>* params) const {
-    KALDI_ASSERT(params->Dim() == NumParams());
-    Vector<BaseFloat> params_aux;
-    nnet_.GetParams(&params_aux);
-    params->CopyFromVec(params_aux);
-  }
-
-  void SetParams(const VectorBase<BaseFloat>& params) {
-    KALDI_ERR << "Unimplemented!";
-  }
-
-  std::string Info() const {
-    return std::string("nested_network {\n") + nnet_.Info() + "}\n";
-  }
-
-  std::string InfoGradient() const {
-    return std::string("nested_gradient {\n") + nnet_.InfoGradient() + "}\n";
-  }
-
-  void PropagateFnc(const CuMatrixBase<BaseFloat> &in,
-                    CuMatrixBase<BaseFloat> *out) {
-    // Get NN output
-    CuMatrix<BaseFloat> out_nnet;
-    nnet_.Propagate(in, &out_nnet);
-    // Get the average row (averaging over the time axis):
-    // averaging corresponds to extraction of a 'constant vector'
-    // code for single sentence,
-    int32 num_inputs = in.NumCols(),
-      nnet_outputs = nnet_.OutputDim(),
-      num_frames = out_nnet.NumRows();
-
-    CuVector<BaseFloat> average_row(nnet_outputs);
-    average_row.AddRowSumMat(1.0/num_frames, out_nnet, 0.0);
-    // Forwarding sentence codes along with input features
-    out->ColRange(0, nnet_outputs).AddVecToRows(1.0, average_row, 0.0);
-    out->ColRange(nnet_outputs, num_inputs).CopyFromMat(in);
-  }
-
-  void BackpropagateFnc(const CuMatrixBase<BaseFloat> &in,
-                        const CuMatrixBase<BaseFloat> &out,
-                        const CuMatrixBase<BaseFloat> &out_diff,
-                        CuMatrixBase<BaseFloat> *in_diff) {
-    if (in_diff == NULL) return;
-    int32 num_inputs = in.NumCols(),
-      nnet_outputs = nnet_.OutputDim();
-    in_diff->CopyFromMat(out_diff.ColRange(nnet_outputs, num_inputs));
-  }
-
-  void Update(const CuMatrixBase<BaseFloat> &input,
-              const CuMatrixBase<BaseFloat> &diff) {
-    // get useful dims,
-    int32 nnet_outputs = nnet_.OutputDim(),
-      num_frames = diff.NumRows();
-    // Passing the derivative into the nested network. The loss derivative is averaged:
-    // single frame from nested network influenced all the frames in the main network,
-    // so to get the derivative w.r.t. single frame from nested network we sum derivatives
-    // of all frames from main network (and scale by 1/Nframes constant).
-    //
-    // In fact all the frames from nested network influenced all the input frames to main nnet,
-    // so the loss derivarive w.r.t. nested network output is same for all frames in sentence.
-    CuVector<BaseFloat> average_diff(nnet_outputs);
-    average_diff.AddRowSumMat(1.0 / num_frames, diff.ColRange(0, nnet_outputs), 0.0);
-    CuMatrix<BaseFloat> nnet_out_diff(num_frames, nnet_outputs);
-    nnet_out_diff.AddVecToRows(1.0, average_diff, 0.0);
-    //
-    nnet_.Backpropagate(nnet_out_diff, NULL);
-  }
-
-  void SetTrainOptions(const NnetTrainOptions &opts) {
-    UpdatableComponent::SetTrainOptions(opts_);
-    // Pass the train options to the nnet
-    NnetTrainOptions o(opts);
-    o.learn_rate *= learn_rate_factor_;
-    nnet_.SetTrainOptions(opts_);
-  }
-
- private:
-  Nnet nnet_;
-  float learn_rate_factor_;
-};
-/* Deprecated */
-
-}  // namespace nnet1
-}  // namespace kaldi
-
-#endif  // KALDI_NNET_NNET_SENTENCE_AVERAGING_COMPONENT_H_
diff --git a/src/nnet/nnet-trnopts.h b/src/nnet/nnet-trnopts.h
deleted file mode 100644
index 0a064e17fd4..00000000000
--- a/src/nnet/nnet-trnopts.h
+++ /dev/null
@@ -1,118 +0,0 @@
-// nnet/nnet-trnopts.h
-
-// Copyright 2013  Brno University of Technology (Author: Karel Vesely)
-
-// See ../../COPYING for clarification regarding multiple authors
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//  http://www.apache.org/licenses/LICENSE-2.0
-//
-// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
-// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
-// MERCHANTABLITY OR NON-INFRINGEMENT.
-// See the Apache 2 License for the specific language governing permissions and
-// limitations under the License.
-
-#ifndef KALDI_NNET_NNET_TRNOPTS_H_
-#define KALDI_NNET_NNET_TRNOPTS_H_
-
-#include "base/kaldi-common.h"
-#include "itf/options-itf.h"
-
-namespace kaldi {
-namespace nnet1 {
-
-
-struct NnetTrainOptions {
-  // option declaration
-  BaseFloat learn_rate;
-  BaseFloat momentum;
-  BaseFloat l2_penalty;
-  BaseFloat l1_penalty;
-
-  // default values
-  NnetTrainOptions():
-    learn_rate(0.008),
-    momentum(0.0),
-    l2_penalty(0.0),
-    l1_penalty(0.0)
-  { }
-
-  // register options
-  void Register(OptionsItf *opts) {
-    opts->Register("learn-rate", &learn_rate, "Learning rate");
-    opts->Register("momentum", &momentum, "Momentum");
-    opts->Register("l2-penalty", &l2_penalty, "L2 penalty (weight decay)");
-    opts->Register("l1-penalty", &l1_penalty, "L1 penalty (promote sparsity)");
-  }
-
-  // print for debug purposes
-  friend std::ostream& operator<<(std::ostream& os, const NnetTrainOptions& opts) {
-    os << "NnetTrainOptions : "
-       << "learn_rate" << opts.learn_rate << ", "
-       << "momentum" << opts.momentum << ", "
-       << "l2_penalty" << opts.l2_penalty << ", "
-       << "l1_penalty" << opts.l1_penalty;
-    return os;
-  }
-};
-
-
-struct RbmTrainOptions {
-  // option declaration
-  BaseFloat learn_rate;
-  BaseFloat momentum;
-  BaseFloat momentum_max;
-  int32 momentum_steps;
-  int32 momentum_step_period;
-  BaseFloat l2_penalty;
-
-  // default values
-  RbmTrainOptions():
-    learn_rate(0.4),
-    momentum(0.5),
-    momentum_max(0.9),
-    momentum_steps(40),
-    momentum_step_period(500000),
-    // 500000 * 40 = 55h of linear increase of momentum
-    l2_penalty(0.0002)
-  { }
-
-  // register options
-  void Register(OptionsItf *opts) {
-    opts->Register("learn-rate", &learn_rate, "Learning rate");
-
-    opts->Register("momentum", &momentum,
-                   "Initial momentum for linear scheduling");
-    opts->Register("momentum-max", &momentum_max,
-                   "Final momentum for linear scheduling");
-    opts->Register("momentum-steps", &momentum_steps,
-                   "Number of steps of linear momentum scheduling");
-    opts->Register("momentum-step-period", &momentum_step_period,
-                   "Number of datapoints per single momentum increase step");
-
-    opts->Register("l2-penalty", &l2_penalty,
-                   "L2 penalty (weight decay, increases mixing-rate)");
-  }
-
-  // print for debug purposes
-  friend std::ostream& operator<<(std::ostream& os, const RbmTrainOptions& opts) {
-    os << "RbmTrainOptions : "
-       << "learn_rate" << opts.learn_rate << ", "
-       << "momentum" << opts.momentum << ", "
-       << "momentum_max" << opts.momentum_max << ", "
-       << "momentum_steps" << opts.momentum_steps << ", "
-       << "momentum_step_period" << opts.momentum_step_period << ", "
-       << "l2_penalty" << opts.l2_penalty;
-    return os;
-  }
-};  // struct RbmTrainOptions
-
-}  // namespace nnet1
-}  // namespace kaldi
-
-#endif  // KALDI_NNET_NNET_TRNOPTS_H_
diff --git a/src/nnet/nnet-utils.h b/src/nnet/nnet-utils.h
deleted file mode 100644
index 8b1afbbed3b..00000000000
--- a/src/nnet/nnet-utils.h
+++ /dev/null
@@ -1,317 +0,0 @@
-// nnet/nnet-utils.h
-
-// Copyright 2015  Brno University of Technology (author: Karel Vesely)
-
-// See ../../COPYING for clarification regarding multiple authors
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//  http://www.apache.org/licenses/LICENSE-2.0
-//
-// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
-// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
-// MERCHANTABLITY OR NON-INFRINGEMENT.
-// See the Apache 2 License for the specific language governing permissions and
-// limitations under the License.
-
-
-#ifndef KALDI_NNET_NNET_UTILS_H_
-#define KALDI_NNET_NNET_UTILS_H_
-
-#include <string>
-#include <vector>
-#include <iterator>
-#include <algorithm>
-
-#include "base/kaldi-common.h"
-#include "cudamatrix/cu-matrix.h"
-#include "cudamatrix/cu-array.h"
-#include "hmm/posterior.h"
-#include "hmm/transition-model.h"
-
-namespace kaldi {
-namespace nnet1 {
-
-
-/**
- * Define stream insertion opeartor for 'std::vector', useful for log-prints,
- */
-template <typename T>
-std::ostream& operator<<(std::ostream& os, const std::vector<T>& v) {
-  std::copy(v.begin(), v.end(), std::ostream_iterator<T>(os, " "));
-  return os;
-}
-
-/**
- * Convert basic type to a string (please don't overuse),
- */
-template <typename T>
-std::string ToString(const T& t) {
-  std::ostringstream os;
-  os << t;
-  return os.str();
-}
-
-/**
- * Get a string with statistics of the data in a vector,
- * so we can print them easily.
- */
-template <typename Real>
-std::string MomentStatistics(const VectorBase<Real> &vec) {
-  // we use an auxiliary vector for the higher order powers
-  Vector<Real> vec_aux(vec);
-  Vector<Real> vec_no_mean(vec);  // vec with mean subtracted
-  // mean
-  Real mean = vec.Sum() / vec.Dim();
-  // variance
-  vec_aux.Add(-mean);
-  vec_no_mean = vec_aux;
-  vec_aux.MulElements(vec_no_mean);  // (vec-mean)^2
-  Real variance = vec_aux.Sum() / vec.Dim();
-  // skewness
-  // - negative : left tail is longer,
-  // - positive : right tail is longer,
-  // - zero : symmetric
-  vec_aux.MulElements(vec_no_mean);  // (vec-mean)^3
-  Real skewness = vec_aux.Sum() / pow(variance, 3.0/2.0) / vec.Dim();
-  // kurtosis (peakedness)
-  // - makes sense for symmetric distributions (skewness is zero)
-  // - positive : 'sharper peak' than Normal distribution
-  // - negative : 'heavier tails' than Normal distribution
-  // - zero : same peakedness as the Normal distribution
-  vec_aux.MulElements(vec_no_mean);  // (vec-mean)^4
-  Real kurtosis = vec_aux.Sum() / (variance * variance) / vec.Dim() - 3.0;
-  // send the statistics to stream,
-  std::ostringstream ostr;
-  ostr << " ( min " << vec.Min() << ", max " << vec.Max()
-       << ", mean " << mean
-       << ", stddev " << sqrt(variance)
-       << ", skewness " << skewness
-       << ", kurtosis " << kurtosis
-       << " ) ";
-  return ostr.str();
-}
-
-/**
- * Overload MomentStatistics to MatrixBase<Real>
- */
-template <typename Real>
-std::string MomentStatistics(const MatrixBase<Real> &mat) {
-  Vector<Real> vec(mat.NumRows()*mat.NumCols());
-  vec.CopyRowsFromMat(mat);
-  return MomentStatistics(vec);
-}
-
-/**
- * Overload MomentStatistics to CuVectorBase<Real>
- */
-template <typename Real>
-std::string MomentStatistics(const CuVectorBase<Real> &vec) {
-  Vector<Real> vec_host(vec.Dim());
-  vec.CopyToVec(&vec_host);
-  return MomentStatistics(vec_host);
-}
-
-/**
- * Overload MomentStatistics to CuMatrix<Real>
- */
-template <typename Real>
-std::string MomentStatistics(const CuMatrixBase<Real> &mat) {
-  Matrix<Real> mat_host(mat.NumRows(), mat.NumCols());
-  mat.CopyToMat(&mat_host);
-  return MomentStatistics(mat_host);
-}
-
-/**
- * Check that matrix contains no nan or inf
- */
-template <typename Real>
-void CheckNanInf(const CuMatrixBase<Real> &mat, const char *msg = "") {
-  Real sum = mat.Sum();
-  if (KALDI_ISINF(sum)) { KALDI_ERR << "'inf' in " << msg; }
-  if (KALDI_ISNAN(sum)) { KALDI_ERR << "'nan' in " << msg; }
-}
-
-/**
- * Get the standard deviation of values in the matrix
- */
-template <typename Real>
-Real ComputeStdDev(const CuMatrixBase<Real> &mat) {
-  int32 N = mat.NumRows() * mat.NumCols();
-  Real mean = mat.Sum() / N;
-  CuMatrix<Real> pow_2(mat);
-  pow_2.MulElements(mat);
-  Real var = pow_2.Sum() / N - mean * mean;
-  if (var < 0.0) {
-    KALDI_WARN << "Forcing the variance to be non-negative! " << var << "->0.0";
-    var = 0.0;
-  }
-  return sqrt(var);
-}
-
-
-/**
- * Fill CuMatrix with random numbers (Gaussian distribution):
- * mu = the mean value,
- * sigma = standard deviation,
- *
- * Using the CPU random generator.
- */
-template <typename Real>
-void RandGauss(BaseFloat mu, BaseFloat sigma, CuMatrixBase<Real>* mat,
-               struct RandomState* state = NULL) {
-  // fill temporary matrix with 'Normal' samples,
-  Matrix<Real> m(mat->NumRows(), mat->NumCols(), kUndefined);
-  for (int32 r = 0; r < m.NumRows(); r++) {
-    for (int32 c = 0; c < m.NumCols(); c++) {
-      m(r, c) = RandGauss(state);
-    }
-  }
-  // re-shape the distrbution,
-  m.Scale(sigma);
-  m.Add(mu);
-  // export,
-  mat->CopyFromMat(m);
-}
-
-/**
- * Fill CuMatrix with random numbers (Uniform distribution):
- * mu = the mean value,
- * range = the 'width' of the uniform PDF (spanning mu-range/2 .. mu+range/2)
- *
- * Using the CPU random generator.
- */
-template <typename Real>
-void RandUniform(BaseFloat mu, BaseFloat range, CuMatrixBase<Real>* mat,
-                 struct RandomState* state = NULL) {
-  // fill temporary matrix with '0..1' samples,
-  Matrix<Real> m(mat->NumRows(), mat->NumCols(), kUndefined);
-  for (int32 r = 0; r < m.NumRows(); r++) {
-    for (int32 c = 0; c < m.NumCols(); c++) {
-      m(r, c) = Rand(state) / static_cast<Real>(RAND_MAX);
-    }
-  }
-  // re-shape the distrbution,
-  m.Scale(range);  // 0..range,
-  m.Add(mu - (range / 2.0));  // mu-range/2 .. mu+range/2,
-  // export,
-  mat->CopyFromMat(m);
-}
-
-/**
- * Fill CuVector with random numbers (Uniform distribution):
- * mu = the mean value,
- * range = the 'width' of the uniform PDF (spanning mu-range/2 .. mu+range/2)
- *
- * Using the CPU random generator.
- */
-template <typename Real>
-void RandUniform(BaseFloat mu, BaseFloat range, CuVectorBase<Real>* vec,
-                 struct RandomState* state = NULL) {
-  // fill temporary vector with '0..1' samples,
-  Vector<Real> v(vec->Dim(), kUndefined);
-  for (int32 i = 0; i < v.Dim(); i++) {
-    v(i) = Rand(state) / static_cast<Real>(RAND_MAX);
-  }
-  // re-shape the distrbution,
-  v.Scale(range);  // 0..range,
-  v.Add(mu - (range / 2.0));  // mu-range/2 .. mu+range/2,
-  // export,
-  vec->CopyFromVec(v);
-}
-
-
-/**
- * Build 'integer vector' out of vector of 'matlab-like' representation:
- * 'b, b:e, b:s:e'
- *
- * b,e,s are integers, where:
- * b = beginning
- * e = end
- * s = step
- *
- * The sequence includes 'end', 1:3 => [ 1 2 3 ].
- * The 'step' has to be positive.
- */
-inline void BuildIntegerVector(const std::vector<std::vector<int32> >& in,
-                               std::vector<int32>* out) {
-  // start with empty vector,
-  out->clear();
-  // loop over records,
-  for (int32 i = 0; i < in.size(); i++) {
-    // process i'th record,
-    int32 beg = 0, end = 0, step = 1;
-    switch (in[i].size()) {
-      case 1:
-        beg  = in[i][0];
-        end  = in[i][0];
-        step = 1;
-        break;
-      case 2:
-        beg  = in[i][0];
-        end  = in[i][1];
-        step = 1;
-        break;
-      case 3:
-        beg  = in[i][0];
-        end  = in[i][2];
-        step = in[i][1];
-        break;
-      default:
-        KALDI_ERR << "Something is wrong! (should be 1-3) : "
-                  << in[i].size();
-    }
-    // check the inputs,
-    KALDI_ASSERT(beg <= end);
-    KALDI_ASSERT(step > 0);  // positive,
-    // append values to vector,
-    for (int32 j = beg; j <= end; j += step) {
-      out->push_back(j);
-    }
-  }
-}
-
-/**
- * Wrapper with 'CuArray<int32>' output.
- */
-inline void BuildIntegerVector(const std::vector<std::vector<int32> >& in,
-                               CuArray<int32>* out) {
-  std::vector<int32> v;
-  BuildIntegerVector(in, &v);
-  (*out) = v;
-}
-
-
-/**
- * Wrapper of PosteriorToMatrix with CuMatrix argument.
- */
-template <typename Real>
-void PosteriorToMatrix(const Posterior &post,
-                       const int32 post_dim, CuMatrix<Real> *mat) {
-  Matrix<Real> m;
-  PosteriorToMatrix(post, post_dim, &m);
-  (*mat) = m;
-}
-
-
-/**
- * Wrapper of PosteriorToMatrixMapped with CuMatrix argument.
- */
-template <typename Real>
-void PosteriorToPdfMatrix(const Posterior &post,
-                          const TransitionModel &model,
-                          CuMatrix<Real> *mat) {
-  Matrix<BaseFloat> m;
-  PosteriorToPdfMatrix(post, model, &m);
-  // Copy to output GPU matrix,
-  (*mat) = m;
-}
-
-
-}  // namespace nnet1
-}  // namespace kaldi
-
-#endif  // KALDI_NNET_NNET_UTILS_H_
diff --git a/src/nnet/nnet-various.h b/src/nnet/nnet-various.h
deleted file mode 100644
index eeef9bc25bf..00000000000
--- a/src/nnet/nnet-various.h
+++ /dev/null
@@ -1,518 +0,0 @@
-// nnet/nnet-various.h
-
-// Copyright 2012-2016  Brno University of Technology (author: Karel Vesely)
-
-// See ../../COPYING for clarification regarding multiple authors
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//  http://www.apache.org/licenses/LICENSE-2.0
-//
-// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
-// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
-// MERCHANTABLITY OR NON-INFRINGEMENT.
-// See the Apache 2 License for the specific language governing permissions and
-// limitations under the License.
-
-
-#ifndef KALDI_NNET_NNET_VARIOUS_H_
-#define KALDI_NNET_NNET_VARIOUS_H_
-
-#include <string>
-#include <vector>
-#include <algorithm>
-#include <sstream>
-
-#include "nnet/nnet-component.h"
-#include "nnet/nnet-utils.h"
-#include "cudamatrix/cu-math.h"
-#include "util/text-utils.h"
-
-namespace kaldi {
-namespace nnet1 {
-
-/**
- * Splices the time context of the input features
- * in N, out k*N, FrameOffset o_1,o_2,...,o_k
- * FrameOffset example 11frames: -5 -4 -3 -2 -1 0 1 2 3 4 5
- */
-class Splice: public Component {
- public:
-  Splice(int32 dim_in, int32 dim_out):
-    Component(dim_in, dim_out)
-  { }
-
-  ~Splice()
-  { }
-
-  Component* Copy() const { return new Splice(*this); }
-  ComponentType GetType() const { return kSplice; }
-
-  void InitData(std::istream &is) {
-    // define options,
-    std::vector<std::vector<int32> > build_vector;
-    // parse config,
-    std::string token;
-    while (is >> std::ws, !is.eof()) {
-      ReadToken(is, false, &token);
-      /**/ if (token == "<ReadVector>") {
-        frame_offsets_.Read(is, false);
-      } else if (token == "<BuildVector>") {
-        // Parse the list of 'matlab-like' indices:
-        // <BuildVector> 1:1:1000 1 2 3 1:10 </BuildVector>
-        while (is >> std::ws, !is.eof()) {
-          std::string colon_sep_list_or_end;
-          ReadToken(is, false, &colon_sep_list_or_end);
-          if (colon_sep_list_or_end == "</BuildVector>") break;
-          std::vector<int32> v;
-          SplitStringToIntegers(colon_sep_list_or_end, ":", false, &v);
-          build_vector.push_back(v);
-        }
-      } else {
-        KALDI_ERR << "Unknown token " << token << ", a typo in config?"
-                  << " (ReadVector|BuildVector)";
-      }
-    }
-
-    if (build_vector.size() > 0) {
-      // build the vector, using <BuildVector> ... </BuildVector> inputs,
-      BuildIntegerVector(build_vector, &frame_offsets_);
-    }
-
-    // check dim
-    KALDI_ASSERT(frame_offsets_.Dim()*InputDim() == OutputDim());
-  }
-
-  void ReadData(std::istream &is, bool binary) {
-    frame_offsets_.Read(is, binary);
-    KALDI_ASSERT(frame_offsets_.Dim() * InputDim() == OutputDim());
-  }
-
-  void WriteData(std::ostream &os, bool binary) const {
-    frame_offsets_.Write(os, binary);
-  }
-
-  std::string Info() const {
-    std::ostringstream ostr;
-    ostr << "\n  frame_offsets " << frame_offsets_;
-    std::string str = ostr.str();
-    str.erase(str.end()-1);
-    return str;
-  }
-
-  void PropagateFnc(const CuMatrixBase<BaseFloat> &in,
-                    CuMatrixBase<BaseFloat> *out) {
-    cu::Splice(in, frame_offsets_, out);
-  }
-
-  void BackpropagateFnc(const CuMatrixBase<BaseFloat> &in,
-                        const CuMatrixBase<BaseFloat> &out,
-                        const CuMatrixBase<BaseFloat> &out_diff,
-                        CuMatrixBase<BaseFloat> *in_diff) {
-    // WARNING!!! WARNING!!! WARNING!!!
-    // THIS BACKPROPAGATION CAN BE USED ONLY WITH 'PER-UTTERANCE' TRAINING!
-    // IN MINI-BATCH TRAINING, THIS <Splice> COMPONENT HAS TO BE PART OF THE
-    // 'feature_transform' SO WE DON'T BACKPROPAGATE THROUGH IT...
-
-    // dims,
-    int32 input_dim = in.NumCols(),
-          num_frames = out_diff.NumRows();
-    // Copy offsets to 'host',
-    std::vector<int32> offsets(frame_offsets_.Dim());
-    frame_offsets_.CopyToVec(&offsets);
-    // loop over the offsets,
-    for (int32 i = 0; i < offsets.size(); i++) {
-      int32 o_i = offsets.at(i);
-      int32 n_rows = num_frames - abs(o_i),
-            src_row = std::max(-o_i, 0),
-            tgt_row = std::max(o_i, 0);
-      const CuSubMatrix<BaseFloat> src = out_diff.Range(src_row, n_rows, i*input_dim, input_dim);
-      CuSubMatrix<BaseFloat> tgt = in_diff->RowRange(tgt_row, n_rows);
-      tgt.AddMat(1.0, src, kNoTrans);
-    }
-  }
-
- protected:
-  CuArray<int32> frame_offsets_;
-};
-
-
-/**
- * Rearrange the matrix columns according to the indices in copy_from_indices_
- */
-class CopyComponent: public Component {
- public:
-  CopyComponent(int32 dim_in, int32 dim_out):
-    Component(dim_in, dim_out)
-  { }
-
-  ~CopyComponent()
-  { }
-
-  Component* Copy() const { return new CopyComponent(*this); }
-  ComponentType GetType() const { return kCopy; }
-
-  void InitData(std::istream &is) {
-    // define options,
-    std::vector<std::vector<int32> > build_vector;
-    // parse config,
-    std::string token;
-    while (is >> std::ws, !is.eof()) {
-      ReadToken(is, false, &token);
-      /**/ if (token == "<ReadVector>") {
-        copy_from_indices_.Read(is, false);
-      } else if (token == "<BuildVector>") {
-        // <BuildVector> 1:1:1000 1:1:1000 1 2 3 1:10 </BuildVector>
-        // 'matlab-line' indexing, read the colon-separated-lists:
-        while (is >> std::ws, !is.eof()) {
-          std::string colon_sep_list_or_end;
-          ReadToken(is, false, &colon_sep_list_or_end);
-          if (colon_sep_list_or_end == "</BuildVector>") break;
-          std::vector<int32> v;
-          SplitStringToIntegers(colon_sep_list_or_end, ":", false, &v);
-          build_vector.push_back(v);
-        }
-      } else {
-        KALDI_ERR << "Unknown token " << token << ", a typo in config?"
-                  << " (ReadVector|BuildVector)";
-      }
-    }
-
-    if (build_vector.size() > 0) {
-      // build the vector, using <BuildVector> ... </BuildVector> inputs,
-      BuildIntegerVector(build_vector, &copy_from_indices_);
-    }
-
-    // decrease by 1,
-    copy_from_indices_.Add(-1);
-
-    // check range,
-    KALDI_ASSERT(copy_from_indices_.Min() >= 0);
-    KALDI_ASSERT(copy_from_indices_.Max() < InputDim());
-    // check dim,
-    KALDI_ASSERT(copy_from_indices_.Dim() == OutputDim());
-  }
-
-  void ReadData(std::istream &is, bool binary) {
-    copy_from_indices_.Read(is, binary);
-    KALDI_ASSERT(copy_from_indices_.Dim() == OutputDim());
-    copy_from_indices_.Add(-1);  // -1 from each element,
-  }
-
-  void WriteData(std::ostream &os, bool binary) const {
-    CuArray<int32> tmp(copy_from_indices_);
-    tmp.Add(1);  // +1 to each element,
-    tmp.Write(os, binary);
-  }
-
-  std::string Info() const {
-    return std::string("\n  min ") + ToString(copy_from_indices_.Min()) +
-                         ", max "  + ToString(copy_from_indices_.Max());
-  }
-
-  void PropagateFnc(const CuMatrixBase<BaseFloat> &in,
-                    CuMatrixBase<BaseFloat> *out) {
-    cu::Copy(in, copy_from_indices_,out);
-  }
-
-  void BackpropagateFnc(const CuMatrixBase<BaseFloat> &in,
-                        const CuMatrixBase<BaseFloat> &out,
-                        const CuMatrixBase<BaseFloat> &out_diff,
-                        CuMatrixBase<BaseFloat> *in_diff) {
-    static bool warning_displayed = false;
-    if (!warning_displayed) {
-      KALDI_WARN << Component::TypeToMarker(GetType()) << " : "
-                 << __func__ << "() Not implemented!";
-
-      warning_displayed = true;
-    }
-    in_diff->SetZero();
-  }
-
- protected:
-  CuArray<int32> copy_from_indices_;
-};
-
-
-
-/**
- * Rescale the matrix-rows to have unit length (L2-norm).
- */
-class LengthNormComponent: public Component {
- public:
-  LengthNormComponent(int32 dim_in, int32 dim_out):
-    Component(dim_in, dim_out)
-  { }
-
-  ~LengthNormComponent()
-  { }
-
-  Component* Copy() const { return new LengthNormComponent(*this); }
-  ComponentType GetType() const { return kLengthNormComponent; }
-
-  void PropagateFnc(const CuMatrixBase<BaseFloat> &in,
-                    CuMatrixBase<BaseFloat> *out) {
-    // resize vector when needed,
-    if (row_scales_.Dim() != in.NumRows()) {
-      row_scales_.Resize(in.NumRows());
-    }
-    // get the normalization scalars,
-    l2_aux_ = in;
-    l2_aux_.MulElements(l2_aux_);  // x^2,
-    row_scales_.AddColSumMat(1.0, l2_aux_, 0.0);  // sum_of_cols(x^2),
-    row_scales_.ApplyPow(0.5);  // L2norm = sqrt(sum_of_cols(x^2)),
-    row_scales_.InvertElements();  // 1/L2norm,
-    // compute the output,
-    out->CopyFromMat(in);
-    out->MulRowsVec(row_scales_);  // re-normalize,
-  }
-
-  void BackpropagateFnc(const CuMatrixBase<BaseFloat> &in,
-                        const CuMatrixBase<BaseFloat> &out,
-                        const CuMatrixBase<BaseFloat> &out_diff,
-                        CuMatrixBase<BaseFloat> *in_diff) {
-    in_diff->CopyFromMat(out_diff);
-    in_diff->MulRowsVec(row_scales_);  // diff_by_x(s * x) = s,
-  }
-
- private:
-  CuMatrix<BaseFloat> l2_aux_;  ///< auxiliary matrix for L2 norm computation,
-  CuVector<BaseFloat> row_scales_;  ///< normalization scale of each row,
-};
-
-
-/**
- * Adds shift to all the lines of the matrix
- * (can be used for global mean normalization)
- */
-class AddShift : public UpdatableComponent {
- public:
-  AddShift(int32 dim_in, int32 dim_out):
-    UpdatableComponent(dim_in, dim_out),
-    shift_data_(dim_in)
-  { }
-
-  ~AddShift()
-  { }
-
-  Component* Copy() const { return new AddShift(*this); }
-  ComponentType GetType() const { return kAddShift; }
-
-  void InitData(std::istream &is) {
-    // define options
-    float init_param = 0.0;
-    // parse config
-    std::string token;
-    while (is >> std::ws, !is.eof()) {
-      ReadToken(is, false, &token);
-      /**/ if (token == "<InitParam>") ReadBasicType(is, false, &init_param);
-      else if (token == "<LearnRateCoef>") ReadBasicType(is, false, &learn_rate_coef_);
-      else KALDI_ERR << "Unknown token " << token << ", a typo in config?"
-                     << " (InitParam)";
-    }
-    // initialize
-    shift_data_.Resize(InputDim(), kSetZero);  // set to zero
-    shift_data_.Set(init_param);
-  }
-
-  void ReadData(std::istream &is, bool binary) {
-    // optional learning-rate coef,
-    if ('<' == Peek(is, binary)) {
-      ExpectToken(is, binary, "<LearnRateCoef>");
-      ReadBasicType(is, binary, &learn_rate_coef_);
-    }
-    // read the shift data
-    shift_data_.Read(is, binary);
-  }
-
-  void WriteData(std::ostream &os, bool binary) const {
-    WriteToken(os, binary, "<LearnRateCoef>");
-    WriteBasicType(os, binary, learn_rate_coef_);
-    shift_data_.Write(os, binary);
-  }
-
-  int32 NumParams() const { return shift_data_.Dim(); }
-
-  void GetGradient(VectorBase<BaseFloat>* gradient) const {
-    KALDI_ASSERT(gradient->Dim() == NumParams());
-    shift_data_grad_.CopyToVec(gradient);
-  }
-
-  void GetParams(VectorBase<BaseFloat>* params) const {
-    KALDI_ASSERT(params->Dim() == NumParams());
-    shift_data_.CopyToVec(params);
-  }
-
-  void SetParams(const VectorBase<BaseFloat>& params) {
-    KALDI_ASSERT(params.Dim() == NumParams());
-    shift_data_.CopyFromVec(params);
-  }
-
-  std::string Info() const {
-    return std::string("\n  shift_data") +
-      MomentStatistics(shift_data_) +
-      ", lr-coef " + ToString(learn_rate_coef_);
-  }
-
-  std::string InfoGradient() const {
-    return std::string("\n  shift_data_grad") +
-      MomentStatistics(shift_data_grad_) +
-      ", lr-coef " + ToString(learn_rate_coef_);
-  }
-
-  void PropagateFnc(const CuMatrixBase<BaseFloat> &in,
-                    CuMatrixBase<BaseFloat> *out) {
-    // copy, add the shift,
-    out->CopyFromMat(in);
-    out->AddVecToRows(1.0, shift_data_, 1.0);
-  }
-
-  void BackpropagateFnc(const CuMatrixBase<BaseFloat> &in,
-                        const CuMatrixBase<BaseFloat> &out,
-                        const CuMatrixBase<BaseFloat> &out_diff,
-                        CuMatrixBase<BaseFloat> *in_diff) {
-    // the derivative of additive constant is zero...
-    in_diff->CopyFromMat(out_diff);
-  }
-
-  void Update(const CuMatrixBase<BaseFloat> &input,
-              const CuMatrixBase<BaseFloat> &diff) {
-    // we use following hyperparameters from the option class,
-    const BaseFloat lr = opts_.learn_rate;
-    // gradient,
-    shift_data_grad_.Resize(InputDim(), kSetZero);  // reset to zero,
-    shift_data_grad_.AddRowSumMat(1.0, diff, 0.0);
-    // update,
-    shift_data_.AddVec(-lr * learn_rate_coef_, shift_data_grad_);
-  }
-
-  void SetLearnRateCoef(BaseFloat c) { learn_rate_coef_ = c; }
-
- protected:
-  CuVector<BaseFloat> shift_data_;
-  CuVector<BaseFloat> shift_data_grad_;
-};
-
-
-/**
- * Rescale the data column-wise by a vector
- * (can be used for global variance normalization)
- */
-class Rescale : public UpdatableComponent {
- public:
-  Rescale(int32 dim_in, int32 dim_out):
-    UpdatableComponent(dim_in, dim_out),
-    scale_data_(dim_in)
-  { }
-
-  ~Rescale()
-  { }
-
-  Component* Copy() const { return new Rescale(*this); }
-  ComponentType GetType() const { return kRescale; }
-
-  void InitData(std::istream &is) {
-    // define options
-    float init_param = 0.0;
-    // parse config
-    std::string token;
-    while (is >> std::ws, !is.eof()) {
-      ReadToken(is, false, &token);
-      /**/ if (token == "<InitParam>") ReadBasicType(is, false, &init_param);
-      else if (token == "<LearnRateCoef>") ReadBasicType(is, false, &learn_rate_coef_);
-      else KALDI_ERR << "Unknown token " << token << ", a typo in config?"
-                     << " (InitParam)";
-    }
-    // initialize
-    scale_data_.Resize(InputDim(), kSetZero);
-    scale_data_.Set(init_param);
-  }
-
-  void ReadData(std::istream &is, bool binary) {
-    // optional learning-rate coef,
-    if ('<' == Peek(is, binary)) {
-      ExpectToken(is, binary, "<LearnRateCoef>");
-      ReadBasicType(is, binary, &learn_rate_coef_);
-    }
-    // read the shift data
-    scale_data_.Read(is, binary);
-  }
-
-  void WriteData(std::ostream &os, bool binary) const {
-    WriteToken(os, binary, "<LearnRateCoef>");
-    WriteBasicType(os, binary, learn_rate_coef_);
-    scale_data_.Write(os, binary);
-  }
-
-  int32 NumParams() const { return scale_data_.Dim(); }
-
-  void GetGradient(VectorBase<BaseFloat>* gradient) const {
-    KALDI_ASSERT(gradient->Dim() == NumParams());
-    scale_data_grad_.CopyToVec(gradient);
-  }
-
-  void GetParams(VectorBase<BaseFloat>* params) const {
-    KALDI_ASSERT(params->Dim() == NumParams());
-    scale_data_.CopyToVec(params);
-  }
-
-  void SetParams(const VectorBase<BaseFloat>& params) {
-    KALDI_ASSERT(params.Dim() == NumParams());
-    scale_data_.CopyFromVec(params);
-  }
-
-  std::string Info() const {
-    return std::string("\n  scale_data") +
-      MomentStatistics(scale_data_) +
-      ", lr-coef " + ToString(learn_rate_coef_);
-  }
-
-  std::string InfoGradient() const {
-    return std::string("\n  scale_data_grad") +
-      MomentStatistics(scale_data_grad_) +
-      ", lr-coef " + ToString(learn_rate_coef_);
-  }
-
-  void PropagateFnc(const CuMatrixBase<BaseFloat> &in,
-                    CuMatrixBase<BaseFloat> *out) {
-    // copy, rescale the data,
-    out->CopyFromMat(in);
-    out->MulColsVec(scale_data_);
-  }
-
-  void BackpropagateFnc(const CuMatrixBase<BaseFloat> &in,
-                        const CuMatrixBase<BaseFloat> &out,
-                        const CuMatrixBase<BaseFloat> &out_diff,
-                        CuMatrixBase<BaseFloat> *in_diff) {
-    // derivatives are scaled with the scale_data_,
-    in_diff->CopyFromMat(out_diff);
-    in_diff->MulColsVec(scale_data_);
-  }
-
-  void Update(const CuMatrixBase<BaseFloat> &input,
-              const CuMatrixBase<BaseFloat> &diff) {
-    // we use following hyperparameters from the option class,
-    const BaseFloat lr = opts_.learn_rate;
-    // gradient,
-    scale_data_grad_.Resize(InputDim(), kSetZero);  // reset,
-    CuMatrix<BaseFloat> gradient_aux(diff);
-    gradient_aux.MulElements(input);
-    scale_data_grad_.AddRowSumMat(1.0, gradient_aux, 0.0);
-    // update,
-    scale_data_.AddVec(-lr * learn_rate_coef_, scale_data_grad_);
-  }
-
-  void SetLearnRateCoef(BaseFloat c) { learn_rate_coef_ = c; }
-
- protected:
-  CuVector<BaseFloat> scale_data_;
-  CuVector<BaseFloat> scale_data_grad_;
-};
-
-}  // namespace nnet1
-}  // namespace kaldi
-
-#endif  // KALDI_NNET_NNET_VARIOUS_H_
diff --git a/src/nnet2/Makefile b/src/nnet2/Makefile
deleted file mode 100644
index 7c19ec2603c..00000000000
--- a/src/nnet2/Makefile
+++ /dev/null
@@ -1,33 +0,0 @@
-
-
-all:
-
-include ../kaldi.mk
-
-LDFLAGS += $(CUDA_LDFLAGS)
-LDLIBS += $(CUDA_LDLIBS)
-
-
-TESTFILES = nnet-component-test nnet-precondition-test \
-	nnet-precondition-online-test nnet-example-functions-test \
-    nnet-nnet-test am-nnet-test online-nnet2-decodable-test \
-    nnet-compute-test
-
-OBJFILES = nnet-component.o nnet-nnet.o train-nnet.o train-nnet-ensemble.o nnet-update.o \
-     nnet-compute.o am-nnet.o nnet-functions.o  \
-     nnet-precondition.o combine-nnet.o \
-     mixup-nnet.o nnet-update-parallel.o combine-nnet-fast.o \
-     nnet-fix.o nnet-stats.o rescale-nnet.o nnet-limit-rank.o nnet-example.o \
-     get-feature-transform.o widen-nnet.o nnet-precondition-online.o \
-     nnet-example-functions.o nnet-compute-discriminative.o \
-     nnet-compute-discriminative-parallel.o online-nnet2-decodable.o \
-     nnet-compute-online.o
-
-LIBNAME = kaldi-nnet2
-
-ADDLIBS = ../cudamatrix/kaldi-cudamatrix.a ../lat/kaldi-lat.a \
-          ../hmm/kaldi-hmm.a ../transform/kaldi-transform.a ../gmm/kaldi-gmm.a \
-          ../tree/kaldi-tree.a ../util/kaldi-util.a ../matrix/kaldi-matrix.a \
-          ../base/kaldi-base.a 
-
-include ../makefiles/default_rules.mk
diff --git a/src/nnet2/am-nnet-test.cc b/src/nnet2/am-nnet-test.cc
deleted file mode 100644
index ce864320700..00000000000
--- a/src/nnet2/am-nnet-test.cc
+++ /dev/null
@@ -1,88 +0,0 @@
-// nnet2/am-nnet-test.cc
-
-// Copyright 2014  Johns Hopkins University (author:  Daniel Povey)
-
-// See ../../COPYING for clarification regarding multiple authors
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//  http://www.apache.org/licenses/LICENSE-2.0
-//
-// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
-// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
-// MERCHANTABLITY OR NON-INFRINGEMENT.
-// See the Apache 2 License for the specific language governing permissions and
-// limitations under the License.
-
-#include "hmm/transition-model.h"
-#include "hmm/hmm-test-utils.h"
-#include "nnet2/am-nnet.h"
-
-
-namespace kaldi {
-namespace nnet2 {
-
-
-void UnitTestAmNnet() {
-  std::vector<int32> phones;
-  phones.push_back(1);
-  for (int32 i = 2; i < 20; i++)
-    if (rand() % 2 == 0)
-      phones.push_back(i);
-  int32 N = 2 + rand() % 2, // context-size N is 2 or 3.
-      P = rand() % N;  // Central-phone is random on [0, N)
-
-  std::vector<int32> num_pdf_classes;
-
-  ContextDependency *ctx_dep =
-      GenRandContextDependencyLarge(phones, N, P,
-                                    true, &num_pdf_classes);
-
-  HmmTopology topo = GetDefaultTopology(phones);
-
-  TransitionModel trans_model(*ctx_dep, topo);
-
-  delete ctx_dep; // We won't need this further.
-  ctx_dep = NULL;
-
-  int32 input_dim = 40, output_dim = trans_model.NumPdfs();
-  Nnet *nnet = GenRandomNnet(input_dim, output_dim);
-
-  AmNnet am_nnet(*nnet);
-  delete nnet;
-  nnet = NULL;
-  Vector<BaseFloat> priors(output_dim);
-  priors.SetRandn();
-  priors.ApplyExp();
-  priors.Scale(1.0 / priors.Sum());
-
-  am_nnet.SetPriors(priors);
-
-  bool binary = (rand() % 2 == 0);
-  std::ostringstream os;
-  am_nnet.Write(os, binary);
-  AmNnet am_nnet2;
-  std::istringstream is(os.str());
-  am_nnet2.Read(is, binary);
-
-  std::ostringstream os2;
-  am_nnet2.Write(os2, binary);
-
-  KALDI_ASSERT(os2.str() == os.str());
-}
-
-} // namespace nnet2
-} // namespace kaldi
-
-
-int main() {
-  using namespace kaldi;
-  using namespace kaldi::nnet2;
-
-  UnitTestAmNnet();
-  return 0;
-}
-
diff --git a/src/nnet2/am-nnet.cc b/src/nnet2/am-nnet.cc
deleted file mode 100644
index 60d65de31a7..00000000000
--- a/src/nnet2/am-nnet.cc
+++ /dev/null
@@ -1,83 +0,0 @@
-// nnet2/am-nnet.cc
-
-// Copyright 2012  Johns Hopkins University (author:  Daniel Povey)
-
-// See ../../COPYING for clarification regarding multiple authors
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//  http://www.apache.org/licenses/LICENSE-2.0
-//
-// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
-// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
-// MERCHANTABLITY OR NON-INFRINGEMENT.
-// See the Apache 2 License for the specific language governing permissions and
-// limitations under the License.
-
-#include "nnet2/am-nnet.h"
-
-namespace kaldi {
-namespace nnet2 {
-
-
-void AmNnet::Init(std::istream &config_is) {
-  nnet_.Init(config_is);
-}
-
-
-void AmNnet::Write(std::ostream &os, bool binary) const {
-  // We don't write any header or footer like <AmNnet> and </AmNnet> -- we just
-  // write the neural net and then the priors.  Who knows, there might be some
-  // situation where we want to just read the neural net.
-  nnet_.Write(os, binary);
-  priors_.Write(os, binary);
-}
-
-void AmNnet::Read(std::istream &is, bool binary) {
-  nnet_.Read(is, binary);
-  priors_.Read(is, binary);
-}
-
-void AmNnet::SetPriors(const VectorBase<BaseFloat> &priors) {
-  priors_ = priors;
-  if (priors_.Dim() > NumPdfs())    
-    KALDI_ERR << "Dimension of priors cannot exceed number of pdfs.";
-
-  if (priors_.Dim() > 0 && priors_.Dim() < NumPdfs()) {
-    KALDI_WARN << "Dimension of priors is " << priors_.Dim() << " < "
-               << NumPdfs() << ": extending with zeros, in case you had "
-               << "unseen pdf's, but this possibly indicates a serious problem.";
-    priors_.Resize(NumPdfs(), kCopyData);
-  }
-}
-
-std::string AmNnet::Info() const {
-  std::ostringstream ostr;
-  ostr << "prior dimension: " << priors_.Dim();
-  if (priors_.Dim() != 0) {
-    ostr << ", prior sum: " << priors_.Sum() << ", prior min: " << priors_.Min()
-         << "\n";
-  }
-  return nnet_.Info() + ostr.str();
-}
-
-void AmNnet::Init(const Nnet &nnet) {
-  nnet_ = nnet;
-  if (priors_.Dim() != 0 && priors_.Dim() != nnet.OutputDim()) {
-    KALDI_WARN << "Initializing neural net: prior dimension mismatch, "
-               << "discarding old priors.";
-    priors_.Resize(0);
-  }
-}
-
-void AmNnet::ResizeOutputLayer(int32 new_num_pdfs) {
-  nnet_.ResizeOutputLayer(new_num_pdfs);
-  priors_.Resize(new_num_pdfs);
-  priors_.Set(1.0 / new_num_pdfs);
-}
-
-} // namespace nnet2
-} // namespace kaldi
diff --git a/src/nnet2/am-nnet.h b/src/nnet2/am-nnet.h
deleted file mode 100644
index a5ea512e3d8..00000000000
--- a/src/nnet2/am-nnet.h
+++ /dev/null
@@ -1,86 +0,0 @@
-// nnet2/am-nnet.h
-
-// Copyright 2012  Johns Hopkins University (author: Daniel Povey)
-
-// See ../../COPYING for clarification regarding multiple authors
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//  http://www.apache.org/licenses/LICENSE-2.0
-//
-// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
-// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
-// MERCHANTABLITY OR NON-INFRINGEMENT.
-// See the Apache 2 License for the specific language governing permissions and
-// limitations under the License.
-
-#ifndef KALDI_NNET2_AM_NNET_H_
-#define KALDI_NNET2_AM_NNET_H_
-
-#include "base/kaldi-common.h"
-#include "matrix/matrix-lib.h"
-#include "nnet2/nnet-nnet.h"
-
-namespace kaldi {
-namespace nnet2 {
-
-/*
-  The class AmNnet (AM stands for "acoustic model") has the job of taking the
-  "Nnet" class, which is a quite general neural network, and giving it an
-  interface that's suitable for acoustic modeling; it deals with storing, and
-  dividing by, the prior of each context-dependent state.
-*/
-
-
-class AmNnet {
- public:
-  AmNnet() { }
-
-  AmNnet(const AmNnet &other): nnet_(other.nnet_), priors_(other.priors_) { }
-
-  explicit AmNnet(const Nnet &nnet): nnet_(nnet) { }
-  
-  /// Initialize the neural network based acoustic model from a config file.
-  /// At this point the priors won't be initialized; you'd have to do
-  /// SetPriors for that.
-  void Init(std::istream &config_is);
-
-  /// Initialize from a neural network that's already been set up.
-  /// Again, the priors will be empty at this point.
-  void Init(const Nnet &nnet);
-
-  int32 NumPdfs() const { return nnet_.OutputDim(); }
-  
-  void Write(std::ostream &os, bool binary) const;
-  
-  void Read(std::istream &is, bool binary);
-
-  const Nnet &GetNnet() const { return nnet_; }
-  
-  Nnet &GetNnet() { return nnet_; }
-
-  void SetPriors(const VectorBase<BaseFloat> &priors);
-  
-  const VectorBase<BaseFloat> &Priors() const { return priors_; }
-
-  std::string Info() const;
-
-  /// This function is used when doing transfer learning to a new system.
-  /// It will set the priors to be all the same. 
-  void ResizeOutputLayer(int32 new_num_pdfs);
-  
- private:
-  const AmNnet &operator = (const AmNnet &other); // Disallow.
-  Nnet nnet_;
-  Vector<BaseFloat> priors_;
-};
-
-
-
-} // namespace nnet2
-} // namespace kaldi
-
-#endif // KALDI_NNET2_AM_NNET_H_
diff --git a/src/nnet2/combine-nnet-a.cc b/src/nnet2/combine-nnet-a.cc
deleted file mode 100644
index 6208ca77770..00000000000
--- a/src/nnet2/combine-nnet-a.cc
+++ /dev/null
@@ -1,230 +0,0 @@
-// nnet2/combine-nnet-a.cc
-
-// Copyright 2012   Johns Hopkins University (author: Daniel Povey)
-
-// See ../../COPYING for clarification regarding multiple authors
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//  http://www.apache.org/licenses/LICENSE-2.0
-//
-// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
-// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
-// MERCHANTABLITY OR NON-INFRINGEMENT.
-// See the Apache 2 License for the specific language governing permissions and
-// limitations under the License.
-
-#include "nnet2/combine-nnet-a.h"
-
-namespace kaldi {
-namespace nnet2 {
-
-/*
-  This function gets the "update direction".  The vector "nnets" is
-  interpreted as (old-nnet new-nnet1 net-nnet2 ... new-nnetN), and
-  the "update direction" is the average of the new nnets, minus the
-  old nnet.
-*/
-static void GetUpdateDirection(const std::vector<Nnet> &nnets,
-                               Nnet *direction) {
-  KALDI_ASSERT(nnets.size() > 1);
-  int32 num_new_nnets = nnets.size() - 1;
-  Vector<BaseFloat> scales(nnets[0].NumUpdatableComponents());
-
-  scales.Set(1.0 / num_new_nnets);
-  
-  *direction = nnets[1];
-  direction->ScaleComponents(scales); // first of the new nnets.
-  for (int32 n = 2; n < 1 + num_new_nnets; n++)
-    direction->AddNnet(scales, nnets[n]);
-  // now "direction" is the average of the new nnets.  Subtract
-  // the old nnet's parameters.
-  scales.Set(-1.0);
-  direction->AddNnet(scales, nnets[0]);
-}
-
-/// Sets "dest" to orig_nnet plus "direction", with
-/// each updatable component of "direction" first scaled by
-/// the appropriate scale.
-static void AddDirection(const Nnet &orig_nnet,
-                         const Nnet &direction,
-                         const VectorBase<BaseFloat> &scales,
-                         Nnet *dest) {
-  *dest = orig_nnet;
-  dest->AddNnet(scales, direction);
-}
-
-
-static BaseFloat ComputeObjfAndGradient(
-    const std::vector<NnetExample> &validation_set,
-    const Vector<double> &scale_params,
-    const Nnet &orig_nnet,
-    const Nnet &direction,
-    Vector<double> *gradient) {
-  
-  Vector<BaseFloat> scale_params_float(scale_params);
-
-  Nnet nnet_combined;
-  AddDirection(orig_nnet, direction, scale_params_float, &nnet_combined);
-  
-  Nnet nnet_gradient(nnet_combined);
-  bool is_gradient = true;
-  nnet_gradient.SetZero(is_gradient);
-  
-  // note: "ans" is normalized by the total weight of validation frames.
-  int32 batch_size = 1024;
-  BaseFloat ans = ComputeNnetGradient(nnet_combined,
-                                      validation_set,
-                                      batch_size,
-                                      &nnet_gradient);
-
-  BaseFloat tot_count = validation_set.size();
-  int32 i = 0; // index into scale_params.
-  for (int32 j = 0; j < nnet_combined.NumComponents(); j++) {
-    const UpdatableComponent *uc_direction =
-        dynamic_cast<const UpdatableComponent*>(&(direction.GetComponent(j))),
-        *uc_gradient =
-        dynamic_cast<const UpdatableComponent*>(&(nnet_gradient.GetComponent(j)));
-    if (uc_direction != NULL) {
-      BaseFloat dotprod = uc_direction->DotProduct(*uc_gradient) / tot_count;
-      (*gradient)(i) = dotprod; 
-      i++;
-    }
-  }
-  KALDI_ASSERT(i == scale_params.Dim());
-  return ans;
-}
-                                   
-
-void CombineNnetsA(const NnetCombineAconfig &config,
-                   const std::vector<NnetExample> &validation_set,
-                   const std::vector<Nnet> &nnets,
-                   Nnet *nnet_out) {
-
-  Nnet direction; // the update direction = avg(nnets[1 ... N]) - nnets[0].
-  GetUpdateDirection(nnets, &direction);
-  
-  Vector<double> scale_params(nnets[0].NumUpdatableComponents()); // initial
-  // scale on "direction".
-
-  int32 dim = scale_params.Dim();
-  KALDI_ASSERT(dim > 0);
-  Vector<double> gradient(dim);
-  
-  double objf, initial_objf, zero_objf;
-
-  // Compute objf at zero; we don't actually need this gradient.
-  zero_objf = ComputeObjfAndGradient(validation_set,
-                                     scale_params,
-                                     nnets[0],
-                                     direction,
-                                     &gradient);
-  KALDI_LOG << "Objective function at old parameters is "
-            << zero_objf;
-  
-  scale_params.Set(1.0); // start optimization from the average of the parameters.
-
-  LbfgsOptions lbfgs_options;
-  lbfgs_options.minimize = false; // We're maximizing.
-  lbfgs_options.m = dim; // Store the same number of vectors as the dimension
-  // itself, so this is BFGS.
-  lbfgs_options.first_step_length = config.initial_step;
-  
-  OptimizeLbfgs<double> lbfgs(scale_params,
-                              lbfgs_options);
-  
-  for (int32 i = 0; i < config.num_bfgs_iters; i++) {    
-    scale_params.CopyFromVec(lbfgs.GetProposedValue());
-    objf = ComputeObjfAndGradient(validation_set,
-                                  scale_params,
-                                  nnets[0],
-                                  direction,
-                                  &gradient);
-
-    KALDI_VLOG(2) << "Iteration " << i << " scale-params = " << scale_params
-                  << ", objf = " << objf << ", gradient = " << gradient;
-    
-    if (i == 0) initial_objf = objf;    
-    lbfgs.DoStep(objf, gradient);
-  }
-
-  scale_params.CopyFromVec(lbfgs.GetValue(&objf));
-
-  KALDI_LOG << "Combining nnets, after BFGS, validation objf per frame changed from "
-            << zero_objf << " (no change), or " << initial_objf << " (default change), "
-            << " to " << objf << "; scale factors on update direction are "
-            << scale_params;
-
-  BaseFloat objf_change = objf - zero_objf;
-  KALDI_ASSERT(objf_change >= 0.0); // This is guaranteed by the L-BFGS code.
-
-  if (objf_change < config.valid_impr_thresh) {
-    // We'll overshoot.  To have a smooth transition between the two regimes, if
-    // objf_change is close to valid_impr_thresh we don't overshoot as far.
-    BaseFloat overshoot = config.overshoot,
-        overshoot_max = config.valid_impr_thresh / objf_change; // >= 1.0.
-    if (overshoot_max < overshoot) {
-      KALDI_LOG << "Limiting overshoot from " << overshoot << " to " << overshoot_max
-                << " since the objf-impr " << objf_change << " is close to "
-                << "--valid-impr-thresh=" << config.valid_impr_thresh;
-      overshoot = overshoot_max;
-    }
-    KALDI_ASSERT(overshoot < 2.0 && "--valid-impr-thresh must be < 2.0 or "
-                 "it will lead to instability.");
-    scale_params.Scale(overshoot);
-
-    BaseFloat optimized_objf = objf;
-    objf = ComputeObjfAndGradient(validation_set,
-                                  scale_params,
-                                  nnets[0],
-                                  direction,
-                                  &gradient);
-
-    KALDI_LOG << "Combining nnets, after overshooting, validation objf changed "
-              << "to " << objf << ".  Note: (zero, start, optimized) objfs were "
-              << zero_objf << ", " << initial_objf << ", " << optimized_objf;
-    if (objf < zero_objf) {
-      // Note: this should not happen according to a quadratic approximation, and we
-      // expect this branch to be taken only rarely if at all.
-      KALDI_WARN << "After overshooting, objf was worse than not updating; not doing the "
-                 << "overshoot. ";
-     scale_params.Scale(1.0 / overshoot);
-    }
-  } // Else don't do the "overshoot" stuff.
-  
-  Vector<BaseFloat> scale_params_float(scale_params);
-  // Output to "nnet_out":
-  AddDirection(nnets[0], direction, scale_params_float, nnet_out);
-
-  // Now update the neural net learning rates.
-  int32 i = 0;
-  for (int32 j = 0; j < nnet_out->NumComponents(); j++) {
-    UpdatableComponent *uc =
-        dynamic_cast<UpdatableComponent*>(&(nnet_out->GetComponent(j)));
-    if (uc != NULL) {
-      BaseFloat step_length = scale_params(i), factor = step_length;
-      // Our basic rule is to update the learning rate by multiplying it
-      // by "step_lenght", but this is subject to certain limits.
-      if (factor < config.min_learning_rate_factor)
-        factor = config.min_learning_rate_factor;
-      if (factor > config.max_learning_rate_factor)
-        factor = config.max_learning_rate_factor;
-      BaseFloat new_learning_rate = factor * uc->LearningRate();
-      if (new_learning_rate < config.min_learning_rate)
-        new_learning_rate = config.min_learning_rate;
-      KALDI_LOG << "For component " << j << ", step length was " << step_length
-                << ", updating learning rate by factor " << factor << ", changing "
-                << "learning rate from " << uc->LearningRate() << " to "
-                << new_learning_rate;
-      uc->SetLearningRate(new_learning_rate);
-      i++;
-    }
-  }
-}
- 
-  
-} // namespace nnet2
-} // namespace kaldi
diff --git a/src/nnet2/combine-nnet-a.h b/src/nnet2/combine-nnet-a.h
deleted file mode 100644
index 2624993bbcf..00000000000
--- a/src/nnet2/combine-nnet-a.h
+++ /dev/null
@@ -1,85 +0,0 @@
-// nnet2/combine-nnet-a.h
-
-// Copyright 2012  Johns Hopkins University (author: Daniel Povey)
-
-// See ../../COPYING for clarification regarding multiple authors
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//  http://www.apache.org/licenses/LICENSE-2.0
-//
-// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
-// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
-// MERCHANTABLITY OR NON-INFRINGEMENT.
-// See the Apache 2 License for the specific language governing permissions and
-// limitations under the License.
-
-#ifndef KALDI_NNET2_COMBINE_NNET_A_H_
-#define KALDI_NNET2_COMBINE_NNET_A_H_
-
-#include "nnet2/nnet-update.h"
-#include "nnet2/nnet-compute.h"
-#include "util/parse-options.h"
-#include "itf/options-itf.h"
-
-namespace kaldi {
-namespace nnet2 {
-
-struct NnetCombineAconfig {
-  int32 num_bfgs_iters; // The dimension is small (the number of layers)
-  // so we do BFGS.  Note: this num-iters is really the number of function
-  // evaluations.
-  
-  BaseFloat initial_step;
-
-  BaseFloat valid_impr_thresh;
-  BaseFloat overshoot;
-
-  BaseFloat min_learning_rate_factor; // 0.5 by default;
-  BaseFloat max_learning_rate_factor; // 2.0 by default.
-  BaseFloat min_learning_rate; // 0.0001 by default; we don't allow learning rate to go below
-  // this, mainly because it would lead to roundoff problems.
-  
-  NnetCombineAconfig(): num_bfgs_iters(15), initial_step(0.1),
-                        valid_impr_thresh(0.5), overshoot(1.8),
-                        min_learning_rate_factor(0.5),
-                        max_learning_rate_factor(2.0),
-                        min_learning_rate(0.0001) { }
-  
-  void Register(OptionsItf *opts) {
-    opts->Register("num-bfgs-iters", &num_bfgs_iters, "Maximum number of function "
-                   "evaluations for BFGS to use when optimizing combination weights");
-    opts->Register("initial-step", &initial_step, "Parameter in the optimization, "
-                   "used to set the initial step length; the default value should be "
-                   "suitable.");
-    opts->Register("num-bfgs-iters", &num_bfgs_iters, "Maximum number of function "
-                   "evaluations for BFGS to use when optimizing combination weights");
-    opts->Register("valid-impr-thresh", &valid_impr_thresh, "Threshold of improvement "
-                   "in validation-set objective function for one iteratin; below this, "
-                   "we start using the \"overshoot\" mechanism to keep learning rates high.");
-    opts->Register("overshoot", &overshoot, "Factor by which we overshoot the step "
-                   "size obtained by BFGS; only applies when validation set impr is less "
-                   "than valid-impr-thresh.");
-    opts->Register("max-learning-rate-factor", &max_learning_rate_factor,
-                   "Maximum factor by which to increase the learning rate for any layer.");
-    opts->Register("min-learning-rate-factor", &min_learning_rate_factor,
-                   "Minimum factor by which to increase the learning rate for any layer.");
-    opts->Register("min-learning-rate", &min_learning_rate,
-                   "Floor on the automatically updated learning rates");
-  }  
-};
-
-void CombineNnetsA(const NnetCombineAconfig &combine_config,
-                   const std::vector<NnetExample> &validation_set,
-                   const std::vector<Nnet> &nnets_in,
-                   Nnet *nnet_out);
-  
-
-
-} // namespace nnet2
-} // namespace kaldi
-
-#endif
diff --git a/src/nnet2/combine-nnet-fast.cc b/src/nnet2/combine-nnet-fast.cc
deleted file mode 100644
index 02265a5f6ac..00000000000
--- a/src/nnet2/combine-nnet-fast.cc
+++ /dev/null
@@ -1,443 +0,0 @@
-// nnet2/combine-nnet-fast.cc
-
-// Copyright 2012   Johns Hopkins University (author: Daniel Povey)
-
-// See ../../COPYING for clarification regarding multiple authors
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//  http://www.apache.org/licenses/LICENSE-2.0
-//
-// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
-// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
-// MERCHANTABLITY OR NON-INFRINGEMENT.
-// See the Apache 2 License for the specific language governing permissions and
-// limitations under the License.
-
-#include "nnet2/combine-nnet-fast.h"
-#include "nnet2/nnet-update-parallel.h"
-#include "util/kaldi-thread.h"
-
-namespace kaldi {
-namespace nnet2 {
-
-/*
-  This class is responsible for computing a Fisher matrix which is a kind of
-  scatter of gradients on subsets; it's used for preconditioning the update in
-  class FastNnetCombiner.  */
-class FisherComputationClass: public MultiThreadable {
- public:
-  FisherComputationClass(const Nnet &nnet,
-                         const std::vector<Nnet> &nnets,
-                         const std::vector<NnetExample> &egs,
-                         int32 minibatch_size,
-                         SpMatrix<double> *scatter):
-      nnet_(nnet), nnets_(nnets), egs_(egs), minibatch_size_(minibatch_size),
-      scatter_ptr_(scatter) { } // This initializer is only used to create a
-  // temporary version of the object; the next initializer is used to
-  // create the separate versions for the parallel jobs.
-
-  FisherComputationClass(const FisherComputationClass &other):
-      MultiThreadable(other),
-      nnet_(other.nnet_), nnets_(other.nnets_), egs_(other.egs_),
-      minibatch_size_(other.minibatch_size_), scatter_ptr_(other.scatter_ptr_) {
-    scatter_.Resize(nnets_.size() * nnet_.NumUpdatableComponents());  }
-
-  void operator () () {
-    // b is the "minibatch id."
-    int32 num_egs = static_cast<int32>(egs_.size());
-    Nnet nnet_gradient(nnet_);
-    for (int32 b = 0; b * minibatch_size_ < num_egs; b++) {
-      if (b % num_threads_ != thread_id_)
-        continue; // We're not responsible for this minibatch.
-      int32 offset = b * minibatch_size_,
-          length = std::min(minibatch_size_,
-                       num_egs - offset);
-      bool is_gradient = true;
-      nnet_gradient.SetZero(is_gradient);
-      std::vector<NnetExample> minibatch(egs_.begin() + offset,
-                                                 egs_.begin() + offset + length);
-      DoBackprop(nnet_, minibatch, &nnet_gradient);
-      Vector<double> gradient(nnets_.size() * nnet_.NumUpdatableComponents());
-      int32 i = 0;
-      for (int32 n = 0; n < static_cast<int32>(nnets_.size()); n++) {
-        for (int32 c = 0; c < nnet_.NumComponents(); c++) {
-          const UpdatableComponent *uc = dynamic_cast<const UpdatableComponent*>(
-              &(nnet_gradient.GetComponent(c))),
-              *uc_other = dynamic_cast<const UpdatableComponent*>(
-                  &(nnets_[n].GetComponent(c)));
-          if (uc != NULL) {
-            gradient(i) = uc->DotProduct(*uc_other);
-            i++;
-          }
-        }
-      }
-      KALDI_ASSERT(i == gradient.Dim());
-      scatter_.AddVec2(1.0, gradient);
-    }
-  }
-  ~FisherComputationClass() {
-    if (scatter_.NumRows() != 0) {
-      if (scatter_ptr_->NumRows() == 0)
-        scatter_ptr_->Resize(scatter_.NumRows());
-      scatter_ptr_->AddSp(1.0, scatter_);
-    }
-  }
-
- private:
-  const Nnet &nnet_; // point at which we compute the parameter gradients.
-  const std::vector<Nnet> &nnets_; // The dot-product  of each of these with the parameter gradients,
-  // are the actual gradients that go into "scatter".
-  const std::vector<NnetExample> &egs_;
-  int32 minibatch_size_; // equals config --fisher-minbatch-size e.g. 64 (smaller than
-                         // regular minibatch size.)
-  SpMatrix<double> *scatter_ptr_;
-  SpMatrix<double> scatter_; // Local accumulation of the scatter.
-};
-
-
-class FastNnetCombiner {
- public:
-  FastNnetCombiner(const NnetCombineFastConfig &combine_config,
-                   const std::vector<NnetExample> &validation_set,
-                   const std::vector<Nnet> &nnets_in,
-                   Nnet *nnet_out):
-      config_(combine_config), egs_(validation_set),
-      nnets_(nnets_in), nnet_out_(nnet_out) {
-
-    GetInitialParams();
-    ComputePreconditioner();
-
-    int32 dim = params_.Dim();
-    KALDI_ASSERT(dim > 0);
-    Vector<double> gradient(dim);
-
-    double regularizer_objf, initial_regularizer_objf; // for diagnostics
-    double objf, initial_objf;
-
-    LbfgsOptions lbfgs_options;
-    lbfgs_options.minimize = false; // We're maximizing.
-    lbfgs_options.m = std::min(dim, config_.max_lbfgs_dim);
-    lbfgs_options.first_step_impr = config_.initial_impr;
-
-    OptimizeLbfgs<double> lbfgs(params_,
-                                lbfgs_options);
-
-    for (int32 i = 0; i < config_.num_lbfgs_iters; i++) {
-      params_.CopyFromVec(lbfgs.GetProposedValue());
-      objf = ComputeObjfAndGradient(&gradient, &regularizer_objf);
-      // Note: there is debug printout in ComputeObjfAndGradient
-      // (at verbose-level 2).
-      if (i == 0) {
-        initial_objf = objf;
-        initial_regularizer_objf = regularizer_objf;
-      }
-      lbfgs.DoStep(objf, gradient);
-    }
-    params_ = lbfgs.GetValue(&objf);
-
-    ComputeCurrentNnet(nnet_out_, true); // create the output neural net, and
-                                         // print out the scaling factors.
-    if (config_.regularizer != 0.0) {
-      double initial_part = initial_objf - initial_regularizer_objf,
-          part = objf - regularizer_objf;
-      KALDI_LOG << "Combining nnets, objf/frame + regularizer changed from "
-                << initial_part << " + " << initial_regularizer_objf
-                << " = " << initial_objf << " to " << part << " + "
-                << regularizer_objf << " = " << objf;
-    } else {
-      KALDI_LOG << "Combining nnets, objf per frame changed from "
-                << initial_objf << " to " << objf;
-    }
-  }
-
- private:
-  int32 GetInitialModel(
-      const std::vector<NnetExample> &validation_set,
-      const std::vector<Nnet> &nnets) const;
-
-  void GetInitialParams();
-
-  void ComputePreconditioner();
-
-  // Computes and returns objective function per frame, including
-  // regularizer term if applicable.  Also puts just the regularizer
-  // term in *regularizer_objf.
-  double ComputeObjfAndGradient(
-      Vector<double> *gradient,
-      double *regularizer_objf);
-
-  void ComputeCurrentNnet(
-      Nnet *dest, bool debug = false);
-
-  static void CombineNnets(const Vector<double> &scale_params,
-                           const std::vector<Nnet> &nnets,
-                           Nnet *dest);
-
-
-  // C_ is the cholesky of the smoothed Fisher matrix F.
-  // Let F = C C^T.
-  // Preconditioned gradient is \hat{g} = C^{-1} g
-  // Note: preconditioned parameter is \hat{p} = C^T p,
-  // so p = C^{-T} \hat{p}.
-  TpMatrix<double> C_;
-  TpMatrix<double> C_inv_;
-  Vector<double> params_; // the parameters we're optimizing-- in the
-                          // preconditioned space.  These are the same dimension
-                          // as the number of nnets we're combining times the
-                          // number of updatable layers.
-
-  const NnetCombineFastConfig &config_;
-  const std::vector<NnetExample> &egs_;
-  const std::vector<Nnet> &nnets_;
-  Nnet *nnet_out_;
-};
-
-
-// static
-void FastNnetCombiner::CombineNnets(const Vector<double> &scale_params,
-                                    const std::vector<Nnet> &nnets,
-                                    Nnet *dest) {
-  int32 num_nnets = nnets.size();
-  KALDI_ASSERT(num_nnets >= 1);
-  int32 num_uc = nnets[0].NumUpdatableComponents();
-  KALDI_ASSERT(num_nnets * nnets[0].NumUpdatableComponents());
-
-
-  *dest = nnets[0];
-  SubVector<double> scale_params0(scale_params, 0, num_uc);
-  dest->ScaleComponents(Vector<BaseFloat>(scale_params0));
-  for (int32 n = 1; n < num_nnets; n++) {
-    SubVector<double> scale_params_n(scale_params, n * num_uc, num_uc);
-    dest->AddNnet(Vector<BaseFloat>(scale_params_n), nnets[n]);
-  }
-}
-
-
-void FastNnetCombiner::ComputePreconditioner() {
-  SpMatrix<double> F; // Fisher matrix.
-  Nnet nnet;
-  ComputeCurrentNnet(&nnet); // will be at initial value of neural net.
-
-  { // This block does the multi-threaded computation.
-    // The next line just initializes an "example" object.
-    FisherComputationClass fc(nnet, nnets_, egs_,
-                              config_.fisher_minibatch_size,
-                              &F);
-
-    // Setting num_threads to zero if config_.num_threads == 1
-    // is a signal to the MultiThreader class to run without creating
-    // any extra threads in this case; it helps support GPUs.
-    int32 num_threads = config_.num_threads == 1 ? 0 : config_.num_threads;
-    // The work gets done in the initializer and destructor of
-    // the class below.
-    MultiThreader<FisherComputationClass> m(num_threads, fc);
-  }
-
-  // The scale of F is irrelevant but it might be quite
-  // large at this point, so we just normalize it.
-  KALDI_ASSERT(F.Trace() > 0);
-  F.Scale(F.NumRows() / F.Trace()); // same scale as unit matrix.
-  // Make zero diagonal elements of F non-zero.  Relates to updatable
-  // components that have no effect, e.g. MixtureProbComponents that have
-  // no real free parameters.
-  KALDI_ASSERT(config_.fisher_floor > 0.0);
-  for (int32 i = 0; i < F.NumRows(); i++)
-    F(i, i) = std::max<BaseFloat>(F(i, i), config_.fisher_floor);
-  // We next smooth the diagonal elements of F by a small amount.
-  // This is mainly necessary in case the number of minibatches is
-  // smaller than the dimension of F; we want to ensure F is full rank.
-  for (int32 i = 0; i < F.NumRows(); i++)
-    F(i, i) *= (1.0 + config_.alpha);
-
-  C_.Resize(F.NumRows());
-  C_.Cholesky(F);
-  C_inv_ = C_;
-  C_inv_.Invert();
-
-  // Transform the params_ data-member to be in the preconditioned space.
-  Vector<double> raw_params(params_);
-  params_.AddTpVec(1.0, C_, kTrans, raw_params, 0.0);
-}
-
-// Note, we ignore the regularizer in selecting the best one.  It shouldn't
-// really matter.
-void FastNnetCombiner::GetInitialParams() {
-  int32 initial_model = config_.initial_model,
-      num_nnets = static_cast<int32>(nnets_.size());
-  if (initial_model > num_nnets)
-    initial_model = num_nnets;
-  if (initial_model < 0)
-    initial_model = GetInitialModel(egs_, nnets_);
-
-  KALDI_ASSERT(initial_model >= 0 && initial_model <= num_nnets);
-  int32 num_uc = nnets_[0].NumUpdatableComponents();
-
-  Vector<double> raw_params(num_uc * num_nnets); // parameters in
-                                                 // non-preconditioned space.
-  if (initial_model < num_nnets) {
-    KALDI_LOG << "Initializing with neural net with index " << initial_model;
-    // At this point we're using the best of the individual neural nets.
-    raw_params.Set(0.0);
-
-    // Set the block of parameters corresponding to the "best" of the
-    // source neural nets to
-    SubVector<double> best_block(raw_params, num_uc * initial_model, num_uc);
-    best_block.Set(1.0);
-  } else { // initial_model == num_nnets
-    KALDI_LOG << "Initializing with all neural nets averaged.";
-    raw_params.Set(1.0 / num_nnets);
-  }
-  KALDI_ASSERT(C_.NumRows() == 0); // Assume this not set up yet.
-  params_ = raw_params; // this is in non-preconditioned space.
-}
-
-/// Computes objf at point "params_".
-double FastNnetCombiner::ComputeObjfAndGradient(
-    Vector<double> *gradient,
-    double *regularizer_objf_ptr) {
-  Nnet nnet;
-  ComputeCurrentNnet(&nnet); // compute it at the value "params_".
-
-  Nnet nnet_gradient(nnet);
-  bool is_gradient = true;
-  nnet_gradient.SetZero(is_gradient);
-  double tot_weight = 0.0;
-  double objf = DoBackpropParallel(nnet, config_.minibatch_size, config_.num_threads,
-                                   egs_, &tot_weight, &nnet_gradient) / egs_.size();
-
-  // raw_gradient is gradient in non-preconditioned space.
-  Vector<double> raw_gradient(params_.Dim());
-
-  double regularizer_objf = 0.0; // sum of -0.5 * config_.regularizer * params-squared.
-  int32 i = 0; // index into raw_gradient
-  int32 num_nnets = nnets_.size();
-  for (int32 n = 0; n < num_nnets; n++) {
-    for (int32 j = 0; j < nnet.NumComponents(); j++) {
-      const UpdatableComponent *uc =
-          dynamic_cast<const UpdatableComponent*>(&(nnets_[n].GetComponent(j))),
-          *uc_gradient =
-          dynamic_cast<const UpdatableComponent*>(&(nnet_gradient.GetComponent(j))),
-          *uc_params =
-          dynamic_cast<const UpdatableComponent*>(&(nnet.GetComponent(j)));
-      if (uc != NULL) {
-        double gradient = uc->DotProduct(*uc_gradient) / tot_weight;
-        // "gradient" is the derivative of the objective function w.r.t. this
-        // element of the parameters (i.e. this weight, which gets applied to
-        // the j'th component of the n'th source neural net).
-        if (config_.regularizer != 0.0) {
-          gradient -= config_.regularizer * uc->DotProduct(*uc_params);
-          if (n == 0) // only add this once...
-            regularizer_objf +=
-                -0.5 * config_.regularizer * uc_params->DotProduct(*uc_params);
-        }
-        raw_gradient(i) = gradient;
-        i++;
-      }
-    }
-  }
-  if (config_.regularizer != 0.0) {
-    KALDI_VLOG(2) << "Objf is " << objf << " + regularizer " << regularizer_objf
-                  << " = " << (objf + regularizer_objf) << ", raw gradient is "
-                  << raw_gradient;
-  } else {
-    KALDI_VLOG(2) << "Objf is " << objf << ", raw gradient is " << raw_gradient;
-  }
-  KALDI_ASSERT(i == raw_gradient.Dim());
-  // \hat{g} = C^{-1} g.
-  gradient->AddTpVec(1.0, C_inv_, kNoTrans, raw_gradient, 0.0);
-  *regularizer_objf_ptr = regularizer_objf;
-  return objf + regularizer_objf;
-}
-
-void FastNnetCombiner::ComputeCurrentNnet(
-    Nnet *dest, bool debug) {
-  int32 num_nnets = nnets_.size();
-  KALDI_ASSERT(num_nnets >= 1);
-  KALDI_ASSERT(params_.Dim() == num_nnets * nnets_[0].NumUpdatableComponents());
-  Vector<double> raw_params(params_.Dim()); // Weights in non-preconditioned space:
-  // p = C^{-T} \hat{p}.  Here, raw_params is p, params_, is \hat{p}.
-
-  if (C_inv_.NumRows() > 0)
-    raw_params.AddTpVec(1.0, C_inv_, kTrans, params_, 0.0);
-  else
-    raw_params = params_; // C not set up yet: interpret params_ as raw parameters.
-
-  if (debug) {
-    Matrix<double> params_mat(num_nnets,
-                              nnets_[0].NumUpdatableComponents());
-    params_mat.CopyRowsFromVec(raw_params);
-    KALDI_LOG << "Scale parameters are " << params_mat;
-  }
-  CombineNnets(raw_params, nnets_, dest);
-}
-
-/// Returns an integer saying which model to use:
-/// either 0 ... num-models - 1 for the best individual model,
-/// or (#models) for the average of all of them.
-int32 FastNnetCombiner::GetInitialModel(
-    const std::vector<NnetExample> &validation_set,
-    const std::vector<Nnet> &nnets) const {
-  int32 num_nnets = static_cast<int32>(nnets.size());
-  KALDI_ASSERT(!nnets.empty());
-  int32 best_n = -1;
-  double best_objf = -std::numeric_limits<double>::infinity();
-  Vector<double> objfs(nnets.size());
-  for (int32 n = 0; n < num_nnets; n++) {
-    double num_frames;
-    double objf = ComputeNnetObjfParallel(nnets[n], config_.minibatch_size,
-                                          config_.num_threads, validation_set,
-                                          &num_frames);
-    KALDI_ASSERT(num_frames != 0);
-    objf /= num_frames;
-
-    if (n == 0 || objf > best_objf) {
-      best_objf = objf;
-      best_n = n;
-    }
-    objfs(n) = objf;
-  }
-  KALDI_LOG << "Objective functions for the source neural nets are " << objfs;
-
-  int32 num_uc = nnets[0].NumUpdatableComponents();
-
-  if (num_nnets > 1) { // Now try a version where all the neural nets have the
-                       // same weight.  Don't do this if num_nnets == 1 as
-                       // it would be a waste of time (identical to n == 0).
-    Vector<double> scale_params(num_uc * num_nnets);
-    scale_params.Set(1.0 / num_nnets);
-    Nnet average_nnet;
-    CombineNnets(scale_params, nnets, &average_nnet);
-    double num_frames;
-    double objf = ComputeNnetObjfParallel(average_nnet, config_.minibatch_size,
-                                          config_.num_threads, validation_set,
-                                          &num_frames);
-    objf /= num_frames;
-    KALDI_LOG << "Objf with all neural nets averaged is " << objf;
-    if (objf > best_objf) {
-      return num_nnets;
-    } else {
-      return best_n;
-    }
-  } else {
-    return best_n;
-  }
-}
-
-void CombineNnetsFast(const NnetCombineFastConfig &combine_config,
-                      const std::vector<NnetExample> &validation_set,
-                      const std::vector<Nnet> &nnets_in,
-                      Nnet *nnet_out) {
-  // Everything happens in the initializer.
-  FastNnetCombiner combiner(combine_config,
-                            validation_set,
-                            nnets_in,
-                            nnet_out);
-}
-
-
-} // namespace nnet2
-} // namespace kaldi
diff --git a/src/nnet2/combine-nnet-fast.h b/src/nnet2/combine-nnet-fast.h
deleted file mode 100644
index 2d75586bf1f..00000000000
--- a/src/nnet2/combine-nnet-fast.h
+++ /dev/null
@@ -1,112 +0,0 @@
-// nnet2/combine-nnet-fast.h
-
-// Copyright 2012  Johns Hopkins University (author: Daniel Povey)
-
-// See ../../COPYING for clarification regarding multiple authors
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//  http://www.apache.org/licenses/LICENSE-2.0
-//
-// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
-// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
-// MERCHANTABLITY OR NON-INFRINGEMENT.
-// See the Apache 2 License for the specific language governing permissions and
-// limitations under the License.
-
-#ifndef KALDI_NNET2_COMBINE_NNET_FAST_H_
-#define KALDI_NNET2_COMBINE_NNET_FAST_H_
-
-#include "nnet2/nnet-update.h"
-#include "nnet2/nnet-compute.h"
-#include "util/parse-options.h"
-#include "itf/options-itf.h"
-
-
-// Compare with combine-nnet.h.  What we're doing is taking
-// a set of neural nets, and combining them with combination weights
-// (separate weights for each updatable layer), and optimizing
-// these weights using a validation set,
-
-// This is a faster implementation
-// with multi-threading and more careful preconditioning.
-// To get the pre-conditioning, we divide the validation subset
-// up into small-ish batches (e.g. 100 frames), and compute the
-// neural net gradient for each one.  We then compute the parameter
-// gradient (i.e. the gradient w.r.t. the combination weights we're
-// optimizing) for each batch, and use the scatter of these as a
-// kind of Fisher matrix for preconditioning.
-
-namespace kaldi {
-namespace nnet2 {
-
-/** Configuration class that controls neural net combination, where we combine a
-    number of neural nets, trying to find for each layer the optimal weighted
-    combination of the different neural-net parameters.
- */
-struct NnetCombineFastConfig {
-  int32 initial_model; // If provided, the index of the initial model to start
-  // the optimization from.
-  int32 num_lbfgs_iters; 
-  int32 num_threads;
-  BaseFloat initial_impr;
-  BaseFloat fisher_floor; // Flooring value we use for Fisher matrix (mainly
-                          // makes a difference in pnorm systems, where there
-                          // are don't-care directions in parameter space.
-  BaseFloat alpha; // A smoothing value we use in getting the Fisher matrix.
-  int32 fisher_minibatch_size; // e.g. 64; a relatively small minibatch size we
-  // use in the Fisher matrix computation (smaller will generally mean more accurate
-  // preconditioning but will slow down the computation).
-  int32 minibatch_size; // e.g. 1028; a larger minibatch size we use in
-  // the gradient computation.
-  int32 max_lbfgs_dim;
-  BaseFloat regularizer;
-  
-  NnetCombineFastConfig(): initial_model(-1), num_lbfgs_iters(10),
-                           num_threads(1), initial_impr(0.01), fisher_floor(1.0e-20),
-                           alpha(0.01), fisher_minibatch_size(64), minibatch_size(1024),
-                           max_lbfgs_dim(10), regularizer(0.0) {}
-  
-  void Register(OptionsItf *opts) {
-    opts->Register("initial-model", &initial_model, "Specifies where to start the "
-                   "optimization from.  If 0 ... #models-1, then specifies the model; "
-                   "if >= #models, then the average of all inputs; if <0, chosen "
-                   "automatically from the previous options.");
-    opts->Register("num-lbfgs-iters", &num_lbfgs_iters, "Maximum number of function "
-                   "evaluations for L-BFGS to use when optimizing combination weights");
-    opts->Register("initial-impr", &initial_impr, "Amount of objective-function change "
-                   "We aim for on the first iteration.");
-    opts->Register("num-threads", &num_threads, "Number of threads to use in "
-                   "multi-core computation");
-    opts->Register("fisher-floor", &fisher_floor,
-                   "Floor for diagonal of Fisher matrix (used in preconditioning)");
-    opts->Register("alpha", &alpha, "Value we use in smoothing the Fisher matrix "
-                   "with its diagonal, in preconditioning the update.");
-    opts->Register("fisher-minibatch-size", &fisher_minibatch_size, "Size of minibatch "
-                   "used in computation of Fisher matrix (smaller -> better "
-                   "preconditioning");
-    opts->Register("minibatch-size", &minibatch_size, "Minibatch size used in computing "
-                   "gradients (only affects speed)");
-    opts->Register("max-lbfgs-dim", &max_lbfgs_dim, "Maximum dimension to use in "
-                   "L-BFGS (will not get higher than this even if the dimension "
-                   "of the space gets higher.)");
-    opts->Register("regularizer", &regularizer, "Add to the objective "
-                   "function (which is average log-like per frame), -0.5 * "
-                   "regularizer * square of parameters.");
-  }  
-};
-
-void CombineNnetsFast(const NnetCombineFastConfig &combine_config,
-                      const std::vector<NnetExample> &validation_set,
-                      const std::vector<Nnet> &nnets_in,
-                      Nnet *nnet_out);
-  
-
-
-} // namespace nnet2
-} // namespace kaldi
-
-#endif
diff --git a/src/nnet2/combine-nnet.cc b/src/nnet2/combine-nnet.cc
deleted file mode 100644
index 417db1b84c4..00000000000
--- a/src/nnet2/combine-nnet.cc
+++ /dev/null
@@ -1,253 +0,0 @@
-// nnet2/combine-nnet.cc
-
-// Copyright 2012   Johns Hopkins University (author: Daniel Povey)
-
-// See ../../COPYING for clarification regarding multiple authors
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//  http://www.apache.org/licenses/LICENSE-2.0
-//
-// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
-// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
-// MERCHANTABLITY OR NON-INFRINGEMENT.
-// See the Apache 2 License for the specific language governing permissions and
-// limitations under the License.
-
-#include "nnet2/combine-nnet.h"
-
-namespace kaldi {
-namespace nnet2 {
-
-
-// Here, "scale_params" is in blocks, with the first block
-// corresponding to nnets[0].
-static void CombineNnets(const Vector<BaseFloat> &scale_params,
-                         const std::vector<Nnet> &nnets,
-                         Nnet *dest) {
-  int32 num_nnets = nnets.size();
-  KALDI_ASSERT(num_nnets >= 1);
-  int32 num_uc = nnets[0].NumUpdatableComponents();
-  KALDI_ASSERT(num_nnets * nnets[0].NumUpdatableComponents());
-  
-  
-  *dest = nnets[0];
-  SubVector<BaseFloat> scale_params0(scale_params, 0, num_uc);
-  dest->ScaleComponents(scale_params0);
-  for (int32 n = 1; n < num_nnets; n++) {
-    SubVector<BaseFloat> scale_params_n(scale_params, n * num_uc, num_uc);
-    dest->AddNnet(scale_params_n, nnets[n]);
-  }
-}
-
-/// Returns an integer saying which model to use:
-/// either 0 ... num-models - 1 for the best individual model,
-/// or (#models) for the average of all of them.
-static int32 GetInitialModel(
-    const std::vector<NnetExample> &validation_set,
-    const std::vector<Nnet> &nnets) {
-  int32 minibatch_size = 1024;
-  int32 num_nnets = static_cast<int32>(nnets.size());
-  KALDI_ASSERT(!nnets.empty());
-  BaseFloat tot_frames = validation_set.size();
-  int32 best_n = -1;
-  BaseFloat best_objf = -std::numeric_limits<BaseFloat>::infinity();
-  Vector<BaseFloat> objfs(nnets.size());
-  for (int32 n = 0; n < num_nnets; n++) {
-    BaseFloat objf = ComputeNnetObjf(nnets[n], validation_set,
-                                     minibatch_size) / tot_frames;
-    
-    if (n == 0 || objf > best_objf) {
-      best_objf = objf;
-      best_n = n;
-    }
-    objfs(n) = objf;
-  }
-  KALDI_LOG << "Objective functions for the source neural nets are " << objfs;
-
-  int32 num_uc = nnets[0].NumUpdatableComponents();
-
-  { // Now try a version where all the neural nets have the same weight.
-    Vector<BaseFloat> scale_params(num_uc * num_nnets);
-    scale_params.Set(1.0 / num_nnets);
-    Nnet average_nnet;
-    CombineNnets(scale_params, nnets, &average_nnet);
-    BaseFloat objf = ComputeNnetObjf(average_nnet, validation_set,
-                                     minibatch_size) / tot_frames;
-    KALDI_LOG << "Objf with all neural nets averaged is " << objf;
-    if (objf > best_objf) {
-      return num_nnets;
-    } else {
-      return best_n;
-    }
-  }
-}
-
-// This function chooses from among the neural nets, the one
-// which has the best validation set objective function.
-static void GetInitialScaleParams(
-    const NnetCombineConfig &combine_config,
-    const std::vector<NnetExample> &validation_set,
-    const std::vector<Nnet> &nnets,
-    Vector<double> *scale_params) {
-
-  int32 initial_model = combine_config.initial_model,
-      num_nnets = static_cast<int32>(nnets.size());
-  if (initial_model < 0 || initial_model > num_nnets)
-    initial_model = GetInitialModel(validation_set, nnets);
-  
-  KALDI_ASSERT(initial_model >= 0 && initial_model <= num_nnets);
-  int32 num_uc = nnets[0].NumUpdatableComponents();
-
-  scale_params->Resize(num_uc * num_nnets);
-  if (initial_model < num_nnets) {
-    KALDI_LOG << "Initializing with neural net with index " << initial_model;
-    // At this point we're using the best of the individual neural nets.
-    scale_params->Set(0.0);
-    
-    // Set the block of parameters corresponding to the "best" of the
-    // source neural nets to
-    SubVector<double> best_block(*scale_params, num_uc * initial_model, num_uc);
-    best_block.Set(1.0);
-  } else { // initial_model == num_nnets
-    KALDI_LOG << "Initializing with all neural nets averaged.";
-    scale_params->Set(1.0 / num_nnets);
-  }
-}
-
-
-
-
-static double ComputeObjfAndGradient(
-    const std::vector<NnetExample> &validation_set,
-    const Vector<double> &scale_params,
-    const std::vector<Nnet> &nnets,
-    bool debug,
-    Vector<double> *gradient) {
-
-  Vector<BaseFloat> scale_params_float(scale_params);
-  
-  Nnet nnet_combined;
-  CombineNnets(scale_params_float, nnets, &nnet_combined);
-  
-  Nnet nnet_gradient(nnet_combined);
-  bool is_gradient = true;
-  nnet_gradient.SetZero(is_gradient);
-  
-  // note: "ans" is normalized by the total weight of validation frames.
-  int32 batch_size = 1024;
-  double ans = ComputeNnetGradient(nnet_combined,
-                                   validation_set,
-                                   batch_size,
-                                   &nnet_gradient);
-
-  double tot_frames = validation_set.size();
-  if (gradient != NULL) {
-    int32 i = 0; // index into scale_params.  
-    for (int32 n = 0; n < static_cast<int32>(nnets.size()); n++) {
-      for (int32 j = 0; j < nnet_combined.NumComponents(); j++) {
-        const UpdatableComponent *uc =
-            dynamic_cast<const UpdatableComponent*>(&(nnets[n].GetComponent(j))),
-            *uc_gradient =
-            dynamic_cast<const UpdatableComponent*>(&(nnet_gradient.GetComponent(j)));
-        if (uc != NULL) {
-          double dotprod = uc->DotProduct(*uc_gradient) / tot_frames;
-          (*gradient)(i) = dotprod; 
-          i++;
-        }
-      }
-    }
-    KALDI_ASSERT(i == scale_params.Dim());
-  }
-
-  if (debug) {
-    KALDI_LOG << "Double-checking gradient computation";
-    
-    Vector<BaseFloat> manual_gradient(scale_params.Dim());
-    for (int32 i = 0; i < scale_params.Dim(); i++) {
-      double delta = 1.0e-04, fg = fabs((*gradient)(i));
-      if (fg < 1.0e-07) fg = 1.0e-07;
-      if (fg * delta < 1.0e-05)
-        delta = 1.0e-05 / fg;
-      
-      Vector<double> scale_params_temp(scale_params);
-      scale_params_temp(i) += delta;
-      double new_ans = ComputeObjfAndGradient(validation_set,
-                                              scale_params_temp,
-                                              nnets,
-                                              false,
-                                              NULL);
-      manual_gradient(i) = (new_ans - ans) / delta;
-    }
-    KALDI_LOG << "Manually computed gradient is " << manual_gradient;
-    KALDI_LOG << "Gradient we computed is " << *gradient;
-  }
-  
-  return ans;
-}
-                                   
-
-void CombineNnets(const NnetCombineConfig &combine_config,
-                  const std::vector<NnetExample> &validation_set,
-                  const std::vector<Nnet> &nnets,
-                  Nnet *nnet_out) {
-
-  Vector<double> scale_params;
-
-  GetInitialScaleParams(combine_config,
-                        validation_set,
-                        nnets,
-                        &scale_params);
-
-  int32 dim = scale_params.Dim();
-  KALDI_ASSERT(dim > 0);
-  Vector<double> gradient(dim);
-  
-  double objf, initial_objf;
-
-  LbfgsOptions lbfgs_options;
-  lbfgs_options.minimize = false; // We're maximizing.
-  lbfgs_options.m = dim; // Store the same number of vectors as the dimension
-  // itself, so this is BFGS.
-  lbfgs_options.first_step_impr = combine_config.initial_impr;
-  
-  OptimizeLbfgs<double> lbfgs(scale_params,
-                              lbfgs_options);
-  
-  for (int32 i = 0; i < combine_config.num_bfgs_iters; i++) {    
-    scale_params.CopyFromVec(lbfgs.GetProposedValue());
-    objf = ComputeObjfAndGradient(validation_set,
-                                  scale_params,
-                                  nnets,
-                                  combine_config.test_gradient,
-                                  &gradient);
-
-    KALDI_VLOG(2) << "Iteration " << i << " scale-params = " << scale_params
-                  << ", objf = " << objf << ", gradient = " << gradient;
-    
-    if (i == 0) initial_objf = objf;
-    
-    lbfgs.DoStep(objf, gradient);
-  }
-
-  scale_params.CopyFromVec(lbfgs.GetValue(&objf));
-
-  Vector<BaseFloat> scale_params_float(scale_params);
-
-  KALDI_LOG << "Combining nnets, validation objf per frame changed from "
-            << initial_objf << " to " << objf;
-
-  Matrix<BaseFloat> scale_params_mat(nnets.size(),
-                                     nnets[0].NumUpdatableComponents());
-  scale_params_mat.CopyRowsFromVec(scale_params_float);
-  KALDI_LOG << "Final scale factors are " << scale_params_mat;
-  
-  CombineNnets(scale_params_float, nnets, nnet_out);
-}
- 
-  
-} // namespace nnet2
-} // namespace kaldi
diff --git a/src/nnet2/combine-nnet.h b/src/nnet2/combine-nnet.h
deleted file mode 100644
index 83a15e41dce..00000000000
--- a/src/nnet2/combine-nnet.h
+++ /dev/null
@@ -1,74 +0,0 @@
-// nnet2/combine-nnet.h
-
-// Copyright 2012  Johns Hopkins University (author: Daniel Povey)
-
-// See ../../COPYING for clarification regarding multiple authors
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//  http://www.apache.org/licenses/LICENSE-2.0
-//
-// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
-// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
-// MERCHANTABLITY OR NON-INFRINGEMENT.
-// See the Apache 2 License for the specific language governing permissions and
-// limitations under the License.
-
-#ifndef KALDI_NNET2_COMBINE_NNET_H_
-#define KALDI_NNET2_COMBINE_NNET_H_
-
-#include "nnet2/nnet-update.h"
-#include "nnet2/nnet-compute.h"
-#include "util/parse-options.h"
-#include "itf/options-itf.h"
-
-namespace kaldi {
-namespace nnet2 {
-
-/** Configuration class that controls neural net combination, where we combine a
-    number of neural nets, trying to find for each layer the optimal weighted
-    combination of the different neural-net parameters.
- */
-struct NnetCombineConfig {
-  int32 initial_model; // If provided, the index of the initial model to start
-  // the optimization from.
-  int32 num_bfgs_iters; // The dimension is small (e.g. 3 to 5 times the
-  // number of neural nets we were given, e.g. 10) so we do
-  // BFGS.  We actually implement this as L-BFGS but setting the number of
-  // vectors to be the same as the dimension of the space.  Note: this
-  // num-iters is in reality the number of function evaluations.
-  
-  BaseFloat initial_impr;
-  bool test_gradient;
-  NnetCombineConfig(): initial_model(-1), num_bfgs_iters(30),
-                       initial_impr(0.01),
-                       test_gradient(false) { }
-  
-  void Register(OptionsItf *opts) {
-    opts->Register("initial-model", &initial_model, "Specifies where to start the "
-                   "optimization from.  If 0 ... #models-1, then specifies the model; "
-                   "if #models, then the average of all inputs; otherwise, chosen "
-                   "automatically from the previous options.");
-    opts->Register("num-bfgs-iters", &num_bfgs_iters, "Maximum number of function "
-                   "evaluations for BFGS to use when optimizing combination weights");
-    opts->Register("initial-impr", &initial_impr, "Amount of objective-function change "
-                   "we aim for on the first iteration.");
-    opts->Register("test-gradient", &test_gradient, "If true, activate code that "
-                   "tests the gradient is accurate.");
-  }  
-};
-
-void CombineNnets(const NnetCombineConfig &combine_config,
-                  const std::vector<NnetExample> &validation_set,
-                  const std::vector<Nnet> &nnets_in,
-                  Nnet *nnet_out);
-  
-
-
-} // namespace nnet2
-} // namespace kaldi
-
-#endif
diff --git a/src/nnet2/decodable-am-nnet.h b/src/nnet2/decodable-am-nnet.h
deleted file mode 100644
index 6c40b11bf9d..00000000000
--- a/src/nnet2/decodable-am-nnet.h
+++ /dev/null
@@ -1,187 +0,0 @@
-// nnet2/decodable-am-nnet.h
-
-// Copyright 2012  Johns Hopkins University (author: Daniel Povey)
-
-// See ../../COPYING for clarification regarding multiple authors
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//  http://www.apache.org/licenses/LICENSE-2.0
-//
-// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
-// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
-// MERCHANTABLITY OR NON-INFRINGEMENT.
-// See the Apache 2 License for the specific language governing permissions and
-// limitations under the License.
-
-#ifndef KALDI_NNET2_DECODABLE_AM_NNET_H_
-#define KALDI_NNET2_DECODABLE_AM_NNET_H_
-
-#include <vector>
-#include "base/kaldi-common.h"
-#include "gmm/am-diag-gmm.h"
-#include "hmm/transition-model.h"
-#include "itf/decodable-itf.h"
-#include "nnet2/am-nnet.h"
-#include "nnet2/nnet-compute.h"
-
-namespace kaldi {
-namespace nnet2 {
-
-/// DecodableAmNnet is a decodable object that decodes
-/// with a neural net acoustic model of type AmNnet.
-
-class DecodableAmNnet: public DecodableInterface {
- public:
-  DecodableAmNnet(const TransitionModel &trans_model,
-                  const AmNnet &am_nnet,
-                  const CuMatrixBase<BaseFloat> &feats,
-                  bool pad_input = true, // if !pad_input, the NumIndices()
-                                         // will be < feats.NumRows().
-                  BaseFloat prob_scale = 1.0):
-      trans_model_(trans_model) {
-    // Note: we could make this more memory-efficient by doing the
-    // computation in smaller chunks than the whole utterance, and not
-    // storing the whole thing.  We'll leave this for later.
-    int32 num_rows = feats.NumRows() -
-        (pad_input ? 0 : am_nnet.GetNnet().LeftContext() +
-                         am_nnet.GetNnet().RightContext());
-    if (num_rows <= 0) {
-      KALDI_WARN << "Input with " << feats.NumRows()  << " rows will produce "
-                 << "empty output.";
-      return;
-    }
-    CuMatrix<BaseFloat> log_probs(num_rows, trans_model.NumPdfs());
-    // the following function is declared in nnet-compute.h
-    NnetComputation(am_nnet.GetNnet(), feats, pad_input, &log_probs);
-    log_probs.ApplyFloor(1.0e-20); // Avoid log of zero which leads to NaN.
-    log_probs.ApplyLog();
-    CuVector<BaseFloat> priors(am_nnet.Priors());
-    KALDI_ASSERT(priors.Dim() == trans_model.NumPdfs() &&
-                 "Priors in neural network not set up.");
-    priors.ApplyLog();
-    // subtract log-prior (divide by prior)
-    log_probs.AddVecToRows(-1.0, priors);
-    // apply probability scale.
-    log_probs.Scale(prob_scale);
-    // Transfer the log-probs to the CPU for faster access by the
-    // decoding process.
-    log_probs_.Swap(&log_probs);
-  }
-
-  // Note, frames are numbered from zero.  But transition_id is numbered
-  // from one (this routine is called by FSTs).
-  virtual BaseFloat LogLikelihood(int32 frame, int32 transition_id) {
-    return log_probs_(frame,
-                      trans_model_.TransitionIdToPdfFast(transition_id));
-  }
-
-  virtual int32 NumFramesReady() const { return log_probs_.NumRows(); }
-
-  // Indices are one-based!  This is for compatibility with OpenFst.
-  virtual int32 NumIndices() const { return trans_model_.NumTransitionIds(); }
-
-  virtual bool IsLastFrame(int32 frame) const {
-    KALDI_ASSERT(frame < NumFramesReady());
-    return (frame == NumFramesReady() - 1);
-  }
-
- protected:
-  const TransitionModel &trans_model_;
-  Matrix<BaseFloat> log_probs_; // actually not really probabilities, since we divide
-  // by the prior -> they won't sum to one.
-
-  KALDI_DISALLOW_COPY_AND_ASSIGN(DecodableAmNnet);
-};
-
-/// This version of DecodableAmNnet is intended for a version of the decoder
-/// that processes different utterances with multiple threads.  It needs to do
-/// the computation in a different place than the initializer, since the
-/// initializer gets called in the main thread of the program.
-
-class DecodableAmNnetParallel: public DecodableInterface {
- public:
-  DecodableAmNnetParallel(
-      const TransitionModel &trans_model,
-      const AmNnet &am_nnet,
-      const CuMatrix<BaseFloat> *feats,
-      bool pad_input = true,
-      BaseFloat prob_scale = 1.0):
-      trans_model_(trans_model), am_nnet_(am_nnet), feats_(feats),
-      pad_input_(pad_input), prob_scale_(prob_scale) {
-    KALDI_ASSERT(feats_ != NULL);
-  }
-
-  void Compute() {
-    log_probs_.Resize(feats_->NumRows(), trans_model_.NumPdfs());
-    // the following function is declared in nnet-compute.h
-    NnetComputation(am_nnet_.GetNnet(), *feats_,
-                    pad_input_, &log_probs_);
-    log_probs_.ApplyFloor(1.0e-20); // Avoid log of zero which leads to NaN.
-    log_probs_.ApplyLog();
-    CuVector<BaseFloat> priors(am_nnet_.Priors());
-    KALDI_ASSERT(priors.Dim() == trans_model_.NumPdfs() &&
-                 "Priors in neural network not set up.");
-    priors.ApplyLog();
-    // subtract log-prior (divide by prior)
-    log_probs_.AddVecToRows(-1.0, priors);
-    // apply probability scale.
-    log_probs_.Scale(prob_scale_);
-    delete feats_;
-    feats_ = NULL;
-  }
-
-  // Note, frames are numbered from zero.  But state_index is numbered
-  // from one (this routine is called by FSTs).
-  virtual BaseFloat LogLikelihood(int32 frame, int32 transition_id) {
-    if (feats_) Compute(); // this function sets feats_ to NULL.
-    return log_probs_(frame,
-                      trans_model_.TransitionIdToPdfFast(transition_id));
-  }
-
-  int32 NumFramesReady() const {
-    if (feats_) {
-      if (pad_input_) return feats_->NumRows();
-      else {
-        int32 ans = feats_->NumRows() - am_nnet_.GetNnet().LeftContext() -
-            am_nnet_.GetNnet().RightContext();
-        if (ans < 0) ans = 0;
-        return ans;
-      }
-    } else {
-      return log_probs_.NumRows();
-    }
-  }
-
-  // Indices are one-based!  This is for compatibility with OpenFst.
-  virtual int32 NumIndices() const { return trans_model_.NumTransitionIds(); }
-
-  virtual bool IsLastFrame(int32 frame) const {
-    KALDI_ASSERT(frame < NumFramesReady());
-    return (frame == NumFramesReady() - 1);
-  }
-  ~DecodableAmNnetParallel() {
-    delete feats_;
-  }
- protected:
-  const TransitionModel &trans_model_;
-  const AmNnet &am_nnet_;
-  CuMatrix<BaseFloat> log_probs_; // actually not really probabilities, since we divide
-  // by the prior -> they won't sum to one.
-  const CuMatrix<BaseFloat> *feats_;
-  bool pad_input_;
-  BaseFloat prob_scale_;
-  KALDI_DISALLOW_COPY_AND_ASSIGN(DecodableAmNnetParallel);
-};
-
-
-
-
-
-} // namespace nnet2
-} // namespace kaldi
-
-#endif  // KALDI_NNET2_DECODABLE_AM_NNET_H_
diff --git a/src/nnet2/get-feature-transform.cc b/src/nnet2/get-feature-transform.cc
deleted file mode 100644
index 38ec9bc3da9..00000000000
--- a/src/nnet2/get-feature-transform.cc
+++ /dev/null
@@ -1,203 +0,0 @@
-// nnet2/get-feature-transform.cc
-
-// Copyright 2009-2011  Jan Silovsky
-//                2013  Johns Hopkins University (author: Daniel Povey)
-
-// See ../../COPYING for clarification regarding multiple authors
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//  http://www.apache.org/licenses/LICENSE-2.0
-//
-// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
-// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
-// MERCHANTABLITY OR NON-INFRINGEMENT.
-// See the Apache 2 License for the specific language governing permissions and
-// limitations under the License.
-
-
-#include "nnet2/get-feature-transform.h"
-
-namespace kaldi {
-
-
-
-void FeatureTransformEstimate::Estimate(const FeatureTransformEstimateOptions &opts,
-                                        Matrix<BaseFloat> *M,
-                                        TpMatrix<BaseFloat> *C) const { 
-  double count;
-  Vector<double> total_mean;
-  SpMatrix<double> total_covar, between_covar;
-  GetStats(&total_covar, &between_covar, &total_mean, &count);
-  KALDI_LOG << "Data count is " << count;
-  EstimateInternal(opts, total_covar, between_covar, total_mean, M, C);
-}
-
-// static
-void FeatureTransformEstimate::EstimateInternal(
-    const FeatureTransformEstimateOptions &opts,
-    const SpMatrix<double> &total_covar,
-    const SpMatrix<double> &between_covar,
-    const Vector<double> &total_mean,
-    Matrix<BaseFloat> *M,
-    TpMatrix<BaseFloat> *C) {
-  
-  int32 target_dim = opts.dim, dim = total_covar.NumRows();
-  // Interpret zero or negative target_dim as the full dim
-  if (target_dim <= 0)
-    target_dim = dim;
-  // between-class covar is of most rank C-1
-  KALDI_ASSERT(target_dim <= dim);
-  
-  // within-class covariance
-  SpMatrix<double> wc_covar(total_covar);
-  wc_covar.AddSp(-1.0, between_covar);
-  TpMatrix<double> wc_covar_sqrt(dim);
-  try {
-    wc_covar_sqrt.Cholesky(wc_covar);
-    if (C != NULL) {
-      C->Resize(dim);
-      C->CopyFromTp(wc_covar_sqrt);
-    }
-  } catch (...) {
-    BaseFloat smooth = 1.0e-03 * wc_covar.Trace() / wc_covar.NumRows();
-    KALDI_LOG << "Cholesky failed (possibly not +ve definite), so adding " << smooth
-              << " to diagonal and trying again.\n";
-    for (int32 i = 0; i < wc_covar.NumRows(); i++)
-      wc_covar(i, i) += smooth;
-    wc_covar_sqrt.Cholesky(wc_covar);    
-  }
-  Matrix<double> wc_covar_sqrt_mat(wc_covar_sqrt);
-  wc_covar_sqrt_mat.Invert();
-
-  SpMatrix<double> tmp_sp(dim);
-  tmp_sp.AddMat2Sp(1.0, wc_covar_sqrt_mat, kNoTrans, between_covar, 0.0);
-  Matrix<double> tmp_mat(tmp_sp);
-  Matrix<double> svd_u(dim, dim), svd_vt(dim, dim);
-  Vector<double> svd_d(dim);
-  tmp_mat.Svd(&svd_d, &svd_u, &svd_vt);
-  SortSvd(&svd_d, &svd_u);
-
-  KALDI_LOG << "LDA singular values are " << svd_d;
-
-  KALDI_LOG << "Sum of all singular values is " << svd_d.Sum();
-  KALDI_LOG << "Sum of selected singular values is " <<
-      SubVector<double>(svd_d, 0, target_dim).Sum();
-  
-  Matrix<double> lda_mat(dim, dim);
-  lda_mat.AddMatMat(1.0, svd_u, kTrans, wc_covar_sqrt_mat, kNoTrans, 0.0);
-
-  // finally, copy first target_dim rows to m
-  M->Resize(target_dim, dim);
-  M->CopyFromMat(lda_mat.Range(0, target_dim, 0, dim));
-  
-  if (opts.within_class_factor != 1.0) {
-    for (int32 i = 0; i < svd_d.Dim(); i++) {
-      BaseFloat old_var = 1.0 + svd_d(i), // the total variance of that dim..
-          new_var = opts.within_class_factor + svd_d(i), // the variance we want..
-          scale = sqrt(new_var / old_var);
-      if (i < M->NumRows())
-        M->Row(i).Scale(scale);
-    }
-  }
-
-  if (opts.max_singular_value > 0.0) {
-    int32 rows = M->NumRows(), cols = M->NumCols(),
-        min_dim = std::min(rows, cols);
-    Matrix<BaseFloat> U(rows, min_dim), Vt(min_dim, cols);
-    Vector<BaseFloat> s(min_dim);
-    M->Svd(&s, &U, &Vt); // decompose m = U diag(s) Vt.
-    BaseFloat max_s = s.Max();
-    int32 n;
-    s.ApplyCeiling(opts.max_singular_value, &n);
-    if (n > 0) {
-      KALDI_LOG << "Applied ceiling to " << n << " out of " << s.Dim()
-                << " singular values of transform using ceiling "
-                << opts.max_singular_value << ", max is " << max_s;
-      Vt.MulRowsVec(s);
-      // reconstruct m with the modified singular values:
-      M->AddMatMat(1.0, U, kNoTrans, Vt, kNoTrans, 0.0);
-    }
-  }
-
-  if (opts.remove_offset)
-    AddMeanOffset(total_mean, M);
-}
-
-void FeatureTransformEstimateMulti::EstimateTransformPart(
-    const FeatureTransformEstimateOptions &opts,
-    const std::vector<int32> &indexes,
-    const SpMatrix<double> &total_covar,
-    const SpMatrix<double> &between_covar,
-    const Vector<double> &mean,
-    Matrix<BaseFloat> *M) const {
-
-  int32 full_dim = Dim(), proj_dim = indexes.size();
-  Matrix<double> transform(proj_dim, full_dim); // projects from full to projected dim.
-  for (int32 i = 0; i < proj_dim; i++)
-    transform(i, indexes[i]) = 1.0;
-
-  SpMatrix<double> total_covar_proj(proj_dim), between_covar_proj(proj_dim);
-  Vector<double> mean_proj(proj_dim);
-  total_covar_proj.AddMat2Sp(1.0, transform, kNoTrans, total_covar, 0.0);
-  between_covar_proj.AddMat2Sp(1.0, transform, kNoTrans, between_covar, 0.0);
-  mean_proj.AddMatVec(1.0, transform, kNoTrans, mean, 0.0);
-
-  Matrix<BaseFloat> M_proj;
-  FeatureTransformEstimateOptions opts_tmp(opts);
-  opts_tmp.dim = proj_dim;
-  EstimateInternal(opts_tmp, total_covar_proj, between_covar_proj, mean_proj,
-                   &M_proj, NULL);
-  if (M_proj.NumCols() == proj_dim + 1) { // Extend transform to add the extra "1" that we
-                                          // use to handle mean shifts..
-    transform.Resize(proj_dim + 1, full_dim + 1, kCopyData);
-    transform(proj_dim, full_dim) = 1.0;
-  }
-  M->Resize(proj_dim, transform.NumCols());
-  // Produce output..
-  M->AddMatMat(1.0, M_proj, kNoTrans, Matrix<BaseFloat>(transform),
-               kNoTrans, 0.0);
-}
-
-void FeatureTransformEstimateMulti::Estimate(
-    const FeatureTransformEstimateOptions &opts,
-    const std::vector<std::vector<int32> > &indexes,
-    Matrix<BaseFloat> *M) const {
-
-  int32 input_dim = Dim(), output_dim = 0, num_transforms = indexes.size();
-  for (int32 i = 0; i < num_transforms; i++) { // some input-checking.
-    KALDI_ASSERT(indexes[i].size() > 0);
-    std::vector<int32> this_indexes(indexes[i]);
-    std::sort(this_indexes.begin(), this_indexes.end());
-    KALDI_ASSERT(IsSortedAndUniq(this_indexes)); // check for duplicates.
-    KALDI_ASSERT(this_indexes.front() >= 0);
-    KALDI_ASSERT(this_indexes.back() < input_dim);
-    output_dim += this_indexes.size();
-  }
-
-  int32 input_dim_ext = (opts.remove_offset ? input_dim + 1 : input_dim);
-  M->Resize(output_dim, input_dim_ext);
-  
-  double count;
-  Vector<double> total_mean;
-  SpMatrix<double> total_covar, between_covar;
-  GetStats(&total_covar, &between_covar, &total_mean, &count);
-
-  int32 cur_output_index = 0;
-  for (int32 i = 0; i < num_transforms; i++) {
-    Matrix<BaseFloat> M_tmp;
-    EstimateTransformPart(opts, indexes[i], total_covar, between_covar,
-                          total_mean, &M_tmp);
-    int32 this_output_dim = indexes[i].size();
-    M->Range(cur_output_index, this_output_dim, 0, M->NumCols()).
-        CopyFromMat(M_tmp);
-    cur_output_index += this_output_dim;
-  }
-  
-}
-
-
-}  // End of namespace kaldi
diff --git a/src/nnet2/get-feature-transform.h b/src/nnet2/get-feature-transform.h
deleted file mode 100644
index a1ba31a36bf..00000000000
--- a/src/nnet2/get-feature-transform.h
+++ /dev/null
@@ -1,180 +0,0 @@
-// nnet2/get-feature-transform.h
-
-// Copyright 2009-2011  Jan Silovsky
-//                2013  Johns Hopkins University (author: Daniel Povey)
-
-// See ../../COPYING for clarification regarding multiple authors
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//  http://www.apache.org/licenses/LICENSE-2.0
-//
-// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
-// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
-// MERCHANTABLITY OR NON-INFRINGEMENT.
-// See the Apache 2 License for the specific language governing permissions and
-// limitations under the License.
-
-#ifndef KALDI_NNET2_GET_FEATURE_TRANSFORM_H_
-#define KALDI_NNET2_GET_FEATURE_TRANSFORM_H_
-
-#include "base/kaldi-common.h"
-#include "util/common-utils.h"
-#include "matrix/matrix-lib.h"
-#include "transform/lda-estimate.h"
-
-namespace kaldi {
-
-/**
-   @file
-   This file is modified from transform/lda-estimate.h
-   It contains a class intended to be used in preconditioning
-   data for neural network training.  See the documentation for class
-   FeatureTransformEstimate for more details.
-*/
-
-struct FeatureTransformEstimateOptions {
-  bool remove_offset;
-  int32 dim;
-  BaseFloat within_class_factor;
-  BaseFloat max_singular_value;
-  FeatureTransformEstimateOptions(): remove_offset(true), dim(-1),
-                                     within_class_factor(0.001), max_singular_value(5.0) { }
-  
-  void Register(OptionsItf *opts) {
-    opts->Register("remove-offset", &remove_offset, "If true, output an affine "
-                   "transform that makes the projected data mean equal to zero.");
-    opts->Register("dim", &dim, "Dimension to project to with LDA");
-    opts->Register("within-class-factor", &within_class_factor, "If 1.0, do "
-                   "conventional LDA where the within-class variance will be "
-                   "unit in the projected space.  May be set to less than 1.0, "
-                   "which scales the features to have less variance, particularly "
-                   "for dimensions where between-class variance is small. ");
-    opts->Register("max-singular-value", &max_singular_value, "If >0, maximum "
-                   "allowed singular value of final transform (they are floored "
-                   "to this)");
-  }    
-};
-
-/**
-     Class for computing a feature transform used for preconditioning of the
-     training data in neural-networks.
-
-     By preconditioning here, all we really mean is an affine transform of the
-     input data-- say if we set up the classification as going from vectors x_i
-     to labels y_i, then this would be a linear transform on X, so we replace
-     x_i with x'_i = A x_i + b.  The statistics we use to obtain this transform
-     are the within-class and between class variance statistics, and the global
-     data mean, that we would use to estimate LDA.  When designing this, we had
-     a few principles in mind:
-        - We want to remove the global mean of the input features (this is
-          well established, I think there is a paper by LeCun explaining why
-          this is a good thing).
-        - We would like the transform to make the training process roughly
-          invariant to linear transformations of the input features, meaning
-          that whatever linear transformation you apply prior to this transform,
-          it should 'undo' it.
-        - We want directions in which there is a lot of between-class variance
-          to be given a higher variance than directions that have mostly
-          within-class variance-- it has been our experience that these
-          'nuisance directions' will interfere with the training if they are
-          given too large a scaling.
-     It is essential to our method that the number of classes is higher than
-     the dimension of the input feature space, which is normal for speech
-     recognition tasks (~5000 > ~250).
-
-     Basically our method is as follows:
-
-       - First subtract the mean.
-       - Get the within-class and between-class stats, as for LDA.
-       - Normalize the space as for LDA, so that the within-class covariance
-         matrix is unit and the between-class covariance matrix is diagonalized
-       - At this stage, if the user asked for dimension reduction then
-         reduce the dimension by taking out dimensions with least between-class
-         variance [note: the current scripts do not do this by default]
-       - Apply a transform that reduces the variance of dimensions
-         with low between-class variance, as we'll describe below.
-       - Finally, do an SVD of the resulting transform, A = U S V^T, apply a
-         maximum to the diagonal elements of the matrix S (e.g. 5.0), and
-         reconstruct A' = U S' V^T; this is the final transform.  The point of
-         this stage is to stop the transform from 'blowing up' any dimensions of
-         the space excessively; this stage was introduced in response to a
-         problem we encountered at one point, and I think normally not very many
-         dimensions of S end up getting floored.
-
-      We need to explain the step that applies the dimension-specific scaling,
-      which we described above as, "Apply a transform that reduces the variance
-      of dimensions with low between-class variance".  For a particular
-      dimension, let the between-class diagonal covariance element be \lambda_i,
-      and the within-class diagonal covariance is 1 at this point (since we
-      have normalized the within-class covariance to unity); hence, the total
-      variance is \lambda_i + 1.
-      Below, "within-class-factor" is a constant that we set by default to
-      0.001.  We scale the i'th dimension of the features by:
-      
-         \f$  sqrt( (within-class-factor + \lambda_i) / (1 + \lambda_i) ) \f$
-           
-      If \lambda_i >> 1, this scaling factor approaches 1 (we don't need to
-      scale up dimensions with high between-class variance as they already
-      naturally have a higher variance than other dimensions.  As \lambda_i
-      becomes small, this scaling factor approaches sqrt(within-class-factor),
-      so dimensions with very small between-class variance get assigned a small
-      variance equal to within-class-factor, and for dimensions with
-      intermediate between-class variance, they end up with a variance roughly
-      equal to \lambda_i: consider that the variance was originally (1 +
-      \lambda_i), so by scaling the features by approximately sqrt((\lambda_i) /
-      (1 + \lambda_i)), the variance becomes approximately \lambda_i [this is
-      clear after noting that the variance gets scaled by the square of the
-      feature scale].      
- */
-class FeatureTransformEstimate: public LdaEstimate {
- public:
-  /// Estimates the LDA transform matrix m.  If Mfull != NULL, it also outputs
-  /// the full matrix (without dimensionality reduction), which is useful for
-  /// some purposes.  If opts.remove_offset == true, it will output both matrices
-  /// with an extra column which corresponds to mean-offset removal (the matrix
-  /// should be multiplied by the feature with a 1 appended to give the correct
-  /// result, as with other Kaldi transforms.)
-  /// "within_cholesky" is a pointer to an SpMatrix that, if non-NULL, will
-  /// be set to the Cholesky factor of the within-class covariance matrix.
-  /// This is used for perturbing features.
-  void Estimate(const FeatureTransformEstimateOptions &opts,
-                Matrix<BaseFloat> *M,
-                TpMatrix<BaseFloat> *within_cholesky) const;
- protected:
-  static void EstimateInternal(const FeatureTransformEstimateOptions &opts,
-                               const SpMatrix<double> &total_covar,
-                               const SpMatrix<double> &between_covar,
-                               const Vector<double> &mean,
-                               Matrix<BaseFloat> *M,
-                               TpMatrix<BaseFloat> *C);
-};
-
-
-class FeatureTransformEstimateMulti: public FeatureTransformEstimate {
- public:
-  /// This is as FeatureTransformEstimate, but for use in
-  /// nnet-get-feature-transform-multi.cc, see the usage message
-  /// of that program for a description of what it does.
-  void Estimate(const FeatureTransformEstimateOptions &opts,
-                const std::vector<std::vector<int32> > &indexes,
-                Matrix<BaseFloat> *M) const;
-
- private:
-  void EstimateTransformPart(const FeatureTransformEstimateOptions &opts,
-                             const std::vector<int32> &indexes,
-                             const SpMatrix<double> &total_covar,
-                             const SpMatrix<double> &between_covar,
-                             const Vector<double> &mean,
-                             Matrix<BaseFloat> *M) const;
-};
-
-
-
-}  // End namespace kaldi
-
-#endif  // KALDI_NNET2_GET_FEATURE_TRANSFORM_H_
-
diff --git a/src/nnet2/mixup-nnet.cc b/src/nnet2/mixup-nnet.cc
deleted file mode 100644
index a7d3723d08e..00000000000
--- a/src/nnet2/mixup-nnet.cc
+++ /dev/null
@@ -1,222 +0,0 @@
-// nnet2/mixup-nnet.cc
-
-// Copyright 2012   Johns Hopkins University (author: Daniel Povey)
-
-// See ../../COPYING for clarification regarding multiple authors
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//  http://www.apache.org/licenses/LICENSE-2.0
-//
-// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
-// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
-// MERCHANTABLITY OR NON-INFRINGEMENT.
-// See the Apache 2 License for the specific language governing permissions and
-// limitations under the License.
-
-#include "nnet2/mixup-nnet.h"
-#include "gmm/model-common.h" // for GetSplitTargets()
-#include <numeric> // for std::accumulate
-
-namespace kaldi {
-namespace nnet2 {
-
-
-/** This function makes sure the neural net ends with a
-    SumGroupComponent.  If it doesn't, it adds one
-    (with a single mixture/matrix corresponding to each
-    output element.)  [Before doing so, it makes sure
-    that the last layer is a SoftmaxLayer, which is what
-    we expect.  You can remove this check if there is some
-    use-case that makes sense where the type of the previous
-    layer is different.
- */
-static void GiveNnetCorrectTopology(Nnet *nnet,
-                                    AffineComponent **affine_component,
-                                    SoftmaxComponent **softmax_component,
-                                    SumGroupComponent **sum_group_component) {
-  int32 nc = nnet->NumComponents();
-  KALDI_ASSERT(nc > 0);
-  Component* component = &(nnet->GetComponent(nc - 1));
-  if ((*sum_group_component =
-       dynamic_cast<SumGroupComponent*>(component)) == NULL) {
-    KALDI_LOG << "Adding SumGroupComponent to neural net.";
-    int32 dim = component->OutputDim();
-    // Give it the same learning rate as the first updatable layer we have.
-    std::vector<int32> sizes(dim, 1); // a vector of all ones, of dimension "dim".
-  
-    *sum_group_component = new SumGroupComponent();
-    (*sum_group_component)->Init(sizes);
-    nnet->Append(*sum_group_component);
-    nc++;
-  }
-  component = &(nnet->GetComponent(nc - 2));    
-  if ((*softmax_component = dynamic_cast<SoftmaxComponent*>(component)) == NULL)
-    KALDI_ERR << "Neural net has wrong topology: expected second-to-last "
-              << "component to be SoftmaxComponent, type is "
-              << component->Type();
-  component = &(nnet->GetComponent(nc - 3));
-  if ((*affine_component = dynamic_cast<AffineComponent*>(component)) == NULL)
-    KALDI_ERR << "Neural net has wrong topology: expected third-to-last "
-              << "component to be AffineComponent, type is "
-              << component->Type();
-}
-
-
-/**
-   This function works as follows.
-   We first make sure the neural net has the correct topology, so its
-   last component is a SumGroupComponent.
-
-   We then get the counts for each matrix in the SumGroupComponent (these
-   will either correspond to leaves in the decision tree, or level-1 leaves, if
-   we have a 2-level-tree system).  We work out the total count for each of these
-   matrices, by getting the count from the SoftmaxComponent.
-
-   We then increase, if necessary, the dimensions that the SumGroupComponent sums
-   over increase the dimension of the SoftmaxComponent if necessary, and duplicate
-   and then perturb the relevant rows of the AffineComponent.
- */
-
-
-
-void MixupNnet(const NnetMixupConfig &mixup_config,
-               Nnet *nnet) {
-  AffineComponent *affine_component = NULL;
-  SoftmaxComponent *softmax_component = NULL;
-  SumGroupComponent *sum_group_component = NULL;
-  GiveNnetCorrectTopology(nnet,
-                          &affine_component,
-                          &softmax_component,
-                          &sum_group_component); // Adds a SumGroupComponent if needed.
-  
-  softmax_component->MixUp(mixup_config.num_mixtures,
-                           mixup_config.power,
-                           mixup_config.min_count,
-                           mixup_config.perturb_stddev,
-                           affine_component,
-                           sum_group_component);
-  nnet->Check(); // Checks that dimensions all match up.
-}
-
-
-/// Allocate mixtures to states via a power rule, and add any new mixtures.
-void SoftmaxComponent::MixUp(int32 num_mixtures,
-                             BaseFloat power,
-                             BaseFloat min_count,
-                             BaseFloat perturb_stddev,
-                             AffineComponent *ac,
-                             SumGroupComponent *sc) {
-  // "counts" is derived from this->counts_ by summing.
-  std::vector<int32> old_sizes;
-  sc->GetSizes(&old_sizes);
-  Vector<BaseFloat> counts(old_sizes.size());
-  int32 old_dim = 0;
-  for (size_t i = 0; i < old_sizes.size(); i++) {
-    int32 this_input_dim = old_sizes[i];
-    BaseFloat this_tot_count = 0.0; /// Total the count out of
-    /// all the output dims of the softmax layer that correspond
-    /// to this mixture.  We'll use this total to allocate new quasi-Gaussians.
-    for (int32 d = 0; d < this_input_dim; d++, old_dim++)
-      this_tot_count += this->value_sum_(old_dim);
-    counts(i) = this_tot_count;
-  }
-  KALDI_ASSERT(old_dim == value_sum_.Dim());
-  KALDI_ASSERT(counts.Sum() > 0 && "Cannot do mixing up without counts.");
-
-  std::vector<int32> targets; // #mixtures for each state.
-
-
-  // Get the target number of mixtures for each state.
-  GetSplitTargets(counts, num_mixtures, power, min_count, &targets);
-  KALDI_ASSERT(targets.size() == old_sizes.size());
-  std::vector<int32> new_sizes(old_sizes.size());
-  for (size_t i = 0; i < targets.size(); i++)
-    new_sizes[i] = std::max(targets[i], old_sizes[i]);
-  int32 new_dim = std::accumulate(new_sizes.begin(), new_sizes.end(),
-                                  static_cast<int32>(0)),
-      affine_input_dim = ac->InputDim();
-  KALDI_ASSERT(new_dim >= old_dim);
-  sc->Init(new_sizes);
-  
-  // bias and linear terms from affine component:
-  Vector<BaseFloat> old_bias_term(ac->bias_params_);
-  Matrix<BaseFloat> old_linear_term(ac->linear_params_);
-  
-  Vector<BaseFloat> new_bias_term(new_dim);
-  Matrix<BaseFloat> new_linear_term(new_dim, affine_input_dim);
-  Vector<BaseFloat> new_counts(new_dim);
-
-  // old_offset and new_offset are offsets into the dimension at the
-  // input/output of the softmax component, before and after mixing up
-  // respectively.  They get incremented in the following loop.
-  int32 old_offset = 0, new_offset = 0;
-  Vector<BaseFloat> old_counts(this->value_sum_);
-  for (size_t i = 0; i < old_sizes.size(); i++) {
-    int32 this_old_dim = old_sizes[i],
-          this_new_dim = new_sizes[i],
-          this_cur_dim = this_old_dim; // this_cur_dim is loop variable.
-    
-    SubMatrix<BaseFloat> this_old_linear_term(old_linear_term,
-                                              old_offset, this_old_dim,
-                                              0, affine_input_dim),
-        this_new_linear_term(new_linear_term,
-                             new_offset, this_new_dim,
-                             0, affine_input_dim);
-    SubVector<BaseFloat> this_old_bias_term(old_bias_term,
-                                            old_offset, this_old_dim),
-        this_new_bias_term(new_bias_term, new_offset, this_new_dim),
-        this_old_counts(old_counts,
-                        old_offset, this_old_dim),
-        this_new_counts(new_counts,
-                        new_offset, this_new_dim);
-    
-    // Copy the same-dimensional part of the parameters and counts.
-    this_new_linear_term.Range(0, this_old_dim, 0, affine_input_dim).
-        CopyFromMat(this_old_linear_term);
-    this_new_bias_term.Range(0, this_old_dim).
-        CopyFromVec(this_old_bias_term);
-    this_new_counts.Range(0, this_old_dim).
-        CopyFromVec(this_old_counts);
-    // this_new_params is the mixture weights.
-    // Add the new components...
-    for (; this_cur_dim < this_new_dim; this_cur_dim++) {
-      BaseFloat *count_begin = this_new_counts.Data(),
-          *count_end  = count_begin + this_cur_dim,
-          *count_max = std::max_element(count_begin, count_end);
-      KALDI_ASSERT(*count_max > 0.0);
-      *count_max *= 0.5;
-      *count_end = *count_max; // count for the element we're adding.
-      int32 max_index = static_cast<int32>(count_max - count_begin),
-          new_index = this_cur_dim;
-      SubVector<BaseFloat> cur_vec(this_new_linear_term, max_index),
-          new_vec(this_new_linear_term, new_index);
-      new_vec.CopyFromVec(cur_vec);
-      Vector<BaseFloat> rand(affine_input_dim);
-      rand.SetRandn();
-      cur_vec.AddVec(perturb_stddev, rand);
-      new_vec.AddVec(-perturb_stddev, rand);
-      this_new_bias_term(max_index) += Log(0.5);
-      this_new_bias_term(new_index) = this_new_bias_term(max_index);
-    }
-    old_offset += this_old_dim;
-    new_offset += this_new_dim;
-  }
-  KALDI_ASSERT(old_offset == old_dim && new_offset == new_dim);
-  ac->SetParams(new_bias_term, new_linear_term);
-  this->value_sum_.Resize(new_counts.Dim());
-  this->value_sum_.CopyFromVec(new_counts);
-  this->count_ = this->value_sum_.Sum();
-  this->dim_ = new_dim;
-  KALDI_LOG << "Mixed up from dimension of " << old_dim << " to " << new_dim
-            << " in the softmax layer.";
-}
-
-
-
-  
-} // namespace nnet2
-} // namespace kaldi
diff --git a/src/nnet2/mixup-nnet.h b/src/nnet2/mixup-nnet.h
deleted file mode 100644
index 79dfa074e96..00000000000
--- a/src/nnet2/mixup-nnet.h
+++ /dev/null
@@ -1,69 +0,0 @@
-// nnet2/mixup-nnet.h
-
-// Copyright 2012  Johns Hopkins University (author: Daniel Povey)
-
-// See ../../COPYING for clarification regarding multiple authors
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//  http://www.apache.org/licenses/LICENSE-2.0
-//
-// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
-// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
-// MERCHANTABLITY OR NON-INFRINGEMENT.
-// See the Apache 2 License for the specific language governing permissions and
-// limitations under the License.
-
-#ifndef KALDI_NNET2_MIXUP_NNET_H_
-#define KALDI_NNET2_MIXUP_NNET_H_
-
-#include "nnet2/nnet-update.h"
-#include "nnet2/nnet-compute.h"
-#include "itf/options-itf.h"
-
-namespace kaldi {
-namespace nnet2 {
-
-struct NnetMixupConfig {
-  BaseFloat power;
-  BaseFloat min_count;
-  int32 num_mixtures;
-  BaseFloat perturb_stddev;
-  
-  
-  NnetMixupConfig(): power(0.25), min_count(1000.0),
-                     num_mixtures(-1), perturb_stddev(0.01) { }
-  
-  void Register(OptionsItf *opts) {
-    opts->Register("power", &power, "Scaling factor used in determining the "
-                   "number of mixture components to use for each HMM state "
-                   "(or group of HMM states)");
-    opts->Register("min-count", &min_count, "Minimum count for a quasi-Gaussian, "
-                   "enforced while allocating mixtures (obscure parameter).");
-    opts->Register("num-mixtures", &num_mixtures, "If specified, total number of "
-                   "mixture components to mix up to (should be at least the "
-                   "#leaves in the system");
-    opts->Register("perturb-stddev", &perturb_stddev, "Standard deviation used "
-                   "when perturbing parameters during mixing up");
-  }  
-};
-
-/**
-  This function does something similar to Gaussian mixture splitting for
-  GMMs, except applied to the output layer of the neural network.
-  We create additional outputs, which will be summed over using a
-  SumGroupComponent.
-*/
-
-void MixupNnet(const NnetMixupConfig &mixup_config,
-               Nnet *nnet);
-  
-
-
-} // namespace nnet2
-} // namespace kaldi
-
-#endif
diff --git a/src/nnet2/nnet-component-test.cc b/src/nnet2/nnet-component-test.cc
deleted file mode 100644
index 04e476c01bd..00000000000
--- a/src/nnet2/nnet-component-test.cc
+++ /dev/null
@@ -1,915 +0,0 @@
-// nnet2/nnet-component-test.cc
-
-// Copyright 2012-2014  Johns Hopkins University (author:  Daniel Povey)
-//                2015  Guoguo Chen
-
-// See ../../COPYING for clarification regarding multiple authors
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//  http://www.apache.org/licenses/LICENSE-2.0
-//
-// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
-// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
-// MERCHANTABLITY OR NON-INFRINGEMENT.
-// See the Apache 2 License for the specific language governing permissions and
-// limitations under the License.
-
-#include "nnet2/nnet-component.h"
-#include "util/common-utils.h"
-
-namespace kaldi {
-namespace nnet2 {
-
-
-void UnitTestGenericComponentInternal(const Component &component,
-                                      const ChunkInfo in_info,
-                                      const ChunkInfo out_info)  {
-
-  CuMatrix<BaseFloat> input(in_info.NumRows(), in_info.NumCols()),
-      output(1, out_info.NumRows() * out_info.NumCols());
-  input.SetRandn();
-  CuVector<BaseFloat> objf_vec(out_info.NumCols()); // objective function is linear function of output.
-  objf_vec.SetRandn(); // set to Gaussian noise.
-
-  int32 rand_seed = Rand();
-
-  RandomComponent *rand_component =
-      const_cast<RandomComponent*>(dynamic_cast<const RandomComponent*>(&component));
-  if (rand_component != NULL) {
-    srand(rand_seed);
-    rand_component->ResetGenerator();
-  }
-  component.Propagate(in_info, out_info, input, &output);
-  {
-    bool binary = (Rand() % 2 == 0);
-    Output ko("tmpf", binary);
-    component.Write(ko.Stream(), binary);
-  }
-  Component *component_copy;
-  {
-    bool binary_in;
-    Input ki("tmpf", &binary_in);
-    component_copy = Component::ReadNew(ki.Stream(), binary_in);
-  }
-  unlink("tmpf");
-
-  { // Test backward derivative is correct.
-    CuVector<BaseFloat> output_objfs(out_info.NumRows());
-    output_objfs.AddMatVec(1.0, output, kNoTrans, objf_vec, 0.0);
-    BaseFloat objf = output_objfs.Sum();
-
-
-    CuMatrix<BaseFloat> output_deriv(output.NumRows(), output.NumCols());
-    for (int32 i = 0; i < output_deriv.NumRows(); i++)
-      output_deriv.Row(i).CopyFromVec(objf_vec);
-
-    CuMatrix<BaseFloat> input_deriv(input.NumRows(), input.NumCols());
-
-
-    CuMatrix<BaseFloat> empty_mat;
-    CuMatrix<BaseFloat> &input_ref =
-        (component_copy->BackpropNeedsInput() ? input : empty_mat),
-        &output_ref =
-        (component_copy->BackpropNeedsOutput() ? output : empty_mat);
-
-    component_copy->Backprop(in_info, out_info, input_ref, output_ref,
-                             output_deriv, NULL, &input_deriv);
-
-    int32 num_ok = 0, num_bad = 0, num_tries = 10;
-    KALDI_LOG << "Comparing feature gradients " << num_tries << " times.";
-    for (int32 i = 0; i < num_tries; i++) {
-      CuMatrix<BaseFloat> perturbed_input(input.NumRows(), input.NumCols());
-      {
-        RandomComponent *rand_component =
-            const_cast<RandomComponent*>(dynamic_cast<const RandomComponent*>(&component));
-        if (rand_component != NULL) {
-          srand(rand_seed);
-          rand_component->ResetGenerator();
-        }
-      }
-      perturbed_input.SetRandn();
-      perturbed_input.Scale(1.0e-04); // scale by a small amount so it's like a delta.
-      BaseFloat predicted_difference = TraceMatMat(perturbed_input,
-                                                   input_deriv, kTrans);
-      perturbed_input.AddMat(1.0, input); // now it's the input + a delta.
-      { // Compute objf with perturbed input and make sure it matches
-        // prediction.
-        CuMatrix<BaseFloat> perturbed_output(output.NumRows(), output.NumCols());
-        {
-          RandomComponent *rand_component =
-              const_cast<RandomComponent*>(dynamic_cast<const RandomComponent*>(&component));
-          if (rand_component != NULL) {
-            srand(rand_seed);
-            rand_component->ResetGenerator();
-          }
-        }
-        component.Propagate(in_info, out_info, perturbed_input, &perturbed_output);
-        CuVector<BaseFloat> perturbed_output_objfs(out_info.NumRows());
-        perturbed_output_objfs.AddMatVec(1.0, perturbed_output, kNoTrans,
-                                         objf_vec, 0.0);
-        BaseFloat perturbed_objf = perturbed_output_objfs.Sum(),
-             observed_difference = perturbed_objf - objf;
-        KALDI_LOG << "Input gradients: comparing " << predicted_difference
-                  << " and " << observed_difference;
-        if (fabs(predicted_difference - observed_difference) >
-            0.15 * fabs((predicted_difference + observed_difference)/2) &&
-            fabs(predicted_difference - observed_difference) > 1.0e-06) {
-          KALDI_WARN << "Bad difference!";
-          num_bad++;
-        } else {
-          num_ok++;
-        }
-      }
-    }
-    KALDI_LOG << "Succeeded for " << num_ok << " out of " << num_tries
-              << " tries.";
-    if (num_ok <= num_bad) {
-      delete component_copy;
-      KALDI_ERR << "Feature-derivative check failed";
-    }
-  }
-
-  UpdatableComponent *ucomponent =
-      dynamic_cast<UpdatableComponent*>(component_copy);
-
-  if (ucomponent != NULL) { // Test parameter derivative is correct.
-
-    int32 num_ok = 0, num_bad = 0, num_tries = 10;
-    KALDI_LOG << "Comparing model gradients " << num_tries << " times.";
-    for (int32 i = 0; i < num_tries; i++) {
-      UpdatableComponent *perturbed_ucomponent =
-          dynamic_cast<UpdatableComponent*>(ucomponent->Copy()),
-          *gradient_ucomponent =
-          dynamic_cast<UpdatableComponent*>(ucomponent->Copy());
-      KALDI_ASSERT(perturbed_ucomponent != NULL);
-      gradient_ucomponent->SetZero(true); // set params to zero and treat as gradient.
-      BaseFloat perturb_stddev = 5.0e-04;
-      perturbed_ucomponent->PerturbParams(perturb_stddev);
-
-      CuVector<BaseFloat> output_objfs(out_info.NumRows());
-      output_objfs.AddMatVec(1.0, output, kNoTrans, objf_vec, 0.0);
-      BaseFloat objf = output_objfs.Sum();
-
-      CuMatrix<BaseFloat> output_deriv(output.NumRows(), output.NumCols());
-      for (int32 i = 0; i < output_deriv.NumRows(); i++)
-        output_deriv.Row(i).CopyFromVec(objf_vec);
-      CuMatrix<BaseFloat> input_deriv; // (input.NumRows(), input.NumCols());
-
-      // This will compute the parameter gradient.
-      ucomponent->Backprop(in_info, out_info, input, output, output_deriv,
-                           gradient_ucomponent, &input_deriv);
-
-      // Now compute the perturbed objf.
-      BaseFloat objf_perturbed;
-      {
-        CuMatrix<BaseFloat> output_perturbed; // (num_egs, output_dim);
-        {
-          RandomComponent *rand_component =
-              const_cast<RandomComponent*>(dynamic_cast<const RandomComponent*>(&component));
-          if (rand_component != NULL) {
-            srand(rand_seed);
-            rand_component->ResetGenerator();
-          }
-        }
-        perturbed_ucomponent->Propagate(in_info, out_info, input, &output_perturbed);
-        CuVector<BaseFloat> output_objfs_perturbed(out_info.NumRows());
-        output_objfs_perturbed.AddMatVec(1.0, output_perturbed,
-                                         kNoTrans, objf_vec, 0.0);
-        objf_perturbed = output_objfs_perturbed.Sum();
-      }
-
-      BaseFloat delta_objf_observed = objf_perturbed - objf,
-          delta_objf_predicted = (perturbed_ucomponent->DotProduct(*gradient_ucomponent) -
-                                  ucomponent->DotProduct(*gradient_ucomponent));
-
-      KALDI_LOG << "Model gradients: comparing " << delta_objf_observed
-                << " and " << delta_objf_predicted;
-      if (fabs(delta_objf_predicted - delta_objf_observed) >
-          0.05 * (fabs(delta_objf_predicted + delta_objf_observed)/2) &&
-          fabs(delta_objf_predicted - delta_objf_observed) > 1.0e-06) {
-        KALDI_WARN << "Bad difference!";
-        num_bad++;
-      } else {
-        num_ok++;
-      }
-      delete perturbed_ucomponent;
-      delete gradient_ucomponent;
-    }
-    if (num_ok < num_bad) {
-      delete component_copy;
-      KALDI_ERR << "model-derivative check failed";
-    }
-  }
-  delete component_copy; // No longer needed.
-}
-
-void UnitTestGenericComponentInternal(const Component &component) {
-  int32 input_dim = component.InputDim(),
-      output_dim = component.OutputDim();
-
-  KALDI_LOG << component.Info();
-  int32 num_egs = 10 + Rand() % 5;
-  int32 num_chunks = 1,
-        first_offset = 0,
-        last_offset = num_egs-1;
-
-  ChunkInfo in_info(input_dim, num_chunks, first_offset, last_offset);
-  ChunkInfo out_info(output_dim, num_chunks, first_offset, last_offset);
-  UnitTestGenericComponentInternal(component, in_info, out_info);
-}
-
-
-
-void UnitTestSigmoidComponent() {
-  // We're testing that the gradients are computed correctly:
-  // the input gradients and the model gradients.
-
-  int32 input_dim = 10 + Rand() % 50;
-  {
-    SigmoidComponent sigmoid_component(input_dim);
-    UnitTestGenericComponentInternal(sigmoid_component);
-  }
-  {
-    SigmoidComponent sigmoid_component;
-    sigmoid_component.InitFromString("dim=15");
-    UnitTestGenericComponentInternal(sigmoid_component);
-  }
-}
-
-template<class T>
-void UnitTestGenericComponent(std::string extra_str = "") {
-  // works if it has an initializer from int,
-  // e.g. tanh, sigmoid.
-
-  // We're testing that the gradients are computed correctly:
-  // the input gradients and the model gradients.
-
-  int32 input_dim = 10 + Rand() % 50;
-  {
-    T component(input_dim);
-    UnitTestGenericComponentInternal(component);
-  }
-  {
-    T component;
-    component.InitFromString(static_cast<std::string>("dim=15 ") + extra_str);
-    UnitTestGenericComponentInternal(component);
-  }
-}
-
-void UnitTestMaxoutComponent() {
-  // works if it has an initializer from int,
-  // e.g. tanh, sigmoid.
-
-  // We're testing that the gradients are computed correctly:
-  // the input gradients and the model gradients.
-
-  for (int32 i = 0; i < 5; i++) {
-    int32 output_dim = 10 + Rand() % 20,
-        group_size = 1 + Rand() % 10,
-        input_dim = output_dim * group_size;
-
-    MaxoutComponent component(input_dim, output_dim);
-    UnitTestGenericComponentInternal(component);
-  }
-
-  {
-    MaxoutComponent component;
-    component.InitFromString("input-dim=15 output-dim=5");
-    UnitTestGenericComponentInternal(component);
-  }
-}
-
-void UnitTestPnormComponent() {
-  // We're testing that the gradients are computed correctly:
-  // the input gradients and the model gradients.
-
-  int32 num_fail = 0, num_tries = 4;
-  for (int32 i = 0; i < num_tries; i++) {
-    try {
-      int32 output_dim = 10 + Rand() % 20,
-          group_size = 1 + Rand() % 10,
-          input_dim = output_dim * group_size;
-      BaseFloat p = 1.0 + 0.1 * (Rand() % 20);
-
-      PnormComponent component(input_dim, output_dim, p);
-      UnitTestGenericComponentInternal(component);
-    } catch (...) {
-      KALDI_WARN << "Ignoring test failure in UnitTestPnormComponent().";
-      num_fail++;
-    }
-  }
-  if (num_fail >= num_tries/2) {
-    KALDI_ERR << "Too many test failures.";
-  }
-}
-
-void UnitTestMaxpoolingComponent() {
-  // works if it has an initializer from int,
-  // e.g. tanh, sigmoid.
-  // We're testing that the gradients are computed correctly:
-  // the input gradients and the model gradients.
-
-  for (int32 i = 0; i < 5; i++) {
-    int32 pool_stride = 5 + Rand() % 10,
-          pool_size = 2 + Rand() % 3,
-          num_pools = 1 + Rand() % 10;
-    int32 output_dim = num_pools * pool_stride;
-    int32 num_patches = num_pools * pool_size;
-    int32 input_dim = pool_stride * num_patches;
-
-    MaxpoolingComponent component(input_dim, output_dim,
-                                  pool_size, pool_stride);
-    UnitTestGenericComponentInternal(component);
-  }
-
-  {
-    MaxpoolingComponent component;
-    component.InitFromString("input-dim=192 output-dim=64 pool-size=3 pool-stride=16");
-    UnitTestGenericComponentInternal(component);
-  }
-}
-
-
-void UnitTestAffineComponent() {
-  BaseFloat learning_rate = 0.01,
-      param_stddev = 0.1, bias_stddev = 1.0;
-  int32 input_dim = 5 + Rand() % 10, output_dim = 5 + Rand() % 10;
-  {
-    AffineComponent component;
-    if (Rand() % 2 == 0) {
-      component.Init(learning_rate, input_dim, output_dim,
-                     param_stddev, bias_stddev);
-    } else {
-      Matrix<BaseFloat> mat(output_dim + 1, input_dim);
-      mat.SetRandn();
-      mat.Scale(param_stddev);
-      WriteKaldiObject(mat, "tmpf", true);
-      Sleep(0.5);
-      component.Init(learning_rate, "tmpf");
-      unlink("tmpf");
-    }
-    UnitTestGenericComponentInternal(component);
-  }
-  {
-    const char *str = "learning-rate=0.01 input-dim=10 output-dim=15 param-stddev=0.1";
-    AffineComponent component;
-    component.InitFromString(str);
-    UnitTestGenericComponentInternal(component);
-  }
-}
-
-void UnitTestConvolutional1dComponent() {
-  BaseFloat learning_rate = 0.01,
-            param_stddev = 0.1, bias_stddev = 1.0;
-  int32 patch_stride = 10, patch_step = 1, patch_dim = 4;
-  int32 num_patches = 1 + (patch_stride - patch_dim) / patch_step;
-  int32 num_splice = 5 + Rand() % 10, num_filters = 5 + Rand() % 10;
-  int32 input_dim = patch_stride * num_splice;
-  int32 filter_dim = patch_dim * num_splice;
-  int32 output_dim = num_patches * num_filters;
-  {
-    Convolutional1dComponent component;
-    if (Rand() % 2 == 0) {
-      component.Init(learning_rate, input_dim, output_dim,
-                     patch_dim, patch_step, patch_stride,
-                     param_stddev, bias_stddev, true);
-    } else {
-      Matrix<BaseFloat> mat(num_filters, filter_dim + 1);
-      mat.SetRandn();
-      mat.Scale(param_stddev);
-      WriteKaldiObject(mat, "tmpf", true);
-      Sleep(0.5);
-      component.Init(learning_rate, patch_dim,
-                     patch_step, patch_stride, "tmpf", false);
-      unlink("tmpf");
-    }
-    UnitTestGenericComponentInternal(component);
-  }
-  {
-    // appended-conv is false by default
-    const char *str = "learning-rate=0.01 input-dim=100 output-dim=70 param-stddev=0.1 patch-dim=4 patch-step=1 patch-stride=10";
-    Convolutional1dComponent component;
-    component.InitFromString(str);
-    UnitTestGenericComponentInternal(component);
-  }
-  {
-    const char *str = "learning-rate=0.01 input-dim=100 output-dim=70 param-stddev=0.1 patch-dim=4 patch-step=1 patch-stride=10 appended-conv=true";
-    Convolutional1dComponent component;
-    component.InitFromString(str);
-    UnitTestGenericComponentInternal(component);
-  }
-}
-
-void UnitTestDropoutComponent() {
-  // We're testing that the gradients are computed correctly:
-  // the input gradients and the model gradients.
-
-  int32 num_fail = 0, num_tries = 4;
-  for (int32 i = 0; i < num_tries; i++) {
-    try {
-      int32 input_dim = 10 + Rand() % 50;
-      {
-        DropoutComponent dropout_component(input_dim, 0.5, 0.3);
-        UnitTestGenericComponentInternal(dropout_component);
-      }
-      {
-        DropoutComponent dropout_component;
-        dropout_component.InitFromString("dim=15 dropout-proportion=0.6 dropout-scale=0.1");
-        UnitTestGenericComponentInternal(dropout_component);
-      }
-    } catch (...) {
-      KALDI_WARN << "Ignoring test failure in UnitTestDropoutComponent().";
-      num_fail++;
-    }
-  }
-  if (num_fail >= num_tries/2) {
-    KALDI_ERR << "Too many test failures.";
-  }
-}
-
-void UnitTestAdditiveNoiseComponent() {
-  // We're testing that the gradients are computed correctly:
-  // the input gradients and the model gradients.
-
-  int32 num_fail = 0, num_tries = 4;
-  for (int32 i = 0; i < num_tries; i++) {
-    try {
-      int32 input_dim = 10 + Rand() % 50;
-      {
-        AdditiveNoiseComponent additive_noise_component(input_dim, 0.1);
-        UnitTestGenericComponentInternal(additive_noise_component);
-      }
-      {
-        AdditiveNoiseComponent additive_noise_component;
-        additive_noise_component.InitFromString("dim=15 stddev=0.2");
-        UnitTestGenericComponentInternal(additive_noise_component);
-      }
-    } catch (...) {
-      KALDI_WARN << "Ignoring failure in AdditiveNoiseComponent test";
-      num_fail++;
-    }
-  }
-  if (num_fail >= num_tries/2) {
-    KALDI_ERR << "Too many test failures.";
-  }
-}
-
-void UnitTestScaleComponent() {
-  int32 dim = 1 + Rand() % 10;
-  BaseFloat scale = 0.1 + Rand() % 3;
-  {
-    ScaleComponent component;
-    if (Rand() % 2 == 0) {
-      component.Init(dim, scale);
-    } else {
-      std::ostringstream str;
-      str << "dim=" << dim << " scale=" << scale;
-      component.InitFromString(str.str());
-    }
-    UnitTestGenericComponentInternal(component);
-  }
-}
-
-
-void UnitTestAffineComponentPreconditioned() {
-  BaseFloat learning_rate = 0.01,
-      param_stddev = 0.1, bias_stddev = 1.0, alpha = 0.01,
-      max_change = 100.0;
-  int32 input_dim = 5 + Rand() % 10, output_dim = 5 + Rand() % 10;
-  {
-    AffineComponentPreconditioned component;
-    if (Rand() % 2 == 0) {
-      component.Init(learning_rate, input_dim, output_dim,
-                     param_stddev, bias_stddev,
-                     alpha, max_change);
-    } else {
-      Matrix<BaseFloat> mat(output_dim + 1, input_dim);
-      mat.SetRandn();
-      mat.Scale(param_stddev);
-      WriteKaldiObject(mat, "tmpf", true);
-      Sleep(0.5);
-      component.Init(learning_rate, alpha, max_change, "tmpf");
-      unlink("tmpf");
-    }
-    UnitTestGenericComponentInternal(component);
-  }
-  {
-    const char *str = "learning-rate=0.01 input-dim=16 output-dim=15 param-stddev=0.1 alpha=0.01";
-    AffineComponentPreconditioned component;
-    component.InitFromString(str);
-    UnitTestGenericComponentInternal(component);
-  }
-}
-
-
-void UnitTestAffineComponentPreconditionedOnline() {
-  BaseFloat learning_rate = 0.01,
-      param_stddev = 0.1, bias_stddev = 1.0, num_samples_history = 2000.0, alpha = 4.0,
-      max_change_per_sample = 0.1, update_period = 1;
-  int32 input_dim = 5 + Rand() % 10, output_dim = 5 + Rand() % 10,
-      rank_in = 1 + Rand() % 5, rank_out = 1 + Rand() % 5;
-  {
-    AffineComponentPreconditionedOnline component;
-    if (Rand() % 2 == 0) {
-      component.Init(learning_rate, input_dim, output_dim,
-                     param_stddev, bias_stddev,
-                     rank_in, rank_out, update_period,
-                     num_samples_history, alpha,
-                     max_change_per_sample);
-    } else {
-      Matrix<BaseFloat> mat(output_dim + 1, input_dim);
-      mat.SetRandn();
-      mat.Scale(param_stddev);
-      WriteKaldiObject(mat, "tmpf", true);
-      Sleep(0.5);
-      component.Init(learning_rate, rank_in, rank_out,
-                     update_period, num_samples_history, alpha,
-                     max_change_per_sample, "tmpf");
-      unlink("tmpf");
-    }
-    UnitTestGenericComponentInternal(component);
-  }
-  {
-    const char *str = "learning-rate=0.01 input-dim=16 output-dim=15 param-stddev=0.1 num-samples-history=3000 alpha=2.0 update-period=1 rank-in=5 rank-out=6";
-    AffineComponentPreconditionedOnline component;
-    component.InitFromString(str);
-    UnitTestGenericComponentInternal(component);
-  }
-}
-
-void UnitTestBlockAffineComponent() {
-  BaseFloat learning_rate = 0.01,
-      param_stddev = 0.1, bias_stddev = 0.1;
-  int32 num_blocks = 1 + Rand() % 3,
-         input_dim = num_blocks * (2 + Rand() % 4),
-        output_dim = num_blocks * (2 + Rand() % 4);
-
-  {
-    BlockAffineComponent component;
-    component.Init(learning_rate, input_dim, output_dim,
-                   param_stddev, bias_stddev, num_blocks);
-    UnitTestGenericComponentInternal(component);
-  }
-  {
-    const char *str = "learning-rate=0.01 input-dim=10 output-dim=15 param-stddev=0.1 num-blocks=5";
-    BlockAffineComponent component;
-    component.InitFromString(str);
-    UnitTestGenericComponentInternal(component);
-  }
-}
-
-void UnitTestBlockAffineComponentPreconditioned() {
-  BaseFloat learning_rate = 0.01,
-      param_stddev = 0.1, bias_stddev = 1.0, alpha = 3.0;
-  int32 num_blocks = 1 + Rand() % 3,
-         input_dim = num_blocks * (2 + Rand() % 4),
-        output_dim = num_blocks * (2 + Rand() % 4);
-
-  {
-    BlockAffineComponentPreconditioned component;
-    component.Init(learning_rate, input_dim, output_dim,
-                   param_stddev, bias_stddev, num_blocks, alpha);
-    UnitTestGenericComponentInternal(component);
-  }
-  {
-    const char *str = "learning-rate=0.01 input-dim=10 output-dim=15 param-stddev=0.1 num-blocks=5 alpha=3.0";
-    BlockAffineComponentPreconditioned component;
-    component.InitFromString(str);
-    UnitTestGenericComponentInternal(component);
-  }
-}
-
-
-void UnitTestSumGroupComponent() {
-  std::vector<int32> sizes;
-  int32 num_sizes = 1 + Rand() % 5;
-  for (int32 i = 0; i < num_sizes; i++)
-    sizes.push_back(1 + Rand() % 5);
-
-  {
-    SumGroupComponent component;
-    component.Init(sizes);
-    UnitTestGenericComponentInternal(component);
-  }
-  {
-    const char *str = "sizes=3:4:5";
-    SumGroupComponent component;
-    component.InitFromString(str);
-    UnitTestGenericComponentInternal(component);
-  }
-}
-
-
-void UnitTestDctComponent() {
-  int32 m = 1 + Rand() % 4, n = 1 + Rand() % 4,
-  dct_dim = m, dim = m * n;
-  bool reorder = (Rand() % 2 == 0);
-  {
-    DctComponent component;
-    component.Init(dim, dct_dim, reorder);
-    UnitTestGenericComponentInternal(component);
-  }
-  {
-    const char *str = "dim=10 dct-dim=5 reorder=true";
-    DctComponent component;
-    component.InitFromString(str);
-    UnitTestGenericComponentInternal(component);
-  }
-  {
-    const char *str = "dim=10 dct-dim=5 reorder=true dct-keep-dim=1";
-    DctComponent component;
-    component.InitFromString(str);
-    UnitTestGenericComponentInternal(component);
-  }
-  {
-    const char *str = "dim=10 dct-dim=5 reorder=true dct-keep-dim=2";
-    DctComponent component;
-    component.InitFromString(str);
-    UnitTestGenericComponentInternal(component);
-  }
-  {
-    const char *str = "dim=10 dct-dim=5 reorder=true dct-keep-dim=3";
-    DctComponent component;
-    component.InitFromString(str);
-    UnitTestGenericComponentInternal(component);
-  }
-  {
-    const char *str = "dim=10 dct-dim=5 reorder=true dct-keep-dim=4";
-    DctComponent component;
-    component.InitFromString(str);
-    UnitTestGenericComponentInternal(component);
-  }
-}
-
-
-void UnitTestFixedLinearComponent() {
-  int32 m = 1 + Rand() % 4, n = 1 + Rand() % 4;
-  {
-    CuMatrix<BaseFloat> mat(m, n);
-    mat.SetRandn();
-    FixedLinearComponent component;
-    component.Init(mat);
-    UnitTestGenericComponentInternal(component);
-  }
-}
-
-
-void UnitTestFixedAffineComponent() {
-  int32 m = 15 + Rand() % 4, n = 15 + Rand() % 4;
-  {
-    CuMatrix<BaseFloat> mat(m, n);
-    mat.SetRandn();
-    FixedAffineComponent component;
-    component.Init(mat);
-    UnitTestGenericComponentInternal(component);
-  }
-}
-
-void UnitTestFixedScaleComponent() {
-  int32 m = 1 + Rand() % 20;
-  {
-    CuVector<BaseFloat> vec(m);
-    vec.SetRandn();
-    FixedScaleComponent component;
-    component.Init(vec);
-    UnitTestGenericComponentInternal(component);
-  }
-}
-
-void UnitTestFixedBiasComponent() {
-  int32 m = 1 + Rand() % 20;
-  {
-    CuVector<BaseFloat> vec(m);
-    vec.SetRandn();
-    FixedBiasComponent component;
-    component.Init(vec);
-    UnitTestGenericComponentInternal(component);
-  }
-}
-
-
-
-void UnitTestParsing() {
-  int32 i;
-  BaseFloat f;
-  bool b;
-  std::vector<int32> v;
-  std::string s = "x=y";
-  KALDI_ASSERT(ParseFromString("foo", &s, &i) == false
-               && s == "x=y");
-  KALDI_ASSERT(ParseFromString("foo", &s, &f) == false
-               && s == "x=y");
-  KALDI_ASSERT(ParseFromString("foo", &s, &v) == false
-               && s == "x=y");
-  KALDI_ASSERT(ParseFromString("foo", &s, &b) == false
-               && s == "x=y");
-  {
-    std::string s = "x=1";
-    KALDI_ASSERT(ParseFromString("x", &s, &i) == true
-                 && i == 1 && s == "");
-    s = "a=b x=1";
-    KALDI_ASSERT(ParseFromString("x", &s, &i) == true
-                 && i == 1 && s == "a=b");
-  }
-  {
-    std::string s = "foo=false";
-    KALDI_ASSERT(ParseFromString("foo", &s, &b) == true
-                 && b == false && s == "");
-    s = "x=y foo=true a=b";
-    KALDI_ASSERT(ParseFromString("foo", &s, &b) == true
-                 && b == true && s == "x=y a=b");
-  }
-
-  {
-    std::string s = "foobar x=1";
-    KALDI_ASSERT(ParseFromString("x", &s, &f) == true
-                 && f == 1.0 && s == "foobar");
-    s = "a=b x=1 bxy";
-    KALDI_ASSERT(ParseFromString("x", &s, &f) == true
-                 && f == 1.0 && s == "a=b bxy");
-  }
-  {
-    std::string s = "x=1:2:3";
-    KALDI_ASSERT(ParseFromString("x", &s, &v) == true
-                 && v.size() == 3 && v[0] == 1 && v[1] == 2 && v[2] == 3
-                 && s == "");
-    s = "a=b x=1:2:3 c=d";
-    KALDI_ASSERT(ParseFromString("x", &s, &v) == true
-                 && f == 1.0 && s == "a=b c=d");
-  }
-
-}
-
-void UnitTestSpliceComponent() {
-  int32 feat_dim = RandInt(1, 20),
-      const_dim =  RandInt(0, 10),
-      left_context = RandInt(-5, 0),
-      right_context = RandInt(0, 5),
-      num_chunks = RandInt(1, 20);
-        // multiple chunks are required as splice component
-        // has separate index computation logic for more than one chunks
-  KALDI_LOG << " Feat_dim :" << feat_dim << " const_dim: " << const_dim  ;
-  std::vector<bool> contiguous(2);
-  contiguous[0] = true;
-  contiguous[1] = false;
-  for (int32 i = 0; i < contiguous.size(); i++) {
-    std::vector<int32> splice_indexes;
-    if (contiguous[i]) {
-      // create contiguous set of splice indexes in the range
-      // (-left_context, right_context)
-      KALDI_LOG << "Testing contiguous splice component";
-      splice_indexes.reserve(right_context - left_context + 1);
-      for (int32 i = left_context; i <= right_context; i++)
-        splice_indexes.push_back(i);
-    } else  {
-      // generate random splice indexes in range (-left_context, right_context)
-      KALDI_LOG << "Testing non-contiguous splice component";
-      int32 num_left_splice_indexes = RandInt(0, -left_context) + 1;
-      int32 num_right_splice_indexes = RandInt(0, right_context);
-      splice_indexes.reserve(num_left_splice_indexes + num_right_splice_indexes);
-      while (splice_indexes.size() < num_left_splice_indexes)  {
-        int32 new_index = RandInt(left_context, 0);
-        // check if the index already exists in the vector
-        if (std::find(splice_indexes.begin(), splice_indexes.end(), new_index)
-            == splice_indexes.end())  {
-          splice_indexes.push_back(new_index);
-        }
-      }
-      while (splice_indexes.size() < num_left_splice_indexes + num_right_splice_indexes)  {
-        int32 new_index = RandInt(0, right_context);
-        // check if the index already exists in the vector
-        if (std::find(splice_indexes.begin(), splice_indexes.end(), new_index)
-            == splice_indexes.end())  {
-          splice_indexes.push_back(new_index);
-        }
-      }
-      sort(splice_indexes.begin(), splice_indexes.end());
-      if (splice_indexes.back() < 0) // will fail assertion in init of component
-        splice_indexes.push_back(0);
-    }
-    std::vector<int32> input_offsets;
-    for (int32 i = 0; i < splice_indexes.size(); i++) {
-      input_offsets.push_back(splice_indexes[i] - splice_indexes.front());
-      KALDI_LOG << i << " : " << splice_indexes[i] << " : " << input_offsets[i] ;
-    }
-    int32 output_offset = -splice_indexes.front();
-    SpliceComponent *component = new SpliceComponent();
-    component->Init(feat_dim + const_dim, splice_indexes, const_dim);
-    ChunkInfo in_info = ChunkInfo(feat_dim + const_dim, num_chunks,
-                                  input_offsets),
-              out_info = ChunkInfo(feat_dim * splice_indexes.size() + const_dim,
-                                   num_chunks, output_offset, output_offset);
-    UnitTestGenericComponentInternal(*component, in_info, out_info);
-    delete component;
-  }
-}
-
-void BasicDebugTestForSpliceMax(bool output=false) {
-  int32 C=5,
-        context_len=2,
-        R= 3 + 2*context_len;
-
-  SpliceMaxComponent *c = new SpliceMaxComponent();
-  std::vector<int32> context(2 * context_len + 1);
-  for (int32 i = -1 * context_len; i <= context_len; i++)
-    context[i + context_len] = i;
-  c->Init(C, context);
-  CuMatrix<BaseFloat> in(R, C), in_deriv(R, C);
-  CuMatrix<BaseFloat> out(R, c->OutputDim());
-  ChunkInfo in_info = ChunkInfo(C, 1, 0, R - 1),
-            out_info = ChunkInfo(C, 1, context_len, R - 1 - context_len);
-
-  in.SetRandn();
-  if (output)
-    KALDI_LOG << in;
-
-  c->Propagate(in_info, out_info, in, &out);
-
-  if (output)
-    KALDI_LOG << out;
-
-  out.Set(5.0);
-
-  if (output)
-    KALDI_LOG << out;
-
-  c->Backprop(in_info, out_info, in, in, out, c, &in_deriv);
-
-  if (output)
-    KALDI_LOG << in_deriv;
-
-  delete c;
-}
-
-
-} // namespace nnet2
-} // namespace kaldi
-
-#include "matrix/matrix-functions.h"
-
-
-int main() {
-  using namespace kaldi;
-  using namespace kaldi::nnet2;
-
-  int32 loop = 0;
-#if HAVE_CUDA == 1
-  for (loop = 0; loop < 2; loop++) {
-    //// Uncomment the following line to expose the bug in UnitTestDropoutComponent
-    //CuDevice::Instantiate().SetDebugStrideMode(true);
-    if (loop == 0)
-      CuDevice::Instantiate().SelectGpuId("no"); // -1 means no GPU
-    else
-      CuDevice::Instantiate().SelectGpuId("optional"); // -2 .. automatic selection
-#endif
-
-    BasicDebugTestForSpliceMax(true);
-    // We used to test this 3 times, but now that nnet2 is rarely changed,
-    // reducing it to once.
-    for (int32 i = 0; i < 1; i++) {
-      UnitTestGenericComponent<SigmoidComponent>();
-      UnitTestGenericComponent<TanhComponent>();
-      UnitTestGenericComponent<PowerComponent>("power=1.5");
-      UnitTestGenericComponent<PowerComponent>("power=1.0");
-      UnitTestGenericComponent<PermuteComponent>();
-      UnitTestGenericComponent<SoftmaxComponent>();
-      UnitTestGenericComponent<LogSoftmaxComponent>();
-      UnitTestGenericComponent<RectifiedLinearComponent>();
-      UnitTestGenericComponent<SoftHingeComponent>();
-      UnitTestSpliceComponent();
-      UnitTestMaxoutComponent();
-      UnitTestPnormComponent();
-      UnitTestMaxpoolingComponent();
-      UnitTestGenericComponent<NormalizeComponent>();
-      UnitTestSigmoidComponent();
-      UnitTestAffineComponent();
-      UnitTestScaleComponent();
-      UnitTestBlockAffineComponent();
-      UnitTestBlockAffineComponentPreconditioned();
-      UnitTestSumGroupComponent();
-      UnitTestDctComponent();
-      UnitTestFixedLinearComponent();
-      UnitTestFixedAffineComponent();
-      UnitTestFixedScaleComponent();
-      UnitTestFixedBiasComponent();
-      UnitTestAffineComponentPreconditioned();
-      UnitTestAffineComponentPreconditionedOnline();
-      UnitTestConvolutional1dComponent();
-      UnitTestDropoutComponent();
-      UnitTestAdditiveNoiseComponent();
-      UnitTestParsing();
-      if (loop == 0)
-        KALDI_LOG << "Tests without GPU use succeeded.";
-      else
-        KALDI_LOG << "Tests with GPU use (if available) succeeded.";
-    }
-#if HAVE_CUDA == 1
-  } // No for loop if 'HAVE_CUDA != 1',
-  CuDevice::Instantiate().PrintProfile();
-#endif
-  return 0;
-}
diff --git a/src/nnet2/nnet-component.cc b/src/nnet2/nnet-component.cc
deleted file mode 100644
index eafeaceb9fe..00000000000
--- a/src/nnet2/nnet-component.cc
+++ /dev/null
@@ -1,4390 +0,0 @@
-// nnet2/nnet-component.cc
-
-// Copyright 2011-2012  Karel Vesely
-//           2013-2014  Johns Hopkins University (author: Daniel Povey)
-//                2013  Xiaohui Zhang
-//                2014  Vijayaditya Peddinti
-//           2014-2015  Guoguo Chen
-
-// See ../../COPYING for clarification regarding multiple authors
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//  http://www.apache.org/licenses/LICENSE-2.0
-//
-// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
-// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
-// MERCHANTABLITY OR NON-INFRINGEMENT.
-// See the Apache 2 License for the specific language governing permissions and
-// limitations under the License.
-
-#include <iterator>
-#include <sstream>
-#include "nnet2/nnet-component.h"
-#include "nnet2/nnet-precondition.h"
-#include "nnet2/nnet-precondition-online.h"
-#include "util/stl-utils.h"
-#include "util/text-utils.h"
-#include "util/kaldi-io.h"
-
-namespace kaldi {
-namespace nnet2 {
-
-// static
-Component* Component::ReadNew(std::istream &is, bool binary) {
-  std::string token;
-  ReadToken(is, binary, &token); // e.g. "<SigmoidComponent>".
-  token.erase(0, 1); // erase "<".
-  token.erase(token.length()-1); // erase ">".
-  Component *ans = NewComponentOfType(token);
-  if (!ans)
-    KALDI_ERR << "Unknown component type " << token;
-  ans->Read(is, binary);
-  return ans;
-}
-
-
-// static
-Component* Component::NewComponentOfType(const std::string &component_type) {
-  Component *ans = NULL;
-  if (component_type == "SigmoidComponent") {
-    ans = new SigmoidComponent();
-  } else if (component_type == "TanhComponent") {
-    ans = new TanhComponent();
-  } else if (component_type == "PowerComponent") {
-    ans = new PowerComponent();
-  } else if (component_type == "SoftmaxComponent") {
-    ans = new SoftmaxComponent();
-  } else if (component_type == "LogSoftmaxComponent") {
-    ans = new LogSoftmaxComponent();
-  } else if (component_type == "RectifiedLinearComponent") {
-    ans = new RectifiedLinearComponent();
-  } else if (component_type == "NormalizeComponent") {
-    ans = new NormalizeComponent();
-  } else if (component_type == "SoftHingeComponent") {
-    ans = new SoftHingeComponent();
-  } else if (component_type == "PnormComponent") {
-    ans = new PnormComponent();
-  } else if (component_type == "MaxoutComponent") {
-    ans = new MaxoutComponent();
-  } else if (component_type == "ScaleComponent") {
-    ans = new ScaleComponent();
-  } else if (component_type == "AffineComponent") {
-    ans = new AffineComponent();
-  } else if (component_type == "AffineComponentPreconditioned") {
-    ans = new AffineComponentPreconditioned();
-  } else if (component_type == "AffineComponentPreconditionedOnline") {
-    ans = new AffineComponentPreconditionedOnline();
-  } else if (component_type == "SumGroupComponent") {
-    ans = new SumGroupComponent();
-  } else if (component_type == "BlockAffineComponent") {
-    ans = new BlockAffineComponent();
-  } else if (component_type == "BlockAffineComponentPreconditioned") {
-    ans = new BlockAffineComponentPreconditioned();
-  } else if (component_type == "PermuteComponent") {
-    ans = new PermuteComponent();
-  } else if (component_type == "DctComponent") {
-    ans = new DctComponent();
-  } else if (component_type == "FixedLinearComponent") {
-    ans = new FixedLinearComponent();
-  } else if (component_type == "FixedAffineComponent") {
-    ans = new FixedAffineComponent();
-  } else if (component_type == "FixedScaleComponent") {
-    ans = new FixedScaleComponent();
-  } else if (component_type == "FixedBiasComponent") {
-    ans = new FixedBiasComponent();
-  } else if (component_type == "SpliceComponent") {
-    ans = new SpliceComponent();
-  } else if (component_type == "SpliceMaxComponent") {
-    ans = new SpliceMaxComponent();
-  } else if (component_type == "DropoutComponent") {
-    ans = new DropoutComponent();
-  } else if (component_type == "AdditiveNoiseComponent") {
-    ans = new AdditiveNoiseComponent();
-  } else if (component_type == "Convolutional1dComponent") {
-    ans = new Convolutional1dComponent();
-  } else if (component_type == "MaxpoolingComponent") {
-    ans = new MaxpoolingComponent();
-  }
-  return ans;
-}
-
-// static
-Component* Component::NewFromString(const std::string &initializer_line) {
-  std::istringstream istr(initializer_line);
-  std::string component_type; // e.g. "SigmoidComponent".
-  istr >> component_type >> std::ws;
-  std::string rest_of_line;
-  getline(istr, rest_of_line);
-  Component *ans = NewComponentOfType(component_type);
-  if (ans == NULL)
-    KALDI_ERR << "Bad initializer line (no such type of Component): "
-              << initializer_line;
-  ans->InitFromString(rest_of_line);
-  return ans;
-}
-
-
-// This is like ExpectToken but for two tokens, and it
-// will either accept token1 and then token2, or just token2.
-// This is useful in Read functions where the first token
-// may already have been consumed.
-static void ExpectOneOrTwoTokens(std::istream &is, bool binary,
-                                 const std::string &token1,
-                                 const std::string &token2) {
-  KALDI_ASSERT(token1 != token2);
-  std::string temp;
-  ReadToken(is, binary, &temp);
-  if (temp == token1) {
-    ExpectToken(is, binary, token2);
-  } else {
-    if (temp != token2) {
-      KALDI_ERR << "Expecting token " << token1 << " or " << token2
-                << " but got " << temp;
-    }
-  }
-}
-
-
-// static
-bool ParseFromString(const std::string &name, std::string *string,
-                     int32 *param) {
-  std::vector<std::string> split_string;
-  SplitStringToVector(*string, " \t", true,
-                      &split_string);
-  std::string name_equals = name + "="; // the name and then the equals sign.
-  size_t len = name_equals.length();
-
-  for (size_t i = 0; i < split_string.size(); i++) {
-    if (split_string[i].compare(0, len, name_equals) == 0) {
-      if (!ConvertStringToInteger(split_string[i].substr(len), param))
-        KALDI_ERR << "Bad option " << split_string[i];
-      *string = "";
-      // Set "string" to all the pieces but the one we used.
-      for (size_t j = 0; j < split_string.size(); j++) {
-        if (j != i) {
-          if (!string->empty()) *string += " ";
-          *string += split_string[j];
-        }
-      }
-      return true;
-    }
-  }
-  return false;
-}
-
-bool ParseFromString(const std::string &name, std::string *string,
-                     bool *param) {
-  std::vector<std::string> split_string;
-  SplitStringToVector(*string, " \t", true,
-                      &split_string);
-  std::string name_equals = name + "="; // the name and then the equals sign.
-  size_t len = name_equals.length();
-
-  for (size_t i = 0; i < split_string.size(); i++) {
-    if (split_string[i].compare(0, len, name_equals) == 0) {
-      std::string b = split_string[i].substr(len);
-      if (b.empty())
-        KALDI_ERR << "Bad option " << split_string[i];
-      if (b[0] == 'f' || b[0] == 'F') *param = false;
-      else if (b[0] == 't' || b[0] == 'T') *param = true;
-      else
-        KALDI_ERR << "Bad option " << split_string[i];
-      *string = "";
-      // Set "string" to all the pieces but the one we used.
-      for (size_t j = 0; j < split_string.size(); j++) {
-        if (j != i) {
-          if (!string->empty()) *string += " ";
-          *string += split_string[j];
-        }
-      }
-      return true;
-    }
-  }
-  return false;
-}
-
-bool ParseFromString(const std::string &name, std::string *string,
-                     BaseFloat *param) {
-  std::vector<std::string> split_string;
-  SplitStringToVector(*string, " \t", true,
-                      &split_string);
-  std::string name_equals = name + "="; // the name and then the equals sign.
-  size_t len = name_equals.length();
-
-  for (size_t i = 0; i < split_string.size(); i++) {
-    if (split_string[i].compare(0, len, name_equals) == 0) {
-      if (!ConvertStringToReal(split_string[i].substr(len), param))
-        KALDI_ERR << "Bad option " << split_string[i];
-      *string = "";
-      // Set "string" to all the pieces but the one we used.
-      for (size_t j = 0; j < split_string.size(); j++) {
-        if (j != i) {
-          if (!string->empty()) *string += " ";
-          *string += split_string[j];
-        }
-      }
-      return true;
-    }
-  }
-  return false;
-}
-
-bool ParseFromString(const std::string &name, std::string *string,
-                     std::string *param) {
-  std::vector<std::string> split_string;
-  SplitStringToVector(*string, " \t", true,
-                      &split_string);
-  std::string name_equals = name + "="; // the name and then the equals sign.
-  size_t len = name_equals.length();
-
-  for (size_t i = 0; i < split_string.size(); i++) {
-    if (split_string[i].compare(0, len, name_equals) == 0) {
-      *param = split_string[i].substr(len);
-
-      // Set "string" to all the pieces but the one we used.
-      *string = "";
-      for (size_t j = 0; j < split_string.size(); j++) {
-        if (j != i) {
-          if (!string->empty()) *string += " ";
-          *string += split_string[j];
-        }
-      }
-      return true;
-    }
-  }
-  return false;
-}
-
-bool ParseFromString(const std::string &name, std::string *string,
-                     std::vector<int32> *param) {
-  std::vector<std::string> split_string;
-  SplitStringToVector(*string, " \t", true,
-                      &split_string);
-  std::string name_equals = name + "="; // the name and then the equals sign.
-  size_t len = name_equals.length();
-
-  for (size_t i = 0; i < split_string.size(); i++) {
-    if (split_string[i].compare(0, len, name_equals) == 0) {
-      if (!SplitStringToIntegers(split_string[i].substr(len), ":",
-                                 false, param))
-        KALDI_ERR << "Bad option " << split_string[i];
-      *string = "";
-      // Set "string" to all the pieces but the one we used.
-      for (size_t j = 0; j < split_string.size(); j++) {
-        if (j != i) {
-          if (!string->empty()) *string += " ";
-          *string += split_string[j];
-        }
-      }
-      return true;
-    }
-  }
-  return false;
-}
-
-
-Component *PermuteComponent::Copy() const {
-  PermuteComponent *ans = new PermuteComponent();
-  ans->reorder_ = reorder_;
-  return ans;
-}
-void PermuteComponent::Init(const std::vector<int32> &reorder) {
-  reorder_ = reorder;
-  KALDI_ASSERT(!reorder.empty());
-  std::vector<int32> indexes(reorder);
-  std::sort(indexes.begin(), indexes.end());
-  for (int32 i = 0; i < static_cast<int32>(indexes.size()); i++)
-    KALDI_ASSERT(i == indexes[i] && "Not a permutation");
-}
-
-
-std::string Component::Info() const {
-  std::stringstream stream;
-  stream << Type() << ", input-dim=" << InputDim()
-         << ", output-dim=" << OutputDim();
-  return stream.str();
-}
-
-std::string UpdatableComponent::Info() const {
-  std::stringstream stream;
-  stream << Type() << ", input-dim=" << InputDim()
-         << ", output-dim=" << OutputDim() << ", learning-rate="
-         << LearningRate();
-  return stream.str();
-}
-
-
-void NonlinearComponent::SetDim(int32 dim) {
-  KALDI_ASSERT(dim > 0);
-  dim_ = dim;
-  value_sum_.Resize(dim);
-  deriv_sum_.Resize(dim);
-  count_ = 0.0;
-}
-
-void NonlinearComponent::UpdateStats(const CuMatrixBase<BaseFloat> &out_value,
-                                     const CuMatrixBase<BaseFloat> *deriv) {
-  KALDI_ASSERT(out_value.NumCols() == InputDim());
-  // Check we have the correct dimensions.
-  if (value_sum_.Dim() != InputDim() ||
-      (deriv != NULL && deriv_sum_.Dim() != InputDim())) {
-    std::lock_guard<std::mutex> lock(mutex_);
-    if (value_sum_.Dim() != InputDim()) {
-      value_sum_.Resize(InputDim());
-      count_ = 0.0;
-    }
-    if (deriv != NULL && deriv_sum_.Dim() != InputDim()) {
-      deriv_sum_.Resize(InputDim());
-      count_ = 0.0;
-      value_sum_.SetZero();
-    }
-  }
-  count_ += out_value.NumRows();
-  CuVector<BaseFloat> temp(InputDim());
-  temp.AddRowSumMat(1.0, out_value, 0.0);
-  value_sum_.AddVec(1.0, temp);
-  if (deriv != NULL) {
-    temp.AddRowSumMat(1.0, *deriv, 0.0);
-    deriv_sum_.AddVec(1.0, temp);
-  }
-}
-
-void NonlinearComponent::Scale(BaseFloat scale) {
-  value_sum_.Scale(scale);
-  deriv_sum_.Scale(scale);
-  count_ *= scale;
-}
-
-void NonlinearComponent::Add(BaseFloat alpha, const NonlinearComponent &other) {
-  if (value_sum_.Dim() == 0 && other.value_sum_.Dim() != 0)
-    value_sum_.Resize(other.value_sum_.Dim());
-  if (deriv_sum_.Dim() == 0 && other.deriv_sum_.Dim() != 0)
-    deriv_sum_.Resize(other.deriv_sum_.Dim());
-  if (other.value_sum_.Dim() != 0)
-    value_sum_.AddVec(alpha, other.value_sum_);
-  if (other.deriv_sum_.Dim() != 0)
-    deriv_sum_.AddVec(alpha, other.deriv_sum_);
-  count_ += alpha * other.count_;
-}
-
-void NonlinearComponent::Read(std::istream &is, bool binary) {
-  std::ostringstream ostr_beg, ostr_end;
-  ostr_beg << "<" << Type() << ">"; // e.g. "<SigmoidComponent>"
-  ostr_end << "</" << Type() << ">"; // e.g. "</SigmoidComponent>"
-  ExpectOneOrTwoTokens(is, binary, ostr_beg.str(), "<Dim>");
-  ReadBasicType(is, binary, &dim_); // Read dimension.
-  ExpectToken(is, binary, "<ValueSum>");
-  value_sum_.Read(is, binary);
-  ExpectToken(is, binary, "<DerivSum>");
-  deriv_sum_.Read(is, binary);
-  ExpectToken(is, binary, "<Count>");
-  ReadBasicType(is, binary, &count_);
-  ExpectToken(is, binary, ostr_end.str());
-}
-
-void NonlinearComponent::Write(std::ostream &os, bool binary) const {
-  std::ostringstream ostr_beg, ostr_end;
-  ostr_beg << "<" << Type() << ">"; // e.g. "<SigmoidComponent>"
-  ostr_end << "</" << Type() << ">"; // e.g. "</SigmoidComponent>"
-  WriteToken(os, binary, ostr_beg.str());
-  WriteToken(os, binary, "<Dim>");
-  WriteBasicType(os, binary, dim_);
-  WriteToken(os, binary, "<ValueSum>");
-  value_sum_.Write(os, binary);
-  WriteToken(os, binary, "<DerivSum>");
-  deriv_sum_.Write(os, binary);
-  WriteToken(os, binary, "<Count>");
-  WriteBasicType(os, binary, count_);
-  WriteToken(os, binary, ostr_end.str());
-}
-
-NonlinearComponent::NonlinearComponent(const NonlinearComponent &other):
-    dim_(other.dim_), value_sum_(other.value_sum_), deriv_sum_(other.deriv_sum_),
-    count_(other.count_) { }
-
-void NonlinearComponent::InitFromString(std::string args) {
-  std::string orig_args(args);
-  int32 dim;
-  bool ok = ParseFromString("dim", &args, &dim);
-  if (!ok || !args.empty() || dim <= 0)
-    KALDI_ERR << "Invalid initializer for layer of type "
-              << Type() << ": \"" << orig_args << "\"";
-  Init(dim);
-}
-
-void MaxoutComponent::Init(int32 input_dim, int32 output_dim)  {
-  input_dim_ = input_dim;
-  output_dim_ = output_dim;
-  if (input_dim_ == 0)
-    input_dim_ = 10 * output_dim_; // default group size : 10
-  KALDI_ASSERT(input_dim_ > 0 && output_dim_ >= 0);
-  KALDI_ASSERT(input_dim_ % output_dim_ == 0);
-}
-
-void MaxoutComponent::InitFromString(std::string args) {
-  std::string orig_args(args);
-  int32 input_dim = 0;
-  int32 output_dim = 0;
-  bool ok = ParseFromString("output-dim", &args, &output_dim) &&
-      ParseFromString("input-dim", &args, &input_dim);
-  KALDI_LOG << output_dim << " " << input_dim << " " << ok;
-  if (!ok || !args.empty() || output_dim <= 0)
-    KALDI_ERR << "Invalid initializer for layer of type "
-              << Type() << ": \"" << orig_args << "\"";
-  Init(input_dim, output_dim);
-}
-
-
-void MaxoutComponent::Propagate(const ChunkInfo &in_info,
-                                const ChunkInfo &out_info,
-                                const CuMatrixBase<BaseFloat> &in,
-                                CuMatrixBase<BaseFloat> *out) const  {
-  in_info.CheckSize(in);
-  out_info.CheckSize(*out);
-  KALDI_ASSERT(in_info.NumChunks() == out_info.NumChunks());
-  out->GroupMax(in);
-}
-
-void MaxoutComponent::Backprop(const ChunkInfo &, // in_info,
-                               const ChunkInfo &, // out_info,
-                               const CuMatrixBase<BaseFloat> &in_value,
-                               const CuMatrixBase<BaseFloat> &out_value,
-                               const CuMatrixBase<BaseFloat> &out_deriv,
-                               Component *to_update,
-                               CuMatrix<BaseFloat> *in_deriv) const {
-  in_deriv->Resize(in_value.NumRows(), in_value.NumCols(), kSetZero);
-  in_deriv->GroupMaxDeriv(in_value, out_value);
-  in_deriv->MulRowsGroupMat(out_deriv);
-}
-
-void MaxoutComponent::Read(std::istream &is, bool binary) {
-  ExpectOneOrTwoTokens(is, binary, "<MaxoutComponent>", "<InputDim>");
-  ReadBasicType(is, binary, &input_dim_);
-  ExpectToken(is, binary, "<OutputDim>");
-  ReadBasicType(is, binary, &output_dim_);
-  ExpectToken(is, binary, "</MaxoutComponent>");
-}
-
-void MaxoutComponent::Write(std::ostream &os, bool binary) const {
-  WriteToken(os, binary, "<MaxoutComponent>");
-  WriteToken(os, binary, "<InputDim>");
-  WriteBasicType(os, binary, input_dim_);
-  WriteToken(os, binary, "<OutputDim>");
-  WriteBasicType(os, binary, output_dim_);
-  WriteToken(os, binary, "</MaxoutComponent>");
-}
-
-std::string MaxoutComponent::Info() const {
-  std::stringstream stream;
-  stream << Type() << ", input-dim = " << input_dim_
-         << ", output-dim = " << output_dim_;
-  return stream.str();
-}
-
-void PnormComponent::Init(int32 input_dim, int32 output_dim, BaseFloat p)  {
-  input_dim_ = input_dim;
-  output_dim_ = output_dim;
-  if (input_dim_ == 0)
-    input_dim_ = 10 * output_dim_; // default group size : 10
-  p_ = p;
-  KALDI_ASSERT(input_dim_ > 0 && output_dim_ >= 0 && p_ >= 0);
-  KALDI_ASSERT(input_dim_ % output_dim_ == 0);
-}
-
-void PnormComponent::InitFromString(std::string args) {
-  std::string orig_args(args);
-  int32 input_dim = 0;
-  int32 output_dim = 0;
-  BaseFloat p = 2;
-  bool ok = ParseFromString("output-dim", &args, &output_dim) &&
-      ParseFromString("input-dim", &args, &input_dim);
-  ParseFromString("p", &args, &p);
-  if (!ok || !args.empty() || output_dim <= 0)
-    KALDI_ERR << "Invalid initializer for layer of type "
-              << Type() << ": \"" << orig_args << "\"";
-  Init(input_dim, output_dim, p);
-}
-
-
-void PnormComponent::Propagate(const ChunkInfo &in_info,
-                               const ChunkInfo &out_info,
-                               const CuMatrixBase<BaseFloat> &in,
-                               CuMatrixBase<BaseFloat> *out) const  {
-  in_info.CheckSize(in);
-  out_info.CheckSize(*out);
-  KALDI_ASSERT(in_info.NumChunks() == out_info.NumChunks());
-
-  out->GroupPnorm(in, p_);
-}
-
-void PnormComponent::Backprop(const ChunkInfo &,  // in_info,
-                              const ChunkInfo &,  // out_info,
-                              const CuMatrixBase<BaseFloat> &in_value,
-                              const CuMatrixBase<BaseFloat> &out_value,
-                              const CuMatrixBase<BaseFloat> &out_deriv,
-                              Component *to_update,
-                                // may be identical to "this".
-                              CuMatrix<BaseFloat> *in_deriv) const  {
-  in_deriv->Resize(in_value.NumRows(), in_value.NumCols(), kSetZero);
-  in_deriv->DiffGroupPnorm(in_value, out_value, out_deriv, p_);
-}
-
-void PnormComponent::Read(std::istream &is, bool binary) {
-  ExpectOneOrTwoTokens(is, binary, "<PnormComponent>", "<InputDim>");
-  ReadBasicType(is, binary, &input_dim_);
-  ExpectToken(is, binary, "<OutputDim>");
-  ReadBasicType(is, binary, &output_dim_);
-  ExpectToken(is, binary, "<P>");
-  ReadBasicType(is, binary, &p_);
-  ExpectToken(is, binary, "</PnormComponent>");
-}
-
-void PnormComponent::Write(std::ostream &os, bool binary) const {
-  WriteToken(os, binary, "<PnormComponent>");
-  WriteToken(os, binary, "<InputDim>");
-  WriteBasicType(os, binary, input_dim_);
-  WriteToken(os, binary, "<OutputDim>");
-  WriteBasicType(os, binary, output_dim_);
-  WriteToken(os, binary, "<P>");
-  WriteBasicType(os, binary, p_);
-  WriteToken(os, binary, "</PnormComponent>");
-}
-
-std::string PnormComponent::Info() const {
-  std::stringstream stream;
-  stream << Type() << ", input-dim = " << input_dim_
-         << ", output-dim = " << output_dim_
-     << ", p = " << p_;
-  return stream.str();
-}
-
-
-const BaseFloat NormalizeComponent::kNormFloor = pow(2.0, -66);
-// This component modifies the vector of activations by scaling it so that the
-// root-mean-square equals 1.0.
-
-void NormalizeComponent::Propagate(const ChunkInfo &in_info,
-                                   const ChunkInfo &out_info,
-                                   const CuMatrixBase<BaseFloat> &in,
-                                   CuMatrixBase<BaseFloat> *out) const  {
-  cu::NormalizePerRow(in, BaseFloat(1), false, out);
-}
-
-/*
-  A note on the derivative of NormalizeComponent...
-  let both row_in and row_out be vectors of dimension D.
-  Let p = row_in^T row_in / D, and let
-      f = 1 / sqrt(max(kNormFloor, p)), and we compute row_out as:
-row_out = f row_in.
-  Suppose we have a quantity deriv_out which is the derivative
-  of the objective function w.r.t. row_out.  We want to compute
-  deriv_in which is the derivative of the objective function w.r.t.
-  row_in.  Let the objective function be F.  One term is obvious: we have
-     deriv_in = f deriv_out + ....
-  next we have to take into account the derivative that gets back-propagated
-  through f.  Obviously, dF/df = deriv_out^T row_in.
-  And df/dp = (p <= kNormFloor ? 0.0 : -0.5 p^{-1.5}) = (f == 1 / sqrt(kNormFloor) ? 0.0 : -0.5 f^3),
-  and dp/d(row_in) = 2/D row_in. [it's vector_valued].
-  So this term in dF/d(row_in) equals:
-    dF/df df/dp dp/d(row_in)   =    2/D (f == 1 / sqrt(kNormFloor)  ? 0.0 : -0.5 f^3) (deriv_out^T row_in) row_in
-  So
-     deriv_in = f deriv_out + (f == 1.0 ? 0.0 : -f^3 / D) (deriv_out^T row_in) row_in
-
-*/
-
-void NormalizeComponent::Backprop(
-    const ChunkInfo &,  // in_info,
-    const ChunkInfo &,  // out_info,
-    const CuMatrixBase<BaseFloat> &in_value,
-    const CuMatrixBase<BaseFloat> &out_value,
-    const CuMatrixBase<BaseFloat> &out_deriv, Component *to_update,
-    // may be identical to "this".
-    CuMatrix<BaseFloat> *in_deriv) const {
-  in_deriv->Resize(out_deriv.NumRows(), out_deriv.NumCols());
-  cu::DiffNormalizePerRow(in_value, out_deriv, BaseFloat(1), false, in_deriv);
-}
-
-void SigmoidComponent::Propagate(const ChunkInfo &in_info,
-                                 const ChunkInfo &out_info,
-                                 const CuMatrixBase<BaseFloat> &in,
-                                 CuMatrixBase<BaseFloat> *out) const  {
-  in_info.CheckSize(in);
-  out_info.CheckSize(*out);
-  KALDI_ASSERT(in_info.NumChunks() == out_info.NumChunks());
-
-  out->Sigmoid(in);
-}
-
-void SigmoidComponent::Backprop(const ChunkInfo &,  //in_info,
-                                const ChunkInfo &,  //out_info,
-                                const CuMatrixBase<BaseFloat> &,  //in_value,
-                                const CuMatrixBase<BaseFloat> &out_value,
-                                const CuMatrixBase<BaseFloat> &out_deriv,
-                                Component *to_update, // may be identical to "this".
-                                CuMatrix<BaseFloat> *in_deriv) const  {
-  // we ignore in_value and to_update.
-
-  // The element by element equation would be:
-  // in_deriv = out_deriv * out_value * (1.0 - out_value);
-  // We can accomplish this via calls to the matrix library.
-
-  in_deriv->Resize(out_deriv.NumRows(), out_deriv.NumCols());
-  in_deriv->Set(1.0);
-  in_deriv->AddMat(-1.0, out_value);
-  // now in_deriv = 1.0 - out_value [element by element]
-  in_deriv->MulElements(out_value);
-  // now in_deriv = out_value * (1.0 - out_value) [element by element], i.e.
-  // it contains the element-by-element derivative of the nonlinearity.
-  if (to_update != NULL)
-    dynamic_cast<NonlinearComponent*>(to_update)->UpdateStats(out_value,
-                                                              in_deriv);
-  in_deriv->MulElements(out_deriv);
-  // now in_deriv = out_deriv * out_value * (1.0 - out_value) [element by element]
-}
-
-
-void TanhComponent::Propagate(const ChunkInfo &in_info,
-                              const ChunkInfo &out_info,
-                              const CuMatrixBase<BaseFloat> &in,
-                              CuMatrixBase<BaseFloat> *out) const  {
-  // Apply tanh function to each element of the output...
-  // the tanh function may be written as -1 + ( 2 / (1 + e^{-2 x})),
-  // which is a scaled and shifted sigmoid.
-
-  in_info.CheckSize(in);
-  out_info.CheckSize(*out);
-  KALDI_ASSERT(in_info.NumChunks() == out_info.NumChunks());
-  out->Tanh(in);
-}
-
-void TanhComponent::Backprop(const ChunkInfo &, //in_info,
-                             const ChunkInfo &, //out_info,
-                             const CuMatrixBase<BaseFloat> &, //in_value,
-                             const CuMatrixBase<BaseFloat> &out_value,
-                             const CuMatrixBase<BaseFloat> &out_deriv,
-                             Component *to_update, // may be identical to "this".
-                             CuMatrix<BaseFloat> *in_deriv) const {
-  /*
-    Note on the derivative of the tanh function:
-    tanh'(x) = sech^2(x) = -(tanh(x)+1) (tanh(x)-1) = 1 - tanh^2(x)
-
-    The element by element equation of what we're doing would be:
-    in_deriv = out_deriv * (1.0 - out_value^2).
-    We can accomplish this via calls to the matrix library. */
-
-  in_deriv->Resize(out_deriv.NumRows(), out_deriv.NumCols());
-  in_deriv->CopyFromMat(out_value);
-  in_deriv->ApplyPow(2.0);
-  in_deriv->Scale(-1.0);
-  in_deriv->Add(1.0);
-  // now in_deriv = (1.0 - out_value^2), the element-by-element derivative of
-  // the nonlinearity.
-  if (to_update != NULL)
-    dynamic_cast<NonlinearComponent*>(to_update)->UpdateStats(out_value,
-                                                              in_deriv);
-  in_deriv->MulElements(out_deriv);
-}
-
-void PowerComponent::Init(int32 dim, BaseFloat power) {
-  dim_ = dim;
-  power_ = power;
-  KALDI_ASSERT(dim > 0 && power >= 0);
-}
-
-void PowerComponent::InitFromString(std::string args) {
-  std::string orig_args(args);
-  int32 dim;
-  BaseFloat power = 2.0;
-  ParseFromString("power", &args, &power); // Optional.
-  // Accept either "dim" or "input-dim" to specify the input dim.
-  // "input-dim" is the canonical one; "dim" simplifies the testing code.
-  bool ok = (ParseFromString("dim", &args, &dim) ||
-             ParseFromString("input-dim", &args, &dim));
-  if (!ok || !args.empty() || dim <= 0)
-    KALDI_ERR << "Invalid initializer for layer of type "
-              << Type() << ": \"" << orig_args << "\"";
-  Init(dim, power);
-}
-
-void PowerComponent::Propagate(const ChunkInfo &in_info,
-                               const ChunkInfo &out_info,
-                               const CuMatrixBase<BaseFloat> &in,
-                               CuMatrixBase<BaseFloat> *out) const  {
-  in_info.CheckSize(in);
-  out_info.CheckSize(*out);
-  KALDI_ASSERT(in_info.NumChunks() == out_info.NumChunks());
-
-  // Apply power operation to each element of the input...
-  out->CopyFromMat(in);
-  out->ApplyPowAbs(power_);
-}
-
-void PowerComponent::Backprop(const ChunkInfo &,  //in_info,
-                              const ChunkInfo &,  //out_info,
-                              const CuMatrixBase<BaseFloat> &in_value,
-                              const CuMatrixBase<BaseFloat> &out_value,
-                              const CuMatrixBase<BaseFloat> &out_deriv,
-                              Component *to_update, // may be identical to "this".
-                              CuMatrix<BaseFloat> *in_deriv) const  {
-  in_deriv->Resize(in_value.NumRows(), in_value.NumCols());
-  // in scalar terms: in_deriv += p * in_value^(p-1) * out_deriv
-  in_deriv->CopyFromMat(in_value);
-  in_deriv->ApplyPowAbs(power_ - 1.0, true);
-  in_deriv->Scale(power_);
-  in_deriv->MulElements(out_deriv);
-}
-
-void PowerComponent::Read(std::istream &is, bool binary) {
-  ExpectOneOrTwoTokens(is, binary, "<PowerComponent>", "<InputDim>");
-  ReadBasicType(is, binary, &dim_);
-  ExpectToken(is, binary, "<OutputDim>");
-  ReadBasicType(is, binary, &dim_);
-  ExpectToken(is, binary, "<Power>");
-  ReadBasicType(is, binary, &power_);
-  ExpectToken(is, binary, "</PowerComponent>");
-}
-
-void PowerComponent::Write(std::ostream &os, bool binary) const {
-  WriteToken(os, binary, "<PowerComponent>");
-  WriteToken(os, binary, "<InputDim>");
-  WriteBasicType(os, binary, dim_);
-  WriteToken(os, binary, "<OutputDim>");
-  WriteBasicType(os, binary, dim_);
-  WriteToken(os, binary, "<Power>");
-  WriteBasicType(os, binary, power_);
-  WriteToken(os, binary, "</PowerComponent>");
-}
-
-std::string PowerComponent::Info() const {
-  std::stringstream stream;
-  stream << Type() << ", dim = " << dim_
-     << ", power = " << power_;
-  return stream.str();
-}
-
-void RectifiedLinearComponent::Propagate(const ChunkInfo &in_info,
-                                         const ChunkInfo &out_info,
-                                         const CuMatrixBase<BaseFloat> &in,
-                                         CuMatrixBase<BaseFloat> *out) const  {
-  // Apply rectified linear function (x >= 0 ? 1.0 : 0.0)
-  out->CopyFromMat(in);
-  out->ApplyFloor(0.0);
-}
-
-void RectifiedLinearComponent::Backprop(const ChunkInfo &,  //in_info,
-                                        const ChunkInfo &,  //out_info,
-                                        const CuMatrixBase<BaseFloat> &,  //in_value,
-                                        const CuMatrixBase<BaseFloat> &out_value,
-                                        const CuMatrixBase<BaseFloat> &out_deriv,
-                                        Component *to_update, // may be identical to "this".
-                                        CuMatrix<BaseFloat> *in_deriv) const  {
-
-  in_deriv->Resize(out_deriv.NumRows(), out_deriv.NumCols(),
-                   kUndefined);
-  in_deriv->CopyFromMat(out_value);
-  in_deriv->ApplyHeaviside();
-  // Now in_deriv(i, j) equals (out_value(i, j) > 0.0 ? 1.0 : 0.0),
-  // which is the derivative of the nonlinearity (well, except at zero
-  // where it's undefined).
-  if (to_update != NULL)
-    dynamic_cast<NonlinearComponent*>(to_update)->UpdateStats(out_value,
-                                                              in_deriv);
-  in_deriv->MulElements(out_deriv);
-}
-
-void SoftHingeComponent::Propagate(const ChunkInfo &in_info,
-                                   const ChunkInfo &out_info,
-                                   const CuMatrixBase<BaseFloat> &in,
-                                   CuMatrixBase<BaseFloat> *out) const  {
-  in_info.CheckSize(in);
-  out_info.CheckSize(*out);
-  KALDI_ASSERT(in_info.NumChunks() == out_info.NumChunks());
-  // Apply function x = log(1 + exp(x))
-  out->SoftHinge(in);
-}
-
-void SoftHingeComponent::Backprop(const ChunkInfo &,  //in_info,
-                                  const ChunkInfo &,  //out_info,
-                                  const CuMatrixBase<BaseFloat> &in_value,
-                                  const CuMatrixBase<BaseFloat> &out_value,
-                                  const CuMatrixBase<BaseFloat> &out_deriv,
-                                  Component *to_update, // may be identical to "this".
-                                  CuMatrix<BaseFloat> *in_deriv) const  {
-
-  in_deriv->Resize(out_deriv.NumRows(), out_deriv.NumCols(),
-                   kUndefined);
-  // note: d/dx: log(1 + exp(x)) = (exp(x) / (1 + exp(x)) = 1 / (1 + exp(-x)),
-  // which is the sigmoid function.
-
-  // if the output is y, then dy/dx =  (exp(x) / (1 + exp(x)),
-  // and using y = log(1 + exp(x)) -> exp(x) = exp(y) - 1, we have
-  // dy/dx = (exp(y) - 1) / exp(y)
-
-
-  in_deriv->Sigmoid(in_value);
-
-  if (to_update != NULL)
-    dynamic_cast<NonlinearComponent*>(to_update)->UpdateStats(out_value,
-                                                              in_deriv);
-  in_deriv->MulElements(out_deriv);
-}
-
-
-void ScaleComponent::Propagate(const ChunkInfo &in_info,
-                               const ChunkInfo &out_info,
-                               const CuMatrixBase<BaseFloat> &in,
-                               CuMatrixBase<BaseFloat> *out) const  {
-  out->CopyFromMat(in);
-  out->Scale(scale_);
-}
-
-void ScaleComponent::Backprop(const ChunkInfo &,  //in_info,
-                              const ChunkInfo &,  //out_info,
-                              const CuMatrixBase<BaseFloat> &,  //in_value,
-                              const CuMatrixBase<BaseFloat> &,  //out_value,
-                              const CuMatrixBase<BaseFloat> &out_deriv,
-                              Component *, //to_update, // may be identical to "this".
-                              CuMatrix<BaseFloat> *in_deriv) const  {
-
-  in_deriv->Resize(out_deriv.NumRows(), out_deriv.NumCols(),
-                   kUndefined);
-  in_deriv->CopyFromMat(out_deriv);
-  in_deriv->Scale(scale_);
-}
-
-void ScaleComponent::Init(int32 dim, BaseFloat scale) {
-  dim_ = dim;
-  scale_ = scale;
-  KALDI_ASSERT(dim_ > 0);
-  KALDI_ASSERT(scale_ != 0.0);
-}
-
-void ScaleComponent::InitFromString(std::string args) {
-  std::string orig_args(args);
-  int32 dim;
-  BaseFloat scale;
-  if (!ParseFromString("dim", &args, &dim))
-    KALDI_ERR << "Dimension not specified for ScaleComponent in config file";
-  if (!ParseFromString("scale", &args, &scale))
-    KALDI_ERR << "Scale not specified for ScaleComponent in config file";
-  Init(dim, scale);
-}
-
-void ScaleComponent::Write(std::ostream &os, bool binary) const {
-  WriteToken(os, binary, "<ScaleComponent>");
-  WriteToken(os, binary, "<Dim>");
-  WriteBasicType(os, binary, dim_);
-  WriteToken(os, binary, "<Scale>");
-  WriteBasicType(os, binary, scale_);
-  WriteToken(os, binary, "</ScaleComponent>");
-}
-
-void ScaleComponent::Read(std::istream &is, bool binary) {
-  ExpectOneOrTwoTokens(is, binary, "<ScaleComponent>", "<Dim>");
-  ReadBasicType(is, binary, &dim_);
-  ExpectToken(is, binary, "<Scale>");
-  ReadBasicType(is, binary, &scale_);
-  ExpectToken(is, binary, "</ScaleComponent>");
-}
-
-std::string ScaleComponent::Info() const {
-  std::stringstream stream;
-  stream << Type() << ", dim=" << dim_ << ", scale=" << scale_;
-  return stream.str();
-}
-
-void SoftmaxComponent::Propagate(const ChunkInfo &in_info,
-                                 const ChunkInfo &out_info,
-                                 const CuMatrixBase<BaseFloat> &in,
-                                 CuMatrixBase<BaseFloat> *out) const  {
-  in_info.CheckSize(in);
-  out_info.CheckSize(*out);
-  KALDI_ASSERT(in_info.NumChunks() == out_info.NumChunks());
-
-  // Apply softmax function to each row of the output...
-  // for that row, we do
-  // x_i = exp(x_i) / sum_j exp(x_j).
-
-  out->ApplySoftMaxPerRow(in);
-
-  // This floor on the output helps us deal with
-  // almost-zeros in a way that doesn't lead to overflow.
-  out->ApplyFloor(1.0e-20);
-}
-
-void SoftmaxComponent::Backprop(const ChunkInfo &in_info,
-                                const ChunkInfo &out_info,
-                                const CuMatrixBase<BaseFloat> &,  //in_value,
-                                const CuMatrixBase<BaseFloat> &out_value,
-                                const CuMatrixBase<BaseFloat> &out_deriv,
-                                Component *to_update, // only thing updated is counts_.
-                                CuMatrix<BaseFloat> *in_deriv) const  {
-  /*
-    Note on the derivative of the softmax function: let it be
-    p_i = exp(x_i) / sum_i exp_i
-    The [matrix-valued] Jacobian of this function is
-    diag(p) - p p^T
-    Let the derivative vector at the output be e, and at the input be
-    d.  We have
-    d = diag(p) e - p (p^T e).
-    d_i = p_i e_i - p_i (p^T e).
-  */
-  in_deriv->Resize(out_deriv.NumRows(), out_deriv.NumCols());
-  in_deriv->DiffSoftmaxPerRow(out_value, out_deriv);
-
-  // The SoftmaxComponent does not have any real trainable parameters, but
-  // during the backprop we store some statistics on the average counts;
-  // these may be used in mixing-up.
-  if (to_update != NULL) {
-    NonlinearComponent *to_update_nonlinear =
-        dynamic_cast<NonlinearComponent*>(to_update);
-    to_update_nonlinear->UpdateStats(out_value);
-  }
-}
-
-void LogSoftmaxComponent::Propagate(const ChunkInfo &in_info,
-                                    const ChunkInfo &out_info,
-                                    const CuMatrixBase<BaseFloat> &in,
-                                    CuMatrixBase<BaseFloat> *out) const  {
-  in_info.CheckSize(in);
-  out_info.CheckSize(*out);
-  KALDI_ASSERT(in_info.NumChunks() == out_info.NumChunks());
-
-  // Applies log softmax function to each row of the output. For each row, we do
-  // x_i = x_i - log(sum_j exp(x_j))
-  out->ApplyLogSoftMaxPerRow(in);
-
-  // Just to be consistent with SoftmaxComponent::Propagate()
-  out->ApplyFloor(Log(1.0e-20));
-}
-
-void LogSoftmaxComponent::Backprop(const ChunkInfo &in_info,
-                                   const ChunkInfo &out_info,
-                                   const CuMatrixBase<BaseFloat> &,  //in_value,
-                                   const CuMatrixBase<BaseFloat> &out_value,
-                                   const CuMatrixBase<BaseFloat> &out_deriv,
-                                   Component *to_update,
-                                   CuMatrix<BaseFloat> *in_deriv) const  {
-  /*
-    Let the output be y, then
-      y_i = x_i - log(sum_i exp(x_i))
-    where x_i is the input to the component. The Jacobian matrix of this
-    function is
-      J = I - 1 exp(y^T)
-    where 1 is a vector of ones. Let the derivative vector at the output be e,
-    and at the input be d, then we have
-      d = e - exp(y) Sum(e)
-      d_i = e_i - exp(y_i) Sum(e)
-  */
-  in_deriv->Resize(out_deriv.NumRows(), out_deriv.NumCols());
-  KALDI_ASSERT(SameDim(out_value, out_deriv) && SameDim(out_value, *in_deriv));
-
-  in_deriv->DiffLogSoftmaxPerRow(out_value, out_deriv);
-
-  // Updates stats.
-  if (to_update != NULL) {
-    NonlinearComponent *to_update_nonlinear =
-        dynamic_cast<NonlinearComponent*>(to_update);
-    to_update_nonlinear->UpdateStats(out_value);
-  }
-}
-
-
-void AffineComponent::Scale(BaseFloat scale) {
-  linear_params_.Scale(scale);
-  bias_params_.Scale(scale);
-}
-
-// virtual
-void AffineComponent::Resize(int32 input_dim, int32 output_dim) {
-  KALDI_ASSERT(input_dim > 0 && output_dim > 0);
-  bias_params_.Resize(output_dim);
-  linear_params_.Resize(output_dim, input_dim);
-}
-
-void AffineComponent::Add(BaseFloat alpha, const UpdatableComponent &other_in) {
-  const AffineComponent *other =
-      dynamic_cast<const AffineComponent*>(&other_in);
-  KALDI_ASSERT(other != NULL);
-  linear_params_.AddMat(alpha, other->linear_params_);
-  bias_params_.AddVec(alpha, other->bias_params_);
-}
-
-AffineComponent::AffineComponent(const AffineComponent &component):
-    UpdatableComponent(component),
-    linear_params_(component.linear_params_),
-    bias_params_(component.bias_params_),
-    is_gradient_(component.is_gradient_) { }
-
-AffineComponent::AffineComponent(const CuMatrixBase<BaseFloat> &linear_params,
-                                 const CuVectorBase<BaseFloat> &bias_params,
-                                 BaseFloat learning_rate):
-    UpdatableComponent(learning_rate),
-    linear_params_(linear_params),
-    bias_params_(bias_params) {
-  KALDI_ASSERT(linear_params.NumRows() == bias_params.Dim()&&
-               bias_params.Dim() != 0);
-  is_gradient_ = false;
-}
-
-
-
-void AffineComponent::SetZero(bool treat_as_gradient) {
-  if (treat_as_gradient) {
-    SetLearningRate(1.0);
-  }
-  linear_params_.SetZero();
-  bias_params_.SetZero();
-  if (treat_as_gradient)
-    is_gradient_ = true;
-}
-
-void AffineComponent::SetParams(const VectorBase<BaseFloat> &bias,
-                                const MatrixBase<BaseFloat> &linear) {
-  bias_params_ = bias;
-  linear_params_ = linear;
-  KALDI_ASSERT(bias_params_.Dim() == linear_params_.NumRows());
-}
-
-void AffineComponent::PerturbParams(BaseFloat stddev) {
-  CuMatrix<BaseFloat> temp_linear_params(linear_params_);
-  temp_linear_params.SetRandn();
-  linear_params_.AddMat(stddev, temp_linear_params);
-
-  CuVector<BaseFloat> temp_bias_params(bias_params_);
-  temp_bias_params.SetRandn();
-  bias_params_.AddVec(stddev, temp_bias_params);
-}
-
-std::string AffineComponent::Info() const {
-  std::stringstream stream;
-  BaseFloat linear_params_size = static_cast<BaseFloat>(linear_params_.NumRows())
-      * static_cast<BaseFloat>(linear_params_.NumCols());
-  BaseFloat linear_stddev =
-      std::sqrt(TraceMatMat(linear_params_, linear_params_, kTrans) /
-                linear_params_size),
-      bias_stddev = std::sqrt(VecVec(bias_params_, bias_params_) /
-                              bias_params_.Dim());
-  stream << Type() << ", input-dim=" << InputDim()
-         << ", output-dim=" << OutputDim()
-         << ", linear-params-stddev=" << linear_stddev
-         << ", bias-params-stddev=" << bias_stddev
-         << ", learning-rate=" << LearningRate();
-  return stream.str();
-}
-
-Component* AffineComponent::Copy() const {
-  AffineComponent *ans = new AffineComponent();
-  ans->learning_rate_ = learning_rate_;
-  ans->linear_params_ = linear_params_;
-  ans->bias_params_ = bias_params_;
-  ans->is_gradient_ = is_gradient_;
-  return ans;
-}
-
-BaseFloat AffineComponent::DotProduct(const UpdatableComponent &other_in) const {
-  const AffineComponent *other =
-      dynamic_cast<const AffineComponent*>(&other_in);
-  return TraceMatMat(linear_params_, other->linear_params_, kTrans)
-      + VecVec(bias_params_, other->bias_params_);
-}
-
-void AffineComponent::Init(BaseFloat learning_rate,
-                           int32 input_dim, int32 output_dim,
-                           BaseFloat param_stddev, BaseFloat bias_stddev) {
-  UpdatableComponent::Init(learning_rate);
-  linear_params_.Resize(output_dim, input_dim);
-  bias_params_.Resize(output_dim);
-  KALDI_ASSERT(output_dim > 0 && input_dim > 0 && param_stddev >= 0.0);
-  linear_params_.SetRandn(); // sets to random normally distributed noise.
-  linear_params_.Scale(param_stddev);
-  bias_params_.SetRandn();
-  bias_params_.Scale(bias_stddev);
-}
-
-void AffineComponent::Init(BaseFloat learning_rate,
-                           std::string matrix_filename) {
-  UpdatableComponent::Init(learning_rate);
-  CuMatrix<BaseFloat> mat;
-  ReadKaldiObject(matrix_filename, &mat); // will abort on failure.
-  KALDI_ASSERT(mat.NumCols() >= 2);
-  int32 input_dim = mat.NumCols() - 1, output_dim = mat.NumRows();
-  linear_params_.Resize(output_dim, input_dim);
-  bias_params_.Resize(output_dim);
-  linear_params_.CopyFromMat(mat.Range(0, output_dim, 0, input_dim));
-  bias_params_.CopyColFromMat(mat, input_dim);
-}
-
-void AffineComponent::InitFromString(std::string args) {
-  std::string orig_args(args);
-  bool ok = true;
-  BaseFloat learning_rate = learning_rate_;
-  std::string matrix_filename;
-  int32 input_dim = -1, output_dim = -1;
-  ParseFromString("learning-rate", &args, &learning_rate); // optional.
-  if (ParseFromString("matrix", &args, &matrix_filename)) {
-    Init(learning_rate, matrix_filename);
-    if (ParseFromString("input-dim", &args, &input_dim))
-      KALDI_ASSERT(input_dim == InputDim() &&
-                   "input-dim mismatch vs. matrix.");
-    if (ParseFromString("output-dim", &args, &output_dim))
-      KALDI_ASSERT(output_dim == OutputDim() &&
-                   "output-dim mismatch vs. matrix.");
-  } else {
-    ok = ok && ParseFromString("input-dim", &args, &input_dim);
-    ok = ok && ParseFromString("output-dim", &args, &output_dim);
-    BaseFloat param_stddev = 1.0 / std::sqrt(input_dim),
-        bias_stddev = 1.0;
-    ParseFromString("param-stddev", &args, &param_stddev);
-    ParseFromString("bias-stddev", &args, &bias_stddev);
-    Init(learning_rate, input_dim, output_dim,
-         param_stddev, bias_stddev);
-  }
-  if (!args.empty())
-    KALDI_ERR << "Could not process these elements in initializer: "
-              << args;
-  if (!ok)
-    KALDI_ERR << "Bad initializer " << orig_args;
-}
-
-
-void AffineComponent::Propagate(const ChunkInfo &in_info,
-                                const ChunkInfo &out_info,
-                                const CuMatrixBase<BaseFloat> &in,
-                                CuMatrixBase<BaseFloat> *out) const  {
-  in_info.CheckSize(in);
-  out_info.CheckSize(*out);
-  KALDI_ASSERT(in_info.NumChunks() == out_info.NumChunks());
-
-  // No need for asserts as they'll happen within the matrix operations.
-  out->CopyRowsFromVec(bias_params_); // copies bias_params_ to each row
-  // of *out.
-  out->AddMatMat(1.0, in, kNoTrans, linear_params_, kTrans, 1.0);
-}
-
-void AffineComponent::UpdateSimple(const CuMatrixBase<BaseFloat> &in_value,
-                                   const CuMatrixBase<BaseFloat> &out_deriv) {
-  bias_params_.AddRowSumMat(learning_rate_, out_deriv, 1.0);
-  linear_params_.AddMatMat(learning_rate_, out_deriv, kTrans,
-                           in_value, kNoTrans, 1.0);
-}
-
-void AffineComponent::Backprop(const ChunkInfo &, //in_info,
-                               const ChunkInfo &, //out_info,
-                               const CuMatrixBase<BaseFloat> &in_value,
-                               const CuMatrixBase<BaseFloat> &, //out_value,
-                               const CuMatrixBase<BaseFloat> &out_deriv,
-                               Component *to_update_in, // may be identical to "this".
-                               CuMatrix<BaseFloat> *in_deriv) const {
-  AffineComponent *to_update = dynamic_cast<AffineComponent*>(to_update_in);
-  in_deriv->Resize(out_deriv.NumRows(), InputDim());
-  // Propagate the derivative back to the input.
-  in_deriv->AddMatMat(1.0, out_deriv, kNoTrans, linear_params_, kNoTrans,
-                      0.0);
-
-  if (to_update != NULL) {
-    // Next update the model (must do this 2nd so the derivatives we propagate
-    // are accurate, in case this == to_update_in.)
-    if (to_update->is_gradient_)
-      to_update->UpdateSimple(in_value, out_deriv);
-    else  // the call below is to a virtual function that may be re-implemented
-      to_update->Update(in_value, out_deriv);  // by child classes.
-  }
-}
-
-void AffineComponent::Read(std::istream &is, bool binary) {
-  std::ostringstream ostr_beg, ostr_end;
-  ostr_beg << "<" << Type() << ">"; // e.g. "<AffineComponent>"
-  ostr_end << "</" << Type() << ">"; // e.g. "</AffineComponent>"
-  // might not see the "<AffineComponent>" part because
-  // of how ReadNew() works.
-  ExpectOneOrTwoTokens(is, binary, ostr_beg.str(), "<LearningRate>");
-  ReadBasicType(is, binary, &learning_rate_);
-  ExpectToken(is, binary, "<LinearParams>");
-  linear_params_.Read(is, binary);
-  ExpectToken(is, binary, "<BiasParams>");
-  bias_params_.Read(is, binary);
-  std::string tok;
-  // back-compatibility code.  TODO: re-do this later.
-  ReadToken(is, binary, &tok);
-  if (tok == "<AvgInput>") { // discard the following.
-    CuVector<BaseFloat> avg_input;
-    avg_input.Read(is, binary);
-    BaseFloat avg_input_count;
-    ExpectToken(is, binary, "<AvgInputCount>");
-    ReadBasicType(is, binary, &avg_input_count);
-    ReadToken(is, binary, &tok);
-  }
-  if (tok == "<IsGradient>") {
-    ReadBasicType(is, binary, &is_gradient_);
-    ExpectToken(is, binary, ostr_end.str());
-  } else {
-    is_gradient_ = false;
-    KALDI_ASSERT(tok == ostr_end.str());
-  }
-}
-
-void AffineComponent::Write(std::ostream &os, bool binary) const {
-  std::ostringstream ostr_beg, ostr_end;
-  ostr_beg << "<" << Type() << ">"; // e.g. "<AffineComponent>"
-  ostr_end << "</" << Type() << ">"; // e.g. "</AffineComponent>"
-  WriteToken(os, binary, ostr_beg.str());
-  WriteToken(os, binary, "<LearningRate>");
-  WriteBasicType(os, binary, learning_rate_);
-  WriteToken(os, binary, "<LinearParams>");
-  linear_params_.Write(os, binary);
-  WriteToken(os, binary, "<BiasParams>");
-  bias_params_.Write(os, binary);
-  WriteToken(os, binary, "<IsGradient>");
-  WriteBasicType(os, binary, is_gradient_);
-  WriteToken(os, binary, ostr_end.str());
-}
-
-int32 AffineComponent::GetParameterDim() const {
-  return (InputDim() + 1) * OutputDim();
-}
-void AffineComponent::Vectorize(VectorBase<BaseFloat> *params) const {
-  params->Range(0, InputDim() * OutputDim()).CopyRowsFromMat(linear_params_);
-  params->Range(InputDim() * OutputDim(),
-                OutputDim()).CopyFromVec(bias_params_);
-}
-void AffineComponent::UnVectorize(const VectorBase<BaseFloat> &params) {
-  linear_params_.CopyRowsFromVec(params.Range(0, InputDim() * OutputDim()));
-  bias_params_.CopyFromVec(params.Range(InputDim() * OutputDim(),
-                                        OutputDim()));
-}
-
-void AffineComponent::LimitRank(int32 d,
-                                AffineComponent **a, AffineComponent **b) const {
-  KALDI_ASSERT(d <= InputDim());
-
-  // We'll limit the rank of just the linear part, keeping the bias vector full.
-  Matrix<BaseFloat> M (linear_params_);
-  int32 rows = M.NumRows(), cols = M.NumCols(), rc_min = std::min(rows, cols);
-  Vector<BaseFloat> s(rc_min);
-  Matrix<BaseFloat> U(rows, rc_min), Vt(rc_min, cols);
-  // Do the destructive svd M = U diag(s) V^T.  It actually outputs the transpose of V.
-  M.DestructiveSvd(&s, &U, &Vt);
-  SortSvd(&s, &U, &Vt); // Sort the singular values from largest to smallest.
-  BaseFloat old_svd_sum = s.Sum();
-  U.Resize(rows, d, kCopyData);
-  s.Resize(d, kCopyData);
-  Vt.Resize(d, cols, kCopyData);
-  BaseFloat new_svd_sum = s.Sum();
-  KALDI_LOG << "Reduced rank from "
-            << rc_min <<  " to " << d << ", SVD sum reduced from "
-            << old_svd_sum << " to " << new_svd_sum;
-
-  // U.MulColsVec(s); // U <-- U diag(s)
-  Vt.MulRowsVec(s); // Vt <-- diag(s) Vt.
-
-  *a = dynamic_cast<AffineComponent*>(this->Copy());
-  *b = dynamic_cast<AffineComponent*>(this->Copy());
-
-  (*a)->bias_params_.Resize(d, kSetZero);
-  (*a)->linear_params_ = Vt;
-
-  (*b)->bias_params_ = this->bias_params_;
-  (*b)->linear_params_ = U;
-}
-
-Component *AffineComponent::CollapseWithNext(
-    const AffineComponent &next_component) const {
-  AffineComponent *ans = dynamic_cast<AffineComponent*>(this->Copy());
-  KALDI_ASSERT(ans != NULL);
-  // Note: it's possible that "ans" is really of a derived type such
-  // as AffineComponentPreconditioned, but this will still work.
-  // the "copy" call will copy things like learning rates, "alpha" value
-  // for preconditioned component, etc.
-  ans->linear_params_.Resize(next_component.OutputDim(), InputDim());
-  ans->bias_params_ = next_component.bias_params_;
-
-  ans->linear_params_.AddMatMat(1.0, next_component.linear_params_, kNoTrans,
-                                this->linear_params_, kNoTrans, 0.0);
-  ans->bias_params_.AddMatVec(1.0, next_component.linear_params_, kNoTrans,
-                              this->bias_params_, 1.0);
-  return ans;
-}
-
-Component *AffineComponent::CollapseWithNext(
-    const FixedAffineComponent &next_component) const {
-  // If at least one was non-updatable, make the whole non-updatable.
-  FixedAffineComponent *ans =
-      dynamic_cast<FixedAffineComponent*>(next_component.Copy());
-  KALDI_ASSERT(ans != NULL);
-  ans->linear_params_.Resize(next_component.OutputDim(), InputDim());
-  ans->bias_params_ = next_component.bias_params_;
-
-  ans->linear_params_.AddMatMat(1.0, next_component.linear_params_, kNoTrans,
-                                this->linear_params_, kNoTrans, 0.0);
-  ans->bias_params_.AddMatVec(1.0, next_component.linear_params_, kNoTrans,
-                              this->bias_params_, 1.0);
-  return ans;
-}
-
-Component *AffineComponent::CollapseWithNext(
-    const FixedScaleComponent &next_component) const {
-  KALDI_ASSERT(this->OutputDim() == next_component.InputDim());
-  AffineComponent *ans =
-      dynamic_cast<AffineComponent*>(this->Copy());
-  KALDI_ASSERT(ans != NULL);
-  ans->linear_params_.MulRowsVec(next_component.scales_);
-  ans->bias_params_.MulElements(next_component.scales_);
-
-  return ans;
-}
-
-
-
-Component *AffineComponent::CollapseWithPrevious(
-    const FixedAffineComponent &prev_component) const {
-  // If at least one was non-updatable, make the whole non-updatable.
-  FixedAffineComponent *ans =
-      dynamic_cast<FixedAffineComponent*>(prev_component.Copy());
-  KALDI_ASSERT(ans != NULL);
-
-  ans->linear_params_.Resize(this->OutputDim(), prev_component.InputDim());
-  ans->bias_params_ = this->bias_params_;
-
-  ans->linear_params_.AddMatMat(1.0, this->linear_params_, kNoTrans,
-                                prev_component.linear_params_, kNoTrans, 0.0);
-  ans->bias_params_.AddMatVec(1.0, this->linear_params_, kNoTrans,
-                              prev_component.bias_params_, 1.0);
-  return ans;
-}
-
-void AffineComponentPreconditioned::Read(std::istream &is, bool binary) {
-  std::ostringstream ostr_beg, ostr_end;
-  ostr_beg << "<" << Type() << ">"; // e.g. "<AffineComponentPreconditioned>"
-  ostr_end << "</" << Type() << ">"; // e.g. "</AffineComponentPreconditioned>"
-  // might not see the "<AffineComponentPreconditioned>" part because
-  // of how ReadNew() works.
-  ExpectOneOrTwoTokens(is, binary, ostr_beg.str(), "<LearningRate>");
-  ReadBasicType(is, binary, &learning_rate_);
-  ExpectToken(is, binary, "<LinearParams>");
-  linear_params_.Read(is, binary);
-  ExpectToken(is, binary, "<BiasParams>");
-  bias_params_.Read(is, binary);
-  ExpectToken(is, binary, "<Alpha>");
-  ReadBasicType(is, binary, &alpha_);
-  // todo: remove back-compat code.  Will just be:
-  // ExpectToken(is, binary, "<MaxChange>");
-  // ReadBasicType(is, binary, &max_change_);
-  // ExpectToken(is, binary, ostr_end);
-  // [end of function]
-  std::string tok;
-  ReadToken(is, binary, &tok);
-  if (tok == "<MaxChange>") {
-    ReadBasicType(is, binary, &max_change_);
-    ExpectToken(is, binary, ostr_end.str());
-  } else {
-    max_change_ = 0.0;
-    KALDI_ASSERT(tok == ostr_end.str());
-  }
-}
-
-void AffineComponentPreconditioned::InitFromString(std::string args) {
-  std::string orig_args(args);
-  std::string matrix_filename;
-  BaseFloat learning_rate = learning_rate_;
-  BaseFloat alpha = 0.1, max_change = 0.0;
-  int32 input_dim = -1, output_dim = -1;
-  ParseFromString("learning-rate", &args, &learning_rate); // optional.
-  ParseFromString("alpha", &args, &alpha);
-  ParseFromString("max-change", &args, &max_change);
-
-  if (ParseFromString("matrix", &args, &matrix_filename)) {
-    Init(learning_rate, alpha, max_change, matrix_filename);
-    if (ParseFromString("input-dim", &args, &input_dim))
-      KALDI_ASSERT(input_dim == InputDim() &&
-                   "input-dim mismatch vs. matrix.");
-    if (ParseFromString("output-dim", &args, &output_dim))
-      KALDI_ASSERT(output_dim == OutputDim() &&
-                   "output-dim mismatch vs. matrix.");
-  } else {
-    bool ok = true;
-    ok = ok && ParseFromString("input-dim", &args, &input_dim);
-    ok = ok && ParseFromString("output-dim", &args, &output_dim);
-    BaseFloat param_stddev = 1.0 / std::sqrt(input_dim),
-        bias_stddev = 1.0;
-    ParseFromString("param-stddev", &args, &param_stddev);
-    ParseFromString("bias-stddev", &args, &bias_stddev);
-    if (!ok)
-      KALDI_ERR << "Bad initializer " << orig_args;
-    Init(learning_rate, input_dim, output_dim, param_stddev,
-         bias_stddev, alpha, max_change);
-  }
-  if (!args.empty())
-    KALDI_ERR << "Could not process these elements in initializer: "
-              << args;
-}
-
-void AffineComponentPreconditioned::Init(BaseFloat learning_rate,
-                                         BaseFloat alpha, BaseFloat max_change,
-                                         std::string matrix_filename) {
-  UpdatableComponent::Init(learning_rate);
-  alpha_ = alpha;
-  max_change_ = max_change;
-  CuMatrix<BaseFloat> mat;
-  ReadKaldiObject(matrix_filename, &mat); // will abort on failure.
-  KALDI_ASSERT(mat.NumCols() >= 2);
-  int32 input_dim = mat.NumCols() - 1, output_dim = mat.NumRows();
-  linear_params_.Resize(output_dim, input_dim);
-  bias_params_.Resize(output_dim);
-  linear_params_.CopyFromMat(mat.Range(0, output_dim, 0, input_dim));
-  bias_params_.CopyColFromMat(mat, input_dim);
-}
-
-void AffineComponentPreconditioned::Init(
-    BaseFloat learning_rate,
-    int32 input_dim, int32 output_dim,
-    BaseFloat param_stddev, BaseFloat bias_stddev,
-    BaseFloat alpha, BaseFloat max_change) {
-  UpdatableComponent::Init(learning_rate);
-  KALDI_ASSERT(input_dim > 0 && output_dim > 0);
-  linear_params_.Resize(output_dim, input_dim);
-  bias_params_.Resize(output_dim);
-  KALDI_ASSERT(output_dim > 0 && input_dim > 0 && param_stddev >= 0.0);
-  linear_params_.SetRandn(); // sets to random normally distributed noise.
-  linear_params_.Scale(param_stddev);
-  bias_params_.SetRandn();
-  bias_params_.Scale(bias_stddev);
-  alpha_ = alpha;
-  KALDI_ASSERT(alpha_ > 0.0);
-  max_change_ = max_change; // Note: any value of max_change_is valid, but
-  // only values > 0.0 will actually activate the code.
-}
-
-
-void AffineComponentPreconditioned::Write(std::ostream &os, bool binary) const {
-  std::ostringstream ostr_beg, ostr_end;
-  ostr_beg << "<" << Type() << ">"; // e.g. "<AffineComponent>"
-  ostr_end << "</" << Type() << ">"; // e.g. "</AffineComponent>"
-  WriteToken(os, binary, ostr_beg.str());
-  WriteToken(os, binary, "<LearningRate>");
-  WriteBasicType(os, binary, learning_rate_);
-  WriteToken(os, binary, "<LinearParams>");
-  linear_params_.Write(os, binary);
-  WriteToken(os, binary, "<BiasParams>");
-  bias_params_.Write(os, binary);
-  WriteToken(os, binary, "<Alpha>");
-  WriteBasicType(os, binary, alpha_);
-  WriteToken(os, binary, "<MaxChange>");
-  WriteBasicType(os, binary, max_change_);
-  WriteToken(os, binary, ostr_end.str());
-}
-
-std::string AffineComponentPreconditioned::Info() const {
-  std::stringstream stream;
-  BaseFloat linear_params_size = static_cast<BaseFloat>(linear_params_.NumRows())
-      * static_cast<BaseFloat>(linear_params_.NumCols());
-  BaseFloat linear_stddev =
-      std::sqrt(TraceMatMat(linear_params_, linear_params_, kTrans) /
-                linear_params_size),
-      bias_stddev = std::sqrt(VecVec(bias_params_, bias_params_) /
-                              bias_params_.Dim());
-  stream << Type() << ", input-dim=" << InputDim()
-         << ", output-dim=" << OutputDim()
-         << ", linear-params-stddev=" << linear_stddev
-         << ", bias-params-stddev=" << bias_stddev
-         << ", learning-rate=" << LearningRate()
-         << ", alpha=" << alpha_
-         << ", max-change=" << max_change_;
-  return stream.str();
-}
-
-Component* AffineComponentPreconditioned::Copy() const {
-  AffineComponentPreconditioned *ans = new AffineComponentPreconditioned();
-  ans->learning_rate_ = learning_rate_;
-  ans->linear_params_ = linear_params_;
-  ans->bias_params_ = bias_params_;
-  ans->alpha_ = alpha_;
-  ans->max_change_ = max_change_;
-  ans->is_gradient_ = is_gradient_;
-  return ans;
-}
-
-
-BaseFloat AffineComponentPreconditioned::GetScalingFactor(
-    const CuMatrix<BaseFloat> &in_value_precon,
-    const CuMatrix<BaseFloat> &out_deriv_precon) {
-  static int scaling_factor_printed = 0;
-
-  KALDI_ASSERT(in_value_precon.NumRows() == out_deriv_precon.NumRows());
-  CuVector<BaseFloat> in_norm(in_value_precon.NumRows()),
-      out_deriv_norm(in_value_precon.NumRows());
-  in_norm.AddDiagMat2(1.0, in_value_precon, kNoTrans, 0.0);
-  out_deriv_norm.AddDiagMat2(1.0, out_deriv_precon, kNoTrans, 0.0);
-  // Get the actual l2 norms, not the squared l2 norm.
-  in_norm.ApplyPow(0.5);
-  out_deriv_norm.ApplyPow(0.5);
-  BaseFloat sum = learning_rate_ * VecVec(in_norm, out_deriv_norm);
-  // sum is the product of norms that we are trying to limit
-  // to max_value_.
-  KALDI_ASSERT(sum == sum && sum - sum == 0.0 &&
-               "NaN in backprop");
-  KALDI_ASSERT(sum >= 0.0);
-  if (sum <= max_change_) return 1.0;
-  else {
-    BaseFloat ans = max_change_ / sum;
-    if (scaling_factor_printed < 10) {
-      KALDI_LOG << "Limiting step size to " << max_change_
-                << " using scaling factor " << ans << ", for component index "
-                << Index();
-      scaling_factor_printed++;
-    }
-    return ans;
-  }
-}
-
-void AffineComponentPreconditioned::Update(
-    const CuMatrixBase<BaseFloat> &in_value,
-    const CuMatrixBase<BaseFloat> &out_deriv) {
-  CuMatrix<BaseFloat> in_value_temp;
-
-  in_value_temp.Resize(in_value.NumRows(),
-                       in_value.NumCols() + 1, kUndefined);
-  in_value_temp.Range(0, in_value.NumRows(),
-                      0, in_value.NumCols()).CopyFromMat(in_value);
-
-  // Add the 1.0 at the end of each row "in_value_temp"
-  in_value_temp.Range(0, in_value.NumRows(),
-                      in_value.NumCols(), 1).Set(1.0);
-
-  CuMatrix<BaseFloat> in_value_precon(in_value_temp.NumRows(),
-                                      in_value_temp.NumCols(), kUndefined),
-      out_deriv_precon(out_deriv.NumRows(),
-                       out_deriv.NumCols(), kUndefined);
-  // each row of in_value_precon will be that same row of
-  // in_value, but multiplied by the inverse of a Fisher
-  // matrix that has been estimated from all the other rows,
-  // smoothed by some appropriate amount times the identity
-  // matrix (this amount is proportional to \alpha).
-  PreconditionDirectionsAlphaRescaled(in_value_temp, alpha_, &in_value_precon);
-  PreconditionDirectionsAlphaRescaled(out_deriv, alpha_, &out_deriv_precon);
-
-  BaseFloat minibatch_scale = 1.0;
-
-  if (max_change_ > 0.0)
-    minibatch_scale = GetScalingFactor(in_value_precon, out_deriv_precon);
-
-
-  CuSubMatrix<BaseFloat> in_value_precon_part(in_value_precon,
-                                            0, in_value_precon.NumRows(),
-                                            0, in_value_precon.NumCols() - 1);
-  // this "precon_ones" is what happens to the vector of 1's representing
-  // offsets, after multiplication by the preconditioner.
-  CuVector<BaseFloat> precon_ones(in_value_precon.NumRows());
-
-  precon_ones.CopyColFromMat(in_value_precon, in_value_precon.NumCols() - 1);
-
-  BaseFloat local_lrate = minibatch_scale * learning_rate_;
-  bias_params_.AddMatVec(local_lrate, out_deriv_precon, kTrans,
-                         precon_ones, 1.0);
-  linear_params_.AddMatMat(local_lrate, out_deriv_precon, kTrans,
-                           in_value_precon_part, kNoTrans, 1.0);
-}
-
-
-// virtual
-void AffineComponentPreconditionedOnline::Resize(
-    int32 input_dim, int32 output_dim) {
-  KALDI_ASSERT(input_dim > 1 && output_dim > 1);
-  if (rank_in_ >= input_dim) rank_in_ = input_dim - 1;
-  if (rank_out_ >= output_dim) rank_out_ = output_dim - 1;
-  bias_params_.Resize(output_dim);
-  linear_params_.Resize(output_dim, input_dim);
-  OnlinePreconditioner temp;
-  preconditioner_in_ = temp;
-  preconditioner_out_ = temp;
-  SetPreconditionerConfigs();
-}
-
-
-void AffineComponentPreconditionedOnline::Read(std::istream &is, bool binary) {
-  std::ostringstream ostr_beg, ostr_end;
-  ostr_beg << "<" << Type() << ">";
-  ostr_end << "</" << Type() << ">";
-  // might not see the "<AffineComponentPreconditionedOnline>" part because
-  // of how ReadNew() works.
-  ExpectOneOrTwoTokens(is, binary, ostr_beg.str(), "<LearningRate>");
-  ReadBasicType(is, binary, &learning_rate_);
-  ExpectToken(is, binary, "<LinearParams>");
-  linear_params_.Read(is, binary);
-  ExpectToken(is, binary, "<BiasParams>");
-  bias_params_.Read(is, binary);
-  std::string tok;
-  ReadToken(is, binary, &tok);
-  if (tok == "<Rank>") {  // back-compatibility (temporary)
-    ReadBasicType(is, binary, &rank_in_);
-    rank_out_ = rank_in_;
-  } else {
-    KALDI_ASSERT(tok == "<RankIn>");
-    ReadBasicType(is, binary, &rank_in_);
-    ExpectToken(is, binary, "<RankOut>");
-    ReadBasicType(is, binary, &rank_out_);
-  }
-  ReadToken(is, binary, &tok);
-  if (tok == "<UpdatePeriod>") {
-    ReadBasicType(is, binary, &update_period_);
-    ExpectToken(is, binary, "<NumSamplesHistory>");
-  } else {
-    update_period_ = 1;
-    KALDI_ASSERT(tok == "<NumSamplesHistory>");
-  }
-  ReadBasicType(is, binary, &num_samples_history_);
-  ExpectToken(is, binary, "<Alpha>");
-  ReadBasicType(is, binary, &alpha_);
-  ExpectToken(is, binary, "<MaxChangePerSample>");
-  ReadBasicType(is, binary, &max_change_per_sample_);
-  ExpectToken(is, binary, ostr_end.str());
-  SetPreconditionerConfigs();
-}
-
-void AffineComponentPreconditionedOnline::InitFromString(std::string args) {
-  std::string orig_args(args);
-  bool ok = true;
-  std::string matrix_filename;
-  BaseFloat learning_rate = learning_rate_;
-  BaseFloat num_samples_history = 2000.0, alpha = 4.0,
-      max_change_per_sample = 0.1;
-  int32 input_dim = -1, output_dim = -1, rank_in = 30, rank_out = 80,
-      update_period = 1;
-  ParseFromString("learning-rate", &args, &learning_rate); // optional.
-  ParseFromString("num-samples-history", &args, &num_samples_history);
-  ParseFromString("alpha", &args, &alpha);
-  ParseFromString("max-change-per-sample", &args, &max_change_per_sample);
-  ParseFromString("rank-in", &args, &rank_in);
-  ParseFromString("rank-out", &args, &rank_out);
-  ParseFromString("update-period", &args, &update_period);
-
-  if (ParseFromString("matrix", &args, &matrix_filename)) {
-    Init(learning_rate, rank_in, rank_out, update_period,
-         num_samples_history, alpha, max_change_per_sample,
-         matrix_filename);
-    if (ParseFromString("input-dim", &args, &input_dim))
-      KALDI_ASSERT(input_dim == InputDim() &&
-                   "input-dim mismatch vs. matrix.");
-    if (ParseFromString("output-dim", &args, &output_dim))
-      KALDI_ASSERT(output_dim == OutputDim() &&
-                   "output-dim mismatch vs. matrix.");
-  } else {
-    ok = ok && ParseFromString("input-dim", &args, &input_dim);
-    ok = ok && ParseFromString("output-dim", &args, &output_dim);
-    BaseFloat param_stddev = 1.0 / std::sqrt(input_dim),
-        bias_stddev = 1.0;
-    ParseFromString("param-stddev", &args, &param_stddev);
-    ParseFromString("bias-stddev", &args, &bias_stddev);
-    Init(learning_rate, input_dim, output_dim, param_stddev,
-         bias_stddev, rank_in, rank_out, update_period,
-         num_samples_history, alpha, max_change_per_sample);
-  }
-  if (!args.empty())
-    KALDI_ERR << "Could not process these elements in initializer: "
-              << args;
-  if (!ok)
-    KALDI_ERR << "Bad initializer " << orig_args;
-}
-
-void AffineComponentPreconditionedOnline::SetPreconditionerConfigs() {
-  preconditioner_in_.SetRank(rank_in_);
-  preconditioner_in_.SetNumSamplesHistory(num_samples_history_);
-  preconditioner_in_.SetAlpha(alpha_);
-  preconditioner_in_.SetUpdatePeriod(update_period_);
-  preconditioner_out_.SetRank(rank_out_);
-  preconditioner_out_.SetNumSamplesHistory(num_samples_history_);
-  preconditioner_out_.SetAlpha(alpha_);
-  preconditioner_out_.SetUpdatePeriod(update_period_);
-}
-
-void AffineComponentPreconditionedOnline::Init(
-    BaseFloat learning_rate, int32 rank_in, int32 rank_out,
-    int32 update_period, BaseFloat num_samples_history, BaseFloat alpha,
-    BaseFloat max_change_per_sample,
-    std::string matrix_filename) {
-  UpdatableComponent::Init(learning_rate);
-  rank_in_ = rank_in;
-  rank_out_ = rank_out;
-  update_period_ = update_period;
-  num_samples_history_ = num_samples_history;
-  alpha_ = alpha;
-  SetPreconditionerConfigs();
-  KALDI_ASSERT(max_change_per_sample >= 0.0);
-  max_change_per_sample_ = max_change_per_sample;
-  CuMatrix<BaseFloat> mat;
-  ReadKaldiObject(matrix_filename, &mat); // will abort on failure.
-  KALDI_ASSERT(mat.NumCols() >= 2);
-  int32 input_dim = mat.NumCols() - 1, output_dim = mat.NumRows();
-  linear_params_.Resize(output_dim, input_dim);
-  bias_params_.Resize(output_dim);
-  linear_params_.CopyFromMat(mat.Range(0, output_dim, 0, input_dim));
-  bias_params_.CopyColFromMat(mat, input_dim);
-}
-
-AffineComponentPreconditionedOnline::AffineComponentPreconditionedOnline(
-    const AffineComponent &orig,
-    int32 rank_in, int32 rank_out, int32 update_period,
-    BaseFloat num_samples_history, BaseFloat alpha):
-    max_change_per_sample_(0.1) {
-  this->linear_params_ = orig.linear_params_;
-  this->bias_params_ = orig.bias_params_;
-  this->learning_rate_ = orig.learning_rate_;
-  this->is_gradient_ = orig.is_gradient_;
-  this->rank_in_ = rank_in;
-  this->rank_out_ = rank_out;
-  this->update_period_ = update_period;
-  this->num_samples_history_ = num_samples_history;
-  this->alpha_ = alpha;
-  SetPreconditionerConfigs();
-}
-
-void AffineComponentPreconditionedOnline::Init(
-    BaseFloat learning_rate,
-    int32 input_dim, int32 output_dim,
-    BaseFloat param_stddev, BaseFloat bias_stddev,
-    int32 rank_in, int32 rank_out, int32 update_period,
-    BaseFloat num_samples_history, BaseFloat alpha,
-    BaseFloat max_change_per_sample) {
-  UpdatableComponent::Init(learning_rate);
-  linear_params_.Resize(output_dim, input_dim);
-  bias_params_.Resize(output_dim);
-  KALDI_ASSERT(output_dim > 0 && input_dim > 0 && param_stddev >= 0.0 &&
-               bias_stddev >= 0.0);
-  linear_params_.SetRandn(); // sets to random normally distributed noise.
-  linear_params_.Scale(param_stddev);
-  bias_params_.SetRandn();
-  bias_params_.Scale(bias_stddev);
-  rank_in_ = rank_in;
-  rank_out_ = rank_out;
-  update_period_ = update_period;
-  num_samples_history_ = num_samples_history;
-  alpha_ = alpha;
-  SetPreconditionerConfigs();
-  KALDI_ASSERT(max_change_per_sample >= 0.0);
-  max_change_per_sample_ = max_change_per_sample;
-}
-
-
-void AffineComponentPreconditionedOnline::Write(std::ostream &os, bool binary) const {
-  std::ostringstream ostr_beg, ostr_end;
-  ostr_beg << "<" << Type() << ">"; // e.g. "<AffineComponent>"
-  ostr_end << "</" << Type() << ">"; // e.g. "</AffineComponent>"
-  WriteToken(os, binary, ostr_beg.str());
-  WriteToken(os, binary, "<LearningRate>");
-  WriteBasicType(os, binary, learning_rate_);
-  WriteToken(os, binary, "<LinearParams>");
-  linear_params_.Write(os, binary);
-  WriteToken(os, binary, "<BiasParams>");
-  bias_params_.Write(os, binary);
-  WriteToken(os, binary, "<RankIn>");
-  WriteBasicType(os, binary, rank_in_);
-  WriteToken(os, binary, "<RankOut>");
-  WriteBasicType(os, binary, rank_out_);
-  WriteToken(os, binary, "<UpdatePeriod>");
-  WriteBasicType(os, binary, update_period_);
-  WriteToken(os, binary, "<NumSamplesHistory>");
-  WriteBasicType(os, binary, num_samples_history_);
-  WriteToken(os, binary, "<Alpha>");
-  WriteBasicType(os, binary, alpha_);
-  WriteToken(os, binary, "<MaxChangePerSample>");
-  WriteBasicType(os, binary, max_change_per_sample_);
-  WriteToken(os, binary, ostr_end.str());
-}
-
-std::string AffineComponentPreconditionedOnline::Info() const {
-  std::stringstream stream;
-  BaseFloat linear_params_size = static_cast<BaseFloat>(linear_params_.NumRows())
-      * static_cast<BaseFloat>(linear_params_.NumCols());
-  BaseFloat linear_stddev =
-      std::sqrt(TraceMatMat(linear_params_, linear_params_, kTrans) /
-                linear_params_size),
-      bias_stddev = std::sqrt(VecVec(bias_params_, bias_params_) /
-                              bias_params_.Dim());
-  stream << Type() << ", input-dim=" << InputDim()
-         << ", output-dim=" << OutputDim()
-         << ", linear-params-stddev=" << linear_stddev
-         << ", bias-params-stddev=" << bias_stddev
-         << ", learning-rate=" << LearningRate()
-         << ", rank-in=" << rank_in_
-         << ", rank-out=" << rank_out_
-         << ", num_samples_history=" << num_samples_history_
-         << ", update_period=" << update_period_
-         << ", alpha=" << alpha_
-         << ", max-change-per-sample=" << max_change_per_sample_;
-  return stream.str();
-}
-
-Component* AffineComponentPreconditionedOnline::Copy() const {
-  AffineComponentPreconditionedOnline *ans = new AffineComponentPreconditionedOnline();
-  ans->learning_rate_ = learning_rate_;
-  ans->rank_in_ = rank_in_;
-  ans->rank_out_ = rank_out_;
-  ans->update_period_ = update_period_;
-  ans->num_samples_history_ = num_samples_history_;
-  ans->alpha_ = alpha_;
-  ans->linear_params_ = linear_params_;
-  ans->bias_params_ = bias_params_;
-  ans->preconditioner_in_ = preconditioner_in_;
-  ans->preconditioner_out_ = preconditioner_out_;
-  ans->max_change_per_sample_ = max_change_per_sample_;
-  ans->is_gradient_ = is_gradient_;
-  ans->SetPreconditionerConfigs();
-  return ans;
-}
-
-
-
-BaseFloat AffineComponentPreconditionedOnline::GetScalingFactor(
-    const CuVectorBase<BaseFloat> &in_products,
-    BaseFloat learning_rate_scale,
-    CuVectorBase<BaseFloat> *out_products) {
-  static int scaling_factor_printed = 0;
-  int32 minibatch_size = in_products.Dim();
-
-  out_products->MulElements(in_products);
-  out_products->ApplyPow(0.5);
-  BaseFloat prod_sum = out_products->Sum();
-  BaseFloat tot_change_norm = learning_rate_scale * learning_rate_ * prod_sum,
-      max_change_norm = max_change_per_sample_ * minibatch_size;
-  // tot_change_norm is the product of norms that we are trying to limit
-  // to max_value_.
-  KALDI_ASSERT(tot_change_norm - tot_change_norm == 0.0 && "NaN in backprop");
-  KALDI_ASSERT(tot_change_norm >= 0.0);
-  if (tot_change_norm <= max_change_norm) return 1.0;
-  else {
-    BaseFloat factor = max_change_norm / tot_change_norm;
-    if (scaling_factor_printed < 10) {
-      KALDI_LOG << "Limiting step size using scaling factor "
-                << factor << ", for component index " << Index();
-      scaling_factor_printed++;
-    }
-    return factor;
-  }
-}
-
-void AffineComponentPreconditionedOnline::Update(
-    const CuMatrixBase<BaseFloat> &in_value,
-    const CuMatrixBase<BaseFloat> &out_deriv) {
-  CuMatrix<BaseFloat> in_value_temp;
-
-  in_value_temp.Resize(in_value.NumRows(),
-                       in_value.NumCols() + 1, kUndefined);
-  in_value_temp.Range(0, in_value.NumRows(),
-                      0, in_value.NumCols()).CopyFromMat(in_value);
-
-  // Add the 1.0 at the end of each row "in_value_temp"
-  in_value_temp.Range(0, in_value.NumRows(),
-                      in_value.NumCols(), 1).Set(1.0);
-
-  CuMatrix<BaseFloat> out_deriv_temp(out_deriv);
-
-  CuMatrix<BaseFloat> row_products(2,
-                                   in_value.NumRows());
-  CuSubVector<BaseFloat> in_row_products(row_products, 0),
-      out_row_products(row_products, 1);
-
-  // These "scale" values get will get multiplied into the learning rate (faster
-  // than having the matrices scaled inside the preconditioning code).
-  BaseFloat in_scale, out_scale;
-
-  preconditioner_in_.PreconditionDirections(&in_value_temp, &in_row_products,
-                                            &in_scale);
-  preconditioner_out_.PreconditionDirections(&out_deriv_temp, &out_row_products,
-                                             &out_scale);
-
-  // "scale" is a scaling factor coming from the PreconditionDirections calls
-  // (it's faster to have them output a scaling factor than to have them scale
-  // their outputs).
-  BaseFloat scale = in_scale * out_scale;
-  BaseFloat minibatch_scale = 1.0;
-
-  if (max_change_per_sample_ > 0.0)
-    minibatch_scale = GetScalingFactor(in_row_products, scale,
-                                       &out_row_products);
-
-  CuSubMatrix<BaseFloat> in_value_precon_part(in_value_temp,
-                                              0, in_value_temp.NumRows(),
-                                              0, in_value_temp.NumCols() - 1);
-  // this "precon_ones" is what happens to the vector of 1's representing
-  // offsets, after multiplication by the preconditioner.
-  CuVector<BaseFloat> precon_ones(in_value_temp.NumRows());
-
-  precon_ones.CopyColFromMat(in_value_temp, in_value_temp.NumCols() - 1);
-
-  BaseFloat local_lrate = scale * minibatch_scale * learning_rate_;
-  bias_params_.AddMatVec(local_lrate, out_deriv_temp, kTrans,
-                         precon_ones, 1.0);
-  linear_params_.AddMatMat(local_lrate, out_deriv_temp, kTrans,
-                           in_value_precon_part, kNoTrans, 1.0);
-}
-
-void BlockAffineComponent::SetZero(bool treat_as_gradient) {
-  if (treat_as_gradient) {
-    SetLearningRate(1.0);
-  }
-  linear_params_.SetZero();
-  bias_params_.SetZero();
-}
-
-void BlockAffineComponent::PerturbParams(BaseFloat stddev) {
-  CuMatrix<BaseFloat> temp_linear_params(linear_params_);
-  temp_linear_params.SetRandn();
-  linear_params_.AddMat(stddev, temp_linear_params);
-
-  CuVector<BaseFloat> temp_bias_params(bias_params_);
-  temp_bias_params.SetRandn();
-  bias_params_.AddVec(stddev, temp_bias_params);
-}
-
-BaseFloat BlockAffineComponent::DotProduct(
-    const UpdatableComponent &other_in) const {
-  const BlockAffineComponent *other =
-      dynamic_cast<const BlockAffineComponent*>(&other_in);
-  return TraceMatMat(linear_params_, other->linear_params_, kTrans)
-      + VecVec(bias_params_, other->bias_params_);
-}
-
-Component* BlockAffineComponent::Copy() const {
-  BlockAffineComponent *ans = new BlockAffineComponent();
-  ans->learning_rate_ = learning_rate_;
-  ans->linear_params_ = linear_params_;
-  ans->bias_params_ = bias_params_;
-  ans->num_blocks_ = num_blocks_;
-  return ans;
-}
-
-void BlockAffineComponent::Scale(BaseFloat scale) {
-  linear_params_.Scale(scale);
-  bias_params_.Scale(scale);
-}
-
-void BlockAffineComponent::Add(BaseFloat alpha,
-                               const UpdatableComponent &other_in) {
-  const BlockAffineComponent *other =
-      dynamic_cast<const BlockAffineComponent*>(&other_in);
-  KALDI_ASSERT(other != NULL);
-  linear_params_.AddMat(alpha, other->linear_params_);
-  bias_params_.AddVec(alpha, other->bias_params_);
-}
-
-void BlockAffineComponent::Propagate(const ChunkInfo &in_info,
-                                     const ChunkInfo &out_info,
-                                     const CuMatrixBase<BaseFloat> &in,
-                                     CuMatrixBase<BaseFloat> *out) const  {
-  in_info.CheckSize(in);
-  out_info.CheckSize(*out);
-  KALDI_ASSERT(in_info.NumChunks() == out_info.NumChunks());
-
-  // The matrix has a block structure where each matrix has input dim
-  // (#rows) equal to input_block_dim.  The blocks are stored in linear_params_
-  // as [ M
-  //      N
-  //      O ] but we actually treat it as:
-  // [ M 0 0
-  //   0 N 0
-  //   0 0 O ]
-  int32 input_block_dim = linear_params_.NumCols(),
-       output_block_dim = linear_params_.NumRows() / num_blocks_,
-             num_frames = in.NumRows();
-  KALDI_ASSERT(in.NumCols() == input_block_dim * num_blocks_);
-  KALDI_ASSERT(out->NumCols() == output_block_dim * num_blocks_);
-  KALDI_ASSERT(in.NumRows() == out->NumRows());
-
-  out->CopyRowsFromVec(bias_params_); // copies bias_params_ to each row
-  // of *out.
-
-  for (int32 b = 0; b < num_blocks_; b++) {
-    CuSubMatrix<BaseFloat> in_block(in, 0, num_frames,
-                                  b * input_block_dim, input_block_dim),
-        out_block(*out, 0, num_frames,
-                  b * output_block_dim, output_block_dim),
-        param_block(linear_params_,
-                    b * output_block_dim, output_block_dim,
-                    0, input_block_dim);
-    out_block.AddMatMat(1.0, in_block, kNoTrans, param_block, kTrans, 1.0);
-  }
-}
-
-void BlockAffineComponent::UpdateSimple(
-    const CuMatrixBase<BaseFloat> &in_value,
-    const CuMatrixBase<BaseFloat> &out_deriv) {
-  int32 input_block_dim = linear_params_.NumCols(),
-      output_block_dim = linear_params_.NumRows() / num_blocks_,
-      num_frames = in_value.NumRows();
-
-  bias_params_.AddRowSumMat(learning_rate_, out_deriv, 1.0);
-  for (int32 b = 0; b < num_blocks_; b++) {
-    CuSubMatrix<BaseFloat> in_value_block(in_value, 0, num_frames,
-                                        b * input_block_dim,
-                                        input_block_dim),
-        out_deriv_block(out_deriv, 0, num_frames,
-                        b * output_block_dim, output_block_dim),
-        param_block(linear_params_,
-                    b * output_block_dim, output_block_dim,
-                    0, input_block_dim);
-    // Update the parameters.
-    param_block.AddMatMat(learning_rate_, out_deriv_block, kTrans,
-                          in_value_block, kNoTrans, 1.0);
-  }
-}
-
-void BlockAffineComponent::Backprop(const ChunkInfo &,  //in_info,
-                                    const ChunkInfo &,  //out_info,
-                                    const CuMatrixBase<BaseFloat> &in_value,
-                                    const CuMatrixBase<BaseFloat> &,  //out_value,
-                                    const CuMatrixBase<BaseFloat> &out_deriv,
-                                    Component *to_update_in,
-                                    CuMatrix<BaseFloat> *in_deriv) const  {
-
-  // This code mirrors the code in Propagate().
-  int32 num_frames = in_value.NumRows();
-  BlockAffineComponent *to_update = dynamic_cast<BlockAffineComponent*>(
-      to_update_in);
-  in_deriv->Resize(out_deriv.NumRows(), InputDim());
-  int32 input_block_dim = linear_params_.NumCols(),
-       output_block_dim = linear_params_.NumRows() / num_blocks_;
-  KALDI_ASSERT(in_value.NumCols() == input_block_dim * num_blocks_);
-  KALDI_ASSERT(out_deriv.NumCols() == output_block_dim * num_blocks_);
-
-  for (int32 b = 0; b < num_blocks_; b++) {
-    CuSubMatrix<BaseFloat> in_value_block(in_value, 0, num_frames,
-                                        b * input_block_dim,
-                                        input_block_dim),
-        in_deriv_block(*in_deriv, 0, num_frames,
-                       b * input_block_dim, input_block_dim),
-        out_deriv_block(out_deriv, 0, num_frames,
-                        b * output_block_dim, output_block_dim),
-        param_block(linear_params_,
-                    b * output_block_dim, output_block_dim,
-                    0, input_block_dim);
-
-    // Propagate the derivative back to the input.
-    in_deriv_block.AddMatMat(1.0, out_deriv_block, kNoTrans,
-                             param_block, kNoTrans, 0.0);
-  }
-  if (to_update != NULL)
-    to_update->Update(in_value, out_deriv);
-}
-
-
-void BlockAffineComponent::Init(BaseFloat learning_rate,
-                                int32 input_dim, int32 output_dim,
-                                BaseFloat param_stddev,
-                                BaseFloat bias_stddev,
-                                int32 num_blocks) {
-  UpdatableComponent::Init(learning_rate);
-  KALDI_ASSERT(output_dim > 0 && input_dim > 0 && param_stddev >= 0.0);
-  KALDI_ASSERT(input_dim % num_blocks == 0 && output_dim % num_blocks == 0);
-
-  linear_params_.Resize(output_dim, input_dim / num_blocks);
-  bias_params_.Resize(output_dim);
-
-  linear_params_.SetRandn(); // sets to random normally distributed noise.
-  linear_params_.Scale(param_stddev);
-  bias_params_.SetRandn();
-  bias_params_.Scale(bias_stddev);
-  num_blocks_ = num_blocks;
-}
-
-void BlockAffineComponent::InitFromString(std::string args) {
-  std::string orig_args(args);
-  bool ok = true;
-  BaseFloat learning_rate = learning_rate_;
-  int32 input_dim = -1, output_dim = -1, num_blocks = 1;
-  ParseFromString("learning-rate", &args, &learning_rate); // optional.
-  ok = ok && ParseFromString("input-dim", &args, &input_dim);
-  ok = ok && ParseFromString("output-dim", &args, &output_dim);
-  ok = ok && ParseFromString("num-blocks", &args, &num_blocks);
-  BaseFloat param_stddev = 1.0 / std::sqrt(input_dim),
-      bias_stddev = 1.0;
-  ParseFromString("param-stddev", &args, &param_stddev);
-  ParseFromString("bias-stddev", &args, &bias_stddev);
-  if (!args.empty())
-    KALDI_ERR << "Could not process these elements in initializer: "
-              << args;
-  if (!ok)
-    KALDI_ERR << "Bad initializer " << orig_args;
-  Init(learning_rate, input_dim, output_dim,
-       param_stddev, bias_stddev, num_blocks);
-}
-
-
-void BlockAffineComponent::Read(std::istream &is, bool binary) {
-  ExpectOneOrTwoTokens(is, binary, "<BlockAffineComponent>", "<LearningRate>");
-  ReadBasicType(is, binary, &learning_rate_);
-  ExpectToken(is, binary, "<NumBlocks>");
-  ReadBasicType(is, binary, &num_blocks_);
-  ExpectToken(is, binary, "<LinearParams>");
-  linear_params_.Read(is, binary);
-  ExpectToken(is, binary, "<BiasParams>");
-  bias_params_.Read(is, binary);
-  ExpectToken(is, binary, "</BlockAffineComponent>");
-}
-
-void BlockAffineComponent::Write(std::ostream &os, bool binary) const {
-  WriteToken(os, binary, "<BlockAffineComponent>");
-  WriteToken(os, binary, "<LearningRate>");
-  WriteBasicType(os, binary, learning_rate_);
-  WriteToken(os, binary, "<NumBlocks>");
-  WriteBasicType(os, binary, num_blocks_);
-  WriteToken(os, binary, "<LinearParams>");
-  linear_params_.Write(os, binary);
-  WriteToken(os, binary, "<BiasParams>");
-  bias_params_.Write(os, binary);
-  WriteToken(os, binary, "</BlockAffineComponent>");
-}
-
-
-int32 BlockAffineComponent::GetParameterDim() const {
-  // Note: num_blocks_ should divide both InputDim() and OutputDim().
-  return InputDim() * OutputDim() / num_blocks_;
-}
-
-void BlockAffineComponent::Vectorize(VectorBase<BaseFloat> *params) const {
-  int32 l = linear_params_.NumRows() * linear_params_.NumCols(),
-      b = bias_params_.Dim();
-  params->Range(0, l).CopyRowsFromMat(linear_params_);
-  params->Range(l, b).CopyFromVec(bias_params_);
-}
-void BlockAffineComponent::UnVectorize(const VectorBase<BaseFloat> &params) {
-  int32 l = linear_params_.NumRows() * linear_params_.NumCols(),
-      b = bias_params_.Dim();
-  linear_params_.CopyRowsFromVec(params.Range(0, l));
-  bias_params_.CopyFromVec(params.Range(l, b));
-}
-
-
-void BlockAffineComponentPreconditioned::Init(BaseFloat learning_rate,
-                                              int32 input_dim, int32 output_dim,
-                                              BaseFloat param_stddev,
-                                              BaseFloat bias_stddev,
-                                              int32 num_blocks,
-                                              BaseFloat alpha) {
-  BlockAffineComponent::Init(learning_rate, input_dim, output_dim,
-                             param_stddev, bias_stddev, num_blocks);
-  is_gradient_ = false;
-  KALDI_ASSERT(alpha > 0.0);
-  alpha_ = alpha;
-}
-
-void BlockAffineComponentPreconditioned::InitFromString(std::string args) {
-  std::string orig_args(args);
-  bool ok = true;
-  BaseFloat learning_rate = learning_rate_;
-  BaseFloat alpha = 4.0;
-  int32 input_dim = -1, output_dim = -1, num_blocks = 1;
-  ParseFromString("learning-rate", &args, &learning_rate); // optional.
-  ParseFromString("alpha", &args, &alpha);
-  ok = ok && ParseFromString("input-dim", &args, &input_dim);
-  ok = ok && ParseFromString("output-dim", &args, &output_dim);
-  ok = ok && ParseFromString("num-blocks", &args, &num_blocks);
-
-  BaseFloat param_stddev = 1.0 / std::sqrt(input_dim),
-      bias_stddev = 1.0;
-  ParseFromString("param-stddev", &args, &param_stddev);
-  ParseFromString("bias-stddev", &args, &bias_stddev);
-  if (!args.empty())
-    KALDI_ERR << "Could not process these elements in initializer: "
-              << args;
-  if (!ok)
-    KALDI_ERR << "Bad initializer " << orig_args;
-  Init(learning_rate, input_dim, output_dim,
-       param_stddev, bias_stddev, num_blocks,
-       alpha);
-}
-
-void BlockAffineComponentPreconditioned::SetZero(bool treat_as_gradient) {
-  if (treat_as_gradient)
-    is_gradient_ = true;
-  BlockAffineComponent::SetZero(treat_as_gradient);
-}
-
-void BlockAffineComponentPreconditioned::Read(std::istream &is, bool binary) {
-  ExpectOneOrTwoTokens(is, binary, "<BlockAffineComponentPreconditioned>",
-                       "<LearningRate>");
-  ReadBasicType(is, binary, &learning_rate_);
-  ExpectToken(is, binary, "<NumBlocks>");
-  ReadBasicType(is, binary, &num_blocks_);
-  ExpectToken(is, binary, "<LinearParams>");
-  linear_params_.Read(is, binary);
-  ExpectToken(is, binary, "<BiasParams>");
-  bias_params_.Read(is, binary);
-  ExpectToken(is, binary, "<Alpha>");
-  ReadBasicType(is, binary, &alpha_);
-  ExpectToken(is, binary, "<IsGradient>");
-  ReadBasicType(is, binary, &is_gradient_);
-  ExpectToken(is, binary, "</BlockAffineComponentPreconditioned>");
-}
-
-void BlockAffineComponentPreconditioned::Write(std::ostream &os,
-                                               bool binary) const {
-  WriteToken(os, binary, "<BlockAffineComponentPreconditioned>");
-  WriteToken(os, binary, "<LearningRate>");
-  WriteBasicType(os, binary, learning_rate_);
-  WriteToken(os, binary, "<NumBlocks>");
-  WriteBasicType(os, binary, num_blocks_);
-  WriteToken(os, binary, "<LinearParams>");
-  linear_params_.Write(os, binary);
-  WriteToken(os, binary, "<BiasParams>");
-  bias_params_.Write(os, binary);
-  WriteToken(os, binary, "<Alpha>");
-  WriteBasicType(os, binary, alpha_);
-  WriteToken(os, binary, "<IsGradient>");
-  WriteBasicType(os, binary, is_gradient_);
-  WriteToken(os, binary, "</BlockAffineComponentPreconditioned>");
-}
-
-Component* BlockAffineComponentPreconditioned::Copy() const {
-  BlockAffineComponentPreconditioned *ans = new
-      BlockAffineComponentPreconditioned();
-  ans->learning_rate_ = learning_rate_;
-  ans->linear_params_ = linear_params_;
-  ans->bias_params_ = bias_params_;
-  ans->num_blocks_ = num_blocks_;
-  ans->alpha_ = alpha_;
-  ans->is_gradient_ = is_gradient_;
-  return ans;
-}
-
-void BlockAffineComponentPreconditioned::Update(
-    const CuMatrixBase<BaseFloat> &in_value,
-    const CuMatrixBase<BaseFloat> &out_deriv) {
-  if (is_gradient_) {
-    UpdateSimple(in_value, out_deriv);
-    // does the baseline update with no preconditioning.
-    return;
-  }
-  int32 input_block_dim = linear_params_.NumCols(),
-      output_block_dim = linear_params_.NumRows() / num_blocks_,
-      num_frames = in_value.NumRows();
-
-  CuMatrix<BaseFloat> in_value_temp(num_frames, input_block_dim + 1, kUndefined),
-      in_value_precon(num_frames, input_block_dim + 1, kUndefined);
-  in_value_temp.Set(1.0); // so last row will have value 1.0.
-  CuSubMatrix<BaseFloat> in_value_temp_part(in_value_temp, 0, num_frames,
-                                            0, input_block_dim); // all but last 1.0
-  CuSubMatrix<BaseFloat> in_value_precon_part(in_value_precon, 0, num_frames,
-                                            0, input_block_dim);
-  CuVector<BaseFloat> precon_ones(num_frames);
-  CuMatrix<BaseFloat> out_deriv_precon(num_frames, output_block_dim, kUndefined);
-
-  for (int32 b = 0; b < num_blocks_; b++) {
-    CuSubMatrix<BaseFloat> in_value_block(in_value, 0, num_frames,
-                                        b * input_block_dim,
-                                        input_block_dim),
-        out_deriv_block(out_deriv, 0, num_frames,
-                        b * output_block_dim, output_block_dim),
-        param_block(linear_params_,
-                    b * output_block_dim, output_block_dim,
-                    0, input_block_dim);
-    in_value_temp_part.CopyFromMat(in_value_block);
-
-    PreconditionDirectionsAlphaRescaled(in_value_temp, alpha_,
-                                        &in_value_precon);
-    PreconditionDirectionsAlphaRescaled(out_deriv_block, alpha_,
-                                        &out_deriv_precon);
-
-
-    // Update the parameters.
-    param_block.AddMatMat(learning_rate_, out_deriv_precon, kTrans,
-                          in_value_precon_part, kNoTrans, 1.0);
-    precon_ones.CopyColFromMat(in_value_precon, input_block_dim);
-    bias_params_.Range(b * output_block_dim, output_block_dim).
-        AddMatVec(learning_rate_, out_deriv_precon, kTrans,
-                  precon_ones, 1.0);
-  }
-}
-
-
-void PermuteComponent::Read(std::istream &is, bool binary) {
-  ExpectOneOrTwoTokens(is, binary, "<PermuteComponent>", "<Reorder>");
-  ReadIntegerVector(is, binary, &reorder_);
-  ExpectToken(is, binary, "</PermuteComponent>");
-}
-
-void PermuteComponent::Write(std::ostream &os, bool binary) const {
-  WriteToken(os, binary, "<PermuteComponent>");
-  WriteToken(os, binary, "<Reorder>");
-  WriteIntegerVector(os, binary, reorder_);
-  WriteToken(os, binary, "</PermuteComponent>");
-}
-
-void PermuteComponent::Init(int32 dim) {
-  KALDI_ASSERT(dim > 0);
-  reorder_.resize(dim);
-  for (int32 i = 0; i < dim; i++) reorder_[i] = i;
-  std::random_shuffle(reorder_.begin(), reorder_.end());
-}
-
-void PermuteComponent::InitFromString(std::string args) {
-  std::string orig_args(args);
-  int32 dim;
-  bool ok = ParseFromString("dim", &args, &dim);
-  if (!ok || !args.empty() || dim <= 0)
-    KALDI_ERR << "Invalid initializer for layer of type "
-              << Type() << ": \"" << orig_args << "\"";
-  Init(dim);
-}
-
-void PermuteComponent::Propagate(const ChunkInfo &in_info,
-                                 const ChunkInfo &out_info,
-                                 const CuMatrixBase<BaseFloat> &in,
-                                 CuMatrixBase<BaseFloat> *out) const  {
-  in_info.CheckSize(in);
-  out_info.CheckSize(*out);
-  KALDI_ASSERT(in_info.NumChunks() == out_info.NumChunks());
-
-  std::vector<int32> reverse_reorder(reorder_.size());
-  for (size_t i = 0; i < reorder_.size(); i++)
-    reverse_reorder[reorder_[i]] = i;
-  // Note: if we were actually using this component type we could make the
-  // CuArray a member variable for efficiency.
-  CuArray<int32> cu_reverse_reorder(reverse_reorder);
-  out->CopyCols(in, cu_reverse_reorder);
-}
-
-void PermuteComponent::Backprop(const ChunkInfo &,  //in_info,
-                                const ChunkInfo &,  //out_info,
-                                const CuMatrixBase<BaseFloat> &in_value,
-                                const CuMatrixBase<BaseFloat> &out_value,
-                                const CuMatrixBase<BaseFloat> &out_deriv,
-                                Component *to_update,
-                                CuMatrix<BaseFloat> *in_deriv) const  {
-  in_deriv->Resize(out_deriv.NumRows(), out_deriv.NumCols());
-  KALDI_ASSERT(out_deriv.NumCols() == OutputDim());
-  // Note: if we were actually using this component type we could make the
-  // CuArray a member variable for efficiency.
-  CuArray<int32> cu_reorder(reorder_);
-  in_deriv->CopyCols(out_deriv, cu_reorder);
-}
-
-void SumGroupComponent::Init(const std::vector<int32> &sizes) {
-  KALDI_ASSERT(!sizes.empty());
-  std::vector<Int32Pair> cpu_vec(sizes.size());
-  std::vector<int32> reverse_cpu_vec;
-  int32 cur_index = 0;
-  for (size_t i = 0; i < sizes.size(); i++) {
-    KALDI_ASSERT(sizes[i] > 0);
-    cpu_vec[i].first = cur_index;
-    cpu_vec[i].second = cur_index + sizes[i];
-    cur_index += sizes[i];
-    for (int32 j = cpu_vec[i].first; j < cpu_vec[i].second; j++)
-      reverse_cpu_vec.push_back(i);
-  }
-  this->indexes_ = cpu_vec;
-  this->reverse_indexes_ = reverse_cpu_vec;
-  this->input_dim_ = cur_index;
-  this->output_dim_ = sizes.size();
-}
-
-void SumGroupComponent::InitFromString(std::string args) {
-  std::string orig_args(args);
-  std::vector<int32> sizes;
-  bool ok = ParseFromString("sizes", &args, &sizes);
-
-  if (!ok || !args.empty() || sizes.empty())
-    KALDI_ERR << "Invalid initializer for layer of type "
-              << Type() << ": \"" << orig_args << "\"";
-  this->Init(sizes);
-}
-
-Component* SumGroupComponent::Copy() const {
-  SumGroupComponent *ans = new SumGroupComponent();
-  ans->indexes_ = indexes_;
-  ans->reverse_indexes_ = reverse_indexes_;
-  ans->input_dim_ = input_dim_;
-  ans->output_dim_ = output_dim_;
-  return ans;
-}
-
-void SumGroupComponent::Read(std::istream &is, bool binary) {
-  ExpectOneOrTwoTokens(is, binary, "<SumGroupComponent>", "<Sizes>");
-  std::vector<int32> sizes;
-  ReadIntegerVector(is, binary, &sizes);
-
-  std::string token;
-  ReadToken(is, binary, &token);
-  if (!(token == "<SumGroupComponent>" ||
-        token == "</SumGroupComponent>")) {
-    KALDI_ERR << "Expected </SumGroupComponent>, got " << token;
-  }
-  this->Init(sizes);
-}
-
-void SumGroupComponent::GetSizes(std::vector<int32> *sizes) const {
-  std::vector<Int32Pair> indexes;
-  indexes_.CopyToVec(&indexes);
-  sizes->resize(indexes.size());
-  for (size_t i = 0; i < indexes.size(); i++) {
-    (*sizes)[i] = indexes[i].second - indexes[i].first;
-    if (i == 0) { KALDI_ASSERT(indexes[i].first == 0); }
-    else { KALDI_ASSERT(indexes[i].first == indexes[i-1].second); }
-    KALDI_ASSERT(indexes[i].second > indexes[i].first);
-    (*sizes)[i] = indexes[i].second - indexes[i].first;
-  }
-}
-
-void SumGroupComponent::Write(std::ostream &os, bool binary) const {
-  WriteToken(os, binary, "<SumGroupComponent>");
-  WriteToken(os, binary, "<Sizes>");
-  std::vector<int32> sizes;
-  this->GetSizes(&sizes);
-  WriteIntegerVector(os, binary, sizes);
-  WriteToken(os, binary, "</SumGroupComponent>");
-}
-
-void SumGroupComponent::Propagate(const ChunkInfo &in_info,
-                                  const ChunkInfo &out_info,
-                                  const CuMatrixBase<BaseFloat> &in,
-                                  CuMatrixBase<BaseFloat> *out) const  {
-  in_info.CheckSize(in);
-  out_info.CheckSize(*out);
-  KALDI_ASSERT(in_info.NumChunks() == out_info.NumChunks());
-
-  out->SumColumnRanges(in, indexes_);
-}
-
-void SumGroupComponent::Backprop(const ChunkInfo &in_info,
-                                 const ChunkInfo &out_info,
-                                 const CuMatrixBase<BaseFloat> &, //in_value,
-                                 const CuMatrixBase<BaseFloat> &, //out_value,
-                                 const CuMatrixBase<BaseFloat> &out_deriv,
-                                 Component *to_update, // may be identical to "this".
-                                 CuMatrix<BaseFloat> *in_deriv) const {
-  in_deriv->Resize(out_deriv.NumRows(), InputDim());
-  in_deriv->CopyCols(out_deriv, reverse_indexes_);
-}
-
-
-std::string SpliceComponent::Info() const {
-  std::stringstream stream;
-  std::ostringstream os;
-  std::copy(context_.begin(), context_.end(),
-            std::ostream_iterator<int32>(os, " "));
-  stream << Component::Info() << ", context=" << os.str();
-  if (const_component_dim_ != 0)
-    stream << ", const_component_dim=" << const_component_dim_;
-
-  return stream.str();
-}
-
-void SpliceComponent::Init(int32 input_dim, std::vector<int32> context,
-                           int32 const_component_dim) {
-  input_dim_ = input_dim;
-  const_component_dim_ = const_component_dim;
-  context_ = context;
-  KALDI_ASSERT(context_.size() > 0);
-  KALDI_ASSERT(input_dim_ > 0 && context_.front() <= 0 && context_.back() >= 0);
-  KALDI_ASSERT(IsSortedAndUniq(context));
-  KALDI_ASSERT(const_component_dim_ >= 0 && const_component_dim_ < input_dim_);
-}
-
-
-// e.g. args == "input-dim=10 left-context=2 right-context=2
-void SpliceComponent::InitFromString(std::string args) {
-  std::string orig_args(args);
-  int32 input_dim, left_context, right_context;
-  std::vector <int32> context;
-  bool in_dim_ok = ParseFromString("input-dim", &args, &input_dim);
-  bool context_ok = ParseFromString("context", &args, &context);
-  bool left_right_context_ok = ParseFromString("left-context", &args,
-                                               &left_context) &&
-                               ParseFromString("right-context", &args,
-                                               &right_context);
-  int32 const_component_dim = 0;
-  ParseFromString("const-component-dim", &args, &const_component_dim);
-
-  if (!(in_dim_ok && (context_ok || left_right_context_ok)) ||
-      !args.empty() || input_dim <= 0)
-    KALDI_ERR << "Invalid initializer for layer of type "
-              << Type() << ": \"" << orig_args << "\"";
-  if (left_right_context_ok)  {
-    KALDI_ASSERT(context.size() == 0);
-    for (int32 i = -left_context; i <= right_context; i++)
-      context.push_back(i);
-  }
-  Init(input_dim, context, const_component_dim);
-}
-
-int32 SpliceComponent::OutputDim() const {
-  return (input_dim_  - const_component_dim_)
-      * (context_.size())
-      + const_component_dim_;
-}
-
-int32 ChunkInfo::GetIndex(int32 offset) const  {
-  if (offsets_.empty()) {  // if data is contiguous
-    KALDI_ASSERT((offset <= last_offset_) && (offset >= first_offset_));
-    return offset - first_offset_;
-  } else  {
-    std::vector<int32>::const_iterator iter =
-        std::lower_bound(offsets_.begin(), offsets_.end(), offset);
-    // make sure offset is present in the vector
-    KALDI_ASSERT(iter != offsets_.end() && *iter == offset);
-    return static_cast<int32>(iter - offsets_.begin());
-  }
-}
-
-int32 ChunkInfo::GetOffset(int32 index) const {
-  if (offsets_.empty()) { // if data is contiguous
-    int32 offset = index + first_offset_;  // just offset by the first_offset_
-    KALDI_ASSERT((offset <= last_offset_) && (offset >= first_offset_));
-    return offset;
-  } else  {
-    KALDI_ASSERT((index >= 0) && (index < offsets_.size()));
-    return offsets_[index];
-  }
-}
-
-void ChunkInfo::Check() const {
-  // Checking sanity of the ChunkInfo object
-  KALDI_ASSERT((feat_dim_ > 0) && (num_chunks_ > 0));
-
-  if (! offsets_.empty()) {
-    KALDI_ASSERT((first_offset_ == offsets_.front()) &&
-                 (last_offset_ == offsets_.back()));
-  } else  {
-    KALDI_ASSERT((first_offset_ >= 0) && (last_offset_ >= first_offset_));
-    // asserting the chunk is not contiguous, as offsets is not empty
-    KALDI_ASSERT ( last_offset_ - first_offset_ + 1 > offsets_.size() );
-  }
-  KALDI_ASSERT(NumRows() % num_chunks_ == 0);
-
-}
-
-void ChunkInfo::CheckSize(const CuMatrixBase<BaseFloat> &mat) const {
-  KALDI_ASSERT((mat.NumRows()  ==  NumRows()) && (mat.NumCols() == NumCols()));
-}
-
-/*
- * This method was used for debugging, make changes in nnet-component.h to
- * expose it
-void ChunkInfo::ToString() const  {
-    KALDI_LOG << "feat_dim  " << feat_dim_;
-    KALDI_LOG << "num_chunks  " << num_chunks_;
-    KALDI_LOG << "first_index  " << first_offset_;
-    KALDI_LOG << "last_index  " << last_offset_;
-    for (size_t i = 0; i < offsets_.size(); i++)
-      KALDI_LOG << offsets_[i];
-}
-*/
-
-
-void SpliceComponent::Propagate(const ChunkInfo &in_info,
-                                const ChunkInfo &out_info,
-                                const CuMatrixBase<BaseFloat> &in,
-                                CuMatrixBase<BaseFloat> *out) const  {
-
-  // Check the inputs are correct and resize output
-  in_info.Check();
-  out_info.Check();
-  in_info.CheckSize(in);
-  out_info.CheckSize(*out);
-  KALDI_ASSERT(in_info.NumChunks() == out_info.NumChunks());
-
-  int32 in_chunk_size  = in_info.ChunkSize(),
-        out_chunk_size = out_info.ChunkSize(),
-        input_dim = in_info.NumCols();
-
-  if (out_chunk_size <= 0)
-    KALDI_ERR << "Splicing features: output will have zero dimension. "
-              << "Probably a code error.";
-
-  // 'indexes' is, for each index from 0 to context_.size() - 1,
-  // then for each row of "out", the corresponding row of "in" that we copy from
-  int32 num_splice = context_.size();
-  std::vector<std::vector<int32> > indexes(num_splice);
-  for (int32 c = 0; c < num_splice; c++)
-    indexes[c].resize(out->NumRows());
-  // const_component_dim_ != 0, "const_indexes" will be used to determine which
-  // row of "in" we copy the last part of each row of "out" from (this part is
-  // not subject to splicing, it's assumed constant for each frame of "input".
-  int32 const_dim = const_component_dim_;
-  std::vector<int32> const_indexes(const_dim == 0 ? 0 : out->NumRows());
-
-  for (int32 chunk = 0; chunk < in_info.NumChunks(); chunk++) {
-    if (chunk == 0) {
-      // this branch could be used for all chunks in the matrix,
-      // but is restricted to chunk 0 for efficiency reasons
-      for (int32 c = 0; c < num_splice; c++) {
-        for (int32 out_index = 0; out_index < out_chunk_size; out_index++) {
-          int32 out_offset = out_info.GetOffset(out_index);
-          int32 in_index = in_info.GetIndex(out_offset + context_[c]);
-          indexes[c][chunk * out_chunk_size + out_index] =
-              chunk * in_chunk_size + in_index;
-        }
-      }
-    } else {  // just copy the indices from the previous chunk
-              // and offset these by input chunk size
-     for (int32 c = 0; c < num_splice; c++) {
-       for (int32 out_index = 0; out_index < out_chunk_size; out_index++) {
-         int32 last_value = indexes[c][(chunk-1) * out_chunk_size + out_index];
-         indexes[c][chunk * out_chunk_size + out_index] =
-             (last_value == -1 ? -1 : last_value + in_chunk_size);
-       }
-     }
-   }
-    if (const_dim != 0) {
-      for (int32 out_index = 0; out_index < out_chunk_size; out_index++)
-        const_indexes[chunk * out_chunk_size + out_index] =
-            chunk * in_chunk_size + out_index;  // there is
-      // an arbitrariness here; since we assume the const_component
-      // is constant within a chunk, it doesn't matter from where we copy.
-    }
-  }
-
-
-  for (int32 c = 0; c < num_splice; c++) {
-    int32 dim = input_dim - const_dim;  // dimension we
-    // are splicing
-    CuSubMatrix<BaseFloat> in_part(in, 0, in.NumRows(),
-                                   0, dim),
-        out_part(*out, 0, out->NumRows(),
-                 c * dim, dim);
-    CuArray<int32> cu_indexes(indexes[c]);
-    out_part.CopyRows(in_part, cu_indexes);
-  }
-  if (const_dim != 0) {
-    CuSubMatrix<BaseFloat> in_part(in, 0, in.NumRows(),
-                                   in.NumCols() - const_dim, const_dim),
-        out_part(*out, 0, out->NumRows(),
-                 out->NumCols() - const_dim, const_dim);
-
-    CuArray<int32> cu_const_indexes(const_indexes);
-    out_part.CopyRows(in_part, cu_const_indexes);
-  }
-}
-
-void SpliceComponent::Backprop(const ChunkInfo &in_info,
-                               const ChunkInfo &out_info,
-                               const CuMatrixBase<BaseFloat> &,  // in_value,
-                               const CuMatrixBase<BaseFloat> &,  // out_value,
-                               const CuMatrixBase<BaseFloat> &out_deriv,
-                               Component *to_update,
-                               CuMatrix<BaseFloat> *in_deriv) const {
-  in_info.Check();
-  out_info.Check();
-  out_info.CheckSize(out_deriv);
-  in_deriv->Resize(in_info.NumRows(), in_info.NumCols(), kUndefined);
-  KALDI_ASSERT(in_info.NumChunks() == out_info.NumChunks());
-  int32 num_chunks = in_info.NumChunks();
-  // rewrite backpropagate
-
-  int32 out_chunk_size = out_info.ChunkSize(),
-         in_chunk_size = in_info.ChunkSize(),
-            output_dim = out_deriv.NumCols(),
-             input_dim = InputDim();
-
-  KALDI_ASSERT(OutputDim() == output_dim);
-
-  int32 num_splice = context_.size(),
-      const_dim = const_component_dim_;
-  // 'indexes' is, for each index from 0 to num_splice - 1,
-  // then for each row of "in_deriv", the corresponding row of "out_deriv" that
-  // we add, or -1 if.
-
-  std::vector<std::vector<int32> > indexes(num_splice);
-  // const_dim != 0, "const_indexes" will be used to determine which
-  // row of "in" we copy the last part of each row of "out" from (this part is
-  // not subject to splicing, it's assumed constant for each frame of "input".
-  std::vector<int32> const_indexes(const_dim == 0 ? 0 : in_deriv->NumRows(), -1);
-
-  for (int32 c = 0; c < indexes.size(); c++)
-    indexes[c].resize(in_deriv->NumRows(), -1);  // set to -1 by default,
-  // this gets interpreted by the CopyRows() code
-  // as a signal to zero the output...
-
-  int32 dim = input_dim - const_dim;  // dimension we are splicing
-  for (int32 chunk = 0; chunk < num_chunks; chunk++) {
-    if (chunk == 0) { // this branch can be taken for all chunks, but is not
-                      // taken for efficiency reasons
-      for (int32 c = 0; c < num_splice; c++)  {
-        for (int32 out_index = 0; out_index < out_chunk_size; out_index++) {
-          int32 out_offset = out_info.GetOffset(out_index);
-          int32 in_index = in_info.GetIndex(out_offset + context_[c]);
-          indexes[c][chunk * in_chunk_size + in_index] =
-              chunk * out_chunk_size + out_index;
-        }
-      }
-    } else {  // just copy the indexes from the previous chunk
-      for (int32 c = 0; c < num_splice; c++)  {
-        for (int32 in_index = 0; in_index < in_chunk_size; in_index++) {
-          int32 last_value = indexes[c][(chunk-1) * in_chunk_size + in_index];
-          indexes[c][chunk * in_chunk_size + in_index] =
-              (last_value == -1 ? -1 : last_value + out_chunk_size);
-        }
-      }
-    }
-    // this code corresponds to the way the forward propagation works; see
-    // comments there.
-    if (const_dim != 0) {
-      for (int32 out_index = 0; out_index < out_chunk_size; out_index++)  {
-        const_indexes[chunk * in_chunk_size + out_index] =
-            chunk * out_chunk_size + out_index;
-      }
-    }
-  }
-
-  CuMatrix<BaseFloat> temp_mat(in_deriv->NumRows(), dim, kUndefined);
-
-  for (int32 c = 0; c < num_splice; c++) {
-    CuArray<int32> cu_indexes(indexes[c]);
-    int32 dim = input_dim - const_dim;  // dimension we
-    // are splicing
-    CuSubMatrix<BaseFloat> out_deriv_part(out_deriv, 0, out_deriv.NumRows(),
-                                          c * dim, dim),
-        in_deriv_part(*in_deriv, 0, in_deriv->NumRows(),
-                      0, dim);
-    if (c == 0) {
-      in_deriv_part.CopyRows(out_deriv_part, cu_indexes);
-    } else {
-      temp_mat.CopyRows(out_deriv_part, cu_indexes);
-      in_deriv_part.AddMat(1.0, temp_mat);
-    }
-  }
-  if (const_dim != 0) {
-    CuSubMatrix<BaseFloat> out_deriv_part(out_deriv, 0, out_deriv.NumRows(),
-                                          out_deriv.NumCols() - const_dim,
-                                          const_dim),
-        in_deriv_part(*in_deriv, 0, in_deriv->NumRows(),
-                      in_deriv->NumCols() - const_dim, const_dim);
-    CuArray<int32> cu_const_indexes(const_indexes);
-    in_deriv_part.CopyRows(out_deriv_part, cu_const_indexes);
-  }
-}
-
-Component *SpliceComponent::Copy() const {
-  SpliceComponent *ans = new SpliceComponent();
-  ans->input_dim_ = input_dim_;
-  ans->context_ = context_;
-  ans->const_component_dim_ = const_component_dim_;
-  return ans;
-}
-
-void SpliceComponent::Read(std::istream &is, bool binary) {
-  ExpectOneOrTwoTokens(is, binary, "<SpliceComponent>", "<InputDim>");
-  ReadBasicType(is, binary, &input_dim_);
-  std::string token;
-  ReadToken(is, false, &token);
-  if (token == "<LeftContext>") {
-    int32 left_context=0, right_context=0;
-    std::vector<int32> context;
-    ReadBasicType(is, binary, &left_context);
-    ExpectToken(is, binary, "<RightContext>");
-    ReadBasicType(is, binary, &right_context);
-    for (int32 i = -1 * left_context; i <= right_context; i++)
-      context.push_back(i);
-    context_ = context;
-  } else  if (token == "<Context>") {
-    ReadIntegerVector(is, binary, &context_);
-  } else  {
-    KALDI_ERR << "Unknown token" << token
-              << ", the model might be corrupted";
-  }
-  ExpectToken(is, binary, "<ConstComponentDim>");
-  ReadBasicType(is, binary, &const_component_dim_);
-  ExpectToken(is, binary, "</SpliceComponent>");
-}
-
-void SpliceComponent::Write(std::ostream &os, bool binary) const {
-  WriteToken(os, binary, "<SpliceComponent>");
-  WriteToken(os, binary, "<InputDim>");
-  WriteBasicType(os, binary, input_dim_);
-  WriteToken(os, binary, "<Context>");
-  WriteIntegerVector(os, binary, context_);
-  WriteToken(os, binary, "<ConstComponentDim>");
-  WriteBasicType(os, binary, const_component_dim_);
-  WriteToken(os, binary, "</SpliceComponent>");
-}
-
-
-std::string SpliceMaxComponent::Info() const {
-  std::stringstream stream;
-  std::ostringstream os;
-  std::copy(context_.begin(), context_.end(),
-            std::ostream_iterator<int32>(os, " "));
-  stream << Component::Info() << ", context=" << os.str();
-  return stream.str();
-}
-
-void SpliceMaxComponent::Init(int32 dim,
-                              std::vector<int32> context)  {
-  dim_ = dim;
-  context_ = context;
-  KALDI_ASSERT(dim_ > 0 && context_.front() <= 0 && context_.back() >= 0);
-}
-
-
-// e.g. args == "dim=10 left-context=2 right-context=2
-void SpliceMaxComponent::InitFromString(std::string args) {
-  std::string orig_args(args);
-  int32 dim, left_context, right_context;
-  std::vector <int32> context;
-  bool dim_ok = ParseFromString("dim", &args, &dim);
-  bool context_ok = ParseFromString("context", &args, &context);
-  bool left_right_context_ok = ParseFromString("left-context",
-                                               &args, &left_context) &&
-                               ParseFromString("right-context", &args,
-                                               &right_context);
-
-  if (!(dim_ok && (context_ok || left_right_context_ok)) ||
-      !args.empty() || dim <= 0)
-    KALDI_ERR << "Invalid initializer for layer of type "
-              << Type() << ": \"" << orig_args << "\"";
-  if (left_right_context_ok)  {
-    KALDI_ASSERT(context.size() == 0);
-    for (int32 i = -1 * left_context; i <= right_context; i++)
-      context.push_back(i);
-  }
-  Init(dim, context);
-}
-
-
-void SpliceMaxComponent::Propagate(const ChunkInfo &in_info,
-                                   const ChunkInfo &out_info,
-                                   const CuMatrixBase<BaseFloat> &in,
-                                   CuMatrixBase<BaseFloat> *out) const  {
-  in_info.Check();
-  out_info.Check();
-  in_info.CheckSize(in);
-  out_info.CheckSize(*out);
-  KALDI_ASSERT(in_info.NumChunks() == out_info.NumChunks());
-  int32 in_chunk_size  = in_info.ChunkSize(),
-        out_chunk_size = out_info.ChunkSize(),
-        dim = in_info.NumCols();
-
-  CuMatrix<BaseFloat> input_chunk_part(out_chunk_size, dim);
-  for (int32 chunk = 0; chunk < in_info.NumChunks(); chunk++) {
-    CuSubMatrix<BaseFloat> input_chunk(in,
-                                     chunk * in_chunk_size, in_chunk_size,
-                                     0, dim),
-                        output_chunk(*out,
-                                     chunk * out_chunk_size,
-                                     out_chunk_size, 0, dim);
-    for (int32 offset = 0; offset < context_.size(); offset++) {
-      // computing the indices to copy into input_chunk_part from input_chunk
-      // copy the rows of the input matrix which correspond to the current
-      // context index
-      std::vector<int32> input_chunk_inds(out_chunk_size);
-      for (int32 i = 0; i < out_chunk_size; i++) {
-        int32 out_chunk_ind  = i;
-        int32 out_chunk_offset =
-            out_info.GetOffset(out_chunk_ind);
-        input_chunk_inds[i] =
-            in_info.GetIndex(out_chunk_offset + context_[offset]);
-      }
-      CuArray<int32> cu_chunk_inds(input_chunk_inds);
-      input_chunk_part.CopyRows(input_chunk, cu_chunk_inds);
-      if (offset == 0)  {
-        output_chunk.CopyFromMat(input_chunk_part);
-      } else {
-        output_chunk.Max(input_chunk_part);
-      }
-    }
-  }
-}
-
-void SpliceMaxComponent::Backprop(const ChunkInfo &in_info,
-                                  const ChunkInfo &out_info,
-                                  const CuMatrixBase<BaseFloat> &in_value,
-                                  const CuMatrixBase<BaseFloat> &,  // out_value
-                                  const CuMatrixBase<BaseFloat> &out_deriv,
-                                  Component *to_update,
-                                  CuMatrix<BaseFloat> *in_deriv) const  {
-  in_info.Check();
-  out_info.Check();
-  in_info.CheckSize(in_value);
-  out_info.CheckSize(out_deriv);
-  in_deriv->Resize(in_info.NumRows(), in_info.NumCols());
-  KALDI_ASSERT(in_info.NumChunks() == out_info.NumChunks());
-
-  int32 out_chunk_size = out_info.ChunkSize(),
-         in_chunk_size = in_info.ChunkSize(),
-                      dim = out_deriv.NumCols();
-
-  KALDI_ASSERT(dim == InputDim());
-
-  for (int32 chunk = 0; chunk < in_info.NumChunks(); chunk++) {
-    CuSubMatrix<BaseFloat> in_deriv_chunk(*in_deriv,
-                                        chunk * in_chunk_size,
-                                        in_chunk_size,
-                                        0, dim),
-                         in_value_chunk(in_value,
-                                        chunk * in_chunk_size,
-                                        in_chunk_size,
-                                        0, dim),
-                        out_deriv_chunk(out_deriv,
-                                        chunk * out_chunk_size,
-                                        out_chunk_size,
-                                        0, dim);
-    for (int32 r = 0; r < out_deriv_chunk.NumRows(); r++) {
-      int32 out_chunk_ind = r;
-      int32 out_chunk_offset =
-          out_info.GetOffset(out_chunk_ind);
-
-      for (int32 c = 0; c < dim; c++) {
-        int32 in_r_max = -1;
-        BaseFloat max_input = -std::numeric_limits<BaseFloat>::infinity();
-        for (int32 context_ind = 0;
-             context_ind < context_.size(); context_ind++) {
-          int32 in_r =
-              in_info.GetIndex(out_chunk_offset + context_[context_ind]);
-          BaseFloat input = in_value_chunk(in_r, c);
-          if (input > max_input) {
-            max_input = input;
-            in_r_max = in_r;
-          }
-        }
-        KALDI_ASSERT(in_r_max != -1);
-        (*in_deriv)(in_r_max, c) += out_deriv_chunk(r, c);
-      }
-    }
-  }
-}
-
-Component *SpliceMaxComponent::Copy() const {
-  SpliceMaxComponent *ans = new SpliceMaxComponent();
-  ans->Init(dim_, context_);
-  return ans;
-}
-
-void SpliceMaxComponent::Read(std::istream &is, bool binary) {
-  ExpectOneOrTwoTokens(is, binary, "<SpliceMaxComponent>", "<Dim>");
-  ReadBasicType(is, binary, &dim_);
-  std::string token;
-  ReadToken(is, false, &token);
-  if (token == "<LeftContext>") {
-    int32 left_context = 0, right_context = 0;
-    std::vector<int32> context;
-    ReadBasicType(is, binary, &left_context);
-    ExpectToken(is, binary, "<RightContext>");
-    ReadBasicType(is, binary, &right_context);
-    for (int32 i = -1 * left_context; i <= right_context; i++)
-      context.push_back(i);
-    context_ = context;
-  } else  if (token == "<Context>") {
-    ReadIntegerVector(is, binary, &context_);
-  } else  {
-    KALDI_ERR << "Unknown token" << token << ", the model might be corrupted";
-  }
-  ExpectToken(is, binary, "</SpliceMaxComponent>");
-}
-
-void SpliceMaxComponent::Write(std::ostream &os, bool binary) const {
-  WriteToken(os, binary, "<SpliceMaxComponent>");
-  WriteToken(os, binary, "<Dim>");
-  WriteBasicType(os, binary, dim_);
-  WriteToken(os, binary, "<Context>");
-  WriteIntegerVector(os, binary, context_);
-  WriteToken(os, binary, "</SpliceMaxComponent>");
-}
-
-std::string DctComponent::Info() const {
-  std::stringstream stream;
-  stream << Component::Info() << ", dct_dim=" << dct_mat_.NumCols();
-  if (dct_mat_.NumCols() != dct_mat_.NumRows())
-    stream << ", dct_keep_dim=" << dct_mat_.NumRows();
-
-  return stream.str();
-}
-
-void DctComponent::Init(int32 dim, int32 dct_dim, bool reorder, int32 dct_keep_dim) {
-  int dct_keep_dim_ = (dct_keep_dim > 0) ? dct_keep_dim : dct_dim;
-
-  KALDI_ASSERT(dim > 0 && dct_dim > 0);
-  KALDI_ASSERT(dim % dct_dim == 0); // dct_dim must divide dim.
-  KALDI_ASSERT(dct_dim >= dct_keep_dim_);
-  dim_ = dim;
-  dct_mat_.Resize(dct_keep_dim_, dct_dim);
-  reorder_ = reorder;
-  Matrix<BaseFloat> dct_mat(dct_keep_dim_, dct_dim);
-  ComputeDctMatrix(&dct_mat);
-  dct_mat_ = dct_mat;
-}
-
-
-
-void DctComponent::InitFromString(std::string args) {
-  std::string orig_args(args);
-  int32 dim, dct_dim, dct_keep_dim = 0;
-  bool reorder = false;
-
-  bool ok = ParseFromString("dim", &args, &dim);
-  ok = ParseFromString("dct-dim", &args, &dct_dim) && ok;
-  ok = ParseFromString("reorder", &args, &reorder) && ok;
-  ParseFromString("dct-keep-dim", &args, &dct_keep_dim);
-
-  if (!ok || !args.empty() || dim <= 0 || dct_dim <= 0 || dct_keep_dim < 0)
-    KALDI_ERR << "Invalid initializer for layer of type "
-              << Type() << ": \"" << orig_args << "\"";
-  Init(dim, dct_dim, reorder, dct_keep_dim);
-}
-
-void DctComponent::Reorder(CuMatrixBase<BaseFloat> *mat, bool reverse) const {
-  // reorders into contiguous blocks of dize "dct_dim_", assuming that
-  // such blocks were interlaced before.  if reverse==true, does the
-  // reverse.
-  int32 dct_dim = dct_mat_.NumCols(),
-      dct_keep_dim = dct_mat_.NumRows(),
-      block_size_in = dim_ / dct_dim,
-      block_size_out = dct_keep_dim;
-
-  //This does not necesarily needs to be true anymore -- output must be reordered as well, but the dimension differs...
-  //KALDI_ASSERT(mat->NumCols() == dim_);
-  if (reverse) std::swap(block_size_in, block_size_out);
-
-  CuVector<BaseFloat> temp(mat->NumCols());
-  for (int32 i = 0; i < mat->NumRows(); i++) {
-    CuSubVector<BaseFloat> row(*mat, i);
-    int32 num_blocks_in = block_size_out;
-    for (int32 b = 0; b < num_blocks_in; b++) {
-      for (int32 j = 0; j < block_size_in; j++) {
-        temp(j * block_size_out + b) = row(b * block_size_in + j);
-      }
-    }
-    row.CopyFromVec(temp);
-  }
-}
-
-void DctComponent::Propagate(const ChunkInfo &in_info,
-                             const ChunkInfo &out_info,
-                             const CuMatrixBase<BaseFloat> &in,
-                             CuMatrixBase<BaseFloat> *out) const  {
-  KALDI_ASSERT(in.NumCols() == InputDim());
-  int32 dct_dim = dct_mat_.NumCols(),
-        dct_keep_dim = dct_mat_.NumRows(),
-        num_rows = in.NumRows(),
-        num_chunks = dim_ / dct_dim;
-
-  in_info.CheckSize(in);
-  out_info.CheckSize(*out);
-  KALDI_ASSERT(num_rows == out_info.NumRows());
-  KALDI_ASSERT(num_chunks * dct_keep_dim == out_info.NumCols());
-
-  CuMatrix<BaseFloat> in_tmp;
-  if (reorder_) {
-    in_tmp = in;
-    Reorder(&in_tmp, false);
-  }
-
-  for (int32 chunk = 0; chunk < num_chunks; chunk++) {
-    CuSubMatrix<BaseFloat> in_mat(reorder_ ? in_tmp : in,
-                                0, num_rows, dct_dim * chunk, dct_dim),
-                        out_mat(*out,
-                                0, num_rows, dct_keep_dim * chunk, dct_keep_dim);
-
-    out_mat.AddMatMat(1.0, in_mat, kNoTrans, dct_mat_, kTrans, 0.0);
-  }
-  if (reorder_)
-    Reorder(out, true);
-}
-
-void DctComponent::Backprop(const ChunkInfo &,  //in_info,
-                            const ChunkInfo &,  //out_info,
-                            const CuMatrixBase<BaseFloat> &,  //in_value,
-                            const CuMatrixBase<BaseFloat> &,  //out_value,
-                            const CuMatrixBase<BaseFloat> &out_deriv,
-                            Component *,  //to_update,
-                            CuMatrix<BaseFloat> *in_deriv) const  {
-  KALDI_ASSERT(out_deriv.NumCols() == OutputDim());
-
-  int32 dct_dim = dct_mat_.NumCols(),
-        dct_keep_dim = dct_mat_.NumRows(),
-        num_chunks = dim_ / dct_dim,
-        num_rows = out_deriv.NumRows();
-
-  in_deriv->Resize(num_rows, dim_);
-
-  CuMatrix<BaseFloat> out_deriv_tmp;
-  if (reorder_) {
-    out_deriv_tmp = out_deriv;
-    Reorder(&out_deriv_tmp, false);
-  }
-  for (int32 chunk = 0; chunk < num_chunks; chunk++) {
-    CuSubMatrix<BaseFloat> in_deriv_mat(*in_deriv,
-                                      0, num_rows, dct_dim * chunk, dct_dim),
-                        out_deriv_mat(reorder_ ? out_deriv_tmp : out_deriv,
-                                      0, num_rows, dct_keep_dim * chunk, dct_keep_dim);
-
-    // Note: in the reverse direction the DCT matrix is transposed.  This is
-    // normal when computing derivatives; the necessity for the transpose is
-    // obvious if you consider what happens when the input and output dims
-    // differ.
-    in_deriv_mat.AddMatMat(1.0, out_deriv_mat, kNoTrans,
-                           dct_mat_, kNoTrans, 0.0);
-  }
-  if (reorder_)
-    Reorder(in_deriv, true);
-}
-
-Component* DctComponent::Copy() const {
-  DctComponent *ans = new DctComponent();
-  ans->dct_mat_ = dct_mat_;
-  ans->dim_ = dim_;
-  ans->reorder_ = reorder_;
-  return ans;
-}
-
-void DctComponent::Write(std::ostream &os, bool binary) const {
-  WriteToken(os, binary, "<DctComponent>");
-  WriteToken(os, binary, "<Dim>");
-  WriteBasicType(os, binary, dim_);
-  WriteToken(os, binary, "<DctDim>");
-  int32 dct_dim = dct_mat_.NumCols();
-  WriteBasicType(os, binary, dct_dim);
-  WriteToken(os, binary, "<Reorder>");
-  WriteBasicType(os, binary, reorder_);
-  WriteToken(os, binary, "<DctKeepDim>");
-  int32 dct_keep_dim = dct_mat_.NumRows();
-  WriteBasicType(os, binary, dct_keep_dim);
-  WriteToken(os, binary, "</DctComponent>");
-}
-
-void DctComponent::Read(std::istream &is, bool binary) {
-  ExpectOneOrTwoTokens(is, binary, "<DctComponent>", "<Dim>");
-  ReadBasicType(is, binary, &dim_);
-
-  ExpectToken(is, binary, "<DctDim>");
-  int32 dct_dim;
-  ReadBasicType(is, binary, &dct_dim);
-
-  ExpectToken(is, binary, "<Reorder>");
-  ReadBasicType(is, binary, &reorder_);
-
-  int32 dct_keep_dim = dct_dim;
-  std::string token;
-  ReadToken(is, binary, &token);
-  if (token == "<DctKeepDim>") {
-    ReadBasicType(is, binary, &dct_keep_dim);
-    ExpectToken(is, binary, "</DctComponent>");
-  } else if (token != "</DctComponent>") {
-    KALDI_ERR << "Expected token \"</DctComponent>\", got instead \""
-              << token << "\".";
-  }
-
-  KALDI_ASSERT(dct_dim > 0 && dim_ > 0 && dim_ % dct_dim == 0);
-  Init(dim_, dct_dim, reorder_, dct_keep_dim);
-  //idct_mat_.Resize(dct_keep_dim, dct_dim);
-  //ComputeDctMatrix(&dct_mat_);
-}
-
-void FixedLinearComponent::InitFromString(std::string args) {
-  std::string orig_args = args;
-  std::string filename;
-  bool ok = ParseFromString("matrix", &args, &filename);
-
-  if (!ok || !args.empty())
-    KALDI_ERR << "Invalid initializer for layer of type "
-              << Type() << ": \"" << orig_args << "\"";
-
-  bool binary;
-  Input ki(filename, &binary);
-  CuMatrix<BaseFloat> mat;
-  mat.Read(ki.Stream(), binary);
-  KALDI_ASSERT(mat.NumRows() != 0);
-  Init(mat);
-}
-
-
-std::string FixedLinearComponent::Info() const {
-  std::stringstream stream;
-  BaseFloat mat_size = static_cast<BaseFloat>(mat_.NumRows())
-      * static_cast<BaseFloat>(mat_.NumCols()),
-      mat_stddev = std::sqrt(TraceMatMat(mat_, mat_, kTrans) /
-                         mat_size);
-  stream << Component::Info() << ", params-stddev=" << mat_stddev;
-  return stream.str();
-}
-
-void FixedLinearComponent::Propagate(const ChunkInfo &in_info,
-                                     const ChunkInfo &out_info,
-                                     const CuMatrixBase<BaseFloat> &in,
-                                     CuMatrixBase<BaseFloat> *out) const  {
-  in_info.CheckSize(in);
-  out_info.CheckSize(*out);
-  KALDI_ASSERT(in_info.NumChunks() == out_info.NumChunks());
-
-  out->AddMatMat(1.0, in, kNoTrans, mat_, kTrans, 0.0);
-}
-
-void FixedLinearComponent::Backprop(const ChunkInfo &,  //in_info,
-                                    const ChunkInfo &,  //out_info,
-                                    const CuMatrixBase<BaseFloat> &,  //in_value,
-                                    const CuMatrixBase<BaseFloat> &,  //out_value,
-                                    const CuMatrixBase<BaseFloat> &out_deriv,
-                                    Component *,  //to_update, // may be identical to "this".
-                                    CuMatrix<BaseFloat> *in_deriv) const  {
-  in_deriv->Resize(out_deriv.NumRows(), mat_.NumCols());
-  in_deriv->AddMatMat(1.0, out_deriv, kNoTrans, mat_, kNoTrans, 0.0);
-}
-
-Component* FixedLinearComponent::Copy() const {
-  FixedLinearComponent *ans = new FixedLinearComponent();
-  ans->Init(mat_);
-  return ans;
-}
-
-
-void FixedLinearComponent::Write(std::ostream &os, bool binary) const {
-  WriteToken(os, binary, "<FixedLinearComponent>");
-  WriteToken(os, binary, "<CuMatrix>");
-  mat_.Write(os, binary);
-  WriteToken(os, binary, "</FixedLinearComponent>");
-}
-
-void FixedLinearComponent::Read(std::istream &is, bool binary) {
-  ExpectOneOrTwoTokens(is, binary, "<FixedLinearComponent>", "<CuMatrix>");
-  mat_.Read(is, binary);
-  ExpectToken(is, binary, "</FixedLinearComponent>");
-}
-
-void FixedAffineComponent::Init(const CuMatrixBase<BaseFloat> &mat) {
-  KALDI_ASSERT(mat.NumCols() > 1);
-  linear_params_ = mat.Range(0, mat.NumRows(),
-                             0, mat.NumCols() - 1);
-  bias_params_.Resize(mat.NumRows());
-  bias_params_.CopyColFromMat(mat, mat.NumCols() - 1);
-}
-
-
-void FixedAffineComponent::InitFromString(std::string args) {
-  std::string orig_args = args;
-  std::string filename;
-  bool ok = ParseFromString("matrix", &args, &filename);
-
-  if (!ok || !args.empty())
-    KALDI_ERR << "Invalid initializer for layer of type "
-              << Type() << ": \"" << orig_args << "\"";
-
-  bool binary;
-  Input ki(filename, &binary);
-  CuMatrix<BaseFloat> mat;
-  mat.Read(ki.Stream(), binary);
-  KALDI_ASSERT(mat.NumRows() != 0);
-  Init(mat);
-}
-
-
-std::string FixedAffineComponent::Info() const {
-  std::stringstream stream;
-  BaseFloat linear_params_size = static_cast<BaseFloat>(linear_params_.NumRows())
-      * static_cast<BaseFloat>(linear_params_.NumCols()),
-      linear_params_stddev =
-      std::sqrt(TraceMatMat(linear_params_,
-                            linear_params_, kTrans) /
-                linear_params_size),
-      bias_params_stddev = std::sqrt(VecVec(bias_params_, bias_params_) /
-                                     bias_params_.Dim());
-
-  stream << Component::Info() << ", linear-params-stddev=" << linear_params_stddev
-         << ", bias-params-stddev=" << bias_params_stddev;
-  return stream.str();
-}
-
-void FixedAffineComponent::Propagate(const ChunkInfo &in_info,
-                                     const ChunkInfo &out_info,
-                                     const CuMatrixBase<BaseFloat> &in,
-                                     CuMatrixBase<BaseFloat> *out) const  {
-  in_info.CheckSize(in);
-  out_info.CheckSize(*out);
-  KALDI_ASSERT(in_info.NumChunks() == out_info.NumChunks());
-
-  out->AddMatMat(1.0, in, kNoTrans, linear_params_, kTrans, 0.0);
-  out->AddVecToRows(1.0, bias_params_);
-}
-
-void FixedAffineComponent::Backprop(const ChunkInfo &,  //in_info,
-                                    const ChunkInfo &,  //out_info,
-                                    const CuMatrixBase<BaseFloat> &,  //in_value,
-                                    const CuMatrixBase<BaseFloat> &,  //out_value,
-                                    const CuMatrixBase<BaseFloat> &out_deriv,
-                                    Component *,  //to_update, // may be identical to "this".
-                                    CuMatrix<BaseFloat> *in_deriv) const  {
-  in_deriv->Resize(out_deriv.NumRows(), linear_params_.NumCols());
-  in_deriv->AddMatMat(1.0, out_deriv, kNoTrans, linear_params_, kNoTrans, 0.0);
-}
-
-Component* FixedAffineComponent::Copy() const {
-  FixedAffineComponent *ans = new FixedAffineComponent();
-  ans->linear_params_ = linear_params_;
-  ans->bias_params_ = bias_params_;
-  return ans;
-}
-
-
-void FixedAffineComponent::Write(std::ostream &os, bool binary) const {
-  WriteToken(os, binary, "<FixedAffineComponent>");
-  WriteToken(os, binary, "<LinearParams>");
-  linear_params_.Write(os, binary);
-  WriteToken(os, binary, "<BiasParams>");
-  bias_params_.Write(os, binary);
-  WriteToken(os, binary, "</FixedAffineComponent>");
-}
-
-void FixedAffineComponent::Read(std::istream &is, bool binary) {
-  ExpectOneOrTwoTokens(is, binary, "<FixedAffineComponent>", "<LinearParams>");
-  linear_params_.Read(is, binary);
-  ExpectToken(is, binary, "<BiasParams>");
-  bias_params_.Read(is, binary);
-  ExpectToken(is, binary, "</FixedAffineComponent>");
-}
-
-
-void FixedScaleComponent::Init(const CuVectorBase<BaseFloat> &scales) {
-  KALDI_ASSERT(scales.Dim() != 0);
-  scales_ = scales;
-}
-
-void FixedScaleComponent::InitFromString(std::string args) {
-  std::string orig_args = args;
-  std::string filename;
-  bool ok = ParseFromString("scales", &args, &filename);
-
-  if (!ok || !args.empty())
-    KALDI_ERR << "Invalid initializer for layer of type "
-              << Type() << ": \"" << orig_args << "\"";
-
-  CuVector<BaseFloat> vec;
-  ReadKaldiObject(filename, &vec);
-  Init(vec);
-}
-
-
-std::string FixedScaleComponent::Info() const {
-  std::stringstream stream;
-  BaseFloat scales_size = static_cast<BaseFloat>(scales_.Dim()),
-      scales_mean = scales_.Sum() / scales_size,
-      scales_stddev = std::sqrt(VecVec(scales_, scales_) / scales_size
-       - (scales_mean * scales_mean));
-  stream << Component::Info() << ", scales-mean=" << scales_mean
-         << ", scales-stddev=" << scales_stddev;
-  return stream.str();
-}
-
-void FixedScaleComponent::Propagate(const ChunkInfo &in_info,
-                                    const ChunkInfo &out_info,
-                                    const CuMatrixBase<BaseFloat> &in,
-                                    CuMatrixBase<BaseFloat> *out) const  {
-  out->CopyFromMat(in);
-  out->MulColsVec(scales_);
-}
-
-void FixedScaleComponent::Backprop(const ChunkInfo &, //in_info,
-                                   const ChunkInfo &, //out_info,
-                                   const CuMatrixBase<BaseFloat> &, //in_value,
-                                   const CuMatrixBase<BaseFloat> &, //out_value,
-                                   const CuMatrixBase<BaseFloat> &out_deriv,
-                                   Component *, //to_update, // may be identical to "this".
-                                   CuMatrix<BaseFloat> *in_deriv) const {
-  *in_deriv = out_deriv;
-  in_deriv->MulColsVec(scales_);
-}
-
-Component* FixedScaleComponent::Copy() const {
-  FixedScaleComponent *ans = new FixedScaleComponent();
-  ans->scales_ = scales_;
-  return ans;
-}
-
-
-void FixedScaleComponent::Write(std::ostream &os, bool binary) const {
-  WriteToken(os, binary, "<FixedScaleComponent>");
-  WriteToken(os, binary, "<Scales>");
-  scales_.Write(os, binary);
-  WriteToken(os, binary, "</FixedScaleComponent>");
-}
-
-void FixedScaleComponent::Read(std::istream &is, bool binary) {
-  ExpectOneOrTwoTokens(is, binary, "<FixedScaleComponent>", "<Scales>");
-  scales_.Read(is, binary);
-  ExpectToken(is, binary, "</FixedScaleComponent>");
-}
-
-void FixedBiasComponent::Init(const CuVectorBase<BaseFloat> &bias) {
-  KALDI_ASSERT(bias.Dim() != 0);
-  bias_ = bias;
-}
-
-void FixedBiasComponent::InitFromString(std::string args) {
-  std::string orig_args = args;
-  std::string filename;
-  bool ok = ParseFromString("bias", &args, &filename);
-
-  if (!ok || !args.empty())
-    KALDI_ERR << "Invalid initializer for layer of type "
-              << Type() << ": \"" << orig_args << "\"";
-
-  CuVector<BaseFloat> vec;
-  ReadKaldiObject(filename, &vec);
-  Init(vec);
-}
-
-
-std::string FixedBiasComponent::Info() const {
-  std::stringstream stream;
-  BaseFloat bias_size = static_cast<BaseFloat>(bias_.Dim()),
-      bias_mean = bias_.Sum() / bias_size,
-      bias_stddev = std::sqrt(VecVec(bias_, bias_) / bias_size)
-       - (bias_mean * bias_mean);
-  stream << Component::Info() << ", bias-mean=" << bias_mean
-         << ", bias-stddev=" << bias_stddev;
-  return stream.str();
-}
-
-void FixedBiasComponent::Propagate(const ChunkInfo &in_info,
-                                   const ChunkInfo &out_info,
-                                   const CuMatrixBase<BaseFloat> &in,
-                                   CuMatrixBase<BaseFloat> *out) const  {
-  out->CopyFromMat(in);
-  out->AddVecToRows(1.0, bias_, 1.0);
-}
-
-void FixedBiasComponent::Backprop(const ChunkInfo &, //in_info,
-                                  const ChunkInfo &, //out_info,
-                                  const CuMatrixBase<BaseFloat> &,  //in_value,
-                                  const CuMatrixBase<BaseFloat> &,  //out_value,
-                                  const CuMatrixBase<BaseFloat> &out_deriv,
-                                  Component *,  //to_update,
-                                  CuMatrix<BaseFloat> *in_deriv) const  {
-  *in_deriv = out_deriv;
-}
-
-Component* FixedBiasComponent::Copy() const {
-  FixedBiasComponent *ans = new FixedBiasComponent();
-  ans->bias_ = bias_;
-  return ans;
-}
-
-
-void FixedBiasComponent::Write(std::ostream &os, bool binary) const {
-  WriteToken(os, binary, "<FixedBiasComponent>");
-  WriteToken(os, binary, "<Bias>");
-  bias_.Write(os, binary);
-  WriteToken(os, binary, "</FixedBiasComponent>");
-}
-
-void FixedBiasComponent::Read(std::istream &is, bool binary) {
-  ExpectOneOrTwoTokens(is, binary, "<FixedBiasComponent>", "<Bias>");
-  bias_.Read(is, binary);
-  ExpectToken(is, binary, "</FixedBiasComponent>");
-}
-
-
-
-
-std::string DropoutComponent::Info() const {
-  std::stringstream stream;
-  stream << Component::Info() << ", dropout_proportion = "
-         << dropout_proportion_ << ", dropout_scale = "
-         << dropout_scale_;
-  return stream.str();
-}
-
-void DropoutComponent::InitFromString(std::string args) {
-  std::string orig_args(args);
-  int32 dim;
-  BaseFloat dropout_proportion = 0.5, dropout_scale = 0.0;
-  bool ok = ParseFromString("dim", &args, &dim);
-  ParseFromString("dropout-proportion", &args, &dropout_proportion);
-  ParseFromString("dropout-scale", &args, &dropout_scale);
-
-  if (!ok || !args.empty() || dim <= 0)
-    KALDI_ERR << "Invalid initializer for layer of type DropoutComponent: \""
-              << orig_args << "\"";
-  Init(dim, dropout_proportion, dropout_scale);
-}
-
-void DropoutComponent::Read(std::istream &is, bool binary) {
-  ExpectOneOrTwoTokens(is, binary, "<DropoutComponent>", "<Dim>");
-  ReadBasicType(is, binary, &dim_);
-  ExpectToken(is, binary, "<DropoutScale>");
-  ReadBasicType(is, binary, &dropout_scale_);
-  ExpectToken(is, binary, "<DropoutProportion>");
-  ReadBasicType(is, binary, &dropout_proportion_);
-  ExpectToken(is, binary, "</DropoutComponent>");
-}
-
-void DropoutComponent::Write(std::ostream &os, bool binary) const {
-  WriteToken(os, binary, "<DropoutComponent>");
-  WriteToken(os, binary, "<Dim>");
-  WriteBasicType(os, binary, dim_);
-  WriteToken(os, binary, "<DropoutScale>");
-  WriteBasicType(os, binary, dropout_scale_);
-  WriteToken(os, binary, "<DropoutProportion>");
-  WriteBasicType(os, binary, dropout_proportion_);
-  WriteToken(os, binary, "</DropoutComponent>");
-}
-
-
-void DropoutComponent::Init(int32 dim,
-                            BaseFloat dropout_proportion,
-                            BaseFloat dropout_scale){
-  dim_ = dim;
-  dropout_proportion_ = dropout_proportion;
-  dropout_scale_ = dropout_scale;
-}
-
-void DropoutComponent::Propagate(const ChunkInfo &in_info,
-                                 const ChunkInfo &out_info,
-                                 const CuMatrixBase<BaseFloat> &in,
-                                 CuMatrixBase<BaseFloat> *out) const  {
-  in_info.CheckSize(in);
-  out_info.CheckSize(*out);
-  KALDI_ASSERT(in_info.NumChunks() == out_info.NumChunks());
-  KALDI_ASSERT(in.NumCols() == this->InputDim());
-
-  BaseFloat dp = dropout_proportion_;
-  KALDI_ASSERT(dp < 1.0 && dp >= 0.0);
-  KALDI_ASSERT(dropout_scale_ <= 1.0 && dropout_scale_ >= 0.0);
-
-  BaseFloat low_scale = dropout_scale_,
-      high_scale = (1.0 - (dp * low_scale)) / (1.0 - dp),
-      average = (low_scale * dp) +
-                (high_scale * (1.0 - dp));
-  KALDI_ASSERT(fabs(average - 1.0) < 0.01);
-
-  // This const_cast is only safe assuming you don't attempt
-  // to use multi-threaded code with the GPU.
-  const_cast<CuRand<BaseFloat>&>(random_generator_).RandUniform(out);
-
-
-  out->Add(-dp); // now, a proportion "dp" will be <0.0
-  out->ApplyHeaviside(); // apply the function (x>0?1:0).  Now, a proportion "dp" will
-                         // be zero and (1-dp) will be 1.0.
-  if ((high_scale - low_scale) != 1.0)
-    out->Scale(high_scale - low_scale); // now, "dp" are 0 and (1-dp) are "high_scale-low_scale".
-  if (low_scale != 0.0)
-    out->Add(low_scale); // now "dp" equal "low_scale" and (1.0-dp) equal "high_scale".
-
-  out->MulElements(in);
-}
-
-void DropoutComponent::Backprop(const ChunkInfo &,  //in_info,
-                                const ChunkInfo &,  //out_info,
-                                const CuMatrixBase<BaseFloat> &in_value,
-                                const CuMatrixBase<BaseFloat> &out_value,
-                                const CuMatrixBase<BaseFloat> &out_deriv,
-                                Component *,  //to_update
-                                CuMatrix<BaseFloat> *in_deriv) const  {
-  KALDI_ASSERT(SameDim(in_value, out_value) && SameDim(in_value, out_deriv));
-  in_deriv->Resize(out_deriv.NumRows(), out_deriv.NumCols());
-  in_deriv->SetMatMatDivMat(out_deriv, out_value, in_value);
-}
-
-Component* DropoutComponent::Copy() const {
-  return new DropoutComponent(dim_,
-                              dropout_proportion_,
-                              dropout_scale_);
-}
-
-void AdditiveNoiseComponent::InitFromString(std::string args) {
-  std::string orig_args(args);
-  int32 dim;
-  BaseFloat stddev = 1.0;
-  bool ok = ParseFromString("dim", &args, &dim);
-  ParseFromString("stddev", &args, &stddev);
-
-  if (!ok || !args.empty() || dim <= 0)
-    KALDI_ERR << "Invalid initializer for layer of type AdditiveNoiseComponent: \""
-              << orig_args << "\"";
-  Init(dim, stddev);
-}
-
-void AdditiveNoiseComponent::Read(std::istream &is, bool binary) {
-  ExpectOneOrTwoTokens(is, binary, "<AdditiveNoiseComponent>", "<Dim>");
-  ReadBasicType(is, binary, &dim_);
-  ExpectToken(is, binary, "<Stddev>");
-  ReadBasicType(is, binary, &stddev_);
-  ExpectToken(is, binary, "</AdditiveNoiseComponent>");
-}
-
-void AdditiveNoiseComponent::Write(std::ostream &os, bool binary) const {
-  WriteToken(os, binary, "<AdditiveNoiseComponent>");
-  WriteToken(os, binary, "<Dim>");
-  WriteBasicType(os, binary, dim_);
-  WriteToken(os, binary, "<Stddev>");
-  WriteBasicType(os, binary, stddev_);
-  WriteToken(os, binary, "</AdditiveNoiseComponent>");
-}
-
-void AdditiveNoiseComponent::Init(int32 dim, BaseFloat stddev) {
-  dim_ = dim;
-  stddev_ = stddev;
-}
-
-void AdditiveNoiseComponent::Propagate(const ChunkInfo &in_info,
-                                       const ChunkInfo &out_info,
-                                       const CuMatrixBase<BaseFloat> &in,
-                                       CuMatrixBase<BaseFloat> *out) const  {
-  KALDI_ASSERT(in.NumCols() == this->InputDim());
-  out->CopyFromMat(in);
-  CuMatrix<BaseFloat> rand(in.NumRows(), in.NumCols());
-  const_cast<CuRand<BaseFloat>&>(random_generator_).RandUniform(&rand);
-  out->AddMat(stddev_, rand);
-}
-
-Convolutional1dComponent::Convolutional1dComponent():
-    UpdatableComponent(),
-    patch_dim_(0), patch_step_(0), patch_stride_(0),
-    appended_conv_(false), is_gradient_(false) {}
-
-Convolutional1dComponent::Convolutional1dComponent(const Convolutional1dComponent &component):
-    UpdatableComponent(component),
-    filter_params_(component.filter_params_),
-    bias_params_(component.bias_params_),
-    appended_conv_(component.appended_conv_),
-    is_gradient_(component.is_gradient_) {}
-
-Convolutional1dComponent::Convolutional1dComponent(const CuMatrixBase<BaseFloat> &filter_params,
-                                                   const CuVectorBase<BaseFloat> &bias_params,
-                                                   BaseFloat learning_rate):
-    UpdatableComponent(learning_rate),
-    filter_params_(filter_params),
-    bias_params_(bias_params) {
-  KALDI_ASSERT(filter_params.NumRows() == bias_params.Dim() &&
-               bias_params.Dim() != 0);
-  appended_conv_ = false;
-  is_gradient_ = false;
-}
-
-// aquire input dim
-int32 Convolutional1dComponent::InputDim() const {
-  int32 filter_dim = filter_params_.NumCols();
-  int32 num_splice = filter_dim / patch_dim_;
-  return patch_stride_ * num_splice;
-}
-
-// aquire output dim
-int32 Convolutional1dComponent::OutputDim() const {
-  int32 num_filters = filter_params_.NumRows();
-  int32 num_patches = 1 + (patch_stride_ - patch_dim_) / patch_step_;
-  return num_patches * num_filters;
-}
-
-// initialize the component using hyperparameters
-void Convolutional1dComponent::Init(BaseFloat learning_rate,
-                                    int32 input_dim, int32 output_dim,
-                                    int32 patch_dim, int32 patch_step,
-                                    int32 patch_stride, BaseFloat param_stddev,
-                                    BaseFloat bias_stddev, bool appended_conv) {
-  UpdatableComponent::Init(learning_rate);
-  patch_dim_ = patch_dim;
-  patch_step_ = patch_step;
-  patch_stride_ = patch_stride;
-  appended_conv_ = appended_conv;
-  int32 num_splice = input_dim / patch_stride;
-  int32 filter_dim = num_splice * patch_dim;
-  int32 num_patches = 1 + (patch_stride - patch_dim) / patch_step;
-  int32 num_filters = output_dim / num_patches;
-  KALDI_ASSERT(input_dim % patch_stride == 0);
-  KALDI_ASSERT((patch_stride - patch_dim) % patch_step == 0);
-  KALDI_ASSERT(output_dim % num_patches == 0);
-
-  filter_params_.Resize(num_filters, filter_dim);
-  bias_params_.Resize(num_filters);
-  KALDI_ASSERT(param_stddev >= 0.0 && bias_stddev >= 0.0);
-  filter_params_.SetRandn();
-  filter_params_.Scale(param_stddev);
-  bias_params_.SetRandn();
-  bias_params_.Scale(bias_stddev);
-}
-
-// initialize the component using predefined matrix file
-void Convolutional1dComponent::Init(BaseFloat learning_rate, int32 patch_dim,
-                                    int32 patch_step, int32 patch_stride,
-                                    std::string matrix_filename,
-                                    bool appended_conv) {
-  UpdatableComponent::Init(learning_rate);
-  patch_dim_ = patch_dim;
-  patch_step_ = patch_step;
-  patch_stride_ = patch_stride;
-  appended_conv_ = appended_conv;
-  CuMatrix<BaseFloat> mat;
-  ReadKaldiObject(matrix_filename, &mat);
-  KALDI_ASSERT(mat.NumCols() >= 2);
-  int32 filter_dim = mat.NumCols() - 1, num_filters = mat.NumRows();
-  filter_params_.Resize(num_filters, filter_dim);
-  bias_params_.Resize(num_filters);
-  filter_params_.CopyFromMat(mat.Range(0, num_filters, 0, filter_dim));
-  bias_params_.CopyColFromMat(mat, filter_dim);
-}
-
-// resize the component, setting the parameters to zero, while
-// leaving any other configuration values the same
-void Convolutional1dComponent::Resize(int32 input_dim, int32 output_dim) {
-  KALDI_ASSERT(input_dim > 0 && output_dim > 0);
-  int32 num_splice = input_dim / patch_stride_;
-  int32 filter_dim = num_splice * patch_dim_;
-  int32 num_patches = 1 + (patch_stride_ - patch_dim_) / patch_step_;
-  int32 num_filters = output_dim / num_patches;
-  KALDI_ASSERT(input_dim % patch_stride_ == 0);
-  KALDI_ASSERT((patch_stride_ - patch_dim_) % patch_step_ == 0);
-  KALDI_ASSERT(output_dim % num_patches == 0);
-  filter_params_.Resize(num_filters, filter_dim);
-  bias_params_.Resize(num_filters);
-}
-
-// display information about component
-std::string Convolutional1dComponent::Info() const {
-  std::stringstream stream;
-  BaseFloat filter_params_size = static_cast<BaseFloat>(filter_params_.NumRows())
-                                 * static_cast<BaseFloat>(filter_params_.NumCols());
-  BaseFloat filter_stddev =
-            std::sqrt(TraceMatMat(filter_params_, filter_params_, kTrans) /
-                      filter_params_size),
-            bias_stddev = std::sqrt(VecVec(bias_params_, bias_params_) /
-                                    bias_params_.Dim());
-
-  int32 num_splice = InputDim() / patch_stride_;
-  int32 filter_dim = num_splice * patch_dim_;
-  int32 num_patches = 1 + (patch_stride_ - patch_dim_) / patch_step_;
-  int32 num_filters = OutputDim() / num_patches;
-
-  stream << Type() << ", input-dim=" << InputDim()
-         << ", output-dim=" << OutputDim()
-         << ", num-splice=" << num_splice
-         << ", num-patches=" << num_patches
-         << ", num-filters=" << num_filters
-         << ", filter-dim=" << filter_dim
-         << ", filter-params-stddev=" << filter_stddev
-         << ", bias-params-stddev=" << bias_stddev
-         << ", appended-conv=" << appended_conv_
-         << ", learning-rate=" << LearningRate();
-  return stream.str();
-}
-
-// initialize the component using configuration file
-void Convolutional1dComponent::InitFromString(std::string args) {
-  std::string orig_args(args);
-  bool ok = true, appended_conv = false;
-  BaseFloat learning_rate = learning_rate_;
-  std::string matrix_filename;
-  int32 input_dim = -1, output_dim = -1;
-  int32 patch_dim = -1, patch_step = -1, patch_stride = -1;
-  ParseFromString("learning-rate", &args, &learning_rate);
-  ParseFromString("appended-conv", &args, &appended_conv);
-  ok = ok && ParseFromString("patch-dim", &args, &patch_dim);
-  ok = ok && ParseFromString("patch-step", &args, &patch_step);
-  ok = ok && ParseFromString("patch-stride", &args, &patch_stride);
-  if (ParseFromString("matrix", &args, &matrix_filename)) {
-    // initialize from prefined parameter matrix
-    Init(learning_rate, patch_dim, patch_step, patch_stride,
-         matrix_filename, appended_conv);
-    if (ParseFromString("input-dim", &args, &input_dim))
-      KALDI_ASSERT(input_dim == InputDim() &&
-               "input-dim mismatch vs. matrix.");
-    if (ParseFromString("output-dim", &args, &output_dim))
-            KALDI_ASSERT(output_dim == OutputDim() &&
-                     "output-dim mismatch vs. matrix.");
-  } else {
-    // initialize from configuration
-    ok = ok && ParseFromString("input-dim", &args, &input_dim);
-    ok = ok && ParseFromString("output-dim", &args, &output_dim);
-    BaseFloat param_stddev = 1.0 / std::sqrt(input_dim), bias_stddev = 1.0;
-    ParseFromString("param-stddev", &args, &param_stddev);
-    ParseFromString("bias-stddev", &args, &bias_stddev);
-    Init(learning_rate, input_dim, output_dim, patch_dim,
-         patch_step, patch_stride, param_stddev, bias_stddev, appended_conv);
-  }
-  if (!args.empty())
-    KALDI_ERR << "Could not process these elements in initializer: " << args;
-  if (!ok)
-    KALDI_ERR << "Bad initializer " << orig_args;
-}
-
-// propagation function
-
-/*
-   In Convolution1dComponent, filter is defined $num-filters x $filter-dim,
-   and bias vector B is defined by length $num-filters. The propatation is
-   Y = X o A' + B
-   where "o" is executing matrix-matrix convolution, which consists of a group
-   of vector-matrix convolutions.
-   For instance, the convolution of X(t) and the i-th filter A(i) is
-   Y(t,i) = X(t) o A'(i) + B(i)
-   The convolution used here is valid convolution. Meaning that the
-   output of M o N is of dim |M| - |N| + 1, assuming M is not shorter then N.
-
-   By default, input is arranged by
-   x (time), y (channel), z(frequency)
-   and output is arranged by
-   x (time), y (frequency), z(channel).
-   When appending convolutional1dcomponent, appended_conv_ should be
-   set ture for the appended convolutional1dcomponent.
-*/
-void Convolutional1dComponent::Propagate(const ChunkInfo &in_info,
-                                         const ChunkInfo &out_info,
-                                         const CuMatrixBase<BaseFloat> &in,
-                                         CuMatrixBase<BaseFloat> *out) const {
-  in_info.CheckSize(in);
-  out_info.CheckSize(*out);
-  KALDI_ASSERT(in_info.NumChunks() == out_info.NumChunks());
-
-  // dims
-  int32 num_splice = InputDim() / patch_stride_;
-  int32 num_patches = 1 + (patch_stride_ - patch_dim_) / patch_step_;
-  int32 num_filters = filter_params_.NumRows();
-  int32 num_frames = in.NumRows();
-  int32 filter_dim = filter_params_.NumCols();
-
-  /** Buffer of reshaped inputs:
-   *  1row = vectorized rectangular feature patches
-   *  1col = dim over speech frames,
-   */
-  CuMatrix<BaseFloat> patches(num_frames, filter_dim * num_patches, kUndefined);
-  // column_map is indexed by the column-index of "patches",
-  // and the value is the corresponding column-index of "in".
-  std::vector<int32> column_map(filter_dim * num_patches);
-
-  // build-up a column selection map
-  for (int32 patch = 0, index = 0; patch < num_patches; patch++) {
-    int32 fstride = patch * patch_step_;
-    for (int32 splice = 0; splice < num_splice; splice++) {
-      int32 cstride = splice * patch_stride_;
-      for (int32 d = 0; d < patch_dim_; d++, index++) {
-        if (appended_conv_)
-          column_map[index] = (fstride + d) * num_splice + splice;
-        else
-          column_map[index] = fstride + cstride + d;
-      }
-    }
-  }
-  CuArray<int32> cu_cols(column_map);
-  patches.CopyCols(in, cu_cols);
-
-  //
-  // compute filter activations
-  //
-
-  std::vector<CuSubMatrix<BaseFloat>* > tgt_batch, patch_batch, filter_params_batch;
-
-  CuSubMatrix<BaseFloat>* filter_params_elem = new CuSubMatrix<BaseFloat>(
-      filter_params_, 0, filter_params_.NumRows(), 0, filter_params_.NumCols());
-
-  // form batch in vector container
-  for (int32 p = 0; p < num_patches; p++) {
-    // form batch in vector container. for filter_params_batch, all elements
-    // point to the same copy filter_params_elem
-    tgt_batch.push_back(new CuSubMatrix<BaseFloat>(out->ColRange(p * num_filters,
-                                                                 num_filters)));
-    patch_batch.push_back(new CuSubMatrix<BaseFloat>(
-        patches.ColRange(p * filter_dim, filter_dim)));
-    filter_params_batch.push_back(filter_params_elem);
-
-    tgt_batch[p]->AddVecToRows(1.0, bias_params_, 0.0); // add bias
-  }
-
-  // apply all filters
-  AddMatMatBatched<BaseFloat>(1.0, tgt_batch, patch_batch, kNoTrans,
-                              filter_params_batch, kTrans, 1.0);
-
-  // release memory
-  delete filter_params_elem;
-  for (int32 p = 0; p < num_patches; p++) {
-    delete tgt_batch[p];
-    delete patch_batch[p];
-  }
-}
-
-// scale the parameters
-void Convolutional1dComponent::Scale(BaseFloat scale) {
-  filter_params_.Scale(scale);
-  bias_params_.Scale(scale);
-}
-
-// add another convolution component
-void Convolutional1dComponent::Add(BaseFloat alpha, const UpdatableComponent &other_in) {
-  const Convolutional1dComponent *other =
-      dynamic_cast<const Convolutional1dComponent*>(&other_in);
-  KALDI_ASSERT(other != NULL);
-  filter_params_.AddMat(alpha, other->filter_params_);
-  bias_params_.AddVec(alpha, other->bias_params_);
-}
-
-/*
- This function does an operation similar to reversing a map,
- except it handles maps that are not one-to-one by outputting
- the reversed map as a vector of lists.
- @param[in] forward_indexes is a vector of int32, each of whose
-            elements is between 0 and input_dim - 1.
- @param[in] input_dim. See definitions of forward_indexes and
-            backward_indexes.
- @param[out] backward_indexes is a vector of dimension input_dim
-            of lists, The list at (backward_indexes[i]) is a list
-            of all indexes j such that forward_indexes[j] = i.
-*/
-void Convolutional1dComponent::ReverseIndexes(const std::vector<int32> &forward_indexes,
-                                              int32 input_dim,
-                                              std::vector<std::vector<int32> > *backward_indexes) {
-  int32 i, size = forward_indexes.size();
-  int32 reserve_size = 2 + size / input_dim;
-  backward_indexes->resize(input_dim);
-  std::vector<std::vector<int32> >::iterator iter = backward_indexes->begin(),
-    end = backward_indexes->end();
-  for (; iter != end; ++iter)
-    iter->reserve(reserve_size);
-  for (int32 j = 0; j < forward_indexes.size(); j++) {
-    i = forward_indexes[j];
-    KALDI_ASSERT(i < input_dim);
-    (*backward_indexes)[i].push_back(j);
-  }
-}
-
-/*
- This function transforms a vector of lists into a list of vectors,
- padded with -1.
- @param[in] The input vector of lists. Let in.size() be D, and let
-            the longest list length (i.e. the max of in[i].size()) be L.
- @param[out] The output list of vectors. The length of the list will
-            be L, each vector-dimension will be D (i.e. out[i].size() == D),
-            and if in[i] == j, then for some k we will have that
-            out[k][j] = i. The output vectors are padded with -1
-            where necessary if not all the input lists have the same side.
-*/
-void Convolutional1dComponent::RearrangeIndexes(const std::vector<std::vector<int32> > &in,
-                                                std::vector<std::vector<int32> > *out) {
-  int32 D = in.size();
-  int32 L = 0;
-  for (int32 i = 0; i < D; i++)
-    if (in[i].size() > L)
-      L = in[i].size();
-  out->resize(L);
-  for (int32 i = 0; i < L; i++)
-    (*out)[i].resize(D, -1);
-  for (int32 i = 0; i < D; i++) {
-    for (int32 j = 0; j < in[i].size(); j++) {
-      (*out)[j][i] = in[i][j];
-    }
-  }
-}
-
-// back propagation function
-void Convolutional1dComponent::Backprop(const ChunkInfo &in_info,
-                                        const ChunkInfo &out_info,
-                                        const CuMatrixBase<BaseFloat> &in_value,
-                                        const CuMatrixBase<BaseFloat> &out_value,
-                                        const CuMatrixBase<BaseFloat> &out_deriv,
-                                        Component *to_update_in,
-                                        CuMatrix<BaseFloat> *in_deriv) const {
-  in_deriv->Resize(out_deriv.NumRows(), InputDim());
-  Convolutional1dComponent *to_update = dynamic_cast<Convolutional1dComponent*>(to_update_in);
-  int32 num_splice = InputDim() / patch_stride_;
-  int32 num_patches = 1 + (patch_stride_ - patch_dim_) / patch_step_;
-  int32 num_filters = filter_params_.NumRows();
-  int32 num_frames = out_deriv.NumRows();
-  int32 filter_dim = filter_params_.NumCols();
-
-  /** Buffer for backpropagation:
-   *  derivatives in the domain of 'patches_',
-   *  1row = vectorized rectangular feature patches,
-   *  1col = dim over speech frames,
-   */
-  CuMatrix<BaseFloat> patches_deriv(num_frames, filter_dim * num_patches, kSetZero);
-
-  //
-  // backpropagate to vector of matrices
-  // (corresponding to position of a filter)
-  //
-  std::vector<CuSubMatrix<BaseFloat>* > patch_deriv_batch, out_deriv_batch,
-      filter_params_batch;
-
-  CuSubMatrix<BaseFloat>* filter_params_elem = new CuSubMatrix<BaseFloat>(
-      filter_params_, 0, filter_params_.NumRows(), 0, filter_params_.NumCols());
-
-  // form batch in vector container
-  for (int32 p = 0; p < num_patches; p++) {
-    // form batch in vector container. for filter_params_batch, all elements
-    // point to the same copy filter_params_elem
-    patch_deriv_batch.push_back(new CuSubMatrix<BaseFloat>(patches_deriv.ColRange(
-        p * filter_dim, filter_dim)));
-    out_deriv_batch.push_back(new CuSubMatrix<BaseFloat>(out_deriv.ColRange(
-        p * num_filters, num_filters)));
-    filter_params_batch.push_back(filter_params_elem);
-  }
-  AddMatMatBatched<BaseFloat>(1.0, patch_deriv_batch, out_deriv_batch, kNoTrans,
-                              filter_params_batch, kNoTrans, 0.0);
-
-  // release memory
-  delete filter_params_elem;
-  for (int32 p = 0; p < num_patches; p++) {
-    delete patch_deriv_batch[p];
-    delete out_deriv_batch[p];
-  }
-
-  // sum the derivatives into in_deriv
-  std::vector<int32> column_map(filter_dim * num_patches);
-  for (int32 patch = 0, index = 0; patch < num_patches; patch++) {
-    int32 fstride = patch * patch_step_;
-    for (int32 splice = 0; splice < num_splice; splice++) {
-      int32 cstride = splice * patch_stride_;
-      for (int32 d = 0; d < patch_dim_; d++, index++) {
-        if (appended_conv_)
-          column_map[index] = (fstride + d) * num_splice + splice;
-        else
-          column_map[index] = fstride + cstride + d;
-      }
-    }
-  }
-  std::vector<std::vector<int32> > reversed_column_map;
-  ReverseIndexes(column_map, InputDim(), &reversed_column_map);
-  std::vector<std::vector<int32> > rearranged_column_map;
-  RearrangeIndexes(reversed_column_map, &rearranged_column_map);
-  for (int32 p = 0; p < rearranged_column_map.size(); p++) {
-    CuArray<int32> cu_cols(rearranged_column_map[p]);
-    in_deriv->AddCols(patches_deriv, cu_cols);
-  }
-
-  if (to_update != NULL) {
-    // Next update the model (must do this 2nd so the derivatives we propagate
-    // are accurate, in case this == to_update_in.)
-    to_update->Update(in_value, out_deriv);
-  }
-}
-
-void Convolutional1dComponent::SetZero(bool treat_as_gradient) {
-  if (treat_as_gradient) {
-    SetLearningRate(1.0);
-  }
-  filter_params_.SetZero();
-  bias_params_.SetZero();
-  if (treat_as_gradient) {
-    is_gradient_ = true;
-  }
-}
-
-void Convolutional1dComponent::Read(std::istream &is, bool binary) {
-  std::ostringstream ostr_beg, ostr_end;
-  ostr_beg << "<" << Type() << ">"; // e.g. "<Convolutional1dComponent>"
-  ostr_end << "</" << Type() << ">"; // e.g. "</Convolutional1dComponent>"
-  // might not see the "<Convolutional1dComponent>" part because
-  // of how ReadNew() works.
-  ExpectOneOrTwoTokens(is, binary, ostr_beg.str(), "<LearningRate>");
-  ReadBasicType(is, binary, &learning_rate_);
-  ExpectToken(is, binary, "<PatchDim>");
-  ReadBasicType(is, binary, &patch_dim_);
-  ExpectToken(is, binary, "<PatchStep>");
-  ReadBasicType(is, binary, &patch_step_);
-  ExpectToken(is, binary, "<PatchStride>");
-  ReadBasicType(is, binary, &patch_stride_);
-  // back-compatibility
-  std::string tok;
-  ReadToken(is, binary, &tok);
-  if (tok == "<AppendedConv>") {
-    ReadBasicType(is, binary, &appended_conv_);
-    ExpectToken(is, binary, "<FilterParams>");
-  } else {
-    appended_conv_ = false;
-    KALDI_ASSERT(tok == "<FilterParams>");
-  }
-  filter_params_.Read(is, binary);
-  ExpectToken(is, binary, "<BiasParams>");
-  bias_params_.Read(is, binary);
-  ReadToken(is, binary, &tok);
-  if (tok == "<IsGradient>") {
-    ReadBasicType(is, binary, &is_gradient_);
-    ExpectToken(is, binary, ostr_end.str());
-  } else {
-    is_gradient_ = false;
-    KALDI_ASSERT(tok == ostr_end.str());
-  }
-}
-
-void Convolutional1dComponent::Write(std::ostream &os, bool binary) const {
-  std::ostringstream ostr_beg, ostr_end;
-  ostr_beg << "<" << Type() << ">"; // e.g. "<Convolutional1dComponent>"
-  ostr_end << "</" << Type() << ">"; // e.g. "</Convolutional1dComponent>"
-  WriteToken(os, binary, ostr_beg.str());
-  WriteToken(os, binary, "<LearningRate>");
-  WriteBasicType(os, binary, learning_rate_);
-  WriteToken(os, binary, "<PatchDim>");
-  WriteBasicType(os, binary, patch_dim_);
-  WriteToken(os, binary, "<PatchStep>");
-  WriteBasicType(os, binary, patch_step_);
-  WriteToken(os, binary, "<PatchStride>");
-  WriteBasicType(os, binary, patch_stride_);
-  WriteToken(os, binary, "<AppendedConv>");
-  WriteBasicType(os, binary, appended_conv_);
-  WriteToken(os, binary, "<FilterParams>");
-  filter_params_.Write(os, binary);
-  WriteToken(os, binary, "<BiasParams>");
-  bias_params_.Write(os, binary);
-  WriteToken(os, binary, "<IsGradient>");
-  WriteBasicType(os, binary, is_gradient_);
-  WriteToken(os, binary, ostr_end.str());
-}
-
-BaseFloat Convolutional1dComponent::DotProduct(const UpdatableComponent &other_in) const {
-  const Convolutional1dComponent *other =
-      dynamic_cast<const Convolutional1dComponent*>(&other_in);
-  return TraceMatMat(filter_params_, other->filter_params_, kTrans)
-         + VecVec(bias_params_, other->bias_params_);
-}
-
-Component* Convolutional1dComponent::Copy() const {
-  Convolutional1dComponent *ans = new Convolutional1dComponent();
-  ans->learning_rate_ = learning_rate_;
-  ans->patch_dim_ = patch_dim_;
-  ans->patch_step_ = patch_step_;
-  ans->patch_stride_ = patch_stride_;
-  ans->filter_params_ = filter_params_;
-  ans->bias_params_ = bias_params_;
-  ans->appended_conv_ = appended_conv_;
-  ans->is_gradient_ = is_gradient_;
-  return ans;
-}
-
-void Convolutional1dComponent::PerturbParams(BaseFloat stddev) {
-  CuMatrix<BaseFloat> temp_filter_params(filter_params_);
-  temp_filter_params.SetRandn();
-  filter_params_.AddMat(stddev, temp_filter_params);
-
-  CuVector<BaseFloat> temp_bias_params(bias_params_);
-  temp_bias_params.SetRandn();
-  bias_params_.AddVec(stddev, temp_bias_params);
-}
-
-void Convolutional1dComponent::SetParams(const VectorBase<BaseFloat> &bias,
-                                         const MatrixBase<BaseFloat> &filter) {
-  bias_params_ = bias;
-  filter_params_ = filter;
-  KALDI_ASSERT(bias_params_.Dim() == filter_params_.NumRows());
-}
-
-int32 Convolutional1dComponent::GetParameterDim() const {
-  return (filter_params_.NumCols() + 1) * filter_params_.NumRows();
-}
-
-// update parameters
-void Convolutional1dComponent::Update(const CuMatrixBase<BaseFloat> &in_value,
-                                      const CuMatrixBase<BaseFloat> &out_deriv) {
-  // useful dims
-  int32 num_patches = 1 + (patch_stride_ - patch_dim_) / patch_step_;
-  int32 num_filters = filter_params_.NumRows();
-  int32 filter_dim = filter_params_.NumCols();
-  int32 num_frames = in_value.NumRows();
-  int32 num_splice = InputDim() / patch_stride_;
-  CuMatrix<BaseFloat> filters_grad;
-  CuVector<BaseFloat> bias_grad;
-
-  /** Buffer of reshaped inputs:
-   *  1row = vectorized rectangular feature patches
-   *  1col = dim over speech frames,
-   */
-  CuMatrix<BaseFloat> patches(num_frames, filter_dim * num_patches, kUndefined);
-  std::vector<int32> column_map(filter_dim * num_patches);
-  for (int32 patch = 0, index = 0; patch < num_patches; patch++) {
-    int32 fstride = patch * patch_step_;
-    for (int32 splice = 0; splice < num_splice; splice++) {
-      int32 cstride = splice * patch_stride_;
-      for (int32 d = 0; d < patch_dim_; d++, index++) {
-        if (appended_conv_)
-          column_map[index] = (fstride + d) * num_splice + splice;
-        else
-          column_map[index] = fstride + cstride + d;
-      }
-    }
-  }
-  CuArray<int32> cu_cols(column_map);
-  patches.CopyCols(in_value, cu_cols);
-
-  //
-  // calculate the gradient
-  //
-  filters_grad.Resize(num_filters, filter_dim, kSetZero); // reset
-  bias_grad.Resize(num_filters, kSetZero); // reset
-
-  //
-  // use all the patches
-  //
-
-  // create a single large matrix holding the smaller matrices
-  // from the vector container filters_grad_batch along the rows
-  CuMatrix<BaseFloat> filters_grad_blocks_batch(
-      num_patches * filters_grad.NumRows(), filters_grad.NumCols());
-
-  std::vector<CuSubMatrix<BaseFloat>* > filters_grad_batch, diff_patch_batch,
-      patch_batch;
-  for (int32 p = 0; p < num_patches; p++) {
-    // form batch in vector container
-    filters_grad_batch.push_back(new CuSubMatrix<BaseFloat>(
-        filters_grad_blocks_batch.RowRange(
-            p * filters_grad.NumRows(),
-            filters_grad.NumRows())));
-    diff_patch_batch.push_back(new CuSubMatrix<BaseFloat>(out_deriv.ColRange(
-        p * num_filters, num_filters)));
-    patch_batch.push_back(new CuSubMatrix<BaseFloat>(patches.ColRange(
-        p * filter_dim, filter_dim)));
-  }
-
-  AddMatMatBatched<BaseFloat>(1.0, filters_grad_batch, diff_patch_batch,
-                              kTrans, patch_batch, kNoTrans, 1.0);
-
-  // add the row blocks together to filters_grad
-  filters_grad.AddMatBlocks(1.0, filters_grad_blocks_batch);
-
-  // create a matrix holding the col blocks sum of out_deriv
-  CuMatrix<BaseFloat> out_deriv_col_blocks_sum(out_deriv.NumRows(), num_filters);
-
-  // add the col blocks together to out_deriv_col_blocks_sum
-  out_deriv_col_blocks_sum.AddMatBlocks(1.0, out_deriv);
-
-  bias_grad.AddRowSumMat(1.0, out_deriv_col_blocks_sum, 1.0);
-
-  // release memory
-  for (int32 p = 0; p < num_patches; p++) {
-    delete filters_grad_batch[p];
-    delete diff_patch_batch[p];
-    delete patch_batch[p];
-  }
-
-  //
-  // update
-  //
-  filter_params_.AddMat(learning_rate_, filters_grad);
-  bias_params_.AddVec(learning_rate_, bias_grad);
-}
-
-void MaxpoolingComponent::Init(int32 input_dim, int32 output_dim,
-                               int32 pool_size, int32 pool_stride)  {
-  input_dim_ = input_dim;
-  output_dim_ = output_dim;
-  pool_size_ = pool_size;
-  pool_stride_ = pool_stride;
-
-  // sanity check
-  // number of patches
-  KALDI_ASSERT(input_dim_ % pool_stride_ == 0);
-  int32 num_patches = input_dim_ / pool_stride_;
-  // number of pools
-  KALDI_ASSERT(num_patches % pool_size_ == 0);
-  int32 num_pools = num_patches / pool_size_;
-  // check output dim
-  KALDI_ASSERT(output_dim_ == num_pools * pool_stride_);
-}
-
-void MaxpoolingComponent::InitFromString(std::string args) {
-  std::string orig_args(args);
-  int32 input_dim = 0;
-  int32 output_dim = 0;
-  int32 pool_size = -1, pool_stride = -1;
-  bool ok = true;
-
-  ok = ok && ParseFromString("input-dim", &args, &input_dim);
-  ok = ok && ParseFromString("output-dim", &args, &output_dim);
-  ok = ok && ParseFromString("pool-size", &args, &pool_size);
-  ok = ok && ParseFromString("pool-stride", &args, &pool_stride);
-
-  KALDI_LOG << output_dim << " " << input_dim << " " << ok;
-  KALDI_LOG << "Pool: " << pool_size << " "
-            << pool_stride << " " << ok;
-  if (!ok || !args.empty() || output_dim <= 0)
-    KALDI_ERR << "Invalid initializer for layer of type "
-              << Type() << ": \"" << orig_args << "\"";
-  Init(input_dim, output_dim, pool_size, pool_stride);
-}
-
-/*
-   Input and output of maxpooling component is arranged as
-   x (time), y (frequency), z (channel)
-   for efficient pooling.
- */
-void MaxpoolingComponent::Propagate(const ChunkInfo &in_info,
-                                    const ChunkInfo &out_info,
-                                    const CuMatrixBase<BaseFloat> &in,
-                                    CuMatrixBase<BaseFloat> *out) const  {
-  in_info.CheckSize(in);
-  out_info.CheckSize(*out);
-  KALDI_ASSERT(in_info.NumChunks() == out_info.NumChunks());
-  int32 num_patches = input_dim_ / pool_stride_;
-  int32 num_pools = num_patches / pool_size_;
-
-  // do the max-pooling
-  for (int32 q = 0; q < num_pools; q++) {
-    // get output buffer of the pool
-    CuSubMatrix<BaseFloat> pool(out->ColRange(q * pool_stride_, pool_stride_));
-    pool.Set(-1e20); // reset a large negative value
-    for (int32 r = 0; r < pool_size_; r++) {
-      // col-by-col block comparison pool
-      int32 p = r + q * pool_size_;
-      pool.Max(in.ColRange(p * pool_stride_, pool_stride_));
-    }
-  }
-}
-
-void MaxpoolingComponent::Backprop(const ChunkInfo &, // in_info,
-                                   const ChunkInfo &, // out_info,
-                                   const CuMatrixBase<BaseFloat> &in_value,
-                                   const CuMatrixBase<BaseFloat> &out_value,
-                                   const CuMatrixBase<BaseFloat> &out_deriv,
-                                   Component *to_update,
-                                   CuMatrix<BaseFloat> *in_deriv) const {
-  int32 num_patches = input_dim_ / pool_stride_;
-  int32 num_pools = num_patches / pool_size_;
-  std::vector<int32> patch_summands(num_patches, 0);
-  in_deriv->Resize(in_value.NumRows(), in_value.NumCols(), kSetZero);
-
-  for(int32 q = 0; q < num_pools; q++) {
-    for(int32 r = 0; r < pool_size_; r++) {
-      int32 p = r + q * pool_size_;
-      CuSubMatrix<BaseFloat> in_p(in_value.ColRange(p * pool_stride_, pool_stride_));
-      CuSubMatrix<BaseFloat> out_q(out_value.ColRange(q * pool_stride_, pool_stride_));
-      CuSubMatrix<BaseFloat> tgt(in_deriv->ColRange(p * pool_stride_, pool_stride_));
-      CuMatrix<BaseFloat> src(out_deriv.ColRange(q * pool_stride_, pool_stride_));
-      // zero-out mask
-      CuMatrix<BaseFloat> mask;
-      in_p.EqualElementMask(out_q, &mask);
-      src.MulElements(mask);
-      tgt.AddMat(1.0, src);
-      // summed deriv info
-      patch_summands[p] += 1;
-    }
-  }
-
-  // scale in_deriv of overlaped pools
-  for(int32 p = 0; p < num_patches; p++) {
-    CuSubMatrix<BaseFloat> tgt(in_deriv->ColRange(p * pool_stride_, pool_stride_));
-    KALDI_ASSERT(patch_summands[p] > 0);
-    tgt.Scale(1.0 / patch_summands[p]);
-  }
-}
-
-void MaxpoolingComponent::Read(std::istream &is, bool binary) {
-  ExpectOneOrTwoTokens(is, binary, "<MaxpoolingComponent>", "<InputDim>");
-  ReadBasicType(is, binary, &input_dim_);
-  ExpectToken(is, binary, "<OutputDim>");
-  ReadBasicType(is, binary, &output_dim_);
-  ExpectToken(is, binary, "<PoolSize>");
-  ReadBasicType(is, binary, &pool_size_);
-  ExpectToken(is, binary, "<PoolStride>");
-  ReadBasicType(is, binary, &pool_stride_);
-  ExpectToken(is, binary, "</MaxpoolingComponent>");
-}
-
-void MaxpoolingComponent::Write(std::ostream &os, bool binary) const {
-  WriteToken(os, binary, "<MaxpoolingComponent>");
-  WriteToken(os, binary, "<InputDim>");
-  WriteBasicType(os, binary, input_dim_);
-  WriteToken(os, binary, "<OutputDim>");
-  WriteBasicType(os, binary, output_dim_);
-  WriteToken(os, binary, "<PoolSize>");
-  WriteBasicType(os, binary, pool_size_);
-  WriteToken(os, binary, "<PoolStride>");
-  WriteBasicType(os, binary, pool_stride_);
-  WriteToken(os, binary, "</MaxpoolingComponent>");
-}
-
-std::string MaxpoolingComponent::Info() const {
-  std::stringstream stream;
-  stream << Type() << ", input-dim = " << input_dim_
-         << ", output-dim = " << output_dim_
-         << ", pool-size = " << pool_size_
-         << ", pool-stride = " << pool_stride_;
-  return stream.str();
-}
-
-} // namespace nnet2
-} // namespace kaldi
diff --git a/src/nnet2/nnet-component.h b/src/nnet2/nnet-component.h
deleted file mode 100644
index 673467eeee4..00000000000
--- a/src/nnet2/nnet-component.h
+++ /dev/null
@@ -1,1816 +0,0 @@
-// nnet2/nnet-component.h
-
-// Copyright 2011-2013  Karel Vesely
-//           2012-2014  Johns Hopkins University (author: Daniel Povey)
-//                2013  Xiaohui Zhang
-//                2014  Vijayaditya Peddinti
-//           2014-2015  Guoguo Chen
-
-// See ../../COPYING for clarification regarding multiple authors
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//  http://www.apache.org/licenses/LICENSE-2.0
-//
-// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
-// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
-// MERCHANTABLITY OR NON-INFRINGEMENT.
-// See the Apache 2 License for the specific language governing permissions and
-// limitations under the License.
-
-#ifndef KALDI_NNET2_NNET_COMPONENT_H_
-#define KALDI_NNET2_NNET_COMPONENT_H_
-
-#include <mutex>
-#include "base/kaldi-common.h"
-#include "itf/options-itf.h"
-#include "matrix/matrix-lib.h"
-#include "cudamatrix/cu-matrix-lib.h"
-#include "nnet2/nnet-precondition-online.h"
-
-#include <iostream>
-
-namespace kaldi {
-namespace nnet2 {
-
-
-/**
-   ChunkInfo is a class whose purpose is to describe the structure of matrices
-   holding features.  This is useful mostly in training time.
-   The main reason why we have this is to support efficient
-   training for networks which we have splicing components that splice in a
-   non-contiguous way, e.g. frames -5, 0 and 5.  We also have in mind future
-   extensibility to convnets which might have similar issues.  This class
-   describes the structure of a minibatch of features, or of a single
-   contiguous block of features.
-   Examples are as follows, and offsets is empty if not mentioned:
-     When decoding, at input to the network:
-       feat_dim = 13, num_chunks = 1, first_offset = 0, last_offset = 691
-      and in the middle of the network (assuming splicing is +-7):
-       feat_dim = 1024, num_chunks = 1, first_offset = 7, last_offset = 684
-    When training, at input to the network:
-      feat_dim = 13, num_chunks = 512, first_offset = 0, last_offset= 14
-     and in the middle of the network:
-      feat_dim = 1024, num_chunks = 512, first_offset = 7, last_offset = 7
-   The only situation where offsets would be nonempty would be if we do
-   splicing with gaps in.  E.g. suppose at network input we splice +-2 frames
-   (contiguous) and somewhere in the middle we splice frames {-5, 0, 5}, then
-   we would have the following while training
-     At input to the network:
-      feat_dim = 13, num_chunks = 512, first_offset = 0, last_offset = 14
-     After the first hidden layer:
-      feat_dim = 1024, num_chunks = 512, first_offset = 2, last_offset = 12,
-       offsets = {2, 10, 12}
-     At the output of the last hidden layer (after the {-5, 0, 5} splice):
-      feat_dim = 1024, num_chunks = 512, first_offset = 7, last_offset = 7
-   (the decoding setup would still look pretty normal, so we don't give an example).
-
-*/
-class ChunkInfo {
- public:
-  ChunkInfo()  // default constructor we assume this object will not be used
-      : feat_dim_(0), num_chunks_(0),
-        first_offset_(0), last_offset_(0),
-        offsets_() { }
-
-  ChunkInfo(int32 feat_dim, int32 num_chunks,
-            int32 first_offset, int32 last_offset )
-      : feat_dim_(feat_dim), num_chunks_(num_chunks),
-        first_offset_(first_offset), last_offset_(last_offset),
-        offsets_() { Check(); }
-
-  ChunkInfo(int32 feat_dim, int32 num_chunks,
-            const std::vector<int32> offsets)
-      : feat_dim_(feat_dim), num_chunks_(num_chunks),
-        first_offset_(offsets.front()), last_offset_(offsets.back()),
-        offsets_(offsets) { if (last_offset_ - first_offset_ + 1 == offsets_.size())
-                              offsets_.clear();
-          Check(); }
-
-  // index : actual row index in the current chunk
-  // offset : the time offset of feature frame at current row in the chunk
-  // As described above offsets can take a variety of values, we see the indices
-  // corresponding to the offsets in each case
-  // 1) if first_offset = 0 & last_offset = 691, then chunk has data
-  // corresponding to time offsets 0:691, so index = offset
-  // 2) if first_offset = 7 & last_offset = 684,
-  //      then index = offset - first offset
-  // 3) if offsets = {2, 10, 12} then indices for these offsets are 0, 1 and 2
-
-  // Returns the chunk row index corresponding to given time offset
-  int32 GetIndex (int32 offset) const;
-
-  // Returns time offset at the current row index in the chunk
-  int32 GetOffset (int32 index) const;
-
-  // Makes the offsets vector empty, to ensure that the chunk is processed as a
-  // contiguous chunk with the given first_offset and last_offset
-  void MakeOffsetsContiguous () { offsets_.clear(); Check(); }
-
-  // Returns chunk size, meaning the number of distinct frame-offsets we
-  // have for each chunk (they don't have to be contiguous).
-  inline int32 ChunkSize() const { return NumRows() / num_chunks_; }
-
-  // Returns number of chunks we expect the feature matrix to have
-  inline int32 NumChunks() const { return num_chunks_; }
-
-  /// Returns the number of rows that we expect the feature matrix to have.
-  int32 NumRows() const {
-    return num_chunks_ * (!offsets_.empty() ? offsets_.size() :
-                                         last_offset_ - first_offset_ + 1); }
-
-  /// Returns the number of columns that we expect the feature matrix to have.
-  int32 NumCols() const { return feat_dim_; }
-
-  /// Checks that the matrix has the size we expect, and die if not.
-  void CheckSize(const CuMatrixBase<BaseFloat> &mat) const;
-
-  /// Checks that the data in the ChunkInfo is valid, and die if not.
-  void Check() const;
-
- private:
-  int32 feat_dim_;  // Feature dimension.
-  int32 num_chunks_;  // Number of separate equal-sized chunks of features
-  int32 first_offset_;  // Start time offset within each chunk, numbered so that at
-                      // the input to the network, the first_offset of the first
-                      // feature would always be zero.
-  int32 last_offset_;  // End time offset within each chunk.
-  std::vector<int32> offsets_; // offsets is only nonempty if the chunk contains
-                             // a non-contiguous sequence.  If nonempty, it must
-                             // be sorted, and offsets.front() == first_offset,
-                             // offsets.back() == last_offset.
-
-};
-
-/**
- * Abstract class, basic element of the network,
- * it is a box with defined inputs, outputs,
- * and tranformation functions interface.
- *
- * It is able to propagate and backpropagate
- * exact implementation is to be implemented in descendants.
- *
- */
-class Component {
- public:
-  Component(): index_(-1) { }
-
-  virtual std::string Type() const = 0; // each type should return a string such as
-  // "SigmoidComponent".
-
-  /// Returns the index in the sequence of layers in the neural net; intended only
-  /// to be used in debugging information.
-  virtual int32 Index() const { return index_; }
-
-  virtual void SetIndex(int32 index) { index_ = index; }
-
-  /// Initialize, typically from a line of a config file.  The "args" will
-  /// contain any parameters that need to be passed to the Component, e.g.
-  /// dimensions.
-  virtual void InitFromString(std::string args) = 0;
-
-  /// Get size of input vectors
-  virtual int32 InputDim() const = 0;
-
-  /// Get size of output vectors
-  virtual int32 OutputDim() const = 0;
-
-  /// Return a vector describing the temporal context this component requires
-  /// for each frame of output, as a sorted list.  The default implementation
-  /// returns a vector ( 0 ), but a splicing layer might return e.g. (-2, -1, 0,
-  /// 1, 2), but it doesn't have to be contiguous.  Note : The context needed by
-  /// the entire network is a function of the contexts needed by all the
-  /// components.  It is required that Context().front() <= 0 and
-  /// Context().back() >= 0.
-  virtual std::vector<int32> Context() const { return std::vector<int32>(1, 0); }
-
-  /// Perform forward pass propagation Input->Output.  Each row is
-  /// one frame or training example.  Interpreted as "num_chunks"
-  /// equally sized chunks of frames; this only matters for layers
-  /// that do things like context splicing.  Typically this variable
-  /// will either be 1 (when we're processing a single contiguous
-  /// chunk of data) or will be the same as in.NumFrames(), but
-  /// other values are possible if some layers do splicing.
-  virtual void Propagate(const ChunkInfo &in_info,
-                         const ChunkInfo &out_info,
-                         const CuMatrixBase<BaseFloat> &in,
-                         CuMatrixBase<BaseFloat> *out) const = 0;
-
-  /// A non-virtual propagate function that first resizes output if necessary.
-  void Propagate(const ChunkInfo &in_info,
-                 const ChunkInfo &out_info,
-                 const CuMatrixBase<BaseFloat> &in,
-                 CuMatrix<BaseFloat> *out) const {
-    if (out->NumRows() != out_info.NumRows() ||
-        out->NumCols() != out_info.NumCols()) {
-      out->Resize(out_info.NumRows(), out_info.NumCols());
-    }
-
-    // Cast to CuMatrixBase to use the virtual version of propagate function.
-    Propagate(in_info, out_info, in,
-              static_cast<CuMatrixBase<BaseFloat>*>(out));
-  }
-
-  /// Perform backward pass propagation of the derivative, and
-  /// also either update the model (if to_update == this) or
-  /// update another model or compute the model derivative (otherwise).
-  /// Note: in_value and out_value are the values of the input and output
-  /// of the component, and these may be dummy variables if respectively
-  /// BackpropNeedsInput() or BackpropNeedsOutput() return false for
-  /// that component (not all components need these).
-  ///
-  /// num_chunks lets us treat the input matrix as contiguous-in-time
-  /// chunks of equal size; it only matters if splicing is involved.
-  virtual void Backprop(const ChunkInfo &in_info,
-                        const ChunkInfo &out_info,
-                        const CuMatrixBase<BaseFloat> &in_value,
-                        const CuMatrixBase<BaseFloat> &out_value,
-                        const CuMatrixBase<BaseFloat> &out_deriv,
-                        Component *to_update, // may be identical to "this".
-                        CuMatrix<BaseFloat> *in_deriv) const = 0;
-
-  virtual bool BackpropNeedsInput() const { return true; } // if this returns false,
-  // the "in_value" to Backprop may be a dummy variable.
-  virtual bool BackpropNeedsOutput() const { return true; } // if this returns false,
-  // the "out_value" to Backprop may be a dummy variable.
-
-  /// Read component from stream
-  static Component* ReadNew(std::istream &is, bool binary);
-
-  /// Copy component (deep copy).
-  virtual Component* Copy() const = 0;
-
-  /// Initialize the Component from one line that will contain
-  /// first the type, e.g. SigmoidComponent, and then
-  /// a number of tokens (typically integers or floats) that will
-  /// be used to initialize the component.
-  static Component *NewFromString(const std::string &initializer_line);
-
-  /// Return a new Component of the given type e.g. "SoftmaxComponent",
-  /// or NULL if no such type exists.
-  static Component *NewComponentOfType(const std::string &type);
-
-  virtual void Read(std::istream &is, bool binary) = 0; // This Read function
-  // requires that the Component has the correct type.
-
-  /// Write component to stream
-  virtual void Write(std::ostream &os, bool binary) const = 0;
-
-  virtual std::string Info() const;
-
-  virtual ~Component() { }
-
- private:
-  int32 index_;
-  KALDI_DISALLOW_COPY_AND_ASSIGN(Component);
-};
-
-
-/**
- * Class UpdatableComponent is a Component which has
- * trainable parameters and contains some global
- * parameters for stochastic gradient descent
- * (learning rate, L2 regularization constant).
- * This is a base-class for Components with parameters.
- */
-class UpdatableComponent: public Component {
- public:
-  UpdatableComponent(const UpdatableComponent &other):
-      learning_rate_(other.learning_rate_){ }
-
-  void Init(BaseFloat learning_rate) {
-    learning_rate_ = learning_rate;
-  }
-  UpdatableComponent(BaseFloat learning_rate) {
-    Init(learning_rate);
-  }
-
-  /// Set parameters to zero, and if treat_as_gradient is true, we'll be
-  /// treating this as a gradient so set the learning rate to 1 and make any
-  /// other changes necessary (there's a variable we have to set for the
-  /// MixtureProbComponent).
-  virtual void SetZero(bool treat_as_gradient) = 0;
-
-  UpdatableComponent(): learning_rate_(0.001) { }
-
-  virtual ~UpdatableComponent() { }
-
-  /// Here, "other" is a component of the same specific type.  This
-  /// function computes the dot product in parameters, and is computed while
-  /// automatically adjusting learning rates; typically, one of the two will
-  /// actually contain the gradient.
-  virtual BaseFloat DotProduct(const UpdatableComponent &other) const = 0;
-
-  /// We introduce a new virtual function that only applies to
-  /// class UpdatableComponent.  This is used in testing.
-  virtual void PerturbParams(BaseFloat stddev) = 0;
-
-  /// This new virtual function scales the parameters
-  /// by this amount.
-  virtual void Scale(BaseFloat scale) = 0;
-
-  /// This new virtual function adds the parameters of another
-  /// updatable component, times some constant, to the current
-  /// parameters.
-  virtual void Add(BaseFloat alpha, const UpdatableComponent &other) = 0;
-
-  /// Sets the learning rate of gradient descent
-  void SetLearningRate(BaseFloat lrate) {  learning_rate_ = lrate; }
-  /// Gets the learning rate of gradient descent
-  BaseFloat LearningRate() const { return learning_rate_; }
-
-  virtual std::string Info() const;
-
-  // The next few functions are not implemented everywhere; they are
-  // intended for use by L-BFGS code, and we won't implement them
-  // for all child classes.
-
-  /// The following new virtual function returns the total dimension of
-  /// the parameters in this class.  E.g. used for L-BFGS update
-  virtual int32 GetParameterDim() const { KALDI_ASSERT(0); return 0; }
-
-  /// Turns the parameters into vector form.  We put the vector form on the CPU,
-  /// because in the kinds of situations where we do this, we'll tend to use
-  /// too much memory for the GPU.
-  virtual void Vectorize(VectorBase<BaseFloat> *params) const { KALDI_ASSERT(0); }
-  /// Converts the parameters from vector form.
-  virtual void UnVectorize(const VectorBase<BaseFloat> &params) {
-    KALDI_ASSERT(0);
-  }
-
- protected:
-  BaseFloat learning_rate_; ///< learning rate (0.0..0.01)
- private:
-  const UpdatableComponent &operator = (const UpdatableComponent &other); // Disallow.
-};
-
-/// This kind of Component is a base-class for things like
-/// sigmoid and softmax.
-class NonlinearComponent: public Component {
- public:
-  void Init(int32 dim) { dim_ = dim; count_ = 0.0; }
-  explicit NonlinearComponent(int32 dim) { Init(dim); }
-  NonlinearComponent(): dim_(0) { } // e.g. prior to Read().
-  explicit NonlinearComponent(const NonlinearComponent &other);
-
-  virtual int32 InputDim() const { return dim_; }
-  virtual int32 OutputDim() const { return dim_; }
-
-  /// We implement InitFromString at this level.
-  virtual void InitFromString(std::string args);
-
-  /// We implement Read at this level as it just needs the Type().
-  virtual void Read(std::istream &is, bool binary);
-
-  /// Write component to stream.
-  virtual void Write(std::ostream &os, bool binary) const;
-
-  void Scale(BaseFloat scale); // relates to scaling stats, not parameters.
-  void Add(BaseFloat alpha, const NonlinearComponent &other); // relates to
-                                                              // adding stats
-
-  // The following functions are unique to NonlinearComponent.
-  // They mostly relate to diagnostics.
-  const CuVector<double> &ValueSum() const { return value_sum_; }
-  const CuVector<double> &DerivSum() const { return deriv_sum_; }
-  double Count() const { return count_; }
-
-  // The following function is used when "widening" neural networks.
-  void SetDim(int32 dim);
-
- protected:
-  friend class NormalizationComponent;
-  friend class SigmoidComponent;
-  friend class TanhComponent;
-  friend class SoftmaxComponent;
-  friend class LogSoftmaxComponent;
-  friend class RectifiedLinearComponent;
-  friend class SoftHingeComponent;
-
-
-  // This function updates the stats "value_sum_", "deriv_sum_", and
-  // count_. (If deriv == NULL, it won't update "deriv_sum_").
-  // It will be called from the Backprop function of child classes.
-  void UpdateStats(const CuMatrixBase<BaseFloat> &out_value,
-                   const CuMatrixBase<BaseFloat> *deriv = NULL);
-
-
-  const NonlinearComponent &operator = (const NonlinearComponent &other); // Disallow.
-  int32 dim_;
-  CuVector<double> value_sum_; // stats at the output.
-  CuVector<double> deriv_sum_; // stats of the derivative of the nonlinearity (only
-  // applicable to element-by-element nonlinearities, not Softmax.
-  double count_;
-  // The mutex is used in UpdateStats, only for resizing vectors.
-  std::mutex mutex_;
-};
-
-class MaxoutComponent: public Component {
- public:
-  void Init(int32 input_dim, int32 output_dim);
-  explicit MaxoutComponent(int32 input_dim, int32 output_dim) {
-    Init(input_dim, output_dim);
-  }
-  MaxoutComponent(): input_dim_(0), output_dim_(0) { }
-  virtual std::string Type() const { return "MaxoutComponent"; }
-  virtual void InitFromString(std::string args);
-  virtual int32 InputDim() const { return input_dim_; }
-  virtual int32 OutputDim() const { return output_dim_; }
-  using Component::Propagate; // to avoid name hiding
-  virtual void Propagate(const ChunkInfo &in_info,
-                         const ChunkInfo &out_info,
-                         const CuMatrixBase<BaseFloat> &in,
-                         CuMatrixBase<BaseFloat> *out) const;
-  virtual void Backprop(const ChunkInfo &in_info,
-                        const ChunkInfo &out_info,
-                        const CuMatrixBase<BaseFloat> &in_value,
-                        const CuMatrixBase<BaseFloat> &,  //out_value,
-                        const CuMatrixBase<BaseFloat> &out_deriv,
-                        Component *to_update, // may be identical to "this".
-                        CuMatrix<BaseFloat> *in_deriv) const;
-  virtual bool BackpropNeedsInput() const { return true; }
-  virtual bool BackpropNeedsOutput() const { return true; }
-  virtual Component* Copy() const { return new MaxoutComponent(input_dim_,
-                                                              output_dim_); }
-
-  virtual void Read(std::istream &is, bool binary); // This Read function
-  // requires that the Component has the correct type.
-
-  /// Write component to stream
-  virtual void Write(std::ostream &os, bool binary) const;
-
-  virtual std::string Info() const;
- protected:
-  int32 input_dim_;
-  int32 output_dim_;
-};
-
-/**
- * MaxPoolingComponent :
- * Maxpooling component was firstly used in ConvNet for selecting an representative
- * activation in an area. It inspired Maxout nonlinearity.
- *
- * The input/output matrices are split to submatrices with width 'pool_stride_'.
- * For instance, a minibatch of 512 frames is propagated by a convolutional
- * layer, resulting in a 512 x 3840 input matrix for MaxpoolingComponent,
- * which is composed of 128 feature maps for each frame (128 x 30). If you want
- * a 3-to-1 maxpooling on each feature map, set 'pool_stride_' and 'pool_size_'
- * as 128 and 3 respectively. Maxpooling component would create an output
- * matrix of 512 x 1280. The 30 input neurons are grouped by a group size of 3, and
- * the maximum in a group is selected, creating a smaller feature map of 10.
- *
- * Our pooling does not supports overlaps, which simplifies the
- * implementation (and was not helpful for Ossama).
- */
-class MaxpoolingComponent: public Component {
- public:
-  void Init(int32 input_dim, int32 output_dim,
-            int32 pool_size, int32 pool_stride);
-  explicit MaxpoolingComponent(int32 input_dim, int32 output_dim,
-                               int32 pool_size, int32 pool_stride) {
-    Init(input_dim, output_dim, pool_size, pool_stride);
-  }
-  MaxpoolingComponent(): input_dim_(0), output_dim_(0),
-    pool_size_(0), pool_stride_(0) { }
-  virtual std::string Type() const { return "MaxpoolingComponent"; }
-  virtual void InitFromString(std::string args);
-  virtual int32 InputDim() const { return input_dim_; }
-  virtual int32 OutputDim() const { return output_dim_; }
-  using Component::Propagate; // to avoid name hiding
-  virtual void Propagate(const ChunkInfo &in_info,
-                         const ChunkInfo &out_info,
-                         const CuMatrixBase<BaseFloat> &in,
-                         CuMatrixBase<BaseFloat> *out) const;
-  virtual void Backprop(const ChunkInfo &in_info,
-                        const ChunkInfo &out_info,
-                        const CuMatrixBase<BaseFloat> &in_value,
-                        const CuMatrixBase<BaseFloat> &,  //out_value,
-                        const CuMatrixBase<BaseFloat> &out_deriv,
-                        Component *to_update, // may be identical to "this".
-                        CuMatrix<BaseFloat> *in_deriv) const;
-  virtual bool BackpropNeedsInput() const { return true; }
-  virtual bool BackpropNeedsOutput() const { return true; }
-  virtual Component* Copy() const {
-    return new MaxpoolingComponent(input_dim_, output_dim_,
-                               pool_size_, pool_stride_); }
-
-  virtual void Read(std::istream &is, bool binary); // This Read function
-  // requires that the Component has the correct type.
-
-  /// Write component to stream
-  virtual void Write(std::ostream &os, bool binary) const;
-
-  virtual std::string Info() const;
- protected:
-  int32 input_dim_;
-  int32 output_dim_;
-  int32 pool_size_;
-  int32 pool_stride_;
-};
-
-class PnormComponent: public Component {
- public:
-  void Init(int32 input_dim, int32 output_dim, BaseFloat p);
-  explicit PnormComponent(int32 input_dim, int32 output_dim, BaseFloat p) {
-    Init(input_dim, output_dim, p);
-  }
-  PnormComponent(): input_dim_(0), output_dim_(0), p_(0) { }
-  virtual std::string Type() const { return "PnormComponent"; }
-  virtual void InitFromString(std::string args);
-  virtual int32 InputDim() const { return input_dim_; }
-  virtual int32 OutputDim() const { return output_dim_; }
-  using Component::Propagate; // to avoid name hiding
-  virtual void Propagate(const ChunkInfo &in_info,
-                         const ChunkInfo &out_info,
-                         const CuMatrixBase<BaseFloat> &in,
-                         CuMatrixBase<BaseFloat> *out) const;
-  virtual void Backprop(const ChunkInfo &in_info,
-                        const ChunkInfo &out_info,
-                        const CuMatrixBase<BaseFloat> &in_value,
-                        const CuMatrixBase<BaseFloat> &,  //out_value,
-                        const CuMatrixBase<BaseFloat> &out_deriv,
-                        Component *to_update, // may be identical to "this".
-                        CuMatrix<BaseFloat> *in_deriv) const;
-  virtual bool BackpropNeedsInput() const { return true; }
-  virtual bool BackpropNeedsOutput() const { return true; }
-  virtual Component* Copy() const { return new PnormComponent(input_dim_,
-                                                              output_dim_, p_); }
-
-  virtual void Read(std::istream &is, bool binary); // This Read function
-  // requires that the Component has the correct type.
-
-  /// Write component to stream
-  virtual void Write(std::ostream &os, bool binary) const;
-
-  virtual std::string Info() const;
- protected:
-  int32 input_dim_;
-  int32 output_dim_;
-  BaseFloat p_;
-};
-
-class NormalizeComponent: public NonlinearComponent {
- public:
-  explicit NormalizeComponent(int32 dim): NonlinearComponent(dim) { }
-  explicit NormalizeComponent(const NormalizeComponent &other): NonlinearComponent(other) { }
-  NormalizeComponent() { }
-  virtual std::string Type() const { return "NormalizeComponent"; }
-  virtual Component* Copy() const { return new NormalizeComponent(*this); }
-  virtual bool BackpropNeedsInput() const { return true; }
-  virtual bool BackpropNeedsOutput() const { return false; }
-  using Component::Propagate; // to avoid name hiding
-  virtual void Propagate(const ChunkInfo &in_info,
-                         const ChunkInfo &out_info,
-                         const CuMatrixBase<BaseFloat> &in,
-                         CuMatrixBase<BaseFloat> *out) const;
-  virtual void Backprop(const ChunkInfo &in_info,
-                        const ChunkInfo &out_info,
-                        const CuMatrixBase<BaseFloat> &in_value,
-                        const CuMatrixBase<BaseFloat> &out_value,
-                        const CuMatrixBase<BaseFloat> &out_deriv,
-                        Component *to_update, // may be identical to "this".
-                        CuMatrix<BaseFloat> *in_deriv) const;
- private:
-  NormalizeComponent &operator = (const NormalizeComponent &other); // Disallow.
-  static const BaseFloat kNormFloor;
-  // about 0.7e-20.  We need a value that's exactly representable in
-  // float and whose inverse square root is also exactly representable
-  // in float (hence, an even power of two).
-};
-
-
-class SigmoidComponent: public NonlinearComponent {
- public:
-  explicit SigmoidComponent(int32 dim): NonlinearComponent(dim) { }
-  explicit SigmoidComponent(const SigmoidComponent &other): NonlinearComponent(other) { }
-  SigmoidComponent() { }
-  virtual std::string Type() const { return "SigmoidComponent"; }
-  virtual bool BackpropNeedsInput() const { return false; }
-  virtual bool BackpropNeedsOutput() const { return true; }
-  virtual Component* Copy() const { return new SigmoidComponent(*this); }
-  using Component::Propagate; // to avoid name hiding
-  virtual void Propagate(const ChunkInfo &in_info,
-                         const ChunkInfo &out_info,
-                         const CuMatrixBase<BaseFloat> &in,
-                         CuMatrixBase<BaseFloat> *out) const;
-  virtual void Backprop(const ChunkInfo &in_info,
-                        const ChunkInfo &out_info,
-                        const CuMatrixBase<BaseFloat> &in_value,
-                        const CuMatrixBase<BaseFloat> &out_value,
-                        const CuMatrixBase<BaseFloat> &out_deriv,
-                        Component *to_update, // may be identical to "this".
-                        CuMatrix<BaseFloat> *in_deriv) const;
- private:
-  SigmoidComponent &operator = (const SigmoidComponent &other); // Disallow.
-};
-
-class TanhComponent: public NonlinearComponent {
- public:
-  explicit TanhComponent(int32 dim): NonlinearComponent(dim) { }
-  explicit TanhComponent(const TanhComponent &other): NonlinearComponent(other) { }
-  TanhComponent() { }
-  virtual std::string Type() const { return "TanhComponent"; }
-  virtual Component* Copy() const { return new TanhComponent(*this); }
-  virtual bool BackpropNeedsInput() const { return false; }
-  virtual bool BackpropNeedsOutput() const { return true; }
-  using Component::Propagate; // to avoid name hiding
-  virtual void Propagate(const ChunkInfo &in_info,
-                         const ChunkInfo &out_info,
-                         const CuMatrixBase<BaseFloat> &in,
-                         CuMatrixBase<BaseFloat> *out) const;
-  virtual void Backprop(const ChunkInfo &in_info,
-                        const ChunkInfo &out_info,
-                        const CuMatrixBase<BaseFloat> &in_value,
-                        const CuMatrixBase<BaseFloat> &out_value,
-                        const CuMatrixBase<BaseFloat> &out_deriv,
-                        Component *to_update, // may be identical to "this".
-                        CuMatrix<BaseFloat> *in_deriv) const;
- private:
-  TanhComponent &operator = (const TanhComponent &other); // Disallow.
-};
-
-/// Take the absoute values of an input vector to a power.
-/// The derivative for zero input will be treated as zero.
-class PowerComponent: public NonlinearComponent {
- public:
-  void Init(int32 dim, BaseFloat power = 2);
-  explicit PowerComponent(int32 dim, BaseFloat power = 2) {
-    Init(dim, power);
-  }
-  PowerComponent(): dim_(0), power_(2) { }
-  virtual std::string Type() const { return "PowerComponent"; }
-  virtual void InitFromString(std::string args);
-  virtual int32 InputDim() const { return dim_; }
-  virtual int32 OutputDim() const { return dim_; }
-  using Component::Propagate; // to avoid name hiding
-  virtual void Propagate(const ChunkInfo &in_info,
-                         const ChunkInfo &out_info,
-                         const CuMatrixBase<BaseFloat> &in,
-                         CuMatrixBase<BaseFloat> *out) const;
-  virtual void Backprop(const ChunkInfo &in_info,
-                        const ChunkInfo &out_info,
-                        const CuMatrixBase<BaseFloat> &in_value,
-                        const CuMatrixBase<BaseFloat> &out_value,
-                        const CuMatrixBase<BaseFloat> &out_deriv,
-                        Component *to_update, // may be identical to "this".
-                        CuMatrix<BaseFloat> *in_deriv) const;
-  virtual bool BackpropNeedsInput() const { return true; }
-  virtual bool BackpropNeedsOutput() const { return true; }
-  virtual Component* Copy() const { return new PowerComponent(dim_, power_); }
-  virtual void Read(std::istream &is, bool binary); // This Read function
-  // requires that the Component has the correct type.
-
-  /// Write component to stream
-  virtual void Write(std::ostream &os, bool binary) const;
-
-  virtual std::string Info() const;
-
- private:
-  int32 dim_;
-  BaseFloat power_;
-};
-
-class RectifiedLinearComponent: public NonlinearComponent {
- public:
-  explicit RectifiedLinearComponent(int32 dim): NonlinearComponent(dim) { }
-  explicit RectifiedLinearComponent(const RectifiedLinearComponent &other): NonlinearComponent(other) { }
-  RectifiedLinearComponent() { }
-  virtual std::string Type() const { return "RectifiedLinearComponent"; }
-  virtual Component* Copy() const { return new RectifiedLinearComponent(*this); }
-  virtual bool BackpropNeedsInput() const { return false; }
-  virtual bool BackpropNeedsOutput() const { return true; }
-  using Component::Propagate; // to avoid name hiding
-  virtual void Propagate(const ChunkInfo &in_info,
-                         const ChunkInfo &out_info,
-                         const CuMatrixBase<BaseFloat> &in,
-                         CuMatrixBase<BaseFloat> *out) const;
-  virtual void Backprop(const ChunkInfo &in_info,
-                        const ChunkInfo &out_info,
-                        const CuMatrixBase<BaseFloat> &in_value,
-                        const CuMatrixBase<BaseFloat> &out_value,
-                        const CuMatrixBase<BaseFloat> &out_deriv,
-                        Component *to_update, // may be identical to "this".
-                        CuMatrix<BaseFloat> *in_deriv) const;
- private:
-  RectifiedLinearComponent &operator = (const RectifiedLinearComponent &other); // Disallow.
-};
-
-class SoftHingeComponent: public NonlinearComponent {
- public:
-  explicit SoftHingeComponent(int32 dim): NonlinearComponent(dim) { }
-  explicit SoftHingeComponent(const SoftHingeComponent &other): NonlinearComponent(other) { }
-  SoftHingeComponent() { }
-  virtual std::string Type() const { return "SoftHingeComponent"; }
-  virtual Component* Copy() const { return new SoftHingeComponent(*this); }
-  virtual bool BackpropNeedsInput() const { return true; }
-  virtual bool BackpropNeedsOutput() const { return true; }
-  using Component::Propagate; // to avoid name hiding
-  virtual void Propagate(const ChunkInfo &in_info,
-                         const ChunkInfo &out_info,
-                         const CuMatrixBase<BaseFloat> &in,
-                         CuMatrixBase<BaseFloat> *out) const;
-  virtual void Backprop(const ChunkInfo &in_info,
-                        const ChunkInfo &out_info,
-                        const CuMatrixBase<BaseFloat> &in_value,
-                        const CuMatrixBase<BaseFloat> &out_value,
-                        const CuMatrixBase<BaseFloat> &out_deriv,
-                        Component *to_update, // may be identical to "this".
-                        CuMatrix<BaseFloat> *in_deriv) const;
- private:
-  SoftHingeComponent &operator = (const SoftHingeComponent &other); // Disallow.
-};
-
-
-// This class scales the input by a specified constant.  This is, of course,
-// useless, but we use it when we want to change how fast the next layer learns.
-// (e.g. a smaller scale will make the next layer learn slower.)
-class ScaleComponent: public Component {
- public:
-  explicit ScaleComponent(int32 dim, BaseFloat scale): dim_(dim), scale_(scale) { }
-  explicit ScaleComponent(const ScaleComponent &other):
-      dim_(other.dim_), scale_(other.scale_) { }
-  ScaleComponent(): dim_(0), scale_(0.0) { }
-  virtual std::string Type() const { return "ScaleComponent"; }
-  virtual Component* Copy() const { return new ScaleComponent(*this); }
-  virtual bool BackpropNeedsInput() const { return false; }
-  virtual bool BackpropNeedsOutput() const { return false; }
-  using Component::Propagate; // to avoid name hiding
-  virtual void Propagate(const ChunkInfo &in_info,
-                         const ChunkInfo &out_info,
-                         const CuMatrixBase<BaseFloat> &in,
-                         CuMatrixBase<BaseFloat> *out) const;
-  virtual void Backprop(const ChunkInfo &in_info,
-                        const ChunkInfo &out_info,
-                        const CuMatrixBase<BaseFloat> &in_value,
-                        const CuMatrixBase<BaseFloat> &out_value,
-                        const CuMatrixBase<BaseFloat> &out_deriv,
-                        Component *to_update, // may be identical to "this".
-                        CuMatrix<BaseFloat> *in_deriv) const;
-
-  virtual int32 InputDim() const { return dim_; }
-  virtual int32 OutputDim() const { return dim_; }
-  virtual void Read(std::istream &is, bool binary);
-
-  virtual void Write(std::ostream &os, bool binary) const;
-
-  void Init(int32 dim, BaseFloat scale);
-
-  virtual void InitFromString(std::string args);
-
-  virtual std::string Info() const;
-
- private:
-  int32 dim_;
-  BaseFloat scale_;
-  ScaleComponent &operator = (const ScaleComponent &other); // Disallow.
-};
-
-
-
-class SumGroupComponent;  // Forward declaration.
-class AffineComponent;  // Forward declaration.
-class FixedScaleComponent;  // Forward declaration.
-
-class SoftmaxComponent: public NonlinearComponent {
- public:
-  explicit SoftmaxComponent(int32 dim): NonlinearComponent(dim) { }
-  explicit SoftmaxComponent(const SoftmaxComponent &other): NonlinearComponent(other) { }
-  SoftmaxComponent() { }
-  virtual std::string Type() const { return "SoftmaxComponent"; }
-  virtual bool BackpropNeedsInput() const { return false; }
-  virtual bool BackpropNeedsOutput() const { return true; }
-  using Component::Propagate; // to avoid name hiding
-  virtual void Propagate(const ChunkInfo &in_info,
-                         const ChunkInfo &out_info,
-                         const CuMatrixBase<BaseFloat> &in,
-                         CuMatrixBase<BaseFloat> *out) const;
-  virtual void Backprop(const ChunkInfo &in_info,
-                        const ChunkInfo &out_info,
-                        const CuMatrixBase<BaseFloat> &in_value,
-                        const CuMatrixBase<BaseFloat> &out_value,
-                        const CuMatrixBase<BaseFloat> &out_deriv,
-                        Component *to_update, // may be identical to "this".
-                        CuMatrix<BaseFloat> *in_deriv) const;
-
-  void MixUp(int32 num_mixtures,
-             BaseFloat power,
-             BaseFloat min_count,
-             BaseFloat perturb_stddev,
-             AffineComponent *ac,
-             SumGroupComponent *sc);
-
-  virtual Component* Copy() const { return new SoftmaxComponent(*this); }
- private:
-  SoftmaxComponent &operator = (const SoftmaxComponent &other); // Disallow.
-};
-
-class LogSoftmaxComponent: public NonlinearComponent {
- public:
-  explicit LogSoftmaxComponent(int32 dim): NonlinearComponent(dim) { }
-  explicit LogSoftmaxComponent(const LogSoftmaxComponent &other): NonlinearComponent(other) { }
-  LogSoftmaxComponent() { }
-  virtual std::string Type() const { return "LogSoftmaxComponent"; }
-  virtual bool BackpropNeedsInput() const { return false; }
-  virtual bool BackpropNeedsOutput() const { return true; }
-  using Component::Propagate; // to avoid name hiding
-  virtual void Propagate(const ChunkInfo &in_info,
-                         const ChunkInfo &out_info,
-                         const CuMatrixBase<BaseFloat> &in,
-                         CuMatrixBase<BaseFloat> *out) const;
-  virtual void Backprop(const ChunkInfo &in_info,
-                        const ChunkInfo &out_info,
-                        const CuMatrixBase<BaseFloat> &in_value,
-                        const CuMatrixBase<BaseFloat> &out_value,
-                        const CuMatrixBase<BaseFloat> &out_deriv,
-                        Component *to_update, // may be identical to "this".
-                        CuMatrix<BaseFloat> *in_deriv) const;
-
-  virtual Component* Copy() const { return new LogSoftmaxComponent(*this); }
- private:
-  LogSoftmaxComponent &operator = (const LogSoftmaxComponent &other); // Disallow.
-};
-
-
-class FixedAffineComponent;
-
-// Affine means a linear function plus an offset.
-// Note: although this class can be instantiated, it also
-// functions as a base-class for more specialized versions of
-// AffineComponent.
-class AffineComponent: public UpdatableComponent {
-  friend class SoftmaxComponent; // Friend declaration relates to mixing up.
- public:
-  AffineComponent(const AffineComponent &other);
-  // The next constructor is used in converting from nnet1.
-  AffineComponent(const CuMatrixBase<BaseFloat> &linear_params,
-                  const CuVectorBase<BaseFloat> &bias_params,
-                  BaseFloat learning_rate);
-
-  virtual int32 InputDim() const { return linear_params_.NumCols(); }
-  virtual int32 OutputDim() const { return linear_params_.NumRows(); }
-  void Init(BaseFloat learning_rate,
-            int32 input_dim, int32 output_dim,
-            BaseFloat param_stddev, BaseFloat bias_stddev);
-  void Init(BaseFloat learning_rate,
-            std::string matrix_filename);
-
-  // This function resizes the dimensions of the component, setting the
-  // parameters to zero, while leaving any other configuration values the same.
-  virtual void Resize(int32 input_dim, int32 output_dim);
-
-  // The following functions are used for collapsing multiple layers
-  // together.  They return a pointer to a new Component equivalent to
-  // the sequence of two components.  We haven't implemented this for
-  // FixedLinearComponent yet.
-  Component *CollapseWithNext(const AffineComponent &next) const ;
-  Component *CollapseWithNext(const FixedAffineComponent &next) const;
-  Component *CollapseWithNext(const FixedScaleComponent &next) const;
-  Component *CollapseWithPrevious(const FixedAffineComponent &prev) const;
-
-  virtual std::string Info() const;
-  virtual void InitFromString(std::string args);
-
-  AffineComponent(): is_gradient_(false) { } // use Init to really initialize.
-  virtual std::string Type() const { return "AffineComponent"; }
-  virtual bool BackpropNeedsInput() const { return true; }
-  virtual bool BackpropNeedsOutput() const { return false; }
-  using Component::Propagate; // to avoid name hiding
-  virtual void Propagate(const ChunkInfo &in_info,
-                         const ChunkInfo &out_info,
-                         const CuMatrixBase<BaseFloat> &in,
-                         CuMatrixBase<BaseFloat> *out) const;
-  virtual void Scale(BaseFloat scale);
-  virtual void Add(BaseFloat alpha, const UpdatableComponent &other);
-  virtual void Backprop(const ChunkInfo &in_info,
-                        const ChunkInfo &out_info,
-                        const CuMatrixBase<BaseFloat> &in_value,
-                        const CuMatrixBase<BaseFloat> &out_value,
-                        const CuMatrixBase<BaseFloat> &out_deriv,
-                        Component *to_update, // may be identical to "this".
-                        CuMatrix<BaseFloat> *in_deriv) const;
-  virtual void SetZero(bool treat_as_gradient);
-  virtual void Read(std::istream &is, bool binary);
-  virtual void Write(std::ostream &os, bool binary) const;
-  virtual BaseFloat DotProduct(const UpdatableComponent &other) const;
-  virtual Component* Copy() const;
-  virtual void PerturbParams(BaseFloat stddev);
-  // This new function is used when mixing up:
-  virtual void SetParams(const VectorBase<BaseFloat> &bias,
-                         const MatrixBase<BaseFloat> &linear);
-  const CuVector<BaseFloat> &BiasParams() { return bias_params_; }
-  const CuMatrix<BaseFloat> &LinearParams() { return linear_params_; }
-
-  virtual int32 GetParameterDim() const;
-  virtual void Vectorize(VectorBase<BaseFloat> *params) const;
-  virtual void UnVectorize(const VectorBase<BaseFloat> &params);
-
-  /// This function is for getting a low-rank approximations of this
-  /// AffineComponent by two AffineComponents.
-  virtual void LimitRank(int32 dimension,
-                         AffineComponent **a, AffineComponent **b) const;
-
-  /// This function is implemented in widen-nnet.cc
-  void Widen(int32 new_dimension,
-             BaseFloat param_stddev,
-             BaseFloat bias_stddev,
-             std::vector<NonlinearComponent*> c2, // will usually have just one
-                                                  // element.
-             AffineComponent *c3);
- protected:
-  friend class AffineComponentPreconditionedOnline;
-  // This function Update() is for extensibility; child classes may override this.
-  virtual void Update(
-      const CuMatrixBase<BaseFloat> &in_value,
-      const CuMatrixBase<BaseFloat> &out_deriv) {
-    UpdateSimple(in_value, out_deriv);
-  }
-  // UpdateSimple is used when *this is a gradient.  Child classes may
-  // or may not override this.
-  virtual void UpdateSimple(
-      const CuMatrixBase<BaseFloat> &in_value,
-      const CuMatrixBase<BaseFloat> &out_deriv);
-
-  const AffineComponent &operator = (const AffineComponent &other); // Disallow.
-  CuMatrix<BaseFloat> linear_params_;
-  CuVector<BaseFloat> bias_params_;
-
-  bool is_gradient_; // If true, treat this as just a gradient.
-};
-
-
-// This is an idea Dan is trying out, a little bit like
-// preconditioning the update with the Fisher matrix, but the
-// Fisher matrix has a special structure.
-// [note: it is currently used in the standard recipe].
-class AffineComponentPreconditioned: public AffineComponent {
- public:
-  virtual std::string Type() const { return "AffineComponentPreconditioned"; }
-
-  virtual void Read(std::istream &is, bool binary);
-  virtual void Write(std::ostream &os, bool binary) const;
-  void Init(BaseFloat learning_rate,
-            int32 input_dim, int32 output_dim,
-            BaseFloat param_stddev, BaseFloat bias_stddev,
-            BaseFloat alpha, BaseFloat max_change);
-  void Init(BaseFloat learning_rate, BaseFloat alpha,
-            BaseFloat max_change, std::string matrix_filename);
-
-  virtual void InitFromString(std::string args);
-  virtual std::string Info() const;
-  virtual Component* Copy() const;
-  AffineComponentPreconditioned(): alpha_(1.0), max_change_(0.0) { }
-  void SetMaxChange(BaseFloat max_change) { max_change_ = max_change; }
- protected:
-  KALDI_DISALLOW_COPY_AND_ASSIGN(AffineComponentPreconditioned);
-  BaseFloat alpha_;
-  BaseFloat max_change_; // If > 0, this is the maximum amount of parameter change (in L2 norm)
-                         // that we allow per minibatch.  This was introduced in order to
-                         // control instability.  Instead of the exact L2 parameter change,
-                         // for efficiency purposes we limit a bound on the exact change.
-                         // The limit is applied via a constant <= 1.0 for each minibatch,
-                         // A suitable value might be, for example, 10 or so; larger if there are
-                         // more parameters.
-
-  /// The following function is only called if max_change_ > 0.  It returns the
-  /// greatest value alpha <= 1.0 such that (alpha times the sum over the
-  /// row-index of the two matrices of the product the l2 norms of the two rows
-  /// times learning_rate_)
-  /// is <= max_change.
-  BaseFloat GetScalingFactor(const CuMatrix<BaseFloat> &in_value_precon,
-                             const CuMatrix<BaseFloat> &out_deriv_precon);
-
-  virtual void Update(
-      const CuMatrixBase<BaseFloat> &in_value,
-      const CuMatrixBase<BaseFloat> &out_deriv);
-};
-
-
-/// Keywords: natural gradient descent, NG-SGD, naturalgradient.  For
-/// the top-level of the natural gradient code look here, and also in
-/// nnet-precondition-online.h.
-/// AffineComponentPreconditionedOnline is, like AffineComponentPreconditioned,
-/// a version of AffineComponent that has a non-(multiple of unit) learning-rate
-/// matrix.  See nnet-precondition-online.h for a description of the technique.
-class AffineComponentPreconditionedOnline: public AffineComponent {
- public:
-  virtual std::string Type() const {
-    return "AffineComponentPreconditionedOnline";
-  }
-
-  virtual void Read(std::istream &is, bool binary);
-  virtual void Write(std::ostream &os, bool binary) const;
-  void Init(BaseFloat learning_rate,
-            int32 input_dim, int32 output_dim,
-            BaseFloat param_stddev, BaseFloat bias_stddev,
-            int32 rank_in, int32 rank_out, int32 update_period,
-            BaseFloat num_samples_history, BaseFloat alpha,
-            BaseFloat max_change_per_sample);
-  void Init(BaseFloat learning_rate, int32 rank_in,
-            int32 rank_out, int32 update_period,
-            BaseFloat num_samples_history,
-            BaseFloat alpha, BaseFloat max_change_per_sample,
-            std::string matrix_filename);
-
-  virtual void Resize(int32 input_dim, int32 output_dim);
-
-  // This constructor is used when converting neural networks partway through
-  // training, from AffineComponent or AffineComponentPreconditioned to
-  // AffineComponentPreconditionedOnline.
-  AffineComponentPreconditionedOnline(const AffineComponent &orig,
-                                      int32 rank_in, int32 rank_out,
-                                      int32 update_period,
-                                      BaseFloat eta, BaseFloat alpha);
-
-  virtual void InitFromString(std::string args);
-  virtual std::string Info() const;
-  virtual Component* Copy() const;
-  AffineComponentPreconditionedOnline(): max_change_per_sample_(0.0) { }
-
- private:
-  KALDI_DISALLOW_COPY_AND_ASSIGN(AffineComponentPreconditionedOnline);
-
-
-  // Configs for preconditioner.  The input side tends to be better conditioned ->
-  // smaller rank needed, so make them separately configurable.
-  int32 rank_in_;
-  int32 rank_out_;
-  int32 update_period_;
-  BaseFloat num_samples_history_;
-  BaseFloat alpha_;
-
-  OnlinePreconditioner preconditioner_in_;
-
-  OnlinePreconditioner preconditioner_out_;
-
-  BaseFloat max_change_per_sample_;
-  // If > 0, max_change_per_sample_ this is the maximum amount of parameter
-  // change (in L2 norm) that we allow per sample, averaged over the minibatch.
-  // This was introduced in order to control instability.
-  // Instead of the exact L2 parameter change, for
-  // efficiency purposes we limit a bound on the exact
-  // change.  The limit is applied via a constant <= 1.0
-  // for each minibatch, A suitable value might be, for
-  // example, 10 or so; larger if there are more
-  // parameters.
-
-  /// The following function is only called if max_change_per_sample_ > 0, it returns a
-  /// scaling factor alpha <= 1.0 (1.0 in the normal case) that enforces the
-  /// "max-change" constraint.  "in_products" is the inner product with itself
-  /// of each row of the matrix of preconditioned input features; "out_products"
-  /// is the same for the output derivatives.  gamma_prod is a product of two
-  /// scalars that are output by the preconditioning code (for the input and
-  /// output), which we will need to multiply into the learning rate.
-  /// out_products is a pointer because we modify it in-place.
-  BaseFloat GetScalingFactor(const CuVectorBase<BaseFloat> &in_products,
-                             BaseFloat gamma_prod,
-                             CuVectorBase<BaseFloat> *out_products);
-
-  // Sets the configs rank, alpha and eta in the preconditioner objects,
-  // from the class variables.
-  void SetPreconditionerConfigs();
-
-  virtual void Update(
-      const CuMatrixBase<BaseFloat> &in_value,
-      const CuMatrixBase<BaseFloat> &out_deriv);
-};
-
-class RandomComponent: public Component {
- public:
-  // This function is required in testing code and in other places we need
-  // consistency in the random number generation (e.g. when optimizing
-  // validation-set performance), but check where else we call sRand().  You'll
-  // need to call srand as well as making this call.
-  void ResetGenerator() { random_generator_.SeedGpu(); }
- protected:
-  CuRand<BaseFloat> random_generator_;
-};
-
-/// Splices a context window of frames together [over time]
-class SpliceComponent: public Component {
- public:
-  SpliceComponent() { }  // called only prior to Read() or Init().
-  // Note: it is required that the elements of "context" be in
-  // strictly increasing order, that the lowest element of component
-  // be nonpositive, and the highest element be nonnegative.
-  void Init(int32 input_dim,
-            std::vector<int32> context,
-            int32 const_component_dim=0);
-  virtual std::string Type() const { return "SpliceComponent"; }
-  virtual std::string Info() const;
-  virtual void InitFromString(std::string args);
-  virtual int32 InputDim() const { return input_dim_; }
-  virtual int32 OutputDim() const;
-  virtual std::vector<int32> Context() const { return context_; }
-  using Component::Propagate; // to avoid name hiding
-  virtual void Propagate(const ChunkInfo &in_info,
-                         const ChunkInfo &out_info,
-                         const CuMatrixBase<BaseFloat> &in,
-                         CuMatrixBase<BaseFloat> *out) const;
-  virtual void Backprop(const ChunkInfo &in_info,
-                        const ChunkInfo &out_info,
-                        const CuMatrixBase<BaseFloat> &in_value,
-                        const CuMatrixBase<BaseFloat> &out_value,
-                        const CuMatrixBase<BaseFloat> &out_deriv,
-                        Component *to_update, // may be identical to "this".
-                        CuMatrix<BaseFloat> *in_deriv) const;
-  virtual bool BackpropNeedsInput() const { return false; }
-  virtual bool BackpropNeedsOutput() const { return false; }
-  virtual Component* Copy() const;
-  virtual void Read(std::istream &is, bool binary);
-  virtual void Write(std::ostream &os, bool binary) const;
- private:
-  KALDI_DISALLOW_COPY_AND_ASSIGN(SpliceComponent);
-  int32 input_dim_;
-  std::vector<int32> context_;
-  int32 const_component_dim_;
-};
-
-/// This is as SpliceComponent but outputs the max of
-/// any of the inputs (taking the max across time).
-class SpliceMaxComponent: public Component {
- public:
-  SpliceMaxComponent() { }  // called only prior to Read() or Init().
-  void Init(int32 dim,
-            std::vector<int32> context);
-  virtual std::string Type() const { return "SpliceMaxComponent"; }
-  virtual std::string Info() const;
-  virtual void InitFromString(std::string args);
-  virtual int32 InputDim() const { return dim_; }
-  virtual int32 OutputDim() const { return dim_; }
-  virtual std::vector<int32> Context() const  { return context_; }
-  using Component::Propagate; // to avoid name hiding
-  virtual void Propagate(const ChunkInfo &in_info,
-                         const ChunkInfo &out_info,
-                         const CuMatrixBase<BaseFloat> &in,
-                         CuMatrixBase<BaseFloat> *out) const;
-  virtual void Backprop(const ChunkInfo &in_info,
-                        const ChunkInfo &out_info,
-                        const CuMatrixBase<BaseFloat> &in_value,
-                        const CuMatrixBase<BaseFloat> &out_value,
-                        const CuMatrixBase<BaseFloat> &out_deriv,
-                        Component *to_update, // may be identical to "this".
-                        CuMatrix<BaseFloat> *in_deriv) const;
-  virtual bool BackpropNeedsInput() const { return true; }
-  virtual bool BackpropNeedsOutput() const { return false; }
-  virtual Component* Copy() const;
-  virtual void Read(std::istream &is, bool binary);
-  virtual void Write(std::ostream &os, bool binary) const;
- private:
-  KALDI_DISALLOW_COPY_AND_ASSIGN(SpliceMaxComponent);
-  int32 dim_;
-  std::vector<int32> context_;
-};
-
-
-// Affine means a linear function plus an offset.  "Block" means
-// here that we support a number of equal-sized blocks of parameters,
-// in the linear part, so e.g. 2 x 500 would mean 2 blocks of 500 each.
-class BlockAffineComponent: public UpdatableComponent {
- public:
-  virtual int32 InputDim() const { return linear_params_.NumCols() * num_blocks_; }
-  virtual int32 OutputDim() const { return linear_params_.NumRows(); }
-  virtual int32 GetParameterDim() const;
-  virtual void Vectorize(VectorBase<BaseFloat> *params) const;
-  virtual void UnVectorize(const VectorBase<BaseFloat> &params);
-
-  // Note: num_blocks must divide input_dim.
-  void Init(BaseFloat learning_rate,
-                    int32 input_dim, int32 output_dim,
-                    BaseFloat param_stddev, BaseFloat bias_stddev,
-                    int32 num_blocks);
-  virtual void InitFromString(std::string args);
-
-  BlockAffineComponent() { } // use Init to really initialize.
-  virtual std::string Type() const { return "BlockAffineComponent"; }
-  virtual bool BackpropNeedsInput() const { return true; }
-  virtual bool BackpropNeedsOutput() const { return false; }
-  using Component::Propagate; // to avoid name hiding
-  virtual void Propagate(const ChunkInfo &in_info,
-                         const ChunkInfo &out_info,
-                         const CuMatrixBase<BaseFloat> &in,
-                         CuMatrixBase<BaseFloat> *out) const;
-  virtual void Backprop(const ChunkInfo &in_info,
-                        const ChunkInfo &out_info,
-                        const CuMatrixBase<BaseFloat> &in_value,
-                        const CuMatrixBase<BaseFloat> &out_value,
-                        const CuMatrixBase<BaseFloat> &out_deriv,
-                        Component *to_update, // may be identical to "this".
-                        CuMatrix<BaseFloat> *in_deriv) const;
-  virtual void SetZero(bool treat_as_gradient);
-  virtual void Read(std::istream &is, bool binary);
-  virtual void Write(std::ostream &os, bool binary) const;
-  virtual BaseFloat DotProduct(const UpdatableComponent &other) const;
-  virtual Component* Copy() const;
-  virtual void PerturbParams(BaseFloat stddev);
-  virtual void Scale(BaseFloat scale);
-  virtual void Add(BaseFloat alpha, const UpdatableComponent &other);
- protected:
-  virtual void Update(
-      const CuMatrixBase<BaseFloat> &in_value,
-      const CuMatrixBase<BaseFloat> &out_deriv) {
-    UpdateSimple(in_value, out_deriv);
-  }
-  // UpdateSimple is used when *this is a gradient.  Child classes may
-  // override this.
-  virtual void UpdateSimple(
-      const CuMatrixBase<BaseFloat> &in_value,
-      const CuMatrixBase<BaseFloat> &out_deriv);
-
-  // The matrix linear_params_ has a block structure, with num_blocks_ blocks of
-  // equal size.  The blocks are stored in linear_params_ as
-  // [ M
-  //   N
-  //   O ] but we actually treat it as the matrix:
-  // [ M 0 0
-  //   0 N 0
-  //   0 0 O ]
-  CuMatrix<BaseFloat> linear_params_;
-  CuVector<BaseFloat> bias_params_;
-  int32 num_blocks_;
- private:
-  KALDI_DISALLOW_COPY_AND_ASSIGN(BlockAffineComponent);
-
-};
-
-
-// Affine means a linear function plus an offset.  "Block" means
-// here that we support a number of equal-sized blocks of parameters,
-// in the linear part, so e.g. 2 x 500 would mean 2 blocks of 500 each.
-class BlockAffineComponentPreconditioned: public BlockAffineComponent {
- public:
-  // Note: num_blocks must divide input_dim.
-  void Init(BaseFloat learning_rate,
-            int32 input_dim, int32 output_dim,
-            BaseFloat param_stddev, BaseFloat bias_stddev,
-            int32 num_blocks, BaseFloat alpha);
-
-  virtual void InitFromString(std::string args);
-
-  BlockAffineComponentPreconditioned() { } // use Init to really initialize.
-  virtual std::string Type() const { return "BlockAffineComponentPreconditioned"; }
-  virtual void SetZero(bool treat_as_gradient);
-  virtual void Read(std::istream &is, bool binary);
-  virtual void Write(std::ostream &os, bool binary) const;
-  virtual Component* Copy() const;
- private:
-  KALDI_DISALLOW_COPY_AND_ASSIGN(BlockAffineComponentPreconditioned);
-  virtual void Update(
-      const CuMatrixBase<BaseFloat> &in_value,
-      const CuMatrixBase<BaseFloat> &out_deriv);
-
-  bool is_gradient_;
-  BaseFloat alpha_;
-};
-
-// SumGroupComponent is used to sum up groups of posteriors.
-// It's used to introduce a kind of Gaussian-mixture-model-like
-// idea into neural nets.  This is basically a degenerate case of
-// MixtureProbComponent; we had to implement it separately to
-// be efficient for CUDA (we can use this one regardless whether
-// we have CUDA or not; it's the normal case we want anyway).
-class SumGroupComponent: public Component {
-public:
-  virtual int32 InputDim() const { return input_dim_; }
-  virtual int32 OutputDim() const { return output_dim_; }
-  void Init(const std::vector<int32> &sizes); // the vector is of the input dim
-                                              // (>= 1) for each output dim.
-  void GetSizes(std::vector<int32> *sizes) const; // Get a vector saying, for
-                                                  // each output-dim, how many
-                                                  // inputs were summed over.
-  virtual void InitFromString(std::string args);
-  SumGroupComponent() { }
-  virtual std::string Type() const { return "SumGroupComponent"; }
-  virtual bool BackpropNeedsInput() const { return false; }
-  virtual bool BackpropNeedsOutput() const { return false; }
-  using Component::Propagate; // to avoid name hiding
-  virtual void Propagate(const ChunkInfo &in_info,
-                         const ChunkInfo &out_info,
-                         const CuMatrixBase<BaseFloat> &in,
-                         CuMatrixBase<BaseFloat> *out) const;
-  // Note: in_value and out_value are both dummy variables.
-  virtual void Backprop(const ChunkInfo &in_info,
-                        const ChunkInfo &out_info,
-                        const CuMatrixBase<BaseFloat> &in_value,
-                        const CuMatrixBase<BaseFloat> &out_value,
-                        const CuMatrixBase<BaseFloat> &out_deriv,
-                        Component *to_update, // may be identical to "this".
-                        CuMatrix<BaseFloat> *in_deriv) const;
-  virtual Component* Copy() const;
-  virtual void Read(std::istream &is, bool binary);
-  virtual void Write(std::ostream &os, bool binary) const;
-
-private:
-  KALDI_DISALLOW_COPY_AND_ASSIGN(SumGroupComponent);
-  // Note: Int32Pair is just struct{ int32 first; int32 second }; it's defined
-  // in cu-matrixdim.h as extern "C" which is needed for the CUDA interface.
-  CuArray<Int32Pair> indexes_; // for each output index, the (start, end) input
-                               // index.
-  CuArray<int32> reverse_indexes_; // for each input index, the output index.
-  int32 input_dim_;
-  int32 output_dim_;
-};
-
-
-/// PermuteComponent does a permutation of the dimensions (by default, a fixed
-/// random permutation, but it may be specified).  Useful in conjunction with
-/// block-diagonal transforms.
-class PermuteComponent: public Component {
- public:
-  void Init(int32 dim);
-  void Init(const std::vector<int32> &reorder);
-  PermuteComponent(int32 dim) { Init(dim); }
-  PermuteComponent(const std::vector<int32> &reorder) { Init(reorder); }
-
-  PermuteComponent() { } // e.g. prior to Read() or Init()
-
-  virtual int32 InputDim() const { return reorder_.size(); }
-  virtual int32 OutputDim() const { return reorder_.size(); }
-  virtual Component *Copy() const;
-
-  virtual void InitFromString(std::string args);
-  virtual void Read(std::istream &is, bool binary);
-  virtual void Write(std::ostream &os, bool binary) const;
-  virtual std::string Type() const { return "PermuteComponent"; }
-  virtual bool BackpropNeedsInput() const { return false; }
-  virtual bool BackpropNeedsOutput() const { return false; }
-  using Component::Propagate; // to avoid name hiding
-  virtual void Propagate(const ChunkInfo &in_info,
-                         const ChunkInfo &out_info,
-                         const CuMatrixBase<BaseFloat> &in,
-                         CuMatrixBase<BaseFloat> *out) const;
-  virtual void Backprop(const ChunkInfo &in_info,
-                        const ChunkInfo &out_info,
-                        const CuMatrixBase<BaseFloat> &in_value,
-                        const CuMatrixBase<BaseFloat> &out_value,
-                        const CuMatrixBase<BaseFloat> &out_deriv,
-                        Component *to_update, // may be identical to "this".
-                        CuMatrix<BaseFloat> *in_deriv) const;
-
- private:
-  KALDI_DISALLOW_COPY_AND_ASSIGN(PermuteComponent);
-  std::vector<int32> reorder_; // This class sends input dimension i to
-                               // output dimension reorder_[i].
-};
-
-
-/// Discrete cosine transform.
-/// TODO: modify this Component so that it supports only keeping a subset
-class DctComponent: public Component {
- public:
-  DctComponent() { dim_ = 0; }
-  virtual std::string Type() const { return "DctComponent"; }
-  virtual std::string Info() const;
-  //dim = dimension of vector being processed
-  //dct_dim = effective lenght of DCT, i.e. how many compoments will be kept
-  void Init(int32 dim, int32 dct_dim, bool reorder, int32 keep_dct_dim=0);
-  // InitFromString takes numeric options
-  // dim, dct-dim, and (optionally) reorder={true,false}, keep-dct-dim
-  // Note: reorder defaults to false. keep-dct-dim defaults to dct-dim
-  virtual void InitFromString(std::string args);
-  virtual int32 InputDim() const { return dim_; }
-  virtual int32 OutputDim() const { return dct_mat_.NumRows() * (dim_ / dct_mat_.NumCols()); }
-  using Component::Propagate; // to avoid name hiding
-  virtual void Propagate(const ChunkInfo &in_info,
-                         const ChunkInfo &out_info,
-                         const CuMatrixBase<BaseFloat> &in,
-                         CuMatrixBase<BaseFloat> *out) const;
-  virtual void Backprop(const ChunkInfo &in_info,
-                        const ChunkInfo &out_info,
-                        const CuMatrixBase<BaseFloat> &in_value,
-                        const CuMatrixBase<BaseFloat> &out_value,
-                        const CuMatrixBase<BaseFloat> &out_deriv,
-                        Component *to_update, // may be identical to "this".
-                        CuMatrix<BaseFloat> *in_deriv) const;
-  virtual bool BackpropNeedsInput() const { return false; }
-  virtual bool BackpropNeedsOutput() const { return false; }
-  virtual Component* Copy() const;
-  virtual void Read(std::istream &is, bool binary);
-  virtual void Write(std::ostream &os, bool binary) const;
- private:
-  void Reorder(CuMatrixBase<BaseFloat> *mat, bool reverse) const;
-  int32 dim_; // The input dimension of the (sub)vector.
-
-  bool reorder_; // If true, transformation matrix we use is not
-  // block diagonal but is block diagonal after reordering-- so
-  // effectively we transform with the Kronecker product D x I,
-  // rather than a matrix with D's on the diagonal (i.e. I x D,
-  // where x is the Kronecker product).  We'll set reorder_ to
-  // true if we want to use this to transform in the time domain,
-  // because the SpliceComponent splices blocks of e.g. MFCCs
-  // together so each time is a dimension of the block.
-
-  CuMatrix<BaseFloat> dct_mat_;
-
-  KALDI_DISALLOW_COPY_AND_ASSIGN(DctComponent);
-};
-
-
-/// FixedLinearComponent is a linear transform that is supplied
-/// at network initialization time and is not trainable.
-class FixedLinearComponent: public Component {
- public:
-  FixedLinearComponent() { }
-  virtual std::string Type() const { return "FixedLinearComponent"; }
-  virtual std::string Info() const;
-
-  void Init(const CuMatrixBase<BaseFloat> &matrix) { mat_ = matrix; }
-
-  // InitFromString takes only the option matrix=<string>,
-  // where the string is the filename of a Kaldi-format matrix to read.
-  virtual void InitFromString(std::string args);
-
-  virtual int32 InputDim() const { return mat_.NumCols(); }
-  virtual int32 OutputDim() const { return mat_.NumRows(); }
-  using Component::Propagate; // to avoid name hiding
-  virtual void Propagate(const ChunkInfo &in_info,
-                         const ChunkInfo &out_info,
-                         const CuMatrixBase<BaseFloat> &in,
-                         CuMatrixBase<BaseFloat> *out) const;
-  virtual void Backprop(const ChunkInfo &in_info,
-                        const ChunkInfo &out_info,
-                        const CuMatrixBase<BaseFloat> &in_value,
-                        const CuMatrixBase<BaseFloat> &out_value,
-                        const CuMatrixBase<BaseFloat> &out_deriv,
-                        Component *to_update, // may be identical to "this".
-                        CuMatrix<BaseFloat> *in_deriv) const;
-  virtual bool BackpropNeedsInput() const { return false; }
-  virtual bool BackpropNeedsOutput() const { return false; }
-  virtual Component* Copy() const;
-  virtual void Read(std::istream &is, bool binary);
-  virtual void Write(std::ostream &os, bool binary) const;
- protected:
-  friend class AffineComponent;
-  CuMatrix<BaseFloat> mat_;
-
-  KALDI_DISALLOW_COPY_AND_ASSIGN(FixedLinearComponent);
-};
-
-
-/// FixedAffineComponent is an affine transform that is supplied
-/// at network initialization time and is not trainable.
-class FixedAffineComponent: public Component {
- public:
-  FixedAffineComponent() { }
-  virtual std::string Type() const { return "FixedAffineComponent"; }
-  virtual std::string Info() const;
-
-  /// matrix should be of size input-dim+1 to output-dim, last col is offset
-  void Init(const CuMatrixBase<BaseFloat> &matrix);
-
-  // InitFromString takes only the option matrix=<string>,
-  // where the string is the filename of a Kaldi-format matrix to read.
-  virtual void InitFromString(std::string args);
-
-  virtual int32 InputDim() const { return linear_params_.NumCols(); }
-  virtual int32 OutputDim() const { return linear_params_.NumRows(); }
-  using Component::Propagate; // to avoid name hiding
-  virtual void Propagate(const ChunkInfo &in_info,
-                         const ChunkInfo &out_info,
-                         const CuMatrixBase<BaseFloat> &in,
-                         CuMatrixBase<BaseFloat> *out) const;
-  virtual void Backprop(const ChunkInfo &in_info,
-                        const ChunkInfo &out_info,
-                        const CuMatrixBase<BaseFloat> &in_value,
-                        const CuMatrixBase<BaseFloat> &out_value,
-                        const CuMatrixBase<BaseFloat> &out_deriv,
-                        Component *to_update, // may be identical to "this".
-                        CuMatrix<BaseFloat> *in_deriv) const;
-  virtual bool BackpropNeedsInput() const { return false; }
-  virtual bool BackpropNeedsOutput() const { return false; }
-  virtual Component* Copy() const;
-  virtual void Read(std::istream &is, bool binary);
-  virtual void Write(std::ostream &os, bool binary) const;
-
-  // Function to provide access to linear_params_.
-  const CuMatrix<BaseFloat> &LinearParams() const { return linear_params_; }
- protected:
-  friend class AffineComponent;
-  CuMatrix<BaseFloat> linear_params_;
-  CuVector<BaseFloat> bias_params_;
-
-  KALDI_DISALLOW_COPY_AND_ASSIGN(FixedAffineComponent);
-};
-
-
-/// FixedScaleComponent applies a fixed per-element scale; it's similar
-/// to the Rescale component in the nnet1 setup (and only needed for nnet1
-/// model conversion).
-class FixedScaleComponent: public Component {
- public:
-  FixedScaleComponent() { }
-  virtual std::string Type() const { return "FixedScaleComponent"; }
-  virtual std::string Info() const;
-
-  void Init(const CuVectorBase<BaseFloat> &scales);
-
-  // InitFromString takes only the option scales=<string>,
-  // where the string is the filename of a Kaldi-format matrix to read.
-  virtual void InitFromString(std::string args);
-
-  virtual int32 InputDim() const { return scales_.Dim(); }
-  virtual int32 OutputDim() const { return scales_.Dim(); }
-  using Component::Propagate; // to avoid name hiding
-  virtual void Propagate(const ChunkInfo &in_info,
-                         const ChunkInfo &out_info,
-                         const CuMatrixBase<BaseFloat> &in,
-                         CuMatrixBase<BaseFloat> *out) const;
-  virtual void Backprop(const ChunkInfo &in_info,
-                        const ChunkInfo &out_info,
-                        const CuMatrixBase<BaseFloat> &in_value,
-                        const CuMatrixBase<BaseFloat> &out_value,
-                        const CuMatrixBase<BaseFloat> &out_deriv,
-                        Component *to_update, // may be identical to "this".
-                        CuMatrix<BaseFloat> *in_deriv) const;
-  virtual bool BackpropNeedsInput() const { return false; }
-  virtual bool BackpropNeedsOutput() const { return false; }
-  virtual Component* Copy() const;
-  virtual void Read(std::istream &is, bool binary);
-  virtual void Write(std::ostream &os, bool binary) const;
-
- protected:
-  friend class AffineComponent;  // necessary for collapse
-  CuVector<BaseFloat> scales_;
-  KALDI_DISALLOW_COPY_AND_ASSIGN(FixedScaleComponent);
-};
-
-/// FixedBiasComponent applies a fixed per-element bias; it's similar
-/// to the AddShift component in the nnet1 setup (and only needed for nnet1
-/// model conversion.
-class FixedBiasComponent: public Component {
- public:
-  FixedBiasComponent() { }
-  virtual std::string Type() const { return "FixedBiasComponent"; }
-  virtual std::string Info() const;
-
-  void Init(const CuVectorBase<BaseFloat> &scales);
-
-  // InitFromString takes only the option bias=<string>,
-  // where the string is the filename of a Kaldi-format matrix to read.
-  virtual void InitFromString(std::string args);
-
-  virtual int32 InputDim() const { return bias_.Dim(); }
-  virtual int32 OutputDim() const { return bias_.Dim(); }
-  using Component::Propagate; // to avoid name hiding
-  virtual void Propagate(const ChunkInfo &in_info,
-                         const ChunkInfo &out_info,
-                         const CuMatrixBase<BaseFloat> &in,
-                         CuMatrixBase<BaseFloat> *out) const;
-  virtual void Backprop(const ChunkInfo &in_info,
-                        const ChunkInfo &out_info,
-                        const CuMatrixBase<BaseFloat> &in_value,
-                        const CuMatrixBase<BaseFloat> &out_value,
-                        const CuMatrixBase<BaseFloat> &out_deriv,
-                        Component *to_update, // may be identical to "this".
-                        CuMatrix<BaseFloat> *in_deriv) const ;
-  virtual bool BackpropNeedsInput() const { return false; }
-  virtual bool BackpropNeedsOutput() const { return false; }
-  virtual Component* Copy() const;
-  virtual void Read(std::istream &is, bool binary);
-  virtual void Write(std::ostream &os, bool binary) const;
-
- protected:
-  CuVector<BaseFloat> bias_;
-  KALDI_DISALLOW_COPY_AND_ASSIGN(FixedBiasComponent);
-};
-
-
-/// This Component, if present, randomly zeroes half of
-/// the inputs and multiplies the other half by two.
-/// Typically you would use this in training but not in
-/// test or when computing validation-set objective functions.
-class DropoutComponent: public RandomComponent {
- public:
-  /// dropout-proportion is the proportion that is dropped out,
-  /// e.g. if 0.1, we set 10% to a low value.  [note, in
-  /// some older code it was interpreted as the value not dropped
-  /// out, so be careful.]  The low scale-value
-  /// is equal to dropout_scale.  The high scale-value is chosen
-  /// such that the expected scale-value is one.
-  void Init(int32 dim,
-            BaseFloat dropout_proportion = 0.5,
-            BaseFloat dropout_scale = 0.0);
-  DropoutComponent(int32 dim, BaseFloat dp = 0.5, BaseFloat sc = 0.0) {
-    Init(dim, dp, sc);
-  }
-  DropoutComponent(): dim_(0), dropout_proportion_(0.5) { }
-  virtual int32 InputDim() const { return dim_; }
-  virtual int32 OutputDim() const { return dim_; }
-  virtual void InitFromString(std::string args);
-
-  virtual void Read(std::istream &is, bool binary);
-
-  virtual void Write(std::ostream &os, bool binary) const;
-
-  virtual std::string Type() const { return "DropoutComponent"; }
-
-  void SetDropoutScale(BaseFloat scale) { dropout_scale_ = scale; }
-  virtual bool BackpropNeedsInput() const { return true; }
-  virtual bool BackpropNeedsOutput() const { return true; }
-  virtual Component* Copy() const;
-  using Component::Propagate; // to avoid name hiding
-  virtual void Propagate(const ChunkInfo &in_info,
-                         const ChunkInfo &out_info,
-                         const CuMatrixBase<BaseFloat> &in,
-                         CuMatrixBase<BaseFloat> *out) const;
-  virtual void Backprop(const ChunkInfo &in_info,
-                        const ChunkInfo &out_info,
-                        const CuMatrixBase<BaseFloat> &in_value,
-                        const CuMatrixBase<BaseFloat> &out_value,
-                        const CuMatrixBase<BaseFloat> &out_deriv,
-                        Component *to_update, // may be identical to "this".
-                        CuMatrix<BaseFloat> *in_deriv) const;
-  virtual std::string Info() const;
- private:
-  int32 dim_;
-  BaseFloat dropout_proportion_;
-  BaseFloat dropout_scale_; // Set the scale that we scale "dropout_proportion_"
-  // of the neurons by (default 0.0, but can be set arbitrarily close to 1.0).
-};
-
-/// This is a bit similar to dropout but adding (not multiplying) Gaussian
-/// noise with a given standard deviation.
-class AdditiveNoiseComponent: public RandomComponent {
- public:
-  void Init(int32 dim, BaseFloat noise_stddev);
-  AdditiveNoiseComponent(int32 dim, BaseFloat stddev) { Init(dim, stddev); }
-  AdditiveNoiseComponent(): dim_(0), stddev_(1.0) { }
-  virtual int32 InputDim() const { return dim_; }
-  virtual int32 OutputDim() const { return dim_; }
-  virtual void InitFromString(std::string args);
-
-  virtual void Read(std::istream &is, bool binary);
-
-  virtual void Write(std::ostream &os, bool binary) const;
-
-  virtual std::string Type() const { return "AdditiveNoiseComponent"; }
-
-  virtual bool BackpropNeedsInput() const { return false; }
-  virtual bool BackpropNeedsOutput() const { return false; }
-  virtual Component* Copy() const {
-    return new AdditiveNoiseComponent(dim_, stddev_);
-  }
-  using Component::Propagate; // to avoid name hiding
-  virtual void Propagate(const ChunkInfo &in_info,
-                         const ChunkInfo &out_info,
-                         const CuMatrixBase<BaseFloat> &in,
-                         CuMatrixBase<BaseFloat> *out) const;
-  virtual void Backprop(const ChunkInfo &in_info,
-                        const ChunkInfo &out_info,
-                        const CuMatrixBase<BaseFloat> &in_value,
-                        const CuMatrixBase<BaseFloat> &out_value,
-                        const CuMatrixBase<BaseFloat> &out_deriv,
-                        Component *to_update, // may be identical to "this".
-                        CuMatrix<BaseFloat> *in_deriv) const { *in_deriv = out_deriv; }
- private:
-  int32 dim_;
-  BaseFloat stddev_;
-};
-
-/**
- * Convolutional1dComponent implements convolution over frequency axis.
- * We assume the input featrues are spliced, i.e. each frame is in
- * fact a set of stacked frames, where we can form patches which span
- * over several frequency bands and whole time axis. A patch is the
- * instance of a filter on a group of frequency bands and whole time
- * axis. Shifts of the filter generate patches.
- *
- * The convolution is done over whole axis with same filter
- * coefficients, i.e. we don't use separate filters for different
- * 'regions' of frequency axis. Due to convolution, same weights are
- * used repeateadly, the final gradient is a sum of all
- * position-specific gradients (the sum was found better than
- * averaging).
- *
- * In order to have a fast implementations, the filters are
- * represented in vectorized form, where each rectangular filter
- * corresponds to a row in a matrix, where all the filters are
- * stored. The features are then re-shaped to a set of matrices, where
- * one matrix corresponds to single patch-position, where all the
- * filters get applied.
- *
- * The type of convolution is controled by hyperparameters:
- * patch_dim_     ... frequency axis size of the patch
- * patch_step_    ... size of shift in the convolution
- * patch_stride_  ... shift for 2nd dim of a patch
- *                    (i.e. frame length before splicing)
- * For instance, for a convolutional component after raw input,
- * if the input is 36-dim fbank feature with delta of order 2
- * and spliced using +/- 5 frames of contexts, the convolutional
- * component takes the input as a 36 x 33 image. The patch_stride_
- * should be configured 36. If patch_step_ and patch_dim_ are
- * configured 1 and 7, the Convolutional1dComponent creates a
- * 2D filter of 7 x 33, such that the convolution is actually done
- * only along the frequency axis. Specifically, the convolutional
- * output along the frequency axis is (36 - 7) / 1 + 1 = 30, and
- * the convolutional output along the temporal axis is 33 - 33 + 1 = 1,
- * resulting in an output image of 30 x 1, which is called a feature map
- * in ConvNet. Then if the output-dim is set 3840, the constructor
- * would know there should be 3840 / 30 = 128 distinct filters,
- * which will create 128 feature maps of 30 x 1 for one frame of
- * input. The feature maps are vectorized as a 3840-dim row vector
- * in the output matrix of this component. For details on progatation
- * of Convolutional1dComponent, check the function definition.
- *
- */
-class Convolutional1dComponent: public UpdatableComponent {
- public:
-  Convolutional1dComponent();
-  // constructor using another component
-  Convolutional1dComponent(const Convolutional1dComponent &component);
-  // constructor using parameters
-  Convolutional1dComponent(const CuMatrixBase<BaseFloat> &filter_params,
-                           const CuVectorBase<BaseFloat> &bias_params,
-                           BaseFloat learning_rate);
-
-  int32 InputDim() const;
-  int32 OutputDim() const;
-  void Init(BaseFloat learning_rate, int32 input_dim, int32 output_dim,
-            int32 patch_dim, int32 patch_step, int32 patch_stride,
-            BaseFloat param_stddev, BaseFloat bias_stddev, bool appended_conv);
-  void Init(BaseFloat learning_rate,
-            int32 patch_dim, int32 patch_step, int32 patch_stride,
-            std::string matrix_filename, bool appended_conv);
-
-  // resize the component, setting the parameters to zero, while
-  // leaving any other configuration values the same
-  void Resize(int32 input_dim, int32 output_dim);
-  std::string Info() const;
-  void InitFromString(std::string args);
-  std::string Type() const { return "Convolutional1dComponent"; }
-  bool BackpropNeedsInput() const { return true; }
-  bool BackpropNeedsOutput() const { return false; }
-  using Component::Propagate; // to avoid name hiding
-  void Propagate(const ChunkInfo &in_info,
-                 const ChunkInfo &out_info,
-                 const CuMatrixBase<BaseFloat> &in,
-                 CuMatrixBase<BaseFloat> *out) const;
-  void Scale(BaseFloat scale);
-  virtual void Add(BaseFloat alpha, const UpdatableComponent &other);
-  virtual void Backprop(const ChunkInfo &in_info,
-                        const ChunkInfo &out_info,
-                        const CuMatrixBase<BaseFloat> &in_value,
-                        const CuMatrixBase<BaseFloat> &out_value,
-                        const CuMatrixBase<BaseFloat> &out_deriv,
-                        Component *to_update_in,
-                        CuMatrix<BaseFloat> *in_deriv) const;
-  void SetZero(bool treat_as_gradient);
-  void Read(std::istream &is, bool binary);
-  void Write(std::ostream &os, bool binary) const;
-  virtual BaseFloat DotProduct(const UpdatableComponent &other) const;
-  Component* Copy() const;
-  void PerturbParams(BaseFloat stddev);
-  void SetParams(const VectorBase<BaseFloat> &bias,
-                 const MatrixBase<BaseFloat> &filter);
-  const CuVector<BaseFloat> &BiasParams() { return bias_params_; }
-  const CuMatrix<BaseFloat> &LinearParams() { return filter_params_; }
-  int32 GetParameterDim() const;
-  void Update(const CuMatrixBase<BaseFloat> &in_value,
-              const CuMatrixBase<BaseFloat> &out_deriv);
-
- private:
-  int32 patch_dim_;
-  int32 patch_step_;
-  int32 patch_stride_;
-
-  static void ReverseIndexes(const std::vector<int32> &forward_indexes,
-                             int32 input_dim,
-                             std::vector<std::vector<int32> > *backward_indexes);
-  static void RearrangeIndexes(const std::vector<std::vector<int32> > &in,
-                               std::vector<std::vector<int32> > *out);
-
-  const Convolutional1dComponent &operator = (const Convolutional1dComponent &other); // Disallow.
-  CuMatrix<BaseFloat> filter_params_;
-  CuVector<BaseFloat> bias_params_;
-  // When appending convolutional1dcomponents, appended_conv_ should be
-  // set ture for the appended convolutional1dcomponents.
-  bool appended_conv_;
-  bool is_gradient_;
-};
-
-
-/// Functions used in Init routines.  Suppose name=="foo", if "string" has a
-/// field like foo=12, this function will set "param" to 12 and remove that
-/// element from "string".  It returns true if the parameter was read.
-bool ParseFromString(const std::string &name, std::string *string,
-                     int32 *param);
-/// This version is for parameters of type BaseFloat.
-bool ParseFromString(const std::string &name, std::string *string,
-                     BaseFloat *param);
-/// This version is for parameters of type std::vector<int32>; it expects
-/// them as a colon-separated list, without spaces.
-bool ParseFromString(const std::string &name, std::string *string,
-                     std::vector<int32> *param);
-/// This version is for parameters of type bool, which can appear
-/// as any string beginning with f, F, t or T.
-bool ParseFromString(const std::string &name, std::string *string,
-                     bool *param);
-
-
-} // namespace nnet2
-} // namespace kaldi
-
-
-#endif
diff --git a/src/nnet2/nnet-compute-discriminative-parallel.cc b/src/nnet2/nnet-compute-discriminative-parallel.cc
deleted file mode 100644
index 0ffd73f45bb..00000000000
--- a/src/nnet2/nnet-compute-discriminative-parallel.cc
+++ /dev/null
@@ -1,222 +0,0 @@
-// nnet2/nnet-compute-discriminative-parallel.cc
-
-// Copyright 2012-2013   Johns Hopkins University (author: Daniel Povey)
-
-// See ../../COPYING for clarification regarding multiple authors
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//  http://www.apache.org/licenses/LICENSE-2.0
-//
-// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
-// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
-// MERCHANTABLITY OR NON-INFRINGEMENT.
-// See the Apache 2 License for the specific language governing permissions and
-// limitations under the License.
-
-#include <deque>
-#include <mutex>
-#include "nnet2/nnet-compute-discriminative-parallel.h"
-#include "hmm/posterior.h"
-#include "lat/lattice-functions.h"
-#include "util/kaldi-semaphore.h"
-#include "util/kaldi-thread.h"
-
-namespace kaldi {
-namespace nnet2 {
-
-/** This struct stores neural net training examples to be used in
-    multi-threaded training.  */
-class DiscriminativeExamplesRepository {
- public:
-  /// The following function is called by the code that reads in the examples.
-  void AcceptExample(const DiscriminativeNnetExample &example);
-
-  /// The following function is called by the code that reads in the examples,
-  /// when we're done reading examples; it signals this way to this class
-  /// that the stream is now empty
-  void ExamplesDone();
-
-  /// This function is called by the code that does the training.  If there is
-  /// an example available it will provide it, or it will sleep till one is
-  /// available.  It returns NULL when there are no examples left and
-  /// ExamplesDone() has been called.
-  DiscriminativeNnetExample *ProvideExample();
-
-  DiscriminativeExamplesRepository(): buffer_size_(4),
-                                      empty_semaphore_(buffer_size_),
-                                      done_(false) { }
- private:
-  int32 buffer_size_;
-  Semaphore full_semaphore_;
-  Semaphore empty_semaphore_;
-  std::mutex examples_mutex_; // mutex we lock to modify examples_.
-
-  std::deque<DiscriminativeNnetExample*> examples_;
-  bool done_;
-  KALDI_DISALLOW_COPY_AND_ASSIGN(DiscriminativeExamplesRepository);
-};
-
-
-void DiscriminativeExamplesRepository::AcceptExample(
-    const DiscriminativeNnetExample &example) {
-  empty_semaphore_.Wait();
-  examples_mutex_.lock();
-  examples_.push_back(new DiscriminativeNnetExample(example));
-  examples_mutex_.unlock();
-  full_semaphore_.Signal();
-}
-
-void DiscriminativeExamplesRepository::ExamplesDone() {
-  for (int32 i = 0; i < buffer_size_; i++)
-    empty_semaphore_.Wait();
-  examples_mutex_.lock();
-  KALDI_ASSERT(examples_.empty());
-  examples_mutex_.unlock();
-  done_ = true;
-  full_semaphore_.Signal();
-}
-
-DiscriminativeNnetExample*
-DiscriminativeExamplesRepository::ProvideExample() {
-  full_semaphore_.Wait();
-  if (done_) {
-    KALDI_ASSERT(examples_.empty());
-    full_semaphore_.Signal(); // Increment the semaphore so
-    // the call by the next thread will not block.
-    return NULL; // no examples to return-- all finished.
-  } else {
-    examples_mutex_.lock();
-    KALDI_ASSERT(!examples_.empty());
-    DiscriminativeNnetExample *ans = examples_.front();
-    examples_.pop_front();
-    examples_mutex_.unlock();
-    empty_semaphore_.Signal();
-    return ans;
-  }
-}
-
-
-class DiscTrainParallelClass: public MultiThreadable {
- public:
-  // This constructor is only called for a temporary object
-  // that we pass to the RunMultiThreaded function.
-  DiscTrainParallelClass(const AmNnet &am_nnet,
-                         const TransitionModel &tmodel,
-                         const NnetDiscriminativeUpdateOptions &opts,
-                         bool store_separate_gradients,
-                         DiscriminativeExamplesRepository *repository,
-                         Nnet *nnet_to_update,
-                         NnetDiscriminativeStats *stats):
-      am_nnet_(am_nnet), tmodel_(tmodel), opts_(opts),
-      store_separate_gradients_(store_separate_gradients),
-      repository_(repository),
-      nnet_to_update_(nnet_to_update),
-      nnet_to_update_orig_(nnet_to_update),
-      stats_ptr_(stats) { }
-
-  // The following constructor is called multiple times within
-  // the RunMultiThreaded template function.
-  DiscTrainParallelClass(const DiscTrainParallelClass &other):
-  MultiThreadable(other),
-  am_nnet_(other.am_nnet_), tmodel_(other.tmodel_), opts_(other.opts_),
-  store_separate_gradients_(other.store_separate_gradients_),
-  repository_(other.repository_), nnet_to_update_(other.nnet_to_update_),
-  nnet_to_update_orig_(other.nnet_to_update_orig_),
-  stats_ptr_(other.stats_ptr_) {
-    if (store_separate_gradients_) {
-      // To ensure correctness, we work on separate copies of the gradient
-      // object, which we'll sum at the end.  This is used for exact gradient
-      // computation.
-      if (other.nnet_to_update_ != NULL) {
-        nnet_to_update_ = new Nnet(*(other.nnet_to_update_));
-        // our "nnet_to_update_" variable is a copy of the neural network
-        // we are to update (presumably a gradient).  If we don't set these
-        // to zero we would end up adding multiple copies of the any initial
-        // gradient that "nnet_to_update_" contained when we initialize
-        // the first instance of the class.
-        nnet_to_update_->SetZero(true);
-      } else { // support case where we don't really need a gradient.
-        nnet_to_update_ = NULL;
-      }
-    }
-  }
-  // This does the main function of the class.
-  void operator () () {
-    DiscriminativeNnetExample *example;
-    while ((example = repository_->ProvideExample()) != NULL) {
-      // This is a function call to a function defined in
-      // nnet-compute-discriminative.h
-      NnetDiscriminativeUpdate(am_nnet_, tmodel_, opts_,
-                               *example, nnet_to_update_, &stats_);
-      delete example;
-
-      if (GetVerboseLevel() > 3) {
-        KALDI_VLOG(3) << "Printing local stats for thread " << thread_id_;
-        stats_.Print(opts_.criterion);
-      }
-    }
-  }
-
-  ~DiscTrainParallelClass() {
-    if (nnet_to_update_orig_ != nnet_to_update_) {
-      // This branch is only taken if this instance of the class is
-      // one of the multiple instances allocated inside the RunMultiThreaded
-      // template function, *and* store_separate_gradients_ has been set to true.
-      // In the typical hogwild case, we don't do this.
-      nnet_to_update_orig_->AddNnet(1.0, *nnet_to_update_);
-      delete nnet_to_update_;
-    }
-    stats_ptr_->Add(stats_);
-  }
- private:
-  const AmNnet &am_nnet_;
-  const TransitionModel &tmodel_;
-  const NnetDiscriminativeUpdateOptions &opts_;
-  bool store_separate_gradients_;
-  DiscriminativeExamplesRepository *repository_;
-  Nnet *nnet_to_update_;
-  Nnet *nnet_to_update_orig_;
-  NnetDiscriminativeStats *stats_ptr_;
-  NnetDiscriminativeStats stats_;
-};
-
-
-
-void NnetDiscriminativeUpdateParallel(
-    const AmNnet &am_nnet,
-    const TransitionModel &tmodel,
-    const NnetDiscriminativeUpdateOptions &opts,
-    int32 num_threads,
-    SequentialDiscriminativeNnetExampleReader *example_reader,
-    Nnet *nnet_to_update,
-    NnetDiscriminativeStats *stats) {
-
-  DiscriminativeExamplesRepository repository;
-
-  const bool store_separate_gradients = (nnet_to_update != &(am_nnet.GetNnet()));
-
-  DiscTrainParallelClass c(am_nnet, tmodel, opts,
-                           store_separate_gradients,
-                           &repository, nnet_to_update, stats);
-
-  {
-    // The initialization of the following class spawns the threads that
-    // process the examples.  They get re-joined in its destructor.
-    MultiThreader<DiscTrainParallelClass> m(num_threads, c);
-
-    for (; !example_reader->Done(); example_reader->Next()) {
-      repository.AcceptExample(example_reader->Value());
-    }
-    repository.ExamplesDone();
-  }
-  stats->Print(opts.criterion);
-}
-
-
-
-} // namespace nnet2
-} // namespace kaldi
diff --git a/src/nnet2/nnet-compute-discriminative-parallel.h b/src/nnet2/nnet-compute-discriminative-parallel.h
deleted file mode 100644
index 5427126be17..00000000000
--- a/src/nnet2/nnet-compute-discriminative-parallel.h
+++ /dev/null
@@ -1,49 +0,0 @@
-// nnet2/nnet-compute-discriminative-parallel.h
-
-// Copyright 2012-2013  Johns Hopkins University (author: Daniel Povey)
-
-// See ../../COPYING for clarification regarding multiple authors
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//  http://www.apache.org/licenses/LICENSE-2.0
-//
-// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
-// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
-// MERCHANTABLITY OR NON-INFRINGEMENT.
-// See the Apache 2 License for the specific language governing permissions and
-// limitations under the License.
-
-#ifndef KALDI_NNET2_NNET_COMPUTE_DISCRIMINATIVE_PARALLEL_H_
-#define KALDI_NNET2_NNET_COMPUTE_DISCRIMINATIVE_PARALLEL_H_
-
-#include "nnet2/am-nnet.h"
-#include "nnet2/nnet-example.h"
-#include "hmm/transition-model.h"
-#include "nnet2/nnet-compute-discriminative.h"
-
-namespace kaldi {
-namespace nnet2 {
-
-/* This header provides a multi-threaded version of the discriminative training
-   code (this is for a CPU-based, instead of GPU-based, setup).
-   Note: we expect that "nnet_to_update" will be the same as "&(am_nnet.GetNnet())"
-*/
-
-void NnetDiscriminativeUpdateParallel(
-    const AmNnet &am_nnet,
-    const TransitionModel &tmodel,
-    const NnetDiscriminativeUpdateOptions &opts,
-    int32 num_threads,
-    SequentialDiscriminativeNnetExampleReader *example_reader,
-    Nnet *nnet_to_update,
-    NnetDiscriminativeStats *stats);
-
-
-} // namespace nnet2
-} // namespace kaldi
-
-#endif //  KALDI_NNET2_NNET_COMPUTE_DISCRIMINATIVE_PARALLEL_H_
diff --git a/src/nnet2/nnet-compute-discriminative.cc b/src/nnet2/nnet-compute-discriminative.cc
deleted file mode 100644
index 72a579d608f..00000000000
--- a/src/nnet2/nnet-compute-discriminative.cc
+++ /dev/null
@@ -1,416 +0,0 @@
-// nnet2/nnet-compute-discriminative.cc
-
-// Copyright 2012-2013   Johns Hopkins University (author: Daniel Povey)
-
-// See ../../COPYING for clarification regarding multiple authors
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//  http://www.apache.org/licenses/LICENSE-2.0
-//
-// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
-// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
-// MERCHANTABLITY OR NON-INFRINGEMENT.
-// See the Apache 2 License for the specific language governing permissions and
-// limitations under the License.
-
-#include "nnet2/nnet-compute-discriminative.h"
-#include "hmm/posterior.h"
-#include "lat/lattice-functions.h"
-
-namespace kaldi {
-namespace nnet2 {
-
-/*
-  This class does the forward and possibly backward computation for (typically)
-  a whole utterance of contiguous features.  You'll instantiate one of
-  these classes each time you want to do this computation.
-*/
-class NnetDiscriminativeUpdater {
- public:
-
-  NnetDiscriminativeUpdater(const AmNnet &am_nnet,
-                            const TransitionModel &tmodel,
-                            const NnetDiscriminativeUpdateOptions &opts,
-                            const DiscriminativeNnetExample &eg,
-                            Nnet *nnet_to_update,
-                            NnetDiscriminativeStats *stats);
-
-  void Update() {
-    Propagate();
-    LatticeComputations();
-    if (nnet_to_update_ != NULL)
-      Backprop();
-  }
-  
-  /// The forward-through-the-layers part of the computation.
-  void Propagate();  
-
-  /// Does the parts between Propagate() and Backprop(), that
-  /// involve forward-backward over the lattice.
-  void LatticeComputations();
-  
-  void Backprop();
-
-  /// Assuming the lattice already has the correct scores in
-  /// it, this function does the MPE or MMI forward-backward
-  /// and puts the resulting discriminative posteriors (which
-  /// may have positive or negative weight) into "post".
-  /// It returns, for MPFE/SMBR, the objective function, or
-  /// for MMI, the negative of the denominator-lattice log-likelihood.
-  double GetDiscriminativePosteriors(Posterior *post);
-  
-  SubMatrix<BaseFloat> GetInputFeatures() const;
-  
-  CuMatrixBase<BaseFloat> &GetOutput() { return forward_data_.back(); }
-
-  static inline Int32Pair MakePair(int32 first, int32 second) {
-    Int32Pair ans;
-    ans.first = first;
-    ans.second = second;
-    return ans;
-  }
-  
- private:
-  typedef LatticeArc Arc;
-  typedef Arc::StateId StateId;
-
-  
-  const AmNnet &am_nnet_;
-  const TransitionModel &tmodel_;
-  const NnetDiscriminativeUpdateOptions &opts_;
-  const DiscriminativeNnetExample &eg_;
-  Nnet *nnet_to_update_; // will equal am_nnet_.GetNnet(), in SGD case, or
-                         // another Nnet, in gradient-computation case, or
-                         // NULL if we just need the objective function.
-  NnetDiscriminativeStats *stats_; // the objective function, etc.
-  std::vector<ChunkInfo> chunk_info_out_; 
-  // forward_data_[i] is the input of the i'th component and (if i > 0)
-  // the output of the i-1'th component.
-  std::vector<CuMatrix<BaseFloat> > forward_data_; 
-  Lattice lat_; // we convert the CompactLattice in the eg, into Lattice form.
-  CuMatrix<BaseFloat> backward_data_;
-  std::vector<int32> silence_phones_; // derived from opts_.silence_phones_str
-};
-
-
-
-NnetDiscriminativeUpdater::NnetDiscriminativeUpdater(
-    const AmNnet &am_nnet,
-    const TransitionModel &tmodel,
-    const NnetDiscriminativeUpdateOptions &opts,
-    const DiscriminativeNnetExample &eg,
-    Nnet *nnet_to_update,
-    NnetDiscriminativeStats *stats):
-    am_nnet_(am_nnet), tmodel_(tmodel), opts_(opts), eg_(eg),
-    nnet_to_update_(nnet_to_update), stats_(stats) {
-  if (!SplitStringToIntegers(opts_.silence_phones_str, ":", false,
-                             &silence_phones_)) {
-    KALDI_ERR << "Bad value for --silence-phones option: "
-              << opts_.silence_phones_str;
-  }
-  const Nnet &nnet = am_nnet_.GetNnet();
-  nnet.ComputeChunkInfo(eg_.input_frames.NumRows(), 1, &chunk_info_out_);
-}
-
-
-
-SubMatrix<BaseFloat> NnetDiscriminativeUpdater::GetInputFeatures() const {
-  int32 num_frames_output = eg_.num_ali.size();
-  int32 eg_left_context = eg_.left_context,
-      eg_right_context = eg_.input_frames.NumRows() -
-      num_frames_output - eg_left_context;
-  KALDI_ASSERT(eg_right_context >= 0);
-  const Nnet &nnet = am_nnet_.GetNnet();
-  // Make sure the example has enough acoustic left and right
-  // context... normally we'll use examples generated using the same model,
-  // which will have the exact context, but we enable a mismatch in context as
-  // long as it is more, not less.
-  KALDI_ASSERT(eg_left_context >= nnet.LeftContext() &&
-               eg_right_context >= nnet.RightContext());
-  int32 offset = eg_left_context - nnet.LeftContext(),
-      num_output_frames =
-      num_frames_output + nnet.LeftContext() + nnet.RightContext();
-  SubMatrix<BaseFloat> ans(eg_.input_frames, offset, num_output_frames,
-                           0, eg_.input_frames.NumCols());
-  return ans;
-}
-
-void NnetDiscriminativeUpdater::Propagate() {
-  const Nnet &nnet = am_nnet_.GetNnet();
-  forward_data_.resize(nnet.NumComponents() + 1);
-  
-  SubMatrix<BaseFloat> input_feats = GetInputFeatures();
-  int32 spk_dim = eg_.spk_info.Dim();
-  if (spk_dim == 0) {
-    forward_data_[0] = input_feats;
-  } else {
-    forward_data_[0].Resize(input_feats.NumRows(),
-                            input_feats.NumCols() + eg_.spk_info.Dim());
-    forward_data_[0].Range(0, input_feats.NumRows(),
-                           0, input_feats.NumCols()).CopyFromMat(input_feats);
-    forward_data_[0].Range(0, input_feats.NumRows(),
-                           input_feats.NumCols(), spk_dim).CopyRowsFromVec(
-                               eg_.spk_info);
-  }
-
-  for (int32 c = 0; c < nnet.NumComponents(); c++) {
-    const Component &component = nnet.GetComponent(c);
-    CuMatrix<BaseFloat> &input = forward_data_[c],
-        &output = forward_data_[c+1];
-    component.Propagate(chunk_info_out_[c] , chunk_info_out_[c+1], input, &output);
-    const Component *prev_component = (c == 0 ? NULL :
-                                       &(nnet.GetComponent(c-1)));
-    bool will_do_backprop = (nnet_to_update_ != NULL),
-        keep_last_output = will_do_backprop &&
-        ((c>0 && prev_component->BackpropNeedsOutput()) ||
-         component.BackpropNeedsInput());
-    if (!keep_last_output)
-      forward_data_[c].Resize(0, 0); // We won't need this data; save memory.
-  }
-}
-
-
-
-void NnetDiscriminativeUpdater::LatticeComputations() {
-  ConvertLattice(eg_.den_lat, &lat_); // convert to Lattice.
-  TopSort(&lat_); // Topologically sort (required by forward-backward algorithms)
-
-  if (opts_.criterion == "mmi" && opts_.boost != 0.0) {
-    BaseFloat max_silence_error = 0.0;
-    LatticeBoost(tmodel_, eg_.num_ali, silence_phones_,
-                 opts_.boost, max_silence_error, &lat_);
-  }
-  
-  int32 num_frames = static_cast<int32>(eg_.num_ali.size());
-
-  stats_->tot_t += num_frames;
-  stats_->tot_t_weighted += num_frames * eg_.weight;
-  
-  const VectorBase<BaseFloat> &priors = am_nnet_.Priors();
-  const CuMatrix<BaseFloat> &posteriors = forward_data_.back();
-
-  KALDI_ASSERT(posteriors.NumRows() == num_frames);
-  int32 num_pdfs = posteriors.NumCols();
-  KALDI_ASSERT(num_pdfs == priors.Dim());
-  
-  // We need to look up the posteriors of some pdf-ids in the matrix
-  // "posteriors".  Rather than looking them all up using operator (), which is
-  // very slow because each lookup involves a separate CUDA call with
-  // communication over PciExpress, we look them up all at once using
-  // CuMatrix::Lookup().
-  // Note: regardless of the criterion, we evaluate the likelihoods in
-  // the numerator alignment.  Even though they may be irrelevant to
-  // the optimization, they will affect the value of the objective function.
-  
-  std::vector<Int32Pair> requested_indexes;
-  BaseFloat wiggle_room = 1.3; // value not critical.. it's just 'reserve'
-  requested_indexes.reserve(num_frames + wiggle_room * lat_.NumStates());
-
-  if (opts_.criterion == "mmi") { // need numerator probabilities...
-    for (int32 t = 0; t < num_frames; t++) {
-      int32 tid = eg_.num_ali[t], pdf_id = tmodel_.TransitionIdToPdf(tid);
-      KALDI_ASSERT(pdf_id >= 0 && pdf_id < num_pdfs);
-      requested_indexes.push_back(MakePair(t, pdf_id));
-    }
-  }
-
-  std::vector<int32> state_times;
-  int32 T = LatticeStateTimes(lat_, &state_times);
-  KALDI_ASSERT(T == num_frames);
-  
-  StateId num_states = lat_.NumStates();
-  for (StateId s = 0; s < num_states; s++) {
-    StateId t = state_times[s];
-    for (fst::ArcIterator<Lattice> aiter(lat_, s); !aiter.Done(); aiter.Next()) {
-      const Arc &arc = aiter.Value();
-      if (arc.ilabel != 0) { // input-side has transition-ids, output-side empty
-        int32 tid = arc.ilabel, pdf_id = tmodel_.TransitionIdToPdf(tid);
-        requested_indexes.push_back(MakePair(t, pdf_id));
-      }
-    }
-  }
-
-  std::vector<BaseFloat> answers;
-  CuArray<Int32Pair> cu_requested_indexes(requested_indexes);
-  answers.resize(requested_indexes.size());
-  posteriors.Lookup(cu_requested_indexes, &(answers[0]));
-
-  int32 num_floored = 0;
-
-  BaseFloat floor_val = 1.0e-20; // floor for posteriors.
-  size_t index;
-
-  // Replace "answers" with the vector of scaled log-probs.  If this step takes
-  // too much time, we can look at other ways to do it, using the CUDA card.
-  for (index = 0; index < answers.size(); index++) {
-    BaseFloat post = answers[index];
-    if (post < floor_val) {
-      post = floor_val;
-      num_floored++;
-    }
-    int32 pdf_id = requested_indexes[index].second;
-    BaseFloat pseudo_loglike = Log(post / priors(pdf_id)) * opts_.acoustic_scale;
-    KALDI_ASSERT(!KALDI_ISINF(pseudo_loglike) && !KALDI_ISNAN(pseudo_loglike));
-    answers[index] = pseudo_loglike;
-  }
-  if (num_floored > 0) {
-    KALDI_WARN << "Floored " << num_floored << " probabilities from nnet.";
-  }
-  
-  index = 0;
-  
-  if (opts_.criterion == "mmi") {
-    double tot_num_like = 0.0;
-    for (; index < eg_.num_ali.size(); index++)
-      tot_num_like += answers[index];
-    stats_->tot_num_objf += eg_.weight * tot_num_like;
-  }
-
-  // Now put the (scaled) acoustic log-likelihoods in the lattice.
-  for (StateId s = 0; s < num_states; s++) {
-    for (fst::MutableArcIterator<Lattice> aiter(&lat_, s);
-         !aiter.Done(); aiter.Next()) {
-      Arc arc = aiter.Value();
-      if (arc.ilabel != 0) { // input-side has transition-ids, output-side empty
-        arc.weight.SetValue2(-answers[index]);
-        index++;
-        aiter.SetValue(arc);
-      }
-    }
-    LatticeWeight final = lat_.Final(s);
-    if (final != LatticeWeight::Zero()) {
-      final.SetValue2(0.0); // make sure no acoustic term in final-prob.
-      lat_.SetFinal(s, final);
-    }
-  }
-  KALDI_ASSERT(index == answers.size());
-  
-  // Get the MPE or MMI posteriors.
-  Posterior post;
-  stats_->tot_den_objf += eg_.weight * GetDiscriminativePosteriors(&post);
-
-  ScalePosterior(eg_.weight, &post);
-
-  double tot_num_post = 0.0, tot_den_post = 0.0;
-  std::vector<MatrixElement<BaseFloat> > sv_labels;
-  sv_labels.reserve(answers.size());
-  for (int32 t = 0; t < post.size(); t++) {
-    for (int32 i = 0; i < post[t].size(); i++) {
-      int32 pdf_id = post[t][i].first;
-      BaseFloat weight = post[t][i].second;
-      if (weight > 0.0) { tot_num_post += weight; }
-      else { tot_den_post -= weight; }
-      MatrixElement<BaseFloat> elem = {t, pdf_id, weight};
-      sv_labels.push_back(elem);
-    }
-  }
-  stats_->tot_num_count += tot_num_post;
-  int32 num_components = am_nnet_.GetNnet().NumComponents();
-  const CuMatrix<BaseFloat> &output(forward_data_[num_components]);
-  backward_data_.Resize(output.NumRows(), output.NumCols()); // zeroes it.
-  
-  { // We don't actually need tot_objf and tot_weight; we have already
-    // computed the objective function.
-    BaseFloat tot_objf, tot_weight;
-    backward_data_.CompObjfAndDeriv(sv_labels, output, &tot_objf, &tot_weight);
-    // Now backward_data_ will contan the derivative at the output.
-    // Our work here is done..
-  }
-}
-
-
-double NnetDiscriminativeUpdater::GetDiscriminativePosteriors(Posterior *post) {
-  if (opts_.criterion == "mpfe" || opts_.criterion == "smbr") {
-    Posterior tid_post;
-    double ans;
-    ans = LatticeForwardBackwardMpeVariants(tmodel_, silence_phones_, lat_,
-                                            eg_.num_ali, opts_.criterion,
-                                            opts_.one_silence_class,
-                                            &tid_post);
-    ConvertPosteriorToPdfs(tmodel_, tid_post, post);
-    return ans; // returns the objective function.
-  } else {
-    KALDI_ASSERT(opts_.criterion == "mmi");
-    bool convert_to_pdfs = true, cancel = true;
-    // we'll return the denominator-lattice forward backward likelihood,
-    // which is one term in the objective function.
-    return LatticeForwardBackwardMmi(tmodel_, lat_, eg_.num_ali,
-                                     opts_.drop_frames, convert_to_pdfs,
-                                     cancel, post);
-  }
-}
-
-
-
-void NnetDiscriminativeUpdater::Backprop() {
-  const Nnet &nnet = am_nnet_.GetNnet();
-  for (int32 c = nnet.NumComponents() - 1; c >= 0; c--) {
-    const Component &component = nnet.GetComponent(c);
-    Component *component_to_update = &(nnet_to_update_->GetComponent(c));
-    const CuMatrix<BaseFloat>  &input = forward_data_[c],
-                            &output = forward_data_[c+1],
-                      &output_deriv = backward_data_;
-    CuMatrix<BaseFloat> input_deriv;
-    component.Backprop(chunk_info_out_[c], chunk_info_out_[c+1], input, output, output_deriv,
-                       component_to_update, &input_deriv);
-    backward_data_.Swap(&input_deriv); // backward_data_ = input_deriv.
-  }
-}
-
-
-void NnetDiscriminativeUpdate(const AmNnet &am_nnet,
-                              const TransitionModel &tmodel,
-                              const NnetDiscriminativeUpdateOptions &opts,
-                              const DiscriminativeNnetExample &eg,
-                              Nnet *nnet_to_update,
-                              NnetDiscriminativeStats *stats) {
-  NnetDiscriminativeUpdater updater(am_nnet, tmodel, opts, eg,
-                                    nnet_to_update, stats);
-  updater.Update();
-}
-
-void NnetDiscriminativeStats::Add(const NnetDiscriminativeStats &other) {
-  tot_t += other.tot_t;
-  tot_t_weighted += other.tot_t_weighted;
-  tot_num_count += other.tot_num_count;
-  tot_num_objf += other.tot_num_objf;
-  tot_den_objf += other.tot_den_objf;
-}
-
-void NnetDiscriminativeStats::Print(std::string criterion) {
-  KALDI_ASSERT(criterion == "mmi" || criterion == "smbr" ||
-               criterion == "mpfe");
-
-  double avg_post_per_frame = tot_num_count / tot_t_weighted;
-  KALDI_LOG << "Number of frames is " << tot_t
-            << " (weighted: " << tot_t_weighted
-            << "), average (num or den) posterior per frame is "
-            << avg_post_per_frame;
-  
-  if (criterion == "mmi") {
-    double num_objf = tot_num_objf / tot_t_weighted,
-        den_objf = tot_den_objf / tot_t_weighted,
-        objf = num_objf - den_objf;
-    KALDI_LOG << "MMI objective function is " << num_objf << " - "
-              << den_objf << " = " << objf << " per frame, over "
-              << tot_t_weighted << " frames.";
-  } else if (criterion == "mpfe") {
-    double objf = tot_den_objf / tot_t_weighted; // this contains the actual
-                                                 // summed objf
-    KALDI_LOG << "MPFE objective function is " << objf
-              << " per frame, over " << tot_t_weighted << " frames.";
-  } else {
-    double objf = tot_den_objf / tot_t_weighted; // this contains the actual
-                                                 // summed objf
-    KALDI_LOG << "SMBR objective function is " << objf
-              << " per frame, over " << tot_t_weighted << " frames.";
-  }
-}
-
-
-} // namespace nnet2
-} // namespace kaldi
diff --git a/src/nnet2/nnet-compute-discriminative.h b/src/nnet2/nnet-compute-discriminative.h
deleted file mode 100644
index 219ebb45008..00000000000
--- a/src/nnet2/nnet-compute-discriminative.h
+++ /dev/null
@@ -1,115 +0,0 @@
-// nnet2/nnet-compute-discriminative.h
-
-// Copyright 2012-2013  Johns Hopkins University (author: Daniel Povey)
-
-// See ../../COPYING for clarification regarding multiple authors
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//  http://www.apache.org/licenses/LICENSE-2.0
-//
-// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
-// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
-// MERCHANTABLITY OR NON-INFRINGEMENT.
-// See the Apache 2 License for the specific language governing permissions and
-// limitations under the License.
-
-#ifndef KALDI_NNET2_NNET_COMPUTE_DISCRIMINATIVE_H_
-#define KALDI_NNET2_NNET_COMPUTE_DISCRIMINATIVE_H_
-
-#include "nnet2/am-nnet.h"
-#include "nnet2/nnet-example.h"
-#include "hmm/transition-model.h"
-
-namespace kaldi {
-namespace nnet2 {
-
-/* This header provides functionality for doing model updates, and computing
-   gradients, using discriminative objective functions (MPFE, SMBR, MMI).
-   We use the DiscriminativeNnetExample defined in nnet-example.h.
-*/
-
-struct NnetDiscriminativeUpdateOptions {
-  std::string criterion; // "mmi" or "mpfe" or "smbr"
-  BaseFloat acoustic_scale; // e.g. 0.1
-  bool drop_frames; // for MMI, true if we ignore frames where alignment
-                    // pdf-id is not in the lattice.
-  bool one_silence_class;  // Affects MPE/SMBR>
-  BaseFloat boost; // for MMI, boosting factor (would be Boosted MMI)... e.g. 0.1.
-
-  std::string silence_phones_str; // colon-separated list of integer ids of silence phones,
-                                  // for MPE/SMBR only.
-
-  NnetDiscriminativeUpdateOptions(): criterion("smbr"), acoustic_scale(0.1),
-                                     drop_frames(false),
-                                     one_silence_class(false),
-                                     boost(0.0) { }
-  
-  void Register(OptionsItf *opts) {
-    opts->Register("criterion", &criterion, "Criterion, 'mmi'|'mpfe'|'smbr', "
-                   "determines the objective function to use.  Should match "
-                   "option used when we created the examples.");
-    opts->Register("acoustic-scale", &acoustic_scale, "Weighting factor to "
-                   "apply to acoustic likelihoods.");
-    opts->Register("drop-frames", &drop_frames, "For MMI, if true we drop frames "
-                   "with no overlap of num and den frames");
-    opts->Register("boost", &boost, "Boosting factor for boosted MMI (e.g. 0.1)");
-    opts->Register("one-silence-class", &one_silence_class, "If true, newer "
-                   "behavior which will tend to reduce insertions.");
-    opts->Register("silence-phones", &silence_phones_str,
-                   "For MPFE or SMBR, colon-separated list of integer ids of "
-                   "silence phones, e.g. 1:2:3");
-    
-  }
-};
-
-
-struct NnetDiscriminativeStats {
-  double tot_t; // total number of frames
-  double tot_t_weighted; // total number of frames times weight.
-  double tot_num_count; // total count of numerator posterior (should be
-                        // identical to denominator-posterior count, so we don't
-                        // separately compute that).
-  double tot_num_objf;  // for MMI, the (weighted) numerator likelihood; for
-                        // SMBR/MPFE, 0.
-  double tot_den_objf;  // for MMI, the (weighted) denominator likelihood; for
-                        // SMBR/MPFE, the objective function.
-  NnetDiscriminativeStats() { std::memset(this, 0, sizeof(*this)); }
-  void Print(std::string criterion); // const NnetDiscriminativeUpdateOptions &opts);
-  void Add(const NnetDiscriminativeStats &other);
-};
-
-/** Does the neural net computation, lattice forward-backward, and backprop,
-    for either the MMI, MPFE or SMBR objective functions.
-    If nnet_to_update == &(am_nnet.GetNnet()), then this does stochastic
-    gradient descent, otherwise (assuming you have called SetZero(true)
-    on *nnet_to_update) it will compute the gradient on this data.
-    If nnet_to_update_ == NULL, no backpropagation is done.
-    
-    Note: we ignore any existing acoustic score in the lattice of "eg".
-
-    For display purposes you should normalize the sum of this return value by
-    dividing by the sum over the examples, of the number of frames
-    (num_ali.size()) times the weight.
-
-    Something you need to be careful with is that the occupation counts and the
-    derivative are, following tradition, missing a factor equal to the acoustic
-    scale.  So you need to multiply them by that scale if you plan to do
-    something like L-BFGS in which you look at both the derivatives and function
-    values.  */
-
-void NnetDiscriminativeUpdate(const AmNnet &am_nnet,
-                              const TransitionModel &tmodel,
-                              const NnetDiscriminativeUpdateOptions &opts,
-                              const DiscriminativeNnetExample &eg,
-                              Nnet *nnet_to_update,
-                              NnetDiscriminativeStats *stats);
-
-
-} // namespace nnet2
-} // namespace kaldi
-
-#endif // KALDI_NNET2_NNET_COMPUTE_DISCRIMINATIVE_H_
diff --git a/src/nnet2/nnet-compute-online.cc b/src/nnet2/nnet-compute-online.cc
deleted file mode 100644
index 18fc48b6c78..00000000000
--- a/src/nnet2/nnet-compute-online.cc
+++ /dev/null
@@ -1,215 +0,0 @@
-// nnet2/nnet-compute-online.cc
-
-// Copyright 2014   Johns Hopkins University (author: Daniel Povey)
-//                  Guoguo Chen
-//                  Vijayaditya Peddinti
-
-// See ../../COPYING for clarification regarding multiple authors
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//  http://www.apache.org/licenses/LICENSE-2.0
-//
-// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
-// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
-// MERCHANTABLITY OR NON-INFRINGEMENT.
-// See the Apache 2 License for the specific language governing permissions and
-// limitations under the License.
-
-#include "nnet2/nnet-compute-online.h"
-#include <vector>
-
-namespace kaldi {
-namespace nnet2 {
-
-NnetOnlineComputer::NnetOnlineComputer(const Nnet &nnet, bool pad_input)
-    : nnet_(nnet), pad_input_(pad_input),
-      is_first_chunk_(true), finished_(false) {
-  data_.resize(nnet_.NumComponents() + 1);
-  reusable_component_inputs_.resize(nnet_.NumComponents()+1);
-}
-
-void NnetOnlineComputer::Compute(const CuMatrixBase<BaseFloat> &input,
-                                 CuMatrix<BaseFloat> *output) {
-  KALDI_ASSERT(output != NULL);
-  KALDI_ASSERT(!finished_);
-  int32 dim = input.NumCols();
-
-  // If input is empty, we also set output to zero size.
-  if (input.NumRows() == 0) {
-    output->Resize(0, 0);
-    return;
-  } else {
-    // store the last frame as it might be needed for padding when Flush() is
-    // called.
-    if (last_seen_input_frame_.Dim() != input.NumCols())
-      last_seen_input_frame_.Resize(input.NumCols());
-    last_seen_input_frame_.CopyFromVec(input.Row(input.NumRows() - 1));
-  }
-
-  // Checking if feature dimension matches that required by the neural network.
-  if (dim != nnet_.InputDim()) {
-    KALDI_ERR << "Feature dimension is " << dim << ", but network expects "
-        << nnet_.InputDim();
-  }
-  // num_effective_input_rows is the effective number of input rows we have, for
-  // purposes of computing how much output we will get.  It is the number of
-  // actual input rows plus the amount of context stored at intermediate layers
-  // of the network (which if we have previously done the computation, will
-  // equal nnet_.LeftContext() + nnet_.RightContext()).
-  int32 num_effective_input_rows = 0;
-  // Initialize the first element of data_, with input
-  CuMatrix<BaseFloat> &input_data(data_[0]);
-  if (is_first_chunk_)  {
-    is_first_chunk_ = false;
-    // assert that all the component-wise input buffers are empty
-    for (int32 i = 0; i < reusable_component_inputs_.size(); i++)
-      KALDI_ASSERT(reusable_component_inputs_[0].NumRows() == 0);
-    // Pad at the start of the file if necessary.
-    if ((pad_input_) && (nnet_.LeftContext() > 0))  {
-        input_data.Resize(nnet_.LeftContext() + input.NumRows(), dim);
-        input_data.Range(0, nnet_.LeftContext(), 0,
-                    dim).CopyRowsFromVec(input.Row(0));
-        input_data.Range(nnet_.LeftContext(), input.NumRows(),
-                    0, dim).CopyFromMat(input);
-    } else {
-      input_data.Resize(input.NumRows(), input.NumCols());
-      input_data.CopyFromMat(input);
-    }
-    num_effective_input_rows = input_data.NumRows();
-  } else {
-    int32 extra_input_rows = 0;
-    // checking if we did forward pass for any chunks before.
-    // if we did a forward pass, component input buffers would be non-empty
-    // these buffers store information equivalent to having an nnet_input
-    // buffer of (nnet_.LeftContext() + nnet_.RightContext())
-    for (int32 i = 0; i < reusable_component_inputs_.size(); i++)  {
-      if (reusable_component_inputs_[i].NumRows() > 0) {
-        extra_input_rows = nnet_.LeftContext() + nnet_.RightContext();
-        break;
-      }
-    }
-    // add unprocessed input from the previous calls
-    input_data.Resize(input.NumRows() + unprocessed_buffer_.NumRows(), dim);
-    if (unprocessed_buffer_.NumRows() > 0)
-      input_data.Range(0, unprocessed_buffer_.NumRows(),
-                       0, dim).CopyFromMat(unprocessed_buffer_);
-    input_data.Range(unprocessed_buffer_.NumRows(), input.NumRows(),
-                     0, dim).CopyFromMat(input);
-    unprocessed_buffer_.Resize(0, 0); // clearing the unprocessed buffer
-    num_effective_input_rows = input_data.NumRows() + extra_input_rows;
-  }
-  if (num_effective_input_rows >=
-      nnet_.LeftContext() + nnet_.RightContext() + 1) {
-    // we have sufficient frames to compute at least one nnet output
-    nnet_.ComputeChunkInfo(num_effective_input_rows, 1, &chunk_info_);
-    Propagate();
-    *output = data_.back();
-  } else {
-    // store the input in the unprocessed_buffer_
-    unprocessed_buffer_ = input_data;
-    // not enough input context so just return an empty array
-    output->Resize(0, 0);
-  }
-
-}
-
-void NnetOnlineComputer::Flush(CuMatrix<BaseFloat> *output) {
-  KALDI_ASSERT(!finished_ && !is_first_chunk_);
-  int32 num_frames_padding = (pad_input_ ? nnet_.RightContext() : 0);
-  int32 num_stored_frames = nnet_.LeftContext() + nnet_.RightContext();
-  int32 num_effective_input_rows =  num_stored_frames + num_frames_padding;
-  // If the amount of output would be empty return at this point.
-  if (num_effective_input_rows < nnet_.LeftContext() + nnet_.RightContext() + 1) {
-    output->Resize(0, 0);
-    finished_ = true;
-    return;
-  }
-
-  int32 dim = nnet_.InputDim();
-  CuMatrix<BaseFloat> &input_data(data_[0]);
-  KALDI_ASSERT(num_frames_padding > 0);  // else we would have returned above.
-  input_data.Resize(num_frames_padding, dim);
-  input_data.CopyRowsFromVec(last_seen_input_frame_);
-
-  // Note, we later modify this chunk-info, it isn't quite correct right now
-  // because we add extra data at intermediate layers, and the actual number of
-  // input rows doesn't equal num_effective_input_rows.
-  nnet_.ComputeChunkInfo(num_effective_input_rows, 1,
-                         &chunk_info_);
-  Propagate();
-  *output = data_.back();
-  finished_ = true;
-}
-
-void NnetOnlineComputer::Propagate() {
-  // This method is like the normal nnet propagate, but we reuse the frames
-  // computed from the previous chunk, at each component.
-
-  for (int32 c = 0; c < nnet_.NumComponents(); c++) {
-    // we assume that the chunks are always contiguous
-    chunk_info_[c].MakeOffsetsContiguous();
-    chunk_info_[c + 1].MakeOffsetsContiguous();
-
-    const Component &component = nnet_.GetComponent(c);
-    CuMatrix<BaseFloat> &input_data = data_[c], &output_data = data_[c + 1];
-    CuMatrix<BaseFloat> input_data_temp;
-
-    if (component.Context().size() > 1)  {
-      int32 dim = component.InputDim();
-      if (reusable_component_inputs_[c].NumRows() > 0) {
-        // concatenate any frames computed by previous component
-        // in the last call, to the input of the current component
-        input_data_temp.Resize(reusable_component_inputs_[c].NumRows()
-                               + input_data.NumRows(), dim);
-        input_data_temp.Range(0, reusable_component_inputs_[c].NumRows(),
-                       0, dim).CopyFromMat(reusable_component_inputs_[c]);
-        input_data_temp.Range(reusable_component_inputs_[c].NumRows(),
-                              input_data.NumRows(), 0, dim).CopyFromMat(
-                                  input_data);
-        input_data = input_data_temp;
-      }
-      // store any frames which can be reused in the next call
-      reusable_component_inputs_[c].Resize(component.Context().back() -
-                                component.Context().front(), dim);
-      reusable_component_inputs_[c].CopyFromMat(
-          input_data.RowRange(input_data.NumRows() -
-                              reusable_component_inputs_[c].NumRows(),
-                              reusable_component_inputs_[c].NumRows()));
-    }
-
-    // chunk_info objects provided assume that we added all the reusable
-    // context at the input of the nnet. However we are reusing hidden
-    // activations computed in the previous call.
-    // Hence we manipulate the chunk_info objects to reflect the state of the
-    // actual chunk, each component is computing, in the current Propagate.
-    // As before we always assume the chunks are contiguous.
-
-    // modifying the input chunk_info
-    int32 chunk_size_assumed = chunk_info_[c].ChunkSize();
-    int32 last_offset = chunk_info_[c].GetOffset(chunk_size_assumed - 1);
-    int32 first_offset = last_offset - input_data.NumRows() + 1;
-    ChunkInfo input_chunk_info(chunk_info_[c].NumCols(),
-                               chunk_info_[c].NumChunks(),
-                               first_offset,
-                               last_offset);
-    // modifying the output chunk_info
-    chunk_size_assumed = chunk_info_[c + 1].ChunkSize();
-    last_offset = chunk_info_[c + 1].GetOffset(chunk_size_assumed - 1);
-    first_offset = last_offset - (input_data.NumRows() -
-                                  (component.Context().back() -
-                                   component.Context().front())) + 1;
-    ChunkInfo output_chunk_info(chunk_info_[c + 1].NumCols(),
-                                chunk_info_[c + 1].NumChunks(),
-                                first_offset,
-                                last_offset);
-    component.Propagate(input_chunk_info, output_chunk_info,
-                        input_data, &output_data);
-  }
-}
-
-}  // namespace nnet2
-}  // namespace kaldi
diff --git a/src/nnet2/nnet-compute-online.h b/src/nnet2/nnet-compute-online.h
deleted file mode 100644
index 66a456d5538..00000000000
--- a/src/nnet2/nnet-compute-online.h
+++ /dev/null
@@ -1,110 +0,0 @@
-// nnet2/nnet-compute-online.h
-
-// Copyright 2014  Johns Hopkins University (author: Daniel Povey)
-//                 Guoguo Chen
-//                 Vijayaditya Peddinti
-
-// See ../../COPYING for clarification regarding multiple authors
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//  http://www.apache.org/licenses/LICENSE-2.0
-//
-// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
-// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
-// MERCHANTABLITY OR NON-INFRINGEMENT.
-// See the Apache 2 License for the specific language governing permissions and
-// limitations under the License.
-
-#ifndef KALDI_NNET2_NNET_COMPUTE_ONLINE_H_
-#define KALDI_NNET2_NNET_COMPUTE_ONLINE_H_
-
-#include "nnet2/nnet-nnet.h"
-#include <vector>
-
-namespace kaldi {
-namespace nnet2 {
-
-/* This header provides functionality for doing forward computation in a situation
-   where you want to start from the beginning of a file and progressively compute
-   more, while re-using the hidden parts that (due to context) may be shared.
-   (note: this sharing is more of an issue in multi-splice networks where there is
-   splicing over time in the middle layers of the network).
-   Note: this doesn't do the final taking-the-log and correcting for the prior.
-   The current implementation is just an inefficient placeholder implementation;
-   later we'll modify it to properly use previously computed activations.
-*/
-
-class NnetOnlineComputer {
-
- public:
-  // All the inputs and outputs are of type CuMatrix, in case we're doing the
-  // computation on the GPU (of course, if there is no GPU, it backs off to
-  // using the CPU).
-  // You should initialize an object of this type for each utterance you want
-  // to decode.
-  
-  // Note: pad_input will normally be true; it means that at the start and end
-  // of the file, we pad with repeats of the first/last frame, so that the total
-  // number of frames it outputs is the same as the number of input frames.
-  NnetOnlineComputer(const Nnet &nnet,
-                     bool pad_input);
-
-  // This function works as follows: given a chunk of input (interpreted
-  // as following in time any previously supplied data), do the computation
-  // and produce all the frames of output we can.  In the middle of the
-  // file, the dimensions of input and output will be the same, but at
-  // the beginning of the file, output will have fewer frames than input
-  // due to required context.
-  // It is the responsibility of the user to keep track of frame indices, if
-  // required.  This class won't output any frame twice.
-  void Compute(const CuMatrixBase<BaseFloat> &input,
-               CuMatrix<BaseFloat> *output);
-  
-  // This flushes out the last frames of output; you call this when all
-  // input has finished.  It's invalid to call Compute or Flush after
-  // calling Flush.  It's valid to call Flush if no frames have been
-  // input or if no frames have been output; this produces empty output.
-  void Flush(CuMatrix<BaseFloat> *output);
-
- private:
-  void Propagate();
-
-  const Nnet &nnet_;
-
-  // data_ contains the intermediate stages and the output of the most recent
-  // computation.
-  std::vector<CuMatrix<BaseFloat> > data_;
-  
-  std::vector<ChunkInfo> chunk_info_;  // contains chunk_info(s) for the
-  // components
-
-  std::vector<CuMatrix<BaseFloat> > reusable_component_inputs_;  
-        // reusable data from previous chunk, this is a buffer to
-        // store the hidden activations before splice type components
-
-  CuMatrix<BaseFloat> unprocessed_buffer_;  // buffer to store unprocessed input
-  // from previous chunks (as we can have several chunks with insufficient
-  // context)
-  
-  CuVector<BaseFloat> last_seen_input_frame_;  // stores the last seen frame
-  // for the sake of right padding the input. This is useful to deal with the
-  // scenario where the initial component is not a splice component.
-
-  bool pad_input_;  // pad input at the beginning of the decode
-
-  bool is_first_chunk_;
-
-  bool finished_;  // forward-pass is complete
-
-  KALDI_DISALLOW_COPY_AND_ASSIGN(NnetOnlineComputer);
-};
-
-
-}  // namespace nnet2
-}  // namespace kaldi
-
-#endif  // KALDI_NNET2_NNET_COMPUTE_ONLINE_H_
diff --git a/src/nnet2/nnet-compute-test.cc b/src/nnet2/nnet-compute-test.cc
deleted file mode 100644
index 6f1ff5e2a9b..00000000000
--- a/src/nnet2/nnet-compute-test.cc
+++ /dev/null
@@ -1,134 +0,0 @@
-// nnet2/nnet-compute-test.cc
-
-// Copyright 2014  Johns Hopkins University (author:  Daniel Povey)
-// Copyright 2015  David Snyder
-
-// See ../../COPYING for clarification regarding multiple authors
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//  http://www.apache.org/licenses/LICENSE-2.0
-//
-// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
-// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
-// MERCHANTABLITY OR NON-INFRINGEMENT.
-// See the Apache 2 License for the specific language governing permissions and
-// limitations under the License.
-
-#include "nnet2/nnet-nnet.h"
-#include "nnet2/nnet-compute.h"
-#include "nnet2/nnet-compute-online.h"
-
-namespace kaldi {
-namespace nnet2 {
-
-
-void UnitTestNnetCompute() {
-  int32 input_dim = 10 + rand() % 40, output_dim = 100 + rand() % 500;
-  bool pad_input = (rand() % 2 == 0);
-  
-  Nnet *nnet = GenRandomNnet(input_dim, output_dim);
-  KALDI_LOG << "Left context = " << nnet->LeftContext() << ", right context = "
-            << nnet->RightContext() << ", pad-input = " << pad_input;
-  KALDI_LOG << "NNet info is " << nnet->Info();
-  int32 num_feats = 5 + rand() % 1000;
-  CuMatrix<BaseFloat> input(num_feats, input_dim);
-  input.SetRandn();
-
-  int32 num_output_rows = num_feats -
-      (pad_input ? 0 : nnet->LeftContext() + nnet->RightContext());
-  if (num_output_rows <= 0)
-    return;
-  CuMatrix<BaseFloat> output1(num_output_rows, output_dim);
-  NnetComputation(*nnet, input, pad_input, &output1);
-
-  CuMatrix<BaseFloat> output2(output1.NumRows(), output1.NumCols());
-  int32 cur_input_pos = 0, cur_output_pos = 0;
-
-  NnetOnlineComputer computer(*nnet, pad_input);
-  while (cur_input_pos <= num_feats) {
-    int32 feats_left = num_feats - cur_input_pos;
-    CuMatrix<BaseFloat> output_part;
-    if (feats_left > 0) {
-      int32 chunk_size = std::min<int32>(1 + rand() % 10, feats_left);
-      CuSubMatrix<BaseFloat> input_part(input, cur_input_pos, chunk_size,
-                                        0, input_dim);
-      computer.Compute(input_part, &output_part);
-      cur_input_pos += chunk_size;
-    } else {
-      computer.Flush(&output_part);
-      cur_input_pos++; // will terminate the loop.
-    }
-    if (output_part.NumRows() != 0) {
-      output2.Range(cur_output_pos, output_part.NumRows(),
-                    0, output_dim).CopyFromMat(output_part);
-      cur_output_pos += output_part.NumRows();
-    }
-  }  
-  AssertEqual(output1, output2);
-  for (int32 i = 0; i < output1.NumRows(); i++) {
-    // just double-check that the frames near the end are right, in case
-    // the test above somehow passed despite that.
-    if (i < 10 || output1.NumRows() - i < 10) {
-      CuSubVector<BaseFloat> vec1(output1, i), vec2(output2, i);
-      AssertEqual(vec1, vec2);
-    }
-  }
-  KALDI_LOG << "OK";
-  delete nnet;
-}
-
-void UnitTestNnetComputeChunked() {
-  int32 input_dim = 10 + rand() % 40, output_dim = 100 + rand() % 500;
-  bool pad_input = true;
-  
-  Nnet *nnet = GenRandomNnet(input_dim, output_dim);
-  int32 num_feats = 100 + rand() % 500;
-  int32 chunk_size = num_feats / (2 + rand() % 10);
-  CuMatrix<BaseFloat> input(num_feats, input_dim);
-  input.SetRandn();
-
-  KALDI_LOG << "Left context = " << nnet->LeftContext() 
-            << ", right context = " << nnet->RightContext() 
-            << ", chunk size = " << chunk_size;
-  KALDI_LOG << "NNet info is " << nnet->Info();
-
-  int32 num_output_rows = num_feats;
-  CuMatrix<BaseFloat> cu_output1(num_output_rows, output_dim);
-  Matrix<BaseFloat> output2(num_output_rows, output_dim);
-  NnetComputation(*nnet, input, pad_input, &cu_output1);
-  NnetComputationChunked(*nnet, Matrix<BaseFloat>(input), chunk_size, 
-                         &output2);
-  Matrix<BaseFloat> output1(cu_output1);
-  AssertEqual(output1, output2);
-  for (int32 i = 0; i < output1.NumRows(); i++) {
-    // just double-check that the frames near the end are right, in case
-    // the test above somehow passed despite that.
-    if (i < 10 || output1.NumRows() - i < 10) {
-      SubVector<BaseFloat> vec1(output1, i), vec2(output2, i);
-      AssertEqual(vec1, vec2);
-    }
-  }
-  KALDI_LOG << "OK";
-  delete nnet;
-}
-
-}  // namespace nnet2
-}  // namespace kaldi
-
-#include "matrix/matrix-functions.h"
-
-
-int main() {
-  using namespace kaldi;
-  using namespace kaldi::nnet2;
-
-  for (int32 i = 0; i < 10; i++) 
-    UnitTestNnetCompute();
-    UnitTestNnetComputeChunked();
-  return 0;
-}
-  
diff --git a/src/nnet2/nnet-compute.cc b/src/nnet2/nnet-compute.cc
deleted file mode 100644
index 9f2fe1ebcc8..00000000000
--- a/src/nnet2/nnet-compute.cc
+++ /dev/null
@@ -1,224 +0,0 @@
-// nnet2/nnet-compute.cc
-
-// Copyright 2012   Johns Hopkins University (author: Daniel Povey)
-// Copyright 2015   David Snyder
-
-// See ../../COPYING for clarification regarding multiple authors
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//  http://www.apache.org/licenses/LICENSE-2.0
-//
-// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
-// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
-// MERCHANTABLITY OR NON-INFRINGEMENT.
-// See the Apache 2 License for the specific language governing permissions and
-// limitations under the License.
-
-#include "nnet2/nnet-compute.h"
-#include "hmm/posterior.h"
-
-namespace kaldi {
-namespace nnet2 {
-
-/*
-  This class does the forward and possibly backward computation for (typically)
-  a whole utterance of contiguous features.  You'll instantiate one of
-  these classes each time you want to do this computation.
-*/
-class NnetComputer {
- public:
-  /* Initializer.  If pad == true, pad input with nnet.LeftContext() frames on
-     the left and nnet.RightContext() frames on the right (duplicate the first
-     and last frames.) */
-  NnetComputer(const Nnet &nnet,
-               const CuMatrixBase<BaseFloat> &input_feats,
-               bool pad, 
-               Nnet *nnet_to_update = NULL);
-  
-  /// The forward-through-the-layers part of the computation.
-  void Propagate();
-  
-  void Backprop(CuMatrix<BaseFloat> *tmp_deriv);
-                
-  
-  /// Computes objf derivative at last layer, and returns objective
-  /// function summed over labels and multiplied by utterance_weight.
-  /// [Note: utterance_weight will normally be 1.0].
-  BaseFloat ComputeLastLayerDeriv(const Posterior &pdf_post,
-                                  CuMatrix<BaseFloat> *deriv) const;
-  
-  CuMatrixBase<BaseFloat> &GetOutput() { return forward_data_.back(); }
-  
- private:  
-  const Nnet &nnet_;
-  std::vector<CuMatrix<BaseFloat> > forward_data_;
-  Nnet *nnet_to_update_; // May be NULL, if just want objective function
-  // but no gradient info or SGD.
-  std::vector <ChunkInfo> chunk_info_;
-};
-
-NnetComputer::NnetComputer(const Nnet &nnet,
-                           const CuMatrixBase<BaseFloat> &input_feats,
-                           bool pad,
-                           Nnet *nnet_to_update):
-    nnet_(nnet), nnet_to_update_(nnet_to_update) {
-  int32 dim = input_feats.NumCols();
-  if (dim != nnet.InputDim()) {
-    KALDI_ERR << "Feature dimension is " << dim << " but network expects "
-              << nnet.InputDim();
-  }
-  forward_data_.resize(nnet.NumComponents() + 1);
-
-  int32 left_context = (pad ? nnet_.LeftContext() : 0),
-       right_context = (pad ? nnet_.RightContext() : 0);
-
-  int32 num_rows = left_context + input_feats.NumRows() + right_context;
-  nnet.ComputeChunkInfo(num_rows, 1, &chunk_info_);
-
-  CuMatrix<BaseFloat> &input(forward_data_[0]);
-  input.Resize(num_rows, dim);
-  input.Range(left_context, input_feats.NumRows(),
-              0, dim).CopyFromMat(input_feats);
-  for (int32 i = 0; i < left_context; i++)
-    input.Row(i).CopyFromVec(input_feats.Row(0));
-  int32 last_row = input_feats.NumRows() - 1;
-  for (int32 i = 0; i < right_context; i++)
-    input.Row(num_rows - i - 1).CopyFromVec(input_feats.Row(last_row));
-}
-
-
-/// This is the forward part of the computation.
-void NnetComputer::Propagate() {
-  for (int32 c = 0; c < nnet_.NumComponents(); c++) {
-    const Component &component = nnet_.GetComponent(c);
-    CuMatrix<BaseFloat> &input = forward_data_[c],
-                     &output = forward_data_[c+1];
-    component.Propagate(chunk_info_[c], chunk_info_[c+1], input, &output);
-    const Component *prev_component = (c == 0 ? NULL : &(nnet_.GetComponent(c-1)));
-    bool will_do_backprop = (nnet_to_update_ != NULL),
-         keep_last_output = will_do_backprop &&
-                             ((c>0 && prev_component->BackpropNeedsOutput()) ||
-                              component.BackpropNeedsInput());
-    if (!keep_last_output)
-      forward_data_[c].Resize(0, 0); // We won't need this data; save memory.
-  }
-}
-
-BaseFloat NnetComputer::ComputeLastLayerDeriv(const Posterior &pdf_post,
-                                              CuMatrix<BaseFloat> *deriv) const {
-  // TODO: convert this to proper CUDA code, c.f. ComputeObjfAndDeriv
-  // in nnet-update.cc (I'm not sure, though, that this code is ever reached.)
-  int32 num_components = nnet_.NumComponents();
-  double tot_objf = 0.0, tot_weight = 0.0;
-  const CuMatrix<BaseFloat> &last_layer_output = forward_data_[num_components];
-  int32 num_frames = last_layer_output.NumRows(),
-          num_pdfs = last_layer_output.NumCols();
-  KALDI_ASSERT(pdf_post.size() == static_cast<size_t>(num_frames));
-  deriv->Resize(num_frames, num_pdfs); // will zero it.
-  for (int32 i = 0; i < deriv->NumRows(); i++) {
-    for (size_t j = 0; j < pdf_post[i].size(); j++) {
-      int32 label = pdf_post[i][j].first;
-      BaseFloat weight = pdf_post[i][j].second;
-      KALDI_ASSERT(label >= 0 && label < num_pdfs);
-      BaseFloat this_prob = last_layer_output(i, label);
-      KALDI_ASSERT(this_prob > 0.99e-20); // We floored to 1.0e-20 in SoftmaxLayer.
-      tot_objf += weight * Log(this_prob);
-      tot_weight += weight;
-      (*deriv)(i, label) += weight / this_prob; // could be "=", assuming the
-      // labels are all distinct.
-    }
-  }
-  KALDI_VLOG(4) << "Objective function is " << (tot_objf/tot_weight) <<
-      " per frame over " << tot_weight << " samples.";
-  return tot_objf;  
-}
-
-
-void NnetComputer::Backprop(CuMatrix<BaseFloat> *tmp_deriv) {
-  KALDI_ASSERT(nnet_to_update_ != NULL); // Or why do backprop?
-  // If later this reasoning changes, we can change this
-  // statement and add logic to make component_to_update, below,
-  // NULL if necessary.
-  
-  for (int32 c = nnet_.NumComponents() - 1; c >= 0; c--) {
-    const Component &component = nnet_.GetComponent(c);
-    Component *component_to_update = &(nnet_to_update_->GetComponent(c));
-    const CuMatrix<BaseFloat>  &input = forward_data_[c],
-                            &output = forward_data_[c+1],
-                      &output_deriv = *tmp_deriv;
-    CuMatrix<BaseFloat> input_deriv;
-    component.Backprop(chunk_info_[c], chunk_info_[c+1], input, output, output_deriv, 
-                       component_to_update, &input_deriv);
-    *tmp_deriv = input_deriv;
-  }
-}
-
-void NnetComputation(const Nnet &nnet,
-                     const CuMatrixBase<BaseFloat> &input,  // features
-                     bool pad_input,
-                     CuMatrixBase<BaseFloat> *output) {
-  NnetComputer nnet_computer(nnet, input, pad_input, NULL);
-  nnet_computer.Propagate();
-  output->CopyFromMat(nnet_computer.GetOutput());
-}
-
-void NnetComputationChunked(const Nnet &nnet,
-                     const Matrix<BaseFloat> &input,  // features
-                     int32 chunk_size,
-                     Matrix<BaseFloat> *output) {
-  int32 num_rows,
-       num_chunks = ceil((BaseFloat)input.NumRows() / chunk_size),
-       dim = input.NumCols(),
-       left_context = nnet.LeftContext(),
-       right_context = nnet.RightContext();
-  Matrix<BaseFloat> full_input;
-  num_rows = left_context + input.NumRows() + right_context;
-  full_input.Resize(num_rows, dim);
-  full_input.Range(left_context, input.NumRows(),
-            0, dim).CopyFromMat(input);
-  for (int32 i = 0; i < left_context; i++)
-    full_input.Row(i).CopyFromVec(input.Row(0));
-  int32 last_row = input.NumRows() - 1;
-  for (int32 i = 0; i < right_context; i++)
-    full_input.Row(num_rows - i - 1).CopyFromVec(input.Row(last_row));
-
-  for (int32 i = 0; i < num_chunks; i++) {
-    int32 index = i * chunk_size,
-          offset = std::min(num_rows - chunk_size * i, 
-                            left_context + chunk_size + right_context);
-    SubMatrix<BaseFloat> chunk_input(full_input, index, offset, 0, dim);
-    CuMatrix<BaseFloat> cu_chunk_input(chunk_input);
-
-    // Note: we have already accounted for input padding, so we pass
-    // pad_input==false to the NnetComputer.
-    NnetComputer nnet_computer(nnet, cu_chunk_input, false, NULL);
-    nnet_computer.Propagate();
-    CuMatrix<BaseFloat> cu_chunk_output(nnet_computer.GetOutput());
-    SubMatrix<BaseFloat> chunk_out(*output, i * chunk_size, 
-                           cu_chunk_output.NumRows(), 0, 
-                           cu_chunk_output.NumCols());
-    chunk_out.CopyFromMat(cu_chunk_output);
-  }
-}
-
-BaseFloat NnetGradientComputation(const Nnet &nnet,
-                                  const CuMatrixBase<BaseFloat> &input,
-                                  bool pad_input,
-                                  const Posterior &pdf_post,
-                                  Nnet *nnet_to_update) {
-  NnetComputer nnet_computer(nnet, input, pad_input, nnet_to_update);
-  nnet_computer.Propagate();
-  CuMatrix<BaseFloat> deriv;
-  BaseFloat ans;
-  ans = nnet_computer.ComputeLastLayerDeriv(pdf_post, &deriv);  
-  nnet_computer.Backprop(&deriv);
-  return ans;
-}
-
-
-} // namespace nnet2
-} // namespace kaldi
diff --git a/src/nnet2/nnet-compute.h b/src/nnet2/nnet-compute.h
deleted file mode 100644
index 875252fd260..00000000000
--- a/src/nnet2/nnet-compute.h
+++ /dev/null
@@ -1,85 +0,0 @@
-// nnet2/nnet-compute.h
-
-// Copyright 2012  Johns Hopkins University (author: Daniel Povey)
-// Copyright 2015  David Snyder
-
-// See ../../COPYING for clarification regarding multiple authors
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//  http://www.apache.org/licenses/LICENSE-2.0
-//
-// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
-// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
-// MERCHANTABLITY OR NON-INFRINGEMENT.
-// See the Apache 2 License for the specific language governing permissions and
-// limitations under the License.
-
-#ifndef KALDI_NNET2_NNET_COMPUTE_H_
-#define KALDI_NNET2_NNET_COMPUTE_H_
-
-#include "nnet2/nnet-nnet.h"
-
-namespace kaldi {
-namespace nnet2 {
-
-/* This header provides functionality for doing forward computation and
-   backpropagation for whole chunks of features, e.g. whole utterances.  The
-   code in nnet-update.h is designed for sample-by-sample computation.
-*/
-
-
-/**
-  Does the basic neural net computation, on a sequence of data (e.g.
-  an utterance).  If pad_input==true we'll pad the input with enough
-  frames of context, and the output will be a matrix of #frames by
-  the output-dim of the network, typically representing state-level
-  posteriors.   If pad_input==false we won't do this and the
-  output will have a lower #frames than the input; we lose
-  nnet.LeftContext() at the left and nnet.RightContext() at the
-  output.
-*/
-void NnetComputation(const Nnet &nnet,
-                     const CuMatrixBase<BaseFloat> &input,  // features
-                     bool pad_input,
-                     CuMatrixBase<BaseFloat> *output); // posteriors.
-/**
-  Does the basic neural net computation, on a sequence of data (e.g.
-  an utterance).  This variant of NnetComputation chunks the input
-  according to chunk_size and does the posterior computation chunk 
-  by chunk.  This allows the computation to be performed on the GPU
-  when the input matrix is very large.  Input is padded with enough
-  frames of context so that the output will be a matrix with 
-  input.NumRows().
-*/
-void NnetComputationChunked(const Nnet &nnet,
-                     const Matrix<BaseFloat> &input,  // features
-                     int32 chunk_size,
-                     Matrix<BaseFloat> *output); // posteriors.
-
-/** Does the neural net computation and backprop, given input and labels.
-    Note: if pad_input==true the number of rows of input should be the
-    same as the number of labels, and if false, you should omit
-    nnet.LeftContext() labels on the left and nnet.RightContext() on
-    the right.  If nnet_to_update == &nnet, then this does stochastic
-    gradient descent, otherwise (assuming you have called SetZero(true)
-    on *nnet_to_update) it will compute the gradient on this data.
-    Returns the total objective function summed over the frames, times
-    the utterance weight.
-*/
-BaseFloat NnetGradientComputation(const Nnet &nnet,
-                                  const MatrixBase<BaseFloat> &input,
-                                  bool pad_input,
-                                  BaseFloat utterance_weight,
-                                  const std::vector<int32> &labels,
-                                  Nnet *nnet_to_update);
-
-
-
-} // namespace nnet2
-} // namespace kaldi
-
-#endif // KALDI_NNET2_NNET_COMPUTE_H_
diff --git a/src/nnet2/nnet-example-functions-test.cc b/src/nnet2/nnet-example-functions-test.cc
deleted file mode 100644
index 03484148f6f..00000000000
--- a/src/nnet2/nnet-example-functions-test.cc
+++ /dev/null
@@ -1,69 +0,0 @@
-// nnet2/nnet-example-functions-test.cc
-
-// Copyright 2013  Johns Hopkins University (author:  Daniel Povey)
-
-// See ../../COPYING for clarification regarding multiple authors
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//  http://www.apache.org/licenses/LICENSE-2.0
-//
-// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
-// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
-// MERCHANTABLITY OR NON-INFRINGEMENT.
-// See the Apache 2 License for the specific language governing permissions and
-// limitations under the License.
-
-#include "nnet2/nnet-example-functions.h"
-#include "util/common-utils.h"
-
-namespace kaldi {
-namespace nnet2 {
-
-// Note: most of these functions we're testing from the command line,
-// this is just to test the function to solve the packing problem.
-
-void UnitTestSolvePackingProblem() {
-  size_t size = Rand() % 20;
-  std::vector<BaseFloat> item_costs;
-  for (size_t i = 0; i < size; i++) {
-    item_costs.push_back(0.5 * (Rand() % 15));
-  }
-  BaseFloat max_cost = 0.66 + Rand() % 5;
-
-  std::vector<std::vector<size_t> > groups;
-  SolvePackingProblem(max_cost, item_costs, &groups);
-  
-  std::vector<size_t> all_indices;
-  for (size_t i = 0; i < groups.size(); i++) {
-    BaseFloat this_group_cost = 0.0;
-    for (size_t j = 0; j < groups[i].size(); j++) {
-      size_t index = groups[i][j];
-      all_indices.push_back(index);
-      this_group_cost += item_costs[index];
-    }
-    KALDI_ASSERT(!groups[i].empty());
-    KALDI_ASSERT(groups[i].size() == 1 || this_group_cost <= max_cost);
-  }
-  SortAndUniq(&all_indices);
-  KALDI_ASSERT(all_indices.size() == size);
-  if (!all_indices.empty())
-    KALDI_ASSERT(all_indices.back() + 1 == size);
-}
-
-
-} // namespace nnet2
-} // namespace kaldi
-
-
-int main() {
-  using namespace kaldi;
-  using namespace kaldi::nnet2;
-  using kaldi::int32;
-  for (int32 i = 0; i < 10; i++)
-    UnitTestSolvePackingProblem();
-}
-
diff --git a/src/nnet2/nnet-example-functions.cc b/src/nnet2/nnet-example-functions.cc
deleted file mode 100644
index 87184cd16e4..00000000000
--- a/src/nnet2/nnet-example-functions.cc
+++ /dev/null
@@ -1,997 +0,0 @@
-// nnet2/nnet-example-functions.cc
-
-// Copyright 2012-2013  Johns Hopkins University (author: Daniel Povey)
-
-// See ../../COPYING for clarification regarding multiple authors
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//  http://www.apache.org/licenses/LICENSE-2.0
-//
-// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
-// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
-// MERCHANTABLITY OR NON-INFRINGEMENT.
-// See the Apache 2 License for the specific language governing permissions and
-// limitations under the License.
-
-#include "nnet2/nnet-example-functions.h"
-#include "lat/lattice-functions.h"
-
-namespace kaldi {
-namespace nnet2 {
-
-
-bool LatticeToDiscriminativeExample(
-    const std::vector<int32> &alignment,
-    const Matrix<BaseFloat> &feats,
-    const CompactLattice &clat,
-    BaseFloat weight,
-    int32 left_context,
-    int32 right_context,
-    DiscriminativeNnetExample *eg) {
-  KALDI_ASSERT(left_context >= 0 && right_context >= 0);
-  int32 num_frames = alignment.size();
-  if (num_frames == 0) {
-    KALDI_WARN << "Empty alignment";
-    return false;
-  }
-  if (num_frames != feats.NumRows()) {
-    KALDI_WARN << "Dimension mismatch: alignment " << num_frames
-               << " versus feats " << feats.NumRows();
-    return false;
-  }
-  std::vector<int32> times;
-  int32 num_frames_clat = CompactLatticeStateTimes(clat, &times);  
-  if (num_frames_clat != num_frames) {
-    KALDI_WARN << "Numerator/frames versus denlat frames mismatch: "
-               << num_frames << " versus " << num_frames_clat;
-    return false;
-  }
-  eg->weight = weight;
-  eg->num_ali = alignment;
-  eg->den_lat = clat;
-
-  int32 feat_dim = feats.NumCols();
-  eg->input_frames.Resize(left_context + num_frames + right_context,
-                          feat_dim);
-  eg->input_frames.Range(left_context, num_frames,
-                         0, feat_dim).CopyFromMat(feats);
-
-  // Duplicate the first and last frames.
-  for (int32 t = 0; t < left_context; t++)
-    eg->input_frames.Row(t).CopyFromVec(feats.Row(0));
-  for (int32 t = 0; t < right_context; t++)
-    eg->input_frames.Row(left_context + num_frames + t).CopyFromVec(
-        feats.Row(num_frames - 1));
-
-  eg->left_context = left_context;
-  eg->Check();
-  return true;
-}
-
-
-
-
-
-
-/**
-   For each frame, judge:
-     - does it produce a nonzero derivative? [this differs MMI vs MPE]
-     - can it be split here [or what is the penalty for splitting here.]
-         - depends whether lattice has just one path at that point.
-
-   Time taken to process segment of a certain length: [must be sub-linear.]
-      [use quadratic function that's max at specified segment length and zero at zero.]
-
-   No penalty for processing frames we don't need to process (already implicit in
-   segment-processing time above.)
-
-   Penalty for splitting where we should not split.  [Make it propto log(#paths).]
-   
- */
-
-
-
-
-
-class DiscriminativeExampleSplitter {
- public:
-  DiscriminativeExampleSplitter(
-      const SplitDiscriminativeExampleConfig &config,
-      const TransitionModel &tmodel,
-      const DiscriminativeNnetExample &eg,
-      std::vector<DiscriminativeNnetExample> *egs_out):
-      config_(config), tmodel_(tmodel), eg_(eg), egs_out_(egs_out) { }
-
-  void Excise(SplitExampleStats *stats) {
-    eg_.Check();
-    PrepareLattice(false);
-    ComputeFrameInfo();
-    if (!config_.excise) {
-      egs_out_->resize(1);
-      (*egs_out_)[0] = eg_;
-    } else {
-      DoExcise(stats);
-    }
-  }
-  
-  void Split(SplitExampleStats *stats) {
-    if (!config_.split) {
-      egs_out_->resize(1);
-      (*egs_out_)[0] = eg_;
-    } else {
-      eg_.Check();    
-      PrepareLattice(true);
-      ComputeFrameInfo();
-      DoSplit(stats);
-    }
-  }
-
- private:
-  typedef LatticeArc Arc;
-  typedef Arc::StateId StateId;
-  typedef Arc::Label Label;
-
-  // converts compact lattice to lat_.  You should set first_time to true if
-  // this is being called from DoSplit, but false if being called from DoExcise
-  // (this saves some time, since we avoid some preparation steps that we know
-  // are unnecessary because they were done before
-  void PrepareLattice(bool first_time); 
-
-  void CollapseTransitionIds(); // Modifies the transition-ids on lat_ so that
-                                // on each frame, there is just one with any
-                                // given pdf-id.  This allows us to determinize
-                                // and minimize more completely.
-  
-  bool ComputeFrameInfo();
-
-  static void RemoveAllOutputSymbols (Lattice *lat);
-
-  void OutputOneSplit(int32 seg_begin, int32 seg_end);
-  
-  void DoSplit(SplitExampleStats *stats);
-
-  void DoExcise(SplitExampleStats *stats);
-  
-  int32 NumFrames() const { return static_cast<int32>(eg_.num_ali.size()); }
-
-  int32 RightContext() { return eg_.input_frames.NumRows() - NumFrames() - eg_.left_context; }
-  
-
-  // Put in lat_out, a slice of "clat" with first frame at time "seg_begin" and
-  // with last frame at time "seg_end - 1".
-  void CreateOutputLattice(int32 seg_begin, int32 seg_end,
-                           CompactLattice *clat_out);
-
-  // Returns the state-id in this output lattice (creates a
-  // new state if needed).
-  StateId GetOutputStateId(StateId s,
-                           unordered_map<StateId, StateId> *state_map,
-                           Lattice *lat_out);           
-
-  struct FrameInfo {
-    int32 den_state_count;
-    int32 den_pdf_count; // number of distinct pdfs in denominator lattice
-    bool multiple_transition_ids; // true if there are multiple distinct
-                                  // transition-ids in the denominator lattice
-                                  // at this point
-    bool num_den_overlap; // true if num and den overlap.
-
-    bool nonzero_derivative; // True if we need to keep this frame because the
-    // derivative is nonzero on this frame.
-    bool can_excise_frame; // True if the frame, if part of a segment, can be
-    // excised, *but ignoring the effect of acoustic
-    // context*.  I.e. true if the likelihoods and
-    // derivatives from this frame do not matter because
-    // the derivatives are zero and the likelihoods don't
-    // affect lattice posteriors (because pdfs are all
-    // the same on this frame, or if doing mpfe,
-    // transition-ids are all the same.
-
-    // start_state says, for a segment starting at frame t, what is the
-    // earliest state in lat_ that we have to consider including in the split
-    // lattice?  This relates to a kind of optimization for efficiency.
-    StateId start_state;
-
-    // end_state says, for a segment whose final frame is time t (i.e.  whose
-    // "segment end" is time t+1), what is the latest state in lat_ that we have
-    // to consider including in the split lattice?  This relates to a kind of
-    // optimization for efficiency.
-    StateId end_state;  
-    FrameInfo(): den_state_count(0), den_pdf_count(0),
-                 multiple_transition_ids(false),
-                 num_den_overlap(false), nonzero_derivative(false),
-                 can_excise_frame(false),
-                 start_state(std::numeric_limits<int32>::max()), end_state(0) { }
-  };
-  
-  
-  // The following variables are set in the initializer:
-  const SplitDiscriminativeExampleConfig &config_;
-  const TransitionModel &tmodel_;
-  const DiscriminativeNnetExample &eg_;
-  std::vector<DiscriminativeNnetExample> *egs_out_;
-  
-  Lattice lat_; // lattice generated from eg_.den_lat, with epsilons removed etc.
-
-
-  // The other variables are computed by Split() or functions called from it.
-
-  std::vector<FrameInfo> frame_info_;
-  
-  // state_times_ says, for each state in lat_, what its start time is.
-  std::vector<int32> state_times_;
-
-};
-
-// Make sure that for any given pdf-id and any given frame, the den-lat has
-// only one transition-id mapping to that pdf-id, on the same frame.
-// It helps us to more completely minimize the lattice.  Note: we
-// can't do this if the criterion is MPFE, because in that case the
-// objective function will be affected by the phone-identities being
-// different even if the pdf-ids are the same.
-void DiscriminativeExampleSplitter::CollapseTransitionIds() {
-  std::vector<int32> times;
-  TopSort(&lat_); // Topologically sort the lattice (required by
-                  // LatticeStateTimes)
-  int32 num_frames = LatticeStateTimes(lat_, &times);  
-  StateId num_states = lat_.NumStates();
-
-  std::vector<std::map<int32, int32> > pdf_to_tid(num_frames);
-  for (StateId s = 0; s < num_states; s++) {
-    int32 t = times[s];
-    for (fst::MutableArcIterator<Lattice> aiter(&lat_, s);
-         !aiter.Done(); aiter.Next()) {
-      KALDI_ASSERT(t >= 0 && t < num_frames);
-      Arc arc = aiter.Value();
-      KALDI_ASSERT(arc.ilabel != 0 && arc.ilabel == arc.olabel);
-      int32 pdf = tmodel_.TransitionIdToPdf(arc.ilabel);
-      if (pdf_to_tid[t].count(pdf) != 0) {
-        arc.ilabel = arc.olabel = pdf_to_tid[t][pdf];
-        aiter.SetValue(arc);
-      } else {
-        pdf_to_tid[t][pdf] = arc.ilabel;
-      }
-    }
-  }    
-}
-
-
-void DiscriminativeExampleSplitter::PrepareLattice(bool first_time) {
-  ::fst::ConvertLattice(eg_.den_lat, &lat_);
-
-  Project(&lat_, fst::PROJECT_INPUT); // Get rid of the word labels and put the
-                                      // transition-ids on both sides.
-  
-  RmEpsilon(&lat_); // Remove epsilons.. this simplifies
-                    // certain things.
-
-  if (first_time) {
-    if (config_.collapse_transition_ids && config_.criterion != "mpfe")
-      CollapseTransitionIds();
-  
-    if (config_.determinize) {
-      if (!config_.minimize) {
-        Lattice det_lat;
-        Determinize(lat_, &det_lat);
-        lat_ = det_lat;
-      } else {
-        Lattice tmp_lat;
-        Reverse(lat_, &tmp_lat);
-        Determinize(tmp_lat, &lat_);
-        Reverse(lat_, &tmp_lat);
-        Determinize(tmp_lat, &lat_);
-        RmEpsilon(&lat_);
-        // Previously we determinized, then did
-        // Minimize(&lat_);
-        // but this was too slow.
-      }
-    }
-  }
-  TopSort(&lat_); // Topologically sort the lattice.
-}
-
-// this function computes various arrays that say something about
-// this frame of the lattice.
-bool DiscriminativeExampleSplitter::ComputeFrameInfo() {
-  
-  int32 num_frames = NumFrames();
-
-  frame_info_.clear();
-  frame_info_.resize(num_frames + 1);
-  
-  LatticeStateTimes(lat_, &state_times_);
-
-  std::vector<std::set<int32> > pdfs_per_frame(num_frames),
-      tids_per_frame(num_frames);
-  
-  int32 num_states = lat_.NumStates();
-  
-  for (int32 state = 0; state < num_states; state++) {
-    int32 t = state_times_[state];
-    KALDI_ASSERT(t >= 0 && t <= num_frames);
-    frame_info_[t].den_state_count++;
-    for (fst::ArcIterator<Lattice> aiter(lat_, state); !aiter.Done();
-         aiter.Next()) {
-      const LatticeArc &arc = aiter.Value();
-      KALDI_ASSERT(arc.ilabel != 0 && arc.ilabel == arc.olabel); 
-      int32 transition_id = arc.ilabel,
-          pdf_id = tmodel_.TransitionIdToPdf(transition_id);
-      tids_per_frame[t].insert(transition_id);
-      pdfs_per_frame[t].insert(pdf_id);
-    }
-    if (t < num_frames)
-      frame_info_[t+1].start_state = std::min(state,
-                                              frame_info_[t+1].start_state);
-    frame_info_[t].end_state = std::max(state,
-                                        frame_info_[t].end_state);
-  }
-
-  for (int32 i = 1; i <= NumFrames(); i++)
-    frame_info_[i].end_state = std::max(frame_info_[i-1].end_state,
-                                        frame_info_[i].end_state);
-  for (int32 i = NumFrames() - 1; i >= 0; i--)
-    frame_info_[i].start_state = std::min(frame_info_[i+1].start_state,
-                                          frame_info_[i].start_state);
-  
-  for (int32 t = 0; t < num_frames; t++) {
-    FrameInfo &frame_info = frame_info_[t];
-    int32 transition_id = eg_.num_ali[t],
-        pdf_id = tmodel_.TransitionIdToPdf(transition_id);
-    frame_info.num_den_overlap = (pdfs_per_frame[t].count(pdf_id) != 0);
-    frame_info.multiple_transition_ids = (tids_per_frame[t].size() > 1);
-    KALDI_ASSERT(!pdfs_per_frame[t].empty());
-    frame_info.den_pdf_count = pdfs_per_frame[t].size();
-
-    if (config_.criterion == "mpfe" || config_.criterion == "smbr") {
-      frame_info.nonzero_derivative = (frame_info.den_pdf_count > 1);
-    } else {
-      KALDI_ASSERT(config_.criterion == "mmi");
-      if (config_.drop_frames) {
-        // With frame dropping, we'll get nonzero derivative only
-        // if num and den overlap, *and* den has >1 active pdf.
-        frame_info.nonzero_derivative = frame_info.num_den_overlap  &&
-            frame_info.den_state_count > 1;
-      } else {
-        // Without frame dropping, we'll get nonzero derivative if num and den
-        // do not overlap , or den has >1 active pdf.
-        frame_info.nonzero_derivative = !frame_info.num_den_overlap ||
-            frame_info.den_state_count > 1;
-      }
-    }
-    // If a frame is part of a segment, but it's not going to contribute
-    // to the derivative and the den lattice has only one pdf active
-    // at that time, then this frame can be excised from the lattice
-    // because it will not affect the posteriors around it.
-    if (config_.criterion == "mpfe") {
-      frame_info.can_excise_frame =
-          !frame_info.nonzero_derivative && \
-          !frame_info.multiple_transition_ids;
-      // in the mpfe case, if there are multiple transition-ids on a
-      // frame there may be multiple phones on a frame, which could
-      // contribute to the objective function even if they share pdf-ids.
-      // (this was an issue that came up during testing).
-    } else {      
-      frame_info.can_excise_frame =
-          !frame_info.nonzero_derivative && frame_info.den_pdf_count == 1;
-    }
-  }
-  return true;
-}
-
-
-/* Excising a frame means removing a frame from the lattice and removing the
-   corresponding feature.  We can only do this if it would not affect the
-   derivatives because the current frame has zero derivative and also all the
-   den-lat pdfs are the same on this frame (so removing the frame doesn't affect
-   the lattice posteriors).  But we can't remove a frame if doing so would
-   affect the acoustic context.  Generally speaking we must keep all frames
-   that are within LeftContext() to the left and RightContext() to the right
-   of a frame that we can't excise, *but* it's OK at the edges of a segment
-   even if they are that close to other frames, because we anyway keep a few
-   frames of context at the edges, and we can just make sure to keep the
-   *right* few frames of context.
-   */
-void DiscriminativeExampleSplitter::DoExcise(SplitExampleStats *stats) {
-  int32 left_context = eg_.left_context,
-      right_context = RightContext(),
-      num_frames = NumFrames();
-  // Compute, for each frame, whether we can excise it.
-  // 
-  std::vector<bool> can_excise(num_frames, false);
-  
-  bool need_some_frame = false;
-  for (int32 t = 0; t < num_frames; t++) {
-    can_excise[t] = frame_info_[t].can_excise_frame;
-    if (!can_excise[t])
-      need_some_frame = true;
-  }
-  if (!need_some_frame) { // We don't need any frame within this file, so simply
-                          // delete the segment.
-    KALDI_WARN << "Example completely removed when excising."; // unexpected,
-    // as the segment should have been deleted when splitting.
-    egs_out_->clear();
-    return;
-  }
-  egs_out_->resize(1);
-  DiscriminativeNnetExample &eg_out = (*egs_out_)[0];
-
-  // start_t and end_t will be the central part of the segment, excluding any
-  // frames at the edges that we can excise.
-  int32 start_t, end_t;
-  for (start_t = 0; can_excise[start_t]; start_t++);
-  for (end_t = num_frames; can_excise[end_t-1]; end_t--);
-
-  // for frames from start_t to end_t-1, do not excise them if
-  // they are within the context-window of a frame that we need to keep.
-  // Note: we do t2 = t - right_context to t + left_context, because we're
-  // concerned whether frame t2 has frame t in its window... it might
-  // seem a bit backwards.
-  std::vector<bool> will_excise(can_excise);
-  for (int32 t = start_t; t < end_t; t++) {
-    for (int32 t2 = t - right_context; t2 <= t + left_context; t2++)
-      if (t2 >= start_t && t2 < end_t && !can_excise[t2])
-        will_excise[t] = false; // can't excise this frame, it's needed for
-                                // context.
-  }
-
-  // Remove all un-needed frames from the lattice by replacing the
-  // symbols with epsilon and then removing the epsilons.
-  // Note, this operation is destructive (it changes lat_).
-  int32 num_states = lat_.NumStates();
-  for (int32 state = 0; state < num_states; state++) {
-    int32 t = state_times_[state];
-    for (::fst::MutableArcIterator<Lattice> aiter(&lat_, state); !aiter.Done();
-         aiter.Next()) {
-      Arc arc = aiter.Value();
-      if (will_excise[t]) {
-        arc.ilabel = arc.olabel = 0;
-        aiter.SetValue(arc);
-      }
-    }
-  }
-  RmEpsilon(&lat_);
-  RemoveAllOutputSymbols(&lat_);
-  ConvertLattice(lat_, &eg_out.den_lat);
-
-  eg_out.num_ali.clear();
-  int32 num_frames_kept = 0;
-  for (int32 t = 0; t < num_frames; t++) {
-    if (!will_excise[t]) {
-      eg_out.num_ali.push_back(eg_.num_ali[t]);
-      num_frames_kept++;
-    }
-  }
-
-  stats->num_frames_kept_after_excise += num_frames_kept;
-  stats->longest_segment_after_excise = std::max(stats->longest_segment_after_excise,
-                                                 num_frames_kept);
-  
-  int32 num_frames_kept_plus = num_frames_kept + left_context + right_context;
-  eg_out.input_frames.Resize(num_frames_kept_plus,
-                             eg_.input_frames.NumCols());
-
-  // the left-context of the output will be shifted to the right by
-  // start_t.
-  for (int32 i = 0; i < left_context; i++) {
-    SubVector<BaseFloat> dst(eg_out.input_frames, i);
-    SubVector<BaseFloat> src(eg_.input_frames, start_t + i);
-    dst.CopyFromVec(src);
-  }
-  // the right-context will also be shifted, we take the frames
-  // to the right of end_t.
-  for (int32 i = 0; i < right_context; i++) {
-    SubVector<BaseFloat> dst(eg_out.input_frames,
-                             num_frames_kept + left_context + i);
-    SubVector<BaseFloat> src(eg_.input_frames,
-                             end_t + left_context + i);
-    dst.CopyFromVec(src);
-  }
-  // now copy the central frames (those that were not excised).
-  int32 dst_t = 0;
-  for (int32 t = start_t; t < end_t; t++) {
-    if (!will_excise[t]) {
-      SubVector<BaseFloat> dst(eg_out.input_frames,
-                               left_context + dst_t);
-      SubVector<BaseFloat> src(eg_.input_frames,
-                               left_context + t);
-      dst.CopyFromVec(src);
-      dst_t++;
-    }
-  }
-  KALDI_ASSERT(dst_t == num_frames_kept);
-
-
-  eg_out.weight = eg_.weight;
-  eg_out.left_context = eg_.left_context;
-  eg_out.spk_info = eg_.spk_info;
-
-  eg_out.Check();
-}
-
-
-void DiscriminativeExampleSplitter::DoSplit(SplitExampleStats *stats) {
-  std::vector<int32> split_points;
-  int32 num_frames = NumFrames();
-  {
-    // Make the "split points" 0 and num_frames, and
-    // any frame that has just one state on it and the previous
-    // frame had >1 state.  This gives us one split for each
-    // "pinch point" in the lattice.  Later we may move each split
-    // to a more optimal location.
-    split_points.push_back(0);
-    for (int32 t = 1; t < num_frames; t++) {
-      if (frame_info_[t].den_state_count == 1 &&
-          frame_info_[t-1].den_state_count > 1)
-        split_points.push_back(t);
-    }
-    split_points.push_back(num_frames);
-  }
-
-  std::vector<bool> is_kept(split_points.size() - 1);
-  { // A "split" is a pair of successive split points.  Work out for each split
-    // whether we must keep it (we must if it contains at least one frame for
-    // which "nonzero_derivative" == true.)
-    for (size_t s = 0; s < is_kept.size(); s++) {
-      int32 start = split_points[s], end = split_points[s+1];
-      bool keep_this_split = false;
-      for (int32 t = start; t < end; t++)
-        if (frame_info_[t].nonzero_derivative)
-          keep_this_split = true;
-      is_kept[s] = keep_this_split;
-    }
-  }
-
-  egs_out_->clear();
-  egs_out_->reserve(is_kept.size());
-
-  stats->num_lattices++;
-  stats->longest_lattice = std::max(stats->longest_lattice, num_frames);
-  stats->num_segments += is_kept.size();
-  stats->num_frames_orig += num_frames;
-  for (int32 t = 0; t < num_frames; t++)
-    if (frame_info_[t].nonzero_derivative)
-      stats->num_frames_must_keep++;
-  
-  for (size_t s = 0; s < is_kept.size(); s++) {
-    if (is_kept[s]) {
-      stats->num_kept_segments++;
-      OutputOneSplit(split_points[s], split_points[s+1]);
-      int32 segment_len = split_points[s+1] - split_points[s];
-      stats->num_frames_kept_after_split += segment_len;
-      stats->longest_segment_after_split =
-          std::max(stats->longest_segment_after_split, segment_len);
-    }
-  }
-}
-
-
-
-void SplitExampleStats::Print() {
-  KALDI_LOG << "Split " << num_lattices << " lattices.  Stats:";
-  double kept_segs_per_lat = num_kept_segments * 1.0 / num_lattices,
-      segs_per_lat = num_segments * 1.0 / num_lattices;
-      
-  KALDI_LOG << "Made on average " << segs_per_lat << " segments per lattice, "
-            << "of which " << kept_segs_per_lat << " were kept.";
-
-  double percent_needed = num_frames_must_keep * 100.0 / num_frames_orig,
-    percent_after_split = num_frames_kept_after_split * 100.0 / num_frames_orig,
-   percent_after_excise = num_frames_kept_after_excise * 100.0 / num_frames_orig;
-      
-  KALDI_LOG << "Needed to keep " << percent_needed << "% of frames, after split "
-            << "kept " << percent_after_split << "%, after excising frames kept "
-            << percent_after_excise << "%.";
-
-  KALDI_LOG << "Longest lattice had " << longest_lattice
-            << " frames, longest segment after splitting had "
-            << longest_segment_after_split
-            << " frames, longest segment after excising had "
-            << longest_segment_after_excise;
-}
-
-void DiscriminativeExampleSplitter::OutputOneSplit(int32 seg_begin,
-                                                   int32 seg_end) {
-  KALDI_ASSERT(seg_begin >= 0 && seg_end > seg_begin && seg_end <= NumFrames());
-  egs_out_->resize(egs_out_->size() + 1);
-  int32 left_context = eg_.left_context, right_context = RightContext(),
-      tot_context = left_context + right_context;
-  DiscriminativeNnetExample &eg_out = egs_out_->back();
-  eg_out.weight = eg_.weight;
-
-  eg_out.num_ali.insert(eg_out.num_ali.end(),
-                        eg_.num_ali.begin() + seg_begin,
-                        eg_.num_ali.begin() + seg_end);
-
-  CreateOutputLattice(seg_begin, seg_end, &(eg_out.den_lat));
-  
-  eg_out.input_frames = eg_.input_frames.Range(seg_begin, seg_end - seg_begin +
-                                               tot_context,
-                                               0, eg_.input_frames.NumCols());
-
-  eg_out.left_context = eg_.left_context;
-
-  eg_out.spk_info = eg_.spk_info;
-  
-  eg_out.Check();  
-}
-
-// static
-void DiscriminativeExampleSplitter::RemoveAllOutputSymbols(Lattice *lat) {
-  for (StateId s = 0; s < lat->NumStates(); s++) {
-    for (::fst::MutableArcIterator<Lattice> aiter(lat, s); !aiter.Done();
-         aiter.Next()) {
-      Arc arc = aiter.Value();
-      arc.olabel = 0;
-      aiter.SetValue(arc);
-    }
-  }  
-}
-
-DiscriminativeExampleSplitter::StateId
-DiscriminativeExampleSplitter::GetOutputStateId(
-    StateId s, unordered_map<StateId, StateId> *state_map, Lattice *lat_out) {
-  if (state_map->count(s) == 0) {
-    return ((*state_map)[s] = lat_out->AddState());
-  } else {
-    return (*state_map)[s];
-  }
-}
-
-void DiscriminativeExampleSplitter::CreateOutputLattice(
-    int32 seg_begin, int32 seg_end,
-    CompactLattice *clat_out) {
-  Lattice lat_out;
-
-  // Below, state_map will map from states in the original lattice
-  // lat_ to ones in the new lattice lat_out.
-  unordered_map<StateId, StateId> state_map;
-
-  // The range of the loop over s could be made over the
-  // entire lattice, but we limit it for efficiency.
-  
-  for (StateId s = frame_info_[seg_begin].start_state;
-       s <= frame_info_[seg_end].end_state; s++) {
-    int32 t = state_times_[s];
-
-    if (t < seg_begin || t > seg_end) // state out of range.
-      continue;
-
-    int32 this_state = GetOutputStateId(s, &state_map, &lat_out);
-
-    if (t == seg_begin) // note: we only split on frames with just one
-      lat_out.SetStart(this_state); // state, so we reach this only once.
-    
-    if (t == seg_end) { // Make it final and don't process its arcs out.
-      if (seg_end == NumFrames()) {
-        lat_out.SetFinal(this_state, lat_.Final(s));
-      } else {
-        lat_out.SetFinal(this_state, LatticeWeight::One());
-      }
-      continue; // don't process arcs out of this state.
-    }
-    
-    for (fst::ArcIterator<Lattice> aiter(lat_, s); !aiter.Done(); aiter.Next()) {
-      const Arc &arc = aiter.Value();
-      StateId next_state = GetOutputStateId(arc.nextstate,
-                                            &state_map, &lat_out);
-      KALDI_ASSERT(arc.ilabel != 0 && arc.ilabel == arc.olabel); // We expect no epsilons.
-      lat_out.AddArc(this_state, Arc(arc.ilabel, arc.olabel, arc.weight,
-                                      next_state));
-    }
-  }
-  Connect(&lat_out); // this is not really necessary, it's only to make sure
-                     // the assert below fails when it should. TODO: remove it.
-  KALDI_ASSERT(lat_out.NumStates() > 0);
-  RemoveAllOutputSymbols(&lat_out);
-  ConvertLattice(lat_out, clat_out);
-}
-
-/*
-void DiscriminativeExampleSplitter::SelfTest() {
-  bool splits_ok = true; // True iff we split only
-                         // on frames where there was
-                         // one arc crossing.
-
-  // we can't do any of this excising frames if we want to
-  // preserve equivalence.
-  std::fill(can_excise_.begin(), can_excise_.end(), false);
-  
-  std::vector<Lattice*> split_lats;
-
-  int32 cur_t = NumFrames();
-  while (cur_t != 0) {
-    Backtrace this_backtrace = backtrace_[cur_t];
-    int32 prev_t = this_backtrace.prev_frame;
-
-    int32 seg_begin = prev_t, seg_end = cur_t;
-    Lattice *new_lat = new Lattice();
-    CreateOutputLattice(seg_begin, seg_end, new_lat);
-    split_lats.push_back(new_lat);
-
-    if (split_penalty_[cur_t] != 0)
-      splits_ok = false; // we split where there was a penalty so we don't
-                         //  expect equivalence.
-    cur_t = prev_t;
-  }
-  KALDI_ASSERT(!split_lats.empty());
-  std::reverse(split_lats.begin(), split_lats.end());
-  for (size_t i = 1; i < split_lats.size(); i++) {
-    // append split_lats[i] to split_lats[0], putting the
-    // result in split_lats[0].
-    Concat(split_lats[0], *(split_lats[i]));
-  }
-  Connect(split_lats[0]);
-  KALDI_ASSERT(split_lats[0]->NumStates() > 0);
-  
-
-  if (!splits_ok) {
-    KALDI_LOG << "Not self-testing because we split where there were "
-              << "multiple paths.";
-    
-  } else {
-    if (!(RandEquivalent(*(split_lats[0]), lat_, 5, 0.01,
-         Rand(), 100))) {
-      KALDI_WARN << "Lattices were not equivalent (self-test failed).";
-      KALDI_LOG << "Original lattice was: ";
-      WriteLattice(std::cerr, false, lat_);
-      KALDI_LOG << "New lattice is:";
-      WriteLattice(std::cerr, false, *(split_lats[0]));
-      {
-        Lattice best_path_orig;
-        ShortestPath(lat_, &best_path_orig);
-        KALDI_LOG << "Original best path was:";
-        WriteLattice(std::cerr, false, best_path_orig);
-      }
-      {
-        Lattice best_path_new;
-        ShortestPath(*(split_lats[0]), &best_path_new);
-        KALDI_LOG << "New best path was:";
-        WriteLattice(std::cerr, false, best_path_new);
-      }
-    }
-  }
-  for (size_t i = 0; i < split_lats.size(); i++)
-    delete split_lats[i];
-}
-*/
-
-
-
-void SplitDiscriminativeExample(
-    const SplitDiscriminativeExampleConfig &config,
-    const TransitionModel &tmodel,
-    const DiscriminativeNnetExample &eg,
-    std::vector<DiscriminativeNnetExample> *egs_out,
-    SplitExampleStats *stats_out) {
-  DiscriminativeExampleSplitter splitter(config, tmodel, eg, egs_out);
-  splitter.Split(stats_out);
-}
-
-
-void ExciseDiscriminativeExample(
-    const SplitDiscriminativeExampleConfig &config,
-    const TransitionModel &tmodel,
-    const DiscriminativeNnetExample &eg,
-    std::vector<DiscriminativeNnetExample> *egs_out,    
-    SplitExampleStats *stats_out) {
-  DiscriminativeExampleSplitter splitter(config, tmodel, eg, egs_out);
-  splitter.Excise(stats_out);
-}
-
-
-void UpdateHash(
-    const TransitionModel &tmodel,
-    const DiscriminativeNnetExample &eg,
-    std::string criterion,
-    bool drop_frames,
-    bool one_silence_class,
-    Matrix<double> *hash,
-    double *num_weight,
-    double *den_weight,
-    double *tot_t) {
-  int32 feat_dim = eg.input_frames.NumCols(),
-      left_context = eg.left_context,
-      num_frames = eg.num_ali.size(),
-      right_context = eg.input_frames.NumRows() - num_frames - left_context,
-      context_width = left_context + 1 + right_context;
-  *tot_t += num_frames;
-  KALDI_ASSERT(right_context >= 0);
-  KALDI_ASSERT(hash != NULL);
-  if (hash->NumRows() == 0) {
-    hash->Resize(tmodel.NumPdfs(), feat_dim);
-  } else {
-    KALDI_ASSERT(hash->NumRows() == tmodel.NumPdfs() &&
-                 hash->NumCols() == feat_dim);
-  }
-
-  Posterior post;
-  std::vector<int32> silence_phones; // we don't let the user specify this
-                                     // because it's not necessary for testing
-                                     // purposes -> leave it empty
-  ExampleToPdfPost(tmodel, silence_phones, criterion, drop_frames,
-                   one_silence_class, eg, &post);
-
-  Vector<BaseFloat> avg_feat(feat_dim);
-  
-  for (int32 t = 0; t < num_frames; t++) {
-    SubMatrix<BaseFloat> context_window(eg.input_frames,
-                                        t, context_width,
-                                        0, feat_dim);
-    // set avg_feat to average over the context-window for this frame.
-    avg_feat.AddRowSumMat(1.0 / context_width, context_window, 0.0);
-    Vector<double> avg_feat_dbl(avg_feat);
-    for (size_t i = 0; i < post[t].size(); i++) {
-      int32 pdf_id = post[t][i].first;
-      BaseFloat weight = post[t][i].second;
-      hash->Row(pdf_id).AddVec(weight, avg_feat_dbl);
-      if (weight > 0.0) *num_weight += weight;
-      else *den_weight += -weight;
-    }
-  }
-}
-
-
-void ExampleToPdfPost(
-    const TransitionModel &tmodel,
-    const std::vector<int32> &silence_phones,    
-    std::string criterion,
-    bool drop_frames,
-    bool one_silence_class,
-    const DiscriminativeNnetExample &eg,
-    Posterior *post) {
-  KALDI_ASSERT(criterion == "mpfe" || criterion == "smbr" || criterion == "mmi");
-  
-  Lattice lat;
-  ConvertLattice(eg.den_lat, &lat);
-  TopSort(&lat);
-  if (criterion == "mpfe" || criterion == "smbr") {
-    Posterior tid_post;
-    LatticeForwardBackwardMpeVariants(tmodel, silence_phones, lat, eg.num_ali,
-                                      criterion, one_silence_class, &tid_post);
-    
-    ConvertPosteriorToPdfs(tmodel, tid_post, post);
-  } else {
-    bool convert_to_pdf_ids = true, cancel = true;
-    LatticeForwardBackwardMmi(tmodel, lat, eg.num_ali,
-                              drop_frames, convert_to_pdf_ids, cancel,
-                              post);
-  }
-  ScalePosterior(eg.weight, post);
-}
-
-
-void SolvePackingProblem(BaseFloat max_cost,
-                         const std::vector<BaseFloat> &costs,
-                         std::vector<std::vector<size_t> > *groups) {
-  groups->clear();
-  std::vector<BaseFloat> group_costs;
-  for (size_t i = 0; i < costs.size(); i++) {
-    bool found_group = false;
-    BaseFloat this_cost = costs[i];
-    for (size_t j = 0; j < groups->size(); j++) {
-      if (group_costs[j] + this_cost <= max_cost) {
-        (*groups)[j].push_back(i);
-        group_costs[j] += this_cost;
-        found_group = true;
-        break;
-      }
-    }
-    if (!found_group) { // Put this object in a newly created group.
-      groups->resize(groups->size() + 1);
-      groups->back().push_back(i);
-      group_costs.push_back(this_cost);
-    }
-  }
-}
-
-void AppendDiscriminativeExamples(
-    const std::vector<const DiscriminativeNnetExample*> &input,
-    DiscriminativeNnetExample *output) {
-  KALDI_ASSERT(!input.empty());
-  const DiscriminativeNnetExample &eg0 = *(input[0]);
-  
-  int32 dim = eg0.input_frames.NumCols() + eg0.spk_info.Dim(),
-      left_context = eg0.left_context,
-      num_frames = eg0.num_ali.size(),
-      right_context = eg0.input_frames.NumRows() - num_frames - left_context;
-
-  int32 tot_frames = eg0.input_frames.NumRows();  // total frames (appended,
-                                                  // with context)
-  for (size_t i = 1; i < input.size(); i++)
-    tot_frames += input[i]->input_frames.NumRows();
-
-  int32 arbitrary_tid = 1;  // arbitrary transition-id that we use to pad the
-                            // num_ali and den_lat members between segments
-                            // (since they're both the same, and the den-lat in
-                            // those parts is linear, they contribute no
-                            // derivative to the training).
-  
-  output->den_lat = eg0.den_lat;
-  output->num_ali = eg0.num_ali;
-  output->input_frames.Resize(tot_frames, dim, kUndefined);
-  output->input_frames.Range(0, eg0.input_frames.NumRows(),
-                             0, eg0.input_frames.NumCols()).CopyFromMat(eg0.input_frames);
-  if (eg0.spk_info.Dim() != 0) {
-    output->input_frames.Range(0, eg0.input_frames.NumRows(),
-                               eg0.input_frames.NumCols(), eg0.spk_info.Dim()).
-        CopyRowsFromVec(eg0.spk_info);
-  }
-  
-  output->num_ali.reserve(tot_frames - left_context - right_context);
-  output->weight = eg0.weight;
-  output->left_context = eg0.left_context;
-  output->spk_info.Resize(0);
-
-  CompactLattice inter_segment_clat;
-  int32 initial = inter_segment_clat.AddState(); // state 0.
-  inter_segment_clat.SetStart(initial);
-  
-  std::vector<int32> inter_segment_ali(left_context + right_context);
-  std::fill(inter_segment_ali.begin(), inter_segment_ali.end(), arbitrary_tid);
-
-  CompactLatticeWeight final_weight = CompactLatticeWeight::One();
-  final_weight.SetString(inter_segment_ali);
-  inter_segment_clat.SetFinal(initial, final_weight);
-  
-  int32 feat_offset = eg0.input_frames.NumRows();
-  
-  for (size_t i = 1; i < input.size(); i++) {
-    const DiscriminativeNnetExample &eg_i = *(input[i]);
-        
-    output->input_frames.Range(feat_offset, eg_i.input_frames.NumRows(),
-                               0, eg_i.input_frames.NumCols()).CopyFromMat(
-                                   eg_i.input_frames);
-    if (eg_i.spk_info.Dim() != 0) {
-      output->input_frames.Range(feat_offset, eg_i.input_frames.NumRows(),
-                                 eg_i.input_frames.NumCols(),
-                                 eg_i.spk_info.Dim()).CopyRowsFromVec(
-                                     eg_i.spk_info);
-      KALDI_ASSERT(eg_i.input_frames.NumCols() +
-                   eg_i.spk_info.Dim() == dim);
-    }
-    
-    output->num_ali.insert(output->num_ali.end(),
-                           inter_segment_ali.begin(), inter_segment_ali.end());
-    output->num_ali.insert(output->num_ali.end(),
-                           eg_i.num_ali.begin(), eg_i.num_ali.end());
-    Concat(&(output->den_lat), inter_segment_clat);
-    Concat(&(output->den_lat), eg_i.den_lat);
-    KALDI_ASSERT(output->weight == eg_i.weight);
-    KALDI_ASSERT(output->left_context == eg_i.left_context);
-    feat_offset += eg_i.input_frames.NumRows();
-  }
-  KALDI_ASSERT(feat_offset == tot_frames);
-}
-  
-void CombineDiscriminativeExamples(
-    int32 max_length,
-    const std::vector<DiscriminativeNnetExample> &input,
-    std::vector<DiscriminativeNnetExample> *output) {
-  
-  std::vector<BaseFloat> costs(input.size());
-  for (size_t i = 0; i < input.size(); i++)
-    costs[i] = static_cast<BaseFloat>(input[i].input_frames.NumRows());
-  std::vector<std::vector<size_t> > groups;
-  SolvePackingProblem(max_length,
-                      costs,
-                      &groups);
-  output->clear();
-  output->resize(groups.size());
-  for (size_t i = 0; i < groups.size(); i++) {
-    std::vector<const DiscriminativeNnetExample*> group_egs;
-    for (size_t j = 0; j < groups[i].size(); j++) {
-      size_t index = groups[i][j];
-      group_egs.push_back(&(input[index]));
-    }
-    AppendDiscriminativeExamples(group_egs, &((*output)[i]));
-  }
-}
-
-
-
-} // namespace nnet2
-} // namespace kaldi
diff --git a/src/nnet2/nnet-example-functions.h b/src/nnet2/nnet-example-functions.h
deleted file mode 100644
index 82c86dfc046..00000000000
--- a/src/nnet2/nnet-example-functions.h
+++ /dev/null
@@ -1,300 +0,0 @@
-// nnet2/nnet-example-functions.h
-
-// Copyright 2013  Johns Hopkins University (author: Daniel Povey)
-
-// See ../../COPYING for clarification regarding multiple authors
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//  http://www.apache.org/licenses/LICENSE-2.0
-//
-// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
-// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
-// MERCHANTABLITY OR NON-INFRINGEMENT.
-// See the Apache 2 License for the specific language governing permissions and
-// limitations under the License.
-
-#ifndef KALDI_NNET2_NNET_EXAMPLE_FUNCTIONS_H_
-#define KALDI_NNET2_NNET_EXAMPLE_FUNCTIONS_H_
-
-/** @file
-    Note on how to parse this filename: it contains functions relatied to
-    neural-net training examples, mostly discriminative neural-net training examples,
-   i.e. type DiscriminativeNnetExample    
-*/
-
-#include "nnet2/nnet-nnet.h"
-#include "util/table-types.h"
-#include "lat/kaldi-lattice.h"
-#include "nnet2/nnet-example.h"
-#include "hmm/transition-model.h"
-#include "hmm/posterior.h"
-
-namespace kaldi {
-namespace nnet2 {
-
-// Glossary: mmi = Maximum Mutual Information,
-//          mpfe = Minimum Phone Frame Error
-//          smbr = State-level Minimum Bayes Risk
-
-
-// This file relates to the creation of examples for discriminative training
-// (see struct DiscriminativeNnetExample, in ./nnet-example.h).
-
-
-/** Config structure for SplitExample, for splitting discriminative
-    training examples.
-*/
-struct SplitDiscriminativeExampleConfig {
-  // This is the maximum length in frames that any example is allowed to have.
-  // We will split training examples to ensure that they are no longer than
-  // this.  Note: if you make this too short it may have bad effects because
-  // the posteriors start to become inaccurate at the edges of the training
-  // example (since they will be based on the acoustic model that was used to
-  // generate the lattices, not the current one).
-  int32 max_length;
-
-  // criterion can be "smbr" or "mpfe" or "mmi".  This info is only needed to
-  // determine which parts of the lattices will not contribute to training and
-  // can be discarded (for mpe/smbr, any part where the den-lat has only one
-  // path or all den-lat paths map to the same pdf can be discareded; for mmi,
-  // any part where the den-lat's pdfs all have the same value as the num-lat
-  // pdf for that frame, can be discarded.
-  std::string criterion;
-
-  bool collapse_transition_ids;
-
-  bool determinize;
-
-  bool minimize; // we'll push and minimize if this is true.
-  
-  bool test;
-
-  bool drop_frames; // For MMI, true if we will eventually drop frames in which
-                    // the numerator does not appear in the denominator lattice.
-                    // (i.e. we won't backpropagate any derivatives on those
-                    // frames).  We may still need to include those frames in
-                    // the computation in order to get correct posteriors for
-                    // other parts of the lattice.
-
-  bool split; // if false, we won't split at all.
-
-  bool excise; // if false, we will skip the "excise" step.
-  
-  SplitDiscriminativeExampleConfig():
-      max_length(1024), criterion("smbr"), collapse_transition_ids(true),
-      determinize(true), minimize(true), test(false), drop_frames(false),
-      split(true), excise(true) { }
-
-  void Register(OptionsItf *opts) {
-
-    opts->Register("max-length", &max_length, "Maximum length allowed for any "
-                   "segment (i.e. max #frames for any example");
-    //opts->Register("target-length", &target_length, "Target length for a "
-    // "segment");
-    opts->Register("criterion", &criterion, "Criterion, 'mmi'|'mpfe'|'smbr'. "
-                   "Determines which frames may be dropped from lattices.");
-    opts->Register("collapse-transition-ids", &collapse_transition_ids,
-                   "This option included for debugging purposes");
-    opts->Register("determinize", &determinize, "If true, we determinize "
-                   "lattices (as Lattice) before splitting and possibly minimize");
-    opts->Register("minimize", &minimize, "If true, we push and "
-                   "minimize lattices (as Lattice) before splitting");
-    opts->Register("test", &test, "If true, activate self-testing code.");
-    // See "Sequence-discriminative training of deep neural networks", Vesely et al,
-    // ICASSP 2013 for explanation of frame dropping.
-    opts->Register("drop-frames", &drop_frames, "For MMI, if true we drop frames "
-                   "with no overlap of num and den pdf-ids");
-    opts->Register("split", &split, "Set to false to disable lattice-splitting.");
-    opts->Register("excise", &excise, "Set to false to disable excising un-needed "
-                   "frames (option included for debug purposes)");
-  }
-};
-
-/// This struct exists only for diagnostic purposes.  Note: the stats assume
-/// that you call SplitDiscriminative and ExciseDiscriminativeExample in the
-/// same program, and the info printed out will be wrong if this is not the
-/// case... this isn't ideal but it was more convenient.
-struct SplitExampleStats {
-  int32 num_lattices;
-  int32 longest_lattice;
-  int32 num_segments;
-  int32 num_kept_segments;
-  int64 num_frames_orig;
-  int64 num_frames_must_keep;
-  int64 num_frames_kept_after_split;
-  int32 longest_segment_after_split;
-  int64 num_frames_kept_after_excise;
-  int32 longest_segment_after_excise;
-  
-  SplitExampleStats() { memset(this, 0, sizeof(*this)); }
-  void Print();
-};
-
-/** Converts lattice to discriminative training example.  returns true on
-    success, false on failure such as mismatched input (will also warn in this
-    case). */
-bool LatticeToDiscriminativeExample(
-    const std::vector<int32> &alignment,
-    const Matrix<BaseFloat> &feats,
-    const CompactLattice &clat,
-    BaseFloat weight,
-    int32 left_context,
-    int32 right_context,
-    DiscriminativeNnetExample *eg);
-
-
-/** Split a "discriminative example" into multiple pieces,
-    splitting where the lattice has "pinch points".
- */
-void SplitDiscriminativeExample(
-    const SplitDiscriminativeExampleConfig &config,
-    const TransitionModel &tmodel,
-    const DiscriminativeNnetExample &eg,
-    std::vector<DiscriminativeNnetExample> *egs_out,
-    SplitExampleStats *stats_out);
-
-/** Remove unnecessary frames from discriminative training
-    example.  The output egs_out will be of size zero or one
-    (usually one) after being called. */
-void ExciseDiscriminativeExample(
-    const SplitDiscriminativeExampleConfig &config,
-    const TransitionModel &tmodel,
-    const DiscriminativeNnetExample &eg,
-    std::vector<DiscriminativeNnetExample> *egs_out,
-    SplitExampleStats *stats_out);
-
-
-/** Appends the given vector of examples (which must be non-empty) into 
-    a single output example (called by CombineExamples, which might be
-    a more convenient interface).
-
-   When combining examples it directly appends the features, and then adds a
-   "fake" segment to the lattice and alignment in between, padding with
-   transition-ids that are all ones.  This is necessary in case the network
-   needs acoustic context, and only because of a kind of limitation in the nnet
-   training code that doesn't support varying 'chunk' sizes within a minibatch.
-
-   Will fail if all the input examples don't have the same weight (this will
-   normally be 1.0 anyway), or if the feature dimension (i.e. basic feature
-   dimension plus spk_info dimension) differs between the examples.
-*/
-void AppendDiscriminativeExamples(
-    const std::vector<const DiscriminativeNnetExample*> &input,
-    DiscriminativeNnetExample *output);
-
-/**
-   This function is used to combine multiple discriminative-training
-   examples (each corresponding to a segment of a lattice), into one.
-   
-   It combines examples into groups such that each group will have a
-   total length (number of rows of the feature matrix) less than or
-   equal to max_length.  However, if individual examples are longer
-   than max_length they will still be processed; they will be given
-   their own group.
-   
-   See also the documentation for AppendDiscriminativeExamples() which
-   gives more details on how we append the examples.
-
-   Will fail if all the input examples don't have the same weight (this will
-   normally be 1.0 anyway).
-
-   If the spk_info variables are non-empty, it will move them into the features
-   of the output, so the spk_info of the output will be empty but the
-   appropriate speaker vectors will be appended to each row of the features.  */
-
-void CombineDiscriminativeExamples(
-    int32 max_length,
-    const std::vector<DiscriminativeNnetExample> &input,
-    std::vector<DiscriminativeNnetExample> *output);
-                     
-/**
-   This function solves the "packing problem" using the "first fit" algorithm.
-   It groups together the indices 0 through sizes.size() - 1, such that the sum
-   of cost within each group does not exceed max_lcost.  [However, if there
-   are single examples that exceed max_cost, it puts them in their own bin].
-   The algorithm is not particularly efficient-- it's more n^2 than n log(n)
-   which it should be.  */
-void SolvePackingProblem(BaseFloat max_cost,
-                         const std::vector<BaseFloat> &costs,
-                         std::vector<std::vector<size_t> > *groups);
-
-
-
-/**
-   Given a discriminative training example, this function works out posteriors
-   at the pdf level (note: these are "discriminative-training posteriors" that
-   may be positive or negative.  The denominator lattice "den_lat" in the
-   example "eg" should already have had acoustic-rescoring done so that its
-   acoustic probs are up to date, and any acoustic scaling should already have
-   been applied.
-
-   "criterion" may be "mmi" or "mpfe" or "smbr".  If criterion
-   is "mmi", "drop_frames" means we don't include derivatives for frames
-   where the numerator pdf is not in the denominator lattice.
-
-   if "one_silence_class" is true you can get a newer behavior for MPE/SMBR
-   which will tend to reduce insertions.
-
-   "silence_phones" is a list of silence phones (this is only relevant for mpfe
-   or smbr, if we want to treat silence specially).
- */
-void ExampleToPdfPost(
-    const TransitionModel &tmodel,
-    const std::vector<int32> &silence_phones,
-    std::string criterion,
-    bool drop_frames,
-    bool one_silence_class,
-    const DiscriminativeNnetExample &eg,
-    Posterior *post);
-
-/**
-   This function is used in code that tests the functionality that we provide
-   here, about splitting and excising nnet examples.  It adds to a "hash
-   function" that is a function of a set of examples; the hash function is of
-   dimension (number of pdf-ids x features dimension).  The hash function
-   consists of the (denominator - numerator) posteriors over pdf-ids, times the
-   average over the context-window (left-context on the left, right-context on
-   the right), of the features.  This is useful because the various
-   manipulations we do are supposed to preserve this, and if there is a bug
-   it will most likely cause the hash function to change.
-
-   This function will resize the matrix if it is empty.
-
-   Any acoustic scaling of the lattice should be done before you call this
-   function.
-
-   'criterion' should be 'mmi', 'mpfe', or 'smbr'.
-   
-   You should set drop_frames to true if you are doing MMI with drop-frames
-   == true.  Then it will not compute the hash for frames where the numerator
-   pdf-id is not in the denominator lattice.
-
-   You can set one_silence_class to true for a newer optional behavior that will
-   reduce insertions in the trained model (or false for the traditional
-   behavior).
-
-   The function will also accumulate the total numerator and denominator weights
-   used as num_weight and den_weight, for an additional diagnostic, and the total
-   number of frames, as tot_t.
-*/
-void UpdateHash(
-    const TransitionModel &tmodel,
-    const DiscriminativeNnetExample &eg,
-    std::string criterion,
-    bool drop_frames,
-    bool one_silence_class,
-    Matrix<double> *hash,
-    double *num_weight,
-    double *den_weight,
-    double *tot_t);
-
-
-
-} // namespace nnet2
-} // namespace kaldi
-
-#endif // KALDI_NNET2_NNET_EXAMPLE_FUNCTIONS_H_
diff --git a/src/nnet2/nnet-example.cc b/src/nnet2/nnet-example.cc
deleted file mode 100644
index 4bca059b76e..00000000000
--- a/src/nnet2/nnet-example.cc
+++ /dev/null
@@ -1,309 +0,0 @@
-// nnet2/nnet-example.cc
-
-// Copyright 2012-2013  Johns Hopkins University (author: Daniel Povey)
-//                2014  Vimal Manohar
-
-// See ../../COPYING for clarification regarding multiple authors
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//  http://www.apache.org/licenses/LICENSE-2.0
-//
-// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
-// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
-// MERCHANTABLITY OR NON-INFRINGEMENT.
-// See the Apache 2 License for the specific language governing permissions and
-// limitations under the License.
-
-#include "nnet2/nnet-example.h"
-#include "lat/lattice-functions.h"
-#include "hmm/posterior.h"
-
-namespace kaldi {
-namespace nnet2 {
-
-// This function returns true if the example has labels which, for each frame,
-// have a single element with probability one; and if so, it outputs them to the
-// vector in the associated pointer.  This enables us to write the egs more
-// compactly to disk in this common case.
-bool HasSimpleLabels(
-    const NnetExample &eg,
-    std::vector<int32> *simple_labels) {
-  size_t num_frames = eg.labels.size();
-  for (int32 t = 0; t < num_frames; t++)
-    if (eg.labels[t].size() != 1 || eg.labels[t][0].second != 1.0)
-      return false;
-  simple_labels->resize(num_frames);
-  for (int32 t = 0; t < num_frames; t++)
-    (*simple_labels)[t] = eg.labels[t][0].first;
-  return true;
-}
-
-
-void NnetExample::Write(std::ostream &os, bool binary) const {
-  // Note: weight, label, input_frames and spk_info are members.  This is a
-  // struct.
-  WriteToken(os, binary, "<NnetExample>");
-
-  // At this point, we write <Lab1> if we have "simple" labels, or
-  // <Lab2> in general.  Previous code (when we had only one frame of
-  // labels) just wrote <Labels>.
-  std::vector<int32> simple_labels;
-  if (HasSimpleLabels(*this, &simple_labels)) {
-    WriteToken(os, binary, "<Lab1>");
-    WriteIntegerVector(os, binary, simple_labels);
-  } else {
-    WriteToken(os, binary, "<Lab2>");
-    int32 num_frames = labels.size();
-    WriteBasicType(os, binary, num_frames);
-    for (int32 t = 0; t < num_frames; t++) {
-      int32 size = labels[t].size();
-      WriteBasicType(os, binary, size);
-      for (int32 i = 0; i < size; i++) {
-        WriteBasicType(os, binary, labels[t][i].first);
-        WriteBasicType(os, binary, labels[t][i].second);
-      }
-    }
-  }
-  WriteToken(os, binary, "<InputFrames>");
-  input_frames.Write(os, binary);
-  WriteToken(os, binary, "<LeftContext>");
-  WriteBasicType(os, binary, left_context);
-  WriteToken(os, binary, "<SpkInfo>");
-  spk_info.Write(os, binary);
-  WriteToken(os, binary, "</NnetExample>");
-}
-
-void NnetExample::Read(std::istream &is, bool binary) {
-  // Note: weight, label, input_frames, left_context and spk_info are members.
-  // This is a struct.
-  ExpectToken(is, binary, "<NnetExample>");
-
-  std::string token;
-  ReadToken(is, binary, &token);
-  if (!strcmp(token.c_str(), "<Lab1>")) {  // simple label format
-    std::vector<int32> simple_labels;
-    ReadIntegerVector(is, binary, &simple_labels);
-    labels.resize(simple_labels.size());
-    for (size_t i = 0; i < simple_labels.size(); i++) {
-      labels[i].resize(1);
-      labels[i][0].first = simple_labels[i];
-      labels[i][0].second = 1.0;
-    }
-  } else if (!strcmp(token.c_str(), "<Lab2>")) {  // generic label format
-    int32 num_frames;
-    ReadBasicType(is, binary, &num_frames);
-    KALDI_ASSERT(num_frames > 0);
-    labels.resize(num_frames);
-    for (int32 t = 0; t < num_frames; t++) {
-      int32 size;
-      ReadBasicType(is, binary, &size);
-      KALDI_ASSERT(size >= 0);
-      labels[t].resize(size);
-      for (int32 i = 0; i < size; i++) {
-        ReadBasicType(is, binary, &(labels[t][i].first));
-        ReadBasicType(is, binary, &(labels[t][i].second));
-      }
-    }
-  } else if (!strcmp(token.c_str(), "<Labels>")) {  // back-compatibility
-    labels.resize(1);  // old format had 1 frame of labels.
-    int32 size;
-    ReadBasicType(is, binary, &size);
-    labels[0].resize(size);
-    for (int32 i = 0; i < size; i++) {
-      ReadBasicType(is, binary, &(labels[0][i].first));
-      ReadBasicType(is, binary, &(labels[0][i].second));
-    }
-  } else {
-    KALDI_ERR << "Expected token <Lab1>, <Lab2> or <Labels>, got " << token;
-  }
-  ExpectToken(is, binary, "<InputFrames>");
-  input_frames.Read(is, binary);
-  ExpectToken(is, binary, "<LeftContext>"); // Note: this member is
-  // recently added, but I don't think we'll get too much back-compatibility
-  // problems from not handling the old format.
-  ReadBasicType(is, binary, &left_context);
-  ExpectToken(is, binary, "<SpkInfo>");
-  spk_info.Read(is, binary);
-  ExpectToken(is, binary, "</NnetExample>");
-}
-
-void NnetExample::SetLabelSingle(int32 frame, int32 pdf_id, BaseFloat weight) {
-  KALDI_ASSERT(static_cast<size_t>(frame) < labels.size());
-  labels[frame].clear();
-  labels[frame].push_back(std::make_pair(pdf_id, weight));
-}
-
-int32 NnetExample::GetLabelSingle(int32 frame, BaseFloat *weight) {
-  BaseFloat max = -1.0;
-  int32 pdf_id = -1;
-  KALDI_ASSERT(static_cast<size_t>(frame) < labels.size());
-  for (int32 i = 0; i < labels[frame].size(); i++) {
-    if (labels[frame][i].second > max) {
-      pdf_id = labels[frame][i].first;
-      max = labels[frame][i].second;
-    }
-  }
-  if (weight != NULL) *weight = max;
-  return pdf_id;
-}
-
-
-
-static bool nnet_example_warned_left = false, nnet_example_warned_right = false;
-
-// Self-constructor that can reduce the number of frames and/or context.
-NnetExample::NnetExample(const NnetExample &input,
-                         int32 start_frame,
-                         int32 new_num_frames,
-                         int32 new_left_context,
-                         int32 new_right_context): spk_info(input.spk_info) {
-  int32 num_label_frames = input.labels.size();
-  if (start_frame < 0) start_frame = 0;  // start_frame is offset in the labeled
-                                         // frames.
-  KALDI_ASSERT(start_frame < num_label_frames);
-  if (start_frame + new_num_frames > num_label_frames || new_num_frames == -1)
-    new_num_frames = num_label_frames - start_frame;
-  // compute right-context of input.
-  int32 input_right_context =
-      input.input_frames.NumRows() - input.left_context - num_label_frames;
-  if (new_left_context == -1) new_left_context = input.left_context;
-  if (new_right_context == -1) new_right_context = input_right_context;
-  if (new_left_context > input.left_context) {
-    if (!nnet_example_warned_left) {
-      nnet_example_warned_left = true;
-      KALDI_WARN << "Requested left-context " << new_left_context
-                 << " exceeds input left-context " << input.left_context
-                 << ", will not warn again.";
-    }
-    new_left_context = input.left_context;
-  }
-  if (new_right_context > input_right_context) {
-    if (!nnet_example_warned_right) {
-      nnet_example_warned_right = true;
-      KALDI_WARN << "Requested right-context " << new_right_context
-                 << " exceeds input right-context " << input_right_context
-                 << ", will not warn again.";
-    }
-    new_right_context = input_right_context;
-  }
-
-  int32 new_tot_frames = new_left_context + new_num_frames + new_right_context,
-      left_frames_lost = (input.left_context - new_left_context) + start_frame;
-  
-  CompressedMatrix new_input_frames(input.input_frames,
-                                    left_frames_lost,
-                                    new_tot_frames,
-                                    0, input.input_frames.NumCols());
-  new_input_frames.Swap(&input_frames);  // swap with class-member.
-  left_context = new_left_context;  // set class-member.
-  labels.clear();
-  labels.insert(labels.end(),
-                input.labels.begin() + start_frame,
-                input.labels.begin() + start_frame + new_num_frames);
-}
-
-void ExamplesRepository::AcceptExamples(
-    std::vector<NnetExample> *examples) {
-  KALDI_ASSERT(!examples->empty());
-  empty_semaphore_.Wait();
-  KALDI_ASSERT(examples_.empty());
-  examples_.swap(*examples);
-  full_semaphore_.Signal();
-}
-
-void ExamplesRepository::ExamplesDone() {
-  empty_semaphore_.Wait();
-  KALDI_ASSERT(examples_.empty());
-  done_ = true;
-  full_semaphore_.Signal();
-}
-
-bool ExamplesRepository::ProvideExamples(
-    std::vector<NnetExample> *examples) {
-  full_semaphore_.Wait();
-  if (done_) {
-    KALDI_ASSERT(examples_.empty());
-    full_semaphore_.Signal(); // Increment the semaphore so
-    // the call by the next thread will not block.
-    return false; // no examples to return-- all finished.
-  } else {
-    KALDI_ASSERT(!examples_.empty() && examples->empty());
-    examples->swap(examples_);
-    empty_semaphore_.Signal();
-    return true;
-  }
-}
-
-
-void DiscriminativeNnetExample::Write(std::ostream &os,
-                                              bool binary) const {
-  // Note: weight, num_ali, den_lat, input_frames, left_context and spk_info are
-  // members.  This is a struct.
-  WriteToken(os, binary, "<DiscriminativeNnetExample>");
-  WriteToken(os, binary, "<Weight>");
-  WriteBasicType(os, binary, weight);
-  WriteToken(os, binary, "<NumAli>");
-  WriteIntegerVector(os, binary, num_ali);
-  if (!WriteCompactLattice(os, binary, den_lat)) {
-    // We can't return error status from this function so we
-    // throw an exception. 
-    KALDI_ERR << "Error writing CompactLattice to stream";
-  }
-  WriteToken(os, binary, "<InputFrames>");
-  {
-    CompressedMatrix cm(input_frames); // Note: this can be read as a regular
-                                       // matrix.
-    cm.Write(os, binary);
-  }
-  WriteToken(os, binary, "<LeftContext>");
-  WriteBasicType(os, binary, left_context);
-  WriteToken(os, binary, "<SpkInfo>");
-  spk_info.Write(os, binary);
-  WriteToken(os, binary, "</DiscriminativeNnetExample>");
-}
-
-void DiscriminativeNnetExample::Read(std::istream &is,
-                                             bool binary) {
-  // Note: weight, num_ali, den_lat, input_frames, left_context and spk_info are
-  // members.  This is a struct.
-  ExpectToken(is, binary, "<DiscriminativeNnetExample>");
-  ExpectToken(is, binary, "<Weight>");
-  ReadBasicType(is, binary, &weight);
-  ExpectToken(is, binary, "<NumAli>");
-  ReadIntegerVector(is, binary, &num_ali);
-  CompactLattice *den_lat_tmp = NULL;
-  if (!ReadCompactLattice(is, binary, &den_lat_tmp) || den_lat_tmp == NULL) {
-    // We can't return error status from this function so we
-    // throw an exception. 
-    KALDI_ERR << "Error reading CompactLattice from stream";
-  }
-  den_lat = *den_lat_tmp;
-  delete den_lat_tmp;
-  ExpectToken(is, binary, "<InputFrames>");
-  input_frames.Read(is, binary);
-  ExpectToken(is, binary, "<LeftContext>");
-  ReadBasicType(is, binary, &left_context);
-  ExpectToken(is, binary, "<SpkInfo>");
-  spk_info.Read(is, binary);
-  ExpectToken(is, binary, "</DiscriminativeNnetExample>");
-}
-
-void DiscriminativeNnetExample::Check() const {
-  KALDI_ASSERT(weight > 0.0);
-  KALDI_ASSERT(!num_ali.empty());
-  int32 num_frames = static_cast<int32>(num_ali.size());
-
-
-  std::vector<int32> times;
-  int32 num_frames_den = CompactLatticeStateTimes(den_lat, &times);
-  KALDI_ASSERT(num_frames == num_frames_den);
-  KALDI_ASSERT(input_frames.NumRows() >= left_context + num_frames);
-}
-
-
-} // namespace nnet2
-} // namespace kaldi
diff --git a/src/nnet2/nnet-example.h b/src/nnet2/nnet-example.h
deleted file mode 100644
index 0b5c473f69f..00000000000
--- a/src/nnet2/nnet-example.h
+++ /dev/null
@@ -1,191 +0,0 @@
-// nnet2/nnet-example.h
-
-// Copyright 2012  Johns Hopkins University (author: Daniel Povey)
-//           2014  Vimal Manohar
-
-// See ../../COPYING for clarification regarding multiple authors
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//  http://www.apache.org/licenses/LICENSE-2.0
-//
-// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
-// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
-// MERCHANTABLITY OR NON-INFRINGEMENT.
-// See the Apache 2 License for the specific language governing permissions and
-// limitations under the License.
-
-#ifndef KALDI_NNET2_NNET_EXAMPLE_H_
-#define KALDI_NNET2_NNET_EXAMPLE_H_
-
-#include "nnet2/nnet-nnet.h"
-#include "util/table-types.h"
-#include "lat/kaldi-lattice.h"
-#include "util/kaldi-semaphore.h"
-
-namespace kaldi {
-namespace nnet2 {
-
-/// NnetExample is the input data and corresponding label (or labels) for one
-/// or more frames of input, used for standard cross-entropy training of neural
-/// nets (and possibly for other objective functions).  In the normal case there
-/// will be just one frame, and one label, with a weight of 1.0.
-struct NnetExample {
-
-  /// The label(s) for each frame in a sequence of frames; in the normal case,
-  /// this will be just [ [ (pdf-id, 1.0) ] ], i.e. one frame with one label.
-  /// Top-level index is the frame index; then for each frame, a list of pdf-ids
-  /// each with its weight.
-  /// In some contexts, we will require that labels.size() == 1.
-  std::vector<std::vector<std::pair<int32, BaseFloat> > > labels;  
-  
-  /// The input data, with NumRows() >= labels.size() + left_context; it
-  /// includes features to the left and right as needed for the temporal context
-  /// of the network.  The features corresponding to labels[0] would be in
-  /// the row with index equal to left_context.
-  CompressedMatrix input_frames; 
-
-  /// The number of frames of left context (we can work out the #frames
-  /// of right context from input_frames.NumRows(), labels.size(), and this).
-  int32 left_context;
-
-  /// The speaker-specific input, if any, or an empty vector if
-  /// we're not using this features.  We'll append this to the
-  /// features for each of the frames.
-  Vector<BaseFloat> spk_info; 
-  
-  void Write(std::ostream &os, bool binary) const;
-  void Read(std::istream &is, bool binary);
-
-  NnetExample() { }
-
-  /// This constructor can be used to extract one or more frames from an example
-  /// that has multiple frames, and possibly truncate the context.  Most of its
-  /// behavior is obvious from the variable names, but note the following: if
-  /// left_context is -1, we use the left-context of the input; the same for
-  /// right_context.  If start_frame < 0 we start the labels from frame 0 of the
-  /// labeled frames of ,input; if num_frames == -1 we go to the end of the
-  /// labeled input from start_frame.  If start_frame + num_frames is greater
-  /// than the number of frames of labels of input, we output as much as we can
-  /// instead of crashing.  The same with left_context and right_context-- if we
-  /// can't provide the requested context we won't crash but will provide as
-  /// much as we can, although in this case we'll print a warning (once).
-  NnetExample(const NnetExample &input,
-              int32 start_frame,
-              int32 num_frames,
-              int32 left_context,
-              int32 right_context);
-  
-  /// Set the label of this frame of this example to the specified pdf_id with
-  /// the specified weight.
-  void SetLabelSingle(int32 frame, int32 pdf_id, BaseFloat weight = 1.0);
-
-  /// Get the maximum weight label (pdf_id and weight) of this frame of this
-  /// example.
-  int32 GetLabelSingle(int32 frame, BaseFloat *weight = NULL);
-};
-
-
-typedef TableWriter<KaldiObjectHolder<NnetExample > > NnetExampleWriter;
-typedef SequentialTableReader<KaldiObjectHolder<NnetExample > > SequentialNnetExampleReader;
-typedef RandomAccessTableReader<KaldiObjectHolder<NnetExample > > RandomAccessNnetExampleReader;
-
-
-/** This class stores neural net training examples to be used in
-    multi-threaded training.  */
-class ExamplesRepository {
- public:
-  /// The following function is called by the code that reads in the examples,
-  /// with a batch of examples.  [It will empty the vector "examples").
-  void AcceptExamples(std::vector<NnetExample> *examples);
-
-  /// The following function is called by the code that reads in the examples,
-  /// when we're done reading examples.
-  void ExamplesDone();
-  
-  /// This function is called by the code that does the training.  It gets the
-  /// training examples, and if they are available, puts them in "examples" and
-  /// returns true.  It returns false when there are no examples left and
-  /// ExamplesDone() has been called.
-  bool ProvideExamples(std::vector<NnetExample> *examples);
-  
-  ExamplesRepository(): empty_semaphore_(1), done_(false) { }
- private:
-  Semaphore full_semaphore_;
-  Semaphore empty_semaphore_;
-
-  std::vector<NnetExample> examples_;
-  bool done_;
-  KALDI_DISALLOW_COPY_AND_ASSIGN(ExamplesRepository);
-};
-
-
-/**
-   This struct is used to store the information we need for discriminative training
-   (MMI or MPE).  Each example corresponds to one chunk of a file (for better randomization
-   and to prevent instability, we may split files in the middle).
-   The example contains the numerator alignment, the denominator lattice, and the
-   input features (extended at the edges according to the left-context and right-context
-   the network needs).  It may also contain a speaker-vector (note: this is
-   not part of any standard recipe right now but is included in case it's useful
-   in the future).
- */
-struct DiscriminativeNnetExample {
-  /// The weight we assign to this example;
-  /// this will typically be one, but we include it
-  /// for the sake of generality.  
-  BaseFloat weight; 
-
-  /// The numerator alignment
-  std::vector<int32> num_ali; 
-
-  /// The denominator lattice.  Note: any acoustic
-  /// likelihoods in the denominator lattice will be
-  /// recomputed at the time we train.
-  CompactLattice den_lat; 
-
-  /// The input data-- typically with a number of frames [NumRows()] larger than
-  /// labels.size(), because it includes features to the left and right as
-  /// needed for the temporal context of the network.  (see also the
-  /// left_context variable).
-  /// Caution: when we write this to disk, we do so as CompressedMatrix.
-  /// Because we do various manipulations on these things in memory, such
-  /// as splitting, we don't want it to be a CompressedMatrix in memory
-  /// as this would be wasteful in time and also would lead to further loss of
-  /// accuracy.
-  Matrix<BaseFloat> input_frames;
-
-  /// The number of frames of left context in the features (we can work out the
-  /// #frames of right context from input_frames.NumRows(), num_ali.size(), and
-  /// this).
-  int32 left_context;
-  
-
-  /// spk_info contains any component of the features that varies slowly or not
-  /// at all with time (and hence, we would lose little by averaging it over
-  /// time and storing the average).  We'll append this to each of the input
-  /// features, if used.
-  Vector<BaseFloat> spk_info; 
-
-  void Check() const; // will crash if invalid.
-  
-  void Write(std::ostream &os, bool binary) const;
-  void Read(std::istream &is, bool binary);
-};
-
-// Yes, the length of typenames is getting out of hand.
-typedef TableWriter<KaldiObjectHolder<DiscriminativeNnetExample > >
-   DiscriminativeNnetExampleWriter;
-typedef SequentialTableReader<KaldiObjectHolder<DiscriminativeNnetExample > >
-   SequentialDiscriminativeNnetExampleReader;
-typedef RandomAccessTableReader<KaldiObjectHolder<DiscriminativeNnetExample > >
-   RandomAccessDiscriminativeNnetExampleReader;
-
-
-}
-} // namespace
-
-#endif // KALDI_NNET2_NNET_EXAMPLE_H_
diff --git a/src/nnet2/nnet-fix.cc b/src/nnet2/nnet-fix.cc
deleted file mode 100644
index 79a10ceccdb..00000000000
--- a/src/nnet2/nnet-fix.cc
+++ /dev/null
@@ -1,111 +0,0 @@
-// nnet2/nnet-fix.cc
-
-// Copyright 2012   Johns Hopkins University (author: Daniel Povey)
-
-// See ../../COPYING for clarification regarding multiple authors
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//  http://www.apache.org/licenses/LICENSE-2.0
-//
-// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
-// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
-// MERCHANTABLITY OR NON-INFRINGEMENT.
-// See the Apache 2 License for the specific language governing permissions and
-// limitations under the License.
-
-#include "nnet2/nnet-fix.h"
-
-namespace kaldi {
-namespace nnet2 {
-
-
-/* See the header for what we're doing.
-   The pattern we're looking for is AffineComponent followed by
-   a NonlinearComponent of type SigmoidComponent or TanhComponent.
-*/
-
-void FixNnet(const NnetFixConfig &config, Nnet *nnet) {
-  for (int32 c = 0; c + 1 < nnet->NumComponents(); c++) {
-    AffineComponent *ac = dynamic_cast<AffineComponent*>(
-        &(nnet->GetComponent(c)));
-    NonlinearComponent *nc = dynamic_cast<NonlinearComponent*>(
-        &(nnet->GetComponent(c + 1)));
-    if (ac == NULL || nc == NULL) continue;
-    // We only want to process this if it's of type SigmoidComponent
-    // or TanhComponent.
-    BaseFloat max_deriv; // The maximum derivative of this nonlinearity.
-    bool is_relu = false;
-    {
-      SigmoidComponent *sc = dynamic_cast<SigmoidComponent*>(nc);
-      TanhComponent *tc = dynamic_cast<TanhComponent*>(nc);
-      RectifiedLinearComponent *rc = dynamic_cast<RectifiedLinearComponent*>(nc);
-      if (sc != NULL) max_deriv = 0.25;
-      else if (tc != NULL) max_deriv = 1.0;
-      else if (rc != NULL) { max_deriv = 1.0; is_relu = true; }
-      else continue; // E.g. SoftmaxComponent; we don't handle this.
-    }
-    double count = nc->Count();
-    Vector<double> deriv_sum (nc->DerivSum());
-    if (count == 0.0 || deriv_sum.Dim() == 0) {
-      KALDI_WARN << "Cannot fix neural net because no statistics are stored.";
-      continue;
-    }
-    Vector<BaseFloat> bias_params(ac->BiasParams());
-    Matrix<BaseFloat> linear_params(ac->LinearParams());
-    int32 dim = nc->InputDim(), num_small_deriv = 0, num_large_deriv = 0;
-    for (int32 d = 0; d < dim; d++) {
-      // deriv ratio is the ratio of the computed average derivative to the
-      // maximum derivative of that nonlinear function.
-      BaseFloat deriv_ratio = deriv_sum(d) / (count * max_deriv);
-      KALDI_ASSERT(deriv_ratio >= 0.0 && deriv_ratio < 1.01); // Or there is an
-                                                              // error in the
-      // math.
-      if (deriv_ratio < config.min_average_deriv) {
-        // derivative is too small, meaning we've gone off into the "flat part"
-        // of the sigmoid (or for ReLU, we're always-off).
-        if (is_relu) {
-          bias_params(d) += config.relu_bias_change;
-        } else {
-          BaseFloat parameter_factor = std::min(config.min_average_deriv /
-                                                deriv_ratio,
-                                                config.parameter_factor);
-          // we need to reduce the parameters, so multiply by 1/parameter factor.
-          bias_params(d) *= 1.0 / parameter_factor;
-          linear_params.Row(d).Scale(1.0 / parameter_factor);
-        }
-        num_small_deriv++;
-      } else if (deriv_ratio > config.max_average_deriv) {
-        // derivative is too large, meaning we're only in the linear part of the
-        // sigmoid, in the middle.  (or for ReLU, we're always-on.
-        if (is_relu) {
-          bias_params(d) -= config.relu_bias_change;
-        } else {
-          BaseFloat parameter_factor = std::min(deriv_ratio / config.max_average_deriv,
-                                                config.parameter_factor);
-          // we need to increase the factors, so multiply by parameter_factor.
-          bias_params(d) *= parameter_factor;
-          linear_params.Row(d).Scale(parameter_factor);
-        }
-        num_large_deriv++;
-      }
-    }
-    if (is_relu) {
-      KALDI_LOG << "For layer " << c << " (ReLU units), increased bias for "
-                << num_small_deriv << " indexes and decreased it for "
-                << num_large_deriv << ", out of a total of " << dim;
-    } else {
-      KALDI_LOG << "For layer " << c << ", decreased parameters for "
-                << num_small_deriv << " indexes, and increased them for "
-                << num_large_deriv << " out of a total of " << dim;
-    }
-    ac->SetParams(bias_params, linear_params);
-  }
-}
-  
-  
-} // namespace nnet2
-} // namespace kaldi
diff --git a/src/nnet2/nnet-fix.h b/src/nnet2/nnet-fix.h
deleted file mode 100644
index 3da5f97c25f..00000000000
--- a/src/nnet2/nnet-fix.h
+++ /dev/null
@@ -1,74 +0,0 @@
-// nnet2/nnet-fix.h
-
-// Copyright 2012  Johns Hopkins University (author: Daniel Povey)
-
-// See ../../COPYING for clarification regarding multiple authors
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//  http://www.apache.org/licenses/LICENSE-2.0
-//
-// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
-// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
-// MERCHANTABLITY OR NON-INFRINGEMENT.
-// See the Apache 2 License for the specific language governing permissions and
-// limitations under the License.
-
-#ifndef KALDI_NNET2_NNET_FIX_H_
-#define KALDI_NNET2_NNET_FIX_H_
-
-#include "nnet2/nnet-nnet.h"
-
-namespace kaldi {
-namespace nnet2 {
-
-/* This header provides a function FixNnet(), and associated config, which
-   is responsible for fixing certain pathologies in a neural network during
-   training.
-
-   For Sigmoid/Tanh units: it identifies neurons whose parameters are getting so large that
-   they are maxing out the sigmoid, and scales down those parameters by a
-   specified factor.  It also identifies neurons that have the opposite pathology
-   that they are just in the linear part of the sigmoid, and it scales up
-   their parameters.
-
-   For ReLU (rectified linear) units, it identifies neurons that are always zero
-   or close to zero, re-randomizes the corresponding parameters, increasing the bias.
-*/
-
-struct NnetFixConfig {
-  BaseFloat min_average_deriv; // Minimum average derivative that we allow,
-  // as a proportion of the maximum derivative of the nonlinearity (1.0 for tanh, 0.25 for sigmoid).
-  // If average derivative is less, we scale up the parameters.
-  BaseFloat max_average_deriv; // Maximum average derivative that we allow,
-  // also expressed relative to the maximum derivative of the nonlinearity.
-  BaseFloat parameter_factor; // Factor (>1.0) by which we change the parameters if
-  // the exceed the bounds above
-  BaseFloat relu_bias_change; // Change in bias for relus that are usually close to zero.
-
-  NnetFixConfig(): min_average_deriv(0.1), max_average_deriv(0.75),
-                   parameter_factor(2.0), relu_bias_change(1.0) { }
-  void Register(OptionsItf *opts) {
-    opts->Register("min-average-deriv", &min_average_deriv, "Miniumum derivative, "
-                   "averaged over the training data, that we allow for a nonlinearity,"
-                   "expressed relative to the maximum derivative of the nonlinearity,"
-                   "i.e. 1.0 for tanh or 0.25 for sigmoid, 1.0 for rectified linear.");
-    opts->Register("max-average-deriv", &max_average_deriv, "Maximum derivative, "
-                   "averaged over the training data, that we allow for the nonlinearity "
-                   "associated with one neuron.");
-    opts->Register("parameter-factor", &parameter_factor, "Maximum factor by which we change "
-                   "the set of parameters associated with a neuron.");
-    opts->Register("relu-bias-change", &relu_bias_change, "For ReLUs, change in bias when "
-                   "we identify a component that's too frequently on or off.");
-  }
-};
-
-void FixNnet(const NnetFixConfig &config, Nnet *nnet);
-
-} // namespace nnet2
-} // namespace kaldi
-
-#endif // KALDI_NNET2_NNET_FIX_H_
diff --git a/src/nnet2/nnet-functions.cc b/src/nnet2/nnet-functions.cc
deleted file mode 100644
index e1a53d668e5..00000000000
--- a/src/nnet2/nnet-functions.cc
+++ /dev/null
@@ -1,78 +0,0 @@
-// nnet2/nnet-functions.cc
-
-// Copyright 2011-2012  Karel Vesely
-//                      Johns Hopkins University (author: Daniel Povey)
-
-// See ../../COPYING for clarification regarding multiple authors
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//  http://www.apache.org/licenses/LICENSE-2.0
-//
-// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
-// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
-// MERCHANTABLITY OR NON-INFRINGEMENT.
-// See the Apache 2 License for the specific language governing permissions and
-// limitations under the License.
-
-#include "nnet2/nnet-nnet.h"
-#include "util/stl-utils.h"
-
-namespace kaldi {
-namespace nnet2 {
-
-int32 IndexOfSoftmaxLayer(const Nnet &nnet) {
-  int32 index = -1, nc = nnet.NumComponents();
-  for (int32 c = 0; c < nc; c++) {
-    const Component *component = &(nnet.GetComponent(c));
-    if (dynamic_cast<const SoftmaxComponent*>(component) != NULL) {
-      if (index != -1) return -1; // >1 softmax components.
-      else index = c;
-    }
-  }
-  return index;
-}
-
-void InsertComponents(const Nnet &src_nnet,
-                      int32 c_to_insert, // component-index before which to insert.
-                      Nnet *dest_nnet) {
-  KALDI_ASSERT(c_to_insert >= 0 && c_to_insert <= dest_nnet->NumComponents());
-  int32 c_tot = dest_nnet->NumComponents() + src_nnet.NumComponents();
-  std::vector<Component*> components(c_tot);
-  for (int32 c = 0; c < c_to_insert; c++)
-    components[c] = dest_nnet->GetComponent(c).Copy();
-  for (int32 c = 0; c < src_nnet.NumComponents(); c++)
-    components[c + c_to_insert] = src_nnet.GetComponent(c).Copy();
-  for (int32 c = c_to_insert; c < dest_nnet->NumComponents(); c++)
-    components[c + src_nnet.NumComponents()] = dest_nnet->GetComponent(c).Copy();
-  // Re-initialize "dest_nnet" from the resulting list of components.
-
-  // The Init method will take ownership of the pointers in the vector:
-  dest_nnet->Init(&components);
-}
-
-
-void ReplaceLastComponents(const Nnet &src_nnet,
-                           int32 num_to_remove,
-                           Nnet *dest_nnet) {
-  KALDI_ASSERT(num_to_remove >= 0 && num_to_remove <= dest_nnet->NumComponents());
-  int32 c_orig = dest_nnet->NumComponents() - num_to_remove;
-
-  std::vector<Component*> components;
-  for (int32 c = 0; c < c_orig; c++)
-    components.push_back(dest_nnet->GetComponent(c).Copy());
-  for (int32 c = 0; c < src_nnet.NumComponents(); c++)
-    components.push_back(src_nnet.GetComponent(c).Copy());
-
-  // Re-initialize "dest_nnet" from the resulting list of components.
-  // The Init method will take ownership of the pointers in the vector:
-  dest_nnet->Init(&components);
-}
-
-
-
-} // namespace nnet2
-} // namespace kaldi
diff --git a/src/nnet2/nnet-functions.h b/src/nnet2/nnet-functions.h
deleted file mode 100644
index f0d2023fca8..00000000000
--- a/src/nnet2/nnet-functions.h
+++ /dev/null
@@ -1,70 +0,0 @@
-// nnet2/nnet-functions.h
-
-// Copyright  2012  Johns Hopkins University (author: Daniel Povey)
-
-// See ../../COPYING for clarification regarding multiple authors
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//  http://www.apache.org/licenses/LICENSE-2.0
-//
-// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
-// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
-// MERCHANTABLITY OR NON-INFRINGEMENT.
-// See the Apache 2 License for the specific language governing permissions and
-// limitations under the License.
-
-#ifndef KALDI_NNET2_NNET_FUNCTIONS_H_
-#define KALDI_NNET2_NNET_FUNCTIONS_H_
-
-#include "base/kaldi-common.h"
-#include "util/kaldi-io.h"
-#include "matrix/matrix-lib.h"
-#include "nnet2/nnet-component.h"
-#include "nnet2/nnet-nnet.h"
-
-#include <iostream>
-#include <sstream>
-#include <vector>
-
-
-namespace kaldi {
-namespace nnet2 {
-
-// Here we declare various functions for manipulating the neural net,
-// such as adding new hidden layers; we'll add things like "mixing up"
-// to here.
-
-
-/// If "nnet" has exactly one softmax layer, this function will return
-/// its index; otherwise it will return -1.
-int32 IndexOfSoftmaxLayer(const Nnet &nnet);
-
-/**
-   Inserts the components of one neural network into a particular place in the
-   other one.  This is useful for adding hidden layers to a neural net.  Inserts
-   the components of "src_nnet" before component index c of "dest_nnet".
-*/
-void InsertComponents(const Nnet &src_nnet,
-                      int32 c,
-                      Nnet *dest_nnet);
-
-/**
-   Removes the last "num_to_remove" components and
-   adds the components from "src_nnet".
- */
-void ReplaceLastComponents(const Nnet &src_nnet,
-                           int32 num_to_remove,
-                           Nnet *dest_nnet);
-
-
-
-} // namespace nnet2
-} // namespace kaldi
-
-#endif
-
-
diff --git a/src/nnet2/nnet-limit-rank.cc b/src/nnet2/nnet-limit-rank.cc
deleted file mode 100644
index 4d303056c91..00000000000
--- a/src/nnet2/nnet-limit-rank.cc
+++ /dev/null
@@ -1,112 +0,0 @@
-// nnet2/nnet-limit-rank.cc
-
-// Copyright 2012   Johns Hopkins University (author: Daniel Povey)
-
-// See ../../COPYING for clarification regarding multiple authors
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//  http://www.apache.org/licenses/LICENSE-2.0
-//
-// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
-// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
-// MERCHANTABLITY OR NON-INFRINGEMENT.
-// See the Apache 2 License for the specific language governing permissions and
-// limitations under the License.
-
-#include "nnet2/nnet-limit-rank.h"
-#include "util/kaldi-thread.h"
-
-namespace kaldi {
-namespace nnet2 {
-
-class LimitRankClass {
- public:
-  LimitRankClass(const NnetLimitRankOpts &opts,
-                 int32 c,
-                 Nnet *nnet): opts_(opts), c_(c), nnet_(nnet) { }
-  void operator () () {
-    AffineComponent *ac = dynamic_cast<AffineComponent*>(
-        &(nnet_->GetComponent(c_)));
-    KALDI_ASSERT(ac != NULL);
-
-    // We'll limit the rank of just the linear part, keeping the bias vector full.
-    Matrix<BaseFloat> M (ac->LinearParams());
-    int32 rows = M.NumRows(), cols = M.NumCols(), rc_min = std::min(rows, cols);
-    Vector<BaseFloat> s(rc_min);
-    Matrix<BaseFloat> U(rows, rc_min), Vt(rc_min, cols);
-    // Do the destructive svd M = U diag(s) V^T.  It actually outputs the transpose of V.
-    M.DestructiveSvd(&s, &U, &Vt);
-    SortSvd(&s, &U, &Vt); // Sort the singular values from largest to smallest.
-
-    int32 d = GetRetainedDim(rows, cols);
-    BaseFloat old_svd_sum = s.Sum();
-    U.Resize(rows, d, kCopyData);
-    s.Resize(d, kCopyData);
-    Vt.Resize(d, cols, kCopyData);
-    BaseFloat new_svd_sum = s.Sum();
-    KALDI_LOG << "For component " << c_ << " of dimension " << rows
-              << " x " << cols << ", reduced rank from "
-              << rc_min <<  " to " << d << ", SVD sum reduced from "
-              << old_svd_sum << " to " << new_svd_sum;
-    Vt.MulRowsVec(s); // Vt <-- diag(s) Vt.
-    M.AddMatMat(1.0, U, kNoTrans, Vt, kNoTrans, 0.0); // Reconstruct with reduced
-    // rank.
-    Vector<BaseFloat> bias_params(ac->BiasParams());
-    ac->SetParams(bias_params, M);
-  }
-
-  int32 GetRetainedDim(int32 rows, int32 cols) {
-    if (opts_.parameter_proportion <= 0.0 || opts_.parameter_proportion > 1.0)
-      KALDI_ERR << "bad --parameter-proportion " << opts_.parameter_proportion;
-    // If we do SVD to dimension d, so that it's U diag(s) V^T where
-    // U is rows * d, s is d, and V is cols * d, then the #params is as follows...
-    //   the first column of U has free parameters (#rows - 1) [the -1 is due to
-    //   the length constraint]; the second has (#rows - 2) [subtract 1 for the
-    //   length constraint and one for orthogonality with the previous row], etc.
-    //   Total is params(U) = (rows * d) - ((d(d+1))/2),
-    //            params(s) = d,
-    //            params(V) = (cols * d) - ((d(d+1))/2),
-    //   So total is (rows + cols) * d - d * d .
-    //   For example, if d = #rows, this equals (#rows * #cols)
-    //   We are solving for:
-    //   (rows * cols) * parameter_proportion = (rows + cols) * d - d * d, or
-    //   d^2 - d * (rows + cols) + (rows*cols)*parameter_proportion
-    //   In quadratic equation
-    //   a = 1.0,
-    //   b = -(rows + cols)
-    //   c = rows * cols * parameter_proportion.
-    //   Take smaller solution.
-    BaseFloat a = 1.0, b = -(rows + cols),
-        c = rows * cols * opts_.parameter_proportion;
-    BaseFloat x = (-b - sqrt(b * b - 4 * a * c)) / (2.0 * a);
-    int32 ans = static_cast<int32>(x);
-    KALDI_ASSERT(ans > 0 && ans <= std::min(rows, cols));
-    return ans;
-  }
-  
-  ~LimitRankClass() { }
- private:
-  const NnetLimitRankOpts &opts_;
-  int32 c_;
-  Nnet *nnet_;
-};
-
-
-void LimitRankParallel(const NnetLimitRankOpts &opts,
-                            Nnet *nnet) {
-  TaskSequencerConfig task_config;
-  task_config.num_threads = opts.num_threads;
-  TaskSequencer<LimitRankClass> tc(task_config);
-  for (int32 c = 0; c < nnet->NumComponents(); c++) {
-    if (dynamic_cast<AffineComponent*>(&(nnet->GetComponent(c))) != NULL)
-      tc.Run(new LimitRankClass(opts, c, nnet));
-  }
-}
-
-
-} // namespace nnet2
-} // namespace kaldi
diff --git a/src/nnet2/nnet-limit-rank.h b/src/nnet2/nnet-limit-rank.h
deleted file mode 100644
index 1628f6ca4f6..00000000000
--- a/src/nnet2/nnet-limit-rank.h
+++ /dev/null
@@ -1,62 +0,0 @@
-// nnet2/nnet-limit-rank.h
-
-// Copyright 2012  Johns Hopkins University (author: Daniel Povey)
-
-// See ../../COPYING for clarification regarding multiple authors
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//  http://www.apache.org/licenses/LICENSE-2.0
-//
-// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
-// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
-// MERCHANTABLITY OR NON-INFRINGEMENT.
-// See the Apache 2 License for the specific language governing permissions and
-// limitations under the License.
-
-#ifndef KALDI_NNET2_NNET_LIMIT_RANK_H_
-#define KALDI_NNET2_NNET_LIMIT_RANK_H_
-
-#include "nnet2/nnet-nnet.h"
-#include "util/table-types.h"
-#include "util/kaldi-semaphore.h"
-#include "util/kaldi-thread.h"
-#include "nnet2/nnet-update.h"
-
-namespace kaldi {
-namespace nnet2 {
-
-struct NnetLimitRankOpts {
-  int32 num_threads;
-  BaseFloat parameter_proportion;
-  
-  NnetLimitRankOpts(): num_threads(1), parameter_proportion(0.75) { }
-
-  void Register(OptionsItf *opts) {
-    opts->Register("num-threads", &num_threads, "Number of threads used for "
-                   "rank-limiting operation; note, will never use more than "
-                   "#layers.");
-    opts->Register("parameter-proportion", &parameter_proportion, "Proportion of "
-                   "dimension of each transform to limit the rank to.");
-  }  
-};
-
-
-/// This function limits the rank of each affine transform in the
-/// neural net, by zeroing out the smallest singular values.  The number of
-/// singular values to zero out is determined on a layer by layer basis, using
-/// "parameter_proportion" to set the proportion of parameters to remove.
-void LimitRankParallel(const NnetLimitRankOpts &opts,
-                       Nnet *nnet);
-
-
-/// Also see the function LimitRankOfLastLayer in class Nnet.                            
-
-
-} // namespace nnet2
-} // namespace kaldi
-
-#endif // KALDI_NNET2_NNET_LIMIT_RANK_H_
diff --git a/src/nnet2/nnet-nnet-test.cc b/src/nnet2/nnet-nnet-test.cc
deleted file mode 100644
index 32c312fe234..00000000000
--- a/src/nnet2/nnet-nnet-test.cc
+++ /dev/null
@@ -1,57 +0,0 @@
-// nnet2/nnet-nnet-test.cc
-
-// Copyright 2014  Johns Hopkins University (author:  Daniel Povey)
-
-// See ../../COPYING for clarification regarding multiple authors
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//  http://www.apache.org/licenses/LICENSE-2.0
-//
-// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
-// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
-// MERCHANTABLITY OR NON-INFRINGEMENT.
-// See the Apache 2 License for the specific language governing permissions and
-// limitations under the License.
-
-#include "nnet2/nnet-nnet.h"
-
-namespace kaldi {
-namespace nnet2 {
-
-
-void UnitTestNnet() {
-  int32 input_dim = 40, output_dim = 500;
-  Nnet *nnet = GenRandomNnet(input_dim, output_dim);
-
-  bool binary = (rand() % 2 == 0);
-  std::ostringstream os;
-  nnet->Write(os, binary);
-  Nnet nnet2;
-  std::istringstream is(os.str());
-  nnet2.Read(is, binary);
-
-  std::ostringstream os2;
-  nnet2.Write(os2, binary);
-
-  KALDI_ASSERT(os2.str() == os.str());
-  delete nnet;
-}
-
-} // namespace nnet2
-} // namespace kaldi
-
-#include "matrix/matrix-functions.h"
-
-
-int main() {
-  using namespace kaldi;
-  using namespace kaldi::nnet2;
-
-  UnitTestNnet();
-  return 0;
-}
-  
diff --git a/src/nnet2/nnet-nnet.cc b/src/nnet2/nnet-nnet.cc
deleted file mode 100644
index 9fe10a4f5aa..00000000000
--- a/src/nnet2/nnet-nnet.cc
+++ /dev/null
@@ -1,846 +0,0 @@
-// nnet2/nnet-nnet.cc
-
-// Copyright 2011-2012  Karel Vesely
-//           2012-2014  Johns Hopkins University (author: Daniel Povey)
-
-// See ../../COPYING for clarification regarding multiple authors
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//  http://www.apache.org/licenses/LICENSE-2.0
-//
-// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
-// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
-// MERCHANTABLITY OR NON-INFRINGEMENT.
-// See the Apache 2 License for the specific language governing permissions and
-// limitations under the License.
-
-#include <algorithm>
-#include <set>
-#include <string>
-#include "nnet2/nnet-nnet.h"
-#include "util/stl-utils.h"
-
-namespace kaldi {
-namespace nnet2 {
-
-
-int32 Nnet::OutputDim() const {
-  KALDI_ASSERT(!components_.empty());
-  return components_.back()->OutputDim();
-}
-
-int32 Nnet::InputDim() const {
-  KALDI_ASSERT(!components_.empty());
-  return components_.front()->InputDim();
-}
-
-
-int32 Nnet::LeftContext() const {
-  KALDI_ASSERT(!components_.empty());
-  int32 ans = 0;
-  for (size_t i = 0; i < components_.size(); i++) {
-    ans += components_[i]->Context().front();
-  }
-  return -1*ans;
-  // nnet-components return left context as a non-positive integer
-  // however the nnet-update, nnet-compute expect a
-  // non-negative left context. In addition, the NnetExample also stores data
-  // left context as positive integer. To be compatible with these other classes
-  // Nnet::LeftContext() returns a non-negative left context.
-}
-
-int32 Nnet::RightContext() const {
-  KALDI_ASSERT(!components_.empty());
-  int32 ans = 0;
-  for (size_t i = 0; i < components_.size(); i++) {
-    ans += components_[i]->Context().back();
-  }
-  return ans;
-}
-
-void Nnet::ComputeChunkInfo(int32 input_chunk_size,
-                            int32 num_chunks,
-                            std::vector<ChunkInfo> *chunk_info_out) const {
-  // First compute the output-chunk indices for the last component in the
-  // network. we assume that the numbering of the input starts from zero.
-  int32 output_chunk_size = input_chunk_size - LeftContext() - RightContext();
-  KALDI_ASSERT(output_chunk_size > 0);
-  std::vector<int32> current_output_inds;
-  for (int32 i = 0; i < output_chunk_size; i++)
-    current_output_inds.push_back(i + LeftContext());
-
-  (*chunk_info_out).resize(NumComponents() + 1);
-
-  // indexes for last component is empty, since the last component's chunk is
-  // always contiguous
-  // component's output is always contiguous
-  (*chunk_info_out)[NumComponents()] = ChunkInfo(
-      GetComponent(NumComponents() - 1).OutputDim(),
-      num_chunks, current_output_inds.front(),
-      current_output_inds.back());
-
-  std::vector<int32> current_input_inds;
-  for (int32 i = NumComponents() - 1; i >= 0; i--) {
-    std::vector<int32> current_context = GetComponent(i).Context();
-    std::set<int32> current_input_ind_set;
-    for (size_t j = 0; j < current_context.size(); j++)
-      for (size_t k = 0; k < current_output_inds.size(); k++)
-        current_input_ind_set.insert(current_context[j] +
-                                     current_output_inds[k]);
-    current_output_inds.resize(current_input_ind_set.size());
-    std::copy(current_input_ind_set.begin(),
-              current_input_ind_set.end(),
-              current_output_inds.begin());
-
-    // checking if the vector has contiguous data
-    // assign indexes only if the data is not contiguous
-    if (current_output_inds.size() !=
-        current_output_inds.back() - current_output_inds.front() + 1) {
-      (*chunk_info_out)[i] = ChunkInfo(GetComponent(i).InputDim(),
-                                       num_chunks,
-                                       current_output_inds);
-    } else  {
-      (*chunk_info_out)[i] = ChunkInfo(GetComponent(i).InputDim(),
-                                       num_chunks,
-                                       current_output_inds.front(),
-                                       current_output_inds.back());
-    }
-  }
-
-  // TODO: Make a set of components which can deal with data rearrangement.
-  // Define this set in an appropriate place so that
-  // users adding new components can simply update the list.
-  const char *dinit[] = {"SpliceComponent", "SpliceMaxComponent"};
-  std::vector< std::string > data_rearrange_components(dinit, dinit + 2);
-
-  // Ensuring that all components until the first component capable of data
-  // rearrangement (e.g. SpliceComponent|SpliceMaxComponent) operate on
-  // contiguous chunks at the input
-  for (size_t i = 0 ; i < NumComponents() ; i++) {
-      (*chunk_info_out)[i].MakeOffsetsContiguous();
-      // Check if the current component is present in the set of components
-      // capable of data rearrangement.
-      if (std::find(data_rearrange_components.begin(),
-                    data_rearrange_components.end(),
-                    components_[i]->Type())
-          != data_rearrange_components.end())
-          break;
-  }
-
-  // sanity testing for chunk_info_out vector
-  for (size_t i = 0; i < chunk_info_out->size(); i++) {
-    (*chunk_info_out)[i].Check();
-    // (*chunk_info_out)[i].ToString();
-  }
-}
-
-const Component& Nnet::GetComponent(int32 component) const {
-  KALDI_ASSERT(static_cast<size_t>(component) < components_.size());
-  return *(components_[component]);
-}
-
-Component& Nnet::GetComponent(int32 component) {
-  KALDI_ASSERT(static_cast<size_t>(component) < components_.size());
-  return *(components_[component]);
-}
-
-void Nnet::SetZero(bool treat_as_gradient) {
-  for (size_t i = 0; i < components_.size(); i++) {
-    UpdatableComponent *uc = dynamic_cast<UpdatableComponent*>(components_[i]);
-    if (uc != NULL) uc->SetZero(treat_as_gradient);
-    NonlinearComponent *nc = dynamic_cast<NonlinearComponent*>(components_[i]);
-    if (nc != NULL) nc->Scale(0.0);
-  }
-}
-
-void Nnet::Write(std::ostream &os, bool binary) const {
-  Check();
-  WriteToken(os, binary, "<Nnet>");
-  int32 num_components = components_.size();
-  WriteToken(os, binary, "<NumComponents>");
-  WriteBasicType(os, binary, num_components);
-  WriteToken(os, binary, "<Components>");
-  for (int32 c = 0; c < num_components; c++) {
-    components_[c]->Write(os, binary);
-    if (!binary) os << std::endl;
-  }
-  WriteToken(os, binary, "</Components>");
-  WriteToken(os, binary, "</Nnet>");
-}
-
-void Nnet::Read(std::istream &is, bool binary) {
-  Destroy();
-  ExpectToken(is, binary, "<Nnet>");
-  int32 num_components;
-  ExpectToken(is, binary, "<NumComponents>");
-  ReadBasicType(is, binary, &num_components);
-  ExpectToken(is, binary, "<Components>");
-  components_.resize(num_components);
-  for (int32 c = 0; c < num_components; c++)
-    components_[c] = Component::ReadNew(is, binary);
-  ExpectToken(is, binary, "</Components>");
-  ExpectToken(is, binary, "</Nnet>");
-  SetIndexes();
-  Check();
-}
-
-
-void Nnet::ZeroStats() {
-  for (size_t i = 0; i < components_.size(); i++) {
-    NonlinearComponent *nonlinear_component =
-        dynamic_cast<NonlinearComponent*>(components_[i]);
-    if (nonlinear_component != NULL)
-      nonlinear_component->Scale(0.0);  // Zero the stats this way.
-  }
-}
-void Nnet::Destroy() {
-  while (!components_.empty()) {
-    delete components_.back();
-    components_.pop_back();
-  }
-}
-
-void Nnet::ComponentDotProducts(
-    const Nnet &other,
-    VectorBase<BaseFloat> *dot_prod) const {
-  KALDI_ASSERT(dot_prod->Dim() == NumUpdatableComponents());
-  int32 index = 0;
-  for (size_t i = 0; i < components_.size(); i++) {
-    UpdatableComponent *uc1 = dynamic_cast<UpdatableComponent*>(components_[i]);
-    const UpdatableComponent *uc2 = dynamic_cast<const UpdatableComponent*>(
-        &(other.GetComponent(i)));
-    KALDI_ASSERT((uc1 != NULL) == (uc2 != NULL));
-    if (uc1 != NULL) {
-      (*dot_prod)(index) = uc1->DotProduct(*uc2);
-      index++;
-    }
-  }
-  KALDI_ASSERT(index == NumUpdatableComponents());
-}
-
-
-Nnet::Nnet(const Nnet &other): components_(other.components_.size()) {
-  for (size_t i = 0; i < other.components_.size(); i++)
-    components_[i] = other.components_[i]->Copy();
-  SetIndexes();
-  Check();
-}
-
-Nnet::Nnet(const Nnet &other1, const Nnet &other2) {
-  int32 dim1 = other1.OutputDim(), dim2 = other2.InputDim();
-  if (dim1 != dim2)
-    KALDI_ERR << "Concatenating neural nets: dimension mismatch "
-              << dim1 << " vs. " << dim2;
-  for (size_t i = 0; i < other1.components_.size(); i++)
-    components_.push_back(other1.components_[i]->Copy());
-  for (size_t i = 0; i < other2.components_.size(); i++)
-    components_.push_back(other2.components_[i]->Copy());
-  SetIndexes();
-  Check();
-}
-
-
-Nnet &Nnet::operator = (const Nnet &other) {
-  Destroy();
-  components_.resize(other.components_.size());
-  for (size_t i = 0; i < other.components_.size(); i++)
-    components_[i] = other.components_[i]->Copy();
-  SetIndexes();
-  Check();
-  return *this;
-}
-
-std::string Nnet::Info() const {
-  std::ostringstream ostr;
-  ostr << "num-components " << NumComponents() << std::endl;
-  ostr << "num-updatable-components " << NumUpdatableComponents() << std::endl;
-  ostr << "left-context " << LeftContext() << std::endl;
-  ostr << "right-context " << RightContext() << std::endl;
-  ostr << "input-dim " << InputDim() << std::endl;
-  ostr << "output-dim " << OutputDim() << std::endl;
-  ostr << "parameter-dim " << GetParameterDim() << std::endl;
-  for (int32 i = 0; i < NumComponents(); i++)
-    ostr << "component " << i << " : " << components_[i]->Info() << std::endl;
-  return ostr.str();
-}
-
-void Nnet::Check() const {
-  for (size_t i = 0; i + 1 < components_.size(); i++) {
-    KALDI_ASSERT(components_[i] != NULL);
-    int32 output_dim = components_[i]->OutputDim(),
-      next_input_dim = components_[i+1]->InputDim();
-    KALDI_ASSERT(output_dim == next_input_dim);
-    KALDI_ASSERT(components_[i]->Index() == static_cast<int32>(i));
-  }
-}
-
-void Nnet::Init(std::istream &is) {
-  Destroy();
-  std::string line;
-  /* example config file as follows.  The things in brackets specify the context
-     splicing for each layer, and after that is the info about the actual layer.
-     Imagine the input dim is 13, and the speaker dim is 40, so (13 x 9) + 40 =  527.
-     The config file might be as follows; the lines beginning with # are comments.
-
-     # layer-type layer-options
-     AffineLayer 0.01 0.001 527 1000 0.04356
-  */
-  components_.clear();
-  while (getline(is, line)) {
-    std::istringstream line_is(line);
-    line_is >> std::ws;  // Eat up whitespace.
-    if (line_is.peek() == '#' || line_is.eof()) continue;  // Comment or empty.
-    Component *c = Component::NewFromString(line);
-    KALDI_ASSERT(c != NULL);
-    components_.push_back(c);
-  }
-  SetIndexes();
-  Check();
-}
-
-void Nnet::Init(std::vector<Component*> *components) {
-  Destroy();
-  components_.swap(*components);
-  SetIndexes();
-  Check();
-}
-
-
-void Nnet::ScaleLearningRates(BaseFloat factor) {
-  std::ostringstream ostr;
-  for (int32 c = 0; c < NumComponents(); c++) {
-    UpdatableComponent *uc = dynamic_cast<UpdatableComponent*>(components_[c]);
-    if (uc != NULL) {  // Updatable component...
-      uc->SetLearningRate(uc->LearningRate() * factor);
-      ostr << uc->LearningRate() << " ";
-    }
-  }
-  KALDI_LOG << "Scaled learning rates by " << factor
-            << ", new learning rates are "
-            << ostr.str();
-}
-
-void Nnet::ScaleLearningRates(std::map<std::string, BaseFloat> scale_factors) {
-  std::ostringstream ostr;
-  for (int32 c = 0; c < NumComponents(); c++) {
-    UpdatableComponent *uc = dynamic_cast<UpdatableComponent*>(components_[c]);
-    if (uc != NULL) {  // Updatable component...
-      // check if scaling factor was specified for a component of this type
-      std::map<std::string, BaseFloat>::const_iterator lr_iterator =
-        scale_factors.find(uc->Type());
-      if (lr_iterator != scale_factors.end())  {
-        uc->SetLearningRate(uc->LearningRate() * lr_iterator->second);
-        ostr << uc->LearningRate() << " ";
-      }
-    }
-  }
-  KALDI_LOG << "Scaled learning rates by component-type specific factor, "
-            << "new learning rates are "
-            << ostr.str();
-}
-
-void Nnet::SetLearningRates(BaseFloat learning_rate) {
-  for (int32 c = 0; c < NumComponents(); c++) {
-    UpdatableComponent *uc = dynamic_cast<UpdatableComponent*>(components_[c]);
-    if (uc != NULL) {  // Updatable component...
-      uc->SetLearningRate(learning_rate);
-    }
-  }
-  KALDI_LOG << "Set learning rates to " << learning_rate;
-}
-
-void Nnet::ResizeOutputLayer(int32 new_num_pdfs) {
-  KALDI_ASSERT(new_num_pdfs > 0);
-  KALDI_ASSERT(NumComponents() > 2);
-  int32 nc = NumComponents();
-  SumGroupComponent *sgc =
-      dynamic_cast<SumGroupComponent*>(components_[nc - 1]);
-  if (sgc != NULL) {
-    // Remove it.  We'll resize things later.
-    delete sgc;
-    components_.erase(components_.begin() + nc - 1,
-                      components_.begin() + nc);
-    nc--;
-  }
-  SoftmaxComponent *sc;
-  if ((sc = dynamic_cast<SoftmaxComponent*>(components_[nc - 1])) == NULL)
-    KALDI_ERR << "Expected last component to be SoftmaxComponent.";
-
-  // check if nc-1 has a FixedScaleComponent
-  bool has_fixed_scale_component = false;
-  int32 fixed_scale_component_index = -1;
-  int32 final_affine_component_index = nc - 2;
-  int32 softmax_component_index = nc - 1;
-  FixedScaleComponent *fsc =
-      dynamic_cast<FixedScaleComponent*>(
-          components_[final_affine_component_index]);
-  if (fsc != NULL)  {
-    has_fixed_scale_component = true;
-    fixed_scale_component_index = nc - 2;
-    final_affine_component_index = nc - 3;
-  }
-  // note: it could be child class of AffineComponent.
-  AffineComponent *ac = dynamic_cast<AffineComponent*>(
-      components_[final_affine_component_index]);
-  if (ac == NULL)
-    KALDI_ERR << "Network doesn't have expected structure (didn't find final "
-              << "AffineComponent).";
-  if (has_fixed_scale_component)  {
-    // collapse the fixed_scale_component with the affine_component before it
-    AffineComponent *ac_new =
-        dynamic_cast<AffineComponent*>(ac->CollapseWithNext(*fsc));
-    KALDI_ASSERT(ac_new != NULL);
-    delete fsc;
-    delete ac;
-    components_.erase(components_.begin() + fixed_scale_component_index,
-                      components_.begin() + (fixed_scale_component_index + 1));
-    components_[final_affine_component_index] = ac_new;
-    ac = ac_new;
-    softmax_component_index = softmax_component_index - 1;
-  }
-  ac->Resize(ac->InputDim(), new_num_pdfs);
-  // Remove the softmax component, and replace it with a new one
-  delete components_[softmax_component_index];
-  components_[softmax_component_index] = new SoftmaxComponent(new_num_pdfs);
-  this->SetIndexes();  // used for debugging
-  this->Check();
-}
-
-int32 Nnet::NumUpdatableComponents() const {
-  int32 ans = 0;
-  for (int32 i = 0; i < NumComponents(); i++)
-    if (dynamic_cast<const UpdatableComponent*>(&(GetComponent(i))) != NULL)
-      ans++;
-  return ans;
-}
-
-void Nnet::ScaleComponents(const VectorBase<BaseFloat> &scale_params) {
-  KALDI_ASSERT(scale_params.Dim() == this->NumUpdatableComponents());
-  int32 i = 0;
-  for (int32 j = 0; j < NumComponents(); j++) {
-    UpdatableComponent *uc =
-        dynamic_cast<UpdatableComponent*>(&(GetComponent(j)));
-    if (uc!= NULL) {
-      uc->Scale(scale_params(i));
-      i++;
-    }
-  }
-  KALDI_ASSERT(i == scale_params.Dim());
-}
-
-// Scales all UpdatableComponents and all NonlinearComponents.
-void Nnet::Scale(BaseFloat scale) {
-  for (int32 i = 0; i < NumComponents(); i++) {
-    UpdatableComponent *uc =
-        dynamic_cast<UpdatableComponent*>(&(GetComponent(i)));
-    if (uc != NULL) uc->Scale(scale);
-    NonlinearComponent *nc =
-        dynamic_cast<NonlinearComponent*>(&(GetComponent(i)));
-    if (nc != NULL) nc->Scale(scale);
-  }
-}
-
-void Nnet::CopyStatsFrom(const Nnet &other) {
-  KALDI_ASSERT(NumComponents() == other.NumComponents());
-  for (int32 i = 0; i < NumComponents(); i++) {
-    NonlinearComponent *nc =
-        dynamic_cast<NonlinearComponent*>(&(GetComponent(i)));
-    const NonlinearComponent *nc_other =
-        dynamic_cast<const NonlinearComponent*>(&(other.GetComponent(i)));
-    if (nc != NULL) {
-      nc->Scale(0.0);
-      nc->Add(1.0, *nc_other);
-    }
-  }
-}
-
-void Nnet::SetLearningRates(const VectorBase<BaseFloat> &learning_rates) {
-  KALDI_ASSERT(learning_rates.Dim() == this->NumUpdatableComponents());
-  KALDI_ASSERT(learning_rates.Min() >= 0.0);  // we allow zero learning rate.
-  int32 i = 0;
-  for (int32 j = 0; j < NumComponents(); j++) {
-    UpdatableComponent *uc =
-        dynamic_cast<UpdatableComponent*>(&(GetComponent(j)));
-    if (uc!= NULL) {
-      uc->SetLearningRate(learning_rates(i));
-      i++;
-    }
-  }
-  KALDI_ASSERT(i == learning_rates.Dim());
-}
-
-void Nnet::GetLearningRates(VectorBase<BaseFloat> *learning_rates) const {
-  KALDI_ASSERT(learning_rates->Dim() == this->NumUpdatableComponents());
-  int32 i = 0;
-  for (int32 j = 0; j < NumComponents(); j++) {
-    const UpdatableComponent *uc =
-        dynamic_cast<const UpdatableComponent*>(&(GetComponent(j)));
-    if (uc!= NULL) {
-      (*learning_rates)(i) = uc->LearningRate();
-      i++;
-    }
-  }
-  KALDI_ASSERT(i == learning_rates->Dim());
-}
-
-void Nnet::Resize(int32 new_size) {
-  KALDI_ASSERT(new_size <= static_cast<int32>(components_.size()));
-  for (size_t i = new_size; i < components_.size(); i++)
-    delete components_[i];
-  components_.resize(new_size);
-}
-
-void Nnet::RemoveDropout() {
-  std::vector<Component*> components;
-  int32 removed = 0;
-  for (size_t i = 0; i < components_.size(); i++) {
-    if (dynamic_cast<DropoutComponent*>(components_[i]) != NULL ||
-        dynamic_cast<AdditiveNoiseComponent*>(components_[i]) != NULL) {
-      delete components_[i];
-      removed++;
-    } else {
-      components.push_back(components_[i]);
-    }
-  }
-  components_ = components;
-  if (removed > 0)
-    KALDI_LOG << "Removed " << removed << " dropout components.";
-  SetIndexes();
-  Check();
-}
-
-void Nnet::SetDropoutScale(BaseFloat scale) {
-  size_t n_set = 0;
-  for (size_t i = 0; i < components_.size(); i++) {
-    DropoutComponent *dc =
-        dynamic_cast<DropoutComponent*>(components_[i]);
-    if (dc != NULL) {
-      dc->SetDropoutScale(scale);
-      n_set++;
-    }
-  }
-  KALDI_LOG << "Set dropout scale to " << scale
-            << " for " << n_set << " components.";
-}
-
-
-void Nnet::RemovePreconditioning() {
-  for (size_t i = 0; i < components_.size(); i++) {
-    if (dynamic_cast<AffineComponentPreconditioned*>(components_[i]) != NULL) {
-      AffineComponent *ac = new AffineComponent(
-          *(dynamic_cast<AffineComponent*>(components_[i])));
-      delete components_[i];
-      components_[i] = ac;
-    } else if (dynamic_cast<AffineComponentPreconditionedOnline*>(
-        components_[i]) != NULL) {
-      AffineComponent *ac = new AffineComponent(
-          *(dynamic_cast<AffineComponent*>(components_[i])));
-      delete components_[i];
-      components_[i] = ac;
-    }
-  }
-  SetIndexes();
-  Check();
-}
-
-
-void Nnet::SwitchToOnlinePreconditioning(int32 rank_in, int32 rank_out,
-                                         int32 update_period,
-                                         BaseFloat num_samples_history,
-                                         BaseFloat alpha) {
-  int32 switched = 0;
-  for (size_t i = 0; i < components_.size(); i++) {
-    if (dynamic_cast<AffineComponent*>(components_[i]) != NULL) {
-      AffineComponentPreconditionedOnline *ac =
-          new AffineComponentPreconditionedOnline(
-              *(dynamic_cast<AffineComponent*>(components_[i])),
-              rank_in, rank_out, update_period, num_samples_history, alpha);
-      delete components_[i];
-      components_[i] = ac;
-      switched++;
-    }
-  }
-  KALDI_LOG << "Switched " << switched << " components to use online "
-            << "preconditioning, with (input, output) rank = "
-            << rank_in << ", " << rank_out << " and num_samples_history = "
-            << num_samples_history;
-  SetIndexes();
-  Check();
-}
-
-
-void Nnet::AddNnet(const VectorBase<BaseFloat> &scale_params,
-                   const Nnet &other) {
-  KALDI_ASSERT(scale_params.Dim() == this->NumUpdatableComponents());
-  int32 i = 0;
-  for (int32 j = 0; j < NumComponents(); j++) {
-    UpdatableComponent *uc =
-        dynamic_cast<UpdatableComponent*>(&(GetComponent(j)));
-    const UpdatableComponent *uc_other =
-        dynamic_cast<const UpdatableComponent*>(&(other.GetComponent(j)));
-    if (uc != NULL) {
-      KALDI_ASSERT(uc_other != NULL);
-      BaseFloat alpha = scale_params(i);
-      uc->Add(alpha, *uc_other);
-      i++;
-    }
-  }
-  KALDI_ASSERT(i == scale_params.Dim());
-}
-
-void Nnet::AddNnet(BaseFloat alpha,
-                   const Nnet &other) {
-  for (int32 i = 0; i < NumComponents(); i++) {
-    UpdatableComponent *uc =
-        dynamic_cast<UpdatableComponent*>(&(GetComponent(i)));
-    const UpdatableComponent *uc_other =
-        dynamic_cast<const UpdatableComponent*>(&(other.GetComponent(i)));
-    if (uc != NULL) {
-      KALDI_ASSERT(uc_other != NULL);
-      uc->Add(alpha, *uc_other);
-    }
-    NonlinearComponent *nc =
-        dynamic_cast<NonlinearComponent*>(&(GetComponent(i)));
-    const NonlinearComponent *nc_other =
-        dynamic_cast<const NonlinearComponent*>(&(other.GetComponent(i)));
-    if (nc != NULL) {
-      KALDI_ASSERT(nc_other != NULL);
-      nc->Add(alpha, *nc_other);
-    }
-  }
-}
-
-void Nnet::AddNnet(BaseFloat alpha,
-                   Nnet *other,
-                   BaseFloat beta) {
-  for (int32 i = 0; i < NumComponents(); i++) {
-    UpdatableComponent *uc =
-        dynamic_cast<UpdatableComponent*>(&(GetComponent(i)));
-    UpdatableComponent *uc_other =
-        dynamic_cast<UpdatableComponent*>(&(other->GetComponent(i)));
-    if (uc != NULL) {
-      KALDI_ASSERT(uc_other != NULL);
-      uc->Add(alpha, *uc_other);
-      uc_other->Scale(beta);
-    }
-    NonlinearComponent *nc =
-        dynamic_cast<NonlinearComponent*>(&(GetComponent(i)));
-    NonlinearComponent *nc_other =
-        dynamic_cast<NonlinearComponent*>(&(other->GetComponent(i)));
-    if (nc != NULL) {
-      KALDI_ASSERT(nc_other != NULL);
-      nc->Add(alpha, *nc_other);
-      nc_other->Scale(beta);
-    }
-  }
-}
-
-
-void Nnet::Append(Component *new_component) {
-  components_.push_back(new_component);
-  SetIndexes();
-  Check();
-}
-
-void Nnet::SetComponent(int32 c, Component *component) {
-  KALDI_ASSERT(static_cast<size_t>(c) < components_.size());
-  delete components_[c];
-  components_[c] = component;
-  SetIndexes();
-  Check();  // Check that all the dimensions still match up.
-}
-
-int32 Nnet::GetParameterDim() const {
-  int32 ans = 0;
-  for (int32 c = 0; c < NumComponents(); c++) {
-    const UpdatableComponent *uc = dynamic_cast<const UpdatableComponent*>(
-        &(GetComponent(c)));
-    if (uc != NULL)
-      ans += uc->GetParameterDim();
-  }
-  return ans;
-}
-
-void Nnet::Vectorize(VectorBase<BaseFloat> *params) const {
-  int32 offset = 0;
-  for (int32 c = 0; c < NumComponents(); c++) {
-    const UpdatableComponent *uc = dynamic_cast<const UpdatableComponent*>(
-        &(GetComponent(c)));
-    if (uc != NULL) {
-      int32 size = uc->GetParameterDim();
-      SubVector<BaseFloat> temp(*params, offset, size);
-      uc->Vectorize(&temp);
-      offset += size;
-    }
-  }
-  KALDI_ASSERT(offset == GetParameterDim());
-}
-
-void Nnet::ResetGenerators() {
-  // resets random-number generators for all random
-  // components.
-  for (int32 c = 0; c < NumComponents(); c++) {
-    RandomComponent *rc = dynamic_cast<RandomComponent*>(
-        &(GetComponent(c)));
-    if (rc != NULL)
-      rc->ResetGenerator();
-  }
-}
-
-void Nnet::UnVectorize(const VectorBase<BaseFloat> &params) {
-  int32 offset = 0;
-  for (int32 c = 0; c < NumComponents(); c++) {
-    UpdatableComponent *uc = dynamic_cast<UpdatableComponent*>(
-        &(GetComponent(c)));
-    if (uc != NULL) {
-      int32 size = uc->GetParameterDim();
-      uc->UnVectorize(params.Range(offset, size));
-      offset += size;
-    }
-  }
-  KALDI_ASSERT(offset == GetParameterDim());
-}
-
-void Nnet::LimitRankOfLastLayer(int32 dim) {
-  for (int32 i = components_.size() - 1; i >= 0; i--) {
-    AffineComponent *a = NULL, *b = NULL,
-        *c = dynamic_cast<AffineComponent*>(components_[i]);
-    if (c != NULL) {
-      c->LimitRank(dim, &a, &b);
-      delete c;
-      components_[i] = a;
-      components_.insert(components_.begin() + i + 1, b);
-      this->SetIndexes();
-      this->Check();
-      return;
-    }
-  }
-  KALDI_ERR << "No affine component found in neural net.";
-}
-
-void Nnet::SetIndexes() {
-  for (size_t i = 0; i < components_.size(); i++)
-    components_[i]->SetIndex(i);
-}
-
-void Nnet::Collapse(bool match_updatableness) {
-  int32 num_collapsed = 0;
-  bool changed = true;
-  while (changed) {
-    changed = false;
-    for (size_t i = 0; i + 1 < components_.size(); i++) {
-      AffineComponent *a1 = dynamic_cast<AffineComponent*>(components_[i]),
-          *a2 = dynamic_cast<AffineComponent*>(components_[i + 1]);
-      FixedAffineComponent
-          *f1 = dynamic_cast<FixedAffineComponent*>(components_[i]),
-          *f2 = dynamic_cast<FixedAffineComponent*>(components_[i + 1]);
-      Component *c = NULL;
-      if (a1 != NULL && a2 != NULL) {
-        c = a1->CollapseWithNext(*a2);
-      } else if (a1 != NULL && f2 != NULL && !match_updatableness) {
-        c = a1->CollapseWithNext(*f2);
-      } else if (f1 != NULL && a2 != NULL && !match_updatableness) {
-        c = a2->CollapseWithPrevious(*f1);
-      }
-      if (c != NULL) {
-        delete components_[i];
-        delete components_[i + 1];
-        components_[i] = c;
-        // This was causing valgrind errors, so doing it differently.  Either
-        // a standard-library bug or I misunderstood something.
-        // components_.erase(components_.begin() + i + i,
-        //                   components_.begin() + i + 2);
-        for (size_t j = i + 1; j + 1 < components_.size(); j++)
-          components_[j] = components_[j + 1];
-        components_.pop_back();
-        changed = true;
-        num_collapsed++;
-      }
-    }
-  }
-  this->SetIndexes();
-  this->Check();
-  KALDI_LOG << "Collapsed " << num_collapsed << " components."
-            << (num_collapsed == 0 && match_updatableness == true ?
-                "  Try --match-updatableness=false." : "");
-}
-
-Nnet *GenRandomNnet(int32 input_dim,
-                    int32 output_dim) {
-  std::vector<Component*> components;
-  int32 cur_dim = input_dim;
-  // have up to 10 layers before the final one.
-  for (size_t i = 0; i < 10; i++) {
-    if (rand() % 2 == 0) {
-      // add an affine component.
-      int32 next_dim = 50 + rand() % 100;
-      BaseFloat learning_rate = 0.0001, param_stddev = 0.001,
-          bias_stddev = 0.1;
-      AffineComponent *component = new AffineComponent();
-      component->Init(learning_rate, cur_dim, next_dim,
-                      param_stddev, bias_stddev);
-      components.push_back(component);
-      cur_dim = next_dim;
-    } else if (rand() % 2 == 0) {
-      components.push_back(new SigmoidComponent(cur_dim));
-    } else if (rand() % 2 == 0 && cur_dim < 200) {
-      SpliceComponent *component = new SpliceComponent();
-      std::vector<int32> context;
-      while (true) {
-        context.clear();
-        for (int32 i = -3; i <= 3; i++) {
-          if (rand() % 3 == 0)
-            context.push_back(i);
-        }
-        if (!context.empty() && context.front() <= 0 &&
-            context.back() >= 0)
-          break;
-      }
-      component->Init(cur_dim, context);
-      components.push_back(component);
-      cur_dim = cur_dim * context.size();
-    } else {
-      break;
-    }
-  }
-
-  {
-    AffineComponent *component = new AffineComponent();
-    BaseFloat learning_rate = 0.0001, param_stddev = 0.001,
-        bias_stddev = 0.1;
-    component->Init(learning_rate, cur_dim, output_dim,
-                    param_stddev, bias_stddev);
-    components.push_back(component);
-    cur_dim = output_dim;
-  }
-
-  components.push_back(new SoftmaxComponent(cur_dim));
-
-  Nnet *ans = new Nnet();
-  ans->Init(&components);
-  return ans;
-}
-
-int32 Nnet::FirstUpdatableComponent() const {
-  for (int32 i = 0; i < NumComponents(); i++) {
-    if (dynamic_cast<UpdatableComponent*>(components_[i]) != NULL)
-      return i;
-  }
-  return NumComponents();
-}
-
-
-int32 Nnet::LastUpdatableComponent() const {
-  for (int32 i = NumComponents() - 1; i >= 0; i--)
-    if (dynamic_cast<UpdatableComponent*>(components_[i]) != NULL)
-      return i;
-  return -1;
-}
-
-}  // namespace nnet2
-}  // namespace kaldi
-
diff --git a/src/nnet2/nnet-nnet.h b/src/nnet2/nnet-nnet.h
deleted file mode 100644
index c9c14cf2bae..00000000000
--- a/src/nnet2/nnet-nnet.h
+++ /dev/null
@@ -1,306 +0,0 @@
-// nnet2/nnet-nnet.h
-
-// Copyright 2011-2012  Karel Vesely
-//           2012-2014  Johns Hopkins University (author: Daniel Povey)
-
-// See ../../COPYING for clarification regarding multiple authors
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//  http://www.apache.org/licenses/LICENSE-2.0
-//
-// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
-// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
-// MERCHANTABLITY OR NON-INFRINGEMENT.
-// See the Apache 2 License for the specific language governing permissions and
-// limitations under the License.
-
-#ifndef KALDI_NNET2_NNET_NNET_H_
-#define KALDI_NNET2_NNET_NNET_H_
-
-#include "base/kaldi-common.h"
-#include "util/kaldi-io.h"
-#include "matrix/matrix-lib.h"
-#include "nnet2/nnet-component.h"
-
-#include <iostream>
-#include <sstream>
-#include <vector>
-#include <map>
-
-
-namespace kaldi {
-namespace nnet2 {
-
-
-/*
-  This neural net is basically a series of Components, and is a fairly
-  passive object that mainly acts as a store for the Components.  Training
-  is handled by a separate class NnetTrainer(), and extracting likelihoods
-  for decoding is handled by DecodableNnetCpu(). 
-  
-  There are a couple of things that make this class more than just a vector of
-  Components.
-
-   (1) It handles frame splicing (temporal context.)
-   We'd like to encompass the approach described in
-   http://www.fit.vutbr.cz/research/groups/speech/publi/2011/vesely_asru2011_00042.pdf
-   where at a certain point they splice together frames -10, -5, 0, +5 and +10.  It
-   seems that it's not necessarily best to splice together a contiguous sequence
-   of frames.
-
-   (2) It handles situations where the input features have two parts--
-   a "frame-specific" part (the normal features), and a "speaker-specific", or at
-   least utterance-specific part that does not vary with the frame index.
-   These features are provided separately from the frame-specific ones, to avoid
-   redundancy.
-*/
-
-
-class Nnet {
- public:
-  
-  /// Returns number of components-- think of this as similar to # of layers, but
-  /// e.g. the nonlinearity and the linear part count as separate components,
-  /// so the number of components will be more than the number of layers.
-  int32 NumComponents() const { return components_.size(); }
-
-  const Component &GetComponent(int32 c) const;
-  
-  Component &GetComponent(int32 c);
-
-  /// Sets the c'th component to "component", taking ownership of the pointer
-  /// and deleting the corresponding one that we own.
-  void SetComponent(int32 c, Component *component);
-  
-  /// Returns the left-context summed over all the Components... this is the
-  /// entire left-context in frames, that the network requires.
-  int32 LeftContext() const;
-
-  /// Returns the right-context summed over all the Components... this is the
-  /// entire right-context in frames, that the network requires.
-  int32 RightContext() const;
-  
-  /// The output dimension of the network -- typically
-  /// the number of pdfs.
-  int32 OutputDim() const;
-
-  /// Dimension of the input features, e.g. 13 or 40.  Does not
-  /// take account of frame splicing-- that is done with the "chunk"
-  /// mechanism, where you provide chunks of features over time.
-  int32 InputDim() const; 
-  
-  /// Uses the output of the Context() functions of the network, to compute a
-  /// vector of size NumComponents() + 1 indexed by component-index c, of the
-  /// chunk-info at the input of each layer c, where the c+1'th element contains
-  /// the chunk-info at the output of that layer.
-  /// The "input_chunk_size" is the time extent of the input.  If you want to
-  /// produce exactly 1 output frame per chunk, then this should equal 1 +
-  /// LeftContext() + RightContext().
-  void ComputeChunkInfo(int32 input_chunk_size,
-                        int32 num_chunks,
-                        std::vector<ChunkInfo> *chunk_info_out) const;
-
-  void ZeroStats(); // zeroes the stats on the nonlinear layers.
-
-  /// Copies only the statistics in layers of type NonlinearComponewnt, from
-  /// this neural net, leaving everything else fixed.
-  void CopyStatsFrom(const Nnet &nnet);
-
-  /// Returns the index of the lowest-numbered component which is updatable, or
-  /// NumComponents() if none are updatable.
-  int32 FirstUpdatableComponent() const;
-  
-  /// Returns the index of the highest-numbered component which is updatable, or
-  /// -1 if none are updatable.
-  int32 LastUpdatableComponent() const;
-
-  /// Returns the number of updatable components.
-  int32 NumUpdatableComponents() const;
-  
-  /// Scales the parameters of each of the updatable components.
-  /// Here, scale_params is a vector of size equal to
-  /// NumUpdatableComponents()
-  void ScaleComponents(const VectorBase<BaseFloat> &scales);
-
-  /// Excise any components of type DropoutComponent or AdditiveNoiseComponent
-  void RemoveDropout();
-
-  /// Calls SetDropoutScale for all the dropout nodes.
-  void SetDropoutScale(BaseFloat scale);
-  
-  /// Replace any components of type AffineComponentPreconditioned with
-  /// components of type AffineComponent.
-  void RemovePreconditioning();
-
-  /// Replaces any components of type AffineComponent or derived classes, with
-  /// components of type AffineComponentPreconditionedOnline.  E.g. rank_in =
-  /// 20, rank_out = 80, num_samples_history = 2000.0, alpha = 4.0
-  void SwitchToOnlinePreconditioning(int32 rank_in, int32 rank_out,
-                                     int32 update_period,
-                                     BaseFloat num_samples_history,
-                                     BaseFloat alpha);
-  
-  /// For each updatatable component, adds to it
-  /// the corresponding element of "other" times the
-  /// appropriate element of "scales" (which has the
-  /// same format as for ScaleComponents(), i.e.
-  /// one entry for each updatable component).
-  void AddNnet(const VectorBase<BaseFloat> &scales,
-               const Nnet &other);
-
-  /// Scales all the Components with the same scale.  This applies to
-  /// UpdatableComponents, and (unlike the ScaleComponents function) to
-  /// SoftmaxComponents.
-  void Scale(BaseFloat scale);
-
-
-  /// Adds to *this, the other neural net times the scale "alpha".  This applies
-  /// to UpdatableComponents, and (unlike the other AddNnet function) to
-  /// SoftmaxComponents.
-  void AddNnet(BaseFloat alpha,
-               const Nnet &other);
-
-  /// Turns the last affine layer into two layers of the same type, with a
-  /// smaller dimension in between-- we're keeping the top singular values of
-  /// the matrix.
-  void LimitRankOfLastLayer(int32 dimension);
-
-  /// This version of AddNnet adds to *this, alpha times *other, and then scales
-  /// *other by beta.  The reason why we make this a separate function is for
-  /// multithreading reasons (otherwise you could do AddNnet(alpha, *iter) and then
-  /// other->Scale(beta).
-  void AddNnet(BaseFloat alpha,
-               Nnet *other,
-               BaseFloat beta);
-
-  /// Removes final components from the neural network (used for
-  /// debugging).
-  void Resize(int32 num_components);
-
-
-  /// Where possible, collapse multiple affine or linear components in a
-  /// sequence into a single one by composing the transforms.  If
-  /// match_updatableness=true, this will not collapse, say, an
-  /// AffineComponent with a FixedAffineComponent or FixedLinearComponent.
-  /// If false, it will collapse them.  This function won't necessarily
-  /// work for all pairs of such layers.  It currently only works where
-  /// one of each pair is an AffineComponent.
-  void Collapse(bool match_updatableness);
-  
-
-  /// Sets the index_ values of the components.
-  void SetIndexes(); 
-  
-  Nnet(const Nnet &other); // Copy constructor.
-
-  Nnet(const Nnet &nnet1, const Nnet &nnet2); // Constructor that takes two
-  // nnets: it concatenates the layers.
-  
-  Nnet() {}
-
-  Nnet &operator = (const Nnet &other); // assignment operator.
-
-  /// Initialize from config file.
-  /// Each line of the config is either a comment line starting
-  /// with whitespace then #, or it is a line that specifies one
-  /// layer of the network, as accepted by Component::InitFromString().
-  /// An example non-comment line is:
-  /// AffineComponent learning-rate=0.01 l2-penalty=0.001 input-dim=10 output-dim=15 param-stddev=0.1
-  void Init(std::istream &is);
-
-  /// This Init method works from a vector of components.  It will take
-  /// ownership of the pointers and will resize the vector to zero to avoid a
-  /// chance of the caller deallocating them (but the vector itself is not
-  /// deleted).
-  void Init(std::vector<Component*> *components);
-
-  /// Appends this component to the components already in the neural net.
-  /// Takes ownership of the pointer.
-  void Append(Component *new_component);
-  
-  virtual ~Nnet() { Destroy(); }
-
-  std::string Info() const; // some human-readable summary info.
-
-  void Destroy();
-  
-  void Write(std::ostream &os, bool binary) const;
-
-  void Read(std::istream &is, bool binary);
-
-  void SetZero(bool treat_as_gradient); // Sets all parameters to zero and if
-  // treat_as_gradient == true, also tells components to "think of themselves as
-  // gradients" (affects some of the update code).  Also zeroes stats stored
-  // with things of type NonlinearComponent.
-
-
-  /// This function is used when doing transfer learning to a new system.  It
-  /// resizes the final affine and softmax components.  If your system has a
-  /// SumGroupComponent before the final softmax, it will be discarded.
-  void ResizeOutputLayer(int32 new_num_pdfs);
-  
-
-  /// Scale all the learning rates in the neural net by this factor.
-  void ScaleLearningRates(BaseFloat factor);
-
-  /// Scale all the learning rates in the neural net by the factors indexed
-  /// by the type of component.
-  void ScaleLearningRates(std::map<std::string, BaseFloat> scale_factors);
-
-  /// Set all the learning rates in the neural net to this value.
-  void SetLearningRates(BaseFloat learning_rates);
-
-  /// Set all the learning rates in the neural net to these values
-  /// (one for each updatable layer).
-  void SetLearningRates(const VectorBase<BaseFloat> &learning_rates);
-
-  /// Get all the learning rates in the neural net (the output
-  /// must have dim equal to NumUpdatableComponents()).
-  void GetLearningRates(VectorBase<BaseFloat> *learning_rates) const;
-  
-  // This sets *dot_prod to the dot prod of *this . validation_gradient,
-  // separately for each updatable component.  The vector must have size equal
-  // to this->NumUpdatableComponents().  Warning: previously it had to have size
-  // equal to this->NumComponents()).  This is used in updating learning rates
-  // and shrinkage rates.
-  void ComponentDotProducts(
-      const Nnet &other,
-      VectorBase<BaseFloat> *dot_prod) const;
-
-  void Check() const; // Consistency check.
-
-
-  void ResetGenerators(); // resets random-number generators for all
-  // random components.  You must also set sRand() for this to be
-  // effective.
-
-  // The following three functions are used for vectorizing
-  // the parameters-- used, for example, in L-BFGS.
-  virtual int32 GetParameterDim() const;
-  virtual void Vectorize(VectorBase<BaseFloat> *params) const;
-  virtual void UnVectorize(const VectorBase<BaseFloat> &params);
-  
-  friend class NnetUpdater;
-  friend class DecodableNnet;
- private:
-  std::vector<Component*> components_;
-};
-
-
-/// This function generates a random neural net, for testing purposes.
-/// It will contain a random number of SigmoidComponent, AffineComponent
-/// and SpliceComponent, followed by a final AffineComponent and
-/// SoftmaxComponent.  The parameters will all be randomly initialized.
-Nnet *GenRandomNnet(int32 input_dim,
-                    int32 output_dim);
-
-
-
-} // namespace nnet2
-} // namespace kaldi
-
-#endif
diff --git a/src/nnet2/nnet-precondition-online-test.cc b/src/nnet2/nnet-precondition-online-test.cc
deleted file mode 100644
index b0306db72a2..00000000000
--- a/src/nnet2/nnet-precondition-online-test.cc
+++ /dev/null
@@ -1,342 +0,0 @@
-// nnet2/nnet-precondition-online-test.cc
-
-// Copyright 2012-2015  Johns Hopkins University (author:  Daniel Povey)
-
-// See ../../COPYING for clarification regarding multiple authors
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//  http://www.apache.org/licenses/LICENSE-2.0
-//
-// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
-// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
-// MERCHANTABLITY OR NON-INFRINGEMENT.
-// See the Apache 2 License for the specific language governing permissions and
-// limitations under the License.
-
-#include "nnet2/nnet-precondition-online.h"
-#include "util/common-utils.h"
-
-namespace kaldi {
-namespace nnet2 {
-
-// Simple version of OnlinePreconditioner that we use to make
-// sure it is behaving as advertised.
-class OnlinePreconditionerSimple {
- public:
-  OnlinePreconditionerSimple(): rank_(40), num_samples_history_(2000.0), alpha_(4.0),
-                                epsilon_(1.0e-10), delta_(5.0e-04) { }
-
-  void SetRank(int32 rank) { rank_ = rank; }
-
-  void PreconditionDirections(
-      CuMatrixBase<BaseFloat> *R,
-      CuVectorBase<BaseFloat> *row_prod,
-      BaseFloat *scale);
-
-
- private:
-  BaseFloat Eta(int32 N) const;
-
-  void PreconditionDirectionsCpu(
-      MatrixBase<double> *R,
-      VectorBase<double> *row_prod,
-      BaseFloat *scale);
-
-
-  void Init(const MatrixBase<double> &R0);
-
-  void InitDefault(int32 D);
-
-  int32 rank_;
-  double num_samples_history_;
-  double alpha_;
-  double epsilon_;
-  double delta_;
-
-  // Fisher matrix defined as F_t = R_t^T diag(d_t) R_t + rho_t I.
-  Vector<double> d_t_;
-  Matrix<double> R_t_;
-  double rho_t_;
-};
-
-
-void OnlinePreconditionerSimple::PreconditionDirections(
-      CuMatrixBase<BaseFloat> *R,
-      CuVectorBase<BaseFloat> *row_prod,
-      BaseFloat *scale) {
-  Matrix<BaseFloat> R_cpu(*R);
-  Vector<BaseFloat> row_prod_cpu(*row_prod);
-  Matrix<double> R_cpu_dbl(R_cpu);
-  Vector<double> row_prod_cpu_dbl(row_prod_cpu);
-  PreconditionDirectionsCpu(&R_cpu_dbl,
-                            &row_prod_cpu_dbl,
-                            scale);
-  row_prod_cpu.CopyFromVec(row_prod_cpu_dbl);
-  R_cpu.CopyFromMat(R_cpu_dbl);
-  R->CopyFromMat(R_cpu);
-  row_prod->CopyFromVec(row_prod_cpu);
-}
-
-void OnlinePreconditionerSimple::InitDefault(int32 D) {
-  if (rank_ >= D) {
-    KALDI_WARN << "Rank " << rank_ << " of online preconditioner is >= dim " << D
-               << ", setting it to "
-               << (D - 1) << " (but this is probably still too high)";
-    rank_ = D - 1;
-  }
-  int32 R = rank_;
-  R_t_.Resize(R, D);
-  for (int32 r = 0; r < R; r++) {
-    std::vector<int32> cols;
-    for (int32 c = r; c < D; c += R)
-      cols.push_back(c);
-    for (int32 i = 0; i < cols.size(); i++) {
-      int32 c = cols[i];
-      R_t_(r, c) = (i == 0 ? 1.1 : 1.0) /
-          sqrt(1.1 * 1.1 + cols.size() - 1);
-    }
-  }
-  d_t_.Resize(R);
-  d_t_.Set(epsilon_);
-  rho_t_ = epsilon_;
-}
-
-void OnlinePreconditionerSimple::Init(const MatrixBase<double> &R0) {
-  int32 D = R0.NumCols(), N = R0.NumRows();
-  InitDefault(D);
-  int32 num_init_iters = 3;
-  for (int32 i = 0; i < num_init_iters; i++) {
-    CuMatrix<BaseFloat> R0_copy(R0);
-    CuVector<BaseFloat> row_products(N);
-    BaseFloat scale;
-    PreconditionDirections(&R0_copy, &row_products, &scale);
-  }
-}
-
-BaseFloat OnlinePreconditionerSimple::Eta(int32 N) const {
-  KALDI_ASSERT(num_samples_history_ > 0.0);
-  BaseFloat ans = 1.0 - exp(-N / num_samples_history_);
-  if (ans > 0.9) ans = 0.9;
-  return ans;
-}
-
-
-void OnlinePreconditionerSimple::PreconditionDirectionsCpu(
-    MatrixBase<double> *X_t,
-    VectorBase<double> *row_prod,
-    BaseFloat *scale) {
-  if (R_t_.NumRows() == 0)
-    Init(*X_t);
-  int32 R = R_t_.NumRows(), D = R_t_.NumCols(), N = X_t->NumRows();
-  BaseFloat eta = Eta(N);
-
-  SpMatrix<double> F_t(D);
-  // F_t =(def) R_t^T D_t R_t + \rho_t I
-  F_t.AddToDiag(rho_t_);
-  F_t.AddMat2Vec(1.0, R_t_, kTrans, d_t_, 1.0);
-
-  // Make sure F_t is +ve definite.
-  {
-    KALDI_ASSERT(d_t_.Min() > 0);
-    Vector<double> eigs(D);
-    F_t.Eig(&eigs, NULL);
-    KALDI_ASSERT(eigs.Min() > 0);
-  }
-
-  // S_t =(def) 1/N X_t^T X_t.
-  SpMatrix<double> S_t(D);
-  S_t.AddMat2(1.0 / N, *X_t, kTrans, 0.0);
-
-  // T_t =(def) \eta S_t + (1-\eta) F_t
-  SpMatrix<double> T_t(D);
-  T_t.AddSp(eta, S_t);
-  T_t.AddSp(1.0 - eta, F_t);
-
-  // Y_t =(def) R_t T_t
-  Matrix<double> Y_t(R, D);
-  Y_t.AddMatSp(1.0, R_t_, kNoTrans, T_t, 0.0);
-
-  // Z_t =(def) Y_t Y_t^T
-  SpMatrix<double> Z_t(R);
-  Z_t.AddMat2(1.0, Y_t, kNoTrans, 0.0);
-
-  Matrix<double> U_t(R, R);
-  Vector<double> c_t(R);
-  // decompose Z_t = U_t C_t U_t^T
-  Z_t.Eig(&c_t, &U_t);
-  SortSvd(&c_t, &U_t);
-  double c_t_floor = pow(rho_t_ * (1.0 - eta), 2);
-  int32 nf;
-  c_t.ApplyFloor(c_t_floor, &nf);
-  if (nf > 0) {
-    KALDI_WARN << "Floored " << nf << " elements of c_t.";
-  }
-  // KALDI_LOG << "c_t is " << c_t;
-  // KALDI_LOG << "U_t is " << U_t;
-  // KALDI_LOG << "Z_t is " << Z_t;
-
-  Vector<double> sqrt_c_t(c_t);
-  sqrt_c_t.ApplyPow(0.5);
-  Vector<double> inv_sqrt_c_t(sqrt_c_t);
-  inv_sqrt_c_t.InvertElements();
-  Matrix<double> R_t1(R, D);
-  // R_{t+1} = C_t^{-0.5} U_t^T Y_t
-  R_t1.AddMatMat(1.0, U_t, kTrans, Y_t, kNoTrans, 0.0);
-  R_t1.MulRowsVec(inv_sqrt_c_t);
-
-  double rho_t1 = (1.0 / (D - R)) *
-      (eta * S_t.Trace() + (1.0 - eta) * (D * rho_t_ + d_t_.Sum()) - sqrt_c_t.Sum());
-
-  Vector<double> d_t1(sqrt_c_t);
-  d_t1.Add(-rho_t1);
-
-  double floor_val = std::max(epsilon_, delta_ * sqrt_c_t.Max());
-  if (rho_t1 < floor_val) {
-    KALDI_WARN << "flooring rho_{t+1} to " << floor_val << ", was " << rho_t1;
-    rho_t1 = floor_val;
-  }
-  d_t1.ApplyFloor(floor_val, &nf);
-  if (nf > 0) {
-    KALDI_VLOG(3) << "d_t1 was " << d_t1;
-    KALDI_WARN << "Floored " << nf << " elements of d_{t+1}.";
-  }
-  // a check.
-  if (nf == 0 && rho_t1 > floor_val) {
-    double tr_F_t1 = D * rho_t1 + d_t1.Sum(), tr_T_t = T_t.Trace();
-    AssertEqual(tr_F_t1, tr_T_t);
-  }
-
-  // G_t = F_t + alpha/D tr(F_t)
-  SpMatrix<double> G_t(F_t);
-  G_t.AddToDiag(alpha_ / D * F_t.Trace());
-  SpMatrix<double> G_t_inv(G_t);
-  G_t_inv.Invert();
-
-  double beta_t = rho_t_ + alpha_/D * F_t.Trace();
-  // X_hat_t = beta_t X_t G_t^{-1}.
-  Matrix<double> X_hat_t(N, D);
-  X_hat_t.AddMatSp(beta_t, *X_t, kNoTrans, G_t_inv, 0.0);
-
-  double tr_x_x = TraceMatMat(*X_t, *X_t, kTrans),
-      tr_Xhat_Xhat = TraceMatMat(X_hat_t, X_hat_t, kTrans);
-  double gamma = (tr_Xhat_Xhat == 0 ? 1.0 : sqrt(tr_x_x / tr_Xhat_Xhat));
-
-  X_t->CopyFromMat(X_hat_t);
-  row_prod->AddDiagMat2(1.0, *X_t, kNoTrans, 0.0);
-  *scale = gamma;
-
-  // Update the parameters
-  rho_t_ = rho_t1;
-  d_t_.CopyFromVec(d_t1);
-  R_t_.CopyFromMat(R_t1);
-
-  KALDI_VLOG(3) << "rho_t_ = " << rho_t_;
-  KALDI_VLOG(3) << "d_t_ = " << d_t_;
-  KALDI_VLOG(3) << "R_t_ = " << R_t_;
-
-
-  { // check that R_t_ R_t_^T = I.
-    SpMatrix<double> unit(R);
-    unit.AddMat2(1.0, R_t_, kNoTrans, 0.0);
-    if (!unit.IsUnit(1.0e-03)) {
-      KALDI_WARN  << "R is not orthogonal, reorthogonalizing.";
-      for (int32 i = 0; i < R; i++) {
-        SubVector<double> row(R_t_, i);
-        for (int32 j = 0; j < i; j++) {
-          SubVector<double> row_j(R_t_, j);
-          row.AddVec(-VecVec(row_j, row), row_j);
-        }
-        row.Scale(1.0 / row.Norm(2.0));
-      }
-    }
-    unit.AddMat2(1.0, R_t_, kNoTrans, 0.0);
-    KALDI_ASSERT(unit.IsUnit(1.0e-03));
-  }
-}
-
-
-void UnitTestPreconditionDirectionsOnline() {
-  MatrixIndexT R = 1 + Rand() % 30,  // rank of correction
-      N = (2 * R) + Rand() % 30,  // batch size
-      D = R + 1 + Rand() % 20; // problem dimension.  Must be > R.
-
-  // Test sometimes with features that are all-zero or all-one; this will
-  // help to make sure low-rank or zero input doesn't crash the code.
-  bool zero = false;
-  bool one = false;
-  if (Rand() % 3 == 0) zero = true;
-  //else if (Rand() % 2 == 0) one = true;
-
-  CuVector<BaseFloat> row_prod1(N), row_prod2(N);
-  BaseFloat gamma1, gamma2;
-  BaseFloat big_eig_factor = RandInt(1, 20);
-  big_eig_factor = big_eig_factor * big_eig_factor;
-  Vector<BaseFloat> big_eig_vector(D);
-  big_eig_vector.SetRandn();
-  big_eig_vector.Scale(big_eig_factor);
-
-  OnlinePreconditionerSimple preconditioner1;
-  OnlinePreconditioner preconditioner2;
-  preconditioner1.SetRank(R);
-  preconditioner2.SetRank(R);
-  preconditioner2.TurnOnDebug();
-
-  int32 num_iters = 100;
-  for (int32 iter = 0; iter < num_iters; iter++) {
-    Matrix<BaseFloat> M_cpu(N, D);
-    if (one) M_cpu.Set(1.0);
-    else if (!zero) {
-      M_cpu.SetRandn();
-      Vector<BaseFloat> rand_vec(N);
-      rand_vec.SetRandn();
-      M_cpu.AddVecVec(1.0, rand_vec, big_eig_vector);
-    }
-    CuMatrix<BaseFloat> M(M_cpu);
-
-    CuMatrix<BaseFloat> Mcopy1(M), Mcopy2(M);
-
-    preconditioner1.PreconditionDirections(&Mcopy1, &row_prod1, &gamma1);
-
-    preconditioner2.PreconditionDirections(&Mcopy2, &row_prod2, &gamma2);
-
-    BaseFloat trace1 = TraceMatMat(M, M, kTrans),
-        trace2 = TraceMatMat(Mcopy1, Mcopy1, kTrans);
-    AssertEqual(trace1, trace2 * gamma2 * gamma2, 1.0e-02);
-
-    AssertEqual(Mcopy1, Mcopy2);
-    AssertEqual<BaseFloat>(row_prod1, row_prod2, 1.0e-02);
-    AssertEqual(gamma1, gamma2, 1.0e-02);
-
-    // make sure positive definite
-    CuVector<BaseFloat> inner_prods(M.NumRows());
-    inner_prods.AddDiagMatMat(1.0, M, kNoTrans, Mcopy1, kTrans, 0.0);
-    KALDI_ASSERT(inner_prods.Min() >= 0.0);
-  }
-  return;
-}
-
-
-} // namespace nnet2
-} // namespace kaldi
-
-
-int main() {
-  using namespace kaldi;
-  using namespace kaldi::nnet2;
-  for (int32 loop = 0; loop < 2; loop++) {
-#if HAVE_CUDA == 1
-    CuDevice::Instantiate().SetDebugStrideMode(true);
-    if (loop == 0)
-      CuDevice::Instantiate().SelectGpuId("no"); // -1 means no GPU
-    else
-      CuDevice::Instantiate().SelectGpuId("optional"); // -2 .. automatic selection
-#endif
-    for (int32 i = 0; i < 10; i++) {
-      UnitTestPreconditionDirectionsOnline();
-    }
-  }
-}
diff --git a/src/nnet2/nnet-precondition-online.cc b/src/nnet2/nnet-precondition-online.cc
deleted file mode 100644
index 51e7c5b13c6..00000000000
--- a/src/nnet2/nnet-precondition-online.cc
+++ /dev/null
@@ -1,641 +0,0 @@
-// nnet2/nnet-precondition-online.cc
-
-// Copyright 2013-2015   Johns Hopkins University (author: Daniel Povey)
-//                2015   Xiaohui Zhang
-
-// See ../../COPYING for clarification regarding multiple authors
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//  http://www.apache.org/licenses/LICENSE-2.0
-//
-// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
-// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
-// MERCHANTABLITY OR NON-INFRINGEMENT.
-// See the Apache 2 License for the specific language governing permissions and
-// limitations under the License.
-
-#include "nnet2/nnet-precondition-online.h"
-
-namespace kaldi {
-namespace nnet2 {
-
-
-OnlinePreconditioner::OnlinePreconditioner():
-    rank_(40), update_period_(1), num_samples_history_(2000.0), alpha_(4.0),
-    epsilon_(1.0e-10), delta_(5.0e-04), t_(-1),
-    num_updates_skipped_(0), self_debug_(false) { }
-
-
-/**
-  This function creates a matrix with orthonormal rows that is like the
-  following matrix, except with each row normalized to have unit 2-norm:
-  [  1.1 0   1   0   1   0
-     0   1.1 0   1   0   1  ]
-  The reason why the first element in each row is 1.1 and not 1, is for
-  symmetry-breaking... we don't want any weighted sum of all these rows to be
-  all ones, because the derivative in that direction can be zero in some
-  architectures and it causes us to have to do an inefficient CPU-based
-  renormalization.
- */
-// static
-void OnlinePreconditioner::InitOrthonormalSpecial(CuMatrixBase<BaseFloat> *R) {
-  int32 num_rows = R->NumRows(), num_cols = R->NumCols();
-  KALDI_ASSERT(num_cols >= num_rows);
-  R->SetZero();
-  std::vector<MatrixElement<BaseFloat> > elems;
-  elems.reserve(num_cols);
-  BaseFloat first_elem = 1.1;
-  for (int32 r = 0; r < num_rows; r++) {
-    std::vector<int32> cols;  // columns that have an entry for this row
-    for (int32 c = r; c < num_cols; c += num_rows)
-      cols.push_back(c);
-    BaseFloat normalizer = 1.0 / sqrt(first_elem * first_elem +
-                                      cols.size() - 1);
-    for (size_t i = 0; i < cols.size(); i++) {
-      int32 c = cols[i];
-      MatrixElement<BaseFloat> e = { r, c,
-                                     normalizer * (i == 0 ? first_elem :
-                                                   BaseFloat(1.0)) };
-      elems.push_back(e);
-    }
-  }
-  R->AddElements(1.0, elems);
-  { // TODO: remove this testing code.
-    CuMatrix<BaseFloat> prod(num_rows, num_rows);
-    prod.AddMatMat(1.0, *R, kNoTrans, *R, kTrans, 0.0);
-    KALDI_ASSERT(prod.IsUnit());
-  }
-}
-
-
-void OnlinePreconditioner::InitDefault(int32 D) {
-  if (rank_ >= D) {
-    KALDI_WARN << "Rank " << rank_ << " of online preconditioner is >= dim " << D
-               << ", setting it to "
-               << (D - 1) << " (but this is probably still too high)";
-    rank_ = D - 1;
-  }
-  if (rank_ == 0) {
-    // Dimension of input data was 1, so the natural gradient preconditioner
-    // would always be the unit matrix.
-    // We'll handle this as a special case, for generality.
-    return;
-  }
-  KALDI_ASSERT(num_samples_history_ > 0.0 && num_samples_history_ <= 1.0e+6);
-  KALDI_ASSERT(alpha_ >= 0.0);
-  KALDI_ASSERT(rank_ > 0);
-  KALDI_ASSERT(epsilon_ > 0.0 && epsilon_ <= 1.0e-05);  // plausible values.
-  KALDI_ASSERT(delta_ > 0.0 && delta_ <= 1.0e-02);  // plausible values.
-
-  // to initialize, in the equation
-  //   F_t =(def) R_t^T D_t R_t + \rho_t I
-  // we will set the orthogonal R_t to a special orthogonal matrix with no zero
-  // rows or columns (see the function), rho_t to epsilon,
-  // and D_t to epsilon.  But we don't store R_t directly.  Instead, we store
-  //   W_t =(def)  E_t^{0.5} R_t,
-  // where E_t =(def)  1/\beta_t (D_t^{-1} + 1/\beta_t I)^{-1}
-  // from (eqn:tii),
-  //  e_{tii} =   1/(\beta_t/d_{tii} + 1),
-  // where
-  // \beta_t =(def) \rho_t + \alpha/D tr(F_t)
-  //         =      epsilon + alpha/D * (epsilon * D + epsilon * rank)
-  //         =     epsilon * (1 + alpha * (D + rank) / D)
-  // And  d_{tii} is epsilon, so
-  //  e_{tii} =   1/((1 + alpha * (D + rank) / D) + 1)  [for each i.]
-  //          =   1/(2 + alpha * (D + rank) / D)).
-  BaseFloat epsilon = epsilon_;  // we could make this a bit more.
-  rho_t_ = epsilon;
-  d_t_.Resize(rank_, kUndefined);
-  d_t_.Set(epsilon);
-  W_t_.Resize(rank_, D, kUndefined);
-  // after the next line, W_ will store the orthogonal matrix R_t.
-  InitOrthonormalSpecial(&W_t_);
-  BaseFloat E_tii = 1.0 / ( 2.0 + (D + rank_) * alpha_ / D );
-  // W_t =(def) E_t^{0.5} R_t.
-  W_t_.Scale(sqrt(E_tii));
-  t_ = 0;
-}
-
-void OnlinePreconditioner::Init(const CuMatrixBase<BaseFloat> &R0) {
-  int32 D = R0.NumCols();
-  // for locking reasons it's better to use a different object.
-  OnlinePreconditioner this_copy(*this);
-  this_copy.InitDefault(D);
-
-  CuMatrix<BaseFloat> R0_copy(R0.NumRows(), R0.NumCols(), kUndefined);
-  // number of iterations with the same data from a pseudorandom start.
-  // this is a faster way of starting than doing eigenvalue decomposition.
-  int32 num_init_iters = 3;
-  for (int32 i = 0; i < num_init_iters; i++) {
-    BaseFloat scale;
-    R0_copy.CopyFromMat(R0);
-    this_copy.PreconditionDirections(&R0_copy, NULL, &scale);
-  }
-  rank_ = this_copy.rank_;
-  W_t_.Swap(&this_copy.W_t_);
-  d_t_.Swap(&this_copy.d_t_);
-  rho_t_ = this_copy.rho_t_;
-  t_ = 0;
-}
-
-void OnlinePreconditioner::PreconditionDirections(
-    CuMatrixBase<BaseFloat> *X_t,
-    CuVectorBase<BaseFloat> *row_prod,
-    BaseFloat *scale) {
-  if (X_t->NumCols() == 1) {
-    // If the dimension of the space equals one then our natural gradient update
-    // with rescaling becomes a no-op, but the code wouldn't naturally handle it
-    // because rank would be zero.  Support this as a special case.
-    if (row_prod)
-      row_prod->AddDiagMat2(1.0, *X_t, kNoTrans, 0.0);
-    *scale = 1.0;
-    return;
-  }
-
-  if (row_prod == NULL) {
-    CuVector<BaseFloat> row_prod_tmp(X_t->NumRows());
-    PreconditionDirections(X_t, &row_prod_tmp, scale);
-    return;
-  }
-
-  read_write_mutex_.lock();
-  if (t_ == -1) // not initialized
-    Init(*X_t);
-
-  // Now t_ >= 0.
-  // We create local copies  of the class variables... this is intended for
-  // multi-threaded safety so we can't read them in an inconsistent state,
-  // but we don't really waste anything here (a copy of W_t is needed anyway,
-  // if we're to update it).
-  int32 t = t_, R = W_t_.NumRows(), D = W_t_.NumCols();
-  // space for W_t, J_t, K_t, L_t.
-  CuMatrix<BaseFloat> WJKL_t(2 * R, D + R);
-  WJKL_t.Range(0, R, 0, D).CopyFromMat(W_t_);
-  BaseFloat rho_t(rho_t_);
-  Vector<BaseFloat> d_t(d_t_);
-  read_write_mutex_.unlock();
-  PreconditionDirectionsInternal(t, rho_t, d_t, &WJKL_t, X_t, row_prod, scale);
-}
-
-void OnlinePreconditioner::ReorthogonalizeXt1(
-    const VectorBase<BaseFloat> &d_t1,
-    BaseFloat rho_t1,
-    CuMatrixBase<BaseFloat> *W_t1,
-    CuMatrixBase<BaseFloat> *temp_W,
-    CuMatrixBase<BaseFloat> *temp_O) {
-  // threshold is a configuration value: a desired threshold on orthogonality,
-  // below which we won't reorthogonalize.
-  const BaseFloat threshold = 1.0e-03;
-
-  int32 R = W_t1->NumRows(), D = W_t1->NumCols();
-  BaseFloat beta_t1 = rho_t1 * (1.0 + alpha_) + alpha_ * d_t1.Sum() / D;
-  Vector<BaseFloat> e_t1(R, kUndefined), sqrt_e_t1(R, kUndefined),
-      inv_sqrt_e_t1(R, kUndefined);
-  ComputeEt(d_t1, beta_t1, &e_t1, &sqrt_e_t1, &inv_sqrt_e_t1);
-
-  temp_O->SymAddMat2(1.0, *W_t1, kNoTrans, 0.0);
-  // O_t =  E_t^{-0.5} W_t W_t^T E_t^{-0.5}
-  Matrix<BaseFloat> O_mat(*temp_O);
-  SpMatrix<BaseFloat> O(O_mat, kTakeLower);
-  for (int32 i = 0; i < R; i++) {
-    BaseFloat i_factor = inv_sqrt_e_t1(i);
-    for (int32 j = 0; j <= i; j++) {
-      BaseFloat j_factor = inv_sqrt_e_t1(j);
-      O(i, j) *= i_factor * j_factor;
-    }
-  }
-  if (O.IsUnit(threshold)) {
-    if (self_debug_) {
-      KALDI_WARN << "Not reorthogonalizing since already orthognoal: " << O;
-    }
-    return;
-  }
-  TpMatrix<BaseFloat> C(R);
-  try {
-    C.Cholesky(O);
-    C.Invert();  // Now it's C^{-1}.
-    if (!(C.Max() < 100.0))
-      KALDI_ERR << "Cholesky out of expected range, "
-                << "reorthogonalizing with Gram-Schmidt";
-  } catch (...) {
-    // We do a Gram-Schmidt orthogonalization, which is a bit less efficient but
-    // more robust than the method using Cholesky.
-    KALDI_WARN << "Cholesky or Invert() failed while re-orthogonalizing R_t. "
-               << "Re-orthogonalizing on CPU.";
-    Matrix<BaseFloat> cpu_W_t1(*W_t1);
-    cpu_W_t1.OrthogonalizeRows();
-    W_t1->CopyFromMat(cpu_W_t1);
-    // at this point cpu_W_t1 represents R_{t+1}- it has orthonormal
-    // rows.  Do: W_{t+1} = E_{t+1}^{0.5} R_{t+1}
-    CuVector<BaseFloat> sqrt_e_t1_gpu(sqrt_e_t1);
-    W_t1->MulRowsVec(sqrt_e_t1_gpu);
-    return;
-  }
-  // Next, compute (E_t^{0.5} C^{-1} E_t^{-0.5})
-  // but it's really t+1, not t.
-  for (int32 i = 0; i < R; i++) {
-    BaseFloat i_factor = sqrt_e_t1(i);
-    for (int32 j = 0; j < i; j++) {
-      // skip j == i because i_factor * j_factor == 1 for j == i.
-      BaseFloat j_factor = inv_sqrt_e_t1(j);
-      C(i, j) *= i_factor * j_factor;
-    }
-  }
-  O_mat.CopyFromTp(C);
-  temp_O->CopyFromMat(O_mat);
-  temp_W->CopyFromMat(*W_t1);
-  W_t1->AddMatMat(1.0, *temp_O, kNoTrans, *temp_W, kNoTrans, 0.0);
-}
-
-// makes sure certain invariants are being preserved
-void OnlinePreconditioner::SelfTest() const {
-  KALDI_ASSERT(rho_t_ >= epsilon_);
-  BaseFloat d_t_max = d_t_.Max(), d_t_min = d_t_.Min();
-  KALDI_ASSERT(d_t_min >= epsilon_);
-  KALDI_ASSERT(d_t_min > 0.9 * delta_ * d_t_max);
-  KALDI_ASSERT(rho_t_ > 0.9 * delta_ * d_t_max);
-
-  int32 D = W_t_.NumCols(), R = W_t_.NumRows();
-  BaseFloat beta_t = rho_t_ * (1.0 + alpha_) + alpha_ * d_t_.Sum() / D;
-  Vector<BaseFloat> e_t(R, kUndefined), sqrt_e_t(R, kUndefined),
-      inv_sqrt_e_t(R, kUndefined);
-  ComputeEt(d_t_, beta_t, &e_t, &sqrt_e_t, &inv_sqrt_e_t);
-
-  CuSpMatrix<BaseFloat> S(R);
-  S.AddMat2(1.0, W_t_, kNoTrans, 0.0);
-  SpMatrix<BaseFloat> O(S);
-  for (int32 i = 0; i < R; i++) {
-    BaseFloat i_factor = inv_sqrt_e_t(i);
-    for (int32 j = 0; j <= i; j++) {
-      BaseFloat j_factor = inv_sqrt_e_t(j);
-      O(i, j) *= i_factor * j_factor;
-    }
-  }
-  if (!O.IsUnit(1.0e-04) || O(0, 0) != O(0, 0)) {
-    BaseFloat worst_error = 0.0;
-    int32 worst_i = 0, worst_j = 0;
-    for (int32 i = 0; i < R; i++) {
-      for (int32 j = 0; j < R; j++) {
-        BaseFloat elem = O(i, j);
-        BaseFloat error = fabs(elem - (i == j ? 1.0 : 0.0));
-        if (error > worst_error || error != error) {
-          worst_error = error;
-          worst_i = i;
-          worst_j = j;
-        }
-      }
-    }
-    if (worst_error > 1.0e-02 || worst_error != worst_error) {
-      KALDI_WARN << "Failed to verify W_t (worst error: O[" << worst_i << ','
-                 << worst_j << "] = " << O(worst_i, worst_j)
-                 << ", d_t = " << d_t_;
-    }
-  }
-}
-
-void OnlinePreconditioner::PreconditionDirectionsInternal(
-    const int32 t,
-    const BaseFloat rho_t,
-    const Vector<BaseFloat> &d_t,
-    CuMatrixBase<BaseFloat> *WJKL_t,
-    CuMatrixBase<BaseFloat> *X_t,
-    CuVectorBase<BaseFloat> *row_prod,
-    BaseFloat *scale) {
-  int32 N = X_t->NumRows(),  // Minibatch size.
-      D = X_t->NumCols(),  // Dimensions of vectors we're preconditioning
-      R = rank_;  // Rank of correction to unit matrix.
-  KALDI_ASSERT(R > 0 && R < D);
-  BaseFloat eta = Eta(N);
-
-  CuMatrix<BaseFloat> H_t(N, R);
-  const CuSubMatrix<BaseFloat> W_t(*WJKL_t, 0, R, 0, D);
-  // Below, WJ_t and LK_t are combinations of two matrices,
-  // which we define in order to combine two separate multiplications into one.
-  CuSubMatrix<BaseFloat> J_t(*WJKL_t, R, R, 0, D),
-      L_t(*WJKL_t, 0, R, D, R),
-      K_t(*WJKL_t, R, R, D, R),
-      WJ_t(*WJKL_t, 0, 2 * R, 0, D),
-      LK_t(*WJKL_t, 0, 2 * R, D, R);
-
-  H_t.AddMatMat(1.0, *X_t, kNoTrans, W_t, kTrans, 0.0);  // H_t = X_t W_t^T
-
-  bool locked = update_mutex_.try_lock();
-  if (locked) {
-    // Just hard-code it here that we do 10 updates before skipping any.
-    const int num_initial_updates = 10;
-    if (t_ > t || (num_updates_skipped_ < update_period_ - 1 &&
-                   t_ >= num_initial_updates)) {
-      update_mutex_.unlock();
-      // We got the lock but we were already beaten to it by another thread, or
-      // we don't want to update yet due to update_period_ > 1 (this saves
-      // compute), so release the lock.
-      locked = false;
-    }
-  }
-
-  if (!locked) {
-    // We're not updating the parameters, either because another thread is
-    // working on updating them, or because another thread already did so from
-    // the same or later starting point (making our update stale), or because
-    // update_period_ > 1.  We just apply the preconditioning and return.
-
-    // note: we don't bother with any locks before incrementing
-    // num_updates_skipped_ below, because the worst that could happen is that,
-    // on very rare occasions, we could skip one or two more updates than we
-    // intended.
-    num_updates_skipped_++;
-
-    BaseFloat tr_Xt_XtT = TraceMatMat(*X_t, *X_t, kTrans);
-    // X_hat_t = X_t - H_t W_t
-    X_t->AddMatMat(-1.0, H_t, kNoTrans, W_t, kNoTrans, 1.0);
-    // each element i of row_prod will be inner product of row i of X_hat_t with
-    // itself.
-    row_prod->AddDiagMat2(1.0, *X_t, kNoTrans, 0.0);
-    BaseFloat tr_Xhat_XhatT = row_prod->Sum();
-    KALDI_ASSERT(tr_Xhat_XhatT == tr_Xhat_XhatT);  // Check for NaN.
-    BaseFloat gamma_t = (tr_Xhat_XhatT == 0.0 ? 1.0 :
-                         sqrt(tr_Xt_XtT / tr_Xhat_XhatT));
-    *scale = gamma_t;
-    return;
-  }
-  J_t.AddMatMat(1.0, H_t, kTrans, *X_t, kNoTrans, 0.0);  // J_t = H_t^T X_t
-
-  bool compute_lk_together = (N > D);
-
-  if (compute_lk_together) {
-    // do the following two multiplies in one operation...
-    // note
-    // L_t = W_t J_t^T
-    // K_t = J_t J_t^T
-    // Note: L_t was defined as L_t = J_t W_t^T, but it's actually symmetric,
-    // so we can compute it as L_t = W_t J_t^T.
-    LK_t.AddMatMat(1.0, WJ_t, kNoTrans, J_t, kTrans, 0.0);
-  } else {
-    K_t.SymAddMat2(1.0, J_t, kNoTrans, 0.0);
-    L_t.SymAddMat2(1.0, H_t, kTrans, 0.0);
-  }
-
-  Matrix<BaseFloat> LK_cpu(LK_t);  // contains L and K on the CPU.
-  SubMatrix<BaseFloat> L_t_cpu(LK_cpu, 0, R, 0, R),
-      K_t_cpu(LK_cpu, R, R, 0, R);
-  if (!compute_lk_together) {
-    // the SymAddMat2 operations only set the lower triangle and diagonal.
-    L_t_cpu.CopyLowerToUpper();
-    K_t_cpu.CopyLowerToUpper();
-  }
-
-  // beta_t = \rho_t(1+\alpha) + \alpha/D tr(D_t)
-  BaseFloat beta_t = rho_t * (1.0 + alpha_) + alpha_ * d_t.Sum() / D;
-  Vector<BaseFloat> e_t(R), sqrt_e_t(R), inv_sqrt_e_t(R);
-  ComputeEt(d_t, beta_t, &e_t, &sqrt_e_t, &inv_sqrt_e_t);
-  KALDI_VLOG(5) << "e_t = " << e_t;
-
-  // The double-precision Z_t here, and the scaling, is to avoid potential
-  // overflow, because Z_t is proportional to the fourth power of data.
-  SpMatrix<double> Z_t_double(R);
-  ComputeZt(N, rho_t, d_t, inv_sqrt_e_t, K_t_cpu, L_t_cpu, &Z_t_double);
-  BaseFloat z_t_scale = std::max<double>(1.0, Z_t_double.Trace());
-  Z_t_double.Scale(1.0 / z_t_scale);
-  SpMatrix<BaseFloat> Z_t_scaled(Z_t_double);
-
-  Matrix<BaseFloat> U_t(R, R);
-  Vector<BaseFloat> c_t(R);
-  // do the symmetric eigenvalue decomposition Z_t = U_t C_t U_t^T.
-  Z_t_scaled.Eig(&c_t, &U_t);
-  SortSvd(&c_t, &U_t);
-  c_t.Scale(z_t_scale);
-
-  const BaseFloat condition_threshold = 1.0e+06;
-  // must_reorthogonalize will be true if the last diagonal element of c_t is
-  // negative, since we don't take the absolute value, but this is the right
-  // thing anyway.
-  bool must_reorthogonalize = (c_t(0) > condition_threshold * c_t(R - 1));
-
-  BaseFloat c_t_floor = pow(rho_t * (1 - eta), 2);
-  int32 nf;
-  c_t.ApplyFloor(c_t_floor, &nf);
-  if (nf > 0)
-    must_reorthogonalize = true;
-  if (nf > 0 && self_debug_) {
-    KALDI_WARN << "Floored " << nf << " elements of C_t.";
-  }
-  BaseFloat tr_Xt_XtT_check;
-  if (self_debug_)
-    tr_Xt_XtT_check = TraceMatMat(*X_t, *X_t, kTrans);
-
-  X_t->AddMatMat(-1.0, H_t, kNoTrans, W_t, kNoTrans, 1.0);  // X_hat_t = X_t - H_t W_t
-  // set *row_prod to inner products of each row of X_hat_t with itself.
-  row_prod->AddDiagMat2(1.0, *X_t, kNoTrans, 0.0);
-
-  BaseFloat tr_Xhat_XhatT = row_prod->Sum();
-  //  tr(X_t X_t^T) = tr(X_hat_t X_hat_t^T) - tr(L_t E_t) + 2 tr(L_t)
-  double tr_Xt_XtT = tr_Xhat_XhatT;
-  for (int32 i = 0; i < R; i++)
-    tr_Xt_XtT += L_t_cpu(i, i) * (2.0 - e_t(i));
-  if (self_debug_) {
-    KALDI_ASSERT(ApproxEqual(tr_Xt_XtT, tr_Xt_XtT_check));
-  }
-  BaseFloat gamma_t = (tr_Xhat_XhatT == 0.0 ? 1.0 :
-                       sqrt(tr_Xt_XtT / tr_Xhat_XhatT));
-  *scale = gamma_t;
-
-  Vector<BaseFloat> sqrt_c_t(c_t);
-  sqrt_c_t.ApplyPow(0.5);
-
-  // \rho_{t+1} = 1/(D - R) (\eta/N tr(X_t X_t^T) + (1-\eta)(D \rho_t + tr(D_t)) - tr(C_t^{0.5})).
-  BaseFloat rho_t1 = 1.0 / (D - R) * (eta / N * tr_Xt_XtT
-                                      + (1-eta)*(D * rho_t + d_t.Sum())
-                                      - sqrt_c_t.Sum());
-  // D_{t+1} = C_t^{0.5} - \rho_{t+1} I
-  Vector<BaseFloat> d_t1(sqrt_c_t);
-  d_t1.Add(-rho_t1);
-  BaseFloat floor_val = std::max(epsilon_, delta_ * sqrt_c_t.Max());
-  if (rho_t1 < floor_val)
-    rho_t1 = floor_val;
-  d_t1.ApplyFloor(floor_val);
-
-  CuMatrix<BaseFloat> W_t1(R, D);  // W_{t+1}
-  ComputeWt1(N, d_t, d_t1, rho_t, rho_t1, U_t, sqrt_c_t, inv_sqrt_e_t,
-             W_t, &J_t, &W_t1);
-
-  if (must_reorthogonalize) {
-    if (self_debug_) {
-      KALDI_WARN << "Reorthogonalizing.";
-    }
-    ReorthogonalizeXt1(d_t1,
-                       rho_t1,
-                       &W_t1,
-                       &J_t,
-                       &L_t);
-  }
-
-  // Commit the new parameters.
-  read_write_mutex_.lock();
-  KALDI_ASSERT(t_ == t);  // we already ensured this.
-  t_ = t + 1;
-  num_updates_skipped_ = 0;
-  W_t_.Swap(&W_t1);
-  d_t_.CopyFromVec(d_t1);
-  rho_t_ = rho_t1;
-
-  if (self_debug_)
-    SelfTest();
-
-  read_write_mutex_.unlock();
-  update_mutex_.unlock();
-}
-
-BaseFloat OnlinePreconditioner::Eta(int32 N) const {
-  KALDI_ASSERT(num_samples_history_ > 0.0);
-  BaseFloat ans = 1.0 - exp(-N / num_samples_history_);
-  // Don't let eta approach 1 too closely, as it can lead to NaN's appearing if
-  // the input is all zero.
-  if (ans > 0.9) ans = 0.9;
-  return ans;
-}
-
-void OnlinePreconditioner::ComputeWt1(int32 N,
-                                       const VectorBase<BaseFloat> &d_t,
-                                       const VectorBase<BaseFloat> &d_t1,
-                                       BaseFloat rho_t,
-                                       BaseFloat rho_t1,
-                                       const MatrixBase<BaseFloat> &U_t,
-                                       const VectorBase<BaseFloat> &sqrt_c_t,
-                                       const VectorBase<BaseFloat> &inv_sqrt_e_t,
-                                       const CuMatrixBase<BaseFloat> &W_t,
-                                       CuMatrixBase<BaseFloat> *J_t,
-                                       CuMatrixBase<BaseFloat> *W_t1) const {
-
-  int32 R = d_t.Dim(), D = W_t.NumCols();
-  BaseFloat eta = Eta(N);
-
-  // \beta_{t+1} = \rho_{t+1} (1+\alpha) + \alpha/D tr(D_{t+1})
-  BaseFloat beta_t1 = rho_t1 * (1.0 + alpha_) + alpha_ * d_t1.Sum() / D;
-  KALDI_ASSERT(beta_t1 > 0.0);
-  Vector<BaseFloat> e_t1(R, kUndefined), sqrt_e_t1(R, kUndefined),
-      inv_sqrt_e_t1(R, kUndefined);
-  ComputeEt(d_t1, beta_t1, &e_t1, &sqrt_e_t1, &inv_sqrt_e_t1);
-  Vector<BaseFloat> inv_sqrt_c_t(sqrt_c_t);
-  inv_sqrt_c_t.InvertElements();
-
-  Vector<BaseFloat> w_t_coeff(R);
-  for (int32 i = 0; i < R; i++)
-    w_t_coeff(i) = (1.0 - eta) / (eta/N) * (d_t(i) + rho_t);
-  CuVector<BaseFloat> w_t_coeff_gpu(w_t_coeff);
-  // B_t = J_t + (1-\eta)/(\eta/N) (D_t + \rho_t I) W_t
-  J_t->AddDiagVecMat(1.0, w_t_coeff_gpu, W_t, kNoTrans, 1.0);
-
-  // A_t = (\eta/N) E_{t+1}^{0.5} C_t^{-0.5} U_t^T E_t^{-0.5} B_t
-  Matrix<BaseFloat> A_t(U_t, kTrans);
-  for (int32 i = 0; i < R; i++) {
-    BaseFloat i_factor = (eta / N) * sqrt_e_t1(i) * inv_sqrt_c_t(i);
-    for (int32 j = 0; j < R; j++) {
-      BaseFloat j_factor = inv_sqrt_e_t(j);
-      A_t(i, j) *= i_factor * j_factor;
-    }
-  }
-  // W_{t+1} = A_t B_t
-  CuMatrix<BaseFloat> A_t_gpu(A_t);
-  W_t1->AddMatMat(1.0, A_t_gpu, kNoTrans, *J_t, kNoTrans, 0.0);
-}
-
-void OnlinePreconditioner::ComputeZt(int32 N,
-                                     BaseFloat rho_t,
-                                     const VectorBase<BaseFloat> &d_t,
-                                     const VectorBase<BaseFloat> &inv_sqrt_e_t,
-                                     const MatrixBase<BaseFloat> &K_t,
-                                     const MatrixBase<BaseFloat> &L_t,
-                                     SpMatrix<double> *Z_t) const {
-  // Use doubles because the range of quantities in Z_t can get large (fourth
-  // power of data), and we want to avoid overflow.  This routine is fast.
-  BaseFloat eta = Eta(N);
-  Vector<BaseFloat> d_t_rho_t(d_t);
-  d_t_rho_t.Add(rho_t);  // now d_t_rho_t is diag(D_t + \rho_t I).
-  double etaN = eta / N, eta1 = 1.0 - eta,
-      etaN_sq = etaN * etaN, eta1_sq = eta1 * eta1,
-      etaN_eta1 = etaN * eta1;
-  int32 R = d_t.Dim();
-  for (int32 i = 0; i < R; i++) {
-    double inv_sqrt_e_t_i = inv_sqrt_e_t(i), d_t_rho_t_i = d_t_rho_t(i);
-    for (int32 j = 0; j <= i; j++) {
-      double inv_sqrt_e_t_j = inv_sqrt_e_t(j), d_t_rho_t_j = d_t_rho_t(j),
-          L_t_i_j = 0.5 * (L_t(i, j) + L_t(j, i)),
-          K_t_i_j = 0.5 * (K_t(i, j) + K_t(j, i));
-      // See (eqn:Zt) in header.
-      (*Z_t)(i, j) = etaN_sq * inv_sqrt_e_t_i * K_t_i_j * inv_sqrt_e_t_j
-          + etaN_eta1 * inv_sqrt_e_t_i * L_t_i_j * inv_sqrt_e_t_j * d_t_rho_t_j
-          + etaN_eta1 * d_t_rho_t_i * inv_sqrt_e_t_i * L_t_i_j * inv_sqrt_e_t_j
-          + (i == j ? eta1_sq * d_t_rho_t_i * d_t_rho_t_i : 0.0);
-    }
-  }
-}
-
-void OnlinePreconditioner::ComputeEt(const VectorBase<BaseFloat> &d_t,
-                                     BaseFloat beta_t,
-                                     VectorBase<BaseFloat> *e_t,
-                                     VectorBase<BaseFloat> *sqrt_e_t,
-                                     VectorBase<BaseFloat> *inv_sqrt_e_t) const {
-  // e_{tii} = 1/(\beta_t/d_{tii} + 1)
-  int32 D = d_t.Dim();
-  const BaseFloat *d = d_t.Data();
-  BaseFloat *e = e_t->Data();
-  for (int32 i = 0; i < D; i++)
-    e[i] = 1.0 / (beta_t / d[i]  +  1);
-  sqrt_e_t->CopyFromVec(*e_t);
-  sqrt_e_t->ApplyPow(0.5);
-  inv_sqrt_e_t->CopyFromVec(*sqrt_e_t);
-  inv_sqrt_e_t->InvertElements();
-}
-
-
-OnlinePreconditioner::OnlinePreconditioner(const OnlinePreconditioner &other):
-    rank_(other.rank_), update_period_(other.update_period_),
-    num_samples_history_(other.num_samples_history_),
-    alpha_(other.alpha_), epsilon_(other.epsilon_), delta_(other.delta_),
-    t_(other.t_), num_updates_skipped_(other.num_updates_skipped_),
-    self_debug_(other.self_debug_), W_t_(other.W_t_),
-    rho_t_(other.rho_t_), d_t_(other.d_t_) {
-  // use default constructor for the mutexes.
-}
-
-OnlinePreconditioner& OnlinePreconditioner::operator = (
-    const OnlinePreconditioner &other) {
-  rank_ = other.rank_;
-  update_period_ = other.update_period_;
-  num_samples_history_ = other.num_samples_history_;
-  alpha_ = other.alpha_;
-  epsilon_ = other.epsilon_;
-  delta_ = other.delta_;
-  t_ = other.t_;
-  self_debug_ = other.self_debug_;
-  W_t_ = other.W_t_;
-  rho_t_ = other.rho_t_;
-  d_t_ = other.d_t_;
-  return *this;
-}
-
-void OnlinePreconditioner::SetRank(int32 rank) {
-  KALDI_ASSERT(rank > 0);
-  rank_ = rank;
-}
-void OnlinePreconditioner::SetUpdatePeriod(int32 update_period) {
-  KALDI_ASSERT(update_period > 0);
-  update_period_ = update_period;
-}
-void OnlinePreconditioner::SetNumSamplesHistory(BaseFloat num_samples_history) {
-  KALDI_ASSERT(num_samples_history > 0.0 &&
-               num_samples_history < 1.0e+6);
-  num_samples_history_ = num_samples_history;
-}
-void OnlinePreconditioner::SetAlpha(BaseFloat alpha) {
-  KALDI_ASSERT(alpha >= 0.0);
-  alpha_ = alpha;
-}
-
-
-}
-}
diff --git a/src/nnet2/nnet-precondition-online.h b/src/nnet2/nnet-precondition-online.h
deleted file mode 100644
index 7758d47831f..00000000000
--- a/src/nnet2/nnet-precondition-online.h
+++ /dev/null
@@ -1,574 +0,0 @@
-// nnet2/nnet-precondition-online.h
-
-// Copyright 2013-2015   Johns Hopkins University (author: Daniel Povey)
-//                2015   Xiaohui Zhang
-
-// See ../../COPYING for clarification regarding multiple authors
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//  http://www.apache.org/licenses/LICENSE-2.0
-//
-// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
-// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
-// MERCHANTABLITY OR NON-INFRINGEMENT.
-// See the Apache 2 License for the specific language governing permissions and
-// limitations under the License.
-
-#ifndef KALDI_NNET2_NNET_PRECONDITION_ONLINE_H_
-#define KALDI_NNET2_NNET_PRECONDITION_ONLINE_H_
-
-#include <iostream>
-#include <mutex>
-#include "base/kaldi-common.h"
-#include "matrix/matrix-lib.h"
-#include "cudamatrix/cu-matrix-lib.h"
-
-namespace kaldi {
-namespace nnet2 {
-
-
-/**
-   Keywords for search: natural gradient, naturalgradient, NG-SGD
-
-   This method is explained in the paper
-   "Parallel training of DNNs with Natural Gradient and Parameter Averaging"
-   by D. Povey, X. Zhang and S. Khudanpur, ICLR Workshop, 2015, where
-   it is referred to as online NG-SGD.  Note that the method exported
-   from this header is just the core of the algorithm, and some outer-level parts
-   of it are implemented in class NaturalGradientAffineComponent.
-
-  The rest of this extended comment describes the way we keep updated an estimate
-  of the inverse of a scatter matrix, in an online way.  This is the same as the
-  estimation of one of the A or B quantities in the paper.  This comment is slightly
-  redundant with the paper- actually it precedes the paper- but we keep it in case it
-  is useful in understanging our method.
-
-  We consider the problem of doing online estimation of a (scaled-identity plus low-rank)
-  approximation of a Fisher matrix... since the Fisher matrix is a scatter of vector-valued derivatives
-  and we will be given the derivatives (or at least terms in a factorization of the derivatives
-  which need not concern us right now), we can just think of the present task as being
-  the online accumulation of a (low-rank plus scaled-identity) approximation to a variance
-  of a distribution with mean zero.
-
-  Later on we'll think about how to get easy access to the inverse of this approximate
-  variance, which is what we really need.
-
-  Our approximation to the Fisher matrix (the scatter of derivatives) will be of the following form
-  (and just think of this as an approximate variance matrix of some arbitrary quantities).
-
-     F_t =(def) R_t^T D_t R_t + \rho_t I
-
-  (t is the minibatch index), where R_t is an R by D matrix with orthonormal
-  rows (1 <= R < D is our chosen rank), D_t is a positive-definite diagonal matrix, and
-  \rho_t > 0.  Suppose the dimension of F_t is D.  Let the vectors whose variance
-  we are approximating be provided in minibatches of size M (M can vary from
-  iteration to iteration, but it won't vary in the normal case, so we omit the
-  subscript t).  The batch of gradients is given as X_t \in Re^{M \times D},
-  i.e. each row is one of the vectors whose scatter we're estimating.  On the
-  t'th iteration, define the scatter S_t of the input vectors X_t as:
-
-     S_t =(def) 1/N X_t^T X_t           (eqn:St)
-
-  (where N is the minibatch size).  Be careful not to confuse the rank R with
-  with input X_t (we would typeface X_t in bold if this were not plain text, to
-  make the distinction clearer).  We want F_t to approach some kind of
-  time-weighted average of the S_t quantities, to the extent permitted by the
-  limitation of the rank R.  We want the F_t quantities to stay "fresh" (since
-  we'll be doing this in a SGD context and the parameters will be slowly
-  changing).  We use a constant 0 < \eta < 1 to control the updating rate.  Our
-  update for R_t is based on the power method.  Define the smoothed scatter
-
-   T_t =(def) \eta S_t + (1-\eta) F_t
-
-  we'll use this in place of the observed scatter S_t, to slow down the update.
-  Defining
-
-   Y_t =(def) R_t T_t
-
-  which can be expanded as follows:
-       Y_t = R_t ( \eta S_t + (1-\eta) F_t )
-           = R_t ( \eta S_t + (1-\eta) (R_t^T D_t R_t + \rho_t I) )
-           = R_t ( \eta S_t + (1-\eta) (R_t^T D_t R_t + \rho_t I) )
-           = \eta R_t S_t + (1-\eta) (D_t + \rho_t I) R_t
-
-  It is useful to think of Y_t as having each of the top eigenvectors of the
-  scatter scaled by the corresponding eigenvalue \lambda_i.
-  We compute the following R by R matrix:
-    Z_t =(def) Y_t Y_t^T
-  and do the symmetric eigenvalue decomposition
-    Z_t = U_t C_t U_t^T
-  where C_t is diagonal and U_t orthogonal; the diagonal elements of C_t will be
-  positive (since \rho_t > 0, T_t is positive definite; since R_t has full row rank
-  and T_t is positive definite, Y_t has full row rank; hence Z_t is positive definite).
-  The diagonal elements of C_t can be thought of as corresponding to the squares of
-  our current estimate of the top eigenvalues of the scatter matrix.
-  [we should check that no element of C_t is <= 0.]
-
-  It is easy to show that C_t^{-0.5} U_t^T Z_t U_t C_t^{-0.5} = I, so
-     (C_t^{-0.5} U_t^T Y_t) (Y_t^T U_t C_t^{-0.5}) = I.  Define
-    R_{t+1} =(def) C_t^{-0.5} U_t^T Y_t
-
-  and it's clear that R_{t+1} R_{t+1}^T = I.
-  We will set
-     D_{t+1} =(def) C_t^{0.5} - \rho_{t+1} I             (eqn:dt1)
-
-  which ensures that for each row r of R_{t+1}, the variance of our scatter
-  matrix F_{t+1} will be the square root of the corresponding diagonal element
-  of C_t.  This makes sense because, as we have pointed out, the diagonal
-  elements of C_t can be thought of as corresponding to squared eigenvalues.
-  But a proper treatment of this would require convergence analysis that would
-  get quite complicated.  We will choose \rho_{t+1} in order to ensure that
-  tr(F_{t+1}) = tr(T_t).
-
-  For any t,
-     tr(F_t) = D \rho_t + tr(D_t)
-     tr(T_t) = \eta tr(S_t) + (1-\eta) tr(F_t)
-             = \eta tr(S_t) + (1-\eta) (D \rho_t + tr(D_t))
-  Expanding out D_{t+1} from (eqn:dt1) in the expression for tr(F_{t+1}) below:
-      tr(F_{t+1})  = D \rho_{t+1} +  tr(D_{t+1})
-      tr(F_{t+1})  = D \rho_{t+1} +  tr(C_t^{0.5} - \rho_{t+1} I)
-                   = (D - R) \rho_{t+1} + tr(C_t^{0.5})
-   and equating tr(F_{t+1}) with T_t (since F_{t+1} is supposed to be a low-rank
-   approximation to T_t), we have
-                          tr(F_{t+1}) = tr(T_t)
-  (D - R) \rho_{t+1} + tr(C_t^{0.5})  = \eta tr(S_t) + (1-\eta) (D \rho_t + tr(D_t))
-
-  Solving for \rho_{t+1},
-       \rho_{t+1} = 1/(D - R) (\eta tr(S_t) + (1-\eta)(D \rho_t + tr(D_t)) - tr(C_t^{0.5})).   (eqn:rhot1)
-
-  Note that it is technically possible that diagonal elements of
-  of D_{t+1} may be negative, but we can still show that F_{t+1} is strictly
-  positive definite if F_t was strictly positive definite.
-
-  If the quantities for which we are computing the Fisher matrix are all zero
-  for some, reason, the sequence of F_t will geometrically approach zero, which
-  would cause problems with inversion; to prevent this happening, after setting
-  D_{t+1} and \rho_{t+1} as above, we floor \rho_{t+1} to a small value (like
-  1.0e-10).
-
-  OK, we have described the updating of R_t, D_t and \rho_t.  Next, we need to
-  figure out how to efficiently multiply by the inverse of F_t.  Our experience
-  from working with the old preconditioning method was that it's best not to use
-  the inverse of the Fisher matrix itself, but a version of the Fisher matrix
-  that's smoothed with some constant times the identity.  Below, (\alpha is a
-  configuration value, e.g. 4.0 seemed to work well).  The following formula is
-  designed to ensure that the smoothing varies proportionally with the scale of F_t:
-
-        G_t =(def) F_t +  \alpha/D tr(F_t) I
-            =     R_t^T D_t R_t + (\rho_t + \alpha/D tr(F_t)) I
-            =     R_t^T D_t R_t + \beta_t I
-  where
-    \beta_t =(def) \rho_t + \alpha/D tr(F_t)
-            =      \rho_t(1+\alpha) + \alpha/D tr(D_t)       (eqn:betat2)
-
-  Define
-     \hat{X}_t =(def)  \beta_t X_t G_t^{-1}.
-  the factor of \beta_t is inserted arbitrarily as it just happens to be convenient
-  to put unit scale on X_t in the formula for \hat{X}_t; it will anyway be canceled out
-  in the next step.  Then our final preconditioned minibatch of vectors is:
-     \bar{X}_t = \gamma_t \hat{X}_t
-  where
-     \gamma_t = sqrt(tr(X_t X_t^T)  / tr(\hat{X}_t \hat{X}_t^T).
-  The factor of \gamma ensures that \bar{X}_t is scaled to have the same overall
-  2-norm as the input X_t.  We found in previous versions of this method that this
-  rescaling was helpful, as otherwise there are certain situations (e.g. at the
-  start of training) where the preconditioned derivatives can get very large.  Note
-  that this rescaling introduces a small bias into the training, because now the
-  scale applied to a given sample depends on that sample itself, albeit in an
-  increasingly diluted way as the minibatch size gets large.
-
-  To efficiently compute G_t^{-1}, we will use the Woodbury matrix identity.
-  Writing the Woodbury formula for the symmetric case,
-    (A + U D U^T)^{-1} = A^{-1} - A^{-1} U (D^{-1} + U^T A^{-1} U)^{-1} U^T A^{-1}
-  Substituting A = \beta_t I, D = D_t and U = R_t^T, this becomes
-       G_t^{-1} = 1/\beta_t I - 1/\beta_t^2 R_t^T (D_t^{-1} + 1/\beta_t I)^{-1} R_t
-                = 1/\beta_t (I - R_t^T E_t R_t)
-  where
-        E_t =(def)  1/\beta_t (D_t^{-1} + 1/\beta_t I)^{-1},         (eqn:etdef)
-  so
-    e_{tii} =   1/\beta_t * 1/(1/d_{tii} + 1/\beta_t)                (eqn:tii)
-            =   1/(\beta_t/d_{tii} + 1)
-
-  We would like an efficient-to-compute expression for \hat{X}_t, without too many separate
-  invocations of kernels on the GPU.
-     \hat{X}_t = \beta_t X_t G_t^{-1}
-         = X_t - X_t R_t^T E_t R_t
-  For efficient operation on the GPU, we want to reduce the number of high-dimensional
-  operations that we do (defining "high-dimension" as anything involving D or M, but not
-  R, since R is likely small, such as 20).  We define
-     W_t =(def)  E_t^{0.5} R_t.
-  We will actually be storing W_t on the GPU rather than R_t, in order to reduce the
-  number of operations on the GPU.  We can now write:
-
-        \hat{X}_t = X_t - X_t W_t^T W_t       (eqn:pt2)
-
-  The following, which we'll compute on the GPU, are going to be useful in computing
-  quantities like Z_t:
-
-     H_t =(def) X_t W_t^T     (dim is N by R)
-     J_t =(def) H_t^T X_t     (dim is R by D)
-         =      W_t X_t^T X_t
-     K_t =(def) J_t J_t^T     (dim is R by R, symmetric).. transfer this to CPU.
-     L_t =(def) H_t^T H_t     (dim is R by R, symmetric).. transfer this to CPU.
-         =      W_t X_t^T X_t W_t^T
-     Note: L_t may also be computed as
-     L_t = J_t W_t^T
-     which may be more efficient if D < N.
-
-  Note: after we have computed H_t we can directly compute
-     \hat{X}_t = X_t - H_t W_t
-
-  We need to determine how Y_t and Z_t relate to the quantities we just defined.
-  First, we'll expand out H_t, J_t, K_t and L_t in terms of the more fundamental quantities.
-     H_t = X_t R_t^T E_t^{0.5}
-     J_t = E_t^{0.5} R_t X_t^T X_t
-     K_t = E_t^{0.5} R_t X_t^T X_t X_t^T X_t R_t^T E_t^{0.5}
-     L_t = E_t^{0.5} R_t X_t^T X_t R_t^T E_t^{0.5}
-
-  we wrote above that
-      Y_t = \eta R_t S_t + (1-\eta) (D_t + \rho_t I) R_t
-  so
-      Y_t = \eta/N R_t X_t^T X_t   + (1-\eta) (D_t + \rho_t I) R_t
-          = \eta/N E_t^{-0.5} J_t  + (1-\eta) (D_t + \rho_t I) R_t     (eqn:yt)
-  We will expand Z_t using the expression for Y_t in the line above:
-      Z_t = Y_t Y_t^T
-          =  (\eta/N)^2 E_t^{-0.5} J_t J_t^T E_t^{-0.5}
-            +(\eta/N)(1-\eta) E_t^{-0.5} J_t R_t^T (D_t + \rho_t I)
-            +(\eta/N)(1-\eta) (D_t + \rho_t I) R_t J_t^T E_t^{-0.5}
-            +(1-\eta)^2 (D_t + \rho_t I)^2
-          = (\eta/N)^2 E_t^{-0.5} K_t E_t^{-0.5}
-           +(\eta/N)(1-\eta) E_t^{-0.5} L_t E_t^{-0.5} (D_t + \rho_t I)
-           +(\eta/N)(1-\eta) (D_t + \rho_t I) E_t^{-0.5} L_t E_t^{-0.5}
-           +(1-\eta)^2 (D_t + \rho_t I)^2                              (eqn:Zt)
-  We compute Z_t on the CPU using the expression above, and then do the symmetric
-  eigenvalue decomposition (also on the CPU):
-      Z_t = U_t C_t U_t^T.
-  and we make sure the eigenvalues are sorted from largest to smallest, for
-  reasons that will be mentioned later.
-
-  Mathematically, no diagonal element of C_t can be less than (1-\eta)^2
-  \rho_t^2, and since negative or zero elements of C_t would cause us a problem
-  later, we floor C_t to this value.  (see below regarding how we ensure R_{t+1}
-  has orthonormal rows).
-
-  We will continue the discussion below regarding what we do with C_t and U_t.
-  Next, we need to digress briefly and describe how to compute
-  tr(\hat{X}_t \hat{X}_t^T) and tr(X_t X_t^2), since these appear in expressions for
-  \gamma_t (needed to produce the output \bar{X}_t), and for \rho_{t+1}.  It happens
-  that we need, for purposes of appying "max_change" in the neural net code, the
-  squared 2-norm of each row of the output \bar{X}_t.  In order to be able to compute
-  \gamma_t, it's most convenient to compute this squared row-norm for each row
-  of \hat{X}_t, as a vector, to compute tr(\hat{X}_t \hat{X}_t^2) from this vector as its sum, and
-  to then work back to compute tr(X_t X_t^2) from the relation between \hat{X}_t and
-  X_t.  We can then scale the row-norms we computed for \hat{X}_t, so they apply to
-  \bar{X}_t.
-
-  For current purposes, you can imagine that we computed tr(\hat{X}_t \hat{X}_t^T) directly.
-  Using (from eqn:pt2)
-      \hat{X}_t = X_t - X_t W_t^T W_t,
-  we can expand tr(\hat{X}_t \hat{X}_t^T) as:
-   tr(\hat{X}_t \hat{X}_t^T) = tr(X_t X_t^T) + tr(X_t W_t^T W_t W_t^T W_t X_t^T)
-                  - 2 tr(X_t W_t^T W_t X_t^T)
-                 = tr(X_t X_t^T) + tr(W_t X_t^T X_t W_t^T W_t W_t^T)
-                  - 2 tr(W_t X_t^T X_t W_t^T)
-                 = tr(X_t X_t^T) + tr(L_t W_t W_t^T) - 2 tr(L_t)
-                 = tr(X_t X_t^T) + tr(L_t E_t) - 2 tr(L_t)
-  and all quantities have already been computed (or are quick to compute, such as
-  the small traces on the right), except tr(X_t X_t^T), so we can write
-
-    tr(X_t X_t^T) = tr(\hat{X}_t \hat{X}_t^T) - tr(L_t E_t) + 2 tr(L_t)
-  and the above expression can be used to obtain tr(X_t X_t^2).
-  We can then do
-     \gamma_t <-- sqrt(tr(X_t X_t^T)  / tr(\hat{X}_t \hat{X}_t^T)).
-  (or one if the denominator is zero), and then
-      \bar{X}_t <-- \gamma_t \hat{X}_t
-  We can then output the per-row squared-l2-norms of Q by scaling those we
-  computed from P by \gamma_t^2.
-
-  OK, the digression on how to compute \gamma_t and tr(X_t X_t^T) is over.
-  We now return to the computation of R_{t+1}, W_{t+1}, \rho_{t+1}, D_{t+1} and E_{t+1}.
-
-  We found above in (eqn:rhot1)
-     \rho_{t+1} = 1/(D - R) (\eta tr(S_t) + (1-\eta)(D \rho_t + tr(D_t)) - tr(C_t^{0.5})).
-  Expanding out S_t from its definition in (eqn:St),
-     \rho_{t+1} = 1/(D - R) (\eta/N tr(X_t X_t^T) + (1-\eta)(D \rho_t + tr(D_t)) - tr(C_t^{0.5})).
-  We can compute this directly as all the quantities involved are already known
-  or easy to compute.
-  Next, from (eqn:dt1), we compute
-     D_{t+1} = C_t^{0.5} - \rho_{t+1} I
-  At this point if \rho_{t+1} is smaller than some small value \epsilon, e.g. 1.0e-10, we
-  set it to \epsilon; as mentioned, we do this to stop F_t approaching zero if all inputs
-  are zero.  Next, if any diagonal element D_{t+1,i,i} has absolute value less
-  than \epsilon, we set it to +\epsilon.  This is to ensure that diagonal
-  elements of E are never zero, which would cause problems.
-
-  Next, we compute (from eqn:betat2, eqn:etdef, eqn:tii),
-        \beta_{t+1} = \rho_{t+1} (1+\alpha) + \alpha/D tr(D_{t+1})
-            E_{t+1} = 1/\beta_{t+1} (D_{t+1}^{-1} + 1/\beta_{t+1} I)^{-1},
- i.e.:      e_{tii} = 1/(\beta_{t+1}/d_{t+1,ii} + 1)
-
- We'll want to store D_{t+1}.  We next want to compute W_{t+1}.
-
-  Before computing W_{t+1}, we need to find an expression for
-     R_{t+1} = C_t^{-0.5} U_t^T Y_t
-   Expanding out Y_t using the expression in (eqn:yt),
-     R_{t+1} = C_t^{-0.5} U_t^T  (\eta/N E_t^{-0.5} J_t  + (1-\eta) (D_t + \rho_t I) R_t)
-             =  (\eta/N C_t^{-0.5} U_t^T E_t^{-0.5})  J_t
-               +((1-\eta) C_t^{-0.5} U_t^T (D_t + \rho_t I) E_t^{-0.5}) W_t
-
-   What we actually want is W_{t+1} = E_{t+1}^{0.5} R_{t+1}:
-     W_{t+1} = (\eta/N E_{t+1}^{0.5} C_t^{-0.5} U_t^T E_t^{-0.5}) J_t
-              +((1-\eta) E_{t+1}^{0.5} C_t^{-0.5} U_t^T (D_t + \rho_t I) E_t^{-0.5}) W_t
-   and to minimize the number of matrix-matrix multiplies we can factorize this as:
-     W_{t+1} = A_t B_t
-        A_t = (\eta/N) E_{t+1}^{0.5} C_t^{-0.5} U_t^T E_t^{-0.5}
-        B_t = J_t + (1-\eta)/(\eta/N) (D_t + \rho_t I) W_t
-   [note: we use the fact that (D_t + \rho_t I) and E_t^{-0.5} commute because
-    they are diagonal].
-
-  A_t is computed on the CPU and transferred from there to the GPU, B_t is
-  computed on the PGU, and the multiplication of A_t with B_t is done on the GPU.
-
-   * Keeping R_t orthogonal *
-
-   Our method requires the R_t matrices to be orthogonal (which we define to
-   mean that R_t R_t^T = I).  If roundoff error causes this equality to be
-   significantly violated, it could cause a problem for the stability of our
-   method.  We now address our method for making sure that the R_t values stay
-   orthogonal.  We do this in the algorithm described above, after creating
-   W_{t+1}.  This extra step is only executed if the condition number of C_t
-   (i.e. the ratio of its largest to smallest diagonal element) exceeds a
-   specified threshold, such as 1.0e+06 [this is tested before applying the
-   floor to C_t].  The threshold was determined empirically by finding the
-   largest value needed to ensure a certain level of orthogonality in R_{t+1}.
-   For purposes of the present discussion, since R_{t+1} is not actually stored,
-   define it as E_{t+1}^{-0.5} W_{t+1}.  Define the following (and we will
-   just use t instead of t+1 below, as all quantities have the same subscript):
-
-      O_t =(def) R_t R_t^T
-          =  E_t^{-0.5} W_t W_t^T E_t^{-0.5}
-
-   (and we would compute this by computing W_t W_t^T on the GPU, transferring
-   it to the CPU, and doing the rest there).  If O_t is not sufficiently close
-   to the unit matrix, we can re-orthogonalize as follows:
-   Do the Cholesky decomposition
-      O_t = C C^T
-   Clearly C^{-1} O_t C^{-T} = I, so if we correct R_t with:
-      R_t <-- C^{-1} R_t
-   we can ensure orthogonality.  If R_t's first k rows are orthogonal, this
-   transform will not affect them, because of its lower-triangular
-   structure... this is good because (thanks to the eigenvalue sorting), the
-   larger eigenvectors are first and it is more critical to keep them pointing
-   in the same direction.  Any loss of orthogonality will be dealt with by
-   modifying the smaller eigenvectors.
-   As a modification to W_t, this would be:
-      W_t <-- (E_t^{0.5} C^{-1} E_t^{-0.5}) W_t,
-   and the matrix in parentheses is computed on the CPU, transferred to the
-   GPU, and the multiplication is done there.
-
-
-   * Initialization *
-
-   Now, a note on what we do on time t = 0, i.e. for the first minibatch.  We
-   initialize X_0 to the top R eigenvectors of 1/N X_0 X_0^T, where N is the
-   minibatch size (num-rows of R0).  If L is the corresponding RxR diagonal
-   matrix of eigenvalues, then we will set D_0 = L - \rho_0 I.  We set \rho_0
-   to ensure that
-                      tr(F_0) = 1/N tr(X_0 X_0^T),
-           tr(D_0) - \rho_0 D = 1/N tr(X_0 X_0^T),
-  tr(L) + \rho_0 R - \rho_0 D = 1/N tr(X_0 X_0^T)
-                       \rho_0 = (1/N tr(X_0 X_0^T) - tr(L)) / (D - R)
-
-   We then floor \rho_0 to \epsilon (e.g. 1.0e-10) and also floor the
-   diagonal elements of D_0 to \epsilon; this ensures that we won't
-   crash for zero inputs.
-
-   A note on multi-threading.  This technique was really designed for use
-   with a GPU, where we won't have multi-threading, but we want it to work
-   also on a CPU, where we may have multiple worker threads.
-   Our approach is as follows (we do this when we're about to start updating
-   the parameters R_t, D_t, \rho_t and derived quantities):
-
-    For time t > 0 (where the matrices are already initialized), before starting
-    the part of the computation that updates the parameters (R_t, D_t, \rho_t and
-    derived quantities), we try to lock a mutex that guards the OnlinePreconditioner.
-    If we can lock it right away, we go ahead and do the update, but if not,
-    we just abandon the attempt to update those quantities.
-
-    We will have another mutex to ensure that when we access quantities like
-    W_t, \rho_t they are all "in sync" (and we don't access them while they are
-    being written by another thread).  This mutex will only be locked for short
-    periods of time.
-
-   Note: it might be a good idea to make sure that the R_t still retain orthonormal
-   rows even in the presence of roundoff, without errors accumulating.  My instinct
-   is that this isn't going to be a problem.
- */
-
-
-class OnlinePreconditioner {
- public:
-  OnlinePreconditioner();
-
-  void SetRank(int32 rank);
-  void SetUpdatePeriod(int32 update_period);
-  // num_samples_history is a time-constant (in samples) that determines eta.
-  void SetNumSamplesHistory(BaseFloat num_samples_history);
-  void SetAlpha(BaseFloat alpha);
-  void TurnOnDebug() { self_debug_ = true; }
-  BaseFloat GetNumSamplesHistory() const { return num_samples_history_; }
-  BaseFloat GetAlpha() const { return alpha_; }
-  int32 GetRank() const { return rank_; }
-  int32 GetUpdatePeriod() const { return update_period_; }
-
-  // The "R" pointer is both the input (R in the comment) and the output (P in
-  // the comment; equal to the preconditioned directions before scaling by
-  // gamma).  If the pointer "row_prod" is supplied, it's set to the inner product
-  // of each row of the preconditioned directions P, at output, with itself.
-  // You would need to apply "scale" to R and "scale * scale" to row_prod, to
-  // get the preconditioned directions; we don't do this ourselves, in order to
-  // save CUDA calls.
-  void PreconditionDirections(CuMatrixBase<BaseFloat> *R,
-                              CuVectorBase<BaseFloat> *row_prod,
-                              BaseFloat *scale);
-
-  // Copy constructor.
-  explicit OnlinePreconditioner(const OnlinePreconditioner &other);
-  // Assignent operator
-  OnlinePreconditioner &operator = (const OnlinePreconditioner &other);
- private:
-
-  // This does the work of PreconditionDirections (the top-level
-  // function handles some multithreading issues and then calls this function).
-  // Note: WJKL_t (dimension 2*R by D + R) is [ W_t L_t; J_t K_t ].
-  void PreconditionDirectionsInternal(const int32 t,
-                                      const BaseFloat rho_t,
-                                      const Vector<BaseFloat> &d_t,
-                                      CuMatrixBase<BaseFloat> *WJKL_t,
-                                      CuMatrixBase<BaseFloat> *X_t,
-                                      CuVectorBase<BaseFloat> *row_prod,
-                                      BaseFloat *scale);
-
-  void ComputeEt(const VectorBase<BaseFloat> &d_t,
-                 BaseFloat beta_t,
-                 VectorBase<BaseFloat> *e_t,
-                 VectorBase<BaseFloat> *sqrt_e_t,
-                 VectorBase<BaseFloat> *inv_sqrt_e_t) const;
-
-  void ComputeZt(int32 N,
-                 BaseFloat rho_t,
-                 const VectorBase<BaseFloat> &d_t,
-                 const VectorBase<BaseFloat> &inv_sqrt_e_t,
-                 const MatrixBase<BaseFloat> &K_t,
-                 const MatrixBase<BaseFloat> &L_t,
-                 SpMatrix<double> *Z_t) const;
-  // Computes W_{t+1}.  Overwrites J_t.
-  void ComputeWt1(int32 N,
-                  const VectorBase<BaseFloat> &d_t,
-                  const VectorBase<BaseFloat> &d_t1,
-                  BaseFloat rho_t,
-                  BaseFloat rho_t1,
-                  const MatrixBase<BaseFloat> &U_t,
-                  const VectorBase<BaseFloat> &sqrt_c_t,
-                  const VectorBase<BaseFloat> &inv_sqrt_e_t,
-                  const CuMatrixBase<BaseFloat> &W_t,
-                  CuMatrixBase<BaseFloat> *J_t,
-                  CuMatrixBase<BaseFloat> *W_t1) const;
-
-  // This function is called if C_t has high condition number; it makes sure
-  // that R_{t+1} is orthogonal.  See the section in the extended comment above
-  // on "keeping R_t orthogonal".
-  void ReorthogonalizeXt1(const VectorBase<BaseFloat> &d_t1,
-                          BaseFloat rho_t1,
-                          CuMatrixBase<BaseFloat> *W_t1,
-                          CuMatrixBase<BaseFloat> *temp_W,
-                          CuMatrixBase<BaseFloat> *temp_O);
-
-  void Init(const CuMatrixBase<BaseFloat> &R0);
-
-  // Initialize to some small 'default' values, called from Init().  Init() then
-  // does a few iterations of update with the first batch's data to give more
-  // reasonable values.
-  void InitDefault(int32 D);
-
-  // initializes R, which is assumed to have at least as many columns as rows,
-  // to a specially designed matrix with orthonormal rows, that has no zero rows
-  // or columns.
-  static void InitOrthonormalSpecial(CuMatrixBase<BaseFloat> *R);
-
-  // Returns the learning rate eta as the function of the number of samples
-  // (actually, N is the number of vectors we're preconditioning, which due to
-  // context is not always exactly the same as the number of samples).  The
-  // value returned depends on num_samples_history_.
-  BaseFloat Eta(int32 N) const;
-
-  // called if self_debug_ = true, makes sure the members satisfy certain
-  // properties.
-  void SelfTest() const;
-
-  // Configuration values:
-
-  // The rank of the correction to the unit matrix (e.g. 20).
-  int32 rank_;
-
-  // After a few initial iterations of updating whenever we can, we start only
-  // updating the Fisher-matrix parameters every "update_period_" minibatches;
-  // this saves time.
-  int32 update_period_;
-
-  // num_samples_history_ determines the value of eta, which in turn affects how
-  // fast we update our estimate of the covariance matrix.  We've done it this
-  // way in order to make it easy to have a single configuration value that
-  // doesn't have to be changed when we change the minibatch size.
-  BaseFloat num_samples_history_;
-
-  // alpha controls how much we smooth the Fisher matrix with the unit matrix.
-  // e.g. alpha = 4.0.
-  BaseFloat alpha_;
-
-  // epsilon is an absolute floor on the unit-matrix scaling factor rho_t in our
-  // Fisher estimate, which we set to 1.0e-10.  We don't actually make this
-  // configurable from the command line.  It's needed to avoid crashes on
-  // all-zero inputs.
-  BaseFloat epsilon_;
-
-  // delta is a relative floor on the unit-matrix scaling factor rho_t in our
-  // Fisher estimate, which we set to 1.0e-05: this is relative to the largest
-  // value of D_t.  It's needed to control roundoff error.  We apply the same
-  // floor to the eigenvalues in D_t.
-  BaseFloat delta_;
-
-  // t is a counter that measures how many updates we've done.
-  int32 t_;
-
-  // This keeps track of how many minibatches we've skipped updating the parameters,
-  // since the most recent update; it's used in enforcing "update_period_", which
-  // is a mechanism to avoid spending too much time updating the subspace (which can
-  // be wasteful).
-  int32 num_updates_skipped_;
-
-  // If true, activates certain checks.
-  bool self_debug_;
-
-  CuMatrix<BaseFloat> W_t_;
-  BaseFloat rho_t_;
-  Vector<BaseFloat> d_t_;
-
-
-  // Used to prevent parameters being read or written in an inconsistent state.
-  std::mutex read_write_mutex_;
-
-  // This mutex is used to control which thread gets to update the
-  // parameters, in multi-threaded code.
-  std::mutex update_mutex_;
-};
-
-} // namespace nnet2
-} // namespace kaldi
-
-
-#endif
diff --git a/src/nnet2/nnet-precondition-test.cc b/src/nnet2/nnet-precondition-test.cc
deleted file mode 100644
index b84e6117790..00000000000
--- a/src/nnet2/nnet-precondition-test.cc
+++ /dev/null
@@ -1,67 +0,0 @@
-// nnet2/nnet-precondition-test.cc
-
-// Copyright 2012  Johns Hopkins University (author:  Daniel Povey)
-
-// See ../../COPYING for clarification regarding multiple authors
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//  http://www.apache.org/licenses/LICENSE-2.0
-//
-// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
-// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
-// MERCHANTABLITY OR NON-INFRINGEMENT.
-// See the Apache 2 License for the specific language governing permissions and
-// limitations under the License.
-
-#include "nnet2/nnet-precondition.h"
-#include "util/common-utils.h"
-
-namespace kaldi {
-namespace nnet2 {
-
-void UnitTestPreconditionDirections() {
-  MatrixIndexT N = 2 + Rand() % 30,
-               D = 1 + Rand() % 20;
-  BaseFloat lambda = 0.1;
-  CuMatrix<BaseFloat> R(N, D), P(N, D);
-  R.SetRandn();
-  P.SetRandn(); // contents should be overwritten.
-
-  PreconditionDirections(R, lambda, &P);
-  // The rest of this function will do the computation the function is doing in
-  // a different, less efficient way and compare with the function call.
-  
-  CuSpMatrix<BaseFloat> G(D);
-  G.SetUnit();
-  G.ScaleDiag(lambda);
-  // G += R^T R.
-  G.AddMat2(1.0/(N-1), R, kTrans, 1.0);
-  
-  for (int32 n = 0; n < N; n++) {
-    CuSubVector<BaseFloat> rn(R, n);
-    CuSpMatrix<BaseFloat> Gn(G);
-    Gn.AddVec2(-1.0/(N-1), rn); // subtract the
-    // outer product of "this" vector.
-    Gn.Invert();
-    CuSubVector<BaseFloat> pn(P, n);
-    CuVector<BaseFloat> pn_compare(D);
-    pn_compare.AddSpVec(1.0, Gn, rn, 0.0);
-    KALDI_ASSERT(pn.ApproxEqual(pn_compare, 0.1));
-  }
-}
-
-
-} // namespace nnet2
-} // namespace kaldi
-
-
-int main() {
-  using namespace kaldi;
-  using namespace kaldi::nnet2;
-  for (int32 i = 0; i < 10; i++)
-    UnitTestPreconditionDirections();
-}
diff --git a/src/nnet2/nnet-precondition.cc b/src/nnet2/nnet-precondition.cc
deleted file mode 100644
index f828418ce8b..00000000000
--- a/src/nnet2/nnet-precondition.cc
+++ /dev/null
@@ -1,352 +0,0 @@
-// nnet2/nnet-precondition.cc
-
-// Copyright 2012   Johns Hopkins University (author: Daniel Povey)
-
-// See ../../COPYING for clarification regarding multiple authors
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//  http://www.apache.org/licenses/LICENSE-2.0
-//
-// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
-// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
-// MERCHANTABLITY OR NON-INFRINGEMENT.
-// See the Apache 2 License for the specific language governing permissions and
-// limitations under the License.
-
-#include "nnet2/nnet-precondition.h"
-
-namespace kaldi {
-namespace nnet2 {
-
-/// See below for comment.
-void PreconditionDirections(const CuMatrixBase<BaseFloat> &R,
-                            double lambda,
-                            CuMatrixBase<BaseFloat> *P) {
-  
-  int32 N = R.NumRows(), D = R.NumCols();
-  KALDI_ASSERT(SameDim(R, *P) && N > 0);
-  if (N == 1) {
-    KALDI_WARN << "Trying to precondition set of only one frames: returning "
-               << "unchanged.  Ignore this warning if infrequent.";
-    P->CopyFromMat(R);
-    return;
-  }
-  CuMatrixBase<BaseFloat> &Q = *P;
-  
-  if (N >= D) {
-    // Compute G = (\lambda I + 1/(N-1) R^T R)^{-1} by direct inversion.
-    // G <-- lambda I.
-    CuMatrix<BaseFloat> G(D, D);
-    G.AddToDiag(lambda);
-    // G += 1.0/(N-1) * R^T R.
-    G.SymAddMat2(1.0 / (N-1), R, kTrans, 1.0);
-    G.CopyLowerToUpper();
-    if (GetVerboseLevel() >= 5 && Rand() % 20 == 0) {
-      CuSpMatrix<BaseFloat> tmp(G, kTakeLower);
-      SpMatrix<BaseFloat> G_cpu(tmp);
-      G_cpu.PrintEigs("G");
-    }
-    G.SymInvertPosDef();
-    // Q <-- R G^T (we just make it transposed as we think
-    // it will be slightly faster; it's symmetric).
-    Q.AddMatMat(1.0, R, kNoTrans, G, kTrans, 0.0);
-  } else {
-    // Through a lot of rearrangements, it turns out that
-    // if we let  S = (\lambda I + 1/(N-1) R R^T)^{-1}
-    // then what we need is
-    // Q <-- S R.
-    // It is curious and (to me) unexpected that the actual code is basically
-    // the same when transposed.
-    CuMatrix<BaseFloat> S(N, N);
-    // S <-- lambda I.
-    S.AddToDiag(lambda);
-    // S += (N-1) R R^T.
-    // the following function only updates the lower triangle.
-    S.SymAddMat2(1.0 / (N-1), R, kNoTrans, 1.0);
-    S.CopyLowerToUpper();
-    // invert S, so now S = (\lambda I + (N-1) R R^T)^{-1}.
-    if (GetVerboseLevel() >= 5 && Rand() % 20 == 0) {
-      CuSpMatrix<BaseFloat> tmp(S, kTakeLower);
-      SpMatrix<BaseFloat> S_cpu(tmp);
-      S_cpu.PrintEigs("S");
-    }
-    S.SymInvertPosDef();
-    Q.AddMatMat(1.0, S, kNoTrans, R, kNoTrans, 0.0);
-  }
-
-#if 0  // Old code before it was optimized for CUDA:
-  for (int32 n = 0; n < N; n++) {
-    CuSubVector<BaseFloat> r(R, n), q(Q, n);
-    BaseFloat gamma = VecVec(r, q), // gamma_n = r_n^T q_n.
-               beta = 1 + gamma / (N - 1 - gamma);
-    if (!(gamma >= 0.0 && beta > 0.0)) {
-      KALDI_ERR << "Bad values encountered in preconditioning: gamma = " << gamma
-                << ", beta = " << beta;
-    }
-    // Q and P share the same memory.  The result of the
-    // scaling below will be output as P.
-    q.Scale(beta);
-  }
-#else
-  CuVector<BaseFloat> gamma(N);
-  gamma.AddDiagMatMat(1.0, R, kNoTrans, Q, kTrans, 0.0);
-  // at this point, gamma(i) equals the i'th row of R dotted with
-  // the i'th row of Q.
-  Vector<BaseFloat> cpu_gamma(gamma), cpu_beta(N, kUndefined);
-  for (int32 n = 0; n < N; n++) {
-    BaseFloat this_gamma = cpu_gamma(n),
-        this_beta = 1.0 + this_gamma / (N - 1 - this_gamma);
-    if (!(this_gamma >= 0.0 && this_beta > 0.0))
-      KALDI_ERR << "Bad values encountered in preconditioning: gamma = "
-                << this_gamma << ", beta = " << this_beta;
-    cpu_beta(n) = this_beta;
-  }
-  CuVector<BaseFloat> beta(cpu_beta);
-  P->MulRowsVec(beta);
-#endif
-}
-
-
-void PreconditionDirectionsAlpha(
-    const CuMatrixBase<BaseFloat> &R,
-    double alpha,
-    CuMatrixBase<BaseFloat> *P) {
-  KALDI_ASSERT(alpha > 0.0);
-  // probably does not really make sense.
-  double t = TraceMatMat(R, R, kTrans), floor = 1.0e-20;
-  if (t < floor) {
-    KALDI_WARN << "Flooring trace from " << t
-               << " to " << floor;
-    t = floor;
-  }
-  double lambda = t * alpha / R.NumRows() / R.NumCols();
-  // see the extended comment below for an explanation of this.
-  if (lambda <= 0.0) {
-    // This should never really happen, it would probably indicate a bug
-    // in the calling code.
-    KALDI_WARN << "Zero or negative lambda in PreconditionDirectionsAlpha.";
-    lambda = 1.0e-10;
-  }
-  PreconditionDirections(R, lambda, P);
-}
-
-
-void PreconditionDirectionsAlphaRescaled(
-    const CuMatrixBase<BaseFloat> &R,
-    double alpha,
-    CuMatrixBase<BaseFloat> *P) {
-  KALDI_ASSERT(alpha > 0.0); // alpha > 1.0
-  // probably does not really make sense.
-  double t = TraceMatMat(R, R, kTrans), floor = 1.0e-20;
-  if (t == 0.0) {
-    P->CopyFromMat(R);
-    return;
-  }
-  if (t < floor) {
-    KALDI_WARN << "Flooring trace from " << t
-               << " to " << floor;
-    t = floor;
-  }
-  double lambda = t * alpha / R.NumRows() / R.NumCols();
-  // see the extended comment below for an explanation of this.
-  KALDI_ASSERT(lambda != 0.0);
-  PreconditionDirections(R, lambda, P);
-  double p_trace = TraceMatMat(*P, *P, kTrans),
-      rescale = sqrt(t / p_trace);
-  KALDI_ASSERT(p_trace != 0.0);
-  P->Scale(rescale);
-}
-
-
-} // namespace nnet2
-} // namespace kaldi
-
-/*
-  Notes for an idea on preconditioning.
-  update is of form:
-     params += learning_rate * input_row * output_deriv'
-  want to precondition by fisher-like matrix in each of (the input dim and the
-  output dim).
-  [note: in this method we'll pretend the chunk-weights are all one.
-   It shouldn't really matter, it's only preconditioning.]
-
-   The first observation is, if we do:
-
-    params += learning_rate * S * input_row * output_deriv' * T
-
-   for any positive definite S and T that we choose (well, perhaps we have
-   to ensure their eigenvalues are bounded in some way, but we'll bother with
-   that later),  then we'll still get convergence.  But S and T cannot be
-   functions of the current sample, the one that creates "input_row" and
-   "output_deriv", or this introduces a bias.
-
-   We can view it as a preconditioning of the vectorized form of the
-   transformation matrix.
-
-   For a Fisher-like preconditioning, we can precondition using
-   the inverse of the scatter of the other features in the batch.
-   For the input_row, call this r_j.
-
-   Let the total scatter be
-
-    S =  \sum_n r_n r_n^T
-  where the sum is taken over the minibatch, and
-   S_n = S - r_n  r_n^T
-  i.e. the scatter with this sample removed.
-  Let F_n be the normalized version of this, dividing by the #samples.
-   F_n = 1/(N-1) S_n
-  where N is the minibatch size (so N-1 is excluding the current sample).
- We're going to want to invert F_n, so we need to make it positive definite.
-
-  We're going to define G_n as a smoothed form of the estimated Fisher matrix
-  for this batch:
-   G_n = F_n + \lambda_n I
-  where I is the identity.  A suitable formula for \lambda_n is to define
-  a small constant \alpha (say, \alpha=0.1), and let
-  
-   \lambda_n =  (\alpha/dim(F)) trace(F_n) .
-
-  In practice (although we lost strict convergence guarantees) it will be easier
-  to set a global \lambda, to:
-
-   \lambda  =  (\alpha/dim(S)) trace(S)
-            = (\alpha/(R.NumRows()*R.NumCols()) * trace(R^T R)).
-  
-  This is an easy way to set it.  Let's define P_n as the inverse of G_n.  This
-  is what we'll be multiplying the input values by:
-
-    P_n = G_n^{-1} = (F_n + \lambda_n I)^{-1}
-
-  First, let's define an uncorrected "global" Fisher matrix
-    F = (1/(N-1)) S_n,
-  and G = F^{-1}.
-  If we let R be the matrix each of whose rows is one of the r_n,
-  then
-    S = R^T R, and
-   F = 1/(N-1) R^T R
-
-           G = (F + \lambda I)^{-1}
-             = (1/(N-1) R^T R + \lambda I)^{-1}
-Using the Woodbury formula,
-     G  = (1/\lambda) I  - (1/\lambda^2) R^T M R
-where
-  M = ((N-1) I + 1/\lambda R R^T)^{-1}
-(and this inversion for M is actually done as an inversion, in a lower
- dimension such as 250, versus the actual dimension which might be 1000).
-
-Let's assume \lambda is a constant, i.e. there is no \lambda_n.
-We can get it from the previous minibatch.
-
- We want to compute
-
-    G_n = F_n^{-1} = (F - 1/(N-1) r_n r_n^T)^{-1}
-
- and using the Sherman-Morrison formula, this may be written as:
-
-   G_n = G  +  \alpha_n q_n q_n^T  # Caution: \alpha_n has nothing to do with \alpha.
-
- where q_n = G r_n, and
-
- \alpha_n =  1/( (N-1) (1 - 1/(N-1) r_n^T q_n) )
-          =  1 / (N - 1 - r_n^T q_n)
-
-  We'll want to compute this efficiently.  For each r_n we'll want to compute
-
- p_n =  G_n r_n
-
- which will correspond to the direction we update in.
- We'll use
-
-  p_n = G r_n + \alpha_n q_n q_n^T r_n
-
-  and since q_n = G r_n, both terms in this equation point in
-  the same direction, and we can write this as:
-
-  p_n = \beta_n q_n,
-
-  where, defining \gamma_n = r_n^T q_n, we have
-
-  \beta_n = 1 + \gamma_n \alpha_n 
-          = 1  +  \gamma_n / ((N-1) (1 - \gamma_n/(N-1)))
-          = 1  +  \gamma_n / (N - 1 - \gamma_n)
-  
-*/
-
-/*
-
-  SUMMARY:
-   let the input features (we can extend these with a 1 for the bias term) be
-   a matrix R, each row of which corresponds to a training example r_n
-
-   The dimension of R is N x D, where N is the minibatch size and D is the
-   dimension of the input to this layer of the network.
-
-   We'll be computing a matrix P, each row p_n of which will be the corresponding
-   row r_n of R, multiplied by a positive definite preconditioning matrix G_n.
-   [we can check that for each i, p_n^T r_n >= 0].
-   The following computation obtains P:
-
-   \lambda <-- (\alpha/N) \trace(R R^T).   # 0 < \alpha <= 1 is a global constant, e.g.
-                                           # \alpha = 0.1, but should try different
-                                           # values, this will be important (note: if the
-                                           # minibatch size is >= the dimension (N >= D),
-                                           # then we can let \alpha be quite small, e.g.
-                                           # 0.001.
-
-   if N >= D, then
-     # compute G by direct inversion.
-     G <-- (\lambda I  +  1/(N-1) R^T R)^{-1}
-     Q <-- R G.
-   else   # number of samples is less than dimension, use
-          # morrison-Woodbury formula, it's more efficient.
-      # We'd first compute:
-      # L <-- ((N-1) I + 1/\lambda R R^T)
-      # (note: L is something that appears in the morrison-Woodbury expansion of G)
-      # M <-- L^{-1}
-      # Note: G is  1/\lambda I  -  (1/\lambda^2) R^T M R
-      # We're doing Q <-- R G, which is:
-      # Q <-- 1/\lambda R - (1/\lambda^2) R (R^T M R)
-      # It's more efficient in this case to left-multiply R
-      # by something, i.e. bracket as:
-      # Q <-- 1/\lambda R - (1/\lambda^2) (R R^T M) R
-      # so let's write it as
-      # Q <-- S R, with
-      # S = 1/\lambda I - 1/\lambda^2 R R^T M
-      #   = 1/\lambda (I - 1/\lambda R R^T M)
-      # Now, -1/\lambda R R^T = (N-1) I - L, and L M = I, so
-      # S = 1/\lambda (I  + ((N-1) I - L) M)
-      #   = (N-1)/\lambda M
-      # and we can get rid of that scalar earlier on:
-      # if we let L' = \lambda/(N-1) L, so that
-      # L' = (lambda I + 1/(N-1) R R^T)
-      # then
-      # S = (\lambda I + 1/(N-1) R R^T)^{-1}. 
-
-      S <-- (\lambda I + 1/(N-1) R R^T)^{-1}.
-      Q <-- S R
-   fi
-
-   Here, we're right multiplying each row r_n of r by the symmetric matrix G, to get
-   the corresponding row q_n of q.  Note: in practice Q will be the same memory as P.
-   Next we work out for each n:
-     \gamma_n = r_n^T q_n     # This should be nonnegative!  Check this.
-      \beta_n = 1  +  \gamma_n / (N - 1 - \gamma_n)  # This should be positive; check this.
-  For each n, we'll do (for the corresponding rows of P and Q):
-     p_n <-- \beta_n q_n.
-  In practice, we'd do this computation in-place, with P and Q using the
-  same memory.
-
-  If we're being paranoid, we should verify that
-
-   p_n = (\lambda I  +  1/(N-1) \sum_{m != n} r_n r_n^T)^{-1} r_n.
-
-  This is exact mathematically, but there could be differences due to roundoff,
-  and if \alpha is quite small, these differences could be substantial.
-  
- */
-    
-
diff --git a/src/nnet2/nnet-precondition.h b/src/nnet2/nnet-precondition.h
deleted file mode 100644
index cf930cd855e..00000000000
--- a/src/nnet2/nnet-precondition.h
+++ /dev/null
@@ -1,88 +0,0 @@
-// nnet2/nnet-precondition.h
-
-// Copyright 2012   Johns Hopkins University (author: Daniel Povey)
-
-// See ../../COPYING for clarification regarding multiple authors
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//  http://www.apache.org/licenses/LICENSE-2.0
-//
-// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
-// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
-// MERCHANTABLITY OR NON-INFRINGEMENT.
-// See the Apache 2 License for the specific language governing permissions and
-// limitations under the License.
-
-#ifndef KALDI_NNET2_NNET_PRECONDITION_H_
-#define KALDI_NNET2_NNET_PRECONDITION_H_
-
-#include "base/kaldi-common.h"
-#include "matrix/matrix-lib.h"
-#include "cudamatrix/cu-matrix-lib.h"
-
-#include <iostream>
-
-namespace kaldi {
-namespace nnet2 {
-
-
-/**
-  The function PreconditionDirections views the input R as
-  a set of directions or gradients, each row r_i being one of the
-  directions.  For each i it constructs a preconditioning matrix
-  G_i formed from the *other* i's, using the formula:
-
-  G_i = (\lambda I + (1/(N-1)) \sum_{j \neq i} r_j r_j^T)^{-1},
-
-  where N is the number of rows in R.  This can be seen as a kind
-  of estimated Fisher matrix that has been smoothed with the
-  identity to make it invertible.  We recommend that you set
-  \lambda using:
-    \lambda = \alpha/(N D) trace(R^T, R)
-  for some small \alpha such as \alpha = 0.1.  However, we leave
-  this to the caller because there are reasons relating to
-  unbiased-ness of the resulting stochastic gradient descent, why you
-  might want to set \lambda using "other" data, e.g. a previous
-  minibatch.
-
-  The output of this function is a matrix P, each row p_i of
-  which is related to r_i by:
-    p_i = G_i r_i
-  Here, p_i is preconditioned by an estimated Fisher matrix
-  in such a way that it's suitable to be used as an update direction.
-
- */
-void PreconditionDirections(const CuMatrixBase<BaseFloat> &R,
-                            double lambda,
-                            CuMatrixBase<BaseFloat> *P);
-
-/**
-   This wrapper for PreconditionDirections computes lambda
-   using \lambda = \alpha/(N D) trace(R^T, R), and calls
-   PreconditionDirections. */
-void PreconditionDirectionsAlpha(
-    const CuMatrixBase<BaseFloat> &R,
-    double alpha,
-    CuMatrixBase<BaseFloat> *P);
-
-/**
-   This wrapper for PreconditionDirections computes lambda
-   using \lambda = \alpha/(N D) trace(R^T, R), and calls
-   PreconditionDirections.  It then rescales *P so that
-   its 2-norm is the same as that of R. */
-void PreconditionDirectionsAlphaRescaled(
-    const CuMatrixBase<BaseFloat> &R,
-    double alpha,
-    CuMatrixBase<BaseFloat> *P);
-  
-                           
-
-} // namespace nnet2
-} // namespace kaldi
-
-
-#endif
diff --git a/src/nnet2/nnet-stats.cc b/src/nnet2/nnet-stats.cc
deleted file mode 100644
index 9293613ee2b..00000000000
--- a/src/nnet2/nnet-stats.cc
+++ /dev/null
@@ -1,122 +0,0 @@
-// nnet2/nnet-stats.cc
-
-// Copyright 2012   Johns Hopkins University (author: Daniel Povey)
-
-// See ../../COPYING for clarification regarding multiple authors
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//  http://www.apache.org/licenses/LICENSE-2.0
-//
-// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
-// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
-// MERCHANTABLITY OR NON-INFRINGEMENT.
-// See the Apache 2 License for the specific language governing permissions and
-// limitations under the License.
-
-#include "nnet2/nnet-stats.h"
-
-namespace kaldi {
-namespace nnet2 {
-
-void NnetStats::StatsElement::PrintStats(std::ostream &os) {
-  BaseFloat c = (count == 0 ? 1 : count), // prevent division by zero.
-      deriv_mean = deriv_sum/c,
-      deriv_stddev = std::sqrt(deriv_sumsq/c - deriv_mean*deriv_mean),
-      abs_value_mean = abs_value_sum/c,
-      abs_value_stddev = std::sqrt(abs_value_sumsq/c -
-                                   abs_value_mean*abs_value_mean);
-
-  os << '[' << deriv_begin << ':' << deriv_end << "] count=" << count
-     << ", deriv mean,stddev=" << deriv_mean << ',' << deriv_stddev
-     << ", abs-avg-value mean,stddev=" << abs_value_mean << ','
-     << abs_value_stddev;
-}
-  
-void NnetStats::StatsElement::AddStats(BaseFloat avg_deriv, BaseFloat avg_value) {
-  count++;
-  deriv_sum += avg_deriv;
-  deriv_sumsq += avg_deriv * avg_deriv;
-  abs_value_sum += std::abs(avg_value);
-  abs_value_sumsq += avg_value * avg_value;
-}
-
-int32 NnetStats::BucketFor(BaseFloat avg_deriv) {
-  KALDI_ASSERT(avg_deriv >= 0.0);
-  KALDI_ASSERT(bucket_width_ > 0.0);
-  // cast ratio to int.  Since we do +0.5, this rounds down.
-  int32 index = static_cast<int32>(avg_deriv / bucket_width_ + 0.5);
-  while (index >= static_cast<int32>(buckets_.size()))
-    buckets_.push_back(StatsElement(buckets_.size() * bucket_width_,
-                                    (buckets_.size() + 1) * bucket_width_));
-  return index;
-}
-
-void NnetStats::AddStats(BaseFloat avg_deriv, BaseFloat avg_value) {
-  global_.AddStats(avg_deriv, avg_value);
-  buckets_[BucketFor(avg_deriv)].AddStats(avg_deriv, avg_value);
-}
-
-void NnetStats::AddStatsFromNnet(const Nnet &nnet) {
-  const AffineComponent *ac = dynamic_cast<const AffineComponent*>(
-      &(nnet.GetComponent(affine_component_index_)));
-  KALDI_ASSERT(ac != NULL); // would be an error in calling code.
-  const NonlinearComponent *nc = dynamic_cast<const NonlinearComponent*>(
-      &(nnet.GetComponent(affine_component_index_ + 1)));
-  KALDI_ASSERT(nc != NULL); // would be an error in calling code.
-
-  double count = nc->Count();
-  if (count == 0) {
-    KALDI_WARN << "No stats stored with nonlinear component";
-    return;
-  }
-  const CuVector<double> &value_sum = nc->ValueSum();
-  const CuVector<double> &deriv_sum = nc->DerivSum();
-  if (value_sum.Dim() != deriv_sum.Dim())
-    KALDI_ERR << "Error computing nnet stats: probably you are "
-              << "trying to compute stats for a sigmoid layer.";
-  for (int32 i = 0; i < value_sum.Dim(); i++) {
-    BaseFloat avg_value = value_sum(i) / count,
-        avg_deriv = deriv_sum(i) / count;
-    AddStats(avg_deriv, avg_value);
-  }
-}
-
-void NnetStats::PrintStats(std::ostream &os) {
-  os << "Stats for buckets:" << std::endl;
-  for (size_t i = 0; i < buckets_.size(); i++) {
-    buckets_[i].PrintStats(os);
-    os << std::endl;
-  }
-  os << "Global stats: ";
-  global_.PrintStats(os);
-  os << std::endl;
-}
-
-void GetNnetStats(const NnetStatsConfig &config,
-                  const Nnet &nnet,
-                  std::vector<NnetStats> *stats) {
-  KALDI_ASSERT(stats->size() == 0);
-  for (int32 c = 0; c + 1 < nnet.NumComponents(); c++) {
-    const AffineComponent *ac = dynamic_cast<const AffineComponent*>(
-        &(nnet.GetComponent(c)));
-    if (ac == NULL) continue;
-    const NonlinearComponent *nc = dynamic_cast<const NonlinearComponent*>(
-        &(nnet.GetComponent(c + 1)));
-    if (nc == NULL) continue;
-    // exclude softmax.
-    const SoftmaxComponent *sc = dynamic_cast<const SoftmaxComponent*>(
-        &(nnet.GetComponent(c + 1)));
-    if (sc != NULL) continue;
-    stats->push_back(NnetStats(c, config.bucket_width));
-    stats->back().AddStatsFromNnet(nnet);
-  }
-}
-
-
-
-} // namespace nnet2
-} // namespace kaldi
diff --git a/src/nnet2/nnet-stats.h b/src/nnet2/nnet-stats.h
deleted file mode 100644
index 3a4d6db2e99..00000000000
--- a/src/nnet2/nnet-stats.h
+++ /dev/null
@@ -1,97 +0,0 @@
-// nnet2/nnet-stats.h
-
-// Copyright 2012  Johns Hopkins University (author: Daniel Povey)
-
-// See ../../COPYING for clarification regarding multiple authors
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//  http://www.apache.org/licenses/LICENSE-2.0
-//
-// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
-// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
-// MERCHANTABLITY OR NON-INFRINGEMENT.
-// See the Apache 2 License for the specific language governing permissions and
-// limitations under the License.
-
-#ifndef KALDI_NNET2_NNET_STATS_H_
-#define KALDI_NNET2_NNET_STATS_H_
-
-#include "nnet2/nnet-nnet.h"
-
-namespace kaldi {
-namespace nnet2 {
-
-/* This program computes various statistics from a neural net.  These are
-   summaries of certain quantities already present in the network as
-   stored on disk, especially regarding certain average values and
-   derivatives of the sigmoids.   
-*/
-
-struct NnetStatsConfig {  
-  BaseFloat bucket_width;
-  NnetStatsConfig(): bucket_width(0.025) { }
-  
-  void Register(OptionsItf *opts) {
-    opts->Register("bucket-width", &bucket_width, "Width of bucket in average-derivative "
-                   "stats for analysis.");
-  }
-};
-
-class NnetStats {
- public:
-  NnetStats(int32 affine_component_index, BaseFloat bucket_width):
-      affine_component_index_(affine_component_index),
-      bucket_width_(bucket_width), global_(0, -1) { }
-  
-  // Use default copy constructor and assignment operator.
-  
-  void AddStats(BaseFloat avg_deriv, BaseFloat avg_value);
-
-  void AddStatsFromNnet(const Nnet &nnet);
-  
-  void PrintStats(std::ostream &os);  
- private:
-
-  struct StatsElement {
-    BaseFloat deriv_begin; // avg-deriv, beginning of bucket.
-    BaseFloat deriv_end;   // avg-deriv, end of bucket.
-    BaseFloat deriv_sum;   // sum of avg-deriv within bucket.
-    BaseFloat deriv_sumsq;   // Sum-squared of avg-deriv within bucket.
-    BaseFloat abs_value_sum; // Sum of abs(avg-value).  Tells us whether it's
-    // saturating at one or both ends.
-    BaseFloat abs_value_sumsq; // Sum-squared of abs(avg-value).
-    int32 count;      // Number of nonlinearities in this bucket.
-
-    StatsElement(BaseFloat deriv_begin,
-                 BaseFloat deriv_end):
-        deriv_begin(deriv_begin), deriv_end(deriv_end), deriv_sum(0.0),
-        deriv_sumsq(0.0), abs_value_sum(0.0), abs_value_sumsq(0.0), count(0) { }
-    void AddStats(BaseFloat avg_deriv, BaseFloat avg_value);
-    // Outputs stats for this bucket; no newline
-    void PrintStats(std::ostream &os); 
-  };
-  int32 BucketFor(BaseFloat avg_deriv); // returns the bucket
-  // for this avg-derivative value, and makes sure it is allocated.
-  
-  int32 affine_component_index_; // Component index of the affine component
-                                // associated with this nonlinearity.
-  BaseFloat bucket_width_; // width of buckets of stats we store (in derivative values).
-  
-  std::vector<StatsElement> buckets_; // Stats divided into buckets by avg_deriv.
-  StatsElement global_; // All the stats.
-  
-};
-
-void GetNnetStats(const NnetStatsConfig &config,
-                  const Nnet &nnet,
-                  std::vector<NnetStats> *stats);
-
-
-} // namespace nnet2
-} // namespace kaldi
-
-#endif // KALDI_NNET2_NNET_STATS_H_
diff --git a/src/nnet2/nnet-update-parallel.cc b/src/nnet2/nnet-update-parallel.cc
deleted file mode 100644
index b23ea500efc..00000000000
--- a/src/nnet2/nnet-update-parallel.cc
+++ /dev/null
@@ -1,271 +0,0 @@
-// nnet2/nnet-update-parallel.cc
-
-// Copyright 2012   Johns Hopkins University (author: Daniel Povey)
-
-// See ../../COPYING for clarification regarding multiple authors
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//  http://www.apache.org/licenses/LICENSE-2.0
-//
-// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
-// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
-// MERCHANTABLITY OR NON-INFRINGEMENT.
-// See the Apache 2 License for the specific language governing permissions and
-// limitations under the License.
-
-#include <numeric>
-#include "nnet2/nnet-update-parallel.h"
-#include "nnet2/nnet-update.h"
-#include "util/kaldi-thread.h"
-
-namespace kaldi {
-namespace nnet2 {
-
-
-class DoBackpropParallelClass: public MultiThreadable {
- public:
-  // This constructor is only called for a temporary object
-  // that we pass to the RunMultiThreaded function.
-  DoBackpropParallelClass(const Nnet &nnet,
-                          ExamplesRepository *repository,
-                          double *tot_weight_ptr,
-                          double *log_prob_ptr,
-                          Nnet *nnet_to_update,
-                          bool store_separate_gradients):
-      nnet_(nnet), repository_(repository),
-      nnet_to_update_(nnet_to_update),
-      nnet_to_update_orig_(nnet_to_update),
-      store_separate_gradients_(store_separate_gradients),
-      tot_weight_ptr_(tot_weight_ptr),
-      log_prob_ptr_(log_prob_ptr),
-      tot_weight_(0.0),
-      log_prob_(0.0) { }
-
-  // The following constructor is called multiple times within
-  // the RunMultiThreaded template function.
-  DoBackpropParallelClass(const DoBackpropParallelClass &other):
-      MultiThreadable(other),
-      nnet_(other.nnet_),
-      repository_(other.repository_),
-      nnet_to_update_(other.nnet_to_update_),
-      nnet_to_update_orig_(other.nnet_to_update_orig_),
-      store_separate_gradients_(other.store_separate_gradients_),
-      tot_weight_ptr_(other.tot_weight_ptr_),
-      log_prob_ptr_(other.log_prob_ptr_),
-      tot_weight_(0),
-      log_prob_(0.0) {
-    if (store_separate_gradients_) {
-      // To ensure correctness, we work on separate copies of the gradient
-      // object, which we'll sum at the end.  This is used for exact gradient
-      // computation.
-      if (other.nnet_to_update_ != NULL) {
-        nnet_to_update_ = new Nnet(*(other.nnet_to_update_));
-        // our "nnet_to_update_" variable is a copy of the neural network
-        // we are to update (presumably a gradient).  If we don't set these
-        // to zero we would end up adding multiple copies of the any initial
-        // gradient that "nnet_to_update_" contained when we initialize
-        // the first instance of the class.
-        nnet_to_update_->SetZero(true);
-      } else { // support case where we don't really need a gradient.
-        nnet_to_update_ = NULL;
-      }
-    }
-  }
-  // This does the main function of the class.
-  void operator () () {
-    std::vector<NnetExample> examples;
-    while (repository_->ProvideExamples(&examples)) {
-      // This is a function call to a function defined in
-      // nnet-update.h
-      double tot_loglike;
-      if (nnet_to_update_ != NULL)
-        tot_loglike = DoBackprop(nnet_, examples, nnet_to_update_);
-      else
-        tot_loglike = ComputeNnetObjf(nnet_, examples);
-      tot_weight_ += TotalNnetTrainingWeight(examples);
-      log_prob_ += tot_loglike;
-      KALDI_VLOG(4) << "Thread " << thread_id_ << " saw "
-                    << tot_weight_ << " frames so far (weighted); likelihood "
-                    << "per frame so far is " << (log_prob_ / tot_weight_);
-      examples.clear();
-    }
-  }
-
-  ~DoBackpropParallelClass() {
-    if (nnet_to_update_orig_ != nnet_to_update_) {
-      // This branch is only taken if this instance of the class is
-      // one of the multiple instances allocated inside the RunMultiThreaded
-      // template function, *and* store_separate_gradients_ has been set to true.
-      // In the typical hogwild case, we don't do this.
-      nnet_to_update_orig_->AddNnet(1.0, *nnet_to_update_);
-      delete nnet_to_update_;
-    }
-    *log_prob_ptr_ += log_prob_;
-    *tot_weight_ptr_ += tot_weight_;
-  }
- private:
-  const Nnet &nnet_;
-  ExamplesRepository *repository_;
-  Nnet *nnet_to_update_;
-  Nnet *nnet_to_update_orig_;
-  bool store_separate_gradients_;
-  double *tot_weight_ptr_;
-  double *log_prob_ptr_;
-  double tot_weight_;
-  double log_prob_; // log-like times num frames.
-};
-
-
-#if HAVE_CUDA == 1
-double DoBackpropSingleThreaded(const Nnet &nnet,
-                                int32 minibatch_size,
-                                SequentialNnetExampleReader *examples_reader,
-                                double *tot_weight_out,
-                                Nnet *nnet_to_update) {
-  double ans = 0.0, tot_weight = 0.0;
-  KALDI_ASSERT(minibatch_size > 0);
-  while (!examples_reader->Done()) {
-    std::vector<NnetExample> egs;
-    egs.reserve(minibatch_size);
-    while (egs.size() < minibatch_size && examples_reader->Done()) {
-      egs.push_back(examples_reader->Value());
-      examples_reader->Next();
-    }
-    ans += DoBackprop(nnet, egs, nnet_to_update);
-    tot_weight += TotalNnetTrainingWeight(egs);
-  }
-  *tot_weight_out = tot_weight;
-  return ans;
-}
-#endif
-
-
-double DoBackpropParallel(const Nnet &nnet,
-                          int32 minibatch_size,
-                          SequentialNnetExampleReader *examples_reader,
-                          double *tot_weight,
-                          Nnet *nnet_to_update) {
-#if HAVE_CUDA == 1
-  // Our GPU code won't work with multithreading; we do this
-  // to enable it to work with this code in the single-threaded
-  // case.
-  if (CuDevice::Instantiate().Enabled())
-    return DoBackpropSingleThreaded(nnet, minibatch_size, examples_reader,
-                                    tot_weight, nnet_to_update);
-#endif
-
-  ExamplesRepository repository; // handles parallel programming issues regarding
-  // the "examples" of data.
-  double tot_log_prob = 0.0;
-  *tot_weight = 0.0;
-
-  // This function assumes you want the exact gradient, if
-  // nnet_to_update != &nnet.
-  const bool store_separate_gradients = (nnet_to_update != &nnet);
-
-  DoBackpropParallelClass c(nnet, &repository, tot_weight,
-                            &tot_log_prob, nnet_to_update,
-                            store_separate_gradients);
-
-  {
-    // The initialization of the following class spawns the threads that
-    // process the examples.  They get re-joined in its destructor.
-    MultiThreader<DoBackpropParallelClass> m(g_num_threads, c);
-
-    std::vector<NnetExample> examples;
-    for (; !examples_reader->Done(); examples_reader->Next()) {
-      examples.push_back(examples_reader->Value());
-      if (examples.size() == minibatch_size)
-        repository.AcceptExamples(&examples);
-    }
-    if (!examples.empty()) // partial minibatch.
-      repository.AcceptExamples(&examples);
-    // Here, the destructor of "m" re-joins the threads, and
-    // does the summing of the gradients if we're doing gradient
-    // computation (i.e. &nnet != nnet_to_update).  This gets
-    // done in the destructors of the objects of type
-    // DoBackpropParallelClass.
-    repository.ExamplesDone();
-  }
-  KALDI_LOG << "Did backprop on " << *tot_weight << " examples, average log-prob "
-            << "per frame is " << (tot_log_prob / *tot_weight);
-  KALDI_LOG << "[this line is to be parsed by a script:] log-prob-per-frame="
-            << (tot_log_prob / *tot_weight);
-  return tot_log_prob;
-}
-
-
-double DoBackpropSingleThreaded(const Nnet &nnet,
-                                int32 minibatch_size,
-                                const std::vector<NnetExample> &egs,
-                                double *tot_weight,
-                                Nnet *nnet_to_update) {
-  double ans = 0.0;
-  *tot_weight = TotalNnetTrainingWeight(egs);
-  for (size_t i = 0; i < egs.size(); i += minibatch_size) {
-    std::vector<NnetExample>::const_iterator end_iter =
-      (i + minibatch_size > egs.size() ? egs.end() :
-       egs.begin() + i + minibatch_size);
-    std::vector<NnetExample> this_egs(egs.begin() + i,
-                                              end_iter);
-    ans += DoBackprop(nnet, this_egs, nnet_to_update);
-  }
-  return ans;
-}
-
-
-double DoBackpropParallel(const Nnet &nnet,
-                          int32 minibatch_size,
-                          int32 num_threads,
-                          const std::vector<NnetExample> &egs,
-                          double *tot_weight,
-                          Nnet *nnet_to_update) {
-  if (num_threads == 1) // support GPUs: special case for 1 thread.
-    return DoBackpropSingleThreaded(nnet, minibatch_size, egs,
-                                    tot_weight, nnet_to_update);
-
-  ExamplesRepository repository; // handles parallel programming issues regarding
-  // the "examples" of data.
-  double tot_log_prob = 0.0;
-  *tot_weight = 0;
-  const bool store_separate_gradients = (nnet_to_update != &nnet);
-
-  DoBackpropParallelClass c(nnet, &repository, tot_weight,
-                            &tot_log_prob, nnet_to_update,
-                            store_separate_gradients);
-
-  {
-    // The initialization of the following class spawns the threads that
-    // process the examples.  They get re-joined in its destructor.
-    MultiThreader<DoBackpropParallelClass> m(num_threads, c);
-
-    int32 num_egs = egs.size();
-    for (int32 offset = 0; offset < num_egs; offset += minibatch_size) {
-      int32 this_minibatch_size = std::min(minibatch_size, num_egs - offset);
-
-      // We waste a little time copying the examples here, but it's very minor.
-      std::vector<NnetExample> examples(egs.begin() + offset,
-                                                egs.begin() + offset + this_minibatch_size);
-
-      repository.AcceptExamples(&examples);
-    }
-
-    // Here, the destructor of "m" re-joins the threads, and
-    // does the summing of the gradients if we're doing gradient
-    // computation (i.e. &nnet != nnet_to_update).  This gets
-    // done in the destructors of the objects of type
-    // DoBackpropParallelClass.
-    repository.ExamplesDone();
-  }
-  KALDI_VLOG(2) << "Did backprop on " << *tot_weight << " examples, average log-prob "
-                << "per frame is " << (tot_log_prob / *tot_weight);
-  return tot_log_prob;
-}
-
-
-} // namespace nnet2
-} // namespace kaldi
diff --git a/src/nnet2/nnet-update-parallel.h b/src/nnet2/nnet-update-parallel.h
deleted file mode 100644
index b1478877660..00000000000
--- a/src/nnet2/nnet-update-parallel.h
+++ /dev/null
@@ -1,88 +0,0 @@
-// nnet2/nnet-update-parallel.h
-
-// Copyright 2012  Johns Hopkins University (author: Daniel Povey)
-
-// See ../../COPYING for clarification regarding multiple authors
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//  http://www.apache.org/licenses/LICENSE-2.0
-//
-// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
-// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
-// MERCHANTABLITY OR NON-INFRINGEMENT.
-// See the Apache 2 License for the specific language governing permissions and
-// limitations under the License.
-
-#ifndef KALDI_NNET2_NNET_UPDATE_PARALLEL_H_
-#define KALDI_NNET2_NNET_UPDATE_PARALLEL_H_
-
-#include "nnet2/nnet-nnet.h"
-#include "util/table-types.h"
-#include "util/kaldi-semaphore.h"
-#include "util/kaldi-thread.h"
-#include "itf/options-itf.h"
-#include "nnet2/nnet-update.h"
-
-namespace kaldi {
-namespace nnet2 {
-
-
-/// This function is similar to "DoBackprop" in nnet-update.h
-/// This function computes the objective function and either updates the model
-/// or computes parameter gradients.  It returns the cross-entropy objective
-/// function summed over all samples, weighted, and the total weight of
-/// the samples (typically the same as the #frames) into total_weight.
-/// It is mostly a wrapper for
-/// a class NnetUpdater that's defined in nnet-update.cc, but we
-/// don't want to expose that complexity at this level.
-/// Note: this function 
-/// If &nnet == nnet_to_update, it assumes we're doing SGD and does
-/// something like Hogwild; otherwise it assumes we're computing a
-/// gradient and it sums up the gradients.
-/// The return value is the total log-prob summed over the #frames. It also
-/// outputs the #frames into "num_frames".
-double DoBackpropParallel(const Nnet &nnet,
-                          int32 minibatch_size,
-                          SequentialNnetExampleReader *example_reader,
-                          double *tot_weight,
-                          Nnet *nnet_to_update);
-
-
-/// This version of DoBackpropParallel takes a vector of examples, and will
-/// typically be used to compute the exact gradient. 
-double DoBackpropParallel(const Nnet &nnet,
-                          int32 minibatch_size,
-                          int32 num_threads,
-                          const std::vector<NnetExample> &examples,
-                          double *num_frames,
-                          Nnet *nnet_to_update);
-
-
-
-/// This is basically to clarify the fact that DoBackpropParallel will
-/// also work with nnet_to_update == NULL, and will compute the objf.
-/// Both versions of the function will support it, but this
-/// version (that takes a vector) is currently the only one we need
-/// to do this with.
-inline double ComputeNnetObjfParallel(
-    const Nnet &nnet,
-    int32 minibatch_size,
-    int32 num_threads,
-    const std::vector<NnetExample> &examples,
-    double *num_frames) {
-  return DoBackpropParallel(nnet, minibatch_size, num_threads,
-                            examples, num_frames, NULL);
-}
-
-
-
-
-
-} // namespace nnet2
-} // namespace kaldi
-
-#endif // KALDI_NNET2_NNET_UPDATE_PARALLEL_H_
diff --git a/src/nnet2/nnet-update.cc b/src/nnet2/nnet-update.cc
deleted file mode 100644
index c2c628ebaac..00000000000
--- a/src/nnet2/nnet-update.cc
+++ /dev/null
@@ -1,361 +0,0 @@
-// nnet2/nnet-update.cc
-
-// Copyright 2012   Johns Hopkins University (author: Daniel Povey)
-//           2014   Xiaohui Zhang
-
-// See ../../COPYING for clarification regarding multiple authors
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//  http://www.apache.org/licenses/LICENSE-2.0
-//
-// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
-// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
-// MERCHANTABLITY OR NON-INFRINGEMENT.
-// See the Apache 2 License for the specific language governing permissions and
-// limitations under the License.
-
-#include "nnet2/nnet-update.h"
-
-namespace kaldi {
-namespace nnet2 {
-
-
-
-NnetUpdater::NnetUpdater(const Nnet &nnet,
-                         Nnet *nnet_to_update):
-    nnet_(nnet), nnet_to_update_(nnet_to_update) {
-}
- 
-
-
-void NnetUpdater::FormatInput(const std::vector<NnetExample> &data) {
-
-  forward_data_.resize(nnet_.NumComponents() + 1);
-  Matrix<BaseFloat> input;
-  FormatNnetInput(nnet_, data, &input);
-  forward_data_[0].Resize(0, 0);  // avoids the next command ever copying GPU->CPU
-  forward_data_[0].Swap(&input); // Copy to GPU, if being used.
-  nnet_.ComputeChunkInfo(1 + nnet_.LeftContext() + nnet_.RightContext(),
-                         data.size(), &chunk_info_out_);
-}
-
-double NnetUpdater::ComputeForMinibatch(
-    const std::vector<NnetExample> &data,
-    double *tot_accuracy) {
-
-  FormatInput(data);
-  Propagate();
-  CuMatrix<BaseFloat> tmp_deriv;
-  double ans = ComputeObjfAndDeriv(data, &tmp_deriv, tot_accuracy);
-  if (nnet_to_update_ != NULL)
-    Backprop(&tmp_deriv); // this is summed (after weighting), not
-                          // averaged.
-  return ans;
-}
-
-
-// form of ComputeForMinibatch for when the input data has
-// already been formatted as a single matrix.
-double NnetUpdater::ComputeForMinibatch(const std::vector<NnetExample> &data,
-                                        Matrix<BaseFloat> *formatted_data,
-                                        double *tot_accuracy) {
-  { // accept the formatted input.  This replaces the call to FormatInput().
-    int32 num_chunks = data.size();
-    KALDI_ASSERT(formatted_data->NumRows() ==
-                 num_chunks * (1 + nnet_.LeftContext() + nnet_.RightContext()) &&
-                 formatted_data->NumCols() == nnet_.InputDim());
-
-    forward_data_.resize(nnet_.NumComponents() + 1);
-    // the next command avoids the Swap() command ever copying GPU->CPU in case
-    // an instance of this class is used more than once (which it isn't in
-    // practice).
-    forward_data_[0].Resize(0, 0);  
-    forward_data_[0].Swap(formatted_data); // Copy to GPU, if being used.
-    nnet_.ComputeChunkInfo(1 + nnet_.LeftContext() + nnet_.RightContext(),
-                           data.size(), &chunk_info_out_);
-  }
-  Propagate();
-  CuMatrix<BaseFloat> tmp_deriv;
-  double ans = ComputeObjfAndDeriv(data, &tmp_deriv, tot_accuracy);
-  if (nnet_to_update_ != NULL)
-    Backprop(&tmp_deriv); // this is summed (after weighting), not
-                          // averaged.
-  return ans;
-}
-
-
-void NnetUpdater::GetOutput(CuMatrix<BaseFloat> *output) {
-  int32 num_components = nnet_.NumComponents(); 
-  KALDI_ASSERT(forward_data_.size() == nnet_.NumComponents() + 1); 
-  *output = forward_data_[num_components];
-}
-
-void NnetUpdater::Propagate() {
-  static int32 num_times_printed = 0;
-        
-  int32 num_components = nnet_.NumComponents();
-  for (int32 c = 0; c < num_components; c++) {
-    const Component &component = nnet_.GetComponent(c);
-    const CuMatrix<BaseFloat> &input = forward_data_[c];
-    CuMatrix<BaseFloat> &output = forward_data_[c+1];
-    // Note: the Propagate function will automatically resize the
-    // output.
-    component.Propagate(chunk_info_out_[c], chunk_info_out_[c+1], input, &output);
-    // If we won't need the output of the previous layer for
-    // backprop, delete it to save memory.
-    bool need_last_output =
-        (c>0 && nnet_.GetComponent(c-1).BackpropNeedsOutput()) ||
-        component.BackpropNeedsInput();
-    if (g_kaldi_verbose_level >= 3 && num_times_printed < 100) {
-      KALDI_VLOG(3) << "Stddev of data for component " << c
-                    << " for this minibatch is "
-                    << (TraceMatMat(forward_data_[c], forward_data_[c], kTrans) /
-                        (forward_data_[c].NumRows() * forward_data_[c].NumCols()));
-      num_times_printed++;
-    }
-    if (!need_last_output)
-      forward_data_[c].Resize(0, 0); // We won't need this data.
-  }
-}
-
-double NnetUpdater::ComputeObjfAndDeriv(
-    const std::vector<NnetExample> &data,
-    CuMatrix<BaseFloat> *deriv,
-    double *tot_accuracy) const {
-  BaseFloat tot_objf = 0.0, tot_weight = 0.0;
-  int32 num_components = nnet_.NumComponents();
-  int32 num_chunks = data.size();
-  deriv->Resize(num_chunks, nnet_.OutputDim()); // sets to zero.
-  const CuMatrix<BaseFloat> &output(forward_data_[num_components]);
-  KALDI_ASSERT(SameDim(output, *deriv));
-
-  std::vector<MatrixElement<BaseFloat> > sv_labels;
-  sv_labels.reserve(num_chunks); // We must have at least this many labels.
-  for (int32 m = 0; m < num_chunks; m++) {
-    KALDI_ASSERT(data[m].labels.size() == 1 &&
-                 "Training code currently does not support multi-frame egs");
-    const std::vector<std::pair<int32,BaseFloat> > &labels = data[m].labels[0];
-    for (size_t i = 0; i < labels.size(); i++) {
-      KALDI_ASSERT(labels[i].first < nnet_.OutputDim() &&
-                        "Possibly egs come from alignments from mismatching model");
-      MatrixElement<BaseFloat> elem = {m, labels[i].first, labels[i].second};
-      sv_labels.push_back(elem);
-    }
-  }
-
-  if (tot_accuracy != NULL)
-    *tot_accuracy = ComputeTotAccuracy(data);
-  
-  deriv->CompObjfAndDeriv(sv_labels, output, &tot_objf, &tot_weight);
-  
-  KALDI_VLOG(4) << "Objective function is " << (tot_objf/tot_weight) << " over "
-                << tot_weight << " samples (weighted).";
-  return tot_objf;
-}
-
-
-double NnetUpdater::ComputeTotAccuracy(
-    const std::vector<NnetExample> &data) const {
-  BaseFloat tot_accuracy = 0.0;
-  int32 num_components = nnet_.NumComponents();
-  const CuMatrix<BaseFloat> &output(forward_data_[num_components]);
-  KALDI_ASSERT(output.NumRows() == static_cast<int32>(data.size()));
-  CuArray<int32> best_pdf(output.NumRows());
-  std::vector<int32> best_pdf_cpu;
-  
-  output.FindRowMaxId(&best_pdf);
-  best_pdf.CopyToVec(&best_pdf_cpu);
-
-  for (int32 i = 0; i < output.NumRows(); i++) {
-    KALDI_ASSERT(data[i].labels.size() == 1 &&
-                 "Training code currently does not support multi-frame egs");
-    const std::vector<std::pair<int32,BaseFloat> > &labels = data[i].labels[0];
-    for (size_t j = 0; j < labels.size(); j++) {
-      int32 ref_pdf_id = labels[j].first,
-          hyp_pdf_id = best_pdf_cpu[i];
-      BaseFloat weight = labels[j].second;
-      tot_accuracy += weight * (hyp_pdf_id == ref_pdf_id ? 1.0 : 0.0);
-    }
-  }
-  return tot_accuracy;
-}
-
-
-void NnetUpdater::Backprop(CuMatrix<BaseFloat> *deriv) const {
-  // We assume ComputeObjfAndDeriv has already been called.
-  for (int32 c = nnet_.NumComponents() - 1;
-       c >= nnet_.FirstUpdatableComponent(); c--) {
-    const Component &component = nnet_.GetComponent(c);
-    Component *component_to_update = (nnet_to_update_ == NULL ? NULL :
-                                      &(nnet_to_update_->GetComponent(c)));
-    const CuMatrix<BaseFloat> &input = forward_data_[c],
-        &output = forward_data_[c+1];
-    CuMatrix<BaseFloat> input_deriv(input.NumRows(), input.NumCols());
-    const CuMatrix<BaseFloat> &output_deriv(*deriv);
-    component.Backprop(chunk_info_out_[c], chunk_info_out_[c+1], input, output,                       
-                       output_deriv, component_to_update,
-                       &input_deriv);
-    input_deriv.Swap(deriv);
-  }
-}
-
-
-void FormatNnetInput(const Nnet &nnet,
-                     const std::vector<NnetExample> &data,
-                     Matrix<BaseFloat> *input_mat) {
-  KALDI_ASSERT(data.size() > 0);
-  int32 num_splice = 1 + nnet.RightContext() + nnet.LeftContext();
-  KALDI_ASSERT(data[0].input_frames.NumRows() >= num_splice);
-  
-  int32 feat_dim = data[0].input_frames.NumCols(),
-         spk_dim = data[0].spk_info.Dim(),
-         tot_dim = feat_dim + spk_dim; // we append these at the neural net
-                                       // input... note, spk_dim might be 0.
-  KALDI_ASSERT(tot_dim == nnet.InputDim());
-  KALDI_ASSERT(data[0].left_context >= nnet.LeftContext());
-  int32 ignore_frames = data[0].left_context - nnet.LeftContext(); // If
-  // the NnetExample has more left-context than we need, ignore some.
-  // this may happen in settings where we increase the amount of context during
-  // training, e.g. by adding layers that require more context.  
-
-  int32 num_chunks = data.size();
-  
-  input_mat->Resize(num_splice * num_chunks,
-                    tot_dim, kUndefined);
-  
-  for (int32 chunk = 0; chunk < num_chunks; chunk++) {
-    SubMatrix<BaseFloat> dest(*input_mat,
-                              chunk * num_splice, num_splice,
-                              0, feat_dim);
-
-    Matrix<BaseFloat> full_src(data[chunk].input_frames);
-    SubMatrix<BaseFloat> src(full_src, ignore_frames, num_splice, 0, feat_dim);
-                             
-    dest.CopyFromMat(src);
-    if (spk_dim != 0) {
-      SubMatrix<BaseFloat> spk_dest(*input_mat,
-                                    chunk * num_splice, num_splice,
-                                    feat_dim, spk_dim);
-      spk_dest.CopyRowsFromVec(data[chunk].spk_info);
-    }
-  }
-}
-
-BaseFloat TotalNnetTrainingWeight(const std::vector<NnetExample> &egs) {
-  double ans = 0.0;
-  for (size_t i = 0; i < egs.size(); i++)
-    for (size_t j = 0; j < egs[i].labels.size(); j++) // for each labeled frame
-      for (size_t k = 0; k < egs[i].labels[j].size(); k++)
-        ans += egs[i].labels[j][k].second;
-  return ans;
-}
-
-
-double ComputeNnetObjf(const Nnet &nnet,
-                       const std::vector<NnetExample> &examples,
-                       double *tot_accuracy) {
-  NnetUpdater updater(nnet, NULL);
-  return updater.ComputeForMinibatch(examples, tot_accuracy);
-}
-
-double DoBackprop(const Nnet &nnet,
-                  const std::vector<NnetExample> &examples,
-                  Nnet *nnet_to_update,
-                  double *tot_accuracy) {
-  if (nnet_to_update == NULL)
-    return ComputeNnetObjf(nnet, examples, tot_accuracy);
-  try {
-    NnetUpdater updater(nnet, nnet_to_update);
-    return updater.ComputeForMinibatch(examples, tot_accuracy);
-  } catch (...) {
-    KALDI_LOG << "Error doing backprop, nnet info is: " << nnet.Info();
-    throw;
-  }
-}
-
-// version of DoBackprop that takes already-formatted examples.
-double DoBackprop(const Nnet &nnet,
-                  const std::vector<NnetExample> &examples,
-                  Matrix<BaseFloat> *examples_formatted,
-                  Nnet *nnet_to_update,
-                  double *tot_accuracy) {
-  if (nnet_to_update == NULL) {
-    KALDI_WARN << "Was not expecting to reach this code path "
-               << "(wastefully formatting data twice)";
-    return ComputeNnetObjf(nnet, examples, tot_accuracy);
- } try {
-    NnetUpdater updater(nnet, nnet_to_update);
-    return updater.ComputeForMinibatch(examples,
-                                       examples_formatted,
-                                       tot_accuracy);
-  } catch (...) {
-    KALDI_LOG << "Error doing backprop, nnet info is: " << nnet.Info();
-    throw;
-  }
-}
-
-
-double ComputeNnetGradient(
-    const Nnet &nnet,
-    const std::vector<NnetExample> &validation_set,
-    int32 batch_size,
-    Nnet *gradient) {
-  bool treat_as_gradient = true;
-  gradient->SetZero(treat_as_gradient);
-  std::vector<NnetExample> batch;
-  batch.reserve(batch_size);
-  double tot_objf = 0.0;
-  for (int32 start_pos = 0;
-       start_pos < static_cast<int32>(validation_set.size());
-       start_pos += batch_size) {
-    batch.clear();
-    for (int32 i = start_pos;
-         i < std::min(start_pos + batch_size,
-                      static_cast<int32>(validation_set.size()));
-         i++) {
-      batch.push_back(validation_set[i]);
-    }
-    tot_objf += DoBackprop(nnet,
-                           batch,
-                           gradient);
-  }
-  return tot_objf / validation_set.size();
-}
-
-double ComputeNnetObjf(
-    const Nnet &nnet,
-    const std::vector<NnetExample> &validation_set,
-    int32 batch_size,
-    double *tot_accuracy) {
-  double tot_accuracy_tmp;
-  if (tot_accuracy)
-    *tot_accuracy = 0.0;
-  std::vector<NnetExample> batch;
-  batch.reserve(batch_size);
-  double tot_objf = 0.0;
-  for (int32 start_pos = 0;
-       start_pos < static_cast<int32>(validation_set.size());
-       start_pos += batch_size) {
-    batch.clear();
-    for (int32 i = start_pos;
-         i < std::min(start_pos + batch_size,
-                      static_cast<int32>(validation_set.size()));
-         i++) {
-      batch.push_back(validation_set[i]);
-    }
-    tot_objf += ComputeNnetObjf(nnet, batch,
-                                tot_accuracy != NULL ? &tot_accuracy_tmp : NULL);
-    if (tot_accuracy)
-      *tot_accuracy += tot_accuracy_tmp;
-  }
-  return tot_objf;
-}
-
-  
-  
-} // namespace nnet2
-} // namespace kaldi
diff --git a/src/nnet2/nnet-update.h b/src/nnet2/nnet-update.h
deleted file mode 100644
index c21ca31c61e..00000000000
--- a/src/nnet2/nnet-update.h
+++ /dev/null
@@ -1,191 +0,0 @@
-// nnet2/nnet-update.h
-
-// Copyright 2012  Johns Hopkins University (author: Daniel Povey)
-//           2014  Xiaohui Zhang
-
-// See ../../COPYING for clarification regarding multiple authors
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//  http://www.apache.org/licenses/LICENSE-2.0
-//
-// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
-// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
-// MERCHANTABLITY OR NON-INFRINGEMENT.
-// See the Apache 2 License for the specific language governing permissions and
-// limitations under the License.
-
-#ifndef KALDI_NNET2_NNET_UPDATE_H_
-#define KALDI_NNET2_NNET_UPDATE_H_
-
-#include "nnet2/nnet-nnet.h"
-#include "nnet2/nnet-example.h"
-#include "util/table-types.h"
-
-
-namespace kaldi {
-namespace nnet2 {
-
-/** @file
-   This header provides functionality for sample-by-sample stochastic
-   gradient descent and gradient computation with a neural net.
-   See also \ref nnet-compute.h which is the same thing but for
-   whole utterances.
-*/
-
-class NnetEnsembleTrainer;
-
-// This class NnetUpdater contains functions for updating the neural net or
-// computing its gradient, given a set of NnetExamples. We
-// define it in the header file becaused it's needed by the ensemble training.
-// But in normal cases its functionality should be used by calling DoBackprop(),
-// and by ComputeNnetObjf()
-class NnetUpdater {
- public:
-  // Note: in the case of training with SGD, "nnet" and "nnet_to_update" will
-  // be identical.  They'll be different if we're accumulating the gradient
-  // for a held-out set and don't want to update the model.  Note: nnet_to_update
-  // may be NULL if you don't want do do backprop.
-  NnetUpdater(const Nnet &nnet,
-              Nnet *nnet_to_update);
-  
-  /// Does the entire forward and backward computation for this minbatch.
-  /// Returns total objective function over this minibatch.  If tot_accuracy != NULL,
-  /// outputs to that pointer the total accuracy.
-  double ComputeForMinibatch(const std::vector<NnetExample> &data,
-                             double *tot_accuracy);
-
-  /// This version of ComputeForMinibatch is used when you have already called
-  /// the function FormatNnetInput (defined below) to format your data as a
-  /// single matrix.  This interface is provided because it can be more
-  /// efficient to do this non-trivial CPU-based computation in a separate
-  /// thread.  formatted_data is an input but this function will destroy it,
-  /// which is why it's a pointer.
-  double ComputeForMinibatch(const std::vector<NnetExample> &data,
-                             Matrix<BaseFloat> *formatted_data,
-                             double *tot_accuracy);
-  
-  void GetOutput(CuMatrix<BaseFloat> *output);
- protected:
-
-  void Propagate();
-
-  /// Formats the input as a single matrix and sets the size of forward_data_,
-  /// and sets up chunk_info_out_.
-  void FormatInput(const std::vector<NnetExample> &data);
-
-  /// Computes objective function and derivative at output layer, but does not
-  /// do the backprop [for that, see Backprop()].  Returns objf summed over all
-  /// samples (with their weights).
-  /// If tot_accuracy != NULL, it will output to tot_accuracy the sum over all labels
-  /// of all examples, of (correctly classified ? 0 : 1) * weight-of-label.  This
-  /// involves extra computation.
-  double ComputeObjfAndDeriv(const std::vector<NnetExample> &data,
-                             CuMatrix<BaseFloat> *deriv,
-                             double *tot_accuracy = NULL) const;
-  
-
-  /// Backprop must be called after ComputeObjfAndDeriv.  Does the
-  /// backpropagation; "nnet_to_update_" is updated.  Note: "deriv" will
-  /// contain, at input, the derivative w.r.t. the output layer (as computed by
-  /// ComputeObjfAndDeriv), but will be used as a temporary variable by this
-  /// function.
-  void Backprop(CuMatrix<BaseFloat> *deriv) const;
-
-  friend class NnetEnsembleTrainer;
- private:
-  // Must be called after Propagate().
-  double ComputeTotAccuracy(const std::vector<NnetExample> &data) const;
-
-  const Nnet &nnet_;
-  Nnet *nnet_to_update_;
-  int32 num_chunks_; // same as the minibatch size.
-  std::vector<ChunkInfo> chunk_info_out_; 
-  
-  std::vector<CuMatrix<BaseFloat> > forward_data_; // The forward data
-  // for the outputs of each of the components.
-
-};
-
-
-/// Takes the input to the nnet for a minibatch of examples, and formats as a
-/// single matrix.  data.size() must be > 0.  Note: you will probably want to
-/// copy this to CuMatrix after you call this function.
-/// The num-rows of the output will, at exit, equal 
-/// (1 + nnet.LeftContext() + nnet.RightContext()) * data.size().
-/// The nnet is only needed so we can call LeftContext(), RightContext()
-/// and InputDim() on it.
-void FormatNnetInput(const Nnet &nnet,
-                     const std::vector<NnetExample> &data,
-                     Matrix<BaseFloat> *mat);
-
-
-/// This function computes the objective function and either updates the model
-/// or adds to parameter gradients.  Returns the cross-entropy objective
-/// function summed over all samples (normalize this by dividing by
-/// TotalNnetTrainingWeight(examples)).  It is mostly a wrapper for
-/// a class NnetUpdater that's defined in nnet-update.cc, but we
-/// don't want to expose that complexity at this level.
-/// All these examples will be treated as one minibatch.
-/// If tot_accuracy != NULL, it outputs to that pointer the total (weighted)
-/// accuracy.
-double DoBackprop(const Nnet &nnet,
-                  const std::vector<NnetExample> &examples,
-                  Nnet *nnet_to_update,
-                  double *tot_accuracy = NULL);
-
-/// This version of DoBackprop allows you to separately call
-/// FormatNnetInput and provide the result to DoBackprop; this
-/// can be useful when using GPUs because the call to FormatNnetInput
-/// can be in a separate thread from the one that uses the GPU.
-/// "examples_formatted" is really an input, but it's a pointer
-/// because internally we call Swap() on it, so we destroy
-/// its contents.
-double DoBackprop(const Nnet &nnet,
-                  const std::vector<NnetExample> &examples,
-                  Matrix<BaseFloat> *examples_formatted,
-                  Nnet *nnet_to_update,
-                  double *tot_accuracy = NULL);
-
-
-
-/// Returns the total weight summed over all the examples... just a simple
-/// utility function.
-BaseFloat TotalNnetTrainingWeight(const std::vector<NnetExample> &egs);
-
-/// Computes objective function over a minibatch.  Returns the *total* weighted
-/// objective function over the minibatch.
-/// If tot_accuracy != NULL, it outputs to that pointer the total (weighted)
-/// accuracy.
-double ComputeNnetObjf(const Nnet &nnet,
-                       const std::vector<NnetExample> &examples,
-                       double *tot_accuracy= NULL);
-
-/// This version of ComputeNnetObjf breaks up the examples into
-/// multiple minibatches to do the computation.
-/// Returns the *total* (weighted) objective function.
-/// If tot_accuracy != NULL, it outputs to that pointer the total (weighted)
-/// accuracy.
-double ComputeNnetObjf(const Nnet &nnet,                          
-                       const std::vector<NnetExample> &examples,
-                       int32 minibatch_size,
-                       double *tot_accuracy= NULL);
-
-
-/// ComputeNnetGradient is mostly used to compute gradients on validation sets;
-/// it divides the example into batches and calls DoBackprop() on each.
-/// It returns the *average* objective function per frame.
-double ComputeNnetGradient(
-    const Nnet &nnet,
-    const std::vector<NnetExample> &examples,
-    int32 batch_size,
-    Nnet *gradient);
-
-
-} // namespace nnet2
-} // namespace kaldi
-
-#endif // KALDI_NNET2_NNET_UPDATE_H_
diff --git a/src/nnet2/online-nnet2-decodable-test.cc b/src/nnet2/online-nnet2-decodable-test.cc
deleted file mode 100644
index 10ca206c5ee..00000000000
--- a/src/nnet2/online-nnet2-decodable-test.cc
+++ /dev/null
@@ -1,114 +0,0 @@
-// nnet2/online-nnet2-decodable-test.cc
-
-// Copyright 2014  Johns Hopkins University (author:  Daniel Povey)
-
-// See ../../COPYING for clarification regarding multiple authors
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//  http://www.apache.org/licenses/LICENSE-2.0
-//
-// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
-// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
-// MERCHANTABLITY OR NON-INFRINGEMENT.
-// See the Apache 2 License for the specific language governing permissions and
-// limitations under the License.
-
-#include "hmm/transition-model.h"
-#include "nnet2/nnet-component.h"
-#include "nnet2/decodable-am-nnet.h"
-#include "nnet2/online-nnet2-decodable.h"
-#include "feat/online-feature.h"
-#include "hmm/hmm-test-utils.h"
-
-namespace kaldi {
-namespace nnet2 {
-
-
-void UnitTestNnetDecodable() {
-  std::vector<int32> phones;
-  phones.push_back(1);
-  for (int32 i = 2; i < 20; i++)
-    if (rand() % 2 == 0)
-      phones.push_back(i);
-  int32 N = 2 + rand() % 2, // context-size N is 2 or 3.
-      P = rand() % N;  // Central-phone is random on [0, N)
-
-  std::vector<int32> num_pdf_classes;
-
-  ContextDependency *ctx_dep =
-      GenRandContextDependencyLarge(phones, N, P,
-                                    true, &num_pdf_classes);
-
-  HmmTopology topo = GetDefaultTopology(phones);
-
-  TransitionModel trans_model(*ctx_dep, topo);
-
-  delete ctx_dep; // We won't need this further.
-  ctx_dep = NULL;
-
-  int32 input_dim = 40, output_dim = trans_model.NumPdfs();
-  Nnet *nnet = GenRandomNnet(input_dim, output_dim);
-
-  AmNnet am_nnet(*nnet);
-  delete nnet;
-  nnet = NULL;
-  Vector<BaseFloat> priors(output_dim);
-  priors.SetRandn();
-  priors.ApplyExp();
-  priors.Scale(1.0 / priors.Sum());
-
-  am_nnet.SetPriors(priors);
-
-  DecodableNnet2OnlineOptions opts;
-  opts.max_nnet_batch_size = 20;
-  opts.acoustic_scale = 0.1;
-
-  opts.pad_input = (rand() % 2 == 0);
-
-  int32 num_input_frames = 400;
-  Matrix<BaseFloat> input_feats(num_input_frames, input_dim);
-  input_feats.SetRandn();
-
-  OnlineMatrixFeature matrix_feature(input_feats);
-
-  DecodableNnet2Online online_decodable(am_nnet, trans_model,
-                                        opts, &matrix_feature);
-
-  DecodableAmNnet offline_decodable(trans_model, am_nnet,
-                                    CuMatrix<BaseFloat>(input_feats),
-                                    opts.pad_input,
-                                    opts.acoustic_scale);
-
-  KALDI_ASSERT(online_decodable.NumFramesReady() ==
-               offline_decodable.NumFramesReady());
-  int32 num_frames = online_decodable.NumFramesReady(),
-      num_tids = trans_model.NumTransitionIds();
-
-  for (int32 i = 0; i < 50; i++) {
-
-    int32 t = rand() % num_frames, tid = 1 + rand() % num_tids;
-    BaseFloat l1 = online_decodable.LogLikelihood(t, tid),
-        l2 = offline_decodable.LogLikelihood(t, tid);
-    KALDI_ASSERT(ApproxEqual(l1, l2));
-  }
-}
-
-} // namespace nnet2
-} // namespace kaldi
-
-
-int main() {
-  using namespace kaldi;
-  using namespace kaldi::nnet2;
-  using kaldi::int32;
-
-  for (int32 i = 0; i < 3; i++)
-    UnitTestNnetDecodable();
-  return 0;
-}
-
-
diff --git a/src/nnet2/online-nnet2-decodable.cc b/src/nnet2/online-nnet2-decodable.cc
deleted file mode 100644
index 715e1cc280d..00000000000
--- a/src/nnet2/online-nnet2-decodable.cc
+++ /dev/null
@@ -1,145 +0,0 @@
-// nnet2/online-nnet2-decodable.cc
-
-// Copyright  2014  Johns Hopkins University (author: Daniel Povey)
-
-// See ../../COPYING for clarification regarding multiple authors
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//  http://www.apache.org/licenses/LICENSE-2.0
-//
-// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
-// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
-// MERCHANTABLITY OR NON-INFRINGEMENT.
-// See the Apache 2 License for the specific language governing permissions and
-// limitations under the License.
-
-#include "nnet2/online-nnet2-decodable.h"
-
-namespace kaldi {
-namespace nnet2 {
-
-DecodableNnet2Online::DecodableNnet2Online(
-    const AmNnet &nnet,
-    const TransitionModel &trans_model,
-    const DecodableNnet2OnlineOptions &opts,
-    OnlineFeatureInterface *input_feats):
-    features_(input_feats),
-    nnet_(nnet),
-    trans_model_(trans_model),
-    opts_(opts),
-    feat_dim_(input_feats->Dim()),
-    left_context_(nnet.GetNnet().LeftContext()),
-    right_context_(nnet.GetNnet().RightContext()),
-    num_pdfs_(nnet.GetNnet().OutputDim()),
-    begin_frame_(-1) {
-  KALDI_ASSERT(opts_.max_nnet_batch_size > 0);
-  log_priors_ = nnet_.Priors();
-  KALDI_ASSERT(log_priors_.Dim() == trans_model_.NumPdfs() &&
-               "Priors in neural network not set up (or mismatch "
-               "with transition model).");
-  log_priors_.ApplyLog();
-}
-
-
-
-BaseFloat DecodableNnet2Online::LogLikelihood(int32 frame, int32 index) {
-  ComputeForFrame(frame);
-  int32 pdf_id = trans_model_.TransitionIdToPdf(index);
-  KALDI_ASSERT(frame >= begin_frame_ &&
-               frame < begin_frame_ + scaled_loglikes_.NumRows());
-  return scaled_loglikes_(frame - begin_frame_, pdf_id);
-}
-
-
-bool DecodableNnet2Online::IsLastFrame(int32 frame) const {
-  if (opts_.pad_input) { // normal case
-    return features_->IsLastFrame(frame);
-  } else {
-    return features_->IsLastFrame(frame + left_context_ + right_context_);
-  }
-}
-
-int32 DecodableNnet2Online::NumFramesReady() const {
-  int32 features_ready = features_->NumFramesReady();
-  if (features_ready == 0)
-    return 0;
-  bool input_finished = features_->IsLastFrame(features_ready - 1);
-  if (opts_.pad_input) {
-    // normal case... we'll pad with duplicates of first + last frame to get the
-    // required left and right context.
-    if (input_finished) return features_ready;
-    else return std::max<int32>(0, features_ready - right_context_);
-  } else {
-    return std::max<int32>(0, features_ready - right_context_ - left_context_);
-  }
-}
-
-void DecodableNnet2Online::ComputeForFrame(int32 frame) {
-  int32 features_ready = features_->NumFramesReady();
-  bool input_finished = features_->IsLastFrame(features_ready - 1);
-  KALDI_ASSERT(frame >= 0);
-  if (frame >= begin_frame_ &&
-      frame < begin_frame_ + scaled_loglikes_.NumRows())
-    return;
-  KALDI_ASSERT(frame < NumFramesReady());
-
-  int32 input_frame_begin;
-  if (opts_.pad_input)
-    input_frame_begin = frame - left_context_;
-  else
-    input_frame_begin = frame;
-  int32 max_possible_input_frame_end = features_ready;
-  if (input_finished && opts_.pad_input)
-    max_possible_input_frame_end += right_context_;
-  int32 input_frame_end = std::min<int32>(max_possible_input_frame_end,
-                                          input_frame_begin +
-                                          left_context_ + right_context_ +
-                                          opts_.max_nnet_batch_size);
-  KALDI_ASSERT(input_frame_end > input_frame_begin);
-  Matrix<BaseFloat> features(input_frame_end - input_frame_begin,
-                             feat_dim_);
-  for (int32 t = input_frame_begin; t < input_frame_end; t++) {
-    SubVector<BaseFloat> row(features, t - input_frame_begin);
-    int32 t_modified = t;
-    // The next two if-statements take care of "pad_input"
-    if (t_modified < 0)
-      t_modified = 0;
-    if (t_modified >= features_ready)
-      t_modified = features_ready - 1;
-    features_->GetFrame(t_modified, &row);
-  }
-  CuMatrix<BaseFloat> cu_features;
-  cu_features.Swap(&features);  // Copy to GPU, if we're using one.
-
-
-  int32 num_frames_out = input_frame_end - input_frame_begin -
-      left_context_ - right_context_;
-
-  CuMatrix<BaseFloat> cu_posteriors(num_frames_out, num_pdfs_);
-
-  // The "false" below tells it not to pad the input: we've already done
-  // any padding that we needed to do.
-  NnetComputation(nnet_.GetNnet(), cu_features,
-                  false, &cu_posteriors);
-
-  cu_posteriors.ApplyFloor(1.0e-20); // Avoid log of zero which leads to NaN.
-  cu_posteriors.ApplyLog();
-  // subtract log-prior (divide by prior)
-  cu_posteriors.AddVecToRows(-1.0, log_priors_);
-  // apply probability scale.
-  cu_posteriors.Scale(opts_.acoustic_scale);
-
-  // Transfer the scores the CPU for faster access by the
-  // decoding process.
-  scaled_loglikes_.Resize(0, 0);
-  cu_posteriors.Swap(&scaled_loglikes_);
-
-  begin_frame_ = frame;
-}
-
-} // namespace nnet2
-} // namespace kaldi
diff --git a/src/nnet2/online-nnet2-decodable.h b/src/nnet2/online-nnet2-decodable.h
deleted file mode 100644
index 96e0a4b8926..00000000000
--- a/src/nnet2/online-nnet2-decodable.h
+++ /dev/null
@@ -1,122 +0,0 @@
-// nnet2/online-nnet2-decodable.h
-
-// Copyright  2014  Johns Hopkins Universithy (author: Daniel Povey)
-
-
-// See ../../COPYING for clarification regarding multiple authors
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//  http://www.apache.org/licenses/LICENSE-2.0
-//
-// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
-// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
-// MERCHANTABLITY OR NON-INFRINGEMENT.
-// See the Apache 2 License for the specific language governing permissions and
-// limitations under the License.
-
-#ifndef KALDI_NNET2_ONLINE_NNET2_DECODABLE_H_
-#define KALDI_NNET2_ONLINE_NNET2_DECODABLE_H_
-
-#include "itf/online-feature-itf.h"
-#include "itf/decodable-itf.h"
-#include "nnet2/am-nnet.h"
-#include "nnet2/nnet-compute.h"
-#include "hmm/transition-model.h"
-
-namespace kaldi {
-namespace nnet2 {
-
-// Note: see also nnet-compute-online.h, which provides a different
-// (lower-level) interface and more efficient for progressive evaluation of an
-// nnet throughout an utterance, with re-use of already-computed activations.
-
-struct DecodableNnet2OnlineOptions {
-  BaseFloat acoustic_scale;
-  bool pad_input;
-  int32 max_nnet_batch_size;
-  
-  DecodableNnet2OnlineOptions():
-      acoustic_scale(0.1),
-      pad_input(true),
-      max_nnet_batch_size(256) { }
-
-  void Register(OptionsItf *opts) {
-    opts->Register("acoustic-scale", &acoustic_scale,
-                   "Scaling factor for acoustic likelihoods");
-    opts->Register("pad-input", &pad_input,
-                   "If true, pad acoustic features with required acoustic context "
-                   "past edges of file.");
-    opts->Register("max-nnet-batch-size", &max_nnet_batch_size,
-                   "Maximum batch size we use in neural-network decodable object, "
-                   "in cases where we are not constrained by currently available "
-                   "frames (this will rarely make a difference)");
-                 
-  }
-};
-
-
-/**
-   This Decodable object for class nnet2::AmNnet takes feature input from class
-   OnlineFeatureInterface, unlike, say, class DecodableAmNnet which takes
-   feature input from a matrix.
-*/
-
-class DecodableNnet2Online: public DecodableInterface {
- public:
-  DecodableNnet2Online(const AmNnet &nnet,
-                       const TransitionModel &trans_model,
-                       const DecodableNnet2OnlineOptions &opts,
-                       OnlineFeatureInterface *input_feats);
-  
-  
-  /// Returns the scaled log likelihood
-  virtual BaseFloat LogLikelihood(int32 frame, int32 index);
-  
-  virtual bool IsLastFrame(int32 frame) const;
-
-  virtual int32 NumFramesReady() const;  
-  
-  /// Indices are one-based!  This is for compatibility with OpenFst.
-  virtual int32 NumIndices() const { return trans_model_.NumTransitionIds(); }
-  
- private:
-
-  /// If the neural-network outputs for this frame are not cached, it computes
-  /// them (and possibly for some succeeding frames)
-  void ComputeForFrame(int32 frame);
-  
-  OnlineFeatureInterface *features_;
-  const AmNnet &nnet_;
-  const TransitionModel &trans_model_;
-  DecodableNnet2OnlineOptions opts_;
-  CuVector<BaseFloat> log_priors_;  // log-priors taken from the model.
-  int32 feat_dim_;  // dimensionality of the input features.
-  int32 left_context_;  // Left context of the network (cached here)
-  int32 right_context_;  // Right context of the network (cached here)
-  int32 num_pdfs_;  // Number of pdfs, equals output-dim of the network (cached
-                    // here)
-  
-  int32 begin_frame_;  // First frame for which scaled_loglikes_ is valid
-                       // (i.e. the first frame of the batch of frames for
-                       // which we've computed the output).
-  
-  // scaled_loglikes_ contains the neural network pseudo-likelihoods: the log of
-  // (prob divided by the prior), scaled by opts.acoustic_scale).  We may
-  // compute this using the GPU, but we transfer it back to the system memory
-  // when we store it here.  These scores are only kept for a subset of frames,
-  // starting at begin_frame_, whose length depends how many frames were ready
-  // at the time we called LogLikelihood(), and will never exceed
-  // opts_.max_nnet_batch_size.
-  Matrix<BaseFloat> scaled_loglikes_;
-
-  KALDI_DISALLOW_COPY_AND_ASSIGN(DecodableNnet2Online);
-};
-
-} // namespace nnet2
-} // namespace kaldi
-
-#endif // KALDI_NNET2_ONLINE_NNET2_DECODABLE_H_
diff --git a/src/nnet2/rescale-nnet.cc b/src/nnet2/rescale-nnet.cc
deleted file mode 100644
index 204720df839..00000000000
--- a/src/nnet2/rescale-nnet.cc
+++ /dev/null
@@ -1,227 +0,0 @@
-// nnet2/rescale-nnet.cc
-
-// Copyright 2012   Johns Hopkins University (author: Daniel Povey)
-
-// See ../../COPYING for clarification regarding multiple authors
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//  http://www.apache.org/licenses/LICENSE-2.0
-//
-// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
-// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
-// MERCHANTABLITY OR NON-INFRINGEMENT.
-// See the Apache 2 License for the specific language governing permissions and
-// limitations under the License.
-
-#include "nnet2/rescale-nnet.h"
-
-namespace kaldi {
-namespace nnet2 {
-
-
-class NnetRescaler {
- public:
-  NnetRescaler(const NnetRescaleConfig &config,
-               const std::vector<NnetExample> &examples,
-               Nnet *nnet):
-      config_(config), examples_(examples), nnet_(nnet) {}
-                            
-  void Rescale();
-
- private:
-  /// takes the input and formats as a single matrix, in forward_data_[0].
-  void FormatInput(const std::vector<NnetExample> &data,
-                   CuMatrix<BaseFloat> *input);
-  void RescaleComponent(int32 c, int32 num_chunks,
-                        CuMatrixBase<BaseFloat> *cur_data_in,
-                        CuMatrix<BaseFloat> *next_data);
-
-  void ComputeRelevantIndexes();
-  
-  BaseFloat GetTargetAvgDeriv(int32 c);
-  
-  const NnetRescaleConfig &config_;
-  const std::vector<NnetExample> &examples_;
-  Nnet *nnet_;
-  std::vector <ChunkInfo> chunk_info_out_;
-  std::set<int32> relevant_indexes_; // values of c with AffineComponent followed
-  // by (at c+1) NonlinearComponent that is not SoftmaxComponent.
-};
-
-
-void NnetRescaler::FormatInput(const std::vector<NnetExample> &data,
-                               CuMatrix<BaseFloat> *input) {
-  KALDI_ASSERT(data.size() > 0);
-  int32 num_splice = nnet_->LeftContext() + 1 + nnet_->RightContext();
-  KALDI_ASSERT(data[0].input_frames.NumRows() == num_splice);
-
-  int32 feat_dim = data[0].input_frames.NumCols(),
-         spk_dim = data[0].spk_info.Dim(),
-         tot_dim = feat_dim + spk_dim; // we append these at the neural net
-                                       // input... note, spk_dim might be 0.
-  KALDI_ASSERT(tot_dim == nnet_->InputDim());
-  int32 num_chunks = data.size();
-
-  input->Resize(num_splice * num_chunks,
-                tot_dim);
-  for (int32 chunk = 0; chunk < num_chunks; chunk++) {
-    CuSubMatrix<BaseFloat> dest(*input,
-                                chunk * num_splice, num_splice,
-                                0, feat_dim);
-    Matrix<BaseFloat> src(data[chunk].input_frames);
-    dest.CopyFromMat(src);
-    if (spk_dim != 0) {
-      CuSubMatrix<BaseFloat> spk_dest(*input,
-                                      chunk * num_splice, num_splice,
-                                      feat_dim, spk_dim);
-      spk_dest.CopyRowsFromVec(data[chunk].spk_info);
-    }
-  }
-  // TODO : filter out the unnecessary rows from the input
-  nnet_->ComputeChunkInfo(num_splice, num_chunks, &chunk_info_out_);
-
-}
-
-void NnetRescaler::ComputeRelevantIndexes() {
-  for (int32 c = 0; c + 1 < nnet_->NumComponents(); c++)
-    if (dynamic_cast<AffineComponent*>(&nnet_->GetComponent(c)) != NULL &&
-        (dynamic_cast<NonlinearComponent*>(&nnet_->GetComponent(c+1)) != NULL &&
-         dynamic_cast<SoftmaxComponent*>(&nnet_->GetComponent(c+1)) == NULL))
-      relevant_indexes_.insert(c);
-}
-
-
-BaseFloat NnetRescaler::GetTargetAvgDeriv(int32 c) {
-  KALDI_ASSERT(relevant_indexes_.count(c) == 1);
-  BaseFloat factor;
-  if (dynamic_cast<SigmoidComponent*>(&(nnet_->GetComponent(c + 1))) != NULL)
-    factor = 0.25;
-  else if (dynamic_cast<TanhComponent*>(&(nnet_->GetComponent(c + 1))) != NULL)
-    factor = 1.0;
-  else
-    KALDI_ERR << "This type of nonlinear component is not handled: index  " << c;
-  
-  int32 last_c = *std::max_element(relevant_indexes_.begin(), relevant_indexes_.end()),
-      first_c = *std::min_element(relevant_indexes_.begin(), relevant_indexes_.end());
-  if (c == first_c)
-    return factor * config_.target_first_layer_avg_deriv;
-  else if (c == last_c)
-    return factor * config_.target_last_layer_avg_deriv;
-  else
-    return factor * config_.target_avg_deriv;
-}
-
-// Here, c is the index of the affine component, and
-// c + 1 is the index of the nonlinear component; *cur_data is the
-// output of the affine component.
-void NnetRescaler::RescaleComponent(
-    int32 c,
-    int32 num_chunks,
-    CuMatrixBase<BaseFloat> *cur_data_in,
-    CuMatrix<BaseFloat> *next_data) {
-  int32 rows = cur_data_in->NumRows(), cols = cur_data_in->NumCols();
-  // Only handle sigmoid or tanh here.
-  if (dynamic_cast<SigmoidComponent*>(&(nnet_->GetComponent(c + 1))) == NULL &&
-      dynamic_cast<TanhComponent*>(&(nnet_->GetComponent(c + 1))) == NULL)
-    KALDI_ERR << "This type of nonlinear component is not handled: index  " << c;
-  KALDI_ASSERT(chunk_info_out_[0].NumChunks() == num_chunks); //TODO verify how this component can be used
-                                                             // rewrite the
-                                                             // chunk_info_out_
-                                                             // computation
-  // the nonlinear component:
-  NonlinearComponent &nc =
-      *(dynamic_cast<NonlinearComponent*>(&(nnet_->GetComponent(c + 1))));
-  ChunkInfo in_info, out_info;
-  in_info = chunk_info_out_[c+1];
-  out_info = chunk_info_out_[c+2];
-
-  BaseFloat orig_avg_deriv, target_avg_deriv = GetTargetAvgDeriv(c);
-  BaseFloat cur_scaling = 1.0; // current rescaling factor (on input).
-  int32 num_iters = 10;
-  
-  CuMatrix<BaseFloat> cur_data(*cur_data_in),
-      ones(rows, cols), in_deriv(rows, cols);
-      
-  ones.Set(1.0);
-  nc.Propagate(in_info, out_info, cur_data, next_data);
-  nc.Backprop(in_info, out_info, cur_data, *next_data, ones, NULL, &in_deriv);
-  BaseFloat cur_avg_deriv;
-  cur_avg_deriv = in_deriv.Sum() / (rows * cols);
-  orig_avg_deriv = cur_avg_deriv;
-  for (int32 iter = 0; iter < num_iters; iter++) {
-    // We already have "cur_avg_deriv"; perturb the scale and compute
-    // the next avg_deriv, so we can see how it changes with the scale.
-    cur_data.CopyFromMat(*cur_data_in);
-    cur_data.Scale(cur_scaling + config_.delta);
-    nc.Propagate(in_info, out_info, cur_data, next_data);
-    nc.Backprop(in_info, out_info, cur_data, *next_data, ones, NULL, &in_deriv);
-    BaseFloat next_avg_deriv = in_deriv.Sum() / (rows * cols);
-    KALDI_ASSERT(next_avg_deriv < cur_avg_deriv);
-    // "gradient" is how avg_deriv changes as we change the scale.
-    // should be negative.
-    BaseFloat gradient = (next_avg_deriv - cur_avg_deriv) / config_.delta;
-    KALDI_ASSERT(gradient < 0.0);
-    BaseFloat proposed_change = (target_avg_deriv - cur_avg_deriv) / gradient;
-    KALDI_VLOG(2) << "cur_avg_deriv = " << cur_avg_deriv << ", target_avg_deriv = "
-                  << target_avg_deriv << ", gradient = " << gradient
-                  << ", proposed_change " << proposed_change; 
-    // Limit size of proposed change in "cur_scaling", to ensure stability.
-    if (fabs(proposed_change / cur_scaling) > config_.max_change)
-      proposed_change = cur_scaling * config_.max_change *
-          (proposed_change > 0.0 ? 1.0 : -1.0);
-    cur_scaling += proposed_change;
-    
-    cur_data.CopyFromMat(*cur_data_in);
-    cur_data.Scale(cur_scaling);
-    nc.Propagate(in_info, out_info, cur_data, next_data);
-    nc.Backprop(in_info, out_info, cur_data, *next_data, ones, NULL, &in_deriv);
-    cur_avg_deriv = in_deriv.Sum() / (rows * cols);
-    if (fabs(proposed_change) < config_.min_change) break; // Terminate the
-    // optimization
-  }
-  UpdatableComponent *uc = dynamic_cast<UpdatableComponent*>(
-      &nnet_->GetComponent(c));
-  KALDI_ASSERT(uc != NULL);
-  uc->Scale(cur_scaling); // scale the parameters of the previous
-  // AffineComponent.
-  
-  KALDI_LOG << "For component " << c << ", scaling parameters by "
-            << cur_scaling << "; average "
-            << "derivative changed from " << orig_avg_deriv << " to "
-            << cur_avg_deriv << "; target was " << target_avg_deriv;
-}
-    
-
-
-void NnetRescaler::Rescale() {
-  ComputeRelevantIndexes(); // set up relevant_indexes_.
-  CuMatrix<BaseFloat> cur_data, next_data;
-  FormatInput(examples_, &cur_data);
-  int32 num_chunks = examples_.size();
-  for (int32 c = 0; c < nnet_->NumComponents(); c++) {
-    Component &component = nnet_->GetComponent(c);
-    if (relevant_indexes_.count(c - 1) == 1) {
-      // the following function call also appropriately sets "next_data"
-      // after doing the rescaling
-      RescaleComponent(c - 1, num_chunks, &cur_data, &next_data);
-    } else {
-      component.Propagate(chunk_info_out_[c], chunk_info_out_[c+1], cur_data, &next_data);
-    }
-    cur_data.Swap(&next_data);
-  }
-}
-
-void RescaleNnet(const NnetRescaleConfig &rescale_config,
-                 const std::vector<NnetExample> &examples,
-                 Nnet *nnet) {
-  NnetRescaler rescaler(rescale_config, examples, nnet);
-  rescaler.Rescale();
-}
-
-
-} // namespace nnet2
-} // namespace kaldi
diff --git a/src/nnet2/rescale-nnet.h b/src/nnet2/rescale-nnet.h
deleted file mode 100644
index 3d367fc1c1c..00000000000
--- a/src/nnet2/rescale-nnet.h
+++ /dev/null
@@ -1,80 +0,0 @@
-// nnet2/rescale-nnet.h
-
-// Copyright 2012  Johns Hopkins University (author: Daniel Povey)
-
-// See ../../COPYING for clarification regarding multiple authors
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//  http://www.apache.org/licenses/LICENSE-2.0
-//
-// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
-// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
-// MERCHANTABLITY OR NON-INFRINGEMENT.
-// See the Apache 2 License for the specific language governing permissions and
-// limitations under the License.
-
-#ifndef KALDI_NNET2_RESCALE_NNET_H_
-#define KALDI_NNET2_RESCALE_NNET_H_
-
-#include "nnet2/nnet-update.h"
-#include "nnet2/nnet-compute.h"
-#include "itf/options-itf.h"
-
-// Neural net rescaling is a rescaling of the parameters of the various layers
-// of a neural net, done so as to match certain specified statistics on the
-// average derivative of the sigmoid, measured on sample data.  This relates to
-// how "saturated" the sigmoid is.
-
-namespace kaldi {
-namespace nnet2 {
-
-
-struct NnetRescaleConfig {
-  BaseFloat target_avg_deriv;
-  BaseFloat target_first_layer_avg_deriv;
-  BaseFloat target_last_layer_avg_deriv;
-
-  // These are relatively unimportant; for now they have no
-  // command line options.
-  BaseFloat num_iters;
-  BaseFloat delta;
-  BaseFloat max_change; // maximum change on any one iteration (to
-  // ensure stability).
-  BaseFloat min_change; // minimum change on any one iteration (controls
-  // termination
-  
-  NnetRescaleConfig(): target_avg_deriv(0.2),
-                       target_first_layer_avg_deriv(0.3),
-                       target_last_layer_avg_deriv(0.1),
-                       num_iters(10),
-                       delta(0.01),
-                       max_change(0.2), min_change(1.0e-05) { }
-  
-  void Register(OptionsItf *opts) {
-    opts->Register("target-avg-deriv", &target_avg_deriv, "Target average derivative "
-                   "for hidden layers that are the not the first or last hidden layer "
-                   "(as fraction of maximum derivative of the nonlinearity)");
-    opts->Register("target-first-layer-avg-deriv", &target_first_layer_avg_deriv,
-                   "Target average derivative for the first hidden layer"
-                   "(as fraction of maximum derivative of the nonlinearity)");
-    opts->Register("target-last-layer-avg-deriv", &target_last_layer_avg_deriv,
-                   "Target average derivative for the last hidden layer, if "
-                   "#hid-layers > 1"
-                   "(as fraction of maximum derivative of the nonlinearity)");
-  }  
-};
-
-void RescaleNnet(const NnetRescaleConfig &rescale_config,
-                 const std::vector<NnetExample> &examples,
-                 Nnet *nnet);
-  
-
-
-} // namespace nnet2
-} // namespace kaldi
-
-#endif
diff --git a/src/nnet2/shrink-nnet.cc b/src/nnet2/shrink-nnet.cc
deleted file mode 100644
index cc24869c02d..00000000000
--- a/src/nnet2/shrink-nnet.cc
+++ /dev/null
@@ -1,112 +0,0 @@
-// nnet2/shrink-nnet.cc
-
-// Copyright 2012   Johns Hopkins University (author: Daniel Povey)
-
-// See ../../COPYING for clarification regarding multiple authors
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//  http://www.apache.org/licenses/LICENSE-2.0
-//
-// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
-// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
-// MERCHANTABLITY OR NON-INFRINGEMENT.
-// See the Apache 2 License for the specific language governing permissions and
-// limitations under the License.
-
-#include "nnet2/shrink-nnet.h"
-
-namespace kaldi {
-namespace nnet2 {
-
-static BaseFloat ComputeObjfAndGradient(
-    const std::vector<NnetExample> &validation_set,
-    const Vector<double> &log_scale_params,
-    const Nnet &nnet,
-    Vector<double> *gradient) {
-  Vector<BaseFloat> scale_params(log_scale_params);
-  scale_params.ApplyExp();
-  Nnet nnet_scaled(nnet);
-  nnet_scaled.ScaleComponents(scale_params);
-  
-  Nnet nnet_gradient(nnet);
-  bool is_gradient = true;
-  nnet_gradient.SetZero(is_gradient);
-
-  // note: "ans" is normalized by the total weight of validation frames.
-  int32 batch_size = 1024;
-  BaseFloat ans = ComputeNnetGradient(nnet_scaled,
-                                      validation_set,
-                                      batch_size,
-                                      &nnet_gradient);
-
-  BaseFloat tot_count = validation_set.size();
-  int32 i = 0; // index into log_scale_params.
-  for (int32 j = 0; j < nnet_scaled.NumComponents(); j++) {
-    const UpdatableComponent *uc =
-        dynamic_cast<const UpdatableComponent*>(&(nnet.GetComponent(j))),
-        *uc_gradient =
-        dynamic_cast<const UpdatableComponent*>(&(nnet_gradient.GetComponent(j)));
-    if (uc != NULL) {
-      BaseFloat dotprod = uc->DotProduct(*uc_gradient) / tot_count;
-      (*gradient)(i) = dotprod * scale_params(i); // gradient w.r.t log of scaling factor.
-      // We multiply by scale_params(i) to take into account d/dx exp(x); "gradient"
-      // is the gradient w.r.t. the log of the scale_params.
-      i++;
-    }
-  }
-  KALDI_ASSERT(i == log_scale_params.Dim());
-  return ans;
-}
-                                   
-
-void ShrinkNnet(const NnetShrinkConfig &shrink_config,
-                const std::vector<NnetExample> &validation_set,
-                Nnet *nnet) {
-
-  int32 dim = nnet->NumUpdatableComponents();
-  KALDI_ASSERT(dim > 0);
-  Vector<double> log_scale(dim), gradient(dim); // will be zero.
-  
-  // Get initial gradient.
-  double objf, initial_objf;
-
-
-  LbfgsOptions lbfgs_options;
-  lbfgs_options.minimize = false; // We're maximizing.
-  lbfgs_options.m = dim; // Store the same number of vectors as the dimension
-  // itself, so this is BFGS.
-  lbfgs_options.first_step_length = shrink_config.initial_step;
-  
-  OptimizeLbfgs<double> lbfgs(log_scale,
-                              lbfgs_options);
-  
-  for (int32 i = 0; i < shrink_config.num_bfgs_iters; i++) {
-    log_scale.CopyFromVec(lbfgs.GetProposedValue());
-    objf = ComputeObjfAndGradient(validation_set, log_scale,
-                                  *nnet,
-                                  &gradient);
-
-    KALDI_VLOG(2) << "log-scale = " << log_scale << ", objf = " << objf
-                  << ", gradient = " << gradient;
-    if (i == 0) initial_objf = objf;
-
-    lbfgs.DoStep(objf, gradient);
-  }
-
-  log_scale.CopyFromVec(lbfgs.GetValue(&objf));
-
-  Vector<BaseFloat> scale(log_scale);
-  scale.ApplyExp();
-  KALDI_LOG << "Shrinking nnet, validation objf per frame changed from "
-            << initial_objf << " to " << objf << ", scale factors per layer are "
-            << scale;
-  nnet->ScaleComponents(scale);
-}
- 
-  
-} // namespace nnet2
-} // namespace kaldi
diff --git a/src/nnet2/shrink-nnet.h b/src/nnet2/shrink-nnet.h
deleted file mode 100644
index 5a80920df77..00000000000
--- a/src/nnet2/shrink-nnet.h
+++ /dev/null
@@ -1,59 +0,0 @@
-// nnet2/shrink-nnet.h
-
-// Copyright 2012  Johns Hopkins University (author: Daniel Povey)
-
-// See ../../COPYING for clarification regarding multiple authors
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//  http://www.apache.org/licenses/LICENSE-2.0
-//
-// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
-// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
-// MERCHANTABLITY OR NON-INFRINGEMENT.
-// See the Apache 2 License for the specific language governing permissions and
-// limitations under the License.
-
-#ifndef KALDI_NNET2_SHRINK_NNET_H_
-#define KALDI_NNET2_SHRINK_NNET_H_
-
-#include "nnet2/nnet-update.h"
-#include "nnet2/nnet-compute.h"
-#include "itf/options-itf.h"
-
-namespace kaldi {
-namespace nnet2 {
-
-/** Configuration class that controls neural net "shrinkage" which is actually a
-    scaling on the parameters of each of the updatable layers.
- */
-struct NnetShrinkConfig {
-  int32 num_bfgs_iters; // The dimension is small (e.g. 3 to 5) so we do
-  // BFGS.  We actually implement this as L-BFGS but setting the number of
-  // vectors to be the same as the dimension of the space.  Note: this
-  // num-iters is in reality the number of function evaluations.
-
-  BaseFloat initial_step;
-  
-  NnetShrinkConfig(): num_bfgs_iters(10), initial_step(0.1) { }
-  void Register(OptionsItf *opts) {
-    opts->Register("num-bfgs-iters", &num_bfgs_iters, "Number of iterations of "
-                   "BFGS to use when optimizing shrinkage parameters");
-    opts->Register("initial-step", &initial_step, "Parameter in the optimization, "
-                   "used to set the initial step length");
-  }  
-};
-
-void ShrinkNnet(const NnetShrinkConfig &shrink_config,
-                const std::vector<NnetExample> &validation_set,
-                Nnet *nnet);
-  
-
-
-} // namespace nnet2
-} // namespace kaldi
-
-#endif
diff --git a/src/nnet2/train-nnet-ensemble.cc b/src/nnet2/train-nnet-ensemble.cc
deleted file mode 100644
index e04f86c267f..00000000000
--- a/src/nnet2/train-nnet-ensemble.cc
+++ /dev/null
@@ -1,141 +0,0 @@
-// nnet2/train-nnet-ensemble.cc
-
-// Copyright 2012   Johns Hopkins University (author: Daniel Povey)
-//           2014   Xiaohui Zhang
-
-// See ../../COPYING for clarification regarding multiple authors
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//  http://www.apache.org/licenses/LICENSE-2.0
-//
-// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
-// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
-// MERCHANTABLITY OR NON-INFRINGEMENT.
-// See the Apache 2 License for the specific language governing permissions and
-// limitations under the License.
-
-#include "nnet2/train-nnet-ensemble.h"
-#include <numeric> // for std::accumulate
-
-namespace kaldi {
-namespace nnet2 {
-
-static inline Int32Pair MakePair(int32 first, int32 second) {
-  Int32Pair ans;
-  ans.first = first;
-  ans.second = second;
-  return ans;
-}
-
-NnetEnsembleTrainer::NnetEnsembleTrainer(
-    const NnetEnsembleTrainerConfig &config,
-    std::vector<Nnet*> nnet_ensemble):
-    config_(config), nnet_ensemble_(nnet_ensemble) {
-  beta_ = config_.beta;
-  num_phases_ = 0;
-  bool first_time = true;
-  BeginNewPhase(first_time);
-}
-
-void NnetEnsembleTrainer::TrainOnExample(const NnetExample &value) {
-  buffer_.push_back(value);
-  if (static_cast<int32>(buffer_.size()) == config_.minibatch_size)
-    TrainOneMinibatch();
-}
-
-void NnetEnsembleTrainer::TrainOneMinibatch() {
-  KALDI_ASSERT(!buffer_.empty());
-  
-  int32 num_states = nnet_ensemble_[0]->GetComponent(nnet_ensemble_[0]->NumComponents() - 1).OutputDim();
-  // average of posteriors matrix, storing averaged outputs of net ensemble.
-  CuMatrix<BaseFloat> post_avg(buffer_.size(), num_states);
-  updater_ensemble_.reserve(nnet_ensemble_.size());
-  std::vector<CuMatrix<BaseFloat> > post_mat;
-  post_mat.resize(nnet_ensemble_.size());
-  for (int32 i = 0; i < nnet_ensemble_.size(); i++) {
-    updater_ensemble_.push_back(new NnetUpdater(*(nnet_ensemble_[i]), nnet_ensemble_[i]));
-    updater_ensemble_[i]->FormatInput(buffer_);
-    updater_ensemble_[i]->Propagate();
-    // posterior matrix, storing output of one net.
-    updater_ensemble_[i]->GetOutput(&post_mat[i]);
-    CuVector<BaseFloat> row_sum(post_mat[i].NumRows());
-    post_avg.AddMat(1.0, post_mat[i]);
-  }
-
-  // calculate the interpolated posterios as new supervision labels, and also 
-  // collect the indices of the original supervision labels for later use (calc. objf.).
-  std::vector<MatrixElement<BaseFloat> > sv_labels;
-  std::vector<Int32Pair > sv_labels_ind;
-  sv_labels.reserve(buffer_.size()); // We must have at least this many labels.
-  sv_labels_ind.reserve(buffer_.size()); // We must have at least this many labels.
-  for (int32 m = 0; m < buffer_.size(); m++) {
-    KALDI_ASSERT(buffer_[m].labels.size() == 1 &&
-                 "Currently this code only supports single-frame egs.");
-    const std::vector<std::pair<int32,BaseFloat> > &labels = buffer_[m].labels[0];
-    for (size_t i = 0; i < labels.size(); i++) {
-      MatrixElement<BaseFloat> 
-          tmp = {m, labels[i].first, labels[i].second};
-      sv_labels.push_back(tmp);
-      sv_labels_ind.push_back(MakePair(m, labels[i].first));
-    }
-  }
-  post_avg.Scale(1.0 / nnet_ensemble_.size());
-  post_avg.Scale(beta_);
-  post_avg.AddElements(1.0, sv_labels);
-
-  // calculate the deriv, do backprop, and calculate the objf.
-  for (int32 i = 0; i < nnet_ensemble_.size(); i++) {  
-    CuMatrix<BaseFloat> tmp_deriv(post_mat[i]);
-    post_mat[i].ApplyLog();
-    std::vector<BaseFloat> log_post_correct;
-    log_post_correct.resize(sv_labels_ind.size());
-    post_mat[i].Lookup(sv_labels_ind, &(log_post_correct[0]));
-    BaseFloat log_prob_this_net = std::accumulate(log_post_correct.begin(),
-                                                  log_post_correct.end(),
-                                                  static_cast<BaseFloat>(0));
-    avg_logprob_this_phase_ += log_prob_this_net;
-    tmp_deriv.InvertElements();
-    tmp_deriv.MulElements(post_avg);
-    updater_ensemble_[i]->Backprop(&tmp_deriv);
-  }
-  count_this_phase_ += buffer_.size();
-  buffer_.clear();
-  minibatches_seen_this_phase_++;
-  if (minibatches_seen_this_phase_ == config_.minibatches_per_phase) {
-    avg_logprob_this_phase_ /= static_cast<BaseFloat>(nnet_ensemble_.size());
-    bool first_time = false;
-    BeginNewPhase(first_time);
-  }
-}
-
-void NnetEnsembleTrainer::BeginNewPhase(bool first_time) {
-  if (!first_time)
-    KALDI_LOG << "Averaged cross-entropy between the supervision labels and the output is "
-              << (avg_logprob_this_phase_/count_this_phase_) << " over "
-              << count_this_phase_ << " frames, during this phase";
-  avg_logprob_this_phase_ = 0.0;
-  count_this_phase_ = 0.0;
-  minibatches_seen_this_phase_ = 0;
-  num_phases_++;
-}
-
-
-NnetEnsembleTrainer::~NnetEnsembleTrainer() {
-  if (!buffer_.empty()) {
-    KALDI_LOG << "Doing partial minibatch of size "
-              << buffer_.size();
-    TrainOneMinibatch();
-    if (minibatches_seen_this_phase_ != 0) {
-      bool first_time = false;
-      BeginNewPhase(first_time);
-    }
-  }
-}
-
-
-} // namespace nnet2
-} // namespace kaldi
diff --git a/src/nnet2/train-nnet-ensemble.h b/src/nnet2/train-nnet-ensemble.h
deleted file mode 100644
index 3ea450d695b..00000000000
--- a/src/nnet2/train-nnet-ensemble.h
+++ /dev/null
@@ -1,105 +0,0 @@
-// nnet2/train-nnet-ensemble.h
-
-// Copyright 2012  Johns Hopkins University (author: Daniel Povey)
-//           2014  Xiaohui Zhang
-
-// See ../../COPYING for clarification regarding multiple authors
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//  http://www.apache.org/licenses/LICENSE-2.0
-//
-// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
-// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
-// MERCHANTABLITY OR NON-INFRINGEMENT.
-// See the Apache 2 License for the specific language governing permissions and
-// limitations under the License.
-
-#ifndef KALDI_NNET2_TRAIN_NNET_ENSEMBLE_H_
-#define KALDI_NNET2_TRAIN_NNET_ENSEMBLE_H_
-
-#include "nnet2/nnet-update.h"
-#include "nnet2/nnet-compute.h"
-#include "itf/options-itf.h"
-
-namespace kaldi {
-namespace nnet2 {
-
-
-struct NnetEnsembleTrainerConfig {
-  int32 minibatch_size;
-  int32 minibatches_per_phase;
-  double beta;
-
-  NnetEnsembleTrainerConfig(): minibatch_size(500),
-                             minibatches_per_phase(50),
-                             beta(0.5) { }
-  
-  void Register (OptionsItf *opts) {
-    opts->Register("minibatch-size", &minibatch_size,
-                   "Number of samples per minibatch of training data.");
-    opts->Register("minibatches-per-phase", &minibatches_per_phase,
-                   "Number of minibatches to wait before printing training-set "
-                   "objective.");
-    opts->Register("beta", &beta, 
-                   "weight of the second term in the objf, which is the cross-entropy "
-                   "between the output posteriors and the averaged posteriors from other nets.");
-  }  
-};
-
-
-// Similar as NnetTrainer, Class NnetEnsembleTrainer first batches
-// up the input into minibatches and feed the data into every nnet in 
-// the ensemble, call Propogate to do forward propogation, and 
-// collect the output posteriors. The posteriors from different 
-// nets are averaged and then used to compute the additional term 
-// in the objf: (a constant times) the cross-entropy between each 
-// net's output posteriors and the averaged posteriors of 
-// the whole nnet ensemble. We also calculate the derivs and 
-// then call Backprop() to update each net separately.
-
-class NnetEnsembleTrainer {
- public:
-  NnetEnsembleTrainer(const NnetEnsembleTrainerConfig &config,
-                      std::vector<Nnet*> nnet_ensemble);
-  
-  /// TrainOnExample will take the example and add it to a buffer;
-  /// if we've reached the minibatch size it will do the training.
-  void TrainOnExample(const NnetExample &value);
-
-  ~NnetEnsembleTrainer();
- private:
-  KALDI_DISALLOW_COPY_AND_ASSIGN(NnetEnsembleTrainer);
-  
-  void TrainOneMinibatch();
-  
-  // The following function is called by TrainOneMinibatch()
-  // when we enter a new phase.
-  void BeginNewPhase(bool first_time);
-  
-  // Things we were given in the initializer:
-  NnetEnsembleTrainerConfig config_;
-
-  std::vector<Nnet*> nnet_ensemble_; // the nnet ensemble we're training.
-  std::vector<NnetUpdater*> updater_ensemble_;
-
-  // State information:
-  int32 num_phases_;
-  int32 minibatches_seen_this_phase_;
-  std::vector<NnetExample> buffer_; 
-
-  // ratio of the supervision, when interpolating the supervision with the averaged posteriors. 
-  double beta_;
-  double avg_logprob_this_phase_; // Needed for accumulating train log-prob on each phase.
-  double count_this_phase_; // count corresponding to the above.
-};
-
-
-
-} // namespace nnet2
-} // namespace kaldi
-
-#endif
diff --git a/src/nnet2/train-nnet.cc b/src/nnet2/train-nnet.cc
deleted file mode 100644
index fe6957190f1..00000000000
--- a/src/nnet2/train-nnet.cc
+++ /dev/null
@@ -1,206 +0,0 @@
-// nnet2/train-nnet.cc
-
-// Copyright 2012   Johns Hopkins University (author: Daniel Povey)
-
-// See ../../COPYING for clarification regarding multiple authors
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//  http://www.apache.org/licenses/LICENSE-2.0
-//
-// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
-// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
-// MERCHANTABLITY OR NON-INFRINGEMENT.
-// See the Apache 2 License for the specific language governing permissions and
-// limitations under the License.
-
-#include "nnet2/train-nnet.h"
-#include "util/kaldi-thread.h"
-
-namespace kaldi {
-namespace nnet2 {
-
-
-class NnetExampleBackgroundReader {
- public:
-  NnetExampleBackgroundReader(int32 minibatch_size,
-                              Nnet *nnet,
-                              SequentialNnetExampleReader *reader):
-      minibatch_size_(minibatch_size), nnet_(nnet), reader_(reader),
-      finished_(false) {
-    // When this class is created, it spawns a thread which calls ReadExamples()
-    // in the background. Below, Run is the static class-member function.
-    thread_ = std::thread(Run, this);
-    // the following call is a signal that no-one is currently using the examples_ and
-    // formatted_examples_ class members.
-    consumer_semaphore_.Signal();
-  }
-  ~NnetExampleBackgroundReader() {
-    if (!thread_.joinable())
-      KALDI_ERR << "No thread to join.";
-    thread_.join();
-  }
-
-  // This will be called in a background thread.  It's responsible for
-  // reading and formatting the examples.
-  void ReadExamples() {
-    KALDI_ASSERT(minibatch_size_ > 0);
-    int32 minibatch_size = minibatch_size_;
-
-
-    // Loop over minibatches...
-    while (true) {
-      // When the following call succeeds we interpret it as a signal that
-      // we are free to write to the class-member variables examples_ and formatted_examples_.
-      consumer_semaphore_.Wait();
-
-      examples_.clear();
-      examples_.reserve(minibatch_size);
-      // Read the examples.
-      for (; examples_.size() < minibatch_size && !reader_->Done(); reader_->Next())
-        examples_.push_back(reader_->Value());
-
-      // Format the examples as a single matrix.  The reason we do this here is
-      // that it's a somewhat CPU-intensive operation (involves decompressing
-      // the matrix), so we do it in a separate thread from the one that's
-      // controlling the GPU (assuming we're using a GPU), so we can get better
-      // GPU utilization.  If we have no GPU this doesn't hurt us.
-      if (examples_.empty()) {
-        formatted_examples_.Resize(0, 0);
-        total_weight_ = 0.0;
-      } else {
-        FormatNnetInput(*nnet_, examples_, &formatted_examples_);
-        total_weight_ = TotalNnetTrainingWeight(examples_);
-      }
-
-      bool finished = examples_.empty();
-
-      // The following call alerts the main program thread (that calls
-      // GetNextMinibatch() that it can how use the contents of
-      // examples_ and formatted_examples_.
-      producer_semaphore_.Signal();
-
-      // If we just read an empty minibatch (because no more examples),
-      // then return.
-      if (finished)
-        return;
-    }
-  }
-
-  // this wrapper can be passed to pthread_create.
-  static void* Run(void *ptr_in) {
-    NnetExampleBackgroundReader *ptr =
-        reinterpret_cast<NnetExampleBackgroundReader*>(ptr_in);
-    ptr->ReadExamples();
-    return NULL;
-  }
-
-  // This call makes available the next minibatch of input.  It returns
-  // true if it got some, and false if there was no more available.
-  // It is an error if you call this function after it has returned false.
-  bool GetNextMinibatch(std::vector<NnetExample> *examples,
-                        Matrix<BaseFloat> *formatted_examples,
-                        double *total_weight) {
-    KALDI_ASSERT(!finished_);
-    // wait until examples_ and formatted_examples_ have been created by
-    // the background thread.
-    producer_semaphore_.Wait();
-    // the calls to swap and Swap are lightweight.
-    examples_.swap(*examples);
-    formatted_examples_.Swap(formatted_examples);
-    *total_weight = total_weight_;
-
-    // signal the background thread that it is now free to write
-    // again to examples_ and formatted_examples_.
-    consumer_semaphore_.Signal();
-
-    if (examples->empty()) {
-      finished_ = true;
-      return false;
-    } else {
-      return true;
-    }
-  }
-
- private:
-  int32 minibatch_size_;
-  Nnet *nnet_;
-  SequentialNnetExampleReader *reader_;
-  std::thread thread_;
-
-  std::vector<NnetExample> examples_;
-  Matrix<BaseFloat> formatted_examples_;
-  double total_weight_;  // total weight, from TotalNnetTrainingWeight(examples_).
-                         // better to compute this in the background thread.
-
-  Semaphore producer_semaphore_;
-  Semaphore consumer_semaphore_;
-
-  bool finished_;
-};
-
-
-
-int64 TrainNnetSimple(const NnetSimpleTrainerConfig &config,
-                      Nnet *nnet,
-                      SequentialNnetExampleReader *reader,
-                      double *tot_weight_ptr,
-                      double *tot_logprob_ptr) {
-  int64 num_egs_processed = 0;
-  double tot_weight = 0.0, tot_logprob = 0.0;
-  NnetExampleBackgroundReader background_reader(config.minibatch_size,
-                                                nnet, reader);
-  KALDI_ASSERT(config.minibatches_per_phase > 0);
-  while (true) {
-    // Iterate over phases.  A phase of training is just a certain number of
-    // minibatches, and its only significance is that it's the periodicity with
-    // which we print diagnostics.
-    double tot_weight_this_phase = 0.0, tot_logprob_this_phase = 0.0;
-
-    int32 i;
-    for (i = 0; i < config.minibatches_per_phase; i++) {
-      std::vector<NnetExample> examples;
-      Matrix<BaseFloat> examples_formatted;
-      double minibatch_total_weight;  // this will normally equal minibatch size.
-      if (!background_reader.GetNextMinibatch(&examples, &examples_formatted,
-                                              &minibatch_total_weight))
-        break;
-      tot_logprob_this_phase += DoBackprop(*nnet, examples, &examples_formatted,
-                                           nnet, NULL);
-      tot_weight_this_phase += minibatch_total_weight;
-      num_egs_processed += examples.size();
-    }
-    if (i != 0) {
-      KALDI_LOG << "Training objective function (this phase) is "
-                << (tot_logprob_this_phase / tot_weight_this_phase) << " over "
-                << tot_weight_this_phase << " frames.";
-    }
-    tot_weight += tot_weight_this_phase;
-    tot_logprob += tot_logprob_this_phase;
-    if (i != config.minibatches_per_phase) {
-      // did not get all the minibatches we wanted because no more input.
-      // this is true if and only if we did "break" in the loop over i above.
-      break;
-    }
-  }
-  if (tot_weight == 0.0) {
-    KALDI_WARN << "No data seen.";
-  } else {
-    KALDI_LOG << "Did backprop on " << tot_weight
-              << " examples, average log-prob per frame is "
-              << (tot_logprob / tot_weight);
-    KALDI_LOG << "[this line is to be parsed by a script:] log-prob-per-frame="
-              << (tot_logprob / tot_weight);
-  }
-  if (tot_weight_ptr) *tot_weight_ptr = tot_weight;
-  if (tot_logprob_ptr) *tot_logprob_ptr = tot_logprob;
-  return num_egs_processed;
-}
-
-
-
-} // namespace nnet2
-} // namespace kaldi
diff --git a/src/nnet2/train-nnet.h b/src/nnet2/train-nnet.h
deleted file mode 100644
index 835ea8b47e6..00000000000
--- a/src/nnet2/train-nnet.h
+++ /dev/null
@@ -1,64 +0,0 @@
-// nnet2/train-nnet.h
-
-// Copyright 2012  Johns Hopkins University (author: Daniel Povey)
-
-// See ../../COPYING for clarification regarding multiple authors
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//  http://www.apache.org/licenses/LICENSE-2.0
-//
-// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
-// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
-// MERCHANTABLITY OR NON-INFRINGEMENT.
-// See the Apache 2 License for the specific language governing permissions and
-// limitations under the License.
-
-#ifndef KALDI_NNET2_TRAIN_NNET_H_
-#define KALDI_NNET2_TRAIN_NNET_H_
-
-#include "nnet2/nnet-update.h"
-#include "nnet2/nnet-compute.h"
-#include "itf/options-itf.h"
-
-namespace kaldi {
-namespace nnet2 {
-
-
-struct NnetSimpleTrainerConfig {
-  int32 minibatch_size;
-  int32 minibatches_per_phase;
-  
-  NnetSimpleTrainerConfig(): minibatch_size(500),
-                             minibatches_per_phase(50) { }
-  
-  void Register (OptionsItf *opts) {
-    opts->Register("minibatch-size", &minibatch_size,
-                   "Number of samples per minibatch of training data.");
-    opts->Register("minibatches-per-phase", &minibatches_per_phase,
-                   "Number of minibatches to wait before printing training-set "
-                   "objective.");
-  }  
-};
-
-
-/// Train on all the examples it can read from the reader.  This does training
-/// in a single thread, but it uses a separate thread to read in the examples
-/// and format the input data on the CPU; this saves us time when using GPUs.
-/// Returns the number of examples processed.
-/// Outputs to tot_weight and tot_logprob_per_frame, if non-NULL, the total
-/// weight of the examples (typically equal to the number of examples) and the
-/// total logprob objective function.
-int64 TrainNnetSimple(const NnetSimpleTrainerConfig &config,
-                      Nnet *nnet,
-                      SequentialNnetExampleReader *reader,
-                      double *tot_weight = NULL,
-                      double *tot_logprob = NULL);
-
-} // namespace nnet2
-} // namespace kaldi
-
-#endif
diff --git a/src/nnet2/widen-nnet.cc b/src/nnet2/widen-nnet.cc
deleted file mode 100644
index 2b49be56728..00000000000
--- a/src/nnet2/widen-nnet.cc
+++ /dev/null
@@ -1,100 +0,0 @@
-// nnet2/widen-nnet.cc
-
-// Copyright 2012   Johns Hopkins University (author: Daniel Povey)
-
-// See ../../COPYING for clarification regarding multiple authors
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//  http://www.apache.org/licenses/LICENSE-2.0
-//
-// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
-// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
-// MERCHANTABLITY OR NON-INFRINGEMENT.
-// See the Apache 2 License for the specific language governing permissions and
-// limitations under the License.
-
-#include "nnet2/widen-nnet.h"
-#include "gmm/model-common.h" // for GetSplitTargets()
-#include <numeric> // for std::accumulate
-
-namespace kaldi {
-namespace nnet2 {
-
-
-void AffineComponent::Widen(int32 new_dim,
-                            BaseFloat param_stddev,
-                            BaseFloat bias_stddev,
-                            std::vector<NonlinearComponent*> c2, // will usually
-                                                                 // have just
-                                                                 // one element.
-                            AffineComponent *c3) {
-  int32 old_dim = this->OutputDim(), extra_dim = new_dim - old_dim;
-  KALDI_ASSERT(!c2.empty());
-  if (new_dim <= old_dim) {
-    KALDI_WARN << "Not widening component because new dim "
-               << new_dim << " <= old dim " << old_dim;
-    return;
-  }
-  
-  this->bias_params_.Resize(new_dim,
-                            kCopyData);
-  this->bias_params_.Range(old_dim, extra_dim).SetRandn();
-  this->bias_params_.Range(old_dim, extra_dim).Scale(bias_stddev);
-
-  this->linear_params_.Resize(new_dim, InputDim(), kCopyData);
-  this->linear_params_.Range(old_dim, extra_dim,
-                             0, InputDim()).SetRandn();
-  this->linear_params_.Range(old_dim, extra_dim,
-                             0, InputDim()).Scale(param_stddev);
-
-  for (size_t i = 0; i < c2.size(); i++) // Change dimension of nonlinear
-    c2[i]->SetDim(new_dim);              // components
-    
-  // Change dimension of next affine component [extend with zeros,
-  // so the existing outputs do not change in value]
-  c3->linear_params_.Resize(c3->OutputDim(), new_dim, kCopyData);
-}
-
-void WidenNnet(const NnetWidenConfig &widen_config,
-               Nnet *nnet) {
-
-  int32 C = nnet->NumComponents();
-  int32 num_widened = 0;
-
-  for (int32 c = 0; c < C - 3; c++) {
-    AffineComponent *c1 = dynamic_cast<AffineComponent*>(&(nnet->GetComponent(c)));
-    if (c1 == NULL) continue;
-    std::vector<NonlinearComponent*> c2; // normally just one element, but allow two right now.
-    c2.push_back(dynamic_cast<NonlinearComponent*>(&(nnet->GetComponent(c+1))));
-    if (c2.back() == NULL) continue;
-    c2.push_back(dynamic_cast<NonlinearComponent*>(&(nnet->GetComponent(c+2))));
-    AffineComponent *c3;
-    if (c2.back() == NULL) {
-      c2.pop_back();
-      c3 = dynamic_cast<AffineComponent*>(&(nnet->GetComponent(c+2)));
-    } else {
-      if (c + 3 >= C) continue;
-      c3 = dynamic_cast<AffineComponent*>(&(nnet->GetComponent(c+3)));
-    }
-    if (c3 == NULL) continue;
-    BaseFloat param_stddev = widen_config.param_stddev_factor /
-        sqrt(1.0 * c1->InputDim());
-    KALDI_LOG << "Widening component " << c << " from "
-              << c1->OutputDim() << " to " << widen_config.hidden_layer_dim;
-    
-    c1->Widen(widen_config.hidden_layer_dim,
-              param_stddev, widen_config.bias_stddev,
-              c2, c3);
-    num_widened++;
-  }
-  nnet->Check();
-  KALDI_LOG << "Widened " << num_widened << " components.";
-}  
-
-  
-} // namespace nnet2
-} // namespace kaldi
diff --git a/src/nnet2/widen-nnet.h b/src/nnet2/widen-nnet.h
deleted file mode 100644
index 1684b4b69fb..00000000000
--- a/src/nnet2/widen-nnet.h
+++ /dev/null
@@ -1,65 +0,0 @@
-// nnet2/widen-nnet.h
-
-// Copyright 2013  Johns Hopkins University (author: Daniel Povey)
-
-// See ../../COPYING for clarification regarding multiple authors
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//  http://www.apache.org/licenses/LICENSE-2.0
-//
-// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
-// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
-// MERCHANTABLITY OR NON-INFRINGEMENT.
-// See the Apache 2 License for the specific language governing permissions and
-// limitations under the License.
-
-#ifndef KALDI_NNET2_WIDEN_NNET_H_
-#define KALDI_NNET2_WIDEN_NNET_H_
-
-#include "nnet2/nnet-update.h"
-#include "nnet2/nnet-compute.h"
-#include "itf/options-itf.h"
-
-namespace kaldi {
-namespace nnet2 {
-
-/** Configuration class that controls neural net "widening", which means increasing
-    the dimension of the hidden layers of an already-trained neural net.
- */
-struct NnetWidenConfig {
-  int32 hidden_layer_dim;
-  BaseFloat param_stddev_factor;
-  BaseFloat bias_stddev;
-  
-  NnetWidenConfig(): hidden_layer_dim(-1),
-                     param_stddev_factor(1.0),
-                     bias_stddev(0.5) { }
-
-  void Register(OptionsItf *opts) {
-    opts->Register("hidden-layer-dim", &hidden_layer_dim, "[required option]: "
-                   "target dimension of hidden layers");
-    opts->Register("param-stddev-factor", &param_stddev_factor, "Factor in "
-                   "standard deviation of linear parameters of new part of "
-                   "transform (multiply by 1/sqrt of input-dim)");
-    opts->Register("bias-stddev", &bias_stddev, "Standard deviation of added "
-                   "bias parameters");
-  }  
-};
-
-/**
-   This function widens a neural network by increasing the hidden-layer
-   dimensions to the target. */
-
-void WidenNnet(const NnetWidenConfig &widen_config,
-               Nnet *nnet);
-  
-
-
-} // namespace nnet2
-} // namespace kaldi
-
-#endif
diff --git a/src/nnet2bin/Makefile b/src/nnet2bin/Makefile
deleted file mode 100644
index b7e2c385006..00000000000
--- a/src/nnet2bin/Makefile
+++ /dev/null
@@ -1,44 +0,0 @@
-
-all:
-EXTRA_CXXFLAGS = -Wno-sign-compare
-include ../kaldi.mk
-
-LDFLAGS += $(CUDA_LDFLAGS)
-LDLIBS += $(CUDA_LDLIBS)
-
-BINFILES = nnet-am-info nnet-init \
-   nnet-train-simple nnet-train-ensemble nnet-train-transitions nnet-latgen-faster nnet-am-copy \
-   nnet-am-init nnet-insert nnet-align-compiled \
-   nnet-compute-prob nnet-copy-egs nnet-combine \
-   nnet-am-average nnet-am-compute nnet-am-mixup \
-   nnet-get-egs nnet-train-parallel nnet-combine-fast \
-   nnet-subset-egs nnet-shuffle-egs nnet-am-fix \
-   nnet-latgen-faster-parallel nnet-to-raw-nnet nnet-compute \
-   raw-nnet-concat raw-nnet-info \
-   nnet-get-feature-transform nnet-compute-from-egs \
-   nnet-am-widen nnet-show-progress \
-   nnet-get-feature-transform-multi nnet-copy-egs-discriminative \
-   nnet-get-egs-discriminative nnet-shuffle-egs-discriminative \
-   nnet-compare-hash-discriminative nnet-combine-egs-discriminative \
-   nnet-train-discriminative-simple nnet-train-discriminative-parallel \
-   nnet-modify-learning-rates nnet-normalize-stddev  \
-   nnet-get-weighted-egs nnet-adjust-priors \
-   cuda-compiled nnet-replace-last-layers nnet-am-switch-preconditioning \
-   nnet1-to-raw-nnet raw-nnet-copy nnet-relabel-egs nnet-am-reinitialize
-
-OBJFILES =
-
-# Add this dependency to force cuda-compiled.o to be rebuilt when we reconfigure.
-cuda-compiled.o: ../kaldi.mk
-
-
-TESTFILES =
-
-ADDLIBS = ../nnet2/kaldi-nnet2.a ../nnet/kaldi-nnet.a \
-          ../cudamatrix/kaldi-cudamatrix.a ../decoder/kaldi-decoder.a \
-          ../lat/kaldi-lat.a ../fstext/kaldi-fstext.a ../hmm/kaldi-hmm.a \
-          ../transform/kaldi-transform.a ../gmm/kaldi-gmm.a \
-          ../tree/kaldi-tree.a ../util/kaldi-util.a ../matrix/kaldi-matrix.a \
-          ../base/kaldi-base.a 
-
-include ../makefiles/default_rules.mk
diff --git a/src/nnet2bin/cuda-compiled.cc b/src/nnet2bin/cuda-compiled.cc
deleted file mode 100644
index b6de9257657..00000000000
--- a/src/nnet2bin/cuda-compiled.cc
+++ /dev/null
@@ -1,36 +0,0 @@
-// nnet2bin/cuda-compiled.cc
-
-// Copyright 2014 Johns Hopkins University (author:  Daniel Povey)
-
-// See ../../COPYING for clarification regarding multiple authors
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//  http://www.apache.org/licenses/LICENSE-2.0
-//
-// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
-// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
-// MERCHANTABLITY OR NON-INFRINGEMENT.
-// See the Apache 2 License for the specific language governing permissions and
-// limitations under the License.
-
-#include "base/kaldi-common.h"
-#include "cudamatrix/cu-device.h"
-
-int main(int argc, char *argv[]) {
-  const char *usage = "This program returns exit status 0 (success) if the code\n"
-      "was compiled with CUDA support, and 1 otherwise.  To support CUDA, you\n"
-      "must run 'configure' on a machine that has the CUDA compiler 'nvcc'\n"
-      "available.\n";
-  if (argc > 1) {
-    std::cerr << usage << "\n";
-  }
-#if HAVE_CUDA==1
-  return 0;
-#else
-  return 1;
-#endif
-}
diff --git a/src/nnet2bin/nnet-adjust-priors.cc b/src/nnet2bin/nnet-adjust-priors.cc
deleted file mode 100644
index 4d0ec110698..00000000000
--- a/src/nnet2bin/nnet-adjust-priors.cc
+++ /dev/null
@@ -1,144 +0,0 @@
-// nnet2bin/nnet-adjust-priors.cc
-
-// Copyright 2014  Johns Hopkins University (author:  Daniel Povey)
-
-// See ../../COPYING for clarification regarding multiple authors
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//  http://www.apache.org/licenses/LICENSE-2.0
-//
-// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
-// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
-// MERCHANTABLITY OR NON-INFRINGEMENT.
-// See the Apache 2 License for the specific language governing permissions and
-// limitations under the License.
-
-#include "base/kaldi-common.h"
-#include "util/common-utils.h"
-#include "nnet2/am-nnet.h"
-#include "hmm/transition-model.h"
-#include "tree/context-dep.h"
-
-namespace kaldi {
-namespace nnet2 {
-
-
-// Computes one-sided K-L divergence from p to q.
-BaseFloat KlDivergence(const Vector<BaseFloat> &p,
-                       const Vector<BaseFloat> &q) {
-  BaseFloat sum_p = p.Sum(), sum_q = q.Sum();
-  if (fabs(sum_p - 1.0) > 0.01 || fabs(sum_q - 1.0) > 0.01) {
-    KALDI_WARN << "KlDivergence: vectors are not close to being normalized "
-               << sum_p << ", " << sum_q;
-  }
-  KALDI_ASSERT(p.Dim() == q.Dim());
-  double ans = 0.0;
-
-  for (int32 i = 0; i < p.Dim(); i++) {
-    BaseFloat p_prob = p(i) / sum_p, q_prob = q(i) / sum_q;
-    ans += p_prob * Log(p_prob / q_prob);
-  }
-  return ans;
-}
-
-void PrintPriorDiagnostics(const Vector<BaseFloat> &old_priors,
-                           const Vector<BaseFloat> &new_priors) {
-  if (old_priors.Dim() == 0) {
-    KALDI_LOG << "Model did not previously have priors attached.";
-  } else {
-    Vector<BaseFloat> diff_prior(new_priors);
-    diff_prior.AddVec(-1.0, old_priors);
-    diff_prior.ApplyAbs();
-    int32 max_index;
-    diff_prior.Max(&max_index);
-    KALDI_LOG << "Adjusting priors: largest absolute difference was for "
-              << "pdf " << max_index << ", " << old_priors(max_index)
-              << " -> " << new_priors(max_index);
-    KALDI_LOG << "Adjusting priors: K-L divergence from old to new is "
-              << KlDivergence(old_priors, new_priors);
-  }
-}
-
-
-} // namespace nnet2
-} // namespace kaldi
-
-int main(int argc, char *argv[]) {
-  try {
-    using namespace kaldi;
-    using namespace kaldi::nnet2;
-    typedef kaldi::int32 int32;
-
-    const char *usage =
-        "Set the priors of the neural net to the computed posterios from the net,\n"
-        "on typical data (e.g. training data). This is correct under more general\n"
-        "circumstances than using the priors of the class labels in the training data\n"
-        "\n"
-        "Typical usage of this program will involve computation of an average pdf-level\n"
-        "posterior with nnet-compute or nnet-compute-from-egs, piped into matrix-sum-rows\n"
-        "and then vector-sum, to compute the average posterior\n"
-        "\n"
-        "Usage: nnet-adjust-priors [options] <nnet-in> <summed-posterior-vector-in> <nnet-out>\n"
-        "e.g.:\n"
-        " nnet-adjust-priors final.mdl prior.vec final.mdl\n";
-    
-    bool binary_write = true;
-    BaseFloat prior_floor = 1.0e-15; // Have a very low prior floor, since this method
-                                     // isn't likely to have a problem with very improbable
-                                     // classes.
-    
-    ParseOptions po(usage);
-    po.Register("binary", &binary_write, "Write output in binary mode");
-    po.Register("prior-floor", &prior_floor, "When setting priors, floor for "
-                "priors (only used to avoid generating NaNs upon inversion)");
-
-    po.Read(argc, argv);
-    
-    if (po.NumArgs() != 3) {
-      po.PrintUsage();
-      exit(1);
-    }
-
-    std::string nnet_rxfilename = po.GetArg(1),
-        posterior_vec_rxfilename = po.GetArg(2),
-        nnet_wxfilename = po.GetArg(3);
-    
-    TransitionModel trans_model;
-    AmNnet am_nnet;
-    {
-      bool binary_read;
-      Input ki(nnet_rxfilename, &binary_read);
-      trans_model.Read(ki.Stream(), binary_read);
-      am_nnet.Read(ki.Stream(), binary_read);
-    }
-    
-
-    Vector<BaseFloat> posterior_vec;
-    ReadKaldiObject(posterior_vec_rxfilename, &posterior_vec);
-
-    KALDI_ASSERT(posterior_vec.Sum() > 0.0);
-    posterior_vec.Scale(1.0 / posterior_vec.Sum()); // Renormalize
-    
-    Vector<BaseFloat> old_priors(am_nnet.Priors());
-
-    PrintPriorDiagnostics(old_priors, posterior_vec);
-    
-    am_nnet.SetPriors(posterior_vec);
-        
-    {
-      Output ko(nnet_wxfilename, binary_write);
-      trans_model.Write(ko.Stream(), binary_write);
-      am_nnet.Write(ko.Stream(), binary_write);
-    }
-    KALDI_LOG << "Modified priors of neural network model and wrote it to "
-              << nnet_wxfilename;
-    return 0;
-  } catch(const std::exception &e) {
-    std::cerr << e.what() << '\n';
-    return -1;
-  }
-}
diff --git a/src/nnet2bin/nnet-align-compiled.cc b/src/nnet2bin/nnet-align-compiled.cc
deleted file mode 100644
index 6d199671cc4..00000000000
--- a/src/nnet2bin/nnet-align-compiled.cc
+++ /dev/null
@@ -1,159 +0,0 @@
-// nnet2bin/nnet-align-compiled.cc
-
-// Copyright 2009-2012  Microsoft Corporation
-//                      Johns Hopkins University (author: Daniel Povey)
-
-// See ../../COPYING for clarification regarding multiple authors
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//  http://www.apache.org/licenses/LICENSE-2.0
-//
-// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
-// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
-// MERCHANTABLITY OR NON-INFRINGEMENT.
-// See the Apache 2 License for the specific language governing permissions and
-// limitations under the License.
-
-#include "base/kaldi-common.h"
-#include "util/common-utils.h"
-#include "gmm/am-diag-gmm.h"
-#include "hmm/transition-model.h"
-#include "hmm/hmm-utils.h"
-#include "fstext/fstext-lib.h"
-#include "decoder/decoder-wrappers.h"
-#include "decoder/training-graph-compiler.h"
-#include "nnet2/decodable-am-nnet.h"
-#include "lat/kaldi-lattice.h"
-
-int main(int argc, char *argv[]) {
-  try {
-    using namespace kaldi;
-    using namespace kaldi::nnet2;
-    typedef kaldi::int32 int32;
-    using fst::SymbolTable;
-    using fst::VectorFst;
-    using fst::StdArc;
-
-    const char *usage =
-        "Align features given neural-net-based model\n"
-        "Usage:   nnet-align-compiled [options] <model-in> <graphs-rspecifier> "
-        "<feature-rspecifier> <alignments-wspecifier>\n"
-        "e.g.: \n"
-        " nnet-align-compiled 1.mdl ark:graphs.fsts scp:train.scp ark:1.ali\n"
-        "or:\n"
-        " compile-train-graphs tree 1.mdl lex.fst 'ark:sym2int.pl -f 2- words.txt text|' \\\n"
-        "   ark:- | nnet-align-compiled 1.mdl ark:- scp:train.scp t, ark:1.ali\n";
-
-    ParseOptions po(usage);
-    AlignConfig align_config;
-    std::string use_gpu = "yes";
-    BaseFloat acoustic_scale = 1.0;
-    BaseFloat transition_scale = 1.0;
-    BaseFloat self_loop_scale = 1.0;
-    std::string per_frame_acwt_wspecifier;
-
-    align_config.Register(&po);
-    po.Register("transition-scale", &transition_scale,
-                "Transition-probability scale [relative to acoustics]");
-    po.Register("acoustic-scale", &acoustic_scale,
-                "Scaling factor for acoustic likelihoods");
-    po.Register("self-loop-scale", &self_loop_scale,
-                "Scale of self-loop versus non-self-loop "
-                "log probs [relative to acoustics]");
-    po.Register("write-per-frame-acoustic-loglikes", &per_frame_acwt_wspecifier,
-                "Wspecifier for table of vectors containing the acoustic log-likelihoods "
-                "per frame for each utterance. E.g. ark:foo/per_frame_logprobs.1.ark");
-    po.Register("use-gpu", &use_gpu,
-                "yes|no|optional|wait, only has effect if compiled with CUDA");
-    po.Read(argc, argv);
-
-    if (po.NumArgs() < 4 || po.NumArgs() > 5) {
-      po.PrintUsage();
-      exit(1);
-    }
-
-#if HAVE_CUDA==1
-    CuDevice::Instantiate().SelectGpuId(use_gpu);
-#endif
-
-    std::string model_in_filename = po.GetArg(1),
-        fst_rspecifier = po.GetArg(2),
-        feature_rspecifier = po.GetArg(3),
-        alignment_wspecifier = po.GetArg(4),
-        scores_wspecifier = po.GetOptArg(5);
-
-    int num_done = 0, num_err = 0, num_retry = 0;
-    double tot_like = 0.0;
-    kaldi::int64 frame_count = 0;
-
-    {
-      TransitionModel trans_model;
-      AmNnet am_nnet;
-      {
-        bool binary;
-        Input ki(model_in_filename, &binary);
-        trans_model.Read(ki.Stream(), binary);
-        am_nnet.Read(ki.Stream(), binary);
-      }
-
-      SequentialTableReader<fst::VectorFstHolder> fst_reader(fst_rspecifier);
-      RandomAccessBaseFloatCuMatrixReader feature_reader(feature_rspecifier);
-      Int32VectorWriter alignment_writer(alignment_wspecifier);
-      BaseFloatWriter scores_writer(scores_wspecifier);
-      BaseFloatVectorWriter per_frame_acwt_writer(per_frame_acwt_wspecifier);
-
-      for (; !fst_reader.Done(); fst_reader.Next()) {
-        std::string utt = fst_reader.Key();
-        if (!feature_reader.HasKey(utt)) {
-          KALDI_WARN << "No features for utterance " << utt;
-          num_err++;
-          continue;
-        }
-        const CuMatrix<BaseFloat> &features = feature_reader.Value(utt);
-        VectorFst<StdArc> decode_fst(fst_reader.Value());
-        fst_reader.FreeCurrent();  // this stops copy-on-write of the fst
-        // by deleting the fst inside the reader, since we're about to mutate
-        // the fst by adding transition probs.
-
-        if (features.NumRows() == 0) {
-          KALDI_WARN << "Zero-length utterance: " << utt;
-          num_err++;
-          continue;
-        }
-
-        {  // Add transition-probs to the FST.
-          std::vector<int32> disambig_syms;  // empty.
-          AddTransitionProbs(trans_model, disambig_syms,
-                             transition_scale, self_loop_scale,
-                             &decode_fst);
-        }
-
-        bool pad_input = true;
-        DecodableAmNnet nnet_decodable(trans_model, am_nnet, features,
-                                       pad_input, acoustic_scale);
-
-        AlignUtteranceWrapper(align_config, utt,
-                              acoustic_scale, &decode_fst, &nnet_decodable,
-                              &alignment_writer, &scores_writer,
-                              &num_done, &num_err, &num_retry,
-                              &tot_like, &frame_count, &per_frame_acwt_writer);
-      }
-      KALDI_LOG << "Overall log-likelihood per frame is " << (tot_like/frame_count)
-                << " over " << frame_count<< " frames.";
-      KALDI_LOG << "Retried " << num_retry << " out of "
-                << (num_done + num_err) << " utterances.";
-      KALDI_LOG << "Done " << num_done << ", errors on " << num_err;
-    }
-#if HAVE_CUDA==1
-    CuDevice::Instantiate().PrintProfile();
-#endif
-    return (num_done != 0 ? 0 : 1);
-  } catch(const std::exception &e) {
-    std::cerr << e.what();
-    return -1;
-  }
-}
diff --git a/src/nnet2bin/nnet-am-average.cc b/src/nnet2bin/nnet-am-average.cc
deleted file mode 100644
index d35375f44f2..00000000000
--- a/src/nnet2bin/nnet-am-average.cc
+++ /dev/null
@@ -1,259 +0,0 @@
-// nnet2bin/nnet-am-average.cc
-
-// Copyright 2012  Johns Hopkins University (author:  Daniel Povey)
-
-// See ../../COPYING for clarification regarding multiple authors
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//  http://www.apache.org/licenses/LICENSE-2.0
-//
-// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
-// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
-// MERCHANTABLITY OR NON-INFRINGEMENT.
-// See the Apache 2 License for the specific language governing permissions and
-// limitations under the License.
-
-#include <algorithm>
-
-#include "base/kaldi-common.h"
-#include "util/common-utils.h"
-#include "hmm/transition-model.h"
-#include "nnet2/combine-nnet-a.h"
-#include "nnet2/am-nnet.h"
-
-namespace kaldi {
-
-void GetWeights(const std::string &weights_str,
-                int32 num_inputs,
-                std::vector<BaseFloat> *weights) {
-  KALDI_ASSERT(num_inputs >= 1);
-  if (!weights_str.empty()) {
-    SplitStringToFloats(weights_str, ":", true, weights);
-    if (weights->size() != num_inputs) {
-      KALDI_ERR << "--weights option must be a colon-separated list "
-                << "with " << num_inputs << " elements, got: "
-                << weights_str;
-    }
-  } else {
-    for (int32 i = 0; i < num_inputs; i++)
-      weights->push_back(1.0 / num_inputs);
-  }
-  // normalize the weights to sum to one.
-  float weight_sum = 0.0;
-  for (int32 i = 0; i < num_inputs; i++)
-    weight_sum += (*weights)[i];
-  for (int32 i = 0; i < num_inputs; i++)
-    (*weights)[i] = (*weights)[i] / weight_sum;
-  if (fabs(weight_sum - 1.0) > 0.01) {
-    KALDI_WARN << "Normalizing weights to sum to one, sum was " << weight_sum;
-  }
-}
-
-
-
-std::vector<bool> GetSkipLayers(const std::string &skip_layers_str,
-                                const int32 first_layer_idx,
-                                const int32 last_layer_idx) {
-
-  std::vector<bool> skip_layers(last_layer_idx, false);
-
-  if (skip_layers_str.empty()) {
-    return skip_layers;
-  }
-
-  std::vector<int> layer_indices;
-  bool ret = SplitStringToIntegers(skip_layers_str, ":", true, &layer_indices);
-  if (!ret) {
-    KALDI_ERR << "Cannot parse the skip layers specifier. It should be"
-              << "colon-separated list of integers";
-  }
-
-  int min_elem = std::numeric_limits<int>().max(),
-      max_elem = std::numeric_limits<int>().min();
-
-  std::vector<int>::iterator it;
-  for ( it = layer_indices.begin(); it != layer_indices.end(); ++it ) {
-    if ( *it < 0 )
-      *it = last_layer_idx + *it;  // convert the negative indices to
-                                       // correct indices -- -1 would be the
-                                       // last one, -2 the one before the last
-                                       // and so on.
-    if (*it > max_elem)
-      max_elem = *it;
-
-    if (*it < min_elem)
-      min_elem = *it;
-  }
-
-  if (max_elem >= last_layer_idx) {
-    KALDI_ERR << "--skip-layers option has to be a colon-separated list"
-              << "of indices which are supposed to be skipped.\n"
-              << "Maximum expected index: " << last_layer_idx
-              << " got: " << max_elem ;
-  }
-  if (min_elem < first_layer_idx) {
-    KALDI_ERR << "--skip-layers option has to be a colon-separated list"
-              << "of indices which are supposed to be skipped.\n"
-              << "Minimum expected index: " << first_layer_idx
-              << " got: " << min_elem ;
-  }
-
-  for ( it = layer_indices.begin(); it != layer_indices.end(); ++it ) {
-    skip_layers[*it] = true;
-  }
-  return skip_layers;
-}
-
-}
-int main(int argc, char *argv[]) {
-  try {
-    using namespace kaldi;
-    using namespace kaldi::nnet2;
-    typedef kaldi::int32 int32;
-    typedef kaldi::int64 int64;
-
-    const char *usage =
-        "This program averages (or sums, if --sum=true) the parameters over a\n"
-        "number of neural nets.  If you supply the option --skip-last-layer=true,\n"
-        "the parameters of the last updatable layer are copied from <model1> instead\n"
-        "of being averaged (useful in multi-language scenarios).\n"
-        "The --weights option can be used to weight each model differently.\n"
-        "\n"
-        "Usage:  nnet-am-average [options] <model1> <model2> ... <modelN> <model-out>\n"
-        "\n"
-        "e.g.:\n"
-        " nnet-am-average 1.1.nnet 1.2.nnet 1.3.nnet 2.nnet\n";
-
-    bool binary_write = true;
-    bool sum = false;
-
-    ParseOptions po(usage);
-    po.Register("sum", &sum, "If true, sums instead of averages.");
-    po.Register("binary", &binary_write, "Write output in binary mode");
-    string weights_str;
-    bool skip_last_layer = false;
-    string skip_layers_str;
-    po.Register("weights", &weights_str, "Colon-separated list of weights, one "
-                "for each input model.  These will be normalized to sum to one.");
-    po.Register("skip-last-layer", &skip_last_layer, "If true, averaging of "
-                "the last updatable layer is skipped (result comes from model1)");
-    po.Register("skip-layers", &skip_layers_str, "Colon-separated list of "
-                "indices of the layers that should be skipped during averaging."
-                "Be careful: this parameter uses an absolute indexing of "
-                "layers, i.e. iterates over all components, not over updatable "
-                "ones only.");
-
-    po.Read(argc, argv);
-
-    if (po.NumArgs() < 2) {
-      po.PrintUsage();
-      exit(1);
-    }
-
-    std::string
-        nnet1_rxfilename = po.GetArg(1),
-        nnet_wxfilename = po.GetArg(po.NumArgs());
-
-    TransitionModel trans_model1;
-    AmNnet am_nnet1;
-    {
-      bool binary_read;
-      Input ki(nnet1_rxfilename, &binary_read);
-      trans_model1.Read(ki.Stream(), binary_read);
-      am_nnet1.Read(ki.Stream(), binary_read);
-    }
-
-    int32 num_inputs = po.NumArgs() - 1;
-
-    std::vector<BaseFloat> model_weights;
-    GetWeights(weights_str, num_inputs, &model_weights);
-
-    int32 c_begin = 0,
-        c_end = (skip_last_layer ?
-                 am_nnet1.GetNnet().LastUpdatableComponent() :
-                 am_nnet1.GetNnet().NumComponents());
-    KALDI_ASSERT(c_end != -1 && "Network has no updatable components.");
-
-    int32 last_layer_idx = am_nnet1.GetNnet().NumComponents();
-    std::vector<bool> skip_layers = GetSkipLayers(skip_layers_str,
-                                             0,
-                                             last_layer_idx);
-
-    // scale the components - except the last layer, if skip_last_layer == true.
-    for (int32 c = c_begin; c < c_end; c++) {
-      if (skip_layers[c]) {
-        KALDI_VLOG(2) << "Not averaging layer " << c << " (as requested)";
-        continue;
-      }
-      bool updated = false;
-      UpdatableComponent *uc =
-        dynamic_cast<UpdatableComponent*>(&(am_nnet1.GetNnet().GetComponent(c)));
-      if (uc != NULL)  {
-        KALDI_VLOG(2) << "Averaging layer " << c << " (UpdatableComponent)";
-        uc->Scale(model_weights[0]);
-        updated = true;
-      }
-      NonlinearComponent *nc =
-        dynamic_cast<NonlinearComponent*>(&(am_nnet1.GetNnet().GetComponent(c)));
-      if (nc != NULL) {
-        KALDI_VLOG(2) << "Averaging layer " << c << " (NonlinearComponent)";
-        nc->Scale(model_weights[0]);
-        updated = true;
-      }
-      if (! updated) {
-        KALDI_VLOG(2) << "Not averaging layer " << c
-          << " (unscalable component)";
-      }
-    }
-
-    for (int32 i = 2; i <= num_inputs; i++) {
-      bool binary_read;
-      Input ki(po.GetArg(i), &binary_read);
-      TransitionModel trans_model;
-      trans_model.Read(ki.Stream(), binary_read);
-      AmNnet am_nnet;
-      am_nnet.Read(ki.Stream(), binary_read);
-
-      for (int32 c = c_begin; c < c_end; c++) {
-        if (skip_layers[c]) continue;
-
-        UpdatableComponent *uc_average =
-          dynamic_cast<UpdatableComponent*>(&(am_nnet1.GetNnet().GetComponent(c)));
-        const UpdatableComponent *uc_this =
-          dynamic_cast<const UpdatableComponent*>(&(am_nnet.GetNnet().GetComponent(c)));
-        if (uc_average != NULL) {
-          KALDI_ASSERT(uc_this != NULL &&
-                       "Networks must have the same structure.");
-          uc_average->Add(model_weights[i-1], *uc_this);
-        }
-
-        NonlinearComponent *nc_average =
-          dynamic_cast<NonlinearComponent*>(&(am_nnet1.GetNnet().GetComponent(c)));
-        const NonlinearComponent *nc_this =
-          dynamic_cast<const NonlinearComponent*>(&(am_nnet.GetNnet().GetComponent(c)));
-        if (nc_average != NULL) {
-          KALDI_ASSERT(nc_this != NULL &&
-                       "Networks must have the same structure.");
-          nc_average->Add(model_weights[i-1], *nc_this);
-        }
-      }
-    }
-
-    {
-      Output ko(nnet_wxfilename, binary_write);
-      trans_model1.Write(ko.Stream(), binary_write);
-      am_nnet1.Write(ko.Stream(), binary_write);
-    }
-
-    KALDI_LOG << "Averaged parameters of " << num_inputs
-              << " neural nets, and wrote to " << nnet_wxfilename;
-    return 0; // it will throw an exception if there are any problems.
-  } catch(const std::exception &e) {
-    std::cerr << e.what() << '\n';
-    return -1;
-  }
-}
diff --git a/src/nnet2bin/nnet-am-compute.cc b/src/nnet2bin/nnet-am-compute.cc
deleted file mode 100644
index 32da30b73a5..00000000000
--- a/src/nnet2bin/nnet-am-compute.cc
+++ /dev/null
@@ -1,163 +0,0 @@
-// nnet2bin/nnet-am-compute.cc
-
-// Copyright 2012  Johns Hopkins University (author:  Daniel Povey)
-//           2015  Johns Hopkins University (author:  Daniel Garcia-Romero)
-//           2015  David Snyder
-//           2017  Karel Vesely
-
-// See ../../COPYING for clarification regarding multiple authors
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//  http://www.apache.org/licenses/LICENSE-2.0
-//
-// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
-// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
-// MERCHANTABLITY OR NON-INFRINGEMENT.
-// See the Apache 2 License for the specific language governing permissions and
-// limitations under the License.
-
-#include "base/kaldi-common.h"
-#include "util/common-utils.h"
-#include "hmm/transition-model.h"
-#include "nnet2/train-nnet.h"
-#include "nnet2/am-nnet.h"
-
-
-int main(int argc, char *argv[]) {
-  try {
-    using namespace kaldi;
-    using namespace kaldi::nnet2;
-    typedef kaldi::int32 int32;
-    typedef kaldi::int64 int64;
-
-    const char *usage =
-        "Does the neural net computation for each file of input features, and\n"
-        "outputs as a matrix the result.  Used mostly for debugging.\n"
-        "Note: if you want it to apply a log (e.g. for log-likelihoods), use\n"
-        "--apply-log=true\n"
-        "\n"
-        "Usage:  nnet-am-compute [options] <model-in> <feature-rspecifier> "
-        "<feature-or-loglikes-wspecifier>\n"
-        "See also: nnet-compute, nnet-logprob\n";
-
-    bool divide_by_priors = false;
-    bool apply_log = false;
-    bool pad_input = true;
-    std::string use_gpu = "no";
-    int32 chunk_size = 0;
-    ParseOptions po(usage);
-    po.Register("divide-by-priors", &divide_by_priors, "If true, "
-                "divide by the priors stored in the model and re-normalize, apply-log may follow");
-    po.Register("apply-log", &apply_log, "Apply a log to the result of the computation "
-                "before outputting.");
-    po.Register("pad-input", &pad_input, "If true, duplicate the first and last frames "
-                "of input features as required for temporal context, to prevent #frames "
-                "of output being less than those of input.");
-    po.Register("use-gpu", &use_gpu,
-                "yes|no|optional|wait, only has effect if compiled with CUDA");
-    po.Register("chunk-size", &chunk_size, "Process the feature matrix in chunks.  "
-                "This is useful when processing large feature files in the GPU.  "
-                "If chunk-size > 0, pad-input must be true.");
-
-    po.Read(argc, argv);
-
-    if (po.NumArgs() != 3) {
-      po.PrintUsage();
-      exit(1);
-    }
-    // If chunk_size is greater than 0, pad_input needs to be true.
-    KALDI_ASSERT(chunk_size < 0 || pad_input);
-
-#if HAVE_CUDA==1
-    CuDevice::Instantiate().SelectGpuId(use_gpu);
-#endif
-
-    std::string nnet_rxfilename = po.GetArg(1),
-        features_rspecifier = po.GetArg(2),
-        features_or_loglikes_wspecifier = po.GetArg(3);
-
-    TransitionModel trans_model;
-    AmNnet am_nnet;
-    {
-      bool binary_read;
-      Input ki(nnet_rxfilename, &binary_read);
-      trans_model.Read(ki.Stream(), binary_read);
-      am_nnet.Read(ki.Stream(), binary_read);
-    }
-
-    Nnet &nnet = am_nnet.GetNnet();
-
-    int64 num_done = 0, num_frames = 0;
-
-    Vector<BaseFloat> inv_priors(am_nnet.Priors());
-    KALDI_ASSERT(inv_priors.Dim() == am_nnet.NumPdfs() &&
-                 "Priors in neural network not set up.");
-    inv_priors.ApplyPow(-1.0);
-
-    SequentialBaseFloatMatrixReader feature_reader(features_rspecifier);
-    BaseFloatMatrixWriter writer(features_or_loglikes_wspecifier);
-
-    for (; !feature_reader.Done();  feature_reader.Next()) {
-      std::string utt = feature_reader.Key();
-      const Matrix<BaseFloat> &feats  = feature_reader.Value();
-
-      int32 output_frames = feats.NumRows(), output_dim = nnet.OutputDim();
-      if (!pad_input)
-        output_frames -= nnet.LeftContext() + nnet.RightContext();
-      if (output_frames <= 0) {
-        KALDI_WARN << "Skipping utterance " << utt << " because output "
-                   << "would be empty.";
-        continue;
-      }
-
-      Matrix<BaseFloat> output(output_frames, output_dim);
-      if (chunk_size > 0 && chunk_size < feats.NumRows()) {
-        NnetComputationChunked(nnet, feats, chunk_size, &output);
-      } else {
-        CuMatrix<BaseFloat> cu_feats(feats);
-        CuMatrix<BaseFloat> cu_output(output);
-        NnetComputation(nnet, cu_feats, pad_input, &cu_output);
-        output.CopyFromMat(cu_output);
-      }
-
-      if (divide_by_priors) {
-        output.MulColsVec(inv_priors); // scales each column by the corresponding element
-        // of inv_priors.
-        for (int32 i = 0; i < output.NumRows(); i++) {
-          SubVector<BaseFloat> frame(output, i);
-          BaseFloat p = frame.Sum();
-          if (!(p > 0.0)) {
-            KALDI_WARN << "Bad sum of probabilities " << p;
-          } else {
-            frame.Scale(1.0 / p); // re-normalize to sum to one.
-          }
-        }
-      }
-
-      if (apply_log) {
-        output.ApplyFloor(1.0e-20);
-        output.ApplyLog();
-      }
-      writer.Write(utt, output);
-      num_frames += feats.NumRows();
-      num_done++;
-    }
-#if HAVE_CUDA==1
-    CuDevice::Instantiate().PrintProfile();
-#endif
-
-    KALDI_LOG << "Processed " << num_done << " feature files, "
-              << num_frames << " frames of input were processed.";
-
-    return (num_done == 0 ? 1 : 0);
-  } catch(const std::exception &e) {
-    std::cerr << e.what() << '\n';
-    return -1;
-  }
-}
-
-
diff --git a/src/nnet2bin/nnet-am-copy.cc b/src/nnet2bin/nnet-am-copy.cc
deleted file mode 100644
index 2ea797350ee..00000000000
--- a/src/nnet2bin/nnet-am-copy.cc
+++ /dev/null
@@ -1,214 +0,0 @@
-// nnet2bin/nnet-am-copy.cc
-
-// Copyright 2012  Johns Hopkins University (author:  Daniel Povey)
-
-// See ../../COPYING for clarification regarding multiple authors
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//  http://www.apache.org/licenses/LICENSE-2.0
-//
-// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
-// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
-// MERCHANTABLITY OR NON-INFRINGEMENT.
-// See the Apache 2 License for the specific language governing permissions and
-// limitations under the License.
-
-#include <typeinfo>
-#include "base/kaldi-common.h"
-#include "util/common-utils.h"
-#include "hmm/transition-model.h"
-#include "nnet2/am-nnet.h"
-#include "tree/context-dep.h"
-
-int main(int argc, char *argv[]) {
-  try {
-    using namespace kaldi;
-    using namespace kaldi::nnet2;
-    typedef kaldi::int32 int32;
-
-    const char *usage =
-        "Copy a (nnet2) neural net and its associated transition model,\n"
-        "possibly changing the binary mode\n"
-        "Also supports multiplying all the learning rates by a factor\n"
-        "(the --learning-rate-factor option) and setting them all to a given\n"
-        "value (the --learning-rate options)\n"
-        "\n"
-        "Usage:  nnet-am-copy [options] <nnet-in> <nnet-out>\n"
-        "e.g.:\n"
-        " nnet-am-copy --binary=false 1.mdl text.mdl\n";
-
-    int32 truncate = -1;
-    bool binary_write = true;
-    bool remove_dropout = false;
-    BaseFloat dropout_scale = -1.0;
-    bool remove_preconditioning = false;
-    bool collapse = false;
-    bool match_updatableness = true;
-    BaseFloat learning_rate_factor = 1.0, learning_rate = -1;
-    std::string learning_rate_scales_str = " ";
-    std::string learning_rates = "";
-    std::string scales = "";
-    std::string stats_from;
-    
-    ParseOptions po(usage);
-    po.Register("binary", &binary_write, "Write output in binary mode");
-    po.Register("learning-rate-factor", &learning_rate_factor,
-                "Before copying, multiply all the learning rates in the "
-                "model by this factor.");
-    po.Register("learning-rate", &learning_rate,
-                "If supplied, all the learning rates of \"updatable\" layers"
-                "are set to this value.");
-    po.Register("learning-rates", &learning_rates,
-                "If supplied (a colon-separated list of learning rates), sets "
-                "the learning rates of \"updatable\" layers to these values.");
-    po.Register("scales", &scales,
-                "A colon-separated list of scaling factors, one for each updatable "
-                "layer: a mechanism to scale the parameters.");
-    po.Register("learning-rate-scales", &learning_rate_scales_str,
-                "Colon-separated list of scaling factors for learning rates, "
-                "applied after the --learning-rate and --learning-rates options."
-                "Used to scale learning rates for particular layer types.  E.g."
-                "--learning-rate-scales=AffineComponent=0.5");
-    po.Register("truncate", &truncate, "If set, will truncate the neural net "
-                "to this many components by removing the last components.");
-    po.Register("remove-dropout", &remove_dropout, "Set this to true to remove "
-                "any dropout components.");
-    po.Register("dropout-scale", &dropout_scale, "If set, set the dropout scale in any "
-                "dropout components to this value.  Note: in traditional dropout, this "
-                "is always zero; you can set it to any value between zero and one.");
-    po.Register("remove-preconditioning", &remove_preconditioning, "Set this to true to replace "
-                "components of type AffineComponentPreconditioned with AffineComponent.");
-    po.Register("stats-from", &stats_from, "Before copying neural net, copy the "
-                "statistics in any layer of type NonlinearComponent, from this "
-                "neural network: provide the extended filename.");
-    po.Register("collapse", &collapse, "If true, collapse sequences of AffineComponents "
-                "and FixedAffineComponents to compactify model");
-    po.Register("match-updatableness", &match_updatableness, "Only relevant if "
-                "collapse=true; set this to false to collapse mixed types.");
-
-    po.Read(argc, argv);
-    
-    if (po.NumArgs() != 2) {
-      po.PrintUsage();
-      exit(1);
-    }
-
-    std::string nnet_rxfilename = po.GetArg(1),
-        nnet_wxfilename = po.GetArg(2);
-    
-    TransitionModel trans_model;
-    AmNnet am_nnet;
-    {
-      bool binary;
-      Input ki(nnet_rxfilename, &binary);
-      trans_model.Read(ki.Stream(), binary);
-      am_nnet.Read(ki.Stream(), binary);
-    }
-
-    if (learning_rate_factor != 1.0)
-      am_nnet.GetNnet().ScaleLearningRates(learning_rate_factor);
-
-    if (learning_rate >= 0)
-      am_nnet.GetNnet().SetLearningRates(learning_rate);
-
-    if (learning_rates != "") {
-      std::vector<BaseFloat> learning_rates_vec;
-      if (!SplitStringToFloats(learning_rates, ":", false, &learning_rates_vec)
-          || static_cast<int32>(learning_rates_vec.size()) !=
-             am_nnet.GetNnet().NumUpdatableComponents()) {
-        KALDI_ERR << "Expected --learning-rates option to be a "
-                  << "colon-separated string with "
-                  << am_nnet.GetNnet().NumUpdatableComponents()
-                  << " elements, instead got \"" << learning_rates << '"';
-      }
-      SubVector<BaseFloat> learning_rates_vector(&(learning_rates_vec[0]),
-                                                 learning_rates_vec.size());
-      am_nnet.GetNnet().SetLearningRates(learning_rates_vector);
-    }
-
-    if (learning_rate_scales_str != " ")  {
-      // parse the learning_rate_scales provided as an option
-      std::map<std::string, BaseFloat> learning_rate_scales;
-      std::vector<std::string> learning_rate_scale_vec;
-      SplitStringToVector(learning_rate_scales_str, ":", true,
-                          &learning_rate_scale_vec);
-      for (int32 index = 0; index < learning_rate_scale_vec.size();
-          index++) {
-        std::vector<std::string> parts;
-        BaseFloat scale_factor;
-        SplitStringToVector(learning_rate_scale_vec[index],
-                            "=", false,  &parts);
-        if (!ConvertStringToReal(parts[1], &scale_factor)) {
-          KALDI_ERR << "Unknown format for --learning-rate-scales option. "
-              << "Expected format is "
-              << "--learning-rate-scales=AffineComponent=0.1:AffineComponentPreconditioned=0.5 "
-              << "instead got "
-              << learning_rate_scales_str;
-        }
-        learning_rate_scales.insert(std::pair<std::string, BaseFloat>(
-                parts[0], scale_factor));
-      }
-      // use the learning_rate_scales to scale the component learning rates
-      am_nnet.GetNnet().ScaleLearningRates(learning_rate_scales);
-    }
-
-    if (scales != "") {
-      std::vector<BaseFloat> scales_vec;
-      if (!SplitStringToFloats(scales, ":", false, &scales_vec)
-          || static_cast<int32>(scales_vec.size()) !=
-             am_nnet.GetNnet().NumUpdatableComponents()) {
-        KALDI_ERR << "Expected --scales option to be a "
-                  << "colon-separated string with "
-                  << am_nnet.GetNnet().NumUpdatableComponents()
-                  << " elements, instead got \"" << scales << '"';
-      }
-      SubVector<BaseFloat> scales_vector(&(scales_vec[0]),
-                                         scales_vec.size());
-      am_nnet.GetNnet().ScaleComponents(scales_vector);
-    }
-
-    if (truncate >= 0) {
-      am_nnet.GetNnet().Resize(truncate);
-      if (am_nnet.GetNnet().OutputDim() != am_nnet.Priors().Dim()) {
-        Vector<BaseFloat> empty_priors;
-        am_nnet.SetPriors(empty_priors);  // so dims don't disagree.
-      }
-    }
-
-    if (remove_dropout) am_nnet.GetNnet().RemoveDropout();
-
-    if (dropout_scale != -1.0) am_nnet.GetNnet().SetDropoutScale(dropout_scale);
-
-    if (remove_preconditioning) am_nnet.GetNnet().RemovePreconditioning();
-
-    if (collapse) am_nnet.GetNnet().Collapse(match_updatableness);
-    
-    if (stats_from != "") {
-      // Copy the stats associated with the layers descending from
-      // NonlinearComponent.
-      bool binary;
-      Input ki(stats_from, &binary);
-      TransitionModel trans_model;
-      trans_model.Read(ki.Stream(), binary);
-      AmNnet am_nnet_stats;
-      am_nnet_stats.Read(ki.Stream(), binary);
-      am_nnet.GetNnet().CopyStatsFrom(am_nnet_stats.GetNnet());
-    }
-    
-    {
-      Output ko(nnet_wxfilename, binary_write);
-      trans_model.Write(ko.Stream(), binary_write);
-      am_nnet.Write(ko.Stream(), binary_write);
-    }
-    KALDI_LOG << "Copied neural net from " << nnet_rxfilename
-              << " to " << nnet_wxfilename;
-    return 0;
-  } catch(const std::exception &e) {
-    std::cerr << e.what() << '\n';
-    return -1;
-  }
-}
diff --git a/src/nnet2bin/nnet-am-fix.cc b/src/nnet2bin/nnet-am-fix.cc
deleted file mode 100644
index 849274671b2..00000000000
--- a/src/nnet2bin/nnet-am-fix.cc
+++ /dev/null
@@ -1,88 +0,0 @@
-// nnet2bin/nnet-am-fix.cc
-
-// Copyright 2012  Johns Hopkins University (author:  Daniel Povey)
-
-// See ../../COPYING for clarification regarding multiple authors
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//  http://www.apache.org/licenses/LICENSE-2.0
-//
-// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
-// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
-// MERCHANTABLITY OR NON-INFRINGEMENT.
-// See the Apache 2 License for the specific language governing permissions and
-// limitations under the License.
-
-#include "base/kaldi-common.h"
-#include "util/common-utils.h"
-#include "nnet2/nnet-fix.h"
-#include "nnet2/am-nnet.h"
-#include "hmm/transition-model.h"
-#include "tree/context-dep.h"
-
-int main(int argc, char *argv[]) {
-  try {
-    using namespace kaldi;
-    using namespace kaldi::nnet2;
-    typedef kaldi::int32 int32;
-
-    const char *usage =
-        "Copy a (cpu-based) neural net and its associated transition model,\n"
-        "but modify it to remove certain pathologies.  We use the average\n"
-        "derivative statistics stored with the layers derived from\n"
-        "NonlinearComponent.  Note: some processes, such as nnet-combine-fast,\n"
-        "may not process these statistics correctly, and you may have to recover\n"
-        "them using the --stats-from option of nnet-am-copy before you use.\n"
-        "this program.\n"
-        "\n"
-        "Usage:  nnet-am-fix [options] <nnet-in> <nnet-out>\n"
-        "e.g.:\n"
-        " nnet-am-fix 1.mdl 1_fixed.mdl\n"
-        "or:\n"
-        " nnet-am-fix --get-counts-from=1.gradient 1.mdl 1_shrunk.mdl\n";
-
-    bool binary_write = true;
-    NnetFixConfig config;
-    
-    ParseOptions po(usage);
-    po.Register("binary", &binary_write, "Write output in binary mode");
-    config.Register(&po);
-    
-    po.Read(argc, argv);
-    
-    if (po.NumArgs() != 2) {
-      po.PrintUsage();
-      exit(1);
-    }
-
-    std::string nnet_rxfilename = po.GetArg(1),
-        nnet_wxfilename = po.GetArg(2);
-    
-    TransitionModel trans_model;
-    AmNnet am_nnet;
-    {
-      bool binary;
-      Input ki(nnet_rxfilename, &binary);
-      trans_model.Read(ki.Stream(), binary);
-      am_nnet.Read(ki.Stream(), binary);
-    }
-
-    FixNnet(config, &am_nnet.GetNnet());
-    
-    {
-      Output ko(nnet_wxfilename, binary_write);
-      trans_model.Write(ko.Stream(), binary_write);
-      am_nnet.Write(ko.Stream(), binary_write);
-    }
-    KALDI_LOG << "Copied neural net from " << nnet_rxfilename
-              << " to " << nnet_wxfilename;
-    return 0;
-  } catch(const std::exception &e) {
-    std::cerr << e.what() << '\n';
-    return -1;
-  }
-}
diff --git a/src/nnet2bin/nnet-am-info.cc b/src/nnet2bin/nnet-am-info.cc
deleted file mode 100644
index 0206f542d5c..00000000000
--- a/src/nnet2bin/nnet-am-info.cc
+++ /dev/null
@@ -1,87 +0,0 @@
-// nnet2bin/nnet-am-info.cc
-
-// Copyright 2012  Johns Hopkins University (author:  Daniel Povey)
-
-// See ../../COPYING for clarification regarding multiple authors
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//  http://www.apache.org/licenses/LICENSE-2.0
-//
-// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
-// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
-// MERCHANTABLITY OR NON-INFRINGEMENT.
-// See the Apache 2 License for the specific language governing permissions and
-// limitations under the License.
-
-#include "base/kaldi-common.h"
-#include "util/common-utils.h"
-#include "hmm/transition-model.h"
-#include "nnet2/am-nnet.h"
-
-int main(int argc, char *argv[]) {
-  try {
-    using namespace kaldi;
-    using namespace kaldi::nnet2;
-    typedef kaldi::int32 int32;
-
-    const char *usage =
-        "Print human-readable information about the neural network\n"
-        "acoustic model to the standard output\n"
-        "Usage:  nnet-am-info [options] <nnet-in>\n"
-        "e.g.:\n"
-        " nnet-am-info 1.nnet\n";
-        
-    ParseOptions po(usage);
-
-    bool print_learning_rates = false;
-
-    po.Register("print-learning-rates", &print_learning_rates,
-                "If true, instead of printing the normal info, print a "
-                "colon-separated list of the learning rates for each updatable "
-                "layer, suitable to give to nnet-am-copy as the argument to"
-                "--learning-rates.");
-    
-    po.Read(argc, argv);
-
-    if (po.NumArgs() != 1) {
-      po.PrintUsage();
-      exit(1);
-    }
-
-    std::string nnet_rxfilename = po.GetArg(1);
-    
-    TransitionModel trans_model;
-    AmNnet am_nnet;
-    {
-      bool binary_read;
-      Input ki(nnet_rxfilename, &binary_read);
-      trans_model.Read(ki.Stream(), binary_read);
-      am_nnet.Read(ki.Stream(), binary_read);
-    }
-
-    if (print_learning_rates) {
-      Vector<BaseFloat> learning_rates(am_nnet.GetNnet().NumUpdatableComponents());
-      am_nnet.GetNnet().GetLearningRates(&learning_rates);
-      int32 nc = learning_rates.Dim();
-      for (int32 i = 0; i < nc; i++)
-        std::cout << learning_rates(i) << (i < nc - 1 ? ":" : "");
-      std::cout << std::endl;
-      KALDI_LOG << "Printed learning-rate info for " << nnet_rxfilename;
-    } else {
-      std::cout << am_nnet.Info();
-      KALDI_LOG << "Printed info about " << nnet_rxfilename;
-    }
-    
-  } catch(const std::exception &e) {
-    std::cerr << e.what() << '\n';
-    return -1;
-  }
-}
-
-
-
-
diff --git a/src/nnet2bin/nnet-am-init.cc b/src/nnet2bin/nnet-am-init.cc
deleted file mode 100644
index 39473a6bcc6..00000000000
--- a/src/nnet2bin/nnet-am-init.cc
+++ /dev/null
@@ -1,110 +0,0 @@
-// nnet2bin/nnet-am-init.cc
-
-// Copyright 2012  Johns Hopkins University (author:  Daniel Povey)
-
-// See ../../COPYING for clarification regarding multiple authors
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//  http://www.apache.org/licenses/LICENSE-2.0
-//
-// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
-// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
-// MERCHANTABLITY OR NON-INFRINGEMENT.
-// See the Apache 2 License for the specific language governing permissions and
-// limitations under the License.
-
-#include "base/kaldi-common.h"
-#include "util/common-utils.h"
-#include "nnet2/am-nnet.h"
-#include "hmm/transition-model.h"
-#include "tree/context-dep.h"
-
-int main(int argc, char *argv[]) {
-  try {
-    using namespace kaldi;
-    using namespace kaldi::nnet2;
-    typedef kaldi::int32 int32;
-
-    // TODO: specify in the usage message where the example scripts are.
-    const char *usage =
-        "Initialize the neural network acoustic model and its associated\n"
-        "transition-model, from a tree, a topology file, and a neural-net\n"
-        "without an associated acoustic model.\n"
-        "See example scripts to see how this works in practice.\n"
-        "\n"
-        "Usage:  nnet-am-init [options] <tree-in> <topology-in> <raw-nnet-in> <nnet-am-out>\n"
-        "or:  nnet-am-init [options] <transition-model-in> <raw-nnet-in> <nnet-am-out>\n"
-        "e.g.:\n"
-        " nnet-am-init tree topo \"nnet-init nnet.config - |\" 1.mdl\n";
-        
-    bool binary_write = true;
-    
-    ParseOptions po(usage);
-    po.Register("binary", &binary_write, "Write output in binary mode");
-    
-    po.Read(argc, argv);
-    
-    if (po.NumArgs() != 3 && po.NumArgs() != 4) {
-      po.PrintUsage();
-      exit(1);
-    }
-
-    std::string raw_nnet_rxfilename, nnet_wxfilename;
-    
-    TransitionModel *trans_model = NULL;
-
-    if (po.NumArgs() == 4) {
-      std::string tree_rxfilename = po.GetArg(1),
-          topo_rxfilename = po.GetArg(2);
-      raw_nnet_rxfilename = po.GetArg(3);
-      nnet_wxfilename = po.GetArg(4);
-    
-      ContextDependency ctx_dep;
-      ReadKaldiObject(tree_rxfilename, &ctx_dep);
-    
-      HmmTopology topo;
-      ReadKaldiObject(topo_rxfilename, &topo);
-
-      // Construct the transition model from the tree and the topology file.
-      trans_model = new TransitionModel(ctx_dep, topo);
-    } else {
-      std::string trans_model_rxfilename = po.GetArg(1);
-      raw_nnet_rxfilename = po.GetArg(2);
-      nnet_wxfilename = po.GetArg(3);
-      trans_model = new TransitionModel();
-      ReadKaldiObject(trans_model_rxfilename, trans_model);
-    }
-    
-    AmNnet am_nnet;    
-    {
-      Nnet nnet;
-      bool binary;
-      Input ki(raw_nnet_rxfilename, &binary);
-      nnet.Read(ki.Stream(), binary);
-      am_nnet.Init(nnet);
-    }
-    
-    if (am_nnet.NumPdfs() != trans_model->NumPdfs())
-      KALDI_ERR << "Mismatch in number of pdfs, neural net has "
-                << am_nnet.NumPdfs() << ", transition model has "
-                << trans_model->NumPdfs();
-
-    {
-      Output ko(nnet_wxfilename, binary_write);
-      trans_model->Write(ko.Stream(), binary_write);
-      am_nnet.Write(ko.Stream(), binary_write);
-    }
-    delete trans_model;
-    KALDI_LOG << "Initialized neural net and wrote it to " << nnet_wxfilename;
-    return 0;
-  } catch(const std::exception &e) {
-    std::cerr << e.what() << '\n';
-    return -1;
-  }
-}
-
-
diff --git a/src/nnet2bin/nnet-am-mixup.cc b/src/nnet2bin/nnet-am-mixup.cc
deleted file mode 100644
index 32a961b2da1..00000000000
--- a/src/nnet2bin/nnet-am-mixup.cc
+++ /dev/null
@@ -1,81 +0,0 @@
-// nnet2bin/nnet-am-mixup.cc
-
-// Copyright 2012  Johns Hopkins University (author:  Daniel Povey)
-
-// See ../../COPYING for clarification regarding multiple authors
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//  http://www.apache.org/licenses/LICENSE-2.0
-//
-// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
-// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
-// MERCHANTABLITY OR NON-INFRINGEMENT.
-// See the Apache 2 License for the specific language governing permissions and
-// limitations under the License.
-
-#include "base/kaldi-common.h"
-#include "util/common-utils.h"
-#include "nnet2/am-nnet.h"
-#include "nnet2/mixup-nnet.h"
-#include "hmm/transition-model.h"
-#include "tree/context-dep.h"
-
-int main(int argc, char *argv[]) {
-  try {
-    using namespace kaldi;
-    using namespace kaldi::nnet2;
-    typedef kaldi::int32 int32;
-
-    const char *usage =
-        "Add mixture-components to a neural net (comparable to mixtures in a Gaussian\n"
-        "mixture model).  Number of mixture components must be greater than the number\n"
-        "of pdfs\n"
-        "\n"
-        "Usage:  nnet-am-mixup [options] <nnet-in> <nnet-out>\n"
-        "e.g.:\n"
-        " nnet-am-mixup --power=0.3 --num-mixtures=5000 1.mdl 2.mdl\n";
-
-    NnetMixupConfig config;
-    bool binary_write = true;
-    
-    ParseOptions po(usage);
-    config.Register(&po);
-
-    po.Read(argc, argv);
-    
-    if (po.NumArgs() != 2) {
-      po.PrintUsage();
-      exit(1);
-    }
-
-    std::string nnet_rxfilename = po.GetArg(1),
-        nnet_wxfilename = po.GetArg(2);
-    
-    TransitionModel trans_model;
-    AmNnet am_nnet;
-    {
-      bool binary;
-      Input ki(nnet_rxfilename, &binary);
-      trans_model.Read(ki.Stream(), binary);
-      am_nnet.Read(ki.Stream(), binary);
-    }
-
-    MixupNnet(config, &(am_nnet.GetNnet()));
-
-    {
-      Output ko(nnet_wxfilename, binary_write);
-      trans_model.Write(ko.Stream(), binary_write);
-      am_nnet.Write(ko.Stream(), binary_write);
-    }
-    KALDI_LOG << "Mixed up neural net from " << nnet_rxfilename
-              << " and wrote it to " << nnet_wxfilename;
-    return 0;
-  } catch(const std::exception &e) {
-    std::cerr << e.what() << '\n';
-    return -1;
-  }
-}
diff --git a/src/nnet2bin/nnet-am-reinitialize.cc b/src/nnet2bin/nnet-am-reinitialize.cc
deleted file mode 100644
index 43d87ec9e46..00000000000
--- a/src/nnet2bin/nnet-am-reinitialize.cc
+++ /dev/null
@@ -1,87 +0,0 @@
-// nnet2bin/nnet-am-reinitialize.cc
-
-// Copyright 2014  Johns Hopkins University (author:  Daniel Povey)
-
-// See ../../COPYING for clarification regarding multiple authors
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//  http://www.apache.org/licenses/LICENSE-2.0
-//
-// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
-// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
-// MERCHANTABLITY OR NON-INFRINGEMENT.
-// See the Apache 2 License for the specific language governing permissions and
-// limitations under the License.
-
-#include "base/kaldi-common.h"
-#include "util/common-utils.h"
-#include "nnet2/am-nnet.h"
-#include "hmm/transition-model.h"
-#include "tree/context-dep.h"
-
-int main(int argc, char *argv[]) {
-  try {
-    using namespace kaldi;
-    using namespace kaldi::nnet2;
-    typedef kaldi::int32 int32;
-
-    const char *usage =
-        "This program can used when transferring a neural net from one language\n"
-        "to another (or one tree to another).  It takes a neural net and a\n"
-        "transition model from a different neural net, resizes the last layer\n"
-        "to match the new transition model, zeroes it, and writes out the new,\n"
-        "resized .mdl file.  If the original model had been 'mixed-up', the associated\n"
-        "SumGroupComponent will be removed.\n"
-        "\n"
-        "Usage:  nnet-am-reinitialize [options] <nnet-in> <new-transition-model> <nnet-out>\n"
-        "e.g.:\n"
-        " nnet-am-reinitialize 1.mdl exp/tri6/final.mdl 2.mdl\n";
-
-    bool binary_write = true;
-    
-    ParseOptions po(usage);
-    po.Register("binary", &binary_write, "Write output in binary mode");
-
-    po.Read(argc, argv);
-    
-    if (po.NumArgs() != 3) {
-      po.PrintUsage();
-      exit(1);
-    }
-
-    std::string nnet_rxfilename = po.GetArg(1),
-        transition_model_rxfilename = po.GetArg(2),
-        nnet_wxfilename = po.GetArg(3);
-    
-    TransitionModel orig_trans_model;
-    AmNnet am_nnet;
-    {
-      bool binary;
-      Input ki(nnet_rxfilename, &binary);
-      orig_trans_model.Read(ki.Stream(), binary);
-      am_nnet.Read(ki.Stream(), binary);
-    }
-
-    TransitionModel new_trans_model;
-    ReadKaldiObject(transition_model_rxfilename, &new_trans_model);
-
-    am_nnet.ResizeOutputLayer(new_trans_model.NumPdfs());
-    
-    {
-      Output ko(nnet_wxfilename, binary_write);
-      new_trans_model.Write(ko.Stream(), binary_write);
-      am_nnet.Write(ko.Stream(), binary_write);
-    }
-    KALDI_LOG << "Resized neural net from " << nnet_rxfilename
-              << " to " << am_nnet.NumPdfs()
-              << " pdfs, and wrote to " << nnet_wxfilename;
-    return 0;
-  } catch(const std::exception &e) {
-    std::cerr << e.what() << '\n';
-    return -1;
-  }
-}
diff --git a/src/nnet2bin/nnet-am-switch-preconditioning.cc b/src/nnet2bin/nnet-am-switch-preconditioning.cc
deleted file mode 100644
index 7967eaaa866..00000000000
--- a/src/nnet2bin/nnet-am-switch-preconditioning.cc
+++ /dev/null
@@ -1,97 +0,0 @@
-// nnet2bin/nnet-am-switch-preconditioning.cc
-
-// Copyright 2012  Johns Hopkins University (author:  Daniel Povey)
-
-// See ../../COPYING for clarification regarding multiple authors
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//  http://www.apache.org/licenses/LICENSE-2.0
-//
-// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
-// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
-// MERCHANTABLITY OR NON-INFRINGEMENT.
-// See the Apache 2 License for the specific language governing permissions and
-// limitations under the License.
-
-#include "base/kaldi-common.h"
-#include "util/common-utils.h"
-#include "nnet2/am-nnet.h"
-#include "hmm/transition-model.h"
-#include "tree/context-dep.h"
-
-int main(int argc, char *argv[]) {
-  try {
-    using namespace kaldi;
-    using namespace kaldi::nnet2;
-    typedef kaldi::int32 int32;
-
-    const char *usage =
-        "Copy a (cpu-based) neural net and its associated transition model,\n"
-        "and switch it to online preconditioning, i.e. change any components\n"
-        "derived from AffineComponent to components of type\n"
-        "AffineComponentPreconditionedOnline.\n"
-        "\n"
-        "Usage:  nnet-am-switch-preconditioning [options] <nnet-in> <nnet-out>\n"
-        "e.g.:\n"
-        " nnet-am-switch-preconditioning --binary=false 1.mdl text.mdl\n";
-
-    int32 rank_in = 20, rank_out = 80, update_period = 4;
-    BaseFloat num_samples_history = 2000.0;
-    BaseFloat alpha = 4.0;
-    bool binary_write = true;
-    
-    ParseOptions po(usage);
-    po.Register("binary", &binary_write, "Write output in binary mode");
-    po.Register("rank-in", &rank_in,
-                "Rank used in online-preconditioning on input side of each layer");
-    po.Register("rank-out", &rank_out,
-                "Rank used in online-preconditioning on output side of each layer");
-    po.Register("update-period", &update_period,
-                "Affects how frequently we update the Fisher-matrix estimate (every "
-                "this-many minibatches).");
-    po.Register("num-samples-history", &num_samples_history,
-                "Number of samples of history to use in online preconditioning "
-                "(affects speed vs accuracy of update of Fisher matrix)");
-    po.Register("alpha", &alpha,
-                "Parameter that affects amount of smoothing with unit matrix "
-                "in online preconditioning (larger -> more smoothing)");
-    
-    po.Read(argc, argv);
-    
-    if (po.NumArgs() != 2) {
-      po.PrintUsage();
-      exit(1);
-    }
-
-    std::string nnet_rxfilename = po.GetArg(1),
-        nnet_wxfilename = po.GetArg(2);
-    
-    TransitionModel trans_model;
-    AmNnet am_nnet;
-    {
-      bool binary;
-      Input ki(nnet_rxfilename, &binary);
-      trans_model.Read(ki.Stream(), binary);
-      am_nnet.Read(ki.Stream(), binary);
-    }
-
-    am_nnet.GetNnet().SwitchToOnlinePreconditioning(rank_in, rank_out, update_period,
-                                                    num_samples_history, alpha);
-    
-    {
-      Output ko(nnet_wxfilename, binary_write);
-      trans_model.Write(ko.Stream(), binary_write);
-      am_nnet.Write(ko.Stream(), binary_write);
-    }
-    KALDI_LOG << "Copied neural net from " << nnet_rxfilename
-              << " to " << nnet_wxfilename;
-    return 0;
-  } catch(const std::exception &e) {
-    std::cerr << e.what() << '\n';
-    return -1;
-  }
-}
diff --git a/src/nnet2bin/nnet-am-widen.cc b/src/nnet2bin/nnet-am-widen.cc
deleted file mode 100644
index 0ed23c11cf6..00000000000
--- a/src/nnet2bin/nnet-am-widen.cc
+++ /dev/null
@@ -1,83 +0,0 @@
-// nnet2bin/nnet-am-widen.cc
-
-// Copyright 2012-2013  Johns Hopkins University (author:  Daniel Povey)
-
-// See ../../COPYING for clarification regarding multiple authors
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//  http://www.apache.org/licenses/LICENSE-2.0
-//
-// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
-// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
-// MERCHANTABLITY OR NON-INFRINGEMENT.
-// See the Apache 2 License for the specific language governing permissions and
-// limitations under the License.
-
-#include "base/kaldi-common.h"
-#include "util/common-utils.h"
-#include "nnet2/am-nnet.h"
-#include "nnet2/widen-nnet.h"
-#include "hmm/transition-model.h"
-#include "tree/context-dep.h"
-
-int main(int argc, char *argv[]) {
-  try {
-    using namespace kaldi;
-    using namespace kaldi::nnet2;
-    typedef kaldi::int32 int32;
-
-    const char *usage =
-        "Copy a (cpu-based) neural net and its associated transition model,\n"
-        "possibly changing the binary mode\n"
-        "Also supports multiplying all the learning rates by a factor\n"
-        "(the --learning-rate-factor option) and setting them all to a given\n"
-        "value (the --learning-rate options)\n"
-        "\n"
-        "Usage:  nnet-am-widen [options] <nnet-in> <nnet-out>\n"
-        "e.g.:\n"
-        " nnet-am-widen --hidden-layer-dim=1024 1.mdl 2.mdl\n";
-
-    NnetWidenConfig config;
-    bool binary_write = true;
-    
-    ParseOptions po(usage);
-    config.Register(&po);
-
-    po.Read(argc, argv);
-    
-    if (po.NumArgs() != 2) {
-      po.PrintUsage();
-      exit(1);
-    }
-
-    std::string nnet_rxfilename = po.GetArg(1),
-        nnet_wxfilename = po.GetArg(2);
-    
-    TransitionModel trans_model;
-    AmNnet am_nnet;
-    {
-      bool binary;
-      Input ki(nnet_rxfilename, &binary);
-      trans_model.Read(ki.Stream(), binary);
-      am_nnet.Read(ki.Stream(), binary);
-    }
-
-    WidenNnet(config, &(am_nnet.GetNnet()));
-    
-    {
-      Output ko(nnet_wxfilename, binary_write);
-      trans_model.Write(ko.Stream(), binary_write);
-      am_nnet.Write(ko.Stream(), binary_write);
-    }
-    KALDI_LOG << "Mixed up neural net from " << nnet_rxfilename
-              << " and wrote it to " << nnet_wxfilename;
-    return 0;
-  } catch(const std::exception &e) {
-    std::cerr << e.what() << '\n';
-    return -1;
-  }
-}
diff --git a/src/nnet2bin/nnet-combine-egs-discriminative.cc b/src/nnet2bin/nnet-combine-egs-discriminative.cc
deleted file mode 100644
index da6e544950c..00000000000
--- a/src/nnet2bin/nnet-combine-egs-discriminative.cc
+++ /dev/null
@@ -1,115 +0,0 @@
-// nnet2bin/nnet-combine-egs-discriminative.cc
-
-// Copyright 2012-2013  Johns Hopkins University (author:  Daniel Povey)
-
-// See ../../COPYING for clarification regarding multiple authors
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//  http://www.apache.org/licenses/LICENSE-2.0
-//
-// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
-// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
-// MERCHANTABLITY OR NON-INFRINGEMENT.
-// See the Apache 2 License for the specific language governing permissions and
-// limitations under the License.
-
-#include "base/kaldi-common.h"
-#include "util/common-utils.h"
-#include "hmm/transition-model.h"
-#include "nnet2/nnet-example-functions.h"
-
-int main(int argc, char *argv[]) {
-  try {
-    using namespace kaldi;
-    using namespace kaldi::nnet2;
-    typedef kaldi::int32 int32;
-    typedef kaldi::int64 int64;
-
-    const char *usage =
-        "Copy examples for discriminative neural network training,\n"
-        "and combine successive examples if their combined length will\n"
-        "be less than --max-length.  This can help to improve efficiency\n"
-        "(--max-length corresponds to minibatch size)\n"
-        "\n"
-        "Usage:  nnet-combine-egs-discriminative [options] <egs-rspecifier> <egs-wspecifier>\n"
-        "\n"
-        "e.g.\n"
-        "nnet-combine-egs-discriminative --max-length=512 ark:temp.1.degs ark:1.degs\n";
-        
-    int32 max_length = 512;
-    int32 hard_max_length = 2048;
-    int32 batch_size = 250;
-    ParseOptions po(usage);
-    po.Register("max-length", &max_length, "Maximum length of example that we "
-                "will create when combining");
-    po.Register("batch-size", &batch_size, "Size of batch used when combinging "
-                "examples");
-    po.Register("hard-max-length", &hard_max_length, "Length of example beyond "
-                "which we will discard (very long examples may cause out of "
-                "memory errors)");
-    
-    po.Read(argc, argv);
-    
-    if (po.NumArgs() != 2) {
-      po.PrintUsage();
-      exit(1);
-    }
-
-    KALDI_ASSERT(hard_max_length >= max_length);
-    KALDI_ASSERT(batch_size >= 1);
-    
-    std::string examples_rspecifier = po.GetArg(1),
-        examples_wspecifier = po.GetArg(2);
-
-    SequentialDiscriminativeNnetExampleReader example_reader(
-        examples_rspecifier);
-    DiscriminativeNnetExampleWriter example_writer(
-        examples_wspecifier);
-
-    int64 num_read = 0, num_written = 0, num_discarded = 0;
-
-    while (!example_reader.Done()) {
-      std::vector<DiscriminativeNnetExample> buffer;
-      size_t size = batch_size;
-      buffer.reserve(size);
-
-      for (; !example_reader.Done() && buffer.size() < size;
-           example_reader.Next()) {
-        buffer.push_back(example_reader.Value());
-        num_read++;
-      }
-
-      std::vector<DiscriminativeNnetExample> combined;
-      CombineDiscriminativeExamples(max_length, buffer, &combined);
-      buffer.clear();
-      for (size_t i = 0; i < combined.size(); i++) {
-        const DiscriminativeNnetExample &eg = combined[i];
-        int32 num_frames = eg.input_frames.NumRows();
-        if (num_frames > hard_max_length) {
-          KALDI_WARN << "Discarding segment of length " << num_frames
-                     << " because it exceeds --hard-max-length="
-                     << hard_max_length;
-          num_discarded++;
-        } else {
-          std::ostringstream ostr;
-          ostr << (num_written++);
-          example_writer.Write(ostr.str(), eg);
-        }
-      }
-    }
-    
-    KALDI_LOG << "Read " << num_read << " discriminative neural-network training"
-              << " examples, wrote " << num_written << ", discarded "
-              << num_discarded;
-    return (num_written == 0 ? 1 : 0);
-  } catch(const std::exception &e) {
-    std::cerr << e.what() << '\n';
-    return -1;
-  }
-}
-
-
diff --git a/src/nnet2bin/nnet-combine-fast.cc b/src/nnet2bin/nnet-combine-fast.cc
deleted file mode 100644
index 63a92f90311..00000000000
--- a/src/nnet2bin/nnet-combine-fast.cc
+++ /dev/null
@@ -1,133 +0,0 @@
-// nnet2bin/nnet-combine-fast.cc
-
-// Copyright 2012  Johns Hopkins University (author:  Daniel Povey)
-
-// See ../../COPYING for clarification regarding multiple authors
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//  http://www.apache.org/licenses/LICENSE-2.0
-//
-// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
-// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
-// MERCHANTABLITY OR NON-INFRINGEMENT.
-// See the Apache 2 License for the specific language governing permissions and
-// limitations under the License.
-
-#include "base/kaldi-common.h"
-#include "util/common-utils.h"
-#include "hmm/transition-model.h"
-#include "nnet2/combine-nnet-fast.h"
-#include "nnet2/am-nnet.h"
-
-
-int main(int argc, char *argv[]) {
-  try {
-    using namespace kaldi;
-    using namespace kaldi::nnet2;
-    typedef kaldi::int32 int32;
-    typedef kaldi::int64 int64;
-
-    const char *usage =
-        "Using a validation set, compute an optimal combination of a number of\n"
-        "neural nets (the combination weights are separate for each layer and\n"
-        "do not have to sum to one).  The optimization is BFGS, which is initialized\n"
-        "from the best of the individual input neural nets (or as specified by\n"
-        "--initial-model)\n"
-        "\n"
-        "Usage:  nnet-combine-fast [options] <model-in1> <model-in2> ... <model-inN> <valid-examples-in> <model-out>\n"
-        "\n"
-        "e.g.:\n"
-        " nnet-combine-fast 1.1.nnet 1.2.nnet 1.3.nnet ark:valid.egs 2.nnet\n"
-        "Caution: the first input neural net must not be a gradient.\n";
-    
-    bool binary_write = true;
-    NnetCombineFastConfig combine_config;
-    std::string use_gpu = "yes";
-    
-    ParseOptions po(usage);
-    po.Register("binary", &binary_write, "Write output in binary mode");
-    po.Register("use-gpu", &use_gpu,
-                "yes|no|optional|wait, only has effect if compiled with CUDA");
-    
-    combine_config.Register(&po);
-    
-    po.Read(argc, argv);
-    
-    if (po.NumArgs() < 3) {
-      po.PrintUsage();
-      exit(1);
-    }
-    
-    std::string
-        nnet1_rxfilename = po.GetArg(1),
-        valid_examples_rspecifier = po.GetArg(po.NumArgs() - 1),
-        nnet_wxfilename = po.GetArg(po.NumArgs());
-
-#if HAVE_CUDA==1
-    if (combine_config.num_threads == 1)
-      CuDevice::Instantiate().SelectGpuId(use_gpu);
-#endif
-
-    
-    TransitionModel trans_model;
-    AmNnet am_nnet1;
-    {
-      bool binary_read;
-      Input ki(nnet1_rxfilename, &binary_read);
-      trans_model.Read(ki.Stream(), binary_read);
-      am_nnet1.Read(ki.Stream(), binary_read);
-    }
-
-    int32 num_nnets = po.NumArgs() - 2;
-    std::vector<Nnet> nnets(num_nnets);
-    nnets[0] = am_nnet1.GetNnet();
-    am_nnet1.GetNnet() = Nnet(); // Clear it to save memory.
-
-    for (int32 n = 1; n < num_nnets; n++) {
-      TransitionModel trans_model;
-      AmNnet am_nnet;
-      bool binary_read;
-      Input ki(po.GetArg(1 + n), &binary_read);
-      trans_model.Read(ki.Stream(), binary_read);
-      am_nnet.Read(ki.Stream(), binary_read);
-      nnets[n] = am_nnet.GetNnet();
-    }      
-    
-    std::vector<NnetExample> validation_set; // stores validation
-    // frames.
-
-    { // This block adds samples to "validation_set".
-      SequentialNnetExampleReader example_reader(
-          valid_examples_rspecifier);
-      for (; !example_reader.Done(); example_reader.Next())
-        validation_set.push_back(example_reader.Value());
-      KALDI_LOG << "Read " << validation_set.size() << " examples from the "
-                << "validation set.";
-      KALDI_ASSERT(validation_set.size() > 0);
-    }
-
-    CombineNnetsFast(combine_config,
-                     validation_set,
-                     nnets,
-                     &(am_nnet1.GetNnet()));
-    
-    {
-      Output ko(nnet_wxfilename, binary_write);
-      trans_model.Write(ko.Stream(), binary_write);
-      am_nnet1.Write(ko.Stream(), binary_write);
-    }
-    
-    KALDI_LOG << "Finished combining neural nets, wrote model to "
-              << nnet_wxfilename;
-    return (validation_set.size() == 0 ? 1 : 0);
-  } catch(const std::exception &e) {
-    std::cerr << e.what() << '\n';
-    return -1;
-  }
-}
-
-
diff --git a/src/nnet2bin/nnet-combine.cc b/src/nnet2bin/nnet-combine.cc
deleted file mode 100644
index 621def0c13c..00000000000
--- a/src/nnet2bin/nnet-combine.cc
+++ /dev/null
@@ -1,124 +0,0 @@
-// nnet2bin/nnet-combine.cc
-
-// Copyright 2012  Johns Hopkins University (author:  Daniel Povey)
-
-// See ../../COPYING for clarification regarding multiple authors
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//  http://www.apache.org/licenses/LICENSE-2.0
-//
-// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
-// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
-// MERCHANTABLITY OR NON-INFRINGEMENT.
-// See the Apache 2 License for the specific language governing permissions and
-// limitations under the License.
-
-#include "base/kaldi-common.h"
-#include "util/common-utils.h"
-#include "hmm/transition-model.h"
-#include "nnet2/combine-nnet.h"
-#include "nnet2/am-nnet.h"
-
-
-int main(int argc, char *argv[]) {
-  try {
-    using namespace kaldi;
-    using namespace kaldi::nnet2;
-    typedef kaldi::int32 int32;
-    typedef kaldi::int64 int64;
-
-    const char *usage =
-        "Using a validation set, compute an optimal combination of a number of\n"
-        "neural nets (the combination weights are separate for each layer and\n"
-        "do not have to sum to one).  The optimization is BFGS, which is initialized\n"
-        "from the best of the individual input neural nets (or as specified by\n"
-        "--initial-model)\n"
-        "\n"
-        "Usage:  nnet-combine [options] <model-in1> <model-in2> ... <model-inN> <valid-examples-in> <model-out>\n"
-        "\n"
-        "e.g.:\n"
-        " nnet-combine 1.1.nnet 1.2.nnet 1.3.nnet ark:valid.egs 2.nnet\n"
-        "Caution: the first input neural net must not be a gradient.\n";
-    
-    bool binary_write = true;
-    NnetCombineConfig combine_config;
-    
-    ParseOptions po(usage);
-    po.Register("binary", &binary_write, "Write output in binary mode");
-    
-    combine_config.Register(&po);
-    
-    po.Read(argc, argv);
-    
-    if (po.NumArgs() < 3) {
-      po.PrintUsage();
-      exit(1);
-    }
-    
-    std::string
-        nnet1_rxfilename = po.GetArg(1),
-        valid_examples_rspecifier = po.GetArg(po.NumArgs() - 1),
-        nnet_wxfilename = po.GetArg(po.NumArgs());
-    
-    TransitionModel trans_model;
-    AmNnet am_nnet1;
-    {
-      bool binary_read;
-      Input ki(nnet1_rxfilename, &binary_read);
-      trans_model.Read(ki.Stream(), binary_read);
-      am_nnet1.Read(ki.Stream(), binary_read);
-    }
-
-    int32 num_nnets = po.NumArgs() - 2;
-    std::vector<Nnet> nnets(num_nnets);
-    nnets[0] = am_nnet1.GetNnet();
-    am_nnet1.GetNnet() = Nnet(); // Clear it to save memory.
-
-    for (int32 n = 1; n < num_nnets; n++) {
-      TransitionModel trans_model;
-      AmNnet am_nnet;
-      bool binary_read;
-      Input ki(po.GetArg(1 + n), &binary_read);
-      trans_model.Read(ki.Stream(), binary_read);
-      am_nnet.Read(ki.Stream(), binary_read);
-      nnets[n] = am_nnet.GetNnet();
-    }      
-    
-    std::vector<NnetExample> validation_set; // stores validation
-    // frames.
-
-    { // This block adds samples to "validation_set".
-      SequentialNnetExampleReader example_reader(
-          valid_examples_rspecifier);
-      for (; !example_reader.Done(); example_reader.Next())
-        validation_set.push_back(example_reader.Value());
-      KALDI_LOG << "Read " << validation_set.size() << " examples from the "
-                << "validation set.";
-      KALDI_ASSERT(validation_set.size() > 0);
-    }
-
-    CombineNnets(combine_config,
-                 validation_set,
-                 nnets,
-                 &(am_nnet1.GetNnet()));
-    
-    {
-      Output ko(nnet_wxfilename, binary_write);
-      trans_model.Write(ko.Stream(), binary_write);
-      am_nnet1.Write(ko.Stream(), binary_write);
-    }
-    
-    KALDI_LOG << "Finished combining neural nets, wrote model to "
-              << nnet_wxfilename;
-    return (validation_set.size() == 0 ? 1 : 0);
-  } catch(const std::exception &e) {
-    std::cerr << e.what() << '\n';
-    return -1;
-  }
-}
-
-
diff --git a/src/nnet2bin/nnet-compare-hash-discriminative.cc b/src/nnet2bin/nnet-compare-hash-discriminative.cc
deleted file mode 100644
index e602165f527..00000000000
--- a/src/nnet2bin/nnet-compare-hash-discriminative.cc
+++ /dev/null
@@ -1,138 +0,0 @@
-// nnet2bin/nnet-compare-hash-discriminative.cc
-
-// Copyright 2012-2013  Johns Hopkins University (author:  Daniel Povey)
-
-// See ../../COPYING for clarification regarding multiple authors
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//  http://www.apache.org/licenses/LICENSE-2.0
-//
-// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
-// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
-// MERCHANTABLITY OR NON-INFRINGEMENT.
-// See the Apache 2 License for the specific language governing permissions and
-// limitations under the License.
-
-#include "base/kaldi-common.h"
-#include "util/common-utils.h"
-#include "hmm/transition-model.h"
-#include "nnet2/nnet-example-functions.h"
-
-int main(int argc, char *argv[]) {
-  try {
-    using namespace kaldi;
-    using namespace kaldi::nnet2;
-    typedef kaldi::int32 int32;
-    typedef kaldi::int64 int64;
-
-    const char *usage =
-        "Compares two archives of discriminative training examples and checks\n"
-        "that they behave the same way for purposes of discriminative training.\n"
-        "This program was created as a way of testing nnet-get-egs-discriminative\n"
-        "The model is only needed for its transition-model.\n"
-        "\n"
-        "Usage:  nnet-compare-hash-discriminative [options] <model-rxfilename> "
-        "<egs-rspecifier1> <egs-rspecifier2>\n"
-        "\n"
-        "Note: options --drop-frames and --criterion should be matched with the\n"
-        "command line of nnet-get-egs-discriminative used to get the examples\n"
-        "nnet-compare-hash-discriminative --drop-frames=true --criterion=mmi ark:1.degs ark:2.degs\n";
-    
-    std::string criterion = "smbr";
-    bool drop_frames = false;
-    bool one_silence_class = false;
-    BaseFloat threshold = 0.002;
-    BaseFloat acoustic_scale = 1.0, lm_scale = 1.0;
-    ParseOptions po(usage);
-
-    po.Register("acoustic-scale", &acoustic_scale,
-                "Scaling factor for acoustic likelihoods");
-    po.Register("lm-scale", &lm_scale,
-                "Scaling factor for \"graph costs\" (including LM costs)");
-    po.Register("criterion", &criterion, "Training criterion, 'mmi'|'mpfe'|'smbr'");
-    po.Register("drop-frames", &drop_frames, "If true, for MMI training, drop "
-                "frames where num and den do not intersect.");
-    po.Register("one-silence-class", &one_silence_class, "If true, newer "
-                 "behavior which will tend to reduce insertions.");
-    po.Register("threshold", &threshold, "Threshold for equality testing "
-                "(relative)");
-    
-    po.Read(argc, argv);
-
-    
-    if (po.NumArgs() != 3) {
-      po.PrintUsage();
-      exit(1);
-    }
-
-    std::string model_rxfilename = po.GetArg(1),
-        examples_rspecifier1 = po.GetArg(2),
-        examples_rspecifier2 = po.GetArg(3);
-
-    int64 num_done1 = 0, num_done2 = 0;
-
-
-    TransitionModel tmodel;
-    ReadKaldiObject(model_rxfilename, &tmodel);
-    
-    Matrix<double> hash1, hash2;
-
-    // some additional diagnostics:
-    double num_weight1 = 0.0, den_weight1 = 0.0, tot_t1 = 0.0;
-    double num_weight2 = 0.0, den_weight2 = 0.0, tot_t2 = 0.0;
-    
-    SequentialDiscriminativeNnetExampleReader
-        example_reader1(examples_rspecifier1),
-        example_reader2(examples_rspecifier2);
-
-    KALDI_LOG << "Computing first hash function";
-    for (; !example_reader1.Done(); example_reader1.Next(), num_done1++) {
-      DiscriminativeNnetExample eg = example_reader1.Value();
-      fst::ScaleLattice(fst::LatticeScale(lm_scale, acoustic_scale),
-                        &(eg.den_lat));
-      UpdateHash(tmodel, eg, criterion, drop_frames,
-                 one_silence_class, &hash1,
-                 &num_weight1, &den_weight1, &tot_t1);
-    }
-    KALDI_LOG << "Processed " << num_done1 << " examples.";
-
-    KALDI_LOG << "Computing second hash function";
-    for (; !example_reader2.Done(); example_reader2.Next(), num_done2++) {
-      DiscriminativeNnetExample eg = example_reader2.Value();
-      fst::ScaleLattice(fst::LatticeScale(lm_scale, acoustic_scale),
-                        &(eg.den_lat));
-      UpdateHash(tmodel, eg, criterion, drop_frames,
-                 one_silence_class, &hash2,
-                 &num_weight2, &den_weight2, &tot_t2);
-    }
-    KALDI_LOG << "Processed " << num_done2 << " examples.";
-    
-    double prod1 = TraceMatMat(hash1, hash1, kTrans),
-        prod2 = TraceMatMat(hash2, hash2, kTrans),
-        cross_prod = TraceMatMat(hash1, hash2, kTrans);
-
-    KALDI_LOG << "Products are as follows (should be the same): prod1 = "
-              << prod1 << ", prod2 = " << prod2 << ", cross_prod = "
-              << cross_prod;
-
-    KALDI_LOG << "Num-weight1 = " << num_weight1 << ", den-weight1 = "
-              << den_weight1 << ", tot_t1 = " << tot_t1;
-    KALDI_LOG << "Num-weight2 = " << num_weight2 << ", den-weight2 = "
-              << den_weight2 << ", tot_t2 = " << tot_t2;
-        
-    KALDI_ASSERT(ApproxEqual(prod1, prod2, threshold) &&
-                 ApproxEqual(prod2, cross_prod, threshold));
-    KALDI_ASSERT(prod1 > 0.0);
-
-    return 0;
-  } catch(const std::exception &e) {
-    std::cerr << e.what() << '\n';
-    return -1;
-  }
-}
-
-
diff --git a/src/nnet2bin/nnet-compute-from-egs.cc b/src/nnet2bin/nnet-compute-from-egs.cc
deleted file mode 100644
index a4c4c0c4ab7..00000000000
--- a/src/nnet2bin/nnet-compute-from-egs.cc
+++ /dev/null
@@ -1,99 +0,0 @@
-// nnet2bin/nnet-compute-from-egs.cc
-
-// Copyright 2012-2013  Johns Hopkins University (author:  Daniel Povey)
-
-// See ../../COPYING for clarification regarding multiple authors
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//  http://www.apache.org/licenses/LICENSE-2.0
-//
-// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
-// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
-// MERCHANTABLITY OR NON-INFRINGEMENT.
-// See the Apache 2 License for the specific language governing permissions and
-// limitations under the License.
-
-#include "base/kaldi-common.h"
-#include "util/common-utils.h"
-#include "hmm/transition-model.h"
-#include "nnet2/train-nnet.h"
-#include "nnet2/am-nnet.h"
-
-
-int main(int argc, char *argv[]) {
-  try {
-    using namespace kaldi;
-    using namespace kaldi::nnet2;
-    typedef kaldi::int32 int32;
-    typedef kaldi::int64 int64;
-
-    const char *usage =
-        "Does the neural net computation, taking as input the nnet-training examples\n"
-        "(typically an archive with the extension .egs), ignoring the labels; it\n"
-        "outputs as a matrix the result.  Used mostly for debugging.\n"
-        "\n"
-        "Usage:  nnet-compute-from-egs [options] <raw-nnet-in> <egs-rspecifier> "
-        "<feature-wspecifier>\n"
-        "e.g.:  nnet-compute-from-egs 'nnet-to-raw-nnet final.mdl -|' egs.10.1.ark ark:-\n";
-    
-    ParseOptions po(usage);
-
-    po.Read(argc, argv);
-    
-    if (po.NumArgs() != 3) {
-      po.PrintUsage();
-      exit(1);
-    }
-    
-    std::string raw_nnet_rxfilename = po.GetArg(1),
-        examples_rspecifier = po.GetArg(2),
-        features_or_loglikes_wspecifier = po.GetArg(3);
-
-    Nnet nnet;
-    ReadKaldiObject(raw_nnet_rxfilename, &nnet);
-    
-    int64 num_egs = 0;
-
-    SequentialNnetExampleReader example_reader(examples_rspecifier);
-    BaseFloatMatrixWriter writer(features_or_loglikes_wspecifier);
-    
-    int32 left_context = nnet.LeftContext(),
-        context = nnet.LeftContext() + 1 + nnet.RightContext();
-
-    for (; !example_reader.Done(); example_reader.Next()) {
-      const NnetExample &eg = example_reader.Value();
-      int32 start_offset = eg.left_context - left_context;
-      int32 basic_dim = eg.input_frames.NumCols(),
-          spk_dim = eg.spk_info.Dim(), dim = basic_dim + spk_dim;
-      Matrix<BaseFloat> input_frames(eg.input_frames),
-          input_block(context, dim);
-      input_block.Range(0, context, 0, basic_dim).CopyFromMat(
-          input_frames.Range(start_offset, context, 0, basic_dim));
-      if (spk_dim != 0) {
-        input_block.Range(0, context, basic_dim, spk_dim).CopyRowsFromVec(
-            eg.spk_info);
-      }
-      CuMatrix<BaseFloat> gpu_input_block;
-      gpu_input_block.Swap(&input_block);
-      CuMatrix<BaseFloat> gpu_output_block(1, nnet.OutputDim());
-      
-      bool pad_input = false;
-      NnetComputation(nnet, gpu_input_block, pad_input, &gpu_output_block);
-      writer.Write("global", Matrix<BaseFloat>(gpu_output_block));
-      num_egs++;
-    }
-    
-    KALDI_LOG << "Processed " << num_egs << " examples.";
-    
-    return (num_egs == 0 ? 1 : 0);
-  } catch(const std::exception &e) {
-    std::cerr << e.what() << '\n';
-    return -1;
-  }
-}
-
-
diff --git a/src/nnet2bin/nnet-compute-prob.cc b/src/nnet2bin/nnet-compute-prob.cc
deleted file mode 100644
index 7a5fa4b32f7..00000000000
--- a/src/nnet2bin/nnet-compute-prob.cc
+++ /dev/null
@@ -1,104 +0,0 @@
-// nnet2bin/nnet-compute-prob.cc
-
-// Copyright 2012  Johns Hopkins University (author:  Daniel Povey)
-
-// See ../../COPYING for clarification regarding multiple authors
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//  http://www.apache.org/licenses/LICENSE-2.0
-//
-// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
-// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
-// MERCHANTABLITY OR NON-INFRINGEMENT.
-// See the Apache 2 License for the specific language governing permissions and
-// limitations under the License.
-
-#include "base/kaldi-common.h"
-#include "util/common-utils.h"
-#include "hmm/transition-model.h"
-#include "nnet2/train-nnet.h"
-#include "nnet2/am-nnet.h"
-
-
-int main(int argc, char *argv[]) {
-  try {
-    using namespace kaldi;
-    using namespace kaldi::nnet2;
-    typedef kaldi::int32 int32;
-    typedef kaldi::int64 int64;
-
-    const char *usage =
-        "Computes and prints the average log-prob per frame of the given data with a\n"
-        "neural net.  The input of this is the output of e.g. nnet-get-egs\n"
-        "Aside from the logging output, which goes to the standard error, this program\n"
-        "prints the average log-prob per frame to the standard output.\n"
-        "Also see nnet-logprob, which produces a matrix of log-probs for each utterance.\n"
-        "\n"
-        "Usage:  nnet-compute-prob [options] <model-in> <training-examples-in>\n"
-        "e.g.: nnet-compute-prob 1.nnet ark:valid.egs\n";
-    
-    ParseOptions po(usage);
-
-    po.Read(argc, argv);
-    
-    if (po.NumArgs() != 2) {
-      po.PrintUsage();
-      exit(1);
-    }
-    
-    std::string nnet_rxfilename = po.GetArg(1),
-        examples_rspecifier = po.GetArg(2);
-
-    TransitionModel trans_model;
-    AmNnet am_nnet;
-    {
-      bool binary_read;
-      Input ki(nnet_rxfilename, &binary_read);
-      trans_model.Read(ki.Stream(), binary_read);
-      am_nnet.Read(ki.Stream(), binary_read);
-    }
-
-
-    std::vector<NnetExample> examples;
-    double tot_weight = 0.0, tot_like = 0.0, tot_accuracy = 0.0;
-    int64 num_examples = 0;
-    SequentialNnetExampleReader example_reader(examples_rspecifier);
-    for (; !example_reader.Done(); example_reader.Next(), num_examples++) {
-      if (examples.size() == 1000) {
-        double accuracy;
-        tot_like += ComputeNnetObjf(am_nnet.GetNnet(), examples, &accuracy);
-        tot_accuracy += accuracy;
-        tot_weight += TotalNnetTrainingWeight(examples);
-        examples.clear();
-      }
-      examples.push_back(example_reader.Value());
-      if (num_examples % 5000 == 0 && num_examples > 0)
-        KALDI_LOG << "Saw " << num_examples << " examples, average "
-                  << "probability is " << (tot_like / num_examples) << " with "
-                  << "total weight " << num_examples;
-    }
-    if (!examples.empty()) {
-      double accuracy;
-      tot_like += ComputeNnetObjf(am_nnet.GetNnet(), examples, &accuracy);
-      tot_accuracy += accuracy;      
-      tot_weight += TotalNnetTrainingWeight(examples);
-    }
-
-    KALDI_LOG << "Saw " << num_examples << " examples, average "
-              << "probability is " << (tot_like / tot_weight)
-              << " and accuracy is " << (tot_accuracy / tot_weight) << " with "
-              << "total weight " << tot_weight;
-    
-    std::cout << (tot_like / tot_weight) << "\n";
-    return (num_examples == 0 ? 1 : 0);
-  } catch(const std::exception &e) {
-    std::cerr << e.what() << '\n';
-    return -1;
-  }
-}
-
-
diff --git a/src/nnet2bin/nnet-compute.cc b/src/nnet2bin/nnet-compute.cc
deleted file mode 100644
index cc9b04f0ac7..00000000000
--- a/src/nnet2bin/nnet-compute.cc
+++ /dev/null
@@ -1,105 +0,0 @@
-// nnet2bin/nnet-compute.cc
-
-// Copyright 2012-2013  Johns Hopkins University (author:  Daniel Povey)
-
-// See ../../COPYING for clarification regarding multiple authors
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//  http://www.apache.org/licenses/LICENSE-2.0
-//
-// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
-// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
-// MERCHANTABLITY OR NON-INFRINGEMENT.
-// See the Apache 2 License for the specific language governing permissions and
-// limitations under the License.
-
-#include "base/kaldi-common.h"
-#include "util/common-utils.h"
-#include "hmm/transition-model.h"
-#include "nnet2/train-nnet.h"
-#include "nnet2/am-nnet.h"
-
-
-int main(int argc, char *argv[]) {
-  try {
-    using namespace kaldi;
-    using namespace kaldi::nnet2;
-    typedef kaldi::int32 int32;
-    typedef kaldi::int64 int64;
-
-    const char *usage =
-        "Does the neural net computation for each file of input features, and\n"
-        "outputs as a matrix the result.  Used mostly for debugging.\n"
-        "Note: if you want it to apply a log (e.g. for log-likelihoods), use\n"
-        "--apply-log=true.  Unlike nnet-am-compute, this version reads a 'raw'\n"
-        "neural net\n"
-        "\n"
-        "Usage:  nnet-compute [options] <raw-nnet-in> <feature-rspecifier> "
-        "<feature-or-loglikes-wspecifier>\n";
-    
-    bool apply_log = false;
-    bool pad_input = true;
-    ParseOptions po(usage);
-    po.Register("apply-log", &apply_log, "Apply a log to the result of the computation "
-                "before outputting.");
-    po.Register("pad-input", &pad_input, "If true, duplicate the first and last frames "
-                "of input features as required for temporal context, to prevent #frames "
-                "of output being less than those of input.");
-    
-    po.Read(argc, argv);
-    
-    if (po.NumArgs() != 3) {
-      po.PrintUsage();
-      exit(1);
-    }
-    
-    std::string raw_nnet_rxfilename = po.GetArg(1),
-        features_rspecifier = po.GetArg(2),
-        features_or_loglikes_wspecifier = po.GetArg(3);
-
-    Nnet nnet;
-    ReadKaldiObject(raw_nnet_rxfilename, &nnet);
-    
-    int64 num_done = 0, num_frames = 0;
-    SequentialBaseFloatCuMatrixReader feature_reader(features_rspecifier);
-    BaseFloatCuMatrixWriter writer(features_or_loglikes_wspecifier);
-    
-    for (; !feature_reader.Done();  feature_reader.Next()) {
-      std::string utt = feature_reader.Key();
-      const CuMatrix<BaseFloat> &feats = feature_reader.Value();
-
-      int32 output_frames = feats.NumRows(), output_dim = nnet.OutputDim();
-      if (!pad_input)
-        output_frames -= nnet.LeftContext() + nnet.RightContext();
-      if (output_frames <= 0) {
-        KALDI_WARN << "Skipping utterance " << utt << " because output "
-                   << "would be empty.";
-        continue;
-      }
-      CuMatrix<BaseFloat> output(output_frames, output_dim);
-      NnetComputation(nnet, feats, pad_input, &output);
-
-      if (apply_log) {
-        output.ApplyFloor(1.0e-20);
-        output.ApplyLog();
-      }
-      writer.Write(utt, output);
-      num_frames += feats.NumRows();
-      num_done++;
-    }
-    
-    KALDI_LOG << "Processed " << num_done << " feature files, "
-              << num_frames << " frames of input were processed.";
-    
-    return (num_done == 0 ? 1 : 0);
-  } catch(const std::exception &e) {
-    std::cerr << e.what() << '\n';
-    return -1;
-  }
-}
-
-
diff --git a/src/nnet2bin/nnet-copy-egs-discriminative.cc b/src/nnet2bin/nnet-copy-egs-discriminative.cc
deleted file mode 100644
index a14fd3e404f..00000000000
--- a/src/nnet2bin/nnet-copy-egs-discriminative.cc
+++ /dev/null
@@ -1,158 +0,0 @@
-// nnet2bin/nnet-copy-egs-discriminative.cc
-
-// Copyright 2012-2013  Johns Hopkins University (author:  Daniel Povey)
-
-// See ../../COPYING for clarification regarding multiple authors
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//  http://www.apache.org/licenses/LICENSE-2.0
-//
-// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
-// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
-// MERCHANTABLITY OR NON-INFRINGEMENT.
-// See the Apache 2 License for the specific language governing permissions and
-// limitations under the License.
-
-#include "base/kaldi-common.h"
-#include "util/common-utils.h"
-#include "hmm/transition-model.h"
-#include "nnet2/nnet-example-functions.h"
-
-namespace kaldi {
-namespace nnet2 {
-// returns an integer randomly drawn with expected value "expected_count"
-// (will be either floor(expected_count) or ceil(expected_count)).
-// this will go into an infinite loop if expected_count is very huge, but
-// it should never be that huge.
-int32 GetCount(double expected_count) {
-  KALDI_ASSERT(expected_count >= 0.0);
-  int32 ans = 0;
-  while (expected_count > 1.0) {
-    ans++;
-    expected_count--;
-  }
-  if (WithProb(expected_count))
-    ans++;
-  return ans;
-}
-void AverageConstPart(int32 const_feat_dim,
-                      DiscriminativeNnetExample *eg) {
-  if (eg->spk_info.Dim() != 0) {  // already has const part.
-    KALDI_ASSERT(eg->spk_info.Dim() == const_feat_dim);
-    // and nothing to do.
-  } else {
-    int32 dim = eg->input_frames.NumCols(),
-        basic_dim = dim - const_feat_dim;
-    KALDI_ASSERT(const_feat_dim < eg->input_frames.NumCols());
-    Matrix<BaseFloat> mat(eg->input_frames);  // copy to non-compressed matrix.
-    eg->input_frames = mat.Range(0, mat.NumRows(), 0, basic_dim);
-    eg->spk_info.Resize(const_feat_dim);
-    eg->spk_info.AddRowSumMat(1.0 / mat.NumRows(),
-                              mat.Range(0, mat.NumRows(),
-                                        basic_dim, const_feat_dim),
-                              0.0);
-  }
-}
-                      
-
-} // namespace nnet2
-} // namespace kaldi
-
-int main(int argc, char *argv[]) {
-  try {
-    using namespace kaldi;
-    using namespace kaldi::nnet2;
-    typedef kaldi::int32 int32;
-    typedef kaldi::int64 int64;
-
-    const char *usage =
-        "Copy examples for discriminative neural\n"
-        "network training.  Supports multiple wspecifiers, in\n"
-        "which case it will write the examples round-robin to the outputs.\n"
-        "\n"
-        "Usage:  nnet-copy-egs-discriminative [options] <egs-rspecifier> <egs-wspecifier1> [<egs-wspecifier2> ...]\n"
-        "\n"
-        "e.g.\n"
-        "nnet-copy-egs-discriminative ark:train.degs ark,t:text.degs\n"
-        "or:\n"
-        "nnet-copy-egs-discriminative ark:train.degs ark:1.degs ark:2.degs\n";
-        
-    bool random = false;
-    int32 srand_seed = 0;
-    BaseFloat keep_proportion = 1.0;
-    int32 const_feat_dim = 0;
-
-    ParseOptions po(usage);
-    po.Register("random", &random, "If true, will write frames to output "
-                "archives randomly, not round-robin.");
-    po.Register("keep-proportion", &keep_proportion, "If <1.0, this program will "
-                "randomly keep this proportion of the input samples.  If >1.0, it will "
-                "in expectation copy a sample this many times.  It will copy it a number "
-                "of times equal to floor(keep-proportion) or ceil(keep-proportion).");
-    po.Register("srand", &srand_seed, "Seed for random number generator "
-                "(only relevant if --random=true or --keep-proportion != 1.0)");
-    po.Register("const-feat-dim", &const_feat_dim,
-                "Dimension of part of features (last dims) which varies little "
-                "or not at all with time, and which should be stored as a single "
-                "vector for each example rather than in the feature matrix."
-                "Useful in systems that use iVectors.  Helpful to save space.");
-    
-    po.Read(argc, argv);
-
-    srand(srand_seed);
-    
-    if (po.NumArgs() < 2) {
-      po.PrintUsage();
-      exit(1);
-    }
-
-    std::string examples_rspecifier = po.GetArg(1);
-
-    SequentialDiscriminativeNnetExampleReader example_reader(
-        examples_rspecifier);
-
-    int32 num_outputs = po.NumArgs() - 1;
-    std::vector<DiscriminativeNnetExampleWriter*> example_writers(num_outputs);
-    for (int32 i = 0; i < num_outputs; i++)
-      example_writers[i] = new DiscriminativeNnetExampleWriter(
-          po.GetArg(i+2));
-
-    
-    int64 num_read = 0, num_written = 0, num_frames_written = 0;
-    for (; !example_reader.Done(); example_reader.Next(), num_read++) {
-      int32 count = GetCount(keep_proportion);
-      for (int32 c = 0; c < count; c++) {
-        int32 index = (random ? Rand() : num_written) % num_outputs;
-        std::ostringstream ostr;
-        ostr << num_written;
-        if (const_feat_dim == 0) {
-          example_writers[index]->Write(ostr.str(),
-                                        example_reader.Value());
-        } else {
-          DiscriminativeNnetExample eg = example_reader.Value();
-          AverageConstPart(const_feat_dim, &eg);
-          example_writers[index]->Write(ostr.str(), eg);
-        }
-        num_written++;
-        num_frames_written +=
-            static_cast<int64>(example_reader.Value().num_ali.size());
-      }
-    }
-    
-    for (int32 i = 0; i < num_outputs; i++)
-      delete example_writers[i];
-    KALDI_LOG << "Read " << num_read << " discriminative neural-network training"
-              << " examples, wrote " << num_written << ", consisting of "
-              << num_frames_written << " frames.";
-    return (num_written == 0 ? 1 : 0);
-  } catch(const std::exception &e) {
-    std::cerr << e.what() << '\n';
-    return -1;
-  }
-}
-
-
diff --git a/src/nnet2bin/nnet-copy-egs.cc b/src/nnet2bin/nnet-copy-egs.cc
deleted file mode 100644
index 7ef07cdf935..00000000000
--- a/src/nnet2bin/nnet-copy-egs.cc
+++ /dev/null
@@ -1,179 +0,0 @@
-// nnet2bin/nnet-copy-egs.cc
-
-// Copyright 2012  Johns Hopkins University (author:  Daniel Povey)
-// Copyright 2014  Vimal Manohar
-
-// See ../../COPYING for clarification regarding multiple authors
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//  http://www.apache.org/licenses/LICENSE-2.0
-//
-// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
-// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
-// MERCHANTABLITY OR NON-INFRINGEMENT.
-// See the Apache 2 License for the specific language governing permissions and
-// limitations under the License.
-
-#include "base/kaldi-common.h"
-#include "util/common-utils.h"
-#include "hmm/transition-model.h"
-#include "nnet2/nnet-example-functions.h"
-
-namespace kaldi {
-namespace nnet2 {
-// returns an integer randomly drawn with expected value "expected_count"
-// (will be either floor(expected_count) or ceil(expected_count)).
-// this will go into an infinite loop if expected_count is very huge, but
-// it should never be that huge.
-int32 GetCount(double expected_count) {
-  KALDI_ASSERT(expected_count >= 0.0);
-  int32 ans = 0;
-  while (expected_count > 1.0) {
-    ans++;
-    expected_count--;
-  }
-  if (WithProb(expected_count))
-    ans++;
-  return ans;
-}
-
-} // namespace nnet2
-} // namespace kaldi
-
-int main(int argc, char *argv[]) {
-  try {
-    using namespace kaldi;
-    using namespace kaldi::nnet2;
-    typedef kaldi::int32 int32;
-    typedef kaldi::int64 int64;
-
-    const char *usage =
-        "Copy examples (typically single frames) for neural network training,\n"
-        "possibly changing the binary mode.  Supports multiple wspecifiers, in\n"
-        "which case it will write the examples round-robin to the outputs.\n"
-        "\n"
-        "Usage:  nnet-copy-egs [options] <egs-rspecifier> <egs-wspecifier1> [<egs-wspecifier2> ...]\n"
-        "\n"
-        "e.g.\n"
-        "nnet-copy-egs ark:train.egs ark,t:text.egs\n"
-        "or:\n"
-        "nnet-copy-egs ark:train.egs ark:1.egs ark:2.egs\n";
-        
-    bool random = false;
-    int32 srand_seed = 0;
-    BaseFloat keep_proportion = 1.0;
-
-    // The following config variables, if set, can be used to extract a single
-    // frame of labels from a multi-frame example, and/or to reduce the amount
-    // of context.
-    int32 left_context = -1, right_context = -1;
-    // you can set frame to a number to select a single frame with a particular
-    // offset, or to 'random' to select a random single frame.
-    std::string frame_str;
-    
-    ParseOptions po(usage);
-    po.Register("random", &random, "If true, will write frames to output "
-                "archives randomly, not round-robin.");
-    po.Register("keep-proportion", &keep_proportion, "If <1.0, this program will "
-                "randomly keep this proportion of the input samples.  If >1.0, it will "
-                "in expectation copy a sample this many times.  It will copy it a number "
-                "of times equal to floor(keep-proportion) or ceil(keep-proportion).");
-    po.Register("srand", &srand_seed, "Seed for random number generator "
-                "(only relevant if --random=true or --keep-proportion != 1.0)");
-    po.Register("frame", &frame_str, "This option can be used to select a single "
-                "frame from each multi-frame example.  Set to a number 0, 1, etc. "
-                "to select a frame with a given index, or 'random' to select a "
-                "random frame.");
-    po.Register("left-context", &left_context, "Can be used to truncate the "
-                "feature left-context that we output.");
-    po.Register("right-context", &right_context, "Can be used to truncate the "
-                "feature right-context that we output.");
-
-    
-    po.Read(argc, argv);
-
-    srand(srand_seed);
-
-    int32 frame = -1;  // -1 means don't do any selection (--frame option unse),
-                       // --2 means random selection.
-    if (frame_str != "") {
-      if (!ConvertStringToInteger(frame_str, &frame)) {
-        if (frame_str == "random") {
-          frame = -2;
-        } else {
-          KALDI_ERR << "Invalid --frame option: '" << frame_str << "'";
-        }
-      } else {
-        KALDI_ASSERT(frame >= 0);
-      }
-    }
-    // the following derived variables will be used if the frame, left_context,
-    // or right_context options were set (the frame option will be more common).
-    bool copy_eg = (frame != -1 || left_context != -1 || right_context != -1);
-    int32 start_frame = -1, num_frames = -1;
-    if (frame != -1) {  // frame >= 0 or frame == -2 meaning random frame
-      num_frames = 1;
-      start_frame = frame;  // value will be ignored if frame == -2.
-    }
-    
-    if (po.NumArgs() < 2) {
-      po.PrintUsage();
-      exit(1);
-    }
-
-    std::string examples_rspecifier = po.GetArg(1);
-
-    SequentialNnetExampleReader example_reader(examples_rspecifier);
-
-    int32 num_outputs = po.NumArgs() - 1;
-    std::vector<NnetExampleWriter*> example_writers(num_outputs);
-    for (int32 i = 0; i < num_outputs; i++)
-      example_writers[i] = new NnetExampleWriter(po.GetArg(i+2));
-
-    
-    int64 num_read = 0, num_written = 0;
-    for (; !example_reader.Done(); example_reader.Next(), num_read++) {
-      // count is normally 1; could be 0, or possibly >1.
-      int32 count = GetCount(keep_proportion);  
-      std::string key = example_reader.Key();
-      const NnetExample &eg = example_reader.Value();
-      for (int32 c = 0; c < count; c++) {
-        int32 index = (random ? Rand() : num_written) % num_outputs;
-        if (!copy_eg) {
-          example_writers[index]->Write(key, eg);
-          num_written++;
-        } else { // the --frame option or related options were set.
-          if (frame == -2)  // --frame=random was set -> choose random frame
-            start_frame = RandInt(0, eg.labels.size() - 1);
-          if (start_frame == -1 || start_frame < eg.labels.size()) {
-            // note: we'd only reach here with start_frame == -1 if the
-            // --left-context or --right-context options were set (reducing
-            // context).  -1 means use whatever we had in the original eg.
-            NnetExample eg_mod(eg, start_frame, num_frames,
-                               left_context, right_context);
-            example_writers[index]->Write(key, eg_mod);
-            num_written++;            
-          }
-          // else this frame was out of range for this eg; we don't make this an
-          // error, because it can happen for truncated multi-frame egs that
-          // were created at the end of an utterance.
-        }
-      }
-    }
-    
-    for (int32 i = 0; i < num_outputs; i++)
-      delete example_writers[i];
-    KALDI_LOG << "Read " << num_read << " neural-network training examples, wrote "
-              << num_written;
-    return (num_written == 0 ? 1 : 0);
-  } catch(const std::exception &e) {
-    std::cerr << e.what() << '\n';
-    return -1;
-  }
-}
-
-
diff --git a/src/nnet2bin/nnet-get-egs-discriminative.cc b/src/nnet2bin/nnet-get-egs-discriminative.cc
deleted file mode 100644
index 58db6972567..00000000000
--- a/src/nnet2bin/nnet-get-egs-discriminative.cc
+++ /dev/null
@@ -1,151 +0,0 @@
-// nnet2bin/nnet-get-egs-discriminative.cc
-
-// Copyright 2012-2013  Johns Hopkins University (author:  Daniel Povey)
-
-// See ../../COPYING for clarification regarding multiple authors
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//  http://www.apache.org/licenses/LICENSE-2.0
-//
-// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
-// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
-// MERCHANTABLITY OR NON-INFRINGEMENT.
-// See the Apache 2 License for the specific language governing permissions and
-// limitations under the License.
-
-#include "base/kaldi-common.h"
-#include "util/common-utils.h"
-#include "hmm/transition-model.h"
-#include "nnet2/nnet-example-functions.h"
-#include "nnet2/am-nnet.h"
-
-
-int main(int argc, char *argv[]) {
-  try {
-    using namespace kaldi;
-    using namespace kaldi::nnet2;
-    typedef kaldi::int32 int32;
-    typedef kaldi::int64 int64;
-
-    const char *usage =
-        "Get examples of data for discriminative neural network training;\n"
-        "each one corresponds to part of a file, of variable (and configurable)\n"
-        "length.\n"
-        "\n"
-        "Usage:  nnet-get-egs-discriminative [options] <model> "
-        "<features-rspecifier> <ali-rspecifier> <den-lat-rspecifier> "
-        "<training-examples-out>\n"
-        "\n"
-        "An example [where $feats expands to the actual features]:\n"
-        "nnet-get-egs-discriminative --acoustic-scale=0.1 \\\n"
-        "  1.mdl '$feats' 'ark,s,cs:gunzip -c ali.1.gz|' 'ark,s,cs:gunzip -c lat.1.gz|' ark:1.degs\n";
-    
-    SplitDiscriminativeExampleConfig split_config;
-    
-    ParseOptions po(usage);
-    split_config.Register(&po);
-    
-    po.Read(argc, argv);
-
-    if (po.NumArgs() != 5) {
-      po.PrintUsage();
-      exit(1);
-    }
-
-    std::string nnet_rxfilename = po.GetArg(1),
-        feature_rspecifier = po.GetArg(2),
-        ali_rspecifier = po.GetArg(3),
-        clat_rspecifier = po.GetArg(4),
-        examples_wspecifier = po.GetArg(5);
-
-
-    TransitionModel trans_model;
-    AmNnet am_nnet;
-    {
-      bool binary;
-      Input ki(nnet_rxfilename, &binary);
-      trans_model.Read(ki.Stream(), binary);
-      am_nnet.Read(ki.Stream(), binary);
-    }
-
-    int32 left_context = am_nnet.GetNnet().LeftContext(),
-        right_context = am_nnet.GetNnet().RightContext();
-
-    
-    // Read in all the training files.
-    SequentialBaseFloatMatrixReader feat_reader(feature_rspecifier);
-    RandomAccessInt32VectorReader ali_reader(ali_rspecifier);
-    RandomAccessCompactLatticeReader clat_reader(clat_rspecifier);
-    DiscriminativeNnetExampleWriter example_writer(examples_wspecifier);
-    
-    int32 num_done = 0, num_err = 0;
-    int64 examples_count = 0; // used in generating id's.
-    
-    SplitExampleStats stats; // diagnostic.
-    
-    for (; !feat_reader.Done(); feat_reader.Next()) {
-      std::string key = feat_reader.Key();
-      const Matrix<BaseFloat> &feats = feat_reader.Value();
-      if (!ali_reader.HasKey(key)) {
-        KALDI_WARN << "No pdf-level posterior for key " << key;
-        num_err++;
-        continue;
-      }
-      const std::vector<int32> &alignment = ali_reader.Value(key);
-      if (!clat_reader.HasKey(key)) {
-        KALDI_WARN << "No denominator lattice for key " << key;
-        num_err++;
-        continue;
-      }
-      CompactLattice clat = clat_reader.Value(key);
-      CreateSuperFinal(&clat); // make sure only one state has a final-prob (of One()).
-      if (clat.Properties(fst::kTopSorted, true) == 0) {
-        TopSort(&clat);
-      }      
-
-      BaseFloat weight = 1.0;
-      DiscriminativeNnetExample eg;
-
-      if (!LatticeToDiscriminativeExample(alignment, feats, clat, weight,
-                                          left_context, right_context, &eg)) {
-        KALDI_WARN << "Error converting lattice to example.";
-        num_err++;
-        continue;
-      }
-      
-      std::vector<DiscriminativeNnetExample> egs;
-      SplitDiscriminativeExample(split_config, trans_model, eg,
-                                 &egs, &stats);
-      
-      KALDI_VLOG(2) << "Split lattice " << key << " into "
-                    << egs.size() << " pieces.";
-      for (size_t i = 0; i < egs.size(); i++) {
-        // Note: excised_egs will be of size 0 or 1.
-        std::vector<DiscriminativeNnetExample> excised_egs;
-        ExciseDiscriminativeExample(split_config, trans_model, egs[i],
-                                    &excised_egs, &stats);
-        for (size_t j = 0; j < excised_egs.size(); j++) {
-          std::ostringstream os;
-          os << (examples_count++);
-          std::string example_key = os.str();
-          example_writer.Write(example_key, excised_egs[j]);
-        }
-      }
-      num_done++;
-    }
-
-    if (num_done > 0) stats.Print();
-    
-    KALDI_LOG << "Finished generating examples, "
-              << "successfully processed " << num_done
-              << " feature files, " << num_err << " had errors.";
-    return (num_done == 0 ? 1 : 0);
-  } catch(const std::exception &e) {
-    std::cerr << e.what() << '\n';
-    return -1;
-  }
-}
diff --git a/src/nnet2bin/nnet-get-egs.cc b/src/nnet2bin/nnet-get-egs.cc
deleted file mode 100644
index 49f270c4f18..00000000000
--- a/src/nnet2bin/nnet-get-egs.cc
+++ /dev/null
@@ -1,184 +0,0 @@
-// nnet2bin/nnet-get-egs.cc
-
-// Copyright 2012-2014  Johns Hopkins University (author:  Daniel Povey)
-//                2014  Vimal Manohar
-
-// See ../../COPYING for clarification regarding multiple authors
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//  http://www.apache.org/licenses/LICENSE-2.0
-//
-// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
-// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
-// MERCHANTABLITY OR NON-INFRINGEMENT.
-// See the Apache 2 License for the specific language governing permissions and
-// limitations under the License.
-
-#include <sstream>
-
-#include "base/kaldi-common.h"
-#include "util/common-utils.h"
-#include "hmm/transition-model.h"
-#include "nnet2/nnet-example-functions.h"
-
-namespace kaldi {
-namespace nnet2 {
-
-
-static void ProcessFile(const MatrixBase<BaseFloat> &feats,
-                        const Posterior &pdf_post,
-                        const std::string &utt_id,
-                        int32 left_context,
-                        int32 right_context,
-                        int32 num_frames,
-                        int32 const_feat_dim,
-                        int64 *num_frames_written,
-                        int64 *num_egs_written,
-                        NnetExampleWriter *example_writer) {
-  KALDI_ASSERT(feats.NumRows() == static_cast<int32>(pdf_post.size()));
-  int32 feat_dim = feats.NumCols();
-  KALDI_ASSERT(const_feat_dim < feat_dim);
-  KALDI_ASSERT(num_frames > 0);
-  int32 basic_feat_dim = feat_dim - const_feat_dim;
-
-  for (int32 t = 0; t < feats.NumRows(); t += num_frames) {
-    int32 this_num_frames = std::min(num_frames,
-                                     feats.NumRows() - t);
-
-    int32 tot_frames = left_context + this_num_frames + right_context;
-    NnetExample eg;
-    Matrix<BaseFloat> input_frames(tot_frames, basic_feat_dim);
-    eg.left_context = left_context;
-    eg.spk_info.Resize(const_feat_dim);
-
-    // Set up "input_frames".
-    for (int32 j = -left_context; j < this_num_frames + right_context; j++) {
-      int32 t2 = j + t;
-      if (t2 < 0) t2 = 0;
-      if (t2 >= feats.NumRows()) t2 = feats.NumRows() - 1;
-      SubVector<BaseFloat> src(feats.Row(t2), 0, basic_feat_dim),
-          dest(input_frames, j + left_context);
-      dest.CopyFromVec(src);
-      if (const_feat_dim > 0) {
-        SubVector<BaseFloat> src(feats.Row(t2), basic_feat_dim, const_feat_dim);
-        // set eg.spk_info to the average of the corresponding dimensions of
-        // the input, taken over the frames whose features we store in the eg.
-        eg.spk_info.AddVec(1.0 / tot_frames, src);
-      }
-    }
-    eg.labels.resize(this_num_frames);
-    for (int32 j = 0; j < this_num_frames; j++)
-      eg.labels[j] = pdf_post[t + j];
-    eg.input_frames = input_frames;  // Copy to CompressedMatrix.
-    
-    std::ostringstream os;
-    os << utt_id << "-" << t;
-
-    std::string key = os.str(); // key is <utt_id>-<frame_id>
-
-    *num_frames_written += this_num_frames;
-    *num_egs_written += 1;
-
-    example_writer->Write(key, eg);
-  }
-}
-
-
-} // namespace nnet2
-} // namespace kaldi
-
-int main(int argc, char *argv[]) {
-  try {
-    using namespace kaldi;
-    using namespace kaldi::nnet2;
-    typedef kaldi::int32 int32;
-    typedef kaldi::int64 int64;
-
-    const char *usage =
-        "Get frame-by-frame examples of data for neural network training.\n"
-        "Essentially this is a format change from features and posteriors\n"
-        "into a special frame-by-frame format.  To split randomly into\n"
-        "different subsets, do nnet-copy-egs with --random=true, but\n"
-        "note that this does not randomize the order of frames.\n"
-        "\n"
-        "Usage:  nnet-get-egs [options] <features-rspecifier> "
-        "<pdf-post-rspecifier> <training-examples-out>\n"
-        "\n"
-        "An example [where $feats expands to the actual features]:\n"
-        "nnet-get-egs --left-context=8 --right-context=8 \"$feats\" \\\n"
-        "  \"ark:gunzip -c exp/nnet/ali.1.gz | ali-to-pdf exp/nnet/1.nnet ark:- ark:- | ali-to-post ark:- ark:- |\" \\\n"
-        "   ark:- \n"
-        "Note: the --left-context and --right-context would be derived from\n"
-        "the output of nnet-info.";
-        
-    
-    int32 left_context = 0, right_context = 0,
-        num_frames = 1, const_feat_dim = 0;
-    
-    ParseOptions po(usage);
-    po.Register("left-context", &left_context, "Number of frames of left "
-                "context the neural net requires.");
-    po.Register("right-context", &right_context, "Number of frames of right "
-                "context the neural net requires.");
-    po.Register("num-frames", &num_frames, "Number of frames with labels "
-                "that each example contains.");
-    po.Register("const-feat-dim", &const_feat_dim, "If specified, the last "
-                "const-feat-dim dimensions of the feature input are treated as "
-                "constant over the context window (so are not spliced)");
-    
-    po.Read(argc, argv);
-
-    if (po.NumArgs() != 3) {
-      po.PrintUsage();
-      exit(1);
-    }
-
-    std::string feature_rspecifier = po.GetArg(1),
-        pdf_post_rspecifier = po.GetArg(2),
-        examples_wspecifier = po.GetArg(3);
-
-    // Read in all the training files.
-    SequentialBaseFloatMatrixReader feat_reader(feature_rspecifier);
-    RandomAccessPosteriorReader pdf_post_reader(pdf_post_rspecifier);
-    NnetExampleWriter example_writer(examples_wspecifier);
-    
-    int32 num_done = 0, num_err = 0;
-    int64 num_frames_written = 0, num_egs_written = 0;
-    
-    for (; !feat_reader.Done(); feat_reader.Next()) {
-      std::string key = feat_reader.Key();
-      const Matrix<BaseFloat> &feats = feat_reader.Value();
-      if (!pdf_post_reader.HasKey(key)) {
-        KALDI_WARN << "No pdf-level posterior for key " << key;
-        num_err++;
-      } else {
-        const Posterior &pdf_post = pdf_post_reader.Value(key);
-        if (pdf_post.size() != feats.NumRows()) {
-          KALDI_WARN << "Posterior has wrong size " << pdf_post.size()
-                     << " versus " << feats.NumRows();
-          num_err++;
-          continue;
-        }
-        ProcessFile(feats, pdf_post, key,
-                    left_context, right_context, num_frames,
-                    const_feat_dim, &num_frames_written, &num_egs_written,
-                    &example_writer);
-        num_done++;
-      }
-    }
-
-    KALDI_LOG << "Finished generating examples, "
-              << "successfully processed " << num_done
-              << " feature files, wrote " << num_egs_written << " examples, "
-              << " with " << num_frames_written << " egs in total; "
-              << num_err << " files had errors.";
-    return (num_egs_written == 0 || num_err > num_done ? 1 : 0);
-  } catch(const std::exception &e) {
-    std::cerr << e.what() << '\n';
-    return -1;
-  }
-}
diff --git a/src/nnet2bin/nnet-get-feature-transform-multi.cc b/src/nnet2bin/nnet-get-feature-transform-multi.cc
deleted file mode 100644
index d7763e821e0..00000000000
--- a/src/nnet2bin/nnet-get-feature-transform-multi.cc
+++ /dev/null
@@ -1,94 +0,0 @@
-// nnet2bin/nnet-get-feature-transform-multi.cc
-
-// Copyright 2013  Johns Hopkins University (author: Daniel Povey)
-
-// See ../../COPYING for clarification regarding multiple authors
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//  http://www.apache.org/licenses/LICENSE-2.0
-//
-// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
-// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
-// MERCHANTABLITY OR NON-INFRINGEMENT.
-// See the Apache 2 License for the specific language governing permissions and
-// limitations under the License.
-
-
-#include "base/kaldi-common.h"
-#include "util/common-utils.h"
-#include "nnet2/get-feature-transform.h"
-
-int main(int argc, char *argv[]) {
-  using namespace kaldi;
-  typedef kaldi::int32 int32;
-  try {
-    const char *usage =
-        "Get feature-projection transform using stats obtained with acc-lda.\n"
-        "The file <index-list> contains a series of line, each containing a list\n"
-        "of integer indexes.  For each line we create a transform of the same type\n"
-        "as nnet-get-feature-transform would produce, taking as input just the\n"
-        "listed feature dimensions.  The output transform will be the concatenation\n"
-        "of all these transforms.  The output-dim will be the number of integers in\n"
-        "the file <index-list> (the individual transforms are not dimension-reducing).\n"
-        "Do not set the --dim option."
-        "Usage:  nnet-get-feature-transform-multi [options] <index-list> <lda-acc-1> <lda-acc-2> ... <lda-acc-n> <matrix-out>\n";
-
-    bool binary = true;
-
-    FeatureTransformEstimateOptions opts;
-    ParseOptions po(usage);
-    po.Register("binary", &binary, "Write accumulators in binary mode.");
-    opts.Register(&po);
-    po.Read(argc, argv);
-    
-    if (po.NumArgs() < 2) {
-      po.PrintUsage();
-      exit(1);
-    }
-
-    FeatureTransformEstimateMulti fte;
-    std::string index_list_rxfilename = po.GetArg(1);
-    std::string projection_wxfilename = po.GetArg(po.NumArgs());
-
-    std::vector<std::vector<int32> > indexes;
-    {
-      Input ki(index_list_rxfilename);
-      std::string line;
-      while (getline(ki.Stream(), line)) {
-        std::vector<int32> this_indexes;
-        if (!SplitStringToIntegers(line, " \t\n\r",
-                                   true, &this_indexes) ||
-            line.empty()) {
-          KALDI_ERR << "Bad line in index-list file: line is " << line;
-        }
-        indexes.push_back(this_indexes);
-      }
-      if (indexes.empty())
-        KALDI_ERR << "Empty index-list file "
-                  << PrintableRxfilename(index_list_rxfilename);
-    }
-    
-    for (int32 i = 2; i < po.NumArgs(); i++) {
-      bool binary_in, add = true;
-      Input ki(po.GetArg(i), &binary_in);
-      fte.Read(ki.Stream(), binary_in, add);
-    }
-
-    Matrix<BaseFloat> mat;
-    fte.Estimate(opts, indexes, &mat);
-    WriteKaldiObject(mat, projection_wxfilename, binary);
-
-    KALDI_LOG << "Wrote transform to "
-              << PrintableWxfilename(projection_wxfilename);
-    return 0;
-  } catch(const std::exception &e) {
-    std::cerr << e.what();
-    return -1;
-  }
-}
-
-
diff --git a/src/nnet2bin/nnet-get-feature-transform.cc b/src/nnet2bin/nnet-get-feature-transform.cc
deleted file mode 100644
index b2e3823a30b..00000000000
--- a/src/nnet2bin/nnet-get-feature-transform.cc
+++ /dev/null
@@ -1,87 +0,0 @@
-// nnet2bin/nnet-get-feature-transform.cc
-
-// Copyright 2013  Johns Hopkins University (author: Daniel Povey)
-
-// See ../../COPYING for clarification regarding multiple authors
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//  http://www.apache.org/licenses/LICENSE-2.0
-//
-// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
-// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
-// MERCHANTABLITY OR NON-INFRINGEMENT.
-// See the Apache 2 License for the specific language governing permissions and
-// limitations under the License.
-
-
-#include "base/kaldi-common.h"
-#include "util/common-utils.h"
-#include "nnet2/get-feature-transform.h"
-
-int main(int argc, char *argv[]) {
-  using namespace kaldi;
-  typedef kaldi::int32 int32;
-  try {
-    const char *usage =
-        "Get feature-projection transform using stats obtained with acc-lda.\n"
-        "See comments in the code of nnet2/get-feature-transform.h for more\n"
-        "information.\n"
-        "\n"
-        "Usage:  nnet-get-feature-transform [options] <matrix-out> <lda-acc-1> <lda-acc-2> ...\n";
-
-    bool binary = true;
-    FeatureTransformEstimateOptions opts;
-    std::string write_cholesky;
-    std::string write_within_covar;
-    ParseOptions po(usage);
-    po.Register("binary", &binary, "Write outputs in binary mode.");
-    po.Register("write-cholesky", &write_cholesky, "If supplied, write to this "
-                "wxfilename the Cholesky factor of the within-class covariance. "
-                "Can be used for perturbing features.  E.g. "
-                "--write-cholesky=exp/nnet5/cholesky.tpmat");
-    po.Register("write-within-covar", &write_within_covar, "If supplied, write "
-                "to this wxfilename the within-class covariance (as a symmetric "
-                "matrix). E.g. --write-within-covar=exp/nnet5/within_covar.mat");
-    opts.Register(&po);
-    po.Read(argc, argv);
-
-    if (po.NumArgs() < 2) {
-      po.PrintUsage();
-      exit(1);
-    }
-
-    FeatureTransformEstimate fte;
-    std::string projection_wxfilename = po.GetArg(1);
-
-    for (int32 i = 2; i <= po.NumArgs(); i++) {
-      bool binary_in, add = true;
-      Input ki(po.GetArg(i), &binary_in);
-      fte.Read(ki.Stream(), binary_in, add);
-    }
-
-    Matrix<BaseFloat> mat;
-    TpMatrix<BaseFloat> cholesky;
-    fte.Estimate(opts, &mat,
-                 (write_cholesky != "" || write_within_covar != "" ?
-                  &cholesky : NULL));
-    WriteKaldiObject(mat, projection_wxfilename, binary);
-    if (write_cholesky != "") {
-      WriteKaldiObject(cholesky, write_cholesky, binary);
-    }
-    if (write_within_covar != "") {
-      SpMatrix<BaseFloat> within_var(cholesky.NumRows());
-      within_var.AddTp2(1.0, cholesky, kNoTrans, 0.0);
-      WriteKaldiObject(within_var, write_within_covar, binary);
-    }
-    return 0;
-  } catch(const std::exception &e) {
-    std::cerr << e.what();
-    return -1;
-  }
-}
-
-
diff --git a/src/nnet2bin/nnet-get-weighted-egs.cc b/src/nnet2bin/nnet-get-weighted-egs.cc
deleted file mode 100644
index a3099ad8017..00000000000
--- a/src/nnet2bin/nnet-get-weighted-egs.cc
+++ /dev/null
@@ -1,232 +0,0 @@
-// nnet2bin/nnet-get-weighted-egs.cc
-
-// Copyright 2013-2014  (Author: Vimal Manohar)
-
-// See ../../COPYING for clarification regarding multiple authors
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//  http://www.apache.org/licenses/LICENSE-2.0
-//
-// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
-// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
-// MERCHANTABLITY OR NON-INFRINGEMENT.
-// See the Apache 2 License for the specific language governing permissions and
-// limitations under the License.
-
-#include "base/kaldi-common.h"
-#include "util/common-utils.h"
-#include "hmm/transition-model.h"
-#include "nnet2/nnet-example-functions.h"
-
-namespace kaldi {
-namespace nnet2 {
-
-// returns an integer randomly drawn with expected value "expected_count"
-// (will be either floor(expected_count) or ceil(expected_count)).
-// this will go into an infinite loop if expected_count is very huge, but
-// it should never be that huge.
-// In the normal case, "expected_count" will be between zero and one.
-int32 GetCount(double expected_count) {
-  KALDI_ASSERT(expected_count >= 0.0);
-  int32 ans = 0;
-  while (expected_count > 1.0) {
-    ans++;
-    expected_count--;
-  }
-  if (WithProb(expected_count))
-    ans++;
-  return ans;
-}
-
-static void ProcessFile(const MatrixBase<BaseFloat> &feats,
-                        const Posterior &pdf_post,
-                        const std::string &utt_id,
-                        const Vector<BaseFloat> &weights,
-                        int32 left_context,
-                        int32 right_context,
-                        int32 const_feat_dim,
-                        BaseFloat keep_proportion,
-                        BaseFloat weight_threshold,
-                        bool use_frame_selection,
-                        bool use_frame_weights,
-                        int64 *num_frames_written,
-                        int64 *num_frames_skipped,
-                        NnetExampleWriter *example_writer) {
-  KALDI_ASSERT(feats.NumRows() == static_cast<int32>(pdf_post.size()));
-  int32 feat_dim = feats.NumCols();
-  KALDI_ASSERT(const_feat_dim < feat_dim);
-  int32 basic_feat_dim = feat_dim - const_feat_dim;
-  NnetExample eg;
-  Matrix<BaseFloat> input_frames(left_context + 1 + right_context,
-                                 basic_feat_dim);
-  eg.left_context = left_context;
-  // TODO: modify this code, and this binary itself, to support the --num-frames
-  // option to allow multiple frames per eg.
-  for (int32 i = 0; i < feats.NumRows(); i++) {
-    int32 count = GetCount(keep_proportion); // number of times
-    // we'll write this out (1 by default).
-    if (count > 0) {
-      // Set up "input_frames".
-      for (int32 j = -left_context; j <= right_context; j++) {
-        int32 j2 = j + i;
-        if (j2 < 0) j2 = 0;
-        if (j2 >= feats.NumRows()) j2 = feats.NumRows() - 1;
-        SubVector<BaseFloat> src(feats, j2), dest(input_frames,
-                                                  j + left_context);
-        dest.CopyFromVec(src);
-      }
-      eg.labels.push_back(pdf_post[i]);
-      eg.input_frames = input_frames;
-      if (const_feat_dim > 0) {
-        // we'll normally reach here if we're using online-estimated iVectors.
-        SubVector<BaseFloat> const_part(feats.Row(i),
-                                        basic_feat_dim, const_feat_dim);
-        eg.spk_info.CopyFromVec(const_part);
-      }
-      if (use_frame_selection) {
-        if (weights(i) < weight_threshold) {
-          (*num_frames_skipped)++;
-          continue;
-        }
-      }
-      std::ostringstream os;
-      os << utt_id << "-" << i;
-      std::string key = os.str(); // key in the archive is the number of the example
-
-      for (int32 c = 0; c < count; c++)
-        example_writer->Write(key, eg);
-    }
-  }
-}
-
-
-} // namespace nnet2
-} // namespace kaldi
-
-int main(int argc, char *argv[]) {
-  try {
-    using namespace kaldi;
-    using namespace kaldi::nnet2;
-    typedef kaldi::int32 int32;
-    typedef kaldi::int64 int64;
-
-    const char *usage =
-        "Get frame-by-frame examples of data for neural network training.\n"
-        "Essentially this is a format change from features and posteriors\n"
-        "into a special frame-by-frame format.  To split randomly into\n"
-        "different subsets, do nnet-copy-egs with --random=true, but\n"
-        "note that this does not randomize the order of frames.\n"
-        "\n"
-        "Usage:  nnet-get-weighted-egs [options] <features-rspecifier> "
-        "<pdf-post-rspecifier> <weights-rspecifier> <training-examples-out>\n"
-        "\n"
-        "An example [where $feats expands to the actual features]:\n"
-        "nnet-get-weighted-egs --left-context=8 --right-context=8 \"$feats\" \\\n"
-        "  \"ark:gunzip -c exp/nnet/ali.1.gz | ali-to-pdf exp/nnet/1.nnet ark:- ark:- | ali-to-post ark:- ark:- |\" \\\n"
-        "   ark:- \n"
-        "Note: the --left-context and --right-context would be derived from\n"
-        "the output of nnet-info.";
-        
-    
-    int32 left_context = 0, right_context = 0, const_feat_dim = 0;
-    int32 srand_seed = 0;
-    BaseFloat keep_proportion = 1.0;
-    BaseFloat weight_threshold = 0.0;
-    bool use_frame_selection = true, use_frame_weights=false;
-    
-    ParseOptions po(usage);
-    po.Register("left-context", &left_context, "Number of frames of left context "
-                "the neural net requires.");
-    po.Register("right-context", &right_context, "Number of frames of right context "
-                "the neural net requires.");
-    po.Register("const-feat-dim", &const_feat_dim, "If specified, the last "
-                "const-feat-dim dimensions of the feature input are treated as "
-                "constant over the context window (so are not spliced)");
-    po.Register("keep-proportion", &keep_proportion, "If <1.0, this program will "
-                "randomly keep this proportion of the input samples.  If >1.0, it will "
-                "in expectation copy a sample this many times.  It will copy it a number "
-                "of times equal to floor(keep-proportion) or ceil(keep-proportion).");
-    po.Register("srand", &srand_seed, "Seed for random number generator "
-                "(only relevant if --keep-proportion != 1.0)");
-    po.Register("weight-threshold", &weight_threshold, "Keep only frames with weights "
-                "above this threshold.");
-    po.Register("use-frame-selection", &use_frame_selection, "Remove the frames below threshold.");
-    po.Register("use-frame-weights", &use_frame_weights, "Scale the error derivatives by the weight");
-    
-    po.Read(argc, argv);
-
-    srand(srand_seed);
-    
-    if (po.NumArgs() != 4) {
-      po.PrintUsage();
-      exit(1);
-    }
-
-    std::string feature_rspecifier = po.GetArg(1),
-        pdf_post_rspecifier = po.GetArg(2),
-        weights_rspecifier = po.GetArg(3),
-        examples_wspecifier = po.GetArg(4);
-
-    // Read in all the training files.
-    SequentialBaseFloatMatrixReader feat_reader(feature_rspecifier);
-    RandomAccessPosteriorReader pdf_post_reader(pdf_post_rspecifier);
-    RandomAccessBaseFloatVectorReader weights_reader(weights_rspecifier);
-    NnetExampleWriter example_writer(examples_wspecifier);
-    
-    int32 num_done = 0, num_err = 0;
-    int64 num_frames_written = 0;
-    int64 num_frames_skipped = 0;
-    
-    for (; !feat_reader.Done(); feat_reader.Next()) {
-      std::string key = feat_reader.Key();
-      const Matrix<BaseFloat> &feats = feat_reader.Value();
-      if (!pdf_post_reader.HasKey(key)) {
-        KALDI_WARN << "No pdf-level posterior for key " << key;
-        num_err++;
-      } else {
-        const Posterior &pdf_post = pdf_post_reader.Value(key);
-        if (pdf_post.size() != feats.NumRows()) {
-          KALDI_WARN << "Posterior has wrong size " << pdf_post.size()
-                     << " versus " << feats.NumRows();
-          num_err++;
-          continue;
-        }
-        if (!weights_reader.HasKey(key)) {
-          KALDI_ERR << "No weights for utterance " << key;
-          //ProcessFile(feats, pdf_post, NULL,
-          //    left_context, right_context, const_feat_dim, keep_proportion,
-          //    weight_threshold, false, false, &num_frames_written, 
-          //    &num_frames_skipped, &example_writer);
-        } else {
-          Vector<BaseFloat> weights = weights_reader.Value(key);
-          if (weights.Dim() != static_cast<int32>(pdf_post.size())) {
-            KALDI_WARN << "Weights for utterance " << key
-              << " have wrong size, " << weights.Dim()
-              << " vs. " << pdf_post.size();
-            num_err++;
-            continue;
-          }
-          ProcessFile(feats, pdf_post, key, weights, left_context, right_context,
-                      const_feat_dim, keep_proportion, weight_threshold,
-                      use_frame_selection, use_frame_weights,
-                      &num_frames_written, &num_frames_skipped, &example_writer);
-        }
-        num_done++;
-      }
-    }
-
-    KALDI_LOG << "Finished generating examples, "
-              << "successfully processed " << num_done
-              << " feature files, wrote " << num_frames_written << " examples, "
-              << "skipped " << num_frames_skipped << " examples, "
-              << num_err << " files had errors.";
-    return (num_done == 0 ? 1 : 0);
-  } catch(const std::exception &e) {
-    std::cerr << e.what() << '\n';
-    return -1;
-  }
-}
diff --git a/src/nnet2bin/nnet-init.cc b/src/nnet2bin/nnet-init.cc
deleted file mode 100644
index 7f29f5e8306..00000000000
--- a/src/nnet2bin/nnet-init.cc
+++ /dev/null
@@ -1,76 +0,0 @@
-// nnet2bin/nnet-init.cc
-
-// Copyright 2012  Johns Hopkins University (author:  Daniel Povey)
-
-// See ../../COPYING for clarification regarding multiple authors
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//  http://www.apache.org/licenses/LICENSE-2.0
-//
-// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
-// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
-// MERCHANTABLITY OR NON-INFRINGEMENT.
-// See the Apache 2 License for the specific language governing permissions and
-// limitations under the License.
-
-#include "base/kaldi-common.h"
-#include "util/common-utils.h"
-#include "nnet2/am-nnet.h"
-#include "hmm/transition-model.h"
-#include "tree/context-dep.h"
-
-int main(int argc, char *argv[]) {
-  try {
-    using namespace kaldi;
-    using namespace kaldi::nnet2;
-    typedef kaldi::int32 int32;
-
-    const char *usage =
-        "Initialize the nnet2 neural network from a config file with a line for each\n"
-        "component.  Note, this only outputs the neural net itself, not the associated\n"
-        "information such as the transition-model; you'll probably want to pipe\n"
-        "the output into something like nnet-am-init.\n"
-        "\n"
-        "Usage:  nnet-init [options] <config-in> <raw-nnet-out>\n"
-        "e.g.:\n"
-        " nnet-init nnet.config 1.raw\n";
-    
-    bool binary_write = true;
-    int32 srand_seed = 0;
-    
-    ParseOptions po(usage);
-    po.Register("binary", &binary_write, "Write output in binary mode");
-    po.Register("srand", &srand_seed, "Seed for random number generator");
-    
-    po.Read(argc, argv);
-    srand(srand_seed);
-    
-    if (po.NumArgs() != 2) {
-      po.PrintUsage();
-      exit(1);
-    }
-
-    std::string config_rxfilename = po.GetArg(1),
-        raw_nnet_wxfilename = po.GetArg(2);
-    
-    Nnet nnet;
-    {
-      bool binary;
-      Input ki(config_rxfilename, &binary);
-      KALDI_ASSERT(!binary && "Expect config file to contain text.");
-      nnet.Init(ki.Stream());
-    }
-
-    WriteKaldiObject(nnet, raw_nnet_wxfilename, binary_write);
-    KALDI_LOG << "Initialized raw neural net and wrote it to "
-              << raw_nnet_wxfilename;
-    return 0;
-  } catch(const std::exception &e) {
-    std::cerr << e.what() << '\n';
-    return -1;
-  }
-}
diff --git a/src/nnet2bin/nnet-insert.cc b/src/nnet2bin/nnet-insert.cc
deleted file mode 100644
index caf4f58a5e8..00000000000
--- a/src/nnet2bin/nnet-insert.cc
+++ /dev/null
@@ -1,138 +0,0 @@
-// nnet2bin/nnet-insert.cc
-
-// Copyright 2012-2014  Johns Hopkins University (author:  Daniel Povey)
-
-// See ../../COPYING for clarification regarding multiple authors
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//  http://www.apache.org/licenses/LICENSE-2.0
-//
-// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
-// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
-// MERCHANTABLITY OR NON-INFRINGEMENT.
-// See the Apache 2 License for the specific language governing permissions and
-// limitations under the License.
-
-#include "base/kaldi-common.h"
-#include "util/common-utils.h"
-#include "nnet2/am-nnet.h"
-#include "nnet2/nnet-functions.h"
-#include "hmm/transition-model.h"
-#include "tree/context-dep.h"
-
-int main(int argc, char *argv[]) {
-  try {
-    using namespace kaldi;
-    using namespace kaldi::nnet2;
-    typedef kaldi::int32 int32;
-
-    const char *usage =
-        "Insert components into a neural network-based acoustic model.\n"
-        "This is mostly intended for adding new hidden layers to neural networks.\n"
-        "You can either specify the option --insert-at=n (specifying the index of\n"
-        "the component after which you want your neural network inserted), or by\n"
-        "default this program will insert it just before the component before the\n"
-        "softmax component.  CAUTION: It will also randomize the parameters of the\n"
-        "component before the softmax (typically AffineComponent), with stddev equal\n"
-        "to the --stddev-factor option (default 0.1), times the inverse square root\n"
-        "of the number of inputs to that component.\n"
-        "Set --randomize-next-component=false to turn this off.\n"
-        "\n"
-        "Usage:  nnet-insert [options] <nnet-in> <raw-nnet-to-insert-in> <nnet-out>\n"
-        "e.g.:\n"
-        " nnet-insert 1.nnet \"nnet-init hidden_layer.config -|\" 2.nnet\n";
-
-    bool binary_write = true;
-    bool randomize_next_component = true;
-    int32 insert_at = -1;
-    BaseFloat stddev_factor = 0.1;
-    int32 srand_seed = 0;
-    
-    ParseOptions po(usage);
-    
-    po.Register("binary", &binary_write, "Write output in binary mode");
-    po.Register("randomize-next-component", &randomize_next_component,
-                "If true, randomize the parameters of the next component after "
-                "what we insert (which must be updatable).");
-    po.Register("insert-at", &insert_at, "Inserts new components before the "
-                "specified component (note: indexes are zero-based).  If <0, "
-                "inserts before the component before the softmax.");
-    po.Register("stddev-factor", &stddev_factor, "Factor on the standard "
-                "deviation when randomizing next component (only relevant if "
-                "--randomize-next-component=true");
-    po.Register("srand", &srand_seed, "Seed for random number generator");
-    
-    po.Read(argc, argv);
-    srand(srand_seed);
-    
-    if (po.NumArgs() != 3) {
-      po.PrintUsage();
-      exit(1);
-    }
-
-    std::string nnet_rxfilename = po.GetArg(1),
-        raw_nnet_rxfilename = po.GetArg(2),
-        nnet_wxfilename = po.GetArg(3);
-    
-    TransitionModel trans_model;
-    AmNnet am_nnet;
-    {
-      bool binary;
-      Input ki(nnet_rxfilename, &binary);
-      trans_model.Read(ki.Stream(), binary);
-      am_nnet.Read(ki.Stream(), binary);
-    }
-
-    Nnet src_nnet; // the one we'll insert.
-    ReadKaldiObject(raw_nnet_rxfilename, &src_nnet);
-
-    if (insert_at == -1) {
-      if ((insert_at = IndexOfSoftmaxLayer(am_nnet.GetNnet())) == -1)
-        KALDI_ERR << "We don't know where to insert the new components: "
-            "the neural net doesn't have exactly one softmax component, "
-            "and you didn't use the --insert-at option.";
-      insert_at--; // we want to insert before the linearity before
-      // the softmax layer.
-    }
-    
-    // This function is declared in nnet-functions.h
-    InsertComponents(src_nnet,
-                     insert_at,
-                     &(am_nnet.GetNnet()));
-    KALDI_LOG << "Inserted " << src_nnet.NumComponents() << " components at "
-              << "position " << insert_at;
-
-    if (randomize_next_component) {
-      int32 c = insert_at + src_nnet.NumComponents();
-      kaldi::nnet2::Component *component = &(am_nnet.GetNnet().GetComponent(c));
-      UpdatableComponent *uc = dynamic_cast<UpdatableComponent*>(component);
-      if (!uc)
-        KALDI_ERR << "You have --randomize-next-component=true, but the "
-                  << "component to randomize is not updatable: "
-                  << component->Info();
-      bool treat_as_gradient = false;
-      uc->SetZero(treat_as_gradient);
-      BaseFloat stddev = stddev_factor /
-          std::sqrt(static_cast<BaseFloat>(uc->InputDim()));
-      uc->PerturbParams(stddev);
-      KALDI_LOG << "Randomized component index " << c << " with stddev "
-                << stddev;
-    }
-
-   
-    {
-      Output ko(nnet_wxfilename, binary_write);
-      trans_model.Write(ko.Stream(), binary_write);
-      am_nnet.Write(ko.Stream(), binary_write);
-    }
-    KALDI_LOG << "Write neural-net acoustic model to " <<  nnet_wxfilename;
-    return 0;
-  } catch(const std::exception &e) {
-    std::cerr << e.what() << '\n';
-    return -1;
-  }
-}
diff --git a/src/nnet2bin/nnet-latgen-faster-parallel.cc b/src/nnet2bin/nnet-latgen-faster-parallel.cc
deleted file mode 100644
index 658d1fd8db6..00000000000
--- a/src/nnet2bin/nnet-latgen-faster-parallel.cc
+++ /dev/null
@@ -1,207 +0,0 @@
-// nnet2bin/nnet-latgen-faster-parallel.cc
-
-// Copyright 2009-2013   Microsoft Corporation
-//                       Johns Hopkins University (author: Daniel Povey)
-//                2014   Guoguo Chen
-
-// See ../../COPYING for clarification regarding multiple authors
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//  http://www.apache.org/licenses/LICENSE-2.0
-//
-// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
-// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
-// MERCHANTABLITY OR NON-INFRINGEMENT.
-// See the Apache 2 License for the specific language governing permissions and
-// limitations under the License.
-
-#include "base/kaldi-common.h"
-#include "util/common-utils.h"
-#include "tree/context-dep.h"
-#include "hmm/transition-model.h"
-#include "fstext/kaldi-fst-io.h"
-#include "decoder/decoder-wrappers.h"
-#include "nnet2/decodable-am-nnet.h"
-#include "base/timer.h"
-#include "util/kaldi-thread.h"
-
-
-int main(int argc, char *argv[]) {
-  try {
-    using namespace kaldi;
-    using namespace kaldi::nnet2;
-    typedef kaldi::int32 int32;
-    using fst::SymbolTable;
-    using fst::Fst;
-    using fst::StdArc;
-
-    const char *usage =
-        "Generate lattices using neural net model.\n"
-        "Usage: nnet-latgen-faster-parallel [options] <nnet-in> <fst-in|fsts-rspecifier> <features-rspecifier>"
-        " <lattice-wspecifier> [ <words-wspecifier> [<alignments-wspecifier>] ]\n";
-    ParseOptions po(usage);
-    Timer timer;
-    bool allow_partial = false;
-    BaseFloat acoustic_scale = 0.1;
-    LatticeFasterDecoderConfig config;
-    TaskSequencerConfig sequencer_config; // has --num-threads option
-
-    std::string word_syms_filename;
-    sequencer_config.Register(&po);
-    config.Register(&po);
-    po.Register("acoustic-scale", &acoustic_scale, "Scaling factor for acoustic likelihoods");
-    po.Register("word-symbol-table", &word_syms_filename, "Symbol table for words [for debug output]");
-    po.Register("allow-partial", &allow_partial, "If true, produce output even if end state was not reached.");
-
-    po.Read(argc, argv);
-
-    if (po.NumArgs() < 4 || po.NumArgs() > 6) {
-      po.PrintUsage();
-      exit(1);
-    }
-
-    std::string model_in_filename = po.GetArg(1),
-        fst_in_str = po.GetArg(2),
-        feature_rspecifier = po.GetArg(3),
-        lattice_wspecifier = po.GetArg(4),
-        words_wspecifier = po.GetOptArg(5),
-        alignment_wspecifier = po.GetOptArg(6);
-
-    TransitionModel trans_model;
-    AmNnet am_nnet;
-    {
-      bool binary;
-      Input ki(model_in_filename, &binary);
-      trans_model.Read(ki.Stream(), binary);
-      am_nnet.Read(ki.Stream(), binary);
-    }
-
-    bool determinize = config.determinize_lattice;
-    CompactLatticeWriter compact_lattice_writer;
-    LatticeWriter lattice_writer;
-    if (! (determinize ? compact_lattice_writer.Open(lattice_wspecifier)
-           : lattice_writer.Open(lattice_wspecifier)))
-      KALDI_ERR << "Could not open table for writing lattices: "
-                 << lattice_wspecifier;
-
-    TaskSequencer<DecodeUtteranceLatticeFasterClass> sequencer(sequencer_config);
-
-    Int32VectorWriter words_writer(words_wspecifier);
-
-    Int32VectorWriter alignment_writer(alignment_wspecifier);
-
-    fst::SymbolTable *word_syms = NULL;
-    if (word_syms_filename != "")
-      if (!(word_syms = fst::SymbolTable::ReadText(word_syms_filename)))
-        KALDI_ERR << "Could not read symbol table from file "
-                   << word_syms_filename;
-
-    // We support reading in a vector to describe each speaker, if the neural
-    // net requires this (i.e. it was trained with this).
-
-    double tot_like = 0.0;
-    kaldi::int64 frame_count = 0;
-    int num_done = 0, num_err = 0;
-    Fst<StdArc> *decode_fst = NULL;
-    if (ClassifyRspecifier(fst_in_str, NULL, NULL) == kNoRspecifier) {
-      SequentialBaseFloatMatrixReader feature_reader(feature_rspecifier);
-
-      decode_fst = fst::ReadFstKaldiGeneric(fst_in_str);
-      timer.Reset();
-
-      {
-
-        for (; !feature_reader.Done(); feature_reader.Next()) {
-          std::string utt = feature_reader.Key();
-          const Matrix<BaseFloat> &features (feature_reader.Value());
-          if (features.NumRows() == 0) {
-            KALDI_WARN << "Zero-length utterance: " << utt;
-            num_err++;
-            continue;
-          }
-          bool pad_input = true;
-          DecodableAmNnetParallel *nnet_decodable = new DecodableAmNnetParallel(
-              trans_model, am_nnet,
-              new CuMatrix<BaseFloat>(features),
-              pad_input, acoustic_scale);
-
-          LatticeFasterDecoder *decoder = new LatticeFasterDecoder(*decode_fst,
-                                                                   config);
-
-          DecodeUtteranceLatticeFasterClass *task =
-              new DecodeUtteranceLatticeFasterClass(
-                  decoder, nnet_decodable, // takes ownership of these two.
-                  trans_model, word_syms, utt, acoustic_scale, determinize,
-                  allow_partial, &alignment_writer, &words_writer,
-                  &compact_lattice_writer, &lattice_writer,
-                  &tot_like, &frame_count, &num_done, &num_err, NULL);
-
-          sequencer.Run(task); // takes ownership of "task",
-                               // and will delete it when done.
-        }
-      }
-    } else { // We have different FSTs for different utterances.
-      SequentialTableReader<fst::VectorFstHolder> fst_reader(fst_in_str);
-      RandomAccessBaseFloatMatrixReader feature_reader(feature_rspecifier);
-      for (; !fst_reader.Done(); fst_reader.Next()) {
-        std::string utt = fst_reader.Key();
-        if (!feature_reader.HasKey(utt)) {
-          KALDI_WARN << "Not decoding utterance " << utt
-                     << " because no features available.";
-          num_err++;
-          continue;
-        }
-        const Matrix<BaseFloat> &features = feature_reader.Value(utt);
-        if (features.NumRows() == 0) {
-          KALDI_WARN << "Zero-length utterance: " << utt;
-          num_err++;
-          continue;
-        }
-
-        // This constructor of LatticeFasterDecoder takes ownership of the FST.
-        LatticeFasterDecoder *decoder =
-            new LatticeFasterDecoder(config, fst_reader.Value().Copy());
-
-        bool pad_input = true;
-        DecodableAmNnetParallel *nnet_decodable = new DecodableAmNnetParallel(
-            trans_model, am_nnet,
-            new CuMatrix<BaseFloat>(features),
-            pad_input, acoustic_scale);
-
-        DecodeUtteranceLatticeFasterClass *task =
-            new DecodeUtteranceLatticeFasterClass(
-                decoder, nnet_decodable, // takes ownership of these two.
-                trans_model, word_syms, utt, acoustic_scale, determinize,
-                allow_partial, &alignment_writer, &words_writer,
-                &compact_lattice_writer, &lattice_writer,
-                &tot_like, &frame_count, &num_done, &num_err, NULL);
-
-        sequencer.Run(task); // takes ownership of "task",
-                             // and will delete it when done.
-      }
-    }
-    sequencer.Wait(); // Waits for all tasks to be done.
-    delete decode_fst;
-
-    double elapsed = timer.Elapsed();
-    KALDI_LOG << "Time taken "<< elapsed
-              << "s: real-time factor per thread assuming 100 frames/sec is "
-              << (sequencer_config.num_threads * elapsed * 100.0 / frame_count);
-    KALDI_LOG << "Done " << num_done << " utterances, failed for "
-              << num_err;
-    KALDI_LOG << "Overall log-likelihood per frame is "
-              << (tot_like / frame_count) << " over " << frame_count
-              << " frames.";
-
-    delete word_syms;
-    if (num_done != 0) return 0;
-    else return 1;
-  } catch(const std::exception &e) {
-    std::cerr << e.what();
-    return -1;
-  }
-}
diff --git a/src/nnet2bin/nnet-latgen-faster.cc b/src/nnet2bin/nnet-latgen-faster.cc
deleted file mode 100644
index e83c65f5fc9..00000000000
--- a/src/nnet2bin/nnet-latgen-faster.cc
+++ /dev/null
@@ -1,196 +0,0 @@
-// nnet2bin/nnet-latgen-faster.cc
-
-// Copyright 2009-2012   Microsoft Corporation
-//                       Johns Hopkins University (author: Daniel Povey)
-//                2014   Guoguo Chen
-
-// See ../../COPYING for clarification regarding multiple authors
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//  http://www.apache.org/licenses/LICENSE-2.0
-//
-// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
-// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
-// MERCHANTABLITY OR NON-INFRINGEMENT.
-// See the Apache 2 License for the specific language governing permissions and
-// limitations under the License.
-
-
-#include "base/kaldi-common.h"
-#include "util/common-utils.h"
-#include "tree/context-dep.h"
-#include "hmm/transition-model.h"
-#include "fstext/kaldi-fst-io.h"
-#include "decoder/decoder-wrappers.h"
-#include "nnet2/decodable-am-nnet.h"
-#include "base/timer.h"
-
-
-int main(int argc, char *argv[]) {
-  try {
-    using namespace kaldi;
-    using namespace kaldi::nnet2;
-    typedef kaldi::int32 int32;
-    using fst::SymbolTable;
-    using fst::Fst;
-    using fst::StdArc;
-
-    const char *usage =
-        "Generate lattices using neural net model.\n"
-        "Usage: nnet-latgen-faster [options] <nnet-in> <fst-in|fsts-rspecifier> <features-rspecifier>"
-        " <lattice-wspecifier> [ <words-wspecifier> [<alignments-wspecifier>] ]\n";
-    ParseOptions po(usage);
-    Timer timer;
-    bool allow_partial = false;
-    BaseFloat acoustic_scale = 0.1;
-    LatticeFasterDecoderConfig config;
-
-    std::string word_syms_filename;
-    config.Register(&po);
-    po.Register("acoustic-scale", &acoustic_scale, "Scaling factor for acoustic likelihoods");
-    po.Register("word-symbol-table", &word_syms_filename, "Symbol table for words [for debug output]");
-    po.Register("allow-partial", &allow_partial, "If true, produce output even if end state was not reached.");
-
-    po.Read(argc, argv);
-
-    if (po.NumArgs() < 4 || po.NumArgs() > 6) {
-      po.PrintUsage();
-      exit(1);
-    }
-
-    std::string model_in_filename = po.GetArg(1),
-        fst_in_str = po.GetArg(2),
-        feature_rspecifier = po.GetArg(3),
-        lattice_wspecifier = po.GetArg(4),
-        words_wspecifier = po.GetOptArg(5),
-        alignment_wspecifier = po.GetOptArg(6);
-
-    TransitionModel trans_model;
-    AmNnet am_nnet;
-    {
-      bool binary;
-      Input ki(model_in_filename, &binary);
-      trans_model.Read(ki.Stream(), binary);
-      am_nnet.Read(ki.Stream(), binary);
-    }
-
-    bool determinize = config.determinize_lattice;
-    CompactLatticeWriter compact_lattice_writer;
-    LatticeWriter lattice_writer;
-    if (! (determinize ? compact_lattice_writer.Open(lattice_wspecifier)
-           : lattice_writer.Open(lattice_wspecifier)))
-      KALDI_ERR << "Could not open table for writing lattices: "
-                 << lattice_wspecifier;
-
-    Int32VectorWriter words_writer(words_wspecifier);
-
-    Int32VectorWriter alignment_writer(alignment_wspecifier);
-
-    fst::SymbolTable *word_syms = NULL;
-    if (word_syms_filename != "")
-      if (!(word_syms = fst::SymbolTable::ReadText(word_syms_filename)))
-        KALDI_ERR << "Could not read symbol table from file "
-                   << word_syms_filename;
-
-
-    double tot_like = 0.0;
-    kaldi::int64 frame_count = 0;
-    int num_success = 0, num_fail = 0;
-
-    if (ClassifyRspecifier(fst_in_str, NULL, NULL) == kNoRspecifier) {
-      SequentialBaseFloatCuMatrixReader feature_reader(feature_rspecifier);
-
-      // Input FST is just one FST, not a table of FSTs.
-      Fst<StdArc> *decode_fst = fst::ReadFstKaldiGeneric(fst_in_str);
-      timer.Reset();
-
-      {
-        LatticeFasterDecoder decoder(*decode_fst, config);
-
-        for (; !feature_reader.Done(); feature_reader.Next()) {
-          std::string utt = feature_reader.Key();
-          const CuMatrix<BaseFloat> &features (feature_reader.Value());
-          if (features.NumRows() == 0) {
-            KALDI_WARN << "Zero-length utterance: " << utt;
-            num_fail++;
-            continue;
-          }
-          bool pad_input = true;
-          DecodableAmNnet nnet_decodable(trans_model,
-                                         am_nnet,
-                                         features,
-                                         pad_input,
-                                         acoustic_scale);
-          double like;
-          if (DecodeUtteranceLatticeFaster(
-                  decoder, nnet_decodable, trans_model, word_syms, utt,
-                  acoustic_scale, determinize, allow_partial, &alignment_writer,
-                  &words_writer, &compact_lattice_writer, &lattice_writer,
-                  &like)) {
-            tot_like += like;
-            frame_count += features.NumRows();
-            num_success++;
-          } else num_fail++;
-        }
-      }
-      delete decode_fst; // delete this only after decoder goes out of scope.
-    } else { // We have different FSTs for different utterances.
-      SequentialTableReader<fst::VectorFstHolder> fst_reader(fst_in_str);
-      RandomAccessBaseFloatCuMatrixReader feature_reader(feature_rspecifier);
-      for (; !fst_reader.Done(); fst_reader.Next()) {
-        std::string utt = fst_reader.Key();
-        if (!feature_reader.HasKey(utt)) {
-          KALDI_WARN << "Not decoding utterance " << utt
-                     << " because no features available.";
-          num_fail++;
-          continue;
-        }
-        const CuMatrix<BaseFloat> &features = feature_reader.Value(utt);
-        if (features.NumRows() == 0) {
-          KALDI_WARN << "Zero-length utterance: " << utt;
-          num_fail++;
-          continue;
-        }
-
-        LatticeFasterDecoder decoder(fst_reader.Value(), config);
-
-        bool pad_input = true;
-        DecodableAmNnet nnet_decodable(trans_model,
-                                       am_nnet,
-                                       features,
-                                       pad_input,
-                                       acoustic_scale);
-        double like;
-        if (DecodeUtteranceLatticeFaster(
-                decoder, nnet_decodable, trans_model, word_syms, utt,
-                acoustic_scale, determinize, allow_partial, &alignment_writer,
-                &words_writer, &compact_lattice_writer, &lattice_writer,
-                &like)) {
-          tot_like += like;
-          frame_count += features.NumRows();
-          num_success++;
-        } else num_fail++;
-      }
-    }
-
-    double elapsed = timer.Elapsed();
-    KALDI_LOG << "Time taken "<< elapsed
-              << "s: real-time factor assuming 100 frames/sec is "
-              << (elapsed*100.0/frame_count);
-    KALDI_LOG << "Done " << num_success << " utterances, failed for "
-              << num_fail;
-    KALDI_LOG << "Overall log-likelihood per frame is " << (tot_like/frame_count) << " over "
-              << frame_count<<" frames.";
-
-    delete word_syms;
-    if (num_success != 0) return 0;
-    else return 1;
-  } catch(const std::exception &e) {
-    std::cerr << e.what();
-    return -1;
-  }
-}
diff --git a/src/nnet2bin/nnet-modify-learning-rates.cc b/src/nnet2bin/nnet-modify-learning-rates.cc
deleted file mode 100644
index 55cab630080..00000000000
--- a/src/nnet2bin/nnet-modify-learning-rates.cc
+++ /dev/null
@@ -1,211 +0,0 @@
-// nnet2bin/nnet-modify-learning-rates.cc
-
-// Copyright 2013  Guoguo Chen
-
-// See ../../COPYING for clarification regarding multiple authors
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//  http://www.apache.org/licenses/LICENSE-2.0
-//
-// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
-// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
-// MERCHANTABLITY OR NON-INFRINGEMENT.
-// See the Apache 2 License for the specific language governing permissions and
-// limitations under the License.
-
-#include "base/kaldi-common.h"
-#include "util/common-utils.h"
-#include "hmm/transition-model.h"
-#include "nnet2/train-nnet.h"
-#include "nnet2/am-nnet.h"
-
-
-namespace kaldi {
-namespace nnet2 {
-void SetMaxChange(BaseFloat max_change, Nnet *nnet) {
-  for (int32 c = 0; c < nnet->NumComponents(); c++) {
-    Component *component = &(nnet->GetComponent(c));
-    AffineComponentPreconditioned *ac =
-        dynamic_cast<AffineComponentPreconditioned*>(component);
-    if (ac != NULL)
-      ac->SetMaxChange(max_change);
-  }
-}
-}
-}
-
-int main(int argc, char *argv[]) {
-  try {
-    using namespace kaldi;
-    using namespace kaldi::nnet2;
-    typedef kaldi::int32 int32;
-    typedef kaldi::int64 int64;
-
-    const char *usage =
-        "This program modifies the learning rates so as to equalize the\n"
-        "relative changes in parameters for each layer, while keeping their\n"
-        "geometric mean the same (or changing it to a value specified using\n"
-        "the --average-learning-rate option).\n"
-        "\n"
-        "Usage: nnet-modify-learning-rates [options] <prev-model> \\\n"
-        "                                  <cur-model> <modified-cur-model>\n"
-        "e.g.: nnet-modify-learning-rates --average-learning-rate=0.0002 \\\n"
-        "                                 5.mdl 6.mdl 6.mdl\n";
-
-    bool binary_write = true;
-    bool retroactive = false;
-    BaseFloat average_learning_rate = 0.0;
-    BaseFloat first_layer_factor = 1.0;
-    BaseFloat last_layer_factor = 1.0;
-    
-    ParseOptions po(usage);
-    po.Register("binary", &binary_write, "Write output in binary mode");
-    po.Register("average-learning-rate", &average_learning_rate,
-                "If supplied, change learning rate geometric mean to the given "
-                "value.");
-    po.Register("first-layer-factor", &first_layer_factor, "Factor that "
-                "reduces the target relative learning rate for first layer.");
-    po.Register("last-layer-factor", &last_layer_factor, "Factor that "
-                "reduces the target relative learning rate for last layer.");
-    po.Register("retroactive", &retroactive, "If true, scale the parameter "
-                "differences as well.");
-
-    po.Read(argc, argv);
-
-    if (po.NumArgs() != 3) {
-      po.PrintUsage();
-      exit(1);
-    }
-
-    KALDI_ASSERT(average_learning_rate >= 0);
-
-    std::string prev_nnet_rxfilename = po.GetArg(1),
-        cur_nnet_rxfilename = po.GetArg(2),
-        modified_cur_nnet_rxfilename = po.GetOptArg(3);
-
-    TransitionModel trans_model;
-    AmNnet am_prev_nnet, am_cur_nnet;
-    {
-      bool binary_read;
-      Input ki(prev_nnet_rxfilename, &binary_read);
-      trans_model.Read(ki.Stream(), binary_read);
-      am_prev_nnet.Read(ki.Stream(), binary_read);
-    }
-    {
-      bool binary_read;
-      Input ki(cur_nnet_rxfilename, &binary_read);
-      trans_model.Read(ki.Stream(), binary_read);
-      am_cur_nnet.Read(ki.Stream(), binary_read);
-    }
-
-    if (am_prev_nnet.GetNnet().GetParameterDim() !=
-        am_cur_nnet.GetNnet().GetParameterDim()) {
-      KALDI_WARN << "Parameter-dim mismatch, cannot equalize the relative "
-                 << "changes in parameters for each layer.";
-      exit(0);
-    }
-
-    int32 ret = 0;
-
-    // Gets relative parameter differences.
-    int32 num_updatable = am_prev_nnet.GetNnet().NumUpdatableComponents();
-    Vector<BaseFloat> relative_diff(num_updatable);
-    {
-      Nnet diff_nnet(am_prev_nnet.GetNnet());
-      diff_nnet.AddNnet(-1.0, am_cur_nnet.GetNnet());
-      diff_nnet.ComponentDotProducts(diff_nnet, &relative_diff);
-      relative_diff.ApplyPow(0.5);
-      Vector<BaseFloat> baseline_prod(num_updatable);
-      am_prev_nnet.GetNnet().ComponentDotProducts(am_prev_nnet.GetNnet(),
-                                                  &baseline_prod);
-      baseline_prod.ApplyPow(0.5);
-      relative_diff.DivElements(baseline_prod);
-      KALDI_LOG << "Relative parameter differences per layer are "
-                << relative_diff;
-
-      // If relative parameter difference for a certain is zero, set it to the
-      // mean of the rest values.
-      int32 num_zero = 0;
-      for (int32 i = 0; i < num_updatable; i++) {
-        if (relative_diff(i) == 0.0) {
-          num_zero++;
-        }
-      }
-      if (num_zero > 0) {
-        BaseFloat average_diff = relative_diff.Sum()
-            / static_cast<BaseFloat>(num_updatable - num_zero);
-        for (int32 i = 0; i < num_updatable; i++) {
-          if (relative_diff(i) == 0.0) {
-            relative_diff(i) = average_diff;
-          }
-        }
-        KALDI_LOG << "Zeros detected in the relative parameter difference "
-                  << "vector, updating the vector to " << relative_diff;
-      }
-    }
-
-    // Gets learning rates for previous neural net.
-    Vector<BaseFloat> prev_nnet_learning_rates(num_updatable),
-        cur_nnet_learning_rates(num_updatable);
-    am_prev_nnet.GetNnet().GetLearningRates(&prev_nnet_learning_rates);
-    am_cur_nnet.GetNnet().GetLearningRates(&cur_nnet_learning_rates);
-    KALDI_LOG << "Learning rates for previous model per layer are "
-              << prev_nnet_learning_rates;
-    KALDI_LOG << "Learning rates for current model per layer are "
-              << cur_nnet_learning_rates;
-    
-    // Gets target geometric mean.
-    BaseFloat target_geometric_mean = 0.0; 
-    if (average_learning_rate == 0.0) {
-      target_geometric_mean = Exp(cur_nnet_learning_rates.SumLog()
-                                  / static_cast<BaseFloat>(num_updatable));
-    } else {
-      target_geometric_mean = average_learning_rate;
-    }
-    KALDI_ASSERT(target_geometric_mean > 0.0);
-
-    // Works out the new learning rates.  We start from the previous model;
-    // this ensures that if this program is run twice, we get consistent
-    // results even if it's overwritten the current model.
-    Vector<BaseFloat> nnet_learning_rates(prev_nnet_learning_rates);
-    nnet_learning_rates.DivElements(relative_diff);
-    KALDI_ASSERT(last_layer_factor > 0.0);
-    nnet_learning_rates(num_updatable - 1) *= last_layer_factor;
-    KALDI_ASSERT(first_layer_factor > 0.0);
-    nnet_learning_rates(0) *= first_layer_factor;
-    BaseFloat cur_geometric_mean = Exp(nnet_learning_rates.SumLog()
-                                 / static_cast<BaseFloat>(num_updatable));
-    nnet_learning_rates.Scale(target_geometric_mean / cur_geometric_mean);
-    KALDI_LOG << "New learning rates for current model per layer are "
-              << nnet_learning_rates;
-
-    // Changes the parameter differences if --retroactivate is set to true.
-    if (retroactive) {
-      Vector<BaseFloat> scale_factors(nnet_learning_rates);
-      scale_factors.DivElements(prev_nnet_learning_rates);
-      am_cur_nnet.GetNnet().AddNnet(-1.0, am_prev_nnet.GetNnet());
-      am_cur_nnet.GetNnet().ScaleComponents(scale_factors);
-      am_cur_nnet.GetNnet().AddNnet(1.0, am_prev_nnet.GetNnet());
-      KALDI_LOG << "Scale parameter difference retroactively. Scaling factors "
-                << "are " << scale_factors;
-    }
-
-    // Sets learning rates and writes updated model.
-    am_cur_nnet.GetNnet().SetLearningRates(nnet_learning_rates);
-
-    SetMaxChange(0.0, &(am_cur_nnet.GetNnet()));
-    
-    Output ko(modified_cur_nnet_rxfilename, binary_write);
-    trans_model.Write(ko.Stream(), binary_write);
-    am_cur_nnet.Write(ko.Stream(), binary_write);
-
-    return ret;
-  } catch(const std::exception &e) {
-    std::cerr << e.what() << '\n';
-    return -1;
-  }
-}
diff --git a/src/nnet2bin/nnet-normalize-stddev.cc b/src/nnet2bin/nnet-normalize-stddev.cc
deleted file mode 100644
index b23faef5fc1..00000000000
--- a/src/nnet2bin/nnet-normalize-stddev.cc
+++ /dev/null
@@ -1,174 +0,0 @@
-// nnet2bin/nnet-normalize-stddev.cc
-
-// Copyright 2013  Guoguo Chen
-//           2014  Johns Hopkins University (author: Daniel Povey)
-
-// See ../../COPYING for clarification regarding multiple authors
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//  http://www.apache.org/licenses/LICENSE-2.0
-//
-// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
-// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
-// MERCHANTABLITY OR NON-INFRINGEMENT.
-// See the Apache 2 License for the specific language governing permissions and
-// limitations under the License.
-
-#include "base/kaldi-common.h"
-#include "util/common-utils.h"
-#include "hmm/transition-model.h"
-#include "nnet2/train-nnet.h"
-#include "nnet2/am-nnet.h"
-
-
-int main(int argc, char *argv[]) {
-  try {
-    using namespace kaldi;
-    using namespace kaldi::nnet2;
-    typedef kaldi::int32 int32;
-    typedef kaldi::int64 int64;
-
-    const char *usage =
-        "This program first identifies any affine or block affine layers that\n"
-        "are followed by pnorm and then renormalize layers. Then it rescales\n"
-        "those layers such that the parameter stddev is 1.0 after scaling\n"
-        "(the target stddev is configurable by the --stddev option).\n"
-        "If you supply the option --stddev-from=<model-filename>, it rescales\n"
-        "those layers to match the standard deviations of corresponding layers\n"
-        "in the specified model.\n"
-        "\n"
-        "Usage: nnet-normalize-stddev [options] <model-in> <model-out>\n"
-        " e.g.: nnet-normalize-stddev final.mdl final.mdl\n";
-
-    bool binary_write = true;
-    BaseFloat stddev = 1.0;
-    std::string reference_model_filename;
-
-    ParseOptions po(usage);
-    po.Register("binary", &binary_write, "Write output in binary mode");
-    po.Register("stddev-from", &reference_model_filename, "Reference model");
-    po.Register("stddev", &stddev, "Target standard deviation that we normalize "
-                "to (note: is overridden by --stddev-from option, if supplied)");
-
-    po.Read(argc, argv);
-
-    if (po.NumArgs() != 2) {
-      po.PrintUsage();
-      exit(1);
-    }
-
-    std::string nnet_rxfilename = po.GetArg(1),
-        normalized_nnet_rxfilename = po.GetArg(2);
-
-    TransitionModel trans_model;
-    AmNnet am_nnet;
-    {
-      bool binary_read;
-      Input ki(nnet_rxfilename, &binary_read);
-      trans_model.Read(ki.Stream(), binary_read);
-      am_nnet.Read(ki.Stream(), binary_read);
-    }
-
-    int32 ret = 0;
-
-    // Works out the layers that we would like to normalize: any affine or block
-    // affine layers that are followed by pnorm and then renormalize layers.
-    std::vector<int32> identified_components;
-    for (int32 c = 0; c < am_nnet.GetNnet().NumComponents() - 2; c++) {
-      // Checks if the current layer is an affine layer or block affine layer.
-      // Also includes PreconditionedAffineComponent and
-      // PreconditionedAffineComponentOnline, since they are child classes of
-      // AffineComponent.
-      kaldi::nnet2::Component *component = &(am_nnet.GetNnet().GetComponent(c));
-      AffineComponent *ac = dynamic_cast<AffineComponent*>(component);
-      BlockAffineComponent *bac =
-        dynamic_cast<BlockAffineComponent*>(component);
-      if (ac == NULL && bac == NULL)
-        continue;
-
-      // Checks if the next layer is a pnorm layer.
-      component = &(am_nnet.GetNnet().GetComponent(c + 1));
-      PnormComponent *pc = dynamic_cast<PnormComponent*>(component);
-      if (pc == NULL)
-        continue;
-
-      // Checks if the layer after the pnorm layer is a NormalizeComponent
-      // or a PowerComponent followed by a NormalizeComponent
-      component = &(am_nnet.GetNnet().GetComponent(c + 2));
-      NormalizeComponent *nc = dynamic_cast<NormalizeComponent*>(component);
-      PowerComponent *pwc = dynamic_cast<PowerComponent*>(component);
-      if (nc == NULL && pwc == NULL)
-        continue;
-      if (pwc != NULL) {  // verify it's PowerComponent followed by
-                         // NormalizeComponent.
-        if (c + 3 >= am_nnet.GetNnet().NumComponents())
-          continue;
-        component = &(am_nnet.GetNnet().GetComponent(c + 3));
-        nc = dynamic_cast<NormalizeComponent*>(component);
-        if (nc == NULL)
-          continue;
-      }
-      // This is the layer that we would like to normalize.
-      identified_components.push_back(c);
-    }
-
-    AmNnet am_nnet_ref;
-    if (!reference_model_filename.empty()) {
-      bool binary_read;
-      Input ki(reference_model_filename, &binary_read);
-      trans_model.Read(ki.Stream(), binary_read);
-      am_nnet_ref.Read(ki.Stream(), binary_read);
-      KALDI_ASSERT(am_nnet_ref.GetNnet().NumComponents() == am_nnet.GetNnet().NumComponents());
-    }
-
-    BaseFloat ref_stddev = 0.0;
-
-    // Normalizes the identified layers.
-    for (int32 c = 0; c < identified_components.size(); c++) {
-      ref_stddev = stddev;
-      if (!reference_model_filename.empty()) {
-        kaldi::nnet2::Component *component =
-            &(am_nnet_ref.GetNnet().GetComponent(identified_components[c]));
-        UpdatableComponent *uc = dynamic_cast<UpdatableComponent*>(component);
-        KALDI_ASSERT(uc != NULL);
-        Vector<BaseFloat> params(uc->GetParameterDim());
-        uc->Vectorize(&params);
-        BaseFloat params_average = params.Sum()
-            / static_cast<BaseFloat>(params.Dim());
-        params.Add(-1.0 * params_average);
-        ref_stddev = sqrt(VecVec(params, params)
-            / static_cast<BaseFloat>(params.Dim()));
-      }
-
-      kaldi::nnet2::Component *component =
-          &(am_nnet.GetNnet().GetComponent(identified_components[c]));
-      UpdatableComponent *uc = dynamic_cast<UpdatableComponent*>(component);
-      KALDI_ASSERT(uc != NULL);
-      Vector<BaseFloat> params(uc->GetParameterDim());
-      uc->Vectorize(&params);
-      BaseFloat params_average = params.Sum()
-          / static_cast<BaseFloat>(params.Dim());
-      params.Add(-1.0 * params_average);
-      BaseFloat params_stddev = sqrt(VecVec(params, params)
-          / static_cast<BaseFloat>(params.Dim()));
-      if (params_stddev > 0.0) {
-        uc->Scale(ref_stddev / params_stddev);
-        KALDI_LOG << "Normalized component " << identified_components[c];
-      }
-    }
-
-    // Writes the normalized model.
-    Output ko(normalized_nnet_rxfilename, binary_write);
-    trans_model.Write(ko.Stream(), binary_write);
-    am_nnet.Write(ko.Stream(), binary_write);
-
-    return ret;
-  } catch(const std::exception &e) {
-    std::cerr << e.what() << '\n';
-    return -1;
-  }
-}
diff --git a/src/nnet2bin/nnet-relabel-egs.cc b/src/nnet2bin/nnet-relabel-egs.cc
deleted file mode 100644
index 69c6c9923b8..00000000000
--- a/src/nnet2bin/nnet-relabel-egs.cc
+++ /dev/null
@@ -1,168 +0,0 @@
-// nnet2bin/nnet-relabel-egs.cc
-
-// Copyright 2014   Vimal Manohar
-
-// See ../../COPYING for clarification regarding multiple authors
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//  http://www.apache.org/licenses/LICENSE-2.0
-//
-// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
-// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
-// MERCHANTABLITY OR NON-INFRINGEMENT.
-// See the Apache 2 License for the specific language governing permissions and
-// limitations under the License.
-
-/** @brief Relabels neural network egs with the read pdf-id alignments
-*/
-
-#include <sstream>
-
-#include "base/kaldi-common.h"
-#include "util/common-utils.h"
-#include "nnet2/nnet-example.h"
-
-namespace kaldi {
-  
-  // this functions splits an egs key like <utt_id>-<frame_id> into 
-  // separate utterance id and frame id on the last delimiter.
-  // Returns false if the delimiter is not found in the key.
-  bool SplitEgsKey(const std::string &key, 
-                    std::string *utt_id, int32 *frame_id) {
-    size_t start = 0, found = 0, end = key.size();
-    utt_id->clear();
- 
-    found = key.find_last_of("-", end);
-    // start != end condition is for when the delimiter is at the end
-    
-    if (found != start && start != end && found < end) {
-      *utt_id = key.substr(start, found - start);
-      std::istringstream tmp(key.substr(found + 1, end));
-      tmp >> *frame_id;
-      return true;
-    }
-
-    return false;
-  }
-}
-
-int main(int argc, char *argv[]) {
-  using namespace kaldi;
-  using namespace kaldi::nnet2;
-
-  typedef kaldi::int32 int32;
-  typedef kaldi::int64 int64;
-  try {
-    const char *usage =
-        "Relabel neural network egs with the read pdf-id alignments, "
-        "zero-based..\n"
-        "Usage: nnet-relabel-egs [options] <pdf-aligment-rspecifier> "
-        "<egs_rspecifier1> ... <egs_rspecifierN> "
-        "<egs_wspecifier1> ... <egs_wspecifierN>\n"
-        "e.g.: \n"
-        " nnet-relabel-egs ark:1.ali egs_in/egs.1.ark egs_in/egs.2.ark "
-        "egs_out/egs.1.ark egs_out/egs.2.ark\n"
-        "See also: nnet-get-egs, nnet-copy-egs, steps/nnet2/relabel_egs.sh\n";
-
-    ParseOptions po(usage);
-
-    po.Read(argc, argv);
-
-    // Here we expect equal number of input egs archive and output egs archives. 
-    // So the total number of arguments including the alignment specifier must be odd.
-    if (po.NumArgs() < 3 || po.NumArgs() % 2 == 0) {
-      po.PrintUsage();
-      exit(1);
-    }
-
-    std::string alignments_rspecifier = po.GetArg(1);
-    int32 num_archives = (po.NumArgs() - 1) / 2;
-    
-    SequentialInt32VectorReader ali_reader(alignments_rspecifier);
-
-    unordered_map<std::string, std::vector<int32>* > utt_to_pdf_ali;
-
-    // Keep statistics
-    int32 num_ali = 0;
-    int64 num_frames_ali = 0, num_frames_egs = 0, 
-          num_frames_missing = 0, num_frames_relabelled = 0;
-
-    // Read alignments and put the pointer in an unordered map
-    // indexed by the key. This is so that we can efficiently find the 
-    // alignment corresponding to the utterance to 
-    // which a particular frame belongs
-    for (; !ali_reader.Done(); ali_reader.Next(), num_ali++) {
-      std::string key = ali_reader.Key();
-      std::vector<int32> *alignment = new std::vector<int32>(ali_reader.Value());
-      std::pair<std::string, std::vector<int32>* > map(key, alignment);
-      utt_to_pdf_ali.insert(map);
-      num_frames_ali += alignment->size();
-    }
-
-    // Read archives of egs sequentially
-    for (int32 i = 0; i < num_archives; i++) {
-      std::string egs_rspecifier(po.GetArg(i+2));
-      std::string egs_wspecifier(po.GetArg(i+2+num_archives));
-
-      SequentialNnetExampleReader egs_reader(egs_rspecifier);
-      NnetExampleWriter egs_writer(egs_wspecifier);
-
-      for (; !egs_reader.Done(); egs_reader.Next(), num_frames_egs++) {
-      
-        std::string key(egs_reader.Key());
-
-        std::string utt_id;
-        int32 frame_id;
-
-        if (!SplitEgsKey(key, &utt_id, &frame_id)) {
-          KALDI_ERR << "Unable to split key " << key << " on delimiter - " 
-                    << " into utterance id and frame id";
-        }
-        NnetExample eg(egs_reader.Value());
-
-        if (utt_to_pdf_ali.find(utt_id) == utt_to_pdf_ali.end()) {
-          KALDI_WARN << "Unable to find utterance id " << utt_id;
-          egs_writer.Write(key, eg);
-          num_frames_missing++;
-          continue;
-        }
-        const std::vector<int32> *alignment = utt_to_pdf_ali[utt_id];
-
-        int32 num_frames_in_eg = eg.labels.size();
-        for (int32 t_offset = 0; t_offset < num_frames_in_eg; t_offset++) {
-          int32 t = frame_id + t_offset;
-          if (t >= static_cast<int32>(alignment->size())) {
-            KALDI_ERR << "Time index " << t << " out of range for alignment, "
-                      << "should be < " << alignment->size();
-          }
-          if (eg.GetLabelSingle(t_offset) != (*alignment)[t])
-            num_frames_relabelled++; 
-          eg.SetLabelSingle(t_offset, (*alignment)[t]);
-        }
-        egs_writer.Write(key, eg);
-      }
-    }
-
-    unordered_map<std::string, std::vector<int32>*>::iterator iter;
-    
-    for (iter = utt_to_pdf_ali.begin(); iter != utt_to_pdf_ali.end(); ++iter)
-      delete iter->second;
-    
-    KALDI_LOG << "Read " << num_ali << " alignments containing a total of " 
-              << num_frames_ali << " frames; labelled " 
-              << num_frames_egs - num_frames_missing << " frames out of " 
-              << num_frames_egs << " examples; labels changed for " 
-              << num_frames_relabelled << " of those frames.\n.";
-
-    return (num_frames_missing > 0.5  * num_frames_egs);
-
-  } catch(const std::exception &e) {
-    std::cerr << e.what();
-    return -1;
-  }
-}
-
diff --git a/src/nnet2bin/nnet-replace-last-layers.cc b/src/nnet2bin/nnet-replace-last-layers.cc
deleted file mode 100644
index 70ebf91095d..00000000000
--- a/src/nnet2bin/nnet-replace-last-layers.cc
+++ /dev/null
@@ -1,97 +0,0 @@
-// nnet2bin/nnet-replace-last-layers.cc
-
-// Copyright 2012  Johns Hopkins University (author:  Daniel Povey)
-
-// See ../../COPYING for clarification regarding multiple authors
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//  http://www.apache.org/licenses/LICENSE-2.0
-//
-// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
-// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
-// MERCHANTABLITY OR NON-INFRINGEMENT.
-// See the Apache 2 License for the specific language governing permissions and
-// limitations under the License.
-
-#include "base/kaldi-common.h"
-#include "util/common-utils.h"
-#include "nnet2/am-nnet.h"
-#include "nnet2/nnet-functions.h"
-#include "hmm/transition-model.h"
-#include "tree/context-dep.h"
-
-int main(int argc, char *argv[]) {
-  try {
-    using namespace kaldi;
-    using namespace kaldi::nnet2;
-    typedef kaldi::int32 int32;
-
-    const char *usage =
-        "This program is for adding new layers to a neural-network acoustic model.\n"
-        "It removes the last --remove-layers layers, and adds the layers from the\n"
-        "supplied raw-nnet.  The typical use is to remove the last two layers\n"
-        "(the softmax, and the affine component before it), and add in replacements\n"
-        "for them newly initialized by nnet-init.  This program is a more flexible\n"
-        "way of adding layers than nnet-insert, but the inserted network needs to\n"
-        "contain replacements for the removed layers.\n"
-        "\n"
-        "Usage:  nnet-replace-last-layers [options] <nnet-in> <raw-nnet-to-insert-in> <nnet-out>\n"
-        "e.g.:\n"
-        " nnet-replace-last-layers 1.nnet \"nnet-init hidden_layer.config -|\" 2.nnet\n";
-
-    bool binary_write = true;
-    int32 remove_layers = 2;
-
-    ParseOptions po(usage);
-    
-    po.Register("binary", &binary_write, "Write output in binary mode");
-    po.Register("remove-layers", &remove_layers, "Number of final layers "
-                "to remove before adding input raw network.");
-    
-    po.Read(argc, argv);
-    
-    if (po.NumArgs() != 3) {
-      po.PrintUsage();
-      exit(1);
-    }
-
-    std::string nnet_rxfilename = po.GetArg(1),
-        raw_nnet_rxfilename = po.GetArg(2),
-        nnet_wxfilename = po.GetArg(3);
-    
-    TransitionModel trans_model;
-    AmNnet am_nnet;
-    {
-      bool binary;
-      Input ki(nnet_rxfilename, &binary);
-      trans_model.Read(ki.Stream(), binary);
-      am_nnet.Read(ki.Stream(), binary);
-    }
-
-    Nnet src_nnet; // the one we'll insert.
-    ReadKaldiObject(raw_nnet_rxfilename, &src_nnet);
-
-    
-    // This function is declared in nnet-functions.h
-    ReplaceLastComponents(src_nnet,
-                          remove_layers,
-                          &(am_nnet.GetNnet()));
-    KALDI_LOG << "Removed " << remove_layers << " components and added "
-              << src_nnet.NumComponents();
-    
-    {
-      Output ko(nnet_wxfilename, binary_write);
-      trans_model.Write(ko.Stream(), binary_write);
-      am_nnet.Write(ko.Stream(), binary_write);
-    }
-    KALDI_LOG << "Write neural-net acoustic model to " <<  nnet_wxfilename;
-    return 0;
-  } catch(const std::exception &e) {
-    std::cerr << e.what() << '\n';
-    return -1;
-  }
-}
diff --git a/src/nnet2bin/nnet-show-progress.cc b/src/nnet2bin/nnet-show-progress.cc
deleted file mode 100644
index 97e13089034..00000000000
--- a/src/nnet2bin/nnet-show-progress.cc
+++ /dev/null
@@ -1,164 +0,0 @@
-// nnet2bin/nnet-show-progress.cc
-
-// Copyright 2012-2013  Johns Hopkins University (author:  Daniel Povey)
-
-// See ../../COPYING for clarification regarding multiple authors
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//  http://www.apache.org/licenses/LICENSE-2.0
-//
-// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
-// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
-// MERCHANTABLITY OR NON-INFRINGEMENT.
-// See the Apache 2 License for the specific language governing permissions and
-// limitations under the License.
-
-#include "base/kaldi-common.h"
-#include "util/common-utils.h"
-#include "hmm/transition-model.h"
-#include "nnet2/train-nnet.h"
-#include "nnet2/am-nnet.h"
-
-
-int main(int argc, char *argv[]) {
-  try {
-    using namespace kaldi;
-    using namespace kaldi::nnet2;
-    typedef kaldi::int32 int32;
-    typedef kaldi::int64 int64;
-
-    const char *usage =
-        "Given an old and a new model and some training examples (possibly held-out),\n"
-        "show the average objective function given the mean of the two models,\n"
-        "and the breakdown by component of why this happened (computed from\n"
-        "derivative information).  Also shows parameter differences per layer.\n"
-        "If training examples not provided, only shows parameter differences per\n"
-        "layer.\n"
-        "\n"
-        "Usage:  nnet-show-progress [options] <old-model-in> <new-model-in> [<training-examples-in>]\n"
-        "e.g.: nnet-show-progress 1.nnet 2.nnet ark:valid.egs\n";
-    
-    ParseOptions po(usage);
-
-    int32 num_segments = 1;
-    int32 batch_size = 1024;
-    std::string use_gpu = "optional";
-    
-    po.Register("num-segments", &num_segments,
-                "Number of line segments used for computing derivatives");
-    po.Register("use-gpu", &use_gpu,
-                "yes|no|optional|wait, only has effect if compiled with CUDA");
-    
-    po.Read(argc, argv);
-    
-    if (po.NumArgs() < 2 || po.NumArgs() > 3) {
-      po.PrintUsage();
-      exit(1);
-    }
-    
-#if HAVE_CUDA==1
-    CuDevice::Instantiate().SelectGpuId(use_gpu);
-#endif
-
-    std::string nnet1_rxfilename = po.GetArg(1),
-        nnet2_rxfilename = po.GetArg(2),
-        examples_rspecifier = po.GetOptArg(3);
-
-    TransitionModel trans_model;
-    AmNnet am_nnet1, am_nnet2;
-    {
-      bool binary_read;
-      Input ki(nnet1_rxfilename, &binary_read);
-      trans_model.Read(ki.Stream(), binary_read);
-      am_nnet1.Read(ki.Stream(), binary_read);
-    }
-    {
-      bool binary_read;
-      Input ki(nnet2_rxfilename, &binary_read);
-      trans_model.Read(ki.Stream(), binary_read);
-      am_nnet2.Read(ki.Stream(), binary_read);
-    }    
-    
-    if (am_nnet1.GetNnet().GetParameterDim() !=
-        am_nnet2.GetNnet().GetParameterDim()) {
-      KALDI_WARN << "Parameter-dim mismatch, cannot show progress.";
-      exit(0);
-    }
-
-    int32 ret = 0;
-    
-    if (!examples_rspecifier.empty()) { 
-      Nnet nnet_gradient(am_nnet2.GetNnet());
-      const bool treat_as_gradient = true;
-      nnet_gradient.SetZero(treat_as_gradient);
-
-      std::vector<NnetExample> examples;
-      SequentialNnetExampleReader example_reader(examples_rspecifier);
-      for (; !example_reader.Done(); example_reader.Next())
-        examples.push_back(example_reader.Value());
-
-      int32 num_examples = examples.size();
-    
-      int32 num_updatable = am_nnet1.GetNnet().NumUpdatableComponents();
-      Vector<BaseFloat> diff(num_updatable);
-    
-      for (int32 s = 0; s < num_segments; s++) {
-        // start and end segments of the line between 0 and 1
-        BaseFloat start = (s + 0.0) / num_segments,
-            end = (s + 1.0) / num_segments, middle = 0.5 * (start + end);
-        Nnet interp_nnet(am_nnet2.GetNnet());
-        interp_nnet.Scale(middle);
-        interp_nnet.AddNnet(1.0 - middle, am_nnet1.GetNnet());
-      
-        Nnet nnet_gradient(am_nnet2.GetNnet());
-        const bool treat_as_gradient = true;
-        nnet_gradient.SetZero(treat_as_gradient);
-
-        double objf_per_frame = ComputeNnetGradient(interp_nnet, examples,
-                                                    batch_size, &nnet_gradient);
-        KALDI_LOG << "At position " << middle << ", objf per frame is " << objf_per_frame;
-
-        Vector<BaseFloat> old_dotprod(num_updatable), new_dotprod(num_updatable);
-        nnet_gradient.ComponentDotProducts(am_nnet1.GetNnet(), &old_dotprod);
-        nnet_gradient.ComponentDotProducts(am_nnet2.GetNnet(), &new_dotprod);
-        old_dotprod.Scale(1.0 / num_examples);
-        new_dotprod.Scale(1.0 / num_examples);
-        diff.AddVec(1.0/ num_segments, new_dotprod);
-        diff.AddVec(-1.0 / num_segments, old_dotprod);
-        KALDI_VLOG(1) << "By segment " << s << ", objf change is " << diff;
-      }
-      KALDI_LOG << "Total objf change per component is " << diff;
-      if (num_examples == 0) ret = 1;
-    }
-   
-    { // Get info about magnitude of parameter change.
-      Nnet diff_nnet(am_nnet1.GetNnet());
-      diff_nnet.AddNnet(-1.0, am_nnet2.GetNnet());
-      int32 num_updatable = diff_nnet.NumUpdatableComponents();
-      Vector<BaseFloat> dot_prod(num_updatable);
-      diff_nnet.ComponentDotProducts(diff_nnet, &dot_prod);
-      dot_prod.ApplyPow(0.5); // take sqrt to get l2 norm of diff
-      KALDI_LOG << "Parameter differences per layer are "
-                << dot_prod;
-
-      Vector<BaseFloat> baseline_prod(num_updatable);
-      am_nnet1.GetNnet().ComponentDotProducts(am_nnet1.GetNnet(),
-                                              &baseline_prod);
-      baseline_prod.ApplyPow(0.5);
-      dot_prod.DivElements(baseline_prod);
-      KALDI_LOG << "Relative parameter differences per layer are "
-                << dot_prod;
-    }
-
-    return ret;
-  } catch(const std::exception &e) {
-    std::cerr << e.what() << '\n';
-    return -1;
-  }
-}
-
-
diff --git a/src/nnet2bin/nnet-shuffle-egs-discriminative.cc b/src/nnet2bin/nnet-shuffle-egs-discriminative.cc
deleted file mode 100644
index ebef378d16b..00000000000
--- a/src/nnet2bin/nnet-shuffle-egs-discriminative.cc
+++ /dev/null
@@ -1,114 +0,0 @@
-// nnet2bin/nnet-shuffle-egs-discriminative.cc
-
-// Copyright 2012-2013  Johns Hopkins University (author:  Daniel Povey)
-
-// See ../../COPYING for clarification regarding multiple authors
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//  http://www.apache.org/licenses/LICENSE-2.0
-//
-// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
-// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
-// MERCHANTABLITY OR NON-INFRINGEMENT.
-// See the Apache 2 License for the specific language governing permissions and
-// limitations under the License.
-
-#include "base/kaldi-common.h"
-#include "util/common-utils.h"
-#include "hmm/transition-model.h"
-#include "nnet2/nnet-example-functions.h"
-
-int main(int argc, char *argv[]) {
-  try {
-    using namespace kaldi;
-    using namespace kaldi::nnet2;
-    typedef kaldi::int32 int32;
-    typedef kaldi::int64 int64;
-
-    const char *usage =
-        "Copy examples (typically single frames) for neural network training,\n"
-        "from the input to output, but randomly shuffle the order.  This program will keep\n"
-        "all of the examples in memory at once, so don't give it too many.\n"
-        "\n"
-        "Usage:  nnet-shuffle-egs-discriminative [options] <egs-rspecifier> <egs-wspecifier>\n"
-        "\n"
-        "nnet-shuffle-egs-discriminative --srand=1 ark:train.degs ark:shuffled.degs\n";
-    
-    int32 srand_seed = 0;
-    int32 buffer_size = 0;
-    ParseOptions po(usage);
-    po.Register("srand", &srand_seed, "Seed for random number generator ");
-    po.Register("buffer-size", &buffer_size, "If >0, size of a buffer we use "
-                "to do limited-memory partial randomization.  Otherwise, do "
-                "full randomization.");
-    
-    po.Read(argc, argv);
-
-    srand(srand_seed);
-    
-    if (po.NumArgs() != 2) {
-      po.PrintUsage();
-      exit(1);
-    }
-
-    std::string examples_rspecifier = po.GetArg(1),
-        examples_wspecifier = po.GetArg(2);
-
-    int64 num_done = 0;
-
-    std::vector<DiscriminativeNnetExample*> egs;
-    SequentialDiscriminativeNnetExampleReader example_reader(
-        examples_rspecifier);
-    DiscriminativeNnetExampleWriter example_writer(
-        examples_wspecifier);
-    if (buffer_size == 0) { // Do full randomization
-      // Putting in an extra level of indirection here to avoid excessive
-      // computation and memory demands when we have to resize the vector.
-    
-      for (; !example_reader.Done(); example_reader.Next())
-        egs.push_back(new DiscriminativeNnetExample(
-            example_reader.Value()));
-      
-      std::random_shuffle(egs.begin(), egs.end());
-    } else {
-      KALDI_ASSERT(buffer_size > 0);
-      egs.resize(buffer_size, NULL);
-      for (; !example_reader.Done(); example_reader.Next()) {
-        int32 index = RandInt(0, buffer_size - 1);
-        if (egs[index] == NULL) {
-          egs[index] = new DiscriminativeNnetExample(example_reader.Value());
-        } else {
-          std::ostringstream ostr;
-          ostr << num_done;
-          example_writer.Write(ostr.str(), *(egs[index]));
-          *(egs[index]) = example_reader.Value();
-          num_done++;
-        }
-      }      
-    }
-    for (size_t i = 0; i < egs.size(); i++) {
-      std::ostringstream ostr;
-      ostr << num_done;
-      if (egs[i] != NULL) {
-        example_writer.Write(ostr.str(), *(egs[i]));
-        delete egs[i];
-      }
-      num_done++;
-    }
-
-    KALDI_LOG << "Shuffled order of " << num_done
-              << " neural-network training examples "
-              << (buffer_size ? "using a buffer (partial randomization)" : "");
-                  
-    return (num_done == 0 ? 1 : 0);
-  } catch(const std::exception &e) {
-    std::cerr << e.what() << '\n';
-    return -1;
-  }
-}
-
-
diff --git a/src/nnet2bin/nnet-shuffle-egs.cc b/src/nnet2bin/nnet-shuffle-egs.cc
deleted file mode 100644
index 7c4872b48b6..00000000000
--- a/src/nnet2bin/nnet-shuffle-egs.cc
+++ /dev/null
@@ -1,113 +0,0 @@
-// nnet2bin/nnet-shuffle-egs.cc
-
-// Copyright 2012  Johns Hopkins University (author:  Daniel Povey)
-// Copyright 2014  Vimal Manohar
-
-// See ../../COPYING for clarification regarding multiple authors
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//  http://www.apache.org/licenses/LICENSE-2.0
-//
-// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
-// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
-// MERCHANTABLITY OR NON-INFRINGEMENT.
-// See the Apache 2 License for the specific language governing permissions and
-// limitations under the License.
-
-#include "base/kaldi-common.h"
-#include "util/common-utils.h"
-#include "hmm/transition-model.h"
-#include "nnet2/nnet-example-functions.h"
-
-int main(int argc, char *argv[]) {
-  try {
-    using namespace kaldi;
-    using namespace kaldi::nnet2;
-    typedef kaldi::int32 int32;
-    typedef kaldi::int64 int64;
-
-    const char *usage =
-        "Copy examples (typically single frames) for neural network training,\n"
-        "from the input to output, but randomly shuffle the order.  This program will keep\n"
-        "all of the examples in memory at once, unless you use the --buffer-size option\n"
-        "\n"
-        "Usage:  nnet-shuffle-egs [options] <egs-rspecifier> <egs-wspecifier>\n"
-        "\n"
-        "nnet-shuffle-egs --srand=1 ark:train.egs ark:shuffled.egs\n";
-
-    int32 srand_seed = 0;
-    int32 buffer_size = 0;
-    ParseOptions po(usage);
-    po.Register("srand", &srand_seed, "Seed for random number generator ");
-    po.Register("buffer-size", &buffer_size, "If >0, size of a buffer we use "
-                "to do limited-memory partial randomization.  Otherwise, do "
-                "full randomization.");
-
-    po.Read(argc, argv);
-
-    srand(srand_seed);
-
-    if (po.NumArgs() != 2) {
-      po.PrintUsage();
-      exit(1);
-    }
-
-    std::string examples_rspecifier = po.GetArg(1),
-        examples_wspecifier = po.GetArg(2);
-
-    int64 num_done = 0;
-
-    std::vector<std::pair<std::string, NnetExample*> > egs;
-
-    SequentialNnetExampleReader example_reader(examples_rspecifier);
-    NnetExampleWriter example_writer(examples_wspecifier);
-    if (buffer_size == 0) {  // Do full randomization
-      // Putting in an extra level of indirection here to avoid excessive
-      // computation and memory demands when we have to resize the vector.
-
-      for (; !example_reader.Done(); example_reader.Next())
-        egs.push_back(std::make_pair(example_reader.Key(),
-                                     new NnetExample(example_reader.Value())));
-
-      std::random_shuffle(egs.begin(), egs.end());
-    } else {
-      KALDI_ASSERT(buffer_size > 0);
-      egs.resize(buffer_size,
-          std::pair<std::string, NnetExample*>("", static_cast<NnetExample *>(NULL)));
-      for (; !example_reader.Done(); example_reader.Next()) {
-        int32 index = RandInt(0, buffer_size - 1);
-        if (egs[index].second == NULL) {
-          egs[index] = std::make_pair(example_reader.Key(),
-                                    new NnetExample(example_reader.Value()));
-        } else {
-          example_writer.Write(egs[index].first, *(egs[index].second));
-          egs[index].first = example_reader.Key();
-          *(egs[index].second) = example_reader.Value();
-          num_done++;
-        }
-      }
-    }
-    for (size_t i = 0; i < egs.size(); i++) {
-      if (egs[i].second != NULL) {
-        example_writer.Write(egs[i].first, *(egs[i].second));
-        delete egs[i].second;
-        num_done++;
-      }
-    }
-
-    KALDI_LOG << "Shuffled order of " << num_done
-              << " neural-network training examples "
-              << (buffer_size ? "using a buffer (partial randomization)" : "");
-
-    return (num_done == 0 ? 1 : 0);
-  } catch(const std::exception &e) {
-    std::cerr << e.what() << '\n';
-    return -1;
-  }
-}
-
-
diff --git a/src/nnet2bin/nnet-subset-egs.cc b/src/nnet2bin/nnet-subset-egs.cc
deleted file mode 100644
index 4511870eab1..00000000000
--- a/src/nnet2bin/nnet-subset-egs.cc
+++ /dev/null
@@ -1,102 +0,0 @@
-// nnet2bin/nnet-subset-egs.cc
-
-// Copyright 2012  Johns Hopkins University (author:  Daniel Povey)
-// Copyright 2014  Vimal Manohar
-
-// See ../../COPYING for clarification regarding multiple authors
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//  http://www.apache.org/licenses/LICENSE-2.0
-//
-// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
-// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
-// MERCHANTABLITY OR NON-INFRINGEMENT.
-// See the Apache 2 License for the specific language governing permissions and
-// limitations under the License.
-
-#include "base/kaldi-common.h"
-#include "util/common-utils.h"
-#include "hmm/transition-model.h"
-#include "nnet2/nnet-example-functions.h"
-
-int main(int argc, char *argv[]) {
-  try {
-    using namespace kaldi;
-    using namespace kaldi::nnet2;
-    typedef kaldi::int32 int32;
-    typedef kaldi::int64 int64;
-
-    const char *usage =
-        "Creates a random subset of the input examples, of a specified size.\n"
-        "Uses no more memory than the size of the subset.\n"
-        "\n"
-        "Usage:  nnet-subset-egs [options] <egs-rspecifier> [<egs-wspecifier2> ...]\n"
-        "\n"
-        "e.g.\n"
-        "nnet-subset-egs [args] ark:- | nnet-subset-egs --n=1000 ark:- ark:subset.egs\n";
-    
-    int32 srand_seed = 0;
-    int32 n = 1000;
-    bool randomize_order = true;
-    ParseOptions po(usage);
-    po.Register("srand", &srand_seed, "Seed for random number generator ");
-    po.Register("n", &n, "Number of examples to output");
-    po.Register("randomize-order", &randomize_order, "If true, randomize the order "
-                "of the output");
-    
-    po.Read(argc, argv);
-    
-    srand(srand_seed);
-    
-    if (po.NumArgs() != 2) {
-      po.PrintUsage();
-      exit(1);
-    }
-
-    std::string examples_rspecifier = po.GetArg(1),
-        examples_wspecifier = po.GetArg(2);
-
-    std::vector<std::pair<std::string, NnetExample> > egs;
-    egs.reserve(n);    
-    
-    SequentialNnetExampleReader example_reader(examples_rspecifier);
-
-    int64 num_read = 0;
-    for (; !example_reader.Done(); example_reader.Next()) {
-      num_read++;
-      if (num_read <= n) {
-        egs.resize(egs.size() + 1);
-        egs.back().first = example_reader.Key();
-        egs.back().second = example_reader.Value();
-      } else {
-        BaseFloat keep_prob = n / static_cast<BaseFloat>(num_read);
-        if (WithProb(keep_prob)) { // With probability "keep_prob"
-          int32 index = RandInt(0, n-1);
-          egs[index].first = example_reader.Key();
-          egs[index].second = example_reader.Value();
-        }
-      }
-    }
-    if (randomize_order)
-      std::random_shuffle(egs.begin(), egs.end());
-
-    NnetExampleWriter writer(examples_wspecifier);
-    for (size_t i = 0; i < egs.size(); i++) {
-      writer.Write(egs[i].first, egs[i].second);
-    }
-    
-    KALDI_LOG << "Selected a subset of " << egs.size() << " out of " << num_read
-              << " neural-network training examples ";
-    
-    return (num_read != 0 ? 0 : 1);
-  } catch(const std::exception &e) {
-    std::cerr << e.what() << '\n';
-    return -1;
-  }
-}
-
-
diff --git a/src/nnet2bin/nnet-to-raw-nnet.cc b/src/nnet2bin/nnet-to-raw-nnet.cc
deleted file mode 100644
index 30085c96572..00000000000
--- a/src/nnet2bin/nnet-to-raw-nnet.cc
+++ /dev/null
@@ -1,83 +0,0 @@
-// nnet2bin/nnet-to-raw-nnet.cc
-
-// Copyright 2013  Johns Hopkins University (author:  Daniel Povey)
-
-// See ../../COPYING for clarification regarding multiple authors
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//  http://www.apache.org/licenses/LICENSE-2.0
-//
-// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
-// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
-// MERCHANTABLITY OR NON-INFRINGEMENT.
-// See the Apache 2 License for the specific language governing permissions and
-// limitations under the License.
-
-#include "base/kaldi-common.h"
-#include "util/common-utils.h"
-#include "nnet2/am-nnet.h"
-#include "hmm/transition-model.h"
-#include "tree/context-dep.h"
-
-int main(int argc, char *argv[]) {
-  try {
-    using namespace kaldi;
-    using namespace kaldi::nnet2;
-    
-    typedef kaldi::int32 int32;
-
-    const char *usage =
-        "Copy a (cpu-based) neural net: reads the AmNnet with its transition model, but\n"
-        "writes just the Nnet with no transition model (i.e. the raw neural net.)\n"
-        "\n"
-        "Usage:  nnet-to-raw-nnet [options] <nnet-in> <raw-nnet-out>\n"
-        "e.g.:\n"
-        " nnet-to-raw-nnet --binary=false 1.mdl 1.raw\n";
-
-    int32 truncate = -1;
-    bool binary_write = true;
-
-    ParseOptions po(usage);
-    po.Register("binary", &binary_write, "Write output in binary mode");
-    po.Register("truncate", &truncate, "If set, will truncate the neural net "
-                "to this many components by removing the last components.");
-    
-    po.Read(argc, argv);
-    
-    if (po.NumArgs() != 2) {
-      po.PrintUsage();
-      exit(1);
-    }
-
-    std::string nnet_rxfilename = po.GetArg(1),
-        raw_nnet_wxfilename = po.GetArg(2);
-    
-    TransitionModel trans_model;
-    AmNnet am_nnet;
-    {
-      bool binary;
-      Input ki(nnet_rxfilename, &binary);
-      trans_model.Read(ki.Stream(), binary);
-      am_nnet.Read(ki.Stream(), binary);
-    }
-
-    if (truncate >= 0) {
-      KALDI_LOG << "Truncating neural net to " << truncate << " layers.";
-      am_nnet.GetNnet().Resize(truncate);
-    }
-
-    const Nnet &nnet = am_nnet.GetNnet();
-    WriteKaldiObject(nnet, raw_nnet_wxfilename, binary_write);
-    
-    KALDI_LOG << "Read neural net from " << nnet_rxfilename
-              << " and wrote raw neural net to " << raw_nnet_wxfilename;
-    return 0;
-  } catch(const std::exception &e) {
-    std::cerr << e.what() << '\n';
-    return -1;
-  }
-}
diff --git a/src/nnet2bin/nnet-train-discriminative-parallel.cc b/src/nnet2bin/nnet-train-discriminative-parallel.cc
deleted file mode 100644
index fbc338803b2..00000000000
--- a/src/nnet2bin/nnet-train-discriminative-parallel.cc
+++ /dev/null
@@ -1,95 +0,0 @@
-// nnet2bin/nnet-train-discriminative-parallel.cc
-
-// Copyright 2013  Johns Hopkins University (author: Daniel Povey)
-
-// See ../../COPYING for clarification regarding multiple authors
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//  http://www.apache.org/licenses/LICENSE-2.0
-//
-// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
-// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
-// MERCHANTABLITY OR NON-INFRINGEMENT.
-// See the Apache 2 License for the specific language governing permissions and
-// limitations under the License.
-
-#include "base/kaldi-common.h"
-#include "util/common-utils.h"
-#include "hmm/transition-model.h"
-#include "nnet2/am-nnet.h"
-#include "nnet2/nnet-compute-discriminative-parallel.h"
-
-
-int main(int argc, char *argv[]) {
-  try {
-    using namespace kaldi;
-    using namespace kaldi::nnet2;
-    typedef kaldi::int32 int32;
-    typedef kaldi::int64 int64;
-
-    const char *usage =
-        "Train the neural network parameters with a discriminative objective\n"
-        "function (MMI, SMBR or MPFE).  This uses training examples prepared with\n"
-        "nnet-get-egs-discriminative\n"
-        "This version uses multiple threads (but no GPU)"
-        "\n"
-        "Usage:  nnet-train-discriminative-parallel [options] <model-in> <training-examples-in> <model-out>\n"
-        "e.g.:\n"
-        "nnet-train-discriminative-parallel --num-threads=8 1.nnet ark:1.degs 2.nnet\n";
-    
-    bool binary_write = true;
-    std::string use_gpu = "yes";
-    int32 num_threads = 1;
-    NnetDiscriminativeUpdateOptions update_opts;
-    
-    ParseOptions po(usage);
-    po.Register("binary", &binary_write, "Write output in binary mode");
-    po.Register("num-threads", &num_threads, "Number of threads to use");
-    update_opts.Register(&po);
-    
-    po.Read(argc, argv);
-    
-    if (po.NumArgs() != 3) {
-      po.PrintUsage();
-      exit(1);
-    }
-    
-    std::string nnet_rxfilename = po.GetArg(1),
-        examples_rspecifier = po.GetArg(2),
-        nnet_wxfilename = po.GetArg(3);
-
-    TransitionModel trans_model;
-    AmNnet am_nnet;
-    {
-      bool binary_read;
-      Input ki(nnet_rxfilename, &binary_read);
-      trans_model.Read(ki.Stream(), binary_read);
-      am_nnet.Read(ki.Stream(), binary_read);
-    }
-
-    
-    NnetDiscriminativeStats stats;
-    SequentialDiscriminativeNnetExampleReader example_reader(
-        examples_rspecifier);
-
-    NnetDiscriminativeUpdateParallel(am_nnet, trans_model,
-                                     update_opts, num_threads, &example_reader,
-                                     &(am_nnet.GetNnet()), &stats);
-    {
-      Output ko(nnet_wxfilename, binary_write);
-      trans_model.Write(ko.Stream(), binary_write);
-      am_nnet.Write(ko.Stream(), binary_write);
-    }
-
-    return (stats.tot_t == 0 ? 1 : 0);
-  } catch(const std::exception &e) {
-    std::cerr << e.what() << '\n';
-    return -1;
-  }
-}
-
-
diff --git a/src/nnet2bin/nnet-train-discriminative-simple.cc b/src/nnet2bin/nnet-train-discriminative-simple.cc
deleted file mode 100644
index 5b2caf4c6f3..00000000000
--- a/src/nnet2bin/nnet-train-discriminative-simple.cc
+++ /dev/null
@@ -1,116 +0,0 @@
-// nnet2bin/nnet-train-discriminative-simple.cc
-
-// Copyright 2013  Johns Hopkins University (author: Daniel Povey)
-
-// See ../../COPYING for clarification regarding multiple authors
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//  http://www.apache.org/licenses/LICENSE-2.0
-//
-// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
-// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
-// MERCHANTABLITY OR NON-INFRINGEMENT.
-// See the Apache 2 License for the specific language governing permissions and
-// limitations under the License.
-
-#include "base/kaldi-common.h"
-#include "util/common-utils.h"
-#include "hmm/transition-model.h"
-#include "nnet2/am-nnet.h"
-#include "nnet2/nnet-compute-discriminative.h"
-
-
-int main(int argc, char *argv[]) {
-  try {
-    using namespace kaldi;
-    using namespace kaldi::nnet2;
-    typedef kaldi::int32 int32;
-    typedef kaldi::int64 int64;
-
-    const char *usage =
-        "Train the neural network parameters with a discriminative objective\n"
-        "function (MMI, SMBR or MPFE).  This uses training examples prepared with\n"
-        "nnet-get-egs-discriminative\n"
-        "\n"
-        "Usage:  nnet-train-discriminative-simple [options] <model-in> <training-examples-in> <model-out>\n"
-        "e.g.:\n"
-        "nnet-train-discriminative-simple 1.nnet ark:1.degs 2.nnet\n";
-    
-    bool binary_write = true;
-    std::string use_gpu = "yes";
-    NnetDiscriminativeUpdateOptions update_opts;
-    
-    ParseOptions po(usage);
-    po.Register("binary", &binary_write, "Write output in binary mode");
-    po.Register("use-gpu", &use_gpu,
-                "yes|no|optional|wait, only has effect if compiled with CUDA");
-    update_opts.Register(&po);
-    
-    po.Read(argc, argv);
-    
-    if (po.NumArgs() != 3) {
-      po.PrintUsage();
-      exit(1);
-    }
-    
-#if HAVE_CUDA==1
-    CuDevice::Instantiate().SelectGpuId(use_gpu);
-#endif
-
-    std::string nnet_rxfilename = po.GetArg(1),
-        examples_rspecifier = po.GetArg(2),
-        nnet_wxfilename = po.GetArg(3);
-
-    int64 num_examples = 0;
-
-    {
-      TransitionModel trans_model;
-      AmNnet am_nnet;
-      {
-        bool binary_read;
-        Input ki(nnet_rxfilename, &binary_read);
-        trans_model.Read(ki.Stream(), binary_read);
-        am_nnet.Read(ki.Stream(), binary_read);
-      }
-
-    
-      NnetDiscriminativeStats stats;
-      SequentialDiscriminativeNnetExampleReader example_reader(examples_rspecifier);
-
-      for (; !example_reader.Done(); example_reader.Next(), num_examples++) {
-        NnetDiscriminativeUpdate(am_nnet, trans_model, update_opts,
-                                 example_reader.Value(),
-                                 &(am_nnet.GetNnet()), &stats);
-        if (num_examples % 10 == 0 && num_examples != 0) { // each example might be 500 frames.
-          if (GetVerboseLevel() >= 2) {
-            stats.Print(update_opts.criterion);
-          }
-        }          
-      }
-
-      stats.Print(update_opts.criterion);
-        
-      {
-        Output ko(nnet_wxfilename, binary_write);
-        trans_model.Write(ko.Stream(), binary_write);
-        am_nnet.Write(ko.Stream(), binary_write);
-      }
-    }
-#if HAVE_CUDA==1
-    CuDevice::Instantiate().PrintProfile();
-#endif
-    KALDI_LOG << "Finished training, processed " << num_examples
-              << " training examples.  Wrote model to "
-              << nnet_wxfilename;
-    return (num_examples == 0 ? 1 : 0);
-  } catch(const std::exception &e) {
-    std::cerr << e.what() << '\n';
-    return -1;
-  }
-}
-
-
diff --git a/src/nnet2bin/nnet-train-ensemble.cc b/src/nnet2bin/nnet-train-ensemble.cc
deleted file mode 100644
index 86e78936279..00000000000
--- a/src/nnet2bin/nnet-train-ensemble.cc
+++ /dev/null
@@ -1,145 +0,0 @@
-// nnet2bin/nnet-train-ensemble.cc
-
-// Copyright 2012  Johns Hopkins University (author: Daniel Povey)
-//           2014  Xiaohui Zhang
-
-// See ../../COPYING for clarification regarding multiple authors
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//  http://www.apache.org/licenses/LICENSE-2.0
-//
-// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
-// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
-// MERCHANTABLITY OR NON-INFRINGEMENT.
-// See the Apache 2 License for the specific language governing permissions and
-// limitations under the License.
-
-#include "base/kaldi-common.h"
-#include "util/common-utils.h"
-#include "hmm/transition-model.h"
-#include "nnet2/train-nnet-ensemble.h"
-#include "nnet2/am-nnet.h"
-
-
-int main(int argc, char *argv[]) {
-  try {
-    using namespace kaldi;
-    using namespace kaldi::nnet2;
-    typedef kaldi::int32 int32;
-    typedef kaldi::int64 int64;
-
-    const char *usage =
-        "Train an ensemble of neural networks with backprop and stochastic\n"
-        "gradient descent using minibatches.  Modified version of nnet-train-simple.\n"
-        "Implements parallel gradient descent with a term that encourages the nnets to\n"
-        "produce similar outputs.\n"
-        "\n"
-        "Usage:  nnet-train-ensemble [options] <model-in-1> <model-in-2> ... <model-in-n>"
-        "  <training-examples-in> <model-out-1> <model-out-2> ... <model-out-n> \n"
-        "\n"
-        "e.g.:\n"
-        " nnet-train-ensemble 1.1.nnet 2.1.nnet ark:egs.ark 2.1.nnet 2.2.nnet \n";
-    
-    bool binary_write = true;
-    bool zero_stats = true;
-    int32 srand_seed = 0;
-    std::string use_gpu = "yes";
-    NnetEnsembleTrainerConfig train_config;
-    
-    ParseOptions po(usage);
-    po.Register("binary", &binary_write, "Write output in binary mode");
-    po.Register("zero-stats", &zero_stats, "If true, zero occupation "
-                "counts stored with the neural net (only affects mixing up).");
-    po.Register("srand", &srand_seed, "Seed for random number generator "
-                "(relevant if you have layers of type AffineComponentPreconditioned "
-                "with l2-penalty != 0.0");
-    po.Register("use-gpu", &use_gpu,
-                "yes|no|optional|wait, only has effect if compiled with CUDA");
- 
-    train_config.Register(&po);
-    
-    po.Read(argc, argv);
-    
-    if (po.NumArgs() <= 3) {
-      po.PrintUsage();
-      exit(1);
-    }
-    srand(srand_seed);
-    
-#if HAVE_CUDA==1
-    CuDevice::Instantiate().SelectGpuId(use_gpu);
-#endif
-    
-    int32 num_nnets = (po.NumArgs() - 1) / 2;
-    std::string nnet_rxfilename = po.GetArg(1);
-    std::string examples_rspecifier = po.GetArg(num_nnets + 1);
-
-    std::string nnet1_rxfilename = po.GetArg(1);
-    
-    TransitionModel trans_model;
-    std::vector<AmNnet> am_nnets(num_nnets);
-    {
-      bool binary_read;
-      Input ki(nnet1_rxfilename, &binary_read);
-      trans_model.Read(ki.Stream(), binary_read);
-      KALDI_LOG << nnet1_rxfilename;
-      am_nnets[0].Read(ki.Stream(), binary_read);
-    }
-
-    std::vector<Nnet*> nnets(num_nnets);
-    nnets[0] = &(am_nnets[0].GetNnet());
-
-    for (int32 n = 1; n < num_nnets; n++) {
-      TransitionModel trans_model;
-      bool binary_read;
-      Input ki(po.GetArg(1 + n), &binary_read);
-      trans_model.Read(ki.Stream(), binary_read);
-      am_nnets[n].Read(ki.Stream(), binary_read);
-      nnets[n] = &am_nnets[n].GetNnet();
-    }      
-    
-
-    int64 num_examples = 0;
-
-    {
-      if (zero_stats) {
-        for (int32 n = 1; n < num_nnets; n++) 
-          nnets[n]->ZeroStats();
-      }
-      { // want to make sure this object deinitializes before
-        // we write the model, as it does something in the destructor.
-        NnetEnsembleTrainer trainer(train_config,
-                                    nnets);
-      
-        SequentialNnetExampleReader example_reader(examples_rspecifier);
-
-        for (; !example_reader.Done(); example_reader.Next(), num_examples++)
-          trainer.TrainOnExample(example_reader.Value());  // It all happens here!
-      }
-    
-      {
-        for (int32 n = 0; n < num_nnets; n++) {
-          Output ko(po.GetArg(po.NumArgs() - num_nnets + n + 1), binary_write);
-          trans_model.Write(ko.Stream(), binary_write);
-          am_nnets[n].Write(ko.Stream(), binary_write);
-        }
-      }
-    }
-#if HAVE_CUDA==1
-    CuDevice::Instantiate().PrintProfile();
-#endif
-    
-    KALDI_LOG << "Finished training, processed " << num_examples
-              << " training examples.";
-    return (num_examples == 0 ? 1 : 0);
-  } catch(const std::exception &e) {
-    std::cerr << e.what() << '\n';
-    return -1;
-  }
-}
-
-
diff --git a/src/nnet2bin/nnet-train-parallel.cc b/src/nnet2bin/nnet-train-parallel.cc
deleted file mode 100644
index a094a069db0..00000000000
--- a/src/nnet2bin/nnet-train-parallel.cc
+++ /dev/null
@@ -1,112 +0,0 @@
-// nnet2bin/nnet-train-parallel.cc
-
-// Copyright 2012  Johns Hopkins University (author: Daniel Povey)
-
-// See ../../COPYING for clarification regarding multiple authors
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//  http://www.apache.org/licenses/LICENSE-2.0
-//
-// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
-// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
-// MERCHANTABLITY OR NON-INFRINGEMENT.
-// See the Apache 2 License for the specific language governing permissions and
-// limitations under the License.
-
-#include "base/kaldi-common.h"
-#include "util/common-utils.h"
-#include "hmm/transition-model.h"
-#include "nnet2/nnet-update-parallel.h"
-#include "nnet2/am-nnet.h"
-
-
-int main(int argc, char *argv[]) {
-  try {
-    using namespace kaldi;
-    using namespace kaldi::nnet2;
-    typedef kaldi::int32 int32;
-    typedef kaldi::int64 int64;
-
-    const char *usage =
-        "Train the neural network parameters with backprop and stochastic\n"
-        "gradient descent using minibatches.  As nnet-train-simple, but\n"
-        "uses multiple threads in a Hogwild type of update (for CPU, not GPU).\n"
-        "\n"
-        "Usage:  nnet-train-parallel [options] <model-in> <training-examples-in> <model-out>\n"
-        "\n"
-        "e.g.:\n"
-        "nnet-train-parallel --num-threads=8 1.nnet ark:1.1.egs 2.nnet\n";
-    
-    bool binary_write = true;
-    bool zero_stats = true;
-    int32 minibatch_size = 1024;
-    int32 srand_seed = 0;
-    
-    ParseOptions po(usage);
-    po.Register("binary", &binary_write, "Write output in binary mode");
-    po.Register("zero-stats", &zero_stats, "If true, zero stats "
-                "stored with the neural net (only affects mixing up).");
-    po.Register("srand", &srand_seed,
-                "Seed for random number generator (e.g., for dropout)");
-    po.Register("num-threads", &g_num_threads, "Number of training threads to use "
-                "in the parallel update. [Note: if you use a parallel "
-                "implementation of BLAS, the actual number of threads may be larger.]");
-    po.Register("minibatch-size", &minibatch_size, "Number of examples to use for "
-                "each minibatch during training.");
-    
-    po.Read(argc, argv);
-    srand(srand_seed);
-
-    if (po.NumArgs() != 3) {
-      po.PrintUsage();
-      exit(1);
-    }
-    
-    std::string nnet_rxfilename = po.GetArg(1),
-        examples_rspecifier = po.GetArg(2),
-        nnet_wxfilename = po.GetArg(3);
-
-    TransitionModel trans_model;
-    AmNnet am_nnet;
-    {
-      bool binary_read;
-      Input ki(nnet_rxfilename, &binary_read);
-      trans_model.Read(ki.Stream(), binary_read);
-      am_nnet.Read(ki.Stream(), binary_read);
-    }
-
-    KALDI_ASSERT(minibatch_size > 0);
-
-    if (zero_stats) am_nnet.GetNnet().ZeroStats();
-
-    double num_examples = 0;
-    SequentialNnetExampleReader example_reader(examples_rspecifier);
-    
-
-    DoBackpropParallel(am_nnet.GetNnet(),
-                       minibatch_size,
-                       &example_reader,
-                       &num_examples,
-                       &(am_nnet.GetNnet()));
-    
-    {
-      Output ko(nnet_wxfilename, binary_write);
-      trans_model.Write(ko.Stream(), binary_write);
-      am_nnet.Write(ko.Stream(), binary_write);
-    }
-    
-    KALDI_LOG << "Finished training, processed " << num_examples
-              << " training examples (weighted).  Wrote model to "
-              << nnet_wxfilename;
-    return (num_examples == 0 ? 1 : 0);
-  } catch(const std::exception &e) {
-    std::cerr << e.what() << '\n';
-    return -1;
-  }
-}
-
-
diff --git a/src/nnet2bin/nnet-train-simple.cc b/src/nnet2bin/nnet-train-simple.cc
deleted file mode 100644
index 322868458fc..00000000000
--- a/src/nnet2bin/nnet-train-simple.cc
+++ /dev/null
@@ -1,117 +0,0 @@
-// nnet2bin/nnet-train-simple.cc
-
-// Copyright 2012  Johns Hopkins University (author: Daniel Povey)
-
-// See ../../COPYING for clarification regarding multiple authors
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//  http://www.apache.org/licenses/LICENSE-2.0
-//
-// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
-// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
-// MERCHANTABLITY OR NON-INFRINGEMENT.
-// See the Apache 2 License for the specific language governing permissions and
-// limitations under the License.
-
-#include "base/kaldi-common.h"
-#include "util/common-utils.h"
-#include "hmm/transition-model.h"
-#include "nnet2/train-nnet.h"
-#include "nnet2/am-nnet.h"
-
-
-int main(int argc, char *argv[]) {
-  try {
-    using namespace kaldi;
-    using namespace kaldi::nnet2;
-    typedef kaldi::int32 int32;
-    typedef kaldi::int64 int64;
-
-    const char *usage =
-        "Train the neural network parameters with backprop and stochastic\n"
-        "gradient descent using minibatches.  Training examples would be\n"
-        "produced by nnet-get-egs.\n"
-        "\n"
-        "Usage:  nnet-train-simple [options] <model-in> <training-examples-in> <model-out>\n"
-        "\n"
-        "e.g.:\n"
-        "nnet-train-simple 1.nnet ark:1.egs 2.nnet\n";
-    
-    bool binary_write = true;
-    bool zero_stats = true;
-    int32 srand_seed = 0;
-    std::string use_gpu = "yes";
-    NnetSimpleTrainerConfig train_config;
-    
-    ParseOptions po(usage);
-    po.Register("binary", &binary_write, "Write output in binary mode");
-    po.Register("zero-stats", &zero_stats, "If true, zero occupation "
-                "counts stored with the neural net (only affects mixing up).");
-    po.Register("srand", &srand_seed, "Seed for random number generator "
-                "(relevant if you have layers of type AffineComponentPreconditioned "
-                "with l2-penalty != 0.0");
-    po.Register("use-gpu", &use_gpu,
-                "yes|no|optional|wait, only has effect if compiled with CUDA");
-    
-    train_config.Register(&po);
-    
-    po.Read(argc, argv);
-    
-    if (po.NumArgs() != 3) {
-      po.PrintUsage();
-      exit(1);
-    }
-    srand(srand_seed);
-    
-#if HAVE_CUDA==1
-    CuDevice::Instantiate().SelectGpuId(use_gpu);
-#endif
-
-    std::string nnet_rxfilename = po.GetArg(1),
-        examples_rspecifier = po.GetArg(2),
-        nnet_wxfilename = po.GetArg(3);
-
-    int64 num_examples;
-    
-    {
-      TransitionModel trans_model;
-      AmNnet am_nnet;
-      {
-        bool binary_read;
-        Input ki(nnet_rxfilename, &binary_read);
-        trans_model.Read(ki.Stream(), binary_read);
-        am_nnet.Read(ki.Stream(), binary_read);
-      }
-
-      if (zero_stats) am_nnet.GetNnet().ZeroStats();
-
-      SequentialNnetExampleReader example_reader(examples_rspecifier);
-      
-      num_examples = TrainNnetSimple(train_config, &(am_nnet.GetNnet()),
-                                     &example_reader);
-    
-      {
-        Output ko(nnet_wxfilename, binary_write);
-        trans_model.Write(ko.Stream(), binary_write);
-        am_nnet.Write(ko.Stream(), binary_write);
-      }
-    }
-#if HAVE_CUDA==1
-    CuDevice::Instantiate().PrintProfile();
-#endif
-    
-    KALDI_LOG << "Finished training, processed " << num_examples
-              << " training examples.  Wrote model to "
-              << nnet_wxfilename;
-    return (num_examples == 0 ? 1 : 0);
-  } catch(const std::exception &e) {
-    std::cerr << e.what() << '\n';
-    return -1;
-  }
-}
-
-
diff --git a/src/nnet2bin/nnet-train-transitions.cc b/src/nnet2bin/nnet-train-transitions.cc
deleted file mode 100644
index 111b6909991..00000000000
--- a/src/nnet2bin/nnet-train-transitions.cc
+++ /dev/null
@@ -1,147 +0,0 @@
-// nnet2bin/nnet-train-transitions.cc
-
-// Copyright 2012  Johns Hopkins University (author:  Daniel Povey)
-
-// See ../../COPYING for clarification regarding multiple authors
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//  http://www.apache.org/licenses/LICENSE-2.0
-//
-// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
-// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
-// MERCHANTABLITY OR NON-INFRINGEMENT.
-// See the Apache 2 License for the specific language governing permissions and
-// limitations under the License.
-
-#include "base/kaldi-common.h"
-#include "util/common-utils.h"
-#include "nnet2/am-nnet.h"
-#include "hmm/transition-model.h"
-#include "tree/context-dep.h"
-
-namespace kaldi {
-namespace nnet2 {
-void SetPriors(const TransitionModel &tmodel,
-               const Vector<double> &transition_accs,
-               double prior_floor,
-               AmNnet *am_nnet) {
-  KALDI_ASSERT(tmodel.NumPdfs() == am_nnet->NumPdfs());
-  Vector<BaseFloat> pdf_counts(tmodel.NumPdfs());
-  KALDI_ASSERT(transition_accs(0) == 0.0); // There is
-  // no zero transition-id.
-  for (int32 tid = 1; tid < transition_accs.Dim(); tid++) {
-    int32 pdf = tmodel.TransitionIdToPdf(tid);
-    pdf_counts(pdf) += transition_accs(tid);
-  }
-  BaseFloat sum = pdf_counts.Sum();
-  KALDI_ASSERT(sum != 0.0);
-  KALDI_ASSERT(prior_floor > 0.0 && prior_floor < 1.0);
-  pdf_counts.Scale(1.0 / sum);
-  pdf_counts.ApplyFloor(prior_floor);
-  pdf_counts.Scale(1.0 / pdf_counts.Sum()); // normalize again.
-  am_nnet->SetPriors(pdf_counts);
-}               
-
-
-} // namespace nnet2
-} // namespace kaldi
-
-int main(int argc, char *argv[]) {
-  try {
-    using namespace kaldi;
-    using namespace kaldi::nnet2;
-    typedef kaldi::int32 int32;
-
-    const char *usage =
-        "Train the transition probabilities of a neural network acoustic model\n"
-        "\n"
-        "Usage:  nnet-train-transitions [options] <nnet-in> <alignments-rspecifier> <nnet-out>\n"
-        "e.g.:\n"
-        " nnet-train-transitions 1.nnet \"ark:gunzip -c ali.*.gz|\" 2.nnet\n";
-    
-    bool binary_write = true;
-    bool set_priors = true; // Also set the per-pdf priors in the model.
-    BaseFloat prior_floor = 5.0e-06; // The default was previously 1e-8, but
-                                     // once we had problems with a pdf-id that
-                                     // was not being seen in training, being
-                                     // recognized all the time.  This value
-                                     // seemed to be the smallest prior of the
-                                     // "seen" pdf-ids in one run.
-    MleTransitionUpdateConfig transition_update_config;
-    
-    ParseOptions po(usage);
-    po.Register("binary", &binary_write, "Write output in binary mode");
-    po.Register("set-priors", &set_priors, "If true, also set priors in neural "
-                "net (we divide by these in test time)");
-    po.Register("prior-floor", &prior_floor, "When setting priors, floor for "
-                "priors");
-    transition_update_config.Register(&po);
-    
-    po.Read(argc, argv);
-    
-    if (po.NumArgs() != 3) {
-      po.PrintUsage();
-      exit(1);
-    }
-
-    std::string nnet_rxfilename = po.GetArg(1),
-        ali_rspecifier = po.GetArg(2),
-        nnet_wxfilename = po.GetArg(3);
-    
-    TransitionModel trans_model;
-    AmNnet am_nnet;
-    {
-      bool binary_read;
-      Input ki(nnet_rxfilename, &binary_read);
-      trans_model.Read(ki.Stream(), binary_read);
-      am_nnet.Read(ki.Stream(), binary_read);
-    }
-    
-    Vector<double> transition_accs;
-    trans_model.InitStats(&transition_accs);
-
-    int32 num_done = 0;
-    SequentialInt32VectorReader ali_reader(ali_rspecifier);
-    for (; ! ali_reader.Done(); ali_reader.Next()) {
-      const std::vector<int32> alignment(ali_reader.Value());
-      for (size_t i = 0; i < alignment.size(); i++) {
-        int32 tid = alignment[i];
-        BaseFloat weight = 1.0;
-        trans_model.Accumulate(weight, tid, &transition_accs);
-      }
-      num_done++;
-    }
-    KALDI_LOG << "Accumulated transition stats from " << num_done
-              << " utterances.";
-
-    {
-      BaseFloat objf_impr, count;
-      trans_model.MleUpdate(transition_accs, transition_update_config,
-                            &objf_impr, &count);
-      KALDI_LOG << "Transition model update: average " << (objf_impr/count)
-                << " log-like improvement per frame over " << count
-                << " frames.";
-    }
-
-    if (set_priors) {
-      KALDI_LOG << "Setting priors of pdfs in the model.";
-      SetPriors(trans_model, transition_accs, prior_floor, &am_nnet);
-    }
-    
-    {
-      Output ko(nnet_wxfilename, binary_write);
-      trans_model.Write(ko.Stream(), binary_write);
-      am_nnet.Write(ko.Stream(), binary_write);
-    }
-    KALDI_LOG << "Trained transitions of neural network model and wrote it to "
-              << nnet_wxfilename;
-    return 0;
-  } catch(const std::exception &e) {
-    std::cerr << e.what() << '\n';
-    return -1;
-  }
-}
diff --git a/src/nnet2bin/nnet1-to-raw-nnet.cc b/src/nnet2bin/nnet1-to-raw-nnet.cc
deleted file mode 100644
index 96e058075d9..00000000000
--- a/src/nnet2bin/nnet1-to-raw-nnet.cc
+++ /dev/null
@@ -1,222 +0,0 @@
-// nnet2bin/nnet1-to-raw-nnet.cc
-
-// Copyright 2013  Johns Hopkins University (author:  Daniel Povey, Hainan Xu)
-
-// See ../../COPYING for clarification regarding multiple authors
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//  http://www.apache.org/licenses/LICENSE-2.0
-//
-// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
-// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
-// MERCHANTABLITY OR NON-INFRINGEMENT.
-// See the Apache 2 License for the specific language governing permissions and
-// limitations under the License.
-
-#include "base/kaldi-common.h"
-#include "util/common-utils.h"
-#include "hmm/transition-model.h"
-#include "nnet/nnet-nnet.h"
-#include "nnet/nnet-affine-transform.h"
-#include "nnet/nnet-activation.h"
-#include "nnet/nnet-various.h"
-#include "nnet2/nnet-nnet.h"
-#include "nnet2/nnet-component.h"
-
-namespace kaldi {
-
-nnet2::Component *ConvertAffineTransformComponent(
-    const nnet1::Component &nnet1_component,
-    const bool use_preconditioned_affine_component) {
-  const nnet1::AffineTransform *affine =
-      dynamic_cast<const nnet1::AffineTransform*>(&nnet1_component);
-  KALDI_ASSERT(affine != NULL);
-  // default learning rate is 1.0e-05, you can use the --learning-rate or
-  // --learning-rates option to nnet-am-copy to change it if you need.
-  BaseFloat learning_rate = 1.0e-05;
-  if (use_preconditioned_affine_component) {
-    int32 rank_in = 20,
-          rank_out = 80,
-          update_period = 4;
-    BaseFloat num_samples_history = 2000.,
-              alpha = 4.;
-    return new nnet2::AffineComponentPreconditionedOnline(
-      nnet2::AffineComponent(affine->GetLinearity(),
-        affine->GetBias(),
-        learning_rate),
-      rank_in,
-      rank_out,
-      update_period,
-      num_samples_history,
-      alpha);
-  } else {
-    return new nnet2::AffineComponent(affine->GetLinearity(),
-      affine->GetBias(),
-      learning_rate);
-  }
-}
-
-nnet2::Component *ConvertSoftmaxComponent(
-    const nnet1::Component &nnet1_component) {
-  const nnet1::Softmax *softmax =
-      dynamic_cast<const nnet1::Softmax*>(&nnet1_component);
-  KALDI_ASSERT(softmax != NULL);
-  return new nnet2::SoftmaxComponent(softmax->InputDim());
-}
-
-nnet2::Component *ConvertSigmoidComponent(
-    const nnet1::Component &nnet1_component) {
-  const nnet1::Sigmoid *sigmoid =
-      dynamic_cast<const nnet1::Sigmoid*>(&nnet1_component);
-  KALDI_ASSERT(sigmoid != NULL);
-  return new nnet2::SigmoidComponent(sigmoid->InputDim());
-}
-
-nnet2::Component *ConvertSpliceComponent(
-    const nnet1::Component &nnet1_component) {
-  const nnet1::Splice *splice =
-      dynamic_cast<const nnet1::Splice*>(&nnet1_component);
-  KALDI_ASSERT(splice != NULL);
-//  int32 low, high;
-  std::vector<int32> frame_offsets;
-
-  std::ostringstream ostr;
-  splice->WriteData(ostr, false);
-
-  std::istringstream istr(ostr.str());
-  ReadIntegerVector(istr, false, &frame_offsets);
-
-  nnet2::SpliceComponent *res = new nnet2::SpliceComponent();
-  res->Init(splice->InputDim(), frame_offsets);
-  return res;
-}
-
-
-nnet2::Component *ConvertAddShiftComponent(
-    const nnet1::Component &nnet1_component) {
-  const nnet1::AddShift *add_shift =
-      dynamic_cast<const nnet1::AddShift*>(&nnet1_component);
-  KALDI_ASSERT(add_shift != NULL);
-  Vector<BaseFloat> bias(add_shift->NumParams());
-
-  add_shift->GetParams(&bias);
-  CuVector<BaseFloat> cu_bias(bias);
-
-  nnet2::FixedBiasComponent *res = new nnet2::FixedBiasComponent();
-  res->Init(cu_bias);
-  return res;
-}
-
-nnet2::Component *ConvertRescaleComponent(
-    const nnet1::Component &nnet1_component) {
-  const nnet1::Rescale *rescale =
-      dynamic_cast<const nnet1::Rescale*>(&nnet1_component);
-  KALDI_ASSERT(rescale != NULL);
-
-  Vector<BaseFloat> scale(rescale->NumParams());
-  rescale->GetParams(&scale);
-
-  CuVector<BaseFloat> cu_scale(scale);
-
-  nnet2::FixedScaleComponent *res = new nnet2::FixedScaleComponent();
-  res->Init(cu_scale);
-  return res;
-}
-
-nnet2::Component *ConvertComponent(const nnet1::Component &nnet1_component,
-    const bool use_preconditioned_affine_component) {
-  nnet1::Component::ComponentType type_in = nnet1_component.GetType();
-  switch (type_in) {
-    case nnet1::Component::kAffineTransform:
-      return ConvertAffineTransformComponent(nnet1_component,
-          use_preconditioned_affine_component);
-    case nnet1::Component::kSoftmax:
-      return ConvertSoftmaxComponent(nnet1_component);
-    case nnet1::Component::kSigmoid:
-      return ConvertSigmoidComponent(nnet1_component);
-    case nnet1::Component::kSplice:
-      return ConvertSpliceComponent(nnet1_component); // note, this will for now only handle the
-      // special case nnet1::Component::where all splice indexes in nnet1_component are contiguous, e.g.
-      // -5, -4, -3, -2, -1, 0, 1, 2, 3, 4, 5 .
-    case nnet1::Component::kAddShift:
-      return ConvertAddShiftComponent(nnet1_component); // convert to FixedBiasComponent
-    case nnet1::Component::kRescale:
-      return ConvertRescaleComponent(nnet1_component); // convert to FixedScaleComponent
-    default: KALDI_ERR << "Un-handled nnet1 component type "
-                       << nnet1::Component::TypeToMarker(type_in);
-    return NULL;
-  }
-}
-
-
-nnet2::Nnet *ConvertNnet1ToNnet2(const nnet1::Nnet &nnet1,
-    const bool use_preconditioned_affine_component) {
-  // get a vector of nnet2::Component pointers and initialize the nnet2::Nnet with it.
-  size_t size = nnet1.NumComponents();
-  std::vector<nnet2::Component*> *components = new std::vector<nnet2::Component*>();
-  components->resize(size);
-  for (size_t i = 0; i < size; i++) {
-      (*components)[i] = ConvertComponent(nnet1.GetComponent(i),
-          use_preconditioned_affine_component);
-  }
-
-  nnet2::Nnet *res = new nnet2::Nnet();
-  res->Init(components);
-  delete components;
-  return res;
-}
-
-}  // namespace kaldi
-
-
-int main(int argc, char *argv[]) {
-  try {
-    using namespace kaldi;
-    typedef kaldi::int32 int32;
-
-    const char *usage =
-        "Convert nnet1 neural net to nnet2 'raw' neural net\n"
-        "\n"
-        "Usage:  nnet1-to-raw-nnet [options] <nnet1-in> <nnet2-out>\n"
-        "e.g.:\n"
-        " nnet1-to-raw-nnet srcdir/final.nnet - | nnet-am-init dest/tree dest/topo - dest/0.mdl\n";
-
-    bool binary_write = true, use_preconditioned_affine_component = false;
-    int32 srand_seed = 0;
-
-    ParseOptions po(usage);
-    po.Register("binary", &binary_write, "Write output in binary mode");
-
-    po.Register("use_preconditioned_affine_component",
-        &use_preconditioned_affine_component,
-        "Using AffineComponentPreconditionOnline instead AffineComponent");
-
-    po.Read(argc, argv);
-    srand(srand_seed);
-
-    if (po.NumArgs() != 2) {
-      po.PrintUsage();
-      exit(1);
-    }
-
-    std::string nnet1_rxfilename = po.GetArg(1),
-        raw_nnet2_wxfilename = po.GetArg(2);
-
-    nnet1::Nnet nnet1;
-    ReadKaldiObject(nnet1_rxfilename, &nnet1);
-    nnet2::Nnet *nnet2 = ConvertNnet1ToNnet2(nnet1,
-        use_preconditioned_affine_component);
-    WriteKaldiObject(*nnet2, raw_nnet2_wxfilename, binary_write);
-    KALDI_LOG << "Converted nnet1 neural net to raw nnet2 and wrote it to "
-              << PrintableWxfilename(raw_nnet2_wxfilename);
-    delete nnet2;
-    return 0;
-  } catch(const std::exception &e) {
-    std::cerr << e.what() << '\n';
-    return -1;
-  }
-}
diff --git a/src/nnet2bin/raw-nnet-concat.cc b/src/nnet2bin/raw-nnet-concat.cc
deleted file mode 100644
index a4664007779..00000000000
--- a/src/nnet2bin/raw-nnet-concat.cc
+++ /dev/null
@@ -1,75 +0,0 @@
-// nnet2bin/raw-nnet-concat.cc
-
-// Copyright 2013  Johns Hopkins University (author:  Daniel Povey)
-
-// See ../../COPYING for clarification regarding multiple authors
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//  http://www.apache.org/licenses/LICENSE-2.0
-//
-// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
-// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
-// MERCHANTABLITY OR NON-INFRINGEMENT.
-// See the Apache 2 License for the specific language governing permissions and
-// limitations under the License.
-
-#include "base/kaldi-common.h"
-#include "util/common-utils.h"
-#include "nnet2/am-nnet.h"
-#include "hmm/transition-model.h"
-#include "tree/context-dep.h"
-
-int main(int argc, char *argv[]) {
-  try {
-    using namespace kaldi;
-    using namespace kaldi::nnet2;
-    typedef kaldi::int32 int32;
-
-    const char *usage =
-        "Concatenate two 'raw' neural nets, e.g. as output by nnet-init or\n"
-        "nnet-to-raw-nnet\n"
-        "\n"
-        "Usage:  raw-nnet-concat [options] <raw-nnet-in1> <raw-nnet-in2> <raw-nnet-out>\n"
-        "e.g.:\n"
-        " raw-nnet-concat nnet1 nnet2 nnet_concat\n";
-    
-    bool binary_write = true;
-    int32 srand_seed = 0;
-    
-    ParseOptions po(usage);
-    po.Register("binary", &binary_write, "Write output in binary mode");
-    
-    po.Read(argc, argv);
-    srand(srand_seed);
-    
-    if (po.NumArgs() != 3) {
-      po.PrintUsage();
-      exit(1);
-    }
-
-    std::string raw_nnet1_rxfilename = po.GetArg(1),
-        raw_nnet2_rxfilename = po.GetArg(2),
-        raw_nnet_wxfilename = po.GetArg(3);
-    
-    Nnet nnet1;
-    ReadKaldiObject(raw_nnet1_rxfilename, &nnet1);
-    Nnet nnet2;
-    ReadKaldiObject(raw_nnet2_rxfilename, &nnet2);
-
-    Nnet nnet_concat(nnet1, nnet2); // Constructor concatenates them.
-
-    WriteKaldiObject(nnet_concat, raw_nnet_wxfilename, binary_write);
-    
-    KALDI_LOG << "Concatenated neural nets from "
-              << raw_nnet1_rxfilename << " and " << raw_nnet2_rxfilename
-              << " and wrote to " << raw_nnet_wxfilename;
-    return 0;
-  } catch(const std::exception &e) {
-    std::cerr << e.what() << '\n';
-    return -1;
-  }
-}
diff --git a/src/nnet2bin/raw-nnet-copy.cc b/src/nnet2bin/raw-nnet-copy.cc
deleted file mode 100644
index 57b5ee0e6f1..00000000000
--- a/src/nnet2bin/raw-nnet-copy.cc
+++ /dev/null
@@ -1,107 +0,0 @@
-// nnet2bin/raw-nnet-copy.cc
-
-// Copyright 2014 Johns Hopkins University (author:  Daniel Povey)
-
-// See ../../COPYING for clarification regarding multiple authors
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//  http://www.apache.org/licenses/LICENSE-2.0
-//
-// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
-// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
-// MERCHANTABLITY OR NON-INFRINGEMENT.
-// See the Apache 2 License for the specific language governing permissions and
-// limitations under the License.
-
-#include <typeinfo>
-#include "base/kaldi-common.h"
-#include "util/common-utils.h"
-#include "hmm/transition-model.h"
-#include "nnet2/am-nnet.h"
-#include "tree/context-dep.h"
-
-int main(int argc, char *argv[]) {
-  try {
-    using namespace kaldi;
-    using namespace kaldi::nnet2;
-    typedef kaldi::int32 int32;
-
-    const char *usage =
-        "Copy a raw neural net (this version works on raw nnet2 neural nets,\n"
-        "without the transition model.  Supports the 'truncate' option.\n"
-        "\n"
-        "Usage:  raw-nnet-copy [options] <raw-nnet-in> <raw-nnet-out>\n"
-        "e.g.:\n"
-        " raw-nnet-copy --binary=false 1.mdl text.mdl\n"
-        "See also: nnet-to-raw-nnet, nnet-am-copy\n";
-    
-    int32 truncate = -1;
-    bool binary_write = true;
-    std::string learning_rate_scales_str = " ";
-    
-    ParseOptions po(usage);
-    po.Register("binary", &binary_write, "Write output in binary mode");
-    po.Register("truncate", &truncate, "If set, will truncate the neural net "
-                "to this many components by removing the last components.");
-    po.Register("learning-rate-scales", &learning_rate_scales_str,
-                "Colon-separated list of scaling factors for learning rates, "
-                "applied after the --learning-rate and --learning-rates options."
-                "Used to scale learning rates for particular layer types.  E.g."
-                "--learning-rate-scales=AffineComponent=0.5");
-
-    po.Read(argc, argv);
-    
-    if (po.NumArgs() != 2) {
-      po.PrintUsage();
-      exit(1);
-    }
-
-    std::string raw_nnet_rxfilename = po.GetArg(1),
-        raw_nnet_wxfilename = po.GetArg(2);
-    
-    Nnet nnet;
-    ReadKaldiObject(raw_nnet_rxfilename, &nnet);
-    
-    if (truncate >= 0)
-      nnet.Resize(truncate);
-
-    if (learning_rate_scales_str != " ")  {
-      // parse the learning_rate_scales provided as an option
-      std::map<std::string, BaseFloat> learning_rate_scales;
-      std::vector<std::string> learning_rate_scale_vec;
-      SplitStringToVector(learning_rate_scales_str, ":", true,
-                          &learning_rate_scale_vec);
-      for (int32 index = 0; index < learning_rate_scale_vec.size();
-          index++) {
-        std::vector<std::string> parts;
-        BaseFloat scale_factor;
-        SplitStringToVector(learning_rate_scale_vec[index],
-                            "=", false,  &parts);
-        if (!ConvertStringToReal(parts[1], &scale_factor)) {
-          KALDI_ERR << "Unknown format for --learning-rate-scales option. "
-              << "Expected format is "
-              << "--learning-rate-scales=AffineComponent=0.1:AffineComponentPreconditioned=0.5 "
-              << "instead got "
-              << learning_rate_scales_str;
-        }
-        learning_rate_scales.insert(std::pair<std::string, BaseFloat>(
-                parts[0], scale_factor));
-      }
-      // use the learning_rate_scales to scale the component learning rates
-      nnet.ScaleLearningRates(learning_rate_scales);
-    }
-
-    WriteKaldiObject(nnet, raw_nnet_wxfilename, binary_write);
-
-    KALDI_LOG << "Copied raw neural net from " << raw_nnet_rxfilename
-              << " to " << raw_nnet_wxfilename;
-    return 0;
-  } catch(const std::exception &e) {
-    std::cerr << e.what() << '\n';
-    return -1;
-  }
-}
diff --git a/src/nnet2bin/raw-nnet-info.cc b/src/nnet2bin/raw-nnet-info.cc
deleted file mode 100644
index f72e5219821..00000000000
--- a/src/nnet2bin/raw-nnet-info.cc
+++ /dev/null
@@ -1,63 +0,0 @@
-// nnet2bin/raw-nnet-info.cc
-
-// Copyright 2013  Johns Hopkins University (author:  Daniel Povey)
-
-// See ../../COPYING for clarification regarding multiple authors
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//  http://www.apache.org/licenses/LICENSE-2.0
-//
-// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
-// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
-// MERCHANTABLITY OR NON-INFRINGEMENT.
-// See the Apache 2 License for the specific language governing permissions and
-// limitations under the License.
-
-#include "base/kaldi-common.h"
-#include "util/common-utils.h"
-#include "hmm/transition-model.h"
-#include "nnet2/nnet-nnet.h"
-
-int main(int argc, char *argv[]) {
-  try {
-    using namespace kaldi;
-    using namespace kaldi::nnet2;
-    typedef kaldi::int32 int32;
-
-    const char *usage =
-        "Print human-readable information about the raw neural network\n"
-        "to the standard output\n"
-        "Usage:  raw-nnet-info [options] <nnet-in>\n"
-        "e.g.:\n"
-        " raw-nnet-info 1.nnet\n";
-        
-    ParseOptions po(usage);
-    
-    po.Read(argc, argv);
-
-    if (po.NumArgs() != 1) {
-      po.PrintUsage();
-      exit(1);
-    }
-
-    std::string raw_nnet_rxfilename = po.GetArg(1);
-    
-    Nnet nnet;
-    ReadKaldiObject(raw_nnet_rxfilename, &nnet);
-    
-    std::cout << nnet.Info();
-    
-    KALDI_LOG << "Printed info about " << raw_nnet_rxfilename;
-  } catch(const std::exception &e) {
-    std::cerr << e.what() << '\n';
-    return -1;
-  }
-}
-
-
-
-
diff --git a/src/nnet2bin/raw-nnet-init b/src/nnet2bin/raw-nnet-init
deleted file mode 120000
index 181aa3a3cf0..00000000000
--- a/src/nnet2bin/raw-nnet-init
+++ /dev/null
@@ -1 +0,0 @@
-nnet-init
\ No newline at end of file
diff --git a/src/nnet3/decodable-online-looped.h b/src/nnet3/decodable-online-looped.h
index 3041d3c4637..4867c5decb8 100644
--- a/src/nnet3/decodable-online-looped.h
+++ b/src/nnet3/decodable-online-looped.h
@@ -28,7 +28,7 @@
 #include "nnet3/nnet-compute.h"
 #include "nnet3/nnet-optimize.h"
 #include "nnet3/decodable-simple-looped.h"
-#include "hmm/transition-model.h"
+#include "hmm/transitions.h"
 
 namespace kaldi {
 namespace nnet3 {
@@ -158,7 +158,7 @@ class DecodableNnetLoopedOnline: public DecodableNnetLoopedOnlineBase {
 
 
 // This is for traditional decoding where the graph has transition-ids
-// on the arcs, and you need the TransitionModel to map those to
+// on the arcs, and you need the Transitions to map those to
 // pdf-ids.
 // Note: whether or not division by the prior takes place depends on
 // whether you supplied class AmNnetSimple (or just Nnet), to the constructor
@@ -167,7 +167,7 @@ class DecodableNnetLoopedOnline: public DecodableNnetLoopedOnlineBase {
 class DecodableAmNnetLoopedOnline: public DecodableNnetLoopedOnlineBase {
  public:
   DecodableAmNnetLoopedOnline(
-      const TransitionModel &trans_model,
+      const Transitions &trans_model,
       const DecodableNnetSimpleLoopedInfo &info,
       OnlineFeatureInterface *input_features,
       OnlineFeatureInterface *ivector_features):
@@ -184,7 +184,7 @@ class DecodableAmNnetLoopedOnline: public DecodableNnetLoopedOnlineBase {
                                   int32 transition_id);
 
  private:
-  const TransitionModel &trans_model_;
+  const Transitions &trans_model_;
 
   KALDI_DISALLOW_COPY_AND_ASSIGN(DecodableAmNnetLoopedOnline);
 
diff --git a/src/nnet3/decodable-simple-looped.cc b/src/nnet3/decodable-simple-looped.cc
index 71aa7daaa17..d7b680519d0 100644
--- a/src/nnet3/decodable-simple-looped.cc
+++ b/src/nnet3/decodable-simple-looped.cc
@@ -244,7 +244,7 @@ void DecodableNnetSimpleLooped::GetCurrentIvector(int32 input_frame,
 
 DecodableAmNnetSimpleLooped::DecodableAmNnetSimpleLooped(
     const DecodableNnetSimpleLoopedInfo &info,
-    const TransitionModel &trans_model,
+    const Transitions &trans_model,
     const MatrixBase<BaseFloat> &feats,
     const VectorBase<BaseFloat> *ivector,
     const MatrixBase<BaseFloat> *online_ivectors,
diff --git a/src/nnet3/decodable-simple-looped.h b/src/nnet3/decodable-simple-looped.h
index ca3f732641e..71724508af4 100644
--- a/src/nnet3/decodable-simple-looped.h
+++ b/src/nnet3/decodable-simple-looped.h
@@ -23,7 +23,7 @@
 #include <vector>
 #include "base/kaldi-common.h"
 #include "gmm/am-diag-gmm.h"
-#include "hmm/transition-model.h"
+#include "hmm/transitions.h"
 #include "itf/decodable-itf.h"
 #include "nnet3/nnet-optimize.h"
 #include "nnet3/nnet-compute.h"
@@ -295,7 +295,7 @@ class DecodableAmNnetSimpleLooped: public DecodableInterface {
                         (in frames) with which the iVectors are estimated.
   */
   DecodableAmNnetSimpleLooped(const DecodableNnetSimpleLoopedInfo &info,
-                              const TransitionModel &trans_model,
+                              const Transitions &trans_model,
                               const MatrixBase<BaseFloat> &feats,
                               const VectorBase<BaseFloat> *ivector = NULL,
                               const MatrixBase<BaseFloat> *online_ivectors = NULL,
@@ -318,7 +318,7 @@ class DecodableAmNnetSimpleLooped: public DecodableInterface {
  private:
   KALDI_DISALLOW_COPY_AND_ASSIGN(DecodableAmNnetSimpleLooped);
   DecodableNnetSimpleLooped decodable_nnet_;
-  const TransitionModel &trans_model_;
+  const Transitions &trans_model_;
 };
 
 
diff --git a/src/nnet3/discriminative-supervision.cc b/src/nnet3/discriminative-supervision.cc
index 0f8f8a4aef7..1097a4c472a 100644
--- a/src/nnet3/discriminative-supervision.cc
+++ b/src/nnet3/discriminative-supervision.cc
@@ -135,7 +135,7 @@ void DiscriminativeSupervision::Check() const {
 
 DiscriminativeSupervisionSplitter::DiscriminativeSupervisionSplitter(
     const SplitDiscriminativeSupervisionOptions &config,
-    const TransitionModel &tmodel,
+    const Transitions &tmodel,
     const DiscriminativeSupervision &supervision):
     config_(config), tmodel_(tmodel), supervision_(supervision) {
   if (supervision_.num_sequences != 1) {
diff --git a/src/nnet3/discriminative-supervision.h b/src/nnet3/discriminative-supervision.h
index 17c0b1cdb1e..ac563814d56 100644
--- a/src/nnet3/discriminative-supervision.h
+++ b/src/nnet3/discriminative-supervision.h
@@ -23,7 +23,7 @@
 
 #include "util/table-types.h"
 #include "hmm/posterior.h"
-#include "hmm/transition-model.h"
+#include "hmm/transitions.h"
 #include "lat/kaldi-lattice.h"
 
 namespace kaldi {
@@ -144,7 +144,7 @@ class DiscriminativeSupervisionSplitter {
 
   DiscriminativeSupervisionSplitter(
       const SplitDiscriminativeSupervisionOptions &config,
-      const TransitionModel &tmodel,
+      const Transitions &tmodel,
       const DiscriminativeSupervision &supervision);
 
   // A structure used to store the forward and backward scores
@@ -185,7 +185,7 @@ class DiscriminativeSupervisionSplitter {
 
   // Transition model is used by the function
   // CollapseTransitionIds()
-  const TransitionModel &tmodel_;
+  const Transitions &tmodel_;
 
   // A reference to the supervision object that we will be splitting
   const DiscriminativeSupervision &supervision_;
diff --git a/src/nnet3/discriminative-training.cc b/src/nnet3/discriminative-training.cc
index 4a32236c9ff..a4c967c6622 100644
--- a/src/nnet3/discriminative-training.cc
+++ b/src/nnet3/discriminative-training.cc
@@ -94,7 +94,7 @@ class DiscriminativeComputation {
   // even though this does not offer any computational advantages here
   // as in the 'chain' case.
   DiscriminativeComputation(const DiscriminativeOptions &opts,
-      const TransitionModel &tmodel,
+      const Transitions &tmodel,
       const CuVectorBase<BaseFloat> &log_priors,
       const DiscriminativeSupervision &supervision,
       const CuMatrixBase<BaseFloat> &nnet_output,
@@ -109,7 +109,7 @@ class DiscriminativeComputation {
 
  private:
   const DiscriminativeOptions &opts_;
-  const TransitionModel &tmodel_;
+  const Transitions &tmodel_;
 
   // Vector of log-priors of pdfs.
   // This can be a size zero vector e.g. for 'chain' model
@@ -180,7 +180,7 @@ class DiscriminativeComputation {
 
 DiscriminativeComputation::DiscriminativeComputation(
                             const DiscriminativeOptions &opts,
-                            const TransitionModel &tmodel,
+                            const Transitions &tmodel,
                             const CuVectorBase<BaseFloat> &log_priors,
                             const DiscriminativeSupervision &supervision,
                             const CuMatrixBase<BaseFloat> &nnet_output,
@@ -544,7 +544,7 @@ double DiscriminativeComputation::ComputeObjfAndDeriv(Posterior *post,
 
 
 void ComputeDiscriminativeObjfAndDeriv(const DiscriminativeOptions &opts,
-                                       const TransitionModel &tmodel,
+                                       const Transitions &tmodel,
                                        const CuVectorBase<BaseFloat> &log_priors,
                                        const DiscriminativeSupervision &supervision,
                                        const CuMatrixBase<BaseFloat> &nnet_output,
diff --git a/src/nnet3/discriminative-training.h b/src/nnet3/discriminative-training.h
index 4ec7109d64f..96d95a54ca5 100644
--- a/src/nnet3/discriminative-training.h
+++ b/src/nnet3/discriminative-training.h
@@ -29,7 +29,7 @@
 #include "tree/context-dep.h"
 #include "lat/kaldi-lattice.h"
 #include "matrix/kaldi-matrix.h"
-#include "hmm/transition-model.h"
+#include "hmm/transitions.h"
 #include "nnet3/discriminative-supervision.h"
 #include "lat/lattice-functions.h"
 #include "cudamatrix/cu-matrix-lib.h"
@@ -235,7 +235,7 @@ struct DiscriminativeObjectiveInfo {
 */
 void ComputeDiscriminativeObjfAndDeriv(
     const DiscriminativeOptions &opts,
-    const TransitionModel &tmodel,
+    const Transitions &tmodel,
     const CuVectorBase<BaseFloat> &log_priors,
     const DiscriminativeSupervision &supervision,
     const CuMatrixBase<BaseFloat> &nnet_output,
diff --git a/src/nnet3/nnet-am-decodable-simple.cc b/src/nnet3/nnet-am-decodable-simple.cc
index 9682bd96bc7..1b0ddc066a9 100644
--- a/src/nnet3/nnet-am-decodable-simple.cc
+++ b/src/nnet3/nnet-am-decodable-simple.cc
@@ -57,7 +57,7 @@ DecodableNnetSimple::DecodableNnetSimple(
 
 DecodableAmNnetSimple::DecodableAmNnetSimple(
     const NnetSimpleComputationOptions &opts,
-    const TransitionModel &trans_model,
+    const Transitions &trans_model,
     const AmNnetSimple &am_nnet,
     const MatrixBase<BaseFloat> &feats,
     const VectorBase<BaseFloat> *ivector,
@@ -312,7 +312,7 @@ void DecodableNnetSimple::CheckAndFixConfigs() {
 
 DecodableAmNnetSimpleParallel::DecodableAmNnetSimpleParallel(
     const NnetSimpleComputationOptions &opts,
-    const TransitionModel &trans_model,
+    const Transitions &trans_model,
     const AmNnetSimple &am_nnet,
     const MatrixBase<BaseFloat> &feats,
     const VectorBase<BaseFloat> *ivector,
diff --git a/src/nnet3/nnet-am-decodable-simple.h b/src/nnet3/nnet-am-decodable-simple.h
index e83b9e4bab2..34560bf247c 100644
--- a/src/nnet3/nnet-am-decodable-simple.h
+++ b/src/nnet3/nnet-am-decodable-simple.h
@@ -23,7 +23,7 @@
 #include <vector>
 #include "base/kaldi-common.h"
 #include "gmm/am-diag-gmm.h"
-#include "hmm/transition-model.h"
+#include "hmm/transitions.h"
 #include "itf/decodable-itf.h"
 #include "nnet3/nnet-optimize.h"
 #include "nnet3/nnet-compute.h"
@@ -283,7 +283,7 @@ class DecodableAmNnetSimple: public DecodableInterface {
                         compiler(am_nnet.GetNnet(), opts.optimize_config).
   */
   DecodableAmNnetSimple(const NnetSimpleComputationOptions &opts,
-                        const TransitionModel &trans_model,
+                        const Transitions &trans_model,
                         const AmNnetSimple &am_nnet,
                         const MatrixBase<BaseFloat> &feats,
                         const VectorBase<BaseFloat> *ivector = NULL,
@@ -311,7 +311,7 @@ class DecodableAmNnetSimple: public DecodableInterface {
   // argument to the constructor is NULL.
   CachingOptimizingCompiler compiler_;
   DecodableNnetSimple decodable_nnet_;
-  const TransitionModel &trans_model_;
+  const Transitions &trans_model_;
 };
 
 
@@ -355,7 +355,7 @@ class DecodableAmNnetSimpleParallel: public DecodableInterface {
   */
   DecodableAmNnetSimpleParallel(
       const NnetSimpleComputationOptions &opts,
-      const TransitionModel &trans_model,
+      const Transitions &trans_model,
       const AmNnetSimple &am_nnet,
       const MatrixBase<BaseFloat> &feats,
       const VectorBase<BaseFloat> *ivector = NULL,
@@ -382,7 +382,7 @@ class DecodableAmNnetSimpleParallel: public DecodableInterface {
   void DeletePointers();
 
   CachingOptimizingCompiler compiler_;
-  const TransitionModel &trans_model_;
+  const Transitions &trans_model_;
 
   Matrix<BaseFloat> *feats_copy_;
   Vector<BaseFloat> *ivector_copy_;
diff --git a/src/nnet3/nnet-batch-compute.cc b/src/nnet3/nnet-batch-compute.cc
index 7124afb22b1..8713d17c049 100644
--- a/src/nnet3/nnet-batch-compute.cc
+++ b/src/nnet3/nnet-batch-compute.cc
@@ -1007,7 +1007,7 @@ void NnetBatchInference::Compute() {
 NnetBatchDecoder::NnetBatchDecoder(
     const fst::Fst<fst::StdArc> &fst,
     const LatticeFasterDecoderConfig &decoder_opts,
-    const TransitionModel &trans_model,
+    const Transitions &trans_model,
     const fst::SymbolTable *word_syms,
     bool allow_partial,
     int32 num_threads,
diff --git a/src/nnet3/nnet-batch-compute.h b/src/nnet3/nnet-batch-compute.h
index bdc58e8cb4b..e30d27e5e9a 100644
--- a/src/nnet3/nnet-batch-compute.h
+++ b/src/nnet3/nnet-batch-compute.h
@@ -28,7 +28,7 @@
 #include <condition_variable>
 #include "base/kaldi-common.h"
 #include "gmm/am-diag-gmm.h"
-#include "hmm/transition-model.h"
+#include "hmm/transitions.h"
 #include "itf/decodable-itf.h"
 #include "nnet3/nnet-optimize.h"
 #include "nnet3/nnet-compute.h"
@@ -628,7 +628,7 @@ class NnetBatchDecoder {
    */
   NnetBatchDecoder(const fst::Fst<fst::StdArc> &fst,
                    const LatticeFasterDecoderConfig &decoder_config,
-                   const TransitionModel &trans_model,
+                   const Transitions &trans_model,
                    const fst::SymbolTable *word_syms,
                    bool allow_partial,
                    int32 num_threads,
@@ -768,7 +768,7 @@ class NnetBatchDecoder {
 
   const fst::Fst<fst::StdArc> &fst_;
   const LatticeFasterDecoderConfig &decoder_opts_;
-  const TransitionModel &trans_model_;
+  const Transitions &trans_model_;
   const fst::SymbolTable *word_syms_;  // May be NULL.  Owned here.
   bool allow_partial_;
   NnetBatchComputer *computer_;
diff --git a/src/nnet3/nnet-discriminative-diagnostics.cc b/src/nnet3/nnet-discriminative-diagnostics.cc
index 488372be8e1..36891804a18 100644
--- a/src/nnet3/nnet-discriminative-diagnostics.cc
+++ b/src/nnet3/nnet-discriminative-diagnostics.cc
@@ -28,7 +28,7 @@ namespace nnet3 {
 NnetDiscriminativeComputeObjf::NnetDiscriminativeComputeObjf(
     const NnetComputeProbOptions &nnet_config,
     const discriminative::DiscriminativeOptions &discriminative_config,
-    const TransitionModel &tmodel,
+    const Transitions &tmodel,
     const VectorBase<BaseFloat> &priors,
     const Nnet &nnet):
     nnet_config_(nnet_config),
diff --git a/src/nnet3/nnet-discriminative-diagnostics.h b/src/nnet3/nnet-discriminative-diagnostics.h
index 3bcae8fac30..ca1b1f82915 100644
--- a/src/nnet3/nnet-discriminative-diagnostics.h
+++ b/src/nnet3/nnet-discriminative-diagnostics.h
@@ -41,7 +41,7 @@ class NnetDiscriminativeComputeObjf {
   // does not store a reference to 'config' but does store one to 'nnet'.
   NnetDiscriminativeComputeObjf(const NnetComputeProbOptions &nnet_config,
       const discriminative::DiscriminativeOptions &discriminative_config,
-      const TransitionModel &tmodel,
+      const Transitions &tmodel,
       const VectorBase<BaseFloat> &priors,
       const Nnet &nnet);
 
@@ -71,7 +71,7 @@ class NnetDiscriminativeComputeObjf {
   NnetComputeProbOptions nnet_config_;
 
   discriminative::DiscriminativeOptions discriminative_config_;
-  const TransitionModel &tmodel_;
+  const Transitions &tmodel_;
   CuVector<BaseFloat> log_priors_;
   const Nnet &nnet_;
   CachingOptimizingCompiler compiler_;
diff --git a/src/nnet3/nnet-discriminative-example.h b/src/nnet3/nnet-discriminative-example.h
index c0ea446552e..e516f503794 100644
--- a/src/nnet3/nnet-discriminative-example.h
+++ b/src/nnet3/nnet-discriminative-example.h
@@ -28,7 +28,7 @@
 #include "nnet3/nnet-example.h"
 #include "nnet3/nnet-example-utils.h"
 #include "hmm/posterior.h"
-#include "hmm/transition-model.h"
+#include "hmm/transitions.h"
 
 namespace kaldi {
 namespace nnet3 {
diff --git a/src/nnet3/nnet-discriminative-training.cc b/src/nnet3/nnet-discriminative-training.cc
index 91a72c73cca..905fb6dc432 100644
--- a/src/nnet3/nnet-discriminative-training.cc
+++ b/src/nnet3/nnet-discriminative-training.cc
@@ -26,7 +26,7 @@ namespace nnet3 {
 
 NnetDiscriminativeTrainer::NnetDiscriminativeTrainer(
                                    const NnetDiscriminativeOptions &opts,
-                                   const TransitionModel &tmodel,
+                                   const Transitions &tmodel,
                                    const VectorBase<BaseFloat> &priors,
                                    Nnet *nnet):
     opts_(opts), tmodel_(tmodel), log_priors_(priors),
diff --git a/src/nnet3/nnet-discriminative-training.h b/src/nnet3/nnet-discriminative-training.h
index 4846aeca9d3..3eff44efcd9 100644
--- a/src/nnet3/nnet-discriminative-training.h
+++ b/src/nnet3/nnet-discriminative-training.h
@@ -87,7 +87,7 @@ struct DiscriminativeObjectiveFunctionInfo {
 class NnetDiscriminativeTrainer {
  public:
   NnetDiscriminativeTrainer(const NnetDiscriminativeOptions &config,
-                            const TransitionModel &tmodel,
+                            const Transitions &tmodel,
                             const VectorBase<BaseFloat> &priors,
                             Nnet *nnet);
 
@@ -104,7 +104,7 @@ class NnetDiscriminativeTrainer {
 
   const NnetDiscriminativeOptions opts_;
 
-  const TransitionModel &tmodel_;
+  const Transitions &tmodel_;
   CuVector<BaseFloat> log_priors_;
   
   Nnet *nnet_;
diff --git a/src/nnet3/nnet-nnet.cc b/src/nnet3/nnet-nnet.cc
index 334234f53db..12dde9d8f9e 100644
--- a/src/nnet3/nnet-nnet.cc
+++ b/src/nnet3/nnet-nnet.cc
@@ -24,7 +24,7 @@
 #include "nnet3/nnet-utils.h"
 #include "nnet3/nnet-simple-component.h"
 #include "nnet3/am-nnet-simple.h"
-#include "hmm/transition-model.h"
+#include "hmm/transitions.h"
 
 namespace kaldi {
 namespace nnet3 {
@@ -587,10 +587,10 @@ void Nnet::Read(std::istream &is, bool binary) {
   Destroy();
   int first_char = PeekToken(is, binary);
   if (first_char == 'T') {
-    // This branch is to allow '.mdl' files (containing a TransitionModel
+    // This branch is to allow '.mdl' files (containing a Transitions
     // and then an AmNnetSimple) to be read where .raw files (containing
     // just an Nnet) would be expected.  This is often convenient.
-    TransitionModel temp_trans_model;
+    Transitions temp_trans_model;
     temp_trans_model.Read(is, binary);
     AmNnetSimple temp_am_nnet;
     temp_am_nnet.Read(is, binary);
diff --git a/src/nnet3bin/Makefile b/src/nnet3bin/Makefile
index 67d15d3c38a..ac4ee9b1818 100644
--- a/src/nnet3bin/Makefile
+++ b/src/nnet3bin/Makefile
@@ -8,7 +8,7 @@ LDLIBS += $(CUDA_LDLIBS)
 
 BINFILES = nnet3-init nnet3-info nnet3-get-egs nnet3-copy-egs nnet3-subset-egs \
    nnet3-shuffle-egs nnet3-acc-lda-stats nnet3-merge-egs \
-   nnet3-compute-from-egs nnet3-train nnet3-am-init nnet3-am-train-transitions \
+   nnet3-compute-from-egs nnet3-train nnet3-am-init \
    nnet3-am-adjust-priors nnet3-am-copy nnet3-compute-prob \
    nnet3-average nnet3-am-info nnet3-combine nnet3-latgen-faster \
    nnet3-latgen-faster-parallel nnet3-show-progress nnet3-align-compiled \
@@ -33,6 +33,6 @@ ADDLIBS = ../nnet3/kaldi-nnet3.a ../chain/kaldi-chain.a \
           ../lat/kaldi-lat.a ../fstext/kaldi-fstext.a ../hmm/kaldi-hmm.a \
           ../transform/kaldi-transform.a ../gmm/kaldi-gmm.a \
           ../tree/kaldi-tree.a ../util/kaldi-util.a ../matrix/kaldi-matrix.a \
-          ../base/kaldi-base.a 
+          ../base/kaldi-base.a
 
 include ../makefiles/default_rules.mk
diff --git a/src/nnet3bin/nnet3-acc-lda-stats.cc b/src/nnet3bin/nnet3-acc-lda-stats.cc
index c8911a4a39f..c7f0a20aa6b 100644
--- a/src/nnet3bin/nnet3-acc-lda-stats.cc
+++ b/src/nnet3bin/nnet3-acc-lda-stats.cc
@@ -19,7 +19,7 @@
 
 #include "base/kaldi-common.h"
 #include "util/common-utils.h"
-#include "hmm/transition-model.h"
+#include "hmm/transitions.h"
 #include "nnet3/nnet-nnet.h"
 #include "nnet3/nnet-example-utils.h"
 #include "nnet3/nnet-optimize.h"
diff --git a/src/nnet3bin/nnet3-align-compiled.cc b/src/nnet3bin/nnet3-align-compiled.cc
index a09ab1cf8ae..09d1fcc4407 100644
--- a/src/nnet3bin/nnet3-align-compiled.cc
+++ b/src/nnet3bin/nnet3-align-compiled.cc
@@ -23,7 +23,7 @@
 #include "base/kaldi-common.h"
 #include "util/common-utils.h"
 #include "gmm/am-diag-gmm.h"
-#include "hmm/transition-model.h"
+#include "hmm/transitions.h"
 #include "hmm/hmm-utils.h"
 #include "fstext/fstext-lib.h"
 #include "decoder/decoder-wrappers.h"
@@ -108,7 +108,7 @@ int main(int argc, char *argv[]) {
 
 
     {
-      TransitionModel trans_model;
+      Transitions trans_model;
       AmNnetSimple am_nnet;
       {
         bool binary;
diff --git a/src/nnet3bin/nnet3-am-adjust-priors.cc b/src/nnet3bin/nnet3-am-adjust-priors.cc
index 957e1dae04a..fa9729682c9 100644
--- a/src/nnet3bin/nnet3-am-adjust-priors.cc
+++ b/src/nnet3bin/nnet3-am-adjust-priors.cc
@@ -20,7 +20,7 @@
 #include "base/kaldi-common.h"
 #include "util/common-utils.h"
 #include "nnet3/am-nnet-simple.h"
-#include "hmm/transition-model.h"
+#include "hmm/transitions.h"
 #include "tree/context-dep.h"
 
 namespace kaldi {
@@ -107,7 +107,7 @@ int main(int argc, char *argv[]) {
         posterior_vec_rxfilename = po.GetArg(2),
         nnet_wxfilename = po.GetArg(3);
     
-    TransitionModel trans_model;
+    Transitions trans_model;
     AmNnetSimple am_nnet;
     {
       bool binary_read;
diff --git a/src/nnet3bin/nnet3-am-copy.cc b/src/nnet3bin/nnet3-am-copy.cc
index 2230ae77c00..2c944b657ae 100644
--- a/src/nnet3bin/nnet3-am-copy.cc
+++ b/src/nnet3bin/nnet3-am-copy.cc
@@ -21,7 +21,7 @@
 #include <typeinfo>
 #include "base/kaldi-common.h"
 #include "util/common-utils.h"
-#include "hmm/transition-model.h"
+#include "hmm/transitions.h"
 #include "nnet3/am-nnet-simple.h"
 #include "nnet3/nnet-utils.h"
 
@@ -98,7 +98,7 @@ int main(int argc, char *argv[]) {
     std::string nnet_rxfilename = po.GetArg(1),
         nnet_wxfilename = po.GetArg(2);
 
-    TransitionModel trans_model;
+    Transitions trans_model;
     AmNnetSimple am_nnet;
     {
       bool binary;
diff --git a/src/nnet3bin/nnet3-am-info.cc b/src/nnet3bin/nnet3-am-info.cc
index be3df5e1e8a..e14c2f1876a 100644
--- a/src/nnet3bin/nnet3-am-info.cc
+++ b/src/nnet3bin/nnet3-am-info.cc
@@ -20,7 +20,7 @@
 #include "base/kaldi-common.h"
 #include "util/common-utils.h"
 #include "nnet3/am-nnet-simple.h"
-#include "hmm/transition-model.h"
+#include "hmm/transitions.h"
 
 int main(int argc, char *argv[]) {
   try {
@@ -48,7 +48,7 @@ int main(int argc, char *argv[]) {
 
     std::string  nnet_rxfilename = po.GetArg(1);
 
-    TransitionModel trans_model;
+    Transitions trans_model;
     AmNnetSimple am_nnet;
     {
       bool binary;
diff --git a/src/nnet3bin/nnet3-am-init.cc b/src/nnet3bin/nnet3-am-init.cc
index 3cb0bfe012b..6d8e69804f8 100644
--- a/src/nnet3bin/nnet3-am-init.cc
+++ b/src/nnet3bin/nnet3-am-init.cc
@@ -19,7 +19,7 @@
 
 #include "base/kaldi-common.h"
 #include "util/common-utils.h"
-#include "hmm/transition-model.h"
+#include "hmm/transitions.h"
 #include "tree/context-dep.h"
 #include "nnet3/am-nnet-simple.h"
 
@@ -56,7 +56,7 @@ int main(int argc, char *argv[]) {
 
     std::string raw_nnet_rxfilename,
         am_nnet_wxfilename;
-    TransitionModel *trans_model = NULL;
+    Transitions *trans_model = NULL;
     
     if (po.NumArgs() == 4) {
       std::string tree_rxfilename = po.GetArg(1),
@@ -67,17 +67,17 @@ int main(int argc, char *argv[]) {
       ContextDependency ctx_dep;
       ReadKaldiObject(tree_rxfilename, &ctx_dep);
     
-      HmmTopology topo;
+      Topology topo;
       ReadKaldiObject(topo_rxfilename, &topo);
       
       // Construct the transition model from the tree and the topology file.
-      trans_model = new TransitionModel(ctx_dep, topo);
+      trans_model = new Transitions(ctx_dep, topo);
     } else {
       std::string trans_model_rxfilename =  po.GetArg(1);
       raw_nnet_rxfilename = po.GetArg(2);
       am_nnet_wxfilename = po.GetArg(3);
       
-      trans_model = new TransitionModel();
+      trans_model = new Transitions();
       ReadKaldiObject(trans_model_rxfilename, trans_model);
     }
 
diff --git a/src/nnet3bin/nnet3-am-train-transitions.cc b/src/nnet3bin/nnet3-am-train-transitions.cc
deleted file mode 100644
index a1956d1e6b9..00000000000
--- a/src/nnet3bin/nnet3-am-train-transitions.cc
+++ /dev/null
@@ -1,147 +0,0 @@
-// nnet3bin/nnet3-am-train-transitions.cc
-
-// Copyright 2012-2015  Johns Hopkins University (author:  Daniel Povey)
-
-// See ../../COPYING for clarification regarding multiple authors
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//  http://www.apache.org/licenses/LICENSE-2.0
-//
-// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
-// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
-// MERCHANTABLITY OR NON-INFRINGEMENT.
-// See the Apache 2 License for the specific language governing permissions and
-// limitations under the License.
-
-#include "base/kaldi-common.h"
-#include "util/common-utils.h"
-#include "hmm/transition-model.h"
-#include "nnet3/am-nnet-simple.h"
-#include "tree/context-dep.h"
-
-namespace kaldi {
-namespace nnet3 {
-void SetPriors(const TransitionModel &tmodel,
-               const Vector<double> &transition_accs,
-               double prior_floor,
-               AmNnetSimple *am_nnet) {
-  KALDI_ASSERT(tmodel.NumPdfs() == am_nnet->NumPdfs());
-  Vector<BaseFloat> pdf_counts(tmodel.NumPdfs());
-  KALDI_ASSERT(transition_accs(0) == 0.0); // There is
-  // no zero transition-id.
-  for (int32 tid = 1; tid < transition_accs.Dim(); tid++) {
-    int32 pdf = tmodel.TransitionIdToPdf(tid);
-    pdf_counts(pdf) += transition_accs(tid);
-  }
-  BaseFloat sum = pdf_counts.Sum();
-  KALDI_ASSERT(sum != 0.0);
-  KALDI_ASSERT(prior_floor > 0.0 && prior_floor < 1.0);
-  pdf_counts.Scale(1.0 / sum);
-  pdf_counts.ApplyFloor(prior_floor);
-  pdf_counts.Scale(1.0 / pdf_counts.Sum()); // normalize again.
-  am_nnet->SetPriors(pdf_counts);
-}               
-
-
-} // namespace nnet3
-} // namespace kaldi
-
-int main(int argc, char *argv[]) {
-  try {
-    using namespace kaldi;
-    using namespace kaldi::nnet3;
-    typedef kaldi::int32 int32;
-
-    const char *usage =
-        "Train the transition probabilities of an nnet3 neural network acoustic model\n"
-        "\n"
-        "Usage:  nnet3-am-train-transitions [options] <nnet-in> <alignments-rspecifier> <nnet-out>\n"
-        "e.g.:\n"
-        " nnet3-am-train-transitions 1.nnet \"ark:gunzip -c ali.*.gz|\" 2.nnet\n";
-    
-    bool binary_write = true;
-    bool set_priors = true; // Also set the per-pdf priors in the model.
-    BaseFloat prior_floor = 5.0e-06; // The default was previously 1e-8, but
-                                     // once we had problems with a pdf-id that
-                                     // was not being seen in training, being
-                                     // recognized all the time.  This value
-                                     // seemed to be the smallest prior of the
-                                     // "seen" pdf-ids in one run.
-    MleTransitionUpdateConfig transition_update_config;
-    
-    ParseOptions po(usage);
-    po.Register("binary", &binary_write, "Write output in binary mode");
-    po.Register("set-priors", &set_priors, "If true, also set priors in neural "
-                "net (we divide by these in test time)");
-    po.Register("prior-floor", &prior_floor, "When setting priors, floor for "
-                "priors");
-    transition_update_config.Register(&po);
-    
-    po.Read(argc, argv);
-    
-    if (po.NumArgs() != 3) {
-      po.PrintUsage();
-      exit(1);
-    }
-
-    std::string nnet_rxfilename = po.GetArg(1),
-        ali_rspecifier = po.GetArg(2),
-        nnet_wxfilename = po.GetArg(3);
-    
-    TransitionModel trans_model;
-    AmNnetSimple am_nnet;
-    {
-      bool binary_read;
-      Input ki(nnet_rxfilename, &binary_read);
-      trans_model.Read(ki.Stream(), binary_read);
-      am_nnet.Read(ki.Stream(), binary_read);
-    }
-    
-    Vector<double> transition_accs;
-    trans_model.InitStats(&transition_accs);
-
-    int32 num_done = 0;
-    SequentialInt32VectorReader ali_reader(ali_rspecifier);
-    for (; ! ali_reader.Done(); ali_reader.Next()) {
-      const std::vector<int32> alignment(ali_reader.Value());
-      for (size_t i = 0; i < alignment.size(); i++) {
-        int32 tid = alignment[i];
-        BaseFloat weight = 1.0;
-        trans_model.Accumulate(weight, tid, &transition_accs);
-      }
-      num_done++;
-    }
-    KALDI_LOG << "Accumulated transition stats from " << num_done
-              << " utterances.";
-
-    {
-      BaseFloat objf_impr, count;
-      trans_model.MleUpdate(transition_accs, transition_update_config,
-                            &objf_impr, &count);
-      KALDI_LOG << "Transition model update: average " << (objf_impr/count)
-                << " log-like improvement per frame over " << count
-                << " frames.";
-    }
-
-    if (set_priors) {
-      KALDI_LOG << "Setting priors of pdfs in the model.";
-      SetPriors(trans_model, transition_accs, prior_floor, &am_nnet);
-    }
-    
-    {
-      Output ko(nnet_wxfilename, binary_write);
-      trans_model.Write(ko.Stream(), binary_write);
-      am_nnet.Write(ko.Stream(), binary_write);
-    }
-    KALDI_LOG << "Trained transitions of neural network model and wrote it to "
-              << nnet_wxfilename;
-    return 0;
-  } catch(const std::exception &e) {
-    std::cerr << e.what() << '\n';
-    return -1;
-  }
-}
diff --git a/src/nnet3bin/nnet3-average.cc b/src/nnet3bin/nnet3-average.cc
index d794e37e50d..50085010d07 100644
--- a/src/nnet3bin/nnet3-average.cc
+++ b/src/nnet3bin/nnet3-average.cc
@@ -19,7 +19,7 @@
 
 #include "base/kaldi-common.h"
 #include "util/common-utils.h"
-#include "hmm/transition-model.h"
+#include "hmm/transitions.h"
 #include "nnet3/nnet-utils.h"
 
 
diff --git a/src/nnet3bin/nnet3-compute-batch.cc b/src/nnet3bin/nnet3-compute-batch.cc
index 5d4b9b1db48..2c3606bb896 100644
--- a/src/nnet3bin/nnet3-compute-batch.cc
+++ b/src/nnet3bin/nnet3-compute-batch.cc
@@ -104,7 +104,7 @@ int main(int argc, char *argv[]) {
     AmNnetSimple am_nnet;
     if (use_priors) {
       bool binary;
-      TransitionModel trans_model;
+      Transitions trans_model;
       Input ki(nnet_rxfilename, &binary);
       trans_model.Read(ki.Stream(), binary);
       am_nnet.Read(ki.Stream(), binary);
diff --git a/src/nnet3bin/nnet3-compute-from-egs.cc b/src/nnet3bin/nnet3-compute-from-egs.cc
index 8f29675211f..f04044e6c88 100644
--- a/src/nnet3bin/nnet3-compute-from-egs.cc
+++ b/src/nnet3bin/nnet3-compute-from-egs.cc
@@ -19,7 +19,7 @@
 
 #include "base/kaldi-common.h"
 #include "util/common-utils.h"
-#include "hmm/transition-model.h"
+#include "hmm/transitions.h"
 #include "nnet3/nnet-nnet.h"
 #include "nnet3/nnet-example-utils.h"
 #include "nnet3/nnet-optimize.h"
diff --git a/src/nnet3bin/nnet3-compute.cc b/src/nnet3bin/nnet3-compute.cc
index cf133025aae..681207ef813 100644
--- a/src/nnet3bin/nnet3-compute.cc
+++ b/src/nnet3bin/nnet3-compute.cc
@@ -101,7 +101,7 @@ int main(int argc, char *argv[]) {
     AmNnetSimple am_nnet;
     if (use_priors) {
       bool binary;
-      TransitionModel trans_model;
+      Transitions trans_model;
       Input ki(nnet_rxfilename, &binary);
       trans_model.Read(ki.Stream(), binary);
       am_nnet.Read(ki.Stream(), binary);
diff --git a/src/nnet3bin/nnet3-copy-egs.cc b/src/nnet3bin/nnet3-copy-egs.cc
index 19c205461ae..9ff9395f47c 100644
--- a/src/nnet3bin/nnet3-copy-egs.cc
+++ b/src/nnet3bin/nnet3-copy-egs.cc
@@ -20,7 +20,7 @@
 
 #include "base/kaldi-common.h"
 #include "util/common-utils.h"
-#include "hmm/transition-model.h"
+#include "hmm/transitions.h"
 #include "nnet3/nnet-example.h"
 #include "nnet3/nnet-example-utils.h"
 
diff --git a/src/nnet3bin/nnet3-copy.cc b/src/nnet3bin/nnet3-copy.cc
index 6d53d44e087..3d001ad051c 100644
--- a/src/nnet3bin/nnet3-copy.cc
+++ b/src/nnet3bin/nnet3-copy.cc
@@ -21,7 +21,7 @@
 #include <typeinfo>
 #include "base/kaldi-common.h"
 #include "util/common-utils.h"
-#include "hmm/transition-model.h"
+#include "hmm/transitions.h"
 #include "nnet3/am-nnet-simple.h"
 #include "nnet3/nnet-utils.h"
 
diff --git a/src/nnet3bin/nnet3-discriminative-compute-from-egs.cc b/src/nnet3bin/nnet3-discriminative-compute-from-egs.cc
index d8b0f469beb..480523e202d 100644
--- a/src/nnet3bin/nnet3-discriminative-compute-from-egs.cc
+++ b/src/nnet3bin/nnet3-discriminative-compute-from-egs.cc
@@ -19,7 +19,7 @@
 
 #include "base/kaldi-common.h"
 #include "util/common-utils.h"
-#include "hmm/transition-model.h"
+#include "hmm/transitions.h"
 #include "nnet3/nnet-nnet.h"
 #include "nnet3/nnet-example-utils.h"
 #include "nnet3/nnet-discriminative-example.h"
diff --git a/src/nnet3bin/nnet3-discriminative-compute-objf.cc b/src/nnet3bin/nnet3-discriminative-compute-objf.cc
index cc3d0f1e3ac..a2cc2c3e794 100644
--- a/src/nnet3bin/nnet3-discriminative-compute-objf.cc
+++ b/src/nnet3bin/nnet3-discriminative-compute-objf.cc
@@ -70,7 +70,7 @@ int main(int argc, char *argv[]) {
     std::string model_rxfilename = po.GetArg(1),
         examples_rspecifier = po.GetArg(2);
 
-    TransitionModel tmodel;
+    Transitions tmodel;
     AmNnetSimple am_nnet;
 
     {
diff --git a/src/nnet3bin/nnet3-discriminative-copy-egs.cc b/src/nnet3bin/nnet3-discriminative-copy-egs.cc
index 17dc2ee4e13..bde1dc29c5d 100644
--- a/src/nnet3bin/nnet3-discriminative-copy-egs.cc
+++ b/src/nnet3bin/nnet3-discriminative-copy-egs.cc
@@ -20,7 +20,7 @@
 
 #include "base/kaldi-common.h"
 #include "util/common-utils.h"
-#include "hmm/transition-model.h"
+#include "hmm/transitions.h"
 #include "nnet3/nnet-discriminative-example.h"
 
 namespace kaldi {
diff --git a/src/nnet3bin/nnet3-discriminative-get-egs.cc b/src/nnet3bin/nnet3-discriminative-get-egs.cc
index 4a31876532f..37ff3d8863a 100644
--- a/src/nnet3bin/nnet3-discriminative-get-egs.cc
+++ b/src/nnet3bin/nnet3-discriminative-get-egs.cc
@@ -22,7 +22,7 @@
 
 #include "base/kaldi-common.h"
 #include "util/common-utils.h"
-#include "hmm/transition-model.h"
+#include "hmm/transitions.h"
 #include "hmm/posterior.h"
 #include "nnet3/nnet-discriminative-example.h"
 #include "nnet3/discriminative-supervision.h"
@@ -37,7 +37,7 @@ namespace nnet3 {
 // returns true if we got as far as calling GetChunksForUtterance()
 // [in which case stats will be accumulated by class UtteranceSplitter]
 static bool ProcessFile(const discriminative::SplitDiscriminativeSupervisionOptions &config,
-                        const TransitionModel &tmodel,
+                        const Transitions &tmodel,
                         const MatrixBase<BaseFloat> &feats,
                         const MatrixBase<BaseFloat> *ivector_feats,
                         int32 ivector_period,
@@ -214,7 +214,7 @@ int main(int argc, char *argv[]) {
         examples_wspecifier = po.GetArg(5);
 
 
-    TransitionModel tmodel;
+    Transitions tmodel;
     {
       bool binary;
       Input ki(model_wxfilename, &binary);
diff --git a/src/nnet3bin/nnet3-discriminative-merge-egs.cc b/src/nnet3bin/nnet3-discriminative-merge-egs.cc
index bc4cdfb2941..e3e27f8c2f6 100644
--- a/src/nnet3bin/nnet3-discriminative-merge-egs.cc
+++ b/src/nnet3bin/nnet3-discriminative-merge-egs.cc
@@ -20,7 +20,7 @@
 
 #include "base/kaldi-common.h"
 #include "util/common-utils.h"
-#include "hmm/transition-model.h"
+#include "hmm/transitions.h"
 #include "nnet3/nnet-discriminative-example.h"
 
 
diff --git a/src/nnet3bin/nnet3-discriminative-shuffle-egs.cc b/src/nnet3bin/nnet3-discriminative-shuffle-egs.cc
index 2a029123852..251ea693197 100644
--- a/src/nnet3bin/nnet3-discriminative-shuffle-egs.cc
+++ b/src/nnet3bin/nnet3-discriminative-shuffle-egs.cc
@@ -20,7 +20,7 @@
 
 #include "base/kaldi-common.h"
 #include "util/common-utils.h"
-#include "hmm/transition-model.h"
+#include "hmm/transitions.h"
 #include "nnet3/nnet-discriminative-example.h"
 
 int main(int argc, char *argv[]) {
diff --git a/src/nnet3bin/nnet3-discriminative-train.cc b/src/nnet3bin/nnet3-discriminative-train.cc
index 0d201b1ae8d..624b6417acf 100644
--- a/src/nnet3bin/nnet3-discriminative-train.cc
+++ b/src/nnet3bin/nnet3-discriminative-train.cc
@@ -71,7 +71,7 @@ int main(int argc, char *argv[]) {
         examples_rspecifier = po.GetArg(2),
         model_wxfilename = po.GetArg(3);
 
-    TransitionModel tmodel;
+    Transitions tmodel;
     AmNnetSimple am_nnet;
 
     bool binary;
diff --git a/src/nnet3bin/nnet3-egs-augment-image.cc b/src/nnet3bin/nnet3-egs-augment-image.cc
index ef724d0c6a6..331169e3edf 100644
--- a/src/nnet3bin/nnet3-egs-augment-image.cc
+++ b/src/nnet3bin/nnet3-egs-augment-image.cc
@@ -21,7 +21,7 @@
 
 #include "base/kaldi-common.h"
 #include "util/common-utils.h"
-#include "hmm/transition-model.h"
+#include "hmm/transitions.h"
 #include "nnet3/nnet-example.h"
 #include "nnet3/nnet-example-utils.h"
 
diff --git a/src/nnet3bin/nnet3-get-egs-dense-targets.cc b/src/nnet3bin/nnet3-get-egs-dense-targets.cc
index a1902071b60..2773fc57cd0 100644
--- a/src/nnet3bin/nnet3-get-egs-dense-targets.cc
+++ b/src/nnet3bin/nnet3-get-egs-dense-targets.cc
@@ -22,7 +22,7 @@
 
 #include "base/kaldi-common.h"
 #include "util/common-utils.h"
-#include "hmm/transition-model.h"
+#include "hmm/transitions.h"
 #include "hmm/posterior.h"
 #include "nnet3/nnet-example.h"
 #include "nnet3/nnet-example-utils.h"
diff --git a/src/nnet3bin/nnet3-get-egs-simple.cc b/src/nnet3bin/nnet3-get-egs-simple.cc
index 031d941f0d5..3821f6ab8e1 100644
--- a/src/nnet3bin/nnet3-get-egs-simple.cc
+++ b/src/nnet3bin/nnet3-get-egs-simple.cc
@@ -20,7 +20,7 @@
 #include <sstream>
 #include "base/kaldi-common.h"
 #include "util/common-utils.h"
-#include "hmm/transition-model.h"
+#include "hmm/transitions.h"
 #include "hmm/posterior.h"
 #include "nnet3/nnet-example.h"
 #include "nnet3/nnet-example-utils.h"
diff --git a/src/nnet3bin/nnet3-get-egs.cc b/src/nnet3bin/nnet3-get-egs.cc
index fed6d529a82..96410f7d678 100644
--- a/src/nnet3bin/nnet3-get-egs.cc
+++ b/src/nnet3bin/nnet3-get-egs.cc
@@ -21,7 +21,7 @@
 #include <sstream>
 #include "base/kaldi-common.h"
 #include "util/common-utils.h"
-#include "hmm/transition-model.h"
+#include "hmm/transitions.h"
 #include "hmm/posterior.h"
 #include "nnet3/nnet-example.h"
 #include "nnet3/nnet-example-utils.h"
diff --git a/src/nnet3bin/nnet3-init.cc b/src/nnet3bin/nnet3-init.cc
index d913ee4e016..21f43c07783 100644
--- a/src/nnet3bin/nnet3-init.cc
+++ b/src/nnet3bin/nnet3-init.cc
@@ -20,7 +20,7 @@
 #include "base/kaldi-common.h"
 #include "util/common-utils.h"
 #include "nnet3/nnet-nnet.h"
-#include "hmm/transition-model.h"
+#include "hmm/transitions.h"
 #include "tree/context-dep.h"
 
 int main(int argc, char *argv[]) {
diff --git a/src/nnet3bin/nnet3-latgen-faster-batch.cc b/src/nnet3bin/nnet3-latgen-faster-batch.cc
index ec52cff9776..021afbc41f4 100644
--- a/src/nnet3bin/nnet3-latgen-faster-batch.cc
+++ b/src/nnet3bin/nnet3-latgen-faster-batch.cc
@@ -23,7 +23,7 @@
 #include "base/kaldi-common.h"
 #include "decoder/decoder-wrappers.h"
 #include "fstext/fstext-lib.h"
-#include "hmm/transition-model.h"
+#include "hmm/transitions.h"
 #include "nnet3/nnet-batch-compute.h"
 #include "nnet3/nnet-utils.h"
 #include "util/kaldi-thread.h"
@@ -129,7 +129,7 @@ int main(int argc, char *argv[]) {
         feature_rspecifier = po.GetArg(3),
         lattice_wspecifier = po.GetArg(4);
 
-    TransitionModel trans_model;
+    Transitions trans_model;
     AmNnetSimple am_nnet;
     {
       bool binary;
diff --git a/src/nnet3bin/nnet3-latgen-faster-looped.cc b/src/nnet3bin/nnet3-latgen-faster-looped.cc
index 1d46d001d22..1472f047f16 100644
--- a/src/nnet3bin/nnet3-latgen-faster-looped.cc
+++ b/src/nnet3bin/nnet3-latgen-faster-looped.cc
@@ -22,7 +22,7 @@
 #include "base/kaldi-common.h"
 #include "util/common-utils.h"
 #include "tree/context-dep.h"
-#include "hmm/transition-model.h"
+#include "hmm/transitions.h"
 #include "fstext/fstext-lib.h"
 #include "decoder/decoder-wrappers.h"
 #include "nnet3/decodable-simple-looped.h"
@@ -90,7 +90,7 @@ int main(int argc, char *argv[]) {
         words_wspecifier = po.GetOptArg(5),
         alignment_wspecifier = po.GetOptArg(6);
 
-    TransitionModel trans_model;
+    Transitions trans_model;
     AmNnetSimple am_nnet;
     {
       bool binary;
diff --git a/src/nnet3bin/nnet3-latgen-faster-parallel.cc b/src/nnet3bin/nnet3-latgen-faster-parallel.cc
index e3d02410368..6e89b9e090e 100644
--- a/src/nnet3bin/nnet3-latgen-faster-parallel.cc
+++ b/src/nnet3bin/nnet3-latgen-faster-parallel.cc
@@ -23,7 +23,7 @@
 #include "base/kaldi-common.h"
 #include "decoder/decoder-wrappers.h"
 #include "fstext/fstext-lib.h"
-#include "hmm/transition-model.h"
+#include "hmm/transitions.h"
 #include "nnet3/nnet-am-decodable-simple.h"
 #include "nnet3/nnet-utils.h"
 #include "util/kaldi-thread.h"
@@ -95,7 +95,7 @@ int main(int argc, char *argv[]) {
         alignment_wspecifier = po.GetOptArg(6);
 
     TaskSequencer<DecodeUtteranceLatticeFasterClass> sequencer(sequencer_config);
-    TransitionModel trans_model;
+    Transitions trans_model;
     AmNnetSimple am_nnet;
     {
       bool binary;
diff --git a/src/nnet3bin/nnet3-latgen-faster.cc b/src/nnet3bin/nnet3-latgen-faster.cc
index 42cd843cf15..de81f94ebe4 100644
--- a/src/nnet3bin/nnet3-latgen-faster.cc
+++ b/src/nnet3bin/nnet3-latgen-faster.cc
@@ -22,7 +22,7 @@
 #include "base/kaldi-common.h"
 #include "util/common-utils.h"
 #include "tree/context-dep.h"
-#include "hmm/transition-model.h"
+#include "hmm/transitions.h"
 #include "fstext/fstext-lib.h"
 #include "decoder/decoder-wrappers.h"
 #include "nnet3/nnet-am-decodable-simple.h"
@@ -90,7 +90,7 @@ int main(int argc, char *argv[]) {
         words_wspecifier = po.GetOptArg(5),
         alignment_wspecifier = po.GetOptArg(6);
 
-    TransitionModel trans_model;
+    Transitions trans_model;
     AmNnetSimple am_nnet;
     {
       bool binary;
diff --git a/src/nnet3bin/nnet3-latgen-grammar.cc b/src/nnet3bin/nnet3-latgen-grammar.cc
index 9d2304fb1d8..68f0bdb0a1a 100644
--- a/src/nnet3bin/nnet3-latgen-grammar.cc
+++ b/src/nnet3bin/nnet3-latgen-grammar.cc
@@ -21,7 +21,7 @@
 #include "base/kaldi-common.h"
 #include "util/common-utils.h"
 #include "tree/context-dep.h"
-#include "hmm/transition-model.h"
+#include "hmm/transitions.h"
 #include "fstext/fstext-lib.h"
 #include "decoder/decoder-wrappers.h"
 #include "nnet3/nnet-am-decodable-simple.h"
@@ -92,7 +92,7 @@ int main(int argc, char *argv[]) {
         words_wspecifier = po.GetOptArg(5),
         alignment_wspecifier = po.GetOptArg(6);
 
-    TransitionModel trans_model;
+    Transitions trans_model;
     AmNnetSimple am_nnet;
     {
       bool binary;
diff --git a/src/nnet3bin/nnet3-merge-egs.cc b/src/nnet3bin/nnet3-merge-egs.cc
index 8a345a4b703..f98ff3a5f43 100644
--- a/src/nnet3bin/nnet3-merge-egs.cc
+++ b/src/nnet3bin/nnet3-merge-egs.cc
@@ -20,7 +20,7 @@
 
 #include "base/kaldi-common.h"
 #include "util/common-utils.h"
-#include "hmm/transition-model.h"
+#include "hmm/transitions.h"
 #include "nnet3/nnet-example.h"
 #include "nnet3/nnet-example-utils.h"
 
diff --git a/src/nnet3bin/nnet3-show-progress.cc b/src/nnet3bin/nnet3-show-progress.cc
index 25a65dbed5c..01b0265d2c1 100644
--- a/src/nnet3bin/nnet3-show-progress.cc
+++ b/src/nnet3bin/nnet3-show-progress.cc
@@ -20,7 +20,7 @@
 
 #include "base/kaldi-common.h"
 #include "util/common-utils.h"
-#include "hmm/transition-model.h"
+#include "hmm/transitions.h"
 #include "nnet3/nnet-utils.h"
 #include "nnet3/nnet-diagnostics.h"
 
diff --git a/src/nnet3bin/nnet3-shuffle-egs.cc b/src/nnet3bin/nnet3-shuffle-egs.cc
index 1cf08085975..f017cb945cf 100644
--- a/src/nnet3bin/nnet3-shuffle-egs.cc
+++ b/src/nnet3bin/nnet3-shuffle-egs.cc
@@ -20,7 +20,7 @@
 
 #include "base/kaldi-common.h"
 #include "util/common-utils.h"
-#include "hmm/transition-model.h"
+#include "hmm/transitions.h"
 #include "nnet3/nnet-example.h"
 
 int main(int argc, char *argv[]) {
diff --git a/src/nnetbin/Makefile b/src/nnetbin/Makefile
deleted file mode 100644
index 86d59ae503e..00000000000
--- a/src/nnetbin/Makefile
+++ /dev/null
@@ -1,30 +0,0 @@
-
-all:
-EXTRA_CXXFLAGS = -Wno-sign-compare
-include ../kaldi.mk
-
-LDFLAGS += $(CUDA_LDFLAGS)
-LDLIBS += $(CUDA_LDLIBS)
-
-BINFILES = nnet-train-frmshuff \
-        nnet-train-perutt \
-        nnet-train-mmi-sequential \
-        nnet-train-mpe-sequential \
-        nnet-train-multistream nnet-train-multistream-perutt \
-        rbm-train-cd1-frmshuff rbm-convert-to-nnet \
-        nnet-forward nnet-copy nnet-info nnet-concat \
-        transf-to-nnet cmvn-to-nnet nnet-initialize \
-	feat-to-post paste-post train-transitions \
-	cuda-gpu-available nnet-set-learnrate
-
-OBJFILES =
-
-
-
-TESTFILES =
-
-ADDLIBS = ../nnet/kaldi-nnet.a ../cudamatrix/kaldi-cudamatrix.a \
-          ../lat/kaldi-lat.a ../hmm/kaldi-hmm.a ../tree/kaldi-tree.a \
-          ../util/kaldi-util.a ../matrix/kaldi-matrix.a ../base/kaldi-base.a 
-
-include ../makefiles/default_rules.mk
diff --git a/src/nnetbin/cmvn-to-nnet.cc b/src/nnetbin/cmvn-to-nnet.cc
deleted file mode 100644
index c06851e632a..00000000000
--- a/src/nnetbin/cmvn-to-nnet.cc
+++ /dev/null
@@ -1,121 +0,0 @@
-// nnetbin/cmvn-to-nnet.cc
-
-// Copyright 2012-2016  Brno University of Technology
-
-// See ../../COPYING for clarification regarding multiple authors
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//  http://www.apache.org/licenses/LICENSE-2.0
-//
-// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
-// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
-// MERCHANTABLITY OR NON-INFRINGEMENT.
-// See the Apache 2 License for the specific language governing permissions and
-// limitations under the License.
-
-#include "base/kaldi-common.h"
-#include "util/common-utils.h"
-#include "nnet/nnet-nnet.h"
-#include "nnet/nnet-various.h"
-
-int main(int argc, char *argv[]) {
-  try {
-    using namespace kaldi;
-    using namespace kaldi::nnet1;
-    typedef kaldi::int32 int32;
-
-    const char *usage =
-      "Convert cmvn-stats into <AddShift> and <Rescale> components.\n"
-      "Usage:  cmvn-to-nnet [options] <transf-in> <nnet-out>\n"
-      "e.g.:\n"
-      " cmvn-to-nnet --binary=false transf.mat nnet.mdl\n";
-
-
-    bool binary_write = false;
-    float std_dev = 1.0;
-    float var_floor = 1e-10;
-    float learn_rate_coef = 0.0;
-
-    ParseOptions po(usage);
-    po.Register("binary", &binary_write, "Write output in binary mode");
-    po.Register("std-dev", &std_dev, "Standard deviation of the output.");
-    po.Register("var-floor", &var_floor,
-        "Floor the variance, so the factors in <Rescale> are bounded.");
-    po.Register("learn-rate-coef", &learn_rate_coef,
-        "Initialize learning-rate coefficient to a value.");
-
-    po.Read(argc, argv);
-
-    if (po.NumArgs() != 2) {
-      po.PrintUsage();
-      exit(1);
-    }
-
-    std::string cmvn_stats_rxfilename = po.GetArg(1),
-        model_out_filename = po.GetArg(2);
-
-    // read the matrix,
-    Matrix<double> cmvn_stats;
-    {
-      bool binary_read;
-      Input ki(cmvn_stats_rxfilename, &binary_read);
-      cmvn_stats.Read(ki.Stream(), binary_read);
-    }
-    KALDI_ASSERT(cmvn_stats.NumRows() == 2);
-    KALDI_ASSERT(cmvn_stats.NumCols() > 1);
-
-    int32 num_dims = cmvn_stats.NumCols() - 1;
-    double frame_count = cmvn_stats(0, cmvn_stats.NumCols() - 1);
-
-    // buffers for shift and scale
-    Vector<BaseFloat> shift(num_dims);
-    Vector<BaseFloat> scale(num_dims);
-
-    // compute the shift and scale per each dimension
-    for (int32 d = 0; d < num_dims; d++) {
-      BaseFloat mean = cmvn_stats(0, d) / frame_count;
-      BaseFloat var = cmvn_stats(1, d) / frame_count - mean * mean;
-      if (var <= var_floor) {
-        KALDI_WARN << "Very small variance " << var
-                   << " flooring to " << var_floor;
-        var = var_floor;
-      }
-      shift(d) = -mean;
-      scale(d) = std_dev / sqrt(var);
-    }
-
-    // create empty nnet,
-    Nnet nnet;
-
-    // append shift component to nnet,
-    {
-      AddShift shift_component(shift.Dim(), shift.Dim());
-      shift_component.SetParams(shift);
-      shift_component.SetLearnRateCoef(learn_rate_coef);
-      nnet.AppendComponent(shift_component);
-    }
-
-    // append scale component to nnet,
-    {
-      Rescale scale_component(scale.Dim(), scale.Dim());
-      scale_component.SetParams(scale);
-      scale_component.SetLearnRateCoef(learn_rate_coef);
-      nnet.AppendComponent(scale_component);
-    }
-
-    // write the nnet,
-    {
-      Output ko(model_out_filename, binary_write);
-      nnet.Write(ko.Stream(), binary_write);
-      KALDI_LOG << "Written cmvn in 'nnet1' model to: " << model_out_filename;
-    }
-    return 0;
-  } catch(const std::exception &e) {
-    std::cerr << e.what();
-    return -1;
-  }
-}
diff --git a/src/nnetbin/feat-to-post.cc b/src/nnetbin/feat-to-post.cc
deleted file mode 100644
index 6e4f4306938..00000000000
--- a/src/nnetbin/feat-to-post.cc
+++ /dev/null
@@ -1,80 +0,0 @@
-// nnetbin/feat-to-post.cc
-
-// Copyright 2014       Brno University of Technology (Author: Karel Vesely)
-
-// See ../../COPYING for clarification regarding multiple authors
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//  http://www.apache.org/licenses/LICENSE-2.0
-//
-// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
-// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
-// MERCHANTABLITY OR NON-INFRINGEMENT.
-// See the Apache 2 License for the specific language governing permissions and
-// limitations under the License.
-
-
-#include "base/kaldi-common.h"
-#include "util/common-utils.h"
-#include "hmm/posterior.h"
-
-/** @brief Converts features into posterior format, which is the generic
- *  format of NN training targets in 'nnet1'. */
-int main(int argc, char *argv[]) {
-  using namespace kaldi;
-  typedef kaldi::int32 int32;
-  try {
-    const char *usage =
-      "Convert features into posterior format, which is the generic format \n"
-      "of NN training targets in Karel's nnet1 tools.\n"
-      "(speed is not an issue for reasonably low NN-output dimensions)\n"
-      "Usage:  feat-to-post [options] feat-rspecifier posteriors-wspecifier\n"
-      "e.g.:\n"
-      " feat-to-post scp:feats.scp ark:feats.post\n";
-
-    ParseOptions po(usage);
-
-    po.Read(argc, argv);
-
-    if (po.NumArgs() != 2) {
-      po.PrintUsage();
-      exit(1);
-    }
-
-    std::string feats_rspecifier = po.GetArg(1);
-    std::string posteriors_wspecifier = po.GetArg(2);
-
-    int32 num_done = 0;
-    SequentialBaseFloatMatrixReader feats_reader(feats_rspecifier);
-    PosteriorWriter posterior_writer(posteriors_wspecifier);
-
-    for (; !feats_reader.Done(); feats_reader.Next()) {
-      num_done++;
-      const Matrix<BaseFloat> &mat = feats_reader.Value();
-      int32 num_frames = mat.NumRows(),
-        num_dims = mat.NumCols();
-      // Posterior is vector<vector<pair<int32, BaseFloat> > >
-      Posterior post(num_frames);
-      // Fill posterior with matrix values,
-      for (int32 f = 0; f < num_frames; f++) {
-        for (int32 d = 0; d < num_dims; d++) {
-          post[f].push_back(std::make_pair(d, mat(f, d)));
-        }
-        KALDI_ASSERT(post[f].size() == num_dims);
-      }
-      // Store
-      posterior_writer.Write(feats_reader.Key(), post);
-    }
-    KALDI_LOG << "Converted " << num_done << " alignments.";
-    return (num_done != 0 ? 0 : 1);
-  } catch(const std::exception &e) {
-    std::cerr << e.what();
-    return -1;
-  }
-}
-
-
diff --git a/src/nnetbin/nnet-concat.cc b/src/nnetbin/nnet-concat.cc
deleted file mode 100644
index 71c72d05b0a..00000000000
--- a/src/nnetbin/nnet-concat.cc
+++ /dev/null
@@ -1,90 +0,0 @@
-// nnetbin/nnet-concat.cc
-
-// Copyright 2012-2013  Brno University of Technology (Author: Karel Vesely)
-
-// See ../../COPYING for clarification regarding multiple authors
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//  http://www.apache.org/licenses/LICENSE-2.0
-//
-// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
-// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
-// MERCHANTABLITY OR NON-INFRINGEMENT.
-// See the Apache 2 License for the specific language governing permissions and
-// limitations under the License.
-
-#include "base/kaldi-common.h"
-#include "util/common-utils.h"
-#include "nnet/nnet-nnet.h"
-
-int main(int argc, char *argv[]) {
-  try {
-    using namespace kaldi;
-    using namespace kaldi::nnet1;
-    typedef kaldi::int32 int32;
-
-    const char *usage =
-      "Concatenate Neural Networks (and possibly change binary/text format)\n"
-      "Usage: nnet-concat [options] <nnet-in1> <...> <nnet-inN> <nnet-out>\n"
-      "e.g.:\n"
-      " nnet-concat --binary=false nnet.1 nnet.2 nnet.1.2\n";
-
-    ParseOptions po(usage);
-
-    bool binary_write = true;
-    po.Register("binary", &binary_write, "Write output in binary mode");
-
-    po.Read(argc, argv);
-
-    if (po.NumArgs() < 3) {
-      po.PrintUsage();
-      exit(1);
-    }
-
-    std::string model_in_filename = po.GetArg(1);
-    std::string model_in_filename_next;
-    std::string model_out_filename = po.GetArg(po.NumArgs());
-
-    // read the first nnet,
-    KALDI_LOG << "Reading " << model_in_filename;
-    Nnet nnet;
-    {
-      bool binary_read;
-      Input ki(model_in_filename, &binary_read);
-      nnet.Read(ki.Stream(), binary_read);
-    }
-
-    // read all the other nnets,
-    for (int32 i = 2; i < po.NumArgs(); i++) {
-      // read the nnet,
-      model_in_filename_next = po.GetArg(i);
-      KALDI_LOG << "Concatenating " << model_in_filename_next;
-      Nnet nnet_next;
-      {
-        bool binary_read;
-        Input ki(model_in_filename_next, &binary_read);
-        nnet_next.Read(ki.Stream(), binary_read);
-      }
-      // append nnet_next to the network nnet,
-      nnet.AppendNnet(nnet_next);
-    }
-
-    // finally write the nnet to disk,
-    {
-      Output ko(model_out_filename, binary_write);
-      nnet.Write(ko.Stream(), binary_write);
-    }
-
-    KALDI_LOG << "Written model to " << model_out_filename;
-    return 0;
-  } catch(const std::exception &e) {
-    std::cerr << e.what();
-    return -1;
-  }
-}
-
-
diff --git a/src/nnetbin/nnet-copy.cc b/src/nnetbin/nnet-copy.cc
deleted file mode 100644
index c4a27f2dd69..00000000000
--- a/src/nnetbin/nnet-copy.cc
+++ /dev/null
@@ -1,151 +0,0 @@
-// nnetbin/nnet-copy.cc
-
-// Copyright 2012-2015  Brno University of Technology (author: Karel Vesely)
-
-// See ../../COPYING for clarification regarding multiple authors
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//  http://www.apache.org/licenses/LICENSE-2.0
-//
-// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
-// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
-// MERCHANTABLITY OR NON-INFRINGEMENT.
-// See the Apache 2 License for the specific language governing permissions and
-// limitations under the License.
-
-#include "base/kaldi-common.h"
-#include "util/common-utils.h"
-#include "nnet/nnet-nnet.h"
-#include "nnet/nnet-parallel-component.h"
-
-int main(int argc, char *argv[]) {
-  try {
-    using namespace kaldi;
-    using namespace kaldi::nnet1;
-    typedef kaldi::int32 int32;
-
-    const char *usage =
-      "Copy Neural Network model (and possibly change binary/text format)\n"
-      "Usage:  nnet-copy [options] <model-in> <model-out>\n"
-      "e.g.:\n"
-      " nnet-copy --binary=false nnet.mdl nnet_txt.mdl\n";
-
-    bool binary_write = true;
-    int32 remove_first_components = 0;
-    int32 remove_last_components = 0;
-    BaseFloat dropout_rate = -1.0;
-
-    ParseOptions po(usage);
-    po.Register("binary", &binary_write, "Write output in binary mode");
-
-    po.Register("remove-first-layers", &remove_first_components,
-        "Deprecated, please use --remove-first-components");
-    po.Register("remove-last-layers", &remove_last_components,
-        "Deprecated, please use --remove-last-components");
-
-    po.Register("remove-first-components", &remove_first_components,
-        "Remove N first Components from the Nnet");
-    po.Register("remove-last-components", &remove_last_components,
-        "Remove N last layers Components from the Nnet");
-
-    po.Register("dropout-rate", &dropout_rate,
-        "Probability that neuron is dropped"
-        "(-1.0 keeps original value).");
-
-    std::string from_parallel_component;
-    po.Register("from-parallel-component", &from_parallel_component,
-        "Extract nested network from parallel component (two possibilities: "
-        "'3' = search for ParallelComponent and get its 3rd network; "
-        "'1:3' = get 3nd network from 1st component; ID = 1..N).");
-
-    po.Read(argc, argv);
-
-    if (po.NumArgs() != 2) {
-      po.PrintUsage();
-      exit(1);
-    }
-
-    std::string model_in_filename = po.GetArg(1),
-        model_out_filename = po.GetArg(2);
-
-    // load the network
-    Nnet nnet;
-    {
-      bool binary_read;
-      Input ki(model_in_filename, &binary_read);
-      nnet.Read(ki.Stream(), binary_read);
-    }
-
-    // eventually replace 'nnet' by nested network from <ParallelComponent>,
-    if (from_parallel_component != "") {
-      std::vector<int32> component_id_nested_id;
-      kaldi::SplitStringToIntegers(from_parallel_component, ":", false,
-                                   &component_id_nested_id);
-      // parse the argument,
-      int32 component_id = -1, nested_id = 0;
-      switch (component_id_nested_id.size()) {
-        case 1:
-          nested_id = component_id_nested_id[0];
-          break;
-        case 2:
-          component_id = component_id_nested_id[0];
-          nested_id = component_id_nested_id[1];
-          break;
-        default:
-          KALDI_ERR << "Check the csl '--from-parallel-component='"
-                    << from_parallel_component
-                    << " There must be 1 or 2 elements.";
-      }
-      // search for first <ParallelComponent> (we don't know component_id yet),
-      if (component_id == -1) {
-        for (int32 i = 0; i < nnet.NumComponents(); i++) {
-          if (nnet.GetComponent(i).GetType() == Component::kParallelComponent) {
-            component_id = i+1;
-            break;
-          }
-        }
-      }
-      // replace the nnet,
-      KALDI_ASSERT(nnet.GetComponent(component_id-1).GetType() ==
-                   Component::kParallelComponent);
-      ParallelComponent& parallel_comp =
-        dynamic_cast<ParallelComponent&>(nnet.GetComponent(component_id-1));
-      nnet = parallel_comp.GetNestedNnet(nested_id-1);  // replace!
-    }
-
-    // optionally remove N first components,
-    if (remove_first_components > 0) {
-      for (int32 i = 0; i < remove_first_components; i++) {
-        nnet.RemoveComponent(0);
-      }
-    }
-
-    // optionally remove N last components,
-    if (remove_last_components > 0) {
-      for (int32 i = 0; i < remove_last_components; i++) {
-        nnet.RemoveLastComponent();
-      }
-    }
-
-    // dropout,
-    if (dropout_rate != -1.0) {
-      nnet.SetDropoutRate(dropout_rate);
-    }
-
-    // store the network,
-    {
-      Output ko(model_out_filename, binary_write);
-      nnet.Write(ko.Stream(), binary_write);
-    }
-
-    KALDI_LOG << "Written 'nnet1' to " << model_out_filename;
-    return 0;
-  } catch(const std::exception &e) {
-    std::cerr << e.what();
-    return -1;
-  }
-}
diff --git a/src/nnetbin/nnet-forward.cc b/src/nnetbin/nnet-forward.cc
deleted file mode 100644
index 062bca7da9d..00000000000
--- a/src/nnetbin/nnet-forward.cc
+++ /dev/null
@@ -1,208 +0,0 @@
-// nnetbin/nnet-forward.cc
-
-// Copyright 2011-2013  Brno University of Technology (Author: Karel Vesely)
-
-// See ../../COPYING for clarification regarding multiple authors
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//  http://www.apache.org/licenses/LICENSE-2.0
-//
-// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
-// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
-// MERCHANTABLITY OR NON-INFRINGEMENT.
-// See the Apache 2 License for the specific language governing permissions and
-// limitations under the License.
-
-#include <limits>
-
-#include "nnet/nnet-nnet.h"
-#include "nnet/nnet-loss.h"
-#include "nnet/nnet-pdf-prior.h"
-#include "base/kaldi-common.h"
-#include "util/common-utils.h"
-#include "base/timer.h"
-
-
-int main(int argc, char *argv[]) {
-  using namespace kaldi;
-  using namespace kaldi::nnet1;
-  try {
-    const char *usage =
-      "Perform forward pass through Neural Network.\n"
-      "Usage: nnet-forward [options] <nnet1-in> <feature-rspecifier> <feature-wspecifier>\n"
-      "e.g.: nnet-forward final.nnet ark:input.ark ark:output.ark\n";
-
-    ParseOptions po(usage);
-
-    PdfPriorOptions prior_opts;
-    prior_opts.Register(&po);
-
-    std::string feature_transform;
-    po.Register("feature-transform", &feature_transform,
-        "Feature transform in front of main network (in nnet format)");
-
-    bool no_softmax = false;
-    po.Register("no-softmax", &no_softmax,
-        "Removes the last component with Softmax, if found. The pre-softmax "
-        "activations are the output of the network. Decoding them leads to "
-        "the same lattices as if we had used 'log-posteriors'.");
-
-    bool apply_log = false;
-    po.Register("apply-log", &apply_log, "Transform NN output by log()");
-
-    std::string use_gpu="no";
-    po.Register("use-gpu", &use_gpu,
-        "yes|no|optional, only has effect if compiled with CUDA");
-
-    using namespace kaldi;
-    using namespace kaldi::nnet1;
-    typedef kaldi::int32 int32;
-
-    po.Read(argc, argv);
-
-    if (po.NumArgs() != 3) {
-      po.PrintUsage();
-      exit(1);
-    }
-
-    std::string model_filename = po.GetArg(1),
-        feature_rspecifier = po.GetArg(2),
-        feature_wspecifier = po.GetArg(3);
-
-    // Select the GPU
-#if HAVE_CUDA == 1
-    CuDevice::Instantiate().SelectGpuId(use_gpu);
-#endif
-
-    Nnet nnet_transf;
-    if (feature_transform != "") {
-      nnet_transf.Read(feature_transform);
-    }
-
-    Nnet nnet;
-    nnet.Read(model_filename);
-    // optionally remove softmax,
-    Component::ComponentType last_comp_type = nnet.GetLastComponent().GetType();
-    if (no_softmax) {
-      if (last_comp_type == Component::kSoftmax ||
-          last_comp_type == Component::kBlockSoftmax) {
-        KALDI_LOG << "Removing " << Component::TypeToMarker(last_comp_type)
-                  << " from the nnet " << model_filename;
-        nnet.RemoveLastComponent();
-      } else {
-        KALDI_WARN << "Last component 'NOT-REMOVED' by --no-softmax=true, "
-          << "the component was " << Component::TypeToMarker(last_comp_type);
-      }
-    }
-
-    // avoid some bad option combinations,
-    if (apply_log && no_softmax) {
-      KALDI_ERR << "Cannot use both --apply-log=true --no-softmax=true, "
-                << "use only one of the two!";
-    }
-
-    // we will subtract log-priors later,
-    PdfPrior pdf_prior(prior_opts);
-
-    // disable dropout,
-    nnet_transf.SetDropoutRate(0.0);
-    nnet.SetDropoutRate(0.0);
-
-    kaldi::int64 tot_t = 0;
-
-    SequentialBaseFloatMatrixReader feature_reader(feature_rspecifier);
-    BaseFloatMatrixWriter feature_writer(feature_wspecifier);
-
-    CuMatrix<BaseFloat> feats, feats_transf, nnet_out;
-    Matrix<BaseFloat> nnet_out_host;
-
-    Timer time;
-    double time_now = 0;
-    int32 num_done = 0;
-
-    // main loop,
-    for (; !feature_reader.Done(); feature_reader.Next()) {
-      // read
-      Matrix<BaseFloat> mat = feature_reader.Value();
-      std::string utt = feature_reader.Key();
-      KALDI_VLOG(2) << "Processing utterance " << num_done+1
-                    << ", " << utt
-                    << ", " << mat.NumRows() << "frm";
-
-
-      if (!KALDI_ISFINITE(mat.Sum())) {  // check there's no nan/inf,
-        KALDI_ERR << "NaN or inf found in features for " << utt;
-      }
-
-      // push it to gpu,
-      feats = mat;
-
-      // fwd-pass, feature transform,
-      nnet_transf.Feedforward(feats, &feats_transf);
-      if (!KALDI_ISFINITE(feats_transf.Sum())) {  // check there's no nan/inf,
-        KALDI_ERR << "NaN or inf found in transformed-features for " << utt;
-      }
-
-      // fwd-pass, nnet,
-      nnet.Feedforward(feats_transf, &nnet_out);
-      if (!KALDI_ISFINITE(nnet_out.Sum())) {  // check there's no nan/inf,
-        KALDI_ERR << "NaN or inf found in nn-output for " << utt;
-      }
-
-      // convert posteriors to log-posteriors,
-      if (apply_log) {
-        if (!(nnet_out.Min() >= 0.0 && nnet_out.Max() <= 1.0)) {
-          KALDI_WARN << "Applying 'log()' to data which don't seem to be "
-                     << "probabilities," << utt;
-        }
-        nnet_out.Add(1e-20);  // avoid log(0),
-        nnet_out.ApplyLog();
-      }
-
-      // subtract log-priors from log-posteriors or pre-softmax,
-      if (prior_opts.class_frame_counts != "") {
-        pdf_prior.SubtractOnLogpost(&nnet_out);
-      }
-
-      // download from GPU,
-      nnet_out_host = Matrix<BaseFloat>(nnet_out);
-
-      // write,
-      if (!KALDI_ISFINITE(nnet_out_host.Sum())) {  // check there's no nan/inf,
-        KALDI_ERR << "NaN or inf found in final output nn-output for " << utt;
-      }
-      feature_writer.Write(feature_reader.Key(), nnet_out_host);
-
-      // progress log,
-      if (num_done % 100 == 0) {
-        time_now = time.Elapsed();
-        KALDI_VLOG(1) << "After " << num_done << " utterances: time elapsed = "
-                      << time_now/60 << " min; processed " << tot_t/time_now
-                      << " frames per second.";
-      }
-      num_done++;
-      tot_t += mat.NumRows();
-    }
-
-    // final message,
-    KALDI_LOG << "Done " << num_done << " files"
-              << " in " << time.Elapsed()/60 << "min,"
-              << " (fps " << tot_t/time.Elapsed() << ")";
-
-#if HAVE_CUDA == 1
-    if (GetVerboseLevel() >= 1) {
-      CuDevice::Instantiate().PrintProfile();
-    }
-#endif
-
-    if (num_done == 0) return -1;
-    return 0;
-  } catch(const std::exception &e) {
-    std::cerr << e.what();
-    return -1;
-  }
-}
diff --git a/src/nnetbin/nnet-info.cc b/src/nnetbin/nnet-info.cc
deleted file mode 100644
index b35ef7da605..00000000000
--- a/src/nnetbin/nnet-info.cc
+++ /dev/null
@@ -1,65 +0,0 @@
-// nnetbin/nnet-info.cc
-
-// Copyright 2013  Brno University of Technology (Author: Karel Vesely)
-
-// See ../../COPYING for clarification regarding multiple authors
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//  http://www.apache.org/licenses/LICENSE-2.0
-//
-// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
-// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
-// MERCHANTABLITY OR NON-INFRINGEMENT.
-// See the Apache 2 License for the specific language governing permissions and
-// limitations under the License.
-
-#include "base/kaldi-common.h"
-#include "util/common-utils.h"
-#include "nnet/nnet-nnet.h"
-
-int main(int argc, char *argv[]) {
-  try {
-    using namespace kaldi;
-    using namespace kaldi::nnet1;
-    typedef kaldi::int32 int32;
-
-    const char *usage =
-        "Print human-readable information about the neural network.\n"
-        "(topology, various weight statistics, etc.) It prints to stdout.\n"
-        "Usage:  nnet-info [options] <nnet-in>\n"
-        "e.g.:\n"
-        " nnet-info 1.nnet\n";
-
-    ParseOptions po(usage);
-    po.Read(argc, argv);
-
-    if (po.NumArgs() != 1) {
-      po.PrintUsage();
-      exit(1);
-    }
-
-    std::string nnet_rxfilename = po.GetArg(1);
-
-    // load the network
-    Nnet nnet;
-    {
-      bool binary_read;
-      Input ki(nnet_rxfilename, &binary_read);
-      nnet.Read(ki.Stream(), binary_read);
-    }
-
-    std::cout << nnet.Info();
-
-    KALDI_LOG << "Printed info about " << nnet_rxfilename;
-    return 0;
-  } catch(const std::exception &e) {
-    std::cerr << e.what();
-    return -1;
-  }
-}
-
-
diff --git a/src/nnetbin/nnet-initialize.cc b/src/nnetbin/nnet-initialize.cc
deleted file mode 100644
index fed255575d0..00000000000
--- a/src/nnetbin/nnet-initialize.cc
+++ /dev/null
@@ -1,71 +0,0 @@
-// nnetbin/nnet-initialize.cc
-
-// Copyright 2014  Brno University of Technology (author: Karel Vesely)
-
-// See ../../COPYING for clarification regarding multiple authors
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//  http://www.apache.org/licenses/LICENSE-2.0
-//
-// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
-// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
-// MERCHANTABLITY OR NON-INFRINGEMENT.
-// See the Apache 2 License for the specific language governing permissions and
-// limitations under the License.
-
-#include "base/kaldi-common.h"
-#include "util/common-utils.h"
-#include "nnet/nnet-nnet.h"
-
-int main(int argc, char *argv[]) {
-  try {
-    using namespace kaldi;
-    using namespace kaldi::nnet1;
-    typedef kaldi::int32 int32;
-
-    const char *usage =
-      "Initialize Neural Network parameters according to a prototype (nnet1).\n"
-      "Usage:  nnet-initialize [options] <nnet-prototype-in> <nnet-out>\n"
-      "e.g.: nnet-initialize --binary=false nnet.proto nnet.init\n";
-
-    SetVerboseLevel(1);  // be verbose by default,
-
-    ParseOptions po(usage);
-    bool binary_write = true;
-    po.Register("binary", &binary_write, "Write output in binary mode");
-    int32 seed = 777;
-    po.Register("seed", &seed, "Seed for random number generator");
-
-    po.Read(argc, argv);
-
-    if (po.NumArgs() != 2) {
-      po.PrintUsage();
-      exit(1);
-    }
-
-    std::string nnet_config_in_filename = po.GetArg(1),
-        nnet_out_filename = po.GetArg(2);
-
-    std::srand(seed);
-
-    // initialize the network
-    Nnet nnet;
-    nnet.Init(nnet_config_in_filename);
-
-    // store the network
-    Output ko(nnet_out_filename, binary_write);
-    nnet.Write(ko.Stream(), binary_write);
-
-    KALDI_LOG << "Written initialized model to " << nnet_out_filename;
-    return 0;
-  } catch(const std::exception &e) {
-    std::cerr << e.what();
-    return -1;
-  }
-}
-
-
diff --git a/src/nnetbin/nnet-set-learnrate.cc b/src/nnetbin/nnet-set-learnrate.cc
deleted file mode 100644
index c520e6bdbbc..00000000000
--- a/src/nnetbin/nnet-set-learnrate.cc
+++ /dev/null
@@ -1,104 +0,0 @@
-// nnetbin/nnet-set-learnrate.cc
-
-// Copyright 2016,  Brno University of Technology
-//                  (author: Katerina Zmolikova, Karel Vesely)
-
-// See ../../COPYING for clarification regarding multiple authors
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//  http://www.apache.org/licenses/LICENSE-2.0
-//
-// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
-// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
-// MERCHANTABLITY OR NON-INFRINGEMENT.
-// See the Apache 2 License for the specific language governing permissions and
-// limitations under the License.
-
-#include "util/common-utils.h"
-#include "nnet/nnet-nnet.h"
-#include "nnet/nnet-component.h"
-#include "nnet/nnet-affine-transform.h"
-#include "nnet/nnet-activation.h"
-
-int main(int argc, char *argv[]) {
-  try {
-    using namespace kaldi;
-    using namespace kaldi::nnet1;
-    typedef kaldi::int32 int32;
-
-    const char *usage =
-      "Sets learning rate coefficient inside of 'nnet1' model\n"
-      "Usage: nnet-set-learnrate --components=<csl> --coef=<float> <nnet-in> <nnet-out>\n"
-      "e.g.: nnet-set-learnrate --components=1:3:5 --coef=0.5 --bias-coef=0.1 nnet-in nnet-out\n";
-
-    ParseOptions po(usage);
-    bool binary = true;
-    po.Register("binary", &binary, "Write output in binary mode");
-
-    std::string components_str = "";
-    po.Register("components", &components_str,
-        "Select components by 'csl' of 1..N values. Layout is the same as in "
-        "'nnet-info' output, (example 1:3:5)");
-
-    float coef = 1.0,
-          weight_coef = 1.0,
-          bias_coef = 1.0;
-
-    po.Register("coef", &coef,
-        "Learn-rate coefficient for both weight matrices and biases.");
-    po.Register("weight-coef", &weight_coef,
-        "Learn-rate coefficient for weight matrices "
-        "(used as: coef * weight_coef).");
-    po.Register("bias-coef", &bias_coef,
-        "Learn-rate coefficient for bias (used as: coef * bias_coef).");
-
-    po.Read(argc, argv);
-
-    if (po.NumArgs() != 2) {
-      po.PrintUsage();
-      exit(1);
-    }
-
-    std::string nnet_in_filename = po.GetArg(1),
-      nnet_out_filename = po.GetArg(2);
-
-    Nnet nnet;
-    nnet.Read(nnet_in_filename);
-
-    // A vector which contains indices of components,
-    // where we will set the 'learn-rate coefficients',
-    std::vector<int32> components;
-    if (components_str != "") {
-      // components were selected by the option,
-      kaldi::SplitStringToIntegers(components_str, ":", false, &components);
-    } else {
-      // otherwise select all the components (1..Ncomp),
-      for (int32 i = 1; i <= nnet.NumComponents(); i++) {
-        components.push_back(i);
-      }
-    }
-
-    // Setting the learning rate coefficients,
-    for (int32 i = 0; i < components.size(); i++) {
-      if (nnet.GetComponent(components[i]-1).IsUpdatable()) {
-        UpdatableComponent& comp =
-          dynamic_cast<UpdatableComponent&>(nnet.GetComponent(components[i]-1));
-        comp.SetLearnRateCoef(coef * weight_coef);  // weight matrices, etc.,
-        comp.SetBiasLearnRateCoef(coef * bias_coef);  // biases,
-      }
-    }
-
-    // Write the 'nnet1' network,
-    nnet.Write(nnet_out_filename, binary);
-
-    return 0;
-  } catch(const std::exception &e) {
-    std::cerr << e.what();
-    return -1;
-  }
-}
-
diff --git a/src/nnetbin/nnet-train-frmshuff.cc b/src/nnetbin/nnet-train-frmshuff.cc
deleted file mode 100644
index cc50e33ea42..00000000000
--- a/src/nnetbin/nnet-train-frmshuff.cc
+++ /dev/null
@@ -1,424 +0,0 @@
-// nnetbin/nnet-train-frmshuff.cc
-
-// Copyright 2013-2016  Brno University of Technology (Author: Karel Vesely)
-
-// See ../../COPYING for clarification regarding multiple authors
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//  http://www.apache.org/licenses/LICENSE-2.0
-//
-// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
-// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
-// MERCHANTABLITY OR NON-INFRINGEMENT.
-// See the Apache 2 License for the specific language governing permissions and
-// limitations under the License.
-
-#include "nnet/nnet-trnopts.h"
-#include "nnet/nnet-nnet.h"
-#include "nnet/nnet-loss.h"
-#include "nnet/nnet-randomizer.h"
-#include "base/kaldi-common.h"
-#include "util/common-utils.h"
-#include "base/timer.h"
-#include "cudamatrix/cu-device.h"
-
-int main(int argc, char *argv[]) {
-  using namespace kaldi;
-  using namespace kaldi::nnet1;
-  typedef kaldi::int32 int32;
-
-  try {
-    const char *usage =
-      "Perform one iteration (epoch) of Neural Network training with\n"
-      "mini-batch Stochastic Gradient Descent. The training targets\n"
-      "are usually pdf-posteriors, prepared by ali-to-post.\n"
-      "Usage:  nnet-train-frmshuff [options] <feature-rspecifier> <targets-rspecifier> <model-in> [<model-out>]\n"
-      "e.g.: nnet-train-frmshuff scp:feats.scp ark:posterior.ark nnet.init nnet.iter1\n";
-
-    ParseOptions po(usage);
-
-    NnetTrainOptions trn_opts;
-    trn_opts.Register(&po);
-    NnetDataRandomizerOptions rnd_opts;
-    rnd_opts.Register(&po);
-    LossOptions loss_opts;
-    loss_opts.Register(&po);
-
-    bool binary = true;
-    po.Register("binary", &binary, "Write output in binary mode");
-
-    bool crossvalidate = false;
-    po.Register("cross-validate", &crossvalidate,
-        "Perform cross-validation (don't back-propagate)");
-
-    bool randomize = true;
-    po.Register("randomize", &randomize,
-        "Perform the frame-level shuffling within the Cache::");
-
-    std::string feature_transform;
-    po.Register("feature-transform", &feature_transform,
-        "Feature transform in Nnet format");
-
-    std::string objective_function = "xent";
-    po.Register("objective-function", &objective_function,
-        "Objective function : xent|mse|multitask");
-
-    int32 max_frames = 360000;
-    po.Register("max-frames", &max_frames,
-        "Maximum number of frames an utterance can have (skipped if longer)");
-
-    int32 length_tolerance = 5;
-    po.Register("length-tolerance", &length_tolerance,
-        "Allowed length mismatch of features/targets/weights "
-        "(in frames, we truncate to the shortest)");
-
-    std::string frame_weights;
-    po.Register("frame-weights", &frame_weights,
-        "Per-frame weights, used to re-scale gradients.");
-
-    std::string utt_weights;
-    po.Register("utt-weights", &utt_weights,
-        "Per-utterance weights, used to re-scale frame-weights.");
-
-    std::string use_gpu="yes";
-    po.Register("use-gpu", &use_gpu,
-        "yes|no|optional, only has effect if compiled with CUDA");
-
-    po.Read(argc, argv);
-
-    if (po.NumArgs() != 3 + (crossvalidate ? 0 : 1)) {
-      po.PrintUsage();
-      exit(1);
-    }
-
-    std::string feature_rspecifier = po.GetArg(1),
-      targets_rspecifier = po.GetArg(2),
-      model_filename = po.GetArg(3);
-
-    std::string target_model_filename;
-    if (!crossvalidate) {
-      target_model_filename = po.GetArg(4);
-    }
-
-    using namespace kaldi;
-    using namespace kaldi::nnet1;
-    typedef kaldi::int32 int32;
-
-#if HAVE_CUDA == 1
-    CuDevice::Instantiate().SelectGpuId(use_gpu);
-#endif
-
-    Nnet nnet_transf;
-    if (feature_transform != "") {
-      nnet_transf.Read(feature_transform);
-    }
-
-    Nnet nnet;
-    nnet.Read(model_filename);
-    nnet.SetTrainOptions(trn_opts);
-
-    if (crossvalidate) {
-      nnet_transf.SetDropoutRate(0.0);
-      nnet.SetDropoutRate(0.0);
-    }
-
-    kaldi::int64 total_frames = 0;
-
-    SequentialBaseFloatMatrixReader feature_reader(feature_rspecifier);
-    RandomAccessPosteriorReader targets_reader(targets_rspecifier);
-    RandomAccessBaseFloatVectorReader weights_reader;
-    if (frame_weights != "") {
-      weights_reader.Open(frame_weights);
-    }
-    RandomAccessBaseFloatReader utt_weights_reader;
-    if (utt_weights != "") {
-      utt_weights_reader.Open(utt_weights);
-    }
-
-    RandomizerMask randomizer_mask(rnd_opts);
-    MatrixRandomizer feature_randomizer(rnd_opts);
-    PosteriorRandomizer targets_randomizer(rnd_opts);
-    VectorRandomizer weights_randomizer(rnd_opts);
-
-    Xent xent(loss_opts);
-    Mse mse(loss_opts);
-
-    MultiTaskLoss multitask(loss_opts);
-    if (0 == objective_function.compare(0, 9, "multitask")) {
-      // objective_function contains something like :
-      // 'multitask,xent,2456,1.0,mse,440,0.001'
-      //
-      // the meaning is following:
-      // 'multitask,<type1>,<dim1>,<weight1>,...,<typeN>,<dimN>,<weightN>'
-      multitask.InitFromString(objective_function);
-    }
-
-    CuMatrix<BaseFloat> feats_transf, nnet_out, obj_diff;
-
-    Timer time, time_io;
-    KALDI_LOG << (crossvalidate ? "CROSS-VALIDATION" : "TRAINING")
-              << " STARTED";
-
-    int32 num_done = 0,
-          num_no_tgt_mat = 0,
-          num_other_error = 0;
-
-    double time_io_accu = 0.0;
-
-    // main loop,
-    while (!feature_reader.Done()) {
-#if HAVE_CUDA == 1
-      // check that GPU computes accurately,
-      CuDevice::Instantiate().CheckGpuHealth();
-#endif
-      // fill the randomizer,
-      time_io.Reset();
-      for ( ; !feature_reader.Done(); feature_reader.Next()) {
-        if (feature_randomizer.IsFull()) {
-          // break the loop without calling Next(),
-          // we keep the 'utt' for next round,
-          break;
-        }
-        std::string utt = feature_reader.Key();
-        KALDI_VLOG(3) << "Reading " << utt;
-        // check that we have targets,
-        if (!targets_reader.HasKey(utt)) {
-          KALDI_WARN << utt << ", missing targets";
-          num_no_tgt_mat++;
-          continue;
-        }
-        // check we have per-frame weights,
-        if (frame_weights != "" && !weights_reader.HasKey(utt)) {
-          KALDI_WARN << utt << ", missing per-frame weights";
-          num_other_error++;
-          continue;
-        }
-        // check we have per-utterance weights,
-        if (utt_weights != "" && !utt_weights_reader.HasKey(utt)) {
-          KALDI_WARN << utt << ", missing per-utterance weight";
-          num_other_error++;
-          continue;
-        }
-        // get feature / target pair,
-        Matrix<BaseFloat> mat = feature_reader.Value();
-        Posterior targets = targets_reader.Value(utt);
-        // get per-frame weights,
-        Vector<BaseFloat> weights;
-        if (frame_weights != "") {
-          weights = weights_reader.Value(utt);
-        } else {  // all per-frame weights are 1.0,
-          weights.Resize(mat.NumRows());
-          weights.Set(1.0);
-        }
-        // multiply with per-utterance weight,
-        if (utt_weights != "") {
-          BaseFloat w = utt_weights_reader.Value(utt);
-          KALDI_ASSERT(w >= 0.0);
-          if (w == 0.0) continue;  // remove sentence from training,
-          weights.Scale(w);
-        }
-
-        // accumulate the I/O time,
-        time_io_accu += time_io.Elapsed();
-        time_io.Reset(); // to be sure we don't count 2x,
-
-        // skip too long utterances (or we run out of memory),
-        if (mat.NumRows() > max_frames) {
-          KALDI_WARN << "Utterance too long, skipping! " << utt
-            << " (length " << mat.NumRows() << ", max_frames "
-            << max_frames << ")";
-          num_other_error++;
-          continue;
-        }
-
-        // correct small length mismatch or drop sentence,
-        {
-          // add lengths to vector,
-          std::vector<int32> length;
-          length.push_back(mat.NumRows());
-          length.push_back(targets.size());
-          length.push_back(weights.Dim());
-          // find min, max,
-          int32 min = *std::min_element(length.begin(), length.end());
-          int32 max = *std::max_element(length.begin(), length.end());
-          // fix or drop ?
-          if (max - min < length_tolerance) {
-            // we truncate to shortest,
-            if (mat.NumRows() != min) mat.Resize(min, mat.NumCols(), kCopyData);
-            if (targets.size() != min) targets.resize(min);
-            if (weights.Dim() != min) weights.Resize(min, kCopyData);
-          } else {
-            KALDI_WARN << "Length mismatch! Targets " << targets.size()
-                       << ", features " << mat.NumRows() << ", " << utt;
-            num_other_error++;
-            continue;
-          }
-        }
-        // apply feature transform (if empty, input is copied),
-        nnet_transf.Feedforward(CuMatrix<BaseFloat>(mat), &feats_transf);
-
-        // remove frames with '0' weight from training,
-        {
-          // are there any frames to be removed? (frames with zero weight),
-          BaseFloat weight_min = weights.Min();
-          KALDI_ASSERT(weight_min >= 0.0);
-          if (weight_min == 0.0) {
-            // create vector with frame-indices to keep,
-            std::vector<MatrixIndexT> keep_frames;
-            for (int32 i = 0; i < weights.Dim(); i++) {
-              if (weights(i) > 0.0) {
-                keep_frames.push_back(i);
-              }
-            }
-
-            // when all frames are removed, we skip the sentence,
-            if (keep_frames.size() == 0) continue;
-
-            // filter feature-frames,
-            CuMatrix<BaseFloat> tmp_feats(keep_frames.size(), feats_transf.NumCols());
-            tmp_feats.CopyRows(feats_transf, CuArray<MatrixIndexT>(keep_frames));
-            tmp_feats.Swap(&feats_transf);
-
-            // filter targets,
-            Posterior tmp_targets;
-            for (int32 i = 0; i < keep_frames.size(); i++) {
-              tmp_targets.push_back(targets[keep_frames[i]]);
-            }
-            tmp_targets.swap(targets);
-
-            // filter weights,
-            Vector<BaseFloat> tmp_weights(keep_frames.size());
-            for (int32 i = 0; i < keep_frames.size(); i++) {
-              tmp_weights(i) = weights(keep_frames[i]);
-            }
-            tmp_weights.Swap(&weights);
-          }
-        }
-
-        // pass data to randomizers,
-        KALDI_ASSERT(feats_transf.NumRows() == targets.size());
-        feature_randomizer.AddData(feats_transf);
-        targets_randomizer.AddData(targets);
-        weights_randomizer.AddData(weights);
-        num_done++;
-
-        time_io.Reset(); // reset before reading next feature matrix,
-      }
-
-      // randomize,
-      if (!crossvalidate && randomize) {
-        const std::vector<int32>& mask =
-          randomizer_mask.Generate(feature_randomizer.NumFrames());
-        feature_randomizer.Randomize(mask);
-        targets_randomizer.Randomize(mask);
-        weights_randomizer.Randomize(mask);
-      }
-
-      // train with data from randomizers (using mini-batches),
-      for ( ; !feature_randomizer.Done(); feature_randomizer.Next(),
-                                          targets_randomizer.Next(),
-                                          weights_randomizer.Next()) {
-        // get block of feature/target pairs,
-        const CuMatrixBase<BaseFloat>& nnet_in = feature_randomizer.Value();
-        const Posterior& nnet_tgt = targets_randomizer.Value();
-        const Vector<BaseFloat>& frm_weights = weights_randomizer.Value();
-
-        // forward pass,
-        nnet.Propagate(nnet_in, &nnet_out);
-
-        // evaluate objective function we've chosen,
-        if (objective_function == "xent") {
-          // gradients re-scaled by weights in Eval,
-          xent.Eval(frm_weights, nnet_out, nnet_tgt, &obj_diff);
-        } else if (objective_function == "mse") {
-          // gradients re-scaled by weights in Eval,
-          mse.Eval(frm_weights, nnet_out, nnet_tgt, &obj_diff);
-        } else if (0 == objective_function.compare(0, 9, "multitask")) {
-          // gradients re-scaled by weights in Eval,
-          multitask.Eval(frm_weights, nnet_out, nnet_tgt, &obj_diff);
-        } else {
-          KALDI_ERR << "Unknown objective function code : " << objective_function;
-        }
-
-        if (!crossvalidate) {
-          // back-propagate, and do the update,
-          nnet.Backpropagate(obj_diff, NULL);
-        }
-
-        // 1st mini-batch : show what happens in network,
-        if (total_frames == 0) {
-          KALDI_LOG << "### After " << total_frames << " frames,";
-          KALDI_LOG << nnet.InfoPropagate();
-          if (!crossvalidate) {
-            KALDI_LOG << nnet.InfoBackPropagate();
-            KALDI_LOG << nnet.InfoGradient();
-          }
-        }
-
-        // VERBOSE LOG
-        // monitor the NN training (--verbose=2),
-        if (GetVerboseLevel() >= 2) {
-          static int32 counter = 0;
-          counter += nnet_in.NumRows();
-          // print every 25k frames,
-          if (counter >= 25000) {
-            KALDI_VLOG(2) << "### After " << total_frames << " frames,";
-            KALDI_VLOG(2) << nnet.InfoPropagate();
-            if (!crossvalidate) {
-              KALDI_VLOG(2) << nnet.InfoBackPropagate();
-              KALDI_VLOG(2) << nnet.InfoGradient();
-            }
-            counter = 0;
-          }
-        }
-
-        total_frames += nnet_in.NumRows();
-      }
-    }  // main loop,
-
-    // after last mini-batch : show what happens in network,
-    KALDI_LOG << "### After " << total_frames << " frames,";
-    KALDI_LOG << nnet.InfoPropagate();
-    if (!crossvalidate) {
-      KALDI_LOG << nnet.InfoBackPropagate();
-      KALDI_LOG << nnet.InfoGradient();
-    }
-
-    if (!crossvalidate) {
-      nnet.Write(target_model_filename, binary);
-    }
-
-    KALDI_LOG << "Done " << num_done << " files, "
-      << num_no_tgt_mat << " with no tgt_mats, "
-      << num_other_error << " with other errors. "
-      << "[" << (crossvalidate ? "CROSS-VALIDATION" : "TRAINING")
-      << ", " << (randomize ? "RANDOMIZED" : "NOT-RANDOMIZED")
-      << ", " << time.Elapsed() / 60 << " min, processing "
-      << total_frames / time.Elapsed() << " frames per sec;"
-      << " i/o time " << 100.*time_io_accu/time.Elapsed() << "%]";
-
-    if (objective_function == "xent") {
-      KALDI_LOG << xent.ReportPerClass();
-      KALDI_LOG << xent.Report();
-    } else if (objective_function == "mse") {
-      KALDI_LOG << mse.Report();
-    } else if (0 == objective_function.compare(0, 9, "multitask")) {
-      KALDI_LOG << multitask.Report();
-    } else {
-      KALDI_ERR << "Unknown objective function code : " << objective_function;
-    }
-
-#if HAVE_CUDA == 1
-    CuDevice::Instantiate().PrintProfile();
-#endif
-
-    return 0;
-  } catch(const std::exception &e) {
-    std::cerr << e.what();
-    return -1;
-  }
-}
diff --git a/src/nnetbin/nnet-train-mmi-sequential.cc b/src/nnetbin/nnet-train-mmi-sequential.cc
deleted file mode 100644
index 2554d64287a..00000000000
--- a/src/nnetbin/nnet-train-mmi-sequential.cc
+++ /dev/null
@@ -1,481 +0,0 @@
-// nnetbin/nnet-train-mmi-sequential.cc
-
-// Copyright 2012-2016  Brno University of Technology (author: Karel Vesely)
-
-// See ../../COPYING for clarification regarding multiple authors
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//  http://www.apache.org/licenses/LICENSE-2.0
-//
-// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
-// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
-// MERCHANTABLITY OR NON-INFRINGEMENT.
-// See the Apache 2 License for the specific language governing permissions and
-// limitations under the License.
-
-#include <iomanip>
-
-#include "base/kaldi-common.h"
-#include "util/common-utils.h"
-#include "tree/context-dep.h"
-#include "hmm/transition-model.h"
-#include "fstext/fstext-lib.h"
-#include "decoder/faster-decoder.h"
-#include "decoder/decodable-matrix.h"
-#include "lat/kaldi-lattice.h"
-#include "lat/lattice-functions.h"
-
-#include "nnet/nnet-trnopts.h"
-#include "nnet/nnet-component.h"
-#include "nnet/nnet-activation.h"
-#include "nnet/nnet-nnet.h"
-#include "nnet/nnet-pdf-prior.h"
-#include "nnet/nnet-utils.h"
-#include "base/timer.h"
-#include "cudamatrix/cu-device.h"
-
-
-namespace kaldi {
-namespace nnet1 {
-
-void LatticeAcousticRescore(const Matrix<BaseFloat> &log_like,
-                            const TransitionModel &trans_model,
-                            const std::vector<int32> &state_times,
-                            Lattice *lat) {
-  kaldi::uint64 props = lat->Properties(fst::kFstProperties, false);
-  if (!(props & fst::kTopSorted))
-    KALDI_ERR << "Input lattice must be topologically sorted.";
-
-  KALDI_ASSERT(!state_times.empty());
-  std::vector<std::vector<int32> > time_to_state(log_like.NumRows());
-  for (size_t i = 0; i < state_times.size(); i++) {
-    KALDI_ASSERT(state_times[i] >= 0);
-    if (state_times[i] < log_like.NumRows())  // end state may be past this..
-      time_to_state[state_times[i]].push_back(i);
-    else
-      KALDI_ASSERT(state_times[i] == log_like.NumRows()
-                   && "There appears to be lattice/feature mismatch.");
-  }
-
-  for (int32 t = 0; t < log_like.NumRows(); t++) {
-    for (size_t i = 0; i < time_to_state[t].size(); i++) {
-      int32 state = time_to_state[t][i];
-      for (fst::MutableArcIterator<Lattice> aiter(lat, state); !aiter.Done();
-           aiter.Next()) {
-        LatticeArc arc = aiter.Value();
-        int32 trans_id = arc.ilabel;
-        if (trans_id != 0) {  // Non-epsilon input label on arc
-          int32 pdf_id = trans_model.TransitionIdToPdf(trans_id);
-          arc.weight.SetValue2(-log_like(t, pdf_id) + arc.weight.Value2());
-          aiter.SetValue(arc);
-        }
-      }
-    }
-  }
-}
-
-}  // namespace nnet1
-}  // namespace kaldi
-
-
-int main(int argc, char *argv[]) {
-  using namespace kaldi;
-  using namespace kaldi::nnet1;
-  typedef kaldi::int32 int32;
-  try {
-    const char *usage =
-      "Perform one iteration of MMI training using SGD with per-utterance"
-      "updates\n"
-
-      "Usage:  nnet-train-mmi-sequential [options] "
-      "<model-in> <transition-model-in> <feature-rspecifier> "
-      "<den-lat-rspecifier> <ali-rspecifier> [<model-out>]\n"
-
-      "e.g.: nnet-train-mmi-sequential nnet.init trans.mdl scp:feats.scp "
-      "scp:denlats.scp ark:ali.ark nnet.iter1\n";
-
-    ParseOptions po(usage);
-
-    NnetTrainOptions trn_opts;
-    trn_opts.learn_rate = 0.00001;  // changing default,
-    trn_opts.Register(&po);
-
-    bool binary = true;
-    po.Register("binary", &binary, "Write output in binary mode");
-
-    std::string feature_transform;
-    po.Register("feature-transform", &feature_transform,
-        "Feature transform in 'nnet1' format");
-
-    PdfPriorOptions prior_opts;
-    prior_opts.Register(&po);
-
-    BaseFloat acoustic_scale = 1.0,
-        lm_scale = 1.0,
-        old_acoustic_scale = 0.0;
-
-    po.Register("acoustic-scale", &acoustic_scale,
-        "Scaling factor for acoustic likelihoods");
-
-    po.Register("lm-scale", &lm_scale,
-        "Scaling factor for \"graph costs\" (including LM costs)");
-
-    po.Register("old-acoustic-scale", &old_acoustic_scale,
-        "Add in the scores in the input lattices with this scale, "
-        "rather than discarding them.");
-
-    kaldi::int32 max_frames = 6000;
-    po.Register("max-frames", &max_frames,
-        "Maximum number of frames an utterance can have (skipped if longer)");
-
-    bool drop_frames = true;
-    po.Register("drop-frames", &drop_frames,
-        "Drop frames, where is zero den-posterior under numerator path "
-        "(ie. path not in lattice)");
-
-    std::string use_gpu="yes";
-    po.Register("use-gpu", &use_gpu,
-        "yes|no|optional, only has effect if compiled with CUDA");
-
-    po.Read(argc, argv);
-
-    if (po.NumArgs() != 6) {
-      po.PrintUsage();
-      exit(1);
-    }
-
-    std::string model_filename = po.GetArg(1),
-        transition_model_filename = po.GetArg(2),
-        feature_rspecifier = po.GetArg(3),
-        den_lat_rspecifier = po.GetArg(4),
-        num_ali_rspecifier = po.GetArg(5),
-        target_model_filename = po.GetArg(6);
-
-    using namespace kaldi;
-    using namespace kaldi::nnet1;
-    typedef kaldi::int32 int32;
-
-#if HAVE_CUDA == 1
-    CuDevice::Instantiate().SelectGpuId(use_gpu);
-#endif
-
-    Nnet nnet_transf;
-    if (feature_transform != "") {
-      nnet_transf.Read(feature_transform);
-    }
-
-    Nnet nnet;
-    nnet.Read(model_filename);
-    // we will use pre-softmax activations, removing softmax,
-    // - pre-softmax activations are equivalent to 'log-posterior + C_frame',
-    // - all paths crossing a frame share same 'C_frame',
-    // - with GMM, we also have the unnormalized acoustic likelihoods,
-    if (nnet.GetLastComponent().GetType() ==
-        kaldi::nnet1::Component::kSoftmax) {
-      KALDI_LOG << "Removing softmax from the nnet " << model_filename;
-      nnet.RemoveLastComponent();
-    } else {
-      KALDI_LOG << "The nnet was without softmax. "
-                << "The last component in " << model_filename << " was "
-                << Component::TypeToMarker(nnet.GetLastComponent().GetType());
-    }
-    nnet.SetTrainOptions(trn_opts);
-
-    // Read the class-frame-counts, compute priors,
-    PdfPrior log_prior(prior_opts);
-
-    // Read transition model,
-    TransitionModel trans_model;
-    ReadKaldiObject(transition_model_filename, &trans_model);
-
-    SequentialBaseFloatMatrixReader feature_reader(feature_rspecifier);
-    RandomAccessLatticeReader den_lat_reader(den_lat_rspecifier);
-    RandomAccessInt32VectorReader num_ali_reader(num_ali_rspecifier);
-
-    CuMatrix<BaseFloat> feats_transf, nnet_out, nnet_diff;
-    Matrix<BaseFloat> nnet_out_h, nnet_diff_h;
-
-    if (drop_frames) {
-      KALDI_LOG << "--drop-frames=true :"
-                   " we will zero gradient for frames with total den/num mismatch."
-                   " The mismatch is likely to be caused by missing correct path "
-                   " from den-lattice due wrong annotation or search error."
-                   " Leaving such frames out stabilizes the training.";
-    }
-
-    Timer time;
-    double time_now = 0;
-    KALDI_LOG << "TRAINING STARTED";
-
-    int32 num_done = 0, num_no_num_ali = 0, num_no_den_lat = 0,
-          num_other_error = 0, num_frm_drop = 0;
-
-    kaldi::int64 total_frames = 0;
-    double lat_like;  // total likelihood of the lattice
-    double lat_ac_like;  // acoustic likelihood weighted by posterior.
-    double total_mmi_obj = 0.0, mmi_obj = 0.0;
-    double total_post_on_ali = 0.0, post_on_ali = 0.0;
-
-    // main loop over utterances,
-    for ( ; !feature_reader.Done(); feature_reader.Next()) {
-      std::string utt = feature_reader.Key();
-      if (!den_lat_reader.HasKey(utt)) {
-        KALDI_WARN << "Missing lattice of " << utt;
-        num_no_den_lat++;
-        continue;
-      }
-      if (!num_ali_reader.HasKey(utt)) {
-        KALDI_WARN << "Missing alignment of " << utt;
-        num_no_num_ali++;
-        continue;
-      }
-
-      // 1) get the features, numerator alignment,
-      const Matrix<BaseFloat> &mat = feature_reader.Value();
-      const std::vector<int32> &num_ali = num_ali_reader.Value(utt);
-      // check duration of numerator alignments
-      if (static_cast<int32>(num_ali.size()) != mat.NumRows()) {
-        KALDI_WARN << "Duration mismatch!"
-                   << " alignment " << num_ali.size()
-                   << " features " << mat.NumRows();
-        num_other_error++;
-        continue;
-      }
-      if (mat.NumRows() > max_frames) {
-        KALDI_WARN << "Skipping " << utt
-          << " that has " << mat.NumRows() << " frames,"
-          << " it is longer than '--max-frames'" << max_frames;
-        num_other_error++;
-        continue;
-      }
-
-      // 2) get the denominator-lattice, preprocess
-      Lattice den_lat = den_lat_reader.Value(utt);
-      if (den_lat.Start() == -1) {
-        KALDI_WARN << "Empty lattice of " << utt << ", skipping.";
-        num_other_error++;
-        continue;
-      }
-      if (old_acoustic_scale != 1.0) {
-        fst::ScaleLattice(fst::AcousticLatticeScale(old_acoustic_scale),
-                          &den_lat);
-      }
-      // optional sort it topologically
-      kaldi::uint64 props = den_lat.Properties(fst::kFstProperties, false);
-      if (!(props & fst::kTopSorted)) {
-        if (fst::TopSort(&den_lat) == false) {
-          KALDI_ERR << "Cycles detected in lattice.";
-        }
-      }
-      // get the lattice length and times of states,
-      std::vector<int32> state_times;
-      int32 max_time = kaldi::LatticeStateTimes(den_lat, &state_times);
-      // check duration of den. lattice,
-      if (max_time != mat.NumRows()) {
-        KALDI_WARN << "Duration mismatch!"
-          << " denominator lattice " << max_time
-          << " features " << mat.NumRows() << ","
-          << " skipping " << utt;
-        num_other_error++;
-        continue;
-      }
-
-      // get dims,
-      int32 num_frames = mat.NumRows(),
-            num_pdfs = nnet.OutputDim();
-
-      // 3) get the pre-softmax outputs from NN,
-      // apply transform,
-      nnet_transf.Feedforward(CuMatrix<BaseFloat>(mat), &feats_transf);
-      // propagate through the nnet (we know it's w/o softmax),
-      nnet.Propagate(feats_transf, &nnet_out);
-      // subtract the log_prior,
-      if (prior_opts.class_frame_counts != "") {
-        log_prior.SubtractOnLogpost(&nnet_out);
-      }
-      // transfer it back to the host,
-      nnet_out_h = Matrix<BaseFloat>(nnet_out);
-      // release the buffers we don't need anymore,
-      feats_transf.Resize(0, 0);
-      nnet_out.Resize(0, 0);
-
-      // 4) rescore the latice,
-      LatticeAcousticRescore(nnet_out_h, trans_model, state_times, &den_lat);
-      if (acoustic_scale != 1.0 || lm_scale != 1.0)
-        fst::ScaleLattice(fst::LatticeScale(lm_scale, acoustic_scale), &den_lat);
-
-      // 5) get the posteriors,
-      kaldi::Posterior post;
-      lat_like = kaldi::LatticeForwardBackward(den_lat, &post, &lat_ac_like);
-
-      // 6) convert the Posterior to a matrix,
-      PosteriorToPdfMatrix(post, trans_model, &nnet_diff_h);
-
-      // 7) Calculate the MMI-objective function,
-      // Calculate the likelihood of correct path from acoustic score,
-      // the denominator likelihood is the total likelihood of the lattice.
-      double path_ac_like = 0.0;
-      for (int32 t = 0; t < num_frames; t++) {
-        int32 pdf = trans_model.TransitionIdToPdf(num_ali[t]);
-        path_ac_like += nnet_out_h(t, pdf);
-      }
-      path_ac_like *= acoustic_scale;
-      mmi_obj = path_ac_like - lat_like;
-      //
-      // Note: numerator likelihood does not include graph score,
-      // while denominator likelihood contains graph scores.
-      // The result is offset at the MMI-objective.
-      // However the offset is constant for given alignment,
-      // so it does not change accross epochs.
-
-      // Sum the den-posteriors under the correct path,
-      post_on_ali = 0.0;
-      for (int32 t = 0; t < num_frames; t++) {
-        int32 pdf = trans_model.TransitionIdToPdf(num_ali[t]);
-        double posterior = nnet_diff_h(t, pdf);
-        post_on_ali += posterior;
-      }
-
-      // Report,
-      KALDI_VLOG(1) << "Lattice #" << num_done + 1 << " processed"
-        << " (" << utt << "): found " << den_lat.NumStates()
-        << " states and " << fst::NumArcs(den_lat) << " arcs.";
-
-      KALDI_VLOG(1) << "Utterance " << utt << ": Average MMI obj. value = "
-        << (mmi_obj/num_frames) << " over " << num_frames << " frames."
-        << " (Avg. den-posterior on ali " << post_on_ali / num_frames << ")";
-
-
-      // 7a) Search for the frames with num/den mismatch,
-      int32 frm_drop = 0;
-      std::vector<int32> frm_drop_vec;
-      for (int32 t = 0; t < num_frames; t++) {
-        int32 pdf = trans_model.TransitionIdToPdf(num_ali[t]);
-        double posterior = nnet_diff_h(t, pdf);
-        if (posterior < 1e-20) {
-          frm_drop++;
-          frm_drop_vec.push_back(t);
-        }
-      }
-
-      // 8) subtract the pdf-Viterbi-path,
-      for (int32 t = 0; t < nnet_diff_h.NumRows(); t++) {
-        int32 pdf = trans_model.TransitionIdToPdf(num_ali[t]);
-        nnet_diff_h(t, pdf) -= 1.0;
-      }
-
-      // 9) Drop mismatched frames from the training by zeroing the derivative,
-      if (drop_frames) {
-        for (int32 i = 0; i < frm_drop_vec.size(); i++) {
-          nnet_diff_h.Row(frm_drop_vec[i]).Set(0.0);
-        }
-        num_frm_drop += frm_drop;
-      }
-      // Report the frame dropping
-      if (frm_drop > 0) {
-        std::stringstream ss;
-        ss << (drop_frames?"Dropped":"[dropping disabled] Would drop")
-           << " frames in " << utt << " " << frm_drop << "/" << num_frames
-           << ",";
-        // get frame intervals from vec frm_drop_vec,
-        ss << " intervals :";
-        // search for streaks of consecutive numbers,
-        int32 beg_streak = frm_drop_vec[0];
-        int32 len_streak = 0;
-        int32 i;
-        for (i = 0; i < frm_drop_vec.size(); i++, len_streak++) {
-          if (beg_streak + len_streak != frm_drop_vec[i]) {
-            ss << " " << beg_streak << ".." << frm_drop_vec[i-1] << "frm";
-            beg_streak = frm_drop_vec[i];
-            len_streak = 0;
-          }
-        }
-        ss << " " << beg_streak << ".." << frm_drop_vec[i-1] << "frm";
-        // print,
-        KALDI_WARN << ss.str();
-      }
-
-      // 10) backpropagate through the nnet, update,
-      nnet_diff.Resize(num_frames, num_pdfs, kUndefined);
-      nnet_diff.CopyFromMat(nnet_diff_h);
-      nnet.Backpropagate(nnet_diff, NULL);
-      // relase the buffer, we don't need anymore,
-      nnet_diff.Resize(0, 0);
-
-      // increase time counter
-      total_mmi_obj += mmi_obj;
-      total_post_on_ali += post_on_ali;
-      total_frames += num_frames;
-      num_done++;
-
-      if (num_done % 100 == 0) {
-        time_now = time.Elapsed();
-        KALDI_VLOG(1) << "After " << num_done << " utterances: "
-          << "time elapsed = " << time_now / 60 << " min; "
-          << "processed " << total_frames / time_now << " frames per sec.";
-#if HAVE_CUDA == 1
-        // check that GPU computes accurately,
-        CuDevice::Instantiate().CheckGpuHealth();
-#endif
-      }
-
-      // GRADIENT LOGGING
-      // First utterance,
-      if (num_done == 1) {
-        KALDI_VLOG(1) << nnet.InfoPropagate();
-        KALDI_VLOG(1) << nnet.InfoBackPropagate();
-        KALDI_VLOG(1) << nnet.InfoGradient();
-      }
-      // Every 1000 utterances (--verbose=2),
-      if (GetVerboseLevel() >= 2) {
-        if (num_done % 1000 == 0) {
-          KALDI_VLOG(2) << nnet.InfoPropagate();
-          KALDI_VLOG(2) << nnet.InfoBackPropagate();
-          KALDI_VLOG(2) << nnet.InfoGradient();
-        }
-      }
-    }  // main loop over utterances,
-
-    // After last utterance,
-    KALDI_VLOG(1) << nnet.InfoPropagate();
-    KALDI_VLOG(1) << nnet.InfoBackPropagate();
-    KALDI_VLOG(1) << nnet.InfoGradient();
-
-    // Add the softmax layer back before writing,
-    KALDI_LOG << "Appending the softmax " << target_model_filename;
-    nnet.AppendComponentPointer(new Softmax(nnet.OutputDim(), nnet.OutputDim()));
-    // Store the nnet,
-    nnet.Write(target_model_filename, binary);
-
-    time_now = time.Elapsed();
-    KALDI_LOG << "TRAINING FINISHED; "
-              << "Time taken = " << time_now/60 << " min; processed "
-              << (total_frames/time_now) << " frames per second.";
-
-    KALDI_LOG << "Done " << num_done << " files, "
-              << num_no_num_ali << " with no numerator alignments, "
-              << num_no_den_lat << " with no denominator lattices, "
-              << num_other_error << " with other errors.";
-
-    KALDI_LOG << "Overall MMI-objective/frame is "
-              << std::setprecision(8) << total_mmi_obj / total_frames
-              << " over " << total_frames << " frames,"
-              << " (average den-posterior on ali "
-              << total_post_on_ali / total_frames << ","
-              << " dropped " << num_frm_drop
-              << " frames with num/den mismatch)";
-
-#if HAVE_CUDA == 1
-    CuDevice::Instantiate().PrintProfile();
-#endif
-
-    return 0;
-  } catch(const std::exception &e) {
-    std::cerr << e.what();
-    return -1;
-  }
-}
diff --git a/src/nnetbin/nnet-train-mpe-sequential.cc b/src/nnetbin/nnet-train-mpe-sequential.cc
deleted file mode 100644
index 2ba14527142..00000000000
--- a/src/nnetbin/nnet-train-mpe-sequential.cc
+++ /dev/null
@@ -1,412 +0,0 @@
-// nnetbin/nnet-train-mpe-sequential.cc
-
-// Copyright 2011-2016  Brno University of Technology (author: Karel Vesely);
-//                      Arnab Ghoshal
-
-// See ../../COPYING for clarification regarding multiple authors
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//  http://www.apache.org/licenses/LICENSE-2.0
-//
-// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
-// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
-// MERCHANTABLITY OR NON-INFRINGEMENT.
-// See the Apache 2 License for the specific language governing permissions and
-// limitations under the License.
-
-
-#include "base/kaldi-common.h"
-#include "util/common-utils.h"
-#include "tree/context-dep.h"
-#include "hmm/transition-model.h"
-#include "fstext/fstext-lib.h"
-#include "decoder/faster-decoder.h"
-#include "decoder/decodable-matrix.h"
-#include "lat/kaldi-lattice.h"
-#include "lat/lattice-functions.h"
-
-#include "nnet/nnet-trnopts.h"
-#include "nnet/nnet-component.h"
-#include "nnet/nnet-activation.h"
-#include "nnet/nnet-nnet.h"
-#include "nnet/nnet-pdf-prior.h"
-#include "nnet/nnet-utils.h"
-#include "base/timer.h"
-#include "cudamatrix/cu-device.h"
-
-
-namespace kaldi {
-namespace nnet1 {
-
-void LatticeAcousticRescore(const Matrix<BaseFloat> &log_like,
-                            const TransitionModel &trans_model,
-                            const std::vector<int32> &state_times,
-                            Lattice *lat) {
-  kaldi::uint64 props = lat->Properties(fst::kFstProperties, false);
-  if (!(props & fst::kTopSorted))
-    KALDI_ERR << "Input lattice must be topologically sorted.";
-
-  KALDI_ASSERT(!state_times.empty());
-  std::vector<std::vector<int32> > time_to_state(log_like.NumRows());
-  for (size_t i = 0; i < state_times.size(); i++) {
-    KALDI_ASSERT(state_times[i] >= 0);
-    if (state_times[i] < log_like.NumRows())  // end state may be past this..
-      time_to_state[state_times[i]].push_back(i);
-    else
-      KALDI_ASSERT(state_times[i] == log_like.NumRows()
-                   && "There appears to be lattice/feature mismatch.");
-  }
-
-  for (int32 t = 0; t < log_like.NumRows(); t++) {
-    for (size_t i = 0; i < time_to_state[t].size(); i++) {
-      int32 state = time_to_state[t][i];
-      for (fst::MutableArcIterator<Lattice> aiter(lat, state); !aiter.Done();
-           aiter.Next()) {
-        LatticeArc arc = aiter.Value();
-        int32 trans_id = arc.ilabel;
-        if (trans_id != 0) {  // Non-epsilon input label on arc
-          int32 pdf_id = trans_model.TransitionIdToPdf(trans_id);
-          arc.weight.SetValue2(-log_like(t, pdf_id) + arc.weight.Value2());
-          aiter.SetValue(arc);
-        }
-      }
-    }
-  }
-}
-
-}  // namespace nnet1
-}  // namespace kaldi
-
-
-int main(int argc, char *argv[]) {
-  using namespace kaldi;
-  using namespace kaldi::nnet1;
-  typedef kaldi::int32 int32;
-  try {
-    const char *usage =
-      "Perform one iteration of MPE/sMBR training using SGD with per-utterance"
-      "updates.\n"
-
-      "Usage:  nnet-train-mpe-sequential [options] "
-      "<model-in> <transition-model-in> <feature-rspecifier> "
-      "<den-lat-rspecifier> <ali-rspecifier> [<model-out>]\n"
-
-      "e.g.: nnet-train-mpe-sequential nnet.init trans.mdl scp:feats.scp "
-      "scp:denlats.scp ark:ali.ark nnet.iter1\n";
-
-    ParseOptions po(usage);
-
-    NnetTrainOptions trn_opts;
-    trn_opts.learn_rate = 0.00001;  // changing default,
-    trn_opts.Register(&po);
-
-    bool binary = true;
-    po.Register("binary", &binary, "Write output in binary mode");
-
-    std::string feature_transform;
-    po.Register("feature-transform", &feature_transform,
-                "Feature transform in 'nnet1' format");
-
-    std::string silence_phones_str;
-    po.Register("silence-phones", &silence_phones_str,
-        "Colon-separated list of integer id's of silence phones, e.g. 46:47");
-
-    PdfPriorOptions prior_opts;
-    prior_opts.Register(&po);
-
-    BaseFloat acoustic_scale = 1.0,
-        lm_scale = 1.0,
-        old_acoustic_scale = 0.0;
-
-    po.Register("acoustic-scale", &acoustic_scale,
-        "Scaling factor for acoustic likelihoods");
-
-    po.Register("lm-scale", &lm_scale,
-        "Scaling factor for \"graph costs\" (including LM costs)");
-
-    po.Register("old-acoustic-scale", &old_acoustic_scale,
-        "Add in the scores in the input lattices with this scale, rather "
-        "than discarding them.");
-
-    bool one_silence_class = false;
-    po.Register("one-silence-class", &one_silence_class,
-        "If true, the newer behavior reduces insertions.");
-
-    kaldi::int32 max_frames = 6000;
-    po.Register("max-frames", &max_frames,
-        "Maximum number of frames an utterance can have (skipped if longer)");
-
-    bool do_smbr = false;
-    po.Register("do-smbr", &do_smbr,
-        "Use state-level accuracies instead of phone accuracies.");
-
-    std::string use_gpu="yes";
-    po.Register("use-gpu", &use_gpu,
-        "yes|no|optional, only has effect if compiled with CUDA");
-
-    po.Read(argc, argv);
-
-    if (po.NumArgs() != 6) {
-      po.PrintUsage();
-      exit(1);
-    }
-
-    std::string model_filename = po.GetArg(1),
-        transition_model_filename = po.GetArg(2),
-        feature_rspecifier = po.GetArg(3),
-        den_lat_rspecifier = po.GetArg(4),
-        ref_ali_rspecifier = po.GetArg(5),
-        target_model_filename = po.GetArg(6);
-
-    std::vector<int32> silence_phones;
-    if (!kaldi::SplitStringToIntegers(silence_phones_str, ":", false,
-                                      &silence_phones)) {
-      KALDI_ERR << "Invalid silence-phones string " << silence_phones_str;
-    }
-    kaldi::SortAndUniq(&silence_phones);
-    if (silence_phones.empty()) {
-      KALDI_LOG << "No silence phones specified.";
-    }
-
-#if HAVE_CUDA == 1
-    CuDevice::Instantiate().SelectGpuId(use_gpu);
-#endif
-
-    Nnet nnet_transf;
-    if (feature_transform != "") {
-      nnet_transf.Read(feature_transform);
-    }
-
-    Nnet nnet;
-    nnet.Read(model_filename);
-    // we will use pre-softmax activations, removing softmax,
-    // - pre-softmax activations are equivalent to 'log-posterior + C_frame',
-    // - all paths crossing a frame share same 'C_frame',
-    // - with GMM, we also have the unnormalized acoustic likelihoods,
-    if (nnet.GetLastComponent().GetType() ==
-        kaldi::nnet1::Component::kSoftmax) {
-      KALDI_LOG << "Removing softmax from the nnet " << model_filename;
-      nnet.RemoveLastComponent();
-    } else {
-      KALDI_LOG << "The nnet was without softmax. "
-                << "The last component in " << model_filename << " was "
-                << Component::TypeToMarker(nnet.GetLastComponent().GetType());
-    }
-    nnet.SetTrainOptions(trn_opts);
-
-    // Read the class-frame-counts, compute priors,
-    PdfPrior log_prior(prior_opts);
-
-    // Read transition model,
-    TransitionModel trans_model;
-    ReadKaldiObject(transition_model_filename, &trans_model);
-
-    SequentialBaseFloatMatrixReader feature_reader(feature_rspecifier);
-    RandomAccessLatticeReader den_lat_reader(den_lat_rspecifier);
-    RandomAccessInt32VectorReader ref_ali_reader(ref_ali_rspecifier);
-
-    CuMatrix<BaseFloat> feats_transf, nnet_out, nnet_diff;
-    Matrix<BaseFloat> nnet_out_h;
-
-    Timer time;
-    double time_now = 0;
-    KALDI_LOG << "TRAINING STARTED";
-
-    int32 num_done = 0,
-          num_no_ref_ali = 0,
-          num_no_den_lat = 0,
-          num_other_error = 0;
-
-    kaldi::int64 total_frames = 0;
-    double total_frame_acc = 0.0, utt_frame_acc;
-
-    // main loop over utterances,
-    for (; !feature_reader.Done(); feature_reader.Next()) {
-      std::string utt = feature_reader.Key();
-      if (!den_lat_reader.HasKey(utt)) {
-        KALDI_WARN << "Missing lattice for " << utt;
-        num_no_den_lat++;
-        continue;
-      }
-      if (!ref_ali_reader.HasKey(utt)) {
-        KALDI_WARN << "Missing alignment for " << utt;
-        num_no_ref_ali++;
-        continue;
-      }
-
-      // 1) get the features, numerator alignment,
-      const Matrix<BaseFloat> &mat = feature_reader.Value();
-      const std::vector<int32> &ref_ali = ref_ali_reader.Value(utt);
-      // check duration of numerator alignments,
-      if (static_cast<MatrixIndexT>(ref_ali.size()) != mat.NumRows()) {
-        KALDI_WARN << "Duration mismatch!"
-                   << " alignment " << ref_ali.size()
-                   << " features " << mat.NumRows();
-        num_other_error++;
-        continue;
-      }
-      if (mat.NumRows() > max_frames) {
-        KALDI_WARN << "Skipping " << utt
-          << " that has " << mat.NumRows() << " frames,"
-          << " it is longer than '--max-frames'" << max_frames;
-        num_other_error++;
-        continue;
-      }
-
-      // 2) get the denominator lattice, preprocess
-      Lattice den_lat = den_lat_reader.Value(utt);
-      if (den_lat.Start() == -1) {
-        KALDI_WARN << "Empty lattice of " << utt << ", skipping.";
-        num_other_error++;
-        continue;
-      }
-      if (old_acoustic_scale != 1.0) {
-        fst::ScaleLattice(fst::AcousticLatticeScale(old_acoustic_scale),
-                          &den_lat);
-      }
-      // optional sort it topologically
-      kaldi::uint64 props = den_lat.Properties(fst::kFstProperties, false);
-      if (!(props & fst::kTopSorted)) {
-        if (fst::TopSort(&den_lat) == false) {
-          KALDI_ERR << "Cycles detected in lattice.";
-        }
-      }
-      // get the lattice length and times of states
-      std::vector<int32> state_times;
-      int32 max_time = kaldi::LatticeStateTimes(den_lat, &state_times);
-      // check for temporal length of denominator lattices
-      if (max_time != mat.NumRows()) {
-        KALDI_WARN << "Duration mismatch!"
-          << " denominator lattice " << max_time
-          << " features " << mat.NumRows() << ","
-          << " skipping " << utt;
-        num_other_error++;
-        continue;
-      }
-
-      // get dims,
-      int32 num_frames = mat.NumRows();
-
-      // 3) get the pre-softmax outputs from NN,
-      // apply transform,
-      nnet_transf.Feedforward(CuMatrix<BaseFloat>(mat), &feats_transf);
-      // propagate through the nnet (we know it's w/o softmax),
-      nnet.Propagate(feats_transf, &nnet_out);
-      // subtract the log_prior,
-      if (prior_opts.class_frame_counts != "") {
-        log_prior.SubtractOnLogpost(&nnet_out);
-      }
-      // transfer it back to the host,
-      nnet_out_h = Matrix<BaseFloat>(nnet_out);
-      // release the buffers we don't need anymore
-      feats_transf.Resize(0, 0);
-      nnet_out.Resize(0, 0);
-
-      // 4) rescore the latice
-      LatticeAcousticRescore(nnet_out_h, trans_model, state_times, &den_lat);
-      if (acoustic_scale != 1.0 || lm_scale != 1.0)
-        fst::ScaleLattice(fst::LatticeScale(lm_scale, acoustic_scale), &den_lat);
-
-      kaldi::Posterior post;
-      if (do_smbr) {
-        // use state-level accuracies, i.e. sMBR estimation,
-        utt_frame_acc = LatticeForwardBackwardMpeVariants(
-            trans_model, silence_phones, den_lat, ref_ali, "smbr",
-            one_silence_class, &post);
-      } else {
-        // use phone-level accuracies, i.e. MPFE (minimum phone frame error),
-        utt_frame_acc = LatticeForwardBackwardMpeVariants(
-            trans_model, silence_phones, den_lat, ref_ali, "mpfe",
-            one_silence_class, &post);
-      }
-
-      // 6) convert the Posterior to a matrix,
-      PosteriorToPdfMatrix(post, trans_model, &nnet_diff);
-      nnet_diff.Scale(-1.0);  // need to flip the sign of derivative,
-
-      KALDI_VLOG(1) << "Lattice #" << num_done + 1 << " processed"
-                    << " (" << utt << "): found " << den_lat.NumStates()
-                    << " states and " << fst::NumArcs(den_lat) << " arcs.";
-
-      KALDI_VLOG(1) << "Utterance " << utt << ": Average frame accuracy = "
-                    << (utt_frame_acc/num_frames) << " over " << num_frames
-                    << " frames,"
-                    << " diff-range(" << nnet_diff.Min() << ","
-                                      << nnet_diff.Max() << ")";
-
-      // 7) backpropagate through the nnet, update,
-      nnet.Backpropagate(nnet_diff, NULL);
-      nnet_diff.Resize(0, 0);  // release GPU memory,
-
-      // increase time counter
-      total_frame_acc += utt_frame_acc;
-      total_frames += num_frames;
-      num_done++;
-
-      if (num_done % 100 == 0) {
-        time_now = time.Elapsed();
-        KALDI_VLOG(1) << "After " << num_done << " utterances: "
-          << "time elapsed = " << time_now / 60 << " min; "
-          << "processed " << total_frames / time_now << " frames per sec.";
-#if HAVE_CUDA == 1
-        // check that GPU computes accurately,
-        CuDevice::Instantiate().CheckGpuHealth();
-#endif
-      }
-
-      // GRADIENT LOGGING
-      // First utterance,
-      if (num_done == 1) {
-        KALDI_VLOG(1) << nnet.InfoPropagate();
-        KALDI_VLOG(1) << nnet.InfoBackPropagate();
-        KALDI_VLOG(1) << nnet.InfoGradient();
-      }
-      // Every 1000 utterances (--verbose=2),
-      if (GetVerboseLevel() >= 2) {
-        if (num_done % 1000 == 0) {
-          KALDI_VLOG(2) << nnet.InfoPropagate();
-          KALDI_VLOG(2) << nnet.InfoBackPropagate();
-          KALDI_VLOG(2) << nnet.InfoGradient();
-        }
-      }
-    }  // main loop over utterances,
-
-    // After last utterance,
-    KALDI_VLOG(1) << nnet.InfoPropagate();
-    KALDI_VLOG(1) << nnet.InfoBackPropagate();
-    KALDI_VLOG(1) << nnet.InfoGradient();
-
-    // Add the softmax layer back before writing,
-    KALDI_LOG << "Appending the softmax " << target_model_filename;
-    nnet.AppendComponentPointer(new Softmax(nnet.OutputDim(), nnet.OutputDim()));
-    // Store the nnet,
-    nnet.Write(target_model_filename, binary);
-
-    time_now = time.Elapsed();
-    KALDI_LOG << "TRAINING FINISHED; "
-              << "Time taken = " << time_now / 60 << " min; processed "
-              << total_frames / time_now << " frames per second.";
-
-    KALDI_LOG << "Done " << num_done << " files, "
-              << num_no_ref_ali << " with no reference alignments, "
-              << num_no_den_lat << " with no lattices, "
-              << num_other_error << " with other errors.";
-
-    KALDI_LOG << "Overall average frame-accuracy is "
-              << total_frame_acc / total_frames << " over "
-              << total_frames << " frames.";
-
-#if HAVE_CUDA == 1
-    CuDevice::Instantiate().PrintProfile();
-#endif
-
-    return 0;
-  } catch(const std::exception &e) {
-    std::cerr << e.what();
-    return -1;
-  }
-}
diff --git a/src/nnetbin/nnet-train-multistream-perutt.cc b/src/nnetbin/nnet-train-multistream-perutt.cc
deleted file mode 100644
index 3694cf29e01..00000000000
--- a/src/nnetbin/nnet-train-multistream-perutt.cc
+++ /dev/null
@@ -1,363 +0,0 @@
-// nnetbin/nnet-train-multistream-perutt.cc
-
-// Copyright 2016 Brno University of Technology (author: Karel Vesely)
-// Copyright 2015 Chongjia Ni
-
-// See ../../COPYING for clarification regarding multiple authors
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-// http://www.apache.org/licenses/LICENSE-2.0
-//
-// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
-// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
-// MERCHANTABLITY OR NON-INFRINGEMENT.
-// See the Apache 2 License for the specific language governing permissions and
-// limitations under the License.
-
-#include "nnet/nnet-trnopts.h"
-#include "nnet/nnet-nnet.h"
-#include "nnet/nnet-loss.h"
-#include "nnet/nnet-matrix-buffer.h"
-
-#include "base/kaldi-common.h"
-#include "util/common-utils.h"
-#include "base/timer.h"
-#include "cudamatrix/cu-device.h"
-
-#include <numeric>
-#include <algorithm>
-
-int main(int argc, char *argv[]) {
-  using namespace kaldi;
-  using namespace kaldi::nnet1;
-  typedef kaldi::int32 int32;
-
-  try {
-    const char *usage =
-      "Perform one iteration of Multi-stream training, per-utterance BPTT for (B)LSTMs.\n"
-      "The updates are done per-utterance, while several utterances are \n"
-      "processed at the same time.\n"
-      "\n"
-      "Usage: nnet-train-multistream-perutt [options] <feature-rspecifier> <labels-rspecifier> <model-in> [<model-out>]\n"
-      "e.g.: nnet-train-blstm-streams scp:feats.scp ark:targets.ark nnet.init nnet.iter1\n";
-
-    ParseOptions po(usage);
-
-    // training options,
-    NnetTrainOptions trn_opts;
-    trn_opts.Register(&po);
-    LossOptions loss_opts;
-    loss_opts.Register(&po);
-
-    bool binary = true;
-    po.Register("binary", &binary, "Write model in binary mode");
-
-    bool crossvalidate = false;
-    po.Register("cross-validate", &crossvalidate,
-        "Perform cross-validation (no backpropagation)");
-
-    std::string feature_transform;
-    po.Register("feature-transform", &feature_transform,
-        "Feature transform in Nnet format");
-
-    int32 length_tolerance = 5;
-    po.Register("length-tolerance", &length_tolerance,
-        "Allowed length difference of features/targets (frames)");
-
-    std::string frame_weights;
-    po.Register("frame-weights", &frame_weights,
-        "Per-frame weights to scale gradients (frame selection/weighting).");
-
-    int32 num_streams = 20;
-    po.Register("num-streams", &num_streams,
-        "Number of sentences processed in parallel (can be lower if sentences are long)");
-
-    double max_frames = 8000;
-    po.Register("max-frames", &max_frames,
-        "Max number of frames to be processed");
-
-    bool dummy = false;
-    po.Register("randomize", &dummy, "Dummy option.");
-
-    std::string use_gpu = "yes";
-    po.Register("use-gpu", &use_gpu,
-        "yes|no|optional, only has effect if compiled with CUDA");
-
-    po.Read(argc, argv);
-
-    if (po.NumArgs() != 3 + (crossvalidate ? 0 : 1)) {
-      po.PrintUsage();
-      exit(1);
-    }
-
-    std::string feature_rspecifier = po.GetArg(1),
-      targets_rspecifier = po.GetArg(2),
-      model_filename = po.GetArg(3);
-
-    std::string target_model_filename;
-    if (!crossvalidate) {
-      target_model_filename = po.GetArg(4);
-    }
-
-    using namespace kaldi;
-    using namespace kaldi::nnet1;
-    typedef kaldi::int32 int32;
-
-#if HAVE_CUDA == 1
-    CuDevice::Instantiate().SelectGpuId(use_gpu);
-#endif
-
-    Nnet nnet_transf;
-    if ( feature_transform != "" ) {
-      nnet_transf.Read(feature_transform);
-    }
-
-    Nnet nnet;
-    nnet.Read(model_filename);
-    nnet.SetTrainOptions(trn_opts);
-
-    if (crossvalidate) {
-      nnet_transf.SetDropoutRate(0.0);
-      nnet.SetDropoutRate(0.0);
-    }
-
-    kaldi::int64 total_frames = 0;
-
-    // Initialize feature and target readers,
-    SequentialBaseFloatMatrixReader feature_reader(feature_rspecifier);
-    RandomAccessPosteriorReader targets_reader(targets_rspecifier);
-    RandomAccessBaseFloatVectorReader weights_reader;
-    if (frame_weights != "") {
-      weights_reader.Open(frame_weights);
-    }
-
-
-    Xent xent(loss_opts);
-
-    CuMatrix<BaseFloat> feats_transf, nnet_out, obj_diff;
-
-    Timer time;
-    KALDI_LOG << (crossvalidate ? "CROSS-VALIDATION" : "TRAINING")
-              << " STARTED";
-
-    // Buffer for input features, used for choosing utt's with similar length,
-    MatrixBuffer matrix_buffer;
-    matrix_buffer.Init(&feature_reader);
-
-    int32 num_done = 0,
-          num_no_tgt_mat = 0,
-          num_other_error = 0;
-
-    while (!matrix_buffer.Done()) {
-
-      // Fill the parallel data into 'std::vector',
-      std::vector<Matrix<BaseFloat> > feats_utt;
-      std::vector<Posterior> labels_utt;
-      std::vector<Vector<BaseFloat> > weights_utt;
-      std::vector<int32> frame_num_utt;
-      {
-        matrix_buffer.ResetLength();  ///< reset the 'preferred' length,
-        for (matrix_buffer.Next(); !matrix_buffer.Done(); matrix_buffer.Next()) {
-          std::string utt = matrix_buffer.Key();
-          // Check that we have targets,
-          if (!targets_reader.HasKey(utt)) {
-            KALDI_WARN << utt << ", missing targets";
-            num_no_tgt_mat++;
-            continue;
-          }
-          // Do we have frame-weights?
-          if (frame_weights != "" && !weights_reader.HasKey(utt)) {
-            KALDI_WARN << utt << ", missing frame-weights";
-            num_other_error++;
-            continue;
-          }
-
-          // Get feature / target pair,
-          Matrix<BaseFloat> mat = matrix_buffer.Value();
-          Posterior targets  = targets_reader.Value(utt);
-
-          // Skip too long sentences,
-          if (mat.NumRows() > max_frames) continue;
-
-          Vector<BaseFloat> weights;
-          if (frame_weights != "") {
-            weights = weights_reader.Value(utt);
-          } else {  // all per-frame weights are 1.0
-            weights.Resize(mat.NumRows());
-            weights.Set(1.0);
-          }
-
-          // correct small length mismatch ... or drop sentence
-          {
-            // add lengths to vector
-            std::vector<int32> length;
-            length.push_back(mat.NumRows());
-            length.push_back(targets.size());
-            length.push_back(weights.Dim());
-            // find min, max
-            int32 min = *std::min_element(length.begin(), length.end());
-            int32 max = *std::max_element(length.begin(), length.end());
-            // fix or drop ?
-            if (max - min < length_tolerance) {
-              if (mat.NumRows() != min) mat.Resize(min, mat.NumCols(), kCopyData);
-              if (targets.size() != min) targets.resize(min);
-              if (weights.Dim() != min) weights.Resize(min, kCopyData);
-            } else {
-              KALDI_WARN << "Length mismatch! Targets " << targets.size()
-                         << ", features " << mat.NumRows() << ", " << utt;
-              num_other_error++;
-              continue;
-            }
-          }
-
-          // input transform may contain splicing,
-          nnet_transf.Feedforward(CuMatrix<BaseFloat>(mat), &feats_transf);
-
-          // store,
-          feats_utt.push_back(Matrix<BaseFloat>(feats_transf));
-          labels_utt.push_back(targets);
-          weights_utt.push_back(weights);
-          frame_num_utt.push_back(feats_transf.NumRows());
-
-          if (frame_num_utt.size() == num_streams) break;
-
-          // See how many frames we'd have (after padding), if we add one more utterance,
-          int32 max = (*std::max_element(frame_num_utt.begin(), frame_num_utt.end()));
-          if (max * (frame_num_utt.size() + 1) > max_frames) break;
-        }
-      }
-      // Having no data? Skip the cycle...
-      if (frame_num_utt.size() == 0) continue;
-
-      // Pack the parallel data,
-      Matrix<BaseFloat> feat_mat_host;
-      Posterior target_host;
-      Vector<BaseFloat> weight_host;
-      {
-        // Number of sequences,
-        int32 n_streams = frame_num_utt.size();
-        int32 frame_num_padded = (*std::max_element(frame_num_utt.begin(), frame_num_utt.end()));
-        int32 feat_dim = feats_utt.front().NumCols();
-
-        // Create the final feature matrix. Every utterance is padded to the max
-        // length within this group of utterances,
-        feat_mat_host.Resize(n_streams * frame_num_padded, feat_dim, kSetZero);
-        target_host.resize(n_streams * frame_num_padded);
-        weight_host.Resize(n_streams * frame_num_padded, kSetZero);
-
-        for (int32 s = 0; s < n_streams; s++) {
-          const Matrix<BaseFloat>& mat_tmp = feats_utt[s];
-          for (int32 r = 0; r < frame_num_utt[s]; r++) {
-            feat_mat_host.Row(r*n_streams + s).CopyFromVec(mat_tmp.Row(r));
-          }
-        }
-
-        for (int32 s = 0; s < n_streams; s++) {
-          const Posterior& target_tmp = labels_utt[s];
-          for (int32 r = 0; r < frame_num_utt[s]; r++) {
-            target_host[r*n_streams + s] = target_tmp[r];
-          }
-        }
-
-        // padded frames will keep initial zero-weight,
-        for (int32 s = 0; s < n_streams; s++) {
-          const Vector<BaseFloat>& weight_tmp = weights_utt[s];
-          for (int32 r = 0; r < frame_num_utt[s]; r++) {
-            weight_host(r*n_streams + s) = weight_tmp(r);
-          }
-        }
-      }
-
-      // Set the original lengths of utterances before padding,
-      nnet.SetSeqLengths(frame_num_utt);
-      // Show the 'utt' lengths in the VLOG[2],
-      if (GetVerboseLevel() >= 2) {
-        std::ostringstream os;
-        os << "[ ";
-        for (size_t i = 0; i < frame_num_utt.size(); i++) {
-          os << frame_num_utt[i] << " ";
-        }
-        os << "]";
-        KALDI_LOG << "frame_num_utt[" << frame_num_utt.size() << "]" << os.str();
-      }
-      // Reset all the streams (we have new sentences),
-      nnet.ResetStreams(std::vector<int32>(frame_num_utt.size(), 1));
-
-      // Propagation,
-      nnet.Propagate(CuMatrix<BaseFloat>(feat_mat_host), &nnet_out);
-
-      // Per-frame cross-entropy, gradients get re-scaled by weights,
-      xent.Eval(weight_host, nnet_out, target_host, &obj_diff);
-
-      // Backward pass
-      if (!crossvalidate) {
-        nnet.Backpropagate(obj_diff, NULL);
-      }
-
-      // 1st model update : show what happens in network,
-      if (total_frames == 0) {
-        KALDI_LOG << "### After " << total_frames << " frames,";
-        KALDI_LOG << nnet.Info();
-        KALDI_LOG << nnet.InfoPropagate();
-        if (!crossvalidate) {
-          KALDI_LOG << nnet.InfoBackPropagate();
-          KALDI_LOG << nnet.InfoGradient();
-        }
-      }
-
-      kaldi::int64 tmp_frames = total_frames;
-
-      num_done += frame_num_utt.size();
-      total_frames += std::accumulate(frame_num_utt.begin(), frame_num_utt.end(), 0);
-
-      // monitor the NN training (--verbose=2),
-      int32 F = 25000;
-      if (GetVerboseLevel() >= 3) {
-        // print every 25k frames,
-        if (tmp_frames / F != total_frames / F) {
-          KALDI_VLOG(2) << "### After " << total_frames << " frames,";
-          KALDI_VLOG(2) << nnet.Info();
-          KALDI_VLOG(2) << nnet.InfoPropagate();
-          if (!crossvalidate) {
-            KALDI_VLOG(2) << nnet.InfoBackPropagate();
-            KALDI_VLOG(2) << nnet.InfoGradient();
-          }
-        }
-      }
-    }
-
-    // after last model update : show what happens in network,
-    KALDI_LOG << "### After " << total_frames << " frames,";
-    KALDI_LOG << nnet.Info();
-    KALDI_LOG << nnet.InfoPropagate();
-    if (!crossvalidate) {
-      KALDI_LOG << nnet.InfoBackPropagate();
-      KALDI_LOG << nnet.InfoGradient();
-    }
-
-    if (!crossvalidate) {
-      nnet.Write(target_model_filename, binary);
-    }
-
-    KALDI_LOG << xent.ReportPerClass();
-    KALDI_LOG << "Done " << num_done << " files, " << num_no_tgt_mat
-              << " with no tgt_mats, " << num_other_error
-              << " with other errors. "
-              << "[" << (crossvalidate ? "CROSS-VALIDATION" : "TRAINING")
-              << ", " << time.Elapsed() / 60 << " min, "
-              << "fps" << total_frames / time.Elapsed() << "]";
-    KALDI_LOG << xent.Report();
-
-#if HAVE_CUDA == 1
-    CuDevice::Instantiate().PrintProfile();
-#endif
-    return 0;
-  } catch(const std::exception &e) {
-    std::cerr << e.what();
-    return -1;
-  }
-}
diff --git a/src/nnetbin/nnet-train-multistream.cc b/src/nnetbin/nnet-train-multistream.cc
deleted file mode 100644
index 133c49e02a5..00000000000
--- a/src/nnetbin/nnet-train-multistream.cc
+++ /dev/null
@@ -1,460 +0,0 @@
-// nnetbin/nnet-train-multistream.cc
-
-// Copyright 2015-2016  Brno University of Technology (Author: Karel Vesely)
-//           2014  Jiayu DU (Jerry), Wei Li
-
-// See ../../COPYING for clarification regarding multiple authors
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//  http://www.apache.org/licenses/LICENSE-2.0
-//
-// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
-// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
-// MERCHANTABLITY OR NON-INFRINGEMENT.
-// See the Apache 2 License for the specific language governing permissions and
-// limitations under the License.
-
-#include <numeric>
-
-#include "nnet/nnet-trnopts.h"
-#include "nnet/nnet-nnet.h"
-#include "nnet/nnet-loss.h"
-#include "nnet/nnet-randomizer.h"
-#include "base/kaldi-common.h"
-#include "util/common-utils.h"
-#include "base/timer.h"
-#include "cudamatrix/cu-device.h"
-
-
-namespace kaldi {
-
-bool ReadData(SequentialBaseFloatMatrixReader& feature_reader,
-              RandomAccessPosteriorReader& target_reader,
-              RandomAccessBaseFloatVectorReader& weights_reader,
-              int32 length_tolerance,
-              Matrix<BaseFloat>* feats,
-              Posterior* targets,
-              Vector<BaseFloat>* weights,
-              int32* num_no_tgt_mat,
-              int32* num_other_error) {
-
-  // We're looking for the 1st valid utterance...
-  for ( ; !feature_reader.Done(); feature_reader.Next()) {
-    // Do we have targets?
-    const std::string& utt = feature_reader.Key();
-    if (!target_reader.HasKey(utt)) {
-      KALDI_WARN << utt << ", missing targets";
-      (*num_no_tgt_mat)++;
-      continue;
-    }
-    // Do we have frame-weights?
-    if (weights_reader.IsOpen() && !weights_reader.HasKey(utt)) {
-      KALDI_WARN << utt << ", missing frame-weights";
-      (*num_other_error)++;
-      continue;
-    }
-
-    // get the (feature,target) pair,
-    (*feats) = feature_reader.Value();
-    (*targets) = target_reader.Value(utt);
-
-    // getting per-frame weights,
-    if (weights_reader.IsOpen()) {
-      (*weights) = weights_reader.Value(utt);
-    } else {  // all per-frame weights are 1.0
-      weights->Resize(feats->NumRows());
-      weights->Set(1.0);
-    }
-
-    // correct small length mismatch ... or drop sentence
-    {
-      // add lengths to vector
-      std::vector<int32> length;
-      length.push_back(feats->NumRows());
-      length.push_back(targets->size());
-      length.push_back(weights->Dim());
-      // find min, max
-      int32 min = *std::min_element(length.begin(), length.end());
-      int32 max = *std::max_element(length.begin(), length.end());
-      // fix or drop ?
-      if (max - min < length_tolerance) {
-        if (feats->NumRows() != min) feats->Resize(min, feats->NumCols(), kCopyData);
-        if (targets->size() != min) targets->resize(min);
-        if (weights->Dim() != min) weights->Resize(min, kCopyData);
-      } else {
-        KALDI_WARN << "Length mismatch! Targets " << targets->size()
-                   << ", features " << feats->NumRows() << ", " << utt;
-        num_other_error++;
-        continue;
-      }
-    }
-
-    // By getting here we got a valid utterance,
-    feature_reader.Next();
-    return true;
-  }
-
-  // No more data,
-  return false;
-}
-
-}  // namespace kaldi
-
-
-int main(int argc, char *argv[]) {
-  using namespace kaldi;
-  using namespace kaldi::nnet1;
-  typedef kaldi::int32 int32;
-
-  try {
-    const char *usage =
-        "Perform one iteration of Multi-stream training, truncated BPTT for LSTMs.\n"
-        "The training targets are pdf-posteriors, usually prepared by ali-to-post.\n"
-        "The updates are per-utterance.\n"
-        "\n"
-        "Usage: nnet-train-multistream [options] "
-          "<feature-rspecifier> <targets-rspecifier> <model-in> [<model-out>]\n"
-        "e.g.: nnet-train-lstm-streams scp:feature.scp ark:posterior.ark nnet.init nnet.iter1\n";
-
-    ParseOptions po(usage);
-
-    NnetTrainOptions trn_opts;
-    trn_opts.Register(&po);
-    LossOptions loss_opts;
-    loss_opts.Register(&po);
-
-    bool binary = true;
-    po.Register("binary", &binary, "Write output in binary mode");
-
-    bool crossvalidate = false;
-    po.Register("cross-validate", &crossvalidate,
-        "Perform cross-validation (don't back-propagate)");
-
-    std::string feature_transform;
-    po.Register("feature-transform", &feature_transform,
-        "Feature transform in Nnet format");
-
-    std::string objective_function = "xent";
-    po.Register("objective-function", &objective_function,
-        "Objective function : xent|mse");
-
-    int32 length_tolerance = 5;
-    po.Register("length-tolerance", &length_tolerance,
-      "Allowed length difference of features/targets (frames)");
-
-    std::string frame_weights;
-    po.Register("frame-weights", &frame_weights,
-      "Per-frame weights to scale gradients (frame selection/weighting).");
-
-    int32 batch_size = 20;
-    po.Register("batch-size", &batch_size,
-      "Length of 'one stream' in the Multi-stream training");
-
-    int32 num_streams = 4;
-    po.Register("num-streams", &num_streams,
-      "Number of streams in the Multi-stream training");
-
-    bool dummy = false;
-    po.Register("randomize", &dummy, "Dummy option.");
-
-    std::string use_gpu="yes";
-    po.Register("use-gpu", &use_gpu,
-        "yes|no|optional, only has effect if compiled with CUDA");
-
-    po.Read(argc, argv);
-
-    if (po.NumArgs() != 3 + (crossvalidate ? 0 : 1)) {
-      po.PrintUsage();
-      exit(1);
-    }
-
-    std::string feature_rspecifier = po.GetArg(1),
-      targets_rspecifier = po.GetArg(2),
-      model_filename = po.GetArg(3);
-
-    std::string target_model_filename;
-    if (!crossvalidate) {
-      target_model_filename = po.GetArg(4);
-    }
-
-    using namespace kaldi;
-    using namespace kaldi::nnet1;
-    typedef kaldi::int32 int32;
-
-#if HAVE_CUDA == 1
-    CuDevice::Instantiate().SelectGpuId(use_gpu);
-#endif
-
-    Nnet nnet_transf;
-    if (feature_transform != "") {
-      nnet_transf.Read(feature_transform);
-    }
-
-    Nnet nnet;
-    nnet.Read(model_filename);
-    nnet.SetTrainOptions(trn_opts);
-
-    if (crossvalidate) {
-      nnet_transf.SetDropoutRate(0.0);
-      nnet.SetDropoutRate(0.0);
-    }
-
-    kaldi::int64 total_frames = 0;
-
-    SequentialBaseFloatMatrixReader feature_reader(feature_rspecifier);
-    RandomAccessPosteriorReader target_reader(targets_rspecifier);
-    RandomAccessBaseFloatVectorReader weights_reader;
-    if (frame_weights != "") {
-      weights_reader.Open(frame_weights);
-    }
-
-    Xent xent(loss_opts);
-    Mse mse(loss_opts);
-
-    Timer time;
-    KALDI_LOG << (crossvalidate ? "CROSS-VALIDATION" : "TRAINING")
-              << " STARTED";
-
-    int32 num_done = 0,
-          num_no_tgt_mat = 0,
-          num_other_error = 0;
-
-    // book-keeping for multi-stream training,
-    std::vector<Matrix<BaseFloat> > feats_utt(num_streams);
-    std::vector<Posterior> labels_utt(num_streams);
-    std::vector<Vector<BaseFloat> > weights_utt(num_streams);
-    std::vector<int32> new_utt_flags(num_streams);
-
-    CuMatrix<BaseFloat> feats_transf, nnet_out, obj_diff;
-
-    // MAIN LOOP,
-    while (1) {
-
-      // Re-fill the streams, if needed,
-      new_utt_flags.assign(num_streams, 0);  // set new-utterance flags to zero,
-      for (int s = 0; s < num_streams; s++) {
-        // Need a new utterance for stream 's'?
-        if (feats_utt[s].NumRows() == 0) {
-          Matrix<BaseFloat> feats;
-          Posterior targets;
-          Vector<BaseFloat> weights;
-          // get the data from readers,
-          if (ReadData(feature_reader, target_reader, weights_reader,
-                       length_tolerance,
-                       &feats, &targets, &weights,
-                       &num_no_tgt_mat, &num_other_error)) {
-
-            // input transform may contain splicing,
-            nnet_transf.Feedforward(CuMatrix<BaseFloat>(feats), &feats_transf);
-
-            /* Here we could do the 'targets_delay', BUT...
-             * It is better to do it by a <Splice> component!
-             *
-             * The prototype would look like this (6th frame becomes 1st frame, etc.):
-             * '<Splice> <InputDim> dim1 <OutputDim> dim1 <BuildVector> 5 </BuildVector>'
-             */
-
-            // store,
-            feats_utt[s] = Matrix<BaseFloat>(feats_transf);
-            labels_utt[s] = targets;
-            weights_utt[s] = weights;
-            new_utt_flags[s] = 1;
-          }
-        }
-      }
-
-      // end the training after processing all the frames,
-      size_t frames_to_go = 0;
-      for (int32 s = 0; s < num_streams; s++) {
-        frames_to_go += feats_utt[s].NumRows();
-      }
-      if (frames_to_go == 0) break;
-
-      // number of frames we'll pack as the streams,
-      std::vector<int32> frame_num_utt;
-
-      // pack the parallel data,
-      Matrix<BaseFloat> feat_mat_host;
-      Posterior target_host;
-      Vector<BaseFloat> weight_host;
-      {
-        // Number of sequences (can have zero length),
-        int32 n_streams = num_streams;
-
-        // Create the final feature matrix with 'interleaved feature-lines',
-        feat_mat_host.Resize(n_streams * batch_size, nnet.InputDim(), kSetZero);
-        target_host.resize(n_streams * batch_size);
-        weight_host.Resize(n_streams * batch_size, kSetZero);
-        frame_num_utt.resize(n_streams, 0);
-
-        // we'll slice at most 'batch_size' frames,
-        for (int32 s = 0; s < n_streams; s++) {
-          int32 num_rows = feats_utt[s].NumRows();
-          frame_num_utt[s] = std::min(batch_size, num_rows);
-        }
-
-        // pack the data,
-        {
-          for (int32 s = 0; s < n_streams; s++) {
-            const Matrix<BaseFloat>& mat_tmp = feats_utt[s];
-            for (int32 r = 0; r < frame_num_utt[s]; r++) {
-              feat_mat_host.Row(r*n_streams + s).CopyFromVec(mat_tmp.Row(r));
-            }
-          }
-
-          for (int32 s = 0; s < n_streams; s++) {
-            const Posterior& target_tmp = labels_utt[s];
-            for (int32 r = 0; r < frame_num_utt[s]; r++) {
-              target_host[r*n_streams + s] = target_tmp[r];
-            }
-          }
-
-          // padded frames will keep initial zero-weight,
-          for (int32 s = 0; s < n_streams; s++) {
-            const Vector<BaseFloat>& weight_tmp = weights_utt[s];
-            for (int32 r = 0; r < frame_num_utt[s]; r++) {
-              weight_host(r*n_streams + s) = weight_tmp(r);
-            }
-          }
-        }
-
-        // remove the data we just packed,
-        {
-          for (int32 s = 0; s < n_streams; s++) {
-            // feats,
-            Matrix<BaseFloat>& m = feats_utt[s];
-            if (m.NumRows() == frame_num_utt[s]) {
-              feats_utt[s].Resize(0,0);  // we packed last chunk,
-            } else {
-              feats_utt[s] = Matrix<BaseFloat>(
-                m.RowRange(frame_num_utt[s], m.NumRows() - frame_num_utt[s])
-              );
-            }
-            // labels,
-            Posterior& post = labels_utt[s];
-            post.erase(post.begin(), post.begin() + frame_num_utt[s]);
-            // weights,
-            Vector<BaseFloat>& w = weights_utt[s];
-            if (w.Dim() == frame_num_utt[s]) {
-              weights_utt[s].Resize(0);  // we packed last chunk,
-            } else {
-              weights_utt[s] = Vector<BaseFloat>(
-                w.Range(frame_num_utt[s], w.Dim() - frame_num_utt[s])
-              );
-            }
-          }
-        }
-      }
-
-      // pass the info about padding,
-      nnet.SetSeqLengths(frame_num_utt);
-      // Show the 'utt' lengths in the VLOG[2],
-      if (GetVerboseLevel() >= 2) {
-        std::ostringstream os;
-        os << "[ ";
-        for (size_t i = 0; i < frame_num_utt.size(); i++) {
-          os << frame_num_utt[i] << " ";
-        }
-        os << "]";
-        KALDI_LOG << "frame_num_utt[" << frame_num_utt.size() << "]" << os.str();
-      }
-
-      // with new utterance we reset the history,
-      nnet.ResetStreams(new_utt_flags);
-
-      // forward pass,
-      nnet.Propagate(CuMatrix<BaseFloat>(feat_mat_host), &nnet_out);
-
-      // evaluate objective function we've chosen,
-      if (objective_function == "xent") {
-        xent.Eval(weight_host, nnet_out, target_host, &obj_diff);
-      } else if (objective_function == "mse") {
-        mse.Eval(weight_host, nnet_out, target_host, &obj_diff);
-      } else {
-        KALDI_ERR << "Unknown objective function code : "
-                  << objective_function;
-      }
-
-      if (!crossvalidate) {
-        // back-propagate, and do the update,
-        nnet.Backpropagate(obj_diff, NULL);
-      }
-
-      // 1st minibatch : show what happens in network,
-      if (total_frames == 0) {
-        KALDI_LOG << "### After " << total_frames << " frames,";
-        KALDI_LOG << nnet.Info();
-        KALDI_LOG << nnet.InfoPropagate();
-        if (!crossvalidate) {
-          KALDI_LOG << nnet.InfoBackPropagate();
-          KALDI_LOG << nnet.InfoGradient();
-        }
-      }
-
-      kaldi::int64 tmp_frames = total_frames;
-
-      num_done += std::accumulate(new_utt_flags.begin(), new_utt_flags.end(), 0);
-      total_frames += std::accumulate(frame_num_utt.begin(), frame_num_utt.end(), 0);
-
-      // monitor the NN training (--verbose=2),
-      int32 F = 25000;
-      if (GetVerboseLevel() >= 2) {
-        // print every 25k frames,
-        if (tmp_frames / F != total_frames / F) {
-          KALDI_VLOG(2) << "### After " << total_frames << " frames,";
-          KALDI_VLOG(2) << nnet.Info();
-          KALDI_VLOG(2) << nnet.InfoPropagate();
-          if (!crossvalidate) {
-            KALDI_VLOG(2) << nnet.InfoBackPropagate();
-            KALDI_VLOG(2) << nnet.InfoGradient();
-          }
-        }
-      }
-    }
-
-    // after last minibatch : show what happens in network,
-    KALDI_LOG << "### After " << total_frames << " frames,";
-    KALDI_LOG << nnet.Info();
-    KALDI_LOG << nnet.InfoPropagate();
-    if (!crossvalidate) {
-      KALDI_LOG << nnet.InfoBackPropagate();
-      KALDI_LOG << nnet.InfoGradient();
-    }
-
-    if (!crossvalidate) {
-      nnet.Write(target_model_filename, binary);
-    }
-
-    if (objective_function == "xent") {
-      KALDI_LOG << xent.ReportPerClass();
-    }
-
-    KALDI_LOG << "Done " << num_done << " files, "
-      << num_no_tgt_mat << " with no tgt_mats, "
-      << num_other_error << " with other errors. "
-      << "[" << (crossvalidate ? "CROSS-VALIDATION" : "TRAINING")
-      << ", " << time.Elapsed() / 60 << " min, processing "
-      << total_frames / time.Elapsed() << " frames per sec.]";
-
-    if (objective_function == "xent") {
-      KALDI_LOG << xent.Report();
-    } else if (objective_function == "mse") {
-      KALDI_LOG << mse.Report();
-    } else {
-      KALDI_ERR << "Unknown objective function code : " << objective_function;
-    }
-
-#if HAVE_CUDA == 1
-    CuDevice::Instantiate().PrintProfile();
-#endif
-
-    return 0;
-  } catch(const std::exception &e) {
-    std::cerr << e.what();
-    return -1;
-  }
-}
diff --git a/src/nnetbin/nnet-train-perutt.cc b/src/nnetbin/nnet-train-perutt.cc
deleted file mode 100644
index 8f417e1b608..00000000000
--- a/src/nnetbin/nnet-train-perutt.cc
+++ /dev/null
@@ -1,310 +0,0 @@
-// nnetbin/nnet-train-perutt.cc
-
-// Copyright 2011-2014  Brno University of Technology (Author: Karel Vesely)
-
-// See ../../COPYING for clarification regarding multiple authors
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//  http://www.apache.org/licenses/LICENSE-2.0
-//
-// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
-// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
-// MERCHANTABLITY OR NON-INFRINGEMENT.
-// See the Apache 2 License for the specific language governing permissions and
-// limitations under the License.
-
-#include "nnet/nnet-trnopts.h"
-#include "nnet/nnet-nnet.h"
-#include "nnet/nnet-loss.h"
-#include "nnet/nnet-randomizer.h"
-#include "base/kaldi-common.h"
-#include "util/common-utils.h"
-#include "base/timer.h"
-#include "cudamatrix/cu-device.h"
-
-int main(int argc, char *argv[]) {
-  using namespace kaldi;
-  using namespace kaldi::nnet1;
-  typedef kaldi::int32 int32;
-
-  try {
-    const char *usage =
-      "Perform one iteration of NN training by SGD with per-utterance updates.\n"
-      "The training targets are represented as pdf-posteriors, usually prepared "
-      "by ali-to-post.\n"
-      "Usage: nnet-train-perutt [options] "
-      "<feature-rspecifier> <targets-rspecifier> <model-in> [<model-out>]\n"
-      "e.g.: nnet-train-perutt scp:feature.scp ark:posterior.ark nnet.init nnet.iter1\n";
-
-    ParseOptions po(usage);
-
-    NnetTrainOptions trn_opts;
-    trn_opts.Register(&po);
-    LossOptions loss_opts;
-    loss_opts.Register(&po);
-
-    bool binary = true;
-    po.Register("binary", &binary, "Write output in binary mode");
-
-    bool crossvalidate = false;
-    po.Register("cross-validate", &crossvalidate,
-        "Perform cross-validation (don't backpropagate)");
-
-    std::string feature_transform;
-    po.Register("feature-transform", &feature_transform,
-        "Feature transform in Nnet format");
-
-    std::string objective_function = "xent";
-    po.Register("objective-function", &objective_function,
-        "Objective function : xent|mse");
-
-    int32 length_tolerance = 5;
-    po.Register("length-tolerance", &length_tolerance,
-        "Allowed length difference of features/targets (frames)");
-
-    std::string frame_weights;
-    po.Register("frame-weights", &frame_weights,
-        "Per-frame weights to scale gradients (frame selection/weighting).");
-
-    kaldi::int32 max_frames = 6000;  // Allow segments maximum of one minute by default
-    po.Register("max-frames",&max_frames, "Maximum number of frames a segment can have to be processed");
-
-    std::string use_gpu="yes";
-    po.Register("use-gpu", &use_gpu,
-        "yes|no|optional, only has effect if compiled with CUDA");
-
-    //// Add dummy option for compatibility with default scheduler,
-    bool randomize = false;
-    po.Register("randomize", &randomize,
-        "Dummy, for compatibility with 'steps/nnet/train_scheduler.sh'");
-    ////
-
-    po.Read(argc, argv);
-
-    if (po.NumArgs() != 3 + (crossvalidate ? 0 : 1)) {
-      po.PrintUsage();
-      exit(1);
-    }
-
-    std::string feature_rspecifier = po.GetArg(1),
-      targets_rspecifier = po.GetArg(2),
-      model_filename = po.GetArg(3);
-
-    std::string target_model_filename;
-    if (!crossvalidate) {
-      target_model_filename = po.GetArg(4);
-    }
-
-    using namespace kaldi;
-    using namespace kaldi::nnet1;
-    typedef kaldi::int32 int32;
-
-#if HAVE_CUDA == 1
-    CuDevice::Instantiate().SelectGpuId(use_gpu);
-#endif
-
-    Nnet nnet_transf;
-    if (feature_transform != "") {
-      nnet_transf.Read(feature_transform);
-    }
-
-    Nnet nnet;
-    nnet.Read(model_filename);
-    nnet.SetTrainOptions(trn_opts);
-
-    if (crossvalidate) {
-      nnet_transf.SetDropoutRate(0.0);
-      nnet.SetDropoutRate(0.0);
-    }
-
-    kaldi::int64 total_frames = 0;
-
-    SequentialBaseFloatMatrixReader feature_reader(feature_rspecifier);
-    RandomAccessPosteriorReader targets_reader(targets_rspecifier);
-    RandomAccessBaseFloatVectorReader weights_reader;
-    if (frame_weights != "") {
-      weights_reader.Open(frame_weights);
-    }
-
-    Xent xent(loss_opts);
-    Mse mse(loss_opts);
-
-    MultiTaskLoss multitask(loss_opts);
-    if (0 == objective_function.compare(0, 9, "multitask")) {
-      // objective_function contains something like :
-      // 'multitask,xent,2456,1.0,mse,440,0.001'
-      //
-      // the meaning is following:
-      // 'multitask,<type1>,<dim1>,<weight1>,...,<typeN>,<dimN>,<weightN>'
-      multitask.InitFromString(objective_function);
-    }
-
-    CuMatrix<BaseFloat> feats, feats_transf, nnet_out, obj_diff;
-
-    Timer time;
-    KALDI_LOG << (crossvalidate?"CROSS-VALIDATION":"TRAINING") << " STARTED";
-
-    int32 num_done = 0,
-          num_no_tgt_mat = 0,
-          num_other_error = 0;
-
-    // main loop,
-    for ( ; !feature_reader.Done(); feature_reader.Next()) {
-      std::string utt = feature_reader.Key();
-      KALDI_VLOG(3) << "Reading " << utt;
-      // check that we have targets
-      if (!targets_reader.HasKey(utt)) {
-        KALDI_WARN << utt << ", missing targets";
-        num_no_tgt_mat++;
-        continue;
-      }
-      // check we have per-frame weights
-      if (frame_weights != "" && !weights_reader.HasKey(utt)) {
-        KALDI_WARN << utt << ", missing per-frame weights";
-        num_other_error++;
-        feature_reader.Next();
-        continue;
-      }
-      // get feature / target pair
-      Matrix<BaseFloat> mat = feature_reader.Value();
-      Posterior nnet_tgt = targets_reader.Value(utt);
-      // skip the sentence if it is too long,
-      if (mat.NumRows() > max_frames) {
-        KALDI_WARN << "Skipping " << utt
-          << " that has " << mat.NumRows() << " frames,"
-          << " it is longer than '--max-frames'" << max_frames;
-        num_other_error++;
-        continue;
-      }
-      // get per-frame weights
-      Vector<BaseFloat> frm_weights;
-      if (frame_weights != "") {
-        frm_weights = weights_reader.Value(utt);
-      } else {  // all per-frame weights are 1.0
-        frm_weights.Resize(mat.NumRows());
-        frm_weights.Set(1.0);
-      }
-      // correct small length mismatch ... or drop sentence
-      {
-        // add lengths to vector
-        std::vector<int32> length;
-        length.push_back(mat.NumRows());
-        length.push_back(nnet_tgt.size());
-        length.push_back(frm_weights.Dim());
-        // find min, max
-        int32 min = *std::min_element(length.begin(), length.end());
-        int32 max = *std::max_element(length.begin(), length.end());
-        // fix or drop ?
-        if (max - min < length_tolerance) {
-          if (mat.NumRows() != min) mat.Resize(min, mat.NumCols(), kCopyData);
-          if (nnet_tgt.size() != min) nnet_tgt.resize(min);
-          if (frm_weights.Dim() != min) frm_weights.Resize(min, kCopyData);
-        } else {
-          KALDI_WARN << utt << ", length mismatch of targets " << nnet_tgt.size()
-                     << " and features " << mat.NumRows();
-          num_other_error++;
-          continue;
-        }
-      }
-      // apply optional feature transform
-      nnet_transf.Feedforward(CuMatrix<BaseFloat>(mat), &feats_transf);
-
-      // forward pass
-      nnet.Propagate(feats_transf, &nnet_out);
-
-      // evaluate objective function we've chosen,
-      if (objective_function == "xent") {
-        // gradients are re-scaled by weights inside Eval,
-        xent.Eval(frm_weights, nnet_out, nnet_tgt, &obj_diff);
-      } else if (objective_function == "mse") {
-        // gradients are re-scaled by weights inside Eval,
-        mse.Eval(frm_weights, nnet_out, nnet_tgt, &obj_diff);
-      } else if (0 == objective_function.compare(0, 9, "multitask")) {
-        // gradients re-scaled by weights in Eval,
-        multitask.Eval(frm_weights, nnet_out, nnet_tgt, &obj_diff);
-      } else {
-        KALDI_ERR << "Unknown objective function code : "
-                  << objective_function;
-      }
-
-      if (!crossvalidate) {
-        // backpropagate and update,
-        nnet.Backpropagate(obj_diff, NULL);
-      }
-
-      // 1st minibatch : show what happens in network,
-      if (total_frames == 0) {
-        KALDI_LOG << "### After " << total_frames << " frames,";
-        KALDI_LOG << nnet.InfoPropagate();
-        if (!crossvalidate) {
-          KALDI_LOG << nnet.InfoBackPropagate();
-          KALDI_LOG << nnet.InfoGradient();
-        }
-      }
-
-      // VERBOSE LOG
-      // monitor the NN training (--verbose=2),
-      if (GetVerboseLevel() >= 2) {
-        static int32 counter = 0;
-        counter += mat.NumRows();
-        // print every 25k frames,
-        if (counter >= 25000) {
-          KALDI_VLOG(2) << "### After " << total_frames << " frames,";
-          KALDI_VLOG(2) << nnet.InfoPropagate();
-          if (!crossvalidate) {
-            KALDI_VLOG(2) << nnet.InfoBackPropagate();
-            KALDI_VLOG(2) << nnet.InfoGradient();
-          }
-          counter = 0;
-        }
-      }
-
-      num_done++;
-      total_frames += frm_weights.Sum();
-    }  // main loop,
-
-    // after last minibatch : show what happens in network,
-    KALDI_LOG << "### After " << total_frames << " frames,";
-    KALDI_LOG << nnet.InfoPropagate();
-    if (!crossvalidate) {
-      KALDI_LOG << nnet.InfoBackPropagate();
-      KALDI_LOG << nnet.InfoGradient();
-    }
-
-    if (!crossvalidate) {
-      nnet.Write(target_model_filename, binary);
-    }
-
-    KALDI_LOG << "Done " << num_done << " files, "
-      << num_no_tgt_mat << " with no tgt_mats, "
-      << num_other_error << " with other errors. "
-      << "[" << (crossvalidate ? "CROSS-VALIDATION" : "TRAINING")
-      << ", " << (randomize ? "RANDOMIZED" : "NOT-RANDOMIZED")
-      << ", " << time.Elapsed() / 60 << " min, processing "
-      << total_frames / time.Elapsed() << " frames per sec.]";
-
-    if (objective_function == "xent") {
-      KALDI_LOG << xent.ReportPerClass();
-      KALDI_LOG << xent.Report();
-    } else if (objective_function == "mse") {
-      KALDI_LOG << mse.Report();
-    } else if (0 == objective_function.compare(0, 9, "multitask")) {
-      KALDI_LOG << multitask.Report();
-    } else {
-      KALDI_ERR << "Unknown objective function code : " << objective_function;
-    }
-
-#if HAVE_CUDA == 1
-    CuDevice::Instantiate().PrintProfile();
-#endif
-
-    return 0;
-  } catch(const std::exception &e) {
-    std::cerr << e.what();
-    return -1;
-  }
-}
diff --git a/src/nnetbin/paste-post.cc b/src/nnetbin/paste-post.cc
deleted file mode 100644
index 10d72f49ed6..00000000000
--- a/src/nnetbin/paste-post.cc
+++ /dev/null
@@ -1,168 +0,0 @@
-// nnetbin/paste-post.cc
-
-// Copyright 2015       Brno University of Technology (Author: Karel Vesely)
-
-// See ../../COPYING for clarification regarding multiple authors
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//  http://www.apache.org/licenses/LICENSE-2.0
-//
-// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
-// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
-// MERCHANTABLITY OR NON-INFRINGEMENT.
-// See the Apache 2 License for the specific language governing permissions and
-// limitations under the License.
-
-
-#include "base/kaldi-common.h"
-#include "base/io-funcs.h"
-#include "util/common-utils.h"
-#include "hmm/posterior.h"
-#include "nnet/nnet-utils.h"
-
-/** @brief Combines 2 or more streams with NN-training targets into single one.
- *  This is handy when training NN with more than one output layer (softmax).
- *  The format of NN-targets is 'posterior' and the dimensionality of the output
- *  stream is the sum of input-stream dimensions.
- */
-int main(int argc, char *argv[]) {
-  using namespace kaldi;
-  using namespace kaldi::nnet1;
-  typedef kaldi::int32 int32;
-  try {
-    const char *usage =
-      "Combine 2 or more streams with NN-training targets into single stream.\n"
-      "As the posterior streams are pasted, the output dimension is the sum\n"
-      "of the input dimensions. This is used when training NN with\n"
-      "multiple softmaxes on its output. This is used in multi-task, \n"
-      "multi-lingual or multi-database training. Depending on the context,\n"
-      "an utterance is not required to be in all the input streams.\n"
-      "For a multi-database training only 1 output layer will be active.\n"
-      "\n"
-      "The lengths of utterances are provided as 1st argument.\n"
-      "The dimensions of input stream are set as 2nd in argument.\n"
-      "Follow the input and output streams which are in 'posterior' format.\n"
-      "\n"
-      "Usage: paste-post <featlen-rspecifier> <dims-csl> <post1-rspecifier> "
-      "... <postN-rspecifier> <post-wspecifier>\n"
-      "e.g.: paste-post 'ark:feat-to-len $feats ark,t:-|' 1029:1124 "
-      "ark:post1.ark ark:post2.ark ark:pasted.ark\n";
-
-    ParseOptions po(usage);
-
-    bool allow_partial = false;
-    po.Register("allow-partial", &allow_partial,
-                "Produce output also when the utterance is not in all input streams.");
-
-    po.Read(argc, argv);
-
-    if (po.NumArgs() < 5) {
-      po.PrintUsage();
-      exit(1);
-    }
-
-    std::string featlen_rspecifier = po.GetArg(1),  // segment lengths,
-                stream_dims_str = po.GetArg(2),
-                post_wspecifier = po.GetArg(po.NumArgs());
-    int32 stream_count = po.NumArgs() - 3;  // number of input posterior streams
-
-    // read the dims of input posterior streams,
-    std::vector<int32> stream_dims;
-    if (!kaldi::SplitStringToIntegers(stream_dims_str, ":,", false, &stream_dims)) {
-      KALDI_ERR << "Invalid stream-dims string " << stream_dims_str;
-    }
-    if (stream_count != stream_dims.size()) {
-      KALDI_ERR << "Mismatch in input posterior-stream count " << stream_count
-                << " and --stream-dims count" << stream_dims.size()
-                << ", " << stream_dims_str;
-    }
-
-    // prepare dim offsets of input streams,
-    std::vector<int32> stream_offset(stream_dims.size()+1, 0);
-    for (int32 s = 0; s < stream_dims.size(); s++) {
-      stream_offset[s+1] = stream_offset[s] + stream_dims[s];
-    }
-
-    // open the input posterior readers,
-    std::vector<RandomAccessPosteriorReader> posterior_reader(po.NumArgs()-3);
-    for (int32 s = 0; s < stream_count; s++) {
-      posterior_reader[s].Open(po.GetArg(s+3));
-    }
-
-    int32 num_done = 0, num_err = 0, num_empty = 0;
-    SequentialInt32Reader featlen_reader(featlen_rspecifier);
-    PosteriorWriter posterior_writer(post_wspecifier);
-
-    // main loop, posterior pasting happens here,
-    for (; !featlen_reader.Done(); featlen_reader.Next()) {
-      bool ok = true, empty = true;
-      std::string utt = featlen_reader.Key();
-      int32 num_frames = featlen_reader.Value();
-
-      // show which streams are non-empty,
-      if (allow_partial && GetVerboseLevel() >= 2) {
-        std::string nonempty_streams;
-        for (int32 s = 0; s < stream_count; s++) {
-          if (posterior_reader[s].HasKey(utt)) {
-            nonempty_streams += " " + ToString(s);
-          }
-        }
-        KALDI_VLOG(2) << "Processing " << utt
-                      << ", frames " << num_frames
-                      << ", pasted-from streams " << nonempty_streams;
-      }
-
-      // Create output posteriors,
-      Posterior post(num_frames);
-
-      // Fill posterior from input streams,
-      for (int32 s = 0; s < stream_count; s++) {
-        if (!posterior_reader[s].HasKey(utt)) {
-          if (!allow_partial) {
-            KALDI_WARN << "No such utterance " << utt
-                       << " in set " << (s+1) << " of posteriors.";
-            ok = false;
-            break;
-          }
-        } else {
-          const Posterior& post_s = posterior_reader[s].Value(utt);
-          KALDI_ASSERT(num_frames <= post_s.size());
-          for (int32 f = 0; f < num_frames; f++) {
-            for (int32 i = 0; i < post_s[f].size(); i++) {
-              int32 id = post_s[f][i].first;
-              BaseFloat val = post_s[f][i].second;
-              KALDI_ASSERT(id < stream_dims[s]);
-              post[f].push_back(std::make_pair(stream_offset[s] + id, val));
-            }
-          }
-          empty = false;
-        }
-      }
-      if (empty) {
-        KALDI_WARN << "Uttenrace with no posteriors " << utt << ", discarding";
-        num_empty++;
-        continue;
-      }
-      if (ok) {
-        posterior_writer.Write(featlen_reader.Key(), post);
-        num_done++;
-      } else {
-        num_err++;
-      }
-    }
-    KALDI_LOG << "Pasted posteriors for " << num_done << " sentences, "
-              << "missing sentences " << num_empty << ", "
-              << "failed for " << num_err;
-    return (num_done != 0 ? 0 : 1);
-  } catch(const std::exception &e) {
-    std::cerr << e.what();
-    return -1;
-  }
-}
-
-
-
diff --git a/src/nnetbin/rbm-convert-to-nnet.cc b/src/nnetbin/rbm-convert-to-nnet.cc
deleted file mode 100644
index 3ac89626376..00000000000
--- a/src/nnetbin/rbm-convert-to-nnet.cc
+++ /dev/null
@@ -1,77 +0,0 @@
-// nnetbin/rbm-convert-to-nnet.cc
-
-// Copyright 2009-2011  Microsoft Corporation
-
-// See ../../COPYING for clarification regarding multiple authors
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//  http://www.apache.org/licenses/LICENSE-2.0
-//
-// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
-// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
-// MERCHANTABLITY OR NON-INFRINGEMENT.
-// See the Apache 2 License for the specific language governing permissions and
-// limitations under the License.
-
-#include "base/kaldi-common.h"
-#include "util/common-utils.h"
-#include "nnet/nnet-nnet.h"
-#include "nnet/nnet-rbm.h"
-
-int main(int argc, char *argv[]) {
-  try {
-    using namespace kaldi;
-    using namespace kaldi::nnet1;
-    typedef kaldi::int32 int32;
-
-    const char *usage =
-        "Convert RBM to <affinetransform> and <sigmoid>\n"
-        "Usage:  rbm-convert-to-nnet [options] <rbm-in> <nnet-out>\n"
-        "e.g.:\n"
-        " rbm-convert-to-nnet --binary=false rbm.mdl nnet.mdl\n";
-
-
-    bool binary_write = true;
-
-    ParseOptions po(usage);
-    po.Register("binary", &binary_write, "Write output in binary mode");
-
-    po.Read(argc, argv);
-
-    if (po.NumArgs() != 2) {
-      po.PrintUsage();
-      exit(1);
-    }
-
-    std::string model_in_filename = po.GetArg(1),
-        model_out_filename = po.GetArg(2);
-
-    Nnet nnet;
-    {
-      bool binary_read;
-      Input ki(model_in_filename, &binary_read);
-      nnet.Read(ki.Stream(), binary_read);
-    }
-
-    KALDI_ASSERT(nnet.NumComponents() == 1);
-    KALDI_ASSERT(nnet.GetComponent(0).GetType() == kaldi::nnet1::Component::kRbm);
-    RbmBase& rbm = dynamic_cast<RbmBase&>(nnet.GetComponent(0));
-
-    {
-      Output ko(model_out_filename, binary_write);
-      rbm.WriteAsNnet(ko.Stream(), binary_write);
-    }
-
-    KALDI_LOG << "Written model to " << model_out_filename;
-    return 0;
-  } catch(const std::exception &e) {
-    std::cerr << e.what();
-    return -1;
-  }
-}
-
-
diff --git a/src/nnetbin/rbm-train-cd1-frmshuff.cc b/src/nnetbin/rbm-train-cd1-frmshuff.cc
deleted file mode 100644
index 09e6f247cec..00000000000
--- a/src/nnetbin/rbm-train-cd1-frmshuff.cc
+++ /dev/null
@@ -1,287 +0,0 @@
-// nnetbin/rbm-train-cd1-frmshuff.cc
-
-// Copyright 2012-2013  Brno University of Technology (Author: Karel Vesely)
-
-// See ../../COPYING for clarification regarding multiple authors
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//  http://www.apache.org/licenses/LICENSE-2.0
-//
-// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
-// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
-// MERCHANTABLITY OR NON-INFRINGEMENT.
-// See the Apache 2 License for the specific language governing permissions and
-// limitations under the License.
-
-#include "nnet/nnet-trnopts.h"
-#include "nnet/nnet-rbm.h"
-#include "nnet/nnet-nnet.h"
-#include "nnet/nnet-loss.h"
-#include "nnet/nnet-randomizer.h"
-#include "base/kaldi-common.h"
-#include "util/common-utils.h"
-#include "base/timer.h"
-#include "cudamatrix/cu-device.h"
-#include "cudamatrix/cu-rand.h"
-
-
-int main(int argc, char *argv[]) {
-  using namespace kaldi;
-  using namespace kaldi::nnet1;
-  typedef kaldi::int32 int32;
-  try {
-    const char *usage =
-      "Train RBM by Contrastive Divergence alg. with 1 step of "
-      "Markov Chain Monte-Carlo.\n"
-      "The tool can perform several iterations (--num-iters) "
-      "or it can subsample the training dataset (--drop-data)\n"
-
-      "Usage: rbm-train-cd1-frmshuff [options] <model-in> "
-      "<feature-rspecifier> <model-out>\n"
-      "e.g.: rbm-train-cd1-frmshuff 1.rbm.init scp:train.scp 1.rbm\n";
-
-    ParseOptions po(usage);
-
-    RbmTrainOptions trn_opts, trn_opts_rbm;
-    trn_opts.Register(&po);
-    LossOptions loss_opts;
-    loss_opts.Register(&po);
-
-    bool binary = false;
-    po.Register("binary", &binary, "Write output in binary mode");
-
-    bool with_bug = true;
-    po.Register("with-bug", &with_bug,
-        "Apply bug which led to better results (set-initial-momentum-to-max)");
-
-    int32 num_iters = 1;
-    po.Register("num-iters", &num_iters,
-                "Number of iterations (smaller datasets should have more iterations, "
-                "iterating within tool because of linear momentum scheduling)");
-
-    std::string feature_transform;
-    po.Register("feature-transform", &feature_transform,
-        "Feature transform in 'nnet1' format");
-
-    NnetDataRandomizerOptions rnd_opts;
-    rnd_opts.minibatch_size = 100;
-    rnd_opts.Register(&po);
-
-    kaldi::int32 max_frames = 6000;
-    po.Register("max-frames", &max_frames,
-        "Maximum number of frames an utterance can have (skipped if longer)");
-
-    std::string use_gpu="yes";
-    po.Register("use-gpu", &use_gpu,
-        "yes|no|optional, only has effect if compiled with CUDA");
-
-    po.Read(argc, argv);
-
-    if (po.NumArgs() != 3) {
-      po.PrintUsage();
-      exit(1);
-    }
-
-    std::string model_filename = po.GetArg(1),
-        feature_rspecifier = po.GetArg(2);
-
-    std::string target_model_filename;
-    target_model_filename = po.GetArg(3);
-
-
-    using namespace kaldi;
-    using namespace kaldi::nnet1;
-    typedef kaldi::int32 int32;
-
-#if HAVE_CUDA == 1
-    CuDevice::Instantiate().SelectGpuId(use_gpu);
-#endif
-
-    Nnet rbm_transf;
-    if (feature_transform != "") {
-      rbm_transf.Read(feature_transform);
-    }
-
-    // Read nnet, extract the RBM,
-    Nnet nnet;
-    nnet.Read(model_filename);
-    KALDI_ASSERT(nnet.NumComponents() == 1);
-    KALDI_ASSERT(nnet.GetComponent(0).GetType() == kaldi::nnet1::Component::kRbm);
-    RbmBase &rbm = dynamic_cast<RbmBase&>(nnet.GetComponent(0));
-
-    // Configure the RBM,
-    // make some constants accessible, will use them later,
-    const BaseFloat& learn_rate = trn_opts.learn_rate;
-    const BaseFloat& momentum = trn_opts.momentum;
-    const BaseFloat& momentum_max = trn_opts.momentum_max;
-    const int32& momentum_steps = trn_opts.momentum_steps;
-    const int32& momentum_step_period = trn_opts.momentum_step_period;
-
-    // 'trn_opts_rbm' is a local copy of 'trn_opts' which is passed to RBM,
-    trn_opts_rbm = trn_opts;
-    // keep `effective' learning rate constant
-    trn_opts_rbm.learn_rate = learn_rate * (1 - momentum);
-    // pass options to RBM,
-    rbm.SetRbmTrainOptions(trn_opts_rbm);
-
-    kaldi::int64 total_frames = 0;
-
-    SequentialBaseFloatMatrixReader feature_reader(feature_rspecifier);
-    RandomizerMask randomizer_mask(rnd_opts);
-    MatrixRandomizer feature_randomizer(rnd_opts);
-
-    CuRand<BaseFloat> cu_rand;  // parallel random number generator,
-    Mse mse(loss_opts);
-
-    CuMatrix<BaseFloat> feats_transf,
-                        pos_hid, pos_hid_aux,
-                        neg_vis, neg_hid;
-    CuMatrix<BaseFloat> dummy_mse_mat;
-
-    Timer time;
-    KALDI_LOG << "RBM TRAINING STARTED";
-
-    int32 iter = 1;
-    KALDI_LOG << "Iteration " << iter << "/" << num_iters;
-
-    int32 num_done = 0, num_other_error = 0;
-    while (!feature_reader.Done()) {
-#if HAVE_CUDA == 1
-      // check that GPU is computing accurately,
-      CuDevice::Instantiate().CheckGpuHealth();
-#endif
-      // fill the randomizer,
-      for ( ; !feature_reader.Done(); feature_reader.Next()) {
-        if (feature_randomizer.IsFull()) {
-          // break the loop without calling Next(),
-          // we keep the 'utt' for next round,
-          break;
-        }
-        std::string utt = feature_reader.Key();
-        KALDI_VLOG(3) << "Reading " << utt;
-        // get feature matrix,
-        const Matrix<BaseFloat> &mat = feature_reader.Value();
-        // skip too long segments (avoid runinning out of memory)
-        if (mat.NumRows() > max_frames) {
-          KALDI_WARN << "Skipping " << utt
-            << " that has " << mat.NumRows() << " frames,"
-            << " it is longer than '--max-frames'" << max_frames;
-          num_other_error++;
-          continue;
-        }
-        // apply feature transform,
-        rbm_transf.Feedforward(CuMatrix<BaseFloat>(mat), &feats_transf);
-        // add to randomizer,
-        feature_randomizer.AddData(feats_transf);
-        num_done++;
-
-        // report the speed
-        if (num_done % 5000 == 0) {
-          double time_now = time.Elapsed();
-          KALDI_VLOG(1) << "After " << num_done << " utterances: "
-            << "time elapsed = " << time_now / 60 << " min; "
-            << "processed " << total_frames / time_now << " frames per sec.";
-        }
-      }
-
-      // randomize,
-      feature_randomizer.Randomize(
-        randomizer_mask.Generate(feature_randomizer.NumFrames())
-      );
-
-      // train with data from randomizer (using mini-batches)
-      for ( ; !feature_randomizer.Done(); feature_randomizer.Next()) {
-        // get the mini-batch,
-        const CuMatrixBase<BaseFloat>& pos_vis = feature_randomizer.Value();
-        // get the dims,
-        int32 num_frames = pos_vis.NumRows(),
-              dim_hid = rbm.OutputDim();
-        // Create dummy frame-weights for Mse::Eval,
-        Vector<BaseFloat> dummy_weights(num_frames);
-        dummy_weights.Set(1.0);
-
-        // TRAIN with CD1,
-        // forward pass,
-        rbm.Propagate(pos_vis, &pos_hid);
-
-        // alter the hidden values, so we can generate negative example,
-        if (rbm.HidType() == Rbm::Bernoulli) {
-          pos_hid_aux.Resize(num_frames, dim_hid);
-          cu_rand.BinarizeProbs(pos_hid, &pos_hid_aux);  // => 0 / 1,
-        } else {
-          KALDI_ASSERT(rbm.HidType() == Rbm::Gaussian);
-          pos_hid_aux = pos_hid;
-          cu_rand.AddGaussNoise(&pos_hid_aux);
-        }
-
-        // reconstruct pass,
-        rbm.Reconstruct(pos_hid_aux, &neg_vis);
-        // propagate negative examples
-        rbm.Propagate(neg_vis, &neg_hid);
-        // update step
-        rbm.RbmUpdate(pos_vis, pos_hid, neg_vis, neg_hid);
-        // evaluate mean square error
-        mse.Eval(dummy_weights, neg_vis, pos_vis, &dummy_mse_mat);
-
-        total_frames += num_frames;
-
-        // change the momentum progressively per 0.5million samples of the data
-        {
-          static int32 n_prev = -1;
-          BaseFloat step = (momentum_max - momentum) / momentum_steps;
-          // change every momentum_step_period data,
-          int32 n = total_frames / momentum_step_period;
-          BaseFloat momentum_actual;
-          if (n > momentum_steps) {
-            momentum_actual = momentum_max;
-          } else {
-            momentum_actual = momentum + n*step;
-          }
-          if (n - n_prev > 0) {
-            n_prev = n;
-            BaseFloat learning_rate_actual = learn_rate*(1-momentum_actual);
-            KALDI_VLOG(1) << "Setting momentum "
-              << (with_bug ? momentum_max : momentum_actual)
-              << " and learning rate " << learning_rate_actual
-              << " after processing "
-              << static_cast<double>(total_frames) / 360000 << " h";
-            // pass values to rbm,
-            trn_opts_rbm.momentum = (with_bug ? momentum_max : momentum_actual);
-            trn_opts_rbm.learn_rate = learning_rate_actual;
-            rbm.SetRbmTrainOptions(trn_opts_rbm);
-          }
-        }
-      }
-
-      // reopen the feature stream if we will run another iteration
-      if (feature_reader.Done() && (iter < num_iters)) {
-        iter++;
-        KALDI_LOG << "Iteration " << iter << "/" << num_iters;
-        feature_reader.Close();
-        feature_reader.Open(feature_rspecifier);
-      }
-    }
-
-    nnet.Write(target_model_filename, binary);
-
-    KALDI_LOG << "Done " << iter << " iterations, " << num_done << " files, "
-              << "skipped " << num_other_error << " files. "
-              << "[" << time.Elapsed() / 60 << " min, "
-              << "processing" << total_frames / time.Elapsed() << " "
-              << "frames per sec.]";
-
-    KALDI_LOG << mse.Report();
-
-#if HAVE_CUDA == 1
-    CuDevice::Instantiate().PrintProfile();
-#endif
-    return 0;
-  } catch(const std::exception &e) {
-    std::cerr << e.what();
-    return -1;
-  }
-}
diff --git a/src/nnetbin/train-transitions.cc b/src/nnetbin/train-transitions.cc
deleted file mode 100644
index 0226e0973d2..00000000000
--- a/src/nnetbin/train-transitions.cc
+++ /dev/null
@@ -1,101 +0,0 @@
-// nnetbin/train-transitions.cc
-
-// Copyright 2015  Brno University of Technology (author: Karel Vesely)
-//           2012  Johns Hopkins University (author:  Daniel Povey)
-
-// See ../../COPYING for clarification regarding multiple authors
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//  http://www.apache.org/licenses/LICENSE-2.0
-//
-// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
-// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
-// MERCHANTABLITY OR NON-INFRINGEMENT.
-// See the Apache 2 License for the specific language governing permissions and
-// limitations under the License.
-
-#include "base/kaldi-common.h"
-#include "util/common-utils.h"
-#include "tree/context-dep.h"
-#include "hmm/transition-model.h"
-
-int main(int argc, char *argv[]) {
-  try {
-    using namespace kaldi;
-    typedef kaldi::int32 int32;
-
-    const char *usage =
-        "Train the transition probabilities in transition-model "
-        "(used in nnet1 recipe).\n"
-        "\n"
-        "Usage: train-transitions [options] "
-        "<trans-model-in> <alignments-rspecifier> <trans-model-out>\n"
-        "e.g.: train-transitions 1.mdl \"ark:gunzip -c ali.*.gz|\" 2.mdl\n";
-
-    bool binary_write = true;
-    MleTransitionUpdateConfig transition_update_config;
-
-    ParseOptions po(usage);
-    po.Register("binary", &binary_write, "Write output in binary mode");
-    transition_update_config.Register(&po);
-
-    po.Read(argc, argv);
-
-    if (po.NumArgs() != 3) {
-      po.PrintUsage();
-      exit(1);
-    }
-
-    std::string trans_model_rxfilename = po.GetArg(1),
-        ali_rspecifier = po.GetArg(2),
-        trans_model_wxfilename = po.GetArg(3);
-
-    TransitionModel trans_model;
-    {
-      bool binary_read;
-      Input ki(trans_model_rxfilename, &binary_read);
-      trans_model.Read(ki.Stream(), binary_read);
-    }
-
-    Vector<double> transition_accs;
-    trans_model.InitStats(&transition_accs);
-
-    int32 num_done = 0;
-    SequentialInt32VectorReader ali_reader(ali_rspecifier);
-    for (; !ali_reader.Done(); ali_reader.Next()) {
-      const std::vector<int32> alignment(ali_reader.Value());
-      for (size_t i = 0; i < alignment.size(); i++) {
-        int32 tid = alignment[i];
-        BaseFloat weight = 1.0;
-        trans_model.Accumulate(weight, tid, &transition_accs);
-      }
-      num_done++;
-    }
-    KALDI_LOG << "Accumulated transition stats from " << num_done
-              << " utterances.";
-
-    {
-      BaseFloat objf_impr, count;
-      trans_model.MleUpdate(transition_accs, transition_update_config,
-                            &objf_impr, &count);
-      KALDI_LOG << "Transition model update: average " << (objf_impr/count)
-                << " log-like improvement per frame over " << count
-                << " frames.";
-    }
-
-    {
-      Output ko(trans_model_wxfilename, binary_write);
-      trans_model.Write(ko.Stream(), binary_write);
-    }
-    KALDI_LOG << "Trained transition model and wrote it to "
-              << trans_model_wxfilename;
-    return 0;
-  } catch(const std::exception &e) {
-    std::cerr << e.what();
-    return -1;
-  }
-}
diff --git a/src/nnetbin/transf-to-nnet.cc b/src/nnetbin/transf-to-nnet.cc
deleted file mode 100644
index f83c71cc47d..00000000000
--- a/src/nnetbin/transf-to-nnet.cc
+++ /dev/null
@@ -1,79 +0,0 @@
-// nnetbin/transf-to-nnet.cc
-
-// Copyright 2012  Brno University of Technology
-
-// See ../../COPYING for clarification regarding multiple authors
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//  http://www.apache.org/licenses/LICENSE-2.0
-//
-// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
-// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
-// MERCHANTABLITY OR NON-INFRINGEMENT.
-// See the Apache 2 License for the specific language governing permissions and
-// limitations under the License.
-
-#include "base/kaldi-common.h"
-#include "util/common-utils.h"
-#include "nnet/nnet-nnet.h"
-#include "nnet/nnet-linear-transform.h"
-
-int main(int argc, char *argv[]) {
-  try {
-    using namespace kaldi;
-    using namespace kaldi::nnet1;
-    typedef kaldi::int32 int32;
-
-    const char *usage =
-        "Convert transformation matrix to <affine-transform>\n"
-        "Usage:  transf-to-nnet [options] <transf-in> <nnet-out>\n"
-        "e.g.:\n"
-        " transf-to-nnet --binary=false transf.mat nnet.mdl\n";
-
-    bool binary_write = false;
-
-    ParseOptions po(usage);
-    po.Register("binary", &binary_write, "Write output in binary mode");
-
-    po.Read(argc, argv);
-
-    if (po.NumArgs() != 2) {
-      po.PrintUsage();
-      exit(1);
-    }
-
-    std::string transform_rxfilename = po.GetArg(1),
-        model_out_filename = po.GetArg(2);
-
-    // read the matrix,
-    Matrix<BaseFloat> transform;
-    {
-      bool binary_read;
-      Input ki(transform_rxfilename, &binary_read);
-      transform.Read(ki.Stream(), binary_read);
-    }
-
-    // wrapping as Nnet with <LinearTransform>,
-    Nnet nnet;
-    LinearTransform lin_tran(transform.NumCols(), transform.NumRows());
-    lin_tran.SetLinearity(transform);
-    nnet.AppendComponent(lin_tran);
-
-    // write the nnet,
-    {
-      Output ko(model_out_filename, binary_write);
-      nnet.Write(ko.Stream(), binary_write);
-      KALDI_LOG << "Written transform in 'nnet1' model: " << model_out_filename;
-    }
-    return 0;
-  } catch(const std::exception &e) {
-    std::cerr << e.what();
-    return -1;
-  }
-}
-
-
diff --git a/src/online/online-decodable.cc b/src/online/online-decodable.cc
index 58f6452879d..58180775fd8 100644
--- a/src/online/online-decodable.cc
+++ b/src/online/online-decodable.cc
@@ -25,7 +25,7 @@
 namespace kaldi {
 
 OnlineDecodableDiagGmmScaled::OnlineDecodableDiagGmmScaled(
-    const AmDiagGmm &am, const TransitionModel &trans_model,
+    const AmDiagGmm &am, const Transitions &trans_model,
     const BaseFloat scale, OnlineFeatureMatrix *input_feats):  
       features_(input_feats), ac_model_(am),
       ac_scale_(scale), trans_model_(trans_model),
diff --git a/src/online/online-decodable.h b/src/online/online-decodable.h
index b6d811d1031..ed7ef087ced 100644
--- a/src/online/online-decodable.h
+++ b/src/online/online-decodable.h
@@ -35,7 +35,7 @@ namespace kaldi {
 class OnlineDecodableDiagGmmScaled : public DecodableInterface {
  public:
   OnlineDecodableDiagGmmScaled(const AmDiagGmm &am,
-                               const TransitionModel &trans_model,
+                               const Transitions &trans_model,
                                const BaseFloat scale,
                                OnlineFeatureMatrix *input_feats);
 
@@ -54,7 +54,7 @@ class OnlineDecodableDiagGmmScaled : public DecodableInterface {
   OnlineFeatureMatrix *features_;
   const AmDiagGmm &ac_model_;
   BaseFloat ac_scale_;
-  const TransitionModel &trans_model_;
+  const Transitions &trans_model_;
   const int32 feat_dim_; // dimensionality of the input features
   Vector<BaseFloat> cur_feats_;
   int32 cur_frame_;
diff --git a/src/online/online-faster-decoder.h b/src/online/online-faster-decoder.h
index cd05b091b53..a81e9451f7e 100644
--- a/src/online/online-faster-decoder.h
+++ b/src/online/online-faster-decoder.h
@@ -25,7 +25,7 @@
 
 #include "util/stl-utils.h"
 #include "decoder/faster-decoder.h"
-#include "hmm/transition-model.h"
+#include "hmm/transitions.h"
 
 namespace kaldi {
 
@@ -79,7 +79,7 @@ class OnlineFasterDecoder : public FasterDecoder {
   OnlineFasterDecoder(const fst::Fst<fst::StdArc> &fst,
                       const OnlineFasterDecoderOpts &opts,
                       const std::vector<int32> &sil_phones,
-                      const TransitionModel &trans_model)
+                      const Transitions &trans_model)
       : FasterDecoder(fst, opts), opts_(opts),
         silence_set_(sil_phones), trans_model_(trans_model),
         max_beam_(opts.beam), effective_beam_(FasterDecoder::config_.beam),
@@ -118,7 +118,7 @@ class OnlineFasterDecoder : public FasterDecoder {
 
   const OnlineFasterDecoderOpts opts_;
   const ConstIntegerSet<int32> silence_set_; // silence phones IDs
-  const TransitionModel &trans_model_; // needed for trans-id -> phone conversion
+  const Transitions &trans_model_; // needed for trans-id -> phone conversion
   const BaseFloat max_beam_; // the maximum allowed beam
   BaseFloat &effective_beam_; // the currently used beam
   DecodeState state_; // the current state of the decoder
diff --git a/src/online2/Makefile b/src/online2/Makefile
index 242c7be6da6..8b975e2ce43 100644
--- a/src/online2/Makefile
+++ b/src/online2/Makefile
@@ -6,9 +6,8 @@ include ../kaldi.mk
 TESTFILES =
 
 OBJFILES = online-gmm-decodable.o online-feature-pipeline.o online-ivector-feature.o \
-           online-nnet2-feature-pipeline.o online-gmm-decoding.o online-timing.o \
+           online2-feature-pipeline.o online-gmm-decoding.o online-timing.o \
            online-endpoint.o onlinebin-util.o online-speex-wrapper.o \
-           online-nnet2-decoding.o online-nnet2-decoding-threaded.o \
            online-nnet3-decoding.o
 
 LIBNAME = kaldi-online2
@@ -19,10 +18,9 @@ ADDLIBS = ../ivector/kaldi-ivector.a ../nnet3/kaldi-nnet3.a \
           ../lat/kaldi-lat.a ../hmm/kaldi-hmm.a ../feat/kaldi-feat.a \
           ../transform/kaldi-transform.a ../gmm/kaldi-gmm.a \
           ../tree/kaldi-tree.a ../util/kaldi-util.a ../matrix/kaldi-matrix.a \
-          ../base/kaldi-base.a 
+          ../base/kaldi-base.a
 
 
 
 
 include ../makefiles/default_rules.mk
-
diff --git a/src/online2/online-endpoint.cc b/src/online2/online-endpoint.cc
index aa7752c4484..da057f8e62d 100644
--- a/src/online2/online-endpoint.cc
+++ b/src/online2/online-endpoint.cc
@@ -72,7 +72,7 @@ bool EndpointDetected(const OnlineEndpointConfig &config,
 }
 
 template <typename FST>
-int32 TrailingSilenceLength(const TransitionModel &tmodel,
+int32 TrailingSilenceLength(const Transitions &tmodel,
                             const std::string &silence_phones_str,
                             const LatticeFasterOnlineDecoderTpl<FST> &decoder) {
   std::vector<int32> silence_phones;
@@ -109,7 +109,7 @@ int32 TrailingSilenceLength(const TransitionModel &tmodel,
 template <typename FST>
 bool EndpointDetected(
     const OnlineEndpointConfig &config,
-    const TransitionModel &tmodel,
+    const Transitions &tmodel,
     BaseFloat frame_shift_in_seconds,
     const LatticeFasterOnlineDecoderTpl<FST> &decoder) {
   if (decoder.NumFramesDecoded() == 0) return false;
@@ -131,7 +131,7 @@ bool EndpointDetected(
 template
 bool EndpointDetected<fst::Fst<fst::StdArc> >(
     const OnlineEndpointConfig &config,
-    const TransitionModel &tmodel,
+    const Transitions &tmodel,
     BaseFloat frame_shift_in_seconds,
     const LatticeFasterOnlineDecoderTpl<fst::Fst<fst::StdArc> > &decoder);
 
@@ -139,7 +139,7 @@ bool EndpointDetected<fst::Fst<fst::StdArc> >(
 template
 bool EndpointDetected<fst::GrammarFst>(
     const OnlineEndpointConfig &config,
-    const TransitionModel &tmodel,
+    const Transitions &tmodel,
     BaseFloat frame_shift_in_seconds,
     const LatticeFasterOnlineDecoderTpl<fst::GrammarFst> &decoder);
 
diff --git a/src/online2/online-endpoint.h b/src/online2/online-endpoint.h
index aaf9232db13..5f80403bbdc 100644
--- a/src/online2/online-endpoint.h
+++ b/src/online2/online-endpoint.h
@@ -33,7 +33,7 @@
 #include "feat/feature-plp.h"
 #include "itf/online-feature-itf.h"
 #include "lat/kaldi-lattice.h"
-#include "hmm/transition-model.h"
+#include "hmm/transitions.h"
 #include "decoder/lattice-faster-online-decoder.h"
 
 namespace kaldi {
@@ -188,7 +188,7 @@ bool EndpointDetected(const OnlineEndpointConfig &config,
 /// BestPathEnd() and TraceBackOneLink() functions of LatticeFasterOnlineDecoder
 /// to do this efficiently.
 template <typename FST>
-int32 TrailingSilenceLength(const TransitionModel &tmodel,
+int32 TrailingSilenceLength(const Transitions &tmodel,
                             const std::string &silence_phones,
                             const LatticeFasterOnlineDecoderTpl<FST> &decoder);
 
@@ -198,7 +198,7 @@ int32 TrailingSilenceLength(const TransitionModel &tmodel,
 template <typename FST>
 bool EndpointDetected(
     const OnlineEndpointConfig &config,
-    const TransitionModel &tmodel,
+    const Transitions &tmodel,
     BaseFloat frame_shift_in_seconds,
     const LatticeFasterOnlineDecoderTpl<FST> &decoder);
 
diff --git a/src/online2/online-gmm-decodable.cc b/src/online2/online-gmm-decodable.cc
index 20a23858adb..33a273e8a56 100644
--- a/src/online2/online-gmm-decodable.cc
+++ b/src/online2/online-gmm-decodable.cc
@@ -24,7 +24,7 @@
 namespace kaldi {
 
 DecodableDiagGmmScaledOnline::DecodableDiagGmmScaledOnline(
-    const AmDiagGmm &am, const TransitionModel &trans_model,
+    const AmDiagGmm &am, const Transitions &trans_model,
     const BaseFloat scale, OnlineFeatureInterface *input_feats):  
       features_(input_feats), ac_model_(am),
       ac_scale_(scale), trans_model_(trans_model),
diff --git a/src/online2/online-gmm-decodable.h b/src/online2/online-gmm-decodable.h
index 1a1d37ba2a2..c9436d83aa4 100644
--- a/src/online2/online-gmm-decodable.h
+++ b/src/online2/online-gmm-decodable.h
@@ -27,7 +27,7 @@
 #include "matrix/matrix-lib.h"
 #include "itf/decodable-itf.h"
 #include "gmm/am-diag-gmm.h"
-#include "hmm/transition-model.h"
+#include "hmm/transitions.h"
 
 namespace kaldi {
 
@@ -35,7 +35,7 @@ namespace kaldi {
 class DecodableDiagGmmScaledOnline : public DecodableInterface {
  public:
   DecodableDiagGmmScaledOnline(const AmDiagGmm &am,
-                               const TransitionModel &trans_model,
+                               const Transitions &trans_model,
                                const BaseFloat scale,
                                OnlineFeatureInterface *input_feats);
 
@@ -56,7 +56,7 @@ class DecodableDiagGmmScaledOnline : public DecodableInterface {
   OnlineFeatureInterface *features_;
   const AmDiagGmm &ac_model_;
   BaseFloat ac_scale_;
-  const TransitionModel &trans_model_;
+  const Transitions &trans_model_;
   const int32 feat_dim_;  // dimensionality of the input features
   Vector<BaseFloat> cur_feats_;
   int32 cur_frame_;
diff --git a/src/online2/online-gmm-decoding.cc b/src/online2/online-gmm-decoding.cc
index 56b5603b206..656b2730bdf 100644
--- a/src/online2/online-gmm-decoding.cc
+++ b/src/online2/online-gmm-decoding.cc
@@ -75,7 +75,7 @@ void SingleUtteranceGmmDecoder::AdvanceDecoding() {
   // from constructing it each time we want to decode more of the
   // input.
   DecodableDiagGmmScaledOnline decodable(am_gmm,
-                                         models_.GetTransitionModel(),
+                                         models_.GetTransitions(),
                                          config_.acoustic_scale,
                                          feature_pipeline_);
 
@@ -169,7 +169,7 @@ bool SingleUtteranceGmmDecoder::GetGaussianPosteriors(bool end_of_utterance,
                 << " frames.";
 
   ConstIntegerSet<int32> silence_set(silence_phones_);  // faster lookup
-  const TransitionModel &trans_model = models_.GetTransitionModel();
+  const Transitions &trans_model = models_.GetTransitionModel();
   WeightSilencePost(trans_model, silence_set,
                     config_.silence_weight, &post);  
   
@@ -309,7 +309,7 @@ SingleUtteranceGmmDecoder::~SingleUtteranceGmmDecoder() {
 
 bool SingleUtteranceGmmDecoder::EndpointDetected(
     const OnlineEndpointConfig &config) {
-  const TransitionModel &tmodel = models_.GetTransitionModel();
+  const Transitions &tmodel = models_.GetTransitionModel();
   return kaldi::EndpointDetected(config, tmodel,
                                  feature_pipeline_->FrameShiftInSeconds(),
                                  decoder_);
@@ -323,7 +323,7 @@ void SingleUtteranceGmmDecoder::GetLattice(bool rescore_if_needed,
   decoder_.GetRawLattice(&lat, end_of_utterance);
   if (rescore_if_needed && RescoringIsNeeded()) {
     DecodableDiagGmmScaledOnline decodable(models_.GetFinalModel(),
-                                           models_.GetTransitionModel(),
+                                           models_.GetTransitions(),
                                            config_.acoustic_scale,
                                            feature_pipeline_);
 
@@ -332,7 +332,7 @@ void SingleUtteranceGmmDecoder::GetLattice(bool rescore_if_needed,
   }
   PruneLattice(lat_beam, &lat);
 
-  DeterminizeLatticePhonePrunedWrapper(models_.GetTransitionModel(),
+  DeterminizeLatticePhonePrunedWrapper(models_.GetTransitions(),
                                        &lat, lat_beam, clat,
                                        config_.faster_decoder_opts.det_opts);
   
@@ -358,7 +358,7 @@ OnlineGmmDecodingModels::OnlineGmmDecodingModels(
   if (!config.online_alimdl_rxfilename.empty()) {
     bool binary;
     Input ki(config.online_alimdl_rxfilename, &binary);
-    TransitionModel tmodel;
+    Transitions tmodel;
     tmodel.Read(ki.Stream(), binary);
     if (!tmodel.Compatible(tmodel_))
       KALDI_ERR << "Incompatible models given to the --model and "
@@ -369,7 +369,7 @@ OnlineGmmDecodingModels::OnlineGmmDecodingModels(
   if (!config.rescore_model_rxfilename.empty()) {
     bool binary;
     Input ki(config.rescore_model_rxfilename, &binary);
-    TransitionModel tmodel;
+    Transitions tmodel;
     tmodel.Read(ki.Stream(), binary);
     if (!tmodel.Compatible(tmodel_))
       KALDI_ERR << "Incompatible models given to the --model and "
@@ -386,7 +386,7 @@ OnlineGmmDecodingModels::OnlineGmmDecodingModels(
 }
 
 
-const TransitionModel &OnlineGmmDecodingModels::GetTransitionModel() const {
+const Transitions &OnlineGmmDecodingModels::GetTransitionModel() const {
   return tmodel_;
 }
 
diff --git a/src/online2/online-gmm-decoding.h b/src/online2/online-gmm-decoding.h
index 8bec6cd9ab9..f5cb725cfc1 100644
--- a/src/online2/online-gmm-decoding.h
+++ b/src/online2/online-gmm-decoding.h
@@ -34,7 +34,7 @@
 #include "online2/online-gmm-decodable.h"
 #include "online2/online-endpoint.h"
 #include "decoder/lattice-faster-online-decoder.h"
-#include "hmm/transition-model.h"
+#include "hmm/transitions.h"
 #include "gmm/am-diag-gmm.h"
 #include "hmm/posterior.h"
 
@@ -167,7 +167,7 @@ class OnlineGmmDecodingModels {
  public:
   OnlineGmmDecodingModels(const OnlineGmmDecodingConfig &config);
 
-  const TransitionModel &GetTransitionModel() const;
+  const Transitions &GetTransitionModel() const;
 
   const AmDiagGmm &GetOnlineAlignmentModel() const;
 
@@ -181,7 +181,7 @@ class OnlineGmmDecodingModels {
   // The transition-model is only needed for its integer ids, and these need to
   // be identical for all 3 models, so we only store one (it doesn't matter
   // which one).
-  TransitionModel tmodel_; 
+  Transitions tmodel_; 
   // The model trained with online-CMVN features
   // (if supplied, otherwise use model_)
   AmDiagGmm online_alignment_model_;
diff --git a/src/online2/online-ivector-feature.cc b/src/online2/online-ivector-feature.cc
index 2042fbb8b80..6898c38ad12 100644
--- a/src/online2/online-ivector-feature.cc
+++ b/src/online2/online-ivector-feature.cc
@@ -445,7 +445,7 @@ BaseFloat OnlineIvectorFeature::ObjfImprPerFrame() const {
 
 
 OnlineSilenceWeighting::OnlineSilenceWeighting(
-    const TransitionModel &trans_model,
+    const Transitions &trans_model,
     const OnlineSilenceWeightingConfig &config,
     int32 frame_subsampling_factor):
     trans_model_(trans_model), config_(config),
diff --git a/src/online2/online-ivector-feature.h b/src/online2/online-ivector-feature.h
index 25e078f1a98..0b3b8d62382 100644
--- a/src/online2/online-ivector-feature.h
+++ b/src/online2/online-ivector-feature.h
@@ -459,7 +459,7 @@ class OnlineSilenceWeighting {
   // frame-rate of the input features.  E.g. you might set it to 3 for such
   // models.
 
-  OnlineSilenceWeighting(const TransitionModel &trans_model,
+  OnlineSilenceWeighting(const Transitions &trans_model,
                          const OnlineSilenceWeightingConfig &config,
 			 int32 frame_subsampling_factor = 1);
 
@@ -490,7 +490,7 @@ class OnlineSilenceWeighting {
       std::vector<std::pair<int32, BaseFloat> > *delta_weights);
 
  private:
-  const TransitionModel &trans_model_;
+  const Transitions &trans_model_;
   const OnlineSilenceWeightingConfig &config_;
 
   int32 frame_subsampling_factor_;
diff --git a/src/online2/online-nnet2-decoding-threaded.cc b/src/online2/online-nnet2-decoding-threaded.cc
deleted file mode 100644
index c70eb571a46..00000000000
--- a/src/online2/online-nnet2-decoding-threaded.cc
+++ /dev/null
@@ -1,652 +0,0 @@
-// online2/online-nnet2-decoding-threaded.cc
-
-// Copyright    2013-2014  Johns Hopkins University (author: Daniel Povey)
-
-// See ../../COPYING for clarification regarding multiple authors
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//  http://www.apache.org/licenses/LICENSE-2.0
-//
-// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
-// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
-// MERCHANTABLITY OR NON-INFRINGEMENT.
-// See the Apache 2 License for the specific language governing permissions and
-// limitations under the License.
-
-#include "online2/online-nnet2-decoding-threaded.h"
-#include "nnet2/nnet-compute-online.h"
-#include "lat/lattice-functions.h"
-#include "lat/determinize-lattice-pruned.h"
-#include "util/kaldi-thread.h"
-
-namespace kaldi {
-
-ThreadSynchronizer::ThreadSynchronizer():
-    abort_(false),
-    producer_waiting_(false),
-    consumer_waiting_(false),
-    num_errors_(0) {
-  producer_semaphore_.Signal();
-  consumer_semaphore_.Signal();
-}
-
-bool ThreadSynchronizer::Lock(ThreadType t) {
-  if (abort_)
-    return false;
-  if (t == ThreadSynchronizer::kProducer) {
-    producer_semaphore_.Wait();
-  } else {
-    consumer_semaphore_.Wait();
-  }
-  if (abort_)
-    return false;
-  mutex_.lock();
-  held_by_ = t;
-  if (abort_) {
-    mutex_.unlock();
-    return false;
-  } else {
-    return true;
-  }
-}
-
-bool ThreadSynchronizer::UnlockSuccess(ThreadType t) {
-  if (t == ThreadSynchronizer::kProducer) {
-    producer_semaphore_.Signal();  // next Lock won't wait.
-    if (consumer_waiting_) {
-      consumer_semaphore_.Signal();
-      consumer_waiting_ = false;
-    }
-  } else {
-    consumer_semaphore_.Signal(); // next Lock won't wait.
-    if (producer_waiting_) {
-      producer_semaphore_.Signal();
-      producer_waiting_ = false;
-    }
-
-  }
-  mutex_.unlock();
-  return !abort_;
-}
-
-bool ThreadSynchronizer::UnlockFailure(ThreadType t) {
-
-  KALDI_ASSERT(held_by_ == t && "Code error: unlocking a mutex you don't hold.");
-
-  if (t == ThreadSynchronizer::kProducer) {
-    KALDI_ASSERT(!producer_waiting_ && "code error.");
-    producer_waiting_ = true;
-  } else {
-    KALDI_ASSERT(!consumer_waiting_ && "code error.");
-    consumer_waiting_ = true;
-  }
-  mutex_.unlock();
-  return !abort_;
-}
-
-void ThreadSynchronizer::SetAbort() {
-  abort_ = true;
-  // we signal the semaphores just in case someone was waiting on either of
-  // them.
-  producer_semaphore_.Signal();
-  consumer_semaphore_.Signal();
-}
-
-ThreadSynchronizer::~ThreadSynchronizer() {
-}
-
-// static
-void OnlineNnet2DecodingThreadedConfig::Check() {
-  KALDI_ASSERT(max_buffered_features > 1);
-  KALDI_ASSERT(feature_batch_size > 0);
-  KALDI_ASSERT(max_loglikes_copy >= 0);
-  KALDI_ASSERT(nnet_batch_size > 0);
-  KALDI_ASSERT(decode_batch_size >= 1);
-}
-
-
-SingleUtteranceNnet2DecoderThreaded::SingleUtteranceNnet2DecoderThreaded(
-    const OnlineNnet2DecodingThreadedConfig &config,
-    const TransitionModel &tmodel,
-    const nnet2::AmNnet &am_nnet,
-    const fst::Fst<fst::StdArc> &fst,
-    const OnlineNnet2FeaturePipelineInfo &feature_info,
-    const OnlineIvectorExtractorAdaptationState &adaptation_state):
-  config_(config), am_nnet_(am_nnet), tmodel_(tmodel), sampling_rate_(0.0),
-  num_samples_received_(0), input_finished_(false),
-  feature_pipeline_(feature_info),
-  num_samples_discarded_(0),
-  silence_weighting_(tmodel, feature_info.silence_weighting_config),
-  decodable_(tmodel),
-  num_frames_decoded_(0), decoder_(fst, config_.decoder_opts),
-  abort_(false), error_(false) {
-  // if the user supplies an adaptation state that was not freshly initialized,
-  // it means that we take the adaptation state from the previous
-  // utterance(s)... this only makes sense if theose previous utterance(s) are
-  // believed to be from the same speaker.
-  feature_pipeline_.SetAdaptationState(adaptation_state);
-  // spawn threads.
-  threads_[0] = std::thread(RunNnetEvaluation, this);
-  decoder_.InitDecoding();
-  threads_[1] = std::thread(RunDecoderSearch, this);
-}
-
-
-SingleUtteranceNnet2DecoderThreaded::~SingleUtteranceNnet2DecoderThreaded() {
-  if (!abort_) {
-    // If we have not already started the process of aborting the threads, do so now.
-    bool error = false;
-    AbortAllThreads(error);
-  }
-  // join all the threads (this avoids leaving zombie threads around, or threads
-  // that might be accessing deconstructed object).
-  WaitForAllThreads();
-  while (!input_waveform_.empty()) {
-    delete input_waveform_.front();
-    input_waveform_.pop_front();
-  }
-  while (!processed_waveform_.empty()) {
-    delete processed_waveform_.front();
-    processed_waveform_.pop_front();
-  }
-}
-
-void SingleUtteranceNnet2DecoderThreaded::AcceptWaveform(
-    BaseFloat sampling_rate,
-    const VectorBase<BaseFloat> &wave_part) {
-  if (sampling_rate_ <= 0.0)
-    sampling_rate_ = sampling_rate;
-  else {
-    KALDI_ASSERT(sampling_rate == sampling_rate_);
-  }
-  num_samples_received_ += wave_part.Dim();
-
-  if (wave_part.Dim() == 0) return;
-  if (!waveform_synchronizer_.Lock(ThreadSynchronizer::kProducer)) {
-    KALDI_ERR << "Failure locking mutex: decoding aborted.";
-  }
-
-  Vector<BaseFloat> *new_part = new Vector<BaseFloat>(wave_part);
-  input_waveform_.push_back(new_part);
-  // we always unlock with success because there is no buffer size limitation
-  // for the waveform so no reason why we might wait.
-  waveform_synchronizer_.UnlockSuccess(ThreadSynchronizer::kProducer);
-}
-
-int32 SingleUtteranceNnet2DecoderThreaded::NumWaveformPiecesPending() {
-  // Note RE locking: what we really want here is just to lock the mutex.  As a
-  // side effect, because of the way the synchronizer code works, it will also
-  // increment the semaphore and might wake up the consumer thread.  This will
-  // possibly make it do a little useless work (go around a loop once), but
-  // won't really do any harm.  Perhaps we should have implemented a version of
-  // the Lock function that takes no arguments.
-  if (!waveform_synchronizer_.Lock(ThreadSynchronizer::kProducer)) {
-    KALDI_ERR << "Failure locking mutex: decoding aborted.";
-  }
-  int32 ans = input_waveform_.size();
-  waveform_synchronizer_.UnlockSuccess(ThreadSynchronizer::kProducer);
-  return ans;
-}
-
-
-int32 SingleUtteranceNnet2DecoderThreaded::NumFramesReceivedApprox() const {
-  return num_samples_received_ /
-      (sampling_rate_ * feature_pipeline_.FrameShiftInSeconds());
-}
-
-void SingleUtteranceNnet2DecoderThreaded::InputFinished() {
-  // setting input_finished_ = true informs the feature-processing pipeline
-  // to expect no more input, and to flush out the last few frames if there
-  // is any latency in the pipeline (e.g. due to pitch).
-  if (!waveform_synchronizer_.Lock(ThreadSynchronizer::kProducer)) {
-    KALDI_ERR << "Failure locking mutex: decoding aborted.";
-  }
-  KALDI_ASSERT(!input_finished_ && "InputFinished called twice");
-  input_finished_ = true;
-  waveform_synchronizer_.UnlockSuccess(ThreadSynchronizer::kProducer);
-}
-
-void SingleUtteranceNnet2DecoderThreaded::TerminateDecoding() {
-  bool error = false;
-  AbortAllThreads(error);
-}
-
-void SingleUtteranceNnet2DecoderThreaded::Wait() {
-  if (!input_finished_ && !abort_) {
-    KALDI_ERR << "You cannot call Wait() before calling either InputFinished() "
-              << "or TerminateDecoding().";
-  }
-  WaitForAllThreads();
-}
-
-void SingleUtteranceNnet2DecoderThreaded::FinalizeDecoding() {
-  if (threads_[0].joinable()) {
-    KALDI_ERR << "It is an error to call FinalizeDecoding before Wait().";
-  }
-  decoder_.FinalizeDecoding();
-}
-
-BaseFloat SingleUtteranceNnet2DecoderThreaded::GetRemainingWaveform(
-    Vector<BaseFloat> *waveform) const {
-  if (threads_[0].joinable()) {
-    KALDI_ERR << "It is an error to call GetRemainingWaveform before Wait().";
-  }
-  int64 num_samples_stored = 0;  // number of samples we still have.
-  std::vector< Vector<BaseFloat>* > all_pieces;
-  std::deque< Vector<BaseFloat>* >::const_iterator iter;
-  for (iter = processed_waveform_.begin(); iter != processed_waveform_.end();
-       ++iter) {
-    num_samples_stored += (*iter)->Dim();
-    all_pieces.push_back(*iter);
-  }
-  for (iter = input_waveform_.begin(); iter != input_waveform_.end(); ++iter) {
-    num_samples_stored += (*iter)->Dim();
-    all_pieces.push_back(*iter);
-  }
-  int64 samples_shift_per_frame =
-      sampling_rate_ * feature_pipeline_.FrameShiftInSeconds();
-  int64 num_samples_to_discard = samples_shift_per_frame * num_frames_decoded_;
-  KALDI_ASSERT(num_samples_to_discard >= num_samples_discarded_);
-
-  // num_samp_discard is how many samples we must discard from our stored
-  // samples.
-  int64 num_samp_discard = num_samples_to_discard - num_samples_discarded_,
-      num_samp_keep = num_samples_stored - num_samp_discard;
-  KALDI_ASSERT(num_samp_discard <= num_samples_stored && num_samp_keep >= 0);
-  waveform->Resize(num_samp_keep, kUndefined);
-  int32 offset = 0; // offset in output waveform.  assume output waveform is no
-                    // larger than int32.
-  for (size_t i = 0; i < all_pieces.size(); i++) {
-    Vector<BaseFloat> *this_piece = all_pieces[i];
-    int32 this_dim = this_piece->Dim();
-    if (num_samp_discard >= this_dim) {
-      num_samp_discard -= this_dim;
-    } else {
-      // normal case is num_samp_discard = 0.
-      int32 this_dim_keep = this_dim - num_samp_discard;
-      waveform->Range(offset, this_dim_keep).CopyFromVec(
-          this_piece->Range(num_samp_discard, this_dim_keep));
-      offset += this_dim_keep;
-      num_samp_discard = 0;
-    }
-  }
-  KALDI_ASSERT(offset == num_samp_keep && num_samp_discard == 0);
-  return sampling_rate_;
-}
-
-void SingleUtteranceNnet2DecoderThreaded::GetAdaptationState(
-    OnlineIvectorExtractorAdaptationState *adaptation_state) {
-  std::lock_guard<std::mutex> lock(feature_pipeline_mutex_);
-  // If this blocks, it shouldn't be for very long.
-  feature_pipeline_.GetAdaptationState(adaptation_state);
-}
-
-void SingleUtteranceNnet2DecoderThreaded::GetLattice(
-    bool end_of_utterance,
-    CompactLattice *clat,
-    BaseFloat *final_relative_cost) const {
-  clat->DeleteStates();
-  decoder_mutex_.lock();
-  if (final_relative_cost != NULL)
-    *final_relative_cost = decoder_.FinalRelativeCost();
-  if (decoder_.NumFramesDecoded() == 0) {
-    decoder_mutex_.unlock();
-    clat->SetFinal(clat->AddState(),
-                   CompactLatticeWeight::One());
-    return;
-  }
-  Lattice raw_lat;
-  decoder_.GetRawLattice(&raw_lat, end_of_utterance);
-  decoder_mutex_.unlock();
-
-  if (!config_.decoder_opts.determinize_lattice)
-    KALDI_ERR << "--determinize-lattice=false option is not supported at the moment";
-
-  BaseFloat lat_beam = config_.decoder_opts.lattice_beam;
-  DeterminizeLatticePhonePrunedWrapper(
-      tmodel_, &raw_lat, lat_beam, clat, config_.decoder_opts.det_opts);
-}
-
-void SingleUtteranceNnet2DecoderThreaded::GetBestPath(
-    bool end_of_utterance,
-    Lattice *best_path,
-    BaseFloat *final_relative_cost) const {
-  std::lock_guard<std::mutex> lock(decoder_mutex_);
-  if (decoder_.NumFramesDecoded() == 0) {
-    // It's possible that this if-statement is not necessary because we'd get this
-    // anyway if we just called GetBestPath on the decoder.
-    best_path->DeleteStates();
-    best_path->SetFinal(best_path->AddState(),
-                        LatticeWeight::One());
-    if (final_relative_cost != NULL)
-      *final_relative_cost = std::numeric_limits<BaseFloat>::infinity();
-  } else {
-    decoder_.GetBestPath(best_path,
-                         end_of_utterance);
-    if (final_relative_cost != NULL)
-      *final_relative_cost = decoder_.FinalRelativeCost();
-  }
-}
-
-void SingleUtteranceNnet2DecoderThreaded::AbortAllThreads(bool error) {
-  abort_ = true;
-  if (error)
-    error_ = true;
-  waveform_synchronizer_.SetAbort();
-  decodable_synchronizer_.SetAbort();
-}
-
-int32 SingleUtteranceNnet2DecoderThreaded::NumFramesDecoded() const {
-  std::lock_guard<std::mutex> lock(decoder_mutex_);
-  return decoder_.NumFramesDecoded();
-}
-
-void SingleUtteranceNnet2DecoderThreaded::RunNnetEvaluation(
-    SingleUtteranceNnet2DecoderThreaded *me) {
-  try {
-    if (!me->RunNnetEvaluationInternal() && !me->abort_)
-      KALDI_ERR << "Returned abnormally and abort was not called";
-  } catch(const std::exception &e) {
-    KALDI_WARN << "Caught exception: " << e.what();
-    // if an error happened in one thread, we need to make sure the other
-    // threads can exit too.
-    bool error = true;
-    me->AbortAllThreads(error);
-  }
-}
-
-void SingleUtteranceNnet2DecoderThreaded::RunDecoderSearch(
-    SingleUtteranceNnet2DecoderThreaded *me) {
-  try {
-    if (!me->RunDecoderSearchInternal() && !me->abort_)
-      KALDI_ERR << "Returned abnormally and abort was not called";
-  } catch(const std::exception &e) {
-    KALDI_WARN << "Caught exception: " << e.what();
-    // if an error happened in one thread, we need to make sure the other threads can exit too.
-    bool error = true;
-    me->AbortAllThreads(error);
-  }
-}
-
-
-void SingleUtteranceNnet2DecoderThreaded::WaitForAllThreads() {
-  for (int32 i = 0; i < 2; i++) {  // there are 2 spawned threads.
-    if (threads_[i].joinable())
-      threads_[i].join();
-  }
-  if (error_)
-    KALDI_ERR << "Error encountered during decoding.  See above.";
-}
-
-
-void SingleUtteranceNnet2DecoderThreaded::ProcessLoglikes(
-    const CuVector<BaseFloat> &log_inv_prior,
-    CuMatrixBase<BaseFloat> *cu_loglikes) {
-  if (cu_loglikes->NumRows() != 0) {
-    cu_loglikes->ApplyFloor(1.0e-20);
-    cu_loglikes->ApplyLog();
-    // take the log-posteriors and turn them into pseudo-log-likelihoods by
-    // dividing by the pdf priors; then scale by the acoustic scale.
-    cu_loglikes->AddVecToRows(1.0, log_inv_prior);
-    cu_loglikes->Scale(config_.acoustic_scale);
-  }
-}
-
-// called from RunNnetEvaluationInternal().  Returns true in the normal case,
-// false on error; if it returns false, then we expect that the calling thread
-// will terminate.  This assumes the calling thread has already
-// locked feature_pipeline_mutex_.
-bool SingleUtteranceNnet2DecoderThreaded::FeatureComputation(
-    int32 num_frames_consumed) {
-
-  int32 num_frames_ready = feature_pipeline_.NumFramesReady(),
-      num_frames_usable = num_frames_ready - num_frames_consumed;
-  bool features_done = feature_pipeline_.IsLastFrame(num_frames_ready - 1);
-  KALDI_ASSERT(num_frames_usable >= 0);
-  if (features_done) {
-    return true;  // nothing to do. (but not an error).
-  } else {
-    if (num_frames_usable >= config_.nnet_batch_size)
-      return true;  // We don't need more data yet.
-
-    // Now try to get more data, if we can.
-    if (!waveform_synchronizer_.Lock(ThreadSynchronizer::kConsumer)) {
-      return false;
-    }
-    // we've got the lock.
-    if (input_waveform_.empty()) {  // we got no data
-      if (input_finished_ &&
-          !feature_pipeline_.IsLastFrame(feature_pipeline_.NumFramesReady()-1)) {
-        // the main thread called InputFinished() and set input_finished_, and
-        // we haven't yet registered that fact.  This is progress so
-        // unlock with UnlockSuccess().
-        feature_pipeline_.InputFinished();
-        return waveform_synchronizer_.UnlockSuccess(ThreadSynchronizer::kConsumer);
-      } else {
-        // there is no progress.  Unlock with UnlockFailure() so the next call to
-        // waveform_synchronizer_.Lock() will lock.
-        return waveform_synchronizer_.UnlockFailure(ThreadSynchronizer::kConsumer);
-      }
-    } else {  // we got some data.  Only take enough of the waveform to
-              // give us a maximum nnet batch size of frames to decode.
-      while (num_frames_usable < config_.nnet_batch_size &&
-             !input_waveform_.empty()) {
-        feature_pipeline_.AcceptWaveform(sampling_rate_, *input_waveform_.front());
-        processed_waveform_.push_back(input_waveform_.front());
-        input_waveform_.pop_front();
-        num_frames_ready = feature_pipeline_.NumFramesReady();
-        num_frames_usable = num_frames_ready - num_frames_consumed;
-      }
-      // Delete already-processed pieces of waveform if we have already decoded
-      // those frames.  (If not already decoded, we keep them around for the
-      // sake of GetRemainingWaveform()).
-      int32 samples_shift_per_frame =
-          sampling_rate_ * feature_pipeline_.FrameShiftInSeconds();
-      while (!processed_waveform_.empty() &&
-             num_samples_discarded_ + processed_waveform_.front()->Dim() <
-             samples_shift_per_frame * num_frames_decoded_) {
-        num_samples_discarded_ += processed_waveform_.front()->Dim();
-        delete processed_waveform_.front();
-        processed_waveform_.pop_front();
-      }
-      return waveform_synchronizer_.UnlockSuccess(ThreadSynchronizer::kConsumer);
-    }
-  }
-}
-
-bool SingleUtteranceNnet2DecoderThreaded::RunNnetEvaluationInternal() {
-  // if any of the Lock/Unlock functions return false, it's because AbortAllThreads()
-  // was called.
-
-  // This object is responsible for keeping track of the context, and avoiding
-  // re-computing things we've already computed.
-  bool pad_input = true;
-  nnet2::NnetOnlineComputer computer(am_nnet_.GetNnet(), pad_input);
-
-  // we declare the following as CuVector just to enable GPU support, but
-  // we expect this code to be run on CPU in the normal case.
-  CuVector<BaseFloat> log_inv_prior(am_nnet_.Priors());
-  log_inv_prior.ApplyFloor(1.0e-20);  // should have no effect.
-  log_inv_prior.ApplyLog();
-  log_inv_prior.Scale(-1.0);
-
-  // we'll have num_frames_consumed >= num_frames_output; num_frames_consumed is
-  // the number of feature frames consumed by the nnet computation,
-  // num_frames_output is the number of frames of loglikes the nnet computation
-  // has produced, which may be less than num_frames_consumed due to the
-  // right-context of the network.
-  int32 num_frames_consumed = 0, num_frames_output = 0;
-
-  while (true) {
-    bool last_time = false;
-
-    /****** Begin locking of feature pipeline mutex. ******/
-    feature_pipeline_mutex_.lock();
-    if (!FeatureComputation(num_frames_consumed)) {  // error
-      feature_pipeline_mutex_.unlock();
-      return false;
-    }
-    // take care of silence weighting.
-    if (silence_weighting_.Active() &&
-        feature_pipeline_.IvectorFeature() != NULL) {
-      silence_weighting_mutex_.lock();
-      std::vector<std::pair<int32, BaseFloat> > delta_weights;
-      silence_weighting_.GetDeltaWeights(
-          feature_pipeline_.IvectorFeature()->NumFramesReady(),
-          &delta_weights);
-      silence_weighting_mutex_.unlock();
-      feature_pipeline_.IvectorFeature()->UpdateFrameWeights(delta_weights);
-    }
-
-    int32 num_frames_ready = feature_pipeline_.NumFramesReady(),
-        num_frames_usable = num_frames_ready - num_frames_consumed;
-    bool features_done = feature_pipeline_.IsLastFrame(num_frames_ready - 1);
-
-    int32 num_frames_evaluate = std::min<int32>(num_frames_usable,
-                                                config_.nnet_batch_size);
-
-    Matrix<BaseFloat> feats;
-    if (num_frames_evaluate > 0) {
-      // we have something to do...
-      feats.Resize(num_frames_evaluate, feature_pipeline_.Dim());
-      for (int32 i = 0; i < num_frames_evaluate; i++) {
-        int32 t = num_frames_consumed + i;
-        SubVector<BaseFloat> feat(feats, i);
-        feature_pipeline_.GetFrame(t, &feat);
-      }
-    }
-    /****** End locking of feature pipeline mutex. ******/
-    feature_pipeline_mutex_.unlock();
-
-    CuMatrix<BaseFloat> cu_loglikes;
-
-    if (feats.NumRows() == 0) {
-      if (features_done) {
-        // flush out the last few frames.  Note: this is the only place from
-        // which we check feature_buffer_finished_, and we'll exit the loop, so
-        // if we reach here it must be the first time it was true.
-        last_time = true;
-        computer.Flush(&cu_loglikes);
-        ProcessLoglikes(log_inv_prior, &cu_loglikes);
-      }
-    } else {
-      CuMatrix<BaseFloat> cu_feats;
-      cu_feats.Swap(&feats);  // If we don't have a GPU (and not having a GPU is
-                              // the normal expected use-case for this code),
-                              // this would be a lightweight operation, swapping
-                              // pointers.
-
-      computer.Compute(cu_feats, &cu_loglikes);
-      num_frames_consumed += cu_feats.NumRows();
-      ProcessLoglikes(log_inv_prior, &cu_loglikes);
-    }
-
-    Matrix<BaseFloat> loglikes;
-    loglikes.Swap(&cu_loglikes);  // If we don't have a GPU (and not having a
-                                  // GPU is the normal expected use-case for
-                                  // this code), this would be a lightweight
-                                  // operation, swapping pointers.
-
-
-    // OK, at this point we may have some newly created log-likes and we want to
-    // give them to the decoding thread.
-
-    int32 num_loglike_frames = loglikes.NumRows();
-
-    if (num_loglike_frames != 0) {  // if we need to output some loglikes...
-      while (true) {
-        // we may have to grab and release the decodable mutex
-        // a few times before it's ready to accept the loglikes.
-        if (!decodable_synchronizer_.Lock(ThreadSynchronizer::kProducer))
-          return false;
-        int32 num_frames_decoded = num_frames_decoded_;
-        // we can't have output fewer frames than were decoded.
-        KALDI_ASSERT(num_frames_output >= num_frames_decoded);
-        if (num_frames_output - num_frames_decoded <= config_.max_loglikes_copy) {
-          // If we would have to copy fewer than config_.max_loglikes_copy
-          // previously output log-likelihoods inside the decodable object, then
-          // we go ahead and copy them to that object.
-          int32 frames_to_discard = num_frames_decoded_ -
-              decodable_.FirstAvailableFrame();
-          KALDI_ASSERT(frames_to_discard >= 0);
-          num_frames_output += num_loglike_frames;
-          decodable_.AcceptLoglikes(&loglikes, frames_to_discard);
-          if (!decodable_synchronizer_.UnlockSuccess(ThreadSynchronizer::kProducer))
-            return false;
-          break;  // break from the innermost while loop.
-        } else {
-          // There are too many frames already available to the decoder, that it
-          // hasn't processed yet, and we don't want them to have to be copied
-          // inside AcceptLoglikes(), so we wait for a bit.
-          // we want the next call to Lock to block until the decoder has
-          //  processed more frames.
-          if (!decodable_synchronizer_.UnlockFailure(ThreadSynchronizer::kProducer))
-            return false;
-        }
-      }
-    }
-    if (last_time) {
-      // Inform the decodable object that there will be no more input.
-      if (!decodable_synchronizer_.Lock(ThreadSynchronizer::kProducer))
-        return false;
-      decodable_.InputIsFinished();
-      if (!decodable_synchronizer_.UnlockSuccess(ThreadSynchronizer::kProducer))
-        return false;
-      KALDI_ASSERT(num_frames_consumed == num_frames_output);
-      return true;
-    }
-  }
-}
-
-
-bool SingleUtteranceNnet2DecoderThreaded::RunDecoderSearchInternal() {
-  int32 num_frames_decoded = 0;  // this is just a copy of decoder_->NumFramesDecoded();
-  while (true) {  // decode at most one frame each loop.
-    if (!decodable_synchronizer_.Lock(ThreadSynchronizer::kConsumer))
-      return false; // AbortAllThreads() called.
-    if (decodable_.NumFramesReady() <= num_frames_decoded) {
-      // no frames available to decode.
-      KALDI_ASSERT(decodable_.NumFramesReady() == num_frames_decoded);
-      if (decodable_.IsLastFrame(num_frames_decoded - 1)) {
-        decodable_synchronizer_.UnlockSuccess(ThreadSynchronizer::kConsumer);
-        return true;  // exit from this thread; we're done.
-      } else {
-        // we were not able to advance the decoding due to no available
-        // input.  The next call will ensure that the next call to
-        // decodable_synchronizer_.Lock() will wait.
-        if (!decodable_synchronizer_.UnlockFailure(ThreadSynchronizer::kConsumer))
-          return false;
-      }
-    } else {
-      // Decode at most config_.decode_batch_size frames (e.g. 1 or 2).
-      decoder_mutex_.lock();
-      decoder_.AdvanceDecoding(&decodable_, config_.decode_batch_size);
-      num_frames_decoded = decoder_.NumFramesDecoded();
-      if (silence_weighting_.Active()) {
-        std::lock_guard<std::mutex> lock(silence_weighting_mutex_);
-        // the next function does not trace back all the way; it's very fast.
-        silence_weighting_.ComputeCurrentTraceback(decoder_);
-      }
-      decoder_mutex_.unlock();
-      num_frames_decoded_ = num_frames_decoded;
-      if (!decodable_synchronizer_.UnlockSuccess(ThreadSynchronizer::kConsumer))
-        return false;
-    }
-  }
-}
-
-bool SingleUtteranceNnet2DecoderThreaded::EndpointDetected(
-    const OnlineEndpointConfig &config) {
-  std::lock_guard<std::mutex> lock(decoder_mutex_);
-  return kaldi::EndpointDetected(config, tmodel_,
-                                 feature_pipeline_.FrameShiftInSeconds(),
-                                 decoder_);
-}
-
-
-
-}  // namespace kaldi
diff --git a/src/online2/online-nnet2-decoding-threaded.h b/src/online2/online-nnet2-decoding-threaded.h
index e77166ac801..3ca62e5ea3e 100644
--- a/src/online2/online-nnet2-decoding-threaded.h
+++ b/src/online2/online-nnet2-decoding-threaded.h
@@ -35,7 +35,7 @@
 #include "online2/online-nnet2-feature-pipeline.h"
 #include "online2/online-endpoint.h"
 #include "decoder/lattice-faster-online-decoder.h"
-#include "hmm/transition-model.h"
+#include "hmm/transitions.h"
 #include "util/kaldi-semaphore.h"
 
 namespace kaldi {
@@ -196,7 +196,7 @@ class SingleUtteranceNnet2DecoderThreaded {
   // (locally owned) feature pipeline.
   SingleUtteranceNnet2DecoderThreaded(
       const OnlineNnet2DecodingThreadedConfig &config,
-      const TransitionModel &tmodel,
+      const Transitions &tmodel,
       const nnet2::AmNnet &am_nnet,
       const fst::Fst<fst::StdArc> &fst,
       const OnlineNnet2FeaturePipelineInfo &feature_info,
@@ -343,7 +343,7 @@ class SingleUtteranceNnet2DecoderThreaded {
 
   const nnet2::AmNnet &am_nnet_;
 
-  const TransitionModel &tmodel_;
+  const Transitions &tmodel_;
 
 
   // sampling_rate_ is set the first time AcceptWaveform is called.
diff --git a/src/online2/online-nnet2-decoding.cc b/src/online2/online-nnet2-decoding.cc
deleted file mode 100644
index fdd1b78a880..00000000000
--- a/src/online2/online-nnet2-decoding.cc
+++ /dev/null
@@ -1,81 +0,0 @@
-// online2/online-nnet2-decoding.cc
-
-// Copyright    2013-2014  Johns Hopkins University (author: Daniel Povey)
-
-// See ../../COPYING for clarification regarding multiple authors
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//  http://www.apache.org/licenses/LICENSE-2.0
-//
-// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
-// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
-// MERCHANTABLITY OR NON-INFRINGEMENT.
-// See the Apache 2 License for the specific language governing permissions and
-// limitations under the License.
-
-#include "online2/online-nnet2-decoding.h"
-#include "lat/lattice-functions.h"
-#include "lat/determinize-lattice-pruned.h"
-
-namespace kaldi {
-
-SingleUtteranceNnet2Decoder::SingleUtteranceNnet2Decoder(
-    const OnlineNnet2DecodingConfig &config,
-    const TransitionModel &tmodel,
-    const nnet2::AmNnet &model,
-    const fst::Fst<fst::StdArc> &fst,
-    OnlineFeatureInterface *feature_pipeline):
-    config_(config),
-    feature_pipeline_(feature_pipeline),
-    tmodel_(tmodel),
-    decodable_(model, tmodel, config.decodable_opts, feature_pipeline),
-    decoder_(fst, config.decoder_opts) {
-  decoder_.InitDecoding();
-}
-
-void SingleUtteranceNnet2Decoder::AdvanceDecoding() {
-  decoder_.AdvanceDecoding(&decodable_);
-}
-
-void SingleUtteranceNnet2Decoder::FinalizeDecoding() {
-  decoder_.FinalizeDecoding();
-}
-
-int32 SingleUtteranceNnet2Decoder::NumFramesDecoded() const {
-  return decoder_.NumFramesDecoded();
-}
-
-void SingleUtteranceNnet2Decoder::GetLattice(bool end_of_utterance,
-                                             CompactLattice *clat) const {
-  if (NumFramesDecoded() == 0)
-    KALDI_ERR << "You cannot get a lattice if you decoded no frames.";
-  Lattice raw_lat;
-  decoder_.GetRawLattice(&raw_lat, end_of_utterance);
-
-  if (!config_.decoder_opts.determinize_lattice)
-    KALDI_ERR << "--determinize-lattice=false option is not supported at the moment";
-
-  BaseFloat lat_beam = config_.decoder_opts.lattice_beam;
-  DeterminizeLatticePhonePrunedWrapper(
-      tmodel_, &raw_lat, lat_beam, clat, config_.decoder_opts.det_opts);
-}
-
-void SingleUtteranceNnet2Decoder::GetBestPath(bool end_of_utterance,
-                                              Lattice *best_path) const {
-  decoder_.GetBestPath(best_path, end_of_utterance);
-}
-
-bool SingleUtteranceNnet2Decoder::EndpointDetected(
-    const OnlineEndpointConfig &config) {
-  return kaldi::EndpointDetected(config, tmodel_,
-                                 feature_pipeline_->FrameShiftInSeconds(),
-                                 decoder_);  
-}
-
-
-}  // namespace kaldi
-
diff --git a/src/online2/online-nnet2-decoding.h b/src/online2/online-nnet2-decoding.h
index 2d48971694b..b185b8b69f8 100644
--- a/src/online2/online-nnet2-decoding.h
+++ b/src/online2/online-nnet2-decoding.h
@@ -32,7 +32,7 @@
 #include "itf/online-feature-itf.h"
 #include "online2/online-endpoint.h"
 #include "decoder/lattice-faster-online-decoder.h"
-#include "hmm/transition-model.h"
+#include "hmm/transitions.h"
 #include "hmm/posterior.h"
 
 namespace kaldi {
@@ -69,7 +69,7 @@ class SingleUtteranceNnet2Decoder {
   // Constructor.  The feature_pipeline_ pointer is not owned in this
   // class, it's owned externally.
   SingleUtteranceNnet2Decoder(const OnlineNnet2DecodingConfig &config,
-                              const TransitionModel &tmodel,
+                              const Transitions &tmodel,
                               const nnet2::AmNnet &model,
                               const fst::Fst<fst::StdArc> &fst,
                               OnlineFeatureInterface *feature_pipeline);
@@ -113,7 +113,7 @@ class SingleUtteranceNnet2Decoder {
 
   OnlineFeatureInterface *feature_pipeline_;
 
-  const TransitionModel &tmodel_;
+  const Transitions &tmodel_;
   
   nnet2::DecodableNnet2Online decodable_;
   
diff --git a/src/online2/online-nnet3-decoding.cc b/src/online2/online-nnet3-decoding.cc
index fbe0c2bed7b..d40dcb411d1 100644
--- a/src/online2/online-nnet3-decoding.cc
+++ b/src/online2/online-nnet3-decoding.cc
@@ -28,7 +28,7 @@ namespace kaldi {
 template <typename FST>
 SingleUtteranceNnet3DecoderTpl<FST>::SingleUtteranceNnet3DecoderTpl(
     const LatticeFasterDecoderConfig &decoder_opts,
-    const TransitionModel &trans_model,
+    const Transitions &trans_model,
     const nnet3::DecodableNnetSimpleLoopedInfo &info,
     const FST &fst,
     OnlineNnet2FeaturePipeline *features):
diff --git a/src/online2/online-nnet3-decoding.h b/src/online2/online-nnet3-decoding.h
index 568c0b6a0b3..b30f035b4d2 100644
--- a/src/online2/online-nnet3-decoding.h
+++ b/src/online2/online-nnet3-decoding.h
@@ -34,7 +34,7 @@
 #include "online2/online-endpoint.h"
 #include "online2/online-nnet2-feature-pipeline.h"
 #include "decoder/lattice-faster-online-decoder.h"
-#include "hmm/transition-model.h"
+#include "hmm/transitions.h"
 #include "hmm/posterior.h"
 
 namespace kaldi {
@@ -55,7 +55,7 @@ class SingleUtteranceNnet3DecoderTpl {
   // Constructor. The pointer 'features' is not being given to this class to own
   // and deallocate, it is owned externally.
   SingleUtteranceNnet3DecoderTpl(const LatticeFasterDecoderConfig &decoder_opts,
-                                 const TransitionModel &trans_model,
+                                 const Transitions &trans_model,
                                  const nnet3::DecodableNnetSimpleLoopedInfo &info,
                                  const FST &fst,
                                  OnlineNnet2FeaturePipeline *features);
@@ -103,7 +103,7 @@ class SingleUtteranceNnet3DecoderTpl {
 
   // we need to keep a reference to the transition model around only because
   // it's needed by the endpointing code.
-  const TransitionModel &trans_model_;
+  const Transitions &trans_model_;
 
   nnet3::DecodableAmNnetLoopedOnline decodable_;
 
diff --git a/src/online2/online-nnet2-feature-pipeline.cc b/src/online2/online2-feature-pipeline.cc
similarity index 100%
rename from src/online2/online-nnet2-feature-pipeline.cc
rename to src/online2/online2-feature-pipeline.cc
diff --git a/src/online2bin/online2-wav-nnet2-am-compute.cc b/src/online2bin/online2-wav-nnet2-am-compute.cc
index 4fa707f8b13..b41351b4d35 100644
--- a/src/online2bin/online2-wav-nnet2-am-compute.cc
+++ b/src/online2bin/online2-wav-nnet2-am-compute.cc
@@ -89,7 +89,7 @@ int main(int argc, char *argv[]) {
       chunk_length_secs = -1.0;
     }
 
-    TransitionModel trans_model;
+    Transitions trans_model;
     AmNnet am_nnet;
     {
       bool binary;
diff --git a/src/online2bin/online2-wav-nnet2-latgen-faster.cc b/src/online2bin/online2-wav-nnet2-latgen-faster.cc
index 02b8dcf2ef5..18f0064adea 100644
--- a/src/online2bin/online2-wav-nnet2-latgen-faster.cc
+++ b/src/online2bin/online2-wav-nnet2-latgen-faster.cc
@@ -152,7 +152,7 @@ int main(int argc, char *argv[]) {
       chunk_length_secs = -1.0;
     }
 
-    TransitionModel trans_model;
+    Transitions trans_model;
     nnet2::AmNnet nnet;
     {
       bool binary;
diff --git a/src/online2bin/online2-wav-nnet2-latgen-threaded.cc b/src/online2bin/online2-wav-nnet2-latgen-threaded.cc
index e9f43867801..a61d2a8ba6f 100644
--- a/src/online2bin/online2-wav-nnet2-latgen-threaded.cc
+++ b/src/online2bin/online2-wav-nnet2-latgen-threaded.cc
@@ -155,7 +155,7 @@ int main(int argc, char *argv[]) {
       feature_info.ivector_extractor_info.greedy_ivector_extractor = true;
     }
     
-    TransitionModel trans_model;
+    Transitions trans_model;
     nnet2::AmNnet am_nnet;
     {
       bool binary;
diff --git a/src/online2bin/online2-wav-nnet3-latgen-faster.cc b/src/online2bin/online2-wav-nnet3-latgen-faster.cc
index af330a59375..571bd988e0b 100644
--- a/src/online2bin/online2-wav-nnet3-latgen-faster.cc
+++ b/src/online2bin/online2-wav-nnet3-latgen-faster.cc
@@ -154,7 +154,7 @@ int main(int argc, char *argv[]) {
       chunk_length_secs = -1.0;
     }
 
-    TransitionModel trans_model;
+    Transitions trans_model;
     nnet3::AmNnetSimple am_nnet;
     {
       bool binary;
diff --git a/src/online2bin/online2-wav-nnet3-latgen-grammar.cc b/src/online2bin/online2-wav-nnet3-latgen-grammar.cc
index 3c4ef858021..d68e36ceef0 100644
--- a/src/online2bin/online2-wav-nnet3-latgen-grammar.cc
+++ b/src/online2bin/online2-wav-nnet3-latgen-grammar.cc
@@ -157,7 +157,7 @@ int main(int argc, char *argv[]) {
       chunk_length_secs = -1.0;
     }
 
-    TransitionModel trans_model;
+    Transitions trans_model;
     nnet3::AmNnetSimple am_nnet;
     {
       bool binary;
diff --git a/src/onlinebin/online-audio-server-decode-faster.cc b/src/onlinebin/online-audio-server-decode-faster.cc
index ca4bfeb8858..3a7d416441e 100644
--- a/src/onlinebin/online-audio-server-decode-faster.cc
+++ b/src/onlinebin/online-audio-server-decode-faster.cc
@@ -151,7 +151,7 @@ int32 main(int argc, char *argv[]) {
 
     std::cout << "Reading acoustic model: " << model_rspecifier << "..."
         << std::endl;
-    TransitionModel trans_model;
+    Transitions trans_model;
     AmDiagGmm am_gmm;
     {
       bool binary;
diff --git a/src/onlinebin/online-gmm-decode-faster.cc b/src/onlinebin/online-gmm-decode-faster.cc
index 8ad86a489d4..9aa8751cf50 100644
--- a/src/onlinebin/online-gmm-decode-faster.cc
+++ b/src/onlinebin/online-gmm-decode-faster.cc
@@ -106,7 +106,7 @@ int main(int argc, char *argv[]) {
     if (silence_phones.empty())
         KALDI_ERR << "No silence phones given!";
 
-    TransitionModel trans_model;
+    Transitions trans_model;
     AmDiagGmm am_gmm;
     {
         bool binary;
diff --git a/src/onlinebin/online-server-gmm-decode-faster.cc b/src/onlinebin/online-server-gmm-decode-faster.cc
index 80973bf0705..5567a192ce4 100644
--- a/src/onlinebin/online-server-gmm-decode-faster.cc
+++ b/src/onlinebin/online-server-gmm-decode-faster.cc
@@ -123,7 +123,7 @@ int main(int argc, char *argv[]) {
     if (silence_phones.empty())
         KALDI_ERR << "No silence phones given!";
 
-    TransitionModel trans_model;
+    Transitions trans_model;
     AmDiagGmm am_gmm;
     {
         bool binary;
diff --git a/src/onlinebin/online-wav-gmm-decode-faster.cc b/src/onlinebin/online-wav-gmm-decode-faster.cc
index fe7c6d6b974..400835e6e20 100644
--- a/src/onlinebin/online-wav-gmm-decode-faster.cc
+++ b/src/onlinebin/online-wav-gmm-decode-faster.cc
@@ -109,7 +109,7 @@ int main(int argc, char *argv[]) {
       lda_transform.Read(ki.Stream(), binary_in);
     }
 
-    TransitionModel trans_model;
+    Transitions trans_model;
     AmDiagGmm am_gmm;
     {
         bool binary;
diff --git a/src/sgmm2/Makefile b/src/sgmm2/Makefile
deleted file mode 100644
index 35a8d3a1f40..00000000000
--- a/src/sgmm2/Makefile
+++ /dev/null
@@ -1,19 +0,0 @@
-all:
-
-OPENFST_CXXFLAGS =
-OPENFST_LDLIBS =
-include ../kaldi.mk
-
-TESTFILES = am-sgmm2-test estimate-am-sgmm2-test  \
-   fmllr-sgmm2-test
-
-OBJFILES = am-sgmm2.o estimate-am-sgmm2.o estimate-am-sgmm2-ebw.o fmllr-sgmm2.o \
-          am-sgmm2-project.o decodable-am-sgmm2.o
-
-LIBNAME = kaldi-sgmm2
-
-ADDLIBS = ../transform/kaldi-transform.a ../gmm/kaldi-gmm.a \
-          ../tree/kaldi-tree.a ../util/kaldi-util.a ../matrix/kaldi-matrix.a \
-          ../base/kaldi-base.a 
-
-include ../makefiles/default_rules.mk
diff --git a/src/sgmm2/am-sgmm2-project.cc b/src/sgmm2/am-sgmm2-project.cc
deleted file mode 100644
index 13a69a75842..00000000000
--- a/src/sgmm2/am-sgmm2-project.cc
+++ /dev/null
@@ -1,265 +0,0 @@
-// sgmm2/am-sgmm2-project.cc
-
-// Copyright 2012  Johns Hopkins University (author: Daniel Povey)
-
-// See ../../COPYING for clarification regarding multiple authors
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//  http://www.apache.org/licenses/LICENSE-2.0
-//
-// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
-// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
-// MERCHANTABLITY OR NON-INFRINGEMENT.
-// See the Apache 2 License for the specific language governing permissions and
-// limitations under the License.
-
-#include <algorithm>
-#include <functional>
-#include <set>
-#include <string>
-#include <utility>
-#include <vector>
-using std::vector;
-
-#include "sgmm2/am-sgmm2-project.h"
-#include "util/kaldi-thread.h"
-#include "gmm/full-gmm-normal.h"
-#include "gmm/diag-gmm-normal.h"
-
-namespace kaldi {
-
-// The output pointer argument "projection" projects from the pre-LDA+MLLT space
-// to the space we're going to model.  We retain "model_dim" dimensions, which
-// means we're keeping all dimensions that have any variation at all.
-
-void Sgmm2Project::ComputeProjection(const AmSgmm2 &sgmm,
-                                     const Matrix<BaseFloat> &inv_lda_mllt,
-                                     int32 start_dim,
-                                     int32 end_dim, // last dim plus one
-                                     Matrix<BaseFloat> *projection) {
-  Matrix<double> inv_lda_mllt_dbl(inv_lda_mllt);
-  KALDI_ASSERT(inv_lda_mllt.NumRows() == inv_lda_mllt.NumCols());
-  
-  // First, to compute the projection that we're going to use:
-  
-  SpMatrix<double> B; // between-class covar.
-  SpMatrix<double> W; // within-class covar.
-
-  int32 model_dim = sgmm.FeatureDim(),
-      full_dim = inv_lda_mllt.NumRows();
-  KALDI_ASSERT(full_dim > model_dim);
-  KALDI_ASSERT(start_dim >= 0 && start_dim < end_dim && end_dim <= full_dim);
-
-  ComputeLdaStats(sgmm.full_ubm(), &B, &W);
-  // B and W are now of dim "model_dim".
-
-  double diag_term = 0.001 / model_dim * B.Trace(); // This will ensure
-  // that the between-class covariance is full rank within the original
-  // feature space.
-  for (int32 i = 0; i < B.NumRows(); i++)
-    B(i, i) += diag_term;
-
-  B.Resize(full_dim, kCopyData); // This extends the extra dims with
-  // zeros, which is what we want, because we assume the means are zero in the
-  // extra dimensions [this is valid because we have cmd'ed data].
-
-  W.Resize(full_dim, kCopyData); // We want the within-class
-  // covar to be unit in the extra dimensions, so we need to do something
-  // about this... note, this is valid if we have an LDA-based feature
-  // space, as we constructed the LDA matrix so that the covar in
-  // the rejected dimensions is unit.  [note: we can gloss over differences
-  // between within vs. total covar here, as it's almost exactly the same
-  // for the rejected dimensions].
-  for (int32 i = model_dim; i < full_dim; i++)
-    W(i, i) = 1.0;
-  
-  // Next, we'll project these "extended" stats with the "inv_lda_mllt"
-  // matrix, which takes us into the space where we were before LDA+MLLT.
-  SpMatrix<double> B_orig(full_dim), W_orig(full_dim);
-  B_orig.AddMat2Sp(1.0, inv_lda_mllt_dbl, kNoTrans, B, 0.0); // B_orig <-- inv_lda_mllt B inv_lda_mllt^T
-  W_orig.AddMat2Sp(1.0, inv_lda_mllt_dbl, kNoTrans, W, 0.0); // W_orig <-- inv_lda_mllt W inv_lda_mllt^T
-
-  // Now get versions of B_orig and W_orig that are limited to the
-  // dimension range that we wanted.
-  Matrix<double> B_orig_mat(B_orig), W_orig_mat(W_orig); // Get them as full matrices...
-  SpMatrix<double> B_orig_limit(B_orig_mat.Range(start_dim, end_dim-start_dim,
-                                                 start_dim, end_dim-start_dim)),
-      W_orig_limit(W_orig_mat.Range(start_dim, end_dim-start_dim,
-                                    start_dim, end_dim-start_dim));
-  
-  Matrix<double> proj;
-  int32 retained_dim = model_dim;
-  if (end_dim - start_dim < retained_dim) retained_dim = end_dim - start_dim;
-  ComputeLdaTransform(B_orig_limit, W_orig_limit, retained_dim, &proj);
-  
-  // Now proj has the projection from the "limited-dimension" space.
-  // We want a projection from the entire space.
-  
-  projection->Resize(retained_dim, full_dim); // This projection (which we output) will project from
-  // full_dim to retained_dim; it goes from the pre-LDA+MLLT space to "retained_dim" which
-  // is <= model_dim.
-  
-  // Copy the relevant dimensions of "projection" from the "proj" matrix that
-  // we just computed.  The rest remain zero (corresponding to discarded dimensions).
-  projection->Range(0, retained_dim, start_dim, end_dim-start_dim).CopyFromMat(proj);
-}
-
-void Sgmm2Project::ComputeLdaTransform(const SpMatrix<double> &B,
-                                       const SpMatrix<double> &W,
-                                       int32 dim_to_retain, 
-                                       Matrix<double> *Projection) {
-  int32 dim = B.NumRows();
-  KALDI_ASSERT(dim_to_retain <= dim);
-
-  // OK, now do LDA in this space...
-  TpMatrix<double> T(dim);
-  T.Cholesky(W); // do Cholesky factorization W_orig = T T^T.  Now,
-  // T^{-1} is the projection that makes W unit.
-  TpMatrix<double> Tinv(T); // get inverse of T.
-  Tinv.Invert();
-  
-  // Now project B_orig with Tinv, to get between-class scatter in space where
-  // W_orig is unit.
-  SpMatrix<double> B_proj(dim);
-  B_proj.AddTp2Sp(1.0, Tinv, kNoTrans, B, 0.0);
-  
-  // Now, in this space, do SVD.
-
-  Matrix<double> P(dim, dim);
-  Vector<double> s(dim);
-  B_proj.SymPosSemiDefEig(&s, &P);
-  // Now B_proj = P diag(s) P^T, with P orthogonal.  It's both SVD and eigenvalue
-  // decomposition.
-  // So P^{-1}, which equals P^T, is the transformation that
-  // will make B_proj diagonal (with eigenvalues equal to s).
-
-  P.Resize(dim, dim_to_retain, kCopyData); // keep only rows of P^T that we want.
-  Projection->Resize(dim_to_retain, dim);
-  // The next line sets "Projection" to the LDA matrix, which is (part of P^T) * T^{-1}
-  Projection->AddMatTp(1.0, P, kTrans, Tinv, kNoTrans, 0.0);
-
-  KALDI_LOG << "Eigenvalues of retained LDA dimensions: "
-            << s.Range(0, dim_to_retain) << " (sum is:) "
-            << s.Range(0, dim_to_retain).Sum();
-  KALDI_LOG << "Eigenvalues of rejected LDA dimensions: "
-            << s.Range(dim_to_retain, dim - dim_to_retain) << " (sum is:) "
-            << s.Range(dim_to_retain, dim - dim_to_retain).Sum();
-
-  { // Check that it's been done correctly by projecting the
-    // matrices we got as input checking they become (diagonal, unit).
-    SpMatrix<double> B_ldaproj(dim_to_retain), W_ldaproj(dim_to_retain);
-    B_ldaproj.AddMat2Sp(1.0, *Projection, kNoTrans, B, 0.0);
-    KALDI_ASSERT(B_ldaproj.IsDiagonal());
-    W_ldaproj.AddMat2Sp(1.0, *Projection, kNoTrans, W, 0.0);
-    KALDI_ASSERT(W_ldaproj.IsUnit());
-  }
-}
-
-
-void Sgmm2Project::ComputeLdaStats(const FullGmm &full_ubm,
-                                   SpMatrix<double> *between_covar,
-                                   SpMatrix<double> *within_covar) {
-  int32 dim = full_ubm.Dim(); // Feature dimension.
-  between_covar->Resize(dim); // zeroes it.
-  within_covar->Resize(dim); // zeroes it.
-  FullGmmNormal full_gmm_normal(full_ubm);
-  BaseFloat weight = 1.0 / full_ubm.NumGauss();
-  Vector<double> avg_mean(dim);
-  for (int32 i = 0; i < full_ubm.NumGauss(); i++) {
-    between_covar->AddSp(weight, full_gmm_normal.vars_[i]);
-    within_covar->AddVec2(weight, full_gmm_normal.means_.Row(i));
-    avg_mean.AddVec(weight, full_gmm_normal.means_.Row(i));
-  }
-  between_covar->AddVec2(-1.0, avg_mean);
-}
-
-void Sgmm2Project::ApplyProjection(const Matrix<BaseFloat> &total_projection,
-                                   AmSgmm2 *sgmm) {
-  int32 dim = sgmm->FeatureDim();
-  int32 retained_dim = total_projection.NumRows();
-  KALDI_ASSERT(retained_dim <= dim);
-  
-  // Note: small_projection is as total_projection but ignoring the
-  // higher dimensions of the input... this is valid as far as the means
-  // are concerned, because we extend with zeros.
-  SubMatrix<BaseFloat> small_projection(total_projection, 0, retained_dim, 0, dim);
-  Matrix<double> small_projection_dbl(small_projection);
-  Matrix<double> total_projection_dbl(total_projection);
-  
-  int32 I = sgmm->NumGauss();
-  for (int32 i = 0; i < I; i++) {
-    {
-      // do M_i  <-- small_projection * M_i
-      Matrix<BaseFloat> M(sgmm->M_[i]);
-      sgmm->M_[i].Resize(retained_dim, M.NumCols());
-      sgmm->M_[i].AddMatMat(1.0, small_projection, kNoTrans, M, kNoTrans, 0.0);
-    }
-    if (!sgmm->N_.empty()) {
-      // do N_i  <-- small_projection * N_i
-      Matrix<BaseFloat> N(sgmm->N_[i]);
-      sgmm->N_[i].Resize(retained_dim, N.NumCols());
-      sgmm->N_[i].AddMatMat(1.0, small_projection, kNoTrans, N, kNoTrans, 0.0);
-    }
-    ProjectVariance(total_projection_dbl, true, // inverted,
-                    &(sgmm->SigmaInv_[i]));
-  }    
-
-  { // Project full_ubm.
-    FullGmmNormal full_ubm_normal(sgmm->full_ubm_);
-    for (int32 i = 0; i < I; i++) {
-      ProjectVariance(total_projection_dbl, false, &(full_ubm_normal.vars_[i]));
-    }
-    Matrix<double> old_means(full_ubm_normal.means_);
-    full_ubm_normal.means_.Resize(I, retained_dim);
-    full_ubm_normal.means_.AddMatMat(1.0, old_means, kNoTrans,
-                                     small_projection_dbl, kTrans, 0.0);
-    sgmm->full_ubm_.Resize(I, retained_dim);
-    full_ubm_normal.CopyToFullGmm(&sgmm->full_ubm_);
-    sgmm->full_ubm_.ComputeGconsts();
-  }
-  sgmm->diag_ubm_.Resize(I, retained_dim);
-  sgmm->diag_ubm_.CopyFromFullGmm(sgmm->full_ubm_);
-  sgmm->diag_ubm_.ComputeGconsts();
-  sgmm->n_.clear(); // The normalizers are invalid now, so clear them.
-}
-
-void Sgmm2Project::ProjectVariance(const Matrix<double> &total_projection,
-                                   bool inverse,
-                                   SpMatrix<double> *variance) {
-  if (inverse) {
-    SpMatrix<double> inv_var(*variance);
-    inv_var.Invert();
-    ProjectVariance(total_projection, false, &inv_var);
-    inv_var.Invert();
-    if (variance->NumRows() != inv_var.NumRows())
-      variance->Resize(inv_var.NumRows());
-    variance->CopyFromSp(inv_var);
-  } else {
-    SpMatrix<double> extended_var(*variance);
-    KALDI_ASSERT(total_projection.NumCols() >= extended_var.NumRows());
-    extended_var.Resize(total_projection.NumCols(), kCopyData);
-    for (int32 i = variance->NumRows(); i < extended_var.NumRows(); i++)
-      extended_var(i, i) = 1.0; // make new part of diagonal ones.
-    int32 tgt_dim = total_projection.NumRows();
-    KALDI_ASSERT(tgt_dim <= variance->NumRows());
-    if (tgt_dim < variance->NumRows()) variance->Resize(tgt_dim);
-    variance->AddMat2Sp(1.0, total_projection, kNoTrans, extended_var, 0.0);
-  }
-}
-
-void Sgmm2Project::ProjectVariance (const Matrix<double> &total_projection,
-                                    bool inverse,
-                                    SpMatrix<float> *variance) {
-  SpMatrix<double> variance_dbl(*variance);
-  ProjectVariance(total_projection, inverse, &variance_dbl);
-  if (variance->NumRows() != variance_dbl.NumRows())
-    variance->Resize(variance_dbl.NumRows());
-  variance->CopyFromSp(variance_dbl);
-}
-
-
-}  // namespace kaldi
diff --git a/src/sgmm2/am-sgmm2-project.h b/src/sgmm2/am-sgmm2-project.h
deleted file mode 100644
index d85fd7106dc..00000000000
--- a/src/sgmm2/am-sgmm2-project.h
+++ /dev/null
@@ -1,86 +0,0 @@
-// sgmm2/am-sgmm2-project.h
-
-// Copyright 2012  Johns Hopkins University (author: Daniel Povey)
-
-// See ../../COPYING for clarification regarding multiple authors
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//  http://www.apache.org/licenses/LICENSE-2.0
-//
-// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
-// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
-// MERCHANTABLITY OR NON-INFRINGEMENT.
-// See the Apache 2 License for the specific language governing permissions and
-// limitations under the License.
-
-#ifndef KALDI_SGMM2_AM_SGMM2_PROJECT_H_
-#define KALDI_SGMM2_AM_SGMM2_PROJECT_H_
-
-#include <vector>
-#include <queue>
-
-#include "sgmm2/am-sgmm2.h"
-
-namespace kaldi {
-
-class Sgmm2Project {
-  // This class essentially functions as a namespace for some functions;
-  // it's a friend of AmSgmm.h.  It relates to "predictive" SGMMs.  This
-  // hasn't been written up yet.  We don't make any functions const or
-  // static, because there are no member variables.
- public:
-
-  // If inv_lda_mllt is the matrix that projects from the space the SGMM is
-  // in, typically back to the spliced-MFCC space, and begin_dim and end_dim
-  // represent the range of dims we want to model, then "projection" will be
-  // a matrix, applied *after* the "inv_lda_mllt" matrix, that projects from
-  // the raw splice-MFCC features to the space we want to model.  This matrix
-  // is of dimension e.g. 40 x 117, and omits the space that the model's states
-  // all treat the same.
-  void ComputeProjection(const AmSgmm2 &sgmm,
-                         const Matrix<BaseFloat> &inv_lda_mllt,
-                         int32 begin_dim,
-                         int32 end_dim, // last dim plus one that we keep.
-                         Matrix<BaseFloat> *projection);
-
-  // This function applies the feature-space projection to the SGMM.
-  // The matrix "total_projection" is the product of the "projection" matrix
-  // of ComputeProjection times the "inv_lda_mllt" matrix.  It actually
-  // projects from a larger dimension than the current SGMM.  We treat
-  // the means as if extended with zeros, and the covariances as if
-  // extended with a unit matrix.
-  void ApplyProjection(const Matrix<BaseFloat> &total_projection,
-                       AmSgmm2 *sgmm);
-                         
- private:
-  // Computes statistics for LDA, in the SGMM's feature space.
-  // This only needs to be approximate, so we use stats based
-  // on the means in the UBM.
-  void ComputeLdaStats(const FullGmm &full_ubm,
-                       SpMatrix<double> *between_covar,
-                       SpMatrix<double> *within_covar);
-
-  void ProjectVariance (const Matrix<double> &total_projection,
-                        bool inverse,
-                        SpMatrix<double> *variance);
-  
-  void ProjectVariance (const Matrix<double> &total_projection,
-                        bool inverse,
-                        SpMatrix<float> *variance);
-  
-  void ComputeLdaTransform(const SpMatrix<double> &B,
-                           const SpMatrix<double> &W,
-                           int32 dim_to_retain, 
-                           Matrix<double> *Projection);
-  
-};
-
-
-
-} // end namespace kaldi
-
-#endif  // KALDI_SGMM2_AM_SGMM2_PROJECT_H_
diff --git a/src/sgmm2/am-sgmm2-test.cc b/src/sgmm2/am-sgmm2-test.cc
deleted file mode 100644
index 5dee50f3f6d..00000000000
--- a/src/sgmm2/am-sgmm2-test.cc
+++ /dev/null
@@ -1,285 +0,0 @@
-// sgmm2/am-sgmm2-test.cc
-
-// Copyright 2012   Arnab Ghoshal
-//           2009-2011  Saarland University
-//           2012  Johns Hopkins University (author: Daniel Povey)
-
-// See ../../COPYING for clarification regarding multiple authors
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//  http://www.apache.org/licenses/LICENSE-2.0
-//
-// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
-// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
-// MERCHANTABLITY OR NON-INFRINGEMENT.
-// See the Apache 2 License for the specific language governing permissions and
-// limitations under the License.
-
-#include "gmm/model-test-common.h"
-#include "sgmm2/am-sgmm2.h"
-#include "util/kaldi-io.h"
-
-using kaldi::AmSgmm2;
-using kaldi::int32;
-using kaldi::BaseFloat;
-namespace ut = kaldi::unittest;
-
-// Tests the initialization routines: InitializeFromFullGmm(), CopyFromSgmm2()
-// and CopyGlobalsInitVecs().
-void TestSgmm2Init(const AmSgmm2 &sgmm) {
-  using namespace kaldi;
-  int32 dim = sgmm.FeatureDim();
-  kaldi::Sgmm2GselectConfig config;
-  config.full_gmm_nbest = std::min(config.full_gmm_nbest, sgmm.NumGauss());
-
-  kaldi::Vector<BaseFloat> feat(dim);
-  for (int32 d = 0; d < dim; d++) {
-    feat(d) = kaldi::RandGauss();
-  }
-  kaldi::Sgmm2PerFrameDerivedVars frame_vars;
-  frame_vars.Resize(sgmm.NumGauss(), sgmm.FeatureDim(),
-                    sgmm.PhoneSpaceDim());
-
-  std::vector<int32> gselect;
-  sgmm.GaussianSelection(config, feat, &gselect);
-  Sgmm2PerSpkDerivedVars empty;
-  Sgmm2PerFrameDerivedVars per_frame;
-  sgmm.ComputePerFrameVars(feat, gselect, empty, &per_frame);
-  Sgmm2LikelihoodCache sgmm_cache(sgmm.NumGroups(), sgmm.NumPdfs());
-  BaseFloat loglike = sgmm.LogLikelihood(per_frame, 0, &sgmm_cache, &empty);
-  sgmm_cache.NextFrame();
-
-  // First, test the CopyFromSgmm2() method:
-  AmSgmm2 *sgmm1 = new AmSgmm2();
-  sgmm1->CopyFromSgmm2(sgmm, true, true);
-  sgmm1->GaussianSelection(config, feat, &gselect);
-  sgmm1->ComputePerFrameVars(feat, gselect, empty, &per_frame);
-  sgmm_cache.NextFrame();
-  BaseFloat loglike1 = sgmm1->LogLikelihood(per_frame, 0, &sgmm_cache, &empty);
-  kaldi::AssertEqual(loglike, loglike1, 1e-4);
-  delete sgmm1;
-
-  AmSgmm2 *sgmm2 = new AmSgmm2();
-  sgmm2->CopyFromSgmm2(sgmm, false, false);
-  sgmm2->ComputeNormalizers();
-  sgmm2->ComputeWeights();
-  sgmm2->GaussianSelection(config, feat, &gselect);
-  sgmm2->ComputePerFrameVars(feat, gselect, empty, &per_frame);
-  sgmm_cache.NextFrame();
-  BaseFloat loglike2 = sgmm2->LogLikelihood(per_frame, 0, &sgmm_cache, &empty);
-  kaldi::AssertEqual(loglike, loglike2, 1e-4);
-  delete sgmm2;
-
-  // Next, initialize using the UBM from the current model
-  AmSgmm2 *sgmm3 = new AmSgmm2();
-  {
-    std::vector<int32> pdf2group(sgmm.NumPdfs());
-    for (int32 i = 0; i < sgmm.NumPdfs(); i++) pdf2group[i] = sgmm.Pdf2Group(i);
-    sgmm3->InitializeFromFullGmm(sgmm.full_ubm(), pdf2group,
-                                 sgmm.PhoneSpaceDim(), sgmm.SpkSpaceDim(), true, 0.9);
-  }
-  sgmm3->ComputeNormalizers();
-  sgmm3->GaussianSelection(config, feat, &gselect);
-  sgmm3->ComputePerFrameVars(feat, gselect, empty, &per_frame);
-  sgmm_cache.NextFrame();
-  BaseFloat loglike3 = sgmm3->LogLikelihood(per_frame, 0, &sgmm_cache, &empty);
-  kaldi::AssertEqual(loglike, loglike3, 1e-4);
-  delete sgmm3;
-}
-
-// Tests the Read() and Write() methods, in both binary and ASCII mode, as well
-// as Check(), and methods in likelihood computations.
-void TestSgmm2IO(const AmSgmm2 &sgmm) {
-  using namespace kaldi;
-  int32 dim = sgmm.FeatureDim();
-  kaldi::Sgmm2GselectConfig config;
-  config.full_gmm_nbest = std::min(config.full_gmm_nbest, sgmm.NumGauss());
-
-  kaldi::Vector<BaseFloat> feat(dim);
-  for (int32 d = 0; d < dim; d++) {
-    feat(d) = kaldi::RandGauss();
-  }
-  kaldi::Sgmm2PerFrameDerivedVars frame_vars;
-  frame_vars.Resize(sgmm.NumGauss(), sgmm.FeatureDim(),
-                    sgmm.PhoneSpaceDim());
-
-  std::vector<int32> gselect;
-  sgmm.GaussianSelection(config, feat, &gselect);
-  Sgmm2PerSpkDerivedVars empty;
-  Sgmm2PerFrameDerivedVars per_frame;
-  sgmm.ComputePerFrameVars(feat, gselect, empty, &per_frame);
-  Sgmm2LikelihoodCache sgmm_cache(sgmm.NumGroups(), sgmm.NumPdfs());
-  BaseFloat loglike = sgmm.LogLikelihood(per_frame, 0, &sgmm_cache, &empty);
-
-  // First, non-binary write
-  sgmm.Write(kaldi::Output("tmpf", false).Stream(), false,
-      kaldi::kSgmmWriteAll);
-
-  bool binary_in;
-  AmSgmm2 *sgmm1 = new AmSgmm2();
-  // Non-binary read
-  kaldi::Input ki1("tmpf", &binary_in);
-  sgmm1->Read(ki1.Stream(), binary_in);
-  sgmm1->Check(true);
-  sgmm1->GaussianSelection(config, feat, &gselect);
-  sgmm1->ComputePerFrameVars(feat, gselect, empty, &per_frame);
-  BaseFloat loglike1 = sgmm1->LogLikelihood(per_frame, 0, &sgmm_cache, &empty);
-  kaldi::AssertEqual(loglike, loglike1, 1e-4);
-
-  // Next, binary write
-  sgmm1->Write(kaldi::Output("tmpfb", true).Stream(), true,
-      kaldi::kSgmmWriteAll);
-  delete sgmm1;
-
-  AmSgmm2 *sgmm2 = new AmSgmm2();
-  // Binary read
-  kaldi::Input ki2("tmpfb", &binary_in);
-  sgmm2->Read(ki2.Stream(), binary_in);
-  sgmm2->Check(true);
-  sgmm2->GaussianSelection(config, feat, &gselect);
-  sgmm2->ComputePerFrameVars(feat, gselect, empty, &per_frame);
-  BaseFloat loglike2 = sgmm2->LogLikelihood(per_frame, 0, &sgmm_cache, &empty);
-  kaldi::AssertEqual(loglike, loglike2, 1e-4);
-  delete sgmm2;
-  unlink("tmpf");
-  unlink("tmpfb");
-}
-
-void TestSgmm2Substates(const AmSgmm2 &sgmm) {
-  using namespace kaldi;
-  int32 target_substates = 2 * sgmm.NumPdfs();
-  kaldi::Vector<BaseFloat> occs(sgmm.NumPdfs());
-  for (int32 i = 0; i < occs.Dim(); i++)
-    occs(i) = std::fabs(kaldi::RandGauss()) * (kaldi::RandUniform()+1);
-  AmSgmm2 *sgmm1 = new AmSgmm2();
-  sgmm1->CopyFromSgmm2(sgmm, false, false);
-  Sgmm2SplitSubstatesConfig cfg;
-  cfg.split_substates = target_substates;
-  sgmm1->SplitSubstates(occs, cfg);
-  sgmm1->ComputeNormalizers();
-  sgmm1->ComputeWeights();
-  sgmm1->Check(true);
-  int32 dim = sgmm.FeatureDim();
-  kaldi::Sgmm2GselectConfig config;
-  config.full_gmm_nbest = std::min(config.full_gmm_nbest, sgmm.NumGauss());
-  kaldi::Vector<BaseFloat> feat(dim);
-  for (int32 d = 0; d < dim; d++) {
-    feat(d) = kaldi::RandGauss();
-  }
-
-  std::vector<int32> gselect;
-  sgmm.GaussianSelection(config, feat, &gselect);
-
-  Sgmm2PerSpkDerivedVars empty;
-  Sgmm2PerFrameDerivedVars per_frame;
-  sgmm.ComputePerFrameVars(feat, gselect, empty, &per_frame);
-  Sgmm2LikelihoodCache sgmm_cache(sgmm.NumGroups(), sgmm.NumPdfs());  
-  BaseFloat loglike = sgmm.LogLikelihood(per_frame, 0, &sgmm_cache, &empty);
-
-  sgmm1->GaussianSelection(config, feat, &gselect);
-  sgmm1->ComputePerFrameVars(feat, gselect, empty, &per_frame);
-  sgmm_cache.NextFrame();
-  BaseFloat loglike1 = sgmm1->LogLikelihood(per_frame, 0, &sgmm_cache, &empty);
-  kaldi::AssertEqual(loglike, loglike1, 1e-2);
-
-  delete sgmm1;
-}
-
-void TestSgmm2IncreaseDim(const AmSgmm2 &sgmm) {
-  using namespace kaldi;
-  int32 target_phn_dim = static_cast<int32>(1.5 * sgmm.PhoneSpaceDim());
-  int32 target_spk_dim = sgmm.PhoneSpaceDim() - 1;
-
-  int32 dim = sgmm.FeatureDim();
-  kaldi::Sgmm2GselectConfig config;
-  config.full_gmm_nbest = std::min(config.full_gmm_nbest, sgmm.NumGauss());
-  kaldi::Vector<BaseFloat> feat(dim);
-  for (int32 d = 0; d < dim; d++) {
-    feat(d) = kaldi::RandGauss();
-  }
-  kaldi::Sgmm2PerFrameDerivedVars frame_vars;
-
-  std::vector<int32> gselect;
-  sgmm.GaussianSelection(config, feat, &gselect);
-  Sgmm2PerSpkDerivedVars empty;
-  Sgmm2PerFrameDerivedVars per_frame;  
-  sgmm.ComputePerFrameVars(feat, gselect, empty, &per_frame);
-  Sgmm2LikelihoodCache sgmm_cache(sgmm.NumGroups(), sgmm.NumPdfs());  
-  BaseFloat loglike = sgmm.LogLikelihood(per_frame, 0, &sgmm_cache, &empty);
-
-  kaldi::Matrix<BaseFloat> norm_xform;
-  kaldi::ComputeFeatureNormalizingTransform(sgmm.full_ubm(), &norm_xform);
-  AmSgmm2 *sgmm1 = new AmSgmm2();
-  sgmm1->CopyFromSgmm2(sgmm, false, false);
-  sgmm1->Check(true);
-  sgmm1->IncreasePhoneSpaceDim(target_phn_dim, norm_xform);
-  sgmm1->ComputeNormalizers();
-  sgmm1->Check(true);
-
-
-  sgmm1->GaussianSelection(config, feat, &gselect);
-  sgmm1->ComputePerFrameVars(feat, gselect, empty, &per_frame);
-  sgmm_cache.NextFrame();
-  BaseFloat loglike1 = sgmm1->LogLikelihood(per_frame, 0, &sgmm_cache, &empty);
-  kaldi::AssertEqual(loglike, loglike1, 1e-4);
-
-  sgmm1->IncreaseSpkSpaceDim(target_spk_dim, norm_xform, true);
-  sgmm1->Check(true);
-  sgmm1->GaussianSelection(config, feat, &gselect);
-  sgmm1->ComputePerFrameVars(feat, gselect, empty, &per_frame);
-  sgmm_cache.NextFrame();
-  BaseFloat loglike2 = sgmm1->LogLikelihood(per_frame, 0, &sgmm_cache, &empty);
-  kaldi::AssertEqual(loglike, loglike2, 1e-4);
-  delete sgmm1;
-}
-
-void TestSgmm2PreXform(const AmSgmm2 &sgmm) {
-  kaldi::Matrix<BaseFloat> xform, inv_xform;
-  kaldi::Vector<BaseFloat> diag_scatter;
-  kaldi::Vector<BaseFloat> occs(sgmm.NumPdfs());
-  occs.Set(100);
-  sgmm.ComputeFmllrPreXform(occs, &xform, &inv_xform, &diag_scatter);
-  int32 dim = xform.NumRows();
-  kaldi::SubMatrix<BaseFloat> a_pre(xform, 0, dim, 0, dim),
-      a_inv(inv_xform, 0, dim, 0, dim);
-  kaldi::Vector<BaseFloat> b_pre(dim), b_inv(dim);
-  b_pre.CopyColFromMat(xform, dim);
-  b_inv.CopyColFromMat(inv_xform, dim);
-  kaldi::Matrix<BaseFloat> res_mat(dim, dim, kaldi::kSetZero);
-  res_mat.AddMatMat(1.0, a_pre, kaldi::kNoTrans, a_inv, kaldi::kNoTrans, 0.0);
-  KALDI_ASSERT(res_mat.IsUnit(1.0e-5));
-  kaldi::Vector<BaseFloat> res_vec(dim, kaldi::kSetZero);
-  res_vec.AddMatVec(1.0, a_inv, kaldi::kNoTrans, b_pre, 0.0);
-  res_vec.AddVec(1.0, b_inv);
-  KALDI_ASSERT(res_vec.IsZero(1.0e-5));
-}
-
-void UnitTestSgmm2() {
-  size_t dim = 1 + kaldi::RandInt(0, 9);  // random dimension of the gmm
-  size_t num_comp = 1 + kaldi::RandInt(0, 9);  // random number of mixtures
-  kaldi::FullGmm full_gmm;
-  ut::InitRandFullGmm(dim, num_comp, &full_gmm);
-
-  std::vector<int32> pdf2group;
-  pdf2group.push_back(0);
-  AmSgmm2 sgmm;
-  kaldi::Sgmm2GselectConfig config;
-  sgmm.InitializeFromFullGmm(full_gmm, pdf2group, dim+1, 0, true, 0.9);
-  sgmm.ComputeNormalizers();
-  TestSgmm2Init(sgmm);
-  TestSgmm2IO(sgmm);
-  TestSgmm2Substates(sgmm);
-  TestSgmm2IncreaseDim(sgmm);
-  TestSgmm2PreXform(sgmm);
-}
-
-int main() {
-  for (int i = 0; i < 10; i++)
-    UnitTestSgmm2();
-  std::cout << "Test OK.\n";
-  return 0;
-}
diff --git a/src/sgmm2/am-sgmm2.cc b/src/sgmm2/am-sgmm2.cc
deleted file mode 100644
index d249a5ab8b2..00000000000
--- a/src/sgmm2/am-sgmm2.cc
+++ /dev/null
@@ -1,1493 +0,0 @@
-// sgmm2/am-sgmm2.cc
-
-// Copyright 2009-2011  Microsoft Corporation;  Lukas Burget;
-//                      Saarland University (Author: Arnab Ghoshal);
-//                      Ondrej Glembek;  Yanmin Qian;
-// Copyright 2012-2013  Johns Hopkins University (Author: Daniel Povey)
-//                      Liang Lu;  Arnab Ghoshal
-
-// See ../../COPYING for clarification regarding multiple authors
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//  http://www.apache.org/licenses/LICENSE-2.0
-//
-// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
-// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
-// MERCHANTABLITY OR NON-INFRINGEMENT.
-// See the Apache 2 License for the specific language governing permissions and
-// limitations under the License.
-
-#include <functional>
-
-#include "sgmm2/am-sgmm2.h"
-#include "util/kaldi-thread.h"
-
-namespace kaldi {
-using std::vector;
-
-// This function needs to be added because std::generate is complaining
-// about RandGauss(), which takes an optional arguments.
-static inline float _RandGauss()
-{
-  return RandGauss();
-}
-
-void Sgmm2LikelihoodCache::NextFrame() {
-  t++;
-  if (t == 0) {
-    t++; // skip over zero; zero is used to invalidate frames.
-    for (size_t i = 0; i < substate_cache.size(); i++)
-      substate_cache[i].t = 0;
-    for (size_t i = 0; i < pdf_cache.size(); i++)
-      pdf_cache[i].t = 0;
-  }
-}
-
-void AmSgmm2::ComputeGammaI(const Vector<BaseFloat> &state_occupancies,
-                            Vector<BaseFloat> *gamma_i) const {
-  KALDI_ASSERT(state_occupancies.Dim() == NumPdfs());
-  Vector<BaseFloat> w_jm(NumGauss());
-  gamma_i->Resize(NumGauss());
-  for (int32 j1 = 0; j1 < NumGroups(); j1++) {
-    int32 M = NumSubstatesForGroup(j1);
-    const std::vector<int32> &pdfs = group2pdf_[j1];
-    Vector<BaseFloat> substate_weight(M); // total weight for each substate.
-    for (size_t i = 0; i < pdfs.size(); i++) {
-      int32 j2 = pdfs[i];
-      substate_weight.AddVec(state_occupancies(j2), c_[j2]);
-    }
-    for (int32 m = 0; m < M; m++) {
-      w_jm.AddMatVec(1.0, w_, kNoTrans, v_[j1].Row(m), 0.0);
-      w_jm.ApplySoftMax();
-      gamma_i->AddVec(substate_weight(m), w_jm);
-    }
-  }
-}
-
-
-void AmSgmm2::ComputePdfMappings() {
-  if (pdf2group_.empty()) {
-    KALDI_WARN << "ComputePdfMappings(): no pdf2group_ map, assuming you "
-        "are reading in old model.";
-    KALDI_ASSERT(v_.size() != 0);
-    pdf2group_.resize(v_.size());
-    for (int32 j2 = 0; j2 < static_cast<int32>(pdf2group_.size()); j2++)
-      pdf2group_[j2] = j2;
-  }
-  group2pdf_.clear();
-  for (int32 j2 = 0; j2 < static_cast<int32>(pdf2group_.size()); j2++) {
-    int32 j1 = pdf2group_[j2];
-    if (group2pdf_.size() <= j1) group2pdf_.resize(j1+1);
-    group2pdf_[j1].push_back(j2);
-  }
-}
-
-void AmSgmm2::Read(std::istream &in_stream, bool binary) {
-  { // We want this to work even if the object was previously
-    // populated, so we clear the items that are more likely
-    // to cause problems.
-    pdf2group_.clear();
-    group2pdf_.clear();
-    u_.Resize(0,0);
-    w_jmi_.clear();
-    v_.clear();
-  }
-  // removing anything that was in the object before.
-  int32 num_pdfs = -1, feat_dim, num_gauss;
-  std::string token;
-
-  ExpectToken(in_stream, binary, "<SGMM>");
-  ExpectToken(in_stream, binary, "<NUMSTATES>");
-  ReadBasicType(in_stream, binary, &num_pdfs);
-  ExpectToken(in_stream, binary, "<DIMENSION>");
-  ReadBasicType(in_stream, binary, &feat_dim);
-  ExpectToken(in_stream, binary, "<NUMGAUSS>");
-  ReadBasicType(in_stream, binary, &num_gauss);
-
-  KALDI_ASSERT(num_pdfs > 0 && feat_dim > 0);
-
-  ReadToken(in_stream, binary, &token);
-
-  while (token != "</SGMM>") {
-    if (token == "<PDF2GROUP>") {
-      ReadIntegerVector(in_stream, binary, &pdf2group_);
-      ComputePdfMappings();
-    } else if (token == "<WEIGHTIDX2GAUSS>") {  // TEMP!   Will remove.
-      std::vector<int32> garbage;
-      ReadIntegerVector(in_stream, binary, &garbage);
-    } else if (token == "<DIAG_UBM>") {
-      diag_ubm_.Read(in_stream, binary);
-    } else if (token == "<FULL_UBM>") {
-      full_ubm_.Read(in_stream, binary);
-    } else if (token == "<SigmaInv>") {
-      SigmaInv_.resize(num_gauss);
-      for (int32 i = 0; i < num_gauss; i++) {
-        SigmaInv_[i].Read(in_stream, binary);
-      }
-    } else if (token == "<M>") {
-      M_.resize(num_gauss);
-      for (int32 i = 0; i < num_gauss; i++) {
-        M_[i].Read(in_stream, binary);
-      }
-    } else if (token == "<N>") {
-      N_.resize(num_gauss);
-      for (int32 i = 0; i < num_gauss; i++) {
-        N_[i].Read(in_stream, binary);
-      }
-    } else if (token == "<w>") {
-      w_.Read(in_stream, binary);
-    } else if (token == "<u>") {
-      u_.Read(in_stream, binary);
-    } else if (token == "<v>") {
-      int32 num_groups = group2pdf_.size();
-      if (num_groups == 0) {
-        KALDI_WARN << "Reading old model with new code (should still work)";
-        num_groups = num_pdfs;
-      }
-      v_.resize(num_groups);
-      for (int32 j1 = 0; j1 < num_groups; j1++) {
-        v_[j1].Read(in_stream, binary);
-      }
-    } else if (token == "<c>") {
-      c_.resize(num_pdfs);
-      for (int32 j2 = 0; j2 < num_pdfs; j2++) {
-        c_[j2].Read(in_stream, binary);
-      }
-    } else if (token == "<n>") {
-      int32 num_groups = group2pdf_.size();
-      if (num_groups == 0) num_groups = num_pdfs;
-      n_.resize(num_groups);
-      for (int32 j1 = 0; j1 < num_groups; j1++) {
-        n_[j1].Read(in_stream, binary);
-      }
-      // The following are the Gaussian prior parameters for MAP adaptation of M
-      // They may be moved to somewhere else eventually.
-    } else if (token == "<M_Prior>") {
-      ExpectToken(in_stream, binary, "<NUMGaussians>");
-      ReadBasicType(in_stream, binary, &num_gauss);
-      M_prior_.resize(num_gauss);
-      for (int32 i = 0; i < num_gauss; i++) {
-        M_prior_[i].Read(in_stream, binary);
-      }
-    } else if (token == "<Row_Cov_Inv>") {
-      row_cov_inv_.Read(in_stream, binary);
-    } else if (token == "<Col_Cov_Inv>") {
-      col_cov_inv_.Read(in_stream, binary);
-    } else {
-      KALDI_ERR << "Unexpected token '" << token << "' in model file ";
-    }
-    ReadToken(in_stream, binary, &token);
-  }
-
-  if (pdf2group_.empty())
-    ComputePdfMappings(); // sets up group2pdf_, and pdf2group_ if reading
-  // old model.
-
-  if (n_.empty())
-    ComputeNormalizers();
-  if (HasSpeakerDependentWeights())
-    ComputeWeights();
-}
-
-int32 AmSgmm2::Pdf2Group(int32 j2) const {
-  KALDI_ASSERT(static_cast<size_t>(j2) < pdf2group_.size());
-  int32 j1 = pdf2group_[j2];
-  return j1;
-}
-
-
-void AmSgmm2::Write(std::ostream &out_stream,
-                   bool binary,
-                   SgmmWriteFlagsType write_params) const {
-  int32 num_pdfs = NumPdfs(),
-      feat_dim = FeatureDim(),
-      num_gauss = NumGauss();
-
-  WriteToken(out_stream, binary, "<SGMM>");
-  if (!binary) out_stream << "\n";
-  WriteToken(out_stream, binary, "<NUMSTATES>");
-  WriteBasicType(out_stream, binary, num_pdfs);
-  WriteToken(out_stream, binary, "<DIMENSION>");
-  WriteBasicType(out_stream, binary, feat_dim);
-  WriteToken(out_stream, binary, "<NUMGAUSS>");
-  WriteBasicType(out_stream, binary, num_gauss);
-  if (!binary) out_stream << "\n";
-
-  if (write_params & kSgmmBackgroundGmms) {
-    WriteToken(out_stream, binary, "<DIAG_UBM>");
-    diag_ubm_.Write(out_stream, binary);
-    WriteToken(out_stream, binary, "<FULL_UBM>");
-    full_ubm_.Write(out_stream, binary);
-  }
-
-  if (write_params & kSgmmGlobalParams) {
-    WriteToken(out_stream, binary, "<SigmaInv>");
-    if (!binary) out_stream << "\n";
-    for (int32 i = 0; i < num_gauss; i++) {
-      SigmaInv_[i].Write(out_stream, binary);
-    }
-    WriteToken(out_stream, binary, "<M>");
-    if (!binary) out_stream << "\n";
-    for (int32 i = 0; i < num_gauss; i++) {
-      M_[i].Write(out_stream, binary);
-    }
-    if (N_.size() != 0) {
-      WriteToken(out_stream, binary, "<N>");
-      if (!binary) out_stream << "\n";
-      for (int32 i = 0; i < num_gauss; i++) {
-        N_[i].Write(out_stream, binary);
-      }
-    }
-    WriteToken(out_stream, binary, "<w>");
-    w_.Write(out_stream, binary);
-    WriteToken(out_stream, binary, "<u>");
-    u_.Write(out_stream, binary);
-  }
-
-  if (write_params & kSgmmStateParams) {
-    WriteToken(out_stream, binary, "<PDF2GROUP>");
-    WriteIntegerVector(out_stream, binary, pdf2group_);
-    WriteToken(out_stream, binary, "<v>");
-    for (int32 j1 = 0; j1 < NumGroups(); j1++) {
-      v_[j1].Write(out_stream, binary);
-    }
-    WriteToken(out_stream, binary, "<c>");
-    for (int32 j2 = 0; j2 < num_pdfs; j2++) {
-      c_[j2].Write(out_stream, binary);
-    }
-  }
-
-  if (write_params & kSgmmNormalizers) {
-    WriteToken(out_stream, binary, "<n>");
-    if (n_.empty())
-      KALDI_WARN << "Not writing normalizers since they are not present.";
-    else
-      for (int32 j1 = 0; j1 < NumGroups(); j1++)
-        n_[j1].Write(out_stream, binary);
-  }
-  WriteToken(out_stream, binary, "</SGMM>");
-}
-
-
-void AmSgmm2::Check(bool show_properties) {
-  int32 J1 = NumGroups(),
-      J2 = NumPdfs(),
-      num_gauss = NumGauss(),
-      feat_dim = FeatureDim(),
-      phn_dim = PhoneSpaceDim(),
-      spk_dim = SpkSpaceDim();
-
-  if (show_properties)
-    KALDI_LOG << "AmSgmm2: #pdfs = " << J2 << ", #pdf-groups = "
-              << J1 << ", #Gaussians = "
-              << num_gauss << ", feature dim = " << feat_dim
-              << ", phone-space dim =" << phn_dim
-              << ", speaker-space dim =" << spk_dim;
-  KALDI_ASSERT(J1 > 0 && num_gauss > 0 && feat_dim > 0 && phn_dim > 0
-               && J2 > 0 && J2 >= J1);
-
-  std::ostringstream debug_str;
-
-  // First check the diagonal-covariance UBM.
-  KALDI_ASSERT(diag_ubm_.NumGauss() == num_gauss);
-  KALDI_ASSERT(diag_ubm_.Dim() == feat_dim);
-
-  // Check the full-covariance UBM.
-  KALDI_ASSERT(full_ubm_.NumGauss() == num_gauss);
-  KALDI_ASSERT(full_ubm_.Dim() == feat_dim);
-
-  // Check the globally-shared covariance matrices.
-  KALDI_ASSERT(SigmaInv_.size() == static_cast<size_t>(num_gauss));
-  for (int32 i = 0; i < num_gauss; i++) {
-    KALDI_ASSERT(SigmaInv_[i].NumRows() == feat_dim &&
-                 SigmaInv_[i](0, 0) > 0.0);  // or it wouldn't be +ve definite.
-  }
-
-  if (spk_dim != 0) {
-    KALDI_ASSERT(N_.size() == static_cast<size_t>(num_gauss));
-    for (int32 i = 0; i < num_gauss; i++)
-      KALDI_ASSERT(N_[i].NumRows() == feat_dim && N_[i].NumCols() == spk_dim);
-    if (u_.NumRows() == 0) {
-      debug_str << "Speaker-weight projections: no.";
-    } else {
-      KALDI_ASSERT(u_.NumRows() == num_gauss && u_.NumCols() == spk_dim);
-      debug_str << "Speaker-weight projections: yes.";
-    }
-  } else {
-    KALDI_ASSERT(N_.size() == 0 && u_.NumRows() == 0);
-  }
-
-  KALDI_ASSERT(M_.size() == static_cast<size_t>(num_gauss));
-  for (int32 i = 0; i < num_gauss; i++) {
-    KALDI_ASSERT(M_[i].NumRows() == feat_dim && M_[i].NumCols() == phn_dim);
-  }
-
-  KALDI_ASSERT(w_.NumRows() == num_gauss && w_.NumCols() == phn_dim);
-
-  {  // check v, c.
-    KALDI_ASSERT(v_.size() == static_cast<size_t>(J1) &&
-                 c_.size() == static_cast<size_t>(J2));
-    int32 nSubstatesTot = 0;
-    for (int32 j1 = 0; j1 < J1; j1++) {
-      int32 M_j = NumSubstatesForGroup(j1);
-      nSubstatesTot += M_j;
-      KALDI_ASSERT(M_j > 0 && v_[j1].NumRows() == M_j &&
-                   v_[j1].NumCols() == phn_dim);
-    }
-    debug_str << "Substates: "<< (nSubstatesTot) << ".  ";
-    int32 nSubstateWeights = 0;
-    for (int32 j2 = 0; j2 < J2; j2++) {
-      int32 j1 = Pdf2Group(j2);
-      int32 M = NumSubstatesForPdf(j2);
-      KALDI_ASSERT(M == NumSubstatesForGroup(j1));
-      nSubstateWeights += M;
-    }
-    KALDI_ASSERT(nSubstateWeights >= nSubstatesTot);
-    debug_str << "SubstateWeights: "<< (nSubstateWeights) << ".  ";
-  }
-
-  // check normalizers.
-  if (n_.size() == 0) {
-    debug_str << "Normalizers: no.  ";
-  } else {
-    debug_str << "Normalizers: yes.  ";
-    KALDI_ASSERT(n_.size() == static_cast<size_t>(J1));
-    for (int32 j1 = 0; j1 < J1; j1++) {
-      KALDI_ASSERT(n_[j1].NumRows() == num_gauss &&
-                   n_[j1].NumCols() == NumSubstatesForGroup(j1));
-    }
-  }
-
-  // check w_jmi_.
-  if (w_jmi_.size() == 0) {
-    debug_str << "Computed weights: no.  ";
-  } else {
-    debug_str << "Computed weights: yes.  ";
-    KALDI_ASSERT(w_jmi_.size() == static_cast<size_t>(J1));
-    for (int32 j1 = 0; j1 < J1; j1++) {
-      KALDI_ASSERT(w_jmi_[j1].NumRows() == NumSubstatesForGroup(j1) &&
-                   w_jmi_[j1].NumCols() == num_gauss);
-    }
-  }
-
-  if (show_properties)
-    KALDI_LOG << "Subspace GMM model properties: " << debug_str.str();
-}
-
-void AmSgmm2::InitializeFromFullGmm(const FullGmm &full_gmm,
-                                    const std::vector<int32> &pdf2group,
-                                    int32 phn_subspace_dim,
-                                    int32 spk_subspace_dim,
-                                    bool speaker_dependent_weights,
-                                    BaseFloat self_weight) {
-  pdf2group_ = pdf2group;
-  ComputePdfMappings();
-  full_ubm_.CopyFromFullGmm(full_gmm);
-  diag_ubm_.CopyFromFullGmm(full_gmm);
-  if (phn_subspace_dim < 1 || phn_subspace_dim > full_gmm.Dim() + 1) {
-    KALDI_WARN << "Initial phone-subspace dimension must be >= 1, value is "
-               << phn_subspace_dim << "; setting to " << full_gmm.Dim() + 1;
-    phn_subspace_dim = full_gmm.Dim() + 1;
-  }
-  KALDI_ASSERT(spk_subspace_dim >= 0);
-
-  w_.Resize(0, 0);
-  N_.clear();
-  c_.clear();
-  v_.clear();
-  SigmaInv_.clear();
-
-  KALDI_LOG << "Initializing model";
-  Matrix<BaseFloat> norm_xform;
-  ComputeFeatureNormalizingTransform(full_gmm, &norm_xform);
-  InitializeMw(phn_subspace_dim, norm_xform);
-  if (spk_subspace_dim > 0)
-    InitializeNu(spk_subspace_dim, norm_xform, speaker_dependent_weights);
-  InitializeVecsAndSubstateWeights(self_weight);
-  KALDI_LOG << "Initializing variances";
-  InitializeCovars();
-}
-
-void AmSgmm2::CopyFromSgmm2(const AmSgmm2 &other,
-                          bool copy_normalizers,
-                          bool copy_weights) {
-  KALDI_LOG << "Copying AmSgmm2";
-  pdf2group_ = other.pdf2group_;
-  group2pdf_ = other.group2pdf_;
-
-  // Copy background GMMs
-  diag_ubm_.CopyFromDiagGmm(other.diag_ubm_);
-  full_ubm_.CopyFromFullGmm(other.full_ubm_);
-
-  // Copy global params
-  SigmaInv_ = other.SigmaInv_;
-  M_ = other.M_;
-  w_ = other.w_;
-  N_ = other.N_;
-  u_ = other.u_;
-
-  // Copy state-specific params, but only copy normalizers if requested.
-  v_ = other.v_;
-  c_ = other.c_;
-  if (copy_normalizers) n_ = other.n_;
-  if (copy_weights) w_jmi_ = other.w_jmi_;
-
-  KALDI_LOG << "Done.";
-}
-
-void AmSgmm2::ComputePerFrameVars(const VectorBase<BaseFloat> &data,
-                                 const std::vector<int32> &gselect,
-                                 const Sgmm2PerSpkDerivedVars &spk_vars,
-                                 Sgmm2PerFrameDerivedVars *per_frame_vars) const {
-  KALDI_ASSERT(!n_.empty() && "ComputeNormalizers() must be called.");
-
-  per_frame_vars->Resize(gselect.size(), FeatureDim(), PhoneSpaceDim());
-
-  per_frame_vars->gselect = gselect;
-  per_frame_vars->xt.CopyFromVec(data);
-
-  for (int32 ki = 0, last = gselect.size(); ki < last; ki++) {
-    int32 i = gselect[ki];
-    per_frame_vars->xti.Row(ki).CopyFromVec(per_frame_vars->xt);
-    if (spk_vars.v_s.Dim() != 0)
-      per_frame_vars->xti.Row(ki).AddVec(-1.0, spk_vars.o_s.Row(i));
-  }
-  Vector<BaseFloat> SigmaInv_xt(FeatureDim());
-
-  bool speaker_dep_weights =
-      (spk_vars.v_s.Dim() != 0 && HasSpeakerDependentWeights());
-  for (int32 ki = 0, last = gselect.size(); ki < last; ki++) {
-    int32 i = gselect[ki];
-    BaseFloat ssgmm_term = (speaker_dep_weights ? spk_vars.log_b_is(i) : 0.0);
-    SigmaInv_xt.AddSpVec(1.0, SigmaInv_[i], per_frame_vars->xti.Row(ki), 0.0);
-    // Eq (35): z_{i}(t) = M_{i}^{T} \Sigma_{i}^{-1} x_{i}(t)
-    per_frame_vars->zti.Row(ki).AddMatVec(1.0, M_[i], kTrans, SigmaInv_xt, 0.0);
-    // Eq.(36): n_{i}(t) = -0.5 x_{i}^{T} \Sigma_{i}^{-1} x_{i}(t)
-    per_frame_vars->nti(ki) = -0.5 * VecVec(per_frame_vars->xti.Row(ki),
-                                            SigmaInv_xt) + ssgmm_term;
-  }
-}
-
-// inline
-void AmSgmm2::ComponentLogLikes(const Sgmm2PerFrameDerivedVars &per_frame_vars,
-                               int32 j1,
-                               Sgmm2PerSpkDerivedVars *spk_vars,
-                               Matrix<BaseFloat> *loglikes) const {
-  const vector<int32> &gselect = per_frame_vars.gselect;
-  int32 num_gselect = gselect.size(), num_substates = v_[j1].NumRows();
-
-  // Eq.(37): log p(x(t), m, i|j)  [indexed by j, ki]
-  // Although the extra memory allocation of storing this as a
-  // matrix might seem unnecessary, we save time in the LogSumExp()
-  // via more effective pruning.
-  loglikes->Resize(num_gselect, num_substates);
-  bool speaker_dep_weights =
-      (spk_vars->v_s.Dim() != 0 && HasSpeakerDependentWeights());
-  if (speaker_dep_weights) {
-    KALDI_ASSERT(static_cast<int32>(spk_vars->log_d_jms.size()) == NumGroups());
-    KALDI_ASSERT(static_cast<int32>(w_jmi_.size()) == NumGroups() ||
-                 "You need to call ComputeWeights().");
-  }
-  for (int32 ki = 0;  ki < num_gselect; ki++) {
-    SubVector<BaseFloat> logp_xi(*loglikes, ki);
-    int32 i = gselect[ki];
-    // for all substates, compute z_{i}^T v_{jm}
-    logp_xi.AddMatVec(1.0, v_[j1], kNoTrans, per_frame_vars.zti.Row(ki), 0.0);
-    logp_xi.AddVec(1.0, n_[j1].Row(i));  // for all substates, add n_{jim}
-    logp_xi.Add(per_frame_vars.nti(ki));  // for all substates, add n_{i}(t)
-  }
-  if (speaker_dep_weights) { // [SSGMM]
-    Vector<BaseFloat> &log_d = spk_vars->log_d_jms[j1];
-    if (log_d.Dim() == 0) { // have not yet cached this quantity.
-      log_d.Resize(num_substates);
-      log_d.AddMatVec(1.0, w_jmi_[j1], kNoTrans, spk_vars->b_is, 0.0);
-      log_d.ApplyLog();
-    }
-    loglikes->AddVecToRows(-1.0, log_d); // [SSGMM] this is the term
-    // - log d_{jm}^{(s)} in the likelihood function [eq. 25 in
-    // the techreport]
-  }
-}
-
-
-BaseFloat AmSgmm2::LogLikelihood(const Sgmm2PerFrameDerivedVars &per_frame_vars,
-                                int32 j2,
-                                Sgmm2LikelihoodCache *cache,
-                                Sgmm2PerSpkDerivedVars *spk_vars,
-                                BaseFloat log_prune) const {
-  int32 t = cache->t; // not a real time; used to uniquely identify frames.
-  // Forgo asserts here, as this is frequently called.
-  // We'll probably get a segfault if an error is made.
-  Sgmm2LikelihoodCache::PdfCacheElement &pdf_cache =
-      cache->pdf_cache[j2];
-#ifdef KALDI_PARANOID
-  bool random_test = (Rand() % 1000 == 1); // to check that the user is
-  // calling Next() on the cache, as they should.
-#else
-  bool random_test = false; // compiler will ignore test branches.
-#endif
-  if (pdf_cache.t == t) {
-    if (!random_test) return pdf_cache.log_like;
-  } else {
-    random_test = false;
-  }
-  // if random_test == true at this point, it was already cached, and we will
-  // verify that we return the same value as the cached one.
-  pdf_cache.t = t;
-
-  int32 j1 = pdf2group_[j2];
-  Sgmm2LikelihoodCache::SubstateCacheElement &substate_cache =
-      cache->substate_cache[j1];
-  if (substate_cache.t != t) { // Need to compute sub-state likelihoods.
-    substate_cache.t = t;
-    Matrix<BaseFloat> loglikes; // indexed [gselect-index][substate-index]
-    ComponentLogLikes(per_frame_vars, j1, spk_vars, &loglikes);
-    BaseFloat max = loglikes.Max(); // use this to keep things in good numerical range.
-    loglikes.Add(-max);
-    loglikes.ApplyExp();
-    substate_cache.remaining_log_like = max;
-    int32 num_substates = loglikes.NumCols();
-    substate_cache.likes.Resize(num_substates); // zeroes it.
-    substate_cache.likes.AddRowSumMat(1.0, loglikes); // add likelihoods [not in log!] for
-    // each column [i.e. summing over the rows], so we get the sum for
-    // each substate index.  You have to multiply by exp(remaining_log_like)
-    // to get a real likelihood.
-  }
-
-  BaseFloat log_like = substate_cache.remaining_log_like
-      + Log(VecVec(substate_cache.likes, c_[j2]));
-
-  if (random_test)
-    KALDI_ASSERT(ApproxEqual(pdf_cache.log_like, log_like));
-
-  pdf_cache.log_like = log_like;
-  KALDI_ASSERT(log_like == log_like && log_like - log_like == 0); // check
-  // that it's not NaN or infinity.
-  return log_like;
-}
-
-BaseFloat
-AmSgmm2::ComponentPosteriors(const Sgmm2PerFrameDerivedVars &per_frame_vars,
-                            int32 j2,
-                            Sgmm2PerSpkDerivedVars *spk_vars,
-                            Matrix<BaseFloat> *post) const {
-  KALDI_ASSERT(j2 < NumPdfs() && post != NULL);
-  int32 j1 = pdf2group_[j2];
-  ComponentLogLikes(per_frame_vars, j1, spk_vars, post); // now
-  // post is a matrix of log-likelihoods indexed by [gaussian-selection index]
-  // [sub-state index].  It doesn't include the sub-state weights,
-  // though.
-  BaseFloat loglike = post->Max();
-  post->Add(-loglike); // get it to nicer numeric range.
-  post->ApplyExp(); // so we're dealing with likelihoods (with an arbitrary offset
-  // "loglike" removed to make it in a nice numeric range)
-  post->MulColsVec(c_[j2]); // include the sub-state weights.
-
-  BaseFloat tot_like = post->Sum();
-  KALDI_ASSERT(tot_like != 0.0); // note: not valid to have zero weights.
-  loglike += Log(tot_like);
-  post->Scale(1.0 / tot_like); // so "post" now sums to one, and "loglike"
-  // contains the correct log-likelihood of the data given the pdf.
-
-  return loglike;
-}
-
-void AmSgmm2::SplitSubstatesInGroup(const Vector<BaseFloat> &pdf_occupancies,
-                                    const Sgmm2SplitSubstatesConfig &opts,
-                                    const SpMatrix<BaseFloat> &sqrt_H_sm,
-                                    int32 j1,
-                                    int32 tgt_M) {
-  const std::vector<int32> &pdfs = group2pdf_[j1];
-  int32 phn_dim = PhoneSpaceDim(), cur_M = NumSubstatesForGroup(j1),
-      num_pdfs_for_group = pdfs.size();
-  Vector<BaseFloat> rand_vec(phn_dim), v_shift(phn_dim);
-
-  KALDI_ASSERT(tgt_M >= cur_M);
-  if (cur_M == tgt_M) return;
-  // Resize v[j1] to fit new substates
-  {
-    Matrix<BaseFloat> tmp_v_j(v_[j1]);
-    v_[j1].Resize(tgt_M, phn_dim);
-    v_[j1].Range(0, cur_M, 0, phn_dim).CopyFromMat(tmp_v_j);
-  }
-
-  // we'll use a temporary matrix for the c quantities.
-  Matrix<BaseFloat> c_j(num_pdfs_for_group, tgt_M);
-  for (int32 i = 0; i < num_pdfs_for_group; i++) {
-    int32 j2 = pdfs[i];
-    c_j.Row(i).Range(0, cur_M).CopyFromVec(c_[j2]);
-  }
-
-  // Keep splitting substates until obtaining the desired number
-  for (; cur_M < tgt_M; cur_M++) {
-    int32 split_m; // substate to split.
-    {
-      Vector<BaseFloat> substate_count(tgt_M);
-      substate_count.AddRowSumMat(1.0, c_j);
-      BaseFloat *data = substate_count.Data();
-      split_m = std::max_element(data, data+cur_M) - data;
-    }
-    for (int32 i = 0; i < num_pdfs_for_group; i++) { // divide count of split
-      // substate. [extended for SCTM]
-      // c_{jkm} := c_{jmk}' := c_{jkm} / 2
-      c_j(i, split_m) = c_j(i, cur_M) = c_j(i, split_m) / 2;
-    }
-    // v_{jkm} := +/- split_perturb * H_k^{(sm)}^{-0.5} * rand_vec
-    std::generate(rand_vec.Data(), rand_vec.Data() + rand_vec.Dim(),
-                  _RandGauss);
-    v_shift.AddSpVec(opts.perturb_factor, sqrt_H_sm, rand_vec, 0.0);
-    v_[j1].Row(cur_M).CopyFromVec(v_[j1].Row(split_m));
-    v_[j1].Row(cur_M).AddVec(1.0, v_shift);
-    v_[j1].Row(split_m).AddVec(-1.0, v_shift);
-  }
-  // copy the temporary matrix for the c_ (sub-state weight)
-  // quantities back to the place it belongs.
-  for (int32 i = 0; i < num_pdfs_for_group; i++) {
-    int32 j2 = pdfs[i];
-    c_[j2].Resize(tgt_M);
-    c_[j2].CopyFromVec(c_j.Row(i));
-  }
-}
-
-
-void AmSgmm2::SplitSubstates(const Vector<BaseFloat> &pdf_occupancies,
-                             const Sgmm2SplitSubstatesConfig &opts) {
-  KALDI_ASSERT(pdf_occupancies.Dim() == NumPdfs());
-  int32 J1 = NumGroups(), J2 = NumPdfs();
-  Vector<BaseFloat> group_occupancies(J1);
-  for (int32 j2 = 0; j2 < J2; j2++)
-    group_occupancies(Pdf2Group(j2)) += pdf_occupancies(j2);
-
-  vector<int32> tgt_num_substates;
-
-  GetSplitTargets(group_occupancies, opts.split_substates,
-                  opts.power, opts.min_count, &tgt_num_substates);
-
-  int32 tot_num_substates_old = 0, tot_num_substates_new = 0;
-  vector< SpMatrix<BaseFloat> > H_i;
-  SpMatrix<BaseFloat> sqrt_H_sm;
-
-  ComputeH(&H_i);  // set up that array.
-  ComputeHsmFromModel(H_i, pdf_occupancies, &sqrt_H_sm, opts.max_cond);
-  H_i.clear();
-  sqrt_H_sm.ApplyPow(-0.5);
-
-  for (int32 j1 = 0; j1 < J1; j1++) {
-    int32 cur_M = NumSubstatesForGroup(j1),
-        tgt_M = tgt_num_substates[j1];
-    tot_num_substates_old += cur_M;
-    tot_num_substates_new += std::max(cur_M, tgt_M);
-    if (cur_M < tgt_M)
-      SplitSubstatesInGroup(pdf_occupancies, opts, sqrt_H_sm, j1, tgt_M);
-  }
-  if (tot_num_substates_old == tot_num_substates_new) {
-    KALDI_LOG << "Not splitting substates; current #substates is "
-              << tot_num_substates_old << " and target is "
-              << opts.split_substates;
-  } else {
-    KALDI_LOG << "Getting rid of normalizers as they will no longer be valid";
-    n_.clear();
-    KALDI_LOG << "Split " << tot_num_substates_old << " substates to "
-              << tot_num_substates_new;
-  }
-}
-
-void AmSgmm2::IncreasePhoneSpaceDim(int32 target_dim,
-                                   const Matrix<BaseFloat> &norm_xform) {
-  KALDI_ASSERT(!M_.empty());
-  int32 initial_dim = PhoneSpaceDim(),
-      feat_dim = FeatureDim();
-  KALDI_ASSERT(norm_xform.NumRows() == feat_dim);
-
-  if (target_dim < initial_dim)
-    KALDI_ERR << "You asked to increase phn dim to a value lower than the "
-              << " current dimension, " << target_dim << " < " << initial_dim;
-
-  if (target_dim > initial_dim + feat_dim) {
-    KALDI_WARN << "Cannot increase phone subspace dimensionality from "
-               << initial_dim << " to " << target_dim << ", increasing to "
-               << initial_dim + feat_dim;
-    target_dim = initial_dim + feat_dim;
-  }
-
-  if (initial_dim < target_dim) {
-    Matrix<BaseFloat> tmp_M(feat_dim, initial_dim);
-    for (int32 i = 0; i < NumGauss(); i++) {
-      tmp_M.CopyFromMat(M_[i]);
-      M_[i].Resize(feat_dim, target_dim);
-      M_[i].Range(0, feat_dim, 0, tmp_M.NumCols()).CopyFromMat(tmp_M);
-      M_[i].Range(0, feat_dim, tmp_M.NumCols(),
-          target_dim - tmp_M.NumCols()).CopyFromMat(norm_xform.Range(0,
-              feat_dim, 0, target_dim-tmp_M.NumCols()));
-    }
-    Matrix<BaseFloat> tmp_w = w_;
-    w_.Resize(tmp_w.NumRows(), target_dim);
-    w_.Range(0, tmp_w.NumRows(), 0, tmp_w.NumCols()).CopyFromMat(tmp_w);
-
-    for (int32 j1 = 0; j1 < NumGroups(); j1++) {
-      // Resize phonetic-subspce vectors.
-      Matrix<BaseFloat> tmp_v_j = v_[j1];
-      v_[j1].Resize(tmp_v_j.NumRows(), target_dim);
-      v_[j1].Range(0, tmp_v_j.NumRows(), 0, tmp_v_j.NumCols()).CopyFromMat(
-          tmp_v_j);
-    }
-    KALDI_LOG << "Phone subspace dimensionality increased from " <<
-        initial_dim << " to " << target_dim;
-  } else {
-    KALDI_LOG << "Phone subspace dimensionality unchanged, since target " <<
-        "dimension (" << target_dim << ") <= initial dimansion (" <<
-        initial_dim << ")";
-  }
-}
-
-void AmSgmm2::IncreaseSpkSpaceDim(int32 target_dim,
-                                 const Matrix<BaseFloat> &norm_xform,
-                                 bool speaker_dependent_weights) {
-  int32 initial_dim = SpkSpaceDim(),
-      feat_dim = FeatureDim();
-  KALDI_ASSERT(norm_xform.NumRows() == feat_dim);
-
-  if (N_.size() == 0)
-    N_.resize(NumGauss());
-
-  if (target_dim < initial_dim)
-    KALDI_ERR << "You asked to increase spk dim to a value lower than the "
-              << " current dimension, " << target_dim << " < " << initial_dim;
-
-  if (target_dim > initial_dim + feat_dim) {
-    KALDI_WARN << "Cannot increase speaker subspace dimensionality from "
-               << initial_dim << " to " << target_dim << ", increasing to "
-               << initial_dim + feat_dim;
-    target_dim = initial_dim + feat_dim;
-  }
-
-  if (initial_dim < target_dim) {
-    int32 dim_change = target_dim - initial_dim;
-    Matrix<BaseFloat> tmp_N((initial_dim != 0) ? feat_dim : 0,
-                            initial_dim);
-    for (int32 i = 0; i < NumGauss(); i++) {
-      if (initial_dim != 0) tmp_N.CopyFromMat(N_[i]);
-      N_[i].Resize(feat_dim, target_dim);
-      if (initial_dim != 0) {
-        N_[i].Range(0, feat_dim, 0, tmp_N.NumCols()).CopyFromMat(tmp_N);
-      }
-      N_[i].Range(0, feat_dim, tmp_N.NumCols(), dim_change).CopyFromMat(
-          norm_xform.Range(0, feat_dim, 0, dim_change));
-    }
-    // if we already have speaker-dependent weights or we are increasing
-    // spk-dim from zero and are asked to add them...
-    if (u_.NumRows() != 0 || (initial_dim == 0 && speaker_dependent_weights))
-      u_.Resize(NumGauss(), target_dim, kCopyData); // extend dim of u_i's
-    KALDI_LOG << "Speaker subspace dimensionality increased from " <<
-        initial_dim << " to " << target_dim;
-    if (initial_dim == 0 && speaker_dependent_weights)
-      KALDI_LOG << "Added parameters u for speaker-dependent weights.";
-  } else {
-    KALDI_LOG << "Speaker subspace dimensionality unchanged, since target " <<
-        "dimension (" << target_dim << ") <= initial dimansion (" <<
-        initial_dim << ")";
-  }
-}
-
-void AmSgmm2::ComputeWeights() {
-  int32 J1 = NumGroups();
-  w_jmi_.resize(J1);
-  int32 i = NumGauss();
-  for (int32 j1 = 0; j1 < J1; j1++) {
-    int32 M = NumSubstatesForGroup(j1);
-    w_jmi_[j1].Resize(M, i);
-    w_jmi_[j1].AddMatMat(1.0, v_[j1], kNoTrans, w_, kTrans, 0.0);
-    // now w_jmi_ contains un-normalized log weights.
-    for (int32 m = 0; m < M; m++)
-      w_jmi_[j1].Row(m).ApplySoftMax(); // get the actual weights.
-  }
-}
-
-void AmSgmm2::ComputeDerivedVars() {
-  if (n_.empty()) ComputeNormalizers();
-  if (diag_ubm_.NumGauss() != full_ubm_.NumGauss()
-      || diag_ubm_.Dim() != full_ubm_.Dim()) {
-    diag_ubm_.CopyFromFullGmm(full_ubm_);
-  }
-  if (w_jmi_.empty() && HasSpeakerDependentWeights())
-    ComputeWeights();
-}
-
-class ComputeNormalizersClass: public MultiThreadable { // For multi-threaded.
- public:
-  ComputeNormalizersClass(AmSgmm2 *am_sgmm,
-                          int32 *entropy_count_ptr,
-                          double *entropy_sum_ptr):
-      am_sgmm_(am_sgmm), entropy_count_ptr_(entropy_count_ptr),
-      entropy_sum_ptr_(entropy_sum_ptr), entropy_count_(0),
-      entropy_sum_(0.0) { }
-
-  ComputeNormalizersClass(const ComputeNormalizersClass &other):
-      MultiThreadable(other),
-      am_sgmm_(other.am_sgmm_), entropy_count_ptr_(other.entropy_count_ptr_),
-      entropy_sum_ptr_(other.entropy_sum_ptr_), entropy_count_(0),
-      entropy_sum_(0.0) { }
-
-  ~ComputeNormalizersClass() {
-    *entropy_count_ptr_ += entropy_count_;
-    *entropy_sum_ptr_ += entropy_sum_;
-  }
-
-  inline void operator() () {
-    // Note: give them local copy of the sums we're computing,
-    // which will be propagated to original pointer in the destructor.
-    am_sgmm_->ComputeNormalizersInternal(num_threads_, thread_id_,
-                                         &entropy_count_,
-                                         &entropy_sum_);
-  }
- private:
-  ComputeNormalizersClass() { } // Disallow empty constructor.
-  AmSgmm2 *am_sgmm_;
-  int32 *entropy_count_ptr_;
-  double *entropy_sum_ptr_;
-  int32 entropy_count_;
-  double entropy_sum_;
-
-};
-
-void AmSgmm2::ComputeNormalizers() {
-  KALDI_LOG << "Computing normalizers";
-  n_.resize(NumPdfs());
-  int32 entropy_count = 0;
-  double entropy_sum = 0.0;
-  ComputeNormalizersClass c(this, &entropy_count, &entropy_sum);
-  RunMultiThreaded(c);
-
-  KALDI_LOG << "Entropy of weights in substates is "
-            << (entropy_sum / entropy_count) << " over " << entropy_count
-            << " substates, equivalent to perplexity of "
-            << (Exp(entropy_sum /entropy_count));
-  KALDI_LOG << "Done computing normalizers";
-}
-
-
-void AmSgmm2::ComputeNormalizersInternal(int32 num_threads, int32 thread,
-                                         int32 *entropy_count,
-                                         double *entropy_sum) {
-
-  BaseFloat DLog2pi = FeatureDim() * Log(2 * M_PI);
-  Vector<BaseFloat> log_det_Sigma(NumGauss());
-
-  for (int32 i = 0; i < NumGauss(); i++) {
-    try {
-      log_det_Sigma(i) = - SigmaInv_[i].LogPosDefDet();
-    } catch(...) {
-      if (thread == 0) // just for one thread, print errors [else, duplicates]
-        KALDI_WARN << "Covariance is not positive definite, setting to unit";
-      SigmaInv_[i].SetUnit();
-      log_det_Sigma(i) = 0.0;
-    }
-  }
-
-  int32 J1 = NumGroups();
-
-  int block_size = (NumPdfs() + num_threads-1) / num_threads;
-  int j_start = thread * block_size, j_end = std::min(J1, j_start + block_size);
-
-  int32 I = NumGauss();
-  for (int32 j1 = j_start; j1 < j_end; j1++) {
-    int32 M = NumSubstatesForGroup(j1);
-    Matrix<BaseFloat> log_w_jm(M, I);
-    n_[j1].Resize(I, M);
-    Matrix<BaseFloat> mu_jmi(M, FeatureDim());
-    Matrix<BaseFloat> SigmaInv_mu(M, FeatureDim());
-
-    // (in logs): w_jm = softmax([w_{k1}^T ... w_{kD}^T] * v_{jkm}) eq.(7)
-    log_w_jm.AddMatMat(1.0, v_[j1], kNoTrans, w_, kTrans, 0.0);
-    for (int32 m = 0; m < M; m++) {
-      log_w_jm.Row(m).Add(-1.0 * log_w_jm.Row(m).LogSumExp());
-      {  // DIAGNOSTIC CODE
-        (*entropy_count)++;
-        for (int32 i = 0; i < NumGauss(); i++) {
-          (*entropy_sum) -= log_w_jm(m, i) * Exp(log_w_jm(m, i));
-        }
-      }
-    }
-
-    for (int32 i = 0; i < I; i++) {
-      // mu_jmi = M_{i} * v_{jm}
-      mu_jmi.AddMatMat(1.0, v_[j1], kNoTrans, M_[i], kTrans, 0.0);
-      SigmaInv_mu.AddMatSp(1.0, mu_jmi, kNoTrans, SigmaInv_[i], 0.0);
-
-      for (int32 m = 0; m < M; m++) {
-        // mu_{jmi} * \Sigma_{i}^{-1} * mu_{jmi}
-        BaseFloat mu_SigmaInv_mu = VecVec(mu_jmi.Row(m), SigmaInv_mu.Row(m));
-        // Previously had:
-        // BaseFloat logc = log(c_[j](m));
-        // but because of STCM aspect, we can't include the sub-state mixture weights
-        // at this point [included later on.]
-
-        // eq.(31)
-        n_[j1](i, m) = log_w_jm(m, i) - 0.5 * (log_det_Sigma(i) + DLog2pi
-            + mu_SigmaInv_mu);
-        {  // Mainly diagnostic code.  Not necessary.
-          BaseFloat tmp = n_[j1](i, m);
-          if (!KALDI_ISFINITE(tmp)) {  // NaN or inf
-            KALDI_LOG << "Warning: normalizer for j1 = " << j1 << ", m = " << m
-                      << ", i = " << i << " is infinite or NaN " << tmp << "= "
-                      << log_w_jm(m, i) << "+"
-                      << (-0.5 * log_det_Sigma(i)) << "+" << (-0.5 * DLog2pi)
-                      << "+" << (mu_SigmaInv_mu) << ", setting to finite.";
-            n_[j1](i, m) = -1.0e+40;  // future work(arnab): get rid of magic number
-          }
-        }
-      }
-    }
-  }
-}
-
-BaseFloat AmSgmm2::GetDjms(int32 j1, int32 m,
-                          Sgmm2PerSpkDerivedVars *spk_vars) const {
-  // This relates to SSGMMs (speaker-dependent weights).
-  if (spk_vars->log_d_jms.empty()) return -1; // this would be
-  // because we don't have speaker-dependent weights ("u" not set up).
-
-  KALDI_ASSERT(!w_jmi_.empty() && "You need to call ComputeWeights() on SGMM.");
-  Vector<BaseFloat> &log_d = spk_vars->log_d_jms[j1];
-  if (log_d.Dim() == 0) {
-    log_d.Resize(NumSubstatesForGroup(j1));
-    log_d.AddMatVec(1.0, w_jmi_[j1], kNoTrans, spk_vars->b_is, 0.0);
-    log_d.ApplyLog();
-  }
-  return Exp(log_d(m));
-}
-
-
-void AmSgmm2::ComputeFmllrPreXform(const Vector<BaseFloat> &state_occs,
-                                  Matrix<BaseFloat> *xform,
-                                   Matrix<BaseFloat> *inv_xform,
-                                  Vector<BaseFloat> *diag_mean_scatter) const {
-  int32 num_pdfs = NumPdfs(),
-      num_gauss = NumGauss(),
-      dim = FeatureDim();
-  KALDI_ASSERT(state_occs.Dim() == num_pdfs);
-
-  BaseFloat total_occ = state_occs.Sum();
-
-  // Degenerate case: unlikely to ever happen.
-  if (total_occ == 0) {
-    KALDI_WARN << "Zero probability (computing transform). Using unit "
-               << "pre-transform";
-    xform->Resize(dim, dim + 1, kUndefined);
-    xform->SetUnit();
-    inv_xform->Resize(dim, dim + 1, kUndefined);
-    inv_xform->SetUnit();
-    diag_mean_scatter->Resize(dim, kSetZero);
-    return;
-  }
-
-  // Convert state occupancies to posteriors; Eq. (B.1)
-  Vector<BaseFloat> state_posteriors(state_occs);
-  state_posteriors.Scale(1/total_occ);
-
-  Vector<BaseFloat> mu_jmi(dim), global_mean(dim);
-  SpMatrix<BaseFloat> within_class_covar(dim), between_class_covar(dim);
-  Vector<BaseFloat> gauss_weight(num_gauss);  // weights for within-class vars.
-  Vector<BaseFloat> w_jm(num_gauss);
-  for (int32 j1 = 0; j1 < NumGroups(); j1++) {
-    const std::vector<int32> &pdfs = group2pdf_[j1];
-    int32 M = NumSubstatesForGroup(j1);
-    Vector<BaseFloat> substate_weight(M); // total weight for each substate.
-    for (size_t i = 0; i < pdfs.size(); i++) {
-      int32 j2 = pdfs[i];
-      substate_weight.AddVec(state_posteriors(j2), c_[j2]);
-    }
-    for (int32 m = 0; m < M; m++) {
-      BaseFloat this_substate_weight = substate_weight(m);
-      // Eq. (7): w_jm = softmax([w_{1}^T ... w_{D}^T] * v_{jm})
-      w_jm.AddMatVec(1.0, w_, kNoTrans, v_[j1].Row(m), 0.0);
-      w_jm.ApplySoftMax();
-
-      for (int32 i = 0; i < num_gauss; i++) {
-        BaseFloat weight = this_substate_weight * w_jm(i);
-        mu_jmi.AddMatVec(1.0, M_[i], kNoTrans, v_[j1].Row(m), 0.0);  // Eq. (6)
-        // Eq. (B.3): \mu_avg = \sum_{jmi} p(j) c_{jm} w_{jmi} \mu_{jmi}
-        global_mean.AddVec(weight, mu_jmi);
-        // \Sigma_B = \sum_{jmi} p(j) c_{jm} w_{jmi} \mu_{jmi} \mu_{jmi}^T
-        between_class_covar.AddVec2(weight, mu_jmi);  // Eq. (B.4)
-        gauss_weight(i) += weight;
-      }
-    }
-  }
-  between_class_covar.AddVec2(-1.0, global_mean);  // Eq. (B.4)
-
-  for (int32 i = 0; i < num_gauss; i++) {
-    SpMatrix<BaseFloat> Sigma(SigmaInv_[i]);
-    Sigma.InvertDouble();
-    // Eq. (B.2): \Sigma_W = \sum_{jmi} p(j) c_{jm} w_{jmi} \Sigma_i
-    within_class_covar.AddSp(gauss_weight(i), Sigma);
-  }
-
-  TpMatrix<BaseFloat> tmpL(dim);
-  Matrix<BaseFloat> tmpLInvFull(dim, dim);
-  tmpL.Cholesky(within_class_covar);  // \Sigma_W = L L^T
-  tmpL.InvertDouble();  // L^{-1}
-  tmpLInvFull.CopyFromTp(tmpL);  // get as full matrix.
-
-  // B := L^{-1} * \Sigma_B * L^{-T}
-  SpMatrix<BaseFloat> tmpB(dim);
-  tmpB.AddMat2Sp(1.0, tmpLInvFull, kNoTrans, between_class_covar, 0.0);
-
-  Matrix<BaseFloat> U(dim, dim);
-  diag_mean_scatter->Resize(dim);
-  xform->Resize(dim, dim + 1);
-  inv_xform->Resize(dim, dim + 1);
-
-  tmpB.Eig(diag_mean_scatter, &U);  // Eq. (B.5): B = U D V^T
-
-  int32 n;
-  diag_mean_scatter->ApplyFloor(1.0e-04, &n);
-  if (n != 0)
-    KALDI_WARN << "Floored " << n << " elements of the mean-scatter matrix.";
-
-  // Eq. (B.6): A_{pre} = U^T * L^{-1}
-  SubMatrix<BaseFloat> Apre(*xform, 0, dim, 0, dim);
-  Apre.AddMatMat(1.0, U, kTrans, tmpLInvFull, kNoTrans, 0.0);
-
-#ifdef KALDI_PARANOID
-  {
-    SpMatrix<BaseFloat> tmp(dim);
-    tmp.AddMat2Sp(1.0, Apre, kNoTrans, within_class_covar, 0.0);
-    KALDI_ASSERT(tmp.IsUnit(0.01));
-  }
-  {
-    SpMatrix<BaseFloat> tmp(dim);
-    tmp.AddMat2Sp(1.0, Apre, kNoTrans, between_class_covar, 0.0);
-    KALDI_ASSERT(tmp.IsDiagonal(0.01));
-  }
-#endif
-
-  // Eq. (B.7): b_{pre} = - A_{pre} \mu_{avg}
-  Vector<BaseFloat> b_pre(dim);
-  b_pre.AddMatVec(-1.0, Apre, kNoTrans, global_mean, 0.0);
-  for (int32 r = 0; r < dim; r++) {
-    xform->Row(r)(dim) = b_pre(r);  // W_{pre} = [ A_{pre}, b_{pre} ]
-  }
-
-  // Eq. (B.8) & (B.9): W_{inv} = [ A_{pre}^{-1}, \mu_{avg} ]
-  inv_xform->CopyFromMat(*xform);
-  inv_xform->Range(0, dim, 0, dim).InvertDouble();
-  for (int32 r = 0; r < dim; r++)
-    inv_xform->Row(r)(dim) = global_mean(r);
-}  // End of ComputePreXform()
-
-template<typename Real>
-void AmSgmm2::GetNtransSigmaInv(vector< Matrix<Real> > *out) const {
-  KALDI_ASSERT(SpkSpaceDim() > 0 &&
-      "Cannot compute N^{T} \\Sigma_{i}^{-1} without speaker projections.");
-  out->resize(NumGauss());
-  Matrix<Real> tmpcov(FeatureDim(), FeatureDim());
-  Matrix<Real> tmp_n(FeatureDim(), SpkSpaceDim());
-  for (int32 i = 0; i < NumGauss(); i++) {
-    tmpcov.CopyFromSp(SigmaInv_[i]);
-    tmp_n.CopyFromMat(N_[i]);
-    (*out)[i].Resize(SpkSpaceDim(), FeatureDim());
-    (*out)[i].AddMatMat(1.0, tmp_n, kTrans, tmpcov, kNoTrans, 0.0);
-  }
-}
-
-// Instantiate the above template.
-template
-void AmSgmm2::GetNtransSigmaInv(vector< Matrix<float> > *out) const;
-template
-void AmSgmm2::GetNtransSigmaInv(vector< Matrix<double> > *out) const;
-
-///////////////////////////////////////////////////////////////////////////////
-
-template<class Real>
-void AmSgmm2::ComputeH(std::vector< SpMatrix<Real> > *H_i) const {
-  KALDI_ASSERT(NumGauss() != 0);
-  (*H_i).resize(NumGauss());
-  SpMatrix<BaseFloat> H_i_tmp(PhoneSpaceDim());
-  for (int32 i = 0; i < NumGauss(); i++) {
-    (*H_i)[i].Resize(PhoneSpaceDim());
-    H_i_tmp.AddMat2Sp(1.0, M_[i], kTrans, SigmaInv_[i], 0.0);
-    (*H_i)[i].CopyFromSp(H_i_tmp);
-  }
-}
-
-// Instantiate the template.
-template
-void AmSgmm2::ComputeH(std::vector< SpMatrix<float> > *H_i) const;
-template
-void AmSgmm2::ComputeH(std::vector< SpMatrix<double> > *H_i) const;
-
-
-// Initializes the matrices M_{i} and w_i
-void AmSgmm2::InitializeMw(int32 phn_subspace_dim,
-                           const Matrix<BaseFloat> &norm_xform) {
-  int32 ddim = full_ubm_.Dim();
-  KALDI_ASSERT(phn_subspace_dim <= ddim + 1);
-  KALDI_ASSERT(phn_subspace_dim <= norm_xform.NumCols() + 1);
-  KALDI_ASSERT(ddim <= norm_xform.NumRows());
-
-  Vector<BaseFloat> mean(ddim);
-  int32 num_gauss = full_ubm_.NumGauss();
-  w_.Resize(num_gauss, phn_subspace_dim);
-  M_.resize(num_gauss);
-  for (int32 i = 0; i < num_gauss; i++) {
-    full_ubm_.GetComponentMean(i, &mean);
-    Matrix<BaseFloat> &thisM(M_[i]);
-    thisM.Resize(ddim, phn_subspace_dim);
-    // Eq. (27): M_{i} = [ \bar{\mu}_{i} (J)_{1:D, 1:(S-1)}]
-    thisM.CopyColFromVec(mean, 0);
-    int32 nonrandom_dim = std::min(phn_subspace_dim - 1, ddim),
-        random_dim = phn_subspace_dim - 1 - nonrandom_dim;
-    thisM.Range(0, ddim, 1, nonrandom_dim).CopyFromMat(
-        norm_xform.Range(0, ddim, 0, nonrandom_dim), kNoTrans);
-    // The following extension to the original paper allows us to
-    // initialize the model with a larger dimension of phone-subspace vector.
-    if (random_dim > 0)
-      thisM.Range(0, ddim, nonrandom_dim + 1, random_dim).SetRandn();
-  }
-}
-
-// Initializes the matrices N_i, and [if speaker_dependent_weights==true] u_i.
-void AmSgmm2::InitializeNu(int32 spk_subspace_dim,
-                          const Matrix<BaseFloat> &norm_xform,
-                          bool speaker_dependent_weights) {
-  int32 ddim = full_ubm_.Dim();
-
-  int32 num_gauss = full_ubm_.NumGauss();
-  N_.resize(num_gauss);
-  for (int32 i = 0; i < num_gauss; i++) {
-    N_[i].Resize(ddim, spk_subspace_dim);
-    // Eq. (28): N_{i} = [ (J)_{1:D, 1:T)}]
-
-    int32 nonrandom_dim = std::min(spk_subspace_dim, ddim),
-        random_dim = spk_subspace_dim - nonrandom_dim;
-
-    N_[i].Range(0, ddim, 0, nonrandom_dim).
-        CopyFromMat(norm_xform.Range(0, ddim, 0, nonrandom_dim), kNoTrans);
-    // The following extension to the original paper allows us to
-    // initialize the model with a larger dimension of speaker-subspace vector.
-    if (random_dim > 0)
-      N_[i].Range(0, ddim, nonrandom_dim, random_dim).SetRandn();
-  }
-  if (speaker_dependent_weights) {
-    u_.Resize(num_gauss, spk_subspace_dim); // will set to zero.
-  } else {
-    u_.Resize(0, 0);
-  }
-}
-
-void AmSgmm2::CopyGlobalsInitVecs(const AmSgmm2 &other,
-                                  const std::vector<int32> &pdf2group,
-                                  BaseFloat self_weight) {
-  KALDI_LOG << "Initializing model";
-  pdf2group_ = pdf2group;
-  ComputePdfMappings();
-
-  // Copy background GMMs
-  diag_ubm_.CopyFromDiagGmm(other.diag_ubm_);
-  full_ubm_.CopyFromFullGmm(other.full_ubm_);
-
-  // Copy global params
-  SigmaInv_ = other.SigmaInv_;
-
-  M_ = other.M_;
-  w_ = other.w_;
-  u_ = other.u_;
-  N_ = other.N_;
-
-  InitializeVecsAndSubstateWeights(self_weight);
-}
-
-
-// Initializes the vectors v_{j1,m} and substate weights c_{j2,m}.
-void AmSgmm2::InitializeVecsAndSubstateWeights(BaseFloat self_weight) {
-  int32 J1 = NumGroups(), J2 = NumPdfs();
-  KALDI_ASSERT(J1 > 0 && J2 >= J1);
-  int32 phn_subspace_dim = PhoneSpaceDim();
-  KALDI_ASSERT(phn_subspace_dim > 0 && "Initialize M and w first.");
-
-  v_.resize(J1);
-  if (self_weight == 1.0) {
-    for (int32 j1 = 0; j1 < J1; j1++) {
-      v_[j1].Resize(1, phn_subspace_dim);
-      v_[j1](0, 0) = 1.0;  // Eq. (26): v_{j1} = [1 0 0 ... 0]
-    }
-    c_.resize(J2);
-    for (int32 j2 = 0; j2 < J2; j2++) {
-      c_[j2].Resize(1);
-      c_[j2](0) = 1.0;    // Eq. (25): c_{j1} = 1.0
-    }
-  } else {
-    for (int32 j1 = 0; j1 < J1; j1++) {
-      int32 npdfs = group2pdf_[j1].size();
-      v_[j1].Resize(npdfs, phn_subspace_dim);
-      for (int32 m = 0; m < npdfs; m++)
-        v_[j1](m, 0) = 1.0;  // Eq. (26): v_{j1} = [1 0 0 ... 0]
-    }
-    c_.resize(J2);
-    for (int32 j2 = 0; j2 < J2; j2++) {
-      int32 j1 = pdf2group_[j2], npdfs = group2pdf_[j1].size();
-      c_[j2].Resize(npdfs);
-      if (npdfs == 1) c_[j2].Set(1.0);
-      else {
-        // note: just avoid NaNs if npdfs-1... value won't matter.
-        double other_weight = (1.0 - self_weight) / std::max((1-npdfs), 1);
-        c_[j2].Set(other_weight);
-        for (int32 k = 0; k < npdfs; k++)
-          if(group2pdf_[j1][k] == j2) c_[j2](k) = self_weight;
-      }
-    }
-  }
-}
-
-// Initializes the within-class vars Sigma_{ki}
-void AmSgmm2::InitializeCovars() {
-  std::vector< SpMatrix<BaseFloat> > &inv_covars(full_ubm_.inv_covars());
-  int32 num_gauss = full_ubm_.NumGauss();
-  int32 dim = full_ubm_.Dim();
-  SigmaInv_.resize(num_gauss);
-  for (int32 i = 0; i < num_gauss; i++) {
-    SigmaInv_[i].Resize(dim);
-    SigmaInv_[i].CopyFromSp(inv_covars[i]);
-  }
-}
-
-// Compute the "smoothing" matrix H^{(sm)} from expected counts given the model.
-void AmSgmm2::ComputeHsmFromModel(
-    const std::vector< SpMatrix<BaseFloat> > &H,
-    const Vector<BaseFloat> &state_occupancies,
-    SpMatrix<BaseFloat> *H_sm,
-    BaseFloat max_cond) const {
-  int32 num_gauss = NumGauss();
-  BaseFloat tot_sum = 0.0;
-  KALDI_ASSERT(state_occupancies.Dim() == NumPdfs());
-  Vector<BaseFloat> w_jm(num_gauss);
-  H_sm->Resize(PhoneSpaceDim());
-  H_sm->SetZero();
-  Vector<BaseFloat> gamma_i;
-  ComputeGammaI(state_occupancies, &gamma_i);
-
-  BaseFloat sum = 0.0;
-  for (int32 i = 0; i < num_gauss; i++) {
-    if (gamma_i(i) > 0) {
-      H_sm->AddSp(gamma_i(i), H[i]);
-      sum += gamma_i(i);
-    }
-  }
-  if (sum == 0.0) {
-    KALDI_WARN << "Sum of counts is zero. ";
-    // set to unit matrix--arbitrary non-singular matrix.. won't ever matter.
-    H_sm->SetUnit();
-  } else {
-    H_sm->Scale(1.0 / sum);
-    int32 tmp = H_sm->LimitCondDouble(max_cond);
-    if (tmp > 0) {
-      KALDI_WARN << "Limited " << (tmp) << " eigenvalues of H_sm";
-    }
-  }
-  tot_sum += sum;
-
-  KALDI_LOG << "total count is " << tot_sum;
-}
-
-void ComputeFeatureNormalizingTransform(const FullGmm &gmm, Matrix<BaseFloat> *xform) {
-  int32 dim = gmm.Dim();
-  int32 num_gauss = gmm.NumGauss();
-  SpMatrix<BaseFloat> within_class_covar(dim);
-  SpMatrix<BaseFloat> between_class_covar(dim);
-  Vector<BaseFloat> global_mean(dim);
-
-  // Accumulate LDA statistics from the GMM parameters.
-  {
-    BaseFloat total_weight = 0.0;
-    Vector<BaseFloat> tmp_weight(num_gauss);
-    Matrix<BaseFloat> tmp_means;
-    std::vector< SpMatrix<BaseFloat> > tmp_covars;
-    tmp_weight.CopyFromVec(gmm.weights());
-    gmm.GetCovarsAndMeans(&tmp_covars, &tmp_means);
-    for (int32 i = 0; i < num_gauss; i++) {
-      BaseFloat w_i = tmp_weight(i);
-      total_weight += w_i;
-      within_class_covar.AddSp(w_i, tmp_covars[i]);
-      between_class_covar.AddVec2(w_i, tmp_means.Row(i));
-      global_mean.AddVec(w_i, tmp_means.Row(i));
-    }
-    KALDI_ASSERT(total_weight > 0);
-    if (fabs(total_weight - 1.0) > 0.001) {
-      KALDI_WARN << "Total weight across the GMMs is " << (total_weight)
-          << ", renormalizing.";
-      global_mean.Scale(1.0 / total_weight);
-      within_class_covar.Scale(1.0 / total_weight);
-      between_class_covar.Scale(1.0 / total_weight);
-    }
-    between_class_covar.AddVec2(-1.0, global_mean);
-  }
-
-  TpMatrix<BaseFloat> chol(dim);
-  chol.Cholesky(within_class_covar);  // Sigma_W = L L^T
-  TpMatrix<BaseFloat> chol_inv(chol);
-  chol_inv.InvertDouble();
-  Matrix<BaseFloat> chol_full(dim, dim);
-  chol_full.CopyFromTp(chol_inv);
-  SpMatrix<BaseFloat> LBL(dim);
-  // LBL = L^{-1} \Sigma_B L^{-T}
-  LBL.AddMat2Sp(1.0, chol_full, kNoTrans, between_class_covar, 0.0);
-  Vector<BaseFloat> Dvec(dim);
-  Matrix<BaseFloat> U(dim, dim);
-  LBL.Eig(&Dvec, &U);
-  SortSvd(&Dvec, &U);
-
-  xform->Resize(dim, dim);
-  chol_full.CopyFromTp(chol);
-  // T := L U, eq (23)
-  xform->AddMatMat(1.0, chol_full, kNoTrans, U, kNoTrans, 0.0);
-
-#ifdef KALDI_PARANOID
-  Matrix<BaseFloat> inv_xform(*xform);
-  inv_xform.InvertDouble();
-  {  // Check that T*within_class_covar*T' = I.
-    Matrix<BaseFloat> wc_covar_full(dim, dim), tmp(dim, dim);
-    wc_covar_full.CopyFromSp(within_class_covar);
-    tmp.AddMatMat(1.0, inv_xform, kNoTrans, wc_covar_full, kNoTrans, 0.0);
-    wc_covar_full.AddMatMat(1.0, tmp, kNoTrans, inv_xform, kTrans, 0.0);
-    KALDI_ASSERT(wc_covar_full.IsUnit(0.01));
-  }
-  {  // Check that T*between_class_covar*T' = diagonal.
-    Matrix<BaseFloat> bc_covar_full(dim, dim), tmp(dim, dim);
-    bc_covar_full.CopyFromSp(between_class_covar);
-    tmp.AddMatMat(1.0, inv_xform, kNoTrans, bc_covar_full, kNoTrans, 0.0);
-    bc_covar_full.AddMatMat(1.0, tmp, kNoTrans, inv_xform, kTrans, 0.0);
-    KALDI_ASSERT(bc_covar_full.IsDiagonal(0.01));
-  }
-#endif
-}
-
-void AmSgmm2::ComputePerSpkDerivedVars(Sgmm2PerSpkDerivedVars *vars) const {
-  KALDI_ASSERT(vars != NULL);
-  if (vars->v_s.Dim() != 0) {
-    KALDI_ASSERT(vars->v_s.Dim() == SpkSpaceDim());
-    vars->o_s.Resize(NumGauss(), FeatureDim());
-    int32 num_gauss = NumGauss();
-    // first compute the o_i^{(s)} quantities.
-    for (int32 i = 0; i < num_gauss; i++) {
-       // Eqn. (32): o_i^{(s)} = N_i v^{(s)}
-      vars->o_s.Row(i).AddMatVec(1.0, N_[i], kNoTrans, vars->v_s, 0.0);
-    }
-    // the rest relates to the SSGMM.  We only need to to this
-    // if we're using speaker-dependent weights.
-    if (HasSpeakerDependentWeights()) {
-      vars->log_d_jms.clear();
-      vars->log_d_jms.resize(NumGroups());
-      vars->log_b_is.Resize(NumGauss());
-      vars->log_b_is.AddMatVec(1.0, u_, kNoTrans, vars->v_s, 0.0);
-      vars->b_is.Resize(NumGauss());
-      vars->b_is.CopyFromVec(vars->log_b_is);
-      vars->b_is.ApplyExp();
-      for (int32 i = 0; i < vars->b_is.Dim(); i++) {
-        if (vars->b_is(i) - vars->b_is(i) != 0.0) { // NaN.
-          vars->b_is(i) = 1.0;
-          KALDI_WARN << "Set NaN in b_is to 1.0";
-        }
-      }
-    } else {
-      vars->b_is.Resize(0);
-      vars->log_b_is.Resize(0);
-      vars->log_d_jms.resize(0);
-    }
-  } else {
-    vars->Clear(); // make sure everything is cleared.
-  }
-}
-
-BaseFloat AmSgmm2::GaussianSelection(const Sgmm2GselectConfig &config,
-                                    const VectorBase<BaseFloat> &data,
-                                    std::vector<int32> *gselect) const {
-  KALDI_ASSERT(diag_ubm_.NumGauss() != 0 &&
-               diag_ubm_.NumGauss() == full_ubm_.NumGauss() &&
-               diag_ubm_.Dim() == data.Dim());
-  KALDI_ASSERT(config.diag_gmm_nbest > 0 && config.full_gmm_nbest > 0 &&
-               config.full_gmm_nbest < config.diag_gmm_nbest);
-  int32 num_gauss = diag_ubm_.NumGauss();
-
-  std::vector< std::pair<BaseFloat, int32> > pruned_pairs;
-  if (config.diag_gmm_nbest < num_gauss) {    Vector<BaseFloat> loglikes(num_gauss);
-    diag_ubm_.LogLikelihoods(data, &loglikes);
-    Vector<BaseFloat> loglikes_copy(loglikes);
-    BaseFloat *ptr = loglikes_copy.Data();
-    std::nth_element(ptr, ptr+num_gauss-config.diag_gmm_nbest, ptr+num_gauss);
-    BaseFloat thresh = ptr[num_gauss-config.diag_gmm_nbest];
-    for (int32 g = 0; g < num_gauss; g++)
-      if (loglikes(g) >= thresh)  // met threshold for diagonal phase.
-        pruned_pairs.push_back(
-            std::make_pair(full_ubm_.ComponentLogLikelihood(data, g), g));
-  } else {
-    Vector<BaseFloat> loglikes(num_gauss);
-    full_ubm_.LogLikelihoods(data, &loglikes);
-    for (int32 g = 0; g < num_gauss; g++)
-      pruned_pairs.push_back(std::make_pair(loglikes(g), g));
-  }
-  KALDI_ASSERT(!pruned_pairs.empty());
-  if (pruned_pairs.size() > static_cast<size_t>(config.full_gmm_nbest)) {
-    std::nth_element(pruned_pairs.begin(),
-                     pruned_pairs.end() - config.full_gmm_nbest,
-                     pruned_pairs.end());
-    pruned_pairs.erase(pruned_pairs.begin(),
-                       pruned_pairs.end() - config.full_gmm_nbest);
-  }
-  Vector<BaseFloat> loglikes_tmp(pruned_pairs.size());  // for return value.
-  KALDI_ASSERT(gselect != NULL);
-  gselect->resize(pruned_pairs.size());
-  // Make sure pruned Gaussians appear from best to worst.
-  std::sort(pruned_pairs.begin(), pruned_pairs.end(),
-            std::greater< std::pair<BaseFloat, int32> >());
-  for (size_t i = 0; i < pruned_pairs.size(); i++) {
-    loglikes_tmp(i) = pruned_pairs[i].first;
-    (*gselect)[i] = pruned_pairs[i].second;
-  }
-  return loglikes_tmp.LogSumExp();
-}
-
-void Sgmm2GauPost::Write(std::ostream &os, bool binary) const {
-  WriteToken(os, binary, "<Sgmm2GauPost>");
-  int32 T = this->size();
-  WriteBasicType(os, binary, T);
-  for (int32 t = 0; t < T; t++) {
-    WriteToken(os, binary, "<gselect>");
-    WriteIntegerVector(os, binary, (*this)[t].gselect);
-    WriteToken(os, binary, "<tids>");
-    WriteIntegerVector(os, binary, (*this)[t].tids);
-    KALDI_ASSERT((*this)[t].tids.size() == (*this)[t].posteriors.size());
-    for (size_t i = 0; i < (*this)[t].posteriors.size(); i++) {
-      (*this)[t].posteriors[i].Write(os, binary);
-    }
-  }
-  WriteToken(os, binary, "</Sgmm2GauPost>");
-}
-
-
-void Sgmm2GauPost::Read(std::istream &is, bool binary) {
-  ExpectToken(is, binary, "<Sgmm2GauPost>");
-  int32 T;
-  ReadBasicType(is, binary, &T);
-  KALDI_ASSERT(T >= 0);
-  this->resize(T);
-  for (int32 t = 0; t < T; t++) {
-    ExpectToken(is, binary, "<gselect>");
-    ReadIntegerVector(is, binary, &((*this)[t].gselect));
-    ExpectToken(is, binary, "<tids>");
-    ReadIntegerVector(is, binary, &((*this)[t].tids));
-    size_t sz = (*this)[t].tids.size();
-    (*this)[t].posteriors.resize(sz);
-    for (size_t i = 0; i < sz; i++)
-      (*this)[t].posteriors[i].Read(is, binary);
-  }
-  ExpectToken(is, binary, "</Sgmm2GauPost>");
-}
-
-
-
-}  // namespace kaldi
diff --git a/src/sgmm2/am-sgmm2.h b/src/sgmm2/am-sgmm2.h
deleted file mode 100644
index c60e66d7a01..00000000000
--- a/src/sgmm2/am-sgmm2.h
+++ /dev/null
@@ -1,586 +0,0 @@
-// sgmm2/am-sgmm2.h
-
-// Copyright 2009-2011  Microsoft Corporation;  Lukas Burget;
-//                      Saarland University (Author: Arnab Ghoshal);
-//                      Ondrej Glembek;  Yanmin Qian;
-// Copyright 2012-2013  Johns Hopkins University (author: Daniel Povey)
-//                      Liang Lu;  Arnab Ghoshal
-
-// See ../../COPYING for clarification regarding multiple authors
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//  http://www.apache.org/licenses/LICENSE-2.0
-//
-// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
-// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
-// MERCHANTABLITY OR NON-INFRINGEMENT.
-// See the Apache 2 License for the specific language governing permissions and
-// limitations under the License.
-
-#ifndef KALDI_SGMM2_AM_SGMM2_H_
-#define KALDI_SGMM2_AM_SGMM2_H_
-
-#include <vector>
-
-#include "base/kaldi-common.h"
-#include "matrix/matrix-lib.h"
-#include "gmm/model-common.h"
-#include "gmm/diag-gmm.h"
-#include "gmm/full-gmm.h"
-#include "itf/options-itf.h"
-#include "util/table-types.h"
-#include "util/kaldi-thread.h"
-
-namespace kaldi {
-/*
-  When reading this file, keep in mind two references: the paper
- "The Subspace Gaussian Mixture Model-- a Structured Model for Speech Recognition", by D. Povey,
-  L. Burget et. al (Computer Speech and Language, 2011), and
-  "The Symmetric Subspace Gaussian Mixture Model": Microsoft Research technical report MSR-TR-2010-138.
-  We will refer to these as "the paper" [or "the CSL paper"] and "the techreport".
-
-  (1) SSGMM
-  
-  We'll use the acronym SSGMM to refer to the Symmetric SGMM, and we'll mark in
-  the code with "[SSGMM]" things that relate to it.  The technical report
-  describes an extention to the originally described model where we have
-  speaker-dependent mixture weights.  These are implemented here.  Note: we only
-  implement the "more efficient" version of the update for the speaker
-  projection vectors \u_i.  There is also an ICASSP paper that describes the
-  stuff in the techreport (more briefly), with results, but we don't refer to
-  any equation numbers in that.
-
-  (2) SCTM
-
-  What we implement here has another extension that was not in the CSL paper: an
-  extension to the "state-clustered tied mixture" [SCTM] system-- a bit like BBN's
-  style of system, except for SGMMs not Gaussians, at the sub-state not Gaussian level.
-  We build a first
-  tree, at which level the phonetic sub-state vectors are defined, and then a
-  "more detailed" tree, at which level we share the sub-state mixture weights.
-  In this class, NumPdfs() returns the real number of pdf's (i.e. the #leaves
-  of the more detailed tree), and NumPdfGroups() returns the number of groups of
-  pdf's that share the sub-state vectors.
-  We use the index j2 for indexing 0...NumPdfs()-1 [as it's the "2nd level" of the tree],
-  and j1 for indexing 0...NumPdfGroups()-1 [as it's the "1st level" of the tree].
-  The weights are stored as c[j2][m].  There is a mapping Pdf2Group(j2) which returns
-  the corresponding j1 for a given j2, and Group2PdfList(j1) which returns a vector<int32>
-  consisting of the list of j2 indices for that j1. 
-  
-  The count quantities we store during the accumulation phase could most simply
-  be stored as gamma[j2][m][i] (where m is the sub-state index), but this is
-  inefficient.  Instead we store them separately as gamma1[j1][m][i] and gamma2[j2][m],
-  so each count gets stored in two separate places; this makes the stats more compact.
-
-  In this implementation, the normalizers n_{jmi} are now stored as n[j1][m][i],
-  without including the log-weight term log c[j2][m].  In the computation of
-  state likelihoods, we first compute the log-prob of the data given each of the
-  sub-state vectors; and we compute the log-sum of this and the posteriors over
-  each of the vectors [treating the weights as 1.0].  Call these
-  "pseudo-posteriors".  Then to take into account the contribution of the
-  weights in a state j2, we take the dot product of the weight-vector c[j2][...]
-  with this vector of pseudo-posteriors.  The log of this dot-product gets added to the
-  original log-sum.  
-*/
-
-
-struct Sgmm2SplitSubstatesConfig {
-  int32 split_substates;
-  BaseFloat perturb_factor;
-  BaseFloat power;
-  BaseFloat max_cond;
-  BaseFloat min_count;
-  Sgmm2SplitSubstatesConfig(): split_substates(0),
-                               perturb_factor(0.01),
-                               power(0.2),
-                               max_cond(100.0),
-                               min_count(40.0) { }
-  void Register(OptionsItf *opts) {
-    opts->Register("split-substates", &split_substates, "Increase number of "
-                   "substates to this overall target.");
-    opts->Register("max-cond-split", &max_cond, "Max condition number of smoothing "
-                   "matrix used in substate splitting.");
-    opts->Register("perturb-factor", &perturb_factor, "Perturbation factor for "
-                   "state vectors while splitting substates.");
-    opts->Register("power", &power, "Exponent for substate occupancies used while "
-                   "splitting substates.");
-    opts->Register("min-count", &min_count, "Minimum allowed count, used in allocating "
-                   "sub-states to state in mixture splitting.");
-  }
-};
-
-// Caution: this config is probably not used in most of the setups, we generally do the Gaussian
-// selection using separate programs
-struct Sgmm2GselectConfig {
-  /// Number of highest-scoring full-covariance Gaussians per frame.
-  int32 full_gmm_nbest;
-  /// Number of highest-scoring diagonal-covariance Gaussians per frame.
-  int32 diag_gmm_nbest;
-
-  Sgmm2GselectConfig() {
-    full_gmm_nbest = 15;
-    diag_gmm_nbest = 50;
-  }
-
-  void Register(OptionsItf *opts) {
-    opts->Register("full-gmm-nbest", &full_gmm_nbest, "Number of highest-scoring"
-                   " full-covariance Gaussians selected per frame.");
-    opts->Register("diag-gmm-nbest", &diag_gmm_nbest, "Number of highest-scoring"
-                   " diagonal-covariance Gaussians selected per frame.");
-  }
-};
-
-/** \struct Sgmm2PerFrameDerivedVars
- *  Holds the per-frame precomputed quantities x(t), x_{i}(t), z_{i}(t), and
- *  n_{i}(t) (cf. Eq. (33)-(36)) for the SGMM, as well as the cached Gaussian
- *  selection records.
- */
-struct Sgmm2PerFrameDerivedVars {
-  std::vector<int32> gselect;
-  Vector<BaseFloat> xt;   ///< x'(t), FMLLR-adapted, dim = [D], eq.(33)
-  Matrix<BaseFloat> xti;  ///< x_{i}(t) = x'(t) - o_i(s): dim = [I][D], eq.(34)
-  Matrix<BaseFloat> zti;  ///< z_{i}(t), dim = [I][S], eq.(35)
-  Vector<BaseFloat> nti;  ///< n_{i}(t), dim = [I], eq.(36) in CSL paper, but
-                          ///< [SSGMM] with extra term log b_i^{(s)}, see eq. (24) of
-                          ///< techreport.
-  
-  void Resize(int32 ngauss, int32 feat_dim, int32 phn_dim) { // resizes but does
-    // not necessarily zero things.
-    if (xt.Dim() != feat_dim) xt.Resize(feat_dim);
-    if (xti.NumRows() != ngauss || xti.NumCols() != feat_dim)
-      xti.Resize(ngauss, feat_dim);
-    if (zti.NumRows() != ngauss || zti.NumCols() != phn_dim)
-      zti.Resize(ngauss, phn_dim);
-    if (nti.Dim() != ngauss)
-      nti.Resize(ngauss);
-  }
-};
-
-class AmSgmm2;
-
-class Sgmm2PerSpkDerivedVars {
-  // To set this up, call ComputePerSpkDerivedVars from the sgmm object.
- public:  
-  void Clear() {
-    v_s.Resize(0);
-    o_s.Resize(0, 0);
-    b_is.Resize(0);
-    log_b_is.Resize(0);
-    log_d_jms.resize(0);
-  }
-  bool Empty() { return v_s.Dim() == 0; }
-  // caution: after SetSpeakerVector you typically want to
-  // use the function AmSgmm::ComputePerSpkDerivedVars
-  const Vector<BaseFloat> &GetSpeakerVector() { return v_s; }
-  
-  void SetSpeakerVector(const Vector<BaseFloat> &v_s_in) {
-    v_s.Resize(v_s_in.Dim());
-    v_s.CopyFromVec(v_s_in);
-  }    
- protected:
-  friend class AmSgmm2;
-  friend class MleAmSgmm2Accs;
-  Vector<BaseFloat> v_s;  ///< Speaker adaptation vector v_^{(s)}. Dim is [T]
-  Matrix<BaseFloat> o_s;  ///< Per-speaker offsets o_{i}. Dimension is [I][D]
-  Vector<BaseFloat> b_is; /// < [SSGMM]: Eq. (22) in techreport, b_i^{(s)} = \exp(\u_i^T \v^{(s)})
-  Vector<BaseFloat> log_b_is; /// < [SSGMM] log of the above (more efficient to store both).
-  std::vector<Vector<BaseFloat> > log_d_jms; ///< [SSGMM] normalizers per-speaker and per-substate;
-                                             ///< indexed [j1][m].
-};
-
-/// Sgmm2LikelihoodCache caches SGMM likelihoods at two levels: the final
-/// pdf likelihoods, and the sub-state level likelihoods, which means
-/// that with the SCTM system we can avoid redundant computation.
-/// You need to call NextFrame() on the cache, between frames.
-struct Sgmm2LikelihoodCache {
- public:
-  // you'll typically initialize with (sgmm.NumGroups(), sgmm.NumPdfs()).
-  Sgmm2LikelihoodCache(int32 num_groups, int32 num_pdfs):
-      substate_cache(num_groups), pdf_cache(num_pdfs), t(1) { }
-  
-  struct SubstateCacheElement { // indexed by j1.
-    SubstateCacheElement(): t(0) { }
-    // The "likes" and "remaining_log_like" quantities store the
-    // log-like of the data given each substate vector, in a redundant
-    // way, so the likelihood is likes(i) * exp(remaining_log_like).
-    // This is to get around problems with numerical range.
-    Vector<BaseFloat> likes; 
-    BaseFloat remaining_log_like;
-    int32 t; // used in detecting "freshness."
-  };  
-  struct PdfCacheElement { // indexed by j2.
-    PdfCacheElement(): t(0) { }
-    BaseFloat log_like;
-    int32 t; // used in detecting "freshness."
-  };
-
-  void NextFrame(); // increments t.
-  std::vector<SubstateCacheElement> substate_cache; // indexed by j1.
-  std::vector<PdfCacheElement> pdf_cache; // indexed by j2.
-  int32 t;
-};
-
-
-/** \class AmSgmm2
- *  Class for definition of the subspace Gmm acoustic model
- */
-class AmSgmm2 {
- public:
-  AmSgmm2() {}
-  void Read(std::istream &is, bool binary);
-  void Write(std::ostream &os, bool binary,
-             SgmmWriteFlagsType write_params) const;
-  
-  /// Checks the various components for correct sizes. With wrong sizes,
-  /// assertion failure occurs. When the argument is set to true, dimensions of
-  /// the various components are printed.
-  void Check(bool show_properties = true);
-
-  /// Initializes the SGMM parameters from a full-covariance UBM.
-  /// The state2group vector maps from a state to the corresponding
-  /// cluster of states [i.e. j2 to j1].  For conventionally structured
-  /// systems (no 2-level tree), this can just be [ 0 1 ... n-1 ].
-  void InitializeFromFullGmm(const FullGmm &gmm,
-                             const std::vector<int32> &pdf2group,
-                             int32 phn_subspace_dim,
-                             int32 spk_subspace_dim,
-                             bool speaker_dependent_weights,
-                             BaseFloat self_weight); // self_weight relates to
-  // initialization of the weights.  if self_weight == 1.0 it means we
-  // just have 1 sub-state per group, otherwise we have one per pdf,
-  // and each pdf has "self_weight" as its "own" weight.
-  
-  /// Copies the global parameters from the supplied model, but sets
-  /// the state vectors to zero. 
-  void CopyGlobalsInitVecs(const AmSgmm2 &other,
-                           const std::vector<int32> &pdf2group,
-                           BaseFloat self_weight);
-  
-  /// Used to copy models (useful in update)
-  void CopyFromSgmm2(const AmSgmm2 &other,
-                    bool copy_normalizers,
-                    bool copy_weights);  // copy_weights is to copy w_{jmi} [which are
-   // stored, in the symmetric SSGMM.]
-  
-  /// Computes the top-scoring Gaussian indices (used for pruning of later
-  /// stages of computation). Returns frame log-likelihood given selected
-  /// Gaussians from full UBM.
-  BaseFloat GaussianSelection(const Sgmm2GselectConfig &config,
-                              const VectorBase<BaseFloat> &data,
-                              std::vector<int32> *gselect) const;
-  
-  /// This needs to be called with each new frame of data, prior to accumulation
-  /// or likelihood evaluation: it computes various pre-computed quantities.
-  void ComputePerFrameVars(const VectorBase<BaseFloat> &data,
-                           const std::vector<int32> &gselect,
-                           const Sgmm2PerSpkDerivedVars &spk_vars,
-                           Sgmm2PerFrameDerivedVars *per_frame_vars) const;
-
-
-  /// Computes the per-speaker derived vars; assumes vars->v_s is already
-  /// set up.
-  void ComputePerSpkDerivedVars(Sgmm2PerSpkDerivedVars *vars) const;
-  
-  /// This does a likelihood computation for a given state using the
-  /// pre-selected Gaussian components (in per_frame_vars).  If the
-  /// log_prune parameter is nonzero (e.g. 5.0), the LogSumExp() stage is
-  /// pruned, which is a significant speedup... smaller values are faster.
-  /// Note: you have to call cache->NextFrame() before calling this for
-  /// a new frame of data.
-  BaseFloat LogLikelihood(const Sgmm2PerFrameDerivedVars &per_frame_vars,
-                          int32 j2, // pdf_id
-                          Sgmm2LikelihoodCache *cache, // be careful to call NextFrame() when needed!
-                          Sgmm2PerSpkDerivedVars *spk_vars,
-                          BaseFloat log_prune = 0.0) const;
-  
-  /// Similar to LogLikelihood() function above, but also computes the posterior
-  /// probabilities for the pre-selected Gaussian components and all substates.
-  /// This one doesn't use caching to share computation for the groups of
-  /// pdfs. [it's less necessary, as most of the time we're doing this from alignments,
-  /// or lattices that are quite sparse, so we save little by sharing this.]
-  BaseFloat ComponentPosteriors(const Sgmm2PerFrameDerivedVars &per_frame_vars,
-                                int32 j2,
-                                Sgmm2PerSpkDerivedVars *spk_vars,
-                                Matrix<BaseFloat> *post) const;
-
-  /// Increases the total number of substates based on the state occupancies.
-  void SplitSubstates(const Vector<BaseFloat> &state_occupancies, // [indexed by pdf-id j2]
-                      const Sgmm2SplitSubstatesConfig &config);
-
-  /// Functions for increasing the phonetic and speaker space dimensions.
-  /// The argument norm_xform is a LDA-like feature normalizing transform,
-  /// computed by the ComputeFeatureNormalizingTransform function.
-  void IncreasePhoneSpaceDim(int32 target_dim,
-                             const Matrix<BaseFloat> &norm_xform);
-
-  /// Increase the subspace dimension for speakers.  The
-  /// boolean "speaker_dependent_weights" argument (for SSGMM)
-  /// only makes a difference if increasing the subspace dimension
-  /// from zero.
-  void IncreaseSpkSpaceDim(int32 target_dim,
-                           const Matrix<BaseFloat> &norm_xform,
-                           bool speaker_dependent_weights);
-
-  /// Computes (and initializes if necessary) derived vars...
-  /// for now this is just the normalizers "n" and the diagonal UBM,
-  /// and if we have the "u" matrix set up, also the w_jmi_
-  /// quantities.
-  void ComputeDerivedVars();
-
-  /// Computes the data-independent terms in the log-likelihood computation
-  /// for each Gaussian component and all substates. Eq. (31)
-  void ComputeNormalizers();
-  
-  /// Computes the weights w_jmi_, which is needed for likelihood evaluation
-  /// with SSGMMs.
-  void ComputeWeights();
-
-  /// Computes the LDA-like pre-transform and its inverse as well as the
-  /// eigenvalues of the scatter of the means used in FMLLR estimation.
-  void ComputeFmllrPreXform(const Vector<BaseFloat> &pdf_occs,
-                            Matrix<BaseFloat> *xform,
-                            Matrix<BaseFloat> *inv_xform,
-                            Vector<BaseFloat> *diag_mean_scatter) const;
-  
-  /// Various model dimensions.
-  int32 NumPdfs() const { return pdf2group_.size(); }
-  int32 NumGroups() const { return group2pdf_.size(); } // relates to SCTM.  # pdf groups,
-  // <= NumPdfs().
-  int32 Pdf2Group(int32 j2) const; // relates to SCTM.
-  int32 NumSubstatesForPdf(int32 j2) const {
-    KALDI_ASSERT(j2 < NumPdfs()); return c_[j2].Dim();
-  }
-  int32 NumSubstatesForGroup(int32 j1) const {
-    KALDI_ASSERT(j1 < NumGroups()); return v_[j1].NumRows();
-  }
-  int32 NumGauss() const { return M_.size(); }
-  int32 PhoneSpaceDim() const { return w_.NumCols(); }
-  int32 SpkSpaceDim() const { return (N_.size() > 0) ? N_[0].NumCols() : 0; }
-  int32 FeatureDim() const { return M_[0].NumRows(); }
-
-  /// True if doing SSGMM.
-  bool HasSpeakerDependentWeights() const { return (u_.NumRows() != 0); }
-
-  bool HasSpeakerSpace() const { return (!N_.empty()); }
-  
-  void RemoveSpeakerSpace() { N_.clear(); u_.Resize(0, 0); w_jmi_.clear(); }
-  
-  // [SSGMM] get the quantity d_{jm}^{(s)} and cache it with
-  // spk vars if necessary.  Called in accumulation code.
-  BaseFloat GetDjms(int32 j1, int32 m,
-                    Sgmm2PerSpkDerivedVars *spk_vars) const;
-  
-  /// Accessors
-  const FullGmm & full_ubm() const { return full_ubm_; }
-  const DiagGmm & diag_ubm() const { return diag_ubm_; }
-  
-  
-  /// Templated accessors (used to accumulate in different precision)
-  template<typename Real>
-  void GetInvCovars(int32 gauss_index, SpMatrix<Real> *out) const;
-
-  template<typename Real>
-  void GetSubstateMean(int32 j1, int32 m, int32 i,
-                       VectorBase<Real> *mean_out) const;
-    
-  template<typename Real>
-  void GetNtransSigmaInv(std::vector< Matrix<Real> > *out) const;
-
-  template<typename Real>
-  void GetSubstateSpeakerMean(int32 j1, int32 substate, int32 gauss,
-                              const Sgmm2PerSpkDerivedVars &spk,
-                              VectorBase<Real> *mean_out) const;
-  
-  template<typename Real>
-  void GetVarScaledSubstateSpeakerMean(int32 j1, int32 substate,
-                                       int32 gauss,
-                                       const Sgmm2PerSpkDerivedVars &spk,
-                                       VectorBase<Real> *mean_out) const;
-
-  /// Computes quantities H = M_i Sigma_i^{-1} M_i^T.
-  template<class Real>
-  void ComputeH(std::vector< SpMatrix<Real> > *H_i) const;
-  
- protected:
-  std::vector<int32> pdf2group_;
-  std::vector<std::vector<int32> > group2pdf_; // the reverse map.
-  
-  /// These contain the "background" model associated with the subspace GMM.
-  DiagGmm diag_ubm_;
-  FullGmm full_ubm_;
-
-  /// Globally shared parameters of the subspace GMM.  The various quantities
-  /// are: I = number of Gaussians, D = data dimension, S = phonetic subspace
-  /// dimension, T = speaker subspace dimension, J2 = number of pdfs, J1 =
-  /// number of groups of pdfs (for SCTM), #mix = number of substates [of state
-  /// j2 or state-group j1, depending on context].
-
-  /// Inverse within-class (full) covariances; dim is [I][D][D].
-  std::vector< SpMatrix<BaseFloat> > SigmaInv_;
-  /// Phonetic-subspace projections. Dimension is [I][D][S]
-  std::vector< Matrix<BaseFloat> > M_;
-  /// Speaker-subspace projections. Dimension is [I][D][T]
-  std::vector< Matrix<BaseFloat> > N_;
-  /// Phonetic-subspace weight projection vectors.  Dimension is [I][S]
-  Matrix<BaseFloat> w_;
-  /// [SSGMM] Speaker-subspace weight projection vectors. Dimension is [I][T]
-  Matrix<BaseFloat> u_;
-  
-  /// The parameters in a particular SGMM state.
-
-  /// v_{jm}, per-state phonetic-subspace vectors. Dimension is [J1][#mix][S].
-  std::vector< Matrix<BaseFloat> > v_;
-  /// c_{jm}, mixture weights. Dimension is [J2][#mix]
-  std::vector< Vector<BaseFloat> > c_;
-  /// n_{jim}, per-Gaussian normalizer. Dimension is [J1][I][#mix]
-  std::vector< Matrix<BaseFloat> > n_;
-  /// [SSGMM] w_{jmi}, dimension is [J1][#mix][I].  Computed from w_ and v_.
-  std::vector< Matrix<BaseFloat> > w_jmi_;
-
-  // Priors for MAP adaptation of M -- keeping them here for now but they may
-  // be moved somewhere else eventually
-  // These are parameters of a matrix-variate normal distribution. The means are
-  // the unadapted M_i, and we have 2 separate covaraince matrices for the rows
-  // and columns of M.
-  std::vector< Matrix<BaseFloat> > M_prior_;  // Matrix-variate Gaussian mean
-  SpMatrix<BaseFloat> row_cov_inv_;
-  SpMatrix<BaseFloat> col_cov_inv_;
-
- private:
-  /// Computes quasi-occupancies gamma_i from the state-level occupancies,
-  /// assuming model correctness.
-  void ComputeGammaI(const Vector<BaseFloat> &state_occupancies,
-                     Vector<BaseFloat> *gamma_i) const;
-  
-  /// Called inside SplitSubstates(); splits substates of one group.
-  void SplitSubstatesInGroup(const Vector<BaseFloat> &pdf_occupancies,
-                             const Sgmm2SplitSubstatesConfig &opts,
-                             const SpMatrix<BaseFloat> &sqrt_H_sm,
-                             int32 j1, int32 M);
-      
-  /// Compute a subset of normalizers; used in multi-threaded implementation.
-  void ComputeNormalizersInternal(int32 num_threads, int32 thread,
-                                  int32 *entropy_count, double *entropy_sum);
-  
-  /// The code below is called internally from LogLikelihood() and
-  /// ComponentPosteriors().  It computes the per-Gaussian log-likelihods
-  /// given each sub-state of the state.  Note: the mixture weights
-  /// are not included at this point.
-  inline void ComponentLogLikes(const Sgmm2PerFrameDerivedVars &per_frame_vars,
-                                int32 j1,
-                                Sgmm2PerSpkDerivedVars *spk_vars,
-                                Matrix<BaseFloat> *loglikes) const;
-
-  
-  /// Initializes the matrices M_ and w_.
-  void InitializeMw(int32 phn_subspace_dim,
-                     const Matrix<BaseFloat> &norm_xform);
-  /// Initializes the matrices N_ and [if speaker_dependent_weights==true] u_ 
-  void InitializeNu(int32 spk_subspace_dim,                    
-                    const Matrix<BaseFloat> &norm_xform,
-                    bool speaker_dependent_weights);
-  void InitializeVecsAndSubstateWeights(BaseFloat self_weight);
-  void InitializeCovars();  ///< initializes the within-class covariances.
-
-  void ComputeHsmFromModel(
-      const std::vector< SpMatrix<BaseFloat> > &H,
-      const Vector<BaseFloat> &state_occupancies,
-      SpMatrix<BaseFloat> *H_sm,
-      BaseFloat max_cond) const;
-
-  void ComputePdfMappings(); // sets up group2pdf_ from pdf2group_.
-  /// maps from each pdf (index j2) to the corresponding group of
-  /// pdfs (index j1) for SCTM.
-  
-  KALDI_DISALLOW_COPY_AND_ASSIGN(AmSgmm2);
-  friend class ComputeNormalizersClass;
-  friend class Sgmm2Project;
-  friend class EbwAmSgmm2Updater;
-  friend class MleAmSgmm2Accs;
-  friend class MleAmSgmm2Updater;
-  friend class MleSgmm2SpeakerAccs;
-  friend class AmSgmm2Functions;  // misc functions that need access.
-  friend class Sgmm2Feature;
-};
-
-template<typename Real>
-inline void AmSgmm2::GetInvCovars(int32 gauss_index,
-                                  SpMatrix<Real> *out) const {
-  out->Resize(SigmaInv_[gauss_index].NumRows(), kUndefined);
-  out->CopyFromSp(SigmaInv_[gauss_index]);
-}
-
-
-template<typename Real>
-inline void AmSgmm2::GetSubstateMean(int32 j1, int32 m, int32 i,
-                                    VectorBase<Real> *mean_out) const {
-  KALDI_ASSERT(mean_out != NULL);
-  KALDI_ASSERT(j1 < NumGroups() && m < NumSubstatesForGroup(j1)
-               && i < NumGauss());
-  KALDI_ASSERT(mean_out->Dim() == FeatureDim());
-  Vector<BaseFloat> mean_tmp(FeatureDim());
-  mean_tmp.AddMatVec(1.0, M_[i], kNoTrans, v_[j1].Row(m), 0.0);
-  mean_out->CopyFromVec(mean_tmp);
-}
-
-
-template<typename Real>
-inline void AmSgmm2::GetSubstateSpeakerMean(int32 j1, int32 m, int32 i,
-                                            const Sgmm2PerSpkDerivedVars &spk,
-                                           VectorBase<Real> *mean_out) const {
-  GetSubstateMean(j1, m, i, mean_out);
-  if (spk.v_s.Dim() != 0)  // have speaker adaptation...
-    mean_out->AddVec(1.0, spk.o_s.Row(i));
-}
-
-template<typename Real>
-void AmSgmm2::GetVarScaledSubstateSpeakerMean(int32 j1, int32 m, int32 i,
-                                             const Sgmm2PerSpkDerivedVars &spk,
-                                             VectorBase<Real> *mean_out) const {
-  Vector<BaseFloat> tmp_mean(mean_out->Dim()), tmp_mean2(mean_out->Dim());
-  GetSubstateSpeakerMean(j1, m, i, spk, &tmp_mean);
-  tmp_mean2.AddSpVec(1.0, SigmaInv_[i], tmp_mean, 0.0);
-  mean_out->CopyFromVec(tmp_mean2);
-}
-
-
-/// Computes the inverse of an LDA transform (without dimensionality reduction)
-/// The computed transform is used in initializing the phonetic and speaker
-/// subspaces, as well as while increasing the dimensions of those spaces.
-void ComputeFeatureNormalizingTransform(const FullGmm &gmm, Matrix<BaseFloat> *xform);
-
-
-/// This is the entry for a single time.
-struct Sgmm2GauPostElement {
-  // Need gselect info here, since "posteriors" is  relative to this set of
-  // selected Gaussians.
-  std::vector<int32> gselect;
-  std::vector<int32> tids;  // transition-ids for each entry in "posteriors"
-  std::vector<Matrix<BaseFloat> > posteriors;
-};
-
-
-/// indexed by time.
-class Sgmm2GauPost: public std::vector<Sgmm2GauPostElement> {
- public:
-  // Add the standard Kaldi Read and Write routines so
-  // we can use KaldiObjectHolder with this type.
-  explicit Sgmm2GauPost(size_t i) : std::vector<Sgmm2GauPostElement>(i) {}
-  Sgmm2GauPost() {}
-  void Write(std::ostream &os, bool binary) const;
-  void Read(std::istream &is, bool binary);
-};
-
-typedef KaldiObjectHolder<Sgmm2GauPost> Sgmm2GauPostHolder;
-typedef RandomAccessTableReader<Sgmm2GauPostHolder> RandomAccessSgmm2GauPostReader;
-typedef SequentialTableReader<Sgmm2GauPostHolder> SequentialSgmm2GauPostReader;
-typedef TableWriter<Sgmm2GauPostHolder> Sgmm2GauPostWriter;
-
-}  // namespace kaldi
-
-
-#endif  // KALDI_SGMM2_AM_SGMM2_H_
diff --git a/src/sgmm2/decodable-am-sgmm2.cc b/src/sgmm2/decodable-am-sgmm2.cc
deleted file mode 100644
index 420c0dc6e74..00000000000
--- a/src/sgmm2/decodable-am-sgmm2.cc
+++ /dev/null
@@ -1,54 +0,0 @@
-// sgmm2/decodable-am-sgmm2.cc
-
-// Copyright 2009-2012  Saarland University;  Lukas Burget;
-//                      Johns Hopkins University (author: Daniel Povey)
-
-// See ../../COPYING for clarification regarding multiple authors
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//  http://www.apache.org/licenses/LICENSE-2.0
-//
-// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
-// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
-// MERCHANTABLITY OR NON-INFRINGEMENT.
-// See the Apache 2 License for the specific language governing permissions and
-// limitations under the License.
-
-#include <vector>
-using std::vector;
-
-#include "sgmm2/decodable-am-sgmm2.h"
-
-namespace kaldi {
-
-
-DecodableAmSgmm2::~DecodableAmSgmm2() {
-  if (delete_vars_) {
-    delete gselect_;
-    delete feature_matrix_;
-    delete spk_;
-  }
-}
-
-BaseFloat DecodableAmSgmm2::LogLikelihoodForPdf(int32 frame, int32 pdf_id) {
-  if (frame != cur_frame_) {
-    cur_frame_ = frame;
-    sgmm_cache_.NextFrame(); // it has a frame-index internally but it doesn't
-    // have to match up with our index here, it just needs to be unique.
-
-
-    SubVector<BaseFloat> data(*feature_matrix_, frame);
-    
-    sgmm_.ComputePerFrameVars(data, (*gselect_)[frame], *spk_,
-                              &per_frame_vars_);
-  }
-  return sgmm_.LogLikelihood(per_frame_vars_, pdf_id, &sgmm_cache_, spk_,
-                             log_prune_);  
-}
-
-
-}  // namespace kaldi
diff --git a/src/sgmm2/decodable-am-sgmm2.h b/src/sgmm2/decodable-am-sgmm2.h
deleted file mode 100644
index 18498bf5b24..00000000000
--- a/src/sgmm2/decodable-am-sgmm2.h
+++ /dev/null
@@ -1,138 +0,0 @@
-// sgmm2/decodable-am-sgmm2.h
-
-// Copyright 2009-2012  Saarland University  Microsoft Corporation
-//                      Lukas Burget  Johns Hopkins University (author: Daniel Povey)
-
-// See ../../COPYING for clarification regarding multiple authors
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//  http://www.apache.org/licenses/LICENSE-2.0
-//
-// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
-// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
-// MERCHANTABLITY OR NON-INFRINGEMENT.
-// See the Apache 2 License for the specific language governing permissions and
-// limitations under the License.
-
-#ifndef KALDI_SGMM2_DECODABLE_AM_SGMM2_H_
-#define KALDI_SGMM2_DECODABLE_AM_SGMM2_H_
-
-#include <vector>
-
-#include "base/kaldi-common.h"
-#include "sgmm2/am-sgmm2.h"
-#include "hmm/transition-model.h"
-#include "itf/decodable-itf.h"
-
-namespace kaldi {
-
-class DecodableAmSgmm2 : public DecodableInterface {
- public:
-  DecodableAmSgmm2(const AmSgmm2 &sgmm,
-                   const TransitionModel &tm,
-                   const Matrix<BaseFloat> &feats,
-                   const std::vector<std::vector<int32> > &gselect,
-                   BaseFloat log_prune,
-                   Sgmm2PerSpkDerivedVars *spk):
-      sgmm_(sgmm), spk_(spk),
-      trans_model_(tm), feature_matrix_(&feats),
-      gselect_(&gselect), log_prune_(log_prune), cur_frame_(-1),
-      sgmm_cache_(sgmm.NumGroups(), sgmm.NumPdfs()), delete_vars_(false) {
-    KALDI_ASSERT(gselect.size() == static_cast<size_t>(feats.NumRows()));
-  }
-
-  /// This version of the constructor takes ownership of the pointers
-  /// "feats", "gselect" and "spk", and will delete them when it is destroyed.
-  DecodableAmSgmm2(const AmSgmm2 &sgmm,
-                   const TransitionModel &tm,
-                   const Matrix<BaseFloat> *feats,
-                   const std::vector<std::vector<int32> > *gselect,
-                   Sgmm2PerSpkDerivedVars *spk,
-                   BaseFloat log_prune):
-      sgmm_(sgmm), spk_(spk),
-      trans_model_(tm), feature_matrix_(feats),
-      gselect_(gselect), log_prune_(log_prune), cur_frame_(-1),
-      sgmm_cache_(sgmm.NumGroups(), sgmm.NumPdfs()), delete_vars_(true) {
-    KALDI_ASSERT(gselect->size() == static_cast<size_t>(feats->NumRows()));
-  }
-
-  // Note, frames are numbered from zero, but transition indices are 1-based!
-  // This is for compatibility with OpenFST.
-  virtual BaseFloat LogLikelihood(int32 frame, int32 tid) {
-    return LogLikelihoodForPdf(frame, trans_model_.TransitionIdToPdfFast(tid));
-  }
-  int32 NumFramesReady() const { return feature_matrix_->NumRows(); }
-  virtual int32 NumIndices() const { return trans_model_.NumTransitionIds(); }
-
-  virtual bool IsLastFrame(int32 frame) const {
-    KALDI_ASSERT(frame < NumFramesReady());
-    return (frame == NumFramesReady() - 1);
-  }
-
-  virtual ~DecodableAmSgmm2();
- protected:
-  virtual BaseFloat LogLikelihoodForPdf(int32 frame, int32 pdf_id);
-
-  const AmSgmm2 &sgmm_;
-  Sgmm2PerSpkDerivedVars *spk_;
-  const TransitionModel &trans_model_;  ///< for tid to pdf mapping
-  const Matrix<BaseFloat> *feature_matrix_;
-  const std::vector<std::vector<int32> > *gselect_;
-
-  BaseFloat log_prune_;
-
-  int32 cur_frame_;
-  Sgmm2PerFrameDerivedVars per_frame_vars_;
-  Sgmm2LikelihoodCache sgmm_cache_;
-
-  bool delete_vars_; // If true, we will delete feature_matrix_, gselect_, and
-  // spk_ in the destructor.
-
- private:
-  KALDI_DISALLOW_COPY_AND_ASSIGN(DecodableAmSgmm2);
-};
-
-class DecodableAmSgmm2Scaled : public DecodableAmSgmm2 {
- public:
-  DecodableAmSgmm2Scaled(const AmSgmm2 &sgmm,
-                         const TransitionModel &tm,
-                         const Matrix<BaseFloat> &feats,
-                         const std::vector<std::vector<int32> > &gselect,
-                         BaseFloat log_prune,
-                         BaseFloat scale,
-                         Sgmm2PerSpkDerivedVars *spk)
-      : DecodableAmSgmm2(sgmm, tm, feats, gselect, log_prune, spk),
-        scale_(scale) {}
-
-  /// This version of the constructor takes ownership of the pointers
-  /// "feats", "gselect" and "spk", and will delete them in its
-  /// destructor.
-  DecodableAmSgmm2Scaled(const AmSgmm2 &sgmm,
-                         const TransitionModel &tm,
-                         const Matrix<BaseFloat> *feats,
-                         const std::vector<std::vector<int32> > *gselect,
-                         Sgmm2PerSpkDerivedVars *spk,
-                         BaseFloat log_prune,
-                         BaseFloat scale)
-      : DecodableAmSgmm2(sgmm, tm, feats, gselect, spk, log_prune),
-        scale_(scale) {}
-
-
-  // Note, frames are numbered from zero but transition-ids from one.
-  virtual BaseFloat LogLikelihood(int32 frame, int32 tid) {
-    return LogLikelihoodForPdf(frame, trans_model_.TransitionIdToPdfFast(tid))
-            * scale_;
-  }
- private:
-  BaseFloat scale_;
-  KALDI_DISALLOW_COPY_AND_ASSIGN(DecodableAmSgmm2Scaled);
-};
-
-
-}  // namespace kaldi
-
-#endif  // KALDI_SGMM2_DECODABLE_AM_SGMM2_H_
diff --git a/src/sgmm2/estimate-am-sgmm2-ebw.cc b/src/sgmm2/estimate-am-sgmm2-ebw.cc
deleted file mode 100644
index 42a05dcca7c..00000000000
--- a/src/sgmm2/estimate-am-sgmm2-ebw.cc
+++ /dev/null
@@ -1,736 +0,0 @@
-// sgmm2/estimate-am-sgmm2-ebw.cc
-
-// Copyright 2012  Johns Hopkins University (Author: Daniel Povey)
-
-// See ../../COPYING for clarification regarding multiple authors
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//  http://www.apache.org/licenses/LICENSE-2.0
-//
-// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
-// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
-// MERCHANTABLITY OR NON-INFRINGEMENT.
-// See the Apache 2 License for the specific language governing permissions and
-// limitations under the License.
-
-#include "base/kaldi-common.h"
-#include "sgmm2/estimate-am-sgmm2-ebw.h"
-#include "util/kaldi-thread.h"
-using std::vector;
-
-namespace kaldi {
-
-void EbwAmSgmm2Updater::Update(const MleAmSgmm2Accs &num_accs,
-                               const MleAmSgmm2Accs &den_accs,
-                               AmSgmm2 *model,
-                               SgmmUpdateFlagsType flags,
-                               BaseFloat *auxf_change_out,
-                               BaseFloat *count_out) {
-
-  // Various quantities need to be computed at the start, before we
-  // change any of the model parameters.
-  std::vector< SpMatrix<double> > Q_num, Q_den, H, S_means;
-
-  if (flags & kSgmmPhoneProjections) {
-    MleAmSgmm2Updater::ComputeQ(num_accs, *model, &Q_num);
-    MleAmSgmm2Updater::ComputeQ(den_accs, *model, &Q_den);
-  }
-  if (flags & kSgmmCovarianceMatrix) { // compute the difference between
-    // the num and den S_means matrices... this is what we will need.
-    MleAmSgmm2Updater::ComputeSMeans(num_accs, *model, &S_means);
-    std::vector< SpMatrix<double> > S_means_tmp;
-    MleAmSgmm2Updater::ComputeSMeans(den_accs, *model, &S_means_tmp);
-    for (size_t i = 0; i < S_means.size(); i++)
-      S_means[i].AddSp(-1.0, S_means_tmp[i]);
-  }
-  if (flags & (kSgmmPhoneVectors | kSgmmPhoneWeightProjections))
-    model->ComputeH(&H);
-
-  Vector<double> gamma_num(num_accs.num_gaussians_);
-  for (int32 j1 = 0; j1 < num_accs.num_groups_; j1++)
-    gamma_num.AddRowSumMat(1.0, num_accs.gamma_[j1]);
-  Vector<double> gamma_den(den_accs.num_gaussians_);
-  for (int32 j1 = 0; j1 < den_accs.num_groups_; j1++)
-    gamma_den.AddRowSumMat(1.0, den_accs.gamma_[j1]);
-
-  BaseFloat tot_impr = 0.0;
-
-  if (flags & kSgmmPhoneVectors)
-    tot_impr += UpdatePhoneVectors(num_accs, den_accs, H, model);
-
-  if (flags & kSgmmPhoneProjections)
-    tot_impr += UpdateM(num_accs, den_accs, Q_num, Q_den,
-                        gamma_num, gamma_den, model);
-
-  if (flags & kSgmmPhoneWeightProjections)
-    tot_impr += UpdateW(num_accs, den_accs, gamma_num, gamma_den, model);
-
-  if (flags & kSgmmSpeakerWeightProjections)
-    tot_impr += UpdateU(num_accs, den_accs, gamma_num, gamma_den, model);
-
-  if (flags & kSgmmCovarianceMatrix)
-    tot_impr += UpdateVars(num_accs, den_accs,
-                           gamma_num, gamma_den, S_means, model);
-
-  if (flags & kSgmmSubstateWeights)
-    tot_impr += UpdateSubstateWeights(num_accs, den_accs, model);
-
-  if (flags & kSgmmSpeakerProjections)
-    tot_impr += UpdateN(num_accs, den_accs, gamma_num, gamma_den, model);
-
-
-  if (auxf_change_out) *auxf_change_out = tot_impr * num_accs.total_frames_;
-  if (count_out) *count_out = num_accs.total_frames_;
-
-  if (fabs(num_accs.total_frames_ - den_accs.total_frames_) >
-      0.01*(num_accs.total_frames_ + den_accs.total_frames_))
-    KALDI_WARN << "Num and den frame counts differ, "
-               << num_accs.total_frames_ << " vs. " << den_accs.total_frames_;
-
-  BaseFloat like_diff = num_accs.total_like_ - den_accs.total_like_;
-
-  KALDI_LOG << "***Averaged differenced likelihood per frame is "
-            << (like_diff/num_accs.total_frames_)
-            << " over " << (num_accs.total_frames_) << " frames.";
-  KALDI_LOG << "***Note: for this to be at all meaningful, if you use "
-            << "\"canceled\" stats you will have to renormalize this over "
-            << "the \"real\" frame count.";
-  KALDI_ASSERT(num_accs.total_frames_ > 0 && den_accs.total_frames_ > 0);
-
-  model->ComputeNormalizers();
-}
-
-
-class EbwUpdatePhoneVectorsClass: public MultiThreadable { // For multi-threaded.
- public:
-  EbwUpdatePhoneVectorsClass(const EbwAmSgmm2Updater *updater,
-                             const MleAmSgmm2Accs &num_accs,
-                             const MleAmSgmm2Accs &den_accs,
-                             const std::vector<SpMatrix<double> > &H,
-                             AmSgmm2 *model,
-                             double *auxf_impr):
-      updater_(updater), num_accs_(num_accs), den_accs_(den_accs),
-      model_(model), H_(H), auxf_impr_ptr_(auxf_impr), auxf_impr_(0.0) { }
-
-  EbwUpdatePhoneVectorsClass(const EbwUpdatePhoneVectorsClass &other) :
-      MultiThreadable(other),
-      updater_(other.updater_), num_accs_(other.num_accs_),
-      den_accs_(other.den_accs_), model_(other.model_),
-      H_(other.H_), auxf_impr_ptr_(other.auxf_impr_ptr_), auxf_impr_(0.0) { }
-
-  ~EbwUpdatePhoneVectorsClass() {
-    *auxf_impr_ptr_ += auxf_impr_;
-  }
-
-  inline void operator() () {
-    // Note: give them local copy of the sums we're computing,
-    // which will be propagated to the total sums in the destructor.
-    updater_->UpdatePhoneVectorsInternal(num_accs_, den_accs_, H_, model_,
-                                         &auxf_impr_, num_threads_, thread_id_);
-  }
- private:
-  const EbwAmSgmm2Updater *updater_;
-  const MleAmSgmm2Accs &num_accs_;
-  const MleAmSgmm2Accs &den_accs_;
-  AmSgmm2 *model_;
-  const std::vector<SpMatrix<double> > &H_;
-  double *auxf_impr_ptr_;
-  double auxf_impr_;
-};
-
-
-void EbwAmSgmm2Updater::ComputePhoneVecStats(
-    const MleAmSgmm2Accs &accs,
-    const AmSgmm2 &model,
-    const std::vector<SpMatrix<double> > &H,
-    int32 j1,
-    int32 m,
-    const Vector<double> &w_jm_in,
-    double gamma_jm,
-    Vector<double> *g_jm,
-    SpMatrix<double> *H_jm) {
-  Vector<double> w_jm(w_jm_in);
-  if (!accs.a_.empty() && accs.a_[j1](m, 0) != 0) { // [SSGMM]
-    w_jm.MulElements(accs.a_[j1].Row(m)); // multiply by "a" quantities..
-    w_jm.Scale(1.0 / w_jm.Sum()); // renormalize.
-  }
-  g_jm->CopyFromVec(accs.y_[j1].Row(m));
-  for (int32 i = 0; i < accs.num_gaussians_; i++) {
-    double gamma_jmi = accs.gamma_[j1](m, i);
-    double quadratic_term = std::max(gamma_jmi, gamma_jm * w_jm(i));
-    double scalar = gamma_jmi - gamma_jm * w_jm(i) + quadratic_term
-        * VecVec(model.w_.Row(i), model.v_[j1].Row(m));
-    g_jm->AddVec(scalar, model.w_.Row(i));
-    if (gamma_jmi != 0.0)
-      H_jm->AddSp(gamma_jmi, H[i]);  // The most important term..
-    if (quadratic_term > 1.0e-10)
-      H_jm->AddVec2(static_cast<BaseFloat>(quadratic_term), model.w_.Row(i));
-  }
-}
-
-
-// Runs the phone vectors update for a subset of states (called
-// multi-threaded).
-void EbwAmSgmm2Updater::UpdatePhoneVectorsInternal(
-    const MleAmSgmm2Accs &num_accs,
-    const MleAmSgmm2Accs &den_accs,
-    const std::vector<SpMatrix<double> > &H,
-    AmSgmm2 *model,
-    double *auxf_impr,
-    int32 num_threads,
-    int32 thread_id) const {
-
-  int32 block_size = (num_accs.num_groups_ + (num_threads-1)) / num_threads,
-      j1_start = block_size * thread_id,
-      j1_end = std::min(num_accs.num_groups_, j1_start + block_size);
-
-  int32 S = num_accs.phn_space_dim_, I = num_accs.num_gaussians_;
-
-  for (int32 j1 = j1_start; j1 < j1_end; j1++) {
-    double num_state_count = 0.0,
-        state_auxf_impr = 0.0;
-    Vector<double> w_jm(I);
-    for (int32 m = 0; m < model->NumSubstatesForGroup(j1); m++) {
-      double gamma_jm_num = num_accs.gamma_[j1].Row(m).Sum();
-      double gamma_jm_den = den_accs.gamma_[j1].Row(m).Sum();
-      num_state_count += gamma_jm_num;
-      Vector<double> g_jm_num(S);  // computed using eq. 58 of SGMM paper [for numerator stats]
-      SpMatrix<double> H_jm_num(S);  // computed using eq. 59 of SGMM paper [for numerator stats]
-      Vector<double> g_jm_den(S); // same, but for denominator stats.
-      SpMatrix<double> H_jm_den(S);
-
-      // Compute the weights for this sub-state.
-      // w_jm = softmax([w_{k1}^T ... w_{kD}^T] * v_{jkm})  eq.(7)
-      w_jm.AddMatVec(1.0, Matrix<double>(model->w_), kNoTrans,
-                     Vector<double>(model->v_[j1].Row(m)), 0.0);
-      w_jm.ApplySoftMax();
-      // Note: in the ML code, in the SSGMM case, at this point the w_jm would
-      // be modified with the "a" quantities to get the "\tilde{w}_{jm}" of the
-      // SSGMM techreport.  But in this code, it gets done inside ComputePhoneVecStats.
-
-      ComputePhoneVecStats(num_accs, *model, H, j1, m, w_jm, gamma_jm_num,
-                           &g_jm_num, &H_jm_num);
-      ComputePhoneVecStats(den_accs, *model, H, j1, m, w_jm, gamma_jm_den,
-                           &g_jm_den, &H_jm_den);
-
-      Vector<double> v_jm(model->v_[j1].Row(m));
-      Vector<double> local_derivative(S); // difference of derivative of numerator
-      // and denominator objetive function.
-      local_derivative.AddVec(1.0, g_jm_num);
-      local_derivative.AddSpVec(-1.0, H_jm_num, v_jm, 1.0);
-      local_derivative.AddVec(-1.0, g_jm_den);
-      local_derivative.AddSpVec(-1.0 * -1.0, H_jm_den, v_jm, 1.0);
-
-      SpMatrix<double> quadratic_term(H_jm_num);
-      quadratic_term.AddSp(1.0, H_jm_den);
-      double substate_count = 1.0e-10 + gamma_jm_num + gamma_jm_den;
-      quadratic_term.Scale( (substate_count + options_.tau_v) / substate_count);
-      quadratic_term.Scale(1.0 / (options_.lrate_v + 1.0e-10) );
-
-      Vector<double> delta_v_jm(S);
-
-      SolverOptions opts;
-      opts.name = "v";
-      opts.K = options_.max_cond;
-      opts.eps = options_.epsilon;
-
-      double auxf_impr =
-          ((gamma_jm_num + gamma_jm_den == 0) ? 0.0 :
-           SolveQuadraticProblem(quadratic_term,
-                                 local_derivative,
-                                 opts, &delta_v_jm));
-
-      v_jm.AddVec(1.0, delta_v_jm);
-      model->v_[j1].Row(m).CopyFromVec(v_jm);
-      state_auxf_impr += auxf_impr;
-    }
-
-    *auxf_impr += state_auxf_impr;
-    if (j1 < 10 && thread_id == 0) {
-      KALDI_LOG << "Objf impr for group j = " << j1 << "  is "
-                << (state_auxf_impr / (num_state_count + 1.0e-10))
-                << " over " << num_state_count << " frames";
-    }
-  }
-}
-
-double EbwAmSgmm2Updater::UpdatePhoneVectors(const MleAmSgmm2Accs &num_accs,
-                                             const MleAmSgmm2Accs &den_accs,
-                                             const vector< SpMatrix<double> > &H,
-                                             AmSgmm2 *model) const {
-  KALDI_LOG << "Updating phone vectors.";
-
-  double count = 0.0, auxf_impr = 0.0;
-
-  int32 J1 = num_accs.num_groups_;
-  for (int32 j1 = 0; j1 < J1; j1++) count += num_accs.gamma_[j1].Sum();
-
-  EbwUpdatePhoneVectorsClass c(this, num_accs, den_accs, H, model, &auxf_impr);
-  RunMultiThreaded(c);
-
-  auxf_impr /= count;
-
-  KALDI_LOG << "**Overall auxf improvement for v is " << auxf_impr
-            << " over " << count << " frames";
-  return auxf_impr;
-}
-
-
-double EbwAmSgmm2Updater::UpdateM(const MleAmSgmm2Accs &num_accs,
-                                  const MleAmSgmm2Accs &den_accs,
-                                  const std::vector< SpMatrix<double> > &Q_num,
-                                  const std::vector< SpMatrix<double> > &Q_den,
-                                  const Vector<double> &gamma_num,
-                                  const Vector<double> &gamma_den,
-                                  AmSgmm2 *model) const {
-  int32 S = model->PhoneSpaceDim(),
-      D = model->FeatureDim(),
-      I = model->NumGauss();
-
-  Vector<double> impr_vec(I);
-
-  for (int32 i = 0; i < I; i++) {
-    double gamma_i_num = gamma_num(i), gamma_i_den = gamma_den(i);
-
-    if (gamma_i_num + gamma_i_den == 0.0) {
-      KALDI_WARN << "Not updating phonetic basis for i = " << i
-                 << " because count is zero. ";
-      continue;
-    }
-
-    Matrix<double> Mi(model->M_[i]);
-    Matrix<double> L(D, S); // this is something like the Y quantity, which
-    // represents the linear term in the objf on M-- except that we make it the local
-    // derivative about the current value, instead of the derivative around zero.
-    // But it's not exactly the derivative w.r.t. M, due to the factor of Sigma_i.
-    // The auxiliary function is Q(x) = tr(M^T P Y) - 0.5 tr(P M Q M^T),
-    // where P is Y^{-1}.  The quantity L we define here will be Y - M Q,
-    // and you can think of this as like the local derivative, except there is
-    // a term P in there.
-    L.AddMat(1.0, num_accs.Y_[i]);
-    L.AddMatSp(-1.0, Mi, kNoTrans, Q_num[i], 1.0);
-    L.AddMat(-1.0, den_accs.Y_[i]);
-    L.AddMatSp(-1.0*-1.0, Mi, kNoTrans, Q_den[i], 1.0);
-
-    SpMatrix<double> Q(S); // This is a combination of the Q's for the numerator and denominator.
-    Q.AddSp(1.0, Q_num[i]);
-    Q.AddSp(1.0, Q_den[i]);
-
-    double state_count = 1.0e-10 + gamma_i_num + gamma_i_den; // the count
-    // represented by the quadratic part of the stats.
-    Q.Scale( (state_count + options_.tau_M) / state_count );
-    Q.Scale( 1.0 / (options_.lrate_M + 1.0e-10) );
-
-
-    SolverOptions opts;
-    opts.name = "M";
-    opts.K = options_.max_cond;
-    opts.eps = options_.epsilon;
-
-    Matrix<double> deltaM(D, S);
-    double impr =
-        SolveQuadraticMatrixProblem(Q, L,
-                                    SpMatrix<double>(model->SigmaInv_[i]),
-                                    opts, &deltaM);
-
-    impr_vec(i) = impr;
-    Mi.AddMat(1.0, deltaM);
-    model->M_[i].CopyFromMat(Mi);
-    if (i < 10 || impr / state_count > 3.0) {
-      KALDI_VLOG(2) << "Objf impr for projection M for i = " << i << ", is "
-                    << (impr/(gamma_i_num + 1.0e-20)) << " over " << gamma_i_num
-                    << " frames";
-    }
-  }
-  BaseFloat tot_count = gamma_num.Sum(), tot_impr = impr_vec.Sum();
-
-  tot_impr /= (tot_count + 1.0e-20);
-  KALDI_LOG << "Overall auxiliary function improvement for model projections "
-            << "M is " << tot_impr << " over " << tot_count << " frames";
-
-  KALDI_VLOG(1) << "Updating M: num-count is " << gamma_num;
-  KALDI_VLOG(1) << "Updating M: den-count is " << gamma_den;
-  KALDI_VLOG(1) << "Updating M: objf-impr is " << impr_vec;
-
-  return tot_impr;
-}
-
-
-// Note: we do just one iteration of the weight-projection update here.  The
-// weak-sense auxiliary functions used don't really make sense if we do it for
-// multiple iterations.  It would be possible to use a similar auxiliary
-// function to the one on my (D. Povey)'s thesis for the Gaussian mixture
-// weights, which would make sense for multiple iterations, but this would be a
-// bit more complex to implement and probably would not give much improvement
-// over this approach.
-double EbwAmSgmm2Updater::UpdateW(const MleAmSgmm2Accs &num_accs,
-                                  const MleAmSgmm2Accs &den_accs,
-                                  const Vector<double> &gamma_num,
-                                  const Vector<double> &gamma_den,
-                                  AmSgmm2 *model) {
-  KALDI_LOG << "Updating weight projections";
-
-  int32 I = num_accs.num_gaussians_, S = num_accs.phn_space_dim_;
-
-  Matrix<double> g_i_num(I, S), g_i_den(I, S);
-
-  // View F_i_{num,den} as vectors of SpMatrix [i.e. symmetric matrices,
-  // linearized into vectors]
-  Matrix<double> F_i_num(I, (S*(S+1))/2), F_i_den(I, (S*(S+1))/2);
-
-  Vector<double> impr_vec(I);
-
-  // Get the F_i and g_i quantities-- this is done in parallel (multi-core),
-  // using the same code we use in the ML update [except we get it for
-  // numerator and denominator separately.]
-  Matrix<double> w(model->w_);
-  {
-    std::vector<Matrix<double> > log_a_num;
-    if (model->HasSpeakerDependentWeights())
-      MleAmSgmm2Updater::ComputeLogA(num_accs, &log_a_num);
-    double garbage;
-    UpdateWClass c_num(num_accs, *model, w, log_a_num, &F_i_num, &g_i_num, &garbage);
-    RunMultiThreaded(c_num);
-  }
-  {
-    std::vector<Matrix<double> > log_a_den;
-    if (model->HasSpeakerDependentWeights())
-      MleAmSgmm2Updater::ComputeLogA(den_accs, &log_a_den);
-    double garbage;
-    UpdateWClass c_den(den_accs, *model, w, log_a_den, &F_i_den, &g_i_den, &garbage);
-    RunMultiThreaded(c_den);
-  }
-
-  for (int32 i = 0; i < I; i++) {
-
-    // auxf was originally formulated in terms of the change in w (i.e. the
-    // g quantities are the local derivatives), so there is less hassle than
-    // with some of the other updates, in changing it to be discriminative.
-    // we essentially just difference the linear terms and add the quadratic
-    // terms.
-
-    Vector<double> derivative(g_i_num.Row(i));
-    derivative.AddVec(-1.0, g_i_den.Row(i));
-    // F_i_num quadratic_term is a bit like the negated 2nd derivative
-    // of the numerator stats-- actually it's not the actual 2nd deriv,
-    // but an upper bound on it.
-    SpMatrix<double> quadratic_term(S), tmp_F(S);
-    quadratic_term.CopyFromVec(F_i_num.Row(i));
-    tmp_F.CopyFromVec(F_i_den.Row(i)); // tmp_F is used for Vector->SpMatrix conversion.
-    quadratic_term.AddSp(1.0, tmp_F);
-
-    double state_count = gamma_num(i) + gamma_den(i);
-
-    quadratic_term.Scale((state_count + options_.tau_w) / (state_count + 1.0e-10));
-    quadratic_term.Scale(1.0 / (options_.lrate_w + 1.0e-10) );
-
-    Vector<double> delta_w(S);
-
-    SolverOptions opts;
-    opts.name = "w";
-    opts.K = options_.max_cond;
-    opts.eps = options_.epsilon;
-
-    double objf_impr =
-        SolveQuadraticProblem(quadratic_term, derivative, opts, &delta_w);
-
-    impr_vec(i) = objf_impr;
-    if (i < 10 || objf_impr / (gamma_num(i) + 1.0e-10) > 2.0) {
-      KALDI_LOG << "Predicted objf impr for w per frame is "
-                << (objf_impr / (gamma_num(i) + 1.0e-10))
-                << " over " << gamma_num(i) << " frames.";
-    }
-    model->w_.Row(i).AddVec(1.0, delta_w);
-  }
-  KALDI_VLOG(1) << "Updating w: numerator count is " << gamma_num;
-  KALDI_VLOG(1) << "Updating w: denominator count is " << gamma_den;
-  KALDI_VLOG(1) << "Updating w: objf-impr is " << impr_vec;
-
-  double tot_num_count = gamma_num.Sum(), tot_impr = impr_vec.Sum();
-  tot_impr /= tot_num_count;
-
-  KALDI_LOG << "**Overall objf impr for w per frame is "
-            << tot_impr << " over " << tot_num_count
-            << " frames.";
-  return tot_impr;
-}
-
-
-double EbwAmSgmm2Updater::UpdateU(const MleAmSgmm2Accs &num_accs,
-                                  const MleAmSgmm2Accs &den_accs,
-                                  const Vector<double> &gamma_num,
-                                  const Vector<double> &gamma_den,
-                                  AmSgmm2 *model) {
-  int32 T = num_accs.spk_space_dim_;
-  double tot_impr = 0.0;
-  for (int32 i = 0; i < num_accs.num_gaussians_; i++) {
-    if (gamma_num(i) < 200.0) {
-      KALDI_LOG << "Numerator count is small " << gamma_num(i) << " for gaussian "
-                << i << ", not updating u_i.";
-      continue;
-    }
-    Vector<double> u_i(model->u_.Row(i));
-    Vector<double> delta_u(T);
-    Vector<double> t(T); // derivative.
-    t.AddVec(1.0, num_accs.t_.Row(i));
-    t.AddVec(-1.0, den_accs.t_.Row(i));
-    SpMatrix<double> U(T); // quadratic term.
-    U.AddSp(1.0, num_accs.U_[i]);
-    U.AddSp(1.0, den_accs.U_[i]);
-
-    double state_count = gamma_num(i) + gamma_den(i);
-    U.Scale((state_count + options_.tau_u) / (state_count + 1.0e-10));
-    U.Scale(1.0 / (options_.lrate_u + 1.0e-10) );
-
-    SolverOptions opts;
-    opts.name = "u";
-    opts.K = options_.max_cond;
-    opts.eps = options_.epsilon;
-
-    double impr = SolveQuadraticProblem(U, t, opts, &delta_u);
-    double impr_per_frame = impr / gamma_num(i);
-    if (impr_per_frame > options_.max_impr_u) {
-      KALDI_WARN << "Updating speaker weight projections u, for Gaussian index "
-                 << i << ", impr/frame is " << impr_per_frame << " over "
-                 << gamma_num(i) << " frames, scaling back to not exceed "
-                 << options_.max_impr_u;
-      double scale = options_.max_impr_u / impr_per_frame;
-      impr *= scale;
-      delta_u.Scale(scale);
-      // Note: a linear scaling of "impr" with "scale" is not quite accurate
-      // in depicting how the quadratic auxiliary function varies as we change
-      // the scale on "delta", but this does not really matter-- the goal is
-      // to limit the auxiliary-function change to not be too large.
-    }
-    if (i < 10) {
-      KALDI_LOG << "Objf impr for spk weight-projection u for i = " << (i)
-                << ", is " << (impr / (gamma_num(i) + 1.0e-20)) << " over "
-                << gamma_num(i) << " frames";
-    }
-    u_i.AddVec(1.0, delta_u);
-    model->u_.Row(i).CopyFromVec(u_i);
-    tot_impr += impr;
-  }
-  KALDI_LOG << "**Overall objf impr for u is " << (tot_impr/gamma_num.Sum())
-            << ", over " << gamma_num.Sum() << " frames";
-  return tot_impr;
-}
-
-
-double EbwAmSgmm2Updater::UpdateN(const MleAmSgmm2Accs &num_accs,
-                                  const MleAmSgmm2Accs &den_accs,
-                                  const Vector<double> &gamma_num,
-                                  const Vector<double> &gamma_den,
-                                  AmSgmm2 *model) const {
-  if (num_accs.spk_space_dim_ == 0 || num_accs.R_.size() == 0 ||
-      num_accs.Z_.size() == 0) {
-    KALDI_ERR << "Speaker subspace dim is zero or no stats accumulated";
-  }
-
-  int32 I = num_accs.num_gaussians_, D = num_accs.feature_dim_,
-      T = num_accs.spk_space_dim_;
-
-  Vector<double> impr_vec(I);
-
-  for (int32 i = 0; i < I; i++) {
-    double gamma_i_num = gamma_num(i), gamma_i_den = gamma_den(i);
-    if (gamma_i_num + gamma_i_den == 0.0) {
-      KALDI_WARN << "Not updating speaker basis for i = " << i
-                 << " because count is zero. ";
-      continue;
-    }
-    Matrix<double> Ni(model->N_[i]);
-    // See comment near declaration of L in UpdateM().  This update is the
-    // same, but change M->N, Y->Z and Q->R.
-
-    Matrix<double> L(D, T);
-    L.AddMat(1.0, num_accs.Z_[i]);
-    L.AddMatSp(-1.0, Ni, kNoTrans, num_accs.R_[i], 1.0);
-    L.AddMat(-1.0, den_accs.Z_[i]);
-    L.AddMatSp(-1.0*-1.0, Ni, kNoTrans, den_accs.R_[i], 1.0);
-
-    SpMatrix<double> R(T); // combination of the numerator and denominator R's.
-    R.AddSp(1.0, num_accs.R_[i]);
-    R.AddSp(1.0, den_accs.R_[i]);
-
-    double state_count = 1.0e-10 + gamma_i_num + gamma_i_den; // the count
-    // represented by the quadratic part of the stats.
-    R.Scale( (state_count + options_.tau_N) / state_count );
-    R.Scale( 1.0 / (options_.lrate_N + 1.0e-10) );
-
-    Matrix<double> deltaN(D, T);
-
-    SolverOptions opts;
-    opts.name = "N";
-    opts.K = options_.max_cond;
-    opts.eps = options_.epsilon;
-
-    double impr =
-        SolveQuadraticMatrixProblem(R, L,
-                                    SpMatrix<double>(model->SigmaInv_[i]),
-                                    opts, &deltaN);
-    impr_vec(i) = impr;
-    Ni.AddMat(1.0, deltaN);
-    model->N_[i].CopyFromMat(Ni);
-    if (i < 10 || impr / (state_count+1.0e-20) > 3.0) {
-      KALDI_LOG << "Objf impr for spk projection N for i = " << (i)
-                << ", is " << (impr / (gamma_i_num + 1.0e-20)) << " over "
-                << gamma_i_num << " frames";
-    }
-  }
-
-  KALDI_VLOG(1) << "Updating N: numerator count is " << gamma_num;
-  KALDI_VLOG(1) << "Updating N: denominator count is " << gamma_den;
-  KALDI_VLOG(1) << "Updating N: objf-impr is " << impr_vec;
-
-  double tot_count = gamma_num.Sum(), tot_impr = impr_vec.Sum();
-  tot_impr /= (tot_count + 1.0e-20);
-  KALDI_LOG << "**Overall auxf impr for N is " << tot_impr
-            << " over " << tot_count << " frames";
-  return tot_impr;
-}
-
-double EbwAmSgmm2Updater::UpdateVars(const MleAmSgmm2Accs &num_accs,
-                                     const MleAmSgmm2Accs &den_accs,
-                                     const Vector<double> &gamma_num,
-                                     const Vector<double> &gamma_den,
-                                     const std::vector< SpMatrix<double> > &S_means,
-                                     AmSgmm2 *model) const {
-  // Note: S_means contains not only the quantity S_means in the paper,
-  // but also has a term - (Y_i M_i^T + M_i Y_i^T).  Plus, it is differenced
-  // between numerator and denominator.  We don't calculate it here,
-  // because it had to be computed with the original model, before we
-  // changed the M quantities.
-  int32 I = num_accs.num_gaussians_;
-  KALDI_ASSERT(S_means.size() == I);
-  Vector<double> impr_vec(I);
-
-  for (int32 i = 0; i < I; i++) {
-    double num_count = gamma_num(i), den_count = gamma_den(i);
-
-    SpMatrix<double> SigmaStats(S_means[i]);
-    SigmaStats.AddSp(1.0, num_accs.S_[i]);
-    SigmaStats.AddSp(-1.0, den_accs.S_[i]);
-    // SigmaStats now contain the stats for estimating Sigma (as in the main SGMM paper),
-    // differenced between num and den.
-    SpMatrix<double> SigmaInvOld(model->SigmaInv_[i]), SigmaOld(model->SigmaInv_[i]);
-    SigmaOld.Invert();
-    double count = num_count - den_count;
-    KALDI_ASSERT(options_.lrate_Sigma <= 1.0);
-    double inv_lrate = 1.0 / options_.lrate_Sigma;
-    // These formulas assure that the objective function behaves in
-    // a roughly symmetric way w.r.t. num and den counts.
-    double E_den = 1.0 + inv_lrate, E_num = inv_lrate - 1.0;
-
-    double smoothing_count =
-        (options_.tau_Sigma * inv_lrate) + // multiply tau_Sigma by inverse-lrate
-        (E_den * den_count) +              // for compatibility with other updates.
-        (E_num * num_count) +
-        1.0e-10;
-    SigmaStats.AddSp(smoothing_count, SigmaOld);
-    count += smoothing_count;
-    SigmaStats.Scale(1.0 / count);
-    SpMatrix<double> SigmaInv(SigmaStats); // before floor and ceiling.  Currently sigma,
-    // not its inverse.
-    bool verbose = false;
-    int n_floor = SigmaInv.ApplyFloor(SigmaOld, options_.cov_min_value, verbose);
-    SigmaInv.Invert(); // make it inverse variance.
-    int n_ceiling = SigmaInv.ApplyFloor(SigmaInvOld, options_.cov_min_value, verbose);
-
-    // this auxf_change.
-    double auxf_change = -0.5 * count *(TraceSpSp(SigmaInv, SigmaStats)
-                                        - TraceSpSp(SigmaInvOld, SigmaStats)
-                                        - SigmaInv.LogDet()
-                                        + SigmaInvOld.LogDet());
-
-    model->SigmaInv_[i].CopyFromSp(SigmaInv);
-    impr_vec(i) = auxf_change;
-    if (i < 10 || auxf_change / (num_count+den_count+1.0e-10) > 2.0
-        || n_floor+n_ceiling > 0) {
-      KALDI_LOG << "Updating variance: Auxf change per frame for Gaussian "
-                << i << " is " << (auxf_change / num_count) << " over "
-                << num_count << " frames " << "(den count was " << den_count
-                << "), #floor,ceil was " << n_floor << ", " << n_ceiling;
-    }
-  }
-  KALDI_VLOG(1) << "Updating Sigma: numerator count is " << gamma_num;
-  KALDI_VLOG(1) << "Updating Sigma: denominator count is " << gamma_den;
-  KALDI_VLOG(1) << "Updating Sigma: objf-impr is " << impr_vec;
-
-  double tot_count = gamma_num.Sum(), tot_impr = impr_vec.Sum();
-  tot_impr /= tot_count+1.0e-20;
-  KALDI_LOG << "**Overall auxf impr for Sigma is " << tot_impr
-            << " over " << tot_count << " frames";
-  return tot_impr;
-}
-
-
-double EbwAmSgmm2Updater::UpdateSubstateWeights(
-    const MleAmSgmm2Accs &num_accs,
-    const MleAmSgmm2Accs &den_accs,
-    AmSgmm2 *model) {
-  KALDI_LOG << "Updating substate mixture weights";
-
-  double tot_count = 0.0, tot_impr = 0.0;
-  for (int32 j2 = 0; j2 < num_accs.num_pdfs_; j2++) {
-    int32 M = model->NumSubstatesForPdf(j2);
-    Vector<double> num_occs(M), den_occs(M),
-        orig_weights(model->c_[j2]), weights(model->c_[j2]);
-
-    for (int32 m = 0; m < M; m++) {
-      num_occs(m) = num_accs.gamma_c_[j2](m)
-          + options_.tau_c * weights(m);
-      den_occs(m) = den_accs.gamma_c_[j2](m);
-    }
-
-    if (weights.Dim() > 1) {
-      double begin_auxf = 0.0, end_auxf = 0.0;
-      for (int32 m = 0; m < M; m++) {  // see eq. 4.32, Dan Povey's PhD thesis.
-        begin_auxf += num_occs(m) * log (weights(m))
-            - den_occs(m) * weights(m) / orig_weights(m);
-      }
-      for (int32 iter = 0; iter < 50; iter++) {
-        Vector<double> k_jm(M);
-        double max_m = 0.0;
-        for (int32 m = 0; m < M; m++)
-          max_m = std::max(max_m, den_occs(m)/orig_weights(m));
-        for (int32 m = 0; m < M; m++)
-          k_jm(m) = max_m - den_occs(m)/orig_weights(m);
-        for (int32 m = 0; m < M; m++)
-          weights(m) = num_occs(m) + k_jm(m)*weights(m);
-        weights.Scale(1.0 / weights.Sum());
-      }
-      for (int32 m = 0; m < M; m++)
-        weights(m) = std::max(weights(m),
-                              static_cast<double>(options_.min_substate_weight));
-      weights.Scale(1.0 / weights.Sum()); // renormalize.
-
-      for (int32 m = 0; m < M; m++) {
-        end_auxf += num_occs(m) * log (weights(m))
-            - den_occs(m) * weights(m) / orig_weights(m);
-      }
-      tot_impr += end_auxf - begin_auxf;
-      double this_impr = ((end_auxf - begin_auxf) / num_occs.Sum());
-      if (j2 < 10 || this_impr > 0.5) {
-        KALDI_LOG << "Updating substate weights: auxf impr for pdf " << j2
-                  << " is " << this_impr << " per frame over " << num_occs.Sum()
-                  << " frames (den count is " << den_occs.Sum() << ")";
-      }
-    }
-    model->c_[j2].CopyFromVec(weights);
-    tot_count += den_occs.Sum(); // Note: num and den occs should be the
-    // same, except num occs are smoothed, so this is what we want.
-  }
-
-  tot_impr /= (tot_count + 1.0e-20);
-
-  KALDI_LOG << "**Overall auxf impr for c is " << tot_impr
-            << " over " << tot_count << " frames";
-  return tot_impr;
-}
-
-}  // namespace kaldi
diff --git a/src/sgmm2/estimate-am-sgmm2-ebw.h b/src/sgmm2/estimate-am-sgmm2-ebw.h
deleted file mode 100644
index c1ec188f367..00000000000
--- a/src/sgmm2/estimate-am-sgmm2-ebw.h
+++ /dev/null
@@ -1,242 +0,0 @@
-// sgmm2/estimate-am-sgmm2-ebw.h
-
-// Copyright 2012  Johns Hopkins University (author: Daniel Povey)
-
-// See ../../COPYING for clarification regarding multiple authors
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//  http://www.apache.org/licenses/LICENSE-2.0
-//
-// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
-// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
-// MERCHANTABLITY OR NON-INFRINGEMENT.
-// See the Apache 2 License for the specific language governing permissions and
-// limitations under the License.
-
-#ifndef KALDI_SGMM2_ESTIMATE_AM_SGMM2_EBW_H_
-#define KALDI_SGMM2_ESTIMATE_AM_SGMM2_EBW_H_ 1
-
-#include <string>
-#include <vector>
-
-#include "gmm/model-common.h"
-#include "itf/options-itf.h"
-#include "sgmm2/estimate-am-sgmm2.h"
-
-namespace kaldi {
-
-/**
-   This header implements a form of Extended Baum-Welch training for SGMMs.
-   If you are confused by this comment, see Dan Povey's thesis for an explanation of
-   Extended Baum-Welch.
-   A note on the EBW (Extended Baum-Welch) updates for the SGMMs... In general there is
-   a parameter-specific value D that is similar to the D in EBW for GMMs.  The value of
-   D is generally set to:
-     E * (denominator-count for that parameter)   +   tau-value for that parameter
-   where the tau-values are user-specified parameters that are specific to the type of
-   the parameter (e.g. phonetic vector, subspace projection, etc.).  Things are a bit
-   more complex for this update than for GMMs, because it's not just a question of picking
-   a tau-value for smoothing: there is sometimes a scatter-matrix of some kind (e.g.
-   an outer product of vectors, or something) that defines a quadratic objective function
-   that we'll add as smoothing.  We have to pick where to get this scatter-matrix from.
-   We feel that it's appropriate for the "E" part of the D to get its scatter-matrix from
-   denominator stats, and the tau part of the D to get half its scatter-matrix from the
-   both the numerator and denominator stats, assigned a weight proportional to how much
-   stats there were.  When you see the auxiliary function written out, it's clear why this
-   makes sense.
-
- */
-
-struct EbwAmSgmm2Options {
-  BaseFloat tau_v; ///<  Smoothing constant for updates of sub-state vectors v_{jm}
-  BaseFloat lrate_v; ///< Learning rate used in updating v-- default 0.5
-  BaseFloat tau_M; ///<  Smoothing constant for the M quantities (phone-subspace projections)
-  BaseFloat lrate_M; ///< Learning rate used in updating M-- default 0.5
-  BaseFloat tau_N; ///<  Smoothing constant for the N quantities (speaker-subspace projections)
-  BaseFloat lrate_N; ///< Learning rate used in updating N-- default 0.5
-  BaseFloat tau_c;  ///< Tau value for smoothing substate weights (c)
-  BaseFloat tau_w;  ///< Tau value for smoothing update of phonetic-subspace weight projectsions (w)
-  BaseFloat lrate_w; ///< Learning rate used in updating w-- default 1.0
-  BaseFloat tau_u;  ///< Tau value for smoothing update of speaker-subspace weight projectsions (u)
-  BaseFloat lrate_u; ///< Learning rate used in updating u-- default 1.0
-  BaseFloat max_impr_u; ///< Maximum improvement/frame allowed for u [0.25, carried over from ML update.]
-  BaseFloat tau_Sigma; ///< Tau value for smoothing covariance-matrices Sigma.
-  BaseFloat lrate_Sigma; ///< Learning rate used in updating Sigma-- default 0.5
-  BaseFloat min_substate_weight; ///< Minimum allowed weight in a sub-state.
-  
-  BaseFloat cov_min_value; ///< E.g. 0.5-- the maximum any eigenvalue of a covariance
-  /// is allowed to change.  [this is the minimum; the maximum is the inverse of this,
-  /// i.e. 2.0 in this case.  For example, 0.9 would constrain the covariance quite tightly,
-  /// 0.1 would be a loose setting.
-  
-  BaseFloat max_cond; ///< large value used in SolveQuadraticProblem.
-  BaseFloat epsilon;  ///< very small value used in SolveQuadraticProblem; workaround
-  /// for an issue in some implementations of SVD.
-  
-  EbwAmSgmm2Options() {
-    tau_v = 50.0;
-    lrate_v = 0.5;
-    tau_M = 500.0;
-    lrate_M = 0.5;
-    tau_N = 500.0;
-    lrate_N = 0.5;
-    tau_c = 10.0;
-    tau_w = 50.0;
-    lrate_w = 1.0;
-    tau_u = 50.0;
-    lrate_u = 1.0;
-    max_impr_u = 0.25;
-    tau_Sigma = 500.0;
-    lrate_Sigma = 0.5;
-
-    min_substate_weight = 1.0e-05;
-    cov_min_value = 0.5;
-    
-    max_cond = 1.0e+05;
-    epsilon = 1.0e-40;
-  }
-
-  void Register(OptionsItf *opts) {
-    std::string module = "EbwAmSgmm2Options: ";
-    opts->Register("tau-v", &tau_v, module+
-                   "Smoothing constant for phone vector estimation.");
-    opts->Register("lrate-v", &lrate_v, module+
-                   "Learning rate constant for phone vector estimation.");
-    opts->Register("tau-m", &tau_M, module+
-                   "Smoothing constant for estimation of phonetic-subspace projections (M).");
-    opts->Register("lrate-m", &lrate_M, module+
-                   "Learning rate constant for phonetic-subspace projections.");
-    opts->Register("tau-n", &tau_N, module+
-                   "Smoothing constant for estimation of speaker-subspace projections (N).");
-    opts->Register("lrate-n", &lrate_N, module+
-                   "Learning rate constant for speaker-subspace projections.");
-    opts->Register("tau-c", &tau_c, module+
-                   "Smoothing constant for estimation of substate weights (c)");
-    opts->Register("tau-w", &tau_w, module+
-                   "Smoothing constant for estimation of phonetic-space weight projections (w)");
-    opts->Register("lrate-w", &lrate_w, module+
-                   "Learning rate constant for phonetic-space weight-projections (w)");
-    opts->Register("tau-u", &tau_u, module+
-                   "Smoothing constant for estimation of speaker-space weight projections (u)");
-    opts->Register("lrate-u", &lrate_u, module+
-                   "Learning rate constant for speaker-space weight-projections (u)");
-    opts->Register("tau-sigma", &tau_Sigma, module+
-                   "Smoothing constant for estimation of within-class covariances (Sigma)");
-    opts->Register("lrate-sigma", &lrate_Sigma, module+
-                   "Constant that controls speed of learning for variances (larger->slower)");
-    opts->Register("cov-min-value", &cov_min_value, module+
-                   "Minimum value that an eigenvalue of the updated covariance matrix can take, "
-                   "relative to its old value (maximum is inverse of this.)");
-    opts->Register("min-substate-weight", &min_substate_weight, module+
-                   "Floor for weights of sub-states.");
-    opts->Register("max-cond", &max_cond, module+
-                   "Value used in handling singular matrices during update.");
-    opts->Register("epsilon", &max_cond, module+
-                   "Value used in handling singular matrices during update.");
-  }
-};
-
-
-/** \class EbwAmSgmmUpdater
- *  Contains the functions needed to update the SGMM parameters.
- */
-class EbwAmSgmm2Updater {
- public:
-  explicit EbwAmSgmm2Updater(const EbwAmSgmm2Options &options):
-      options_(options) {}
-  
-  void Update(const MleAmSgmm2Accs &num_accs,
-              const MleAmSgmm2Accs &den_accs,
-              AmSgmm2 *model,
-              SgmmUpdateFlagsType flags,
-              BaseFloat *auxf_change_out,
-              BaseFloat *count_out);
-    
- protected:
-  // The following two classes relate to multi-core parallelization of some
-  // phases of the update.
-  friend class EbwUpdateWClass;
-  friend class EbwUpdatePhoneVectorsClass;
- private:
-  EbwAmSgmm2Options options_;
-
-  Vector<double> gamma_j_;  ///< State occupancies
-
-  double UpdatePhoneVectors(const MleAmSgmm2Accs &num_accs,
-                            const MleAmSgmm2Accs &den_accs,
-                            const std::vector< SpMatrix<double> > &H,
-                            AmSgmm2 *model) const;
-  
-  // Called from UpdatePhoneVectors; updates a subset of states
-  // (relates to multi-threading).
-  void UpdatePhoneVectorsInternal(const MleAmSgmm2Accs &num_accs,
-                                  const MleAmSgmm2Accs &den_accs,
-                                  const std::vector<SpMatrix<double> > &H,
-                                  AmSgmm2 *model,
-                                  double *auxf_impr,
-                                  int32 num_threads,
-                                  int32 thread_id) const;
-  // Called from UpdatePhoneVectorsInternal
-  static void ComputePhoneVecStats(const MleAmSgmm2Accs &accs,
-                                   const AmSgmm2 &model,
-                                   const std::vector<SpMatrix<double> > &H,
-                                   int32 j1,
-                                   int32 m,
-                                   const Vector<double> &w_jm,
-                                   double gamma_jm,
-                                   Vector<double> *g_jm,
-                                   SpMatrix<double> *H_jm);
-                                    
-  double UpdateM(const MleAmSgmm2Accs &num_accs,
-                 const MleAmSgmm2Accs &den_accs,
-                 const std::vector< SpMatrix<double> > &Q_num,
-                 const std::vector< SpMatrix<double> > &Q_den,
-                 const Vector<double> &gamma_num,
-                 const Vector<double> &gamma_den,
-                 AmSgmm2 *model) const;
-  
-  double UpdateN(const MleAmSgmm2Accs &num_accs,
-                 const MleAmSgmm2Accs &den_accs,
-                 const Vector<double> &gamma_num,
-                 const Vector<double> &gamma_den,
-                 AmSgmm2 *model) const;
-  
-  double UpdateVars(const MleAmSgmm2Accs &num_accs,
-                    const MleAmSgmm2Accs &den_accs,
-                    const Vector<double> &gamma_num,
-                    const Vector<double> &gamma_den,
-                    const std::vector< SpMatrix<double> > &S_means,
-                    AmSgmm2 *model) const;
-
-  /// Note: in the discriminative case we do just one iteration of
-  /// updating the w quantities.
-  double UpdateW(const MleAmSgmm2Accs &num_accs,
-                 const MleAmSgmm2Accs &den_accs,
-                 const Vector<double> &gamma_num,
-                 const Vector<double> &gamma_den,
-                 AmSgmm2 *model);
-
-
-  double UpdateU(const MleAmSgmm2Accs &num_accs,
-                 const MleAmSgmm2Accs &den_accs,
-                 const Vector<double> &gamma_num,
-                 const Vector<double> &gamma_den,
-                 AmSgmm2 *model);
-  
-  double UpdateSubstateWeights(const MleAmSgmm2Accs &num_accs,
-                               const MleAmSgmm2Accs &den_accs,
-                               AmSgmm2 *model);
-
-  KALDI_DISALLOW_COPY_AND_ASSIGN(EbwAmSgmm2Updater);
-  EbwAmSgmm2Updater() {}  // Prevent unconfigured updater.
-};
-
-
-}  // namespace kaldi
-
-
-#endif  // KALDI_SGMM2_ESTIMATE_AM_SGMM2_EBW_H_
diff --git a/src/sgmm2/estimate-am-sgmm2-test.cc b/src/sgmm2/estimate-am-sgmm2-test.cc
deleted file mode 100644
index bfdb161d95f..00000000000
--- a/src/sgmm2/estimate-am-sgmm2-test.cc
+++ /dev/null
@@ -1,167 +0,0 @@
-// sgmm2/estimate-am-sgmm2-test.cc
-
-// Copyright 2009-2011  Saarland University (author:  Arnab Ghoshal)
-//           2012-2013  Johns Hopkins University (author: Daniel Povey)
-//                      Arnab Ghoshal
-
-// See ../../COPYING for clarification regarding multiple authors
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//  http://www.apache.org/licenses/LICENSE-2.0
-//
-// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
-// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
-// MERCHANTABLITY OR NON-INFRINGEMENT.
-// See the Apache 2 License for the specific language governing permissions and
-// limitations under the License.
-
-#include "base/kaldi-math.h"
-#include "gmm/model-test-common.h"
-#include "sgmm2/am-sgmm2.h"
-#include "sgmm2/estimate-am-sgmm2.h"
-#include "util/kaldi-io.h"
-
-using kaldi::AmSgmm2;
-using kaldi::MleAmSgmm2Accs;
-using kaldi::int32;
-using kaldi::BaseFloat;
-using kaldi::Exp;
-
-namespace ut = kaldi::unittest;
-
-// Tests the Read() and Write() methods for the accumulators, in both binary
-// and ASCII mode, as well as Check().
-void TestSgmm2AccsIO(const AmSgmm2 &sgmm,
-                     const kaldi::Matrix<BaseFloat> &feats) {
-  using namespace kaldi;
-  kaldi::SgmmUpdateFlagsType flags = kaldi::kSgmmAll & ~kSgmmSpeakerWeightProjections;
-  kaldi::Sgmm2PerFrameDerivedVars frame_vars;
-  kaldi::Sgmm2PerSpkDerivedVars empty;
-  frame_vars.Resize(sgmm.NumGauss(), sgmm.FeatureDim(),
-                    sgmm.PhoneSpaceDim());
-  kaldi::Sgmm2GselectConfig sgmm_config;
-  sgmm_config.full_gmm_nbest = std::min(sgmm_config.full_gmm_nbest,
-                                        sgmm.NumGauss());
-  MleAmSgmm2Accs accs(sgmm, flags, true);
-  BaseFloat loglike = 0.0;
-
-  for (int32 i = 0; i < feats.NumRows(); i++) {
-    std::vector<int32> gselect;
-    sgmm.GaussianSelection(sgmm_config, feats.Row(i), &gselect);
-    sgmm.ComputePerFrameVars(feats.Row(i), gselect, empty, &frame_vars);
-    loglike += accs.Accumulate(sgmm, frame_vars, 0, 1.0, &empty);
-  }
-  accs.CommitStatsForSpk(sgmm, empty);
-
-  kaldi::MleAmSgmm2Options update_opts;
-  AmSgmm2 *sgmm1 = new AmSgmm2();
-  sgmm1->CopyFromSgmm2(sgmm, false, false);
-  kaldi::MleAmSgmm2Updater updater(update_opts);
-  updater.Update(accs, sgmm1, flags);
-  sgmm1->ComputeDerivedVars();
-  std::vector<int32> gselect;
-  Sgmm2LikelihoodCache like_cache(sgmm.NumGroups(), sgmm.NumPdfs());
-
-  sgmm1->GaussianSelection(sgmm_config, feats.Row(0), &gselect);
-  sgmm1->ComputePerFrameVars(feats.Row(0), gselect, empty, &frame_vars);
-  BaseFloat loglike1 = sgmm1->LogLikelihood(frame_vars, 0, &like_cache, &empty);
-  delete sgmm1;
-
-  // First, non-binary write
-  accs.Write(kaldi::Output("tmpf", false).Stream(), false);
-  bool binary_in;
-  MleAmSgmm2Accs *accs1 = new MleAmSgmm2Accs();
-  // Non-binary read
-  kaldi::Input ki1("tmpf", &binary_in);
-  accs1->Read(ki1.Stream(), binary_in, false);
-  accs1->Check(sgmm, true);
-  AmSgmm2 *sgmm2 = new AmSgmm2();
-  sgmm2->CopyFromSgmm2(sgmm, false, false);
-  updater.Update(*accs1, sgmm2, flags);
-  sgmm2->ComputeDerivedVars();
-  sgmm2->GaussianSelection(sgmm_config, feats.Row(0), &gselect);
-  sgmm2->ComputePerFrameVars(feats.Row(0), gselect, empty, &frame_vars);
-  Sgmm2LikelihoodCache like_cache2(sgmm2->NumGroups(), sgmm2->NumPdfs());
-  BaseFloat loglike2 = sgmm2->LogLikelihood(frame_vars, 0, &like_cache2, &empty);
-  kaldi::AssertEqual(loglike1, loglike2, 1e-4);
-  delete accs1;
-
-  // Next, binary write
-  accs.Write(kaldi::Output("tmpfb", true).Stream(), true);
-  MleAmSgmm2Accs *accs2 = new MleAmSgmm2Accs();
-  // Binary read
-  kaldi::Input ki2("tmpfb", &binary_in);
-  accs2->Read(ki2.Stream(), binary_in, false);
-  accs2->Check(sgmm, true);
-  AmSgmm2 *sgmm3 = new AmSgmm2();
-  sgmm3->CopyFromSgmm2(sgmm, false, false);
-  updater.Update(*accs2, sgmm3, flags);
-  sgmm3->ComputeDerivedVars();
-  sgmm3->GaussianSelection(sgmm_config, feats.Row(0), &gselect);
-  sgmm3->ComputePerFrameVars(feats.Row(0), gselect, empty, &frame_vars);
-  Sgmm2LikelihoodCache like_cache3(sgmm3->NumGroups(), sgmm3->NumPdfs());
-  BaseFloat loglike3 = sgmm3->LogLikelihood(frame_vars, 0, &like_cache3, &empty);
-  kaldi::AssertEqual(loglike1, loglike3, 1e-6);
-
-  // Testing the MAP update of M
-  update_opts.tau_map_M = 10;
-  update_opts.full_col_cov = (RandUniform() > 0.5)? true : false;
-  update_opts.full_row_cov = (RandUniform() > 0.5)? true : false;
-  kaldi::MleAmSgmm2Updater updater_map(update_opts);
-  sgmm3->CopyFromSgmm2(sgmm, false, false);
-  updater_map.Update(*accs2, sgmm3, flags);
-
-  delete accs2;
-  delete sgmm2;
-  delete sgmm3;
-
-  unlink("tmpf");
-  unlink("tmpfb");
-}
-
-void UnitTestEstimateSgmm2() {
-  int32 dim = 1 + kaldi::RandInt(0, 9);  // random dimension of the gmm
-  int32 num_comp = 2 + kaldi::RandInt(0, 9);  // random mixture size
-  kaldi::FullGmm full_gmm;
-  ut::InitRandFullGmm(dim, num_comp, &full_gmm);
-
-  AmSgmm2 sgmm;
-  kaldi::Sgmm2GselectConfig config;
-  std::vector<int32> pdf2group;
-  pdf2group.push_back(0);
-  sgmm.InitializeFromFullGmm(full_gmm, pdf2group, dim+1, dim, false, 0.9); // TODO-- make this true!
-  sgmm.ComputeNormalizers();
-
-  kaldi::Matrix<BaseFloat> feats;
-
-  {  // First, generate random means and variances
-    int32 num_feat_comp = num_comp + kaldi::RandInt(-num_comp/2, num_comp/2);
-    kaldi::Matrix<BaseFloat> means(num_feat_comp, dim),
-        vars(num_feat_comp, dim);
-    for (int32 m = 0; m < num_feat_comp; m++) {
-      for (int32 d= 0; d < dim; d++) {
-        means(m, d) = kaldi::RandGauss();
-        vars(m, d) = Exp(kaldi::RandGauss()) + 1e-2;
-      }
-    }
-    // Now generate random features with those means and variances.
-    feats.Resize(num_feat_comp * 200, dim);
-    for (int32 m = 0; m < num_feat_comp; m++) {
-      kaldi::SubMatrix<BaseFloat> tmp(feats, m*200, 200, 0, dim);
-      ut::RandDiagGaussFeatures(200, means.Row(m), vars.Row(m), &tmp);
-    }
-  }
-  sgmm.ComputeDerivedVars();
-  TestSgmm2AccsIO(sgmm, feats);
-}
-
-int main() {
-  for (int i = 0; i < 10; i++)
-    UnitTestEstimateSgmm2();
-  std::cout << "Test OK.\n";
-  return 0;
-}
diff --git a/src/sgmm2/estimate-am-sgmm2.cc b/src/sgmm2/estimate-am-sgmm2.cc
deleted file mode 100644
index 6bb277314df..00000000000
--- a/src/sgmm2/estimate-am-sgmm2.cc
+++ /dev/null
@@ -1,1952 +0,0 @@
-// sgmm2/estimate-am-sgmm2.cc
-
-// Copyright 2009-2011  Microsoft Corporation;  Lukas Burget;
-//                      Saarland University (Author: Arnab Ghoshal);
-//                      Ondrej Glembek;  Yanmin Qian;
-// Copyright 2012-2013  Johns Hopkins University (Author: Daniel Povey)
-//                      Liang Lu;  Arnab Ghoshal
-
-// See ../../COPYING for clarification regarding multiple authors
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//  http://www.apache.org/licenses/LICENSE-2.0
-//
-// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
-// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
-// MERCHANTABLITY OR NON-INFRINGEMENT.
-// See the Apache 2 License for the specific language governing permissions and
-// limitations under the License.
-
-
-#include "sgmm2/am-sgmm2.h"
-#include "sgmm2/estimate-am-sgmm2.h"
-#include "util/kaldi-thread.h"
-
-namespace kaldi {
-
-using std::string;
-using std::vector;
-
-void MleAmSgmm2Accs::Write(std::ostream &out_stream, bool binary) const {
-
-  WriteToken(out_stream, binary, "<SGMMACCS>");
-  WriteToken(out_stream, binary, "<NUMPDFS>");
-  WriteBasicType(out_stream, binary, num_pdfs_);
-  WriteToken(out_stream, binary, "<NUMGROUPS>");
-  WriteBasicType(out_stream, binary, num_groups_);
-  WriteToken(out_stream, binary, "<NUMGaussians>");
-  WriteBasicType(out_stream, binary, num_gaussians_);
-  WriteToken(out_stream, binary, "<FEATUREDIM>");
-  WriteBasicType(out_stream, binary, feature_dim_);
-  WriteToken(out_stream, binary, "<PHONESPACEDIM>");
-  WriteBasicType(out_stream, binary, phn_space_dim_);
-  WriteToken(out_stream, binary, "<SPKSPACEDIM>");
-  WriteBasicType(out_stream, binary, spk_space_dim_);
-  if (!binary) out_stream << "\n";
-
-  if (Y_.size() != 0) {
-    KALDI_ASSERT(gamma_.size() != 0);
-    WriteToken(out_stream, binary, "<Y>");
-    for (int32 i = 0; i < num_gaussians_; i++) {
-      Matrix<BaseFloat>(Y_[i]).Write(out_stream, binary);
-    }
-  }
-  if (Z_.size() != 0) {
-    KALDI_ASSERT(R_.size() != 0);
-    WriteToken(out_stream, binary, "<Z>");
-    for (int32 i = 0; i < num_gaussians_; i++) {
-      Matrix<BaseFloat>(Z_[i]).Write(out_stream, binary);
-    }
-    WriteToken(out_stream, binary, "<R>");
-    for (int32 i = 0; i < num_gaussians_; i++) {
-      SpMatrix<BaseFloat>(R_[i]).Write(out_stream, binary);
-    }
-  }
-  if (S_.size() != 0) {
-    KALDI_ASSERT(gamma_.size() != 0);
-    WriteToken(out_stream, binary, "<S>");
-    for (int32 i = 0; i < num_gaussians_; i++) {
-      SpMatrix<BaseFloat>(S_[i]).Write(out_stream, binary);
-    }
-  }
-  if (y_.size() != 0) {
-    KALDI_ASSERT(gamma_.size() != 0);
-    WriteToken(out_stream, binary, "<y>");
-    for (int32 j1 = 0; j1 < num_groups_; j1++) {
-      Matrix<BaseFloat>(y_[j1]).Write(out_stream, binary);
-    }
-  }
-  if (gamma_.size() != 0) { // These stats are large
-    // -> write as single precision.
-    WriteToken(out_stream, binary, "<gamma>");
-    for (int32 j1 = 0; j1 < num_groups_; j1++) {
-      Matrix<BaseFloat> gamma_j1(gamma_[j1]);
-      gamma_j1.Write(out_stream, binary);
-    }
-  }
-  if (t_.NumRows() != 0) {
-    WriteToken(out_stream, binary, "<t>");
-    Matrix<BaseFloat>(t_).Write(out_stream, binary);
-  }
-  if (U_.size() != 0) {
-    WriteToken(out_stream, binary, "<U>");
-    for (int32 i = 0; i < num_gaussians_; i++) {
-      SpMatrix<BaseFloat>(U_[i]).Write(out_stream, binary);
-    }
-  }
-  if (gamma_c_.size() != 0) {
-    WriteToken(out_stream, binary, "<gamma_c>");
-    for (int32 j2 = 0; j2 < num_pdfs_; j2++) {
-      Vector<BaseFloat>(gamma_c_[j2]).Write(out_stream, binary);
-    }
-  }
-  if (a_.size() != 0) {
-    WriteToken(out_stream, binary, "<a>");
-    for (int32 j1 = 0; j1 < num_groups_; j1++) {
-      Matrix<BaseFloat>(a_[j1]).Write(out_stream, binary);
-    }
-  }
-  WriteToken(out_stream, binary, "<total_like>");
-  WriteBasicType(out_stream, binary, total_like_);
-
-  WriteToken(out_stream, binary, "<total_frames>");
-  WriteBasicType(out_stream, binary, total_frames_);
-
-  WriteToken(out_stream, binary, "</SGMMACCS>");
-}
-
-void MleAmSgmm2Accs::Read(std::istream &in_stream, bool binary,
-                         bool add) {
-  ExpectToken(in_stream, binary, "<SGMMACCS>");
-  ExpectToken(in_stream, binary, "<NUMPDFS>");
-  ReadBasicType(in_stream, binary, &num_pdfs_);
-  ExpectToken(in_stream, binary, "<NUMGROUPS>");
-  ReadBasicType(in_stream, binary, &num_groups_);
-  ExpectToken(in_stream, binary, "<NUMGaussians>");
-  ReadBasicType(in_stream, binary, &num_gaussians_);
-  ExpectToken(in_stream, binary, "<FEATUREDIM>");
-  ReadBasicType(in_stream, binary, &feature_dim_);
-  ExpectToken(in_stream, binary, "<PHONESPACEDIM>");
-  ReadBasicType(in_stream, binary, &phn_space_dim_);
-  ExpectToken(in_stream, binary, "<SPKSPACEDIM>");
-  ReadBasicType(in_stream, binary, &spk_space_dim_);
-
-  string token;
-  ReadToken(in_stream, binary, &token);
-
-  while (token != "</SGMMACCS>") {
-    if (token == "<Y>") {
-      Y_.resize(num_gaussians_);
-      for (size_t i = 0; i < Y_.size(); i++) {
-        Y_[i].Read(in_stream, binary, add);
-      }
-    } else if (token == "<Z>") {
-      Z_.resize(num_gaussians_);
-      for (size_t i = 0; i < Z_.size(); i++) {
-        Z_[i].Read(in_stream, binary, add);
-      }
-    } else if (token == "<R>") {
-      R_.resize(num_gaussians_);
-      if (gamma_s_.Dim() == 0) gamma_s_.Resize(num_gaussians_);
-      for (size_t i = 0; i < R_.size(); i++) {
-        R_[i].Read(in_stream, binary, add);
-      }
-    } else if (token == "<S>") {
-      S_.resize(num_gaussians_);
-      for (size_t i = 0; i < S_.size(); i++) {
-        S_[i].Read(in_stream, binary, add);
-      }
-    } else if (token == "<y>") {
-      y_.resize(num_groups_);
-      for (int32 j1 = 0; j1 < num_groups_; j1++) {
-        y_[j1].Read(in_stream, binary, add);
-      }
-    } else if (token == "<gamma>") {
-      gamma_.resize(num_groups_);
-      for (int32 j1 = 0; j1 < num_groups_; j1++) {
-        gamma_[j1].Read(in_stream, binary, add);
-      }
-      // Don't read gamma_s, it's just a temporary variable and
-      // not part of the permanent (non-speaker-specific) accs.
-    } else if (token == "<a>") {
-      a_.resize(num_groups_);
-      for (int32 j1 = 0; j1 < num_groups_; j1++) {
-        a_[j1].Read(in_stream, binary, add);
-      }
-    } else if (token == "<gamma_c>") {
-      gamma_c_.resize(num_pdfs_);
-      for (int32 j2 = 0; j2 < num_pdfs_; j2++) {
-        gamma_c_[j2].Read(in_stream, binary, add);
-      }
-    } else if (token == "<t>") {
-      t_.Read(in_stream, binary, add);
-    } else if (token == "<U>") {
-      U_.resize(num_gaussians_);
-      for (int32 i = 0; i < num_gaussians_; i++) {
-        U_[i].Read(in_stream, binary, add);
-      }
-    } else if (token == "<total_like>") {
-      double total_like;
-      ReadBasicType(in_stream, binary, &total_like);
-      if (add)
-        total_like_ += total_like;
-      else
-        total_like_ = total_like;
-    } else if (token == "<total_frames>") {
-      double total_frames;
-      ReadBasicType(in_stream, binary, &total_frames);
-      if (add)
-        total_frames_ += total_frames;
-      else
-        total_frames_ = total_frames;
-    } else {
-      KALDI_ERR << "Unexpected token '" << token << "' in model file ";
-    }
-    ReadToken(in_stream, binary, &token);
-  }
-}
-
-void MleAmSgmm2Accs::Check(const AmSgmm2 &model,
-                          bool show_properties) const {
-  if (show_properties)
-    KALDI_LOG << "Sgmm2PdfModel: J1 = " << num_groups_ << ", J2 = "
-              << num_pdfs_ << ", D = " << feature_dim_ << ", S = "
-              << phn_space_dim_ << ", T = " << spk_space_dim_ << ", I = "
-              << num_gaussians_;
-
-  KALDI_ASSERT(num_pdfs_ == model.NumPdfs() && num_pdfs_ > 0);
-  KALDI_ASSERT(num_groups_ == model.NumGroups() && num_groups_ > 0);
-  KALDI_ASSERT(num_gaussians_ == model.NumGauss() && num_gaussians_ > 0);
-  KALDI_ASSERT(feature_dim_ == model.FeatureDim() && feature_dim_ > 0);
-  KALDI_ASSERT(phn_space_dim_ == model.PhoneSpaceDim() && phn_space_dim_ > 0);
-  KALDI_ASSERT(spk_space_dim_ == model.SpkSpaceDim());
-
-  std::ostringstream debug_str;
-
-  if (Y_.size() == 0) {
-    debug_str << "Y: no.  ";
-  } else {
-    KALDI_ASSERT(gamma_.size() != 0);
-    KALDI_ASSERT(Y_.size() == static_cast<size_t>(num_gaussians_));
-    bool nz = false;
-    for (int32 i = 0; i < num_gaussians_; i++) {
-      KALDI_ASSERT(Y_[i].NumRows() == feature_dim_ &&
-                   Y_[i].NumCols() == phn_space_dim_);
-      if (!nz && Y_[i](0, 0) != 0) { nz = true; }
-    }
-    debug_str << "Y: yes, " << string(nz ? "nonzero. " : "zero. ");
-  }
-
-  if (Z_.size() == 0) {
-    KALDI_ASSERT(R_.size() == 0);
-    debug_str << "Z, R: no.  ";
-  } else {
-    KALDI_ASSERT(gamma_s_.Dim() == num_gaussians_);
-    KALDI_ASSERT(Z_.size() == static_cast<size_t>(num_gaussians_));
-    KALDI_ASSERT(R_.size() == static_cast<size_t>(num_gaussians_));
-    bool Z_nz = false, R_nz = false;
-    for (int32 i = 0; i < num_gaussians_; i++) {
-      KALDI_ASSERT(Z_[i].NumRows() == feature_dim_ &&
-                   Z_[i].NumCols() == spk_space_dim_);
-      KALDI_ASSERT(R_[i].NumRows() == spk_space_dim_);
-      if (!Z_nz && Z_[i](0, 0) != 0) { Z_nz = true; }
-      if (!R_nz && R_[i](0, 0) != 0) { R_nz = true; }
-    }
-    bool gamma_s_nz = !gamma_s_.IsZero();
-    debug_str << "Z: yes, " << string(Z_nz ? "nonzero. " : "zero. ");
-    debug_str << "R: yes, " << string(R_nz ? "nonzero. " : "zero. ");
-    debug_str << "gamma_s: yes, " << string(gamma_s_nz ? "nonzero. " : "zero. ");
-  }
-
-  if (S_.size() == 0) {
-    debug_str << "S: no.  ";
-  } else {
-    KALDI_ASSERT(gamma_.size() != 0);
-    bool S_nz = false;
-    KALDI_ASSERT(S_.size() == static_cast<size_t>(num_gaussians_));
-    for (int32 i = 0; i < num_gaussians_; i++) {
-      KALDI_ASSERT(S_[i].NumRows() == feature_dim_);
-      if (!S_nz && S_[i](0, 0) != 0) { S_nz = true; }
-    }
-    debug_str << "S: yes, " << string(S_nz ? "nonzero. " : "zero. ");
-  }
-
-  if (y_.size() == 0) {
-    debug_str << "y: no.  ";
-  } else {
-    KALDI_ASSERT(gamma_.size() != 0);
-    bool nz = false;
-    KALDI_ASSERT(y_.size() == static_cast<size_t>(num_groups_));
-    for (int32 j1 = 0; j1 < num_groups_; j1++) {
-      KALDI_ASSERT(y_[j1].NumRows() == model.NumSubstatesForGroup(j1));
-      KALDI_ASSERT(y_[j1].NumCols() == phn_space_dim_);
-      if (!nz && y_[j1](0, 0) != 0) { nz = true; }
-    }
-    debug_str << "y: yes, " << string(nz ? "nonzero. " : "zero. ");
-  }
-
-  if (a_.size() == 0) {
-    debug_str << "a: no.  ";
-  } else {
-    debug_str << "a: yes.  ";
-    bool nz = false;
-    KALDI_ASSERT(a_.size() == static_cast<size_t>(num_groups_));
-    for (int32 j1 = 0; j1 < num_groups_; j1++) {
-      KALDI_ASSERT(a_[j1].NumRows() == model.NumSubstatesForGroup(j1) &&
-                   a_[j1].NumCols() == num_gaussians_);
-      if (!nz && a_[j1].Sum() != 0) nz = true;
-    }
-    debug_str << "a: yes, " << string(nz ? "nonzero. " : "zero. "); // TODO: take out "string"
-  }
-
-  double tot_gamma = 0.0;
-  if (gamma_.size() == 0) {
-    debug_str << "gamma: no.  ";
-  } else {
-    debug_str << "gamma: yes.  ";
-    KALDI_ASSERT(gamma_.size() == static_cast<size_t>(num_groups_));
-    for (int32 j1 = 0; j1 < num_groups_; j1++) {
-      KALDI_ASSERT(gamma_[j1].NumRows() == model.NumSubstatesForGroup(j1) &&
-                   gamma_[j1].NumCols() == num_gaussians_);
-      tot_gamma += gamma_[j1].Sum();
-    }
-    bool nz = (tot_gamma != 0.0);
-    KALDI_ASSERT(gamma_c_.size() == num_pdfs_ && "gamma_ set up but not gamma_c_.");
-    debug_str << "gamma: yes, " << string(nz ? "nonzero. " : "zero. ");
-  }
-
-  if (gamma_c_.size() == 0) {
-    KALDI_ERR << "gamma_c_ not set up."; // required for all accs.
-  } else {
-    KALDI_ASSERT(gamma_c_.size() == num_pdfs_);
-    double tot_gamma_c = 0.0;
-    for (int32 j2 = 0; j2 < num_pdfs_; j2++) {
-      KALDI_ASSERT(gamma_c_[j2].Dim() == model.NumSubstatesForPdf(j2));
-      tot_gamma_c += gamma_c_[j2].Sum();
-    }
-    bool nz = (tot_gamma_c != 0.0);
-    debug_str << "gamma_c: yes, " << string(nz ? "nonzero. " : "zero. ");
-    if (!gamma_.empty() && !ApproxEqual(tot_gamma_c, tot_gamma))
-      KALDI_WARN << "Counts from gamma and gamma_c differ "
-                 << tot_gamma << " vs. " << tot_gamma_c;
-  }
-
-  if (t_.NumRows() == 0) {
-    debug_str << "t: no.  ";
-  } else {
-    KALDI_ASSERT(t_.NumRows() == num_gaussians_ &&
-                 t_.NumCols() == spk_space_dim_);
-    KALDI_ASSERT(!U_.empty()); // t and U are used together.
-    bool nz = (t_.FrobeniusNorm() != 0);
-    debug_str << "t: yes, " << string(nz ? "nonzero. " : "zero. ");
-  }
-
-  if (U_.size() == 0) {
-    debug_str << "U: no.  ";
-  } else {
-    bool nz = false;
-    KALDI_ASSERT(U_.size() == num_gaussians_);
-    for (int32 i = 0; i < num_gaussians_; i++) {
-      if (!nz && U_[i].FrobeniusNorm() != 0) nz = true;
-      KALDI_ASSERT(U_[i].NumRows() == spk_space_dim_);
-    }
-    KALDI_ASSERT(t_.NumRows() != 0); // t and U are used together.
-    debug_str << "t: yes, " << string(nz ? "nonzero. " : "zero. ");
-  }
-
-  if (show_properties)
-    KALDI_LOG << "Subspace GMM model properties: " << debug_str.str();
-}
-
-void MleAmSgmm2Accs::ResizeAccumulators(const AmSgmm2 &model,
-                                        SgmmUpdateFlagsType flags,
-                                        bool have_spk_vecs) {
-  num_pdfs_ = model.NumPdfs();
-  num_groups_ = model.NumGroups();
-  num_gaussians_ = model.NumGauss();
-  feature_dim_ = model.FeatureDim();
-  phn_space_dim_ = model.PhoneSpaceDim();
-  spk_space_dim_ = model.SpkSpaceDim();
-  total_frames_ = total_like_ = 0;
-
-  if (flags & (kSgmmPhoneProjections | kSgmmCovarianceMatrix)) {
-    Y_.resize(num_gaussians_);
-    for (int32 i = 0; i < num_gaussians_; i++) {
-      Y_[i].Resize(feature_dim_, phn_space_dim_);
-    }
-  } else {
-    Y_.clear();
-  }
-
-  if (flags & (kSgmmSpeakerProjections | kSgmmSpeakerWeightProjections)) {
-    gamma_s_.Resize(num_gaussians_);
-  } else {
-    gamma_s_.Resize(0);
-  }
-
-  if (flags & kSgmmSpeakerProjections) {
-    if (spk_space_dim_ == 0) {
-      KALDI_ERR << "Cannot set up accumulators for speaker projections "
-                << "because speaker subspace has not been set up";
-    }
-    Z_.resize(num_gaussians_);
-    R_.resize(num_gaussians_);
-    for (int32 i = 0; i < num_gaussians_; i++) {
-      Z_[i].Resize(feature_dim_, spk_space_dim_);
-      R_[i].Resize(spk_space_dim_);
-    }
-  } else {
-    Z_.clear();
-    R_.clear();
-  }
-
-  if (flags & kSgmmCovarianceMatrix) {
-    S_.resize(num_gaussians_);
-    for (int32 i = 0; i < num_gaussians_; i++) {
-      S_[i].Resize(feature_dim_);
-    }
-  } else {
-    S_.clear();
-  }
-
-  if (flags & (kSgmmPhoneVectors | kSgmmPhoneWeightProjections |
-               kSgmmCovarianceMatrix | kSgmmPhoneProjections)) {
-    gamma_.resize(num_groups_);
-    for (int32 j1 = 0; j1 < num_groups_; j1++) {
-      gamma_[j1].Resize(model.NumSubstatesForGroup(j1), num_gaussians_);
-    }
-  } else {
-    gamma_.clear();
-  }
-
-  if (flags & (kSgmmPhoneVectors | kSgmmPhoneWeightProjections)
-      && model.HasSpeakerDependentWeights() && have_spk_vecs) { // SSGMM code.
-    a_.resize(num_groups_);
-    for (int32 j1 = 0; j1 < num_groups_; j1++) {
-      a_[j1].Resize(model.NumSubstatesForGroup(j1),
-                    num_gaussians_);
-    }
-  } else {
-    a_.clear();
-  }
-
-  if (flags & kSgmmSpeakerWeightProjections) {
-    KALDI_ASSERT(model.HasSpeakerDependentWeights() &&
-                 "remove the flag \"u\" if you don't have u set up.");
-    a_s_.Resize(num_gaussians_);
-    t_.Resize(num_gaussians_, spk_space_dim_);
-    U_.resize(num_gaussians_);
-    for (int32 i = 0; i < num_gaussians_; i++)
-      U_[i].Resize(spk_space_dim_);
-  } else {
-    a_s_.Resize(0);
-    t_.Resize(0, 0);
-    U_.resize(0);
-  }
-
-  if (true) { // always set up gamma_c_; it's nominally for
-    // estimation of substate weights, but it's also required when
-    // GetStateOccupancies() is called.
-    gamma_c_.resize(num_pdfs_);
-    for (int32 j2 = 0; j2 < num_pdfs_; j2++) {
-      gamma_c_[j2].Resize(model.NumSubstatesForPdf(j2));
-    }
-  }
-
-
-  if (flags & kSgmmPhoneVectors) {
-    y_.resize(num_groups_);
-    for (int32 j1 = 0; j1 < num_groups_; j1++) {
-      y_[j1].Resize(model.NumSubstatesForGroup(j1), phn_space_dim_);
-    }
-  } else {
-    y_.clear();
-  }
-}
-
-BaseFloat MleAmSgmm2Accs::Accumulate(const AmSgmm2 &model,
-                                    const Sgmm2PerFrameDerivedVars &frame_vars,
-                                    int32 j2,
-                                    BaseFloat weight,
-                                    Sgmm2PerSpkDerivedVars *spk_vars) {
-  // Calculate Gaussian posteriors and collect statistics
-  Matrix<BaseFloat> posteriors;
-  BaseFloat log_like = model.ComponentPosteriors(frame_vars, j2, spk_vars, &posteriors);
-  posteriors.Scale(weight);
-  BaseFloat count = AccumulateFromPosteriors(model, frame_vars, posteriors,
-                                             j2, spk_vars);
-  // Note: total_frames_ is incremented in AccumulateFromPosteriors().
-  total_like_ += count * log_like;
-  return log_like;
-}
-
-BaseFloat MleAmSgmm2Accs::AccumulateFromPosteriors(
-    const AmSgmm2 &model,
-    const Sgmm2PerFrameDerivedVars &frame_vars,
-    const Matrix<BaseFloat> &posteriors,
-    int32 j2,
-    Sgmm2PerSpkDerivedVars *spk_vars) {
-  double tot_count = 0.0;
-  const vector<int32> &gselect = frame_vars.gselect;
-  // Intermediate variables
-  Vector<BaseFloat> gammat(gselect.size()), // sum of gammas over mix-weight.
-      a_is_part(gselect.size()); //
-  Vector<BaseFloat> xt_jmi(feature_dim_), mu_jmi(feature_dim_),
-      zt_jmi(spk_space_dim_);
-
-  int32 j1 = model.Pdf2Group(j2);
-  int32 num_substates = model.NumSubstatesForGroup(j1);
-
-  for (int32 m = 0; m < num_substates; m++) {
-    BaseFloat d_jms = model.GetDjms(j1, m, spk_vars);
-    BaseFloat gammat_jm = 0.0;
-    for (int32 ki = 0; ki < static_cast<int32>(gselect.size()); ki++) {
-      int32 i = gselect[ki];
-
-      // Eq. (39): gamma_{jmi}(t) = p (j, m, i|t)
-      BaseFloat gammat_jmi = RandPrune(posteriors(ki, m), rand_prune_);
-      if (gammat_jmi == 0.0) continue;
-      gammat(ki) += gammat_jmi;
-      if (gamma_s_.Dim() != 0)
-        gamma_s_(i) += gammat_jmi;
-      gammat_jm += gammat_jmi;
-
-      // Accumulate statistics for non-zero gaussian posteriors
-      tot_count += gammat_jmi;
-      if (!gamma_.empty()) {
-        // Eq. (40): gamma_{jmi} = \sum_t gamma_{jmi}(t)
-        gamma_[j1](m, i) += gammat_jmi;
-      }
-      if (!y_.empty()) {
-        // Eq. (41): y_{jm} = \sum_{t, i} \gamma_{jmi}(t) z_{i}(t)
-        // Suggestion:  move this out of the loop over m
-        y_[j1].Row(m).AddVec(gammat_jmi, frame_vars.zti.Row(ki));
-      }
-      if (!Y_.empty()) {
-        // Eq. (42): Y_{i} = \sum_{t, j, m} \gamma_{jmi}(t) x_{i}(t) v_{jm}^T
-        Y_[i].AddVecVec(gammat_jmi, frame_vars.xti.Row(ki),
-                        model.v_[j1].Row(m));
-      }
-      // Accumulate for speaker projections
-      if (!Z_.empty()) {
-        KALDI_ASSERT(spk_space_dim_ > 0);
-        // Eq. (43): x_{jmi}(t) = x_k(t) - M{i} v_{jm}
-        model.GetSubstateMean(j1, m, i, &mu_jmi);
-        xt_jmi.CopyFromVec(frame_vars.xt);
-        xt_jmi.AddVec(-1.0, mu_jmi);
-        // Eq. (44): Z_{i} = \sum_{t, j, m} \gamma_{jmi}(t) x_{jmi}(t) v^{s}'
-        if (spk_vars->v_s.Dim() != 0)  // interpret empty v_s as zero.
-          Z_[i].AddVecVec(gammat_jmi, xt_jmi, spk_vars->v_s);
-        // Eq. (49): \gamma_{i}^{(s)} = \sum_{t\in\Tau(s), j, m} gamma_{jmi}
-        // Will be used when you call CommitStatsForSpk(), to update R_.
-      }
-    } // loop over selected Gaussians
-    if (gammat_jm != 0.0) {
-      if (!a_.empty()) { // SSGMM code.
-        KALDI_ASSERT(d_jms > 0);
-        // below is eq. 40 in the MSR techreport.  Caution: there
-        // was an error in the original techreport.  The index i
-        // in the summation and the quantity \gamma_{jmi}^{(t)}
-        // should be differently named, e.g. i'.
-        a_[j1].Row(m).AddVec(gammat_jm / d_jms, spk_vars->b_is);
-      }
-      if (a_s_.Dim() != 0) { // [SSGMM]
-        KALDI_ASSERT(d_jms > 0);
-        KALDI_ASSERT(!model.w_jmi_.empty());
-        a_s_.AddVec(gammat_jm / d_jms, model.w_jmi_[j1].Row(m));
-      }
-      if (!gamma_c_.empty())
-        gamma_c_[j2](m) += gammat_jm;
-    }
-  } // loop over substates
-
-  if (!S_.empty()) {
-    for (int32 ki = 0; ki < static_cast<int32>(gselect.size()); ki++) {
-      // Eq. (47): S_{i} = \sum_{t, j, m} \gamma_{jmi}(t) x_{i}(t) x_{i}(t)^T
-      if (gammat(ki) != 0.0) {
-        int32 i = gselect[ki];
-        S_[i].AddVec2(gammat(ki), frame_vars.xti.Row(ki));
-      }
-    }
-  }
-  total_frames_ += tot_count;
-  return tot_count;
-}
-
-void MleAmSgmm2Accs::CommitStatsForSpk(const AmSgmm2 &model,
-                                       const Sgmm2PerSpkDerivedVars &spk_vars) {
-  const VectorBase<BaseFloat> &v_s = spk_vars.v_s;
-  if (v_s.Dim() != 0 && !v_s.IsZero() && !R_.empty()) {
-    for (int32 i = 0; i < num_gaussians_; i++)
-      // Accumulate Statistics R_{ki}
-      if (gamma_s_(i) != 0.0)
-        R_[i].AddVec2(gamma_s_(i),
-                      Vector<double>(v_s));
-  }
-  if (a_s_.Dim() != 0) {
-    Vector<BaseFloat> tmp(gamma_s_);
-    // tmp(i) = gamma_s^{(i)} - a_i^{(s)} b_i^{(s)}.
-    tmp.AddVecVec(-1.0, Vector<BaseFloat>(a_s_), spk_vars.b_is, 1.0);
-    t_.AddVecVec(1.0, tmp, v_s); // eq. 53 of techreport.
-    for (int32 i = 0; i < num_gaussians_; i++) {
-      U_[i].AddVec2(a_s_(i) * spk_vars.b_is(i),
-                    Vector<double>(v_s)); // eq. 54 of techreport.
-    }
-  }
-  gamma_s_.SetZero();
-  a_s_.SetZero();
-}
-
-void MleAmSgmm2Accs::GetStateOccupancies(Vector<BaseFloat> *occs) const {
-  int32 J2 = gamma_c_.size();
-  occs->Resize(J2);
-  for (int32 j2 = 0; j2 < J2; j2++) {
-    (*occs)(j2) = gamma_c_[j2].Sum();
-  }
-}
-
-void MleAmSgmm2Updater::Update(const MleAmSgmm2Accs &accs,
-                               AmSgmm2 *model,
-                               SgmmUpdateFlagsType flags) {
-  // Q_{i}, quadratic term for phonetic subspace estimation. Dim is [I][S][S]
-  std::vector< SpMatrix<double> > Q;
-
-  // Eq (74): S_{i}^{(means)}, scatter of substate mean vectors for estimating
-  // the shared covariance matrices. [Actually this variable contains also the
-  // term -(Y_i M_i^T + M_i Y_I^T).]  Dimension is [I][D][D].
-  std::vector< SpMatrix<double> > S_means;
-  std::vector<Matrix<double> > log_a;
-
-  Vector<double> gamma_i(accs.num_gaussians_);
-  for (int32 j1 = 0; j1 < accs.num_groups_; j1++)
-    gamma_i.AddRowSumMat(1.0, accs.gamma_[j1]); // add sum of rows of
-  // accs.gamma_[j1], to gamma_i.
-
-  if (flags & kSgmmPhoneProjections)
-    ComputeQ(accs, *model, &Q);
-  if (flags & kSgmmCovarianceMatrix)
-    ComputeSMeans(accs, *model, &S_means);
-  if (!accs.a_.empty())
-    ComputeLogA(accs, &log_a);
-
-  // quantities used in both vector and weights updates...
-  vector< SpMatrix<double> > H;
-  // "smoothing" matrices, weighted sums of above.
-  SpMatrix<double> H_sm; // weighted sum of H.  Used e.g. in renormalizing phonetic space.
-  if ((flags & (kSgmmPhoneVectors | kSgmmPhoneWeightProjections))
-      || options_.renormalize_V)
-    model->ComputeH(&H);
-
-  BaseFloat tot_impr = 0.0;
-
-  if (flags & kSgmmPhoneVectors)
-    tot_impr += UpdatePhoneVectors(accs, H, log_a, model);
-  if (flags & kSgmmPhoneProjections) {
-    if (options_.tau_map_M > 0.0)
-      tot_impr += MapUpdateM(accs, Q, gamma_i, model);  // MAP adaptation of M
-    else
-      tot_impr += UpdateM(accs, Q, gamma_i, model);
-  }
-  if (flags & kSgmmPhoneWeightProjections)
-    tot_impr += UpdateW(accs, log_a, gamma_i, model);
-  if (flags & kSgmmCovarianceMatrix)
-    tot_impr += UpdateVars(accs, S_means, gamma_i, model);
-  if (flags & kSgmmSubstateWeights)
-    tot_impr += UpdateSubstateWeights(accs, model);
-  if (flags & kSgmmSpeakerProjections)
-    tot_impr += UpdateN(accs, gamma_i, model);
-  if (flags & kSgmmSpeakerWeightProjections)
-    tot_impr += UpdateU(accs, gamma_i, model);
-
-  if ((flags & kSgmmSpeakerProjections) && (options_.renormalize_N))
-    RenormalizeN(accs, gamma_i, model); // if you renormalize N you have to
-  // alter any speaker vectors you're keeping around, as well.
-  // So be careful with this option.
-
-  if (options_.renormalize_V)
-    RenormalizeV(accs, model, gamma_i, H);
-
-  KALDI_LOG << "*Overall auxf improvement, combining all parameters, is "
-            << tot_impr;
-
-  KALDI_LOG << "***Overall data likelihood is "
-            << (accs.total_like_/accs.total_frames_)
-            << " over " << accs.total_frames_ << " frames.";
-
-  model->n_.clear(); // has become invalid.
-  model->w_jmi_.clear(); // has become invalid.
-  // we updated the v or w quantities.
-}
-
-// Compute the Q_{i} (Eq. 64)
-void MleAmSgmm2Updater::ComputeQ(const MleAmSgmm2Accs &accs,
-                                const AmSgmm2 &model,
-                                std::vector< SpMatrix<double> > *Q) {
-  Q->resize(accs.num_gaussians_);
-  for (int32 i = 0; i < accs.num_gaussians_; i++) {
-    (*Q)[i].Resize(accs.phn_space_dim_);
-    for (int32 j1 = 0; j1 < accs.num_groups_; j1++) {
-      for (int32 m = 0; m < model.NumSubstatesForGroup(j1); m++) {
-        if (accs.gamma_[j1](m, i) > 0.0) {
-          (*Q)[i].AddVec2(static_cast<BaseFloat>(accs.gamma_[j1](m, i)),
-                          model.v_[j1].Row(m));
-        }
-      }
-    }
-  }
-}
-
-// Compute the S_i^{(means)} quantities (Eq. 74).
-// Note: we seem to have also included in this variable
-// the term - (Y_i M_I^T + M_i Y_i^T).
-void MleAmSgmm2Updater::ComputeSMeans(const MleAmSgmm2Accs &accs,
-                                     const AmSgmm2 &model,
-                                     std::vector< SpMatrix<double> > *S_means) {
-  S_means->resize(accs.num_gaussians_);
-  Matrix<double> YM_MY(accs.feature_dim_, accs.feature_dim_);
-  Vector<BaseFloat> mu_jmi(accs.feature_dim_);
-  for (int32 i = 0; i < accs.num_gaussians_; i++) {
-    // YM_MY = - (Y_{i} M_{i}^T)
-    YM_MY.AddMatMat(-1.0, accs.Y_[i], kNoTrans,
-                    Matrix<double>(model.M_[i]), kTrans, 0.0);
-    // Add its own transpose: YM_MY = - (Y_{i} M_{i}^T + M_{i} Y_{i}^T)
-    {
-      Matrix<double> M(YM_MY, kTrans);
-      YM_MY.AddMat(1.0, M);
-    }
-    (*S_means)[i].Resize(accs.feature_dim_, kUndefined);
-    (*S_means)[i].CopyFromMat(YM_MY);  // Sigma_{i} = -(YM' + MY')
-
-    for (int32 j1 = 0; j1 < accs.num_groups_; j1++) {
-      for (int32 m = 0; m < model.NumSubstatesForGroup(j1); m++) {
-        if (accs.gamma_[j1](m, i) != 0.0) {
-          // Sigma_{i} += gamma_{jmi} * mu_{jmi}*mu_{jmi}^T
-          mu_jmi.AddMatVec(1.0, model.M_[i], kNoTrans, model.v_[j1].Row(m), 0.0);
-          (*S_means)[i].AddVec2(static_cast<BaseFloat>(accs.gamma_[j1](m, i)), mu_jmi);
-        }
-      }
-    }
-    KALDI_ASSERT(1.0 / (*S_means)[i](0, 0) != 0.0);
-  }
-}
-
-
-class UpdatePhoneVectorsClass: public MultiThreadable { // For multi-threaded.
- public:
-  UpdatePhoneVectorsClass(const MleAmSgmm2Updater &updater,
-                          const MleAmSgmm2Accs &accs,
-                          const std::vector<SpMatrix<double> > &H,
-                          const std::vector<Matrix<double> > &log_a,
-                          AmSgmm2 *model,
-                          double *auxf_impr):
-      updater_(updater), accs_(accs), model_(model),
-      H_(H), log_a_(log_a), auxf_impr_ptr_(auxf_impr),
-      auxf_impr_(0.0) { }
-
-  UpdatePhoneVectorsClass(const UpdatePhoneVectorsClass &other) :
-      MultiThreadable(other),
-      updater_(other.updater_), accs_(other.accs_), model_(other.model_),
-      H_(other.H_), log_a_(other.log_a_), auxf_impr_ptr_(other.auxf_impr_ptr_),
-      auxf_impr_(0.0) { }
-
-  ~UpdatePhoneVectorsClass() {
-    *auxf_impr_ptr_ += auxf_impr_;
-  }
-
-  inline void operator() () {
-    // Note: give them local copy of the sums we're computing,
-    // which will be propagated to the total sums in the destructor.
-    updater_.UpdatePhoneVectorsInternal(accs_, H_, log_a_, model_,
-                                        &auxf_impr_, num_threads_, thread_id_);
-  }
- private:
-  const MleAmSgmm2Updater &updater_;
-  const MleAmSgmm2Accs &accs_;
-  AmSgmm2 *model_;
-  const std::vector<SpMatrix<double> > &H_;
-  const std::vector<Matrix<double> > &log_a_;
-  double *auxf_impr_ptr_;
-  double auxf_impr_;
-};
-
-/**
-   In this update, smoothing terms are not supported.  However, it does compute
-   the auxiliary function after doing the update, and backtracks if it did not
-   increase (due to the weight terms, increase is not mathematically
-   guaranteed). */
-
-double MleAmSgmm2Updater::UpdatePhoneVectors(
-    const MleAmSgmm2Accs &accs,
-    const vector< SpMatrix<double> > &H,
-    const vector< Matrix<double> > &log_a,
-    AmSgmm2 *model) const {
-
-  KALDI_LOG << "Updating phone vectors";
-
-  double count = 0.0, auxf_impr = 0.0;  // sum over all states
-
-  for (int32 j1 = 0; j1 < accs.num_groups_; j1++)
-    count += accs.gamma_[j1].Sum();
-
-  UpdatePhoneVectorsClass c(*this, accs, H, log_a, model, &auxf_impr);
-  RunMultiThreaded(c);
-
-  double auxf_per_frame = auxf_impr / (count + 1.0e-20);
-
-  KALDI_LOG << "**Overall auxf impr for v is " << auxf_per_frame << " over "
-            << count << " frames";
-  return auxf_per_frame;
-}
-
-//static
-void MleAmSgmm2Updater::ComputeLogA(const MleAmSgmm2Accs &accs,
-                                    std::vector<Matrix<double> > *log_a) {
-  // This computes the logarithm of the statistics a_{jmi} defined
-  // in Eq. 40 of the SSGMM techreport.  Although the log of a_{jmi} never
-  // explicitly appears in the techreport, it happens to be more convenient
-  // in the code to use the log of it.
-  // Note: because of the way a is computed, for each (j,m) the
-  // entries over i should always be all zero or all nonzero.
-  int32 num_zeros = 0;
-  KALDI_ASSERT(accs.a_.size() == accs.num_groups_);
-  log_a->resize(accs.num_groups_);
-  for (int32 j1 = 0; j1 < accs.num_groups_; j1++) {
-    int32 num_substates = accs.a_[j1].NumRows();
-    KALDI_ASSERT(num_substates > 0);
-    (*log_a)[j1].Resize(num_substates, accs.num_gaussians_);
-    for (int32 m = 0; m < num_substates; m++) {
-      if (accs.a_[j1](m, 0) == 0.0) { // Zero accs.
-        num_zeros++;
-        if (accs.gamma_[j1].Row(m).Sum() != 0.0)
-          KALDI_WARN << "Inconsistency between a and gamma stats. [BAD!]";
-        // leave the row zero.  This means the sub-state saw no stats.
-      } else {
-        (*log_a)[j1].Row(m).CopyFromVec(accs.a_[j1].Row(m));
-        (*log_a)[j1].Row(m).ApplyLog();
-      }
-    }
-  }
-  if (num_zeros != 0)
-    KALDI_WARN << num_zeros
-               << " sub-states with zero \"a\" (and presumably gamma) stats.";
-}
-
-void MleAmSgmm2Updater::UpdatePhoneVectorsInternal(
-    const MleAmSgmm2Accs &accs,
-    const vector< SpMatrix<double> > &H,
-    const vector< Matrix<double> > &log_a,
-    AmSgmm2 *model,
-    double *auxf_impr_ptr,
-    int32 num_threads,
-    int32 thread_id) const {
-
-  int32 J1 = accs.num_groups_, block_size = (J1 + (num_threads-1)) / num_threads,
-      j1_start = block_size * thread_id,
-      j1_end = std::min(accs.num_groups_, j1_start + block_size);
-
-  double tot_auxf_impr = 0.0;
-
-  for (int32 j1 = j1_start; j1 < j1_end; j1++) {
-    for (int32 m = 0; m < model->NumSubstatesForGroup(j1); m++) {
-      double gamma_jm = accs.gamma_[j1].Row(m).Sum();
-      SpMatrix<double> X_jm(accs.phn_space_dim_);  // = \sum_i \gamma_{jmi} H_i
-
-      for (int32 i = 0; i < accs.num_gaussians_; i++) {
-        double gamma_jmi = accs.gamma_[j1](m, i);
-        if (gamma_jmi != 0.0)
-          X_jm.AddSp(gamma_jmi, H[i]);
-      }
-
-      Vector<double> v_jm_orig(model->v_[j1].Row(m)),
-          v_jm(v_jm_orig);
-
-      double exact_auxf_start = 0.0, exact_auxf = 0.0, approx_auxf_impr = 0.0;
-      int32 backtrack_iter, max_backtrack = 10;
-      for (backtrack_iter = 0; backtrack_iter < max_backtrack; backtrack_iter++) {
-        // Note: the 1st time we go through this loop we have not yet updated
-        // v_jm and it has the old value; the 2nd time, it has the updated value
-        // and we will typically break at this point, after verifying that
-        // the auxf has improved.
-
-        // w_jm = softmax([w_{k1}^T ... w_{kD}^T] * v_{jkm})  eq.(7)
-        Vector<double> w_jm(accs.num_gaussians_);
-        w_jm.AddMatVec(1.0, Matrix<double>(model->w_), kNoTrans,
-                       v_jm, 0.0);
-        if (!log_a.empty()) w_jm.AddVec(1.0, log_a[j1].Row(m)); // SSGMM techreport eq. 42
-        w_jm.Add(-w_jm.LogSumExp());  // it is now log w_jm
-
-
-        exact_auxf = VecVec(w_jm, accs.gamma_[j1].Row(m))
-            + VecVec(v_jm, accs.y_[j1].Row(m))
-            -0.5 * VecSpVec(v_jm, X_jm, v_jm);
-
-        if (backtrack_iter == 0) {
-          exact_auxf_start = exact_auxf;
-        } else {
-          if (exact_auxf >= exact_auxf_start) {
-            break;  // terminate backtracking.
-          } else {
-            KALDI_LOG << "Backtracking computation of v_jm for j = " << j1
-                      << " and m = " << m << " because auxf changed by "
-                      << (exact_auxf-exact_auxf_start) << " [vs. predicted:] "
-                      << approx_auxf_impr;
-            v_jm.AddVec(1.0, v_jm_orig);
-            v_jm.Scale(0.5);
-          }
-        }
-
-        if (backtrack_iter == 0) {  // computing updated value.
-          w_jm.ApplyExp();  // it is now w_jm
-          SpMatrix<double> H_jm(X_jm);
-          Vector<double> g_jm(accs.y_[j1].Row(m));
-          for (int32 i = 0; i < accs.num_gaussians_; i++) {
-            double gamma_jmi = accs.gamma_[j1](m, i);
-            double quadratic_term = std::max(gamma_jmi, gamma_jm * w_jm(i));
-            double scalar = gamma_jmi - gamma_jm * w_jm(i) + quadratic_term
-                * VecVec(model->w_.Row(i), model->v_[j1].Row(m));
-            g_jm.AddVec(scalar, model->w_.Row(i));
-            if (quadratic_term > 1.0e-10) {
-              H_jm.AddVec2(static_cast<BaseFloat>(quadratic_term), model->w_.Row(i));
-            }
-          }
-
-          SolverOptions opts;
-          opts.name = "v";
-          opts.K = options_.max_cond;
-          opts.eps = options_.epsilon;
-
-          approx_auxf_impr = SolveQuadraticProblem(H_jm, g_jm, opts, &v_jm);
-        }
-      }
-      double exact_auxf_impr = exact_auxf - exact_auxf_start;
-      tot_auxf_impr += exact_auxf_impr;
-      if (backtrack_iter == max_backtrack) {
-        KALDI_WARN << "Backtracked " << max_backtrack << " times [not updating]";
-      } else {
-        model->v_[j1].Row(m).CopyFromVec(v_jm);
-      }
-
-      if (j1 < 3 && m < 3) {
-        KALDI_LOG << "Auxf impr for j = " << j1 << " m = " << m << " is "
-                  << (exact_auxf_impr/gamma_jm+1.0e-20) << " per frame over "
-                  << gamma_jm << " frames.";
-      }
-    }
-  }
-  *auxf_impr_ptr = tot_auxf_impr;
-}
-
-
-void MleAmSgmm2Updater::RenormalizeV(const MleAmSgmm2Accs &accs,
-                                    AmSgmm2 *model,
-                                    const Vector<double> &gamma_i,
-                                    const vector<SpMatrix<double> > &H) {
-  // Compute H^{(sm)}, the "smoothing" matrix-- average of H's.
-  SpMatrix<double> H_sm(accs.phn_space_dim_);
-  for (int32 i = 0; i < accs.num_gaussians_; i++)
-    H_sm.AddSp(gamma_i(i), H[i]);
-  KALDI_ASSERT(gamma_i.Sum() > 0.0);
-  H_sm.Scale(1.0 / gamma_i.Sum());
-
-  SpMatrix<double> Sigma(accs.phn_space_dim_);
-  int32 count = 0;
-  for (int32 j1 = 0; j1 < accs.num_groups_; j1++) {
-    for (int32 m = 0; m < model->NumSubstatesForGroup(j1); m++) {
-      count++;
-      Sigma.AddVec2(static_cast<BaseFloat>(1.0), model->v_[j1].Row(m));
-    }
-  }
-  if (!Sigma.IsPosDef()) {
-    KALDI_LOG << "Not renormalizing v because scatter is not positive definite"
-              << " -- maybe first iter?";
-    return;
-  }
-  Sigma.Scale(1.0 / count);
-  KALDI_LOG << "Scatter of vectors v is : ";
-  Sigma.PrintEigs("Sigma");
-
-  // Want to make variance of v unit and H_sm (like precision matrix) diagonal.
-  TpMatrix<double> L(accs.phn_space_dim_);
-  L.Cholesky(Sigma);
-  TpMatrix<double> LInv(L);
-  LInv.Invert();
-
-  Matrix<double> tmpL(accs.phn_space_dim_, accs.phn_space_dim_);
-  tmpL.CopyFromTp(L);
-
-  SpMatrix<double> H_sm_proj(accs.phn_space_dim_);
-  H_sm_proj.AddMat2Sp(1.0, tmpL, kTrans, H_sm, 0.0);
-  // H_sm_proj := L^{T} * H_sm * L.
-  // This is right because we would transform the vectors themselves
-  // by L^{-1}, and H_sm is like the inverse of the vectors,
-  // so it's {L^{-1}}^{-T} = L^T.
-
-  Matrix<double> U(accs.phn_space_dim_, accs.phn_space_dim_);
-  Vector<double> eigs(accs.phn_space_dim_);
-  H_sm_proj.SymPosSemiDefEig(&eigs, &U, 1.0);  // 1.0 means no checking +ve def -> faster
-  KALDI_LOG << "Note on the next diagnostic: the first number is generally not "
-            << "that meaningful as it relates to the static offset";
-  H_sm_proj.PrintEigs("H_sm_proj (Significance of dims in vector space.. note)");
-
-  // Transform on vectors is U^T L^{-1}.
-  // Why?  Because transform on H_sm is T =U^T L^T
-  // and we want T^{-T} by normal rules of vector/covector and we
-  // have (U^T L^T)^{-T} = (L U)^{-1} = U^T L^{-1}.
-  Matrix<double> Trans(accs.phn_space_dim_, accs.phn_space_dim_);  // T^{-T}
-  Matrix<double> tmpLInv(accs.phn_space_dim_, accs.phn_space_dim_);
-  tmpLInv.CopyFromTp(LInv);
-  Trans.AddMatMat(1.0, U, kTrans, tmpLInv, kNoTrans, 0.0);
-  Matrix<double> TransInv(Trans);
-  TransInv.Invert();  // T in above...
-
-#ifdef KALDI_PARANOID
-  {
-    SpMatrix<double> H_sm_tmp(accs.phn_space_dim_);
-    H_sm_tmp.AddMat2Sp(1.0, TransInv, kTrans, H_sm, 0.0);
-    KALDI_ASSERT(H_sm_tmp.IsDiagonal(0.1));
-  }
-  {
-    SpMatrix<double> Sigma_tmp(accs.phn_space_dim_);
-    Sigma_tmp.AddMat2Sp(1.0, Trans, kNoTrans, Sigma, 0.0);
-    KALDI_ASSERT(Sigma_tmp.IsUnit(0.1));
-  }
-#endif
-
-  for (int32 j1 = 0; j1 < accs.num_groups_; j1++) {
-    for (int32 m = 0; m < model->NumSubstatesForGroup(j1); m++) {
-      Vector<double> tmp(accs.phn_space_dim_);
-      tmp.AddMatVec(1.0, Trans, kNoTrans, Vector<double>(model->v_[j1].Row(m)), 0.0);
-      model->v_[j1].Row(m).CopyFromVec(tmp);
-    }
-  }
-  for (int32 i = 0; i < accs.num_gaussians_; i++) {
-    Vector<double> tmp(accs.phn_space_dim_);
-    tmp.AddMatVec(1.0, TransInv, kTrans, Vector<double>(model->w_.Row(i)), 0.0);
-    model->w_.Row(i).CopyFromVec(tmp);
-
-    Matrix<double> tmpM(accs.feature_dim_, accs.phn_space_dim_);
-    // Multiplying on right not left so must not transpose TransInv.
-    tmpM.AddMatMat(1.0, Matrix<double>(model->M_[i]), kNoTrans,
-                   TransInv, kNoTrans, 0.0);
-    model->M_[i].CopyFromMat(tmpM);
-  }
-  KALDI_LOG << "Renormalized subspace.";
-}
-
-double MleAmSgmm2Updater::UpdateM(const MleAmSgmm2Accs &accs,
-                                 const std::vector< SpMatrix<double> > &Q,
-                                 const Vector<double> &gamma_i,
-                                 AmSgmm2 *model) {
-  double tot_count = 0.0, tot_like_impr = 0.0;
-  for (int32 i = 0; i < accs.num_gaussians_; i++) {
-    if (gamma_i(i) < accs.feature_dim_) {
-      KALDI_WARN << "For component " << i << ": not updating M due to very "
-                 << "small count (=" << gamma_i(i) << ").";
-      continue;
-    }
-
-    SolverOptions opts;
-    opts.name = "M";
-    opts.K = options_.max_cond;
-    opts.eps = options_.epsilon;
-
-    Matrix<double> Mi(model->M_[i]);
-    double impr =
-        SolveQuadraticMatrixProblem(Q[i], accs.Y_[i],
-                                    SpMatrix<double>(model->SigmaInv_[i]),
-                                    opts, &Mi);
-
-    model->M_[i].CopyFromMat(Mi);
-
-    if (i < 10) {
-      KALDI_VLOG(2) << "Objf impr for projection M for i = " << i << ", is "
-                    << (impr/(gamma_i(i) + 1.0e-20)) << " over " << gamma_i(i)
-                    << " frames";
-    }
-    tot_count += gamma_i(i);
-    tot_like_impr += impr;
-  }
-  tot_like_impr /= (tot_count + 1.0e-20);
-  KALDI_LOG << "Overall objective function improvement for model projections "
-            << "M is " << tot_like_impr << " over " << tot_count << " frames";
-  return tot_like_impr;
-}
-
-
-// Estimate the parameters of a Gaussian prior over the M matrices. There are
-// as many mean matrices as UBM size and two covariance matrices for the rows
-// of M and columns of M. The prior means M_i are fixed to the unadapted values.
-// This is what was done in Lu, et al. "Maximum a posteriori adaptation of
-// subspace Gaussian mixture models for cross-lingual speech recognition",
-// ICASSP 2012.
-void MleAmSgmm2Updater::ComputeMPrior(AmSgmm2 *model) {
-  KALDI_ASSERT(options_.map_M_prior_iters > 0);
-  int32 Ddim = model->FeatureDim();
-  int32 Sdim = model->PhoneSpaceDim();
-  int32 nGaussians = model->NumGauss();
-
-  // inverse variance of the columns of M: dim is # of rows
-  model->col_cov_inv_.Resize(Ddim);
-  // inverse covariance of the rows of M: dim is # of columns
-  model->row_cov_inv_.Resize(Sdim);
-
-  model->col_cov_inv_.SetUnit();
-  model->row_cov_inv_.SetUnit();
-
-  if (model->M_prior_.size() == 0) {
-    model->M_prior_.resize(nGaussians);
-    for (int32 i = 0; i < nGaussians; i++) {
-      model->M_prior_[i].Resize(Ddim, Sdim);
-      model->M_prior_[i].CopyFromMat(model->M_[i]); // We initialize Mpri as this
-    }
-  }
-
-  if (options_.full_col_cov || options_.full_row_cov) {
-    Matrix<double> avg_M(Ddim, Sdim);  // average of the Gaussian prior means
-    for (int32 i = 0; i < nGaussians; i++)
-      avg_M.AddMat(1.0, Matrix<double>(model->M_prior_[i]));
-    avg_M.Scale(1.0 / nGaussians);
-
-    Matrix<double> MDiff(Ddim, Sdim);
-    for (int32 iter = 0; iter < options_.map_M_prior_iters; iter++) {
-      { // diagnostic block.
-        double prior_like = -0.5 * nGaussians * (Ddim * Sdim * Log(2 * M_PI)
-                + Sdim * (-model->row_cov_inv_.LogPosDefDet())
-                + Ddim * (-model->col_cov_inv_.LogPosDefDet()));
-        for (int32 i = 0; i < nGaussians; i++) {
-          MDiff.CopyFromMat(Matrix<double>(model->M_prior_[i]));
-          MDiff.AddMat(-1.0, avg_M);  // MDiff = M_{i} - avg(M)
-          SpMatrix<double> tmp(Ddim);
-          // tmp = MDiff.Omega_r^{-1}*MDiff^T.
-          tmp.AddMat2Sp(1.0, MDiff, kNoTrans,
-                        SpMatrix<double>(model->row_cov_inv_), 0.0);
-          prior_like -= 0.5 * TraceSpSp(tmp, SpMatrix<double>(model->col_cov_inv_));
-        }
-        KALDI_LOG << "Before iteration " << iter
-            << " of updating prior over M, log like per dimension modeled is "
-            << prior_like / (nGaussians * Ddim * Sdim);
-      }
-
-      // First estimate the column covariances (\Omega_r in paper)
-      if (options_.full_col_cov) {
-        size_t limited;
-        model->col_cov_inv_.SetZero();
-        for (int32 i = 0; i < nGaussians; i++) {
-          MDiff.CopyFromMat(Matrix<double>(model->M_prior_[i]));
-          MDiff.AddMat(-1.0, avg_M);  // MDiff = M_{i} - avg(M)
-          // Omega_r += 1/(D*I) * Mdiff * Omega_c^{-1} * Mdiff^T
-          model->col_cov_inv_.AddMat2Sp(1.0 / (Ddim * nGaussians),
-                                        Matrix<BaseFloat>(MDiff), kNoTrans,
-                                        model->row_cov_inv_, 1.0);
-        }
-        model->col_cov_inv_.PrintEigs("col_cov");
-        limited = model->col_cov_inv_.LimitCond(options_.max_cond,
-                                                true /*invert the matrix*/);
-        if (limited != 0) {
-          KALDI_LOG << "Computing column covariances for M: limited " << limited
-                    << " singular values, max condition is "
-                    << options_.max_cond;
-        }
-      }
-
-      // Now estimate the row covariances (\Omega_c in paper)
-      if (options_.full_row_cov) {
-        size_t limited;
-        model->row_cov_inv_.SetZero();
-        for (int32 i = 0; i < nGaussians; i++) {
-          MDiff.CopyFromMat(Matrix<double>(model->M_prior_[i]));
-          MDiff.AddMat(-1.0, avg_M);  // MDiff = M_{i} - avg(M)
-          // Omega_c += 1/(S*I) * Mdiff^T * Omega_r^{-1} * Mdiff.
-          model->row_cov_inv_.AddMat2Sp(1.0 / (Sdim * nGaussians),
-                                        Matrix<BaseFloat>(MDiff), kTrans,
-                                        model->col_cov_inv_, 1.0);
-        }
-        model->row_cov_inv_.PrintEigs("row_cov");
-        limited = model->row_cov_inv_.LimitCond(options_.max_cond,
-                                                true /*invert the matrix*/);
-        if (limited != 0) {
-          KALDI_LOG << "Computing row covariances for M: limited " << limited
-                    << " singular values, max condition is "
-                    << options_.max_cond;
-        }
-      }
-    }  // end iterations
-  }
-}
-
-
-// MAP adaptation of M with a matrix-variate Gaussian prior
-double MleAmSgmm2Updater::MapUpdateM(const MleAmSgmm2Accs &accs,
-                                     const std::vector< SpMatrix<double> > &Q,
-                                     const Vector<double> &gamma_i,
-                                     AmSgmm2 *model) {
-  int32 Ddim = model->FeatureDim();
-  int32 Sdim = model->PhoneSpaceDim();
-  int32 nGaussians = model->NumGauss();
-
-  KALDI_LOG << "Prior smoothing parameter: Tau = " << options_.tau_map_M;
-  if (model->M_prior_.size() == 0 || model->col_cov_inv_.NumRows() == 0
-      || model->row_cov_inv_.NumRows() == 0) {
-    KALDI_LOG << "Computing the prior first";
-    ComputeMPrior(model);
-  }
-
-  Matrix<double> G(Ddim, Sdim);
-  // \tau \Omega_c^{-1} avg(M) \Omega_r^{-1}, depends on Gaussian index
-  Matrix<double> prior_term_i(Ddim, Sdim);
-  SpMatrix<double> P2(model->col_cov_inv_);
-  SpMatrix<double> Q2(model->row_cov_inv_);
-  Q2.Scale(options_.tau_map_M);
-
-  double totcount = 0.0, tot_like_impr = 0.0;
-  for (int32 i = 0; i < nGaussians; ++i) {
-    if (gamma_i(i) < accs.feature_dim_) {
-      KALDI_WARN << "For component " << i << ": not updating M due to very "
-                 << "small count (=" << gamma_i(i) << ").";
-      continue;
-    }
-
-    Matrix<double> tmp(Ddim, Sdim, kSetZero);
-    tmp.AddSpMat(1.0, SpMatrix<double>(model->col_cov_inv_),
-                 Matrix<double>(model->M_prior_[i]), kNoTrans, 0.0);
-    prior_term_i.AddMatSp(options_.tau_map_M, tmp, kNoTrans,
-                          SpMatrix<double>(model->row_cov_inv_), 0.0);
-
-    Matrix<double> SigmaY(Ddim, Sdim, kSetZero);
-    SigmaY.AddSpMat(1.0, SpMatrix<double>(model->SigmaInv_[i]), accs.Y_[i],
-                    kNoTrans, 0.0);
-    G.CopyFromMat(SigmaY);  // G = \Sigma_{i}^{-1} Y_{i}
-    G.AddMat(1.0, prior_term_i); // G += \tau \Omega_c^{-1} avg(M) \Omega_r^{-1}
-    SpMatrix<double> P1(model->SigmaInv_[i]);
-    Matrix<double> Mi(model->M_[i]);
-
-    SolverOptions opts;
-    opts.name = "M";
-    opts.K = options_.max_cond;
-    opts.eps = options_.epsilon;
-    double impr =
-        SolveDoubleQuadraticMatrixProblem(G, P1, P2, Q[i], Q2, opts, &Mi);
-    model->M_[i].CopyFromMat(Mi);
-    if (i < 10) {
-      KALDI_LOG << "Objf impr for projection M for i = " << i << ", is "
-                << (impr / (gamma_i(i) + 1.0e-20)) << " over " << gamma_i(i)
-                << " frames";
-    }
-    totcount += gamma_i(i);
-    tot_like_impr += impr;
-  }
-  tot_like_impr /= (totcount + 1.0e-20);
-  KALDI_LOG << "Overall objective function improvement for model projections "
-            << "M is " << tot_like_impr << " over " << totcount << " frames";
-  return tot_like_impr;
-}
-
-
-/// This function gets stats used inside UpdateW, where it accumulates
-/// the F_i and g_i quantities.  Note: F_i is viewed as a vector of SpMatrix
-/// (one for each i); each row of F_i is viewed as an SpMatrix even though
-/// it's stored as a vector....
-/// Note: on the first iteration w is just a double-precision copy of the matrix
-/// model->w_; thereafter it may differ.
-/// log_a relates to the SSGMM.
-
-// static
-void MleAmSgmm2Updater::UpdateWGetStats(const MleAmSgmm2Accs &accs,
-                                        const AmSgmm2 &model,
-                                        const Matrix<double> &w,
-                                        const std::vector<Matrix<double> > &log_a,
-                                        Matrix<double> *F_i,
-                                        Matrix<double> *g_i,
-                                        double *tot_like,
-                                        int32 num_threads,
-                                        int32 thread_id) {
-
-  // Accumulate stats from a block of states (this gets called in parallel).
-  int32 block_size = (accs.num_groups_ + (num_threads-1)) / num_threads,
-      j1_start = block_size * thread_id,
-      j1_end = std::min(accs.num_groups_, j1_start + block_size);
-
-  // Unlike in the report the inner most loop is over Gaussians, where
-  // per-gaussian statistics are accumulated. This is more memory demanding
-  // but more computationally efficient, as outer product v_{jvm} v_{jvm}^T
-  // is computed only once for all gaussians.
-
-  SpMatrix<double> v_vT(accs.phn_space_dim_);
-
-  for (int32 j1 = j1_start; j1 < j1_end; j1++) {
-    int32 num_substates = model.NumSubstatesForGroup(j1);
-    Matrix<double> w_j(num_substates, accs.num_gaussians_);
-    // The linear term and quadratic term for each Gaussian-- two scalars
-    // for each Gaussian, they appear in the accumulation formulas.
-    Matrix<double> linear_term(num_substates, accs.num_gaussians_);
-    Matrix<double> quadratic_term(num_substates, accs.num_gaussians_);
-    Matrix<double> v_vT_m(num_substates,
-                          (accs.phn_space_dim_*(accs.phn_space_dim_+1))/2);
-
-    // w_jm = softmax([w_{k1}^T ... w_{kD}^T] * v_{jkm})  eq.(7)
-    Matrix<double> v_j_double(model.v_[j1]);
-    w_j.AddMatMat(1.0, v_j_double, kNoTrans, w, kTrans, 0.0);
-    if (!log_a.empty()) w_j.AddMat(1.0, log_a[j1]); // SSGMM techreport eq. 42
-
-    for (int32 m = 0; m < model.NumSubstatesForGroup(j1); m++) {
-      SubVector<double> w_jm(w_j, m);
-      double gamma_jm = accs.gamma_[j1].Row(m).Sum();
-      w_jm.Add(-1.0 * w_jm.LogSumExp());
-      *tot_like += VecVec(w_jm, accs.gamma_[j1].Row(m));
-      w_jm.ApplyExp();
-      v_vT.SetZero();
-      // v_vT := v_{jkm} v_{jkm}^T
-      v_vT.AddVec2(static_cast<BaseFloat>(1.0), v_j_double.Row(m));
-      v_vT_m.Row(m).CopyFromPacked(v_vT); // a bit wasteful, but does not dominate.
-
-      for (int32 i = 0; i < accs.num_gaussians_; i++) {
-        // Suggestion: g_jkm can be computed more efficiently
-        // using the Vector/Matrix routines for all i at once
-        // linear term around cur value.
-        linear_term(m, i) = accs.gamma_[j1](m, i) - gamma_jm * w_jm(i);
-        quadratic_term(m, i) = std::max(accs.gamma_[j1](m, i),
-                                        gamma_jm * w_jm(i));
-      }
-    } // loop over substates
-    g_i->AddMatMat(1.0, linear_term, kTrans, v_j_double, kNoTrans, 1.0);
-    F_i->AddMatMat(1.0, quadratic_term, kTrans, v_vT_m, kNoTrans, 1.0);
-  } // loop over states
-}
-
-// The parallel weight update, in the paper.
-double MleAmSgmm2Updater::UpdateW(const MleAmSgmm2Accs &accs,
-                                  const std::vector<Matrix<double> > &log_a,
-                                  const Vector<double> &gamma_i,
-                                  AmSgmm2 *model) {
-  KALDI_LOG << "Updating weight projections";
-
-  // tot_like_{after, before} are totals over multiple iterations,
-  // not valid likelihoods. but difference is valid (when divided by tot_count).
-  double tot_predicted_like_impr = 0.0, tot_like_before = 0.0,
-      tot_like_after = 0.0;
-
-  Matrix<double> g_i(accs.num_gaussians_, accs.phn_space_dim_);
-  // View F_i as a vector of SpMatrix.
-  Matrix<double> F_i(accs.num_gaussians_,
-                     (accs.phn_space_dim_*(accs.phn_space_dim_+1))/2);
-
-  Matrix<double> w(model->w_);
-  double tot_count = gamma_i.Sum();
-
-  for (int iter = 0; iter < options_.weight_projections_iters; iter++) {
-    F_i.SetZero();
-    g_i.SetZero();
-    double k_like_before = 0.0;
-
-    UpdateWClass c(accs, *model, w, log_a, &F_i, &g_i, &k_like_before);
-    RunMultiThreaded(c);
-
-    Matrix<double> w_orig(w);
-    double k_predicted_like_impr = 0.0, k_like_after = 0.0;
-    double min_step = 0.001, step_size;
-
-    SolverOptions opts;
-    opts.name = "w";
-    opts.K = options_.max_cond;
-    opts.eps = options_.epsilon;
-
-    for (step_size = 1.0; step_size >= min_step; step_size /= 2) {
-      k_predicted_like_impr = 0.0;
-      k_like_after = 0.0;
-
-      for (int32 i = 0; i < accs.num_gaussians_; i++) {
-        // auxf is formulated in terms of change in w.
-        Vector<double> delta_w(accs.phn_space_dim_);
-        // returns objf impr with step_size = 1,
-        // but it may not be 1 so we recalculate it.
-        SpMatrix<double> this_F_i(accs.phn_space_dim_);
-        this_F_i.CopyFromVec(F_i.Row(i));
-        SolveQuadraticProblem(this_F_i, g_i.Row(i), opts, &delta_w);
-
-        delta_w.Scale(step_size);
-        double predicted_impr = VecVec(delta_w, g_i.Row(i)) -
-            0.5 * VecSpVec(delta_w,  this_F_i, delta_w);
-
-        // should never be negative because
-        // we checked inside SolveQuadraticProblem.
-        KALDI_ASSERT(predicted_impr >= -1.0e-05);
-
-        if (i < 10)
-          KALDI_LOG << "Predicted objf impr for w, iter = " << iter
-                    << ", i = " << i << " is "
-                    << (predicted_impr/gamma_i(i)+1.0e-20)
-                    << " per frame over " << gamma_i(i) << " frames.";
-        k_predicted_like_impr += predicted_impr;
-        w.Row(i).AddVec(1.0, delta_w);
-      }
-      for (int32 j1 = 0; j1 < accs.num_groups_; j1++) {
-        int32 M = model->NumSubstatesForGroup(j1);
-        Matrix<double> w_j(M, accs.num_gaussians_);
-        w_j.AddMatMat(1.0, Matrix<double>(model->v_[j1]), kNoTrans,
-                       w, kTrans, 0.0);
-        if (!log_a.empty()) w_j.AddMat(1.0, log_a[j1]); // SSGMM techreport eq. 42
-        for (int32 m = 0; m < M; m++) {
-          SubVector<double> w_jm(w_j, m);
-          w_jm.Add(-1.0 * w_jm.LogSumExp());
-        }
-        k_like_after += TraceMatMat(w_j, accs.gamma_[j1], kTrans);
-      }
-      KALDI_VLOG(2) << "For iteration " << iter << ", updating w gives "
-                    << "predicted per-frame like impr "
-                    << (k_predicted_like_impr / tot_count) << ", actual "
-                    << ((k_like_after - k_like_before) / tot_count) << ", over "
-                    << tot_count << " frames";
-      if (k_like_after < k_like_before) {
-        w.CopyFromMat(w_orig);  // Undo what we computed.
-        if (fabs(k_like_after - k_like_before) / tot_count < 1.0e-05) {
-          k_like_after = k_like_before;
-          KALDI_WARN << "Not updating weights as not increasing auxf and "
-                     << "probably due to numerical issues (since small change).";
-          break;
-        } else {
-          KALDI_WARN << "Halving step size for weights as likelihood did "
-                     << "not increase";
-        }
-      } else {
-        break;
-      }
-    }
-    if (step_size < min_step) {
-      // Undo any step as we have no confidence that this is right.
-      w.CopyFromMat(w_orig);
-    } else {
-      tot_predicted_like_impr += k_predicted_like_impr;
-      tot_like_after += k_like_after;
-      tot_like_before += k_like_before;
-    }
-  }
-
-  model->w_.CopyFromMat(w);
-  model->w_jmi_.clear(); // invalidated.
-
-  tot_predicted_like_impr /= tot_count;
-  tot_like_after = (tot_like_after - tot_like_before) / tot_count;
-  KALDI_LOG << "**Overall objf impr for w is " << tot_predicted_like_impr
-            << ", actual " << tot_like_after << ", over "
-            << tot_count << " frames";
-  return tot_like_after;
-}
-
-double MleAmSgmm2Updater::UpdateU(const MleAmSgmm2Accs &accs,
-                                 const Vector<double> &gamma_i,
-                                 AmSgmm2 *model) {
-  double tot_impr = 0.0;
-  SolverOptions opts;
-  opts.name = "u";
-  opts.K = options_.max_cond;
-  opts.eps = options_.epsilon;
-
-  for (int32 i = 0; i < accs.num_gaussians_; i++) {
-    if (gamma_i(i) < 200.0) {
-      KALDI_LOG << "Count is small " << gamma_i(i) << " for gaussian "
-                << i << ", not updating u_i.";
-      continue;
-    }
-    Vector<double> u_i(model->u_.Row(i));
-    Vector<double> delta_u(accs.spk_space_dim_);
-    double impr =
-        SolveQuadraticProblem(accs.U_[i], accs.t_.Row(i), opts, &delta_u);
-    double impr_per_frame = impr / gamma_i(i);
-    if (impr_per_frame > options_.max_impr_u) {
-      KALDI_WARN << "Updating speaker weight projections u, for Gaussian index "
-                 << i << ", impr/frame is " << impr_per_frame << " over "
-                 << gamma_i(i) << " frames, scaling back to not exceed "
-                 << options_.max_impr_u;
-      double scale = options_.max_impr_u / impr_per_frame;
-      impr *= scale;
-      delta_u.Scale(scale);
-      // Note: a linear scaling of "impr" with "scale" is not quite accurate
-      // in depicting how the quadratic auxiliary function varies as we change
-      // the scale on "delta", but this does not really matter-- the goal is
-      // to limit the auxiliary-function change to not be too large.
-    }
-    if (i < 10) {
-      KALDI_LOG << "Objf impr for spk weight-projection u for i = " << (i)
-                << ", is " << (impr / (gamma_i(i) + 1.0e-20)) << " over "
-                << gamma_i(i) << " frames";
-    }
-    u_i.AddVec(1.0, delta_u);
-    model->u_.Row(i).CopyFromVec(u_i);
-    tot_impr += impr;
-  }
-  KALDI_LOG << "**Overall objf impr for u is " << (tot_impr/gamma_i.Sum())
-            << ", over " << gamma_i.Sum() << " frames";
-  return tot_impr / gamma_i.Sum();
-}
-
-double MleAmSgmm2Updater::UpdateN(const MleAmSgmm2Accs &accs,
-                                 const Vector<double>  &gamma_i,
-                                 AmSgmm2 *model) {
-  double tot_count = 0.0, tot_like_impr = 0.0;
-  if (accs.spk_space_dim_ == 0 || accs.R_.size() == 0 || accs.Z_.size() == 0) {
-    KALDI_ERR << "Speaker subspace dim is zero or no stats accumulated";
-  }
-  SolverOptions opts;
-  opts.name = "N";
-  opts.K = options_.max_cond;
-  opts.eps = options_.epsilon;
-
-
-  for (int32 i = 0; i < accs.num_gaussians_; i++) {
-    if (gamma_i(i) < 2 * accs.spk_space_dim_) {
-      KALDI_WARN << "Not updating speaker basis for i = " << (i)
-                 << " because count is too small " << (gamma_i(i));
-      continue;
-    }
-    Matrix<double> Ni(model->N_[i]);
-    double impr =
-        SolveQuadraticMatrixProblem(accs.R_[i], accs.Z_[i],
-                                    SpMatrix<double>(model->SigmaInv_[i]),
-                                    opts, &Ni);
-    model->N_[i].CopyFromMat(Ni);
-    if (i < 10) {
-      KALDI_LOG << "Objf impr for spk projection N for i = " << (i)
-                << ", is " << (impr / (gamma_i(i) + 1.0e-20)) << " over "
-                << gamma_i(i) << " frames";
-    }
-    tot_count += gamma_i(i);
-    tot_like_impr += impr;
-  }
-
-  KALDI_LOG << "**Overall objf impr for N is " << (tot_like_impr/tot_count)
-            << " over " << tot_count << " frames";
-  return (tot_like_impr/tot_count);
-}
-
-void MleAmSgmm2Updater::RenormalizeN(const MleAmSgmm2Accs &accs,
-                                    const Vector<double> &gamma_i,
-                                    AmSgmm2 *model) {
-  KALDI_ASSERT(accs.R_.size() != 0);
-  double tot_count = gamma_i.Sum();
-  if (tot_count == 0) {
-    KALDI_WARN << "Not renormalizing N, since there are no counts.";
-    return;
-  }
-
-  SpMatrix<double> RTot(accs.spk_space_dim_);
-  //  for (int32 i = 0; i < accs.num_gaussians_; i++) {
-  //    RTot.AddSp(1.0, accs.R_[i]);
-  //  }
-  for (int32 i = 0; i < accs.num_gaussians_; i++) {
-    RTot.AddSp(gamma_i(i), accs.R_[i]);
-  }
-  RTot.Scale(1.0 / tot_count);
-  Matrix<double> U(accs.spk_space_dim_, accs.spk_space_dim_);
-  Vector<double> eigs(accs.spk_space_dim_);
-  RTot.SymPosSemiDefEig(&eigs, &U);
-  KALDI_LOG << "Renormalizing N, eigs are: " << (eigs);
-  Vector<double> sqrteigs(accs.spk_space_dim_);
-  for (int32 t = 0; t < accs.spk_space_dim_; t++) {
-    sqrteigs(t) = sqrt(eigs(t));
-  }
-  // e.g.   diag(eigs)^{-0.5} * U' * RTot * U * diag(eigs)^{-0.5}  = 1
-  // But inverse transpose of this transformation needs to take place on R,
-  // i.e. not (on left: diag(eigs)^{-0.5} * U')
-  // but: (inverse it: U . diag(eigs)^{0.5},
-  // transpose it: diag(eigs)^{0.5} U^T. Need to do this on the right to N
-  // (because N has the spk vecs on the right), so N := N U diag(eigs)^{0.5}
-  U.MulColsVec(sqrteigs);
-  Matrix<double> Ntmp(accs.feature_dim_, accs.spk_space_dim_);
-  for (int32 i = 0; i < accs.num_gaussians_; i++) {
-    Ntmp.AddMatMat(1.0, Matrix<double>(model->N_[i]), kNoTrans, U, kNoTrans, 0.0);
-    model->N_[i].CopyFromMat(Ntmp);
-  }
-}
-
-
-double MleAmSgmm2Updater::UpdateVars(const MleAmSgmm2Accs &accs,
-                                    const std::vector< SpMatrix<double> > &S_means,
-                                    const Vector<double> &gamma_i,
-                                    AmSgmm2 *model) {
-  KALDI_ASSERT(S_means.size() == static_cast<size_t>(accs.num_gaussians_));
-
-  SpMatrix<double> Sigma_i(accs.feature_dim_), Sigma_i_ml(accs.feature_dim_);
-  double tot_objf_impr = 0.0, tot_t = 0.0;
-  SpMatrix<double> covfloor(accs.feature_dim_);
-  Vector<double> objf_improv(accs.num_gaussians_);
-
-  // First pass over all (shared) Gaussian components to calculate the
-  // ML estimate of the covariances, and the total covariance for flooring.
-  for (int32 i = 0; i < accs.num_gaussians_; i++) {
-    // Eq. (75): Sigma_{i}^{ml} = 1/gamma_{i} [S_{i} + S_{i}^{(means)} - ...
-    //                                          Y_{i} M_{i}^T - M_{i} Y_{i}^T]
-    // Note the S_means already contains the Y_{i} M_{i}^T terms.
-    Sigma_i_ml.CopyFromSp(S_means[i]);
-    Sigma_i_ml.AddSp(1.0, accs.S_[i]);
-
-    covfloor.AddSp(1.0, Sigma_i_ml);
-    // inverting  small values e.g. 4.41745328e-40 seems to generate inf,
-    // although would be fixed up later.
-    if (gamma_i(i) > 1.0e-20) {
-      Sigma_i_ml.Scale(1 / (gamma_i(i) + 1.0e-20));
-    } else {
-      Sigma_i_ml.SetUnit();
-    }
-    KALDI_ASSERT(1.0 / Sigma_i_ml(0, 0) != 0.0);
-    // Eq. (76): Compute the objective function with the old parameter values
-    objf_improv(i) = model->SigmaInv_[i].LogPosDefDet() -
-        TraceSpSp(SpMatrix<double>(model->SigmaInv_[i]), Sigma_i_ml);
-
-    model->SigmaInv_[i].CopyFromSp(Sigma_i_ml);  // inverted in the next loop.
-  }
-
-  // Compute the covariance floor.
-  if (gamma_i.Sum() == 0) {  // If no count, use identity.
-    KALDI_WARN << "Updating variances: zero counts. Setting floor to unit.";
-    covfloor.SetUnit();
-  } else {  // else, use the global average covariance.
-    covfloor.Scale(options_.cov_floor / gamma_i.Sum());
-    int32 tmp;
-    if ((tmp = covfloor.LimitCondDouble(options_.max_cond)) != 0) {
-      KALDI_WARN << "Covariance flooring matrix is poorly conditioned. Fixed "
-                 << "up " << tmp << " eigenvalues.";
-    }
-  }
-
-  if (options_.cov_diag_ratio > 1000) {
-    KALDI_LOG << "Assuming you want to build a diagonal system since "
-              << "cov_diag_ratio is large: making diagonal covFloor.";
-    for (int32 i = 0; i < covfloor.NumRows(); i++)
-      for (int32 j = 0; j < i; j++)
-        covfloor(i, j) = 0.0;
-  }
-
-  // Second pass over all (shared) Gaussian components to calculate the
-  // floored estimate of the covariances, and update the model.
-  for (int32 i = 0; i < accs.num_gaussians_; i++) {
-    Sigma_i.CopyFromSp(model->SigmaInv_[i]);
-    Sigma_i_ml.CopyFromSp(Sigma_i);
-    // In case of insufficient counts, make the covariance matrix diagonal.
-    // cov_diag_ratio is 2 by default, set to very large to always get diag-cov
-    if (gamma_i(i) < options_.cov_diag_ratio * accs.feature_dim_) {
-      KALDI_WARN << "For Gaussian component " << i << ": Too low count "
-                 << gamma_i(i) << " for covariance matrix estimation. Setting to "
-                 << "diagonal";
-      for (int32 d = 0; d < accs.feature_dim_; d++)
-        for (int32 e = 0; e < d; e++)
-          Sigma_i(d, e) = 0.0;  // SpMatrix, can only set lower triangular part
-
-      int floored = Sigma_i.ApplyFloor(covfloor);
-      if (floored > 0) {
-        KALDI_WARN << "For Gaussian component " << i << ": Floored " << floored
-                   << " covariance eigenvalues.";
-      }
-      model->SigmaInv_[i].CopyFromSp(Sigma_i);
-      model->SigmaInv_[i].InvertDouble();
-    } else {  // Updating the full covariance matrix.
-      try {
-        int floored = Sigma_i.ApplyFloor(covfloor);
-        if (floored > 0) {
-          KALDI_WARN << "For Gaussian component " << i << ": Floored "
-                     << floored << " covariance eigenvalues.";
-        }
-        model->SigmaInv_[i].CopyFromSp(Sigma_i);
-        model->SigmaInv_[i].InvertDouble();
-
-        objf_improv(i) += Sigma_i.LogPosDefDet() +
-            TraceSpSp(SpMatrix<double>(model->SigmaInv_[i]), Sigma_i_ml);
-        objf_improv(i) *= (-0.5 * gamma_i(i));  // Eq. (76)
-
-        tot_objf_impr += objf_improv(i);
-        tot_t += gamma_i(i);
-        if (i < 5) {
-          KALDI_VLOG(2) << "objf impr from variance update =" << objf_improv(i)
-              / (gamma_i(i) + 1.0e-20) << " over " << (gamma_i(i))
-                        << " frames for i = " << (i);
-        }
-      } catch(...) {
-        KALDI_WARN << "Updating within-class covariance matrix i = " << (i)
-                   << ", numerical problem";
-        // This is a catch-all thing in case of unanticipated errors, but
-        // flooring should prevent this occurring for the most part.
-        model->SigmaInv_[i].SetUnit();  // Set to unit.
-      }
-    }
-  }
-  KALDI_LOG << "**Overall objf impr for variance update = "
-            << (tot_objf_impr / (tot_t+ 1.0e-20))
-            << " over " << tot_t << " frames";
-  return tot_objf_impr / (tot_t + 1.0e-20);
-}
-
-
-double MleAmSgmm2Updater::UpdateSubstateWeights(
-    const MleAmSgmm2Accs &accs, AmSgmm2 *model) {
-  KALDI_LOG << "Updating substate mixture weights";
-  // Also set the vector gamma_j which is a cache of the state occupancies.
-
-  double tot_gamma = 0.0, objf_impr = 0.0;
-  for (int32 j2 = 0; j2 < accs.num_pdfs_; j2++) {
-    double gamma_j_sm = 0.0;
-    int32 num_substates = model->NumSubstatesForPdf(j2);
-    const Vector<double> &occs(accs.gamma_c_[j2]);
-    Vector<double> smoothed_occs(occs);
-    smoothed_occs.Add(options_.tau_c);
-    gamma_j_sm += smoothed_occs.Sum();
-    tot_gamma += occs.Sum();
-
-    for (int32 m = 0; m < num_substates; m++) {
-      double cur_weight = model->c_[j2](m);
-      if (cur_weight <= 0) {
-        KALDI_WARN << "Zero or negative weight, flooring";
-        cur_weight = 1.0e-10;  // future work(arnab): remove magic numbers
-      }
-      model->c_[j2](m) = smoothed_occs(m) / gamma_j_sm;
-      objf_impr += Log(model->c_[j2](m) / cur_weight) * occs(m);
-    }
-  }
-  KALDI_LOG << "**Overall objf impr for c is " << (objf_impr/tot_gamma)
-            << ", over " << tot_gamma << " frames.";
-  return (objf_impr/tot_gamma);
-}
-
-
-MleSgmm2SpeakerAccs::MleSgmm2SpeakerAccs(const AmSgmm2 &model,
-                                         BaseFloat prune)
-    : rand_prune_(prune) {
-  KALDI_ASSERT(model.SpkSpaceDim() != 0);
-  H_spk_.resize(model.NumGauss());
-  for (int32 i = 0; i < model.NumGauss(); i++) {
-    // Eq. (82): H_{i}^{spk} = N_{i}^T \Sigma_{i}^{-1} N_{i}
-    H_spk_[i].Resize(model.SpkSpaceDim());
-    H_spk_[i].AddMat2Sp(1.0, Matrix<double>(model.N_[i]),
-                        kTrans, SpMatrix<double>(model.SigmaInv_[i]), 0.0);
-  }
-
-  model.GetNtransSigmaInv(&NtransSigmaInv_);
-
-  gamma_s_.Resize(model.NumGauss());
-  y_s_.Resize(model.SpkSpaceDim());
-  if (model.HasSpeakerDependentWeights())
-    a_s_.Resize(model.NumGauss());
-}
-
-void MleSgmm2SpeakerAccs::Clear() {
-  y_s_.SetZero();
-  gamma_s_.SetZero();
-  if (a_s_.Dim() != 0) a_s_.SetZero();
-}
-
-BaseFloat
-MleSgmm2SpeakerAccs::Accumulate(const AmSgmm2 &model,
-                               const Sgmm2PerFrameDerivedVars &frame_vars,
-                               int32 j2,
-                               BaseFloat weight,
-                               Sgmm2PerSpkDerivedVars *spk_vars) {
-  // Calculate Gaussian posteriors and collect statistics
-  Matrix<BaseFloat> posteriors;
-  BaseFloat log_like = model.ComponentPosteriors(frame_vars, j2, spk_vars,
-                                                 &posteriors);
-  posteriors.Scale(weight);
-  AccumulateFromPosteriors(model, frame_vars, posteriors, j2, spk_vars);
-  return log_like;
-}
-
-BaseFloat
-MleSgmm2SpeakerAccs::AccumulateFromPosteriors(const AmSgmm2 &model,
-                                             const Sgmm2PerFrameDerivedVars &frame_vars,
-                                             const Matrix<BaseFloat> &posteriors,
-                                             int32 j2,
-                                             Sgmm2PerSpkDerivedVars *spk_vars) {
-  double tot_count = 0.0;
-  int32 feature_dim = model.FeatureDim(),
-      spk_space_dim = model.SpkSpaceDim();
-  KALDI_ASSERT(spk_space_dim != 0);
-  const vector<int32> &gselect = frame_vars.gselect;
-
-  // Intermediate variables
-  Vector<double> xt_jmi(feature_dim), mu_jmi(feature_dim),
-      zt_jmi(spk_space_dim);
-  int32 num_substates = model.NumSubstatesForPdf(j2),
-      j1 = model.Pdf2Group(j2);
-  bool have_spk_dep_weights = (a_s_.Dim() != 0);
-
-  for (int32 m = 0; m < num_substates; m++) {
-    BaseFloat gammat_jm = 0.0;
-    for (int32 ki = 0; ki < static_cast<int32>(gselect.size()); ki++) {
-      int32 i = gselect[ki];
-      // Eq. (39): gamma_{jmi}(t) = p (j, m, i|t)
-      BaseFloat gammat_jmi = RandPrune(posteriors(ki, m), rand_prune_);
-      if (gammat_jmi != 0.0) {
-        gammat_jm += gammat_jmi;
-        tot_count += gammat_jmi;
-        model.GetSubstateMean(j1, m, i, &mu_jmi);
-        xt_jmi.CopyFromVec(frame_vars.xt);
-        xt_jmi.AddVec(-1.0, mu_jmi);
-        // Eq. (48): z{jmi}(t) = N_{i}^{T} \Sigma_{i}^{-1} x_{jmi}(t)
-        zt_jmi.AddMatVec(1.0, NtransSigmaInv_[i], kNoTrans, xt_jmi, 0.0);
-        // Eq. (49): \gamma_{i}^{(s)} = \sum_{t\in\Tau(s), j, m} gamma_{jmi}
-        gamma_s_(i) += gammat_jmi;
-        // Eq. (50): y^{(s)} = \sum_{t, j, m, i} gamma_{jmi}(t) z_{jmi}(t)
-        y_s_.AddVec(gammat_jmi, zt_jmi);
-      }
-    }
-    if (have_spk_dep_weights) {
-      KALDI_ASSERT(!model.w_jmi_.empty());
-      BaseFloat d_jms = model.GetDjms(j1, m, spk_vars);
-      if (d_jms == -1.0) d_jms = 1.0; // Explanation: d_jms is set to -1 when we didn't have
-      // speaker vectors in training.  We treat this the same as the speaker vector being
-      // zero, and d_jms becomes 1 in this case.
-      a_s_.AddVec(gammat_jm/d_jms, model.w_jmi_[j1].Row(m));
-    }
-  }
-  return tot_count;
-}
-
-void MleSgmm2SpeakerAccs::Update(const AmSgmm2 &model,
-                                BaseFloat min_count,
-                                Vector<BaseFloat> *v_s,
-                                BaseFloat *objf_impr_out,
-                                BaseFloat *count_out) {
-  double tot_gamma = gamma_s_.Sum();
-  if (tot_gamma < min_count) {
-    KALDI_WARN << "Updating speaker vectors, count is " << tot_gamma
-               << " < " << min_count << "not updating.";
-    if (objf_impr_out) *objf_impr_out = 0.0;
-    if (count_out) *count_out = 0.0;
-    return;
-  }
-  if (a_s_.Dim() == 0) // No speaker-dependent weights...
-    UpdateNoU(v_s, objf_impr_out, count_out);
-  else
-    UpdateWithU(model, v_s, objf_impr_out, count_out);
-}
-
-
-// Basic update, no SSGMM.
-void MleSgmm2SpeakerAccs::UpdateNoU(Vector<BaseFloat> *v_s,
-                                BaseFloat *objf_impr_out,
-                                BaseFloat *count_out) {
-  double tot_gamma = gamma_s_.Sum();
-  KALDI_ASSERT(y_s_.Dim() != 0);
-  int32 T = y_s_.Dim();  // speaker-subspace dim.
-  int32 num_gauss = gamma_s_.Dim();
-  if (v_s->Dim() != T) v_s->Resize(T);  // will set it to zero.
-
-  // Eq. (84): H^{(s)} = \sum_{i} \gamma_{i}(s) H_{i}^{spk}
-  SpMatrix<double> H_s(T);
-
-  for (int32 i = 0; i < num_gauss; i++)
-    H_s.AddSp(gamma_s_(i), H_spk_[i]);
-
-  // Don't make these options to SolveQuadraticProblem configurable...
-  // they really don't make a difference at all unless the matrix in
-  // question is singular, which wouldn't happen in this case.
-  Vector<double> v_s_dbl(*v_s);
-  double tot_objf_impr =
-      SolveQuadraticProblem(H_s, y_s_, SolverOptions("v_s"), &v_s_dbl);
-
-  v_s->CopyFromVec(v_s_dbl);
-
-  KALDI_LOG << "*Objf impr for speaker vector is " << (tot_objf_impr / tot_gamma)
-            << " over " << tot_gamma << " frames.";
-
-  if (objf_impr_out) *objf_impr_out = tot_objf_impr;
-  if (count_out) *count_out = tot_gamma;
-}
-
-// Basic update, no SSGMM.
-void MleSgmm2SpeakerAccs::UpdateWithU(const AmSgmm2 &model,
-                                     Vector<BaseFloat> *v_s_ptr,
-                                     BaseFloat *objf_impr_out,
-                                     BaseFloat *count_out) {
-  double tot_gamma = gamma_s_.Sum();
-  KALDI_ASSERT(y_s_.Dim() != 0);
-  int32 T = y_s_.Dim();  // speaker-subspace dim.
-  int32 num_gauss = gamma_s_.Dim();
-  if (v_s_ptr->Dim() != T) v_s_ptr->Resize(T);  // will set it to zero.
-
-  // Eq. (84): H^{(s)} = \sum_{i} \gamma_{i}(s) H_{i}^{spk}
-  SpMatrix<double> H_s(T);
-
-  for (int32 i = 0; i < num_gauss; i++)
-    H_s.AddSp(gamma_s_(i), H_spk_[i]);
-
-  Vector<double> v_s(*v_s_ptr);
-  int32 num_iters = 5, // don't set this to 1, as we discard last iter.
-      num_backtracks = 0,
-      max_backtracks = 10;
-  Vector<double> auxf(num_iters);
-  Matrix<double> v_s_per_iter(num_iters, T);
-  // The update for v^{(s)} is the one described in the technical report
-  // section 5.1 (eq. 33 and below).
-
-  for (int32 iter = 0; iter < num_iters; iter++) { // converges very fast,
-    // and each iteration is fast, so don't need to make this configurable.
-    v_s_per_iter.Row(iter).CopyFromVec(v_s);
-
-    SpMatrix<double> F(H_s); // the 2nd-order quadratic term on this iteration...
-    // F^{(p)} in the techerport.
-    Vector<double> g(y_s_); // g^{(p)} in the techreport.
-    g.AddSpVec(-1.0, H_s, v_s, 1.0);
-    Vector<double> log_b_is(num_gauss); // b_i^{(s)}, indexed by i.
-    log_b_is.AddMatVec(1.0, Matrix<double>(model.u_), kNoTrans, v_s, 0.0);
-    Vector<double> tilde_w_is(log_b_is);
-    Vector<double> log_a_s_(a_s_);
-    log_a_s_.ApplyLog();
-    tilde_w_is.AddVec(1.0, log_a_s_);
-    tilde_w_is.Add(-1.0 * tilde_w_is.LogSumExp()); // normalize.
-    // currently tilde_w_is is in log form.
-    auxf(iter) = VecVec(v_s, y_s_) - 0.5 * VecSpVec(v_s, H_s, v_s)
-        + VecVec(gamma_s_, tilde_w_is); // "new" term (weights)
-
-    if (iter > 0 && auxf(iter) < auxf(iter-1) &&
-        !ApproxEqual(auxf(iter), auxf(iter-1))) { // auxf did not improve.
-      // backtrack halfway, and do this iteration again.
-      KALDI_WARN << "Backtracking in speaker vector update, on iter "
-                 << iter << ", auxfs are " << auxf(iter-1) << " -> "
-                 << auxf(iter);
-      v_s.Scale(0.5);
-      v_s.AddVec(0.5, v_s_per_iter.Row(iter-1));
-      if (++num_backtracks >= max_backtracks) {
-        KALDI_WARN << "Backtracked " << max_backtracks
-                   << " times in speaker-vector update.";
-        // backtrack all the way, and terminate:
-        v_s_per_iter.Row(num_iters-1).CopyFromVec(v_s_per_iter.Row(iter-1));
-        // the following statement ensures we will get
-        // the appropriate auxiliary-function.
-        auxf(num_iters-1) = auxf(iter-1);
-        break;
-      }
-      iter--;
-    }
-    tilde_w_is.ApplyExp();
-    for (int32 i = 0; i < num_gauss; i++) {
-      g.AddVec(gamma_s_(i) - tot_gamma * tilde_w_is(i), model.u_.Row(i));
-      F.AddVec2(tot_gamma * tilde_w_is(i), model.u_.Row(i));
-    }
-    Vector<double> delta(v_s.Dim());
-    SolveQuadraticProblem(F, g, SolverOptions("v_s"), &delta);
-    v_s.AddVec(1.0, delta);
-  }
-  // so that we only accept things where the auxf has been checked, we
-  // actually take the penultimate speaker-vector. --> don't set
-  // num-iters = 1.
-  v_s_ptr->CopyFromVec(v_s_per_iter.Row(num_iters-1));
-
-  double auxf_change = auxf(num_iters-1) - auxf(0);
-  KALDI_LOG << "*Objf impr for speaker vector is " << (auxf_change / tot_gamma)
-            << " per frame, over " << tot_gamma << " frames.";
-
-  if (objf_impr_out) *objf_impr_out = auxf_change;
-  if (count_out) *count_out = tot_gamma;
-}
-
-
-MleAmSgmm2Accs::~MleAmSgmm2Accs() {
-  if (gamma_s_.Sum() != 0.0)
-    KALDI_ERR << "In destructor of MleAmSgmm2Accs: detected that you forgot to "
-        "call CommitStatsForSpk()";
-}
-
-
-}  // namespace kaldi
diff --git a/src/sgmm2/estimate-am-sgmm2.h b/src/sgmm2/estimate-am-sgmm2.h
deleted file mode 100644
index 3768008b3b7..00000000000
--- a/src/sgmm2/estimate-am-sgmm2.h
+++ /dev/null
@@ -1,478 +0,0 @@
-// sgmm2/estimate-am-sgmm2.h
-
-// Copyright 2009-2011  Microsoft Corporation;  Lukas Burget;
-//                      Saarland University (Author: Arnab Ghoshal);
-//                      Ondrej Glembek;  Yanmin Qian;
-// Copyright 2012-2013  Johns Hopkins University (Author: Daniel Povey)
-//                      Liang Lu;  Arnab Ghoshal
-
-// See ../../COPYING for clarification regarding multiple authors
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//  http://www.apache.org/licenses/LICENSE-2.0
-//
-// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
-// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
-// MERCHANTABLITY OR NON-INFRINGEMENT.
-// See the Apache 2 License for the specific language governing permissions and
-// limitations under the License.
-
-#ifndef KALDI_SGMM2_ESTIMATE_AM_SGMM2_H_
-#define KALDI_SGMM2_ESTIMATE_AM_SGMM2_H_ 1
-
-#include <string>
-#include <vector>
-
-#include "sgmm2/am-sgmm2.h"
-#include "gmm/model-common.h"
-#include "itf/options-itf.h"
-#include "util/kaldi-thread.h"
-
-namespace kaldi {
-
-/** \struct MleAmSgmm2Options
- *  Configuration variables needed in the SGMM estimation process.
- */
-struct MleAmSgmm2Options {
-  /// Smoothing constant for sub-state weights [count to add to each one].
-  BaseFloat tau_c;
-  /// Floor covariance matrices Sigma_i to this times average cov.
-  BaseFloat cov_floor;
-  /// ratio to dim below which we use diagonal. default 2, set to inf for diag.
-  BaseFloat cov_diag_ratio;
-  /// Max on condition of matrices in update beyond which we do not update.
-  /// Should probably be related to numerical properties of machine
-  /// or BaseFloat type.
-  BaseFloat max_cond;
-
-  bool renormalize_V;  // Renormalize the phonetic space.
-  bool renormalize_N;  // Renormalize the speaker space.
-
-  /// Number of iters when re-estimating weight projections "w".
-  int weight_projections_iters;
-
-  BaseFloat epsilon;  ///< very small value used to prevent SVD crashing.
-  BaseFloat max_impr_u; ///< max improvement per frame allowed in update of u.
-
-  BaseFloat tau_map_M;  ///< For MAP update of the phonetic subspace M
-  int map_M_prior_iters;  ///< num of iterations to update the prior of M
-  bool full_row_cov;  ///< Estimate row covariance instead of using I
-  bool full_col_cov;  ///< Estimate col covariance instead of using I
-
-  MleAmSgmm2Options() {
-    cov_floor = 0.025;
-    tau_c  = 2.0;
-    cov_diag_ratio = 2.0;  // set this to very large to get diagonal-cov models.
-    max_cond = 1.0e+05;
-    epsilon = 1.0e-40;
-    renormalize_V = true;
-    renormalize_N = false;  // default to false since will invalidate spk vectors
-    // on disk.
-    weight_projections_iters = 3;
-    max_impr_u = 0.25;
-
-    map_M_prior_iters = 5;
-    tau_map_M = 0.0;  // No MAP update by default (~500-1000 depending on prior)
-    full_row_cov = false;
-    full_col_cov = false;
-  }
-
-  void Register(OptionsItf *opts) {
-    std::string module = "MleAmSgmm2Options: ";
-    opts->Register("tau-c", &tau_c, module+
-                   "Count for smoothing weight update.");
-    opts->Register("cov-floor", &cov_floor, module+
-                   "Covariance floor (fraction of average covariance).");
-    opts->Register("cov-diag-ratio", &cov_diag_ratio, module+
-                   "Minimum occ/dim ratio below which use diagonal covariances.");
-    opts->Register("max-cond", &max_cond, module+"Maximum condition number used to "
-                   "regularize the solution of certain quadratic auxiliary functions.");
-    opts->Register("weight-projections-iters", &weight_projections_iters, module+
-                   "Number for iterations for weight projection estimation.");
-    opts->Register("renormalize-v", &renormalize_V, module+"If true, renormalize "
-                   "the phonetic-subspace vectors to have meaningful sizes.");
-    opts->Register("renormalize-n", &renormalize_N, module+"If true, renormalize "
-                   "the speaker subspace to have meaningful sizes.");
-    opts->Register("max-impr-u", &max_impr_u, module+"Maximum objective function "
-                   "improvement per frame allowed in update of u (to "
-                   "maintain stability.");
-
-    opts->Register("tau-map-M", &tau_map_M, module+"Smoothing for MAP estimate "
-                   "of M (0 means ML update).");
-    opts->Register("map-M-prior-iters", &map_M_prior_iters, module+
-                   "Number of iterations to estimate prior covariances for M.");
-    opts->Register("full-row-cov", &full_row_cov, module+
-                   "Estimate row covariance instead of using I.");
-    opts->Register("full-col-cov", &full_col_cov, module+
-                   "Estimate column covariance instead of using I.");
-  }
-};
-
-/** \class MleAmSgmm2Accs
- *  Class for the accumulators associated with the phonetic-subspace model
- *  parameters
- */
-class MleAmSgmm2Accs {
- public:
-  explicit MleAmSgmm2Accs(BaseFloat rand_prune = 1.0e-05)
-      : total_frames_(0.0), total_like_(0.0), feature_dim_(0),
-        phn_space_dim_(0), spk_space_dim_(0), num_gaussians_(0),
-        num_pdfs_(0), num_groups_(0), rand_prune_(rand_prune) {}
-
-  MleAmSgmm2Accs(const AmSgmm2 &model, SgmmUpdateFlagsType flags,
-                 bool have_spk_vecs,
-                 BaseFloat rand_prune = 1.0e-05)
-      : total_frames_(0.0), total_like_(0.0), rand_prune_(rand_prune) {
-    ResizeAccumulators(model, flags, have_spk_vecs);
-  }
-
-  ~MleAmSgmm2Accs();
-
-  void Read(std::istream &in_stream, bool binary, bool add);
-  void Write(std::ostream &out_stream, bool binary) const;
-
-  /// Checks the various accumulators for correct sizes given a model. With
-  /// wrong sizes, assertion failure occurs. When the show_properties argument
-  /// is set to true, dimensions and presence/absence of the various
-  /// accumulators are printed. For use when accumulators are read from file.
-  void Check(const AmSgmm2 &model, bool show_properties = true) const;
-
-  /// Resizes the accumulators to the correct sizes given the model. The flags
-  /// argument controls which accumulators to resize.
-  void ResizeAccumulators(const AmSgmm2 &model, SgmmUpdateFlagsType flags,
-                          bool have_spk_vecs);
-
-  /// Returns likelihood.
-  BaseFloat Accumulate(const AmSgmm2 &model,
-                       const Sgmm2PerFrameDerivedVars &frame_vars,
-                       int32 pdf_index, // == j2.
-                       BaseFloat weight,
-                       Sgmm2PerSpkDerivedVars *spk_vars);
-
-  /// Returns count accumulated (may differ from posteriors.Sum()
-  /// due to weight pruning).
-  BaseFloat AccumulateFromPosteriors(const AmSgmm2 &model,
-                                     const Sgmm2PerFrameDerivedVars &frame_vars,
-                                     const Matrix<BaseFloat> &posteriors,
-                                     int32 pdf_index, // == j2.
-                                     Sgmm2PerSpkDerivedVars *spk_vars);
-
-  /// Accumulates global stats for the current speaker (if applicable).  If
-  /// flags contains kSgmmSpeakerProjections (N), or
-  /// kSgmmSpeakerWeightProjections (u), must call this after finishing the
-  /// speaker's data.
-  void CommitStatsForSpk(const AmSgmm2 &model,
-                         const Sgmm2PerSpkDerivedVars &spk_vars);
-
-  /// Accessors
-  void GetStateOccupancies(Vector<BaseFloat> *occs) const;
-  int32 FeatureDim() const { return feature_dim_; }
-  int32 PhoneSpaceDim() const { return phn_space_dim_; }
-  int32 NumPdfs() const { return num_pdfs_; } // returns J2
-  int32 NumGroups() const { return num_groups_; } // returns J1
-  int32 NumGauss() const { return num_gaussians_; }
-
- private:
-  /// The stats which are not tied to any state.
-  /// Stats Y_{i} for phonetic-subspace projections M; Dim is [I][D][S].
-  std::vector< Matrix<double> > Y_;
-  /// Stats Z_{i} for speaker-subspace projections N. Dim is [I][D][T].
-  std::vector< Matrix<double> > Z_;
-  /// R_{i}, quadratic term for speaker subspace estimation. Dim is [I][T][T]
-  std::vector< SpMatrix<double> > R_;
-  /// S_{i}^{-}, scatter of adapted feature vectors x_{i}(t). Dim is [I][D][D].
-  std::vector< SpMatrix<double> > S_;
-
-  /// The SGMM state specific stats.
-  /// Statistics y_{jm} for state vectors v_{jm}. dimension is [J1][#mix][S].
-  std::vector< Matrix<double> > y_;
-  /// Gaussian occupancies gamma_{jmi} for each substate and Gaussian index,
-  /// pooled over groups. Dim is [J1][#mix][I].
-  std::vector< Matrix<double> > gamma_;
-
-  /// [SSGMM] These a_{jmi} quantities are dimensionally the same
-  /// as the gamma quantities.  They're needed to estimate the v_{jm}
-  /// and w_i quantities in the symmetric SGMM.  Dimension is [J1][#mix][S]
-  std::vector< Matrix<double> > a_;
-
-  /// [SSGMM] each row is one of the t_i quantities in the less-exact
-  /// version of the SSGMM update for the speaker weight projections.
-  /// Dimension is [I][T]
-  Matrix<double> t_;
-
-  /// [SSGMM], this is a per-speaker variable storing the a_i^{(s)}
-  /// quantities that we will use in order to compute the non-speaker-
-  /// specific quantities [see eqs. 53 and 54 in techreport].  Note:
-  /// there is a separate variable a_s_ in class MleSgmm2SpeakerAccs,
-  /// which is the same thing but for purposes of computing
-  /// the speaker-vector v^{(s)}.
-  Vector<double> a_s_;
-
-  /// the U_i quantities from the less-exact version of the SSGMM update for the
-  /// speaker weight projections.  Dimension is [I][T][T]
-  std::vector<SpMatrix<double> > U_;
-
-  /// Sub-state occupancies gamma_{jm}^{(c)} for each sub-state.  In the
-  /// SCTM version of the SGMM, for compactness we store two separate
-  /// sets of gamma statistics, one to estimate the v_{jm} quantities
-  /// and one to estimate the sub-state weights c_{jm}.
-  std::vector< Vector<double> > gamma_c_;
-
-  /// gamma_{i}^{(s)}.  Per-speaker counts for each Gaussian. Dimension is [I]
-  /// Needed for stats R_.  This can be viewed as a temporary variable; it
-  /// does not form part of the stats that we eventually dump to disk.
-  Vector<double> gamma_s_;
-
-  double total_frames_, total_like_;
-
-  /// Dimensionality of various subspaces
-  int32 feature_dim_, phn_space_dim_, spk_space_dim_;
-  int32 num_gaussians_, num_pdfs_, num_groups_;  ///< Other model specifications
-
-  BaseFloat rand_prune_;
-
-  KALDI_DISALLOW_COPY_AND_ASSIGN(MleAmSgmm2Accs);
-  friend class MleAmSgmm2Updater;
-  friend class EbwAmSgmm2Updater;
-};
-
-/** \class MleAmSgmmUpdater
- *  Contains the functions needed to update the SGMM parameters.
- */
-class MleAmSgmm2Updater {
- public:
-  explicit MleAmSgmm2Updater(const MleAmSgmm2Options &options)
-      : options_(options) {}
-  void Reconfigure(const MleAmSgmm2Options &options) {
-    options_ = options;
-  }
-
-  void Update(const MleAmSgmm2Accs &accs,
-              AmSgmm2 *model,
-              SgmmUpdateFlagsType flags);
-
- private:
-  friend class UpdateWClass;
-  friend class UpdatePhoneVectorsClass;
-  friend class EbwEstimateAmSgmm2;
-
-  ///  Compute the Q_i quantities (Eq. 64).
-  static void ComputeQ(const MleAmSgmm2Accs &accs,
-                       const AmSgmm2 &model,
-                       std::vector< SpMatrix<double> > *Q);
-
-  /// Compute the S_means quantities, minus sum: (Y_i M_i^T + M_i Y_I^T).
-  static void ComputeSMeans(const MleAmSgmm2Accs &accs,
-                            const AmSgmm2 &model,
-                            std::vector< SpMatrix<double> > *S_means);
-  friend class EbwAmSgmm2Updater;
-
-  MleAmSgmm2Options options_;
-
-  // Called from UpdatePhoneVectors; updates a subset of states
-  // (relates to multi-threading).
-  void UpdatePhoneVectorsInternal(const MleAmSgmm2Accs &accs,
-                                  const std::vector<SpMatrix<double> > &H,
-                                  const std::vector<Matrix<double> > &log_a,
-                                  AmSgmm2 *model,
-                                  double *auxf_impr,
-                                  int32 num_threads,
-                                  int32 thread_id) const;
-
-  double UpdatePhoneVectors(const MleAmSgmm2Accs &accs,
-                            const std::vector<SpMatrix<double> > &H,
-                            const std::vector<Matrix<double> > &log_a,
-                            AmSgmm2 *model) const;
-
-  double UpdateM(const MleAmSgmm2Accs &accs,
-                 const std::vector< SpMatrix<double> > &Q,
-                 const Vector<double> &gamma_i,
-                 AmSgmm2 *model);
-
-  void RenormalizeV(const MleAmSgmm2Accs &accs, AmSgmm2 *model,
-                    const Vector<double> &gamma_i,
-                    const std::vector<SpMatrix<double> > &H);
-
-  double UpdateN(const MleAmSgmm2Accs &accs, const Vector<double> &gamma_i,
-                 AmSgmm2 *model);
-  void RenormalizeN(const MleAmSgmm2Accs &accs, const Vector<double> &gamma_i,
-                    AmSgmm2 *model);
-  double UpdateVars(const MleAmSgmm2Accs &accs,
-                    const std::vector< SpMatrix<double> > &S_means,
-                    const Vector<double> &gamma_i,
-                    AmSgmm2 *model);
-  // Update for the phonetic-subspace weight projections w_i
-  double UpdateW(const MleAmSgmm2Accs &accs,
-                 const std::vector<Matrix<double> > &log_a,
-                 const Vector<double> &gamma_i,
-                 AmSgmm2 *model);
-  // Update for the speaker-subspace weight projections u_i [SSGMM]
-  double UpdateU(const MleAmSgmm2Accs &accs, const Vector<double> &gamma_i,
-                 AmSgmm2 *model);
-
-  /// Called, multithreaded, inside UpdateW
-  static
-  void UpdateWGetStats(const MleAmSgmm2Accs &accs,
-                       const AmSgmm2 &model,
-                       const Matrix<double> &w,
-                       const std::vector<Matrix<double> > &log_a,
-                       Matrix<double> *F_i,
-                       Matrix<double> *g_i,
-                       double *tot_like,
-                       int32 num_threads,
-                       int32 thread_id);
-
-  double UpdateSubstateWeights(const MleAmSgmm2Accs &accs,
-                               AmSgmm2 *model);
-
-  static void ComputeLogA(const MleAmSgmm2Accs &accs,
-                          std::vector<Matrix<double> > *log_a); // [SSGMM]
-
-  void ComputeMPrior(AmSgmm2 *model);  // TODO(arnab): Maybe make this static?
-  double MapUpdateM(const MleAmSgmm2Accs &accs,
-                    const std::vector< SpMatrix<double> > &Q,
-                    const Vector<double> &gamma_i, AmSgmm2 *model);
-
-  KALDI_DISALLOW_COPY_AND_ASSIGN(MleAmSgmm2Updater);
-  MleAmSgmm2Updater() {}  // Prevent unconfigured updater.
-};
-
-
-/** \class MleSgmm2SpeakerAccs
- *  Class for the accumulators required to update the speaker
- *  vectors v_s.
- *  Note: if you have multiple speakers you will want to initialize
- *  this just once and call Clear() after you're done with each speaker,
- *  rather than creating a new object for each speaker, since the
- *  initialization function does nontrivial work.
- */
-
-class MleSgmm2SpeakerAccs {
- public:
-  /// Initialize the object.  Error if speaker subspace not set up.
-  MleSgmm2SpeakerAccs(const AmSgmm2 &model,
-                      BaseFloat rand_prune_ = 1.0e-05);
-
-  /// Clear the statistics.
-  void Clear();
-
-  /// Accumulate statistics.  Returns per-frame log-likelihood.
-  BaseFloat Accumulate(const AmSgmm2 &model,
-                       const Sgmm2PerFrameDerivedVars &frame_vars,
-                       int32 pdf_index,
-                       BaseFloat weight,
-                       Sgmm2PerSpkDerivedVars *spk_vars);
-
-  /// Accumulate statistics, given posteriors.  Returns total
-  /// count accumulated, which may differ from posteriors.Sum()
-  /// due to randomized pruning.
-  BaseFloat AccumulateFromPosteriors(const AmSgmm2 &model,
-                                     const Sgmm2PerFrameDerivedVars &frame_vars,
-                                     const Matrix<BaseFloat> &posteriors,
-                                     int32 pdf_index,
-                                     Sgmm2PerSpkDerivedVars *spk_vars);
-
-  /// Update speaker vector.  If v_s was empty, will assume it started as zero
-  /// and will resize it to the speaker-subspace size.
-  void Update(const AmSgmm2 &model,
-              BaseFloat min_count,  // e.g. 100
-              Vector<BaseFloat> *v_s,
-              BaseFloat *objf_impr_out,
-              BaseFloat *count_out);
-
- private:
-  // Update without speaker-dependent weights (vectors u_i),
-  // i.e. not symmetric SGMM (SSGMM)
-  void UpdateNoU(Vector<BaseFloat> *v_s,
-                 BaseFloat *objf_impr_out,
-                 BaseFloat *count_out);
-  // Update for SSGMM
-  void UpdateWithU(const AmSgmm2 &model,
-                   Vector<BaseFloat> *v_s,
-                   BaseFloat *objf_impr_out,
-                   BaseFloat *count_out);
-
-
-  /// Statistics for speaker adaptation (vectors), stored per-speaker.
-  /// Per-speaker stats for vectors, y^{(s)}. Dimension [T].
-  Vector<double> y_s_;
-  /// gamma_{i}^{(s)}.  Per-speaker counts for each Gaussian. Dimension is [I]
-  Vector<double> gamma_s_;
-  /// a_i^{(s)}.  For SSGMM.
-  Vector<double> a_s_;
-
-  /// The following variable does not change per speaker, it just
-  /// relates to the speaker subspace.
-  /// Eq. (82): H_{i}^{spk} = N_{i}^T \Sigma_{i}^{-1} N_{i}
-  std::vector< SpMatrix<double> > H_spk_;
-
-  /// N_i^T \Sigma_{i}^{-1}. Needed for y^{(s)}
-  std::vector< Matrix<double> > NtransSigmaInv_;
-
-  /// small constant to randomly prune tiny posteriors
-  BaseFloat rand_prune_;
-};
-
-// This class, used in multi-core implementation of the updates of the "w_i"
-// quantities, was previously in estimate-am-sgmm.cc, but is being moved to the
-// header so it can be used in estimate-am-sgmm-ebw.cc.  It is responsible for
-// computing, in parallel, the F_i and g_i quantities used in the updates of
-// w_i.
-class UpdateWClass: public MultiThreadable {
- public:
-  UpdateWClass(const MleAmSgmm2Accs &accs,
-               const AmSgmm2 &model,
-               const Matrix<double> &w,
-               const std::vector<Matrix<double> > &log_a,
-               Matrix<double> *F_i,
-               Matrix<double> *g_i,
-               double *tot_like):
-      accs_(accs), model_(model), w_(w), log_a_(log_a),
-      F_i_ptr_(F_i), g_i_ptr_(g_i), tot_like_ptr_(tot_like) {
-    tot_like_ = 0.0;
-    F_i_.Resize(F_i->NumRows(), F_i->NumCols());
-    g_i_.Resize(g_i->NumRows(), g_i->NumCols());
-  }
-
-  UpdateWClass(const UpdateWClass &other) :
-      MultiThreadable(other),
-      accs_(other.accs_), model_(other.model_), w_(other.w_),
-      log_a_(other.log_a_), F_i_ptr_(other.F_i_ptr_), g_i_ptr_(other.g_i_ptr_),
-      F_i_(other.F_i_), g_i_(other.g_i_), tot_like_ptr_(other.tot_like_ptr_),
-      tot_like_(0.0) { }
-
-  ~UpdateWClass() {
-    F_i_ptr_->AddMat(1.0, F_i_, kNoTrans);
-    g_i_ptr_->AddMat(1.0, g_i_, kNoTrans);
-    *tot_like_ptr_ += tot_like_;
-  }
-
-  inline void operator() () {
-    // Note: give them local copy of the sums we're computing,
-    // which will be propagated to the total sums in the destructor.
-    MleAmSgmm2Updater::UpdateWGetStats(accs_, model_, w_, log_a_,
-                                      &F_i_, &g_i_, &tot_like_,
-                                      num_threads_, thread_id_);
-  }
- private:
-  const MleAmSgmm2Accs &accs_;
-  const AmSgmm2 &model_;
-  const Matrix<double> &w_;
-  const std::vector<Matrix<double> > &log_a_;
-  Matrix<double> *F_i_ptr_;
-  Matrix<double> *g_i_ptr_;
-  Matrix<double> F_i_;
-  Matrix<double> g_i_;
-  double *tot_like_ptr_;
-  double tot_like_;
-};
-
-
-}  // namespace kaldi
-
-
-#endif  // KALDI_SGMM2_ESTIMATE_AM_SGMM2_H_
diff --git a/src/sgmm2/fmllr-sgmm2-test.cc b/src/sgmm2/fmllr-sgmm2-test.cc
deleted file mode 100644
index ede25d76c68..00000000000
--- a/src/sgmm2/fmllr-sgmm2-test.cc
+++ /dev/null
@@ -1,243 +0,0 @@
-// sgmm2/fmllr-sgmm2-test.cc
-
-// Copyright 2009-2011  Saarland University (author:  Arnab Ghoshal)
-//           2012  Johns Hopkins University (author: Daniel Povey)
-
-// See ../../COPYING for clarification regarding multiple authors
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//  http://www.apache.org/licenses/LICENSE-2.0
-//
-// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
-// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
-// MERCHANTABLITY OR NON-INFRINGEMENT.
-// See the Apache 2 License for the specific language governing permissions and
-// limitations under the License.
-
-#include <vector>
-
-#include "base/kaldi-math.h"
-#include "gmm/model-test-common.h"
-#include "sgmm2/am-sgmm2.h"
-#include "sgmm2/fmllr-sgmm2.h"
-#include "util/kaldi-io.h"
-
-using kaldi::AmSgmm2;
-using kaldi::int32;
-using kaldi::BaseFloat;
-using kaldi::Vector;
-using kaldi::Matrix;
-using kaldi::Exp;
-
-namespace ut = kaldi::unittest;
-
-void ApplyFmllrXform(const kaldi::VectorBase<BaseFloat> &in,
-                     const Matrix<BaseFloat> &xf,
-                     Vector<BaseFloat> *out) {
-  int32 dim = in.Dim();
-  KALDI_ASSERT(xf.NumRows() == dim && xf.NumCols() == dim + 1);
-  Vector<BaseFloat> tmp(dim + 1);
-  tmp.Range(0, dim).CopyFromVec(in);
-  tmp(dim) = 1.0;
-  out->Resize(dim, kaldi::kSetZero);
-  out->AddMatVec(1.0, xf, kaldi::kNoTrans, tmp, 0.0);
-}
-
-// Tests the Read() and Write() methods for the accumulators, in both binary
-// and ASCII mode, as well as Check().
-void TestSgmm2FmllrAccsIO(const AmSgmm2 &sgmm,
-                         const kaldi::Matrix<BaseFloat> &feats) {
-  KALDI_LOG << "Test IO start.";
-  using namespace kaldi;
-  int32 dim = sgmm.FeatureDim();
-  kaldi::Sgmm2PerFrameDerivedVars frame_vars;
-  kaldi::Sgmm2PerSpkDerivedVars empty;
-  kaldi::Sgmm2FmllrGlobalParams fmllr_globals;
-  kaldi::Sgmm2GselectConfig sgmm_config;
-
-  frame_vars.Resize(sgmm.NumGauss(), dim, sgmm.PhoneSpaceDim());
-  sgmm_config.full_gmm_nbest = std::min(sgmm_config.full_gmm_nbest,
-                                        sgmm.NumGauss());
-  kaldi::Vector<BaseFloat> occs(sgmm.NumPdfs());
-  occs.Set(feats.NumRows());
-  sgmm.ComputeFmllrPreXform(occs, &fmllr_globals.pre_xform_,
-                            &fmllr_globals.inv_xform_,
-                            &fmllr_globals.mean_scatter_);
-  if (fmllr_globals.mean_scatter_.Min() == 0.0) {
-    KALDI_WARN << "Global covariances low rank!";
-    KALDI_WARN << "Diag-scatter = " << fmllr_globals.mean_scatter_;
-    return;
-  }
-
-//  std::cout << "Pre-Xform = " << fmllr_globals.pre_xform_;
-//  std::cout << "Inv-Xform = " << fmllr_globals.inv_xform_;
-
-  FmllrSgmm2Accs accs;
-  accs.Init(sgmm.FeatureDim(), sgmm.NumGauss());
-  BaseFloat loglike = 0.0;
-  std::vector<int32> gselect;
-  for (int32 i = 0; i < feats.NumRows(); i++) {
-    sgmm.GaussianSelection(sgmm_config, feats.Row(i), &gselect);
-    sgmm.ComputePerFrameVars(feats.Row(i), gselect, empty, &frame_vars);
-    loglike += accs.Accumulate(sgmm, feats.Row(i), frame_vars, 0, 1.0,
-                               &empty);
-  }
-
-  kaldi::Sgmm2FmllrConfig update_opts;
-  update_opts.fmllr_min_count = 999; // Make sure it doesn't
-  // divide 200, because the test can fail when we cross the boundary
-  // of 1000 due to roundoff.  Actually it's weird because 1000 should
-  // be exactly representable in float and in text.  But something's going wrong.
-  kaldi::Matrix<BaseFloat> xform_mat(dim, dim+1);
-  xform_mat.SetUnit();
-  BaseFloat frames, impr;
-  accs.Update(sgmm, fmllr_globals, update_opts, &xform_mat, &frames, &impr);
-
-  Vector<BaseFloat> xformed_feat(dim);
-  ApplyFmllrXform(feats.Row(0), xform_mat, &xformed_feat);
-  sgmm.GaussianSelection(sgmm_config, xformed_feat, &gselect);
-  sgmm.ComputePerFrameVars(xformed_feat, gselect, empty, &frame_vars);
-
-  Sgmm2LikelihoodCache like_cache(sgmm.NumGroups(), sgmm.NumPdfs());
-  BaseFloat loglike1 = sgmm.LogLikelihood(frame_vars, 0,
-                                          &like_cache, &empty);
-
-  bool binary_in;
-  // First, non-binary write
-  KALDI_LOG << "Test ASCII IO.";
-  accs.Write(kaldi::Output("tmpf", false).Stream(), false);
-  FmllrSgmm2Accs *accs1 = new FmllrSgmm2Accs();
-  // Non-binary read
-  kaldi::Input ki1("tmpf", &binary_in);
-  accs1->Read(ki1.Stream(), binary_in, false);
-  xform_mat.SetUnit();
-  accs1->Update(sgmm, fmllr_globals, update_opts, &xform_mat, NULL, NULL);
-  ApplyFmllrXform(feats.Row(0), xform_mat, &xformed_feat);
-  sgmm.GaussianSelection(sgmm_config, xformed_feat, &gselect);
-  sgmm.ComputePerFrameVars(xformed_feat, gselect, empty, &frame_vars);
-  like_cache.NextFrame();
-  BaseFloat loglike2 = sgmm.LogLikelihood(frame_vars, 0,
-                                          &like_cache, &empty);
-  std::cout << "LL1 = " << loglike1 << ", LL2 = " << loglike2 << std::endl;
-  
-  kaldi::AssertEqual(loglike1, loglike2, 1e-2);
-  delete accs1;
-
-  // Next, binary write
-  KALDI_LOG << "Test Binary IO.";
-  accs.Write(kaldi::Output("tmpfb", true).Stream(), true);
-  FmllrSgmm2Accs *accs2 = new FmllrSgmm2Accs();
-  // Binary read
-  kaldi::Input ki2("tmpfb", &binary_in);
-  accs2->Read(ki2.Stream(), binary_in, false);
-  xform_mat.SetUnit();
-  accs2->Update(sgmm, fmllr_globals, update_opts, &xform_mat, NULL, NULL);
-  ApplyFmllrXform(feats.Row(0), xform_mat, &xformed_feat);
-  sgmm.GaussianSelection(sgmm_config, xformed_feat, &gselect);
-  sgmm.ComputePerFrameVars(xformed_feat, gselect, empty,  &frame_vars);
-  BaseFloat loglike3 = sgmm.LogLikelihood(frame_vars, 0,
-                                          &like_cache, &empty);
-  std::cout << "LL1 = " << loglike1 << ", LL3 = " << loglike3 << std::endl;
-  kaldi::AssertEqual(loglike1, loglike3, 1e-4);
-  delete accs2;
-  
-  unlink("tmpf");
-  unlink("tmpfb");
-  KALDI_LOG << "Test IO end.";
-}
-
-void TestSgmm2FmllrSubspace(const AmSgmm2 &sgmm,
-                         const kaldi::Matrix<BaseFloat> &feats) {
-  KALDI_LOG << "Test Subspace start.";
-  using namespace kaldi;
-  int32 dim = sgmm.FeatureDim();
-  kaldi::Sgmm2PerFrameDerivedVars frame_vars;
-  kaldi::Sgmm2PerSpkDerivedVars empty;
-  kaldi::Sgmm2FmllrGlobalParams fmllr_globals;
-  kaldi::Sgmm2GselectConfig sgmm_config;
-
-  frame_vars.Resize(sgmm.NumGauss(), dim, sgmm.PhoneSpaceDim());
-  sgmm_config.full_gmm_nbest = std::min(sgmm_config.full_gmm_nbest,
-                                        sgmm.NumGauss());
-  kaldi::Vector<BaseFloat> occs(sgmm.NumPdfs());
-  occs.Set(feats.NumRows());
-  sgmm.ComputeFmllrPreXform(occs, &fmllr_globals.pre_xform_,
-                            &fmllr_globals.inv_xform_,
-                            &fmllr_globals.mean_scatter_);
-  if (fmllr_globals.mean_scatter_.Min() == 0.0) {
-    KALDI_WARN << "Global covariances low rank!";
-    KALDI_WARN << "Diag-scatter = " << fmllr_globals.mean_scatter_;
-    return;
-  }
-
-  FmllrSgmm2Accs accs;
-  accs.Init(sgmm.FeatureDim(), sgmm.NumGauss());
-  BaseFloat loglike = 0.0;
-  std::vector<int32> gselect;
-  for (int32 i = 0; i < feats.NumRows(); i++) {
-    sgmm.GaussianSelection(sgmm_config, feats.Row(i), &gselect);
-    sgmm.ComputePerFrameVars(feats.Row(i), gselect, empty, &frame_vars);
-    loglike += accs.Accumulate(sgmm, feats.Row(i), frame_vars, 0, 1.0,
-                               &empty);
-  }
-
-  SpMatrix<double> grad_scatter(dim * (dim+1));
-  accs.AccumulateForFmllrSubspace(sgmm, fmllr_globals, &grad_scatter);
-  kaldi::Sgmm2FmllrConfig update_opts;
-  EstimateSgmm2FmllrSubspace(grad_scatter, update_opts.num_fmllr_bases, dim,
-                            &fmllr_globals);
-//  update_opts.fmllr_min_count = 100;
-  kaldi::Matrix<BaseFloat> xform_mat(dim, dim+1);
-  xform_mat.SetUnit();
-  accs.Update(sgmm, fmllr_globals, update_opts, &xform_mat, NULL, NULL);
-  KALDI_LOG << "Test Subspace end.";
-}
-
-void TestSgmm2Fmllr() {
-  // srand(time(NULL));
-  int32 dim = 1 + kaldi::RandInt(0, 9);  // random dimension of the gmm
-  int32 num_comp = 2 + kaldi::RandInt(0, 9);  // random number of mixtures
-  kaldi::FullGmm full_gmm;
-  ut::InitRandFullGmm(dim, num_comp, &full_gmm);
-
-  AmSgmm2 sgmm;
-  kaldi::Sgmm2GselectConfig config;
-  std::vector<int32> pdf2group;
-  pdf2group.push_back(0);
-  sgmm.InitializeFromFullGmm(full_gmm, pdf2group, dim+1, dim, true, 0.9);
-  sgmm.ComputeNormalizers();
-
-  kaldi::Matrix<BaseFloat> feats;
-
-  {  // First, generate random means and variances
-    int32 num_feat_comp = num_comp + kaldi::RandInt(-num_comp/2, num_comp/2);
-    kaldi::Matrix<BaseFloat> means(num_feat_comp, dim),
-        vars(num_feat_comp, dim);
-    for (int32 m = 0; m < num_feat_comp; m++) {
-      for (int32 d= 0; d < dim; d++) {
-        means(m, d) = kaldi::RandGauss();
-        vars(m, d) = Exp(kaldi::RandGauss()) + 1e-2;
-      }
-    }
-    // Now generate random features with those means and variances.
-    feats.Resize(num_feat_comp * 200, dim);
-    for (int32 m = 0; m < num_feat_comp; m++) {
-      kaldi::SubMatrix<BaseFloat> tmp(feats, m*200, 200, 0, dim);
-      ut::RandDiagGaussFeatures(200, means.Row(m), vars.Row(m), &tmp);
-    }
-  }
-  TestSgmm2FmllrAccsIO(sgmm, feats);
-  TestSgmm2FmllrSubspace(sgmm, feats);
-}
-
-int main() {
-  kaldi::g_kaldi_verbose_level = 5;
-  for (int i = 0; i < 10; i++)
-    TestSgmm2Fmllr();
-  std::cout << "Test OK.\n";
-  return 0;
-}
diff --git a/src/sgmm2/fmllr-sgmm2.cc b/src/sgmm2/fmllr-sgmm2.cc
deleted file mode 100644
index 35658caec69..00000000000
--- a/src/sgmm2/fmllr-sgmm2.cc
+++ /dev/null
@@ -1,555 +0,0 @@
-// sgmm2/fmllr-sgmm2.cc
-
-// Copyright 2009-2012   Saarland University (author: Arnab Ghoshal)
-//                       Johns Hopkins University (author: Daniel Povey)    
-
-// See ../../COPYING for clarification regarding multiple authors
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//  http://www.apache.org/licenses/LICENSE-2.0
-//
-// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
-// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
-// MERCHANTABLITY OR NON-INFRINGEMENT.
-// See the Apache 2 License for the specific language governing permissions and
-// limitations under the License.
-
-#include <algorithm>
-#include <string>
-#include <vector>
-using std::vector;
-
-#include "sgmm2/fmllr-sgmm2.h"
-#include "util/parse-options.h"
-
-namespace kaldi {
-
-static void ApplyPreXformToGradient(const Sgmm2FmllrGlobalParams &globals,
-                                    const Matrix<BaseFloat> &gradient_in,
-                                    Matrix<BaseFloat> *gradient_out) {
-  // Eq. (B.14): P' = A_{inv}^T P {W_{pre}^+}^T
-  int32 dim = gradient_in.NumRows();
-  Matrix<BaseFloat> Wpre_plus(dim + 1, dim + 1, kSetZero);
-  Wpre_plus.Range(0, dim, 0, dim + 1).CopyFromMat(globals.pre_xform_);
-  Wpre_plus(dim, dim) = 1;
-  SubMatrix<BaseFloat> Ainv(globals.inv_xform_, 0, dim, 0, dim);
-  Matrix<BaseFloat> AinvP(dim, dim + 1, kUndefined);
-  AinvP.AddMatMat(1.0, Ainv, kTrans, gradient_in, kNoTrans, 0.0);
-  gradient_out->AddMatMat(1.0, AinvP, kNoTrans, Wpre_plus, kTrans, 0.0);
-}
-
-static void ApplyInvPreXformToChange(const Sgmm2FmllrGlobalParams &globals,
-                                     const Matrix<BaseFloat> &delta_in,
-                                     Matrix<BaseFloat> *delta_out) {
-  // Eq. (B.25): \Delta = A_{inv} \Delta' W_{pre}^+
-  int32 dim = delta_in.NumRows();
-  Matrix<BaseFloat> Wpre_plus(dim + 1, dim + 1, kSetZero);
-  Wpre_plus.Range(0, dim, 0, dim + 1).CopyFromMat(globals.pre_xform_);
-  Wpre_plus(dim, dim) = 1;
-  SubMatrix<BaseFloat> Ainv(globals.inv_xform_, 0, dim, 0, dim);
-  Matrix<BaseFloat> AinvD(dim, dim + 1, kUndefined);
-  AinvD.AddMatMat(1.0, Ainv, kNoTrans, delta_in, kNoTrans, 0.0);
-  delta_out->AddMatMat(1.0, AinvD, kNoTrans, Wpre_plus, kNoTrans, 0.0);
-}
-
-static void ApplyHessianXformToGradient(const Sgmm2FmllrGlobalParams &globals,
-                                        const Matrix<BaseFloat> &gradient_in,
-                                        Matrix<BaseFloat> *gradient_out) {
-  int32 dim = gradient_in.NumRows();
-  const Vector<BaseFloat> &D = globals.mean_scatter_;
-  if (D.Min() <= 0.0)
-    KALDI_ERR << "Cannot estimate FMLLR: mean scatter has 0 eigenvalues.";
-  for (int32 r = 0; r < dim; r++) {
-    for (int32 c = 0; c < r; c++) {
-      // Eq. (B.15)
-      (*gradient_out)(r, c) = gradient_in(r, c) / std::sqrt(1 + D(c));
-      // Eq. (B.16)
-      (*gradient_out)(c, r) = gradient_in(c, r) / std::sqrt(1 + D(r) -
-          1 / (1 + D(c))) - gradient_in(r, c) / ((1 + D(c)) *
-              std::sqrt(1 + D(r) - 1 / (1 + D(c))));
-    }
-    // Eq. (B.17) & (B.18)
-    (*gradient_out)(r, r) = gradient_in(r, r) / std::sqrt(2 + D(r));
-    (*gradient_out)(r, dim) = gradient_in(r, dim);
-  }
-}
-
-static void ApplyInvHessianXformToChange(const Sgmm2FmllrGlobalParams &globals,
-                                         const Matrix<BaseFloat> &delta_in,
-                                         Matrix<BaseFloat> *delta_out) {
-  int32 dim = delta_in.NumRows();
-  const Vector<BaseFloat> &D = globals.mean_scatter_;
-  if (D.Min() <= 0.0)
-    KALDI_ERR << "Cannot estimate FMLLR: mean scatter has 0 eigenvalues.";
-  for (int32 r = 0; r < dim; r++) {
-    for (int32 c = 0; c < r; c++) {
-      // Eq. (B.21)
-      (*delta_out)(r, c) = delta_in(r, c) / std::sqrt(1 + D(c)) -
-          delta_in(c, r) / ((1 + D(c)) * std::sqrt(1 + D(r) - 1 / (1 + D(c))));
-      // Eq. (B.22)
-      (*delta_out)(c, r) = delta_in(c, r) / std::sqrt(1 + D(r) - 1/ (1 + D(c)));
-    }
-    // Eq. (B.23) & (B.24)
-    (*delta_out)(r, r) = delta_in(r, r) / std::sqrt(2 + D(r));
-    (*delta_out)(r, dim) = delta_in(r, dim);
-  }
-}
-
-
-void Sgmm2FmllrGlobalParams::Write(std::ostream &out, bool binary) const {
-  WriteToken(out, binary, "<SGMM_FMLLR_GLOBAL_PARAMS>");
-  WriteToken(out, binary, "<PRE_XFORM>");
-  pre_xform_.Write(out, binary);
-  WriteToken(out, binary, "<INV_XFORM>");
-  inv_xform_.Write(out, binary);
-  WriteToken(out, binary, "<MEAN_SCATTER>");
-  mean_scatter_.Write(out, binary);
-  if (fmllr_bases_.size() != 0) {
-    WriteToken(out, binary, "<FMLLR_BASIS>");
-    uint32 tmp = static_cast<uint32>(fmllr_bases_.size());
-    WriteBasicType(out, binary, tmp);
-    for (uint32 i = 0; i < tmp; i++) {
-      fmllr_bases_[i].Write(out, binary);
-    }
-  }
-  WriteToken(out, binary, "</SGMM_FMLLR_GLOBAL_PARAMS>");
-}
-
-void Sgmm2FmllrGlobalParams::Read(std::istream &in, bool binary) {
-  ExpectToken(in, binary, "<SGMM_FMLLR_GLOBAL_PARAMS>");
-  ExpectToken(in, binary, "<PRE_XFORM>");
-  pre_xform_.Read(in, binary);
-  ExpectToken(in, binary, "<INV_XFORM>");
-  inv_xform_.Read(in, binary);
-  ExpectToken(in, binary, "<MEAN_SCATTER>");
-  mean_scatter_.Read(in, binary);
-  std::string token;
-  ReadToken(in, binary, &token);
-  if (token == "<FMLLR_BASIS>") {
-    uint32 tmp;
-    ReadBasicType(in, binary, &tmp);
-    fmllr_bases_.resize(tmp);
-    for (uint32 i = 0; i < tmp; i++) {
-      fmllr_bases_[i].Read(in, binary);
-    }
-  } else {
-    if (token != "</SGMM_FMLLR_GLOBAL_PARAMS>")
-      KALDI_ERR << "Unexpected token '" << token << "' found.";
-  }
-}
-
-
-void FmllrSgmm2Accs::Init(int32 dim, int32 num_gaussians) {
-  if (dim == 0) {  // empty stats
-    dim_ = 0;  // non-zero dimension is meaningless in empty stats
-    stats_.Init(0, 0);  // clear the stats
-  } else {
-    dim_ = dim;
-    stats_.Init(dim, num_gaussians);
-  }
-}
-
-BaseFloat FmllrSgmm2Accs::Accumulate(const AmSgmm2 &model,
-                                     const VectorBase<BaseFloat> &data,
-                                     const Sgmm2PerFrameDerivedVars &frame_vars,
-                                     int32 pdf_index, BaseFloat weight,
-                                     Sgmm2PerSpkDerivedVars *spk) {
-  // Calulate Gaussian posteriors and collect statistics
-  Matrix<BaseFloat> posteriors;
-  BaseFloat log_like = model.ComponentPosteriors(frame_vars, pdf_index,
-                                                 spk, &posteriors);
-  posteriors.Scale(weight);
-  AccumulateFromPosteriors(model, *spk, data, frame_vars.gselect, posteriors,
-                           pdf_index);
-  return log_like;
-}
-
-void FmllrSgmm2Accs::AccumulateFromPosteriors(
-    const AmSgmm2 &model,
-    const Sgmm2PerSpkDerivedVars &spk,
-    const VectorBase<BaseFloat> &data,
-    const vector<int32> &gselect,
-    const Matrix<BaseFloat> &posteriors,
-    int32 j2) {
-  Vector<double> var_scaled_mean(dim_), extended_data(dim_+1);
-  extended_data.Range(0, dim_).CopyFromVec(data);
-  extended_data(dim_) = 1.0;
-  SpMatrix<double> scatter(dim_+1, kSetZero);
-  scatter.AddVec2(1.0, extended_data);
-  int32 j1 = model.Pdf2Group(j2);
-  for (int32 ki = 0, ki_max = gselect.size(); ki < ki_max; ki++) {
-    int32 i = gselect[ki];
-
-    for (int32 m = 0; m < model.NumSubstatesForGroup(j1); m++) {
-      // posterior gamma_{jkmi}(t)                             eq.(39)
-      BaseFloat gammat_jmi = posteriors(ki, m);
-
-      // Accumulate statistics for non-zero gaussian posterior
-      if (gammat_jmi > 0.0) {
-        stats_.beta_ += gammat_jmi;
-        model.GetVarScaledSubstateSpeakerMean(j1, m, i, spk,
-                                              &var_scaled_mean);
-        // Eq. (52): K += \gamma_{jmi} \Sigma_{i}^{-1} \mu_{jmi}^{(s)} x^{+T}
-        stats_.K_.AddVecVec(gammat_jmi, var_scaled_mean, extended_data);
-        // Eq. (53): G_{i} += \gamma_{jmi} x^{+} x^{+T}
-        stats_.G_[i].AddSp(gammat_jmi, scatter);
-      }  // non-zero posteriors
-    }  // loop over substates
-  }  // loop over selected Gaussians
-}
-
-void FmllrSgmm2Accs::AccumulateForFmllrSubspace(const AmSgmm2 &sgmm,
-    const Sgmm2FmllrGlobalParams &globals, SpMatrix<double> *grad_scatter) {
-  if (stats_.beta_ <= 0.0) {
-    KALDI_WARN << "Not committing any stats since no stats accumulated.";
-    return;
-  }
-  int32 dim = sgmm.FeatureDim();
-  Matrix<BaseFloat> xform(dim, dim + 1, kUndefined);
-  xform.SetUnit();
-  Matrix<BaseFloat> grad(dim, dim + 1, kSetZero);
-  this->FmllrObjGradient(sgmm, xform, &grad, NULL);
-  Matrix<BaseFloat> pre_xformed_grad(dim, dim + 1, kSetZero);
-  ApplyPreXformToGradient(globals, grad, &pre_xformed_grad);
-  Matrix<BaseFloat> hess_xformed_grad(dim, dim + 1, kSetZero);
-  ApplyHessianXformToGradient(globals, pre_xformed_grad, &hess_xformed_grad);
-  Vector<double> grad_vec(dim * (dim + 1));
-  grad_vec.CopyRowsFromMat(hess_xformed_grad);
-  grad_vec.Scale(1 / std::sqrt(stats_.beta_));
-  grad_scatter->AddVec2(1.0, grad_vec);
-  KALDI_LOG << "Frame counts for when committing fMLLR subspace stats are "
-            << stats_.beta_;
-}
-
-
-BaseFloat FmllrSgmm2Accs::FmllrObjGradient(const AmSgmm2 &sgmm,
-                                          const Matrix<BaseFloat> &xform,
-                                          Matrix<BaseFloat> *grad_out,
-                                          Matrix<BaseFloat> *G_out) const {
-  int32 dim = sgmm.FeatureDim(),
-      num_gauss = sgmm.NumGauss();
-  KALDI_ASSERT(stats_.G_.size() == static_cast<size_t>(num_gauss));
-  Matrix<double> xform_d(xform);
-  SubMatrix<double> A(xform_d, 0, dim, 0, dim);
-  Matrix<double> xform_g(dim, dim + 1), total_g(dim, dim + 1);
-  SpMatrix<double> inv_covar(dim);
-  double obj = stats_.beta_ * A.LogDet() +
-      TraceMatMat(xform_d, stats_.K_, kTrans);
-  for (int32 i = 0; i < num_gauss; i++) {
-    sgmm.GetInvCovars(i, &inv_covar);
-    xform_g.AddMatSp(1.0, xform_d, kNoTrans, stats_.G_[i], 0.0);
-    total_g.AddSpMat(1.0, inv_covar, xform_g, kNoTrans, 1.0);
-  }
-  obj -= 0.5 * TraceMatMat(xform_d, total_g, kTrans);
-  if (G_out != NULL) G_out->CopyFromMat(total_g);
-
-  // Compute the gradient: P = \beta [(A^{-1})^{T} , 0] + K - S
-  if (grad_out != NULL) {
-    Matrix<double> grad_d(dim, dim + 1, kSetZero);
-    grad_d.Range(0, dim, 0, dim).CopyFromMat(A);
-    grad_d.Range(0, dim, 0, dim).InvertDouble();
-    grad_d.Range(0, dim, 0, dim).Transpose();
-    grad_d.Scale(stats_.beta_);
-    grad_d.AddMat(-1.0, total_g, kNoTrans);
-    grad_d.AddMat(1.0, stats_.K_, kNoTrans);
-    grad_out->CopyFromMat(grad_d);
-  }
-
-  return obj;
-}
-
-
-void FmllrSgmm2Accs::Write(std::ostream &out, bool binary) const {
-  WriteToken(out, binary, "<FMLLRACCS>");
-  WriteToken(out, binary, "<DIMENSION>");
-  WriteBasicType(out, binary, dim_);
-  WriteToken(out, binary, "<STATS>");
-  stats_.Write(out, binary);
-  WriteToken(out, binary, "</FMLLRACCS>");
-}
-
-void FmllrSgmm2Accs::Read(std::istream &in, bool binary, bool add) {
-  ExpectToken(in, binary, "<FMLLRACCS>");
-  ExpectToken(in, binary, "<DIMENSION>");
-  ReadBasicType(in, binary, &dim_);
-  KALDI_ASSERT(dim_ > 0);
-  ExpectToken(in, binary, "<STATS>");
-  stats_.Read(in, binary, add);
-  ExpectToken(in, binary, "</FMLLRACCS>");
-}
-
-
-static BaseFloat CalcFmllrStepSize(const AffineXformStats &stats,
-                                   const AmSgmm2 &sgmm,
-                                   const MatrixBase<BaseFloat> &Delta,
-                                   const MatrixBase<BaseFloat> &A,
-                                   const Matrix<BaseFloat> &G,
-                                   int32 max_iters) {
-  int32 dim = sgmm.FeatureDim();
-  Matrix<double> Delta_d(Delta);
-  Matrix<double> G_d(G);
-  SubMatrix<double> Delta_C(Delta_d, 0, dim, 0, dim);
-
-  // Eq. (B.28): m = tr(\Delta K^T) - tr(\Delta S^T)
-  BaseFloat m = TraceMatMat(Delta_d, stats.K_, kTrans)
-                    - TraceMatMat(Delta_d, G_d, kTrans);
-  // Eq. (B.29): n = \sum_i tr(\Delta \Sigma_{i}^{-1} \Delta S_{i})
-  BaseFloat n = 0;
-  SpMatrix<double> inv_covar;
-  for (int32 i = 0, num_gauss = sgmm.NumGauss(); i < num_gauss; i++) {
-    sgmm.GetInvCovars(i, &inv_covar);
-    n += TraceMatSpMatSp(Delta_d, kTrans, inv_covar, Delta_d, kNoTrans,
-                         stats.G_[i]);
-  }
-
-  BaseFloat step_size = 0.0;
-  // initialize just to get rid of compile errors.
-  BaseFloat obj_step_old, obj_step_new = 0.0;
-  Matrix<double> new_A(dim, dim);
-  Matrix<double> B(dim, dim);
-  for (int32 iter_step = 0; iter_step < max_iters; iter_step++) {
-    if (iter_step == 0) {
-      obj_step_old = stats.beta_ * A.LogDet();  // Q_0 = \beta * log det(A)
-    } else {
-      obj_step_old = obj_step_new;
-    }
-
-    // Eq. (B.30); B = (A + k\Delta^{-C})^{-1} \Delta^{-C}
-    new_A.CopyFromMat(A);
-    new_A.AddMat(step_size, Delta_C, kNoTrans);
-    new_A.InvertDouble();
-    B.AddMatMat(1.0, new_A, kNoTrans, Delta_C, kNoTrans, 0.0);
-
-    BaseFloat d = m - step_size * n + stats.beta_ * TraceMat(B);
-    BaseFloat d2 = -n - stats.beta_ * TraceMatMat(B, B, kNoTrans);
-    if (std::fabs(d / d2) < 0.000001) { break; }  // converged
-
-    BaseFloat step_size_change = -(d / d2);
-    step_size += step_size_change;  // Eq. (B.33)
-
-    // Halve step size when the auxiliary function decreases.
-    do {
-      new_A.CopyFromMat(A);
-      new_A.AddMat(step_size, Delta_C, kNoTrans);
-      BaseFloat logdet = new_A.LogDet();
-      obj_step_new = stats.beta_ * logdet + step_size * m -
-          0.5 * step_size * step_size * n;
-
-      if (obj_step_new - obj_step_old < -0.001) {
-        KALDI_WARN << "Objective function decreased (" << obj_step_old << "->"
-                   << obj_step_new << "). Halving step size change ("
-                   << step_size << " -> " << (step_size - (step_size_change/2))
-                   << ")";
-        step_size_change /= 2;
-        step_size -= step_size_change;  // take away half of our step
-      }  // Facing numeric precision issues. Compute in double?
-    } while (obj_step_new - obj_step_old < -0.001 && step_size_change > 1e-05);
-  }
-  return step_size;
-}
-
-
-bool FmllrSgmm2Accs::Update(const AmSgmm2 &sgmm,
-                           const Sgmm2FmllrGlobalParams &globals,
-                           const Sgmm2FmllrConfig &opts,
-                           Matrix<BaseFloat> *out_xform,
-                           BaseFloat *frame_count, BaseFloat *auxf_out) const {
-  BaseFloat auxf_improv = 0.0, logdet = 0.0;
-  KALDI_ASSERT(out_xform->NumRows() == dim_ && out_xform->NumCols() == dim_+1);
-  BaseFloat mincount = (globals.HasBasis() ?
-      std::min(opts.fmllr_min_count_basis, opts.fmllr_min_count_full) :
-      opts.fmllr_min_count);
-  bool using_subspace = (globals.HasBasis() ?
-      (stats_.beta_ < opts.fmllr_min_count_full) : false);
-
-  if (globals.IsEmpty())
-    KALDI_ERR << "Must set up pre-transforms before estimating FMLLR.";
-
-  KALDI_VLOG(1) << "Mincount = " << mincount << "; Basis: "
-                << std::string(globals.HasBasis()? "yes; " : "no; ")
-                << "Using subspace: " << std::string(using_subspace? "yes; "
-                    : "no; ");
-
-  int32 num_bases = 0;
-  if (using_subspace) {
-    KALDI_ASSERT(globals.fmllr_bases_.size() != 0);
-    int32 max_bases = std::min(static_cast<int32>(globals.fmllr_bases_.size()),
-                               opts.num_fmllr_bases);
-    num_bases = (opts.bases_occ_scale <= 0.0)? max_bases :
-        std::min(max_bases, static_cast<int32>(std::floor(opts.bases_occ_scale
-                                                          * stats_.beta_)));
-    KALDI_VLOG(1) << "Have " << stats_.beta_ << " frames for speaker: Using "
-                  << num_bases << " fMLLR bases.";
-  }
-
-  // initialization just to get rid of compile errors.
-  BaseFloat auxf_old = 0, auxf_new = 0;
-  if (frame_count != NULL) *frame_count = stats_.beta_;
-
-  // If occupancy is greater than the min count, update the transform
-  if (stats_.beta_ >= mincount) {
-    for (int32 iter = 0; iter < opts.fmllr_iters; iter++) {
-      Matrix<BaseFloat> grad(dim_, dim_ + 1, kSetZero);
-      Matrix<BaseFloat> G(dim_, dim_ + 1, kSetZero);
-      auxf_new = this->FmllrObjGradient(sgmm, *out_xform, &grad, &G);
-
-      // For diagnostic purposes
-      KALDI_VLOG(3) << "Iter " << iter << ": Auxiliary function = "
-          << (auxf_new / stats_.beta_) << " per frame over " << stats_.beta_
-          << " frames";
-
-      if (iter > 0) {
-        // For diagnostic purposes
-        KALDI_VLOG(2) << "Iter " << iter << ": Auxiliary function improvement: "
-            << ((auxf_new - auxf_old) / stats_.beta_) << " per frame over "
-            << (stats_.beta_) << " frames";
-        auxf_improv += auxf_new - auxf_old;
-      }
-
-      Matrix<BaseFloat> pre_xformed_grad(dim_, dim_ + 1, kSetZero);
-      ApplyPreXformToGradient(globals, grad, &pre_xformed_grad);
-//      std::cout << "Pre-X Grad = " << pre_xformed_grad << std::endl;
-
-      // Transform P_sk with the Hessian
-      Matrix<BaseFloat> hess_xformed_grad(dim_, dim_ + 1, kSetZero);
-      ApplyHessianXformToGradient(globals, pre_xformed_grad,
-                                  &hess_xformed_grad);
-//      std::cout << "Hess-X Grad = " << hess_xformed_grad << std::endl;
-
-      // Update the actual FMLLR transform matrices
-      Matrix<BaseFloat> hess_xformed_delta(dim_, dim_ + 1, kUndefined);
-      if (using_subspace) {
-        // Note that in this case we can simply store the speaker-specific
-        // coefficients for each of the basis matrices. The current
-        // implementation stores the computed transform to simplify the code!
-        hess_xformed_delta.SetZero();
-        for (int32 b = 0; b < num_bases; b++) {  // Eq (B.20)
-          hess_xformed_delta.AddMat(TraceMatMat(globals.fmllr_bases_[b],
-                                                hess_xformed_grad, kTrans),
-                                    globals.fmllr_bases_[b], kNoTrans);
-        }
-        hess_xformed_delta.Scale(1 / stats_.beta_);
-      } else {
-        hess_xformed_delta.CopyFromMat(hess_xformed_grad);
-        hess_xformed_delta.Scale(1 / stats_.beta_);  // Eq. (B.19)
-      }
-
-//      std::cout << "Hess-X Delta = " << hess_xformed_delta << std::endl;
-
-      // Transform Delta with the Hessian
-      Matrix<BaseFloat> pre_xformed_delta(dim_, dim_ + 1, kSetZero);
-      ApplyInvHessianXformToChange(globals, hess_xformed_delta,
-                                   &pre_xformed_delta);
-
-      // Apply inverse pre-transform to Delta
-      Matrix<BaseFloat> delta(dim_, dim_ + 1, kSetZero);
-      ApplyInvPreXformToChange(globals, pre_xformed_delta, &delta);
-
-#ifdef KALDI_PARANOID
-      // Check whether co-ordinate transformation is correct.
-      {
-        BaseFloat tr1 = TraceMatMat(delta, grad, kTrans);
-        BaseFloat tr2 = TraceMatMat(pre_xformed_delta, pre_xformed_grad,
-                                    kTrans);
-        BaseFloat tr3 = TraceMatMat(hess_xformed_delta, hess_xformed_grad,
-                                    kTrans);
-        AssertEqual(tr1, tr2, 1e-5);
-        AssertEqual(tr2, tr3, 1e-5);
-      }
-#endif
-
-      // Calculate the optimal step size
-      SubMatrix<BaseFloat> A(*out_xform, 0, dim_, 0, dim_);
-      BaseFloat step_size = CalcFmllrStepSize(stats_, sgmm, delta, A, G,
-                                              opts.fmllr_iters);
-
-      // Update: W <-- W + k \Delta   Eq. (B.34)
-      out_xform->AddMat(step_size, delta, kNoTrans);
-      auxf_old = auxf_new;
-
-      // Check the objective function change for the last iteration
-      if (iter == opts.fmllr_iters - 1) {
-        auxf_new = this->FmllrObjGradient(sgmm, *out_xform, NULL, NULL);
-        logdet = A.LogDet();
-        // SubMatrix A points to the memory location of out_xform, and so will
-        // contain the updated value
-
-        KALDI_VLOG(2) << "Iter " << iter << ": Auxiliary function improvement: "
-            << ((auxf_new - auxf_old) / stats_.beta_) << " per frame over "
-            << (stats_.beta_) << " frames";
-        auxf_improv += auxf_new - auxf_old;
-      }
-    }
-    if (auxf_out != NULL) *auxf_out = auxf_improv;
-    auxf_improv /= (stats_.beta_ + 1.0e-10);
-
-    KALDI_LOG << "Auxiliary function improvement for FMLLR = " << auxf_improv
-        << " per frame over " << stats_.beta_ << " frames. Log-determinant = "
-        << logdet;
-    return true;
-  } else {
-    KALDI_ASSERT(stats_.beta_ < mincount);
-//    std::cerr.precision(10);
-//    std::cerr.setf(std::ios::fixed,std::ios::floatfield);
-    KALDI_WARN << "Not updating FMLLR because count is " << stats_.beta_
-        << " < " << (mincount);
-    if (auxf_out != NULL) *auxf_out = 0.0;
-    return false;
-  }  // Do not use the transform if it does not have enough counts
-  KALDI_ASSERT(false);  // Should never be reached.
-}
-
-void EstimateSgmm2FmllrSubspace(const SpMatrix<double> &fmllr_grad_scatter,
-                               int32 num_fmllr_bases, int32 feat_dim,
-                               Sgmm2FmllrGlobalParams *globals, double min_eig) {
-  KALDI_ASSERT(num_fmllr_bases > 0 && feat_dim > 0);
-  if (num_fmllr_bases >  feat_dim * (feat_dim + 1)) {
-    num_fmllr_bases = feat_dim * (feat_dim + 1);
-    KALDI_WARN << "Limiting number of fMLLR bases to be the same as transform "
-               << "dimension.";
-  }
-
-  vector< Matrix<BaseFloat> > &fmllr_bases(globals->fmllr_bases_);
-
-  Vector<double> s(fmllr_grad_scatter.NumRows());
-  Matrix<double> U(fmllr_grad_scatter.NumRows(),
-                      fmllr_grad_scatter.NumRows());
-  try {
-    fmllr_grad_scatter.Eig(&s, &U);
-    SortSvd(&s, &U);  // in case was not exactly sorted.
-    KALDI_VLOG(1) << "Eigenvalues (max 200) of CMLLR scatter are: "
-                  << (SubVector<double>(s, 0,
-                                        std::min(static_cast<MatrixIndexT>(200),
-                                                 s.Dim())));
-
-
-//    for (int32 b = 2; b < num_fmllr_bases; b++) {
-//      if (s(b) < min_eig) {
-//        num_fmllr_bases = b;
-//        KALDI_WARN << "Limiting number of fMLLR bases to " << num_fmllr_bases
-//                   << " because of small eigenvalues.";
-//        break;
-//      }
-//    }
-
-    U.Transpose();  // Now the rows of U correspond to the basis vectors.
-    fmllr_bases.resize(num_fmllr_bases);
-    for (int32 b = 0; b < num_fmllr_bases; b++) {
-      fmllr_bases[b].Resize(feat_dim, feat_dim + 1, kSetZero);
-      fmllr_bases[b].CopyRowsFromVec(U.Row(b));
-    }
-    KALDI_LOG << "Estimated " << num_fmllr_bases << " fMLLR basis matrices.";
-  } catch(const std::exception &e) {
-    KALDI_WARN << "Not estimating FMLLR bases because of a thrown exception:\n"
-               << e.what();
-    fmllr_bases.resize(0);
-  }
-}  // End of EstimateSgmm2FmllrSubspace
-
-
-}  // namespace kaldi
-
diff --git a/src/sgmm2/fmllr-sgmm2.h b/src/sgmm2/fmllr-sgmm2.h
deleted file mode 100644
index cfd716de534..00000000000
--- a/src/sgmm2/fmllr-sgmm2.h
+++ /dev/null
@@ -1,193 +0,0 @@
-// sgmm2/fmllr-sgmm2.h
-
-// Copyright 2009-2012     Saarland University (author: Arnab Ghoshal)
-//                         Johns Hopkins University (author: Daniel Povey)
-
-// See ../../COPYING for clarification regarding multiple authors
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//  http://www.apache.org/licenses/LICENSE-2.0
-//
-// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
-// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
-// MERCHANTABLITY OR NON-INFRINGEMENT.
-// See the Apache 2 License for the specific language governing permissions and
-// limitations under the License.
-
-
-#ifndef KALDI_SGMM2_FMLLR_SGMM2_H_
-#define KALDI_SGMM2_FMLLR_SGMM2_H_
-
-#include <string>
-#include <vector>
-
-#include "base/kaldi-common.h"
-#include "sgmm2/am-sgmm2.h"
-#include "transform/transform-common.h"
-#include "util/kaldi-table.h"
-#include "util/kaldi-holder.h"
-#include "itf/options-itf.h"
-
-namespace kaldi {
-
-/** \struct Sgmm2FmllrConfig
- *  Configuration variables needed in the estimation of FMLLR for SGMMs.
- */
-struct Sgmm2FmllrConfig {
-  int32 fmllr_iters;  ///< Number of iterations in FMLLR estimation.
-  int32 step_iters;  ///< Iterations to find optimal FMLLR step size.
-  /// Minimum occupancy count to estimate FMLLR using basis matrices.
-  BaseFloat fmllr_min_count_basis;
-  /// Minimum occupancy count to estimate FMLLR without basis matrices.
-  BaseFloat fmllr_min_count;
-  /// Minimum occupancy count to stop using FMLLR bases and switch to
-  /// regular FMLLR estimation.
-  BaseFloat fmllr_min_count_full;
-  /// Number of basis matrices to use for FMLLR estimation. Can only *reduce*
-  /// the number of bases present. Overridden by the 'bases_occ_scale' option.
-  int32 num_fmllr_bases;
-  /// Scale per-speaker count to determine number of CMLLR bases.
-  BaseFloat bases_occ_scale;
-
-  Sgmm2FmllrConfig() {
-    fmllr_iters = 5;
-    step_iters = 10;
-    fmllr_min_count_basis = 100.0;
-    fmllr_min_count = 1000.0;
-    fmllr_min_count_full = 5000.0;
-    num_fmllr_bases = 50;
-    bases_occ_scale = 0.2;
-  }
-
-  void Register(OptionsItf *opts);
-};
-
-inline void Sgmm2FmllrConfig::Register(OptionsItf *opts) {
-  std::string module = "Sgmm2FmllrConfig: ";
-  opts->Register("fmllr-iters", &fmllr_iters, module+
-                 "Number of iterations in FMLLR estimation.");
-  opts->Register("fmllr-step-iters", &step_iters, module+
-                 "Number of iterations to find optimal FMLLR step size.");
-  opts->Register("fmllr-min-count-bases", &fmllr_min_count_basis, module+
-                 "Minimum occupancy count to estimate FMLLR using basis matrices.");
-  opts->Register("fmllr-min-count", &fmllr_min_count, module+
-                 "Minimum occupancy count to estimate FMLLR (without bases).");
-  opts->Register("fmllr-min-count-full", &fmllr_min_count_full, module+
-                 "Minimum occupancy count to stop using basis matrices for FMLLR.");
-  opts->Register("fmllr-num-bases", &num_fmllr_bases, module+
-                 "Number of FMLLR basis matrices.");
-  opts->Register("fmllr-bases-occ-scale", &bases_occ_scale, module+
-                 "Scale per-speaker count to determine number of CMLLR bases.");
-}
-
-
-/** \class Sgmm2FmllrGlobalParams
- *  Global adaptation parameters.
- */
-class Sgmm2FmllrGlobalParams {
- public:
-  void Init(const AmSgmm2 &sgmm, const Vector<BaseFloat> &state_occs);
-  void Write(std::ostream &out_stream, bool binary) const;
-  void Read(std::istream &in_stream, bool binary);
-  bool IsEmpty() const {
-    return (pre_xform_.NumRows() == 0 || inv_xform_.NumRows() == 0 ||
-            mean_scatter_.Dim() == 0);
-  }
-  bool HasBasis() const { return fmllr_bases_.size() != 0; }
-
-  /// Pre-transform matrix. Dim is [D][D+1].
-  Matrix<BaseFloat> pre_xform_;
-  /// Inverse of pre-transform. Dim is [D][D+1].
-  Matrix<BaseFloat> inv_xform_;
-  /// Diagonal of mean-scatter matrix. Dim is [D]
-  Vector<BaseFloat> mean_scatter_;
-  /// \tilde{W}_b.  [b][d][d], dim is [B][D][D+1].
-  std::vector< Matrix<BaseFloat> > fmllr_bases_;
-};
-
-inline void Sgmm2FmllrGlobalParams::Init(const AmSgmm2 &sgmm,
-                                        const Vector<BaseFloat> &state_occs) {
-  sgmm.ComputeFmllrPreXform(state_occs, &pre_xform_, &inv_xform_,
-                            &mean_scatter_);
-}
-
-/** \class FmllrSgmm2Accs
- *  Class for computing the accumulators needed for the maximum-likelihood
- *  estimate of FMLLR transforms for a subspace GMM acoustic model.
- */
-class FmllrSgmm2Accs {
- public:
-  FmllrSgmm2Accs() : dim_(-1) {}
-  ~FmllrSgmm2Accs() {}
-
-  void Init(int32 dim, int32 num_gaussians);
-  void SetZero() { stats_.SetZero(); }
-
-  void Write(std::ostream &out_stream, bool binary) const;
-  void Read(std::istream &in_stream, bool binary, bool add);
-
-  /// Accumulation routine that computes the Gaussian posteriors and calls
-  /// the AccumulateFromPosteriors function with the computed posteriors.
-  /// The 'data' argument is not FMLLR-transformed and is needed in addition
-  /// to the the 'frame_vars' since the latter only contains a copy of the
-  /// transformed feature vector.
-  BaseFloat Accumulate(const AmSgmm2 &sgmm,                       
-                       const VectorBase<BaseFloat> &data,
-                       const Sgmm2PerFrameDerivedVars &frame_vars,
-                       int32 state_index,
-                       BaseFloat weight,
-                       Sgmm2PerSpkDerivedVars *spk);
-
-  void AccumulateFromPosteriors(const AmSgmm2 &sgmm,
-                                const Sgmm2PerSpkDerivedVars &spk,
-                                const VectorBase<BaseFloat> &data,
-                                const std::vector<int32> &gauss_select,
-                                const Matrix<BaseFloat> &posteriors,
-                                int32 state_index);
-
-  void AccumulateForFmllrSubspace(const AmSgmm2 &sgmm,
-                                  const Sgmm2FmllrGlobalParams &fmllr_globals,
-                                  SpMatrix<double> *grad_scatter);
-
-  BaseFloat FmllrObjGradient(const AmSgmm2 &sgmm,
-                             const Matrix<BaseFloat> &xform,
-                             Matrix<BaseFloat> *grad_out,
-                             Matrix<BaseFloat> *G_out) const;
-
-  /// Computes the FMLLR transform from the accumulated stats, using the
-  /// pre-transforms in fmllr_globals. Expects the transform matrix out_xform
-  /// to be initialized to the correct size. Returns true if the transform was
-  /// updated (i.e. had enough counts).
-  bool Update(const AmSgmm2 &model,
-              const Sgmm2FmllrGlobalParams &fmllr_globals,
-              const Sgmm2FmllrConfig &opts, Matrix<BaseFloat> *out_xform,
-              BaseFloat *frame_count, BaseFloat *auxf_improv) const;
-
-  /// Accessors
-  int32 Dim() const { return dim_; }
-  const AffineXformStats &stats() const { return stats_; }
-
- private:
-  AffineXformStats stats_;  ///< Accumulated stats
-  int32 dim_;  ///< Dimension of feature vectors
-
-  // Cannot have copy constructor and assigment operator
-  KALDI_DISALLOW_COPY_AND_ASSIGN(FmllrSgmm2Accs);
-};
-
-/// Computes the fMLLR basis matrices given the scatter of the vectorized
-/// gradients (eq: B.10). The result is stored in 'fmllr_globals'.
-/// The actual number of bases may be less than 'num_fmllr_bases' depending
-/// on the feature dimension and number of eigenvalues greater than 'min_eig'.
-void EstimateSgmm2FmllrSubspace(const SpMatrix<double> &fmllr_grad_scatter,
-                               int32 num_fmllr_bases, int32 feat_dim,
-                               Sgmm2FmllrGlobalParams *fmllr_globals,
-                               double min_eig = 0.0);
-
-}  // namespace kaldi
-
-#endif  // KALDI_SGMM2_FMLLR_SGMM2_H_
diff --git a/src/sgmm2bin/Makefile b/src/sgmm2bin/Makefile
deleted file mode 100644
index e973061ed8a..00000000000
--- a/src/sgmm2bin/Makefile
+++ /dev/null
@@ -1,26 +0,0 @@
-
-all:
-EXTRA_CXXFLAGS = -Wno-sign-compare
-include ../kaldi.mk
-
-BINFILES = sgmm2-init sgmm2-gselect sgmm2-acc-stats sgmm2-est sgmm2-sum-accs \
-         sgmm2-align-compiled sgmm2-est-spkvecs sgmm2-post-to-gpost \
-         sgmm2-acc-stats-gpost sgmm2-latgen-faster sgmm2-est-spkvecs-gpost \
-         sgmm2-rescore-lattice sgmm2-copy sgmm2-info sgmm2-est-ebw \
-         sgmm2-acc-stats2 sgmm2-comp-prexform sgmm2-est-fmllr sgmm2-project \
-         sgmm2-latgen-faster-parallel init-ubm
-
-OBJFILES =
-
-
-
-TESTFILES =
-
-
-ADDLIBS = ../decoder/kaldi-decoder.a ../lat/kaldi-lat.a \
-          ../fstext/kaldi-fstext.a ../sgmm2/kaldi-sgmm2.a ../hmm/kaldi-hmm.a \
-          ../feat/kaldi-feat.a ../transform/kaldi-transform.a \
-          ../gmm/kaldi-gmm.a ../tree/kaldi-tree.a ../util/kaldi-util.a \
-          ../matrix/kaldi-matrix.a ../base/kaldi-base.a 
-
-include ../makefiles/default_rules.mk
diff --git a/src/sgmm2bin/init-ubm.cc b/src/sgmm2bin/init-ubm.cc
deleted file mode 100644
index 3a0d398b7f6..00000000000
--- a/src/sgmm2bin/init-ubm.cc
+++ /dev/null
@@ -1,95 +0,0 @@
-// sgmmbin/init-ubm.cc
-
-// Copyright 2009-2011   Saarland University
-// Author:  Arnab Ghoshal
-
-// See ../../COPYING for clarification regarding multiple authors
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//  http://www.apache.org/licenses/LICENSE-2.0
-//
-// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
-// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
-// MERCHANTABLITY OR NON-INFRINGEMENT.
-// See the Apache 2 License for the specific language governing permissions and
-// limitations under the License.
-
-#include "base/kaldi-common.h"
-#include "util/common-utils.h"
-#include "util/kaldi-io.h"
-#include "gmm/diag-gmm.h"
-#include "gmm/full-gmm.h"
-#include "gmm/am-diag-gmm.h"
-#include "hmm/transition-model.h"
-
-
-int main(int argc, char *argv[]) {
-  try {
-    typedef kaldi::int32 int32;
-    typedef kaldi::BaseFloat BaseFloat;
-
-    const char *usage =
-        "Cluster the Gaussians in a diagonal-GMM acoustic model\n"
-        "to a single full-covariance or diagonal-covariance GMM.\n"
-        "Usage: init-ubm [options] <model-file> <state-occs> <gmm-out>\n";
-
-    bool binary_write = true, fullcov_ubm = true;
-    kaldi::ParseOptions po(usage);
-    po.Register("binary", &binary_write, "Write output in binary mode");
-    po.Register("fullcov-ubm", &fullcov_ubm, "Write out full covariance UBM.");
-    kaldi::UbmClusteringOptions ubm_opts;
-    ubm_opts.Register(&po);
-
-    po.Read(argc, argv);
-
-    if (po.NumArgs() != 3) {
-      po.PrintUsage();
-      exit(1);
-    }
-    ubm_opts.Check();
-    
-    std::string model_in_filename = po.GetArg(1),
-        occs_in_filename = po.GetArg(2),
-        gmm_out_filename = po.GetArg(3);
-
-    kaldi::AmDiagGmm am_gmm;
-    kaldi::TransitionModel trans_model;
-    {
-      bool binary_read;
-      kaldi::Input ki(model_in_filename, &binary_read);
-      trans_model.Read(ki.Stream(), binary_read);
-      am_gmm.Read(ki.Stream(), binary_read);
-    }
-
-    kaldi::Vector<BaseFloat> state_occs;
-    state_occs.Resize(am_gmm.NumPdfs());
-    {
-      bool binary_read;
-      kaldi::Input ki(occs_in_filename, &binary_read);
-      state_occs.Read(ki.Stream(), binary_read);
-    }
-
-    kaldi::DiagGmm ubm;
-    ClusterGaussiansToUbm(am_gmm, state_occs, ubm_opts, &ubm);
-    if (fullcov_ubm) {
-      kaldi::FullGmm full_ubm;
-      full_ubm.CopyFromDiagGmm(ubm);
-      kaldi::Output ko(gmm_out_filename, binary_write);
-      full_ubm.Write(ko.Stream(), binary_write);
-    } else {
-      kaldi::Output ko(gmm_out_filename, binary_write);
-      ubm.Write(ko.Stream(), binary_write);
-    }
-
-    KALDI_LOG << "Written UBM to " << gmm_out_filename;
-  } catch(const std::exception &e) {
-    std::cerr << e.what() << '\n';
-    return -1;
-  }
-}
-
-
diff --git a/src/sgmm2bin/sgmm2-acc-stats-gpost.cc b/src/sgmm2bin/sgmm2-acc-stats-gpost.cc
deleted file mode 100644
index 9c6fa5989c8..00000000000
--- a/src/sgmm2bin/sgmm2-acc-stats-gpost.cc
+++ /dev/null
@@ -1,181 +0,0 @@
-// sgmm2bin/sgmm2-acc-stats-gpost.cc
-
-// Copyright 2009-2012   Saarland University  Microsoft Corporation
-//                       Johns Hopkins University (Author: Daniel Povey)
-
-// See ../../COPYING for clarification regarding multiple authors
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//  http://www.apache.org/licenses/LICENSE-2.0
-//
-// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
-// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
-// MERCHANTABLITY OR NON-INFRINGEMENT.
-// See the Apache 2 License for the specific language governing permissions and
-// limitations under the License.
-
-
-#include "base/kaldi-common.h"
-#include "util/common-utils.h"
-#include "sgmm2/am-sgmm2.h"
-#include "hmm/transition-model.h"
-#include "sgmm2/estimate-am-sgmm2.h"
-
-
-int main(int argc, char *argv[]) {
-  using namespace kaldi;
-  try {
-    const char *usage =
-        "Accumulate stats for SGMM training, given Gaussian-level posteriors\n"
-        "Usage: sgmm2-acc-stats-gpost [options] <model-in> <feature-rspecifier> "
-        "<gpost-rspecifier> <stats-out>\n"
-        "e.g.: sgmm2-acc-stats-gpost 1.mdl 1.ali scp:train.scp ark, s, cs:- 1.acc\n";
-
-    ParseOptions po(usage);
-    bool binary = true;
-    std::string spkvecs_rspecifier, utt2spk_rspecifier;
-    std::string update_flags_str = "vMNwcSt";
-    BaseFloat rand_prune = 1.0e-05;
-
-    po.Register("binary", &binary, "Write output in binary mode");
-    po.Register("spk-vecs", &spkvecs_rspecifier, "Speaker vectors (rspecifier)");
-    po.Register("utt2spk", &utt2spk_rspecifier,
-                "rspecifier for utterance to speaker map");
-    po.Register("rand-prune", &rand_prune, "Pruning threshold for posteriors");
-    po.Register("update-flags", &update_flags_str, "Which SGMM parameters to update: subset of vMNwcS.");
-    po.Read(argc, argv);
-
-    kaldi::SgmmUpdateFlagsType acc_flags = StringToSgmmUpdateFlags(update_flags_str);
-
-    if (po.NumArgs() != 4) {
-      po.PrintUsage();
-      exit(1);
-    }
-
-    std::string model_filename = po.GetArg(1),
-        feature_rspecifier = po.GetArg(2),
-        gpost_rspecifier = po.GetArg(3),
-        accs_wxfilename = po.GetArg(4);
-
-    using namespace kaldi;
-    typedef kaldi::int32 int32;
-
-    // Initialize the readers before the model, as this can avoid
-    // crashes on systems with low virtual memory.
-    SequentialBaseFloatMatrixReader feature_reader(feature_rspecifier);
-    RandomAccessSgmm2GauPostReader gpost_reader(gpost_rspecifier);
-    RandomAccessBaseFloatVectorReaderMapped spkvecs_reader(spkvecs_rspecifier,
-                                                           utt2spk_rspecifier);
-    RandomAccessTokenReader utt2spk_map(utt2spk_rspecifier);
-    
-    AmSgmm2 am_sgmm;
-    TransitionModel trans_model;
-    {
-      bool binary;
-      Input ki(model_filename, &binary);
-      trans_model.Read(ki.Stream(), binary);
-      am_sgmm.Read(ki.Stream(), binary);
-    }
-
-    Vector<double> transition_accs;
-    trans_model.InitStats(&transition_accs);
-    MleAmSgmm2Accs sgmm_accs(rand_prune);
-    sgmm_accs.ResizeAccumulators(am_sgmm, acc_flags, (spkvecs_rspecifier != ""));
-
-    double tot_t = 0.0;
-    kaldi::Sgmm2PerFrameDerivedVars per_frame_vars;
-    
-    int32 num_done = 0, num_err = 0;
-    std::string cur_spk;
-    Sgmm2PerSpkDerivedVars spk_vars;
-    
-    for (; !feature_reader.Done(); feature_reader.Next()) {
-      std::string utt = feature_reader.Key();
-      std::string spk = utt;
-
-      if (!utt2spk_rspecifier.empty()) {
-        if (!utt2spk_map.HasKey(utt)) {
-          KALDI_WARN << "utt2spk map does not have value for " << utt
-                     << ", ignoring this utterance.";
-          continue;
-        } else { spk = utt2spk_map.Value(utt); }
-      }
-
-      if (spk != cur_spk && cur_spk != "")
-        sgmm_accs.CommitStatsForSpk(am_sgmm, spk_vars);
-      
-      if (spk != cur_spk || spk_vars.Empty()) {
-        spk_vars.Clear();
-        if (spkvecs_reader.IsOpen()) {
-          if (spkvecs_reader.HasKey(utt)) {
-            spk_vars.SetSpeakerVector(spkvecs_reader.Value(utt));
-            am_sgmm.ComputePerSpkDerivedVars(&spk_vars);
-          } else {
-            KALDI_WARN << "Cannot find speaker vector for " << utt;
-            num_err++;
-            continue;
-          }
-        } // else spk_vars is "empty"
-      }
-
-      cur_spk = spk;      
-      
-      const Matrix<BaseFloat> &mat = feature_reader.Value();
-      if (!gpost_reader.HasKey(utt) ||
-          gpost_reader.Value(utt).size() != mat.NumRows()) {
-        KALDI_WARN << "No Gaussian-posterior information for utterance "
-                   << utt << " (or wrong size).";
-        num_err++;
-        continue;
-      }
-      const Sgmm2GauPost &gpost = gpost_reader.Value(utt);
-      
-      num_done++;
-      BaseFloat tot_weight = 0.0;
-
-      for (size_t i = 0; i < gpost.size(); i++) {
-        const std::vector<int32> &gselect = gpost[i].gselect;
-        am_sgmm.ComputePerFrameVars(mat.Row(i), gselect, spk_vars,
-                                    &per_frame_vars);
-
-        for (size_t j = 0; j < gpost[i].tids.size(); j++) {
-          int32 tid = gpost[i].tids[j],  // transition identifier.
-              pdf_id = trans_model.TransitionIdToPdf(tid);
-          
-          BaseFloat weight = gpost[i].posteriors[j].Sum();
-          trans_model.Accumulate(weight, tid, &transition_accs);
-          sgmm_accs.AccumulateFromPosteriors(am_sgmm, per_frame_vars,
-                                             gpost[i].posteriors[j],
-                                             pdf_id, &spk_vars);
-          tot_weight += weight;
-        }
-      }
-
-      tot_t += tot_weight;
-      if (num_done % 50 == 0)
-        KALDI_LOG << "Processed " << num_done << " utterances";      
-    }
-    sgmm_accs.CommitStatsForSpk(am_sgmm, spk_vars); // for last speaker
-    
-    KALDI_LOG << "Overall number of frames is " << tot_t;
-    KALDI_LOG << "Done " << num_done << " files, "
-              << num_err << " with errors.";
-
-    {
-      Output ko(accs_wxfilename, binary);
-      transition_accs.Write(ko.Stream(), binary);
-      sgmm_accs.Write(ko.Stream(), binary);
-    }
-    KALDI_LOG << "Written accs.";
-    return (num_done != 0 ? 0 : 1);
-  } catch(const std::exception &e) {
-    std::cerr << e.what();
-    return -1;
-  }
-}
-
-
diff --git a/src/sgmm2bin/sgmm2-acc-stats.cc b/src/sgmm2bin/sgmm2-acc-stats.cc
deleted file mode 100644
index a083e169a2e..00000000000
--- a/src/sgmm2bin/sgmm2-acc-stats.cc
+++ /dev/null
@@ -1,223 +0,0 @@
-// sgmm2bin/sgmm2-acc-stats.cc
-
-// Copyright 2009-2012   Saarland University (Author:  Arnab Ghoshal),
-//                       Johns Hopkins University (Author:  Daniel Povey)
-//                2014   Guoguo Chen
-
-// See ../../COPYING for clarification regarding multiple authors
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//  http://www.apache.org/licenses/LICENSE-2.0
-//
-// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
-// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
-// MERCHANTABLITY OR NON-INFRINGEMENT.
-// See the Apache 2 License for the specific language governing permissions and
-// limitations under the License.
-
-
-#include "base/kaldi-common.h"
-#include "util/common-utils.h"
-#include "sgmm2/am-sgmm2.h"
-#include "hmm/transition-model.h"
-#include "sgmm2/estimate-am-sgmm2.h"
-#include "hmm/posterior.h"
-
-int main(int argc, char *argv[]) {
-  using namespace kaldi;
-  try {
-    const char *usage =
-        "Accumulate stats for SGMM training.\n"
-        "Usage: sgmm2-acc-stats [options] <model-in> <feature-rspecifier> "
-        "<posteriors-rspecifier> <stats-out>\n"
-        "e.g.: sgmm2-acc-stats --gselect=ark:gselect.ark 1.mdl 1.ali scp:train.scp 'ark:ali-to-post 1.ali ark:-|' 1.acc\n"
-        "(note: gselect option is mandatory)\n";
-        
-    ParseOptions po(usage);
-    bool binary = true;
-    std::string gselect_rspecifier, spkvecs_rspecifier, utt2spk_rspecifier;
-    std::string update_flags_str = "vMNwcSt";
-    BaseFloat rand_prune = 1.0e-05;
-
-    po.Register("binary", &binary, "Write output in binary mode");
-    po.Register("gselect", &gselect_rspecifier, "Precomputed Gaussian indices (rspecifier)");
-    po.Register("spk-vecs", &spkvecs_rspecifier, "Speaker vectors (rspecifier)");
-    po.Register("utt2spk", &utt2spk_rspecifier,
-                "rspecifier for utterance to speaker map");
-    po.Register("rand-prune", &rand_prune, "Pruning threshold for posteriors");
-    po.Register("update-flags", &update_flags_str, "Which SGMM parameters to accumulate "
-                "stats for: subset of vMNwcS.");
-
-    po.Read(argc, argv);
-
-    kaldi::SgmmUpdateFlagsType acc_flags = StringToSgmmUpdateFlags(update_flags_str);
-
-    if (po.NumArgs() != 4) {
-      po.PrintUsage();
-      exit(1);
-    }
-    if (gselect_rspecifier == "")
-      KALDI_ERR << "--gselect option is mandatory.";
-    
-    std::string model_filename = po.GetArg(1),
-        feature_rspecifier = po.GetArg(2),
-        posteriors_rspecifier = po.GetArg(3),
-        accs_wxfilename = po.GetArg(4);
-
-    using namespace kaldi;
-    typedef kaldi::int32 int32;
-
-    int32 num_done = 0, num_err = 0;
-    Vector<double> transition_accs;
-    MleAmSgmm2Accs sgmm_accs(rand_prune);
-
-    { // this anonymous scope is to ensure deallocation of unnecessary stuff
-      // while we're writing out the accs, which could be a long time for large
-      // models.
-      
-      // Initialize the readers before the model, as the model can
-      // be large, and we don't want to call fork() after reading it if
-      // virtual memory may be low.
-      SequentialBaseFloatMatrixReader feature_reader(feature_rspecifier);
-      RandomAccessPosteriorReader posteriors_reader(posteriors_rspecifier);
-      RandomAccessInt32VectorVectorReader gselect_reader(gselect_rspecifier);
-      RandomAccessBaseFloatVectorReaderMapped spkvecs_reader(spkvecs_rspecifier,
-                                                             utt2spk_rspecifier);
-      RandomAccessTokenReader utt2spk_map(utt2spk_rspecifier);
-      
-      AmSgmm2 am_sgmm;
-      TransitionModel trans_model;
-      {
-        bool binary;
-        Input ki(model_filename, &binary);
-        trans_model.Read(ki.Stream(), binary);
-        am_sgmm.Read(ki.Stream(), binary);
-      }
-
-
-      trans_model.InitStats(&transition_accs);
-      sgmm_accs.ResizeAccumulators(am_sgmm, acc_flags, (spkvecs_rspecifier!=""));
-
-      double tot_like = 0.0;
-      double tot_t = 0;
-
-      kaldi::Sgmm2PerFrameDerivedVars per_frame_vars;
-      std::string cur_spk;
-      Sgmm2PerSpkDerivedVars spk_vars;
-              
-      for (; !feature_reader.Done(); feature_reader.Next()) {
-        std::string utt = feature_reader.Key();
-        std::string spk = utt;
-        if (!utt2spk_rspecifier.empty()) {
-          if (!utt2spk_map.HasKey(utt)) {
-            KALDI_WARN << "utt2spk map does not have value for " << utt
-                       << ", ignoring this utterance.";
-            continue;
-          } else { spk = utt2spk_map.Value(utt); }
-        }
-
-        if (spk != cur_spk && cur_spk != "")
-          sgmm_accs.CommitStatsForSpk(am_sgmm, spk_vars);        
-        
-        if (spk != cur_spk || spk_vars.Empty()) {
-          spk_vars.Clear();
-          if (spkvecs_reader.IsOpen()) {
-            if (spkvecs_reader.HasKey(utt)) {
-              spk_vars.SetSpeakerVector(spkvecs_reader.Value(utt));
-              am_sgmm.ComputePerSpkDerivedVars(&spk_vars);
-            } else {
-              KALDI_WARN << "Cannot find speaker vector for " << utt;
-              num_err++;
-              continue;
-            }
-          } // else spk_vars is "empty"
-        }
-        
-        cur_spk = spk;
-        
-        const Matrix<BaseFloat> &features = feature_reader.Value();
-        if (!posteriors_reader.HasKey(utt) ||
-            posteriors_reader.Value(utt).size() != features.NumRows()) {
-          KALDI_WARN << "No posterior info available for utterance "
-                     << utt << " (or wrong size)";
-          num_err++;
-          continue;
-        }
-        const Posterior &posterior = posteriors_reader.Value(utt);
-      
-        if (!gselect_reader.HasKey(utt)
-            && gselect_reader.Value(utt).size() != features.NumRows()) {
-          KALDI_WARN << "No Gaussian-selection info available for utterance "
-                     << utt << " (or wrong size)";
-          num_err++;
-        }
-        const std::vector<std::vector<int32> > &gselect =
-            gselect_reader.Value(utt);
-
-        num_done++;
-      
-        BaseFloat tot_like_this_file = 0.0, tot_weight = 0.0;
-
-        Posterior pdf_posterior;
-        ConvertPosteriorToPdfs(trans_model, posterior, &pdf_posterior);
-        for (size_t i = 0; i < posterior.size(); i++) {
-          am_sgmm.ComputePerFrameVars(features.Row(i), gselect[i], spk_vars,
-                                      &per_frame_vars);
-          // Accumulates for SGMM.
-          for (size_t j = 0; j < pdf_posterior[i].size(); j++) {
-            int32 pdf_id = pdf_posterior[i][j].first;
-            BaseFloat weight = pdf_posterior[i][j].second;
-            tot_like_this_file += sgmm_accs.Accumulate(am_sgmm, per_frame_vars,
-                                                       pdf_id, weight, &spk_vars)
-                * weight;
-            tot_weight += weight;
-          }
-
-          // Accumulates for transitions.
-          for (size_t j = 0; j < posterior[i].size(); j++) {
-            int32 tid = posterior[i][j].first;
-            BaseFloat weight = posterior[i][j].second;
-            trans_model.Accumulate(weight, tid, &transition_accs);
-          }
-        }
-        
-        KALDI_VLOG(2) << "Average like for this file is "
-                      << (tot_like_this_file/tot_weight) << " over "
-                      << tot_weight <<" frames.";
-        tot_like += tot_like_this_file;
-        tot_t += tot_weight;
-        if (num_done % 50 == 0) {
-          KALDI_LOG << "Processed " << num_done << " utterances; for utterance "
-                    << utt << " avg. like is "
-                    << (tot_like_this_file/tot_weight)
-                    << " over " << tot_weight <<" frames.";
-        }
-      }
-      sgmm_accs.CommitStatsForSpk(am_sgmm, spk_vars); // commit stats for
-      // last speaker.
-      
-      KALDI_LOG << "Overall like per frame (Gaussian only) = "
-                << (tot_like/tot_t) << " over " << tot_t << " frames.";
-
-      KALDI_LOG << "Done " << num_done << " files, " << num_err
-                << " with errors.";
-    } 
-
-    {
-      Output ko(accs_wxfilename, binary);
-      transition_accs.Write(ko.Stream(), binary);
-      sgmm_accs.Write(ko.Stream(), binary);
-    }
-    KALDI_LOG << "Written accs.";
-    return (num_done != 0 ? 0 : 1);
-  } catch(const std::exception &e) {
-    std::cerr << e.what();
-    return -1;
-  }
-}
-
-
diff --git a/src/sgmm2bin/sgmm2-acc-stats2.cc b/src/sgmm2bin/sgmm2-acc-stats2.cc
deleted file mode 100644
index a2446df9d6e..00000000000
--- a/src/sgmm2bin/sgmm2-acc-stats2.cc
+++ /dev/null
@@ -1,240 +0,0 @@
-// sgmm2bin/sgmm2-acc-stats2.cc
-
-// Copyright 2009-2012   Saarland University (Author:  Arnab Ghoshal),
-//                       Johns Hopkins University (Author: Daniel Povey)
-
-// See ../../COPYING for clarification regarding multiple authors
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//  http://www.apache.org/licenses/LICENSE-2.0
-//
-// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
-// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
-// MERCHANTABLITY OR NON-INFRINGEMENT.
-// See the Apache 2 License for the specific language governing permissions and
-// limitations under the License.
-
-
-#include "base/kaldi-common.h"
-#include "util/common-utils.h"
-#include "sgmm2/am-sgmm2.h"
-#include "hmm/transition-model.h"
-#include "sgmm2/estimate-am-sgmm2.h"
-#include "hmm/posterior.h"
-
-int main(int argc, char *argv[]) {
-  using namespace kaldi;
-  try {
-    const char *usage =
-        "Accumulate numerator and denominator stats for discriminative training\n"
-        "of SGMMs (input is posteriors of mixed sign)\n"
-        "Usage: sgmm2-acc-stats2 [options] <model-in> <feature-rspecifier> "
-        "<posteriors-rspecifier> <num-stats-out> <den-stats-out>\n"
-        "e.g.: sgmm2-acc-stats2 1.mdl 1.ali scp:train.scp ark:1.posts num.acc den.acc\n";
-
-    ParseOptions po(usage);
-    bool binary = true;
-    std::string gselect_rspecifier, spkvecs_rspecifier, utt2spk_rspecifier;
-    std::string update_flags_str = "vMNwucSt";
-    BaseFloat rand_prune = 1.0e-05;
-
-    po.Register("binary", &binary, "Write output in binary mode");
-    po.Register("gselect", &gselect_rspecifier, "Precomputed Gaussian indices (rspecifier)");
-    po.Register("spk-vecs", &spkvecs_rspecifier, "Speaker vectors (rspecifier)");
-    po.Register("utt2spk", &utt2spk_rspecifier,
-                "rspecifier for utterance to speaker map");
-    po.Register("rand-prune", &rand_prune, "Pruning threshold for posteriors");
-    po.Register("update-flags", &update_flags_str, "Which SGMM parameters to accumulate "
-                "stats for: subset of vMNwcS.");
-
-    po.Read(argc, argv);
-
-    kaldi::SgmmUpdateFlagsType acc_flags = StringToSgmmUpdateFlags(update_flags_str);
-    
-    if (po.NumArgs() != 5) {
-      po.PrintUsage();
-      exit(1);
-    }
-
-    std::string model_filename = po.GetArg(1),
-        feature_rspecifier = po.GetArg(2),
-        posteriors_rspecifier = po.GetArg(3),
-        num_accs_wxfilename = po.GetArg(4),
-        den_accs_wxfilename = po.GetArg(5);
-    
-
-    using namespace kaldi;
-    typedef kaldi::int32 int32;
-    typedef kaldi::int64 int64;
-
-    // Initialize the readers before the model, as the model can
-    // be large, and we don't want to call fork() after reading it if
-    // virtual memory may be low.
-    SequentialBaseFloatMatrixReader feature_reader(feature_rspecifier);
-    RandomAccessPosteriorReader posteriors_reader(posteriors_rspecifier);
-    RandomAccessInt32VectorVectorReader gselect_reader(gselect_rspecifier);
-    RandomAccessBaseFloatVectorReaderMapped spkvecs_reader(spkvecs_rspecifier,
-                                                           utt2spk_rspecifier);
-    RandomAccessTokenReader utt2spk_map(utt2spk_rspecifier);    
-    
-    AmSgmm2 am_sgmm;
-    TransitionModel trans_model;
-    {
-      bool binary;
-      Input ki(model_filename, &binary);
-      trans_model.Read(ki.Stream(), binary);
-      am_sgmm.Read(ki.Stream(), binary);
-    }
-
-    if (acc_flags & kSgmmSpeakerWeightProjections && !am_sgmm.HasSpeakerDependentWeights()) {
-      acc_flags &= ~kSgmmSpeakerWeightProjections;
-      KALDI_WARN << "Removing speaker weight projections (u) from flags "
-          "as not present in model\n";
-    }
-    if (acc_flags & kSgmmSpeakerProjections && !am_sgmm.HasSpeakerSpace()) {
-      acc_flags &= ~kSgmmSpeakerProjections;
-      KALDI_WARN << "Removing speaker projections (N) from flags "
-          "as not present in model\n";
-    }
-    
-    Vector<double> num_transition_accs, den_transition_accs;
-    if (acc_flags & kaldi::kSgmmTransitions) {
-      trans_model.InitStats(&num_transition_accs);
-      trans_model.InitStats(&den_transition_accs);
-    }
-    MleAmSgmm2Accs num_sgmm_accs(rand_prune), den_sgmm_accs(rand_prune);
-    bool have_spk_vecs = (spkvecs_rspecifier != "");
-    num_sgmm_accs.ResizeAccumulators(am_sgmm, acc_flags, have_spk_vecs);
-    den_sgmm_accs.ResizeAccumulators(am_sgmm, acc_flags, have_spk_vecs);   
-
-    double tot_like = 0.0, tot_weight = 0.0, tot_abs_weight = 0.0;
-    int64 tot_frames = 0;
-
-    kaldi::Sgmm2PerFrameDerivedVars per_frame_vars;
-
-    int32 num_done = 0, num_err = 0;
-    std::string cur_spk;
-    Sgmm2PerSpkDerivedVars spk_vars;
-    
-    for (; !feature_reader.Done(); feature_reader.Next()) {
-      std::string utt = feature_reader.Key();
-      std::string spk = utt;
-      if (!utt2spk_rspecifier.empty()) {
-        if (!utt2spk_map.HasKey(utt)) {
-          KALDI_WARN << "utt2spk map does not have value for " << utt
-                     << ", ignoring this utterance.";
-          continue;
-        } else { spk = utt2spk_map.Value(utt); }
-      }
-      if (spk != cur_spk && cur_spk != "") {
-        num_sgmm_accs.CommitStatsForSpk(am_sgmm, spk_vars);
-        den_sgmm_accs.CommitStatsForSpk(am_sgmm, spk_vars);
-      }
-      if (spk != cur_spk || spk_vars.Empty()) {
-        spk_vars.Clear();
-        if (spkvecs_reader.IsOpen()) {
-          if (spkvecs_reader.HasKey(utt)) {
-            spk_vars.SetSpeakerVector(spkvecs_reader.Value(utt));
-            am_sgmm.ComputePerSpkDerivedVars(&spk_vars);
-          } else {
-            KALDI_WARN << "Cannot find speaker vector for " << utt;
-            num_err++;
-            continue;
-          }
-        } // else spk_vars is "empty"
-      }
-      cur_spk = spk;
-      
-      const Matrix<BaseFloat> &features = feature_reader.Value();
-      if (!posteriors_reader.HasKey(utt) ||
-          posteriors_reader.Value(utt).size() != features.NumRows()) {
-        KALDI_WARN << "No posterior info available for utterance "
-                   << utt << " (or wrong size)";
-        num_err++;
-        continue;
-      }
-      
-      const Posterior &posterior = posteriors_reader.Value(utt);
-      if (!gselect_reader.HasKey(utt)
-          && gselect_reader.Value(utt).size() != features.NumRows()) {
-        KALDI_WARN << "No Gaussian-selection info available for utterance "
-                   << utt << " (or wrong size)";
-        num_err++;
-      }
-      const std::vector<std::vector<int32> > &gselect =
-          gselect_reader.Value(utt);
-
-      num_done++;
-      BaseFloat tot_like_this_file = 0.0, tot_weight_this_file = 0.0,
-          tot_abs_weight_this_file = 0.0;
-        
-      for (size_t i = 0; i < posterior.size(); i++) {
-        if (posterior[i].empty())
-          continue;
-        am_sgmm.ComputePerFrameVars(features.Row(i), gselect[i], spk_vars,
-                                    &per_frame_vars);
-        
-        for (size_t j = 0; j < posterior[i].size(); j++) {
-          int32 tid = posterior[i][j].first,  // transition identifier.
-              pdf_id = trans_model.TransitionIdToPdf(tid);
-          BaseFloat weight = posterior[i][j].second,
-              abs_weight = std::abs(weight);
-            
-          if (acc_flags & kaldi::kSgmmTransitions) {
-            trans_model.Accumulate(abs_weight, tid,  weight > 0 ?
-                                   &num_transition_accs : &den_transition_accs);
-          }
-          tot_like_this_file +=
-              (weight > 0 ? num_sgmm_accs : den_sgmm_accs).Accumulate(
-                  am_sgmm, per_frame_vars, pdf_id, abs_weight, &spk_vars)
-              * weight;
-          tot_weight_this_file += weight;
-          tot_abs_weight_this_file += abs_weight;
-        }
-      }
-      // Commit stats for the last speaker.
-      num_sgmm_accs.CommitStatsForSpk(am_sgmm, spk_vars);
-      den_sgmm_accs.CommitStatsForSpk(am_sgmm, spk_vars);
-      
-        
-      tot_like += tot_like_this_file;
-      tot_weight += tot_weight_this_file;
-      tot_abs_weight += tot_abs_weight_this_file;
-      tot_frames += posterior.size();
-      if (num_done % 50 == 0)
-        KALDI_LOG << "Processed " << num_done << " utterances.";
-    }
-    // Commit stats for last speaker.
-    num_sgmm_accs.CommitStatsForSpk(am_sgmm, spk_vars);
-    den_sgmm_accs.CommitStatsForSpk(am_sgmm, spk_vars);
-    
-    KALDI_LOG << "Overall weighted acoustic likelihood per frame was "
-              << (tot_like/tot_frames) << " over " << tot_frames << " frames; "
-              << "average weight per frame is " << (tot_weight/tot_frames)
-              << ", average abs(weight) per frame is "
-              << (tot_abs_weight/tot_frames);
-    
-    KALDI_LOG << "Done " << num_done << " files, " << num_err
-              << " with errors.";
-    
-    {
-      Output ko(num_accs_wxfilename, binary);
-      num_transition_accs.Write(ko.Stream(), binary);
-      num_sgmm_accs.Write(ko.Stream(), binary);
-    }
-    {
-      Output ko(den_accs_wxfilename, binary);
-      den_transition_accs.Write(ko.Stream(), binary);
-      den_sgmm_accs.Write(ko.Stream(), binary);
-    }
-    KALDI_LOG << "Written accs.";
-    return (num_done != 0 ? 0 : 1);
-  } catch(const std::exception &e) {
-    std::cerr << e.what();
-    return -1;
-  }
-}
diff --git a/src/sgmm2bin/sgmm2-align-compiled.cc b/src/sgmm2bin/sgmm2-align-compiled.cc
deleted file mode 100644
index 6b733fe9ab8..00000000000
--- a/src/sgmm2bin/sgmm2-align-compiled.cc
+++ /dev/null
@@ -1,183 +0,0 @@
-// sgmm2bin/sgmm2-align-compiled.cc
-
-// Copyright 2009-2012  Microsoft Corporation;  Saarland University
-//           2012-2014 Johns Hopkins University (Daniel Povey)
-
-// See ../../COPYING for clarification regarding multiple authors
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//  http://www.apache.org/licenses/LICENSE-2.0
-//
-// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
-// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
-// MERCHANTABLITY OR NON-INFRINGEMENT.
-// See the Apache 2 License for the specific language governing permissions and
-// limitations under the License.
-
-#include "base/kaldi-common.h"
-#include "util/common-utils.h"
-#include "sgmm2/am-sgmm2.h"
-#include "hmm/transition-model.h"
-#include "hmm/hmm-utils.h"
-#include "fstext/fstext-lib.h"
-#include "decoder/decoder-wrappers.h"
-#include "decoder/training-graph-compiler.h"
-#include "sgmm2/decodable-am-sgmm2.h"
-#include "lat/kaldi-lattice.h" // for {Compact}LatticeArc
-
-
-int main(int argc, char *argv[]) {
-  try {
-    using namespace kaldi;
-    typedef kaldi::int32 int32;
-    using fst::SymbolTable;
-    using fst::VectorFst;
-    using fst::StdArc;
-
-    const char *usage =
-        "Align features given [SGMM-based] models.\n"
-        "Usage: sgmm2-align-compiled [options] <model-in> <graphs-rspecifier> "
-        "<feature-rspecifier> <alignments-wspecifier>\n"
-        "e.g.: sgmm2-align-compiled 1.mdl ark:graphs.fsts scp:train.scp ark:1.ali\n";
-
-    ParseOptions po(usage);
-    bool binary = true;
-    AlignConfig align_config;
-    BaseFloat acoustic_scale = 1.0;
-    BaseFloat transition_scale = 1.0;
-    BaseFloat self_loop_scale = 1.0;
-    BaseFloat log_prune = 5.0;
-    std::string gselect_rspecifier, spkvecs_rspecifier, utt2spk_rspecifier;
-    std::string per_frame_acwt_wspecifier;
-
-    align_config.Register(&po);
-    po.Register("binary", &binary, "Write output in binary mode");
-    po.Register("log-prune", &log_prune, "Pruning beam used to reduce number "
-                "of exp() evaluations.");
-    po.Register("spk-vecs", &spkvecs_rspecifier, "Speaker vectors (rspecifier)");
-    po.Register("utt2spk", &utt2spk_rspecifier,
-                "rspecifier for utterance to speaker map");
-    po.Register("acoustic-scale", &acoustic_scale, "Scaling factor for acoustic "
-                "likelihoods");
-    po.Register("transition-scale", &transition_scale, "Scaling factor for "
-                "some transition probabilities [see also self-loop-scale].");
-    po.Register("self-loop-scale", &self_loop_scale, "Scaling factor for "
-                "self-loop versus non-self-loop probability mass [controls "
-                "most transition probabilities.]");
-    po.Register("write-per-frame-acoustic-loglikes", &per_frame_acwt_wspecifier,
-                "Wspecifier for table of vectors containing the acoustic log-likelihoods "
-                "per frame for each utterance. E.g. ark:foo/per_frame_logprobs.1.ark");
-    po.Register("gselect", &gselect_rspecifier, "Precomputed Gaussian indices "
-                "(rspecifier)");
-
-    po.Read(argc, argv);
-
-    if (po.NumArgs() != 4) {
-      po.PrintUsage();
-      exit(1);
-    }
-
-    if (gselect_rspecifier == "")
-      KALDI_ERR << "--gselect option is mandatory.";
-
-    std::string model_in_filename = po.GetArg(1),
-        fst_rspecifier = po.GetArg(2),
-        feature_rspecifier = po.GetArg(3),
-        alignment_wspecifier = po.GetArg(4);
-
-    TransitionModel trans_model;
-    AmSgmm2 am_sgmm;
-    {
-      bool binary;
-      Input ki(model_in_filename, &binary);
-      trans_model.Read(ki.Stream(), binary);
-      am_sgmm.Read(ki.Stream(), binary);
-    }
-
-    SequentialTableReader<fst::VectorFstHolder> fst_reader(fst_rspecifier);
-    RandomAccessBaseFloatMatrixReader feature_reader(feature_rspecifier);
-    RandomAccessInt32VectorVectorReader gselect_reader(gselect_rspecifier);
-
-    RandomAccessBaseFloatVectorReaderMapped spkvecs_reader(spkvecs_rspecifier,
-                                                           utt2spk_rspecifier);
-
-    Int32VectorWriter alignment_writer(alignment_wspecifier);
-    BaseFloatVectorWriter per_frame_acwt_writer(per_frame_acwt_wspecifier);
-
-    int num_done = 0, num_err = 0, num_retry = 0;
-    double tot_like = 0.0;
-    kaldi::int64 frame_count = 0;
-
-    for (; !fst_reader.Done(); fst_reader.Next()) {
-      std::string utt = fst_reader.Key();
-      if (!feature_reader.HasKey(utt)) {
-        KALDI_WARN << "No feature found for utterance " << utt;
-        num_err++;
-        continue;
-      }
-      VectorFst<StdArc> decode_fst(fst_reader.Value());
-      // stops copy-on-write of the fst by deleting the fst inside the reader,
-      // since we're about to mutate the fst by adding transition probs.
-      fst_reader.FreeCurrent();
-
-      const Matrix<BaseFloat> &features = feature_reader.Value(utt);
-      if (features.NumRows() == 0) {
-        KALDI_WARN << "Zero-length utterance: " << utt;
-        num_err++;
-        continue;
-      }
-
-      Sgmm2PerSpkDerivedVars spk_vars;
-      if (spkvecs_reader.IsOpen()) {
-        if (spkvecs_reader.HasKey(utt)) {
-          spk_vars.SetSpeakerVector(spkvecs_reader.Value(utt));
-          am_sgmm.ComputePerSpkDerivedVars(&spk_vars);
-        } else {
-          KALDI_WARN << "Cannot find speaker vector for " << utt;
-          num_err++;
-          continue;
-        }
-      }  // else spk_vars is "empty"
-
-      if (!gselect_reader.HasKey(utt)
-          && gselect_reader.Value(utt).size() != features.NumRows()) {
-        KALDI_WARN << "No Gaussian-selection info available for utterance "
-                   << utt << " (or wrong size)";
-        num_err++;
-      }
-      const std::vector<std::vector<int32> > &gselect =
-          gselect_reader.Value(utt);
-
-      {  // Add transition-probs to the FST.
-        std::vector<int32> disambig_syms;  // empty.
-        AddTransitionProbs(trans_model, disambig_syms,
-                           transition_scale, self_loop_scale,
-                           &decode_fst);
-      }
-
-      DecodableAmSgmm2Scaled sgmm_decodable(am_sgmm, trans_model, features, gselect,
-                                            log_prune, acoustic_scale, &spk_vars);
-
-      AlignUtteranceWrapper(align_config, utt,
-                            acoustic_scale, &decode_fst, &sgmm_decodable,
-                            &alignment_writer, NULL,
-                            &num_done, &num_err, &num_retry,
-                            &tot_like, &frame_count, &per_frame_acwt_writer);
-
-    }
-
-    KALDI_LOG << "Overall log-likelihood per frame is " << (tot_like/frame_count)
-              << " over " << frame_count<< " frames.";
-    KALDI_LOG << "Retried " << num_retry << " out of "
-              << (num_done + num_err) << " utterances.";
-    KALDI_LOG << "Done " << num_done << ", errors on " << num_err;
-    return (num_done != 0 ? 0 : 1);
-  } catch(const std::exception &e) {
-    std::cerr << e.what();
-    return -1;
-  }
-}
diff --git a/src/sgmm2bin/sgmm2-comp-prexform.cc b/src/sgmm2bin/sgmm2-comp-prexform.cc
deleted file mode 100644
index a216300fab7..00000000000
--- a/src/sgmm2bin/sgmm2-comp-prexform.cc
+++ /dev/null
@@ -1,84 +0,0 @@
-// sgmm2bin/sgmm2-comp-prexform.cc
-
-// Copyright 2009-2012  Saarland University (author: Arnab Ghoshal)
-//                      Johns Hopkins University (author: Daniel Povey)
-
-// See ../../COPYING for clarification regarding multiple authors
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//  http://www.apache.org/licenses/LICENSE-2.0
-//
-// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
-// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
-// MERCHANTABLITY OR NON-INFRINGEMENT.
-// See the Apache 2 License for the specific language governing permissions and
-// limitations under the License.
-
-#include "util/common-utils.h"
-#include "sgmm2/am-sgmm2.h"
-#include "sgmm2/fmllr-sgmm2.h"
-#include "hmm/transition-model.h"
-
-int main(int argc, char *argv[]) {
-  try {
-    typedef kaldi::int32 int32;
-
-    const char *usage =
-        "Compute \"pre-transform\" parameters required for estimating fMLLR with\n"
-        "SGMMs, and write to a model file, after the SGMM.\n"
-        "Usage: sgmm2-comp-prexform [options] <sgmm2-in> <occs-in> <sgmm-out>\n";
-
-    bool binary = true;
-    kaldi::ParseOptions po(usage);
-    po.Register("binary", &binary, "Write output in binary mode");
-    po.Read(argc, argv);
-
-    if (po.NumArgs() < 3) {
-      po.PrintUsage();
-      exit(1);
-    }
-
-    std::string sgmm_in_filename = po.GetArg(1),
-        occs_filename = po.GetArg(2),
-        sgmm_out_filename = po.GetArg(3);
-
-    kaldi::AmSgmm2 sgmm_in;
-    kaldi::TransitionModel trans_model;
-    {
-      bool binary_read;
-      kaldi::Input ki(sgmm_in_filename, &binary_read);
-      trans_model.Read(ki.Stream(), binary_read);
-      sgmm_in.Read(ki.Stream(), binary_read);
-    }
-
-    kaldi::Vector<kaldi::BaseFloat> occs;
-    {
-      bool binary_read;
-      kaldi::Input ki(occs_filename, &binary_read);
-      occs.Read(ki.Stream(), binary_read);
-    }
-
-    kaldi::Sgmm2FmllrGlobalParams fmllr_globals;
-    sgmm_in.ComputeFmllrPreXform(occs, &fmllr_globals.pre_xform_,
-                                 &fmllr_globals.inv_xform_,
-                                 &fmllr_globals.mean_scatter_);
-
-    {
-      kaldi::Output ko(sgmm_out_filename, binary);
-      trans_model.Write(ko.Stream(), binary);
-      sgmm_in.Write(ko.Stream(), binary, kaldi::kSgmmWriteAll);
-      fmllr_globals.Write(ko.Stream(), binary);
-    }
-
-    KALDI_LOG << "Written model to " << sgmm_out_filename;
-  } catch(const std::exception &e) {
-    std::cerr << e.what() << '\n';
-    return -1;
-  }
-}
-
-
diff --git a/src/sgmm2bin/sgmm2-copy.cc b/src/sgmm2bin/sgmm2-copy.cc
deleted file mode 100644
index b3271e0938f..00000000000
--- a/src/sgmm2bin/sgmm2-copy.cc
+++ /dev/null
@@ -1,74 +0,0 @@
-// sgmm2bin/sgmm2-copy.cc
-
-// Copyright 2009-2012  Microsoft Corporation
-//                      Johns Hopkins University (author: Daniel Povey).
-
-// See ../../COPYING for clarification regarding multiple authors
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//  http://www.apache.org/licenses/LICENSE-2.0
-//
-// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
-// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
-// MERCHANTABLITY OR NON-INFRINGEMENT.
-// See the Apache 2 License for the specific language governing permissions and
-// limitations under the License.
-
-
-#include "base/kaldi-common.h"
-#include "util/common-utils.h"
-
-#include "sgmm2/am-sgmm2.h"
-#include "hmm/transition-model.h"
-
-int main(int argc, char *argv[]) {
-  try {
-    using namespace kaldi;
-    typedef kaldi::int32 int32;
-    const char *usage =
-        "Copy SGMM (possibly changing binary/text format)\n"
-        "Usage: sgmm2-copy [options] <model-in> <model-out>\n"
-        "e.g.: sgmm2-copy --binary=false 1.mdl 1_text.mdl\n";
-
-    bool binary_write = true;
-
-    ParseOptions po(usage);
-    po.Register("binary", &binary_write, "Write output in binary mode");
-
-    po.Read(argc, argv);
-    if (po.NumArgs() != 2) {
-      po.PrintUsage();
-      exit(1);
-    }
-    std::string model_in_filename = po.GetArg(1),
-        model_out_filename = po.GetArg(2);
-
-    AmSgmm2 am_sgmm;
-    TransitionModel trans_model;
-    {
-      bool binary;
-      Input ki(model_in_filename, &binary);
-      trans_model.Read(ki.Stream(), binary);
-      am_sgmm.Read(ki.Stream(), binary);
-    }
-
-    {
-      Output ko(model_out_filename, binary_write);
-      trans_model.Write(ko.Stream(), binary_write);
-      am_sgmm.Write(ko.Stream(), binary_write, kSgmmWriteAll);
-    }
-    
-    
-    KALDI_LOG << "Written model to " << model_out_filename;
-    return 0;
-  } catch(const std::exception &e) {
-    std::cerr << e.what();
-    return -1;
-  }
-}
-
-
diff --git a/src/sgmm2bin/sgmm2-est-ebw.cc b/src/sgmm2bin/sgmm2-est-ebw.cc
deleted file mode 100644
index bff0e8ff04b..00000000000
--- a/src/sgmm2bin/sgmm2-est-ebw.cc
+++ /dev/null
@@ -1,118 +0,0 @@
-// sgmm2bin/sgmm2-est-ebw.cc
-
-// Copyright 2012  Johns Hopkins Univerity (Author: Daniel Povey)
-
-// See ../../COPYING for clarification regarding multiple authors
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//  http://www.apache.org/licenses/LICENSE-2.0
-//
-// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
-// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
-// MERCHANTABLITY OR NON-INFRINGEMENT.
-// See the Apache 2 License for the specific language governing permissions and
-// limitations under the License.
-
-
-#include "base/kaldi-common.h"
-#include "util/common-utils.h"
-#include "util/kaldi-thread.h"
-#include "hmm/transition-model.h"
-#include "sgmm2/estimate-am-sgmm2-ebw.h"
-
-
-int main(int argc, char *argv[]) {
-  using namespace kaldi;
-  typedef kaldi::int32 int32;
-  using std::string;
-  try {
-    const char *usage =
-        "Estimate SGMM model parameters discriminatively using Extended\n"
-        "Baum-Welch style of update\n"
-        "Usage: sgmm2-est-ebw [options] <model-in> <num-stats-in> <den-stats-in> <model-out>\n";
-
-
-    string update_flags_str = "vMNwcSt";
-    bool binary_write = true;
-    string write_flags_str = "gsnu";
-    EbwAmSgmm2Options opts;
-
-    
-    ParseOptions po(usage);
-    po.Register("binary", &binary_write, "Write output in binary mode");
-    po.Register("update-flags", &update_flags_str, "Which SGMM parameters to "
-                "update: subset of vMNwcSt.");
-    po.Register("write-flags", &write_flags_str, "Which SGMM parameters to "
-                "write: subset of gsnu");
-    po.Register("num-threads", &g_num_threads, "Number of threads to use in "
-                "weight update and normalizer computation");
-    opts.Register(&po);
-
-    po.Read(argc, argv);
-    if (po.NumArgs() != 4) {
-      po.PrintUsage();
-      exit(1);
-    }
-    string model_in_filename = po.GetArg(1),
-        num_stats_filename = po.GetArg(2),
-        den_stats_filename = po.GetArg(3),
-        model_out_filename = po.GetArg(4);
-    
-    SgmmUpdateFlagsType update_flags = StringToSgmmUpdateFlags(update_flags_str);
-    SgmmWriteFlagsType write_flags = StringToSgmmWriteFlags(write_flags_str);
-
-    AmSgmm2 am_sgmm;
-    TransitionModel trans_model;
-    {
-      bool binary;
-      Input ki(model_in_filename, &binary);
-      trans_model.Read(ki.Stream(), binary);
-      am_sgmm.Read(ki.Stream(), binary);
-    }
-
-    MleAmSgmm2Accs sgmm_num_accs;
-    {
-      bool binary;
-      Vector<double> transition_accs; // won't be used.
-      Input ki(num_stats_filename, &binary);
-      transition_accs.Read(ki.Stream(), binary);
-      sgmm_num_accs.Read(ki.Stream(), binary, false);  // false == add; doesn't matter.
-    }
-    MleAmSgmm2Accs sgmm_den_accs;
-    {
-      bool binary;
-      Vector<double> transition_accs; // won't be used.
-      Input ki(den_stats_filename, &binary);
-      transition_accs.Read(ki.Stream(), binary);
-      sgmm_den_accs.Read(ki.Stream(), binary, false);  // false == add; doesn't matter.
-    }
-    
-    sgmm_num_accs.Check(am_sgmm, true); // Will check consistency and print some diagnostics.
-    sgmm_den_accs.Check(am_sgmm, true); // Will check consistency and print some diagnostics.    
-
-    {  // Update SGMM.
-      BaseFloat auxf_impr, count;
-      kaldi::EbwAmSgmm2Updater sgmm_updater(opts);
-      sgmm_updater.Update(sgmm_num_accs, sgmm_den_accs, &am_sgmm,
-                          update_flags, &auxf_impr, &count);
-      KALDI_LOG << "Overall auxf impr/frame from SGMM update is " << (auxf_impr/count)
-                << " over " << count << " frames.";
-    }
-
-    {
-      Output ko(model_out_filename, binary_write);
-      trans_model.Write(ko.Stream(), binary_write);
-      am_sgmm.Write(ko.Stream(), binary_write, write_flags);
-    }
-    
-    KALDI_LOG << "Wrote model to " << model_out_filename;
-    return 0;
-  } catch(const std::exception &e) {
-    std::cerr << e.what();
-    return -1;
-  }
-}
diff --git a/src/sgmm2bin/sgmm2-est-fmllr.cc b/src/sgmm2bin/sgmm2-est-fmllr.cc
deleted file mode 100644
index 8d2b28ee325..00000000000
--- a/src/sgmm2bin/sgmm2-est-fmllr.cc
+++ /dev/null
@@ -1,302 +0,0 @@
-// sgmm2bin/sgmm2-est-fmllr.cc
-
-// Copyright 2009-2012  Saarland University   Microsoft Corporation  Johns Hopkins University (Author: Daniel Povey)
-//                2014  Guoguo Chen
-
-// See ../../COPYING for clarification regarding multiple authors
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//  http://www.apache.org/licenses/LICENSE-2.0
-//
-// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
-// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
-// MERCHANTABLITY OR NON-INFRINGEMENT.
-// See the Apache 2 License for the specific language governing permissions and
-// limitations under the License.
-
-#include <string>
-using std::string;
-#include <vector>
-using std::vector;
-
-#include "base/kaldi-common.h"
-#include "util/common-utils.h"
-#include "sgmm2/am-sgmm2.h"
-#include "sgmm2/fmllr-sgmm2.h"
-#include "hmm/transition-model.h"
-#include "hmm/posterior.h"
-
-namespace kaldi {
-
-void AccumulateForUtterance(const Matrix<BaseFloat> &feats,
-                            const Matrix<BaseFloat> &transformed_feats, // if already fMLLR
-                            const std::vector<std::vector<int32> > &gselect,
-                            const Posterior &post,
-                            const TransitionModel &trans_model,
-                            const AmSgmm2 &am_sgmm,
-                            BaseFloat logdet,
-                            Sgmm2PerSpkDerivedVars *spk_vars,
-                            FmllrSgmm2Accs *spk_stats) {
-  kaldi::Sgmm2PerFrameDerivedVars per_frame_vars;
-
-  Posterior pdf_post;
-  ConvertPosteriorToPdfs(trans_model, post, &pdf_post);
-  for (size_t t = 0; t < post.size(); t++) {
-    // per-frame vars only used for computing posteriors... use the
-    // transformed feats for this, if available.
-    am_sgmm.ComputePerFrameVars(transformed_feats.Row(t), gselect[t],
-                                *spk_vars, &per_frame_vars);
-    
-
-    for (size_t j = 0; j < pdf_post[t].size(); j++) {
-      int32 pdf_id = pdf_post[t][j].first;
-      Matrix<BaseFloat> posteriors;
-      am_sgmm.ComponentPosteriors(per_frame_vars, pdf_id,
-                                  spk_vars, &posteriors);
-      posteriors.Scale(pdf_post[t][j].second);
-      spk_stats->AccumulateFromPosteriors(am_sgmm, *spk_vars, feats.Row(t),
-                                          gselect[t], posteriors, pdf_id);
-    }
-  }
-}
-
-}  // end namespace kaldi
-
-int main(int argc, char *argv[]) {
-  try {
-    typedef kaldi::int32 int32;
-    using namespace kaldi;
-    const char *usage =
-        "Estimate FMLLR transform for SGMMs, either per utterance or for the "
-        "supplied set of speakers (with spk2utt option).\n"
-        "Reads state-level posteriors. Writes to a table of matrices.\n"
-        "--gselect option is mandatory.\n"
-        "Usage: sgmm2-est-fmllr [options] <model-in> <feature-rspecifier> "
-        "<post-rspecifier> <mats-wspecifier>\n";
-    
-    ParseOptions po(usage);
-    string spk2utt_rspecifier, spkvecs_rspecifier, fmllr_rspecifier,
-        gselect_rspecifier;
-    BaseFloat min_count = 100;
-    Sgmm2FmllrConfig fmllr_opts;
-    
-    po.Register("spk2utt", &spk2utt_rspecifier,
-                "File to read speaker to utterance-list map from.");
-    po.Register("spkvec-min-count", &min_count,
-                "Minimum count needed to estimate speaker vectors");
-    po.Register("spk-vecs", &spkvecs_rspecifier,
-                "Speaker vectors to use during aligment (rspecifier)");
-    po.Register("input-fmllr", &fmllr_rspecifier,
-                "Initial FMLLR transform per speaker (rspecifier)");
-    po.Register("gselect", &gselect_rspecifier,
-                "Precomputed Gaussian indices (rspecifier)");
-    fmllr_opts.Register(&po);
-
-    po.Read(argc, argv);
-
-    if (po.NumArgs() != 4) {
-      po.PrintUsage();
-      exit(1);
-    }
-
-    string model_rxfilename = po.GetArg(1),
-        feature_rspecifier = po.GetArg(2),
-        post_rspecifier = po.GetArg(3),
-        fmllr_wspecifier = po.GetArg(4);
-
-    TransitionModel trans_model;
-    AmSgmm2 am_sgmm;
-    Sgmm2FmllrGlobalParams fmllr_globals;
-    {
-      bool binary;
-      Input ki(model_rxfilename, &binary);
-      trans_model.Read(ki.Stream(), binary);
-      am_sgmm.Read(ki.Stream(), binary);
-      fmllr_globals.Read(ki.Stream(), binary);
-    }
-    if (gselect_rspecifier == "")
-      KALDI_ERR << "--gselect option is required.";
-    
-    RandomAccessPosteriorReader post_reader(post_rspecifier);
-    RandomAccessBaseFloatVectorReader spkvecs_reader(spkvecs_rspecifier);
-    RandomAccessInt32VectorVectorReader gselect_reader(gselect_rspecifier);
-    RandomAccessBaseFloatMatrixReader fmllr_reader(fmllr_rspecifier);
-
-    BaseFloatMatrixWriter fmllr_writer(fmllr_wspecifier);
-
-    int32 dim = am_sgmm.FeatureDim();
-    FmllrSgmm2Accs spk_stats;
-    spk_stats.Init(dim, am_sgmm.NumGauss());
-    Matrix<BaseFloat> fmllr_xform(dim, dim + 1);
-    BaseFloat logdet = 0.0;
-    double tot_impr = 0.0, tot_t = 0.0;
-    int32 num_done = 0, num_err = 0;
-    std::vector<std::vector<int32> > empty_gselect;
-
-    if (!spk2utt_rspecifier.empty()) {  // per-speaker adaptation
-      SequentialTokenVectorReader spk2utt_reader(spk2utt_rspecifier);
-      RandomAccessBaseFloatMatrixReader feature_reader(feature_rspecifier);
-
-      for (; !spk2utt_reader.Done(); spk2utt_reader.Next()) {
-        spk_stats.SetZero();
-        string spk = spk2utt_reader.Key();
-        const vector<string> &uttlist = spk2utt_reader.Value();
-
-        Sgmm2PerSpkDerivedVars spk_vars;
-        if (spkvecs_reader.IsOpen()) {
-          if (spkvecs_reader.HasKey(spk)) {
-            spk_vars.SetSpeakerVector(spkvecs_reader.Value(spk));
-            am_sgmm.ComputePerSpkDerivedVars(&spk_vars);
-          } else {
-            KALDI_WARN << "Cannot find speaker vector for " << spk;
-            num_err++;
-            continue;
-          }
-        }  // else spk_vars is "empty"
-
-        if (fmllr_reader.IsOpen()) {
-          if (fmllr_reader.HasKey(spk)) {
-            fmllr_xform.CopyFromMat(fmllr_reader.Value(spk));
-            logdet = fmllr_xform.Range(0, dim, 0, dim).LogDet();
-          } else {
-            KALDI_WARN << "Cannot find FMLLR transform for " << spk;
-            fmllr_xform.SetUnit();
-            logdet = 0.0;
-          }
-        } else {
-          fmllr_xform.SetUnit();
-          logdet = 0.0;
-        }
-
-        for (size_t i = 0; i < uttlist.size(); i++) {
-          std::string utt = uttlist[i];
-          if (!feature_reader.HasKey(utt)) {
-            KALDI_WARN << "Did not find features for utterance " << utt;
-            num_err++;
-            continue;
-          }
-          const Matrix<BaseFloat> &feats = feature_reader.Value(utt);
-          if (!post_reader.HasKey(utt) ||
-              post_reader.Value(utt).size() != feats.NumRows()) {
-            KALDI_WARN << "Did not find posteriors for utterance " << utt
-                       << " (or wrong size).";
-            num_err++;
-            continue;
-          }
-          const Posterior &post = post_reader.Value(utt);
-          if (!gselect_reader.HasKey(utt) ||
-              gselect_reader.Value(utt).size() != feats.NumRows()) {
-            KALDI_WARN << "Did not find gselect info for utterance " << utt
-                       << " (or wrong size).";
-            num_err++;
-            continue;
-          }
-          const std::vector<std::vector<int32> > &gselect =
-              gselect_reader.Value(utt);
-          
-          Matrix<BaseFloat> transformed_feats(feats);
-          for (int32 r = 0; r < transformed_feats.NumRows(); r++) {
-            SubVector<BaseFloat> row(transformed_feats, r);
-            ApplyAffineTransform(fmllr_xform, &row);
-          }
-          AccumulateForUtterance(feats, transformed_feats, gselect,
-                                 post, trans_model, am_sgmm,
-                                 logdet, &spk_vars, &spk_stats);
-          num_done++;
-        }  // end looping over all utterances of the current speaker
-        
-        BaseFloat impr, spk_frame_count;
-        // Compute the FMLLR transform and write it out.
-        spk_stats.Update(am_sgmm, fmllr_globals, fmllr_opts, &fmllr_xform,
-                         &spk_frame_count, &impr);
-        fmllr_writer.Write(spk, fmllr_xform);
-        tot_impr += impr;
-        tot_t += spk_frame_count;
-      }  // end looping over speakers
-    } else {  // per-utterance adaptation
-      SequentialBaseFloatMatrixReader feature_reader(feature_rspecifier);
-      for (; !feature_reader.Done(); feature_reader.Next()) {
-        string utt = feature_reader.Key();
-        const Matrix<BaseFloat> &feats = feature_reader.Value();
-
-        if (!post_reader.HasKey(utt) ||
-            post_reader.Value(utt).size() != feats.NumRows()) {
-          KALDI_WARN << "Did not find posteriors for utterance " << utt
-                     << " (or wrong size).";
-          num_err++;
-          continue;
-        }
-        const Posterior &post = post_reader.Value(utt);
-        if (!gselect_reader.HasKey(utt) ||
-            gselect_reader.Value(utt).size() != feats.NumRows()) {
-          KALDI_WARN << "Did not find gselect info for utterance " << utt
-                     << " (or wrong size).";
-          num_err++;
-          continue;
-        }
-        const std::vector<std::vector<int32> > &gselect =
-            gselect_reader.Value(utt);
-        
-        if (fmllr_reader.IsOpen()) {
-          if (fmllr_reader.HasKey(utt)) {
-            fmllr_xform.CopyFromMat(fmllr_reader.Value(utt));
-            logdet = fmllr_xform.Range(0, dim, 0, dim).LogDet();
-          } else {
-            KALDI_WARN << "Cannot find FMLLR transform for " << utt;
-            fmllr_xform.SetUnit();
-            logdet = 0.0;
-          }
-        } else {
-          fmllr_xform.SetUnit();
-          logdet = 0.0;
-        }
-        
-        Matrix<BaseFloat> transformed_feats(feats);
-        for (int32 r = 0; r < transformed_feats.NumRows(); r++) {
-          SubVector<BaseFloat> row(transformed_feats, r);
-          ApplyAffineTransform(fmllr_xform, &row);
-        }
-        
-        Sgmm2PerSpkDerivedVars spk_vars;
-        if (spkvecs_reader.IsOpen()) {
-          if (spkvecs_reader.HasKey(utt)) {
-            spk_vars.SetSpeakerVector(spkvecs_reader.Value(utt));
-            am_sgmm.ComputePerSpkDerivedVars(&spk_vars);
-          } else {
-            KALDI_WARN << "Cannot find speaker vector for " << utt;
-            num_err++;
-            continue;
-          }
-        }  // else spk_vars is "empty"
-
-        spk_stats.SetZero();
-
-        AccumulateForUtterance(feats, transformed_feats, gselect,
-                               post, trans_model, am_sgmm,
-                               logdet, &spk_vars, &spk_stats);
-        num_done++;
-        
-        BaseFloat impr, spk_frame_count;
-        // Compute the FMLLR transform and write it out.
-        spk_stats.Update(am_sgmm, fmllr_globals, fmllr_opts, &fmllr_xform,
-                         &spk_frame_count, &impr);
-        fmllr_writer.Write(utt, fmllr_xform);
-        tot_impr += impr;
-        tot_t += spk_frame_count;
-      }
-    }
-
-    KALDI_LOG << "Done " << num_done << " files, " << num_err << " with errors.";
-    KALDI_LOG << "Overall auxf impr per frame is " << (tot_impr / tot_t)
-              << " per frame, over " << tot_t << " frames.";
-    return (num_done != 0 ? 0 : 1);
-  } catch(const std::exception &e) {
-    std::cerr << e.what();
-    return -1;
-  }
-}
-
diff --git a/src/sgmm2bin/sgmm2-est-spkvecs-gpost.cc b/src/sgmm2bin/sgmm2-est-spkvecs-gpost.cc
deleted file mode 100644
index e4b680cfd3f..00000000000
--- a/src/sgmm2bin/sgmm2-est-spkvecs-gpost.cc
+++ /dev/null
@@ -1,218 +0,0 @@
-// sgmm2bin/sgmm2-est-spkvecs-gpost.cc
-
-// Copyright 2009-2011   Saarland University;  Microsoft Corporation
-
-// See ../../COPYING for clarification regarding multiple authors
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//  http://www.apache.org/licenses/LICENSE-2.0
-//
-// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
-// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
-// MERCHANTABLITY OR NON-INFRINGEMENT.
-// See the Apache 2 License for the specific language governing permissions and
-// limitations under the License.
-
-#include <string>
-using std::string;
-#include <vector>
-using std::vector;
-
-#include "base/kaldi-common.h"
-#include "util/common-utils.h"
-#include "sgmm2/am-sgmm2.h"
-#include "sgmm2/estimate-am-sgmm2.h"
-#include "hmm/transition-model.h"
-
-namespace kaldi {
-
-void AccumulateForUtterance(const Matrix<BaseFloat> &feats,
-                            const Sgmm2GauPost &gpost,
-                            const TransitionModel &trans_model,
-                            const AmSgmm2 &am_sgmm,
-                            Sgmm2PerSpkDerivedVars *spk_vars,
-                            MleSgmm2SpeakerAccs *spk_stats) {
-  kaldi::Sgmm2PerFrameDerivedVars per_frame_vars;
-
-  for (size_t i = 0; i < gpost.size(); i++) {
-    am_sgmm.ComputePerFrameVars(feats.Row(i),
-                                gpost[i].gselect, *spk_vars,
-                                &per_frame_vars);
-
-    for (size_t j = 0; j < gpost[i].tids.size(); j++) {
-      int32 pdf_id = trans_model.TransitionIdToPdf(gpost[i].tids[j]);
-      spk_stats->AccumulateFromPosteriors(am_sgmm, per_frame_vars,
-                                          gpost[i].posteriors[j], pdf_id,
-                                          spk_vars);
-    }
-  }
-}
-
-}  // end namespace kaldi
-
-int main(int argc, char *argv[]) {
-  try {
-    typedef kaldi::int32 int32;
-    using namespace kaldi;
-    const char *usage =
-        "Estimate SGMM speaker vectors, either per utterance or for the "
-        "supplied set of speakers (with spk2utt option).\n"
-        "Reads Gaussian-level posteriors. Writes to a table of vectors.\n"
-        "Usage: sgmm2-est-spkvecs-gpost [options] <model-in> <feature-rspecifier> "
-        "<gpost-rspecifier> <vecs-wspecifier>\n";
-
-    ParseOptions po(usage);
-    string spk2utt_rspecifier, spkvecs_rspecifier;
-    BaseFloat min_count = 100;
-    BaseFloat rand_prune = 1.0e-05;
-
-    po.Register("spk2utt", &spk2utt_rspecifier,
-        "File to read speaker to utterance-list map from.");
-    po.Register("spkvec-min-count", &min_count,
-        "Minimum count needed to estimate speaker vectors");
-    po.Register("rand-prune", &rand_prune, "Randomized pruning parameter for posteriors (more->faster).");
-    po.Register("spk-vecs", &spkvecs_rspecifier, "Speaker vectors to use during aligment (rspecifier)");
-    po.Read(argc, argv);
-
-    if (po.NumArgs() != 4) {
-      po.PrintUsage();
-      exit(1);
-    }
-
-    string model_rxfilename = po.GetArg(1),
-        feature_rspecifier = po.GetArg(2),
-        gpost_rspecifier = po.GetArg(3),
-        vecs_wspecifier = po.GetArg(4);
-
-    TransitionModel trans_model;
-    AmSgmm2 am_sgmm;
-    {
-      bool binary;
-      Input ki(model_rxfilename, &binary);
-      trans_model.Read(ki.Stream(), binary);
-      am_sgmm.Read(ki.Stream(), binary);
-    }
-    MleSgmm2SpeakerAccs spk_stats(am_sgmm, rand_prune);
-
-    RandomAccessSgmm2GauPostReader gpost_reader(gpost_rspecifier);
-
-    RandomAccessBaseFloatVectorReader spkvecs_reader(spkvecs_rspecifier);
-
-    BaseFloatVectorWriter vecs_writer(vecs_wspecifier);
-
-    double tot_impr = 0.0, tot_t = 0.0;
-    int32 num_done = 0, num_err = 0;
-
-    if (!spk2utt_rspecifier.empty()) {  // per-speaker adaptation
-      SequentialTokenVectorReader spk2utt_reader(spk2utt_rspecifier);
-      RandomAccessBaseFloatMatrixReader feature_reader(feature_rspecifier);
-
-      for (; !spk2utt_reader.Done(); spk2utt_reader.Next()) {
-        spk_stats.Clear();
-        string spk = spk2utt_reader.Key();
-        const vector<string> &uttlist = spk2utt_reader.Value();
-
-        Sgmm2PerSpkDerivedVars spk_vars;
-        if (spkvecs_reader.IsOpen()) {
-          if (spkvecs_reader.HasKey(spk)) {
-            spk_vars.SetSpeakerVector(spkvecs_reader.Value(spk));
-            am_sgmm.ComputePerSpkDerivedVars(&spk_vars);
-          } else {
-            KALDI_WARN << "Cannot find speaker vector for " << spk;
-          }
-        }  // else spk_vars is "empty"
-
-        for (size_t i = 0; i < uttlist.size(); i++) {
-          std::string utt = uttlist[i];
-          if (!feature_reader.HasKey(utt)) {
-            KALDI_WARN << "Did not find features for utterance " << utt;
-            continue;
-          }
-          const Matrix<BaseFloat> &feats = feature_reader.Value(utt);
-          if (!gpost_reader.HasKey(utt) ||
-              gpost_reader.Value(utt).size() != feats.NumRows()) {
-            KALDI_WARN << "Did not find posteriors for utterance " << utt
-                       << " (or wrong size).";
-            num_err++;
-            continue;
-          }
-          const Sgmm2GauPost &gpost = gpost_reader.Value(utt);
-          
-          AccumulateForUtterance(feats, gpost, trans_model, am_sgmm,
-                                 &spk_vars, &spk_stats);
-          num_done++;
-        }  // end looping over all utterances of the current speaker
-
-        BaseFloat impr, spk_tot_t;
-        {  // Compute the spk_vec and write it out.
-          Vector<BaseFloat> spk_vec(am_sgmm.SpkSpaceDim(), kSetZero);
-          if (spk_vars.GetSpeakerVector().Dim() != 0)
-            spk_vec.CopyFromVec(spk_vars.GetSpeakerVector());
-          spk_stats.Update(am_sgmm, min_count, &spk_vec, &impr, &spk_tot_t);
-          vecs_writer.Write(spk, spk_vec);
-        }
-        KALDI_LOG << "For speaker " << spk << ", auxf-impr from speaker vector is "
-                  << (impr/spk_tot_t) << ", over " << spk_tot_t << " frames.\n";
-        tot_impr += impr;
-        tot_t += spk_tot_t;
-      }  // end looping over speakers
-    } else {  // per-utterance adaptation
-      SequentialBaseFloatMatrixReader feature_reader(feature_rspecifier);
-      for (; !feature_reader.Done(); feature_reader.Next()) {
-        string utt = feature_reader.Key();
-        const Matrix<BaseFloat> &feats = feature_reader.Value();
-        if (!gpost_reader.HasKey(utt) ||
-            gpost_reader.Value(utt).size() != feats.NumRows()) {
-          KALDI_WARN << "Did not find posts for utterance "
-                     << utt;
-          num_err++;
-          continue;
-        }
-        const Sgmm2GauPost &gpost = gpost_reader.Value(utt);
-
-        Sgmm2PerSpkDerivedVars spk_vars;
-        if (spkvecs_reader.IsOpen()) {
-          if (spkvecs_reader.HasKey(utt)) {
-            spk_vars.SetSpeakerVector(spkvecs_reader.Value(utt));
-            am_sgmm.ComputePerSpkDerivedVars(&spk_vars);
-          } else {
-            KALDI_WARN << "Cannot find speaker vector for " << utt;
-          }
-        }  // else spk_vars is "empty"
-        
-        num_done++;
-        spk_stats.Clear();
-
-        AccumulateForUtterance(feats, gpost, trans_model, am_sgmm,
-                               &spk_vars, &spk_stats);
-
-        BaseFloat impr, utt_tot_t;
-        {  // Compute the spk_vec and write it out.
-          Vector<BaseFloat> spk_vec(am_sgmm.SpkSpaceDim(), kSetZero);
-          if (spk_vars.GetSpeakerVector().Dim() != 0)
-            spk_vec.CopyFromVec(spk_vars.GetSpeakerVector());
-          spk_stats.Update(am_sgmm, min_count, &spk_vec, &impr, &utt_tot_t);
-          vecs_writer.Write(utt, spk_vec);
-        }
-        KALDI_LOG << "For utterance " << utt << ", auxf-impr from speaker vectors is "
-                  << (impr/utt_tot_t) << ", over " << utt_tot_t << " frames.";
-        tot_impr += impr;
-        tot_t += utt_tot_t;
-      }
-    }
-
-    KALDI_LOG << "Done " << num_done << " files, " << num_err
-              << " with errors.";
-    KALDI_LOG << "Overall auxf impr per frame is " << (tot_impr / tot_t)
-              << " over " << tot_t << " frames.";
-    return (num_done != 0 ? 0 : 1);
-  } catch(const std::exception &e) {
-    std::cerr << e.what();
-    return -1;
-  }
-}
-
diff --git a/src/sgmm2bin/sgmm2-est-spkvecs.cc b/src/sgmm2bin/sgmm2-est-spkvecs.cc
deleted file mode 100644
index dc979d4cbd1..00000000000
--- a/src/sgmm2bin/sgmm2-est-spkvecs.cc
+++ /dev/null
@@ -1,259 +0,0 @@
-// sgmm2bin/sgmm2-est-spkvecs.cc
-
-// Copyright 2009-2012  Saarland University  Microsoft Corporation
-//                      Johns Hopkins University (Author: Daniel Povey)
-//                2014  Guoguo Chen
-
-// See ../../COPYING for clarification regarding multiple authors
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//  http://www.apache.org/licenses/LICENSE-2.0
-//
-// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
-// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
-// MERCHANTABLITY OR NON-INFRINGEMENT.
-// See the Apache 2 License for the specific language governing permissions and
-// limitations under the License.
-
-#include <string>
-using std::string;
-#include <vector>
-using std::vector;
-
-#include "base/kaldi-common.h"
-#include "util/common-utils.h"
-#include "sgmm2/am-sgmm2.h"
-#include "sgmm2/estimate-am-sgmm2.h"
-#include "hmm/transition-model.h"
-#include "hmm/posterior.h"
-
-namespace kaldi {
-
-void AccumulateForUtterance(const Matrix<BaseFloat> &feats,
-                            const Posterior &post,
-                            const TransitionModel &trans_model,
-                            const AmSgmm2 &am_sgmm,
-                            const vector< vector<int32> > &gselect,
-                            Sgmm2PerSpkDerivedVars *spk_vars,
-                            MleSgmm2SpeakerAccs *spk_stats) {
-  kaldi::Sgmm2PerFrameDerivedVars per_frame_vars;
-
-  KALDI_ASSERT(gselect.size() == feats.NumRows());
-  Posterior pdf_post;
-  ConvertPosteriorToPdfs(trans_model, post, &pdf_post);
-  for (size_t i = 0; i < post.size(); i++) {
-    am_sgmm.ComputePerFrameVars(feats.Row(i), gselect[i],
-                                *spk_vars, &per_frame_vars);
-    
-    for (size_t j = 0; j < pdf_post[i].size(); j++) {
-      int32 pdf_id = pdf_post[i][j].first;
-      spk_stats->Accumulate(am_sgmm, per_frame_vars, pdf_id,
-                            pdf_post[i][j].second, spk_vars);
-    }
-  }
-}
-
-}  // end namespace kaldi
-
-int main(int argc, char *argv[]) {
-  try {
-    typedef kaldi::int32 int32;
-    using namespace kaldi;
-    const char *usage =
-        "Estimate SGMM speaker vectors, either per utterance or for the "
-        "supplied set of speakers (with spk2utt option).\n"
-        "Reads Gaussian-level posteriors. Writes to a table of vectors.\n"
-        "Usage: sgmm2-est-spkvecs [options] <model-in> <feature-rspecifier> "
-        "<post-rspecifier> <vecs-wspecifier>\n"
-        "note: --gselect option is required.";
-    
-    ParseOptions po(usage);
-    string gselect_rspecifier, spk2utt_rspecifier, spkvecs_rspecifier;
-    BaseFloat min_count = 100;
-    BaseFloat rand_prune = 1.0e-05;
-
-    po.Register("gselect", &gselect_rspecifier,
-                "rspecifier for precomputed per-frame Gaussian indices from.");
-    po.Register("spk2utt", &spk2utt_rspecifier,
-        "File to read speaker to utterance-list map from.");
-    po.Register("spkvec-min-count", &min_count,
-        "Minimum count needed to estimate speaker vectors");
-    po.Register("rand-prune", &rand_prune, "Pruning threshold for posteriors");
-    po.Register("spk-vecs", &spkvecs_rspecifier, "Speaker vectors to use during aligment (rspecifier)");
-    po.Read(argc, argv);
-
-    if (po.NumArgs() != 4) {
-      po.PrintUsage();
-      exit(1);
-    }
-    if (gselect_rspecifier == "")
-      KALDI_ERR << "--gselect option is mandatory.";
-    
-    string model_rxfilename = po.GetArg(1),
-        feature_rspecifier = po.GetArg(2),
-        post_rspecifier = po.GetArg(3),
-        vecs_wspecifier = po.GetArg(4);
-
-    TransitionModel trans_model;
-    AmSgmm2 am_sgmm;
-    {
-      bool binary;
-      Input ki(model_rxfilename, &binary);
-      trans_model.Read(ki.Stream(), binary);
-      am_sgmm.Read(ki.Stream(), binary);
-    }
-    MleSgmm2SpeakerAccs spk_stats(am_sgmm, rand_prune);
-
-    RandomAccessPosteriorReader post_reader(post_rspecifier);
-    RandomAccessInt32VectorVectorReader gselect_reader(gselect_rspecifier);
-    RandomAccessBaseFloatVectorReader spkvecs_reader(spkvecs_rspecifier);
-
-    BaseFloatVectorWriter vecs_writer(vecs_wspecifier);
-
-    double tot_impr = 0.0, tot_t = 0.0;
-    int32 num_done = 0, num_err = 0;
-    std::vector<std::vector<int32> > empty_gselect;
-
-    if (!spk2utt_rspecifier.empty()) {  // per-speaker adaptation
-      SequentialTokenVectorReader spk2utt_reader(spk2utt_rspecifier);
-      RandomAccessBaseFloatMatrixReader feature_reader(feature_rspecifier);
-
-      for (; !spk2utt_reader.Done(); spk2utt_reader.Next()) {
-        spk_stats.Clear();
-        string spk = spk2utt_reader.Key();
-        const vector<string> &uttlist = spk2utt_reader.Value();
-
-        Sgmm2PerSpkDerivedVars spk_vars;
-        if (spkvecs_reader.IsOpen()) {
-          if (spkvecs_reader.HasKey(spk)) {
-            spk_vars.SetSpeakerVector(spkvecs_reader.Value(spk));
-            am_sgmm.ComputePerSpkDerivedVars(&spk_vars);
-          } else {
-            KALDI_WARN << "Cannot find speaker vector for speaker " << spk
-                       << ", not processing this speaker.";
-            num_err++; // standard Kaldi behavior is to not process data
-            // when errors like this happen, as it's generally a script error;
-            continue;
-          }
-        }  // else spk_vars is "empty"
-
-        for (size_t i = 0; i < uttlist.size(); i++) {
-          std::string utt = uttlist[i];
-          if (!feature_reader.HasKey(utt)) {
-            KALDI_WARN << "Did not find features for utterance " << utt;
-            continue;
-          }
-          if (!post_reader.HasKey(utt)) {
-            KALDI_WARN << "Did not find posteriors for utterance " << utt;
-            num_err++;
-            continue;
-          }
-          const Matrix<BaseFloat> &feats = feature_reader.Value(utt);
-          const Posterior &post = post_reader.Value(utt);
-          if (static_cast<int32>(post.size()) != feats.NumRows()) {
-            KALDI_WARN << "Posterior vector has wrong size " << (post.size())
-                       << " vs. " << (feats.NumRows());
-            num_err++;
-            continue;
-          }
-          if (!gselect_reader.HasKey(utt) ||
-              gselect_reader.Value(utt).size() != feats.NumRows()) {
-            KALDI_WARN << "No Gaussian-selection info available for utterance "
-                       << utt << " (or wrong size)";
-            num_err++;
-            continue;
-          }
-          const std::vector<std::vector<int32> > &gselect =
-              gselect_reader.Value(utt);
-          
-          AccumulateForUtterance(feats, post, trans_model, am_sgmm,
-                                 gselect, &spk_vars, &spk_stats);
-          num_done++;
-        }  // end looping over all utterances of the current speaker
-
-        BaseFloat impr, spk_tot_t;
-        {  // Compute the spk_vec and write it out.
-          Vector<BaseFloat> spk_vec(am_sgmm.SpkSpaceDim(), kSetZero);
-          if (spk_vars.GetSpeakerVector().Dim() != 0)
-            spk_vec.CopyFromVec(spk_vars.GetSpeakerVector());
-          spk_stats.Update(am_sgmm, min_count, &spk_vec, &impr, &spk_tot_t);
-          vecs_writer.Write(spk, spk_vec);
-        }
-        KALDI_LOG << "For speaker " << spk << ", auxf-impr from speaker vector is "
-                  << (impr/spk_tot_t) << ", over " << spk_tot_t << " frames.";
-        tot_impr += impr;
-        tot_t += spk_tot_t;
-      }  // end looping over speakers
-    } else {  // per-utterance adaptation
-      SequentialBaseFloatMatrixReader feature_reader(feature_rspecifier);
-      for (; !feature_reader.Done(); feature_reader.Next()) {
-        string utt = feature_reader.Key();        
-        const Matrix<BaseFloat> &feats = feature_reader.Value();
-        if (!post_reader.HasKey(utt) ||
-            post_reader.Value(utt).size() != feats.NumRows()) {
-          KALDI_WARN << "Did not find posts for utterance "
-                     << utt << " (or wrong size).";
-          num_err++;
-          continue;
-        }
-        const Posterior &post = post_reader.Value(utt);
-
-        Sgmm2PerSpkDerivedVars spk_vars;
-        if (spkvecs_reader.IsOpen()) {
-          if (spkvecs_reader.HasKey(utt)) {
-            spk_vars.SetSpeakerVector(spkvecs_reader.Value(utt));
-            am_sgmm.ComputePerSpkDerivedVars(&spk_vars);
-          } else {
-            KALDI_WARN << "Cannot find speaker vector for utterance " << utt
-                       << ", not processing it.";
-            num_err++;
-            continue;
-          }
-        }  // else spk_vars is "empty"
-        
-        num_done++;
-
-        if (!gselect_reader.HasKey(utt) ||
-            gselect_reader.Value(utt).size() != feats.NumRows()) {
-          KALDI_WARN << "No Gaussian-selection info available for utterance "
-                     << utt << " (or wrong size)";
-          num_err++;
-          continue;
-        }
-        const std::vector<std::vector<int32> > &gselect =
-            gselect_reader.Value(utt);
-
-        spk_stats.Clear();
-        
-        AccumulateForUtterance(feats, post, trans_model, am_sgmm,
-                               gselect, &spk_vars, &spk_stats);
-
-        BaseFloat impr, utt_tot_t;
-        {  // Compute the spk_vec and write it out.
-          Vector<BaseFloat> spk_vec(am_sgmm.SpkSpaceDim(), kSetZero);
-          if (spk_vars.GetSpeakerVector().Dim() != 0)
-            spk_vec.CopyFromVec(spk_vars.GetSpeakerVector());
-          spk_stats.Update(am_sgmm, min_count, &spk_vec, &impr, &utt_tot_t);
-          vecs_writer.Write(utt, spk_vec);
-        }
-        KALDI_LOG << "For utterance " << utt << ", auxf-impr from speaker vectors is "
-                  << (impr/utt_tot_t) << ", over " << utt_tot_t << " frames.";
-        tot_impr += impr;
-        tot_t += utt_tot_t;
-      }
-    }
-
-    KALDI_LOG << "Overall auxf impr per frame is "
-              << (tot_impr / tot_t) << " over " << tot_t << " frames.";
-    KALDI_LOG << "Done " << num_done << " files, " << num_err << " with errors.";
-    return (num_done != 0 ? 0 : 1);
-  } catch(const std::exception &e) {
-    std::cerr << e.what();
-    return -1;
-  }
-}
-
diff --git a/src/sgmm2bin/sgmm2-est.cc b/src/sgmm2bin/sgmm2-est.cc
deleted file mode 100644
index 0080f72fea6..00000000000
--- a/src/sgmm2bin/sgmm2-est.cc
+++ /dev/null
@@ -1,166 +0,0 @@
-// sgmm2bin/sgmm2-est.cc
-
-// Copyright 2009-2012  Saarland University (Author:  Arnab Ghoshal)
-//                      Johns Hopkins University (Author: Daniel Povey)
-
-// See ../../COPYING for clarification regarding multiple authors
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//  http://www.apache.org/licenses/LICENSE-2.0
-//
-// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
-// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
-// MERCHANTABLITY OR NON-INFRINGEMENT.
-// See the Apache 2 License for the specific language governing permissions and
-// limitations under the License.
-
-
-#include "base/kaldi-common.h"
-#include "util/common-utils.h"
-#include "util/kaldi-thread.h"
-#include "sgmm2/am-sgmm2.h"
-#include "hmm/transition-model.h"
-#include "sgmm2/estimate-am-sgmm2.h"
-
-
-int main(int argc, char *argv[]) {
-  try {
-    using namespace kaldi;
-    typedef kaldi::int32 int32;
-    const char *usage =
-        "Estimate SGMM model parameters from accumulated stats.\n"
-        "Usage: sgmm2-est [options] <model-in> <stats-in> <model-out>\n";
-
-    bool binary_write = true;
-    std::string update_flags_str = "vMNwucSt";
-    std::string write_flags_str = "gsnu";
-    kaldi::MleTransitionUpdateConfig tcfg;
-    kaldi::MleAmSgmm2Options sgmm_opts;
-    kaldi::Sgmm2SplitSubstatesConfig split_opts;
-    int32 increase_phn_dim = 0;
-    int32 increase_spk_dim = 0;
-    bool remove_speaker_space = false;
-    bool spk_dep_weights = false;
-    std::string occs_out_filename;
-
-    ParseOptions po(usage);
-    po.Register("binary", &binary_write, "Write output in binary mode");
-    po.Register("increase-phn-dim", &increase_phn_dim, "Increase phone-space "
-                "dimension as far as allowed towards this target.");
-    po.Register("increase-spk-dim", &increase_spk_dim, "Increase speaker-space "
-                "dimension as far as allowed towards this target.");
-    po.Register("spk-dep-weights", &spk_dep_weights, "If true, have speaker-"
-                "dependent weights (symmetric SGMM)-- this option only makes"
-                "a difference if you use the --increase-spk-dim option and "
-                "are increasing the speaker dimension from zero.");
-    po.Register("remove-speaker-space", &remove_speaker_space, "Remove speaker-specific "
-                "projections N");
-    po.Register("write-occs", &occs_out_filename, "File to write pdf "
-                "occupantion counts to.");
-    po.Register("update-flags", &update_flags_str, "Which SGMM parameters to "
-                "update: subset of vMNwcSt.");
-    po.Register("write-flags", &write_flags_str, "Which SGMM parameters to "
-                "write: subset of gsnu");
-    po.Register("num-threads", &g_num_threads, "Number of threads to use in "
-                "weight update and normalizer computation");
-    tcfg.Register(&po);
-    sgmm_opts.Register(&po);
-    split_opts.Register(&po);
-
-    po.Read(argc, argv);
-    if (po.NumArgs() != 3) {
-      po.PrintUsage();
-      exit(1);
-    }
-    std::string model_in_filename = po.GetArg(1),
-        stats_filename = po.GetArg(2),
-        model_out_filename = po.GetArg(3);
-
-    kaldi::SgmmUpdateFlagsType update_flags =
-        StringToSgmmUpdateFlags(update_flags_str);
-    kaldi::SgmmWriteFlagsType write_flags =
-        StringToSgmmWriteFlags(write_flags_str);
-    
-    AmSgmm2 am_sgmm;
-    TransitionModel trans_model;
-    {
-      bool binary;
-      Input ki(model_in_filename, &binary);
-      trans_model.Read(ki.Stream(), binary);
-      am_sgmm.Read(ki.Stream(), binary);
-    }
-
-    Vector<double> transition_accs;
-    MleAmSgmm2Accs sgmm_accs;
-    {
-      bool binary;
-      Input ki(stats_filename, &binary);
-      transition_accs.Read(ki.Stream(), binary);
-      sgmm_accs.Read(ki.Stream(), binary, true);  // true == add; doesn't matter here.
-    }
-
-    if (update_flags & kSgmmTransitions) {  // Update transition model.
-      BaseFloat objf_impr, count;
-      trans_model.MleUpdate(transition_accs, tcfg, &objf_impr, &count);
-      KALDI_LOG << "Transition model update: Overall " << (objf_impr/count)
-                << " log-like improvement per frame over " << (count)
-                << " frames.";
-    }
-
-    sgmm_accs.Check(am_sgmm, true); // Will check consistency and print some diagnostics.
-
-    { // Do the update.
-      kaldi::MleAmSgmm2Updater updater(sgmm_opts);
-      updater.Update(sgmm_accs, &am_sgmm, update_flags);
-    }
-
-    Vector<BaseFloat> pdf_occs;
-    sgmm_accs.GetStateOccupancies(&pdf_occs);
-
-    if (split_opts.split_substates != 0)
-      am_sgmm.SplitSubstates(pdf_occs, split_opts);
-
-    if (!occs_out_filename.empty()) {
-      kaldi::Output ko(occs_out_filename, binary_write);
-      pdf_occs.Write(ko.Stream(), binary_write);
-    }
-
-    if (increase_phn_dim != 0 || increase_spk_dim != 0) {
-      // Feature normalizing transform matrix used to initialize the new columns
-      // of the phonetic- or speaker-space projection matrices.
-      kaldi::Matrix<BaseFloat> norm_xform;
-      ComputeFeatureNormalizingTransform(am_sgmm.full_ubm(), &norm_xform);
-      if (increase_phn_dim != 0)
-        am_sgmm.IncreasePhoneSpaceDim(increase_phn_dim, norm_xform);
-      if (increase_spk_dim != 0)
-        am_sgmm.IncreaseSpkSpaceDim(increase_spk_dim, norm_xform,
-                                    spk_dep_weights);
-    }
-    if (remove_speaker_space) {
-      KALDI_LOG << "Removing speaker space (projections N_)";
-      am_sgmm.RemoveSpeakerSpace();
-    }
-
-    am_sgmm.ComputeDerivedVars(); // recompute normalizers, and possibly
-    // weights.
-    
-    {
-      Output ko(model_out_filename, binary_write);
-      trans_model.Write(ko.Stream(), binary_write);
-      am_sgmm.Write(ko.Stream(), binary_write, write_flags);
-    }
-    
-    
-    KALDI_LOG << "Written model to " << model_out_filename;
-    return 0;
-  } catch(const std::exception &e) {
-    std::cerr << e.what();
-    return -1;
-  }
-}
-
-
diff --git a/src/sgmm2bin/sgmm2-gselect.cc b/src/sgmm2bin/sgmm2-gselect.cc
deleted file mode 100644
index ded53b68cb2..00000000000
--- a/src/sgmm2bin/sgmm2-gselect.cc
+++ /dev/null
@@ -1,110 +0,0 @@
-// sgmm2bin/sgmm2-gselect.cc
-
-// Copyright 2009-2012   Saarland University  Microsoft Corporation
-//                       Johns Hopkins University (author: Daniel Povey)
-
-// See ../../COPYING for clarification regarding multiple authors
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//  http://www.apache.org/licenses/LICENSE-2.0
-//
-// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
-// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
-// MERCHANTABLITY OR NON-INFRINGEMENT.
-// See the Apache 2 License for the specific language governing permissions and
-// limitations under the License.
-
-
-#include "base/kaldi-common.h"
-#include "util/common-utils.h"
-#include "sgmm2/am-sgmm2.h"
-#include "hmm/transition-model.h"
-
-int main(int argc, char *argv[]) {
-  try {
-    using namespace kaldi;
-    const char *usage =
-        "Precompute Gaussian indices for SGMM training "
-        "Usage: sgmm2-gselect [options] <model-in> <feature-rspecifier> <gselect-wspecifier>\n"
-        "e.g.: sgmm2-gselect 1.sgmm \"ark:feature-command |\" ark:1.gs\n"
-        "Note: you can do the same thing by combining the programs sgmm2-write-ubm, fgmm-global-to-gmm,\n"
-        "gmm-gselect and fgmm-gselect\n";
-
-    ParseOptions po(usage);
-    kaldi::Sgmm2GselectConfig sgmm_opts;
-    std::string preselect_rspecifier;
-    std::string likelihood_wspecifier;
-    po.Register("write-likes", &likelihood_wspecifier, "Wspecifier for likelihoods per "
-                "utterance");
-    sgmm_opts.Register(&po);
-    po.Read(argc, argv);
-
-    if (po.NumArgs() != 3) {
-      po.PrintUsage();
-      exit(1);
-    }
-
-    std::string model_filename = po.GetArg(1),
-        feature_rspecifier = po.GetArg(2),
-        gselect_wspecifier = po.GetArg(3);
-
-    using namespace kaldi;
-    typedef kaldi::int32 int32;
-
-    AmSgmm2 am_sgmm;
-    {
-      bool binary;
-      Input ki(model_filename, &binary);
-      TransitionModel trans_model;
-      trans_model.Read(ki.Stream(), binary);
-      am_sgmm.Read(ki.Stream(), binary);
-    }
-
-    double tot_like = 0.0;
-    kaldi::int64 tot_t = 0;
-
-    SequentialBaseFloatMatrixReader feature_reader(feature_rspecifier);
-    Int32VectorVectorWriter gselect_writer(gselect_wspecifier);
-    BaseFloatWriter likelihood_writer(likelihood_wspecifier);
-
-    int32 num_done = 0, num_err = 0;
-    for (; !feature_reader.Done(); feature_reader.Next()) {
-      int32 tot_t_this_file = 0; double tot_like_this_file = 0;
-      std::string utt = feature_reader.Key();
-      const Matrix<BaseFloat> &mat = feature_reader.Value();
-      std::vector<std::vector<int32> > gselect_vec(mat.NumRows());
-      tot_t_this_file += mat.NumRows();
-      for (int32 i = 0; i < mat.NumRows(); i++)
-        tot_like_this_file += am_sgmm.GaussianSelection(sgmm_opts, mat.Row(i), &(gselect_vec[i]));
-
-      gselect_writer.Write(utt, gselect_vec);
-      if (num_done % 10 == 0)
-        KALDI_LOG << "For " << num_done << "'th file, average UBM likelihood over "
-                  << tot_t_this_file << " frames is "
-                  << (tot_like_this_file/tot_t_this_file);
-      tot_t += tot_t_this_file;
-      tot_like += tot_like_this_file;
-
-      if(likelihood_wspecifier != "")
-        likelihood_writer.Write(utt, tot_like_this_file);
-      num_done++;
-    }
-
-    KALDI_LOG << "Done " << num_done << " files, " << num_err
-              << " with errors, average UBM log-likelihood is "
-              << (tot_like/tot_t) << " over " << tot_t << " frames.";
-
-
-    if (num_done != 0) return 0;
-    else return 1;
-  } catch(const std::exception &e) {
-    std::cerr << e.what();
-    return -1;
-  }
-}
-
-
diff --git a/src/sgmm2bin/sgmm2-info.cc b/src/sgmm2bin/sgmm2-info.cc
deleted file mode 100644
index 6b9ce2f30d1..00000000000
--- a/src/sgmm2bin/sgmm2-info.cc
+++ /dev/null
@@ -1,115 +0,0 @@
-// sgmm2bin/sgmm2-info.cc
-
-// Copyright 2012  Arnab Ghoshal  Johns Hopkins University (author: Daniel Povey)
-
-// See ../../COPYING for clarification regarding multiple authors
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//  http://www.apache.org/licenses/LICENSE-2.0
-//
-// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
-// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
-// MERCHANTABLITY OR NON-INFRINGEMENT.
-// See the Apache 2 License for the specific language governing permissions and
-// limitations under the License.
-
-#include <iomanip>
-
-#include "base/kaldi-common.h"
-#include "util/common-utils.h"
-#include "sgmm2/am-sgmm2.h"
-#include "hmm/transition-model.h"
-
-
-int main(int argc, char *argv[]) {
-  try {
-    using namespace kaldi;
-    typedef kaldi::int32 int32;
-    const char *usage =
-        "Print various information about an SGMM.\n"
-        "Usage: sgmm2-info [options] <model-in> [model-in2 ... ]\n";
-
-    bool sgmm_detailed = false;
-    bool trans_detailed = false;
-
-    ParseOptions po(usage);
-    po.Register("sgmm2-detailed", &sgmm_detailed,
-                "Print detailed information about substates.");
-    po.Register("trans-detailed", &trans_detailed,
-                "Print detailed information about transition model.");
-
-    po.Read(argc, argv);
-    if (po.NumArgs() < 1) {
-      po.PrintUsage();
-      exit(1);
-    }
-
-    for (int i = 1, max = po.NumArgs(); i <= max; ++i) {
-      std::string model_in_filename = po.GetArg(i);
-      AmSgmm2 am_sgmm;
-      TransitionModel trans_model;
-      {
-        bool binary;
-        Input ki(model_in_filename, &binary);
-        trans_model.Read(ki.Stream(), binary);
-        am_sgmm.Read(ki.Stream(), binary);
-      }
-
-      {
-        using namespace std;
-        cout.setf(ios::left);
-        cout << "\nModel file: " << model_in_filename << endl;
-        cout << " SGMM information:\n"
-          << setw(40) << "  # of HMM states" << am_sgmm.NumPdfs() << endl
-          << setw(40) << "  # of Gaussians per state" << am_sgmm.NumGauss() << endl
-          << setw(40) << "  Dimension of phone vector space"
-          << am_sgmm.PhoneSpaceDim() << endl
-          << setw(40) << "  Dimension of speaker vector space"
-          << am_sgmm.SpkSpaceDim() << endl
-          << setw(40) << "  Dimension of feature vectors"
-             << am_sgmm.FeatureDim() << endl;
-        int32 total_mixweights = 0;
-        for (int32 j2 = 0; j2 < am_sgmm.NumPdfs(); j2++) {
-          total_mixweights += am_sgmm.NumSubstatesForPdf(j2);
-          if (sgmm_detailed) {
-            cout << "  # of substates for state " << setw(13) << j2
-                 << am_sgmm.NumSubstatesForPdf(j2) << endl;
-          }
-        }
-        cout << setw(40) << "  Total # of mixture weights " << total_mixweights << endl;
-        int32 total_groups = am_sgmm.NumGroups();
-        cout << setw(40) << "  Total # of groups of pdfs " << total_groups << endl;
-        int32 total_substates = 0;
-        for (int32 j1 = 0; j1 < am_sgmm.NumGroups(); j1++) {
-          total_substates += am_sgmm.NumSubstatesForGroup(j1);
-        }
-        cout << setw(40) << "  Total # of substates " << total_substates << endl;        
-        cout << "\nTransition model information:\n"
-             << setw(40) << " # of HMM states" << trans_model.NumPdfs() << endl
-             << setw(40) << " # of transition states"
-             << trans_model.NumTransitionStates() << endl;
-          int32 total_indices = 0;
-          for (int32 s = 0; s < trans_model.NumTransitionStates(); s++) {
-            total_indices += trans_model.NumTransitionIndices(s);
-            if (trans_detailed) {
-              cout << "  # of transition ids for state " << setw(8) << s
-                   << trans_model.NumTransitionIndices(s) << endl;
-            }
-          }
-          cout << setw(40) << "  Total # of transition ids " << total_indices
-               << endl;
-      }
-    }
-
-    return 0;
-  } catch(const std::exception &e) {
-    std::cerr << e.what();
-    return -1;
-  }
-}
-
-
diff --git a/src/sgmm2bin/sgmm2-init.cc b/src/sgmm2bin/sgmm2-init.cc
deleted file mode 100644
index 4aaa400c511..00000000000
--- a/src/sgmm2bin/sgmm2-init.cc
+++ /dev/null
@@ -1,132 +0,0 @@
-// sgmm2bin/sgmm2-init.cc
-
-// Copyright 2012   Arnab Ghoshal  Johns Hopkins University (author: Daniel Povey)
-// Copyright 2009-2011   Saarland University
-
-// See ../../COPYING for clarification regarding multiple authors
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//  http://www.apache.org/licenses/LICENSE-2.0
-//
-// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
-// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
-// MERCHANTABLITY OR NON-INFRINGEMENT.
-// See the Apache 2 License for the specific language governing permissions and
-// limitations under the License.
-
-#include "util/common-utils.h"
-#include "gmm/am-diag-gmm.h"
-#include "sgmm2/am-sgmm2.h"
-#include "hmm/transition-model.h"
-#include "tree/context-dep.h"
-
-
-int main(int argc, char *argv[]) {
-  try {
-    using namespace kaldi;
-    typedef kaldi::int32 int32;
-
-    const char *usage =
-        "Initialize an SGMM from a trained full-covariance UBM and a specified"
-        " model topology.\n"
-        "Usage: sgmm2-init [options] <topology> <tree> <init-model> <sgmm-out>\n"
-        "The <init-model> argument can be a UBM (the default case) or another\n"
-        "SGMM (if the --init-from-sgmm flag is used).\n"
-        "For systems with two-level tree, use --pdf-map argument.";
-    
-    bool binary = true, init_from_sgmm = false, spk_dep_weights = false; // will
-    // make it true later.
-    int32 phn_space_dim = 0, spk_space_dim = 0;
-    std::string pdf_map_rxfilename;
-    double self_weight = 1.0;
-    
-    kaldi::ParseOptions po(usage);
-    po.Register("binary", &binary, "Write output in binary mode");
-    po.Register("phn-space-dim", &phn_space_dim, "Phonetic space dimension.");
-    po.Register("spk-space-dim", &spk_space_dim, "Speaker space dimension.");
-    po.Register("spk-dep-weights", &spk_dep_weights, "If true, have speaker-"
-                "dependent weights (symmetric SGMM)");
-    po.Register("init-from-sgmm", &init_from_sgmm,
-                "Initialize from another SGMM (instead of a UBM).");
-    po.Register("self-weight", &self_weight,
-                "If < 1.0, will be the weight of a pdf with its \"own\" mixture, "
-                "where we initialize each group with a number of mixtures.  If"
-                "1.0, we initialize each group with just one mixture component.");
-    po.Register("pdf-map", &pdf_map_rxfilename,
-                "For systems with 2-level trees [SCTM systems], the file that "
-                "maps from pdfs to groups (from build-tree-two-level)");
-    
-    po.Read(argc, argv);
-
-    if (po.NumArgs() != 4) {
-      po.PrintUsage();
-      exit(1);
-    }
-
-    std::string topo_in_filename = po.GetArg(1),
-        tree_in_filename = po.GetArg(2),
-        init_model_filename = po.GetArg(3),
-        sgmm_out_filename = po.GetArg(4);
-
-    ContextDependency ctx_dep;
-    {
-      bool binary_in;
-      Input ki(tree_in_filename.c_str(), &binary_in);
-      ctx_dep.Read(ki.Stream(), binary_in);
-    }
-
-    std::vector<int32> pdf2group;
-    if (pdf_map_rxfilename != "") {
-      bool binary_in;
-      Input ki(pdf_map_rxfilename, &binary_in);
-      ReadIntegerVector(ki.Stream(), binary_in, &pdf2group);
-    } else {
-      for (int32 i = 0; i < ctx_dep.NumPdfs(); i++) pdf2group.push_back(i);
-    }
-
-    
-    HmmTopology topo;
-    ReadKaldiObject(topo_in_filename, &topo);
-
-    TransitionModel trans_model(ctx_dep, topo);
-    
-    kaldi::AmSgmm2 sgmm;
-    if (init_from_sgmm) {
-      kaldi::AmSgmm2 init_sgmm;
-      {
-        bool binary_read;
-        kaldi::Input ki(init_model_filename, &binary_read);
-        init_sgmm.Read(ki.Stream(), binary_read);
-      }
-      sgmm.CopyGlobalsInitVecs(init_sgmm, pdf2group, self_weight);
-    } else {
-      kaldi::FullGmm ubm;
-      {
-        bool binary_read;
-        kaldi::Input ki(init_model_filename, &binary_read);
-        ubm.Read(ki.Stream(), binary_read);
-      }
-      sgmm.InitializeFromFullGmm(ubm, pdf2group, phn_space_dim,
-                                 spk_space_dim, spk_dep_weights,
-                                 self_weight);
-    }
-    sgmm.ComputeNormalizers();
-
-    {
-      kaldi::Output ko(sgmm_out_filename, binary);
-      trans_model.Write(ko.Stream(), binary);
-      sgmm.Write(ko.Stream(), binary, kaldi::kSgmmWriteAll);
-    }
-
-    KALDI_LOG << "Written model to " << sgmm_out_filename;
-  } catch(const std::exception &e) {
-    std::cerr << e.what() << '\n';
-    return -1;
-  }
-}
-
-
diff --git a/src/sgmm2bin/sgmm2-latgen-faster-parallel.cc b/src/sgmm2bin/sgmm2-latgen-faster-parallel.cc
deleted file mode 100644
index 31ed135efa7..00000000000
--- a/src/sgmm2bin/sgmm2-latgen-faster-parallel.cc
+++ /dev/null
@@ -1,291 +0,0 @@
-// sgmm2bin/sgmm2-latgen-faster-parallel.cc
-
-// Copyright 2009-2013  Saarland University;  Microsoft Corporation;
-//                      Johns Hopkins University (author: Daniel Povey)
-//                2014  Guoguo Chen
-
-// See ../../COPYING for clarification regarding multiple authors
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//  http://www.apache.org/licenses/LICENSE-2.0
-//
-// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
-// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
-// MERCHANTABLITY OR NON-INFRINGEMENT.
-// See the Apache 2 License for the specific language governing permissions and
-// limitations under the License.
-
-#include <string>
-using std::string;
-
-#include "base/kaldi-common.h"
-#include "util/common-utils.h"
-#include "sgmm2/am-sgmm2.h"
-#include "hmm/transition-model.h"
-#include "fstext/fstext-lib.h"
-#include "decoder/decoder-wrappers.h"
-#include "sgmm2/decodable-am-sgmm2.h"
-#include "util/kaldi-thread.h"
-#include "base/timer.h"
-
-namespace kaldi {
-
-// the reference arguments at the beginning are not const as the style guide
-// requires, but are best viewed as inputs.
-void ProcessUtterance(const AmSgmm2 &am_sgmm,
-                      const TransitionModel &trans_model,
-                      double log_prune,
-                      double acoustic_scale,
-                      const Matrix<BaseFloat> &features,
-                      RandomAccessInt32VectorVectorReader &gselect_reader,
-                      RandomAccessBaseFloatVectorReaderMapped &spkvecs_reader,
-                      const fst::SymbolTable *word_syms,
-                      const std::string &utt,
-                      bool determinize,
-                      bool allow_partial,
-                      Int32VectorWriter *alignments_writer,
-                      Int32VectorWriter *words_writer,
-                      CompactLatticeWriter *compact_lattice_writer,
-                      LatticeWriter *lattice_writer,
-                      LatticeFasterDecoder *decoder, // Takes ownership of this.
-                      double *like_sum,
-                      int64 *frame_sum,
-                      int32 *num_done,
-                      int32 *num_err,
-                      TaskSequencer<DecodeUtteranceLatticeFasterClass> *sequencer) {
-  using fst::Fst;
-  using std::vector;
-
-  Sgmm2PerSpkDerivedVars *spk_vars = new Sgmm2PerSpkDerivedVars; // decodable
-  // will take ownership.
-  if (spkvecs_reader.IsOpen()) {
-    if (spkvecs_reader.HasKey(utt)) {
-      spk_vars->SetSpeakerVector(spkvecs_reader.Value(utt));
-      am_sgmm.ComputePerSpkDerivedVars(spk_vars);
-    } else {
-      KALDI_WARN << "Cannot find speaker vector for " << utt << ", not decoding this utterance";
-      delete spk_vars;
-      (*num_err)++;
-      return;
-    }
-  }
-  if (!gselect_reader.HasKey(utt) ||
-      gselect_reader.Value(utt).size() != features.NumRows()) {
-    KALDI_WARN << "No Gaussian-selection info available for utterance "
-               << utt << " (or wrong size)";
-  }
-
-  // decodable will take ownership.
-  vector<vector<int32> > *gselect = new std::vector<vector<int32> >(
-      gselect_reader.Value(utt));
-
-  Matrix<BaseFloat> *new_feats = new Matrix<BaseFloat>(features); // decodable
-  // will take ownership of this.
-
-  // This takes ownership of new_feats, gselect, and spk_vars
-  DecodableAmSgmm2Scaled *sgmm_decodable = new DecodableAmSgmm2Scaled(
-      am_sgmm, trans_model, new_feats, gselect,
-      spk_vars, log_prune, acoustic_scale);
-
-  // takes ownership of decoder and sgmm_decodable.
-  DecodeUtteranceLatticeFasterClass *task =
-      new DecodeUtteranceLatticeFasterClass(
-          decoder, sgmm_decodable, trans_model, word_syms, utt, acoustic_scale,
-          determinize, allow_partial, alignments_writer, words_writer,
-          compact_lattice_writer, lattice_writer, like_sum, frame_sum, num_done,
-          num_err, NULL);
-
-  sequencer->Run(task); // takes ownership.
-}
-
-} // end namespace kaldi
-
-int main(int argc, char *argv[]) {
-  try {
-    using namespace kaldi;
-    typedef kaldi::int32 int32;
-    using fst::SymbolTable;
-    using fst::Fst;
-    using fst::VectorFst;
-    using fst::StdArc;
-
-    const char *usage =
-        "Decode features using SGMM-based model.  This version accepts the --num-threads\n"
-        "option but otherwise behaves identically to sgmm2-latgen-faster\n"
-        "Usage:  sgmm2-latgen-faster-parallel [options] <model-in> (<fst-in>|<fsts-rspecifier>) "
-        "<features-rspecifier> <lattices-wspecifier> [<words-wspecifier> [<alignments-wspecifier>] ]\n";
-    ParseOptions po(usage);
-    BaseFloat acoustic_scale = 0.1;
-    bool allow_partial = false;
-    BaseFloat log_prune = 5.0;
-    string word_syms_filename, gselect_rspecifier, spkvecs_rspecifier,
-        utt2spk_rspecifier;
-
-    LatticeFasterDecoderConfig decoder_opts;
-    TaskSequencerConfig sequencer_config; // has --num-threads option
-    decoder_opts.Register(&po);
-    sequencer_config.Register(&po);
-
-    po.Register("acoustic-scale", &acoustic_scale,
-        "Scaling factor for acoustic likelihoods");
-    po.Register("log-prune", &log_prune,
-                "Pruning beam used to reduce number of exp() evaluations.");
-    po.Register("word-symbol-table", &word_syms_filename,
-        "Symbol table for words [for debug output]");
-    po.Register("allow-partial", &allow_partial,
-                "Produce output even when final state was not reached");
-    po.Register("gselect", &gselect_rspecifier,
-                "rspecifier for precomputed per-frame Gaussian indices.");
-    po.Register("spk-vecs", &spkvecs_rspecifier,
-                "rspecifier for speaker vectors");
-    po.Register("utt2spk", &utt2spk_rspecifier,
-                "rspecifier for utterance to speaker map");
-    po.Read(argc, argv);
-
-    if (po.NumArgs() < 4 || po.NumArgs() > 6) {
-      po.PrintUsage();
-      exit(1);
-    }
-
-    if (gselect_rspecifier == "")
-      KALDI_ERR << "--gselect option is required.";
-
-    std::string model_in_filename = po.GetArg(1),
-        fst_in_str = po.GetArg(2),
-        feature_rspecifier = po.GetArg(3),
-        lattice_wspecifier = po.GetArg(4),
-        words_wspecifier = po.GetOptArg(5),
-        alignment_wspecifier = po.GetOptArg(6);
-
-    double tot_like = 0.0;
-    kaldi::int64 frame_count = 0;    
-    int num_done = 0, num_err = 0;
-    Timer timer;
-    Fst<StdArc> *decode_fst = NULL;
-    fst::SymbolTable *word_syms = NULL;
-    
-    TaskSequencer<DecodeUtteranceLatticeFasterClass> sequencer(
-        sequencer_config);
-    TransitionModel trans_model;
-    kaldi::AmSgmm2 am_sgmm;
-    {
-      bool binary;
-      Input ki(model_in_filename, &binary);
-      trans_model.Read(ki.Stream(), binary);
-      am_sgmm.Read(ki.Stream(), binary);
-    }
-
-    CompactLatticeWriter compact_lattice_writer;
-    LatticeWriter lattice_writer;
-    
-    bool determinize = decoder_opts.determinize_lattice;    
-    if (! (determinize ? compact_lattice_writer.Open(lattice_wspecifier)
-           : lattice_writer.Open(lattice_wspecifier)))
-      KALDI_ERR << "Could not open table for writing lattices: "
-                << lattice_wspecifier;
-
-    Int32VectorWriter words_writer(words_wspecifier);
-    Int32VectorWriter alignment_writer(alignment_wspecifier);
-
-    if (word_syms_filename != "") 
-      if (!(word_syms = fst::SymbolTable::ReadText(word_syms_filename)))
-        KALDI_ERR << "Could not read symbol table from file "
-                  << word_syms_filename;
-
-    RandomAccessInt32VectorVectorReader gselect_reader(gselect_rspecifier);
-    RandomAccessBaseFloatVectorReaderMapped spkvecs_reader(spkvecs_rspecifier,
-                                                           utt2spk_rspecifier);
-        
-    if (ClassifyRspecifier(fst_in_str, NULL, NULL) == kNoRspecifier) { // a single FST.
-      SequentialBaseFloatMatrixReader feature_reader(feature_rspecifier);
-      // It's important that we initialize decode_fst after feature_reader, as it
-      // can prevent crashes on systems installed without enough virtual memory.
-      // It has to do with what happens on UNIX systems if you call fork() on a
-      // large process: the page-table entries are duplicated, which requires a
-      // lot of virtual memory.
-      decode_fst = fst::ReadFstKaldiGeneric(fst_in_str);
-      timer.Reset(); // exclude graph loading time.
-      
-      {
-        for (; !feature_reader.Done(); feature_reader.Next()) {
-          string utt = feature_reader.Key();
-          const Matrix<BaseFloat> &features(feature_reader.Value());
-          if (features.NumRows() == 0) {
-            KALDI_WARN << "Zero-length utterance: " << utt;
-            num_err++;
-            continue;
-          }
-
-          // ProcessUtterance will take ownership of this.
-          LatticeFasterDecoder *decoder = new LatticeFasterDecoder(
-              *decode_fst, decoder_opts);
-
-          ProcessUtterance(am_sgmm, trans_model, log_prune, acoustic_scale,
-                           features, gselect_reader, spkvecs_reader, word_syms,
-                           utt, determinize, allow_partial,
-                           &alignment_writer, &words_writer, &compact_lattice_writer,
-                           &lattice_writer, decoder, &tot_like, &frame_count,
-                           &num_done, &num_err, &sequencer);
-        }
-      }
-    } else { // We have different FSTs for different utterances.
-      SequentialTableReader<fst::VectorFstHolder> fst_reader(fst_in_str);
-      RandomAccessBaseFloatMatrixReader feature_reader(feature_rspecifier);          
-      for (; !fst_reader.Done(); fst_reader.Next()) {
-        std::string utt = fst_reader.Key();
-        if (!feature_reader.HasKey(utt)) {
-          KALDI_WARN << "Not decoding utterance " << utt
-                     << " because no features available.";
-          num_err++;
-          continue;
-        }
-        const Matrix<BaseFloat> &features = feature_reader.Value(utt);
-        if (features.NumRows() == 0) {
-          KALDI_WARN << "Zero-length utterance: " << utt;
-          num_err++;
-          continue;
-        }
-        VectorFst<StdArc> *fst = fst_reader.Value().Copy(); // Note: this does
-        // a shallow copy because OpenFst is "smart" about these things and
-        // does reference counting.  The constructor of LatticeFasterDecoder
-        // takes ownership of this FST (note: LatticeFasterDecoder has 2
-        // constructors, one of which takes ownership and one of which does not).
-        LatticeFasterDecoder *decoder = new LatticeFasterDecoder(decoder_opts,
-                                                                 fst);
-
-        // ProcessUtterance takes ownership of "decoder".
-        ProcessUtterance(am_sgmm, trans_model, log_prune, acoustic_scale,
-                         features, gselect_reader, spkvecs_reader, word_syms,
-                         utt, determinize, allow_partial,
-                         &alignment_writer, &words_writer, &compact_lattice_writer,
-                         &lattice_writer, decoder, &tot_like, &frame_count,
-                         &num_done, &num_err, &sequencer);
-      }
-    }
-    sequencer.Wait(); // Wait till all tasks are done.
-    
-    delete decode_fst; 
-    delete word_syms;
-    
-    double elapsed = timer.Elapsed();
-    KALDI_LOG << "Decoded with " << sequencer_config.num_threads << " threads.";
-    KALDI_LOG << "Time taken [excluding initialization] "<< elapsed
-              << "s: real-time factor per thread assuming 100 frames/sec is "
-              << (sequencer_config.num_threads * elapsed * 100.0 / frame_count);
-    KALDI_LOG << "Done " << num_done << " utterances, failed for "
-              << num_err;
-    KALDI_LOG << "Overall log-likelihood per frame = " << (tot_like/frame_count)
-              << " over " << frame_count << " frames.";
-
-    return (num_done != 0 ? 0 : 1);
-  } catch(const std::exception &e) {
-    std::cerr << e.what();
-    return -1;
-  }
-}
-
-
diff --git a/src/sgmm2bin/sgmm2-latgen-faster.cc b/src/sgmm2bin/sgmm2-latgen-faster.cc
deleted file mode 100644
index 39eccc4a6b6..00000000000
--- a/src/sgmm2bin/sgmm2-latgen-faster.cc
+++ /dev/null
@@ -1,268 +0,0 @@
-// sgmm2bin/sgmm2-latgen-faster.cc
-
-// Copyright 2009-2012  Saarland University;  Microsoft Corporation;
-//                      Johns Hopkins University (author: Daniel Povey)
-//                2014  Guoguo Chen
-
-// See ../../COPYING for clarification regarding multiple authors
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//  http://www.apache.org/licenses/LICENSE-2.0
-//
-// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
-// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
-// MERCHANTABLITY OR NON-INFRINGEMENT.
-// See the Apache 2 License for the specific language governing permissions and
-// limitations under the License.
-
-#include <string>
-using std::string;
-
-#include "base/kaldi-common.h"
-#include "util/common-utils.h"
-#include "sgmm2/am-sgmm2.h"
-#include "hmm/transition-model.h"
-#include "fstext/fstext-lib.h"
-#include "decoder/decoder-wrappers.h"
-#include "sgmm2/decodable-am-sgmm2.h"
-#include "base/timer.h"
-
-namespace kaldi {
-
-// the reference arguments at the beginning are not const as the style guide
-// requires, but are best viewed as inputs.
-bool ProcessUtterance(LatticeFasterDecoder &decoder,
-                      const AmSgmm2 &am_sgmm,
-                      const TransitionModel &trans_model,
-                      double log_prune,
-                      double acoustic_scale,
-                      const Matrix<BaseFloat> &features,
-                      RandomAccessInt32VectorVectorReader &gselect_reader,
-                      RandomAccessBaseFloatVectorReaderMapped &spkvecs_reader,
-                      const fst::SymbolTable *word_syms,
-                      const std::string &utt,
-                      bool determinize,
-                      bool allow_partial,
-                      Int32VectorWriter *alignments_writer,
-                      Int32VectorWriter *words_writer,
-                      CompactLatticeWriter *compact_lattice_writer,
-                      LatticeWriter *lattice_writer,
-                      double *like_ptr) { // puts utterance's like in like_ptr on success.
-  using fst::Fst;
-
-  Sgmm2PerSpkDerivedVars spk_vars;
-  if (spkvecs_reader.IsOpen()) {
-    if (spkvecs_reader.HasKey(utt)) {
-      spk_vars.SetSpeakerVector(spkvecs_reader.Value(utt));
-      am_sgmm.ComputePerSpkDerivedVars(&spk_vars);
-    } else {
-      KALDI_WARN << "Cannot find speaker vector for " << utt << ", not decoding this utterance";
-      return false; // We could use zero, but probably the user would want to know about this
-      // (this would normally be a script error or some kind of failure).
-    }
-  }
-  if (!gselect_reader.HasKey(utt) ||
-      gselect_reader.Value(utt).size() != features.NumRows()) {
-    KALDI_WARN << "No Gaussian-selection info available for utterance "
-               << utt << " (or wrong size)";
-  }
-
-  const std::vector<std::vector<int32> > &gselect =
-      gselect_reader.Value(utt);
-  
-  DecodableAmSgmm2Scaled sgmm_decodable(am_sgmm, trans_model, features, gselect,
-                                        log_prune, acoustic_scale, &spk_vars);
-
-  return DecodeUtteranceLatticeFaster(
-      decoder, sgmm_decodable, trans_model, word_syms, utt, acoustic_scale,
-      determinize, allow_partial, alignments_writer, words_writer,
-      compact_lattice_writer, lattice_writer, like_ptr);
-}
-
-} // end namespace kaldi
-
-int main(int argc, char *argv[]) {
-  try {
-    using namespace kaldi;
-    typedef kaldi::int32 int32;
-    using fst::SymbolTable;
-    using fst::Fst;
-    using fst::StdArc;
-
-    const char *usage =
-        "Decode features using SGMM-based model.\n"
-        "Usage:  sgmm2-latgen-faster [options] <model-in> (<fst-in>|<fsts-rspecifier>) "
-        "<features-rspecifier> <lattices-wspecifier> [<words-wspecifier> [<alignments-wspecifier>] ]\n";
-    ParseOptions po(usage);
-    BaseFloat acoustic_scale = 0.1;
-    bool allow_partial = false;
-    BaseFloat log_prune = 5.0;
-    string word_syms_filename, gselect_rspecifier, spkvecs_rspecifier,
-        utt2spk_rspecifier;
-
-    LatticeFasterDecoderConfig decoder_opts;
-    decoder_opts.Register(&po);    
-
-    po.Register("acoustic-scale", &acoustic_scale,
-        "Scaling factor for acoustic likelihoods");
-    po.Register("log-prune", &log_prune,
-                "Pruning beam used to reduce number of exp() evaluations.");
-    po.Register("word-symbol-table", &word_syms_filename,
-        "Symbol table for words [for debug output]");
-    po.Register("allow-partial", &allow_partial,
-                "Produce output even when final state was not reached");
-    po.Register("gselect", &gselect_rspecifier,
-                "rspecifier for precomputed per-frame Gaussian indices.");
-    po.Register("spk-vecs", &spkvecs_rspecifier,
-                "rspecifier for speaker vectors");
-    po.Register("utt2spk", &utt2spk_rspecifier,
-                "rspecifier for utterance to speaker map");
-    po.Read(argc, argv);
-
-    if (po.NumArgs() < 4 || po.NumArgs() > 6) {
-      po.PrintUsage();
-      exit(1);
-    }
-
-    if (gselect_rspecifier == "")
-      KALDI_ERR << "--gselect option is required.";
-
-    std::string model_in_filename = po.GetArg(1),
-        fst_in_str = po.GetArg(2),
-        feature_rspecifier = po.GetArg(3),
-        lattice_wspecifier = po.GetArg(4),
-        words_wspecifier = po.GetOptArg(5),
-        alignment_wspecifier = po.GetOptArg(6);
-
-    TransitionModel trans_model;
-    kaldi::AmSgmm2 am_sgmm;
-    {
-      bool binary;
-      Input ki(model_in_filename, &binary);
-      trans_model.Read(ki.Stream(), binary);
-      am_sgmm.Read(ki.Stream(), binary);
-    }
-
-    CompactLatticeWriter compact_lattice_writer;
-    LatticeWriter lattice_writer;
-    bool determinize = decoder_opts.determinize_lattice;    
-    if (! (determinize ? compact_lattice_writer.Open(lattice_wspecifier)
-           : lattice_writer.Open(lattice_wspecifier)))
-      KALDI_ERR << "Could not open table for writing lattices: "
-                 << lattice_wspecifier;
-    
-    Int32VectorWriter words_writer(words_wspecifier);
-
-    Int32VectorWriter alignment_writer(alignment_wspecifier);
-
-    fst::SymbolTable *word_syms = NULL;
-    if (word_syms_filename != "") 
-      if (!(word_syms = fst::SymbolTable::ReadText(word_syms_filename)))
-        KALDI_ERR << "Could not read symbol table from file "
-                   << word_syms_filename;
-
-    RandomAccessInt32VectorVectorReader gselect_reader(gselect_rspecifier);
-    RandomAccessBaseFloatVectorReaderMapped spkvecs_reader(spkvecs_rspecifier,
-                                                           utt2spk_rspecifier);
-
-    BaseFloat tot_like = 0.0;
-    kaldi::int64 frame_count = 0;
-    int num_success = 0, num_err = 0;
-
-    Timer timer;
-        
-    if (ClassifyRspecifier(fst_in_str, NULL, NULL) == kNoRspecifier) { // a single FST.
-      SequentialBaseFloatMatrixReader feature_reader(feature_rspecifier);
-      // It's important that we initialize decode_fst after feature_reader, as it
-      // can prevent crashes on systems installed without enough virtual memory.
-      // It has to do with what happens on UNIX systems if you call fork() on a
-      // large process: the page-table entries are duplicated, which requires a
-      // lot of virtual memory.
-      Fst<StdArc> *decode_fst = fst::ReadFstKaldiGeneric(fst_in_str);
-      timer.Reset(); // exclude graph loading time.
-      
-      {
-        LatticeFasterDecoder decoder(*decode_fst, decoder_opts);
-    
-        const std::vector<std::vector<int32> > empty_gselect;
-
-        for (; !feature_reader.Done(); feature_reader.Next()) {
-          string utt = feature_reader.Key();
-          const Matrix<BaseFloat> &features(feature_reader.Value());
-          if (features.NumRows() == 0) {
-            KALDI_WARN << "Zero-length utterance: " << utt;
-            num_err++;
-            continue;
-          }
-          double like;
-          if (ProcessUtterance(decoder, am_sgmm, trans_model, log_prune, acoustic_scale,
-                               features, gselect_reader, spkvecs_reader, word_syms,
-                               utt, determinize, allow_partial,
-                               &alignment_writer, &words_writer, &compact_lattice_writer,
-                               &lattice_writer, &like)) {
-            tot_like += like;
-            frame_count += features.NumRows();
-            KALDI_LOG << "Log-like per frame for utterance " << utt << " is "
-                      << (like / features.NumRows()) << " over "
-                      << features.NumRows() << " frames.";
-            num_success++;
-          } else { num_err++; }
-        }
-      }
-      delete decode_fst; // only safe to do this after decoder goes out of scope.
-    } else { // We have different FSTs for different utterances.
-      SequentialTableReader<fst::VectorFstHolder> fst_reader(fst_in_str);
-      RandomAccessBaseFloatMatrixReader feature_reader(feature_rspecifier);          
-      for (; !fst_reader.Done(); fst_reader.Next()) {
-        std::string utt = fst_reader.Key();
-        if (!feature_reader.HasKey(utt)) {
-          KALDI_WARN << "Not decoding utterance " << utt
-                     << " because no features available.";
-          num_err++;
-          continue;
-        }
-        const Matrix<BaseFloat> &features = feature_reader.Value(utt);
-        if (features.NumRows() == 0) {
-          KALDI_WARN << "Zero-length utterance: " << utt;
-          num_err++;
-          continue;
-        }
-        LatticeFasterDecoder decoder(fst_reader.Value(), decoder_opts);
-        double like;
-
-        if (ProcessUtterance(decoder, am_sgmm, trans_model, log_prune, acoustic_scale,
-                             features, gselect_reader, spkvecs_reader, word_syms,
-                             utt, determinize, allow_partial,
-                             &alignment_writer, &words_writer, &compact_lattice_writer,
-                             &lattice_writer, &like)) {
-          tot_like += like;
-          frame_count += features.NumRows();
-          KALDI_LOG << "Log-like per frame for utterance " << utt << " is "
-                    << (like / features.NumRows()) << " over "
-                    << features.NumRows() << " frames.";
-          num_success++;
-        } else { num_err++; }
-      }
-    }
-    double elapsed = timer.Elapsed();
-    KALDI_LOG << "Time taken [excluding initialization] "<< elapsed
-              << "s: real-time factor assuming 100 frames/sec is "
-              << (elapsed*100.0/frame_count);
-    KALDI_LOG << "Done " << num_success << " utterances, failed for "
-              << num_err;
-    KALDI_LOG << "Overall log-likelihood per frame = " << (tot_like/frame_count)
-              << " over " << frame_count << " frames.";
-
-    delete word_syms;
-    return (num_success != 0 ? 0 : 1);
-  } catch(const std::exception &e) {
-    std::cerr << e.what();
-    return -1;
-  }
-}
-
-
diff --git a/src/sgmm2bin/sgmm2-post-to-gpost.cc b/src/sgmm2bin/sgmm2-post-to-gpost.cc
deleted file mode 100644
index 2dfbe436fb3..00000000000
--- a/src/sgmm2bin/sgmm2-post-to-gpost.cc
+++ /dev/null
@@ -1,186 +0,0 @@
-// sgmm2bin/sgmm2-post-to-gpost.cc
-
-// Copyright 2009-2012   Saarland University  Microsoft Corporation
-//                       Johns Hopkins University (Author: Daniel Povey)
-//                2014   Guoguo Chen
-
-// See ../../COPYING for clarification regarding multiple authors
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//  http://www.apache.org/licenses/LICENSE-2.0
-//
-// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
-// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
-// MERCHANTABLITY OR NON-INFRINGEMENT.
-// See the Apache 2 License for the specific language governing permissions and
-// limitations under the License.
-
-
-#include "base/kaldi-common.h"
-#include "util/common-utils.h"
-#include "sgmm2/am-sgmm2.h"
-#include "hmm/transition-model.h"
-#include "sgmm2/estimate-am-sgmm2.h"
-#include "hmm/posterior.h"
-
-
-int main(int argc, char *argv[]) {
-  using namespace kaldi;
-  try {
-    const char *usage =
-        "Convert posteriors to Gaussian-level posteriors for SGMM training.\n"
-        "Usage: sgmm2-post-to-gpost [options] <model-in> <feature-rspecifier> "
-        "<posteriors-rspecifier> <gpost-wspecifier>\n"
-        "e.g.: sgmm2-post-to-gpost 1.mdl 1.ali scp:train.scp 'ark:ali-to-post ark:1.ali ark:-|' ark:-";
-
-    ParseOptions po(usage);
-    std::string gselect_rspecifier, spkvecs_rspecifier, utt2spk_rspecifier;
-
-    po.Register("gselect", &gselect_rspecifier, "Precomputed Gaussian indices (rspecifier)");
-    po.Register("spk-vecs", &spkvecs_rspecifier, "Speaker vectors (rspecifier)");
-    po.Register("utt2spk", &utt2spk_rspecifier,
-                "rspecifier for utterance to speaker map");
-
-    po.Read(argc, argv);
-
-    if (po.NumArgs() != 4) {
-      po.PrintUsage();
-      exit(1);
-    }
-    if (gselect_rspecifier == "")
-      KALDI_ERR << "--gselect option is required";
-    
-    std::string model_filename = po.GetArg(1),
-        feature_rspecifier = po.GetArg(2),
-        posteriors_rspecifier = po.GetArg(3),
-        gpost_wspecifier = po.GetArg(4);
-
-    using namespace kaldi;
-    typedef kaldi::int32 int32;
-
-    AmSgmm2 am_sgmm;
-    TransitionModel trans_model;
-    {
-      bool binary;
-      Input ki(model_filename, &binary);
-      trans_model.Read(ki.Stream(), binary);
-      am_sgmm.Read(ki.Stream(), binary);
-    }
-
-    double tot_like = 0.0;
-    kaldi::int64 tot_t = 0;
-
-    SequentialBaseFloatMatrixReader feature_reader(feature_rspecifier);
-    RandomAccessPosteriorReader posteriors_reader(posteriors_rspecifier);
-    RandomAccessInt32VectorVectorReader gselect_reader(gselect_rspecifier);
-    RandomAccessBaseFloatVectorReaderMapped spkvecs_reader(spkvecs_rspecifier,
-                                                           utt2spk_rspecifier);
-
-    Sgmm2PerFrameDerivedVars per_frame_vars;
-    
-    Sgmm2GauPostWriter gpost_writer(gpost_wspecifier);
-    
-    int32 num_done = 0, num_err = 0;
-    for (; !feature_reader.Done(); feature_reader.Next()) {
-      const Matrix<BaseFloat> &mat = feature_reader.Value();
-      std::string utt = feature_reader.Key();
-      if (!posteriors_reader.HasKey(utt)
-          || posteriors_reader.Value(utt).size() != mat.NumRows()) {
-        KALDI_WARN << "No posteriors available for utterance " << utt
-                   << " (or wrong size)";
-        num_err++;
-        continue;
-      }
-      Posterior posterior = posteriors_reader.Value(utt);
-
-      if (!gselect_reader.HasKey(utt) ||
-          gselect_reader.Value(utt).size() != mat.NumRows()) {
-        KALDI_WARN << "No Gaussian-selection info available for utterance "
-                   << utt << " (or wrong size)";
-        num_err++;
-        continue;
-      }
-      const std::vector<std::vector<int32> > &gselect =
-          gselect_reader.Value(utt);
-
-      Sgmm2PerSpkDerivedVars spk_vars;
-      if (spkvecs_reader.IsOpen()) {
-        if (spkvecs_reader.HasKey(utt)) {
-          spk_vars.SetSpeakerVector(spkvecs_reader.Value(utt));
-          am_sgmm.ComputePerSpkDerivedVars(&spk_vars);
-        } else {
-          KALDI_WARN << "Cannot find speaker vector for " << utt;
-          num_err++;
-          continue;
-        }
-      } // else spk_vars is "empty"
-
-      num_done++;
-      BaseFloat tot_like_this_file = 0.0, tot_weight = 0.0;
-
-      Sgmm2GauPost gpost(posterior.size());  // posterior.size() == T.
-
-      SortPosteriorByPdfs(trans_model, &posterior);
-      int32 prev_pdf_id = -1;
-      BaseFloat prev_like = 0;
-      Matrix<BaseFloat> prev_posterior;
-      for (size_t i = 0; i < posterior.size(); i++) {
-        am_sgmm.ComputePerFrameVars(mat.Row(i), gselect[i],
-                                    spk_vars, &per_frame_vars);
-
-        gpost[i].gselect = gselect[i];
-        gpost[i].tids.resize(posterior[i].size());
-        gpost[i].posteriors.resize(posterior[i].size());
-
-        prev_pdf_id = -1;       // Only cache for the same frame.
-        for (size_t j = 0; j < posterior[i].size(); j++) {
-          int32 tid = posterior[i][j].first,  // transition identifier.
-              pdf_id = trans_model.TransitionIdToPdf(tid);
-          BaseFloat weight = posterior[i][j].second;
-          gpost[i].tids[j] = tid;
-
-          if (pdf_id != prev_pdf_id) {
-            // First time see this pdf-id for this frame, update the cached
-            // variables.
-            prev_pdf_id = pdf_id;
-            prev_like = am_sgmm.ComponentPosteriors(per_frame_vars, pdf_id,
-                                                    &spk_vars,
-                                                    &prev_posterior);
-          }
-
-          gpost[i].posteriors[j] = prev_posterior;
-          tot_like_this_file += prev_like * weight;
-          tot_weight += weight;
-          gpost[i].posteriors[j].Scale(weight);
-        }
-      }
-
-      KALDI_VLOG(2) << "Average like for this file is "
-                    << (tot_like_this_file/posterior.size()) << " over "
-                    << posterior.size() <<" frames.";
-      tot_like += tot_like_this_file;
-      tot_t += posterior.size();
-      if (num_done % 10 == 0)
-        KALDI_LOG << "Avg like per frame so far is "
-                  << (tot_like/tot_t);
-      gpost_writer.Write(utt, gpost);
-    }
-    
-    KALDI_LOG << "Overall like per frame (Gaussian only) = "
-              << (tot_like/tot_t) << " over " << tot_t << " frames.";
-
-    KALDI_LOG << "Done " << num_done << " files, " << num_err
-              << " with errors.";
-
-    return (num_done != 0 ? 0 : 1);
-  } catch(const std::exception &e) {
-    std::cerr << e.what();
-    return -1;
-  }
-}
-
-
diff --git a/src/sgmm2bin/sgmm2-project.cc b/src/sgmm2bin/sgmm2-project.cc
deleted file mode 100644
index 7b3d5c412d5..00000000000
--- a/src/sgmm2bin/sgmm2-project.cc
+++ /dev/null
@@ -1,116 +0,0 @@
-// sgmm2bin/sgmm2-project.cc
-
-// Copyright 2012    Johns Hopkins University (Author: Daniel Povey)
-
-// See ../../COPYING for clarification regarding multiple authors
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//  http://www.apache.org/licenses/LICENSE-2.0
-//
-// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
-// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
-// MERCHANTABLITY OR NON-INFRINGEMENT.
-// See the Apache 2 License for the specific language governing permissions and
-// limitations under the License.
-
-
-#include "base/kaldi-common.h"
-#include "util/common-utils.h"
-#include "util/kaldi-thread.h"
-#include "hmm/transition-model.h"
-#include "sgmm2/am-sgmm2-project.h"
-
-int main(int argc, char *argv[]) {
-  try {
-    using namespace kaldi;
-    typedef kaldi::int32 int32;
-    const char *usage =
-        "Compute SGMM model projection that only models a part of a pre-LDA space.\n"
-        "Used in predictive SGMMs.  Takes as input an LDA+MLLT transform,\n"
-        "and outputs a transform from the pre-LDA+MLLT space to the space that\n"
-        "we want to model\n"
-        "Usage: sgmm2-project [options] <model-in> <lda-mllt-mat-in> <model-out> <new-projection-out>\n"
-        "e.g.: sgmm2-project --start-dim=0 --end-dim=52 final.mdl final.inv_full_mat final_proj1.mdl proj1.mat\n";
-    
-    std::string write_flags_str = "gsnu";
-
-    bool binary_write = false;
-    int32 start_dim = 0;
-    int32 end_dim = 0;
-
-    ParseOptions po(usage);
-    po.Register("binary", &binary_write, "Write output in binary mode");
-    po.Register("start-dim", &start_dim, "Starting dimension to keep in "
-                "pre-LDA-MLLT space.");
-    po.Register("end-dim", &end_dim, "Ending dimension to keep in "
-                "pre-LDA-MLLT space (equals last retained dimension plus one)");
-
-    po.Read(argc, argv);
-    if (po.NumArgs() != 4) {
-      po.PrintUsage();
-      exit(1);
-    }
-    std::string model_rxfilename = po.GetArg(1),
-        lda_mllt_rxfilename = po.GetArg(2),
-        model_wxfilename = po.GetArg(3),
-        proj_wxfilename = po.GetArg(4);
-
-    kaldi::SgmmWriteFlagsType write_flags =
-        StringToSgmmWriteFlags(write_flags_str);
-    
-    AmSgmm2 am_sgmm;
-    TransitionModel trans_model;
-    {
-      bool binary;
-      Input ki(model_rxfilename, &binary);
-      trans_model.Read(ki.Stream(), binary);
-      am_sgmm.Read(ki.Stream(), binary);
-    }
-
-
-    Matrix<BaseFloat> lda_mllt_mat;
-    ReadKaldiObject(lda_mllt_rxfilename, &lda_mllt_mat);
-
-    // Need the full LDA+MLLT matrix, including the extra rows.
-    // See featbin/extend-transform.cc
-    KALDI_ASSERT(lda_mllt_mat.NumRows() == lda_mllt_mat.NumCols());
-
-    Matrix<BaseFloat> inv_lda_mllt_mat(lda_mllt_mat);
-    inv_lda_mllt_mat.Invert();
-
-    Matrix<BaseFloat> projection;
-    Sgmm2Project sgmm_project;
-    sgmm_project.ComputeProjection(am_sgmm, inv_lda_mllt_mat, start_dim, end_dim,
-                                   &projection);
-
-    Matrix<BaseFloat> total_projection(projection.NumRows(), projection.NumCols());
-    total_projection.AddMatMat(1.0, projection, kNoTrans,
-                               inv_lda_mllt_mat, kNoTrans, 0.0);
-    
-    sgmm_project.ApplyProjection(total_projection, &am_sgmm);
-    
-    am_sgmm.ComputeDerivedVars(); // recompute normalizers, and possibly
-    // weights.
-    
-    {
-      Output ko(model_wxfilename, binary_write);
-      trans_model.Write(ko.Stream(), binary_write);
-      am_sgmm.Write(ko.Stream(), binary_write, write_flags);
-    }
-    KALDI_LOG << "Wrote model to " << model_wxfilename;
-
-    WriteKaldiObject(projection, proj_wxfilename, binary_write);
-    KALDI_LOG << "Wrote projection matrix to " << proj_wxfilename;
-    
-    return 0;
-  } catch(const std::exception &e) {
-    std::cerr << e.what();
-    return -1;
-  }
-}
-
-
diff --git a/src/sgmm2bin/sgmm2-rescore-lattice.cc b/src/sgmm2bin/sgmm2-rescore-lattice.cc
deleted file mode 100644
index 95daab11c99..00000000000
--- a/src/sgmm2bin/sgmm2-rescore-lattice.cc
+++ /dev/null
@@ -1,166 +0,0 @@
-// sgmm2bin/sgmm2-rescore-lattice.cc
-
-// Copyright 2009-2012   Saarland University (Author: Arnab Ghoshal)
-//                       Johns Hopkins University (Author: Daniel Povey)
-//                       Cisco Systems (Author: Neha Agrawal)
-
-// See ../../COPYING for clarification regarding multiple authors
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//  http://www.apache.org/licenses/LICENSE-2.0
-//
-// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
-// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
-// MERCHANTABLITY OR NON-INFRINGEMENT.
-// See the Apache 2 License for the specific language governing permissions and
-// limitations under the License.
-
-#include "base/kaldi-common.h"
-#include "util/common-utils.h"
-#include "util/stl-utils.h"
-#include "sgmm2/am-sgmm2.h"
-#include "hmm/transition-model.h"
-#include "fstext/fstext-lib.h"
-#include "lat/kaldi-lattice.h"
-#include "lat/lattice-functions.h"
-#include "sgmm2/decodable-am-sgmm2.h"
-
-int main(int argc, char *argv[]) {
-  try {
-    using namespace kaldi;
-    typedef kaldi::int32 int32;
-    typedef kaldi::int64 int64;
-    using fst::SymbolTable;
-    using fst::VectorFst;
-    using fst::StdArc;
-
-    const char *usage =
-      "Replace the acoustic scores on a lattice using a new model.\n"
-      "Usage: sgmm2-rescore-lattice [options] <model-in> <lattice-rspecifier> "
-      "<feature-rspecifier> <lattice-wspecifier>\n"
-      " e.g.: sgmm2-rescore-lattice 1.mdl ark:1.lats scp:trn.scp ark:2.lats\n";
-
-    kaldi::BaseFloat old_acoustic_scale = 0.0;
-    bool speedup = false;
-    BaseFloat log_prune = 5.0;
-    std::string gselect_rspecifier, spkvecs_rspecifier, utt2spk_rspecifier;
-
-    kaldi::ParseOptions po(usage);
-    po.Register("old-acoustic-scale", &old_acoustic_scale,
-                "Add the current acoustic scores with some scale.");
-    po.Register("log-prune", &log_prune,
-                "Pruning beam used to reduce number of exp() evaluations.");
-    po.Register("spk-vecs", &spkvecs_rspecifier, "Speaker vectors (rspecifier)");
-    po.Register("utt2spk", &utt2spk_rspecifier,
-                "rspecifier for utterance to speaker map");
-    po.Register("gselect", &gselect_rspecifier,
-                "Precomputed Gaussian indices (rspecifier)");
-    po.Register("speedup", &speedup,
-                "If true, enable a faster version of the computation that "
-                "saves times when there is only one pdf-id on a single frame "
-                "by only sometimes (randomly) computing the probabilities, and "
-                "then scaling them up to preserve corpus-level diagnostics.");
-
-    
-    po.Read(argc, argv);
-
-    if (po.NumArgs() != 4) {
-      po.PrintUsage();
-      exit(1);
-    }
-    if (gselect_rspecifier == "")
-      KALDI_ERR << "--gselect-rspecifier option is required.";
-
-    std::string model_filename = po.GetArg(1),
-        lats_rspecifier = po.GetArg(2),
-        feature_rspecifier = po.GetArg(3),
-        lats_wspecifier = po.GetArg(4);
-
-    AmSgmm2 am_sgmm;
-    TransitionModel trans_model;
-    {
-      bool binary;
-      Input ki(model_filename, &binary);
-      trans_model.Read(ki.Stream(), binary);
-      am_sgmm.Read(ki.Stream(), binary);
-    }
-
-    RandomAccessInt32VectorVectorReader gselect_reader(gselect_rspecifier);
-    RandomAccessBaseFloatVectorReaderMapped spkvecs_reader(spkvecs_rspecifier,
-                                                           utt2spk_rspecifier);
-    RandomAccessBaseFloatMatrixReader feature_reader(feature_rspecifier);
-    // Read as compact lattice
-    SequentialCompactLatticeReader compact_lattice_reader(lats_rspecifier);
-    // Write as compact lattice.
-    CompactLatticeWriter compact_lattice_writer(lats_wspecifier);
-
-    int32 num_done = 0, num_err = 0;
-    for (; !compact_lattice_reader.Done(); compact_lattice_reader.Next()) {
-      std::string utt = compact_lattice_reader.Key();
-      if (!feature_reader.HasKey(utt)) {
-        KALDI_WARN << "No feature found for utterance " << utt;
-        num_err++;
-        continue;
-      }
-
-      CompactLattice clat = compact_lattice_reader.Value();
-      compact_lattice_reader.FreeCurrent();
-      if (old_acoustic_scale != 1.0)
-        fst::ScaleLattice(fst::AcousticLatticeScale(old_acoustic_scale), &clat);
-
-      const Matrix<BaseFloat> &feats = feature_reader.Value(utt);
-
-      // Get speaker vectors
-      Sgmm2PerSpkDerivedVars spk_vars;
-      if (spkvecs_reader.IsOpen()) {
-        if (spkvecs_reader.HasKey(utt)) {
-          spk_vars.SetSpeakerVector(spkvecs_reader.Value(utt));
-          am_sgmm.ComputePerSpkDerivedVars(&spk_vars);
-        } else {
-          KALDI_WARN << "Cannot find speaker vector for " << utt;
-          num_err++;
-          continue;
-        }
-      }  // else spk_vars is "empty"
-
-      if (!gselect_reader.HasKey(utt) ||
-          gselect_reader.Value(utt).size() != feats.NumRows()) {
-        KALDI_WARN << "No Gaussian-selection info available for utterance "
-                   << utt << " (or wrong size)";
-        num_err++;
-        continue;
-      }
-      const std::vector<std::vector<int32> > &gselect =
-          gselect_reader.Value(utt);
-
-      DecodableAmSgmm2 sgmm2_decodable(am_sgmm, trans_model, feats,
-                                       gselect, log_prune, &spk_vars);
-
-      if (!speedup) {
-        if (kaldi::RescoreCompactLattice(&sgmm2_decodable, &clat)) {
-          compact_lattice_writer.Write(utt, clat);
-          num_done++;
-        } else num_err++;
-      } else {
-        BaseFloat speedup_factor = 100.0; 
-        if (kaldi::RescoreCompactLatticeSpeedup(trans_model, speedup_factor,
-                                                &sgmm2_decodable,
-                                                &clat)) {
-          compact_lattice_writer.Write(utt, clat);
-          num_done++;
-        } else num_err++;
-      }        
-    }
-
-    KALDI_LOG << "Done " << num_done << " lattices, errors on "
-              << num_err;
-    return (num_done != 0 ? 0 : 1);
-  } catch(const std::exception &e) {
-    std::cerr << e.what();
-    return -1;
-  }
-}
diff --git a/src/sgmm2bin/sgmm2-sum-accs.cc b/src/sgmm2bin/sgmm2-sum-accs.cc
deleted file mode 100644
index 8259702ba49..00000000000
--- a/src/sgmm2bin/sgmm2-sum-accs.cc
+++ /dev/null
@@ -1,94 +0,0 @@
-// sgmm2bin/sgmm2-sum-accs.cc
-
-// Copyright 2009-2012   Saarland University;  Microsoft Corporation
-//                       Johns Hopkins University (author: Daniel Povey)
-
-// See ../../COPYING for clarification regarding multiple authors
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//  http://www.apache.org/licenses/LICENSE-2.0
-//
-// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
-// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
-// MERCHANTABLITY OR NON-INFRINGEMENT.
-// See the Apache 2 License for the specific language governing permissions and
-// limitations under the License.
-
-#include "util/common-utils.h"
-#include "sgmm2/estimate-am-sgmm2.h"
-#include "hmm/transition-model.h"
-
-
-int main(int argc, char *argv[]) {
-  try {
-    typedef kaldi::int32 int32;
-
-    const char *usage =
-        "Sum multiple accumulated stats files for SGMM training.\n"
-        "Usage: sgmm2-sum-accs [options] stats-out stats-in1 stats-in2 ...\n";
-
-    bool binary = true;
-    bool parallel = false;
-    kaldi::ParseOptions po(usage);
-    po.Register("binary", &binary, "Write output in binary mode");
-    po.Register("parallel", &parallel, "If true, the program makes sure to open all "
-                "filehandles before reading for any (useful when summing accs from "
-                "long processes)");
-    po.Read(argc, argv);
-
-    if (po.NumArgs() < 2) {
-      po.PrintUsage();
-      exit(1);
-    }
-
-    std::string stats_out_filename = po.GetArg(1);
-    kaldi::Vector<double> transition_accs;
-    kaldi::MleAmSgmm2Accs sgmm_accs;
-
-    if (parallel) {
-      std::vector<kaldi::Input*> inputs(po.NumArgs() - 1);
-      for (int i = 0; i < po.NumArgs() - 1; i++) {
-        std::string stats_in_filename = po.GetArg(i + 2);
-        inputs[i] = new kaldi::Input(stats_in_filename); // Don't try
-        // to work out binary status yet; this would cause us to wait
-        // for the output of that process.  We delay it till later.
-      }
-      for (size_t i = 0; i < po.NumArgs() - 1; i++) {
-        bool b;
-        if (kaldi::InitKaldiInputStream(inputs[i]->Stream(), &b)) {
-          transition_accs.Read(inputs[i]->Stream(), b, true /* add values */);
-          sgmm_accs.Read(inputs[i]->Stream(), b, true /* add values */);
-          delete inputs[i];
-        } else {
-          KALDI_ERR << "Failed to read input stats file " << po.GetArg(i + 2);
-        }
-      }      
-    } else {
-      for (int i = 2, max = po.NumArgs(); i <= max; i++) {
-        std::string stats_in_filename = po.GetArg(i);
-        bool binary_read;
-        kaldi::Input ki(stats_in_filename, &binary_read);
-        transition_accs.Read(ki.Stream(), binary_read, true /* add values */);
-        sgmm_accs.Read(ki.Stream(), binary_read, true /* add values */);
-      }
-    }
-
-    // Write out the accs
-    {
-      kaldi::Output ko(stats_out_filename, binary);
-      transition_accs.Write(ko.Stream(), binary);
-      sgmm_accs.Write(ko.Stream(), binary);
-    }
-
-    KALDI_LOG << "Written stats to " << stats_out_filename;
-  } catch(const std::exception &e) {
-    std::cerr << e.what() << '\n';
-    return -1;
-  }
-}
-
-
diff --git a/src/transform/Makefile b/src/transform/Makefile
index a265db6ac37..bd745599032 100644
--- a/src/transform/Makefile
+++ b/src/transform/Makefile
@@ -2,19 +2,17 @@ all:
 
 include ../kaldi.mk
 
-TESTFILES = regtree-fmllr-diag-gmm-test lda-estimate-test \
-      regression-tree-test fmllr-diag-gmm-test \
-      regtree-mllr-diag-gmm-test fmpe-test fmllr-raw-test
+TESTFILES = lda-estimate-test fmllr-diag-gmm-test
 
-OBJFILES = regression-tree.o regtree-mllr-diag-gmm.o lda-estimate.o \
-    regtree-fmllr-diag-gmm.o cmvn.o transform-common.o fmllr-diag-gmm.o \
-    lvtln.o mllt.o fmpe.o basis-fmllr-diag-gmm.o \
-    compressed-transform-stats.o fmllr-raw.o decodable-am-diag-gmm-regtree.o
+OBJFILES = lda-estimate.o \
+    cmvn.o transform-common.o fmllr-diag-gmm.o \
+    lvtln.o mllt.o basis-fmllr-diag-gmm.o \
+    compressed-transform-stats.o fmllr-raw.o
 
 
 LIBNAME = kaldi-transform
 
 ADDLIBS = ../gmm/kaldi-gmm.a ../tree/kaldi-tree.a ../util/kaldi-util.a \
-          ../matrix/kaldi-matrix.a ../base/kaldi-base.a 
+          ../matrix/kaldi-matrix.a ../base/kaldi-base.a
 
 include ../makefiles/default_rules.mk
diff --git a/src/transform/decodable-am-diag-gmm-regtree.cc b/src/transform/decodable-am-diag-gmm-regtree.cc
deleted file mode 100644
index 536fb8ed1bc..00000000000
--- a/src/transform/decodable-am-diag-gmm-regtree.cc
+++ /dev/null
@@ -1,234 +0,0 @@
-// transform/decodable-am-diag-gmm-regtree.cc
-
-// Copyright 2009-2011  Saarland University;  Lukas Burget
-//                2013  Johns Hopkins Universith (author: Daniel Povey)
-
-// See ../../COPYING for clarification regarding multiple authors
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//  http://www.apache.org/licenses/LICENSE-2.0
-//
-// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
-// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
-// MERCHANTABLITY OR NON-INFRINGEMENT.
-// See the Apache 2 License for the specific language governing permissions and
-// limitations under the License.
-
-#include <vector>
-using std::vector;
-
-#include "transform/decodable-am-diag-gmm-regtree.h"
-
-namespace kaldi {
-
-
-BaseFloat DecodableAmDiagGmmRegtreeFmllr::LogLikelihoodZeroBased(int32 frame,
-                                                          int32 state) {
-  KALDI_ASSERT(frame < NumFramesReady() && frame >= 0);
-  KALDI_ASSERT(state < NumIndices() && state >= 0);
-
-  if (!valid_logdets_) {
-    logdets_.Resize(fmllr_xform_.NumRegClasses());
-    fmllr_xform_.GetLogDets(&logdets_);
-    valid_logdets_ = true;
-  }
-
-  if (log_like_cache_[state].hit_time == frame) {
-    return log_like_cache_[state].log_like;  // return cached value, if found
-  }
-
-  const DiagGmm &pdf = acoustic_model_.GetPdf(state);
-  const VectorBase<BaseFloat> &data = feature_matrix_.Row(frame);
-
-  // check if everything is in order
-  if (pdf.Dim() != data.Dim()) {
-    KALDI_ERR << "Dim mismatch: data dim = "  << data.Dim()
-        << " vs. model dim = " << pdf.Dim();
-  }
-  if (!pdf.valid_gconsts()) {
-    KALDI_ERR << "State "  << (state)  << ": Must call ComputeGconsts() "
-        "before computing likelihood.";
-  }
-
-  if (frame != previous_frame_) {  // cache the transformed & squared stats.
-    fmllr_xform_.TransformFeature(data, &xformed_data_);
-    xformed_data_squared_ = xformed_data_;
-    vector< Vector <BaseFloat> >::iterator it = xformed_data_squared_.begin(),
-        end = xformed_data_squared_.end();
-    for (; it != end; ++it) { it->ApplyPow(2.0); }
-    previous_frame_ = frame;
-  }
-
-  Vector<BaseFloat> loglikes(pdf.gconsts());  // need to recreate for each pdf
-  int32 baseclass, regclass;
-  for (int32 comp_id = 0, num_comp = pdf.NumGauss(); comp_id < num_comp;
-      ++comp_id) {
-    baseclass = regtree_.Gauss2BaseclassId(state, comp_id);
-    regclass = fmllr_xform_.Base2RegClass(baseclass);
-    // loglikes +=  means * inv(vars) * data.
-    loglikes(comp_id) += VecVec(pdf.means_invvars().Row(comp_id),
-                                xformed_data_[regclass]);
-    // loglikes += -0.5 * inv(vars) * data_sq.
-    loglikes(comp_id) -= 0.5 * VecVec(pdf.inv_vars().Row(comp_id),
-                                      xformed_data_squared_[regclass]);
-    loglikes(comp_id) += logdets_(regclass);
-  }
-
-  BaseFloat log_sum = loglikes.LogSumExp(log_sum_exp_prune_);
-  if (KALDI_ISNAN(log_sum) || KALDI_ISINF(log_sum))
-    KALDI_ERR << "Invalid answer (overflow or invalid variances/features?)";
-
-  log_like_cache_[state].log_like = log_sum;
-  log_like_cache_[state].hit_time = frame;
-
-  return log_sum;
-}
-
-DecodableAmDiagGmmRegtreeMllr::~DecodableAmDiagGmmRegtreeMllr() {
-  DeletePointers(&xformed_mean_invvars_);
-  DeletePointers(&xformed_gconsts_);
-}
-
-
-void DecodableAmDiagGmmRegtreeMllr::InitCache() {
-  if (xformed_mean_invvars_.size() != 0)
-    DeletePointers(&xformed_mean_invvars_);
-  if (xformed_gconsts_.size() != 0)
-    DeletePointers(&xformed_gconsts_);
-  int32 num_pdfs = acoustic_model_.NumPdfs();
-  xformed_mean_invvars_.resize(num_pdfs);
-  xformed_gconsts_.resize(num_pdfs);
-  is_cached_.resize(num_pdfs, false);
-  ResetLogLikeCache();
-}
-
-
-// This is almost the same code as DiagGmm::ComputeGconsts, except that
-// means are used instead of means * inv(vars). This saves some computation.
-static void ComputeGconsts(const VectorBase<BaseFloat> &weights,
-                           const MatrixBase<BaseFloat> &means,
-                           const MatrixBase<BaseFloat> &inv_vars,
-                           VectorBase<BaseFloat> *gconsts_out) {
-  int32 num_gauss = weights.Dim();
-  int32 dim = means.NumCols();
-  KALDI_ASSERT(means.NumRows() == num_gauss
-      && inv_vars.NumRows() == num_gauss && inv_vars.NumCols() == dim);
-  KALDI_ASSERT(gconsts_out->Dim() == num_gauss);
-
-  BaseFloat offset = -0.5 * M_LOG_2PI * dim;  // constant term in gconst.
-  int32 num_bad = 0;
-
-  for (int32 gauss = 0; gauss < num_gauss; gauss++) {
-    KALDI_ASSERT(weights(gauss) >= 0);  // Cannot have negative weights.
-    BaseFloat gc = Log(weights(gauss)) + offset;  // May be -inf if weights == 0
-    for (int32 d = 0; d < dim; d++) {
-      gc += 0.5 * Log(inv_vars(gauss, d)) - 0.5 * means(gauss, d)
-        * means(gauss, d) * inv_vars(gauss, d);  // diff from DiagGmm version.
-    }
-
-    if (KALDI_ISNAN(gc)) {  // negative infinity is OK but NaN is not acceptable
-      KALDI_ERR << "At component "  << gauss
-                << ", not a number in gconst computation";
-    }
-    if (KALDI_ISINF(gc)) {
-      num_bad++;
-      // If positive infinity, make it negative infinity.
-      // Want to make sure the answer becomes -inf in the end, not NaN.
-      if (gc > 0) gc = -gc;
-    }
-    (*gconsts_out)(gauss) = gc;
-  }
-  if (num_bad > 0)
-    KALDI_WARN << num_bad << " unusable components found while computing "
-               << "gconsts.";
-}
-
-
-const Matrix<BaseFloat>& DecodableAmDiagGmmRegtreeMllr::GetXformedMeanInvVars(
-    int32 state) {
-  if (is_cached_[state]) {  // found in cache
-    KALDI_ASSERT(xformed_mean_invvars_[state] != NULL);
-    KALDI_VLOG(3) << "For PDF index " << state << ": transformed means "
-                  << "found in cache.";
-    return *xformed_mean_invvars_[state];
-  } else {  // transform the means and cache them
-    KALDI_ASSERT(xformed_mean_invvars_[state] == NULL);
-    KALDI_VLOG(3) << "For PDF index " << state << ": transforming means.";
-    int32 num_gauss = acoustic_model_.GetPdf(state).NumGauss(),
-        dim = acoustic_model_.Dim();
-    const Vector<BaseFloat> &weights = acoustic_model_.GetPdf(state).weights();
-    const Matrix<BaseFloat> &invvars = acoustic_model_.GetPdf(state).inv_vars();
-    xformed_mean_invvars_[state] = new Matrix<BaseFloat>(num_gauss, dim);
-    mllr_xform_.GetTransformedMeans(regtree_, acoustic_model_, state,
-                                    xformed_mean_invvars_[state]);
-    xformed_gconsts_[state] = new Vector<BaseFloat>(num_gauss);
-    // At this point, the transformed means haven't been multiplied with
-    // the inv vars, and they are used to compute gconsts first.
-    ComputeGconsts(weights, *xformed_mean_invvars_[state], invvars,
-                   xformed_gconsts_[state]);
-    // Finally, multiply the transformed means with the inv vars.
-    xformed_mean_invvars_[state]->MulElements(invvars);
-    is_cached_[state] = true;
-    return *xformed_mean_invvars_[state];
-  }
-}
-
-const Vector<BaseFloat>& DecodableAmDiagGmmRegtreeMllr::GetXformedGconsts(
-    int32 state) {
-  if (!is_cached_[state]) {
-    KALDI_ERR << "GConsts not cached for state: " << state << ". Must call "
-              << "GetXformedMeanInvVars() first.";
-  }
-  KALDI_ASSERT(xformed_gconsts_[state] != NULL);
-  return *xformed_gconsts_[state];
-}
-
-BaseFloat DecodableAmDiagGmmRegtreeMllr::LogLikelihoodZeroBased(int32 frame,
-                                                                int32 state) {
-//  KALDI_ERR << "Function not completely implemented yet.";
-  KALDI_ASSERT(frame < NumFramesReady() && frame >= 0);
-  KALDI_ASSERT(state < NumIndices() && state >= 0);
-
-  if (log_like_cache_[state].hit_time == frame) {
-    return log_like_cache_[state].log_like;  // return cached value, if found
-  }
-
-  const DiagGmm &pdf = acoustic_model_.GetPdf(state);
-  const VectorBase<BaseFloat> &data = feature_matrix_.Row(frame);
-
-  // check if everything is in order
-  if (pdf.Dim() != data.Dim()) {
-    KALDI_ERR << "Dim mismatch: data dim = "  << data.Dim()
-        << " vs. model dim = " << pdf.Dim();
-  }
-
-  if (frame != previous_frame_) {  // cache the squared stats.
-    data_squared_.CopyFromVec(feature_matrix_.Row(frame));
-    data_squared_.ApplyPow(2.0);
-    previous_frame_ = frame;
-  }
-
-  const Matrix<BaseFloat> &means_invvars = GetXformedMeanInvVars(state);
-  const Vector<BaseFloat> &gconsts = GetXformedGconsts(state);
-
-  Vector<BaseFloat> loglikes(gconsts);  // need to recreate for each pdf
-  // loglikes +=  means * inv(vars) * data.
-  loglikes.AddMatVec(1.0, means_invvars, kNoTrans, data, 1.0);
-  // loglikes += -0.5 * inv(vars) * data_sq.
-  loglikes.AddMatVec(-0.5, pdf.inv_vars(), kNoTrans, data_squared_, 1.0);
-
-  BaseFloat log_sum = loglikes.LogSumExp(log_sum_exp_prune_);
-  if (KALDI_ISNAN(log_sum) || KALDI_ISINF(log_sum))
-    KALDI_ERR << "Invalid answer (overflow or invalid variances/features?)";
-
-  log_like_cache_[state].log_like = log_sum;
-  log_like_cache_[state].hit_time = frame;
-
-  return log_sum;
-}
-
-}  // namespace kaldi
diff --git a/src/transform/decodable-am-diag-gmm-regtree.h b/src/transform/decodable-am-diag-gmm-regtree.h
deleted file mode 100644
index b6e7888ffdc..00000000000
--- a/src/transform/decodable-am-diag-gmm-regtree.h
+++ /dev/null
@@ -1,141 +0,0 @@
-// transform/decodable-am-diag-gmm-regtree.h
-
-// Copyright 2009-2011  Saarland University;  Microsoft Corporation;
-//                      Lukas Burget
-//                2013  Johns Hopkins Universith (author: Daniel Povey)
-
-// See ../../COPYING for clarification regarding multiple authors
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//  http://www.apache.org/licenses/LICENSE-2.0
-//
-// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
-// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
-// MERCHANTABLITY OR NON-INFRINGEMENT.
-// See the Apache 2 License for the specific language governing permissions and
-// limitations under the License.
-
-#ifndef KALDI_TRANSFORM_DECODABLE_AM_DIAG_GMM_REGTREE_H_
-#define KALDI_TRANSFORM_DECODABLE_AM_DIAG_GMM_REGTREE_H_
-
-#include <vector>
-
-#include "base/kaldi-common.h"
-#include "gmm/am-diag-gmm.h"
-#include "hmm/transition-model.h"
-#include "itf/decodable-itf.h"
-#include "transform/regression-tree.h"
-#include "gmm/decodable-am-diag-gmm.h"
-#include "transform/regtree-fmllr-diag-gmm.h"
-#include "transform/regtree-mllr-diag-gmm.h"
-
-namespace kaldi {
-
-class DecodableAmDiagGmmRegtreeFmllr: public DecodableAmDiagGmmUnmapped {
- public:
-  DecodableAmDiagGmmRegtreeFmllr(const AmDiagGmm &am,
-                                 const TransitionModel &tm,
-                                 const Matrix<BaseFloat> &feats,
-                                 const RegtreeFmllrDiagGmm &fmllr_xform,
-                                 const RegressionTree &regtree,
-                                 BaseFloat scale,
-                                 BaseFloat log_sum_exp_prune = -1.0)
-    : DecodableAmDiagGmmUnmapped(am, feats, log_sum_exp_prune), trans_model_(tm),
-      scale_(scale), fmllr_xform_(fmllr_xform), regtree_(regtree),
-      valid_logdets_(false) {}
-
-  // Note, frames are numbered from zero but transition-ids (tid) from one.
-  virtual BaseFloat LogLikelihood(int32 frame, int32 tid) {
-    return scale_*LogLikelihoodZeroBased(frame,
-                                         trans_model_.TransitionIdToPdfFast(tid));
-  }
-
-  virtual int32 NumFramesReady() const { return feature_matrix_.NumRows(); }
-
-  // Indices are one-based!  This is for compatibility with OpenFst.
-  virtual int32 NumIndices() const { return trans_model_.NumTransitionIds(); }
-
- protected:
-  virtual BaseFloat LogLikelihoodZeroBased(int32 frame, int32 state_index);
-
-  const TransitionModel *TransModel() { return &trans_model_; }
-
- private:
-  const TransitionModel &trans_model_;  // for transition-id to pdf mapping
-  BaseFloat scale_;
-  const RegtreeFmllrDiagGmm &fmllr_xform_;
-  const RegressionTree &regtree_;
-  std::vector< Vector<BaseFloat> > xformed_data_;
-  std::vector< Vector<BaseFloat> > xformed_data_squared_;
-  Vector<BaseFloat> logdets_;
-  bool valid_logdets_;
-
-  KALDI_DISALLOW_COPY_AND_ASSIGN(DecodableAmDiagGmmRegtreeFmllr);
-};
-
-class DecodableAmDiagGmmRegtreeMllr: public DecodableAmDiagGmmUnmapped {
- public:
-  DecodableAmDiagGmmRegtreeMllr(const AmDiagGmm &am,
-                                const TransitionModel &tm,
-                                const Matrix<BaseFloat> &feats,
-                                const RegtreeMllrDiagGmm &mllr_xform,
-                                const RegressionTree &regtree,
-                                BaseFloat scale,
-                                BaseFloat log_sum_exp_prune = -1.0):
-      DecodableAmDiagGmmUnmapped(am, feats, log_sum_exp_prune),
-      trans_model_(tm), scale_(scale), mllr_xform_(mllr_xform),
-      regtree_(regtree), data_squared_(feats.NumCols()) { InitCache(); }
-  ~DecodableAmDiagGmmRegtreeMllr();
-
-  // Note, frames are numbered from zero but transition-ids (tid) from one.
-  virtual BaseFloat LogLikelihood(int32 frame, int32 tid) {
-    return scale_*LogLikelihoodZeroBased(frame,
-                                         trans_model_.TransitionIdToPdfFast(tid));
-  }
-
-  virtual int32 NumFramesReady() const { return feature_matrix_.NumRows(); }
-
-  // Indices are one-based!  This is for compatibility with OpenFst.
-  virtual int32 NumIndices() const { return trans_model_.NumTransitionIds(); }
-
-  const TransitionModel *TransModel() { return &trans_model_; }
-
- protected:
-  virtual BaseFloat LogLikelihoodZeroBased(int32 frame, int32 state_index);
-
- private:
-  /// Initializes the mean & gconst caches
-  void InitCache();
-  /// Get the transformed means times inverse variances for a given pdf, and
-  /// cache them. The 'state_index' is 0-based.
-  const Matrix<BaseFloat>& GetXformedMeanInvVars(int32 state_index);
-  /// Get the cached (while computing transformed means) gconsts for
-  /// likelihood calculation. The 'state_index' is 0-based.
-  const Vector<BaseFloat>& GetXformedGconsts(int32 state_index);
-
-  const TransitionModel &trans_model_;  // for transition-id to pdf mapping
-  BaseFloat scale_;
-  const RegtreeMllrDiagGmm &mllr_xform_;
-  const RegressionTree &regtree_;
-  // we want it public to have access to the pdf ids
-
-  /// Cache of transformed means time inverse variances for each state.
-  std::vector< Matrix<BaseFloat>* > xformed_mean_invvars_;
-  /// Cache of transformed gconsts for each state.
-  std::vector< Vector<BaseFloat>* > xformed_gconsts_;
-  /// Boolean variable per state to indicate whether the transformed means for
-  /// that state are cached.
-  std::vector<bool> is_cached_;
-
-  Vector<BaseFloat> data_squared_;  ///< Cached for fast likelihood calculation
-
-  KALDI_DISALLOW_COPY_AND_ASSIGN(DecodableAmDiagGmmRegtreeMllr);
-};
-
-}  // namespace kaldi
-
-#endif  // KALDI_TRANSFORM_DECODABLE_AM_DIAG_GMM_REGTREE_H_
diff --git a/src/transform/fmllr-raw-test.cc b/src/transform/fmllr-raw-test.cc
deleted file mode 100644
index 10fa3bae188..00000000000
--- a/src/transform/fmllr-raw-test.cc
+++ /dev/null
@@ -1,123 +0,0 @@
-// transform/fmllr-raw-test.cc
-
-// Copyright  2009-2011 Microsoft Corporation
-//            2013  Johns Hopkins University (author: Daniel Povey)
-
-// See ../../COPYING for clarification regarding multiple authors
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//  http://www.apache.org/licenses/LICENSE-2.0
-//
-// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
-// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
-// MERCHANTABLITY OR NON-INFRINGEMENT.
-// See the Apache 2 License for the specific language governing permissions and
-// limitations under the License.
-
-#include "util/common-utils.h"
-#include "gmm/diag-gmm.h"
-#include "transform/fmllr-diag-gmm.h"
-#include "transform/fmllr-raw.h"
-
-namespace kaldi {
-
-
-void InitRandomGmm (DiagGmm *gmm_in) {
-  int32 num_gauss = 5 + rand () % 4;
-  int32 dim = 6 + Rand() % 5;
-  DiagGmm &gmm(*gmm_in);
-  gmm.Resize(num_gauss, dim);
-  Matrix<BaseFloat> inv_vars(num_gauss, dim),
-      means(num_gauss, dim);
-  Vector<BaseFloat> weights(num_gauss);
-  for (int32 i = 0; i < num_gauss; i++) {
-    for (int32 j = 0; j < dim; j++) {
-      inv_vars(i, j) = Exp(RandGauss() * (1.0 / (1 + j)));
-      means(i, j) = RandGauss() * (1.0 / (1 + j));
-    }
-    weights(i) = Exp(RandGauss());
-  }
-  weights.Scale(1.0 / weights.Sum());
-  gmm.SetWeights(weights);
-  gmm.SetInvVarsAndMeans(inv_vars, means);
-  gmm.ComputeGconsts();
-}
-
-void UnitTestFmllrRaw(bool use_offset) {
-  using namespace kaldi;
-  DiagGmm gmm;
-  InitRandomGmm(&gmm);
-  int32 model_dim =  gmm.Dim();
-
-  int32 raw_dim = 5 + Rand() % 3;
-  int32 num_splice = 1 + Rand() % 5;
-  while (num_splice * raw_dim < model_dim) {
-    num_splice++;
-  }
-
-  int32 full_dim = num_splice * raw_dim;
-  int32 npoints = raw_dim*(raw_dim+1)*10;
-
-  Matrix<BaseFloat> rand_points(npoints, full_dim);
-  rand_points.SetRandn();
-
-  Matrix<BaseFloat> lda_mllt(full_dim, full_dim + (use_offset ? 1 : 0)); // This is the full LDA+MLLT
-  // matrix.  TODO: test with offset.
-  lda_mllt.SetRandn();
-
-  FmllrRawAccs accs(raw_dim, model_dim, lda_mllt);
-
-  BaseFloat prev_objf_impr;
-  for (int32 iter = 0; iter < 4; iter++) {
-
-    for (int32 i = 0; i < npoints; i++) {
-      SubVector<BaseFloat> sample(rand_points, i);
-      accs.AccumulateForGmm(gmm, sample, 1.0);
-    }
-
-    Matrix<BaseFloat> fmllr_mat(raw_dim, raw_dim + 1);
-    fmllr_mat.SetUnit(); // sets diagonal elements to one.
-
-    FmllrRawOptions opts;
-    BaseFloat objf_impr, count;
-    accs.Update(opts, &fmllr_mat, &objf_impr, &count);
-
-    KALDI_ASSERT(objf_impr > 0.0);
-
-    if (iter != 0) {
-      // This is not something provable, but is always true
-      // in practice.
-      KALDI_ASSERT(objf_impr < prev_objf_impr);
-    }
-    prev_objf_impr = objf_impr;
-
-
-    // Now transform the raw features.
-    for (int32 splice = 0; splice < num_splice; splice++) {
-      SubMatrix<BaseFloat> raw_feats(rand_points,
-                                     0, npoints,
-                                     splice * raw_dim, raw_dim);
-      for (int32 t = 0; t < npoints; t++) {
-        SubVector<BaseFloat> this_feat(raw_feats, t);
-        ApplyAffineTransform(fmllr_mat, &this_feat);
-      }
-    }
-    accs.SetZero();
-  }
-}
-
-
-}  // namespace kaldi ends here
-
-int main() {
-  kaldi::g_kaldi_verbose_level = 5;
-
-  for (int i = 0; i < 2; i++) {  // did more iterations when first testing...
-    kaldi::UnitTestFmllrRaw(i % 2 == 0);
-  }
-  std::cout << "Test OK.\n";
-}
diff --git a/src/transform/fmllr-raw.cc b/src/transform/fmllr-raw.cc
deleted file mode 100644
index 6f52f9c630d..00000000000
--- a/src/transform/fmllr-raw.cc
+++ /dev/null
@@ -1,546 +0,0 @@
-// transform/fmllr-raw.cc
-
-// Copyright 2013  Johns Hopkins University (author: Daniel Povey)
-
-// See ../../COPYING for clarification regarding multiple authors
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//  http://www.apache.org/licenses/LICENSE-2.0
-//
-// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
-// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
-// MERCHANTABLITY OR NON-INFRINGEMENT.
-// See the Apache 2 License for the specific language governing permissions and
-// limitations under the License.
-
-#include <utility>
-#include <vector>
-using std::vector;
-
-#include "transform/fmllr-raw.h"
-#include "transform/fmllr-diag-gmm.h"
-
-namespace kaldi {
-
-FmllrRawAccs::FmllrRawAccs(int32 raw_dim,
-                           int32 model_dim,
-                           const Matrix<BaseFloat> &full_transform):
-    raw_dim_(raw_dim),
-    model_dim_(model_dim) {
-  if (full_transform.NumCols() != full_transform.NumRows() &&
-      full_transform.NumCols() != full_transform.NumRows() + 1) {
-    KALDI_ERR << "Expecting full LDA+MLLT transform to be square or d by d+1 "
-              << "(make sure you are including rejected rows).";
-  }
-  if (raw_dim <= 0 || full_transform.NumRows() % raw_dim != 0)
-    KALDI_ERR << "Raw feature dimension is invalid " << raw_dim
-              << "(must be positive and divide feature dimension)";
-  int32 full_dim = full_transform.NumRows();
-  full_transform_ = full_transform.Range(0, full_dim, 0, full_dim);
-  transform_offset_.Resize(full_dim);
-  if (full_transform_.NumCols() == full_dim + 1)
-    transform_offset_.CopyColFromMat(full_transform_, full_dim);
-  
-  int32 full_dim2 = ((full_dim+1)*(full_dim+2))/2;
-  count_ = 0.0;
-
-  temp_.Resize(full_dim + 1);
-  Q_.Resize(model_dim + 1, full_dim + 1);
-  S_.Resize(model_dim + 1, full_dim2);
-
-  single_frame_stats_.s.Resize(full_dim + 1);
-  single_frame_stats_.transformed_data.Resize(full_dim);
-  single_frame_stats_.count = 0.0;
-  single_frame_stats_.a.Resize(model_dim);
-  single_frame_stats_.b.Resize(model_dim);
-}
-
-
-bool FmllrRawAccs::DataHasChanged(const VectorBase<BaseFloat> &data) const {
-  KALDI_ASSERT(data.Dim() == FullDim());
-  return !data.ApproxEqual(single_frame_stats_.s.Range(0, FullDim()), 0.0);
-}
-
-void FmllrRawAccs::CommitSingleFrameStats() {
-  // Commit the stats for this from (in SingleFrameStats).
-  int32 model_dim = ModelDim(), full_dim = FullDim();
-  SingleFrameStats &stats = single_frame_stats_;
-  if (stats.count == 0.0) return;
-
-  count_ += stats.count;
-
-  // a_ext and b_ext are a and b extended with the count,
-  // which we'll later use to reconstruct the full stats for
-  // the rejected dimensions.
-  Vector<double> a_ext(model_dim + 1), b_ext(model_dim + 1);
-  a_ext.Range(0, model_dim).CopyFromVec(stats.a);
-  b_ext.Range(0, model_dim).CopyFromVec(stats.b);
-  a_ext(model_dim) = stats.count;
-  b_ext(model_dim) = stats.count;
-  Q_.AddVecVec(1.0, a_ext, Vector<double>(stats.s));
-
-  temp_.SetZero();
-  temp_.AddVec2(1.0, stats.s);
-  int32 full_dim2 = ((full_dim + 1) * (full_dim + 2)) / 2;
-  SubVector<double> temp_vec(temp_.Data(), full_dim2);
-  S_.AddVecVec(1.0, b_ext, temp_vec);
-}
-
-void FmllrRawAccs::InitSingleFrameStats(const VectorBase<BaseFloat> &data) {
-  SingleFrameStats &stats = single_frame_stats_;
-  int32 full_dim = FullDim();
-  KALDI_ASSERT(data.Dim() == full_dim);
-  stats.s.Range(0, full_dim).CopyFromVec(data);
-  stats.s(full_dim) = 1.0;
-  stats.transformed_data.AddMatVec(1.0, full_transform_, kNoTrans, data, 0.0);
-  stats.transformed_data.AddVec(1.0, transform_offset_);
-  stats.count = 0.0;
-  stats.a.SetZero();
-  stats.b.SetZero();
-}
-
-
-BaseFloat FmllrRawAccs::AccumulateForGmm(const DiagGmm &gmm,
-                                         const VectorBase<BaseFloat> &data,
-                                         BaseFloat weight) {
-  int32 model_dim = ModelDim(), full_dim = FullDim();
-  KALDI_ASSERT(data.Dim() == full_dim &&
-               "Expect raw, spliced data, which should have same dimension as "
-               "full transform.");
-  if (DataHasChanged(data)) {
-    // this is part of our mechanism to accumulate certain sub-parts of
-    // the computation for each frame, to avoid excessive compute.
-    CommitSingleFrameStats();
-    InitSingleFrameStats(data);
-  }
-  SingleFrameStats &stats = single_frame_stats_;
-
-  SubVector<BaseFloat> projected_data(stats.transformed_data, 0, model_dim);
-
-  int32 num_gauss = gmm.NumGauss();
-  Vector<BaseFloat> posterior(num_gauss);
-  BaseFloat log_like = gmm.ComponentPosteriors(projected_data, &posterior);
-  posterior.Scale(weight);
-  // Note: AccumulateFromPosteriors takes the original, spliced data,
-  // and returns the log-like of the rejected dimensions.
-  AccumulateFromPosteriors(gmm, data, posterior);
-
-  // Add the likelihood of the rejected dimensions to the objective function
-  // (assume zero-mean, unit-variance Gaussian; the LDA should have any offset
-  // required to ensure this).
-  if (full_dim > model_dim) {
-    SubVector<BaseFloat> rejected_data(stats.transformed_data,
-                                       model_dim, full_dim - model_dim);
-    log_like += -0.5 * (VecVec(rejected_data, rejected_data)
-                        + (full_dim - model_dim) * M_LOG_2PI);
-  }
-  return log_like;
-}
-
-/*
-  // Extended comment here.
-  //
-  // Let x_t(i) be the fully processed feature, dimension i (with fMLLR transform
-  //  and LDA transform), but *without* any offset term from the LDA, which
-  //  it's more convenient to view as an offset in the model.
-  //
-  //
-  // For a given dimension i (either accepted or rejected), the auxf can
-  // be expressed as a quadratic function of x_t(i).  We ultimately will want to
-  // express x_t(i) as a linear function of the parameters of the linearized
-  // fMLLR transform matrix.  Some notation:
-  //    Let l be the linearized transform matrix, i.e. the concatenation of the
-  //       m rows, each of length m+1, of the fMLLR transform.
-  //    Let n be the number of frames we splice together each time.
-  //    Let s_t be the spliced-together features on time t, with a one appended;
-  //       it will have n blocks each of size m, followed by a 1.  (dim is n*m + 1).
-  //     
-  // x(i) [note, this is the feature without any LDA offset], is bilinear in the
-  //      transform matrix and the features, so:
-  //
-  // x(i) = l^T M_i s_t, where s_t is the spliced features on time t,
-  //          with a 1 appended
-  //   [we need to compute M_i but we know the function is bilinear so it exists].
-  //
-  // The auxf can be written as:
-  // F = sum_i sum_t  a_{ti} x(i) - 0.5  b_{ti} x(i)^2 
-  //   = sum_i sum_t  a_{ti} x(i) - 0.5  b_{ti} x(i)^2
-  //   = sum_i sum_t  a_{ti} (l^T M_i s_t)  -  0.5 b_{ti} (l^T M_i s_t )^2
-  //   = sum_i l^T M_i q_i  +  l^T M_i S_i M_i^T l 
-  //  where
-  //     q_i = sum_t a_{ti} s_t, and
-  //     S_i = sum_t b_{ti} s_t s_t^T
-  //   [Note that we only need store S_i for the model-dim plus one, because
-  //    all the rejected dimensions have the same value]
-  //
-  //     We define a matrix Q whose rows are q_d, with
-  //       Q = \sum_t d_t s_t^T
-  //    [The Q we actually store as stats will use a modified form of d that
-  //     has a 1 for all dimensions past the model dim, to avoid redundancy;
-  //     we'll reconstruct the true Q from this later on.]
-  //     
-  //
-  // What is M_i?  Working it out is a little tedious.
-  //  Note: each M_i (from i = 0 ... full_dim) is of
-  //    dimension (raw_dim*(raw_dim+1)) by full_dim + 1
-  // 
-  // We want to express x(i) [we forget the subscript "t" sometimes],
-  // as a bilinear function of l and s_t.
-  //    We have x(i) = l^T M_i s.
-  //
-  // The (j,k)'th component of M_i is the term in x(i) that corresponds to the j'th
-  // component of l and the k'th of s.
-
-  // Before defining M_i, let us define N_i, where l^t N_i s will equal the spliced and
-  // transformed pre-LDA features of dimension i.  the N's have the same dimensions as the
-  // M's.
-  //
-  // We'll first define the j,k'th component of N_i, as this is easier; we'll then define the M_i
-  // as combinations of N_i.
-  //
-  // For a given i, j and k, the value of n_{i,j,k} will be as follows:
-  //   We first decompose index j into j1, j2 (both functions of
-  //    the original index j), where
-  //    j1 corresponds to the row-index of the fMLLR transform, j2 to the col-index.
-  //   We next decompose i into i1, i2, where i1 corresponds to the splicing number
-  //   (0...n-1), and i2 corresponds to the cepstral index.
-  //
-  //   If (j1 != i2) then n_{ijk} == 0.
-  //
-  //   Elsif k corresponds to the last element [i.e. k == m * n], then this m_{ijk} corresponds
-  //   to the effect of the j'th component of l for zero input, so:
-  //     If j2 == m (i.e. this the offset term in the fMLLR matrix), then
-  //       n_{ijk} = 1.0,
-  //     Else
-  //       n_{ijk} = 0.0
-  //     Fi
-  //
-  //   Else:
-  //     Decompose k into k1, k2, where k1 = 0.. n-1 is the splicing index, and k2 = 0...m-1 is
-  //      the cepstral index.
-  //     If k1 != i1 then
-  //       n_{ijk} = 0.0
-  //     elsif k2 != j2 then
-  //       n_{ijk} = 0.0
-  //     else
-  //       n_{ijk} = 1.0
-  //     fi
-  //    Endif
-  //    Now,  M_i will be defined as sum_i T_{ij} N_j, where T_{ij} are the elements of the
-  //     LDA+MLLT transform (but excluding any linear offset, which gets accounted for by
-  //     c_i, above).
-  //
-  //  Now suppose we want to express the auxiliary function in a simpler form
-  //  as l^T v - 0.5 l^T W l, where v and W are the "simple" linear and quadratic stats,
-  //  we can do so with:
-  //     v = \sum_i M_i q_i   
-  //  and
-  //     W = \sum_i M_i S_i M_i^T
-  //
-  */
-
-void FmllrRawAccs::AccumulateFromPosteriors(
-    const DiagGmm &diag_gmm,
-    const VectorBase<BaseFloat> &data,
-    const VectorBase<BaseFloat> &posterior) {
-  // The user may call this function directly, even though we also
-  // call it from AccumulateForGmm(), so check again:
-  if (DataHasChanged(data)) { 
-    CommitSingleFrameStats();
-    InitSingleFrameStats(data);
-  }
-  
-  int32  model_dim = ModelDim();
-
-  SingleFrameStats &stats = single_frame_stats_;
-  
-  // The quantities a and b describe the diagonal auxiliary function
-  // for each of the retained dimensions in the transformed space--
-  // in the format F = \sum_d alpha(d) x(d)  -0.5 beta(d) x(d)^2,
-  // where x(d) is the d'th dimensional fully processed feature.
-  // For d, see the comment-- it's alpha processed to take into
-  // account any offset in the LDA.  Note that it's a reference.
-  //
-  Vector<double> a(model_dim), b(model_dim);
-  
-  int32 num_comp = diag_gmm.NumGauss();
-  
-  double count = 0.0; // data-count contribution from this frame.
-
-  // Note: we could do this using matrix-matrix operations instead of
-  // row by row.  In the end it won't really matter as this is not
-  // the slowest part of the computation.
-  for (size_t m = 0; m < num_comp; m++) {
-    BaseFloat this_post = posterior(m);
-    if (this_post != 0.0) {
-      count += this_post;
-      a.AddVec(this_post, diag_gmm.means_invvars().Row(m));
-      b.AddVec(this_post, diag_gmm.inv_vars().Row(m));
-    }
-  }
-  // Correct "a" for any offset term in the LDA transform-- we view it as
-  // the opposite offset in the model [note: we'll handle the rejected dimensions
-  // in update time.]  Here, multiplying the element of "b" (which is the
-  // weighted inv-vars) by transform_offset_, and subtracting the result from
-  // a, is like subtracting the transform-offset from the original means
-  // (because a contains the means times inv-vars_.
-  Vector<double> offset(transform_offset_.Range(0, model_dim));
-  a.AddVecVec(-1.0, b, offset, 1.0);
-  stats.a.AddVec(1.0, a);
-  stats.b.AddVec(1.0, b);
-  stats.count += count;
-}
-
-
-void FmllrRawAccs::Update(const FmllrRawOptions &opts,
-                          MatrixBase<BaseFloat> *raw_fmllr_mat,
-                          BaseFloat *objf_impr,
-                          BaseFloat *count) {
-  // First commit any pending stats from the last frame.
-  if (single_frame_stats_.count != 0.0)
-    CommitSingleFrameStats();
-  
-  if (this->count_ < opts.min_count) {
-    KALDI_WARN << "Not updating (raw) fMLLR since count " << this->count_
-               << " is less than min count " << opts.min_count;
-    *objf_impr = 0.0;
-    *count = this->count_;
-    return;
-  }
-  KALDI_ASSERT(raw_fmllr_mat->NumRows() == RawDim() &&
-               raw_fmllr_mat->NumCols() == RawDim() + 1 &&
-               !raw_fmllr_mat->IsZero());
-  Matrix<double> fmllr_mat(*raw_fmllr_mat); // temporary, double-precision version
-                                            // of matrix.
-
-
-  Matrix<double> linear_stats; // like K in diagonal update.
-  std::vector<SpMatrix<double> > diag_stats; // like G in diagonal update.
-                                             // Note: we will invert these.
-  std::vector<std::vector<Matrix<double> > > off_diag_stats; // these will
-  // contribute to the linear term.
-
-  Vector<double> simple_linear_stats;
-  SpMatrix<double> simple_quadratic_stats;
-  ConvertToSimpleStats(&simple_linear_stats, &simple_quadratic_stats);
-  
-  ConvertToPerRowStats(simple_linear_stats, simple_quadratic_stats,
-                       &linear_stats, &diag_stats, &off_diag_stats);
-
-  try {
-    for (size_t i = 0; i < diag_stats.size(); i++) {
-      diag_stats[i].Invert();
-    }
-  } catch (...) {
-    KALDI_WARN << "Error inverting stats matrices for fMLLR "
-               << "[min-count too small?  Bad data?], not updating.";
-    return;
-  }
-  
-  int32 raw_dim = RawDim(), splice_width = SpliceWidth();
-  
-  double effective_beta = count_ * splice_width; // We "count" the determinant
-  // splice_width times in the objective function.
-
-  double auxf_orig = GetAuxf(simple_linear_stats, simple_quadratic_stats,
-                             fmllr_mat);
-  for (int32 iter = 0; iter < opts.num_iters; iter++) {
-    for (int32 row = 0; row < raw_dim; row++) {
-      SubVector<double> this_row(fmllr_mat, row);
-      Vector<double> this_linear(raw_dim + 1);  // Here, k_i is the linear term
-      // in the auxf expressed as a function of this row.
-      this_linear.CopyFromVec(linear_stats.Row(row));
-      for (int32 row2 = 0; row2 < raw_dim; row2++) {
-        if (row2 != row) {
-          if (row2 < row) {
-            this_linear.AddMatVec(-1.0, off_diag_stats[row][row2], kNoTrans,
-                                  fmllr_mat.Row(row2), 1.0);
-          } else {
-            // We won't have the element [row][row2] stored, but use symmetry.
-            this_linear.AddMatVec(-1.0, off_diag_stats[row2][row], kTrans,
-                                  fmllr_mat.Row(row2), 1.0);
-          }
-        }
-      }
-      FmllrInnerUpdate(diag_stats[row],
-                       this_linear,
-                       effective_beta,
-                       row,
-                       &fmllr_mat);
-    }
-    if (GetVerboseLevel() >= 2) {
-      double cur_auxf = GetAuxf(simple_linear_stats, simple_quadratic_stats,
-                                 fmllr_mat),
-          auxf_change = cur_auxf - auxf_orig;
-      KALDI_VLOG(2) << "Updating raw fMLLR: objf improvement per frame was "
-                    << (auxf_change / this->count_) << " over "
-                    << this->count_ << " frames, by the " << iter
-                    << "'th iteration";
-    }
-  }
-  double auxf_final = GetAuxf(simple_linear_stats, simple_quadratic_stats,
-                              fmllr_mat),
-      auxf_change = auxf_final - auxf_orig;
-  *count = this->count_;
-  KALDI_VLOG(1) << "Updating raw fMLLR: objf improvement per frame was "
-                << (auxf_change / this->count_) << " over "
-                << this->count_ << " frames.";
-  if (auxf_final > auxf_orig) {
-    *objf_impr = auxf_change;
-    *count = this->count_;
-    raw_fmllr_mat->CopyFromMat(fmllr_mat);
-  } else {
-    *objf_impr = 0.0;
-    // don't update "raw_fmllr_mat"
-  }
-}
-
-void FmllrRawAccs::SetZero() {
-  count_ = 0.0;
-  single_frame_stats_.count = 0.0;
-  single_frame_stats_.s.SetZero();
-  Q_.SetZero();
-  S_.SetZero();
-}
-
-// Compute the M_i quantities, needed in the update.  This function could be
-// greatly speeded up but I don't think it's the limiting factor.
-void FmllrRawAccs::ComputeM(std::vector<Matrix<double> > *M) const {
-  int32 full_dim = FullDim(), raw_dim = RawDim(),
-      raw_dim2 = raw_dim * (raw_dim + 1);
-  M->resize(full_dim);
-  for (int32 i = 0; i < full_dim; i++)
-    (*M)[i].Resize(raw_dim2, full_dim + 1);  
-
-  // the N's are simpler matrices from which we'll interpolate the M's.
-  // In this loop we imagine w are computing the vector of N's, but
-  // when we get each element, if it's nonzero we propagate it straight
-  // to the M's.
-  for (int32 i = 0; i < full_dim; i++) {
-    // i is index after fMLLR transform; i1 is splicing index,
-    // i2 is cepstral index.
-    int32 i1 = i / raw_dim, i2 = i % raw_dim;
-    for (int32 j = 0; j < raw_dim2; j++) {
-      // j1 is row-index of fMLLR transform, j2 is column-index
-      int32 j1 = j / (raw_dim + 1), j2 = j % (raw_dim + 1);
-      for (int32 k = 0; k < full_dim + 1; k++) {
-        BaseFloat n_ijk;
-        if (j1 != i2) {
-          n_ijk = 0.0;
-        } else if (k == full_dim) {
-          if (j2 == raw_dim) // offset term in fMLLR matrix.
-            n_ijk = 1.0;
-          else
-            n_ijk = 0.0;
-        } else {
-          // k1 is splicing index, k2 is cepstral idnex.
-          int32 k1 = k / raw_dim, k2 = k % raw_dim;
-          if (k1 != i1 || k2 != j2)
-            n_ijk = 0.0;
-          else
-            n_ijk = 1.0;
-        }
-        if (n_ijk != 0.0)
-          for (int32 l = 0; l < full_dim; l++)
-            (*M)[l](j, k) += n_ijk * full_transform_(l, i);
-      }
-    }
-  }
-}
-
-void FmllrRawAccs::ConvertToSimpleStats(
-    Vector<double> *simple_linear_stats,
-    SpMatrix<double> *simple_quadratic_stats) const {
-  std::vector<Matrix<double> > M;
-  ComputeM(&M);
-
-  int32 full_dim = FullDim(), raw_dim = RawDim(), model_dim = ModelDim(),
-      raw_dim2 = raw_dim * (raw_dim + 1),
-      full_dim2 = ((full_dim+1)*(full_dim+2))/2;
-  simple_linear_stats->Resize(raw_dim2);
-  simple_quadratic_stats->Resize(raw_dim2);
-  for (int32 i = 0; i < full_dim; i++) {
-    Vector<double> q_i(full_dim + 1);
-    SpMatrix<double> S_i(full_dim + 1);
-    SubVector<double> S_i_vec(S_i.Data(), full_dim2);
-    if (i < model_dim) {
-      q_i.CopyFromVec(Q_.Row(i));
-      S_i_vec.CopyFromVec(S_.Row(i));
-    } else {
-      q_i.CopyFromVec(Q_.Row(model_dim)); // The last row contains stats proportional
-      // to "count", which we need to modify to be correct.
-      q_i.Scale(-transform_offset_(i)); // These stats are zero (corresponding to
-      // a zero-mean model) if there is no offset in the LDA transform.  Note:
-      // the two statements above are the equivalent, for the rejected dims,
-      // of the statement "a.AddVecVec(-1.0, b, offset);" for the kept ones.
-      // 
-      S_i_vec.CopyFromVec(S_.Row(model_dim)); // these are correct, and
-      // all the same (corresponds to unit variance).
-    }
-    // The equation v = \sum_i M_i q_i:
-    simple_linear_stats->AddMatVec(1.0, M[i], kNoTrans, q_i, 1.0);
-    // The equation W = \sum_i M_i S_i M_i^T
-    // Here, M[i] is quite sparse, so AddSmat2Sp will be faster.
-    simple_quadratic_stats->AddSmat2Sp(1.0, M[i], kNoTrans, S_i, 1.0);
-  }
-}
-
-// See header for comment.
-void FmllrRawAccs::ConvertToPerRowStats(
-    const Vector<double> &simple_linear_stats,
-    const SpMatrix<double> &simple_quadratic_stats_sp,
-    Matrix<double> *linear_stats,
-    std::vector<SpMatrix<double> > *diag_stats,
-    std::vector<std::vector<Matrix<double> > > *off_diag_stats) const {
-
-  // get it as a Matrix, which makes it easier to extract sub-parts.
-  Matrix<double> simple_quadratic_stats(simple_quadratic_stats_sp);
-
-  linear_stats->Resize(RawDim(), RawDim() + 1);
-  linear_stats->CopyRowsFromVec(simple_linear_stats);
-  diag_stats->resize(RawDim());
-  off_diag_stats->resize(RawDim());
-
-  // Set *diag_stats
-  int32 rd1 = RawDim() + 1;
-  for (int32 i = 0; i < RawDim(); i++) {
-    SubMatrix<double> this_diag(simple_quadratic_stats,
-                                i * rd1, rd1,
-                                i * rd1, rd1);
-    (*diag_stats)[i].Resize(RawDim() + 1);
-    (*diag_stats)[i].CopyFromMat(this_diag, kTakeMean);
-  }    
-  
-  for (int32 i = 0; i < RawDim(); i++) {
-    (*off_diag_stats)[i].resize(i);
-    for (int32 j = 0; j < i; j++) {
-      SubMatrix<double> this_off_diag(simple_quadratic_stats,
-                                      i * rd1, rd1,
-                                      j * rd1, rd1);
-      (*off_diag_stats)[i][j] = this_off_diag;
-    }
-  }
-}
-
-double FmllrRawAccs::GetAuxf(const Vector<double> &simple_linear_stats,
-                             const SpMatrix<double> &simple_quadratic_stats,
-                             const Matrix<double> &fmllr_mat) const {
-  // linearize transform...
-  int32 raw_dim = RawDim(), spice_width = SpliceWidth();
-  Vector<double> fmllr_vec(raw_dim * (raw_dim + 1));
-  fmllr_vec.CopyRowsFromMat(fmllr_mat);
-  SubMatrix<double> square_part(fmllr_mat, 0, raw_dim,
-                                0, raw_dim);
-  double logdet = square_part.LogDet();
-  return VecVec(fmllr_vec, simple_linear_stats) -
-      0.5 * VecSpVec(fmllr_vec, simple_quadratic_stats, fmllr_vec) +
-      logdet * spice_width * count_;
-}
-
-
-
-} // namespace kaldi
diff --git a/src/transform/fmllr-raw.h b/src/transform/fmllr-raw.h
deleted file mode 100644
index cc2d33f4830..00000000000
--- a/src/transform/fmllr-raw.h
+++ /dev/null
@@ -1,206 +0,0 @@
-// transform/fmllr-raw.h
-
-// Copyright 2013  Johns Hopkins University (author: Daniel Povey)
-
-// See ../../COPYING for clarification regarding multiple authors
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//  http://www.apache.org/licenses/LICENSE-2.0
-//
-// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
-// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
-// MERCHANTABLITY OR NON-INFRINGEMENT.
-// See the Apache 2 License for the specific language governing permissions and
-// limitations under the License.
-
-
-#ifndef KALDI_TRANSFORM_FMLLR_RAW_H_
-#define KALDI_TRANSFORM_FMLLR_RAW_H_
-
-#include <vector>
-
-#include "base/kaldi-common.h"
-#include "gmm/am-diag-gmm.h"
-#include "gmm/mle-full-gmm.h"
-#include "transform/transform-common.h"
-#include "util/kaldi-table.h"
-#include "util/kaldi-holder.h"
-
-namespace kaldi {
-
-
-/*
-  This header contains classes and functions related to computing Constrained
-  MLLR (equivalently, fMLLR) on the raw MFCCs or similar, when they have been
-  spliced and projected with something like LDA+MLLT, but where our model is
-  built on top of the spliced and projected features.  We back-project the
-  model estimation back to the original MFCCs so our transform optimizes the
-  data likelihood given our model in the projected space.  We have to include
-  the rejected dimensions in this likelihood, too.  The objective function
-  includes N times the log-determinant of the square part of the transform,
-  where N is the number of times we spliced consecutive features (e.g. N = 9,
-  if we spliced +- 4 frames of context).
-
-  For concreteness (but without losing generality), assuming we spliced
-  13-dimensional MFCCs across 9 frames to get 117-dimensional features.
-   
-  Each of the 117-dim features is a linear function of the 13(13+1) transform
-  parameters.  We have a particular vectorization of these parameters, from
-  which (with the transform) we work out the full quadratic auxiliary function
-  w.r.t. the parameters.
-
-  This gives us a generic quadratic scalar function of the 13(13+1) parameters.
-  How to get this quadratic w.r.t. one row?  Always keep updated the current
-  derivative w.r.t. one row.  The quadratic w.r.t. that row can be read off.
-  The log-determinant is easy to work out from the cofactor.
-
-  So the full stats will be a (13(13+1)) by (13(13+1)) SpMatrix, plus
-  a bias term.
-
-  The update will iterate row by row, and work out the quadratic function
-  of the row.
-*/
-
-
-struct FmllrRawOptions {
-  BaseFloat min_count;
-  int32 num_iters;
-  FmllrRawOptions(): min_count(100.0), num_iters(20) { }
-  void Register(OptionsItf *opts) {
-    opts->Register("fmllr-min-count", &min_count,
-                   "Minimum count required to update fMLLR");
-    opts->Register("fmllr-num-iters", &num_iters,
-                   "Number of iterations in fMLLR update phase.");
-  }
-};
-
-class FmllrRawAccs {
- public:
-  FmllrRawAccs() { }
-
-  /// Dimension of raw MFCC (etc.) features
-  int32 RawDim() const { return raw_dim_; }
-  /// Full feature dimension after splicing.
-  int32 FullDim() const { return full_transform_.NumRows(); }
-  /// Number of frames that are spliced together each time.
-  int32 SpliceWidth() const { return FullDim() / RawDim(); }
-  /// Dimension of the model.
-  int32 ModelDim() const { return model_dim_; }
-  
-  // Initializer takes the raw dimension of the features (e.g. 13 for typicaly
-  // MFCC features, and the full transform (e.g. an LDA+MLLT transform).  This
-  // full transform is the transform extended with the "rejected rows" that
-  // we would normally discard; we need them for this type of estimation.
-  FmllrRawAccs(int32 raw_dim,
-               int32 model_dim,
-               const Matrix<BaseFloat> &full_transform);
-
-  
-  /// Accumulate stats for a single GMM in the model; returns log likelihood.
-  /// Here, "data" will typically be of larger dimension than the model.
-  /// Note: "data" is the original, spliced features-- before LDA+MLLT.
-  /// Returns log-like for this data given this GMM, including rejected
-  /// dimensions (not multiplied by weight).
-  BaseFloat AccumulateForGmm(const DiagGmm &gmm,
-                             const VectorBase<BaseFloat> &data,
-                             BaseFloat weight);
-  
-  /// Accumulate stats for a GMM, given supplied posteriors.  Note: "data" is
-  /// the original, spliced features-- before LDA+MLLT. 
-  void AccumulateFromPosteriors(const DiagGmm &gmm,
-                                const VectorBase<BaseFloat> &data,
-                                const VectorBase<BaseFloat> &posteriors);
-
-  /// Update "raw_fmllr_mat"; it should have the correct dimension and
-  /// reasonable values at entry (see the function InitFmllr in fmllr-diag-gmm.h
-  /// for how to initialize it.)
-  /// The only reason this function is not const is because we may have
-  /// to call CommitSingleFrameStats().
-  void Update(const FmllrRawOptions &opts,
-              MatrixBase<BaseFloat> *raw_fmllr_mat,
-              BaseFloat *objf_impr,
-              BaseFloat *count);
-
-  void SetZero();
- private:
-  struct SingleFrameStats {
-    Vector<BaseFloat> s; // [FullDim() + 1]-dimensional spliced data, plus 1.0
-    Vector<BaseFloat> transformed_data; // [FullDim()] Data times full transform, with offset.
-    double count;
-    Vector<double> a; // linear term in per-frame auxf; dim is model-dim.
-    Vector<double> b; // quadratic term in per-frame auxf; dim is model-dim.
-  };
-  
-  void CommitSingleFrameStats();
-
-  void InitSingleFrameStats(const VectorBase<BaseFloat> &data);
-  
-  bool DataHasChanged(const VectorBase<BaseFloat> &data) const; // compares it to the
-  // data in single_frame_stats_, returns true if it's different.
-
-  
-  /// Compute the auxiliary function for this matrix.
-  double GetAuxf(const Vector<double> &simple_linear_stats,
-                 const SpMatrix<double> &simple_quadratic_stats,
-                 const Matrix<double> &fmllr_mat) const;
-
-  /// Converts from the Q and S stats to a simple objective function
-  /// of the form l . simple_linear_stats -0.5 l^t simple_quadratic_stats l,
-  /// plus the determinant term, where l is the linearized transform.
-  void ConvertToSimpleStats(
-      Vector<double> *simple_linear_stats,
-      SpMatrix<double> *simple_quadratic_stats) const;
-
-  /// Computes the M_i matrices used in the update, see the extended comment in
-  /// fmllr-raw.cc for explanation.
-  void ComputeM(
-      std::vector<Matrix<double> > *M) const;
-  
-  /// Transform stats into a convenient format for the update.
-  /// linear_stats is of dim RawDim() by RawDim() + 1, it's the linear term.
-  /// diag_stats (of dimension RawDim(), each element of dimension RawDim() + 1
-  /// is the quadratic terms w.r.t. the diagonals.  off_diag_stats contains the
-  /// cross-terms between different rows; it is indexed [i][j], with
-  /// 0 <= i < RawDim(), and j < i, and each element is of dimension RawDim() + 1
-  /// by RawDim() + 1.  The [i][j]'th element is interpreted as follows:
-  /// the inner product with the [i'th row] [element [i][j]] [j'th row] is the
-  /// term in the objective function.
-  /// This function resizes its output.
-  void ConvertToPerRowStats(
-      const Vector<double> &simple_linear_stats,
-      const SpMatrix<double> &simple_quadratic_stats_sp,
-      Matrix<double> *linear_stats,
-      std::vector<SpMatrix<double> > *diag_stats,
-      std::vector<std::vector<Matrix<double> > > *off_diag_stats) const;
-  
-  int32 raw_dim_; // Raw MFCC dimension.
-  int32 model_dim_; // Model dimension
-
-  Matrix<BaseFloat> full_transform_; // Does not include any offset term
-  // (last column).
-  Vector<BaseFloat> transform_offset_; // The offset term (or zero).
-  
-
-  SingleFrameStats single_frame_stats_;
-  
-  double count_; // The data-count.  Note: in accounting for the determinant, we will
-                 // have to multiply this by the number of times the data is spliced
-                 // together on each frame.
-
-  SpMatrix<double> temp_; // [full_dim + 1][full_dim + 1], outer product of s.
-  Matrix<double> Q_; // linear stats, indexed [model_dim + 1][full_dim + 1]
-  Matrix<double> S_; // quadratic stats, indexed
-                     // [model_dim + 1][((full_dim+1)*(full_dim+2))/2]
-  
-  KALDI_DISALLOW_COPY_AND_ASSIGN(FmllrRawAccs);
-};
-
-
-
-} // namespace kaldi
-
-#endif  // KALDI_TRANSFORM_FMLLR_RAW_H_
diff --git a/src/transform/fmpe-test.cc b/src/transform/fmpe-test.cc
deleted file mode 100644
index ec76bd1ef20..00000000000
--- a/src/transform/fmpe-test.cc
+++ /dev/null
@@ -1,177 +0,0 @@
-// transform/fmpe-test.cc
-
-// Copyright 2012  Johns Hopkins University (Author: Daniel Povey)
-
-// See ../../COPYING for clarification regarding multiple authors
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//  http://www.apache.org/licenses/LICENSE-2.0
-//
-// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
-// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
-// MERCHANTABLITY OR NON-INFRINGEMENT.
-// See the Apache 2 License for the specific language governing permissions and
-// limitations under the License.
-
-#include "util/common-utils.h"
-#include "gmm/diag-gmm.h"
-#include "gmm/diag-gmm-normal.h"
-#include "gmm/model-test-common.h"
-#include "transform/fmpe.h"
-
-namespace kaldi {
-
-
-// Compute derivative of GMM log-likelihood w.r.t. features.
-// Note: this code copied from gmm-get-feat-deriv.cc; had
-// to simplify a bit.
-void GetFeatDeriv(const DiagGmm &gmm,
-                  const Matrix<BaseFloat> &feats,
-                  Matrix<BaseFloat> *deriv) {
-  
-  deriv->Resize(feats.NumRows(), feats.NumCols());
-
-  Vector<BaseFloat> gauss_posteriors;
-  Vector<BaseFloat> temp_vec(feats.NumCols());
-  for (int32 i = 0; i < feats.NumRows(); i++) {
-    SubVector<BaseFloat> this_feat(feats, i);
-    SubVector<BaseFloat> this_deriv(*deriv, i);
-    gmm.ComponentPosteriors(this_feat, &gauss_posteriors);
-    BaseFloat weight = 1.0;
-    gauss_posteriors.Scale(weight);
-    // The next line does: to i'th row of deriv, add
-    // means_invvars^T * gauss_posteriors,
-    // where each row of means_invvars is the mean times
-    // diagonal inverse covariance... after transposing,
-    // this becomes a weighted of these rows, weighted by
-    // the posteriors.  This comes from the term
-    //  feat^T * inv_var * mean
-    // in the objective function.
-    this_deriv.AddMatVec(1.0, gmm.means_invvars(), kTrans,
-                         gauss_posteriors, 1.0);
-
-    // next line does temp_vec == inv_vars^T * gauss_posteriors,
-    // which sets temp_vec to a weighted sum of the inv_vars,
-    // weighed by Gaussian posterior.
-    temp_vec.AddMatVec(1.0, gmm.inv_vars(), kTrans,
-                       gauss_posteriors, 0.0);
-    // Add to the derivative, -(this_feat .* temp_vec),
-    // which is the term that comes from the -0.5 * inv_var^T feat_sq,
-    // in the objective function (where inv_var is a vector, and feat_sq
-    // is a vector of squares of the feature values).
-    this_deriv.AddVecVec(-1.0, this_feat, temp_vec, 1.0);
-  }
-}
-
-// Gets total log-likelihood, summed over all frames.
-BaseFloat GetGmmLike(const DiagGmm &gmm,
-                     const Matrix<BaseFloat> &feats) {
-  BaseFloat ans = 0.0;
-  for (int32 i = 0; i < feats.NumRows(); i++)
-    ans += gmm.LogLikelihood(feats.Row(i));
-  return ans;
-}
-
-void TestFmpe() {
-  int32 dim = 10 + (Rand() % 10);
-  int32 num_comp = 10 + (Rand() % 10);
-  DiagGmm gmm;
-  unittest::InitRandDiagGmm(dim, num_comp, &gmm);
-  
-  int32 num_frames = 20;
-  Matrix<BaseFloat> feats(num_frames, dim);
-
-  for (int32 i = 0; i < num_frames; i++)
-    for (int32 j = 0; j < dim; j++)
-      feats(i, j) = RandGauss();
-
-  FmpeOptions opts; // Default.
-  {
-    Fmpe fmpe(gmm, opts);
-    {
-      bool binary = (Rand() % 2 == 1);
-      Output ko("tmpf", binary);
-      fmpe.Write(ko.Stream(), binary);
-    }
-  }
-  Fmpe fmpe(gmm, opts);
-  {
-    bool binary_in;
-    Input ki("tmpf", &binary_in);
-    fmpe.Read(ki.Stream(), binary_in);
-  }
-
-  // We'll first be testing that the feature derivative is
-  // accurate, by measuring a small random offset in feature space.
-  {
-    Matrix<BaseFloat> deriv;
-    Matrix<BaseFloat> random_offset(feats.NumRows(), feats.NumCols());
-    for (int32 i = 0; i < feats.NumRows(); i++)
-      for (int32 j = 0; j < feats.NumCols(); j++)
-        random_offset(i, j) = 1.0e-03 * RandGauss();
-    BaseFloat like_before = GetGmmLike(gmm, feats);
-    feats.AddMat(1.0, random_offset);
-    BaseFloat like_after = GetGmmLike(gmm, feats);
-    feats.AddMat(-1.0, random_offset); // undo the change.
-    GetFeatDeriv(gmm, feats, &deriv);
-    BaseFloat change1 = like_after - like_before,
-        change2 = TraceMatMat(random_offset, deriv, kTrans);
-    KALDI_LOG << "Random offset led to like change "
-              << change1 << " (manually), and " << change2
-              << " (derivative)";
-    // note: not making this threshold smaller, as don't want
-    // spurious failures.  Seems to be OK though.
-    KALDI_ASSERT( fabs(change1-change2) < 0.15*fabs(change1+change2));
-  }
-
-  std::vector<std::vector<int32> > gselect(feats.NumRows()); // make it have all Gaussians...
-  for (int32 i = 0; i < feats.NumRows(); i++)
-    for (int32 j = 0; j < gmm.NumGauss(); j++)
-      gselect[i].push_back(j);
-
-  Matrix<BaseFloat> fmpe_offset;
-  // Check that the fMPE feature offset is zero.
-  fmpe.ComputeFeatures(feats, gselect, &fmpe_offset);
-  KALDI_ASSERT(fmpe_offset.IsZero());
-  
-  // Note: we're just using the ML objective function here.
-  // This is just to make sure the derivatives are all computed
-  // correctly.
-  BaseFloat like_before_update = GetGmmLike(gmm, feats);
-  // Now get stats for update.
-  FmpeStats stats(fmpe);
-  Matrix<BaseFloat> deriv;
-  GetFeatDeriv(gmm, feats, &deriv);
-  fmpe.AccStats(feats, gselect, deriv, NULL, &stats);
-  FmpeUpdateOptions update_opts;
-  update_opts.learning_rate = 0.001; // so linear assumption is more valid.
-  BaseFloat delta = fmpe.Update(update_opts, stats);
-
-  fmpe.ComputeFeatures(feats, gselect, &fmpe_offset);
-  feats.AddMat(1.0, fmpe_offset);
-
-  BaseFloat like_after_update = GetGmmLike(gmm, feats);
-
-  BaseFloat delta2 = like_after_update - like_before_update;
-  KALDI_LOG << "Change predicted by fMPE Update function is "
-            << delta << ", change computed directly is "
-            << delta2;
-  KALDI_ASSERT(fabs(delta-delta2) < 0.15 * fabs(delta+delta2));
-  
-  unlink("tmpf");
-}
-
-}
-
-
-int main() {
-  kaldi::g_kaldi_verbose_level = 5;
-  for (int i = 0; i <= 10; i++)
-    kaldi::TestFmpe();
-  std::cout << "Test OK.\n";
-}
-
diff --git a/src/transform/fmpe.cc b/src/transform/fmpe.cc
deleted file mode 100644
index 9a49bf53678..00000000000
--- a/src/transform/fmpe.cc
+++ /dev/null
@@ -1,691 +0,0 @@
-// transform/fmpe.cc
-
-// Copyright 2011-2012  Yanmin Qian  Johns Hopkins University (Author: Daniel Povey)
-
-// See ../../COPYING for clarification regarding multiple authors
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//  http://www.apache.org/licenses/LICENSE-2.0
-//
-// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
-// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
-// MERCHANTABLITY OR NON-INFRINGEMENT.
-// See the Apache 2 License for the specific language governing permissions and
-// limitations under the License.
-
-
-#include "transform/fmpe.h"
-#include "util/text-utils.h"
-#include "gmm/diag-gmm-normal.h"
-#include "gmm/am-diag-gmm.h"
-#include "hmm/transition-model.h"
-
-namespace kaldi {
-
-void Fmpe::SetContexts(std::string context_str) {
-  // sets the contexts_ variable.
-  using std::vector;
-  using std::string;
-  contexts_.clear();
-  vector<string> ctx_vec; // splitting context_str on ":"
-  SplitStringToVector(context_str, ":", false, &ctx_vec);
-  contexts_.resize(ctx_vec.size());
-  for (size_t i = 0; i < ctx_vec.size(); i++) {
-    vector<string> pair_vec; // splitting ctx_vec[i] on ";"
-    SplitStringToVector(ctx_vec[i], ";", false, &pair_vec);
-    KALDI_ASSERT(pair_vec.size() != 0 && "empty context!");
-    for (size_t j = 0; j < pair_vec.size(); j++) {
-      vector<string> one_pair;
-      SplitStringToVector(pair_vec[j], ",", false, &one_pair);
-      KALDI_ASSERT(one_pair.size() == 2 &&
-                   "Mal-formed context string: bad --context-expansion option?");
-      int32 pos = 0;
-      BaseFloat weight = BaseFloat(0);
-      bool ok = ConvertStringToInteger(one_pair[0], &pos);
-      ok = ConvertStringToReal(one_pair[1], &weight) && ok;
-      if (!ok)
-        KALDI_ERR << "Mal-formed context string: bad --context-expansion option?";
-      contexts_[i].push_back(std::make_pair(pos, weight));
-    }
-  }
-}
-
-void Fmpe::ComputeC() {
-  KALDI_ASSERT(gmm_.NumGauss() != 0.0);
-  int32 dim = gmm_.Dim();
-
-  // Getting stats from the GMM... assume the model is
-  // correct.
-  SpMatrix<double> x2_stats(dim);
-  Vector<double> x_stats(dim);
-  double tot_count = 0.0;
-  DiagGmmNormal ngmm(gmm_);
-  for (int32 pdf = 0; pdf < ngmm.NumGauss(); pdf++) {
-    x2_stats.AddVec2(ngmm.weights_(pdf), ngmm.means_.Row(pdf));
-    x2_stats.AddDiagVec(ngmm.weights_(pdf), ngmm.vars_.Row(pdf)); // add diagonal
-    // covar to diagonal elements of x2_stats.
-    x_stats.AddVec(ngmm.weights_(pdf), ngmm.means_.Row(pdf));
-    tot_count += ngmm.weights_(pdf);
-  }
-  KALDI_ASSERT(tot_count != 0.0);
-  x2_stats.Scale(1.0 / tot_count);
-  x_stats.Scale(1.0 / tot_count);
-  x2_stats.AddVec2(-1.0, x_stats); // subtract outer product of mean,
-  // to get centered covariance.
-  C_.Resize(dim);
-  try {
-    TpMatrix<double> Ctmp(dim); Ctmp.Cholesky(x2_stats);
-    C_.CopyFromTp(Ctmp);
-  } catch (...) {
-    KALDI_ERR << "Error initializing fMPE object: cholesky of "
-        "feature variance failed.  Probably code error, or NaN/inf in model";
-  }
-}
-
-void Fmpe::ComputeStddevs() {
-  const Matrix<BaseFloat> &inv_vars = gmm_.inv_vars();
-  stddevs_.Resize(inv_vars.NumRows(), inv_vars.NumCols());
-  stddevs_.CopyFromMat(inv_vars);
-  stddevs_.ApplyPow(-0.5);
-}
-
-
-void Fmpe::ApplyContext(const MatrixBase<BaseFloat> &intermed_feat,
-                        MatrixBase<BaseFloat> *feat_out) const {
-  // Applies the temporal-context part of the transformation.
-  int32 dim = FeatDim(), ncontexts = NumContexts(),
-      T = intermed_feat.NumRows();
-  KALDI_ASSERT(intermed_feat.NumCols() == dim * ncontexts &&
-               intermed_feat.NumRows() == feat_out->NumRows()
-               && feat_out->NumCols() == dim);
-  // note: ncontexts == contexts_.size().
-  for (int32 i = 0; i < ncontexts; i++) {
-    // this_intermed_feat is the chunk of the "intermediate features"
-    // that corresponds to this "context"
-    SubMatrix<BaseFloat> this_intermed_feat(intermed_feat, 0, T,
-                                            dim*i, dim);
-    for (int32 j = 0; j < static_cast<int32>(contexts_[i].size()); j++) {
-      int32 t_offset = contexts_[i][j].first;
-      BaseFloat weight = contexts_[i][j].second;
-      // Note: we could do this more efficiently using matrix operations,
-      // but this doesn't dominate the computation and I think this is
-      // clearer.
-      for (int32 t_out = 0; t_out < T; t_out++) { // t_out indexes the output
-        int32 t_in = t_out + t_offset; // t_in indexes the input.
-        if (t_in >= 0 && t_in < T) // Discard frames outside range.
-          feat_out->Row(t_out).AddVec(weight, this_intermed_feat.Row(t_in));
-      }
-    }
-  }
-}
-
-void Fmpe::ApplyContextReverse(const MatrixBase<BaseFloat> &feat_deriv,
-                               MatrixBase<BaseFloat> *intermed_feat_deriv)
-    const {
-  // Applies the temporal-context part of the transformation,
-  // in reverse, for getting derivatives for training.
-  int32 dim = FeatDim(), ncontexts = NumContexts(),
-      T = feat_deriv.NumRows();
-  KALDI_ASSERT(intermed_feat_deriv->NumCols() == dim * ncontexts &&
-               intermed_feat_deriv->NumRows() == feat_deriv.NumRows()
-               && feat_deriv.NumCols() == dim);
-  // note: ncontexts == contexts_.size().
-  for (int32 i = 0; i < ncontexts; i++) {
-    // this_intermed_feat is the chunk of the derivative of
-    // "intermediate features" that corresponds to this "context"
-    // (this is output, in this routine).
-    SubMatrix<BaseFloat> this_intermed_feat_deriv(*intermed_feat_deriv, 0, T,
-                                                  dim*i, dim);
-    for (int32 j = 0; j < static_cast<int32>(contexts_[i].size()); j++) {
-      int32 t_offset = contexts_[i][j].first;
-      BaseFloat weight = contexts_[i][j].second;
-      // Note: we could do this more efficiently using matrix operations,
-      // but this doesn't dominate the computation and I think this is
-      // clearer.
-      for (int32 t_out = 0; t_out < T; t_out++) { // t_out indexes the output
-        int32 t_in = t_out + t_offset; // t_in indexes the input.
-        if (t_in >= 0 && t_in < T) // Discard frames outside range.
-          this_intermed_feat_deriv.Row(t_in).AddVec(weight,
-                                                    feat_deriv.Row(t_out));
-        // Note: the line above is where the work happens; it's the same
-        // as in ApplyContext except reversing the input and output.
-      }
-    }
-  }
-}
-
-void Fmpe::ApplyC(MatrixBase<BaseFloat> *feat_out, bool reverse) const {
-  int32 T = feat_out->NumRows();
-  Vector<BaseFloat> tmp(feat_out->NumCols());
-  for (int32 t = 0; t < T; t++) {
-    SubVector<BaseFloat> row(*feat_out, t);
-    // Next line does: tmp = C_ * row
-    tmp.AddTpVec(1.0, C_, (reverse ? kTrans : kNoTrans), row, 0.0);
-    row.CopyFromVec(tmp);
-  }
-}
-
-// Constructs the high-dim features and applies the main projection matrix
-// projT_.  This projects from dimension ngauss*(dim+1) to dim*ncontexts.  Note:
-// because the input vector of size ngauss*(dim+1) is sparse in a blocky way
-// (i.e. each frame only has a couple of nonzero posteriors), we deal with
-// sub-matrices of the projection matrix projT_.  We actually further optimize
-// the code by taking all frames in a file that had nonzero posteriors for a
-// particular Gaussian, and forming a matrix out of the corresponding
-// high-dimensional features; we can then use a matrix-matrix multiply rather
-// than using vector-matrix operations.
-
-void Fmpe::ApplyProjection(const MatrixBase<BaseFloat> &feat_in,
-                           const std::vector<std::vector<int32> > &gselect,
-                           MatrixBase<BaseFloat> *intermed_feat) const {
-  int32 dim = FeatDim(), ncontexts = NumContexts();  
-  
-  Vector<BaseFloat> post; // will be posteriors of selected Gaussians.
-  Vector<BaseFloat> input_chunk(dim+1); // will be a segment of
-  // the high-dimensional features.
-
-  // "all_posts" is a vector of ((gauss-index, time-index), gaussian
-  // posterior).
-  // We'll compute the posterior information, sort it, and then
-  // go through it in sorted order, which maintains memory locality
-  // when accessing the projection matrix.
-  // Note: if we really cared we could make this use level-3 BLAS
-  // (matrix-matrix multiply), but we'd need to have a temporary
-  // matrix for the output and input.
-  std::vector<std::pair<std::pair<int32, int32>, BaseFloat> > all_posts;
-  
-  for (int32 t = 0; t < feat_in.NumRows(); t++) {
-    SubVector<BaseFloat> this_feat(feat_in, t);
-    gmm_.LogLikelihoodsPreselect(this_feat, gselect[t], &post);
-    // At this point, post will contain log-likes of the selected
-    // Gaussians.
-    post.ApplySoftMax(); // Now they are posteriors (which sum to one).
-    for (int32 i = 0; i < post.Dim(); i++) {
-      int32 gauss = gselect[t][i];
-      all_posts.push_back(std::make_pair(std::make_pair(gauss, t), post(i)));
-    }
-  }
-  std::sort(all_posts.begin(), all_posts.end());
-  
-  bool optimize = true;
-
-  if (!optimize) { // Why do we keep this un-optimized code around?
-    // For clarity, so you can see what's going on, and for easier
-    // comparision with ApplyProjectionReverse which is similar to this
-    // un-optimized segment.  Both un-optimized and optimized versions
-    // should give identical transforms (up to tiny roundoff differences).
-    for (size_t i = 0; i < all_posts.size(); i++) {
-      int32 gauss = all_posts[i].first.first, t = all_posts[i].first.second;
-      SubVector<BaseFloat> this_feat(feat_in, t);
-      SubVector<BaseFloat> this_intermed_feat(*intermed_feat, t);
-      BaseFloat this_post = all_posts[i].second;
-      SubVector<BaseFloat> this_stddev(stddevs_, gauss);
-
-      // The next line is equivalent to setting input_chunk to
-      // -this_post * the gaussian mean / (gaussian stddev).  Note: we use
-      // the fact that mean * inv_var *  stddev == mean / stddev.
-      input_chunk.Range(0, dim).AddVecVec(-this_post, gmm_.means_invvars().Row(gauss),
-                                          this_stddev, 0.0);
-      // The next line is equivalent to adding (feat / gaussian stddev) to
-      // input_chunk, so now it contains (feat - mean) / stddev, which is
-      // our "normalized" feature offset.
-      input_chunk.Range(0, dim).AddVecDivVec(this_post, this_feat, this_stddev,
-                                             1.0);
-      // The last element of this input_chunk is the posterior itself
-      // (between 0 and 1).
-      input_chunk(dim) = this_post * config_.post_scale;
-
-      // this_intermed_feat += [appropriate chjunk of projT_] * input_chunk.
-      this_intermed_feat.AddMatVec(1.0, projT_.Range(gauss*(dim+1), dim+1,
-                                                     0, dim*ncontexts),
-                                   kTrans, input_chunk, 1.0);
-    }
-  } else {
-    size_t i = 0;
-    // We process the "posts" vector in chunks, where each chunk corresponds to
-    // the same Gaussian index (but different times).
-    while (i < all_posts.size()) {
-      int32 gauss = all_posts[i].first.first;
-      SubVector<BaseFloat> this_stddev(stddevs_, gauss),
-          this_mean_invvar(gmm_.means_invvars(), gauss);
-      SubMatrix<BaseFloat> this_projT_chunk(projT_, gauss*(dim+1), dim+1,
-                                            0, dim*ncontexts);
-      int32 batch_size; // number of posteriors with same Gaussian..
-      for (batch_size = 0;
-           batch_size+i < static_cast<int32>(all_posts.size()) &&
-               all_posts[batch_size+i].first.first == gauss;
-           batch_size++); // empty loop body.
-      Matrix<BaseFloat> input_chunks(batch_size, dim+1);
-      Matrix<BaseFloat> intermed_temp(batch_size, dim*ncontexts);
-      for (int32 j = 0; j < batch_size; j++) { // set up "input_chunks".
-        // To understand this code, first examine code and comments in "non-optimized"
-        // code chunk above (the other branch of the if/else statement).
-        int32 t = all_posts[i+j].first.second;
-        SubVector<BaseFloat> this_feat(feat_in, t);
-        SubVector<BaseFloat> this_input_chunk(input_chunks, j);
-        BaseFloat this_post = all_posts[i+j].second;
-        this_input_chunk.Range(0, dim).AddVecVec(-this_post,
-                                                 this_mean_invvar,
-                                                 this_stddev, 0.0);
-        this_input_chunk.Range(0, dim).AddVecDivVec(this_post, this_feat,
-                                                    this_stddev, 1.0);
-        this_input_chunk(dim) = this_post * config_.post_scale;
-      }
-      // The next line is where most of the computation will happen,
-      // during the feature computation phase.  We have rearranged
-      // stuff so it's a matrix-matrix operation, for greater
-      // efficiency (when using optimized libraries like ATLAS).
-      intermed_temp.AddMatMat(1.0, input_chunks, kNoTrans,
-                              this_projT_chunk, kNoTrans, 0.0);
-      for (int32 j = 0; j < batch_size; j++) { // add data from
-        // intermed_temp to the output "intermed_feat"
-        int32 t = all_posts[i+j].first.second;
-        SubVector<BaseFloat> this_intermed_feat(*intermed_feat, t);
-        SubVector<BaseFloat> this_intermed_temp(intermed_temp, j);
-        // this_intermed_feat += this_intermed_temp.
-        this_intermed_feat.AddVec(1.0, this_intermed_temp);
-      }
-      i += batch_size;
-    }
-  }
-}      
-
-
-
-// This function does the reverse to ApplyProjection, for the case
-// where we want the derivatives w.r.t. the projection matrix.
-// It stores the positive and negative parts of this separately.
-void Fmpe::ApplyProjectionReverse(const MatrixBase<BaseFloat> &feat_in,
-                                  const std::vector<std::vector<int32> > &gselect,
-                                  const MatrixBase<BaseFloat> &intermed_feat_deriv,
-                                  MatrixBase<BaseFloat> *proj_deriv_plus,
-                                  MatrixBase<BaseFloat> *proj_deriv_minus) const {
-  int32 dim = FeatDim(), ncontexts = NumContexts();  
-  
-  Vector<BaseFloat> post; // will be posteriors of selected Gaussians.
-  Vector<BaseFloat> input_chunk(dim+1); // will be a segment of
-  // the high-dimensional features.
-
-  // "all_posts" is a vector of ((gauss-index, time-index), gaussian
-  // posterior).
-  // We'll compute the posterior information, sort it, and then
-  // go through it in sorted order, which maintains memory locality
-  // when accessing the projection matrix.
-  std::vector<std::pair<std::pair<int32, int32>, BaseFloat> > all_posts;
-  
-  for (int32 t = 0; t < feat_in.NumRows(); t++) {
-    SubVector<BaseFloat> this_feat(feat_in, t);
-    gmm_.LogLikelihoodsPreselect(this_feat, gselect[t], &post);
-    // At this point, post will contain log-likes of the selected
-    // Gaussians.
-    post.ApplySoftMax(); // Now they are posteriors (which sum to one).
-    for (int32 i = 0; i < post.Dim(); i++) {
-      // The next few lines (where we set up "input_chunk") are identical
-      // to ApplyProjection.
-      int32 gauss = gselect[t][i];
-      all_posts.push_back(std::make_pair(std::make_pair(gauss, t), post(i)));
-    }
-  }
-  std::sort(all_posts.begin(), all_posts.end());
-  for (size_t i = 0; i < all_posts.size(); i++) {
-    int32 gauss = all_posts[i].first.first, t = all_posts[i].first.second;
-    BaseFloat this_post = all_posts[i].second;
-    SubVector<BaseFloat> this_feat(feat_in, t);    
-    SubVector<BaseFloat> this_intermed_feat_deriv(intermed_feat_deriv, t);
-    SubVector<BaseFloat> this_stddev(stddevs_, gauss);
-    input_chunk.Range(0, dim).AddVecVec(-this_post, gmm_.means_invvars().Row(gauss),
-                                        this_stddev, 0.0);
-    input_chunk.Range(0, dim).AddVecDivVec(this_post, this_feat, this_stddev,
-                                           1.0);
-    input_chunk(dim) = this_post * config_.post_scale;
-
-    // If not for accumulating the + and - parts separately, we would be
-    // doing something like:
-    // proj_deriv_.Range(0, dim*ncontexts, gauss*(dim+1), dim+1).AddVecVec(
-    //                    1.0, this_intermed_feat_deriv, input_chunk);
-
-
-    SubMatrix<BaseFloat> plus_chunk(*proj_deriv_plus, 
-                                    gauss*(dim+1), dim+1,
-                                    0, dim*ncontexts),
-        minus_chunk(*proj_deriv_minus, 
-                    gauss*(dim+1), dim+1,
-                    0, dim*ncontexts);
-          
-    // This next function takes the rank-one matrix
-    //  (input_chunk * this_intermed_deriv'), and adds the positive
-    // part to proj_deriv_plus, and minus the negative part to
-    // proj_deriv_minus.
-    AddOuterProductPlusMinus(static_cast<BaseFloat>(1.0),
-                             input_chunk,
-                             this_intermed_feat_deriv,
-                             &plus_chunk, &minus_chunk);
-  }
-}      
-
-void Fmpe::ComputeFeatures(const MatrixBase<BaseFloat> &feat_in,
-                           const std::vector<std::vector<int32> > &gselect,
-                           Matrix<BaseFloat> *feat_out) const {
-  int32 dim = FeatDim();
-  KALDI_ASSERT(feat_in.NumRows() != 0 && feat_in.NumCols() == dim);
-  KALDI_ASSERT(feat_in.NumRows() == static_cast<int32>(gselect.size()));
-  feat_out->Resize(feat_in.NumRows(), feat_in.NumCols()); // will zero it.
-  
-  // Intermediate-dimension features
-  Matrix<BaseFloat> intermed_feat(feat_in.NumRows(),
-                                  dim * NumContexts());
-
-  // Apply the main projection, from high-dim to intermediate
-  // dimension (dim * NumContexts()).
-  ApplyProjection(feat_in, gselect, &intermed_feat);
-
-  // Apply the temporal context and reduces from
-  // dimension dim*ncontexts to dim.
-  ApplyContext(intermed_feat, feat_out);
-
-  // Lastly, apply the the "C" matrix-- linear transform on the offsets.
-  ApplyC(feat_out);
-}
-
-
-void Fmpe::AccStats(const MatrixBase<BaseFloat> &feat_in,
-                    const std::vector<std::vector<int32> > &gselect,
-                    const MatrixBase<BaseFloat> &direct_feat_deriv,
-                    const MatrixBase<BaseFloat> *indirect_feat_deriv, // may be NULL
-                    FmpeStats *fmpe_stats) const {
-  SubMatrix<BaseFloat> stats_plus(fmpe_stats->DerivPlus());
-  SubMatrix<BaseFloat> stats_minus(fmpe_stats->DerivMinus());
-  int32 dim = FeatDim(), ncontexts = NumContexts();
-  KALDI_ASSERT(feat_in.NumRows() != 0 && feat_in.NumCols() == dim);
-  KALDI_ASSERT(feat_in.NumRows() == static_cast<int32>(gselect.size()));
-  KALDI_ASSERT(SameDim(stats_plus, projT_) && SameDim(stats_minus, projT_) &&
-               SameDim(feat_in, direct_feat_deriv));
-
-  if (indirect_feat_deriv != NULL)
-    fmpe_stats->AccumulateChecks(feat_in, direct_feat_deriv, *indirect_feat_deriv);
-  
-  Matrix<BaseFloat> feat_deriv(direct_feat_deriv); // "feat_deriv" is initially direct+indirect.
-  if (indirect_feat_deriv != NULL)
-    feat_deriv.AddMat(1.0, *indirect_feat_deriv);
-  
-  // We do the "*Reverse" version of each stage now, in reverse order.
-  ApplyCReverse(&feat_deriv);
-  
-  Matrix<BaseFloat> intermed_feat_deriv(feat_in.NumRows(), dim*ncontexts);
-  ApplyContextReverse(feat_deriv, &intermed_feat_deriv);
-  
-  ApplyProjectionReverse(feat_in, gselect, intermed_feat_deriv,
-                         &stats_plus, &stats_minus);
-}
-
-
-void FmpeOptions::Write(std::ostream &os, bool binary) const {
-  WriteToken(os, binary, context_expansion);
-  WriteBasicType(os, binary, post_scale);
-}
-void FmpeOptions::Read(std::istream &is, bool binary) {
-  ReadToken(is, binary, &context_expansion);
-  ReadBasicType(is, binary, &post_scale);
-}
-
-Fmpe::Fmpe(const DiagGmm &gmm, const FmpeOptions &config): gmm_(gmm),
-                                                          config_(config) {
-  SetContexts(config.context_expansion);
-  ComputeC();
-  ComputeStddevs();
-  projT_.Resize(NumGauss() * (FeatDim()+1), FeatDim() * NumContexts());
-}
-
-BaseFloat Fmpe::Update(const FmpeUpdateOptions &config,
-                       const FmpeStats &stats) {
-  SubMatrix<BaseFloat> proj_deriv_plus = stats.DerivPlus(),
-      proj_deriv_minus = stats.DerivMinus();
-  // tot_linear_objf_impr is the change in the actual
-  // objective function if it were linear, i.e.
-  //   objf-gradient . parameter-change
-  // Note: none of this is normalized by the #frames (we don't have
-  // this info here), so that is done at the script level.
-  BaseFloat tot_linear_objf_impr = 0.0;
-  int32 changed = 0; // Keep track of how many elements change sign.
-  KALDI_ASSERT(SameDim(proj_deriv_plus, projT_) && SameDim(proj_deriv_minus, projT_));
-  KALDI_ASSERT(proj_deriv_plus.Min() >= 0);
-  KALDI_ASSERT(proj_deriv_minus.Min() >= 0);
-  BaseFloat learning_rate = config.learning_rate,
-      l2_weight = config.l2_weight;
-  
-  for (int32 i = 0; i < projT_.NumRows(); i++) {
-    for (int32 j = 0; j < projT_.NumCols(); j++) {
-      BaseFloat p = proj_deriv_plus(i, j), n = proj_deriv_minus(i, j),
-          x = projT_(i, j);
-      // Suppose the basic update (before regularization) is:
-      // z <-- x  +   learning_rate * (p - n) / (p + n),
-      // where z is the new parameter and x is the old one.
-      // Here, we view (learning_rate / (p + n)) as a parameter-specific
-      // learning rate.  In fact we view this update as the maximization
-      // of an auxiliary function of the form:
-      //  (z-x).(p-n)    - 0.5 (z - x)^2 (p+n)/learning_rate
-      // and taking the derivative w.r.t z, we get:
-      // Q'(z) =  (p-n) - (z - x) (p+n) / learning_rate
-      // which we set to zero and solve for z, to get z = x + learning_rate.(p-n)/(p+n)
-      // At this point we add regularization, a term of the form -l2_weight * z^2.
-      // Our new auxiliary function derivative is:
-      // Q(z) = -2.l2_weight.z + (p-n) - (z - x) (p+n) / learning_rate
-      // We can write this as:
-      // Q(z) = z . (-2.l2_weight - (p+n)/learning_rate)
-      //        + (p-n) + x(p+n)/learning_rate
-      // solving for z, we get:
-      //      z = ((p-n) + x (p+n)/learning_rate) / (2.l2_weight + (p+n)/learning_rate)
-
-      BaseFloat z = ((p-n) + x*(p+n)/learning_rate) / (2*l2_weight + (p+n)/learning_rate);
-      // z is the new parameter value.
-
-      tot_linear_objf_impr += (z-x) * (p-n); // objf impr based on linear assumption.
-      projT_(i, j) = z;
-      if (z*x < 0) changed++;
-    }
-  }
-  KALDI_LOG << "Objf impr (assuming linear) is " << tot_linear_objf_impr;
-  KALDI_LOG << ((100.0*changed)/(projT_.NumRows()*projT_.NumCols()))
-            << "% of matrix elements changed sign.";
-  return tot_linear_objf_impr;
-}
-
-// Note: we write the GMM first, without any other header.
-// This way, the gselect code can treat the form on disk as
-// a normal GMM object.
-void Fmpe::Write(std::ostream &os, bool binary) const {
-  if (gmm_.NumGauss() == 0)
-    KALDI_ERR << "Fmpe::Write, object not initialized.";
-  gmm_.Write(os, binary);
-  config_.Write(os, binary);
-  // stddevs_ are derived, don't write them.
-  projT_.Write(os, binary);
-  C_.Write(os, binary);
-  // contexts_ are derived from config, don't write them.
-}
-
-
-void Fmpe::Read(std::istream &is, bool binary) {
-  gmm_.Read(is, binary);
-  config_.Read(is, binary);
-  ComputeStddevs(); // computed from gmm.
-  projT_.Read(is, binary);
-  C_.Read(is, binary);
-  SetContexts(config_.context_expansion);
-}
-
-
-BaseFloat ComputeAmGmmFeatureDeriv(const AmDiagGmm &am_gmm,
-                                   const TransitionModel &trans_model,
-                                   const Posterior &posterior,
-                                   const MatrixBase<BaseFloat> &features,
-                                   Matrix<BaseFloat> *direct_deriv,
-                                   const AccumAmDiagGmm *model_diff,
-                                   Matrix<BaseFloat> *indirect_deriv) {
-  KALDI_ASSERT((model_diff != NULL) == (indirect_deriv != NULL));
-  BaseFloat ans = 0.0;
-  KALDI_ASSERT(posterior.size() == static_cast<size_t>(features.NumRows()));
-  direct_deriv->Resize(features.NumRows(), features.NumCols());
-  if (indirect_deriv != NULL)
-    indirect_deriv->Resize(features.NumRows(), features.NumCols());
-  Vector<BaseFloat> temp_vec(features.NumCols());
-  Vector<double> temp_vec_dbl(features.NumCols());
-  for (size_t i = 0; i < posterior.size(); i++) {
-    for (size_t j = 0; j < posterior[i].size(); j++) {
-      int32 tid = posterior[i][j].first,  // transition identifier.
-          pdf_id = trans_model.TransitionIdToPdf(tid);
-      BaseFloat weight = posterior[i][j].second;
-      const DiagGmm &gmm = am_gmm.GetPdf(pdf_id);
-      Vector<BaseFloat> gauss_posteriors;
-      SubVector<BaseFloat> this_feat(features, i);
-      SubVector<BaseFloat> this_direct_deriv(*direct_deriv, i);
-      ans += weight * 
-          gmm.ComponentPosteriors(this_feat, &gauss_posteriors);
-      
-      gauss_posteriors.Scale(weight);
-      // The next line does: to i'th row of deriv, add
-      // means_invvars^T * gauss_posteriors,
-      // where each row of means_invvars is the mean times
-      // diagonal inverse covariance... after transposing,
-      // this becomes a weighted of these rows, weighted by
-      // the posteriors.  This comes from the term
-      //  feat^T * inv_var * mean
-      // in the objective function.
-      this_direct_deriv.AddMatVec(1.0, gmm.means_invvars(), kTrans,
-                                  gauss_posteriors, 1.0);      
-
-      // next line does temp_vec == inv_vars^T * gauss_posteriors,
-      // which sets temp_vec to a weighted sum of the inv_vars,
-      // weighed by Gaussian posterior.
-      temp_vec.AddMatVec(1.0, gmm.inv_vars(), kTrans,
-                         gauss_posteriors, 0.0);
-      // Add to the derivative, -(this_feat .* temp_vec),
-      // which is the term that comes from the -0.5 * inv_var^T feat_sq,
-      // in the objective function (where inv_var is a vector, and feat_sq
-      // is a vector of squares of the feature values).
-      // Note: we have to do some messing about with double-precision here
-      // because the stats only come in double precision.
-      this_direct_deriv.AddVecVec(-1.0, this_feat, temp_vec, 1.0);
-      if (model_diff != NULL && weight > 0.0) { // We need to get the indirect diff.
-        // This "weight > 0.0" checks that this is the numerator stats, as the
-        // fMPE indirect diff applies only to the ML stats-- CAUTION, this
-        // code will only work as-is for fMMI (and the stats should not be
-        // canceled), due to the assumption that ML stats == num stats.
-        Vector<double> gauss_posteriors_dbl(gauss_posteriors);
-        const AccumDiagGmm &deriv_acc = model_diff->GetAcc(pdf_id);
-        // part of the derivative.  Note: we could just store the direct and
-        // indirect derivatives together in one matrix, but it makes it easier
-        // to accumulate certain diagnostics if we store them separately.
-        SubVector<BaseFloat> this_indirect_deriv(*indirect_deriv, i);
-        // note: deriv_acc.mean_accumulator() contains the derivative of
-        // the objective function w.r.t. the "x stats" accumulated for
-        // this GMM.  variance_accumulator() is the same for the "x^2 stats".
-        temp_vec_dbl.AddMatVec(1.0, deriv_acc.mean_accumulator(), kTrans,
-                               gauss_posteriors_dbl, 0.0);
-        this_indirect_deriv.AddVec(1.0, temp_vec_dbl);
-        temp_vec_dbl.AddMatVec(1.0, deriv_acc.variance_accumulator(), kTrans,
-                               gauss_posteriors_dbl, 0.0);
-        temp_vec.CopyFromVec(temp_vec_dbl); // convert to float.
-        // next line because d(x^2 stats for Gaussian)/d(feature) =
-        // 2 * (gaussian posterior) * feature.
-        this_indirect_deriv.AddVecVec(2.0, this_feat, temp_vec, 1.0);
-      }
-    }
-  }
-  return ans;
-}
-
-
-SubMatrix<BaseFloat> FmpeStats::DerivPlus() const { // const-ness not preserved.
-  KALDI_ASSERT(deriv.NumRows() != 0);
-  int32 proj_num_rows = deriv.NumRows(),
-      proj_num_cols = deriv.NumCols()/2;
-  return SubMatrix<BaseFloat>(deriv, 0, proj_num_rows,
-                              0, proj_num_cols);
-}
-SubMatrix<BaseFloat> FmpeStats::DerivMinus() const { // const-ness not preserved.
-  KALDI_ASSERT(deriv.NumRows() != 0);
-  int32 proj_num_rows = deriv.NumRows(),
-      proj_num_cols = deriv.NumCols()/2;
-  return SubMatrix<BaseFloat>(deriv, 0, proj_num_rows,
-                              proj_num_cols, proj_num_cols);
-}
-
-void FmpeStats::Init(const Fmpe &fmpe) {
-  int32 num_rows = fmpe.ProjectionTNumRows(),
-      num_cols = fmpe.ProjectionTNumCols();
-  deriv.Resize(num_rows, num_cols*2);
-
-  int32 feat_dim = fmpe.FeatDim();
-  checks.Resize(8, feat_dim);
-}
-
-void FmpeStats::AccumulateChecks(const MatrixBase<BaseFloat> &feats,
-                                 const MatrixBase<BaseFloat> &direct_deriv,
-                                 const MatrixBase<BaseFloat> &indirect_deriv) {
-  int32 T = feats.NumRows(), dim = feats.NumCols();
-  KALDI_ASSERT(direct_deriv.NumRows() == T && direct_deriv.NumCols() == dim &&
-               indirect_deriv.NumRows() == T && indirect_deriv.NumCols() == dim);
-  KALDI_ASSERT(checks.NumRows() == 8 && checks.NumCols() == dim);
-  for (int32 t = 0; t < T; t++) {
-    for (int32 d = 0; d < dim; d++) {
-      BaseFloat zero = 0.0;
-      checks(0, d) += std::max(zero, direct_deriv(t, d));
-      checks(1, d) += std::max(zero, -direct_deriv(t, d));
-      checks(2, d) += std::max(zero, indirect_deriv(t, d));
-      checks(3, d) += std::max(zero, -indirect_deriv(t, d));
-      checks(4, d) += std::max(zero, feats(t, d)*direct_deriv(t, d));
-      checks(5, d) += std::max(zero, -feats(t, d)*direct_deriv(t, d));
-      checks(6, d) += std::max(zero, feats(t, d)*indirect_deriv(t, d));
-      checks(7, d) += std::max(zero, -feats(t, d)*indirect_deriv(t, d));
-    }
-  }
-}
-
-void FmpeStats::DoChecks() {
-  if (checks.IsZero()) {
-    KALDI_LOG << "No checks will be done, probably indirect derivative was not used.";
-    return;
-  }
-  int32 dim = checks.NumCols();
-  Vector<double> shift_check(dim), shift_check2(dim), scale_check(dim), scale_check2(dim);
-  for (int32 d = 0; d < dim; d++) {
-    // shiftnumerator = direct+indirect deriv-- should be zero.
-    double shift_num = checks(0, d) - checks(1, d) + checks(2, d) - checks(3, d),
-        shift_den = checks(0, d) + checks(1, d) + checks(2, d) + checks(3, d),
-        shift_den2 = fabs(checks(0, d) - checks(1, d)) + fabs(checks(2, d) - checks(3, d));
-    shift_check(d) = shift_num / shift_den;
-    shift_check2(d) = shift_num / shift_den2;
-    double scale_num = checks(4, d) - checks(5, d) + checks(6, d) - checks(7, d),
-        scale_den = checks(4, d) + checks(5, d) + checks(6, d) + checks(7, d),
-        scale_den2 = fabs(checks(4, d) - checks(5, d)) + fabs(checks(6, d) - checks(7, d));
-    scale_check(d) = scale_num / scale_den;
-    scale_check2(d) = scale_num / scale_den2;
-  }
-
-  KALDI_LOG << "Shift-check is as follows (should be in range +- 0.01 or less)."
-            << shift_check;
-  KALDI_LOG << "Scale-check is as follows (should be in range +- 0.01 or less)."
-            << scale_check;
-  KALDI_LOG << "Shift-check(2) is as follows: most elements should be in range +-0.1: "
-            << shift_check2;
-  KALDI_LOG << "Scale-check(2) is as follows: most elements should be in range +-0.1: "
-            << scale_check2;
-}
-
-void FmpeStats::Write(std::ostream &os, bool binary) const {
-  deriv.Write(os, binary);
-  checks.Write(os, binary);
-}
-
-void FmpeStats::Read(std::istream &is, bool binary, bool add) {
-  deriv.Read(is, binary, add);
-  checks.Read(is, binary, add);
-}
-
-
-}  // End of namespace kaldi
diff --git a/src/transform/fmpe.h b/src/transform/fmpe.h
deleted file mode 100644
index f5f95938246..00000000000
--- a/src/transform/fmpe.h
+++ /dev/null
@@ -1,271 +0,0 @@
-// transform/fmpe.h
-
-// Copyright 2011-2012  Yanmin Qian  Johns Hopkins University (Author: Daniel Povey)
-
-// See ../../COPYING for clarification regarding multiple authors
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//  http://www.apache.org/licenses/LICENSE-2.0
-//
-// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
-// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
-// MERCHANTABLITY OR NON-INFRINGEMENT.
-// See the Apache 2 License for the specific language governing permissions and
-// limitations under the License.
-
-
-#ifndef KALDI_TRANSFORM_FMPE_H_
-#define KALDI_TRANSFORM_FMPE_H_ 1
-
-#include <vector>
-
-#include "gmm/am-diag-gmm.h"
-#include "gmm/mle-am-diag-gmm.h"
-#include "hmm/transition-model.h"
-#include "hmm/posterior.h"
-
-namespace kaldi {
-
-
-struct FmpeOptions {
-  // Probably the easiest place to start, to understand fMPE, is the
-  // paper "Improvements to fMPE for discriminative training of features".
-  // We are simplifying a few things here.  We are getting rid of the
-  // "indirect differential"; we are adding a linear transform after the
-  // high->low dimension projection whose function is to "un-whiten" the
-  // transformed features (i.e. project from a nominally Gaussian-distributed
-  // space into our actual feature space), in order to make it unnecessary to
-  // take into account the per-dim variance during the update phase of fMPE;
-  // and the update equations are rather simpler than described in
-  // the paper; we take away some stuff, but add in the capability to
-  // do l2 regularization during the update phase.
-  
-  std::string context_expansion; // This string describes the various contexts...
-  // the easiest way to think of it is, we first generate the high-dimensional
-  // features without context expansion, and we then append the left and right
-  // frames, and also weighted averages of further-out frames, as specified by
-  // this string.  Suppose there are 1024 Gaussians and the feature dimension is
-  // 40.  In the simple way to describe it, supposing there are 9 contexts (the
-  // central frame, the left and right frames, and 6 averages of more distant
-  // frames), we generate the "offset features" of dimension (1024 * 41), then
-  // add left and right temporal context to the high-dim features so the
-  // dimension is (1024 * 41 * 9), and then project down to 40, so we train a
-  // matrix of 40 x (1024 * 41 * 9).  As described in the paper, though, we
-  // reorganize the computation for efficiency (it has to do with preserving
-  // sparsity), and we train a matrix of dimension (40 * 9) x (1024 * 41).  The
-  // (40 x 9) -> 40 transformation, which involves time as well, is dictated by
-  // these contexts.
-
-  // You probably won't want to mess with this "context_expansion" string.
-  // The most important parameter to tune is the number of Gaussians in
-  // the UBM.  Typically this will be in the range 300 to 1000.
-
-  BaseFloat post_scale; // Scale on the posterior component of the high-dim
-  // features (1 of these for every [feat-dim] of the offset features).
-  // Typically 5.0-- this just gives a bit more emphasis to these posteriors
-  // during training, like a faster learning rate.
-  
-  FmpeOptions(): context_expansion("0,1.0:-1,1.0:1,1.0:-2,0.5;-3,0.5:2,0.5;3,0.5:-4,0.5;-5,0.5:4,0.5;5,0.5:-6,0.333;-7,0.333;-8,0.333:6,0.333;7,0.333;8,0.333"),
-                post_scale(5.0) { }
-
-  void Register(OptionsItf *opts) {
-    opts->Register("post-scale", &post_scale, "Scaling constant on posterior "
-                   "element of offset features, to give it a faster learning "
-                   "rate.");
-    opts->Register("context-expansion", &context_expansion, "Specifies the "
-                   "temporal context-splicing of high-dimensional features.");
-  }
-  // We include write and read functions, since this
-  // object is included as a member of the fMPE object.
-  void Write(std::ostream &os, bool binary) const;
-  void Read(std::istream &is, bool binary);
-};
-
-struct FmpeUpdateOptions {
-  BaseFloat learning_rate; // Learning rate constant.  Like inverse of E
-  // in the papers.
-  BaseFloat l2_weight; // Weight on l2 regularization term
-  
-  FmpeUpdateOptions(): learning_rate(0.1), l2_weight(100.0) { }
-
-  void Register(OptionsItf *opts) {
-    opts->Register("learning-rate", &learning_rate,
-                   "Learning rate constant (like inverse of E in fMPE papers)");
-    opts->Register("l2-weight", &l2_weight,
-                   "Weight on l2 regularization term in objective function.");
-  }  
-};
-
-class Fmpe;
-
-struct FmpeStats {
-  FmpeStats() { };
-  void Init(const Fmpe &fmpe);
-  FmpeStats(const Fmpe &fmpe) { Init(fmpe); }
-
-  void Write(std::ostream &os, bool binary) const;
-  void Read(std::istream &is, bool binary, bool add = false);
-  
-  SubMatrix<BaseFloat> DerivPlus() const;
-  SubMatrix<BaseFloat> DerivMinus() const;
-
-  /// If we're using the indirect differential, accumulates certain quantities
-  /// that will be used in the update phase to verify that the computation
-  /// of the indirect differential was done correctly
-  void AccumulateChecks(const MatrixBase<BaseFloat> &feats,
-                        const MatrixBase<BaseFloat> &direct_deriv,
-                        const MatrixBase<BaseFloat> &indirect_deriv);
-  void DoChecks(); // Will check that stuff cancels.  Just prints
-  // messages for now.
- private:
-  Matrix<BaseFloat> deriv; // contains positive and negative parts of derivatives
-  // separately as sub-parts of the matrix, to ensure memory locality.
-
-  // checks() is an 8 x fmpe.FeatDim() matrix that stores:
-  //  (0-1) summed-deriv from direct, +ve and -ve part.
-  //  (2-3) summed-deriv from indirect, +ve and -ve part.
-  //  (4-5) (summed-deriv from direct * features), +ve and -ve part.
-  //  (6-7) (summed-deriv from indirect * features), +ve and -ve part.
-  Matrix<double> checks; // contains quantities we use to check the
-  // indirect and direct derivatives are canceling as they should.
-
-};
-
-class Fmpe {
- public:
-  Fmpe() {}
-  Fmpe(const DiagGmm &gmm, const FmpeOptions &config);
-
-  int32 FeatDim() const { return gmm_.Dim(); }
-  int32 NumGauss() const { return gmm_.NumGauss(); }
-  int32 NumContexts() const { return static_cast<int32>(contexts_.size()); }
-
-  // Note: this returns the number of rows and columns in projT_,
-  // which is the transpose of the high->intermediate dimensional
-  // projection matrix.  This is the dimension we want for the
-  // stats.
-  int32 ProjectionTNumRows() const { return (FeatDim()+1) * NumGauss(); }
-  int32 ProjectionTNumCols() const { return FeatDim() * NumContexts(); }
-
-  
-  // Computes the fMPE feature offsets and outputs them.
-  // You can add feat_in to this afterwards, if you want.
-  // Requires the Gaussian-selection info, which would normally
-  // be computed by a separate program-- this consists of
-  // lists of the top-scoring Gaussians for these features.
-  void ComputeFeatures(const MatrixBase<BaseFloat> &feat_in,
-                       const std::vector<std::vector<int32> > &gselect,
-                       Matrix<BaseFloat> *feat_out) const;
-
-  // For training-- compute the derivative w.r.t the projection matrix
-  // (we keep the positive and negative parts separately to help
-  // set the learning rates).
-  void AccStats(const MatrixBase<BaseFloat> &feat_in,
-                const std::vector<std::vector<int32> > &gselect,
-                const MatrixBase<BaseFloat> &direct_feat_deriv,
-                const MatrixBase<BaseFloat> *indirect_feat_deriv, // may be NULL
-                FmpeStats *stats) const;
-  
-  // Note: the form on disk starts with the GMM; that way,
-  // the gselect program can treat the fMPE object as if it
-  // is a GMM.
-  void Write(std::ostream &os, bool binary) const;
-  void Read(std::istream &is, bool binary);
-
-  // Returns total objf improvement, based on linear assumption.
-  BaseFloat Update(const FmpeUpdateOptions &config,
-                   const FmpeStats &stats);
-  
- private:
-  void SetContexts(std::string context_str);
-  void ComputeC(); // Computes the Cholesky factor C, from the GMM.
-  void ComputeStddevs();
-
-  // Constructs the high-dim features and applies the main projection matrix proj_.
-  void ApplyProjection(const MatrixBase<BaseFloat> &feat_in,
-                       const std::vector<std::vector<int32> > &gselect,
-                       MatrixBase<BaseFloat> *intermed_feat) const;
-
-  // The same in reverse, for computing derivatives.
-  void ApplyProjectionReverse(const MatrixBase<BaseFloat> &feat_in,
-                              const std::vector<std::vector<int32> > &gselect,
-                              const MatrixBase<BaseFloat> &intermed_feat_deriv,
-                              MatrixBase<BaseFloat> *proj_deriv_plus,
-                              MatrixBase<BaseFloat> *proj_deriv_minus) const;
-
-  // Applies the temporal context splicing from the intermediate
-  // features-- adds the result to feat_out which at this point
-  // will typically be zero.
-  void ApplyContext(const MatrixBase<BaseFloat> &intermed_feat,
-                    MatrixBase<BaseFloat> *feat_out) const;
-
-  // This is as ApplyContext but for back-propagating the derivative.
-  // Result is added to intermediate_feat_deriv which at this point will
-  // typically be zero.
-  void ApplyContextReverse(const MatrixBase<BaseFloat> &feat_deriv,
-                           MatrixBase<BaseFloat> *intermed_feat_deriv) const;
-
-  // Multiplies the feature offsets by the Cholesky matrix C.
-  void ApplyC(MatrixBase<BaseFloat> *feat_out, bool reverse = false) const;
-
-  // For computing derivatives-- multiply the derivatives by C^T,
-  // which is the "reverse" of the forward pass of multiplying
-  // by C (this is how derivatives behave...)
-  void ApplyCReverse(MatrixBase<BaseFloat> *deriv) const { ApplyC(deriv, true); }
-
-  
-  
-  DiagGmm gmm_; // The GMM used to get posteriors.
-  FmpeOptions config_;
-  Matrix<BaseFloat> stddevs_; // The standard deviations of the
-  // variances of the GMM -- computed to avoid taking a square root
-  // in the fMPE computation.   Derived variable-- not stored on
-  // disk.
-  Matrix<BaseFloat> projT_; // The transpose of the projection matrix;
-  // this is of dimension
-  // (NumGauss() * (FeatDim()+1)) * (FeatDim() * NumContexts()).
-  
-  TpMatrix<BaseFloat> C_; // Cholesky factor of the variance Sigma of
-  // features around their mean (as estimated from GMM)... applied
-  // to fMPE offset just before we add it to the features.  This allows
-  // us to simplify the fMPE update and not have to worry about
-  // the features having non-unit variance, and what effect this should
-  // have on the learning rate..
-  
-  // The following variable dictates how we use temporal context.
-  // e.g. contexts = { { (0, 1.0) }, { (-1, 1.0) }, { (1, 1.0) },
-  //                   { (-2, 0.5 ), (-3, 0.5) }, ...  }
-  std::vector<std::vector<std::pair<int32, BaseFloat> > > contexts_;
-  
-};
-
-/// Computes derivatives of the likelihood of these states (weighted),
-/// w.r.t. the feature values.  Used in fMPE training.  Note, the
-/// weights "posterior" may be positive or negative-- for MMI, MPE,
-/// etc., they will typically be of both signs.  Will resize "deriv".
-/// Returns the sum of (GMM likelihood * weight), which may be used
-/// as an approximation to the objective function.
-/// Last two parameters are optional.  See GetStatsDerivative() for
-/// or fMPE paper (ICASSP, 2005) more info on indirect derivative.
-/// Caution: if you supply the last two parameters, this function only
-/// works in the MMI case as it assumes the stats with positive weight
-/// are numerator == ml stats-- this is only the same thing in the MMI
-/// case, not fMPE.
-BaseFloat ComputeAmGmmFeatureDeriv(const AmDiagGmm &am_gmm,
-                                   const TransitionModel &trans_model,
-                                   const Posterior &posterior,
-                                   const MatrixBase<BaseFloat> &features,
-                                   Matrix<BaseFloat> *direct_deriv,
-                                   const AccumAmDiagGmm *model_diff = NULL,
-                                   Matrix<BaseFloat> *indirect_deriv = NULL);
-
-
-
-}  // End namespace kaldi
-
-
-#endif
diff --git a/src/transform/regtree-fmllr-diag-gmm-test.cc b/src/transform/regtree-fmllr-diag-gmm-test.cc
deleted file mode 100644
index 7f471d83966..00000000000
--- a/src/transform/regtree-fmllr-diag-gmm-test.cc
+++ /dev/null
@@ -1,320 +0,0 @@
-// transform/regtree-fmllr-diag-gmm-test.cc
-
-// Copyright 2009-2011  Georg Stemmer;  Saarland University
-
-// See ../../COPYING for clarification regarding multiple authors
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//  http://www.apache.org/licenses/LICENSE-2.0
-//
-// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
-// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
-// MERCHANTABLITY OR NON-INFRINGEMENT.
-// See the Apache 2 License for the specific language governing permissions and
-// limitations under the License.
-
-#include "util/common-utils.h"
-#include "gmm/diag-gmm.h"
-#include "gmm/mle-diag-gmm.h"
-#include "gmm/mle-am-diag-gmm.h"
-#include "gmm/model-test-common.h"
-#include "transform/regtree-fmllr-diag-gmm.h"
-
-namespace kaldi {
-
-static void
-RandFullCova(Matrix<BaseFloat> *matrix) {
-  size_t dim = matrix->NumCols();
-  KALDI_ASSERT(matrix->NumCols() == matrix->NumRows());
-
-  size_t iter = 0;
-  size_t max_iter = 10000;
-  // generate random (non-singular) matrix
-  // until condition
-  Matrix<BaseFloat> tmp(dim, dim);
-  SpMatrix<BaseFloat> tmp2(dim);
-  while (iter < max_iter) {
-    tmp.SetRandn();
-    if (tmp.Cond() < 100) break;
-    iter++;
-  }
-  if (iter >= max_iter) {
-    KALDI_ERR << "Internal error: found no random covariance matrix.";
-  }
-  // tmp * tmp^T will give positive definite matrix
-  tmp2.AddMat2(1.0, tmp, kNoTrans, 0.0);
-  matrix->CopyFromSp(tmp2);
-}
-
-
-/// Generate features for a certain covariance type
-/// covariance_type == 0: full covariance
-/// covariance_type == 1: diagonal covariance
-
-enum cova_type {
-  full,
-  diag
-};
-
-static void
-generate_features(cova_type covariance_type,
-                  size_t n_gaussians,
-                  size_t dim,
-                  Matrix<BaseFloat> &trans_mat,
-                  size_t frames_per_gaussian,
-                  std::vector<Vector<BaseFloat>*> & train_feats,
-                  std::vector<Vector<BaseFloat>*> & adapt_feats
-                  ) {
-  // compute inverse of the transformation matrix
-  Matrix<BaseFloat> inv_trans_mat(dim, dim);
-  inv_trans_mat.CopyFromMat(trans_mat, kNoTrans);
-  inv_trans_mat.Invert();
-  // the untransformed means are random
-  Matrix<BaseFloat> untransformed_means(dim, n_gaussians);
-  untransformed_means.SetRandn();
-  untransformed_means.Scale(10);
-
-  // the actual means result from
-  // transformation with inv_trans_mat
-  Matrix<BaseFloat> actual_means(dim, n_gaussians);
-
-  // actual_means = inv_trans_mat * untransformed_means
-  actual_means.AddMatMat(1.0, inv_trans_mat, kNoTrans,
-                         untransformed_means, kNoTrans, 0.0);
-
-  size_t train_counter = 0;
-
-  // temporary variables
-  Vector<BaseFloat> randomvec(dim);
-  Matrix<BaseFloat> Sj(dim, dim);
-
-  // loop over all gaussians
-  for (size_t j = 0; j < n_gaussians; j++) {
-    if (covariance_type == diag) {
-      // random diagonal covariance for gaussian j
-      Sj.SetZero();
-      for (size_t d = 0; d < dim; d++) {
-        Sj(d, d) = 2*Exp(RandGauss());
-      }
-    }
-    if (covariance_type == full) {
-      // random full covariance for gaussian j
-      RandFullCova(&Sj);
-    }
-    // compute inv_trans_mat * Sj
-    Matrix<BaseFloat> tmp_matrix(dim, dim);
-    tmp_matrix.AddMatMat(1.0, inv_trans_mat, kNoTrans, Sj, kNoTrans, 0.0);
-
-    // compute features
-    for (size_t i = 0; i < frames_per_gaussian; i++) {
-      train_feats[train_counter] = new Vector<BaseFloat>(dim);
-      adapt_feats[train_counter] = new Vector<BaseFloat>(dim);
-
-      // initalize feature vector with mean of class j
-      train_feats[train_counter]->CopyColFromMat(untransformed_means, j);
-      adapt_feats[train_counter]->CopyColFromMat(actual_means, j);
-
-      // determine random vector and
-      // multiply the random vector with SJ
-      // and add it to train_feats:
-      // train_feats = train_feats + SJ * random
-      // for adapt_feats we include the invtrans_mat:
-      // adapt_feats = adapt_feats + invtrans_mat * SJ * random
-      for (size_t d = 0; d < dim; d++) {
-        randomvec(d) = RandGauss();
-      }
-      train_feats[train_counter]->AddMatVec(1.0, Sj, kNoTrans,
-                                            randomvec, 1.0);
-      adapt_feats[train_counter]->AddMatVec(1.0, tmp_matrix, kNoTrans,
-                                            randomvec, 1.0);
-      train_counter++;
-    }
-  }
-  return;
-}
-
-void UnitTestRegtreeFmllrDiagGmm(cova_type feature_type, size_t max_bclass) {
-  // dimension of the feature space
-  size_t dim = 5 + Rand() % 3;
-
-  // number of components in the data
-  size_t n_gaussians = 8;
-
-  // number of data points to generate for every gaussian
-  size_t frames_per_gaussian = 100;
-
-  // generate random transformation matrix trans_mat
-  Matrix<BaseFloat> trans_mat(dim, dim);
-  int i = 0;
-  while (i < 10000) {
-    trans_mat.SetRandn();
-    if (trans_mat.Cond() < 100) break;
-    i++;
-  }
-  std::cout << "Condition of original Trans_Mat: " << trans_mat.Cond() << '\n';
-
-  // generate many feature vectors for each of the mixture components
-  std::vector<Vector<BaseFloat>*>
-      train_feats(n_gaussians * frames_per_gaussian);
-  std::vector<Vector<BaseFloat>*>
-      adapt_feats(n_gaussians * frames_per_gaussian);
-
-  generate_features(feature_type,
-                    n_gaussians,
-                    dim,
-                    trans_mat,
-                    frames_per_gaussian,
-                    train_feats,
-                    adapt_feats);
-
-  // initial values for a GMM
-  Vector<BaseFloat> weights(1);
-  Matrix<BaseFloat> means(1, dim), vars(1, dim), invvars(1, dim);
-  for (size_t d= 0; d < dim; d++) {
-    means(0, d) = 0.0F;
-    vars(0, d) = 1.0F;
-  }
-  weights(0) = 1.0F;
-  invvars.CopyFromMat(vars);
-  invvars.InvertElements();
-
-  // new HMM with 1 state
-  DiagGmm *gmm = new DiagGmm();
-  gmm->Resize(1, dim);
-  gmm->SetWeights(weights);
-  gmm->SetInvVarsAndMeans(invvars, means);
-  gmm->ComputeGconsts();
-  GmmFlagsType flags = kGmmAll;
-  MleDiagGmmOptions opts;
-
-  AmDiagGmm *am = new AmDiagGmm();
-  am->AddPdf(*gmm);
-  AccumAmDiagGmm *est_am = new AccumAmDiagGmm();
-
-  // train HMM
-  size_t iteration = 0;
-  size_t maxiterations = 10;
-  int32 maxcomponents = n_gaussians;
-  BaseFloat loglike = 0;
-  while (iteration < maxiterations) {
-    est_am->Init(*am, flags);
-
-    loglike = 0;
-    for (size_t j = 0; j < train_feats.size(); j++) {
-      loglike += est_am->AccumulateForGmm(*am, *train_feats[j], 0, 1.0);
-    }
-    MleAmDiagGmmUpdate(opts, *est_am, flags, am, NULL, NULL);
-
-    std::cout << "Loglikelihood before iteration " << iteration << " : "
-              << std::scientific << loglike << " number of components: "
-              << am->NumGaussInPdf(0) << '\n';
-
-    if ((iteration % 3 == 1) &&
-        (am->NumGaussInPdf(0) * 2 <= maxcomponents)) {
-      size_t n = am->NumGaussInPdf(0)*2;
-      am->SplitPdf(0, n, 0.001);
-    }
-    iteration++;
-  }
-
-  // adapt HMM to the transformed feature vectors
-  iteration = 0;
-  RegtreeFmllrDiagGmmAccs * fmllr_accs = new RegtreeFmllrDiagGmmAccs();
-  RegressionTree regtree;
-
-  RegtreeFmllrOptions xform_opts;
-  xform_opts.min_count = 100 * (1 + Rand() % 10);
-  xform_opts.use_regtree = (RandUniform() < 0.5)? false : true;
-
-  size_t num_pdfs = 1;
-  Vector<BaseFloat> occs(num_pdfs);
-  for (int32 i = 0; i < static_cast<int32>(num_pdfs); i++) {
-    occs(i) = 1.0/static_cast<BaseFloat>(num_pdfs);
-  }
-  std::vector<int32> silphones;
-  regtree.BuildTree(occs, silphones, *am, max_bclass);
-  maxiterations = 10;
-  std::vector<Vector<BaseFloat>*> logdet(adapt_feats.size());
-  for (size_t j = 0; j < adapt_feats.size(); j++) {
-    logdet[j] = new Vector<BaseFloat>(1);
-    logdet[j]->operator()(0) = 0.0;
-  }
-  while (iteration < maxiterations) {
-    fmllr_accs->Init(regtree.NumBaseclasses(), dim);
-    fmllr_accs->SetZero();
-    RegtreeFmllrDiagGmm *new_fmllr = new RegtreeFmllrDiagGmm();
-    loglike = 0;
-    for (size_t j = 0; j < adapt_feats.size(); j++) {
-      loglike += fmllr_accs->AccumulateForGmm(regtree, *am, *adapt_feats[j], 0, 1.0);
-      loglike += logdet[j]->operator()(0);
-    }
-    std::cout << "FMLLR: Loglikelihood before iteration " << iteration << " : "
-              << std::scientific << loglike << '\n';
-
-    fmllr_accs->Update(regtree, xform_opts, new_fmllr, NULL, NULL);
-    std::cout << "Got " << new_fmllr->NumBaseClasses() << " baseclasses\n";
-    bool binary = (RandUniform() < 0.5)? true : false;
-    std::cout << "Writing the transform to disk.\n";
-    new_fmllr->Write(Output("tmpf", binary).Stream(), binary);
-    RegtreeFmllrDiagGmm *fmllr_read = new RegtreeFmllrDiagGmm();
-    bool binary_in;
-    Input ki("tmpf", &binary_in);
-    std::cout << "Reading the transform from disk.\n";
-    fmllr_read->Read(ki.Stream(), binary_in);
-    fmllr_read->Validate();
-
-    // transform features
-    std::vector<Vector<BaseFloat> > trans_feats(1);
-    Vector<BaseFloat> trans_logdet;
-//    new_fmllr->ComputeLogDets();
-    trans_logdet.Resize(fmllr_read->NumRegClasses());
-    fmllr_read->GetLogDets(&trans_logdet);
-    for (size_t j = 0; j < adapt_feats.size(); j++) {
-      fmllr_read->TransformFeature(*adapt_feats[j], &trans_feats);
-      logdet[j]->operator()(0) += trans_logdet(0);
-      adapt_feats[j]->CopyFromVec(trans_feats[0]);
-    }
-    iteration++;
-    delete new_fmllr;
-    delete fmllr_read;
-    
-    unlink("tmpf");
-  }
-
-//  // transform features with empty transform
-//  std::vector<Vector<BaseFloat> > trans_feats(1);
-//  RegtreeFmllrDiagGmm *empty_fmllr = new RegtreeFmllrDiagGmm();
-//  empty_fmllr->Init(0, 0);
-//  for (size_t j = 0; j < adapt_feats.size(); j++) {
-//    empty_fmllr->TransformFeature(*adapt_feats[j], &trans_feats);
-//  }
-//  delete empty_fmllr;
-
-  // clean up
-  delete fmllr_accs;
-  delete est_am;
-  delete am;
-  delete gmm;
-  DeletePointers(&logdet);
-  DeletePointers(&train_feats);
-  DeletePointers(&adapt_feats);
-}
-}  // namespace kaldi ends here
-
-int main() {
-  for (int i = 0; i <= 8; i+=2) {  // test is too slow so can't do too many
-    std::cout << "--------------------------------------" << '\n';
-    std::cout << "Test number " << i << '\n';
-    std::cout << "--\nfeatures = full\n";
-    kaldi::UnitTestRegtreeFmllrDiagGmm(kaldi::full, (i%10+1));
-    std::cout << "--\nfeatures = diag\n";
-    kaldi::UnitTestRegtreeFmllrDiagGmm(kaldi::diag, (i%10+1));
-    std::cout << "--------------------------------------" << '\n';
-  }
-  std::cout << "Test OK.\n";
-}
-
diff --git a/src/transform/regtree-fmllr-diag-gmm.cc b/src/transform/regtree-fmllr-diag-gmm.cc
deleted file mode 100644
index f34b9987183..00000000000
--- a/src/transform/regtree-fmllr-diag-gmm.cc
+++ /dev/null
@@ -1,407 +0,0 @@
-// transform/regtree-fmllr-diag-gmm.cc
-
-// Copyright 2009-2011  Saarland University;  Georg Stemmer;
-//                      Microsoft Corporation
-
-// See ../../COPYING for clarification regarding multiple authors
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//  http://www.apache.org/licenses/LICENSE-2.0
-//
-// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
-// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
-// MERCHANTABLITY OR NON-INFRINGEMENT.
-// See the Apache 2 License for the specific language governing permissions and
-// limitations under the License.
-
-#include <utility>
-#include <vector>
-using std::vector;
-
-#include "itf/optimizable-itf.h"
-#include "transform/fmllr-diag-gmm.h"
-#include "transform/regtree-fmllr-diag-gmm.h"
-
-namespace kaldi {
-
-void RegtreeFmllrDiagGmm::Init(size_t num_xforms, size_t dim) {
-  if (num_xforms == 0) {  // empty transform
-    xform_matrices_.clear();
-    logdet_.Resize(0);
-    valid_logdet_ = false;
-    dim_ = 0;  // non-zero dimension is meaningless with empty transform
-    num_xforms_ = 0;
-  } else {
-    KALDI_ASSERT(dim != 0);  // if not empty, dim = 0 is meaningless
-    dim_ = dim;
-    num_xforms_ = num_xforms;
-    xform_matrices_.resize(num_xforms);
-    logdet_.Resize(num_xforms);
-    vector< Matrix<BaseFloat> >::iterator xform_itr = xform_matrices_.begin(),
-        xform_itr_end = xform_matrices_.end();
-    for (; xform_itr != xform_itr_end; ++xform_itr) {
-      xform_itr->Resize(dim, dim+1);
-      xform_itr->SetUnit();
-    }
-    valid_logdet_ = true;
-  }
-}
-
-void RegtreeFmllrDiagGmm::SetUnit() {
-  KALDI_ASSERT(num_xforms_ > 0 && dim_ > 0);
-  vector< Matrix<BaseFloat> >::iterator xform_itr = xform_matrices_.begin(),
-      xform_itr_end = xform_matrices_.end();
-  for (; xform_itr != xform_itr_end; ++xform_itr) {
-    xform_itr->SetUnit();
-  }
-}
-
-void RegtreeFmllrDiagGmm::Validate() {
-  if (dim_ < 0 || num_xforms_ < 0) {  // uninitialized case
-    KALDI_ERR <<"Do not call Validate() with an uninitialized object (dim = "
-              << (dim_) << ", # transforms = " << (num_xforms_);
-  } else if (dim_ * num_xforms_ == 0) {  // empty case
-    KALDI_ASSERT(num_xforms_ == 0 && dim_ == 0);
-    if (xform_matrices_.size() != 0 || logdet_.Dim() != 0) {
-      KALDI_ERR << "Number of transforms = " << (xform_matrices_.size())
-                << ", number of log-determinant terms = " << (logdet_.Dim())
-                << ". Expected number = 0";
-    }
-    return;
-  }
-
-  // non-empty case: typical usage scenario
-  if (xform_matrices_.size() != static_cast<size_t>(num_xforms_)
-      || logdet_.Dim() != num_xforms_) {
-    KALDI_ERR << "Number of transforms = " << (xform_matrices_.size())
-              << ", number of log-determinant terms = " << (logdet_.Dim())
-              << ". `Expected number = " << (num_xforms_);
-  }
-  for (int32 i = 0; i < num_xforms_; i++) {
-    if (xform_matrices_[i].NumRows() != dim_ ||
-        xform_matrices_[i].NumCols() != (dim_+1)) {
-      KALDI_ERR << "For transform " << (i) << ": inconsistent size: rows = "
-                << (xform_matrices_[i].NumRows()) << ", cols = "
-                << xform_matrices_[i].NumCols() << ", dim = " << (dim_);
-    }
-  }
-  if (bclass2xforms_.size() > 0) {
-    for (int32 i = 0, maxi = bclass2xforms_.size(); i < maxi; i++) {
-      if (bclass2xforms_[i] >= num_xforms_) {
-        KALDI_ERR << "For baseclass " << (i) << ", transform index "
-                  << (bclass2xforms_[i]) << " exceeds total transforms "
-                  << (num_xforms_);
-      }
-    }
-  } else {
-    if (num_xforms_ > 1) {
-      KALDI_WARN << "Multiple FMLLR transforms found without baseclass info.";
-    }
-  }
-}
-
-void RegtreeFmllrDiagGmm::ComputeLogDets() {
-  logdet_.Resize(num_xforms_);
-  for (int32 r = 0; r < num_xforms_; r++) {
-    SubMatrix<BaseFloat> tmp_a(xform_matrices_[r], 0, dim_, 0,
-                               dim_);
-    logdet_(r) = tmp_a.LogDet();
-    KALDI_ASSERT(!KALDI_ISNAN(logdet_(r)));
-  }
-  valid_logdet_ = true;
-}
-
-void RegtreeFmllrDiagGmm::TransformFeature(const VectorBase<BaseFloat> &in,
-                                    vector<Vector<BaseFloat> > *out) const {
-  KALDI_ASSERT(out != NULL);
-
-  if (xform_matrices_.size() == 0) {  // empty transform
-    KALDI_ASSERT(num_xforms_ == 0 && dim_ == 0 && logdet_.Dim() == 0);
-    KALDI_WARN << "Asked to apply empty feature transform. Copying instead.";
-    out->resize(1);
-    (*out)[0].Resize(in.Dim());
-    (*out)[0].CopyFromVec(in);
-    return;
-  } else {
-    KALDI_ASSERT(in.Dim() == dim_);
-    // if (!valid_logdet_)
-    // KALDI_ERR << "Must call ComputeLogDets() before transforming data.";
-    // [no need for this check].
-    Vector<BaseFloat> extended_feat(dim_ + 1);
-    extended_feat.Range(0, dim_).CopyFromVec(in);
-    extended_feat(dim_) = 1.0;
-    KALDI_ASSERT(num_xforms_ > 0);
-    out->resize(num_xforms_);
-    for (int32 xform_index = 0; xform_index < num_xforms_;
-         ++xform_index) {
-      (*out)[xform_index].Resize(dim_);
-      (*out)[xform_index].AddMatVec(1.0, xform_matrices_[xform_index],
-                                    kNoTrans, extended_feat, 0.0);
-    }
-  }
-}
-
-void RegtreeFmllrDiagGmm::Write(std::ostream &out, bool binary) const {
-  WriteToken(out, binary, "<FMLLRXFORM>");
-  WriteToken(out, binary, "<NUMXFORMS>");
-  WriteBasicType(out, binary, num_xforms_);
-  WriteToken(out, binary, "<DIMENSION>");
-  WriteBasicType(out, binary, dim_);
-
-  vector< Matrix<BaseFloat> >::const_iterator xform_itr =
-      xform_matrices_.begin(), xform_itr_end = xform_matrices_.end();
-  for (; xform_itr != xform_itr_end; ++xform_itr) {
-    WriteToken(out, binary, "<XFORM>");
-    xform_itr->Write(out, binary);
-  }
-
-  WriteToken(out, binary, "<BCLASS2XFORMS>");
-  WriteIntegerVector(out, binary, bclass2xforms_);
-  WriteToken(out, binary, "</FMLLRXFORM>");
-}
-
-
-void RegtreeFmllrDiagGmm::Read(std::istream &in, bool binary) {
-  ExpectToken(in, binary, "<FMLLRXFORM>");
-  ExpectToken(in, binary, "<NUMXFORMS>");
-  ReadBasicType(in, binary, &num_xforms_);
-  ExpectToken(in, binary, "<DIMENSION>");
-  ReadBasicType(in, binary, &dim_);
-  KALDI_ASSERT(num_xforms_ >= 0 && dim_ >= 0);  // can be 0 for empty xform
-
-  xform_matrices_.resize(num_xforms_);
-  vector< Matrix<BaseFloat> >::iterator xform_itr = xform_matrices_.begin(),
-      xform_itr_end = xform_matrices_.end();
-  for (; xform_itr != xform_itr_end; ++xform_itr) {
-    ExpectToken(in, binary, "<XFORM>");
-    xform_itr->Read(in, binary);
-    KALDI_ASSERT(xform_itr->NumRows() == (xform_itr->NumCols() - 1)
-           && xform_itr->NumRows() == dim_);
-  }
-
-  ExpectToken(in, binary, "<BCLASS2XFORMS>");
-  ReadIntegerVector(in, binary, &bclass2xforms_);
-  ExpectToken(in, binary, "</FMLLRXFORM>");
-  ComputeLogDets();  // so that the transforms can be used.
-}
-
-// ************************************************************************
-
-
-
-
-void RegtreeFmllrDiagGmmAccs::Init(size_t num_bclass, size_t dim) {
-  if (num_bclass == 0) {  // empty stats
-    DeletePointers(&baseclass_stats_);
-    baseclass_stats_.clear();
-    num_baseclasses_ = 0;
-    dim_ = 0;  // non-zero dimension is meaningless in empty stats
-  } else {
-    KALDI_ASSERT(dim != 0);  // if not empty, dim = 0 is meaningless
-    num_baseclasses_ = num_bclass;
-    dim_ = dim;
-    DeletePointers(&baseclass_stats_);
-    baseclass_stats_.resize(num_bclass);
-    for (vector<AffineXformStats*>::iterator it = baseclass_stats_.begin(),
-             end = baseclass_stats_.end(); it != end; ++it) {
-      *it = new AffineXformStats();
-      (*it)->Init(dim, dim);
-    }
-  }
-}
-
-void RegtreeFmllrDiagGmmAccs::SetZero() {
-  for (vector<AffineXformStats*>::iterator it = baseclass_stats_.begin(),
-           end = baseclass_stats_.end(); it != end; ++it) {
-    (*it)->SetZero();
-  }
-}
-
-BaseFloat RegtreeFmllrDiagGmmAccs::AccumulateForGmm(
-    const RegressionTree &regtree, const AmDiagGmm &am,
-    const VectorBase<BaseFloat> &data, size_t pdf_index, BaseFloat weight) {
-  const DiagGmm &pdf = am.GetPdf(pdf_index);
-  int32 num_comp = pdf.NumGauss();
-  Vector<BaseFloat> posterior(num_comp);
-  BaseFloat loglike = pdf.ComponentPosteriors(data, &posterior);
-  posterior.Scale(weight);
-  Vector<double> posterior_d(posterior);
-
-  Vector<double> extended_data(dim_+1);
-  extended_data.Range(0, dim_).CopyFromVec(data);
-  extended_data(dim_) = 1.0;
-  SpMatrix<double> scatter(dim_+1);
-  scatter.AddVec2(1.0, extended_data);
-
-  Vector<double> inv_var_mean(dim_);
-  Matrix<double> g_scale(baseclass_stats_.size(), dim_);  // scale on "scatter" for each dim.
-  for (int32 m = 0; m < num_comp; m++) {
-    inv_var_mean.CopyRowFromMat(pdf.means_invvars(), m);
-    int32 bclass = regtree.Gauss2BaseclassId(pdf_index, m);
-
-    baseclass_stats_[bclass]->beta_ += posterior_d(m);
-    baseclass_stats_[bclass]->K_.AddVecVec(posterior_d(m), inv_var_mean,
-                                           extended_data);
-    for (int32 d = 0; d < dim_; d++)
-      g_scale(bclass, d) +=  posterior(m) * pdf.inv_vars()(m, d);
-  }
-  for (size_t bclass = 0; bclass < baseclass_stats_.size(); bclass++) {
-    vector< SpMatrix<double> > &G = baseclass_stats_[bclass]->G_;
-    for (int32 d = 0; d < dim_; d++)
-      if (g_scale(bclass, d) != 0.0)
-        G[d].AddSp(g_scale(bclass, d), scatter);
-  }
-  return loglike;
-}
-
-void RegtreeFmllrDiagGmmAccs::AccumulateForGaussian(
-    const RegressionTree &regtree, const AmDiagGmm &am,
-    const VectorBase<BaseFloat> &data, size_t pdf_index, size_t gauss_index,
-    BaseFloat weight) {
-  const DiagGmm &pdf = am.GetPdf(pdf_index);
-  size_t dim = static_cast<size_t>(dim_);
-  Vector<double> extended_data(dim+1);
-  extended_data.Range(0, dim).CopyFromVec(data);
-  extended_data(dim) = 1.0;
-  SpMatrix<double> scatter(dim+1);
-  scatter.AddVec2(1.0, extended_data);
-  double weight_d = static_cast<double>(weight);
-
-  unsigned bclass = regtree.Gauss2BaseclassId(pdf_index, gauss_index);
-  Vector<double> inv_var_mean(dim_);
-  inv_var_mean.CopyRowFromMat(pdf.means_invvars(), gauss_index);
-
-  baseclass_stats_[bclass]->beta_ += weight_d;
-  baseclass_stats_[bclass]->K_.AddVecVec(weight_d, inv_var_mean, extended_data);
-  vector< SpMatrix<double> > &G = baseclass_stats_[bclass]->G_;
-  for (size_t d = 0; d < dim; d++)
-    G[d].AddSp((weight_d * pdf.inv_vars()(gauss_index, d)), scatter);
-}
-
-void RegtreeFmllrDiagGmmAccs::Write(std::ostream &out, bool binary) const {
-  WriteToken(out, binary, "<FMLLRACCS>");
-  WriteToken(out, binary, "<NUMBASECLASSES>");
-  WriteBasicType(out, binary, num_baseclasses_);
-  WriteToken(out, binary, "<DIMENSION>");
-  WriteBasicType(out, binary, dim_);
-  WriteToken(out, binary, "<STATS>");
-  vector<AffineXformStats*>::const_iterator itr = baseclass_stats_.begin(),
-      end = baseclass_stats_.end();
-  for ( ; itr != end; ++itr)
-    (*itr)->Write(out, binary);
-  WriteToken(out, binary, "</FMLLRACCS>");
-}
-
-void RegtreeFmllrDiagGmmAccs::Read(std::istream &in, bool binary, bool add) {
-  ExpectToken(in, binary, "<FMLLRACCS>");
-  ExpectToken(in, binary, "<NUMBASECLASSES>");
-  ReadBasicType(in, binary, &num_baseclasses_);
-  ExpectToken(in, binary, "<DIMENSION>");
-  ReadBasicType(in, binary, &dim_);
-  KALDI_ASSERT(num_baseclasses_ > 0 && dim_ > 0);
-  baseclass_stats_.resize(num_baseclasses_);
-  ExpectToken(in, binary, "<STATS>");
-  vector<AffineXformStats*>::iterator itr = baseclass_stats_.begin(),
-      end = baseclass_stats_.end();
-  for ( ; itr != end; ++itr) {
-    *itr = new AffineXformStats();
-    (*itr)->Init(dim_, dim_);
-    (*itr)->Read(in, binary, add);
-  }
-  ExpectToken(in, binary, "</FMLLRACCS>");
-}
-
-
-void RegtreeFmllrDiagGmmAccs::Update(const RegressionTree &regtree,
-                              const RegtreeFmllrOptions &opts,
-                              RegtreeFmllrDiagGmm *out_fmllr,
-                              BaseFloat *auxf_impr_out,
-                              BaseFloat *tot_t_out) const {
-  BaseFloat tot_auxf_impr = 0.0, tot_t = 0.0;
-  Matrix<BaseFloat> xform_mat(dim_, dim_+1);
-  if (opts.use_regtree) {  // estimate transforms using a regression tree
-    vector<AffineXformStats*> regclass_stats;
-    vector<int32> base2regclass;
-    bool update_xforms = regtree.GatherStats(baseclass_stats_, opts.min_count,
-                                             &base2regclass, &regclass_stats);
-    out_fmllr->set_bclass2xforms(base2regclass);
-    // If update_xforms == true, none should be negative, else all should be -1
-    if (update_xforms) {
-      out_fmllr->Init(regclass_stats.size(), dim_);
-      size_t num_rclass = regclass_stats.size();
-      for (size_t rclass_index = 0;
-           rclass_index < num_rclass; ++rclass_index) {
-        KALDI_ASSERT(regclass_stats[rclass_index]->beta_ >= opts.min_count);
-        xform_mat.SetUnit();
-        tot_t += regclass_stats[rclass_index]->beta_;
-
-        tot_auxf_impr +=
-            ComputeFmllrMatrixDiagGmmFull(xform_mat, *(regclass_stats[rclass_index]),
-                                          opts.num_iters, &xform_mat);
-        
-        out_fmllr->SetParameters(xform_mat, rclass_index);
-      }
-      KALDI_LOG << "Estimated " << num_rclass << " regression classes.";
-    } else {
-      out_fmllr->Init(1, dim_);  // Use a unit transform at the root.
-    }
-    DeletePointers(&regclass_stats);
-    // end of estimation using regression tree
-  } else {  // No regtree: estimate 1 transform per baseclass (if enough count)
-    for (int32 bclass_index = 0; bclass_index < num_baseclasses_;
-         ++bclass_index) {
-      tot_t += baseclass_stats_[bclass_index]->beta_;
-    }
-
-    out_fmllr->Init(num_baseclasses_, dim_);
-    vector<int32> base2regclass(num_baseclasses_);
-    for (int32 bclass_index = 0; bclass_index < num_baseclasses_;
-         ++bclass_index) {
-      if (baseclass_stats_[bclass_index]->beta_ >= opts.min_count) {
-        xform_mat.SetUnit();
-
-        if (opts.update_type == "full") {
-          tot_auxf_impr +=
-              ComputeFmllrMatrixDiagGmmFull(xform_mat,
-                                            *(baseclass_stats_[bclass_index]),
-                                            opts.num_iters, &xform_mat);
-        } else if (opts.update_type == "diag")
-          tot_auxf_impr +=
-              ComputeFmllrMatrixDiagGmmDiagonal(xform_mat,
-                                                *(baseclass_stats_[bclass_index]),
-                                                &xform_mat);
-        else if (opts.update_type == "offset")
-          tot_auxf_impr +=
-              ComputeFmllrMatrixDiagGmmOffset(xform_mat,
-                                              *(baseclass_stats_[bclass_index]),
-                                              &xform_mat);
-        else if (opts.update_type == "none")
-          tot_auxf_impr = 0.0;
-        else
-          KALDI_ERR << "Unknown fMLLR update type " << opts.update_type
-                    << ", fmllr-update-type must be one of \"full\"|\"diag\"|\"offset\"|\"none\"";
-
-        out_fmllr->SetParameters(xform_mat, bclass_index);
-        base2regclass[bclass_index] = bclass_index;
-      } else {
-        KALDI_WARN << "For baseclass " << (bclass_index) << " count = "
-                   << (baseclass_stats_[bclass_index]->beta_) << " < "
-                   << opts.min_count << ": not updating FMLLR";
-        base2regclass[bclass_index] = -1;
-      }
-      out_fmllr->set_bclass2xforms(base2regclass);
-    }  // end looping over all baseclasses
-  }  // end of estimating one transform per baseclass without regtree
-  if (auxf_impr_out) *auxf_impr_out = tot_auxf_impr;
-  if (tot_t_out) *tot_t_out = tot_t;
-}
-
-
-
-
-}  // namespace kaldi
-
diff --git a/src/transform/regtree-fmllr-diag-gmm.h b/src/transform/regtree-fmllr-diag-gmm.h
deleted file mode 100644
index 9130850ab8c..00000000000
--- a/src/transform/regtree-fmllr-diag-gmm.h
+++ /dev/null
@@ -1,204 +0,0 @@
-// transform/regtree-fmllr-diag-gmm.h
-
-// Copyright 2009-2011  Saarland University;  Georg Stemmer;
-//                      Microsoft Corporation
-
-// See ../../COPYING for clarification regarding multiple authors
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//  http://www.apache.org/licenses/LICENSE-2.0
-//
-// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
-// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
-// MERCHANTABLITY OR NON-INFRINGEMENT.
-// See the Apache 2 License for the specific language governing permissions and
-// limitations under the License.
-
-
-#ifndef KALDI_TRANSFORM_REGTREE_FMLLR_DIAG_GMM_H_
-#define KALDI_TRANSFORM_REGTREE_FMLLR_DIAG_GMM_H_
-
-#include <vector>
-
-#include "base/kaldi-common.h"
-#include "gmm/am-diag-gmm.h"
-#include "transform/transform-common.h"
-#include "transform/regression-tree.h"
-#include "util/kaldi-table.h"
-#include "util/kaldi-holder.h"
-
-namespace kaldi {
-
-
-///  Configuration variables for FMLLR transforms
-struct RegtreeFmllrOptions {
-  std::string update_type;  ///< "full", "diag", "offset", "none"
-  BaseFloat min_count;  ///< Minimum occupancy for computing a transform
-  int32 num_iters;      ///< Number of iterations (if using an iterative update)
-  bool use_regtree;     ///< If 'true', find transforms to generate using regression tree.
-                        ///< If 'false', generate transforms for each baseclass.
-
-  RegtreeFmllrOptions(): update_type("full"), min_count(1000.0),
-                         num_iters(10), use_regtree(true) { }
-
-  void Register(OptionsItf *opts) {
-    opts->Register("fmllr-update-type", &update_type,
-                   "Update type for fMLLR (\"full\"|\"diag\"|\"offset\"|\"none\")");
-    opts->Register("fmllr-min-count", &min_count,
-                   "Minimum count to estimate an fMLLR transform.");
-    opts->Register("fmllr-num-iters", &num_iters,
-                   "Number of fMLLR iterations (if using an iterative update).");
-    opts->Register("fmllr-use-regtree", &use_regtree,
-                   "Use a regression-class tree for fMLLR.");
-  }
-};
-
-
-/** An FMLLR (feature-space MLLR) transformation, also called CMLLR
- *  (constrained MLLR) is an affine transformation of the feature vectors.
- *  This class supports multiple transforms, and a regression tree.
- *  For a single, feature-level transformation see fmllr-diag-gmm-global.h
- *  Note: the "regression classes" are the classes after tree-clustering,
- *  which are smaller in number than the "base classes"  (these correspond
- *  to the leaves of the tree).
- */
-class RegtreeFmllrDiagGmm {
- public:
-  RegtreeFmllrDiagGmm() : dim_(-1), num_xforms_(-1), valid_logdet_(false) {}
-  explicit RegtreeFmllrDiagGmm(const RegtreeFmllrDiagGmm &other)
-      : dim_(other.dim_), num_xforms_(other.num_xforms_),
-        xform_matrices_(other.xform_matrices_), logdet_(other.logdet_),
-        valid_logdet_(other.valid_logdet_),
-        bclass2xforms_(other.bclass2xforms_) {}
-  ~RegtreeFmllrDiagGmm() {}
-  /// Allocates memory for transform matrix & bias vector
-  void Init(size_t num_xforms, size_t dim);
-  void Validate();  ///< Checks whether the various parameters are consistent
-  /// Sets transform matrix to identity and bias vector to zero
-  void SetUnit();
-  /// Computes the log-determinant of the Jacobians for each transform
-  void ComputeLogDets();
-  /// Get the transformed features for each of the transforms.
-  void TransformFeature(const VectorBase<BaseFloat> &in,
-                        std::vector< Vector<BaseFloat> > *out) const;
-  void Write(std::ostream &out_stream, bool binary) const;
-  void Read(std::istream &in_stream, bool binary);
-
-  /// Accessors
-  int32 Dim() const { return dim_; }
-  int32 NumBaseClasses() const { return bclass2xforms_.size(); }
-  int32 NumRegClasses() const { return num_xforms_; }
-  void GetXformMatrix(int32 xform_index, Matrix<BaseFloat> *out) const;
-  void GetLogDets(VectorBase<BaseFloat> *out) const;
-  int32 Base2RegClass(int32 bclass) const { return bclass2xforms_[bclass]; }
-
-  /// Mutators
-  void SetParameters(const MatrixBase<BaseFloat> &mat, size_t regclass);
-  void set_bclass2xforms(const std::vector<int32> &in) { bclass2xforms_ = in; }
-
- private:
-  int32 dim_;             ///< Dimension of feature vectors
-  int32 num_xforms_;            ///< Number of transform matrices
-  std::vector< Matrix<BaseFloat> > xform_matrices_;  ///< Transform matrices
-  Vector<BaseFloat> logdet_;    ///< Log-determinants of the Jacobians
-  bool valid_logdet_;           ///< Whether logdets are for current transforms
-  /// For each baseclass index of which transform to use; -1 => no xform
-  std::vector<int32> bclass2xforms_;
-
-  void operator = (const RegtreeFmllrDiagGmm&);  // Disallow assignment operator
-};
-
-inline void RegtreeFmllrDiagGmm::GetXformMatrix(int32 xform_index,
-                                              Matrix<BaseFloat> *out) const {
-  if (xform_index >= num_xforms_) {
-    KALDI_ERR << "Index (" << xform_index << ") out of range [0, "
-        << num_xforms_ << "]";
-  }
-  out->Resize(dim_, dim_ + 1);
-  out->CopyFromMat(xform_matrices_[xform_index], kNoTrans);
-}
-
-inline void RegtreeFmllrDiagGmm::SetParameters(const MatrixBase<BaseFloat> &mat,
-                                        size_t regclass) {
-  xform_matrices_[regclass].CopyFromMat(mat, kNoTrans);
-  valid_logdet_ = false;
-}
-
-inline void RegtreeFmllrDiagGmm::GetLogDets(VectorBase<BaseFloat> *out) const {
-  KALDI_ASSERT(valid_logdet_ && out->Dim() == logdet_.Dim());
-  out->CopyFromVec(logdet_);
-}
-
-typedef TableWriter< KaldiObjectHolder<RegtreeFmllrDiagGmm> >  RegtreeFmllrDiagGmmWriter;
-typedef RandomAccessTableReader< KaldiObjectHolder<RegtreeFmllrDiagGmm> >
-            RandomAccessRegtreeFmllrDiagGmmReader;
-typedef RandomAccessTableReaderMapped< KaldiObjectHolder<RegtreeFmllrDiagGmm> >
-            RandomAccessRegtreeFmllrDiagGmmReaderMapped;
-typedef SequentialTableReader< KaldiObjectHolder<RegtreeFmllrDiagGmm> >  RegtreeFmllrDiagGmmSeqReader;
-
-/** \class RegtreeFmllrDiagGmmAccs
- *  Class for computing the accumulators needed for the maximum-likelihood
- *  estimate of FMLLR transforms for an acoustic model that uses diagonal
- *  Gaussian mixture models as emission densities.
- */
-class RegtreeFmllrDiagGmmAccs {
- public:
-  RegtreeFmllrDiagGmmAccs() : num_baseclasses_(-1), dim_(-1) {}
-  ~RegtreeFmllrDiagGmmAccs() { DeletePointers(&baseclass_stats_); }
-
-  void Init(size_t num_bclass, size_t dim);
-  void SetZero();
-
-  /// Accumulate stats for a single GMM in the model; returns log likelihood.
-  /// This does not work if the features have already been transformed
-  /// with multiple feature transforms (so you can't use use this to
-  /// do a 2nd pass of regression-tree fMLLR estimation, which as I write
-  /// (Dan, 2016) I'm not sure that this framework even supports.
-  BaseFloat AccumulateForGmm(const RegressionTree &regtree,
-                             const AmDiagGmm &am,
-                             const VectorBase<BaseFloat> &data,
-                             size_t pdf_index, BaseFloat weight);
-
-  /// Accumulate stats for a single Gaussian component in the model.
-  void AccumulateForGaussian(const RegressionTree &regtree,
-                             const AmDiagGmm &am,
-                             const VectorBase<BaseFloat> &data,
-                             size_t pdf_index, size_t gauss_index,
-                             BaseFloat weight);
-
-  void Update(const RegressionTree &regtree, const RegtreeFmllrOptions &opts,
-              RegtreeFmllrDiagGmm *out_fmllr, BaseFloat *auxf_impr,
-              BaseFloat *tot_t) const;
-
-  void Write(std::ostream &out_stream, bool binary) const;
-  void Read(std::istream &in_stream, bool binary, bool add);
-
-  /// Accessors
-  int32 Dim() const { return dim_; }
-  int32 NumBaseClasses() const { return num_baseclasses_; }
-  const std::vector<AffineXformStats*> &baseclass_stats() const {
-    return baseclass_stats_;
-  }
-
- private:
-  /// Per-baseclass stats; used for accumulation
-  std::vector<AffineXformStats*> baseclass_stats_;
-  /// Number of baseclasses
-  int32 num_baseclasses_;
-  /// Dimension of feature vectors
-  int32 dim_;
-
-  // Cannot have copy constructor and assigment operator
-  KALDI_DISALLOW_COPY_AND_ASSIGN(RegtreeFmllrDiagGmmAccs);
-};
-
-
-
-
-}  // namespace kaldi
-
-#endif  // KALDI_TRANSFORM_REGTREE_FMLLR_DIAG_GMM_H_
diff --git a/src/transform/regtree-mllr-diag-gmm-test.cc b/src/transform/regtree-mllr-diag-gmm-test.cc
deleted file mode 100644
index 812a78d56d2..00000000000
--- a/src/transform/regtree-mllr-diag-gmm-test.cc
+++ /dev/null
@@ -1,194 +0,0 @@
-// transform/regtree-mllr-diag-gmm-test.cc
-
-// Copyright 2009-2011   Saarland University
-// Author:  Arnab Ghoshal
-
-// See ../../COPYING for clarification regarding multiple authors
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//  http://www.apache.org/licenses/LICENSE-2.0
-//
-// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
-// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
-// MERCHANTABLITY OR NON-INFRINGEMENT.
-// See the Apache 2 License for the specific language governing permissions and
-// limitations under the License.
-
-#include "util/common-utils.h"
-#include "gmm/diag-gmm.h"
-#include "gmm/mle-diag-gmm.h"
-#include "gmm/mle-am-diag-gmm.h"
-#include "gmm/model-test-common.h"
-#include "transform/regtree-mllr-diag-gmm.h"
-
-using kaldi::int32;
-using kaldi::BaseFloat;
-using kaldi::RegtreeMllrDiagGmmAccs;
-namespace ut = kaldi::unittest;
-
-void TestMllrAccsIO(const kaldi::AmDiagGmm &am_gmm,
-                    const kaldi::RegressionTree &regtree,
-                    const RegtreeMllrDiagGmmAccs &accs,
-                    const kaldi::Matrix<BaseFloat> adapt_data) {
-  // First, non-binary write
-  accs.Write(kaldi::Output("tmpf", false).Stream(), false);
-
-  kaldi::RegtreeMllrDiagGmm mllr;
-  kaldi::RegtreeMllrOptions opts;
-  opts.min_count = 100;
-  opts.use_regtree = false;
-  accs.Update(regtree, opts, &mllr, NULL, NULL);
-  kaldi::AmDiagGmm am1;
-  am1.CopyFromAmDiagGmm(am_gmm);
-  mllr.TransformModel(regtree, &am1);
-
-  BaseFloat loglike = 0;
-  int32 npoints = adapt_data.NumRows();
-  for (int32 j = 0; j < npoints; j++) {
-    loglike += am1.LogLikelihood(0, adapt_data.Row(j));
-  }
-  KALDI_LOG << "Per-frame loglike after adaptation = " << (loglike/npoints)
-            << " over " << npoints << " frames.";
-
-  size_t num_comp2 = 1 + kaldi::RandInt(0, 9);  // random number of mixtures
-  int32 dim = am_gmm.Dim();
-  kaldi::DiagGmm gmm2;
-  ut::InitRandDiagGmm(dim, num_comp2, &gmm2);
-  kaldi::Vector<BaseFloat> data(dim);
-  gmm2.Generate(&data);
-  BaseFloat loglike1 = am1.LogLikelihood(0, data);
-//  KALDI_LOG << "LL0 = " << loglike0 << "; LL1 = " << loglike1;
-
-  KALDI_LOG << "Test ASCII IO.";
-  bool binary_in;
-  kaldi::RegtreeMllrDiagGmm mllr1;
-  RegtreeMllrDiagGmmAccs *accs1 = new RegtreeMllrDiagGmmAccs();
-  // Non-binary read
-  kaldi::Input ki1("tmpf", &binary_in);
-  accs1->Read(ki1.Stream(), binary_in, false);
-  accs1->Update(regtree, opts, &mllr1, NULL, NULL);
-  delete accs1;
-  kaldi::AmDiagGmm am2;
-  am2.CopyFromAmDiagGmm(am_gmm);
-  mllr.TransformModel(regtree, &am2);
-  BaseFloat loglike2 = am2.LogLikelihood(0, data);
-//  KALDI_LOG << "LL1 = " << loglike1 << "; LL2 = " << loglike2;
-  kaldi::AssertEqual(loglike1, loglike2, 1e-6);
-
-  kaldi::RegtreeMllrDiagGmm mllr2;
-  // Next, binary write
-  KALDI_LOG << "Test Binary IO.";
-  accs.Write(kaldi::Output("tmpfb", true).Stream(), true);
-  RegtreeMllrDiagGmmAccs *accs2 = new RegtreeMllrDiagGmmAccs();
-  // Binary read
-  kaldi::Input ki2("tmpfb", &binary_in);
-  accs2->Read(ki2.Stream(), binary_in, false);
-  accs2->Update(regtree, opts, &mllr2, NULL, NULL);
-  delete accs2;
-  kaldi::AmDiagGmm am3;
-  am3.CopyFromAmDiagGmm(am_gmm);
-  mllr.TransformModel(regtree, &am3);
-  BaseFloat loglike3 = am3.LogLikelihood(0, data);
-//  KALDI_LOG << "LL1 = " << loglike1 << "; LL3 = " << loglike3;
-  kaldi::AssertEqual(loglike1, loglike3, 1e-6);
-  
-  unlink("tmpf");
-  unlink("tmpfb");
-}
-
-void TestXformMean(const kaldi::AmDiagGmm &am_gmm,
-                   const kaldi::RegressionTree &regtree,
-                   const RegtreeMllrDiagGmmAccs &accs,
-                   const kaldi::Matrix<BaseFloat> adapt_data) {
-  kaldi::RegtreeMllrDiagGmm mllr;
-  kaldi::RegtreeMllrOptions opts;
-  opts.min_count = 100;
-  opts.use_regtree = false;
-  accs.Update(regtree, opts, &mllr, NULL, NULL);
-
-  kaldi::AmDiagGmm am1;
-  am1.CopyFromAmDiagGmm(am_gmm);
-  mllr.TransformModel(regtree, &am1);
-
-  kaldi::DiagGmm tmp_pdf;
-  tmp_pdf.CopyFromDiagGmm(am_gmm.GetPdf(0));
-  kaldi::Matrix<BaseFloat> tmp_means(am_gmm.GetPdf(0).NumGauss(), am_gmm.Dim());
-  mllr.GetTransformedMeans(regtree, am_gmm, 0, &tmp_means);
-  tmp_pdf.SetInvVarsAndMeans(tmp_pdf.inv_vars(), tmp_means);
-  tmp_pdf.ComputeGconsts();
-
-  BaseFloat loglike0 = 0, loglike = 0;
-  int32 npoints = adapt_data.NumRows();
-  for (int32 j = 0; j < npoints; j++) {
-    loglike0 += am1.LogLikelihood(0, adapt_data.Row(j));
-    loglike += tmp_pdf.LogLikelihood(adapt_data.Row(j));
-  }
-  KALDI_LOG << "Per-frame loglike after adaptation = " << (loglike0/npoints)
-            << " over " << npoints << " frames.";
-//  KALDI_LOG << "LL0 = " << loglike0 << "; LL = " << loglike;
-  kaldi::AssertEqual(loglike0, loglike, 1e-6);
-
-  kaldi::Matrix<BaseFloat> tmp_means2(am_gmm.GetPdf(0).NumGauss(), am_gmm.Dim());
-  mllr.GetTransformedMeans(regtree, am_gmm, 0, &tmp_means2);
-  tmp_pdf.SetInvVarsAndMeans(tmp_pdf.inv_vars(), tmp_means2);
-  tmp_pdf.ComputeGconsts();
-
-  BaseFloat loglike1 = 0;
-  for (int32 j = 0; j < npoints; j++) {
-    loglike1 += tmp_pdf.LogLikelihood(adapt_data.Row(j));
-  }
-//  KALDI_LOG << "LL = " << loglike << "; LL1 = " << loglike1;
-  kaldi::AssertEqual(loglike, loglike1, 1e-6);
-}
-
-
-void UnitTestRegtreeMllrDiagGmm() {
-  size_t dim = 1 + kaldi::RandInt(1, 9);  // random dimension of the gmm
-  size_t num_comp = 1 + kaldi::RandInt(0, 5);  // random number of mixtures
-  kaldi::DiagGmm gmm;
-  ut::InitRandDiagGmm(dim, num_comp, &gmm);
-  kaldi::AmDiagGmm am_gmm;
-  am_gmm.Init(gmm, 1);
-
-  size_t num_comp2 = 1 + kaldi::RandInt(0, 5);  // random number of mixtures
-  kaldi::DiagGmm gmm2;
-  ut::InitRandDiagGmm(dim, num_comp2, &gmm2);
-  int32 npoints = dim*(dim+1)*10 + 500;
-  kaldi::Matrix<BaseFloat> adapt_data(npoints, dim);
-  for (int32 j = 0; j < npoints; j++) {
-    kaldi::SubVector<BaseFloat> row(adapt_data, j);
-    gmm2.Generate(&row);
-  }
-
-  kaldi::RegressionTree regtree;
-  std::vector<int32> sil_indices;
-  kaldi::Vector<BaseFloat> state_occs(1);
-  state_occs(0) = npoints;
-  regtree.BuildTree(state_occs, sil_indices, am_gmm, 1);
-  int32 num_bclass = regtree.NumBaseclasses();
-
-  kaldi::RegtreeMllrDiagGmmAccs accs;
-  BaseFloat loglike = 0;
-  accs.Init(num_bclass, dim);
-  for (int32 j = 0; j < npoints; j++) {
-    loglike += accs.AccumulateForGmm(regtree, am_gmm, adapt_data.Row(j),
-                                     0, 1.0);
-  }
-  KALDI_LOG << "Per-frame loglike during accumulations = " << (loglike/npoints)
-            << " over " << npoints << " frames.";
-
-  TestMllrAccsIO(am_gmm, regtree, accs, adapt_data);
-  TestXformMean(am_gmm, regtree, accs, adapt_data);
-}
-
-int main() {
-  kaldi::g_kaldi_verbose_level = 5;
-  for (int i = 0; i <= 10; i++)
-    UnitTestRegtreeMllrDiagGmm();
-  std::cout << "Test OK.\n";
-}
-
diff --git a/src/transform/regtree-mllr-diag-gmm.cc b/src/transform/regtree-mllr-diag-gmm.cc
deleted file mode 100644
index 7c30d66ff56..00000000000
--- a/src/transform/regtree-mllr-diag-gmm.cc
+++ /dev/null
@@ -1,398 +0,0 @@
-// transform/regtree-mllr-diag-gmm.cc
-
-// Copyright 2009-2011  Saarland University;  Jan Silovsky
-
-// See ../../COPYING for clarification regarding multiple authors
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//  http://www.apache.org/licenses/LICENSE-2.0
-//
-// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
-// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
-// MERCHANTABLITY OR NON-INFRINGEMENT.
-// See the Apache 2 License for the specific language governing permissions and
-// limitations under the License.
-
-#include <utility>
-using std::pair;
-#include <vector>
-using std::vector;
-
-#include "transform/regtree-mllr-diag-gmm.h"
-
-namespace kaldi {
-
-void RegtreeMllrDiagGmm::Init(int32 num_xforms, int32 dim) {
-  if (num_xforms == 0) {  // empty transform
-    xform_matrices_.clear();
-    dim_ = 0;  // non-zero dimension is meaningless with empty transform
-    num_xforms_ = 0;
-    bclass2xforms_.clear();
-  } else {
-    KALDI_ASSERT(dim != 0);  // if not empty, dim = 0 is meaningless
-    dim_ = dim;
-    num_xforms_ = num_xforms;
-    xform_matrices_.resize(num_xforms);
-    vector< Matrix<BaseFloat> >::iterator xform_itr = xform_matrices_.begin(),
-                                      xform_itr_end = xform_matrices_.end();
-    for (; xform_itr != xform_itr_end; ++xform_itr) {
-      xform_itr->Resize(dim, dim+1);
-      xform_itr->SetUnit();
-    }
-  }
-}
-
-void RegtreeMllrDiagGmm::SetUnit() {
-  vector< Matrix<BaseFloat> >::iterator xform_itr = xform_matrices_.begin(),
-                                    xform_itr_end = xform_matrices_.end();
-  for (; xform_itr != xform_itr_end; ++xform_itr) {
-    xform_itr->SetUnit();
-  }
-}
-
-void RegtreeMllrDiagGmm::TransformModel(const RegressionTree &regtree,
-                                        AmDiagGmm *am) {
-  KALDI_ASSERT(static_cast<int32>(bclass2xforms_.size()) ==
-               regtree.NumBaseclasses());
-  Vector<BaseFloat> extended_mean(dim_+1), xformed_mean(dim_);
-  for (int32 bclass_index = 0, num_bclasses = regtree.NumBaseclasses();
-       bclass_index < num_bclasses; ++bclass_index) {
-    int32 xform_index;
-    if ((xform_index = bclass2xforms_[bclass_index]) > -1) {
-      KALDI_ASSERT(xform_index < num_xforms_);
-      const vector< pair<int32, int32> > &bclass =
-          regtree.GetBaseclass(bclass_index);
-      for (vector< pair<int32, int32> >::const_iterator itr = bclass.begin(),
-          end = bclass.end(); itr != end; ++itr) {
-        SubVector<BaseFloat> tmp_mean(extended_mean.Range(0, dim_));
-        am->GetGaussianMean(itr->first, itr->second, &tmp_mean);
-        extended_mean(dim_) = 1.0;
-        xformed_mean.AddMatVec(1.0, xform_matrices_[xform_index], kNoTrans,
-                               extended_mean, 0.0);
-        am->SetGaussianMean(itr->first, itr->second, xformed_mean);
-      }  // end iterating over Gaussians in baseclass
-    }  // else keep the means untransformed
-  }  // end iterating over all baseclasses
-  am->ComputeGconsts();
-}
-
-
-void RegtreeMllrDiagGmm::GetTransformedMeans(const RegressionTree &regtree,
-                                             const AmDiagGmm &am,
-                                             int32 pdf_index,
-                                             MatrixBase<BaseFloat> *out) const {
-  KALDI_ASSERT(static_cast<int32>(bclass2xforms_.size()) ==
-               regtree.NumBaseclasses());
-  int32 num_gauss = am.GetPdf(pdf_index).NumGauss();
-  KALDI_ASSERT(out->NumRows() == num_gauss && out->NumCols() == dim_);
-
-  Vector<BaseFloat> extended_mean(dim_+1);
-  extended_mean(dim_) = 1.0;
-
-  for (int32 gauss_index = 0; gauss_index < num_gauss; gauss_index++) {
-    int32 bclass_index = regtree.Gauss2BaseclassId(pdf_index, gauss_index);
-    int32 xform_index = bclass2xforms_[bclass_index];
-    if (xform_index > -1) {  // use a transform
-      KALDI_ASSERT(xform_index < num_xforms_);
-      SubVector<BaseFloat> tmp_mean(extended_mean.Range(0, dim_));
-      am.GetGaussianMean(pdf_index, gauss_index, &tmp_mean);
-      SubVector<BaseFloat> out_row(out->Row(gauss_index));
-      out_row.AddMatVec(1.0, xform_matrices_[xform_index], kNoTrans,
-                        extended_mean, 0.0);
-    } else {  // Copy untransformed mean
-      SubVector<BaseFloat> out_row(out->Row(gauss_index));
-      am.GetGaussianMean(pdf_index, gauss_index, &out_row);
-    }
-  }
-}
-
-
-void RegtreeMllrDiagGmm::Write(std::ostream &out, bool binary) const {
-  WriteToken(out, binary, "<MLLRXFORM>");
-  WriteToken(out, binary, "<NUMXFORMS>");
-  WriteBasicType(out, binary, num_xforms_);
-  WriteToken(out, binary, "<DIMENSION>");
-  WriteBasicType(out, binary, dim_);
-
-  vector< Matrix<BaseFloat> >::const_iterator xform_itr =
-      xform_matrices_.begin(), xform_itr_end = xform_matrices_.end();
-  for (; xform_itr != xform_itr_end; ++xform_itr) {
-    WriteToken(out, binary, "<XFORM>");
-    xform_itr->Write(out, binary);
-  }
-
-  WriteToken(out, binary, "<BCLASS2XFORMS>");
-  WriteIntegerVector(out, binary, bclass2xforms_);
-  WriteToken(out, binary, "</MLLRXFORM>");
-}
-
-
-void RegtreeMllrDiagGmm::Read(std::istream &in, bool binary) {
-  ExpectToken(in, binary, "<MLLRXFORM>");
-  ExpectToken(in, binary, "<NUMXFORMS>");
-  ReadBasicType(in, binary, &num_xforms_);
-  ExpectToken(in, binary, "<DIMENSION>");
-  ReadBasicType(in, binary, &dim_);
-  KALDI_ASSERT(num_xforms_ >= 0 && dim_ >= 0);  // can be 0 for empty xform
-
-  xform_matrices_.resize(num_xforms_);
-  vector< Matrix<BaseFloat> >::iterator xform_itr = xform_matrices_.begin(),
-                                    xform_itr_end = xform_matrices_.end();
-  for (; xform_itr != xform_itr_end; ++xform_itr) {
-    ExpectToken(in, binary, "<XFORM>");
-    xform_itr->Read(in, binary);
-    KALDI_ASSERT(xform_itr->NumRows() == (xform_itr->NumCols() - 1)
-                 && xform_itr->NumRows() == dim_);
-  }
-
-  ExpectToken(in, binary, "<BCLASS2XFORMS>");
-  ReadIntegerVector(in, binary, &bclass2xforms_);
-  ExpectToken(in, binary, "</MLLRXFORM>");
-}
-
-// ************************************************************************
-
-void RegtreeMllrDiagGmmAccs::Init(int32 num_bclass, int32 dim) {
-  if (num_bclass == 0) {  // empty stats
-    DeletePointers(&baseclass_stats_);
-    baseclass_stats_.clear();
-    num_baseclasses_ = 0;
-    dim_ = 0;  // non-zero dimension is meaningless in empty stats
-  } else {
-    KALDI_ASSERT(dim != 0);  // if not empty, dim = 0 is meaningless
-    num_baseclasses_ = num_bclass;
-    dim_ = dim;
-    baseclass_stats_.resize(num_baseclasses_);
-    for (vector<AffineXformStats*>::iterator it = baseclass_stats_.begin(),
-        end = baseclass_stats_.end(); it != end; ++it) {
-      *it = new AffineXformStats();
-      (*it)->Init(dim_, dim_);
-    }
-  }
-}
-
-void RegtreeMllrDiagGmmAccs::SetZero() {
-  for (vector<AffineXformStats*>::iterator it = baseclass_stats_.begin(),
-      end = baseclass_stats_.end(); it != end; ++it) {
-    (*it)->SetZero();
-  }
-}
-
-BaseFloat RegtreeMllrDiagGmmAccs::AccumulateForGmm(
-    const RegressionTree &regtree, const AmDiagGmm &am,
-    const VectorBase<BaseFloat> &data, int32 pdf_index, BaseFloat weight) {
-  const DiagGmm &pdf = am.GetPdf(pdf_index);
-  int32 num_comp = static_cast<int32>(pdf.NumGauss());
-  Vector<BaseFloat> posterior(num_comp);
-  BaseFloat loglike = pdf.ComponentPosteriors(data, &posterior);
-  posterior.Scale(weight);
-  Vector<double> posterior_d(posterior);
-
-  Vector<double> data_d(data);
-  Vector<double> inv_var_x(dim_);
-  Vector<double> extended_mean(dim_+1);
-  SpMatrix<double> mean_scatter(dim_+1);
-
-  for (int32 m = 0; m < num_comp; m++) {
-    unsigned bclass = regtree.Gauss2BaseclassId(pdf_index, m);
-    inv_var_x.CopyFromVec(pdf.inv_vars().Row(m));
-    inv_var_x.MulElements(data_d);
-
-    // Using SubVector to stop compiler warning
-    SubVector<double> tmp_mean(extended_mean, 0, dim_);
-    pdf.GetComponentMean(m, &tmp_mean);  // modifies extended_mean
-    extended_mean(dim_) = 1.0;
-    mean_scatter.SetZero();
-    mean_scatter.AddVec2(1.0, extended_mean);
-
-    baseclass_stats_[bclass]->beta_ += posterior_d(m);
-    baseclass_stats_[bclass]->K_.AddVecVec(posterior_d(m), inv_var_x,
-                                           extended_mean);
-    vector< SpMatrix<double> > &G = baseclass_stats_[bclass]->G_;
-    for (int32 d = 0; d < dim_; d++)
-      G[d].AddSp((posterior_d(m) * pdf.inv_vars()(m, d)), mean_scatter);
-  }
-  return loglike;
-}
-
-void RegtreeMllrDiagGmmAccs::AccumulateForGaussian(
-    const RegressionTree &regtree, const AmDiagGmm &am,
-    const VectorBase<BaseFloat> &data, int32 pdf_index, int32 gauss_index,
-    BaseFloat weight) {
-  const DiagGmm &pdf = am.GetPdf(pdf_index);
-  Vector<double> data_d(data);
-  Vector<double> inv_var_x(dim_);
-  Vector<double> extended_mean(dim_+1);
-  double weight_d = static_cast<double>(weight);
-
-  unsigned bclass = regtree.Gauss2BaseclassId(pdf_index, gauss_index);
-  inv_var_x.CopyFromVec(pdf.inv_vars().Row(gauss_index));
-  inv_var_x.MulElements(data_d);
-
-  // Using SubVector to stop compiler warning
-  SubVector<double> tmp_mean(extended_mean, 0, dim_);
-  pdf.GetComponentMean(gauss_index, &tmp_mean);  // modifies extended_mean
-  extended_mean(dim_) = 1.0;
-  SpMatrix<double> mean_scatter(dim_+1);
-  mean_scatter.AddVec2(1.0, extended_mean);
-
-  baseclass_stats_[bclass]->beta_ += weight_d;
-  baseclass_stats_[bclass]->K_.AddVecVec(weight_d, inv_var_x, extended_mean);
-  vector< SpMatrix<double> > &G = baseclass_stats_[bclass]->G_;
-  for (int32 d = 0; d < dim_; d++)
-    G[d].AddSp((weight_d * pdf.inv_vars()(gauss_index, d)), mean_scatter);
-}
-
-void RegtreeMllrDiagGmmAccs::Write(std::ostream &out, bool binary) const {
-  WriteToken(out, binary, "<MLLRACCS>");
-  WriteToken(out, binary, "<NUMBASECLASSES>");
-  WriteBasicType(out, binary, num_baseclasses_);
-  WriteToken(out, binary, "<DIMENSION>");
-  WriteBasicType(out, binary, dim_);
-  WriteToken(out, binary, "<STATS>");
-  vector<AffineXformStats*>::const_iterator itr = baseclass_stats_.begin(),
-                                            end = baseclass_stats_.end();
-  for ( ; itr != end; ++itr)
-    (*itr)->Write(out, binary);
-  WriteToken(out, binary, "</MLLRACCS>");
-}
-
-void RegtreeMllrDiagGmmAccs::Read(std::istream &in, bool binary, bool add) {
-  ExpectToken(in, binary, "<MLLRACCS>");
-  ExpectToken(in, binary, "<NUMBASECLASSES>");
-  ReadBasicType(in, binary, &num_baseclasses_);
-  ExpectToken(in, binary, "<DIMENSION>");
-  ReadBasicType(in, binary, &dim_);
-  KALDI_ASSERT(num_baseclasses_ > 0 && dim_ > 0);
-  baseclass_stats_.resize(num_baseclasses_);
-  ExpectToken(in, binary, "<STATS>");
-  vector<AffineXformStats*>::iterator itr = baseclass_stats_.begin(),
-                                      end = baseclass_stats_.end();
-  for ( ; itr != end; ++itr) {
-    *itr = new AffineXformStats();
-    (*itr)->Init(dim_, dim_);
-    (*itr)->Read(in, binary, add);
-  }
-  ExpectToken(in, binary, "</MLLRACCS>");
-}
-
-static void ComputeMllrMatrix(const Matrix<double> &K,
-                              const vector< SpMatrix<double> > &G,
-                              Matrix<BaseFloat> *out) {
-  int32 dim = G.size();
-  Matrix<double> tmp_out(dim, dim+1);
-  for (int32 d = 0; d < dim; d++) {
-    if (G[d].Cond() > 1.0e+9) {
-      KALDI_WARN << "Dim " << d << ": Badly conditioned stats. Setting MLLR "
-                 << "transform to unit.";
-      tmp_out.SetUnit();
-      break;
-    }
-    SpMatrix<double> inv_g(G[d]);
-//    KALDI_LOG << "Dim " << d << ": G: max = " << inv_g.Max() << ", min = "
-//              << inv_g.Min() << ", log det = " << inv_g.LogDet(NULL)
-//              << ", cond = " << inv_g.Cond();
-    inv_g.Invert();
-//    KALDI_LOG << "Inv G: max = " << inv_g.Max() << ", min = " << inv_g.Min()
-//              << ", log det = " << inv_g.LogDet(NULL) << ", cond = "
-//              << inv_g.Cond();
-    tmp_out.Row(d).AddSpVec(1.0, inv_g, K.Row(d), 0.0);
-  }
-  out->CopyFromMat(tmp_out, kNoTrans);
-}
-
-static BaseFloat MllrAuxFunction(const Matrix<BaseFloat> &xform,
-                                 const AffineXformStats &stats) {
-  int32 dim = stats.G_.size();
-  Matrix<double> xform_d(xform);
-  Vector<double> xform_row_g(dim + 1);
-  SubMatrix<double> A(xform_d, 0, dim, 0, dim);
-  double obj = TraceMatMat(xform_d, stats.K_, kTrans);
-  for (int32 d = 0; d < dim; d++) {
-    xform_row_g.AddSpVec(1.0, stats.G_[d], xform_d.Row(d), 0.0);
-    obj -= 0.5 * VecVec(xform_row_g, xform_d.Row(d));
-  }
-  return obj;
-}
-
-void RegtreeMllrDiagGmmAccs::Update(const RegressionTree &regtree,
-                                    const RegtreeMllrOptions &opts,
-                                    RegtreeMllrDiagGmm *out_mllr,
-                                    BaseFloat *auxf_impr,
-                                    BaseFloat *t) const {
-  BaseFloat tot_auxf_impr = 0, tot_t = 0;
-  Matrix<BaseFloat> xform_mat(dim_, dim_ + 1);
-  if (opts.use_regtree) {  // estimate transforms using a regression tree
-    vector<AffineXformStats*> regclass_stats;
-    vector<int32> base2regclass;
-    bool update_xforms = regtree.GatherStats(baseclass_stats_, opts.min_count,
-                                             &base2regclass, &regclass_stats);
-    out_mllr->set_bclass2xforms(base2regclass);
-    // If update_xforms == true, none should be negative, else all should be -1
-    if (update_xforms) {
-      out_mllr->Init(regclass_stats.size(), dim_);
-      for (int32 rclass_index = 0, num_rclass = regclass_stats.size();
-           rclass_index < num_rclass; ++rclass_index) {
-        KALDI_ASSERT(regclass_stats[rclass_index]->beta_ >= opts.min_count);
-        xform_mat.SetUnit();
-        BaseFloat obj_old = MllrAuxFunction(xform_mat,
-                                            *(regclass_stats[rclass_index]));
-        ComputeMllrMatrix(regclass_stats[rclass_index]->K_,
-                          regclass_stats[rclass_index]->G_, &xform_mat);
-        out_mllr->SetParameters(xform_mat, rclass_index);
-        BaseFloat obj_new = MllrAuxFunction(xform_mat,
-                                            *(regclass_stats[rclass_index]));
-        KALDI_LOG << "MLLR: regclass " << (rclass_index)
-                  << ": Objective function impr per frame is "
-                  << ((obj_new - obj_old)/regclass_stats[rclass_index]->beta_)
-                  << " over " << regclass_stats[rclass_index]->beta_
-                  << " frames.";
-        KALDI_ASSERT(obj_new >= obj_old - (std::abs(obj_new)+std::abs(obj_old))*1.0e-05);
-        tot_t += regclass_stats[rclass_index]->beta_;
-        tot_auxf_impr += obj_new - obj_old;
-      }
-    } else {
-      out_mllr->Init(1, dim_);  // Use a unit transform at the root.
-    }
-    DeletePointers(&regclass_stats);
-    // end of estimation using regression tree
-  } else {  // estimate 1 transform per baseclass (if enough count)
-    out_mllr->Init(num_baseclasses_, dim_);
-    vector<int32> base2xforms(num_baseclasses_, -1);
-    for (int32 bclass_index = 0; bclass_index < num_baseclasses_;
-         ++bclass_index) {
-      if (baseclass_stats_[bclass_index]->beta_ > opts.min_count) {
-        base2xforms[bclass_index] = bclass_index;
-        xform_mat.SetUnit();
-        BaseFloat obj_old = MllrAuxFunction(xform_mat,
-                                            *(baseclass_stats_[bclass_index]));
-        ComputeMllrMatrix(baseclass_stats_[bclass_index]->K_,
-                          baseclass_stats_[bclass_index]->G_, &xform_mat);
-        out_mllr->SetParameters(xform_mat, bclass_index);
-        BaseFloat obj_new = MllrAuxFunction(xform_mat,
-                                            *(baseclass_stats_[bclass_index]));
-        KALDI_LOG << "MLLR: base-class " << (bclass_index)
-                  << ": Auxiliary function impr per frame is "
-                  << ((obj_new-obj_old)/baseclass_stats_[bclass_index]->beta_);
-        KALDI_ASSERT(obj_new >= obj_old - (std::abs(obj_new)+std::abs(obj_old))*1.0e-05);
-        tot_t += baseclass_stats_[bclass_index]->beta_;
-        tot_auxf_impr += obj_new - obj_old;
-      } else {
-        KALDI_WARN << "For baseclass "  << (bclass_index) << " count = "
-                   << (baseclass_stats_[bclass_index]->beta_) << " < "
-                   << opts.min_count << ": not updating MLLR";
-        tot_t += baseclass_stats_[bclass_index]->beta_;
-      }
-    }  // end looping over all baseclasses
-    out_mllr->set_bclass2xforms(base2xforms);
-  }  // end of estimating one transform per baseclass
-  if (auxf_impr != NULL) *auxf_impr = tot_auxf_impr;
-  if (t != NULL) *t = tot_t;
-}
-
-}  // namespace kaldi
-
diff --git a/src/transform/regtree-mllr-diag-gmm.h b/src/transform/regtree-mllr-diag-gmm.h
deleted file mode 100644
index 49a8cc41dd0..00000000000
--- a/src/transform/regtree-mllr-diag-gmm.h
+++ /dev/null
@@ -1,164 +0,0 @@
-// transform/regtree-mllr-diag-gmm.h
-
-// Copyright 2009-2011  Saarland University;  Jan Silovsky
-
-// See ../../COPYING for clarification regarding multiple authors
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//  http://www.apache.org/licenses/LICENSE-2.0
-//
-// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
-// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
-// MERCHANTABLITY OR NON-INFRINGEMENT.
-// See the Apache 2 License for the specific language governing permissions and
-// limitations under the License.
-
-#ifndef KALDI_TRANSFORM_REGTREE_MLLR_DIAG_GMM_H_
-#define KALDI_TRANSFORM_REGTREE_MLLR_DIAG_GMM_H_
-
-#include <vector>
-
-#include "base/kaldi-common.h"
-#include "gmm/am-diag-gmm.h"
-#include "transform/transform-common.h"
-#include "transform/regression-tree.h"
-#include "util/common-utils.h"
-
-namespace kaldi {
-
-
-///  Configuration variables for FMLLR transforms
-struct RegtreeMllrOptions {
-  BaseFloat min_count;  ///< Minimum occupancy for computing a transform
-
-  /// If 'true', find transforms to generate using regression tree.
-  /// If 'false', generate transforms for each baseclass.
-  bool use_regtree;
-
-  RegtreeMllrOptions(): min_count(1000.0), use_regtree(true) { }
-
-  void Register(OptionsItf *opts) {
-    opts->Register("mllr-min-count", &min_count,
-                   "Minimum count to estimate an MLLR transform.");
-    opts->Register("mllr-use-regtree", &use_regtree,
-                   "Use a regression-class tree for MLLR.");
-  }
-};
-
-/// An MLLR mean transformation is an affine transformation of Gaussian means.
-class RegtreeMllrDiagGmm {
- public:
-  RegtreeMllrDiagGmm() {}
-
-  /// Allocates memory for transform matrix & bias vector
-  void Init(int32 num_xforms, int32 dim);
-
-  /// Initialize transform matrix to identity and bias vector to zero
-  void SetUnit();
-
-  /// Apply the transform(s) to all the Gaussian means in the model
-  void TransformModel(const RegressionTree &regtree, AmDiagGmm *am);
-
-  /// Get all the transformed means for a given pdf.
-  void GetTransformedMeans(const RegressionTree &regtree, const AmDiagGmm &am,
-                           int32 pdf_index, MatrixBase<BaseFloat> *out) const;
-
-  void Write(std::ostream &out_stream, bool binary) const;
-  void Read(std::istream &in_stream, bool binary);
-
-  /// Mutators
-  void SetParameters(const MatrixBase<BaseFloat> &mat, int32 regclass);
-  void set_bclass2xforms(const std::vector<int32> &in) { bclass2xforms_ = in; }
-
-  /// Accessors
-  const std::vector< Matrix<BaseFloat> > xform_matrices() const {
-    return xform_matrices_;
-  }
-
- private:
-  /// Transform matrices: size() = num_xforms_
-  std::vector< Matrix<BaseFloat> > xform_matrices_;
-  int32 num_xforms_;  ///< Number of transforms == xform_matrices_.size()
-  /// For each baseclass index of which transform to use; -1 => no xform
-  std::vector<int32> bclass2xforms_;
-  int32 dim_;  ///< Dimension of feature vectors
-
-  // Cannot have copy constructor and assigment operator
-  KALDI_DISALLOW_COPY_AND_ASSIGN(RegtreeMllrDiagGmm);
-};
-
-inline void RegtreeMllrDiagGmm::SetParameters(const MatrixBase<BaseFloat> &mat,
-                                              int32 regclass) {
-  xform_matrices_[regclass].CopyFromMat(mat, kNoTrans);
-}
-
-/** Class for computing the maximum-likelihood estimates of the parameters of
- *  an acoustic model that uses diagonal Gaussian mixture models as emission
- *  densities.
- */
-class RegtreeMllrDiagGmmAccs {
- public:
-  RegtreeMllrDiagGmmAccs() {}
-  ~RegtreeMllrDiagGmmAccs() { DeletePointers(&baseclass_stats_); }
-
-  void Init(int32 num_bclass, int32 dim);
-  void SetZero();
-
-  /// Accumulate stats for a single GMM in the model; returns log likelihood.
-  /// This does not work with multiple feature transforms.
-  BaseFloat AccumulateForGmm(const RegressionTree &regtree,
-                             const AmDiagGmm &am,
-                             const VectorBase<BaseFloat> &data,
-                             int32 pdf_index, BaseFloat weight);
-
-  /// Accumulate stats for a single Gaussian component in the model.
-  void AccumulateForGaussian(const RegressionTree &regtree,
-                             const AmDiagGmm &am,
-                             const VectorBase<BaseFloat> &data,
-                             int32 pdf_index, int32 gauss_index,
-                             BaseFloat weight);
-
-  void Update(const RegressionTree &regtree, const RegtreeMllrOptions &opts,
-              RegtreeMllrDiagGmm *out_mllr, BaseFloat *auxf_impr,
-              BaseFloat *t) const;
-
-  void Write(std::ostream &out_stream, bool binary) const;
-  void Read(std::istream &in_stream, bool binary, bool add);
-
-  /// Accessors
-  int32 Dim() const { return dim_; }
-  int32 NumBaseClasses() const { return num_baseclasses_; }
-  const std::vector<AffineXformStats*> &baseclass_stats() const {
-    return baseclass_stats_;
-  }
-
- private:
-  /// Per-baseclass stats; used for accumulation
-  std::vector<AffineXformStats*> baseclass_stats_;
-  int32 num_baseclasses_;    ///< Number of baseclasses
-  int32 dim_;    ///< Dimension of feature vectors
-
-  /// Returns the MLLR objective function for a given transform and baseclass.
-  BaseFloat MllrObjFunction(const Matrix<BaseFloat> &xform,
-                            int32 bclass_id) const;
-
-  // Cannot have copy constructor and assigment operator
-  KALDI_DISALLOW_COPY_AND_ASSIGN(RegtreeMllrDiagGmmAccs);
-};
-
-typedef TableWriter< KaldiObjectHolder<RegtreeMllrDiagGmm> >
-            RegtreeMllrDiagGmmWriter;
-typedef RandomAccessTableReader< KaldiObjectHolder<RegtreeMllrDiagGmm> >
-            RandomAccessRegtreeMllrDiagGmmReader;
-typedef RandomAccessTableReaderMapped< KaldiObjectHolder<RegtreeMllrDiagGmm> >
-            RandomAccessRegtreeMllrDiagGmmReaderMapped;
-typedef SequentialTableReader< KaldiObjectHolder<RegtreeMllrDiagGmm> >
-            RegtreeMllrDiagGmmSeqReader;
-
-}  // namespace kaldi
-
-#endif  // KALDI_TRANSFORM_REGTREE_MLLR_DIAG_GMM_H_
diff --git a/src/tree/build-tree.h b/src/tree/build-tree.h
index 498ac5a8e19..9196c6bb204 100644
--- a/src/tree/build-tree.h
+++ b/src/tree/build-tree.h
@@ -51,7 +51,7 @@ namespace kaldi {
  *                 roots are shared together (prior to decision-tree splitting).
  * @param phone2num_pdf_classes [in] A map from phones to the number of
  *                 \ref pdf_class "pdf-classes"
- *                 in the phone (this info is derived from the HmmTopology object)
+ *                 in the phone (this info is derived from the Topology object)
  * @param share_roots [in] A vector the same size as phone_sets; says for each
  *                phone set whether the root should be shared among all the
  *                pdf-classes or not.
@@ -122,7 +122,7 @@ EventMap *BuildTree(Questions &qopts,
  *                 roots are shared together (prior to decision-tree splitting).
  * @param phone2num_pdf_classes [in] A map from phones to the number of
  *                 \ref pdf_class "pdf-classes"
- *                 in the phone (this info is derived from the HmmTopology object)
+ *                 in the phone (this info is derived from the Topology object)
  * @param share_roots [in] A vector the same size as phone_sets; says for each
  *                phone set whether the root should be shared among all the
  *                pdf-classes or not.
diff --git a/src/tree/context-dep.h b/src/tree/context-dep.h
index e69c26f8638..743dc964198 100644
--- a/src/tree/context-dep.h
+++ b/src/tree/context-dep.h
@@ -39,7 +39,7 @@ namespace kaldi {
 static const EventKeyType kPdfClass = -1;  // The "name" to which we assign the
 // pdf-class (generally corresponds ot position in the HMM, zero-based);
 // must not be used for any other event.  I.e. the value corresponding to
-// this key is the pdf-class (see hmm-topology.h for explanation of what this is).
+// this key is the pdf-class (see topology.h for explanation of what this is).
 
 
 /* ContextDependency is quite a generic decision tree.
@@ -99,9 +99,9 @@ class ContextDependency: public ContextDependencyInterface {
 
   /// GetPdfInfo returns a vector indexed by pdf-id, saying for each pdf which
   /// pairs of (phone, pdf-class) it can correspond to.  (Usually just one).
-  /// c.f. hmm/hmm-topology.h for meaning of pdf-class.
+  /// c.f. hmm/topology.h for meaning of pdf-class.
   /// This is the old, simpler interface of GetPdfInfo(), and that this one can
-  /// only be called if the HmmTopology object's IsHmm() function call returns
+  /// only be called if the Topology object's IsHmm() function call returns
   /// true.
   virtual void GetPdfInfo(
       const std::vector<int32> &phones,  // list of phones

From cc1d251a6ae8be81747236b9c587790fd78dd740 Mon Sep 17 00:00:00 2001
From: Desh Raj <r.desh26@gmail.com>
Date: Wed, 13 Mar 2019 21:45:02 -0400
Subject: [PATCH 002/163] Merge master into kaldi10 (#3105)

* [src] Change warp-synchronous to cub::BlockReduce (safer but slower) (#3080)

* [src] Fix && and || uses where & and | intended, and other weird errors (#3087)

* [build] Some fixes to Makefiles (#3088)

clang is unhappy with '-rdynamic' in compile-only step, and the
switch is really unnecessary.

Also, the default location for MKL 64-bit libraries is intel64/.
The em64t/ was explained already obsolete by an Intel rep in 2010:
https://software.intel.com/en-us/forums/intel-math-kernel-library/topic/285973

* [src] Fixed -Wreordered warnings in feat (#3090)

* [egs] Replace bc with perl -e (#3093)

* [scripts] Fix python3 compatibility issue in data-perturbing script (#3084)

* [doc] fix some typos in doc. (#3097)

* [build] Make sure expf() speed probe times sensibly (#3089)

* [scripts] Make sure merge_targets.py works in python3 (#3094)

* [src] ifdef to fix compilation failure on CUDA 8 and earlier (#3103)

* [doc] fix typos and broken links in doc. (#3102)

* [scripts] Fix frame_shift bug in egs/swbd/s5c/local/score_sclite_conf.sh (#3104)
---
 .../nnet3/xvector/extract_xvectors.sh         |   2 +-
 egs/callhome_diarization/v1/run.sh            |   2 +-
 egs/callhome_diarization/v2/run.sh            |   2 +-
 egs/dihard_2018/v1/run.sh                     |   2 +-
 egs/dihard_2018/v2/run.sh                     |   2 +-
 egs/rm/README.txt                             |   2 +-
 egs/sre08/v1/local/score_sre08.sh             |   4 +-
 egs/swbd/s5c/local/score_sclite_conf.sh       |   8 +-
 egs/wsj/s5/local/chain/tuning/run_tdnn_1g.sh  |   2 +-
 .../segmentation/internal/merge_targets.py    |   6 +-
 .../s5/utils/data/perturb_data_dir_volume.sh  |   6 +-
 src/bin/compute-wer-bootci.cc                 |   6 +-
 src/cudamatrix/cu-device.cc                   |   2 +
 src/cudamatrix/cu-kernels.cu                  | 171 ++++--------------
 src/doc/data_prep.dox                         |   2 +-
 src/doc/dependencies.dox                      |   2 +-
 src/doc/dnn.dox                               |   2 +-
 src/doc/io.dox                                |   4 +-
 src/doc/kaldi_for_dummies.dox                 |  32 ++--
 src/doc/tutorial_looking.dox                  |   4 +-
 src/doc/tutorial_prereqs.dox                  |   2 +-
 src/doc/tutorial_running.dox                  |  10 +-
 src/doc/versions.dox                          |   4 +-
 src/fstext/determinize-lattice-inl.h          |   4 +-
 src/fstext/lattice-weight.h                   |   3 +-
 src/gmm/mle-diag-gmm.h                        |  12 +-
 src/gmm/mle-full-gmm.h                        |   8 +-
 src/makefiles/cuda_64bit.mk                   |   4 +-
 src/makefiles/default_rules.mk                |   2 +-
 src/makefiles/linux_x86_64_mkl.mk             |   2 +-
 src/nnet3/nnet-analyze.cc                     |   2 +-
 src/nnet3/nnet-chain-training.cc              |   4 +-
 src/nnet3/nnet-simple-component.cc            |   8 +-
 src/nnet3/nnet-training.cc                    |   4 +-
 src/probe/README.slow_expf                    |  11 +-
 src/probe/exp-test.cc                         |  51 ++++--
 src/rnnlm/rnnlm-core-training.cc              |   4 +-
 src/rnnlm/rnnlm-embedding-training.cc         |  12 +-
 src/tree/build-tree-questions.h               |   6 +-
 src/util/kaldi-pipebuf.h                      |   3 +-
 40 files changed, 175 insertions(+), 244 deletions(-)

diff --git a/egs/callhome_diarization/v1/diarization/nnet3/xvector/extract_xvectors.sh b/egs/callhome_diarization/v1/diarization/nnet3/xvector/extract_xvectors.sh
index d7591a6a3a8..8d579138c73 100755
--- a/egs/callhome_diarization/v1/diarization/nnet3/xvector/extract_xvectors.sh
+++ b/egs/callhome_diarization/v1/diarization/nnet3/xvector/extract_xvectors.sh
@@ -102,7 +102,7 @@ if [ $stage -le 0 ]; then
   fi
   utils/data/get_uniform_subsegments.py \
       --max-segment-duration=$window \
-      --overlap-duration=$(echo "$window-$period" | bc) \
+      --overlap-duration=$(perl -e "print ($window-$period);") \
       --max-remaining-duration=$min_segment \
       --constant-duration=True \
       $segments > $dir/subsegments
diff --git a/egs/callhome_diarization/v1/run.sh b/egs/callhome_diarization/v1/run.sh
index acc48bd24f9..f4652c0c0ef 100755
--- a/egs/callhome_diarization/v1/run.sh
+++ b/egs/callhome_diarization/v1/run.sh
@@ -188,7 +188,7 @@ if [ $stage -le 6 ]; then
 
       der=$(grep -oP 'DIARIZATION\ ERROR\ =\ \K[0-9]+([.][0-9]+)?' \
         exp/tuning/${dataset}_t${threshold})
-      if [ $(echo $der'<'$best_der | bc -l) -eq 1 ]; then
+      if [ $(perl -e "print ($der < $best_der ? 1 : 0);") -eq 1 ]; then
         best_der=$der
         best_threshold=$threshold
       fi
diff --git a/egs/callhome_diarization/v2/run.sh b/egs/callhome_diarization/v2/run.sh
index ae05dd9da1c..b79717e2348 100755
--- a/egs/callhome_diarization/v2/run.sh
+++ b/egs/callhome_diarization/v2/run.sh
@@ -297,7 +297,7 @@ if [ $stage -le 10 ]; then
 
       der=$(grep -oP 'DIARIZATION\ ERROR\ =\ \K[0-9]+([.][0-9]+)?' \
         $nnet_dir/tuning/${dataset}_t${threshold})
-      if [ $(echo $der'<'$best_der | bc -l) -eq 1 ]; then
+      if [ $(perl -e "print ($der < $best_der ? 1 : 0);") -eq 1 ]; then
         best_der=$der
         best_threshold=$threshold
       fi
diff --git a/egs/dihard_2018/v1/run.sh b/egs/dihard_2018/v1/run.sh
index 429a1231975..44af9f48c3f 100755
--- a/egs/dihard_2018/v1/run.sh
+++ b/egs/dihard_2018/v1/run.sh
@@ -186,7 +186,7 @@ if [ $stage -le 7 ]; then
 
     der=$(grep -oP 'DIARIZATION\ ERROR\ =\ \K[0-9]+([.][0-9]+)?' \
       $ivec_dir/tuning/dihard_2018_dev_t${threshold})
-    if [ $(echo $der'<'$best_der | bc -l) -eq 1 ]; then
+    if [ $(perl -e "print ($der < $best_der ? 1 : 0);") -eq 1 ]; then
       best_der=$der
       best_threshold=$threshold
     fi
diff --git a/egs/dihard_2018/v2/run.sh b/egs/dihard_2018/v2/run.sh
index 1c018dfcc55..0da1f330ea7 100755
--- a/egs/dihard_2018/v2/run.sh
+++ b/egs/dihard_2018/v2/run.sh
@@ -260,7 +260,7 @@ if [ $stage -le 12 ]; then
 
     der=$(grep -oP 'DIARIZATION\ ERROR\ =\ \K[0-9]+([.][0-9]+)?' \
       $nnet_dir/tuning/dihard_2018_dev_t${threshold})
-    if [ $(echo $der'<'$best_der | bc -l) -eq 1 ]; then
+    if [ $(perl -e "print ($der < $best_der ? 1 : 0);") -eq 1 ]; then
       best_der=$der
       best_threshold=$threshold
     fi
diff --git a/egs/rm/README.txt b/egs/rm/README.txt
index ed588e481c6..4fa3d7c87e8 100644
--- a/egs/rm/README.txt
+++ b/egs/rm/README.txt
@@ -9,7 +9,7 @@ About the Resource Management corpus:
 
 Each subdirectory of this directory contains the
 scripts for a sequence of experiments. 
-s5 is the currently recommmended setup.
+s5 is the currently recommended setup.
 
   s5: This is the "new-new-style" recipe.  It is now finished.
       All further work will be on top of this style of recipe.  Note: 
diff --git a/egs/sre08/v1/local/score_sre08.sh b/egs/sre08/v1/local/score_sre08.sh
index 92831502f45..c1584946735 100755
--- a/egs/sre08/v1/local/score_sre08.sh
+++ b/egs/sre08/v1/local/score_sre08.sh
@@ -35,11 +35,11 @@ tot_eer=0.0
 printf '% 12s' 'EER:'
 for condition in $(seq 8); do
   eer=$(awk '{print $3}' $scores | paste - $trials | awk -v c=$condition '{n=4+c; if ($n == "Y") print $1, $4}' | compute-eer - 2>/dev/null)
-  tot_eer=$(echo "$tot_eer+$eer" | bc)
+  tot_eer=$(perl -e "print ($tot_eer+$eer);")
   eers[$condition]=$eer
 done
 
-eers[0]=$(echo "$tot_eer/8" | bc -l)
+eers[0]=$(perl -e "print ($tot_eer/8.0);")
 
 for i in $(seq 0 8); do
   printf '% 7.2f' ${eers[$i]}
diff --git a/egs/swbd/s5c/local/score_sclite_conf.sh b/egs/swbd/s5c/local/score_sclite_conf.sh
index 9a1fa5083bf..21da4520a4d 100755
--- a/egs/swbd/s5c/local/score_sclite_conf.sh
+++ b/egs/swbd/s5c/local/score_sclite_conf.sh
@@ -39,6 +39,12 @@ for f in $data/stm $data/glm $lang/words.txt $lang/phones/word_boundary.int \
   [ ! -f $f ] && echo "$0: expecting file $f to exist" && exit 1;
 done
 
+if [ -f $dir/../frame_subsampling_factor ]; then
+  factor=$(cat $dir/../frame_subsampling_factor) || exit 1
+  frame_shift_opt="--frame-shift=0.0$factor"
+  echo "$0: $dir/../frame_subsampling_factor exists, using $frame_shift_opt"
+fi
+
 name=`basename $data`; # e.g. eval2000
 
 mkdir -p $dir/scoring/log
@@ -51,7 +57,7 @@ if [ $stage -le 0 ]; then
       ACWT=\`perl -e \"print 1.0/LMWT\;\"\` '&&' \
       lattice-add-penalty --word-ins-penalty=$wip "ark:gunzip -c $dir/lat.*.gz|" ark:- \| \
       lattice-align-words $lang/phones/word_boundary.int $model ark:- ark:- \| \
-      lattice-to-ctm-conf --decode-mbr=$decode_mbr --acoustic-scale=\$ACWT  ark:- - \| \
+      lattice-to-ctm-conf $frame_shift_opt --decode-mbr=$decode_mbr --acoustic-scale=\$ACWT  ark:- - \| \
       utils/int2sym.pl -f 5 $lang/words.txt  \| \
       utils/convert_ctm.pl $data/segments $data/reco2file_and_channel \
       '>' $dir/score_LMWT_${wip}/$name.ctm || exit 1;
diff --git a/egs/wsj/s5/local/chain/tuning/run_tdnn_1g.sh b/egs/wsj/s5/local/chain/tuning/run_tdnn_1g.sh
index 526059b7b90..8f566ccfe6d 100755
--- a/egs/wsj/s5/local/chain/tuning/run_tdnn_1g.sh
+++ b/egs/wsj/s5/local/chain/tuning/run_tdnn_1g.sh
@@ -160,7 +160,7 @@ if [ $stage -le 15 ]; then
   echo "$0: creating neural net configs using the xconfig parser";
 
   num_targets=$(tree-info $tree_dir/tree |grep num-pdfs|awk '{print $2}')
-  learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python)
+  learning_rate_factor=$(echo "print(0.5/$xent_regularize)" | python)
   tdnn_opts="l2-regularize=0.01 dropout-proportion=0.0 dropout-per-dim-continuous=true"
   tdnnf_opts="l2-regularize=0.01 dropout-proportion=0.0 bypass-scale=0.66"
   linear_opts="l2-regularize=0.01 orthonormal-constraint=-1.0"
diff --git a/egs/wsj/s5/steps/segmentation/internal/merge_targets.py b/egs/wsj/s5/steps/segmentation/internal/merge_targets.py
index a14aef151c2..84b0c884f45 100755
--- a/egs/wsj/s5/steps/segmentation/internal/merge_targets.py
+++ b/egs/wsj/s5/steps/segmentation/internal/merge_targets.py
@@ -1,4 +1,4 @@
-#!/usr/bin/env python
+#!/usr/bin/env python3
 
 # Copyright 2017  Vimal Manohar
 # Apache 2.0
@@ -16,8 +16,6 @@
 option.
 """
 
-from __future__ import print_function
-from __future__ import division
 import argparse
 import logging
 import numpy as np
@@ -111,7 +109,7 @@ def should_remove_frame(row, dim):
                                      # source[2] = [ 0 0 0 ]
     """
     assert len(row) % dim == 0
-    num_sources = len(row) / dim
+    num_sources = len(row) // dim
 
     max_idx = np.argmax(row)
     max_val = row[max_idx]
diff --git a/egs/wsj/s5/utils/data/perturb_data_dir_volume.sh b/egs/wsj/s5/utils/data/perturb_data_dir_volume.sh
index dae440b03a3..e357ba8cbfb 100755
--- a/egs/wsj/s5/utils/data/perturb_data_dir_volume.sh
+++ b/egs/wsj/s5/utils/data/perturb_data_dir_volume.sh
@@ -52,15 +52,15 @@ for line in sys.stdin.readlines():
   parts = line.strip().split()
   if line.strip()[-1] == '|':
     if re.search('sox --vol', ' '.join(parts[-11:])):
-      print 'true'
+      print('true')
       sys.exit(0)
   elif re.search(':[0-9]+$', line.strip()) is not None:
     continue
   else:
     if ' '.join(parts[1:3]) == 'sox --vol':
-      print 'true'
+      print('true')
       sys.exit(0)
-print 'false'
+print('false')
 "` || exit 1
 
 if $volume_perturb_done; then
diff --git a/src/bin/compute-wer-bootci.cc b/src/bin/compute-wer-bootci.cc
index b8b0697af75..ba2a4ce739c 100644
--- a/src/bin/compute-wer-bootci.cc
+++ b/src/bin/compute-wer-bootci.cc
@@ -162,7 +162,7 @@ int main(int argc, char *argv[]) {
 
   try {
     const char *usage =
-      "Compute a bootstrapping of WER to extract the 95\% confidence interval.\n"
+      "Compute a bootstrapping of WER to extract the 95% confidence interval.\n"
       "Take a reference and a transcription file, in integer or text format,\n"
       "and outputs overall WER statistics to standard output along with its\n"
       "confidence interval using the bootstrap method of Bisani and Ney.\n"
@@ -234,12 +234,12 @@ int main(int argc, char *argv[]) {
     std::cout.precision(2);
     std::cerr.precision(2);
     std::cout << "Set1: %WER " << std::fixed << 100*mean_wer <<
-              " 95\% Conf Interval [ " << 100*mean_wer-100*interval <<
+              " 95% Conf Interval [ " << 100*mean_wer-100*interval <<
               ", " << 100*mean_wer+100*interval << " ]" << '\n';
 
     if(!hyp2_rspecifier.empty()) {
         std::cout << "Set2: %WER " << std::fixed << 100*mean_wer2 <<
-            " 95\% Conf Interval [ " << 100*mean_wer2-100*interval2 <<
+            " 95% Conf Interval [ " << 100*mean_wer2-100*interval2 <<
             ", " << 100*mean_wer2+100*interval2 << " ]" << '\n';
 
         std::cout << "Probability of Set2 improving Set1: " << std::fixed <<
diff --git a/src/cudamatrix/cu-device.cc b/src/cudamatrix/cu-device.cc
index 140275d3b6e..85c2492c074 100644
--- a/src/cudamatrix/cu-device.cc
+++ b/src/cudamatrix/cu-device.cc
@@ -111,12 +111,14 @@ void CuDevice::Initialize() {
     CUBLAS_SAFE_CALL(cublasCreate(&cublas_handle_));
     CUBLAS_SAFE_CALL(cublasSetStream(cublas_handle_, cudaStreamPerThread));
     
+    #if CUDA_VERSION >= 9000 
     if (device_options_.use_tensor_cores) {
       // Enable tensor cores in CUBLAS
       // Note if the device does not support tensor cores this will fall back to normal math mode
       CUBLAS_SAFE_CALL(cublasSetMathMode(cublas_handle_, 
             CUBLAS_TENSOR_OP_MATH));
     }
+    #endif
 
     // Initialize the cuSPARSE library
     CUSPARSE_SAFE_CALL(cusparseCreate(&cusparse_handle_));
diff --git a/src/cudamatrix/cu-kernels.cu b/src/cudamatrix/cu-kernels.cu
index 5a5307b9f87..17d56a05772 100644
--- a/src/cudamatrix/cu-kernels.cu
+++ b/src/cudamatrix/cu-kernels.cu
@@ -28,7 +28,7 @@
 #include <limits>
 #include <math_constants.h>
 #include "cudamatrix/cu-kernels-ansi.h"
-
+#include <cub/block/block_reduce.cuh>
 
 
 /***********************************************************************
@@ -958,6 +958,7 @@ static void _trace_mat_mat(const Real* A, const Real* B, MatrixDim dA,
     Real trans[TileDim][TileDim + 1];
     Real sum[CU1DBLOCK];
   } smem;
+
   // linear thread id;
   const int32_cuda tid = threadIdx.y * blockDim.x + threadIdx.x;
   const int32_cuda grid_height = gridDim.y * TileDim;
@@ -1021,6 +1022,7 @@ static void _trace_mat_mat(const Real* A, const Real* B, MatrixDim dA,
   if (tid == 0) {
     value[blockIdx.y * gridDim.x + blockIdx.x] = smem.sum[0];
   }
+
 }
 
 // _trace_mat_mat_trans reduce the partial sum to
@@ -1030,6 +1032,7 @@ __global__
 static void _trace_mat_mat_trans(const Real* A, const Real* B, MatrixDim dA,
                                  int B_stride, Real* value) {
   __shared__ Real ssum[CU1DBLOCK];
+
   // linear thread id;
   const int32_cuda tid = threadIdx.y * blockDim.x + threadIdx.x;
   const int32_cuda j = blockIdx.x * blockDim.x + threadIdx.x;
@@ -1046,7 +1049,7 @@ static void _trace_mat_mat_trans(const Real* A, const Real* B, MatrixDim dA,
   }
   ssum[tid] = tsum;
   __syncthreads();
-
+  
   // Block reduce
 # pragma unroll
   for (int shift = CU1DBLOCK / 2; shift > warpSize; shift >>= 1) {
@@ -2485,6 +2488,8 @@ template<typename Real>
 __global__
 static void _softmax_reduce(Real*y, const Real*x, MatrixDim d, int src_stride) {
   __shared__ Real smem[CU1DBLOCK];
+  typedef cub::BlockReduce<Real, CU1DBLOCK> BlockReduceT;
+  __shared__ typename BlockReduceT::TempStorage temp_storage;
   const int i = blockIdx.x;
   const int x_start = i * src_stride;
   const int y_start = i * d.stride;
@@ -2496,24 +2501,9 @@ static void _softmax_reduce(Real*y, const Real*x, MatrixDim d, int src_stride) {
   for (int j = tid; j < d.cols; j += CU1DBLOCK) {
     tmax = fmax(tmax, x[x_start + j]);
   }
-  smem[tid] = tmax;
-  __syncthreads();
-
-  // reduce to 2x warpSize elements per row
-# pragma unroll
-  for (int shift = CU1DBLOCK / 2; shift > warpSize; shift >>= 1) {
-    if (tid < shift) {
-      smem[tid] = fmax(smem[tid], smem[tid + shift]);
-    }
-    __syncthreads();
-  }
-
-  // reduce to 1 element per row
-  if (tid < warpSize) {
-#   pragma unroll
-    for (int shift = warpSize; shift > 0; shift >>= 1) {
-      smem[tid] = fmax(smem[tid], smem[tid + shift]);
-    }
+  tmax = BlockReduceT(temp_storage).Reduce(tmax, cub::Max());
+  if (tid == 0) {
+    smem[0] = tmax;
   }
 
   // broadcast max to all threads
@@ -2526,24 +2516,9 @@ static void _softmax_reduce(Real*y, const Real*x, MatrixDim d, int src_stride) {
   for (int j = tid; j < d.cols; j += CU1DBLOCK) {
     tsum += exp(x[x_start + j] - max);
   }
-  smem[tid] = tsum;
-  __syncthreads();
-
-  // reduce to 2x warpSize elements per row
-# pragma unroll
-  for (int shift = CU1DBLOCK / 2; shift > warpSize; shift >>= 1) {
-    if (tid < shift) {
-      smem[tid] += smem[tid + shift];
-    }
-    __syncthreads();
-  }
-
-  // reduce to 1 element per row
-  if (tid < warpSize) {
-#   pragma unroll
-    for (int shift = warpSize; shift > 0; shift >>= 1) {
-      smem[tid] += smem[tid + shift];
-    }
+  tsum = BlockReduceT(temp_storage).Sum(tsum);
+  if (tid == 0) {
+    smem[0] = tsum;
   }
 
   // broadcast sum to all threads
@@ -2577,6 +2552,8 @@ static void _normalize_per_row(Real *y, int y_stride, const Real *x,
   const int i = blockIdx.x;
   const int tid = threadIdx.x;
   const Real* x_row = x + i * x_d.stride;
+  typedef cub::BlockReduce<Real, CU1DBLOCK> BlockReduceT;
+  __shared__ typename BlockReduceT::TempStorage temp_storage;
   __shared__ Real ssum[CU1DBLOCK];
 
   // Reduce x_j^2 to CU1DBLOCK elements per row
@@ -2584,34 +2561,14 @@ static void _normalize_per_row(Real *y, int y_stride, const Real *x,
   for (int j = tid; j < x_d.cols; j += CU1DBLOCK) {
     tsum += x_row[j] * x_row[j];
   }
-  ssum[tid] = tsum;
+  tsum = BlockReduceT(temp_storage).Sum(tsum);
   __syncthreads();
-
-  // Tree reduce to 2x warpSize elements per row
-# pragma unroll
-  for (int shift = CU1DBLOCK / 2; shift > warpSize; shift >>= 1) {
-    if (tid < shift)
-      ssum[tid] += ssum[tid + shift];
-    __syncthreads();
-  }
-
-  // Reduce last warp to 1 element per row.
-  // Threads implicitly synchronized within a warp.
-  if (tid < warpSize) {
-#   pragma unroll
-    for (int shift = warpSize; shift > 0; shift >>= 1) {
-      ssum[tid] += ssum[tid + shift];
-    }
-  }
+  
 
   const Real kSquaredNormFloor = 1.3552527156068805425e-20; // 2^-66
-  if (tid == 0) {
-    ssum[0] = sqrt(
-        fmax(ssum[0] / (target_rms * target_rms * x_d.cols), kSquaredNormFloor));
-  }
+  ssum[tid] = sqrt(
+    fmax(tsum / (target_rms * target_rms * x_d.cols), kSquaredNormFloor));
 
-  // Broadcast floored stddev to all threads.
-  __syncthreads();
   const Real stddev_div_target_rms = ssum[0];
   const Real scale = Real(1) / stddev_div_target_rms;
 
@@ -2626,7 +2583,6 @@ static void _normalize_per_row(Real *y, int y_stride, const Real *x,
   }
 }
 
-
 template<typename Real>
 __global__
 static void _diff_normalize_per_row(Real *id, int id_stride, const Real *iv,
@@ -2722,6 +2678,8 @@ __global__
 static void _log_softmax_reduce(Real* y, const Real* x, MatrixDim y_dim,
                                 int x_stride) {
   __shared__ Real smem[CU1DBLOCK];
+  typedef cub::BlockReduce<Real, CU1DBLOCK> BlockReduceT;
+  __shared__ typename BlockReduceT::TempStorage temp_storage;
   const int i = blockIdx.x;
   const int x_start = i * x_stride;
   const int y_start = i * y_dim.stride;
@@ -2733,23 +2691,9 @@ static void _log_softmax_reduce(Real* y, const Real* x, MatrixDim y_dim,
   for (int j = tid; j < y_dim.cols; j += CU1DBLOCK) {
     tmax = fmax(tmax, x[x_start + j]);
   }
-  smem[tid] = tmax;
-  __syncthreads();
-
-  // reduce to 2x warpSize elements per row
-# pragma unroll
-  for (int shift = CU1DBLOCK / 2; shift > warpSize; shift >>= 1) {
-    if (tid < shift) {
-      smem[tid] = fmax(smem[tid], smem[tid + shift]);
-    }
-    __syncthreads();
-  }
-
-  // reduce to 1 element per row
-  if (tid < warpSize) {
-    for (int shift = warpSize; shift > 0; shift >>= 1) {
-      smem[tid] = fmax(smem[tid], smem[tid + shift]);
-    }
+  tmax = BlockReduceT(temp_storage).Reduce(tmax, cub::Max());
+  if (tid == 0) {
+    smem[0] = tmax;
   }
 
   // broadcast max to all threads
@@ -2762,23 +2706,9 @@ static void _log_softmax_reduce(Real* y, const Real* x, MatrixDim y_dim,
   for (int j = tid; j < y_dim.cols; j += CU1DBLOCK) {
     tsum += exp(x[x_start + j] - max);
   }
-  smem[tid] = tsum;
-  __syncthreads();
-
-  // reduce to 2x warpSize elements per row
-# pragma unroll
-  for (int shift = CU1DBLOCK / 2; shift > warpSize; shift >>= 1) {
-    if (tid < shift) {
-      smem[tid] += smem[tid + shift];
-    }
-    __syncthreads();
-  }
-
-  // reduce to 1 element per row
-  if (tid < warpSize) {
-    for (int shift = warpSize; shift > 0; shift >>= 1) {
-      smem[tid] += smem[tid + shift];
-    }
+  tsum = BlockReduceT(temp_storage).Sum(tsum);
+  if (tid == 0) {
+    smem[0] = tsum;
   }
 
   // broadcast sum to all threads
@@ -3024,6 +2954,9 @@ static void _diff_softmax(Real* x, const MatrixDim dim, const Real* value,
                           const int value_stride, const Real* diff,
                           const int diff_stride) {
   __shared__ Real ssum[CU1DBLOCK];
+  typedef cub::BlockReduce<Real, CU1DBLOCK> BlockReduceT;
+  __shared__ typename BlockReduceT::TempStorage temp_storage;
+
   const int tid = threadIdx.x;
   const int i = blockIdx.x;
   const int value_start = i * value_stride;
@@ -3035,24 +2968,9 @@ static void _diff_softmax(Real* x, const MatrixDim dim, const Real* value,
   for (int j = tid; j < dim.cols; j += CU1DBLOCK) {
     tsum += value[value_start + j] * diff[diff_start + j];
   }
-  ssum[tid] = tsum;
-  __syncthreads();
-
-  // Tree reduce to 2x warpSize elements.
-# pragma unroll
-  for (int shift = CU1DBLOCK / 2; shift > warpSize; shift >>= 1) {
-    if (tid < shift) {
-      ssum[tid] += ssum[tid + shift];
-    }
-    __syncthreads();
-  }
-
-  // Warp reduce to 1 element. Threads implicitly synchronized within a warp.
-  if (tid < warpSize) {
-#   pragma unroll
-    for (int shift = warpSize; shift > 0; shift >>= 1) {
-      ssum[tid] += ssum[tid + shift];
-    }
+  tsum = BlockReduceT(temp_storage).Sum(tsum);
+  if (tid == 0) {
+    ssum[0] = tsum;
   }
 
   // Broadcast result to all threads
@@ -3078,6 +2996,8 @@ static void _diff_log_softmax(const MatrixDim in_deriv_dim,
                               Real* in_deriv) {
 
   __shared__ Real ssum[CU1DBLOCK];
+  typedef cub::BlockReduce<Real, CU1DBLOCK> BlockReduceT;
+  __shared__ typename BlockReduceT::TempStorage temp_storage;
   const int tid = threadIdx.x;
   const int i = blockIdx.x;
   const int out_value_start = i * out_value_stride;
@@ -3089,24 +3009,9 @@ static void _diff_log_softmax(const MatrixDim in_deriv_dim,
   for (int j = tid; j < in_deriv_dim.cols; j += CU1DBLOCK) {
     tsum += out_deriv[out_deriv_start + j];
   }
-  ssum[tid] = tsum;
-  __syncthreads();
-
-  // Tree reduce to 2x warpSize elements.
-# pragma unroll
-  for (int shift = CU1DBLOCK / 2; shift > warpSize; shift >>= 1) {
-    if (tid < shift) {
-      ssum[tid] += ssum[tid + shift];
-    }
-    __syncthreads();
-  }
-
-  // Warp reduce to 1 element. Threads implicitly synchronized within a warp.
-  if (tid < warpSize) {
-#   pragma unroll
-    for (int shift = warpSize; shift > 0; shift >>= 1) {
-      ssum[tid] += ssum[tid + shift];
-    }
+  tsum = BlockReduceT(temp_storage).Sum(tsum);
+  if (tid == 0) {
+    ssum[0] = tsum;
   }
 
   // Broadcast result to all threads
diff --git a/src/doc/data_prep.dox b/src/doc/data_prep.dox
index d8fe1746df1..e81032537cc 100644
--- a/src/doc/data_prep.dox
+++ b/src/doc/data_prep.dox
@@ -191,7 +191,7 @@ the speaker identities, you can just make the speaker-ids the same as the uttera
 so the format of the file would be just <DFN>\<utterance-id\> \<utterance-id\></DFN>.
 We have made the previous sentence bold because we have encountered people creating
 a "global" speaker-id.  This is a bad idea because it makes cepstral mean normalization
-ineffective in traning (since it's applied globally), and because it will create problems
+ineffective in training (since it's applied globally), and because it will create problems
 when you use utils/split_data_dir.sh to split your data into pieces.
 
 There is another file that exists in some setups; it is used only occasionally and
diff --git a/src/doc/dependencies.dox b/src/doc/dependencies.dox
index 63d2658b726..d8a5591955f 100644
--- a/src/doc/dependencies.dox
+++ b/src/doc/dependencies.dox
@@ -113,7 +113,7 @@
     - CLAPACK, the linear algebra library (we download the headers).
       This is useful only on systems where you don't have ATLAS and are
       instead compiling with CLAPACK.
-    - OpenBLAS: this is an alernative to ATLAS or CLAPACK.  The scripts don't
+    - OpenBLAS: this is an alternative to ATLAS or CLAPACK.  The scripts don't
       use it by default but we provide installation scripts so you can install
       it if you want to compare it against ATLAS (it's more actively
       maintained than ATLAS).
diff --git a/src/doc/dnn.dox b/src/doc/dnn.dox
index 5b3d2b98261..bab4658e552 100644
--- a/src/doc/dnn.dox
+++ b/src/doc/dnn.dox
@@ -37,7 +37,7 @@ namespace kaldi {
   We currently have three separate codebases for deep neural nets in Kaldi.  All
   are still active in the sense that the up-to-date recipes refer to all of
   them.  The first one ("nnet1"( is located in code subdirectories nnet/ and
-  nnetbin/, and is primiarly maintained by Karel Vesely.  The second is located
+  nnetbin/, and is primarily maintained by Karel Vesely.  The second is located
   in code subdirectories nnet2/ and nnet2bin/, and is primarily maintained by
   Daniel Povey (this code was originally based on an earlier version of Karel's
   code, but it has been extensively rewritten).  The third is located
diff --git a/src/doc/io.dox b/src/doc/io.dox
index dc958f57a6f..8f3a3cc05b6 100644
--- a/src/doc/io.dox
+++ b/src/doc/io.dox
@@ -383,7 +383,7 @@ namespace kaldi {
   std::string rspecifier2 = "ark:-"; // archive read from stdin.
   // write to a gzipped text archive.
   std::string wspecifier1 = "ark,t:| gzip -c > /some/dir/foo.ark.gz";
-  std::string wspecifier2 = "ark,scp:data/my.ark,data/my.ark";
+  std::string wspecifier2 = "ark,scp:data/my.ark,data/my.scp";
  \endcode
 
  Usually, an rspecifier or wspecifier consists of a comma-separated, unordered
@@ -401,7 +401,7 @@ namespace kaldi {
  \endverbatim
  This will write an archive, and a
  script file with lines like "utt_id /somedir/foo.ark:1234" that specify offsets into the
- archive for more efficient random access.  You can then do what you like which
+ archive for more efficient random access.  You can then do whatever you like with
  the script file, including breaking it up into segments, and it will behave like
  any other script file.  Note that although the order of options before the colon
  doesn't generally matter, in this particular case the "ark" must come before
diff --git a/src/doc/kaldi_for_dummies.dox b/src/doc/kaldi_for_dummies.dox
index d712ab87af9..b48d6dd8dac 100644
--- a/src/doc/kaldi_for_dummies.dox
+++ b/src/doc/kaldi_for_dummies.dox
@@ -71,7 +71,7 @@ and installation,
  - \c awk – programming language, used for searching and processing patterns
 in files and data streams,
  - \c bash – Unix shell and script programming language,
- - \c grep – command-line utility for searching plain-text data sets for lines
+ - \c grep – command-line utility for searching plain-text datasets for lines
 matching a regular expression,
  - \c make – automatically builds executable programs and libraries from
 source code,
@@ -96,7 +96,7 @@ be nice if you read any \c README files you find.
 
 \c kaldi - main Kaldi directory which contains:
  - \c egs – example scripts allowing you to quickly build ASR
-systems for over 30 popular speech corporas (documentation is attached for each
+systems for over 30 popular speech corpora (documentation is attached for each
 project),
  - \c misc – additional tools and supplies, not needed for proper
 Kaldi functionality,
@@ -136,34 +136,34 @@ the stuff related to your project.
 
 I assume that you want to set up an ASR system, basing on your own audio data.
 For example - let it be a set of 100 files. File format is WAV. Each file
-contains 3 spoken digits recorded in english language, one by one. Each of
+contains 3 spoken digits recorded in English language, one by one. Each of
 these audio files is named in a recognizable way (e.g. \c 1_5_6.wav,
 which in my pattern means that the spoken sentence is 'one, five, six') and
 placed in the recognizable folder representing particular speaker during a
 particular recording session (there may be a situation that you have recordings
 of the same person but in two different quality/noise environments - put these
-in separate folders). So to sum up, my exemplary data set looks like this:
+in separate folders). So to sum up, my exemplary dataset looks like this:
  - 10 different speakers (ASR systems must be trained and tested on different
 speakers, the more speakers you have the better),
  - each speaker says 10 sentences,
- - 100 senteces/utterances (in 100 *.wav files placed in 10 folders related to
+ - 100 sentences/utterances (in 100 *.wav files placed in 10 folders related to
 particular speakers - 10 *.wav files in each folder),
  - 300 words (digits from zero to nine),
  - each sentence/utterance consist of 3 words.
 
-Whatever your first data set is, adjust my example to your particular case. Be
-careful with big data sets and complex grammars - start with something simple.
+Whatever your first dataset is, adjust my example to your particular case. Be
+careful with big datasets and complex grammars - start with something simple.
 Sentences that contain only digits are perfect in this case.
 
 <h2>Task</h2>
 Go to \c kaldi/egs/digits directory and create
 \c digits_audio folder. In \c kaldi/egs/digits/digits_audio
 create two folders: \c train and \c test. Select one speaker
-of your choice to represent testing data set. Use this speaker's 'speakerID' as
+of your choice to represent testing dataset. Use this speaker's 'speakerID' as
 a name for an another new folder in \c kaldi/egs/digits/digits_audio/test
 directory. Then put there all the audio files related to that person. Put the
 rest (9 speakers) into \c train folder - this will be your training
-data set. Also create subfolders for each speaker.
+dataset. Also create subfolders for each speaker.
 
 \subsection kaldi_for_dummies_acoustic Acoustic data
 
@@ -174,14 +174,14 @@ section as well) can be considered as a text file with some number of strings
 (each string in a new line). These strings need to be sorted. If you will
 encounter any sorting issues you can use Kaldi scripts for checking
 (\c utils/validate_data_dir.sh) and fixing (\c utils/fix_data_dir.sh) data order.
-And for you information - \c utils directory will be attached to your project in
+And for your information - \c utils directory will be attached to your project in
 \ref kaldi_for_dummies_tools "Tools attachment" section.
 
 <h2>Task</h2>
 In \c kaldi/egs/digits directory, create a folder \c data. Then create
 \c test and \c train subfolders inside. Create in each subfolder following files
 (so you have files named in <b>the same way in \c test and \c train subfolders
-but they relate to two different data sets</b> that you created before):
+but they relate to two different datasets</b> that you created before):
 
 a.) \c spk2gender <br>
 This file informs about speakers gender. As we assumed, 'speakerID' is a unique
@@ -252,7 +252,7 @@ four four two
 
 \subsection kaldi_for_dummies_language Language data
 
-This section relates to language modelling files that also need to be considered
+This section relates to language modeling files that also need to be considered
 as 'must be done'. Look for the syntax details here: \ref data_prep (each file
 is precisely described). Also feel free to read some examples in other \c egs
 scripts. Now is the perfect time.
@@ -395,7 +395,7 @@ decided to use two different training methods:
 - TRI1 - simple triphone training (first triphone pass).
 
 These two methods are enough to show noticable differences in decoding results
-using only digits lexicon and small training data set.
+using only digits lexicon and small training dataset.
 
 <h2>Task</h2>
 In \c kaldi/egs/digits directory create 3 scripts:
@@ -432,7 +432,7 @@ c.) \c run.sh
 . ./path.sh || exit 1
 . ./cmd.sh || exit 1
 
-nj=1       # number of parallel jobs - 1 is perfect for such a small data set
+nj=1       # number of parallel jobs - 1 is perfect for such a small dataset
 lm_order=1 # language model order (n-gram quantity) - 1 is enough for digits grammar
 
 # Safety mechanism (possible running this script with modified arguments)
@@ -575,7 +575,7 @@ folder (same directory).
 This is just an example. The point of this short tutorial is to show you how to
 create 'anything' in Kaldi and to get a better understanding of how to think
 while using this toolkit. Personally I started with looking for tutorials made
-by the Kaldi authors/developers. After succesful Kaldi installation I launched
+by the Kaldi authors/developers. After successful Kaldi installation I launched
 some example scripts (Yesno, Voxforge, LibriSpeech - they are relatively easy
 and have free acoustic/language data to download - I used these three as a base
 for my own scripts).
@@ -586,7 +586,7 @@ There are two very useful sections for beginners inside: <br>
 a.) \ref tutorial - almost 'step by step' tutorial on how to set up an ASR
 system; up to some point this can be done without RM dataset. It is good to
 read it, <br>
-b.) \ref data_prep - very detailed explaination of how to use your own data
+b.) \ref data_prep - very detailed explanation of how to use your own data
 in Kaldi.
 
 More useful links about Kaldi I found: <br>
diff --git a/src/doc/tutorial_looking.dox b/src/doc/tutorial_looking.dox
index 420abfc9bce..831d721c7eb 100644
--- a/src/doc/tutorial_looking.dox
+++ b/src/doc/tutorial_looking.dox
@@ -171,7 +171,7 @@ making sure have their normal values, begin with KALDI_.  This is a precaution
 to avoid future conflicts with other codebases (since \#defines don't limit themselves
 to the kaldi namespace).  Notice the style of the function names: LikeThis().
 Our style is generally based on
-<a href=http://google-styleguide.googlecode.com/svn/trunk/cppguide.xml> this one </a>,
+<a href=https://google.github.io/styleguide/cppguide.html> this one </a>,
 to conform with OpenFst, but there are some differences.
 
 To see other elements of the style, which will help you to understand Kaldi
@@ -190,7 +190,7 @@ It prints out the usage, which should give you a generic idea of how Kaldi progr
 are called.  Note that while there is a --config option that can be used to
 pass a configuration file, in general Kaldi is not as config-driven as HTK and these
 files are not widely used.  You will see a --binary option.  In general, Kaldi file
-formats come in both binary and test forms, and the --binary option controls how
+formats come in both binary and text forms, and the --binary option controls how
 they are written.  However, this only controls how single objects (e.g. acoustic models)
 are written.  For whole collections of objects (e.g. collections of feature files),
 there is a different mechanism that we will come to later.
diff --git a/src/doc/tutorial_prereqs.dox b/src/doc/tutorial_prereqs.dox
index 82079a281b9..72b1fcf8ad8 100644
--- a/src/doc/tutorial_prereqs.dox
+++ b/src/doc/tutorial_prereqs.dox
@@ -51,7 +51,7 @@
   The most difficult part of the installation process relates to the math library
   ATLAS; if this is not already installed as a library on your system you will
   have to compile it, and this requires that CPU throttling be turned off, which
-  may require root priveleges.  We provide scripts and detailed instructions for
+  may require root privileges.  We provide scripts and detailed instructions for
   all installation steps.  When scripts fail, read the output carefully because
   it tries to provide guidance as to how to fix problems.  Please inform us if there
   are problems at any point, however minor; see \ref other.
diff --git a/src/doc/tutorial_running.dox b/src/doc/tutorial_running.dox
index f977348a3cb..d639cd4e664 100644
--- a/src/doc/tutorial_running.dox
+++ b/src/doc/tutorial_running.dox
@@ -115,14 +115,14 @@ Now go back to the data directory and change directory to /train. Then execute t
 
 \verbatim
 head text
-head spk2gender.map
+head spk2gender
 head spk2utt
 head utt2spk
 head wav.scp
 \endverbatim
 
 - text - This file contains mappings between utterances and utterance ids which will be used by Kaldi. This file will be turned into an integer format-- still a text file, but with the words replaced with integers.
-- spk2gender.map - This file contains mappings between speakers and their gender. This also acts as a list of unique users involved in training. 
+- spk2gender - This file contains mappings between speakers and their gender. This also acts as a list of unique users involved in training. 
 - spk2utt - This is a mapping between the speaker identifiers and all the utterance identifiers associated with the speaker. 
 - utt2spk - This is a one-to-one mapping between utterance ids and the corresponding speaker identifiers. 
 - wav.scp - This file is actually read directly by Kaldi programs when doing feature extraction. Look at the file again. It is parsed as a set of key-value pairs, where the key is the first string on each line. The value is a kind of "extended filename", and you can guess how it works. Since it is for reading we will refer to this type of string as an "rxfilename" (for writing we use the term wxfilename). See \ref io_sec_xfilename if you are curious. Note that although we use the extension .scp, this is not a script file in the HTK sense (i.e. it is not viewed as an extension to the command-line arguments).
@@ -383,7 +383,7 @@ do
 copy-tree --binary=false exp/mono/tree - | less
 \endverbatim
 Note that this is a monophone "tree" so it is very trivial-- it
-does not have any "splits".  Although this tree format was not indended to be
+does not have any "splits".  Although this tree format was not intended to be
 very human-readable, we have received a number of queries about the tree format so we
 will explain it.  The rest of this paragraph can be skipped over by the casual reader.
 After "ToPdf", the tree file contains an object of the
@@ -442,7 +442,7 @@ Type
 \verbatim
 grep Overall exp/mono/log/acc.{?,??}.{?,??}.log
 \endverbatim
-You can see the acoustic likelihods on each iteration.  Next look at one of the files
+You can see the acoustic likelihoods on each iteration.  Next look at one of the files
 exp/mono/log/update.*.log to see what kind of information is in the update log.
 
 When the monophone training is finished, we can test the monophone decoding. Before decoding, we have to create the decode graph. Type:
@@ -505,7 +505,7 @@ gmm-decode-faster
 \endverbatim
 to see the usage message, and match up the arguments with what you see in the log file.
 Recall that "rspecifier" is one of those strings that specifies how to read a table,
-and "wspecifier" specifies how to write one.  Look carefuly at these arguments and try
+and "wspecifier" specifies how to write one.  Look carefully at these arguments and try
 to figure out what they mean.  Look at the rspecifier that corresponds to the features, and
 try to understand it (this one has spaces inside, so Kaldi prints it out with single quotes
 around it so that you could paste it into the shell and the program would run as intended).
diff --git a/src/doc/versions.dox b/src/doc/versions.dox
index b26978b6e4d..08e2c2bbda7 100644
--- a/src/doc/versions.dox
+++ b/src/doc/versions.dox
@@ -28,7 +28,7 @@
 
    \section versions_scheme Versioning scheme
 
-     During its lifetime, Kaldi has has three different versioning methods.
+     During its lifetime, Kaldi has three different versioning methods.
      Originally Kaldi was a subversion (svn)-based project, and was hosted
      on Sourceforge.  Then Kaldi was moved to github, and for some time the
      only version-number available was the git hash of the commit.
@@ -121,7 +121,7 @@
       - Create a nnet3-based setup for RNN language models (i.e. recurrent and neural net based
         language models)
       - Some extentions to the core of the nnet3 framework to support constant values and
-        scalar multiplication without dedicated compoennts.
+        scalar multiplication without dedicated components.
 
    Below are commits corresponding to minor version numbers 5.3.x.
 
diff --git a/src/fstext/determinize-lattice-inl.h b/src/fstext/determinize-lattice-inl.h
index 43ad809f70e..775228bfd21 100644
--- a/src/fstext/determinize-lattice-inl.h
+++ b/src/fstext/determinize-lattice-inl.h
@@ -510,7 +510,7 @@ template<class Weight, class IntType> class LatticeDeterminizer {
         if (!CheckMemoryUsage()) return false;
       }
       return (determinized_ = true);
-    } catch (std::bad_alloc) {
+    } catch (const std::bad_alloc &) {
       int32 repo_size = repository_.MemSize(),
           arcs_size = num_arcs_ * sizeof(TempArc),
           elems_size = num_elems_ * sizeof(Element),
@@ -520,7 +520,7 @@ template<class Weight, class IntType> class LatticeDeterminizer {
           << " (repo,arcs,elems) = ("
           << repo_size << "," << arcs_size << "," << elems_size << ")";
       return (determinized_ = false);
-    } catch (std::runtime_error) {
+    } catch (const std::runtime_error &) {
       KALDI_WARN << "Caught exception doing lattice determinization";
       return (determinized_ = false);
     }
diff --git a/src/fstext/lattice-weight.h b/src/fstext/lattice-weight.h
index af4826f7bed..86bec97d4e8 100644
--- a/src/fstext/lattice-weight.h
+++ b/src/fstext/lattice-weight.h
@@ -179,8 +179,7 @@ class LatticeWeightTpl {
     } else if (s == "-Infinity") {
       f = -numeric_limits<T>::infinity();
     } else if (s == "BadNumber") {
-      f = numeric_limits<T>::infinity();
-      f -= f; // get NaN
+      f = numeric_limits<T>::quiet_NaN();
     } else {
       char *p;
       f = strtod(s.c_str(), &p);
diff --git a/src/gmm/mle-diag-gmm.h b/src/gmm/mle-diag-gmm.h
index 24194ef886a..d41d36489bf 100644
--- a/src/gmm/mle-diag-gmm.h
+++ b/src/gmm/mle-diag-gmm.h
@@ -85,7 +85,7 @@ struct MapDiagGmmOptions {
   /// Tau value for the weights-- this tau value is applied
   /// per state, not per Gaussian.
   BaseFloat weight_tau;
-  
+
   MapDiagGmmOptions(): mean_tau(10.0),
                              variance_tau(50.0),
                              weight_tau(10.0) { }
@@ -150,8 +150,8 @@ class AccumDiagGmm {
       const MatrixBase<BaseFloat> &data,
       const VectorBase<BaseFloat> &frame_weights,
       int32 num_threads);
-  
-  
+
+
   /// Increment the stats for this component by the specified amount
   /// (not all parts may be taken, depending on flags).
   /// Note: x_stats and x2_stats are assumed to already be multiplied by "occ"
@@ -162,7 +162,7 @@ class AccumDiagGmm {
 
   /// Increment with stats from this other accumulator (times scale)
   void Add(double scale, const AccumDiagGmm &acc);
-  
+
   /// Smooths the accumulated counts by adding 'tau' extra frames. An example
   /// use for this is I-smoothing for MMIE.   Calls SmoothWithAccum.
   void SmoothStats(BaseFloat tau);
@@ -179,13 +179,13 @@ class AccumDiagGmm {
   void SmoothWithModel(BaseFloat tau, const DiagGmm &src_gmm);
 
   // Const accessors
-  const GmmFlagsType Flags() const { return flags_; }
+  GmmFlagsType Flags() const { return flags_; }
   const VectorBase<double> &occupancy() const { return occupancy_; }
   const MatrixBase<double> &mean_accumulator() const { return mean_accumulator_; }
   const MatrixBase<double> &variance_accumulator() const { return variance_accumulator_; }
 
   // used in testing.
-  void AssertEqual(const AccumDiagGmm &other); 
+  void AssertEqual(const AccumDiagGmm &other);
  private:
   int32 dim_;
   int32 num_comp_;
diff --git a/src/gmm/mle-full-gmm.h b/src/gmm/mle-full-gmm.h
index 6e770764e1e..618714b0e9b 100644
--- a/src/gmm/mle-full-gmm.h
+++ b/src/gmm/mle-full-gmm.h
@@ -1,7 +1,7 @@
 // gmm/mle-full-gmm.h
 
 // Copyright 2009-2011  Jan Silovsky;  Saarland University;
-//                      Microsoft Corporation; 
+//                      Microsoft Corporation;
 //                      Univ. Erlangen Nuremberg, Korbinian Riedhammer
 
 // See ../../COPYING for clarification regarding multiple authors
@@ -91,7 +91,7 @@ class AccumFullGmm {
   void Resize(int32 num_components, int32 dim, GmmFlagsType flags);
   /// Calls Resize with arguments based on gmm_ptr_
   void Resize(const FullGmm &gmm, GmmFlagsType flags);
-  
+
   void ResizeVarAccumulator(int32 num_comp, int32 dim);
   /// Returns the number of mixture components
   int32 NumGauss() const { return num_comp_; }
@@ -122,8 +122,8 @@ class AccumFullGmm {
                                const VectorBase<BaseFloat> &data,
                                BaseFloat frame_posterior);
 
-  /// Accessors  
-  const GmmFlagsType Flags() const { return flags_; }
+  /// Accessors
+  GmmFlagsType Flags() const { return flags_; }
   const Vector<double> &occupancy() const { return occupancy_; }
   const Matrix<double> &mean_accumulator() const { return mean_accumulator_; }
   const std::vector<SpMatrix<double> > &covariance_accumulator() const { return covariance_accumulator_; }
diff --git a/src/makefiles/cuda_64bit.mk b/src/makefiles/cuda_64bit.mk
index d66ae03602f..eb8cf743ab3 100644
--- a/src/makefiles/cuda_64bit.mk
+++ b/src/makefiles/cuda_64bit.mk
@@ -5,7 +5,7 @@ ifndef CUDATKDIR
 $(error CUDATKDIR not defined.)
 endif
 
-CXXFLAGS += -DHAVE_CUDA -I$(CUDATKDIR)/include -fPIC -pthread -isystem $(OPENFSTINC) -rdynamic
+CXXFLAGS += -DHAVE_CUDA -I$(CUDATKDIR)/include -fPIC -pthread -isystem $(OPENFSTINC)
 
 CUDA_INCLUDE= -I$(CUDATKDIR)/include -I$(CUBROOT)
 CUDA_FLAGS = --machine 64 -DHAVE_CUDA \
@@ -14,4 +14,4 @@ CUDA_FLAGS = --machine 64 -DHAVE_CUDA \
              --verbose -Xcompiler "$(CXXFLAGS)"
 
 CUDA_LDFLAGS += -L$(CUDATKDIR)/lib64 -Wl,-rpath,$(CUDATKDIR)/lib64
-CUDA_LDLIBS += -lcublas -lcusparse -lcudart -lcurand -lnvToolsExt #LDLIBS : The libs are loaded later than static libs in implicit rule
+CUDA_LDLIBS += -lcublas -lcusparse -lcudart -lcurand -lnvToolsExt #LDLIBS : The .so libs are loaded later than static libs in implicit rule
diff --git a/src/makefiles/default_rules.mk b/src/makefiles/default_rules.mk
index 25dafae2f3a..fcce90f5c21 100644
--- a/src/makefiles/default_rules.mk
+++ b/src/makefiles/default_rules.mk
@@ -125,7 +125,7 @@ valgrind: .valgrind
 #buid up dependency commands
 CC_SRCS=$(wildcard *.cc)
 #check if files exist to run dependency commands on
-ifneq ($(CC_SRCS),)										
+ifneq ($(CC_SRCS),)
 CC_DEP_COMMAND=$(CXX) -M $(CXXFLAGS) $(CC_SRCS)
 endif
 
diff --git a/src/makefiles/linux_x86_64_mkl.mk b/src/makefiles/linux_x86_64_mkl.mk
index 7a70fa51a65..d1c399d9796 100644
--- a/src/makefiles/linux_x86_64_mkl.mk
+++ b/src/makefiles/linux_x86_64_mkl.mk
@@ -22,7 +22,7 @@ ifndef MKLROOT
 $(error MKLROOT not defined.)
 endif
 
-MKLLIB ?= $(MKLROOT)/lib/em64t
+MKLLIB ?= $(MKLROOT)/lib/intel64
 
 CXXFLAGS = -std=c++11 -I.. -isystem $(OPENFSTINC) -O1 $(EXTRA_CXXFLAGS) \
            -Wall -Wno-sign-compare -Wno-unused-local-typedefs \
diff --git a/src/nnet3/nnet-analyze.cc b/src/nnet3/nnet-analyze.cc
index 584a7c19ab8..a3696403eba 100644
--- a/src/nnet3/nnet-analyze.cc
+++ b/src/nnet3/nnet-analyze.cc
@@ -880,7 +880,7 @@ void ComputationChecker::CheckComputationIndexes() const {
           KALDI_ERR << "Backprop input needed but not supplied.";
         if ((properties & kBackpropNeedsOutput) && c.arg4 == 0)
           KALDI_ERR << "Backprop output needed but not supplied.";
-        if (c.arg6 == 0 && !(properties && kUpdatableComponent)) {
+        if (c.arg6 == 0 && !(properties & kUpdatableComponent)) {
           // note: we could perhaps make this just a warning,
           // or optimize it away somehow.
           KALDI_ERR << "Backprop is done but has no effect.";
diff --git a/src/nnet3/nnet-chain-training.cc b/src/nnet3/nnet-chain-training.cc
index a798cb597f5..cccb1110d3c 100644
--- a/src/nnet3/nnet-chain-training.cc
+++ b/src/nnet3/nnet-chain-training.cc
@@ -298,7 +298,7 @@ void NnetChainTrainer::PrintMaxChangeStats() const {
                      (num_minibatches_processed_ *
                      (nnet_config.backstitch_training_scale == 0.0 ? 1.0 :
                      1.0 + 1.0 / nnet_config.backstitch_training_interval))
-                  << " \% of the time.";
+                  << " % of the time.";
       i++;
     }
   }
@@ -308,7 +308,7 @@ void NnetChainTrainer::PrintMaxChangeStats() const {
                  (num_minibatches_processed_ *
                  (nnet_config.backstitch_training_scale == 0.0 ? 1.0 :
                  1.0 + 1.0 / nnet_config.backstitch_training_interval))
-              << " \% of the time.";
+              << " % of the time.";
 }
 
 NnetChainTrainer::~NnetChainTrainer() {
diff --git a/src/nnet3/nnet-simple-component.cc b/src/nnet3/nnet-simple-component.cc
index e8c99494b06..32f49745c0c 100644
--- a/src/nnet3/nnet-simple-component.cc
+++ b/src/nnet3/nnet-simple-component.cc
@@ -4068,13 +4068,13 @@ bool CompositeComponent::IsUpdatable() const {
 int32 CompositeComponent::InputDim() const {
   KALDI_ASSERT(!components_.empty());
   return components_.front()->InputDim();
-};
+}
 
 // virtual
 int32 CompositeComponent::OutputDim() const {
   KALDI_ASSERT(!components_.empty());
   return components_.back()->OutputDim();
-};
+}
 
 // virtual
 int32 CompositeComponent::Properties() const {
@@ -4096,7 +4096,7 @@ int32 CompositeComponent::Properties() const {
   if (last_component_properties & kStoresStats)
     ans |= kBackpropNeedsOutput;
   return ans;
-};
+}
 
 
 MatrixStrideType CompositeComponent::GetStrideType(int32 i) const {
@@ -4319,7 +4319,7 @@ void CompositeComponent::Backprop(const std::string &debug_info,
       // optimization; other propagates might also be skippable.
       int32 properties = components_[num_components - 2]->Properties(),
           next_properties = components_[num_components - 1]->Properties();
-      if (!(properties & (kBackpropNeedsOutput || kUsesMemo)) &&
+      if (!(properties & (kBackpropNeedsOutput | kUsesMemo)) &&
           !(next_properties & kBackpropNeedsInput)) {
         num_components_to_propagate--;
       }
diff --git a/src/nnet3/nnet-training.cc b/src/nnet3/nnet-training.cc
index 0acaa5c2008..820644470c7 100644
--- a/src/nnet3/nnet-training.cc
+++ b/src/nnet3/nnet-training.cc
@@ -257,7 +257,7 @@ void NnetTrainer::PrintMaxChangeStats() const {
                      (num_minibatches_processed_ *
                      (config_.backstitch_training_scale == 0.0 ? 1.0 :
                      1.0 + 1.0 / config_.backstitch_training_interval))
-                  << " \% of the time.";
+                  << " % of the time.";
       i++;
     }
   }
@@ -267,7 +267,7 @@ void NnetTrainer::PrintMaxChangeStats() const {
                  (num_minibatches_processed_ *
                  (config_.backstitch_training_scale == 0.0 ? 1.0 :
                  1.0 + 1.0 / config_.backstitch_training_interval))
-              << " \% of the time.";
+              << " % of the time.";
 }
 
 void ObjectiveFunctionInfo::UpdateStats(
diff --git a/src/probe/README.slow_expf b/src/probe/README.slow_expf
index 00c9ce5be09..c20386b8137 100644
--- a/src/probe/README.slow_expf
+++ b/src/probe/README.slow_expf
@@ -1,5 +1,6 @@
-On some machines, expf() turns out to be very slow: much slower than its double precision counterpart exp().
-Probably this is concerned with the version of glibc.
+On some machines, expf() turns out to be very slow: much slower than its double
+precision counterpart exp().  Probably this is concerned with the version of
+glibc.
 
 Here are a couple of examples:
 
@@ -21,5 +22,7 @@ configuration$ ./exp-test
 exp() time: 0.0028439
 expf() time: 0.00713329
 
-If slow behaviour is detected, then KALDI_NO_EXPF macro will be used, and the Exp() wrapper in base/kaldi-math.h will use exp() even for single precision floats.
-The behaviour of expf() is considered to be slow if it is slower than exp() by at least 10%.
\ No newline at end of file
+If slow behaviour is detected, then KALDI_NO_EXPF macro will be used, and the
+Exp() wrapper in base/kaldi-math.h will use exp() even for single precision
+floats.  The behaviour of expf() is considered to be slow if it is slower than
+exp() by at least 10%.
diff --git a/src/probe/exp-test.cc b/src/probe/exp-test.cc
index 1fd8a64c6a6..d6cc76d4ce2 100644
--- a/src/probe/exp-test.cc
+++ b/src/probe/exp-test.cc
@@ -17,35 +17,52 @@
 // See the Apache 2 License for the specific language governing permissions and
 // limitations under the License.
 
+// Read Makefile.slow_expf. This test must be compiled with -O0.
+
 #include <iostream>
 #include <cmath>
 #include "base/timer.h"
 
-#define SAMPLE 100000
+int main() {
+  int test_iter = 300000;
+
+  // Make sure that the CPU bumps its clock to full speed: run the first loop
+  // without timing. Then increase the sample iteration count exponentially
+  // until the loop takes at least 10ms. We run this loop 1/4 of the number of
+  // actual test iterations and call both exp() and expf(), so that the overall
+  // test run will take 20 to 60 ms, to ensure a sensibly measurable result.
+  for (bool first = true; ; first=false) {
+    kaldi::Timer timer;
+    for(int i = 0; i < test_iter; i += 4) {
+      (void)exp((double)(i & 0x0F));
+      (void)expf((double)(i & 0x0F));
+    }
+    double time = timer.Elapsed();
+    if (first) continue;
+    if (time > 0.01) break;
+    test_iter *= 3;
+  }
 
-int main() { 
-  float dummy = 0.0;
   kaldi::Timer exp_timer;
-  for(int i = 0; i < SAMPLE; ++i) {
-    dummy += exp((double)(i % 10));
+  for(int i = 0; i < test_iter; ++i) {
+    (void)exp((double)(i & 0x0F));
   }
   double exp_time = exp_timer.Elapsed();
 
   kaldi::Timer expf_timer;
-  for(int i = 0; i < SAMPLE; ++i) {
-    dummy += expf((double)(i % 10));
+  for(int i = 0; i < test_iter; ++i) {
+    (void)expf((double)(i & 0x0F));
   }
   double expf_time = expf_timer.Elapsed();
-  
-  // Often exp() and expf() perform very similarly, 
-  // so we will replace expf() by exp() only if there is at least 10% difference 
-  if (expf_time < exp_time * 1.1) { 
+
+  double ratio = expf_time / exp_time;
+  if (ratio < 1.1) {
+    // Often exp() and expf() perform very similarly, so we will replace expf()
+    // by exp() only if there is at least 10% difference.
     return 0;
-  } else {
-    std::cerr << "exp() time: " << exp_time << std::endl;
-    std::cerr << "expf() time: " << expf_time << std::endl;
-    return 1;
   }
-  
-  std::cerr << dummy << std::endl; // No complaint about the unused variable
+
+  std::cerr << ("WARNING: slow expf() detected. expf() is slower than exp() "
+                "by the factor of ") << ratio << "\n";
+  return 1;
 }
diff --git a/src/rnnlm/rnnlm-core-training.cc b/src/rnnlm/rnnlm-core-training.cc
index 5a1ae97895f..d1a01f7ef66 100644
--- a/src/rnnlm/rnnlm-core-training.cc
+++ b/src/rnnlm/rnnlm-core-training.cc
@@ -302,7 +302,7 @@ void RnnlmCoreTrainer::PrintMaxChangeStats() const {
                   << ", per-component max-change was enforced "
                   << ((100.0 * num_max_change_per_component_applied_[i]) /
                       num_minibatches_processed_)
-                  << "\% of the time.";
+                  << "% of the time.";
       i++;
     }
   }
@@ -312,7 +312,7 @@ void RnnlmCoreTrainer::PrintMaxChangeStats() const {
                  (num_minibatches_processed_ *
                  (config_.backstitch_training_scale == 0.0 ? 1.0 :
                  1.0 + 1.0 / config_.backstitch_training_interval))
-              << "\% of the time.";
+              << "% of the time.";
 }
 
 void RnnlmCoreTrainer::ProcessOutput(
diff --git a/src/rnnlm/rnnlm-embedding-training.cc b/src/rnnlm/rnnlm-embedding-training.cc
index c4238c7356a..0b5916b6bba 100644
--- a/src/rnnlm/rnnlm-embedding-training.cc
+++ b/src/rnnlm/rnnlm-embedding-training.cc
@@ -117,9 +117,9 @@ void RnnlmEmbeddingTrainer::TrainBackstitch(
     bool is_backstitch_step1,
     CuMatrixBase<BaseFloat> *embedding_deriv) {
 
-  // backstitch training is incompatible with momentum > 0  
+  // backstitch training is incompatible with momentum > 0
   KALDI_ASSERT(config_.momentum == 0.0);
-  
+
   // If relevant, do the following:
   // "embedding_deriv += - 2 * l2_regularize * embedding_mat_"
   // This is an approximate to the regular l2 regularization (add l2 regularization
@@ -130,7 +130,7 @@ void RnnlmEmbeddingTrainer::TrainBackstitch(
       embedding_deriv->AddMat(1.0 / (1.0 + config_.backstitch_training_scale) *
           l2_term, *embedding_mat_);
     }
-  } 
+  }
 
   BaseFloat scale = 1.0;
   if (config_.use_natural_gradient) {
@@ -213,7 +213,7 @@ void RnnlmEmbeddingTrainer::Train(
 }
 
 void RnnlmEmbeddingTrainer::TrainBackstitch(
-    bool is_backstitch_step1, 
+    bool is_backstitch_step1,
     const CuArrayBase<int32> &active_words,
     CuMatrixBase<BaseFloat> *embedding_deriv) {
 
@@ -232,7 +232,7 @@ void RnnlmEmbeddingTrainer::TrainBackstitch(
       embedding_deriv->AddRows(l2_term / (1.0 + config_.backstitch_training_scale),
                                *embedding_mat_, active_words);
     }
-  } 
+  }
   BaseFloat scale = 1.0;
   if (config_.use_natural_gradient) {
     if (is_backstitch_step1) preconditioner_.Freeze(true);
@@ -273,7 +273,7 @@ void RnnlmEmbeddingTrainer::PrintStats() {
                (num_minibatches_ *
                (config_.backstitch_training_scale == 0.0 ? 1.0 :
                1.0 + 1.0 / config_.backstitch_training_interval))
-            << " \% of the time.";
+            << " % of the time.";
 
   Matrix<BaseFloat> delta_embedding_mat(*embedding_mat_);
   delta_embedding_mat.AddMat(-1.0, initial_embedding_mat_);
diff --git a/src/tree/build-tree-questions.h b/src/tree/build-tree-questions.h
index a6bcfdd500b..22f12d62912 100644
--- a/src/tree/build-tree-questions.h
+++ b/src/tree/build-tree-questions.h
@@ -52,7 +52,7 @@ struct QuestionsForKey {  // Configuration class associated with a particular ke
   std::vector<std::vector<EventValueType> > initial_questions;
   RefineClustersOptions refine_opts;  // if refine_opts.max_iter == 0,
   // we just pick from the initial questions.
-  
+
   QuestionsForKey(int32 num_iters = 5): refine_opts(num_iters, 2) {
     // refine_cfg with 5 iters and top-n = 2 (this is no restriction because
     // RefineClusters called with 2 clusters; would get set to that anyway as
@@ -102,7 +102,9 @@ class Questions {  // careful, this is a class.
     KALDI_ASSERT(keys_out != NULL);
     CopyMapKeysToVector(key_idx_, keys_out);
   }
-  const bool HasQuestionsForKey(EventKeyType key) const { return (key_idx_.count(key) != 0); }
+  bool HasQuestionsForKey(EventKeyType key) const {
+    return (key_idx_.count(key) != 0);
+  }
   ~Questions() { kaldi::DeletePointers(&key_options_); }
 
 
diff --git a/src/util/kaldi-pipebuf.h b/src/util/kaldi-pipebuf.h
index 9b83cdccc3d..61034ac2757 100644
--- a/src/util/kaldi-pipebuf.h
+++ b/src/util/kaldi-pipebuf.h
@@ -82,7 +82,6 @@ class basic_pipebuf : public std::basic_filebuf<CharType, Traits> {
 };  // class basic_pipebuf
 #endif  // _MSC_VER
 
-};  // namespace kaldi
+}  // namespace kaldi
 
 #endif  // KALDI_UTIL_KALDI_PIPEBUF_H_
-

From f28516a56c618021d817eda23eeb30bbe82abd01 Mon Sep 17 00:00:00 2001
From: Daniel Povey <dpovey@gmail.com>
Date: Fri, 15 Mar 2019 15:55:26 -0400
Subject: [PATCH 003/163] [src] Add Vector strides, beginning draft of tensor
 stuff (#3120)

---
 src/matrix/kaldi-matrix.cc    |  56 ++++----
 src/matrix/kaldi-matrix.h     |  26 ++--
 src/matrix/kaldi-vector.cc    | 245 ++++++++++++++++++----------------
 src/matrix/kaldi-vector.h     |  90 ++++++++-----
 src/matrix/matrix-lib-test.cc |   4 +-
 src/matrix/sp-matrix.cc       |   7 +-
 src/tensor/tensor.h           | 187 ++++++++++++++++++++++++++
 7 files changed, 427 insertions(+), 188 deletions(-)
 create mode 100644 src/tensor/tensor.h

diff --git a/src/matrix/kaldi-matrix.cc b/src/matrix/kaldi-matrix.cc
index fcfe0616b64..d70ac5cefc8 100644
--- a/src/matrix/kaldi-matrix.cc
+++ b/src/matrix/kaldi-matrix.cc
@@ -28,7 +28,7 @@
 #include "matrix/compressed-matrix.h"
 #include "matrix/sparse-matrix.h"
 
-static_assert(int(kaldi::kNoTrans) == int(CblasNoTrans) && int(kaldi::kTrans) == int(CblasTrans), 
+static_assert(int(kaldi::kNoTrans) == int(CblasNoTrans) && int(kaldi::kTrans) == int(CblasTrans),
     "kaldi::kNoTrans and kaldi::kTrans must be equal to the appropriate CBLAS library constants!");
 
 namespace kaldi {
@@ -117,10 +117,10 @@ template<>
 template<>
 void MatrixBase<float>::AddVecVec(const float alpha,
                                   const VectorBase<float> &a,
-                                  const VectorBase<float> &rb) {
-  KALDI_ASSERT(a.Dim() == num_rows_ && rb.Dim() == num_cols_);
-  cblas_Xger(a.Dim(), rb.Dim(), alpha, a.Data(), 1, rb.Data(),
-             1, data_, stride_);
+                                  const VectorBase<float> &b) {
+  KALDI_ASSERT(a.Dim() == num_rows_ && b.Dim() == num_cols_);
+  cblas_Xger(a.Dim(), b.Dim(), alpha, a.Data(), a.Stride(),
+             b.Data(), b.Stride(), data_, stride_);
 }
 
 template<typename Real>
@@ -132,15 +132,18 @@ void MatrixBase<Real>::AddVecVec(const Real alpha,
   if (num_rows_ * num_cols_ > 100) { // It's probably worth it to allocate
     // temporary vectors of the right type and use BLAS.
     Vector<Real> temp_a(a), temp_b(b);
-    cblas_Xger(num_rows_, num_cols_, alpha, temp_a.Data(), 1,
-               temp_b.Data(), 1, data_, stride_);
+    cblas_Xger(num_rows_, num_cols_, alpha,
+               temp_a.Data(), temp_a.Stride(),
+               temp_b.Data(), temp_b.Stride(),
+               data_, stride_);
   } else {
     const OtherReal *a_data = a.Data(), *b_data = b.Data();
+    MatrixIndexT a_stride = a.Stride(), b_stride = b.Stride();
     Real *row_data = data_;
     for (MatrixIndexT i = 0; i < num_rows_; i++, row_data += stride_) {
-      BaseFloat alpha_ai = alpha * a_data[i];
+      BaseFloat alpha_ai = alpha * a_data[i * a_stride];
       for (MatrixIndexT j = 0; j < num_cols_; j++)
-        row_data[j] += alpha_ai * b_data[j];
+        row_data[j] += alpha_ai * b_data[j * b_stride];
     }
   }
 }
@@ -159,11 +162,11 @@ template<>
 template<>
 void MatrixBase<double>::AddVecVec(const double alpha,
                                    const VectorBase<double> &a,
-                                   const VectorBase<double> &rb) {
-  KALDI_ASSERT(a.Dim() == num_rows_ && rb.Dim() == num_cols_);
+                                   const VectorBase<double> &b) {
+  KALDI_ASSERT(a.Dim() == num_rows_ && b.Dim() == num_cols_);
   if (num_rows_ == 0) return;
-  cblas_Xger(a.Dim(), rb.Dim(), alpha, a.Data(), 1, rb.Data(),
-             1, data_, stride_);
+  cblas_Xger(a.Dim(), b.Dim(), alpha, a.Data(), a.Stride(),
+             b.Data(), b.Stride(), data_, stride_);
 }
 
 template<typename Real>
@@ -538,7 +541,7 @@ void MatrixBase<Real>::AddMatSmat(Real alpha, const MatrixBase<Real> &A,
         // pass stride to write a column as matrices are stored in row major order.
         cblas_Xaxpy(this_num_rows, alpha_B_jk, a_col_k, A.stride_,
                     this_col_j, this->stride_);
-        //for (MatrixIndexT i = 0; i < this_num_rows; ++i) 
+        //for (MatrixIndexT i = 0; i < this_num_rows; ++i)
         // this_col_j[i*this->stride_] +=  alpha_B_jk * a_col_k[i*A.stride_];
       }
     }
@@ -591,8 +594,10 @@ void MatrixBase<Real>::AddDiagVecMat(
   if (transM == kTrans) std::swap(M_row_stride, M_col_stride);
   Real *data = data_;
   const Real *Mdata = M.Data(), *vdata = v.Data();
+  MatrixIndexT v_stride = v.Stride();
   if (num_rows_ == 0) return;
-  for (MatrixIndexT i = 0; i < num_rows; i++, data += stride, Mdata += M_row_stride, vdata++)
+  for (MatrixIndexT i = 0; i < num_rows;
+       i++, data += stride, Mdata += M_row_stride, vdata += v_stride)
     cblas_Xaxpy(num_cols, alpha * *vdata, Mdata, M_col_stride, data, 1);
 }
 
@@ -623,10 +628,11 @@ void MatrixBase<Real>::AddMatDiagVec(
 
   Real *data = data_;
   const Real *Mdata = M.Data(), *vdata = v.Data();
+  MatrixIndexT v_stride = v.Stride();
   if (num_rows_ == 0) return;
   for (MatrixIndexT i = 0; i < num_rows; i++){
       for(MatrixIndexT j = 0; j < num_cols; j ++ ){
-          data[i*stride + j] += alpha * vdata[j] * Mdata[i*M_row_stride + j*M_col_stride];
+          data[i*stride + j] += alpha * vdata[j * v_stride] * Mdata[i*M_row_stride + j*M_col_stride];
       }
   }
 }
@@ -658,7 +664,8 @@ void MatrixBase<Real>::AddMatMatElements(const Real alpha,
 template<typename Real>
 void MatrixBase<Real>::LapackGesvd(VectorBase<Real> *s, MatrixBase<Real> *U_in,
                                    MatrixBase<Real> *V_in) {
-  KALDI_ASSERT(s != NULL && U_in != this && V_in != this);
+  KALDI_ASSERT(s != NULL && U_in != this && V_in != this &&
+               s->Stride() == 1);
 
   Matrix<Real> tmpU, tmpV;
   if (U_in == NULL) tmpU.Resize(this->num_rows_, 1);  // work-space if U_in empty.
@@ -1786,7 +1793,7 @@ void MatrixBase<Real>::DestructiveSvd(VectorBase<Real> *s, MatrixBase<Real> *U,
   // Throws exception on error.
 
   KALDI_ASSERT(num_rows_>=num_cols_ && "Svd requires that #rows by >= #cols.");  // For compatibility with JAMA code.
-  KALDI_ASSERT(s->Dim() == num_cols_);  // s should be the smaller dim.
+  KALDI_ASSERT(s->Dim() == num_cols_ && s->Stride() == 1);  // s should be the smaller dim.
   KALDI_ASSERT(U == NULL || (U->num_rows_ == num_rows_&&U->num_cols_ == num_cols_));
   KALDI_ASSERT(Vt == NULL || (Vt->num_rows_ == num_cols_&&Vt->num_cols_ == num_cols_));
 
@@ -1992,27 +1999,28 @@ void MatrixBase<Real>::OrthogonalizeRows() {
 // symmetric positive definite).
 
 template<typename Real>
-void MatrixBase<Real>::SymPosSemiDefEig(VectorBase<Real> *rs, MatrixBase<Real> *rU, Real check_thresh) // e.g. check_thresh = 0.001
+void MatrixBase<Real>::SymPosSemiDefEig(VectorBase<Real> *s, MatrixBase<Real> *U, Real check_thresh) // e.g. check_thresh = 0.001
 {
   const MatrixIndexT D = num_rows_;
 
   KALDI_ASSERT(num_rows_ == num_cols_);
   KALDI_ASSERT(IsSymmetric() && "SymPosSemiDefEig: expecting input to be symmetrical.");
-  KALDI_ASSERT(rU->num_rows_ == D && rU->num_cols_ == D && rs->Dim() == D);
+  KALDI_ASSERT(U->num_rows_ == D && U->num_cols_ == D && s->Dim() == D &&
+               s->Stride() == 1);
 
   Matrix<Real>  Vt(D, D);
-  Svd(rs, rU, &Vt);
+  Svd(s, U, &Vt);
 
   // First just zero any singular values if the column of U and V do not have +ve dot product--
   // this may mean we have small negative eigenvalues, and if we zero them the result will be closer to correct.
   for (MatrixIndexT i = 0;i < D;i++) {
     Real sum = 0.0;
-    for (MatrixIndexT j = 0;j < D;j++) sum += (*rU)(j, i) * Vt(i, j);
-    if (sum < 0.0) (*rs)(i) = 0.0;
+    for (MatrixIndexT j = 0;j < D;j++) sum += (*U)(j, i) * Vt(i, j);
+    if (sum < 0.0) (*s)(i) = 0.0;
   }
 
   {
-    Matrix<Real> tmpU(*rU); Vector<Real> tmps(*rs); tmps.ApplyPow(0.5);
+    Matrix<Real> tmpU(*U); Vector<Real> tmps(*s); tmps.ApplyPow(0.5);
     tmpU.MulColsVec(tmps);
     SpMatrix<Real> tmpThis(D);
     tmpThis.AddMat2(1.0, tmpU, kNoTrans, 0.0);
diff --git a/src/matrix/kaldi-matrix.h b/src/matrix/kaldi-matrix.h
index 11a5e08b15d..4b06a22ece9 100644
--- a/src/matrix/kaldi-matrix.h
+++ b/src/matrix/kaldi-matrix.h
@@ -59,13 +59,13 @@ class MatrixBase {
   friend class SparseMatrix<float>;
   friend class SparseMatrix<double>;
 
-  /// Returns number of rows (or zero for emtpy matrix).
+  /// Returns number of rows (or zero for empty matrix).
   inline MatrixIndexT  NumRows() const { return num_rows_; }
 
   /// Returns number of columns (or zero for emtpy matrix).
   inline MatrixIndexT NumCols() const { return num_cols_; }
 
-  /// Stride (distance in memory between each row).  Will be >= NumCols.
+  /// Stride (distance in memory between each row).  Must be >= NumCols().
   inline MatrixIndexT Stride() const {  return stride_; }
 
   /// Returns size in bytes of the data held by the matrix.
@@ -183,18 +183,20 @@ class MatrixBase {
 
   /* Accessing of sub-parts of the matrix. */
 
-  /// Return specific row of matrix [const].
-  inline const SubVector<Real> Row(MatrixIndexT i) const {
+  /// Return specific row of matrix.  Warning: this can get
+  /// around const constraints.
+  inline SubVector<Real> Row(MatrixIndexT i) const {
     KALDI_ASSERT(static_cast<UnsignedMatrixIndexT>(i) <
                  static_cast<UnsignedMatrixIndexT>(num_rows_));
-    return SubVector<Real>(data_ + (i * stride_), NumCols());
+    return SubVector<Real>(data_ + (i * stride_), num_cols_);
   }
 
-  /// Return specific row of matrix.
-  inline SubVector<Real> Row(MatrixIndexT i) {
+  /// Return specific column of matrix.  Warning: this can get
+  /// around const constraints.
+  inline const SubVector<Real> Col(MatrixIndexT i) const {
     KALDI_ASSERT(static_cast<UnsignedMatrixIndexT>(i) <
-                 static_cast<UnsignedMatrixIndexT>(num_rows_));
-    return SubVector<Real>(data_ + (i * stride_), NumCols());
+                 static_cast<UnsignedMatrixIndexT>(num_cols_));
+    return SubVector<Real>(data_ + i, num_rows_, stride_);
   }
 
   /// Return a sub-part of matrix.
@@ -406,7 +408,9 @@ class MatrixBase {
      Null pointers for U and/or Vt at input mean we do not want that output.  We
      expect that S.Dim() == m, U is either NULL or m by n,
      and v is either NULL or n by n.
-     The singular values are not sorted (use SortSvd for that).  */
+     The singular values are not sorted (use SortSvd for that).
+     Requires that s->Stride() == 1.
+  */
   void DestructiveSvd(VectorBase<Real> *s, MatrixBase<Real> *U,
                       MatrixBase<Real> *Vt);  // Destroys calling matrix.
 
@@ -414,6 +418,7 @@ class MatrixBase {
   /// transposed; the normal formulation is U diag(s) V^T.
   /// Null pointers for U or V mean we don't want that output (this saves
   /// compute).  The singular values are not sorted (use SortSvd for that).
+  /// Requires that s->Stride() == 1.
   void Svd(VectorBase<Real> *s, MatrixBase<Real> *U,
            MatrixBase<Real> *Vt) const;
   /// Compute SVD but only retain the singular values.
@@ -531,6 +536,7 @@ class MatrixBase {
    * positive semi-definite (check_thresh controls how stringent the check is;
    * set it to 2 to ensure it won't ever complain, but it will zero out negative
    * dimensions in your matrix.
+   * Requires s->Stride() == 1.
   */
   void SymPosSemiDefEig(VectorBase<Real> *s, MatrixBase<Real> *P,
                         Real check_thresh = 0.001);
diff --git a/src/matrix/kaldi-vector.cc b/src/matrix/kaldi-vector.cc
index c8ea35112ea..84ff7a9d768 100644
--- a/src/matrix/kaldi-vector.cc
+++ b/src/matrix/kaldi-vector.cc
@@ -33,12 +33,23 @@
 
 namespace kaldi {
 
+
+template<typename Real> inline const Real* Get64Ones() {
+  // The C++ standard doesn't seem to provide a compact way to do this.
+  static const Real ones[64] = { 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+                                 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+                                 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+                                 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1 };
+  return ones;
+}
+
+
 template<typename Real>
 Real VecVec(const VectorBase<Real> &a,
             const VectorBase<Real> &b) {
   MatrixIndexT adim = a.Dim();
   KALDI_ASSERT(adim == b.Dim());
-  return cblas_Xdot(adim, a.Data(), 1, b.Data(), 1);
+  return cblas_Xdot(adim, a.Data(), a.Stride(), b.Data(), b.Stride());
 }
 
 template
@@ -49,25 +60,26 @@ double VecVec<>(const VectorBase<double> &a,
                 const VectorBase<double> &b);
 
 template<typename Real, typename OtherReal>
-Real VecVec(const VectorBase<Real> &ra,
-            const VectorBase<OtherReal> &rb) {
-  MatrixIndexT adim = ra.Dim();
-  KALDI_ASSERT(adim == rb.Dim());
-  const Real *a_data = ra.Data();
-  const OtherReal *b_data = rb.Data();
+Real VecVec(const VectorBase<Real> &a,
+            const VectorBase<OtherReal> &b) {
+  MatrixIndexT adim = a.Dim();
+  KALDI_ASSERT(adim == b.Dim());
+  const Real *a_data = a.Data();
+  const OtherReal *b_data = b.Data();
+  MatrixIndexT a_stride = a.Stride(), b_stride = b.Stride();
   Real sum = 0.0;
   for (MatrixIndexT i = 0; i < adim; i++)
-    sum += a_data[i]*b_data[i];
+    sum += a_data[i * a_stride] * b_data[i * b_stride];
   return sum;
 }
 
 // instantiate the template above.
 template
-float VecVec<>(const VectorBase<float> &ra,
-               const VectorBase<double> &rb);
+float VecVec<>(const VectorBase<float> &a,
+               const VectorBase<double> &b);
 template
-double VecVec<>(const VectorBase<double> &ra,
-                const VectorBase<float> &rb);
+double VecVec<>(const VectorBase<double> &a,
+                const VectorBase<float> &b);
 
 
 template<>
@@ -76,7 +88,7 @@ void VectorBase<float>::AddVec(const float alpha,
                                const VectorBase<float> &v) {
   KALDI_ASSERT(dim_ == v.dim_);
   KALDI_ASSERT(&v != this);
-  cblas_Xaxpy(dim_, alpha, v.Data(), 1, data_, 1);
+  cblas_Xaxpy(dim_, alpha, v.Data(), v.stride_, data_, stride_);
 }
 
 template<>
@@ -85,7 +97,7 @@ void VectorBase<double>::AddVec(const double alpha,
                                 const VectorBase<double> &v) {
   KALDI_ASSERT(dim_ == v.dim_);
   KALDI_ASSERT(&v != this);
-  cblas_Xaxpy(dim_, alpha, v.Data(), 1, data_, 1);
+  cblas_Xaxpy(dim_, alpha, v.Data(), v.stride_, data_, stride_);
 }
 
 template<typename Real>
@@ -98,7 +110,7 @@ void VectorBase<Real>::AddMatVec(const Real alpha,
                || (trans == kTrans && M.NumRows() == v.dim_ && M.NumCols() == dim_));
   KALDI_ASSERT(&v != this);
   cblas_Xgemv(trans, M.NumRows(), M.NumCols(), alpha, M.Data(), M.Stride(),
-              v.Data(), 1, beta, data_, 1);
+              v.Data(), v.stride_, beta, data_, stride_);
 }
 
 template<typename Real>
@@ -111,40 +123,19 @@ void VectorBase<Real>::AddMatSvec(const Real alpha,
                || (trans == kTrans && M.NumRows() == v.dim_ && M.NumCols() == dim_));
   KALDI_ASSERT(&v != this);
   Xgemv_sparsevec(trans, M.NumRows(), M.NumCols(), alpha, M.Data(), M.Stride(),
-                  v.Data(), 1, beta, data_, 1);
+                  v.Data(), v.stride_, beta, data_, stride_);
   return;
-  /*
-  MatrixIndexT this_dim = this->dim_, v_dim = v.dim_,
-      M_stride = M.Stride();
-  Real *this_data = this->data_;
-  const Real *M_data = M.Data(), *v_data = v.data_;
-  if (beta != 1.0) this->Scale(beta);
-  if (trans == kNoTrans) {
-    for (MatrixIndexT i = 0; i < v_dim; i++) {
-      Real v_i = v_data[i];
-      if (v_i == 0.0) continue;
-      // Add to *this, the i'th column of the Matrix, times v_i.
-      cblas_Xaxpy(this_dim, v_i * alpha, M_data + i, M_stride, this_data, 1);
-    }
-  } else { // The transposed case is slightly more efficient, I guess.
-    for (MatrixIndexT i = 0; i < v_dim; i++) {
-      Real v_i = v.data_[i];
-      if (v_i == 0.0) continue;
-      // Add to *this, the i'th row of the Matrix, times v_i.
-      cblas_Xaxpy(this_dim, v_i * alpha,
-                  M_data + (i * M_stride), 1, this_data, 1);
-    }
-    }*/
 }
 
 template<typename Real>
 void VectorBase<Real>::AddSpVec(const Real alpha,
-                                 const SpMatrix<Real> &M,
-                                 const VectorBase<Real> &v,
-                                 const Real beta) {
+                                const SpMatrix<Real> &M,
+                                const VectorBase<Real> &v,
+                                const Real beta) {
   KALDI_ASSERT(M.NumRows() == v.dim_ && dim_ == v.dim_);
   KALDI_ASSERT(&v != this);
-  cblas_Xspmv(alpha, M.NumRows(), M.Data(), v.Data(), 1, beta, data_, 1);
+  cblas_Xspmv(alpha, M.NumRows(), M.Data(), v.Data(), v.stride_,
+              beta, data_, stride_);
 }
 
 
@@ -152,19 +143,20 @@ template<typename Real>
 void VectorBase<Real>::MulTp(const TpMatrix<Real> &M,
                               const MatrixTransposeType trans) {
   KALDI_ASSERT(M.NumRows() == dim_);
-  cblas_Xtpmv(trans,M.Data(),M.NumRows(),data_,1);
+  cblas_Xtpmv(trans, M.Data(), M.NumRows(), data_, stride_);
 }
 
 template<typename Real>
 void VectorBase<Real>::Solve(const TpMatrix<Real> &M,
                         const MatrixTransposeType trans) {
   KALDI_ASSERT(M.NumRows() == dim_);
-  cblas_Xtpsv(trans, M.Data(), M.NumRows(), data_, 1);
+  cblas_Xtpsv(trans, M.Data(), M.NumRows(), data_, stride_);
 }
 
 
 template<typename Real>
 inline void Vector<Real>::Init(const MatrixIndexT dim) {
+  this->stride_ = 1;
   KALDI_ASSERT(dim >= 0);
   if (dim == 0) {
     this->dim_ = 0;
@@ -188,7 +180,6 @@ inline void Vector<Real>::Init(const MatrixIndexT dim) {
 
 template<typename Real>
 void Vector<Real>::Resize(const MatrixIndexT dim, MatrixResizeType resize_type) {
-
   // the next block uses recursion to handle what we have to do if
   // resize_type == kCopyData.
   if (resize_type == kCopyData) {
@@ -244,12 +235,6 @@ template void VectorBase<float>::CopyFromPacked(const PackedMatrix<float> &other
 template void VectorBase<double>::CopyFromPacked(const PackedMatrix<double> &other);
 template void VectorBase<double>::CopyFromPacked(const PackedMatrix<float> &other);
 
-/// Load data into the vector
-template<typename Real>
-void VectorBase<Real>::CopyFromPtr(const Real *data, MatrixIndexT sz) {
-  KALDI_ASSERT(dim_ == sz);
-  std::memcpy(this->data_, data, Dim() * sizeof(Real));
-}
 
 template<typename Real>
 template<typename OtherReal>
@@ -264,7 +249,7 @@ void VectorBase<Real>::CopyFromVec(const VectorBase<OtherReal> &other) {
 template void VectorBase<float>::CopyFromVec(const VectorBase<double> &other);
 template void VectorBase<double>::CopyFromVec(const VectorBase<float> &other);
 
-// Remove element from the vector. The vector is non reallocated
+// Remove element from the vector. The vector is not reallocated
 template<typename Real>
 void Vector<Real>::RemoveElement(MatrixIndexT i) {
   KALDI_ASSERT(i <  this->dim_ && "Access out of vector");
@@ -678,14 +663,15 @@ void VectorBase<double>::CopyColFromMat(const MatrixBase<double> &mat, MatrixInd
 template<typename Real>
 void VectorBase<Real>::CopyDiagFromMat(const MatrixBase<Real> &M) {
   KALDI_ASSERT(dim_ == std::min(M.NumRows(), M.NumCols()));
-  cblas_Xcopy(dim_, M.Data(), M.Stride() + 1, data_, 1);
+  cblas_Xcopy(dim_, M.Data(), M.Stride() + 1, data_, stride_);
 }
 
 template<typename Real>
 void VectorBase<Real>::CopyDiagFromPacked(const PackedMatrix<Real> &M) {
   KALDI_ASSERT(dim_ == M.NumCols());
-  for (MatrixIndexT i = 0; i < dim_; i++)
-    data_[i] = M(i, i);
+  MatrixIndexT stride = stride_, dim = dim_;
+  for (MatrixIndexT i = 0; i < dim; i++)
+    data_[i * stride] = M(i, i);
   // could make this more efficient.
 }
 
@@ -695,15 +681,16 @@ Real VectorBase<Real>::Sum() const {
   // implement sum. This allows us to access SIMD operations in a
   // cross-platform way via your BLAS library.
   Real one(1);
-  return cblas_Xdot(dim_, data_, 1, &one, 0);
+  return cblas_Xdot(dim_, data_, stride_, &one, 0);
 }
 
 template<typename Real>
 Real VectorBase<Real>::SumLog() const {
   double sum_log = 0.0;
   double prod = 1.0;
-  for (MatrixIndexT i = 0; i < dim_; i++) {
-    prod *= data_[i];
+  MatrixIndexT dim = dim_, stride = stride_;
+  for (MatrixIndexT i = 0; i < dim; i++) {
+    prod *= data_[i * stride];
     // Possible future work (arnab): change these magic values to pre-defined
     // constants
     if (prod < 1.0e-10 || prod > 1.0e+10) {
@@ -716,43 +703,45 @@ Real VectorBase<Real>::SumLog() const {
 }
 
 template<typename Real>
-void VectorBase<Real>::AddRowSumMat(Real alpha, const MatrixBase<Real> &M, Real beta) {
+void VectorBase<Real>::AddRowSumMat(Real alpha, const MatrixBase<Real> &M,
+                                    Real beta) {
   KALDI_ASSERT(dim_ == M.NumCols());
-  MatrixIndexT num_rows = M.NumRows(), stride = M.Stride(), dim = dim_;
-  Real *data = data_;
-
-  // implement the function according to a dimension cutoff for computation efficiency
-  if (num_rows <= 64) {
-    cblas_Xscal(dim, beta, data, 1);
-    const Real *m_data = M.Data();
-    for (MatrixIndexT i = 0; i < num_rows; i++, m_data += stride)
-      cblas_Xaxpy(dim, alpha, m_data, 1, data, 1);
-
-  } else {
-    Vector<Real> ones(M.NumRows());
-    ones.Set(1.0);
-    this->AddMatVec(alpha, M, kTrans, ones, beta);
+  // the BLAS standard does not support vectors with stride zero, even though
+  // some implementations (such as Mac's accelerate framework and I believe
+  // CUBLAS) seem to allow it.  We compile a fixed-size (64) vector of ones
+  // into the program.
+  const Real *ones = Get64Ones<Real>();
+
+  MatrixIndexT num_rows = M.NumRows();
+  for (MatrixIndexT row_offset = 0; row_offset < num_rows; row_offset += 64) {
+    MatrixIndexT this_num_rows =
+        std::min<MatrixIndexT>(64, num_rows - row_offset);
+    cblas_Xgemv(kTrans, this_num_rows, M.NumCols(), alpha,
+                M.RowData(row_offset), M.Stride(), ones, 1,
+                beta, data_, stride_);
+    beta = 1.0;
   }
 }
 
+
 template<typename Real>
-void VectorBase<Real>::AddColSumMat(Real alpha, const MatrixBase<Real> &M, Real beta) {
+void VectorBase<Real>::AddColSumMat(Real alpha, const MatrixBase<Real> &M,
+                                    Real beta) {
   KALDI_ASSERT(dim_ == M.NumRows());
-  MatrixIndexT num_cols = M.NumCols();
+  // the BLAS standard does not support vectors with stride zero, even though
+  // some implementations (such as Mac's accelerate framework and I believe
+  // CUBLAS) seem to allow it.  We compile a fixed-size (64) vector of ones
+  // into the program.
+  const Real *ones = Get64Ones<Real>();
 
-  // implement the function according to a dimension cutoff for computation efficiency
-  if (num_cols <= 64) {
-    for (MatrixIndexT i = 0; i < dim_; i++) {
-      double sum = 0.0;
-      const Real *src = M.RowData(i);
-      for (MatrixIndexT j = 0; j < num_cols; j++)
-        sum += src[j];
-      data_[i] = alpha * sum + beta * data_[i];
-    }
-  } else {
-    Vector<Real> ones(M.NumCols());
-    ones.Set(1.0);
-    this->AddMatVec(alpha, M, kNoTrans, ones, beta);
+  MatrixIndexT num_cols = M.NumCols();
+  for (MatrixIndexT col_offset = 0; col_offset < num_cols; col_offset += 64) {
+    MatrixIndexT this_num_cols =
+        std::min<MatrixIndexT>(64, num_cols - col_offset);
+    cblas_Xgemv(kNoTrans, M.NumRows(), this_num_cols, alpha,
+                M.Data() + col_offset, M.Stride(), ones, 1,
+                beta, data_, stride_);
+    beta = 1.0;
   }
 }
 
@@ -769,8 +758,11 @@ Real VectorBase<Real>::LogSumExp(Real prune) const {
 
   double sum_relto_max_elem = 0.0;
 
-  for (MatrixIndexT i = 0; i < dim_; i++) {
-    BaseFloat f = data_[i];
+  const Real *data = data_;
+  MatrixIndexT dim = dim_, stride = stride_;
+
+  for (MatrixIndexT i = 0; i < dim; i++) {
+    BaseFloat f = data[i * stride];
     if (f >= cutoff)
       sum_relto_max_elem += Exp(f - max_elem);
   }
@@ -779,38 +771,50 @@ Real VectorBase<Real>::LogSumExp(Real prune) const {
 
 template<typename Real>
 void VectorBase<Real>::InvertElements() {
-  for (MatrixIndexT i = 0; i < dim_; i++) {
-    data_[i] = static_cast<Real>(1 / data_[i]);
+  MatrixIndexT dim = dim_, stride = stride_;
+  for (MatrixIndexT i = 0; i < dim; i++) {
+    data_[i * stride] = static_cast<Real>(1) / data_[i * stride];
   }
 }
 
 template<typename Real>
 void VectorBase<Real>::ApplyLog() {
-  for (MatrixIndexT i = 0; i < dim_; i++) {
-    if (data_[i] < 0.0)
+  MatrixIndexT dim = dim_, stride = stride_;
+  Real *data = data_;
+  for (MatrixIndexT i = 0; i < dim; i++) {
+    if (data[i * stride] < 0.0)
       KALDI_ERR << "Trying to take log of a negative number.";
-    data_[i] = Log(data_[i]);
+    data[i * stride] = Log(data[i * stride]);
   }
 }
 
 template<typename Real>
-void VectorBase<Real>::ApplyLogAndCopy(const VectorBase<Real> &v) {
+void VectorBase<Real>::ApplyLog(const VectorBase<Real> &v) {
   KALDI_ASSERT(dim_ == v.Dim());
-  for (MatrixIndexT i = 0; i < dim_; i++) {
-    data_[i] = Log(v(i));
+  MatrixIndexT dim = dim_, stride = stride_, v_stride = v.stride_;
+  Real *data = data_;
+  const Real *v_data = v.data_;
+  for (MatrixIndexT i = 0; i < dim; i++) {
+    data[i * stride] = Log(v_data[i * v_stride]);
   }
 }
 
 template<typename Real>
 void VectorBase<Real>::ApplyExp() {
-  for (MatrixIndexT i = 0; i < dim_; i++) {
-    data_[i] = Exp(data_[i]);
+  Real *data = data_;
+  MatrixIndexT dim = dim_, stride = stride_;
+  for (MatrixIndexT i = 0; i < dim; i++) {
+    data[i * stride] = Exp(data[i * stride]);
   }
 }
 
 template<typename Real>
 void VectorBase<Real>::ApplyAbs() {
-  for (MatrixIndexT i = 0; i < dim_; i++) { data_[i] = std::abs(data_[i]); }
+  Real *data = data_;
+  MatrixIndexT dim = dim_, stride = stride_;
+  for (MatrixIndexT i = 0; i < dim; i++) {
+    data[i * stride] = std::abs(data[i * stride]);
+  }
 }
 
 template<typename Real>
@@ -823,8 +827,8 @@ void VectorBase<Real>::ApplyFloor(Real floor_val, MatrixIndexT *floored_count) {
     MatrixIndexT num_floored = 0;
     for (MatrixIndexT i = 0; i < dim_; i++) {
       if (data_[i] < floor_val) {
-	data_[i] = floor_val;
-	num_floored++;
+        data_[i] = floor_val;
+        num_floored++;
       }
     }
     *floored_count = num_floored;
@@ -833,16 +837,18 @@ void VectorBase<Real>::ApplyFloor(Real floor_val, MatrixIndexT *floored_count) {
 
 template<typename Real>
 void VectorBase<Real>::ApplyCeiling(Real ceil_val, MatrixIndexT *ceiled_count) {
+  MatrixIndexT dim = dim_, stride = stride_;
+  Real *data = data_;
   if (ceiled_count == nullptr) {
-    for (MatrixIndexT i = 0; i < dim_; i++) {
-      data_[i] = std::min(data_[i], ceil_val);
+    for (MatrixIndexT i = 0; i < dim; i++) {
+      data[i * stride] = std::min(data[i * stride], ceil_val);
     }
   } else {
     MatrixIndexT num_changed = 0;
-    for (MatrixIndexT i = 0; i < dim_; i++) {
-      if (data_[i] > ceil_val) {
-	data_[i] = ceil_val;
-	num_changed++;
+    for (MatrixIndexT i = 0; i < dim; i++) {
+      if (data[i * stride] > ceil_val) {
+        data_[i * stride] = ceil_val;
+        num_changed++;
       }
     }
     *ceiled_count = num_changed;
@@ -851,11 +857,15 @@ void VectorBase<Real>::ApplyCeiling(Real ceil_val, MatrixIndexT *ceiled_count) {
 
 template<typename Real>
 MatrixIndexT VectorBase<Real>::ApplyFloor(const VectorBase<Real> &floor_vec) {
-  KALDI_ASSERT(floor_vec.Dim() == dim_);
+  MatrixIndexT dim = dim_, stride = stride_,
+      floor_stride = floor_vec.stride_;
+  Real *data = data_;
+  const Real *floor_data = floor_vec.data_;
+  KALDI_ASSERT(floor_vec.dim_ == dim);
   MatrixIndexT num_floored = 0;
-  for (MatrixIndexT i = 0; i < dim_; i++) {
-    if (data_[i] < floor_vec(i)) {
-      data_[i] = floor_vec(i);
+  for (MatrixIndexT i = 0; i < dim; i++) {
+    if (data[i * stride] < floor_data[i * floor_stride]) {
+      data_[i * stride] = floor_data[i * floor_stride];
       num_floored++;
     }
   }
@@ -960,7 +970,7 @@ void VectorBase<Real>::Add(Real c) {
 
 template<typename Real>
 void VectorBase<Real>::Scale(Real alpha) {
-  cblas_Xscal(dim_, alpha, data_, 1);
+  cblas_Xscal(dim_, alpha, data_, stride_);
 }
 
 template<typename Real>
@@ -1001,8 +1011,8 @@ void VectorBase<Real>::AddVecVec(Real alpha, const VectorBase<Real> &v,
   KALDI_ASSERT(v.data_ != this->data_ && r.data_ != this->data_);
   // We pretend that v is a band-diagonal matrix.
   KALDI_ASSERT(dim_ == v.dim_ && dim_ == r.dim_);
-  cblas_Xgbmv(kNoTrans, dim_, dim_, 0, 0, alpha, v.data_, 1,
-              r.data_, 1, beta, this->data_, 1);
+  cblas_Xgbmv(kNoTrans, dim_, dim_, 0, 0, alpha, v.data_, v.stride_,
+              r.data_, r.stride_, beta, this->data_, stride_);
 }
 
 
@@ -1310,7 +1320,8 @@ void VectorBase<Real>::AddDiagMat2(
     Real *data = this->data_;
     const Real *mat_data = M.Data();
     for (MatrixIndexT i = 0; i < rows; i++, mat_data += mat_stride, data++)
-      *data = beta * *data + alpha * cblas_Xdot(cols,mat_data,1,mat_data,1);
+      *data = beta * *data + alpha * cblas_Xdot(cols, mat_data, 1,
+                                                mat_data, 1);
   } else {
     KALDI_ASSERT(this->dim_ == M.NumCols());
     MatrixIndexT rows = M.NumRows(), cols = this->dim_,
diff --git a/src/matrix/kaldi-vector.h b/src/matrix/kaldi-vector.h
index 383d8ca2862..9da4c18af1d 100644
--- a/src/matrix/kaldi-vector.h
+++ b/src/matrix/kaldi-vector.h
@@ -62,8 +62,19 @@ class VectorBase {
   /// Returns the  dimension of the vector.
   inline MatrixIndexT Dim() const { return dim_; }
 
-  /// Returns the size in memory of the vector, in bytes.
-  inline MatrixIndexT SizeInBytes() const { return (dim_*sizeof(Real)); }
+  /// Returns the stride betwen elements of the vector; will normally be 1, and
+  /// must be nonzero.  CAUTION: we are in the process of updating this library
+  /// to support vector strides, so stride != 1 may not be supported everywhere,
+  /// and may sometimes lead to unexpected behavior or crashes.
+  inline MatrixIndexT Stride() const { return stride_; }
+
+  /// Returns the size in memory of the vector, in bytes, assuming
+  /// stride is 1 (if not, this doesn't make sense in the contexts
+  /// in which this is called.  TODO: get rid of this
+  inline MatrixIndexT SizeInBytes() const {
+    KALDI_ASSERT(stride_ == 1);
+    return (dim_*sizeof(Real));
+  }
 
   /// Returns a pointer to the start of the vector's data.
   inline Real* Data() { return data_; }
@@ -75,14 +86,14 @@ class VectorBase {
   inline Real operator() (MatrixIndexT i) const {
     KALDI_PARANOID_ASSERT(static_cast<UnsignedMatrixIndexT>(i) <
                  static_cast<UnsignedMatrixIndexT>(dim_));
-    return *(data_ + i);
+    return *(data_ + i * stride_);
   }
 
   /// Indexing operator (non-const).
   inline Real & operator() (MatrixIndexT i) {
     KALDI_PARANOID_ASSERT(static_cast<UnsignedMatrixIndexT>(i) <
                  static_cast<UnsignedMatrixIndexT>(dim_));
-    return *(data_ + i);
+    return *(data_ + i * stride_);
   }
 
   /** @brief Returns a sub-vector of a vector (a range of elements).
@@ -126,7 +137,7 @@ class VectorBase {
   void ApplyLog();
 
   /// Apply natural log to another vector and put result in *this.
-  void ApplyLogAndCopy(const VectorBase<Real> &v);
+  void ApplyLog(const VectorBase<Real> &v);
 
   /// Apply exponential to each value in vector.
   void ApplyExp();
@@ -360,25 +371,18 @@ class VectorBase {
   ~VectorBase() {}
 
   /// Empty initializer, corresponds to vector of zero size.
-  explicit VectorBase(): data_(NULL), dim_(0) {
+  explicit VectorBase(): data_(NULL), dim_(0), stride_(1) {
     KALDI_ASSERT_IS_FLOATING_TYPE(Real);
   }
 
-// Took this out since it is not currently used, and it is possible to create
-// objects where the allocated memory is not the same size as dim_ : Arnab
-//  /// Initializer from a pointer and a size; keeps the pointer internally
-//  /// (ownership or non-ownership depends on the child class).
-//  explicit VectorBase(Real* data, MatrixIndexT dim)
-//      : data_(data), dim_(dim) {}
-
-  // Arnab : made this protected since it is unsafe too.
-  /// Load data into the vector: sz must match own size.
-  void CopyFromPtr(const Real* Data, MatrixIndexT sz);
 
   /// data memory area
   Real* data_;
   /// dimension of vector
   MatrixIndexT dim_;
+  /// stride between elements of the vector.  Would normally be 1.  Must be
+  /// > 0  (if the vector is nonempty).
+  MatrixIndexT stride_;
   KALDI_DISALLOW_COPY_AND_ASSIGN(VectorBase);
 }; // class VectorBase
 
@@ -484,18 +488,32 @@ class Vector: public VectorBase<Real> {
 template<typename Real>
 class SubVector : public VectorBase<Real> {
  public:
-  /// Constructor from a Vector or SubVector.
-  /// SubVectors are not const-safe and it's very hard to make them
-  /// so for now we just give up.  This function contains const_cast.
-  SubVector(const VectorBase<Real> &t, const MatrixIndexT origin,
-            const MatrixIndexT length) : VectorBase<Real>() {
-    // following assert equiv to origin>=0 && length>=0 &&
-    // origin+length <= rt.dim_
-    KALDI_ASSERT(static_cast<UnsignedMatrixIndexT>(origin)+
-                 static_cast<UnsignedMatrixIndexT>(length) <=
-                 static_cast<UnsignedMatrixIndexT>(t.Dim()));
-    VectorBase<Real>::data_ = const_cast<Real*> (t.Data()+origin);
-    VectorBase<Real>::dim_   = length;
+  /**
+     Constructor from a Vector or SubVector.
+     SubVectors are not const-safe and it's very hard to make them
+     so for now we just give up.  This function contains const_cast.
+        @param [in] src     The vector we are taking a sub-vector of
+        @param [in] begin   The first element in 'src'
+        @param [in] num_elements  The number of elements we are taking
+        @param [in] step   The step between elements from 'src'; must be
+                           >0.
+  */
+  SubVector(const VectorBase<Real> &src,
+            const MatrixIndexT begin,
+            const MatrixIndexT num_elements,
+            const MatrixIndexT step = 1) : VectorBase<Real>() {
+    // Casting to UnsignedMatrixIndexT is a mechanism to test something
+    // is >= 0 as well as < x (for positive x) in a single comparison.
+    typedef UnsignedMatrixIndexT U;
+    KALDI_ASSERT(
+        step != 0 &&
+        static_cast<U>(begin) < static_cast<U>(src.Dim()) &&
+        static_cast<U>(begin + step * (num_elements - 1)) <
+        static_cast<U>(src.Dim()));
+    VectorBase<Real>::data_ = const_cast<Real*> (src.Data() +
+                                                 begin * src.Stride());
+    VectorBase<Real>::dim_   = num_elements;
+    VectorBase<Real>::stride_ = step * src.Stride();
   }
 
   /// This constructor initializes the vector to point at the contents
@@ -503,6 +521,7 @@ class SubVector : public VectorBase<Real> {
   SubVector(const PackedMatrix<Real> &M) {
     VectorBase<Real>::data_ = const_cast<Real*> (M.Data());
     VectorBase<Real>::dim_   = (M.NumRows()*(M.NumRows()+1))/2;
+    VectorBase<Real>::stride_ = 1;
   }
 
   /// Copy constructor
@@ -510,21 +529,28 @@ class SubVector : public VectorBase<Real> {
     // this copy constructor needed for Range() to work in base class.
     VectorBase<Real>::data_ = other.data_;
     VectorBase<Real>::dim_ = other.dim_;
+    VectorBase<Real>::stride_ = other.stride_;
   }
 
-  /// Constructor from a pointer to memory and a length.  Keeps a pointer
-  /// to the data but does not take ownership (will never delete).
-  /// Caution: this constructor enables you to evade const constraints.
-  SubVector(const Real *data, MatrixIndexT length) : VectorBase<Real> () {
+  /// Constructor from a pointer to memory and a length, and an optional stride.
+  /// Keeps a pointer to the data but does not take ownership (will never
+  /// delete).  Caution: this constructor enables you to evade const
+  /// constraints.
+  SubVector(const Real *data, MatrixIndexT length, MatrixIndexT stride = 1):
+      VectorBase<Real> () {
     VectorBase<Real>::data_ = const_cast<Real*>(data);
     VectorBase<Real>::dim_   = length;
+    VectorBase<Real>::stride_ = stride;
   }
 
 
   /// This operation does not preserve const-ness, so be careful.
+  /// This function is somewhat deprecated, for being ambiguous
+  /// MatrixBase:Row() is probably preferred.
   SubVector(const MatrixBase<Real> &matrix, MatrixIndexT row) {
     VectorBase<Real>::data_ = const_cast<Real*>(matrix.RowData(row));
     VectorBase<Real>::dim_   = matrix.NumCols();
+    VectorBase<Real>::stride_ = 1;
   }
 
   ~SubVector() {}  ///< Destructor (does nothing; no pointers are owned here).
diff --git a/src/matrix/matrix-lib-test.cc b/src/matrix/matrix-lib-test.cc
index 7db0d8d822c..8097ab119b5 100644
--- a/src/matrix/matrix-lib-test.cc
+++ b/src/matrix/matrix-lib-test.cc
@@ -27,7 +27,7 @@
 #include "util/stl-utils.h"
 #include <numeric>
 #include <time.h> // This is only needed for UnitTestSvdSpeed, you can
-// comment it (and that function) out if it causes problems.  
+// comment it (and that function) out if it causes problems.
 #include <matrix/cblas-wrappers.h>
 
 namespace kaldi {
@@ -2448,7 +2448,7 @@ template<typename Real> static void  UnitTestSimple() {
     Vector<Real> V2(V), V3(dimM*dimN);
     V2.ApplyExp();
     AssertEqual(V.Sum(), V2.SumLog());
-    V3.ApplyLogAndCopy(V2);
+    V3.ApplyLog(V2);
     V2.ApplyLog();
     AssertEqual(V, V2);
     AssertEqual(V3, V2);
diff --git a/src/matrix/sp-matrix.cc b/src/matrix/sp-matrix.cc
index 224ef39fb6e..40511f537ef 100644
--- a/src/matrix/sp-matrix.cc
+++ b/src/matrix/sp-matrix.cc
@@ -180,16 +180,17 @@ Real SpMatrix<Real>::Trace() const {
 // diagonal update, this <-- this + diag(v)
 template<typename Real>
 template<typename OtherReal>
-void  SpMatrix<Real>::AddDiagVec(const Real alpha, const VectorBase<OtherReal> &v) {
+void SpMatrix<Real>::AddDiagVec(const Real alpha, const VectorBase<OtherReal> &v) {
   int32 num_rows = this->num_rows_;
   KALDI_ASSERT(num_rows == v.Dim() && num_rows > 0);
   const OtherReal *src = v.Data();
   Real *dst = this->data_;
+  MatrixIndexT src_stride = v.Stride();
   if (alpha == 1.0)
-    for (int32 i = 1; i <= num_rows; i++, src++, dst += i)
+    for (int32 i = 1; i <= num_rows; i++, src += src_stride, dst += i)
       *dst += *src;
   else
-    for (int32 i = 1; i <= num_rows; i++, src++, dst += i)
+    for (int32 i = 1; i <= num_rows; i++, src += src_stride, dst += i)
       *dst += alpha * *src;
 }
 
diff --git a/src/tensor/tensor.h b/src/tensor/tensor.h
new file mode 100644
index 00000000000..e94b6978a6b
--- /dev/null
+++ b/src/tensor/tensor.h
@@ -0,0 +1,187 @@
+/**
+   This is some notes on plans for kaldi10 tensor stuff, nothing is fully fleshed out.
+*/
+
+namespace kaldi {
+namespace tensor {
+
+
+enum {
+  kCpuDevice = 0,
+  kCudaDevice = 1
+} DeviceType;
+
+// We may later add a device number (like which GPU we are using),
+// once we support multiple GPUs.
+struct Device {
+  DeviceType device_type;
+  // operator ==, probably, maybe constructors.
+};
+
+
+// 'Storage' contains a single allocated region (on CPU or GPU, according
+// to 'device').
+struct Storage {
+  void *data;
+  size_t num_bytes;
+  Device device;
+
+  // Note: will throw if allocation fails (for now).
+  Storage(Device device, size_t num_bytes);
+
+  // Destructor deallocates 'data'.  For now there is no
+  // concept of a custom allocator or an allocator object, we just use our CuDevice stuff for cuda
+  // allocation and posix_memalign for CPU allocation (obviously we need
+  // to make sure 'data' is aligned in most specific way we might need).
+  // in future we might choose
+  // to add that.
+  ~Storage();
+};
+
+
+enum {
+  kFloatDtype = 0,
+  kDoubleDtype = 1
+} DataType;
+
+#define KALDI_TENSOR_MAX_DIM 5
+
+
+
+/*
+  This struct stores the dimension and strides of a Tensor.  The following
+  describes the properties that a Tensor will always have (note: we
+  also use TensorDim inside implementation code in ways such that these
+  properties do not all hold).
+
+  These properties are stricter than some other frameworks, such as PyTorch,
+  which allow the users to manually add dimensions with stride 0 (and dim>1) so
+  that a lower-dimensional quantity can masquerade as one with a higher
+  dimension.  We require that it never be possible to access the same
+  memory location using two different tuples of indexes.  We also
+  don't allow zero dims (i.e. a tensor must not be empty); if you want an
+  empty Tensor, just use a null pointer.
+
+    0 <= num_axes <= 5
+    for 0 <= axis < num_axes:
+       dims[i] > 0
+
+  The strides may take any value, including zero or negative, as long as the
+  uniqueness property is satisfied (i.e. must not be possible to access the
+  same memory location using two different tuples of indices.
+
+*/
+
+struct TensorDim {
+
+  int64_t num_axes;
+  int64_t dims[KALDI_TENSOR_MAX_DIM];
+  int64_t strides[KALDI_TENSOR_MAX_DIM];
+  // We may later add methods to this.
+
+  // Checks that the TensorDim is valid, assuming it is part of a Tensor.
+  // I.e. that it satifies the properties mentioned above.
+  bool Check();
+};
+
+struct TensorDimProperties {
+  // Below are cached properties that depend on a TensorDim.
+
+  // The number of elements in the Tensor, which equals the product
+  // of dims[0] .. dims[num_axes - 1].  Must always be >0.
+  int64_t num_elements;
+
+  // is_contiguous means that the data form a contiguous block in memory; it is
+  // not the same as PyTorch's is_contiguous which is a stronger condition; our
+  // has_expected_strides is equivalent to that.
+  bool is_contiguous;
+
+  // has_expected_strides means that the strides are as if this was a "c"-style
+  // multidimensional array, meaning that (using Python wrap-around indexing
+  // conventions as if strides was an array of dimension 'num_axes'),
+  // strides[-1] == 1, strides[-1] == dims[-1], strides[-2] = dims[-1] *
+  // dims[-1], and so on.  This is the same as PyTorch's is_contiguous.
+  bool has_expected_strides;
+
+  void UpdateProperties(const TensorDim &dim);
+};
+
+
+
+class Tensor {
+ public:
+  //  ...
+
+ private:
+  // The tensor dim and strides.
+  TensorDim dim_;
+  // Cached properties that depend on dim_.
+  TensorDimProperties derived_;
+  // The data-type of this tensor.
+  DataType dtype_;
+
+  // The raw data pointer
+  void *data_;
+
+  // The storage region where the data resides.  data_ does not necessarily
+  // equal storage_->data; it may be more than that, e.g. if this is a view
+  // to part of another Tensor.
+  std::shared_ptr<Storage> storage_;
+
+
+};
+
+/*
+  This is the 'gradient information' that class Variable stores for a Tensor
+  when it is initialized with requires_grad = true (or is a result of
+  an operation on Variables one of which had requires_grad = true).
+  This does not give you access to the underlying Variables; doing it
+  like this makes reference counting easier (no loops).  The GradFunc
+  will store any pointers to the original Variable that it may have
+  needed.
+
+  Users will rarely need to interact directly with this struct directly.
+ */
+struct TensorGrad {
+  // The gradients corresponding to the input variables, which
+  // we may need to update.  Some subset of these may be nullptr,
+  // corresponding to input Variables for which no gradient
+  // was required.
+  std::vector<std::shared_ptr<TensorGrad> > inputs;
+
+  // is_view is
+  bool is_view{false};
+
+  // The device we
+  Device device;
+
+  // The dimension of the Tensor for which this is the gradient.  Used
+  // to set up 'grad' when needed.
+  TensorDim dim;
+
+  // 'offset' is only inspected if this is a view; it is the offset
+  // (in elements) from the
+  // 'inputs' will just contain one member, which is the gradient for the source
+  // Variable, and we use 'dim' and 'offset' to construct the sub-tensor).
+  int64_t offset;
+
+  // This stores the gradient (if we already have one), or nullptr if not.
+  std::unique_ptr<Tensor> grad{nullptr};
+
+
+};
+
+
+class Variable {
+    using GradFunc = std::function<
+      void(std::vector<Variable>& inputs, const Variable& grad_output)>;
+
+
+};
+
+typedef std::unique_ptr<Storage>
+
+
+
+
+};

From b9efc548679b52e502b7b9b2d3a304a5fb42594d Mon Sep 17 00:00:00 2001
From: Desh Raj <r.desh26@gmail.com>
Date: Fri, 15 Mar 2019 16:27:07 -0400
Subject: [PATCH 004/163] Merge with master branch

* [src] Change warp-synchronous to cub::BlockReduce (safer but slower) (#3080)

* [src] Fix && and || uses where & and | intended, and other weird errors (#3087)

* [build] Some fixes to Makefiles (#3088)

clang is unhappy with '-rdynamic' in compile-only step, and the
switch is really unnecessary.

Also, the default location for MKL 64-bit libraries is intel64/.
The em64t/ was explained already obsolete by an Intel rep in 2010:
https://software.intel.com/en-us/forums/intel-math-kernel-library/topic/285973

* [src] Fixed -Wreordered warnings in feat (#3090)

* [egs] Replace bc with perl -e (#3093)

* [scripts] Fix python3 compatibility issue in data-perturbing script (#3084)

* [doc] fix some typos in doc. (#3097)

* [build] Make sure expf() speed probe times sensibly (#3089)

* [scripts] Make sure merge_targets.py works in python3 (#3094)

* [src] ifdef to fix compilation failure on CUDA 8 and earlier (#3103)

* [doc] fix typos and broken links in doc. (#3102)

* [scripts] Fix frame_shift bug in egs/swbd/s5c/local/score_sclite_conf.sh (#3104)

* [src] Fix wrong assertion failure in nnet3-am-compute (#3106)

* [src] Cosmetic changes to natural-gradient code (#3108)

* [src,scripts] Python2 compatibility fixes and code cleanup for nnet1 (#3113)

* [doc] Small documentation fixes; update on Kaldi history (#3031)

* [src] Various mostly-cosmetic changes (copying from another branch) (#3109)
---
 .../libs/nnet3/train/dropout_schedule.py      |  20 +-
 egs/wsj/s5/steps/libs/nnet3/xconfig/parser.py |   1 +
 egs/wsj/s5/steps/nnet/train.sh                |  12 -
 egs/wsj/s5/steps/nnet3/xconfig_to_config.py   | 106 +++++
 egs/wsj/s5/utils/nnet/gen_dct_mat.py          |  11 +-
 egs/wsj/s5/utils/nnet/gen_hamm_mat.py         |   5 +-
 egs/wsj/s5/utils/nnet/gen_splice.py           |   5 +-
 egs/wsj/s5/utils/nnet/make_cnn2d_proto.py     | 259 ------------
 src/base/io-funcs.h                           |   2 +-
 src/bin/cuda-gpu-available.cc                 |  11 +
 src/doc/dnn1.dox                              |  16 +-
 src/doc/history.dox                           |  10 +-
 src/nnet3/natural-gradient-online.cc          |  24 +-
 src/nnet3/natural-gradient-online.h           |   8 +-
 src/nnet3/nnet-chain-training.cc              |  56 +--
 src/nnet3/nnet-chain-training.h               |  14 +-
 src/nnet3/nnet-parse-test.cc                  | 189 ---------
 src/nnet3/nnet-parse.cc                       | 373 ------------------
 src/nnet3/nnet-parse.h                        | 123 ------
 src/nnet3/nnet-training.cc                    |  47 +--
 src/nnet3/nnet-training.h                     |   8 +-
 src/nnet3/nnet-utils.cc                       |  43 +-
 src/nnet3/nnet-utils.h                        |  29 ++
 src/util/text-utils-test.cc                   | 190 +++++++++
 src/util/text-utils.cc                        | 248 ++++++++++++
 src/util/text-utils.h                         |  92 +++++
 26 files changed, 802 insertions(+), 1100 deletions(-)
 create mode 100755 egs/wsj/s5/steps/nnet3/xconfig_to_config.py
 delete mode 100755 egs/wsj/s5/utils/nnet/make_cnn2d_proto.py

diff --git a/egs/wsj/s5/steps/libs/nnet3/train/dropout_schedule.py b/egs/wsj/s5/steps/libs/nnet3/train/dropout_schedule.py
index 0ad93e5977d..0de9074517f 100644
--- a/egs/wsj/s5/steps/libs/nnet3/train/dropout_schedule.py
+++ b/egs/wsj/s5/steps/libs/nnet3/train/dropout_schedule.py
@@ -186,9 +186,22 @@ def _get_component_dropout(dropout_schedule, data_fraction):
 
 def _get_dropout_proportions(dropout_schedule, data_fraction):
     """Returns dropout proportions based on the dropout_schedule for the
-    fraction of data seen at this stage of training.
+    fraction of data seen at this stage of training.  Returns a list of
+    pairs (pattern, dropout_proportion); for instance, it might return
+    the list ['*', 0.625] meaning a dropout proportion of 0.625 is to
+    be applied to all dropout components.
+
     Returns None if dropout_schedule is None.
 
+    dropout_schedule might be (in the sample case using the default pattern of
+    '*'): '0.1,0.5@0.5,0.1', meaning a piecewise linear function that starts at
+    0.1 when data_fraction=0.0, rises to 0.5 when data_fraction=0.5, and falls
+    again to 0.1 when data_fraction=1.0.   It can also contain space-separated
+    items of the form 'pattern=schedule', for instance:
+       '*=0.0,0.5,0.0 lstm.*=0.0,0.3@0.75,0.0'
+    The more specific patterns should go later, otherwise they will be overridden
+    by the less specific patterns' commands.
+
     Calls _get_component_dropout() for the different component name patterns
     in dropout_schedule.
 
@@ -198,6 +211,7 @@ def _get_dropout_proportions(dropout_schedule, data_fraction):
             See _self_test() for examples.
         data_fraction: The fraction of data seen until this stage of
             training.
+
     """
     if dropout_schedule is None:
         return None
@@ -213,6 +227,10 @@ def _get_dropout_proportions(dropout_schedule, data_fraction):
 def get_dropout_edit_string(dropout_schedule, data_fraction, iter_):
     """Return an nnet3-copy --edits line to modify raw_model_string to
     set dropout proportions according to dropout_proportions.
+    E.g. if _dropout_proportions(dropout_schedule, data_fraction)
+    returns [('*', 0.625)],  this will return the string:
+     "nnet3-copy --edits='set-dropout-proportion name=* proportion=0.625'"
+
 
     Arguments:
         dropout_schedule: Value for the --trainer.dropout-schedule option.
diff --git a/egs/wsj/s5/steps/libs/nnet3/xconfig/parser.py b/egs/wsj/s5/steps/libs/nnet3/xconfig/parser.py
index 5ac2ed59003..b540423e3cd 100644
--- a/egs/wsj/s5/steps/libs/nnet3/xconfig/parser.py
+++ b/egs/wsj/s5/steps/libs/nnet3/xconfig/parser.py
@@ -27,6 +27,7 @@
         'relu-batchnorm-layer' : xlayers.XconfigBasicLayer,
         'relu-batchnorm-so-layer' : xlayers.XconfigBasicLayer,
         'batchnorm-so-relu-layer' : xlayers.XconfigBasicLayer,
+        'batchnorm-layer' : xlayers.XconfigBasicLayer,
         'sigmoid-layer' : xlayers.XconfigBasicLayer,
         'tanh-layer' : xlayers.XconfigBasicLayer,
         'fixed-affine-layer' : xlayers.XconfigFixedAffineLayer,
diff --git a/egs/wsj/s5/steps/nnet/train.sh b/egs/wsj/s5/steps/nnet/train.sh
index c23a15362c7..50a62837b67 100755
--- a/egs/wsj/s5/steps/nnet/train.sh
+++ b/egs/wsj/s5/steps/nnet/train.sh
@@ -433,18 +433,6 @@ else
         ${bn_dim:+ --bottleneck-dim=$bn_dim} \
         "$cnn_fea" $num_tgt $hid_layers $hid_dim >>$nnet_proto
       ;;
-    cnn2d)
-      delta_order=$([ -z $delta_opts ] && echo "0" || { echo $delta_opts | tr ' ' '\n' | grep "delta[-_]order" | sed 's:^.*=::'; })
-      echo "Debug : $delta_opts, delta_order $delta_order"
-      utils/nnet/make_cnn2d_proto.py $cnn_proto_opts \
-        --splice=$splice --delta-order=$delta_order --dir=$dir \
-        $num_fea >$nnet_proto
-      cnn_fea=$(cat $nnet_proto | grep -v '^$' | tail -n1 | awk '{ print $5; }')
-      utils/nnet/make_nnet_proto.py $proto_opts \
-        --no-smaller-input-weights \
-        ${bn_dim:+ --bottleneck-dim=$bn_dim} \
-        "$cnn_fea" $num_tgt $hid_layers $hid_dim >>$nnet_proto
-      ;;
     lstm)
       utils/nnet/make_lstm_proto.py $proto_opts \
         $num_fea $num_tgt >$nnet_proto
diff --git a/egs/wsj/s5/steps/nnet3/xconfig_to_config.py b/egs/wsj/s5/steps/nnet3/xconfig_to_config.py
new file mode 100755
index 00000000000..952745cea9f
--- /dev/null
+++ b/egs/wsj/s5/steps/nnet3/xconfig_to_config.py
@@ -0,0 +1,106 @@
+#!/usr/bin/env python3
+
+# Copyright 2016-2018    Johns Hopkins University (Dan Povey)
+#           2016    Vijayaditya Peddinti
+#           2017    Google Inc. (vpeddinti@google.com)
+# Apache 2.0.
+
+# This is like xconfig_to_configs.py but with a simpler interface; it writes
+# to a single named file.
+
+
+import argparse
+import os
+import sys
+from collections import defaultdict
+
+sys.path.insert(0, 'steps/')
+# the following is in case we weren't running this from the normal directory.
+sys.path.insert(0, os.path.realpath(os.path.dirname(sys.argv[0])) + '/')
+
+import libs.nnet3.xconfig.parser as xparser
+import libs.common as common_lib
+
+
+def get_args():
+    # we add compulsory arguments as named arguments for readability
+    parser = argparse.ArgumentParser(
+        description="Reads an xconfig file and creates config files "
+                    "for neural net creation and training",
+        epilog='Search egs/*/*/local/{nnet3,chain}/*sh for examples')
+    parser.add_argument('--xconfig-file', required=True,
+                        help='Filename of input xconfig file')
+    parser.add_argument('--existing-model',
+                        help='Filename of previously trained neural net '
+                             '(e.g. final.mdl) which is useful in case of '
+                             'using nodes from list of component-nodes in '
+                             'already trained model '
+                             'to generate new config file for new model.'
+                             'The context info is also generated using '
+                             'a model generated by adding final.config '
+                             'to the existing model.'
+                             'e.g. In Transfer learning: generate new model using '
+                             'component nodes in existing model.')
+    parser.add_argument('--config-file-out', required=True,
+                        help='Filename to write nnet config file.');
+    parser.add_argument('--nnet-edits', type=str, default=None,
+                        action=common_lib.NullstrToNoneAction,
+                        help="""This option is useful in case the network you
+                        are creating does not have an output node called
+                        'output' (e.g. for multilingual setups).  You can set
+                        this to an edit-string like: 'rename-node old-name=xxx
+                        new-name=output' if node xxx plays the role of the
+                        output node in this network.  This is only used for
+                        computing the left/right context.""")
+
+    print(' '.join(sys.argv), file=sys.stderr)
+
+    args = parser.parse_args()
+
+    return args
+
+
+
+def write_config_file(config_file_out, all_layers):
+    # config_basename_to_lines is map from the basename of the
+    # config, as a string (i.e. 'ref', 'all', 'init') to a list of
+    # strings representing lines to put in the config file.
+    config_basename_to_lines = defaultdict(list)
+
+    for layer in all_layers:
+        try:
+            pairs = layer.get_full_config()
+            for config_basename, line in pairs:
+                config_basename_to_lines[config_basename].append(line)
+        except Exception as e:
+            print("{0}: error producing config lines from xconfig "
+                  "line '{1}': error was: {2}".format(sys.argv[0],
+                                                      str(layer), repr(e)),
+                  file=sys.stderr)
+            # we use raise rather than raise(e) as using a blank raise
+            # preserves the backtrace
+            raise
+
+    with open(config_file_out, 'w') as f:
+        print('# This file was created by the command:\n'
+              '# {0} '.format(sys.argv), file=f)
+        lines = config_basename_to_lines['final']
+        for line in lines:
+            print(line, file=f)
+
+
+def main():
+    args = get_args()
+    existing_layers = []
+    if args.existing_model is not None:
+        existing_layers = xparser.get_model_component_info(args.existing_model)
+    all_layers = xparser.read_xconfig_file(args.xconfig_file, existing_layers)
+    write_config_file(args.config_file_out, all_layers)
+
+
+if __name__ == '__main__':
+    main()
+
+
+# test:
+# (echo 'input dim=40 name=input'; echo 'output name=output input=Append(-1,0,1)')  >xconfig; steps/nnet3/xconfig_to_config.py --xconfig-file=xconfig --config-file-out=foo
diff --git a/egs/wsj/s5/utils/nnet/gen_dct_mat.py b/egs/wsj/s5/utils/nnet/gen_dct_mat.py
index 24139f1c9f8..77461112d0b 100755
--- a/egs/wsj/s5/utils/nnet/gen_dct_mat.py
+++ b/egs/wsj/s5/utils/nnet/gen_dct_mat.py
@@ -16,8 +16,8 @@
 # limitations under the License.
 
 # ./gen_dct_mat.py
-# script generates matrix with DCT transform, which is sparse 
-# and takes into account that data-layout is along frequency axis, 
+# script generates matrix with DCT transform, which is sparse
+# and takes into account that data-layout is along frequency axis,
 # while DCT is done along temporal axis.
 
 from __future__ import division
@@ -29,10 +29,7 @@
 from optparse import OptionParser
 
 def print_on_same_line(text):
-    if (sys.version_info > (3,0)):
-        print(text, end=' ')
-    else:
-        print text,
+    print(text, end=' ')
 
 parser = OptionParser()
 parser.add_option('--fea-dim', dest='dim', help='feature dimension')
@@ -69,7 +66,7 @@ def print_on_same_line(text):
           if(n==timeContext-1):
               print_on_same_line((dim-m-1)*'0 ')
         print()
-    print() 
+    print()
 
 print(']')
 
diff --git a/egs/wsj/s5/utils/nnet/gen_hamm_mat.py b/egs/wsj/s5/utils/nnet/gen_hamm_mat.py
index d7e9d9b7493..110178c6702 100755
--- a/egs/wsj/s5/utils/nnet/gen_hamm_mat.py
+++ b/egs/wsj/s5/utils/nnet/gen_hamm_mat.py
@@ -27,10 +27,7 @@
 from optparse import OptionParser
 
 def print_on_same_line(text):
-    if (sys.version_info > (3,0)):
-        print(text, end=' ')
-    else:
-        print text,
+    print(text, end=' ')
 
 parser = OptionParser()
 parser.add_option('--fea-dim', dest='dim', help='feature dimension')
diff --git a/egs/wsj/s5/utils/nnet/gen_splice.py b/egs/wsj/s5/utils/nnet/gen_splice.py
index 3fe76513df6..f3a2c8b39ac 100755
--- a/egs/wsj/s5/utils/nnet/gen_splice.py
+++ b/egs/wsj/s5/utils/nnet/gen_splice.py
@@ -26,10 +26,7 @@
 from optparse import OptionParser
 
 def print_on_same_line(text):
-    if (sys.version_info > (3,0)):
-        print(text, end=' ')
-    else:
-        print text,
+    print(text, end=' ')
 
 parser = OptionParser()
 parser.add_option('--fea-dim', dest='dim_in', help='feature dimension')
diff --git a/egs/wsj/s5/utils/nnet/make_cnn2d_proto.py b/egs/wsj/s5/utils/nnet/make_cnn2d_proto.py
deleted file mode 100755
index 172660da825..00000000000
--- a/egs/wsj/s5/utils/nnet/make_cnn2d_proto.py
+++ /dev/null
@@ -1,259 +0,0 @@
-#!/usr/bin/python
-
-# Copyright 2014  Brno University of Technology (author: Karel Vesely)
-
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#  http://www.apache.org/licenses/LICENSE-2.0
-#
-# THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
-# WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
-# MERCHANTABLITY OR NON-INFRINGEMENT.
-# See the Apache 2 License for the specific language governing permissions and
-# limitations under the License.
-
-# Generated Nnet prototype, to be initialized by 'nnet-initialize'.
-
-from __future__ import division
-from __future__ import print_function
-import math, random, sys, warnings
-from optparse import OptionParser
-
-###
-### Parse options
-###
-usage="%prog [options] <feat-dim> <num-leaves> <num-hidden-layers> <num-hidden-neurons>  >nnet-proto-file"
-parser = OptionParser(usage)
-
-parser.add_option('--activation-type', dest='activation_type', 
-                   help='Select type of activation function : (<Sigmoid>|<Tanh>) [default: %default]', 
-                   default='<Sigmoid>', type='string');
-
-parser.add_option('--cnn1-num-filters', dest='cnn1_num_filters',
-		   help='Number of filters in first convolutional layer [default: %default]',
-		   default=128, type='int')
-# this is given by splice
-# parser.add_option('--cnn1-fmap-x-len', dest='cnn1_fmap_x_len',
-# 	  	   help='Size of cnn1-fmap-x-len [default: %default]',
-# 		   default=11, type='int')
-
-# this should be equal to feat_raw_dim
-# parser.add_option('--cnn1-fmap-y-len', dest='cnn1_fmap_y_len',
-# 	  	   help='Size of cnn1-fmap-y-len [default: %default]',
-# 		   default=32, type='int')
-
-parser.add_option('--cnn1-filt-x-len', dest='cnn1_filt_x_len',
-	  	   help='Size of cnn1-filt-x-len [default: %default]',
-		   default=9, type='int')
-parser.add_option('--cnn1-filt-y-len', dest='cnn1_filt_y_len',
-	  	   help='Size of cnn1-filt-y-len [default: %default]',
-		   default=9, type='int')
-
-parser.add_option('--cnn1-filt-x-step', dest='cnn1_filt_x_step',
-	  	   help='Size of cnn1-filt-x-step [default: %default]',
-		   default=1, type='int')
-parser.add_option('--cnn1-filt-y-step', dest='cnn1_filt_y_step',
-	  	   help='Size of cnn1-filt-y-step [default: %default]',
-		   default=1, type='int')
-parser.add_option('--cnn1-connect-fmap', dest='cnn1_connect_fmap',
-	  	   help='Size of cnn1-connect-fmap [default: %default]',
-		   default=0, type='int')
-
-parser.add_option('--pool1-x-len', dest='pool1_x_len',
-	  	   help='Size of pool1-filt-x-len [default: %default]',
-		   default=1, type='int')
-parser.add_option('--pool1-x-step', dest='pool1_x_step',
-	  	   help='Size of pool1-x-step [default: %default]',
-		   default=1, type='int')
-
-
-# 
-parser.add_option('--pool1-y-len', dest='pool1_y_len',
-	  	   help='Size of pool1-y-len [default: %default]',
-		   default=3, type='int')
-parser.add_option('--pool1-y-step', dest='pool1_y_step',
-	  	   help='Size of pool1-y-step [default: %default]',
-		   default=3, type='int')
-
-parser.add_option('--pool1-type', dest='pool1_type',
-		  help='Type of pooling (Max || Average) [default: %default]',
-		  default='Max', type='string')
-
-parser.add_option('--cnn2-num-filters', dest='cnn2_num_filters',
-		   help='Number of filters in first convolutional layer [default: %default]',
-		   default=256, type='int')
-parser.add_option('--cnn2-filt-x-len', dest='cnn2_filt_x_len',
-	  	   help='Size of cnn2-filt-x-len [default: %default]',
-		   default=3, type='int')
-parser.add_option('--cnn2-filt-y-len', dest='cnn2_filt_y_len',
-	  	   help='Size of cnn2-filt-y-len [default: %default]',
-		   default=4, type='int')
-parser.add_option('--cnn2-filt-x-step', dest='cnn2_filt_x_step',
-	  	   help='Size of cnn2-filt-x-step [default: %default]',
-		   default=1, type='int')
-parser.add_option('--cnn2-filt-y-step', dest='cnn2_filt_y_step',
-	  	   help='Size of cnn2-filt-y-step [default: %default]',
-		   default=1, type='int')
-parser.add_option('--cnn2-connect-fmap', dest='cnn2_connect_fmap',
-	  	   help='Size of cnn2-connect-fmap [default: %default]',
-		   default=1, type='int')
-
-parser.add_option('--pitch-dim', dest='pitch_dim',
-		  help='Number of features representing pitch [default: %default]',
-		  default=0, type='int')
-parser.add_option('--delta-order', dest='delta_order',
-		  help='Order of delta features [default: %default]',
-		  default=2, type='int')
-parser.add_option('--splice', dest='splice',
-		  help='Length of splice [default: %default]',
-		  default=5,type='int')
-parser.add_option('--dir', dest='dirct',
-		  help='Directory, where network prototypes will be saved [default: %default]',
-		  default='.', type='string')
-parser.add_option('--num-pitch-neurons', dest='num_pitch_neurons',
-		  help='Number of neurons in layers processing pitch features [default: %default]',
-		  default='200', type='int')
-
-
-(o,args) = parser.parse_args()
-if len(args) != 1 : 
-  parser.print_help()
-  sys.exit(1)
-  
-feat_dim=int(args[0])
-### End parse options 
-
-feat_raw_dim = feat_dim / (o.delta_order+1) / (o.splice*2+1) - o.pitch_dim # we need number of feats without deltas and splice and pitch
-o.cnn1_fmap_y_len = feat_raw_dim
-o.cnn1_fmap_x_len = o.splice*2+1
-
-# Check
-assert(feat_dim > 0)
-assert(o.pool1_type == 'Max' or o.pool1_type == 'Average')
-
-## Extra checks if dimensions are matching, if not match them by 
-## producing a warning
-# cnn1
-assert( (o.cnn1_fmap_y_len - o.cnn1_filt_y_len) % o.cnn1_filt_y_step == 0 )
-assert( (o.cnn1_fmap_x_len - o.cnn1_filt_x_len) % o.cnn1_filt_x_step == 0 )
-
-# subsample1
-cnn1_out_fmap_y_len=(1 + (o.cnn1_fmap_y_len - o.cnn1_filt_y_len) / o.cnn1_filt_y_step)
-cnn1_out_fmap_x_len=(1 + (o.cnn1_fmap_x_len - o.cnn1_filt_x_len) / o.cnn1_filt_x_step)
-
-# fix filt_len and filt_step
-def fix_filt_step(inp_len, filt_len, filt_step):
-  
-  if ((inp_len - filt_len) % filt_step == 0):
-    return filt_step
-  else:
-    # filt_step <= filt_len
-    for filt_step in range(filt_len, 0, -1):
-      if ((inp_len - filt_len) % filt_step == 0):
-        return filt_step
-    
-o.pool1_y_step = fix_filt_step(cnn1_out_fmap_y_len, o.pool1_y_len, o.pool1_y_step)
-if o.pool1_y_step == 1 and o.pool1_y_len != 1:
-  warnings.warn('WARNING: Choose different pool1_y_len as subsampling is not happening');
-  
-o.pool1_x_step = fix_filt_step(cnn1_out_fmap_x_len, o.pool1_x_len, o.pool1_x_step)
-if o.pool1_x_step == 1 and o.pool1_x_len != 1:
-  warnings.warn('WARNING: Choose different pool1_x_len as subsampling is not happening');
-
-
-###
-### Print prototype of the network
-###
-
-# Begin the prototype
-print("<NnetProto>")
-
-# Convolutional part of network
-'''1st CNN layer'''
-cnn1_input_dim=feat_raw_dim * (o.delta_order+1) * (o.splice*2+1)
-cnn1_out_fmap_x_len=(1 + (o.cnn1_fmap_x_len - o.cnn1_filt_x_len) / o.cnn1_filt_x_step)
-cnn1_out_fmap_y_len=(1 + (o.cnn1_fmap_y_len - o.cnn1_filt_y_len) / o.cnn1_filt_y_step)
-cnn1_output_dim=o.cnn1_num_filters * cnn1_out_fmap_x_len * cnn1_out_fmap_y_len
-
-'''1st Pooling layer'''
-pool1_input_dim=cnn1_output_dim
-pool1_fmap_x_len=cnn1_out_fmap_x_len
-pool1_out_fmap_x_len=(1 + (pool1_fmap_x_len - o.pool1_x_len) / o.pool1_x_step)
-pool1_fmap_y_len=cnn1_out_fmap_y_len
-pool1_out_fmap_y_len=(1 + (pool1_fmap_y_len - o.pool1_y_len) / o.pool1_y_step)
-pool1_output_dim=o.cnn1_num_filters*pool1_out_fmap_x_len*pool1_out_fmap_y_len
-
-'''2nd CNN layer'''
-cnn2_input_dim=pool1_output_dim
-cnn2_fmap_x_len=pool1_out_fmap_x_len
-cnn2_out_fmap_x_len=(1 + (cnn2_fmap_x_len - o.cnn2_filt_x_len) / o.cnn2_filt_x_step)
-cnn2_fmap_y_len=pool1_out_fmap_y_len
-cnn2_out_fmap_y_len=(1 + (cnn2_fmap_y_len - o.cnn2_filt_y_len) / o.cnn2_filt_y_step)
-cnn2_output_dim=o.cnn2_num_filters * cnn2_out_fmap_x_len * cnn2_out_fmap_y_len
-
-
-convolution_proto = ''
-
-convolution_proto += "<Convolutional2DComponent> <InputDim> %d <OutputDim> %d <FmapXLen> %d <FmapYLen> %d <FiltXLen> %d <FiltYLen> %d <FiltXStep> %d <FiltYStep> %d <ConnectFmap> %d <BiasMean> %f <BiasRange> %f <ParamStddev> %f\n" % \
-    ( cnn1_input_dim, cnn1_output_dim, o.cnn1_fmap_x_len, o.cnn1_fmap_y_len, o.cnn1_filt_x_len, o.cnn1_filt_y_len, o.cnn1_filt_x_step, o.cnn1_filt_y_step, o.cnn1_connect_fmap, 0.0, 0.0, 0.01 )
-convolution_proto += "<%sPooling2DComponent> <InputDim> %d <OutputDim> %d <FmapXLen> %d <FmapYLen> %d <PoolXLen> %d <PoolYLen> %d <PoolXStep> %d <PoolYStep> %d\n" % \
-    ( o.pool1_type, pool1_input_dim, pool1_output_dim, pool1_fmap_x_len, pool1_fmap_y_len, o.pool1_x_len, o.pool1_y_len, o.pool1_x_step, o.pool1_y_step )
-convolution_proto += "<Rescale> <InputDim> %d <OutputDim> %d <InitParam> %f\n" % \
-    ( pool1_output_dim, pool1_output_dim, 1.0 )
-convolution_proto += "<AddShift> <InputDim> %d <OutputDim> %d <InitParam> %f\n" % \
-    ( pool1_output_dim, pool1_output_dim, 0.0 )
-convolution_proto += "%s <InputDim> %d <OutputDim> %d\n" % \
-    ( o.activation_type, pool1_output_dim, pool1_output_dim )
-convolution_proto += "<Convolutional2DComponent> <InputDim> %d <OutputDim> %d <FmapXLen> %d <FmapYLen> %d <FiltXLen> %d <FiltYLen> %d <FiltXStep> %d <FiltYStep> %d <ConnectFmap> %d <BiasMean> %f <BiasRange> %f <ParamStddev> %f\n" % \
-    ( cnn2_input_dim, cnn2_output_dim, cnn2_fmap_x_len, cnn2_fmap_y_len, o.cnn2_filt_x_len, o.cnn2_filt_y_len, o.cnn2_filt_x_step, o.cnn2_filt_y_step, o.cnn2_connect_fmap, -2.0, 4.0, 0.1 )
-convolution_proto += "<Rescale> <InputDim> %d <OutputDim> %d <InitParam> %f\n" % \
-    ( cnn2_output_dim, cnn2_output_dim, 1.0)
-convolution_proto += "<AddShift> <InputDim> %d <OutputDim> %d <InitParam> %f\n" % \
-    ( cnn2_output_dim, cnn2_output_dim, 0.0)
-convolution_proto += "%s <InputDim> %d <OutputDim> %d\n" % \
-    ( o.activation_type, cnn2_output_dim, cnn2_output_dim)
-
-if (o.pitch_dim > 0):
-  # convolutional part
-  f_conv = open('%s/nnet.proto.convolution' % o.dirct, 'w')
-  f_conv.write('<NnetProto>\n')
-  f_conv.write(convolution_proto)
-  f_conv.write('</NnetProto>\n')
-  f_conv.close()
-  
-  # pitch part
-  f_pitch = open('%s/nnet.proto.pitch' % o.dirct, 'w')
-  f_pitch.write('<NnetProto>\n')
-  f_pitch.write('<AffineTransform> <InputDim> %d <OutputDim> %d <BiasMean> %f <BiasRange> %f <ParamStddev> %f\n' % \
-		((o.pitch_dim * (o.delta_order+1) * (o.splice*2+1)), o.num_pitch_neurons, -2.0, 4.0, 0.109375))
-  f_pitch.write('%s <InputDim> %d <OutputDim> %d\n' % \
-		(o.activation_type, o.num_pitch_neurons, o.num_pitch_neurons))
-  f_pitch.write('<AffineTransform> <InputDim> %d <OutputDim> %d <BiasMean> %f <BiasRange> %f <ParamStddev> %f\n' % \
-		(o.num_pitch_neurons, o.num_pitch_neurons, -2.0, 4.0, 0.109375))
-  f_pitch.write('%s <InputDim> %d <OutputDim> %d\n' % \
-		(o.activation_type, o.num_pitch_neurons, o.num_pitch_neurons))
-  f_pitch.write('</NnetProto>\n')
-  f_pitch.close()
-
-  # paralell part
-  vector = ''
-  for i in range(1, (feat_raw_dim + o.pitch_dim) * (o.delta_order+1) * (o.splice*2+1), feat_raw_dim + o.pitch_dim):
-    vector += '%d:1:%d ' % (i, i + feat_raw_dim - 1)
-  for i in range(feat_raw_dim+1, (feat_raw_dim + o.pitch_dim) * (o.delta_order+1) * (o.splice*2+1), feat_raw_dim + o.pitch_dim):
-    vector += '%d:1:%d ' % (i, i + o.pitch_dim - 1)
-  print('<Copy> <InputDim> %d <OutputDim> %d <BuildVector>  %s </BuildVector> ' % \
-	((feat_raw_dim + o.pitch_dim) * (o.delta_order+1) * (o.splice*2+1), (feat_raw_dim + o.pitch_dim) * (o.delta_order+1) * (o.splice*2+1), vector))
-  print('<ParallelComponent> <InputDim> %d <OutputDim> %d <NestedNnetProto> %s %s </NestedNnetProto>' % \
-	((feat_raw_dim + o.pitch_dim) * (o.delta_order+1) * (o.splice*2+1), o.num_pitch_neurons + cnn2_output_dim, '%s/nnet.proto.convolution' % o.dirct, '%s/nnet.proto.pitch' % o.dirct))
-
-  num_convolution_output = o.num_pitch_neurons + cnn2_output_dim
-else: # no pitch
-  print(convolution_proto)
-
-# We are done!
-sys.exit(0)
-
-
diff --git a/src/base/io-funcs.h b/src/base/io-funcs.h
index b3015905785..6396967f56b 100644
--- a/src/base/io-funcs.h
+++ b/src/base/io-funcs.h
@@ -108,7 +108,7 @@ namespace kaldi {
   it doesn't throw.  It's useful if a class can have various forms based on
   typedefs and virtual classes, and wants to know which version to read.
 
-  ReadToken allow the caller to obtain the next token.  PeekToken works just
+  ReadToken allows the caller to obtain the next token.  PeekToken works just
   like ReadToken, but seeks back to the beginning of the token.  A subsequent
   call to ReadToken will read the same token again.  This is useful when
   different object types are written to the same file; using PeekToken one can
diff --git a/src/bin/cuda-gpu-available.cc b/src/bin/cuda-gpu-available.cc
index 41d0227ce08..69637d3601a 100644
--- a/src/bin/cuda-gpu-available.cc
+++ b/src/bin/cuda-gpu-available.cc
@@ -40,6 +40,17 @@ void TestGpuComputation() {
 #endif
 
 int main(int argc, char *argv[]) try {
+
+  /* only for Doxygen documentation, never shown in command line */
+  const char *usage =
+        "Test if there is a GPU available, and if the GPU setup is correct.\n"
+        "A GPU is acquired and a small computation is done\n"
+        "(generating a random matrix and computing softmax for its rows).\n"
+        "\n"
+        "exit-code: 0 = success, 1 = compiled without GPU support, -1 = error\n"
+        "\n"
+        "Usage:  cuda-gpu-available\n";
+
   char hostname[100] = "UNKNOWN-HOSTNAME";
 #if !defined(_MSC_VER) && !defined(__CYGWIN__)
   if (gethostname(hostname, 100)) {
diff --git a/src/doc/dnn1.dox b/src/doc/dnn1.dox
index 223b7665274..e8dcfd90d3f 100644
--- a/src/doc/dnn1.dox
+++ b/src/doc/dnn1.dox
@@ -35,13 +35,13 @@ show some \ref dnn1_advanced_features, and do a light introduction to the \ref d
 <hr><!-- #################################################################################################################### -->
 
 \section dnn1_toplevel_scripts Top-level script
-Let's have a look at the script <b><a href="http://sourceforge.net/p/kaldi/code/HEAD/tree/trunk/egs/wsj/s5/local/nnet/run_dnn.sh">egs/wsj/s5/local/nnet/run_dnn.sh</a></b>.
+Let's have a look at the script <b><a href="https://github.com/kaldi-asr/kaldi/blob/master/egs/wsj/s5/local/nnet/run_dnn.sh">egs/wsj/s5/local/nnet/run_dnn.sh</a></b>.
 This script assumes to use a single CUDA GPU, and that kaldi was compiled with CUDA (check for 'CUDA = true' in src/kaldi.mk).
 Also we assume that 'cuda_cmd' is set properly in egs/wsj/s5/cmd.sh either to a GPU cluster node using 'queue.pl' or to a local machine using 'run.pl'.
 And finally the script assumes we already have a SAT GMM system exp/tri4b and corresponding fMLLR transforms, as generated by egs/wsj/s5/run.sh.
 Note that for other databases the run_dnn.sh is typically in the same location s5/local/nnet/run_dnn.sh.
 
-The script <a href="http://sourceforge.net/p/kaldi/code/HEAD/tree/trunk/egs/wsj/s5/local/nnet/run_dnn.sh">egs/wsj/s5/local/nnet/run_dnn.sh</a> is split into several stages:
+The script <a href="https://github.com/kaldi-asr/kaldi/blob/master/egs/wsj/s5/local/nnet/run_dnn.sh">egs/wsj/s5/local/nnet/run_dnn.sh</a> is split into several stages:
 
 0. <b>storing 40-dimensional fMLLR features to disk, steps/nnet/make_fmllr_feats.sh,</b>
 this simplifies the training scripts, the 40-dimensional features are MFCC-LDA-MLLT-fMLLR with CMN
@@ -100,7 +100,7 @@ Besides the DNN recipe, there are also other example scripts which can be handy:
 <hr><!-- #################################################################################################################### -->
 
 \section dnn1_training_script_internals Training script internals
-The main neural network training script <a href="http://sourceforge.net/p/kaldi/code/HEAD/tree/trunk/egs/wsj/s5/steps/nnet/train.sh">steps/nnet/train.sh</a> is invoked as:
+The main neural network training script <a href="https://github.com/kaldi-asr/kaldi/blob/master/egs/wsj/s5/steps/nnet/train.sh">steps/nnet/train.sh</a> is invoked as:
 
 \verbatim
 steps/nnet/train.sh <data-train> <data-dev> <lang-dir> <ali-train> <ali-dev> <exp-dir>
@@ -111,11 +111,11 @@ The <lang-dir> is used only in the special case when using LDA feature-transform
 The output (i.e. the trained networks and logfiles) goes into <exp-dir>.
 
 Internally the script prepares the feature+target pipelines, generates a neural-network prototype and initialization, creates feature_transform and calls the scheduler script 
-<a href="http://sourceforge.net/p/kaldi/code/HEAD/tree/trunk/egs/wsj/s5/steps/nnet/train_scheduler.sh">steps/nnet/train_scheduler.sh</a>,
+<a href="https://github.com/kaldi-asr/kaldi/blob/master/egs/wsj/s5/steps/nnet/train_scheduler.sh">steps/nnet/train_scheduler.sh</a>,
 which runs the training epochs and controls the learning rate.
 
 
-<b>While looking inside <a href="http://sourceforge.net/p/kaldi/code/HEAD/tree/trunk/egs/wsj/s5/steps/nnet/train.sh">steps/nnet/train.sh</a> we see:</b>
+<b>While looking inside <a href="https://github.com/kaldi-asr/kaldi/blob/master/egs/wsj/s5/steps/nnet/train.sh">steps/nnet/train.sh</a> we see:</b>
 
 1. CUDA is required, the scripts exit if no GPU was detected or was CUDA not compiled in (one can still use '--skip-cuda-check true' to run on CPU, but it is 10-20x slower)
 
@@ -165,12 +165,12 @@ $ cat exp/dnn5b_pretrain-dbn_dnn/nnet.proto
 
 7. the network is initialized by : \ref nnet-initialize.cc , the DBN gets prepended in the next step using \ref nnet-concat.cc
 
-8. finally the training gets called by running scheduler script <a href="http://sourceforge.net/p/kaldi/code/HEAD/tree/trunk/egs/wsj/s5/steps/nnet/train_scheduler.sh">steps/nnet/train_scheduler.sh</a>
+8. finally the training gets called by running scheduler script <a href="https://github.com/kaldi-asr/kaldi/blob/master/egs/wsj/s5/steps/nnet/train_scheduler.sh">steps/nnet/train_scheduler.sh</a>
 
 Note : both neural networks and feature transforms can be viewed by \ref nnet-info.cc, or shown in ascii by \ref nnet-copy.cc
 
 
-<b>While looking inside <a href="http://sourceforge.net/p/kaldi/code/HEAD/tree/trunk/egs/wsj/s5/steps/nnet/train_scheduler.sh">steps/nnet/train_scheduler.sh</a> we see:</b>
+<b>While looking inside <a href="https://github.com/kaldi-asr/kaldi/blob/master/egs/wsj/s5/steps/nnet/train_scheduler.sh">steps/nnet/train_scheduler.sh</a> we see:</b>
 
 the initial cross-validation run and the main for-loop over $iter which runs the epochs and controls the learning rate. Typically, the train_scheduler.sh is called from train.sh.
 - the default learning-rate scheduling is based on the relative improvement of the objective function: 
@@ -310,7 +310,7 @@ AddMat	174.307s
 AddMatMat	1922.11s
 \endverbatim
 
-<b> Running <a href="http://sourceforge.net/p/kaldi/code/HEAD/tree/trunk/egs/wsj/s5/steps/nnet/train_scheduler.sh">steps/nnet/train_scheduler.sh</a> directly:</b>
+<b> Running <a href="https://github.com/kaldi-asr/kaldi/blob/master/egs/wsj/s5/steps/nnet/train_scheduler.sh">steps/nnet/train_scheduler.sh</a> directly:</b>
 - The script train_scheduler.sh can be called outside train.sh, it allows to override the default NN-input and NN-target streams, which can be handy.
 - However the script assumes everything is set-up correctly, and there are almost no sanity checks, which makes it suitable for more advanced users only.
 - It is highly recommended to have a look at how train_scheduler.sh is usually called before trying to call it directly.
diff --git a/src/doc/history.dox b/src/doc/history.dox
index 40d46c7e32f..0813f2331cc 100644
--- a/src/doc/history.dox
+++ b/src/doc/history.dox
@@ -54,7 +54,8 @@
  Sandeep Boda, Sandeep Reddy and Haihua Xu (who helped with coding, code cleanup
  and documentation); we were visited by Michael Riley (who helped us to understand
  OpenFst and gave some lectures on FSTs), and would like to acknowledge the help of
- Honza Cernocky (for allowing us to have the workshop and helping to organize it),
+ Honza Cernocky (for negotiating the venue and some support for the workshop from
+ the Faculty of Information Technology of BUT and helping to organize it),
  Renata Kohlova (administration), and Tomas Kasparek (system administration).
  It is possible that this list of contributors contains
  oversights; any important omissions are unlikely to be intentional.
@@ -62,13 +63,16 @@
  A lot of code was written during the summer of 2010 but we still did not have a
  complete working system.  Some of the participants of the 2010 workshop
  continued working to complete the toolkit and get a working set of training scripts.
- The code was released on May 14th, 2011.
+ The code was released on May 14th, 2011, and presented to public at ICASSP 2011 
+ in Prague,
+ <a href="https://www.superlectures.com/icassp2011/category.php?lang=en&id=131">
+ see the recordings</a>.
 
  Since the initial release, Kaldi has been maintained and developed to a large
  extent by Daniel Povey, working at Microsoft Research until early 2012 and
  since then at Johns Hopkins University; but also with major contributions by
  others: notably Karel Vesely, who developed the neural-net training framework,
- and Arnab Ghoshal, who co-ordinated the acoustic modeling work early on; but
+ and Arnab Ghoshal, who coordinated the acoustic modeling work early on; but
  also other major contributors whom we do not name here because it is too hard
  to determine where to cut off the list; and a long tail of minor contributors;
  the total number of people who have contributed code or scripts or patches is
diff --git a/src/nnet3/natural-gradient-online.cc b/src/nnet3/natural-gradient-online.cc
index 0677e1ca474..a205490ee3f 100644
--- a/src/nnet3/natural-gradient-online.cc
+++ b/src/nnet3/natural-gradient-online.cc
@@ -119,14 +119,14 @@ void OnlineNaturalGradient::InitDefault(int32 D) {
   t_ = 0;
 }
 
-void OnlineNaturalGradient::Init(const CuMatrixBase<BaseFloat> &R0) {
-  int32 D = R0.NumCols();
+void OnlineNaturalGradient::Init(const CuMatrixBase<BaseFloat> &X0) {
+  int32 D = X0.NumCols();
   // for locking reasons it's better to use a different object.
   OnlineNaturalGradient this_copy(*this);
   this_copy.InitDefault(D);
   this_copy.t_ = 1;  // Prevent recursion to Init() again.
 
-  CuMatrix<BaseFloat> R0_copy(R0.NumRows(), R0.NumCols(), kUndefined);
+  CuMatrix<BaseFloat> X0_copy(X0.NumRows(), X0.NumCols(), kUndefined);
   // 'num_iters' is number of iterations with the same data from a pseudorandom
   // start.  this is a faster way of starting than doing eigenvalue
   // decomposition.
@@ -134,11 +134,11 @@ void OnlineNaturalGradient::Init(const CuMatrixBase<BaseFloat> &R0) {
   // Note: we only do three iterations of initialization if we have enough data
   // that it's reasonably possible to estimate the subspace of dimension
   // this_copy.rank_.  If we don't have more than that many rows in our initial
-  // minibatch R0, we just do one iteration... this gives us almost exactly
-  // (barring small effects due to epsilon_ > 0) the row subspace of R0 after
+  // minibatch X0, we just do one iteration... this gives us almost exactly
+  // (barring small effects due to epsilon_ > 0) the row subspace of X0 after
   // one iteration anyway.
   int32 num_init_iters;
-  if (R0.NumRows() <= this_copy.rank_)
+  if (X0.NumRows() <= this_copy.rank_)
     num_init_iters = 1;
   else
     num_init_iters = 3;
@@ -147,8 +147,8 @@ void OnlineNaturalGradient::Init(const CuMatrixBase<BaseFloat> &R0) {
                                // initialize.
   for (int32 i = 0; i < num_init_iters; i++) {
     BaseFloat scale;
-    R0_copy.CopyFromMat(R0);
-    this_copy.PreconditionDirections(&R0_copy, &scale);
+    X0_copy.CopyFromMat(X0);
+    this_copy.PreconditionDirections(&X0_copy, &scale);
   }
   rank_ = this_copy.rank_;
   W_t_.Swap(&this_copy.W_t_);
@@ -197,7 +197,7 @@ void OnlineNaturalGradient::PreconditionDirections(
   t_ += 1;
 }
 
-void OnlineNaturalGradient::ReorthogonalizeXt1(
+void OnlineNaturalGradient::ReorthogonalizeRt1(
     const VectorBase<BaseFloat> &d_t1,
     BaseFloat rho_t1,
     CuMatrixBase<BaseFloat> *W_t1,
@@ -214,7 +214,7 @@ void OnlineNaturalGradient::ReorthogonalizeXt1(
   ComputeEt(d_t1, beta_t1, &e_t1, &sqrt_e_t1, &inv_sqrt_e_t1);
 
   temp_O->SymAddMat2(1.0, *W_t1, kNoTrans, 0.0);
-  // O_t =  E_t^{-0.5} W_t W_t^T E_t^{-0.5}
+  // O_{t+1} =  E_{t+1}^{-0.5} W_{t+1} W_{t+1}^T E_{t+1}^{-0.5}
   Matrix<BaseFloat> O_mat(*temp_O);
   SpMatrix<BaseFloat> O(O_mat, kTakeLower);
   for (int32 i = 0; i < R; i++) {
@@ -439,7 +439,7 @@ void OnlineNaturalGradient::PreconditionDirectionsInternal(
     if (self_debug_) {
       KALDI_WARN << "Reorthogonalizing.";
     }
-    ReorthogonalizeXt1(d_t1,
+    ReorthogonalizeRt1(d_t1,
                        rho_t1,
                        &W_t1,
                        &J_t,
@@ -510,7 +510,7 @@ void OnlineNaturalGradient::ComputeWt1(int32 N,
   // B_t = J_t + (1-\eta)/(\eta/N) (D_t + \rho_t I) W_t
   J_t->AddDiagVecMat(1.0, w_t_coeff_gpu, W_t, kNoTrans, 1.0);
 
-  // A_t = (\eta/N) E_{t+1}^{0.5} C_t^{-0.5} U_t^T E_t^{-0.5} B_t
+  // A_t = (\eta/N) E_{t+1}^{0.5} C_t^{-0.5} U_t^T E_t^{-0.5}
   Matrix<BaseFloat> A_t(U_t, kTrans);
   for (int32 i = 0; i < R; i++) {
     BaseFloat i_factor = (eta / N) * sqrt_e_t1(i) * inv_sqrt_c_t(i);
diff --git a/src/nnet3/natural-gradient-online.h b/src/nnet3/natural-gradient-online.h
index a68ad9bbb53..77be28a19d4 100644
--- a/src/nnet3/natural-gradient-online.h
+++ b/src/nnet3/natural-gradient-online.h
@@ -375,8 +375,8 @@ namespace nnet3 {
    * Initialization *
 
    Now, a note on what we do on time t = 0, i.e. for the first minibatch.  We
-   initialize X_0 to the top R eigenvectors of 1/N X_0 X_0^T, where N is the
-   minibatch size (num-rows of R0).  If L is the corresponding RxR diagonal
+   initialize R_0 to the top R eigenvectors of 1/N X_0 X_0^T, where N is the
+   minibatch size (num-rows of X0).  If L is the corresponding RxR diagonal
    matrix of eigenvalues, then we will set D_0 = L - \rho_0 I.  We set \rho_0
    to ensure that
                       tr(F_0) = 1/N tr(X_0 X_0^T),
@@ -457,7 +457,7 @@ class OnlineNaturalGradient {
             not.
 
   */
-  void PreconditionDirections(CuMatrixBase<BaseFloat> *R,
+  void PreconditionDirections(CuMatrixBase<BaseFloat> *X,
                               BaseFloat *scale);
 
 
@@ -515,7 +515,7 @@ class OnlineNaturalGradient {
   // This function is called if C_t has high condition number; it makes sure
   // that R_{t+1} is orthogonal.  See the section in the extended comment above
   // on "keeping R_t orthogonal".
-  void ReorthogonalizeXt1(const VectorBase<BaseFloat> &d_t1,
+  void ReorthogonalizeRt1(const VectorBase<BaseFloat> &d_t1,
                           BaseFloat rho_t1,
                           CuMatrixBase<BaseFloat> *W_t1,
                           CuMatrixBase<BaseFloat> *temp_W,
diff --git a/src/nnet3/nnet-chain-training.cc b/src/nnet3/nnet-chain-training.cc
index cccb1110d3c..d9562887817 100644
--- a/src/nnet3/nnet-chain-training.cc
+++ b/src/nnet3/nnet-chain-training.cc
@@ -33,6 +33,7 @@ NnetChainTrainer::NnetChainTrainer(const NnetChainTrainingOptions &opts,
     compiler_(*nnet, opts_.nnet_config.optimize_config,
               opts_.nnet_config.compiler_config),
     num_minibatches_processed_(0),
+    max_change_stats_(*nnet),
     srand_seed_(RandInt(0, 100000)) {
   if (opts.nnet_config.zero_component_stats)
     ZeroComponentStats(nnet);
@@ -41,9 +42,6 @@ NnetChainTrainer::NnetChainTrainer(const NnetChainTrainingOptions &opts,
                opts.nnet_config.backstitch_training_interval > 0);
   delta_nnet_ = nnet_->Copy();
   ScaleNnet(0.0, delta_nnet_);
-  const int32 num_updatable = NumUpdatableComponents(*delta_nnet_);
-  num_max_change_per_component_applied_.resize(num_updatable, 0);
-  num_max_change_global_applied_ = 0;
 
   if (opts.nnet_config.read_cache != "") {
     bool binary;
@@ -111,17 +109,19 @@ void NnetChainTrainer::TrainInternal(const NnetChainExample &eg,
   this->ProcessOutputs(false, eg, &computer);
   computer.Run();
 
-  // If relevant, add in the part of the gradient that comes from L2
-  // regularization.
+  // If relevant, add in the part of the gradient that comes from
+  // parameter-level L2 regularization.
   ApplyL2Regularization(*nnet_,
                         GetNumNvalues(eg.inputs, false) *
                         nnet_config.l2_regularize_factor,
                         delta_nnet_);
 
   // Updates the parameters of nnet
-  bool success = UpdateNnetWithMaxChange(*delta_nnet_,
-      nnet_config.max_param_change, 1.0, 1.0 - nnet_config.momentum, nnet_,
-      &num_max_change_per_component_applied_, &num_max_change_global_applied_);
+  bool success = UpdateNnetWithMaxChange(
+      *delta_nnet_,
+      nnet_config.max_param_change,
+      1.0, 1.0 - nnet_config.momentum, nnet_,
+      &max_change_stats_);
 
   // Scale down the batchnorm stats (keeps them fresh... this affects what
   // happens when we use the model with batchnorm test-mode set).
@@ -176,9 +176,10 @@ void NnetChainTrainer::TrainInternalBackstitch(const NnetChainExample &eg,
   }
 
   // Updates the parameters of nnet
-  UpdateNnetWithMaxChange(*delta_nnet_,
-      nnet_config.max_param_change, max_change_scale, scale_adding, nnet_,
-      &num_max_change_per_component_applied_, &num_max_change_global_applied_);
+  UpdateNnetWithMaxChange(
+      *delta_nnet_, nnet_config.max_param_change,
+      max_change_scale, scale_adding, nnet_,
+      &max_change_stats_);
 
   if (is_backstitch_step1) {
     // The following will only do something if we have a LinearComponent or
@@ -276,41 +277,10 @@ bool NnetChainTrainer::PrintTotalStats() const {
     const ObjectiveFunctionInfo &info = iter->second;
     ans = info.PrintTotalStats(name) || ans;
   }
-  PrintMaxChangeStats();
+  max_change_stats_.Print(*nnet_);
   return ans;
 }
 
-void NnetChainTrainer::PrintMaxChangeStats() const {
-  KALDI_ASSERT(delta_nnet_ != NULL);
-  const NnetTrainerOptions &nnet_config = opts_.nnet_config;
-  int32 i = 0;
-  for (int32 c = 0; c < delta_nnet_->NumComponents(); c++) {
-    Component *comp = delta_nnet_->GetComponent(c);
-    if (comp->Properties() & kUpdatableComponent) {
-      UpdatableComponent *uc = dynamic_cast<UpdatableComponent*>(comp);
-      if (uc == NULL)
-        KALDI_ERR << "Updatable component does not inherit from class "
-                  << "UpdatableComponent; change this code.";
-      if (num_max_change_per_component_applied_[i] > 0)
-        KALDI_LOG << "For " << delta_nnet_->GetComponentName(c)
-                  << ", per-component max-change was enforced "
-                  << (100.0 * num_max_change_per_component_applied_[i]) /
-                     (num_minibatches_processed_ *
-                     (nnet_config.backstitch_training_scale == 0.0 ? 1.0 :
-                     1.0 + 1.0 / nnet_config.backstitch_training_interval))
-                  << " % of the time.";
-      i++;
-    }
-  }
-  if (num_max_change_global_applied_ > 0)
-    KALDI_LOG << "The global max-change was enforced "
-              << (100.0 * num_max_change_global_applied_) /
-                 (num_minibatches_processed_ *
-                 (nnet_config.backstitch_training_scale == 0.0 ? 1.0 :
-                 1.0 + 1.0 / nnet_config.backstitch_training_interval))
-              << " % of the time.";
-}
-
 NnetChainTrainer::~NnetChainTrainer() {
   if (opts_.nnet_config.write_cache != "") {
     Output ko(opts_.nnet_config.write_cache, opts_.nnet_config.binary_write_cache);
diff --git a/src/nnet3/nnet-chain-training.h b/src/nnet3/nnet-chain-training.h
index 5bf6a3f6fce..bc5143491ac 100644
--- a/src/nnet3/nnet-chain-training.h
+++ b/src/nnet3/nnet-chain-training.h
@@ -64,10 +64,6 @@ class NnetChainTrainer {
   // Prints out the final stats, and return true if there was a nonzero count.
   bool PrintTotalStats() const;
 
-  // Prints out the max-change stats (if nonzero): the percentage of time that
-  // per-component max-change and global max-change were enforced.
-  void PrintMaxChangeStats() const;
-
   ~NnetChainTrainer();
  private:
   // The internal function for doing one step of conventional SGD training.
@@ -88,11 +84,8 @@ class NnetChainTrainer {
 
   chain::DenominatorGraph den_graph_;
   Nnet *nnet_;
-  Nnet *delta_nnet_;  // Only used if momentum != 0.0 or max-param-change !=
-                      // 0.0.  nnet representing accumulated parameter-change
-                      // (we'd call this gradient_nnet_, but due to
-                      // natural-gradient update, it's better to consider it as
-                      // a delta-parameter nnet.
+  Nnet *delta_nnet_;  // stores the change to the parameters on each training
+                      // iteration.
   CachingOptimizingCompiler compiler_;
 
   // This code supports multiple output layers, even though in the
@@ -101,8 +94,7 @@ class NnetChainTrainer {
   int32 num_minibatches_processed_;
 
   // stats for max-change.
-  std::vector<int32> num_max_change_per_component_applied_;
-  int32 num_max_change_global_applied_;
+  MaxChangeStats max_change_stats_;
 
   unordered_map<std::string, ObjectiveFunctionInfo, StringHasher> objf_info_;
 
diff --git a/src/nnet3/nnet-parse-test.cc b/src/nnet3/nnet-parse-test.cc
index babdbbdcb0e..5ae4917dba6 100644
--- a/src/nnet3/nnet-parse-test.cc
+++ b/src/nnet3/nnet-parse-test.cc
@@ -23,193 +23,6 @@
 namespace kaldi {
 namespace nnet3 {
 
-void UnitTestConfigLineParse() {
-  std::string str;
-  {
-    ConfigLine cfl;
-    str = "a-b xx=yyy foo=bar  baz=123 ba=1:2";
-    bool status = cfl.ParseLine(str);
-    KALDI_ASSERT(status && cfl.FirstToken() == "a-b");
-
-    KALDI_ASSERT(cfl.HasUnusedValues());
-    std::string str_value;
-    KALDI_ASSERT(cfl.GetValue("xx", &str_value));
-    KALDI_ASSERT(str_value == "yyy");
-    KALDI_ASSERT(cfl.HasUnusedValues());
-    KALDI_ASSERT(cfl.GetValue("foo", &str_value));
-    KALDI_ASSERT(str_value == "bar");
-    KALDI_ASSERT(cfl.HasUnusedValues());
-    KALDI_ASSERT(!cfl.GetValue("xy", &str_value));
-    KALDI_ASSERT(cfl.GetValue("baz", &str_value));
-    KALDI_ASSERT(str_value == "123");
-
-    std::vector<int32> int_values;
-    KALDI_ASSERT(!cfl.GetValue("xx", &int_values));
-    KALDI_ASSERT(cfl.GetValue("baz", &int_values));
-    KALDI_ASSERT(cfl.HasUnusedValues());
-    KALDI_ASSERT(int_values.size() == 1 && int_values[0] == 123);
-    KALDI_ASSERT(cfl.GetValue("ba", &int_values));
-    KALDI_ASSERT(int_values.size() == 2 && int_values[0] == 1 && int_values[1] == 2);
-    KALDI_ASSERT(!cfl.HasUnusedValues());
-  }
-
-  {
-    ConfigLine cfl;
-    str = "a-b baz=x y z pp = qq ab =cd ac= bd";
-    KALDI_ASSERT(!cfl.ParseLine(str));
-  }
-  {
-    ConfigLine cfl;
-    str = "a-b baz=x y z pp = qq ab=cd ac=bd";
-    KALDI_ASSERT(!cfl.ParseLine(str));
-  }
-  {
-    ConfigLine cfl;
-    str = "foo-bar";
-    KALDI_ASSERT(cfl.ParseLine(str));
-  }
-  {
-    ConfigLine cfl;
-    str = "foo-bar a=b c d f=g";
-    std::string value;
-    KALDI_ASSERT(cfl.ParseLine(str) && cfl.FirstToken() == "foo-bar" &&
-                 cfl.GetValue("a", &value)  && value == "b c d" &&
-                 cfl.GetValue("f", &value) && value == "g" &&
-                 !cfl.HasUnusedValues());
-  }
-  {
-    ConfigLine cfl;
-    str = "zzz a=b baz";
-    KALDI_ASSERT(cfl.ParseLine(str) && cfl.FirstToken() == "zzz" &&
-                 cfl.UnusedValues() == "a=b baz");
-  }
-  {
-    ConfigLine cfl;
-    str = "xxx a=b baz ";
-    KALDI_ASSERT(cfl.ParseLine(str) && cfl.UnusedValues() == "a=b baz");
-  }
-  {
-    ConfigLine cfl;
-    str = "xxx a=b =c";
-    KALDI_ASSERT(!cfl.ParseLine(str));
-  }
-  {
-    ConfigLine cfl;
-    str = "xxx baz='x y z' pp=qq ab=cd ac=bd";
-    KALDI_ASSERT(cfl.ParseLine(str) && cfl.FirstToken() == "xxx");
-    std::string str_value;
-    KALDI_ASSERT(cfl.GetValue("baz", &str_value));
-    KALDI_ASSERT(str_value == "x y z");
-    KALDI_ASSERT(cfl.GetValue("pp", &str_value));
-    KALDI_ASSERT(str_value == "qq");
-    KALDI_ASSERT(cfl.UnusedValues() == "ab=cd ac=bd");
-    KALDI_ASSERT(cfl.GetValue("ab", &str_value));
-    KALDI_ASSERT(str_value == "cd");
-    KALDI_ASSERT(cfl.UnusedValues() == "ac=bd");
-    KALDI_ASSERT(cfl.HasUnusedValues());
-    KALDI_ASSERT(cfl.GetValue("ac", &str_value));
-    KALDI_ASSERT(str_value == "bd");
-    KALDI_ASSERT(!cfl.HasUnusedValues());
-  }
-
-  {
-    ConfigLine cfl;
-    str = "x baz= pp = qq flag=t ";
-    KALDI_ASSERT(!cfl.ParseLine(str));
-  }
-  {
-    ConfigLine cfl;
-    str = " x baz= pp=qq flag=t  ";
-    KALDI_ASSERT(cfl.ParseLine(str) && cfl.FirstToken() == "x");
-
-    std::string str_value;
-    KALDI_ASSERT(cfl.GetValue("baz", &str_value));
-    KALDI_ASSERT(str_value == "");
-    KALDI_ASSERT(cfl.GetValue("pp", &str_value));
-    KALDI_ASSERT(str_value == "qq");
-    KALDI_ASSERT(cfl.HasUnusedValues());
-    KALDI_ASSERT(cfl.GetValue("flag", &str_value));
-    KALDI_ASSERT(str_value == "t");
-    KALDI_ASSERT(!cfl.HasUnusedValues());
-
-    bool bool_value = false;
-    KALDI_ASSERT(cfl.GetValue("flag", &bool_value));
-    KALDI_ASSERT(bool_value);
-  }
-
-  {
-    ConfigLine cfl;
-    str = "xx _baz=a -pp=qq";
-    KALDI_ASSERT(!cfl.ParseLine(str));
-  }
-  {
-    ConfigLine cfl;
-    str = "xx 0baz=a pp=qq";
-    KALDI_ASSERT(!cfl.ParseLine(str));
-  }
-  {
-    ConfigLine cfl;
-    str = "xx -baz=a pp=qq";
-    KALDI_ASSERT(!cfl.ParseLine(str));
-  }
-  {
-    ConfigLine cfl;
-    str = "xx _baz'=a pp=qq";
-    KALDI_ASSERT(!cfl.ParseLine(str));
-  }
-  {
-    ConfigLine cfl;
-    str = " baz=g";
-    KALDI_ASSERT(cfl.ParseLine(str) && cfl.FirstToken() == "");
-    bool flag;
-    KALDI_ASSERT(!cfl.GetValue("baz", &flag));
-  }
-  {
-    ConfigLine cfl;
-    str = "xx _baz1=a pp=qq";
-    KALDI_ASSERT(cfl.ParseLine(str));
-
-    std::string str_value;
-    KALDI_ASSERT(cfl.GetValue("_baz1", &str_value));
-  }
-}
-
-void UnitTestReadConfig() {
-  std::string str = "a-b alpha=aa beta=\"b b\"# String test\n"
-      "a-b beta2='b c' beta3=bd # \n"
-      "a-b gamma=1:2:3:4  # Int Vector test\n"
-      " a-b de1ta=f  # Bool + Integer in key Comment test delta=t  \n"
-      "a-b _epsilon=-1  # Int Vector test _epsilon=1 \n"
-      "a-b zet-_a=0.15   theta=1.1# Float, -, _ test\n"
-      "a-b quoted='a b c' # quoted string\n"
-      "a-b quoted2=\"d e 'a b=c' f\" # string quoted with double quotes";
-
-  std::istringstream is(str);
-  std::vector<std::string> lines;
-  ReadConfigLines(is, &lines);
-  KALDI_ASSERT(lines.size() == 8);
-
-  ConfigLine cfl;
-  for (size_t i = 0; i < lines.size(); i++) {
-    KALDI_ASSERT(cfl.ParseLine(lines[i]) && cfl.FirstToken() == "a-b");
-    if (i == 1) {
-        KALDI_ASSERT(cfl.GetValue("beta2", &str) && str == "b c");
-    }
-    if (i == 4) {
-      KALDI_ASSERT(cfl.GetValue("_epsilon", &str) && str == "-1");
-    }
-    if (i == 5) {
-      BaseFloat float_val = 0;
-      KALDI_ASSERT(cfl.GetValue("zet-_a", &float_val) && ApproxEqual(float_val, 0.15));
-    }
-    if (i == 6) {
-      KALDI_ASSERT(cfl.GetValue("quoted", &str) && str == "a b c");
-    }
-    if (i == 7) {
-      KALDI_ASSERT(cfl.GetValue("quoted2", &str) && str == "d e 'a b=c' f");
-    }
-  }
-}
 
 void UnitTestDescriptorTokenize() {
   std::vector<std::string> lines;
@@ -281,8 +94,6 @@ int main() {
   using namespace kaldi;
   using namespace kaldi::nnet3;
 
-  UnitTestConfigLineParse();
-  UnitTestReadConfig();
   UnitTestDescriptorTokenize();
   UnitTestSummarizeVector();
   UnitTestNameMatchesPattern();
diff --git a/src/nnet3/nnet-parse.cc b/src/nnet3/nnet-parse.cc
index a51bba21484..17dec23e7c1 100644
--- a/src/nnet3/nnet-parse.cc
+++ b/src/nnet3/nnet-parse.cc
@@ -27,353 +27,6 @@
 namespace kaldi {
 namespace nnet3 {
 
-
-bool ConfigLine::ParseLine(const std::string &line) {
-  data_.clear();
-  whole_line_ = line;
-  if (line.size() == 0) return false;   // Empty line
-  size_t pos = 0, size = line.size();
-  while (isspace(line[pos]) && pos < size) pos++;
-  if (pos == size)
-    return false;  // whitespace-only line
-  size_t first_token_start_pos = pos;
-  // first get first_token_.
-  while (!isspace(line[pos]) && pos < size) {
-    if (line[pos] == '=') {
-      // If the first block of non-whitespace looks like "foo-bar=...",
-      // then we ignore it: there is no initial token, and FirstToken()
-      // is empty.
-      pos = first_token_start_pos;
-      break;
-    }
-    pos++;
-  }
-  first_token_ = std::string(line, first_token_start_pos, pos - first_token_start_pos);
-  // first_token_ is expected to be either empty or something like
-  // "component-node", which actually is a slightly more restrictive set of
-  // strings than IsValidName() checks for this is a convenient way to check it.
-  if (!first_token_.empty() && !IsValidName(first_token_))
-    return false;
-
-  while (pos < size) {
-    if (isspace(line[pos])) {
-      pos++;
-      continue;
-    }
-
-    // OK, at this point we know that we are pointing at nonspace.
-    size_t next_equals_sign = line.find_first_of("=", pos);
-    if (next_equals_sign == pos || next_equals_sign == std::string::npos) {
-      // we're looking for something like 'key=value'.  If there is no equals sign,
-      // or it's not preceded by something, it's a parsing failure.
-      return false;
-    }
-    std::string key(line, pos, next_equals_sign - pos);
-    if (!IsValidName(key)) return false;
-
-    // handle any quotes.  we support key='blah blah' or key="foo bar".
-    // no escaping is supported.
-    if (line[next_equals_sign+1] == '\'' || line[next_equals_sign+1] == '"') {
-      char my_quote = line[next_equals_sign+1];
-      size_t next_quote = line.find_first_of(my_quote, next_equals_sign + 2);
-      if (next_quote == std::string::npos) {  // no matching quote was found.
-        KALDI_WARN << "No matching quote for " << my_quote << " in config line '"
-                   << line << "'";
-        return false;
-      } else {
-        std::string value(line, next_equals_sign + 2,
-                          next_quote - next_equals_sign - 2);
-        data_.insert(std::make_pair(key, std::make_pair(value, false)));
-        pos = next_quote + 1;
-        continue;
-      }
-    } else {
-      // we want to be able to parse something like "... input=Offset(a, -1) foo=bar":
-      // in general, config values with spaces in them, even without quoting.
-
-      size_t next_next_equals_sign = line.find_first_of("=", next_equals_sign + 1),
-          terminating_space = size;
-
-      if (next_next_equals_sign != std::string::npos) {  // found a later equals sign.
-        size_t preceding_space = line.find_last_of(" \t", next_next_equals_sign);
-        if (preceding_space != std::string::npos &&
-            preceding_space > next_equals_sign)
-          terminating_space = preceding_space;
-      }
-      while (isspace(line[terminating_space - 1]) && terminating_space > 0)
-        terminating_space--;
-
-      std::string value(line, next_equals_sign + 1,
-                        terminating_space - (next_equals_sign + 1));
-      data_.insert(std::make_pair(key, std::make_pair(value, false)));
-      pos = terminating_space;
-    }
-  }
-  return true;
-}
-
-bool ConfigLine::GetValue(const std::string &key, std::string *value) {
-  KALDI_ASSERT(value != NULL);
-  std::map<std::string, std::pair<std::string, bool> >::iterator it = data_.begin();
-  for (; it != data_.end(); ++it) {
-    if (it->first == key) {
-      *value = (it->second).first;
-      (it->second).second = true;
-      return true;
-    }
-  }
-  return false;
-}
-
-bool ConfigLine::GetValue(const std::string &key, BaseFloat *value) {
-  KALDI_ASSERT(value != NULL);
-  std::map<std::string, std::pair<std::string, bool> >::iterator it = data_.begin();
-  for (; it != data_.end(); ++it) {
-    if (it->first == key) {
-      if (!ConvertStringToReal((it->second).first, value))
-        return false;
-      (it->second).second = true;
-      return true;
-    }
-  }
-  return false;
-}
-
-bool ConfigLine::GetValue(const std::string &key, int32 *value) {
-  KALDI_ASSERT(value != NULL);
-  std::map<std::string, std::pair<std::string, bool> >::iterator it = data_.begin();
-  for (; it != data_.end(); ++it) {
-    if (it->first == key) {
-      if (!ConvertStringToInteger((it->second).first, value))
-        return false;
-      (it->second).second = true;
-      return true;
-    }
-  }
-  return false;
-}
-
-bool ConfigLine::GetValue(const std::string &key, std::vector<int32> *value) {
-  KALDI_ASSERT(value != NULL);
-  value->clear();
-  std::map<std::string, std::pair<std::string, bool> >::iterator it = data_.begin();
-  for (; it != data_.end(); ++it) {
-    if (it->first == key) {
-      if (!SplitStringToIntegers((it->second).first, ":,", true, value)) {
-        // KALDI_WARN << "Bad option " << (it->second).first;
-        return false;
-      }
-      (it->second).second = true;
-      return true;
-    }
-  }
-  return false;
-}
-
-bool ConfigLine::GetValue(const std::string &key, bool *value) {
-  KALDI_ASSERT(value != NULL);
-  std::map<std::string, std::pair<std::string, bool> >::iterator it = data_.begin();
-  for (; it != data_.end(); ++it) {
-    if (it->first == key) {
-      if ((it->second).first.size() == 0) return false;
-      switch (((it->second).first)[0]) {
-        case 'F':
-        case 'f':
-          *value = false;
-          break;
-        case 'T':
-        case 't':
-          *value = true;
-          break;
-        default:
-          return false;
-      }
-      (it->second).second = true;
-      return true;
-    }
-  }
-  return false;
-}
-
-bool ConfigLine::HasUnusedValues() const {
-  std::map<std::string, std::pair<std::string, bool> >::const_iterator it = data_.begin();
-  for (; it != data_.end(); ++it) {
-    if (!(it->second).second) return true;
-  }
-  return false;
-}
-
-std::string ConfigLine::UnusedValues() const {
-  std::string unused_str;
-  std::map<std::string, std::pair<std::string, bool> >::const_iterator it = data_.begin();
-  for (; it != data_.end(); ++it) {
-    if (!(it->second).second) {
-      if (unused_str == "")
-        unused_str = it->first + "=" + (it->second).first;
-      else
-        unused_str += " " + it->first + "=" + (it->second).first;
-    }
-  }
-  return unused_str;
-}
-
-// This is like ExpectToken but for two tokens, and it
-// will either accept token1 and then token2, or just token2.
-// This is useful in Read functions where the first token
-// may already have been consumed.
-void ExpectOneOrTwoTokens(std::istream &is, bool binary,
-                          const std::string &token1,
-                          const std::string &token2) {
-  KALDI_ASSERT(token1 != token2);
-  std::string temp;
-  ReadToken(is, binary, &temp);
-  if (temp == token1) {
-    ExpectToken(is, binary, token2);
-  } else {
-    if (temp != token2) {
-      KALDI_ERR << "Expecting token " << token1 << " or " << token2
-                << " but got " << temp;
-    }
-  }
-}
-
-// static
-bool ParseFromString(const std::string &name, std::string *string,
-                     int32 *param) {
-  std::vector<std::string> split_string;
-  SplitStringToVector(*string, " \t", true,
-                      &split_string);
-  std::string name_equals = name + "="; // the name and then the equals sign.
-  size_t len = name_equals.length();
-
-  for (size_t i = 0; i < split_string.size(); i++) {
-    if (split_string[i].compare(0, len, name_equals) == 0) {
-      if (!ConvertStringToInteger(split_string[i].substr(len), param))
-        KALDI_ERR << "Bad option " << split_string[i];
-      *string = "";
-      // Set "string" to all the pieces but the one we used.
-      for (size_t j = 0; j < split_string.size(); j++) {
-        if (j != i) {
-          if (!string->empty()) *string += " ";
-          *string += split_string[j];
-        }
-      }
-      return true;
-    }
-  }
-  return false;
-}
-
-bool ParseFromString(const std::string &name, std::string *string,
-                     bool *param) {
-  std::vector<std::string> split_string;
-  SplitStringToVector(*string, " \t", true,
-                      &split_string);
-  std::string name_equals = name + "="; // the name and then the equals sign.
-  size_t len = name_equals.length();
-
-  for (size_t i = 0; i < split_string.size(); i++) {
-    if (split_string[i].compare(0, len, name_equals) == 0) {
-      std::string b = split_string[i].substr(len);
-      if (b.empty())
-        KALDI_ERR << "Bad option " << split_string[i];
-      if (b[0] == 'f' || b[0] == 'F') *param = false;
-      else if (b[0] == 't' || b[0] == 'T') *param = true;
-      else
-        KALDI_ERR << "Bad option " << split_string[i];
-      *string = "";
-      // Set "string" to all the pieces but the one we used.
-      for (size_t j = 0; j < split_string.size(); j++) {
-        if (j != i) {
-          if (!string->empty()) *string += " ";
-          *string += split_string[j];
-        }
-      }
-      return true;
-    }
-  }
-  return false;
-}
-
-bool ParseFromString(const std::string &name, std::string *string,
-                     BaseFloat *param) {
-  std::vector<std::string> split_string;
-  SplitStringToVector(*string, " \t", true,
-                      &split_string);
-  std::string name_equals = name + "="; // the name and then the equals sign.
-  size_t len = name_equals.length();
-
-  for (size_t i = 0; i < split_string.size(); i++) {
-    if (split_string[i].compare(0, len, name_equals) == 0) {
-      if (!ConvertStringToReal(split_string[i].substr(len), param))
-        KALDI_ERR << "Bad option " << split_string[i];
-      *string = "";
-      // Set "string" to all the pieces but the one we used.
-      for (size_t j = 0; j < split_string.size(); j++) {
-        if (j != i) {
-          if (!string->empty()) *string += " ";
-          *string += split_string[j];
-        }
-      }
-      return true;
-    }
-  }
-  return false;
-}
-
-bool ParseFromString(const std::string &name, std::string *string,
-                     std::string *param) {
-  std::vector<std::string> split_string;
-  SplitStringToVector(*string, " \t", true,
-                      &split_string);
-  std::string name_equals = name + "="; // the name and then the equals sign.
-  size_t len = name_equals.length();
-
-  for (size_t i = 0; i < split_string.size(); i++) {
-    if (split_string[i].compare(0, len, name_equals) == 0) {
-      *param = split_string[i].substr(len);
-
-      // Set "string" to all the pieces but the one we used.
-      *string = "";
-      for (size_t j = 0; j < split_string.size(); j++) {
-        if (j != i) {
-          if (!string->empty()) *string += " ";
-          *string += split_string[j];
-        }
-      }
-      return true;
-    }
-  }
-  return false;
-}
-
-bool ParseFromString(const std::string &name, std::string *string,
-                     std::vector<int32> *param) {
-  std::vector<std::string> split_string;
-  SplitStringToVector(*string, " \t", true,
-                      &split_string);
-  std::string name_equals = name + "="; // the name and then the equals sign.
-  size_t len = name_equals.length();
-
-  for (size_t i = 0; i < split_string.size(); i++) {
-    if (split_string[i].compare(0, len, name_equals) == 0) {
-      if (!SplitStringToIntegers(split_string[i].substr(len), ":,",
-                                 false, param))
-        KALDI_ERR << "Bad option " << split_string[i];
-      *string = "";
-      // Set "string" to all the pieces but the one we used.
-      for (size_t j = 0; j < split_string.size(); j++) {
-        if (j != i) {
-          if (!string->empty()) *string += " ";
-          *string += split_string[j];
-        }
-      }
-      return true;
-    }
-  }
-  return false;
-}
-
 bool DescriptorTokenize(const std::string &input,
                         std::vector<std::string> *tokens) {
   KALDI_ASSERT(tokens != NULL);
@@ -422,32 +75,6 @@ bool DescriptorTokenize(const std::string &input,
   return true;
 }
 
-bool IsValidName(const std::string &name) {
-  if (name.size() == 0) return false;
-  for (size_t i = 0; i < name.size(); i++) {
-    if (i == 0 && !isalpha(name[i]) && name[i] != '_')
-      return false;
-    if (!isalnum(name[i]) && name[i] != '_' && name[i] != '-' && name[i] != '.')
-      return false;
-  }
-  return true;
-}
-
-void ReadConfigLines(std::istream &is,
-                    std::vector<std::string> *lines) {
-  KALDI_ASSERT(lines != NULL);
-  std::string line;
-  while (std::getline(is, line)) {
-    if (line.size() == 0) continue;
-    size_t start = line.find_first_not_of(" \t");
-    size_t end = line.find_first_of('#');
-    if (start == std::string::npos || start == end) continue;
-    end = line.find_last_not_of(" \t", end - 1);
-    KALDI_ASSERT(end >= start);
-    lines->push_back(line.substr(start, end - start + 1));
-  }
-}
-
 std::string ErrorContext(std::istream &is) {
   if (!is.good()) return "end of line";
   char buf[21];
diff --git a/src/nnet3/nnet-parse.h b/src/nnet3/nnet-parse.h
index a073a54f7e0..0fc19d51f6c 100644
--- a/src/nnet3/nnet-parse.h
+++ b/src/nnet3/nnet-parse.h
@@ -26,103 +26,6 @@
 namespace kaldi {
 namespace nnet3 {
 
-/**
-   This class is responsible for parsing input like
-    hi-there xx=yyy a=b c empty= f-oo=Append(bar, sss) ba_z=123 bing='a b c' baz="a b c d='a b' e"
-   and giving you access to the fields, in this case
-
-   FirstToken() == "hi-there", and key->value pairs:
-
-   xx->yyy, a->"b c", empty->"", f-oo->"Append(bar, sss)", ba_z->"123",
-   bing->"a b c", baz->"a b c d='a b' e"
-
-   The first token is optional, if the line started with a key-value pair then
-   FirstValue() will be empty.
-
-   Note: it can parse value fields with space inside them only if they are free of the '='
-   character.  If values are going to contain the '=' character, you need to quote them
-   with either single or double quotes.
-
-   Key values may contain -_a-zA-Z0-9, but must begin with a-zA-Z_.
- */
-class ConfigLine {
- public:
-  // Tries to parse the line as a config-file line.  Returns false
-  // if it could not for some reason, e.g. parsing failure.  In most cases
-  // prints no warnings; the user should do this.  Does not expect comments.
-  bool ParseLine(const std::string &line);
-
-  // the GetValue functions are overloaded for various types.  They return true
-  // if the key exists with value that can be converted to that type, and false
-  // otherwise.  They also mark the key-value pair as having been read.  It is
-  // not an error to read values twice.
-  bool GetValue(const std::string &key, std::string *value);
-  bool GetValue(const std::string &key, BaseFloat *value);
-  bool GetValue(const std::string &key, int32 *value);
-  // Values may be separated by ":" or by ",".
-  bool GetValue(const std::string &key, std::vector<int32> *value);
-  bool GetValue(const std::string &key, bool *value);
-
-  bool HasUnusedValues() const;
-  /// returns e.g. foo=bar xxx=yyy if foo and xxx were not consumed by one
-  /// of the GetValue() functions.
-  std::string UnusedValues() const;
-
-  const std::string &FirstToken() const { return first_token_; }
-
-  const std::string WholeLine() { return whole_line_; }
-  // use default assignment operator and copy constructor.
- private:
-  std::string whole_line_;
-  // the first token of the line, e.g. if line is
-  // foo-bar baz=bing
-  // then first_token_ would be "foo-bar".
-  std::string first_token_;
-
-  // data_ maps from key to (value, is-this-value-consumed?).
-  std::map<std::string, std::pair<std::string, bool> > data_;
-
-};
-
-// Note: the ParseFromString functions are to be removed after we switch over to
-// using the ConfigLine mechanism.
-
-
-/// \file nnet-parse.h
-///   This header contains a few parsing-related functions that are used
-///    while reading parsing neural network files and config files.
-
-/// Function used in Init routines.  Suppose name=="foo", if "string" has a
-/// field like foo=12, this function will set "param" to 12 and remove that
-/// element from "string".  It returns true if the parameter was read.
-bool ParseFromString(const std::string &name, std::string *string,
-                     int32 *param);
-
-/// This version of ParseFromString is for parameters of type BaseFloat.
-bool ParseFromString(const std::string &name, std::string *string,
-                     BaseFloat *param);
-
-/// This version of ParseFromString is for parameters of type bool, which can
-/// appear as any string beginning with f, F, t or T.
-bool ParseFromString(const std::string &name, std::string *string,
-                     bool *param);
-
-/// This version of ParseFromString is for parsing strings.  (these
-/// should not contain space).
-bool ParseFromString(const std::string &name, std::string *string,
-                     std::string *param);
-
-/// This version of ParseFromString handles colon-separated or comma-separated
-/// lists of integers.
-bool ParseFromString(const std::string &name, std::string *string,
-                     std::vector<int32> *param);
-
-/// This function is like ExpectToken but for two tokens, and it will either
-/// accept token1 and then token2, or just token2.  This is useful in Read
-/// functions where the first token may already have been consumed.
-void ExpectOneOrTwoTokens(std::istream &is, bool binary,
-                          const std::string &token1,
-                          const std::string &token2);
 
 /**
    This function tokenizes input when parsing Descriptor configuration
@@ -142,32 +45,6 @@ void ExpectOneOrTwoTokens(std::istream &is, bool binary,
 bool DescriptorTokenize(const std::string &input,
                         std::vector<std::string> *tokens);
 
-/// Returns true if 'name' would be a valid name for a component or node in a
-/// Nnet.  This is a nonempty string beginning with A-Za-z_, and containing only
-/// '-', '_', '.', A-Z, a-z, or 0-9.
-bool IsValidName(const std::string &name);
-
-
-/**
-   This function reads in a config file and *appends* its contents to a vector of
-   lines; it is responsible for removing comments (anything after '#') and
-   stripping out any lines that contain only whitespace after comment removal.
- */
-void ReadConfigLines(std::istream &is,
-                     std::vector<std::string> *lines);
-
-
-/**
-   This function converts config-lines from a simple sequence of strings
-   as output by ReadConfigLines(), into a sequence of first-tokens and
-   name-value pairs.  The general format is:
-      "command-type bar=baz xx=yyy"
-   etc., although there are subtleties as to what exactly is allowed, see
-   documentation for class ConfigLine for details.
-   This function will die if there was a parsing failure.
- */
-void ParseConfigLines(const std::vector<std::string> &lines,
-                      std::vector<ConfigLine> *config_lines);
 
 /*
   Returns true if name 'name' matches pattern 'pattern'.  The pattern
diff --git a/src/nnet3/nnet-training.cc b/src/nnet3/nnet-training.cc
index 820644470c7..b4563c7a2c3 100644
--- a/src/nnet3/nnet-training.cc
+++ b/src/nnet3/nnet-training.cc
@@ -30,6 +30,7 @@ NnetTrainer::NnetTrainer(const NnetTrainerOptions &config,
     nnet_(nnet),
     compiler_(*nnet, config_.optimize_config, config_.compiler_config),
     num_minibatches_processed_(0),
+    max_change_stats_(*nnet),
     srand_seed_(RandInt(0, 100000)) {
   if (config.zero_component_stats)
     ZeroComponentStats(nnet);
@@ -38,9 +39,6 @@ NnetTrainer::NnetTrainer(const NnetTrainerOptions &config,
                config.backstitch_training_interval > 0);
   delta_nnet_ = nnet_->Copy();
   ScaleNnet(0.0, delta_nnet_);
-  const int32 num_updatable = NumUpdatableComponents(*delta_nnet_);
-  num_max_change_per_component_applied_.resize(num_updatable, 0);
-  num_max_change_global_applied_ = 0;
 
   if (config_.read_cache != "") {
     bool binary;
@@ -111,9 +109,9 @@ void NnetTrainer::TrainInternal(const NnetExample &eg,
                         delta_nnet_);
 
   // Update the parameters of nnet
-  bool success = UpdateNnetWithMaxChange(*delta_nnet_, config_.max_param_change,
-      1.0, 1.0 - config_.momentum, nnet_,
-      &num_max_change_per_component_applied_, &num_max_change_global_applied_);
+  bool success = UpdateNnetWithMaxChange(
+      *delta_nnet_, config_.max_param_change,
+      1.0, 1.0 - config_.momentum, nnet_, &max_change_stats_);
 
   // Scale down the batchnorm stats (keeps them fresh... this affects what
   // happens when we use the model with batchnorm test-mode set).
@@ -167,9 +165,10 @@ void NnetTrainer::TrainInternalBackstitch(const NnetExample &eg,
   }
 
   // Updates the parameters of nnet
-  UpdateNnetWithMaxChange(*delta_nnet_, config_.max_param_change,
+  UpdateNnetWithMaxChange(
+      *delta_nnet_, config_.max_param_change,
       max_change_scale, scale_adding, nnet_,
-      &num_max_change_per_component_applied_, &num_max_change_global_applied_);
+      &max_change_stats_);
 
   if (is_backstitch_step1) {
     // The following will only do something if we have a LinearComponent or
@@ -236,40 +235,10 @@ bool NnetTrainer::PrintTotalStats() const {
     bool ok = info.PrintTotalStats(name);
     ans = ans || ok;
   }
-  PrintMaxChangeStats();
+  max_change_stats_.Print(*nnet_);
   return ans;
 }
 
-void NnetTrainer::PrintMaxChangeStats() const {
-  KALDI_ASSERT(delta_nnet_ != NULL);
-  int32 i = 0;
-  for (int32 c = 0; c < delta_nnet_->NumComponents(); c++) {
-    Component *comp = delta_nnet_->GetComponent(c);
-    if (comp->Properties() & kUpdatableComponent) {
-      UpdatableComponent *uc = dynamic_cast<UpdatableComponent*>(comp);
-      if (uc == NULL)
-        KALDI_ERR << "Updatable component does not inherit from class "
-                  << "UpdatableComponent; change this code.";
-      if (num_max_change_per_component_applied_[i] > 0)
-        KALDI_LOG << "For " << delta_nnet_->GetComponentName(c)
-                  << ", per-component max-change was enforced "
-                  << (100.0 * num_max_change_per_component_applied_[i]) /
-                     (num_minibatches_processed_ *
-                     (config_.backstitch_training_scale == 0.0 ? 1.0 :
-                     1.0 + 1.0 / config_.backstitch_training_interval))
-                  << " % of the time.";
-      i++;
-    }
-  }
-  if (num_max_change_global_applied_ > 0)
-    KALDI_LOG << "The global max-change was enforced "
-              << (100.0 * num_max_change_global_applied_) /
-                 (num_minibatches_processed_ *
-                 (config_.backstitch_training_scale == 0.0 ? 1.0 :
-                 1.0 + 1.0 / config_.backstitch_training_interval))
-              << " % of the time.";
-}
-
 void ObjectiveFunctionInfo::UpdateStats(
     const std::string &output_name,
     int32 minibatches_per_phase,
diff --git a/src/nnet3/nnet-training.h b/src/nnet3/nnet-training.h
index fffc621930a..64ec7abc58e 100644
--- a/src/nnet3/nnet-training.h
+++ b/src/nnet3/nnet-training.h
@@ -26,6 +26,7 @@
 #include "nnet3/nnet-compute.h"
 #include "nnet3/nnet-optimize.h"
 #include "nnet3/nnet-example-utils.h"
+#include "nnet3/nnet-utils.h"
 
 namespace kaldi {
 namespace nnet3 {
@@ -187,10 +188,6 @@ class NnetTrainer {
   // Prints out the final stats, and return true if there was a nonzero count.
   bool PrintTotalStats() const;
 
-  // Prints out the max-change stats (if nonzero): the percentage of time that
-  // per-component max-change and global max-change were enforced.
-  void PrintMaxChangeStats() const;
-
   ~NnetTrainer();
  private:
   // The internal function for doing one step of conventional SGD training.
@@ -220,8 +217,7 @@ class NnetTrainer {
   int32 num_minibatches_processed_;
 
   // stats for max-change.
-  std::vector<int32> num_max_change_per_component_applied_;
-  int32 num_max_change_global_applied_;
+  MaxChangeStats max_change_stats_;
 
   unordered_map<std::string, ObjectiveFunctionInfo, StringHasher> objf_info_;
 
diff --git a/src/nnet3/nnet-utils.cc b/src/nnet3/nnet-utils.cc
index e020f8fc6a7..541d2735529 100644
--- a/src/nnet3/nnet-utils.cc
+++ b/src/nnet3/nnet-utils.cc
@@ -1655,7 +1655,6 @@ class ModelCollapser {
                                                   component_index2);
   }
 
-
   /**
      Tries to produce a component that's equivalent to running the component
      'component_index2' with input given by 'component_index1'.  This handles
@@ -2173,5 +2172,47 @@ void ApplyL2Regularization(const Nnet &nnet,
 }
 
 
+bool UpdateNnetWithMaxChange(const Nnet &delta_nnet,
+                             BaseFloat max_param_change,
+                             BaseFloat max_change_scale,
+                             BaseFloat scale, Nnet *nnet,
+                             MaxChangeStats *stats) {
+  bool ans = UpdateNnetWithMaxChange(
+      delta_nnet, max_param_change, max_change_scale,
+      scale, nnet,
+      &(stats->num_max_change_per_component_applied),
+      &(stats->num_max_change_global_applied));
+  stats->num_minibatches_processed++;
+  return ans;
+}
+
+
+void MaxChangeStats::Print(const Nnet &nnet) const {
+  int32 i = 0;
+  for (int32 c = 0; c < nnet.NumComponents(); c++) {
+    const Component *comp = nnet.GetComponent(c);
+    if (comp->Properties() & kUpdatableComponent) {
+      const UpdatableComponent *uc = dynamic_cast<const UpdatableComponent*>(
+          comp);
+      if (uc == NULL)
+        KALDI_ERR << "Updatable component does not inherit from class "
+                  << "UpdatableComponent; change this code.";
+      if (num_max_change_per_component_applied[i] > 0)
+        KALDI_LOG << "For " << nnet.GetComponentName(c)
+                  << ", per-component max-change was enforced "
+                  << ((100.0 * num_max_change_per_component_applied[i]) /
+                      num_minibatches_processed)
+                  << " \% of the time.";
+      i++;
+    }
+  }
+  if (num_max_change_global_applied > 0)
+    KALDI_LOG << "The global max-change was enforced "
+              << ((100.0 * num_max_change_global_applied) /
+                  num_minibatches_processed)
+              << " \% of the time.";
+}
+
+
 } // namespace nnet3
 } // namespace kaldi
diff --git a/src/nnet3/nnet-utils.h b/src/nnet3/nnet-utils.h
index 787bd228a38..60a18f15d84 100644
--- a/src/nnet3/nnet-utils.h
+++ b/src/nnet3/nnet-utils.h
@@ -377,6 +377,17 @@ bool UpdateNnetWithMaxChange(const Nnet &delta_nnet,
                              num_max_change_per_component_applied,
                              int32 *num_max_change_global_applied);
 
+struct MaxChangeStats;
+
+// This overloaded version of UpdateNnetWithMaxChange() is a convenience
+// wrapper for when you have a MaxChangeStats object to keep track
+// of how many times the max-change was applied.  See documentation above.
+bool UpdateNnetWithMaxChange(const Nnet &delta_nnet,
+                             BaseFloat max_param_change,
+                             BaseFloat max_change_scale,
+                             BaseFloat scale, Nnet *nnet,
+                             MaxChangeStats *stats);
+
 
 /**
    This function is used as part of the regular training workflow, prior to
@@ -513,6 +524,24 @@ int32 GetNumNvalues(const std::vector<NnetIo> &io_vec,
                     bool exhaustive);
 
 
+struct MaxChangeStats {
+  int32 num_max_change_global_applied;
+  int32 num_minibatches_processed;
+  std::vector<int32> num_max_change_per_component_applied;
+
+  MaxChangeStats(const Nnet &nnet):
+      num_max_change_global_applied(0),
+      num_minibatches_processed(0),
+      num_max_change_per_component_applied(NumUpdatableComponents(nnet), 0) { }
+
+  // Prints the max-change stats.  Usually will be called at the end
+  // of the program.  The nnet is only needed for structural information,
+  // to work out the component names.
+  void Print(const Nnet &nnet) const;
+};
+
+
+
 } // namespace nnet3
 } // namespace kaldi
 
diff --git a/src/util/text-utils-test.cc b/src/util/text-utils-test.cc
index 5bfe4cb24d0..3b58f4f1dd1 100644
--- a/src/util/text-utils-test.cc
+++ b/src/util/text-utils-test.cc
@@ -2,6 +2,7 @@
 
 // Copyright 2009-2011     Microsoft Corporation
 //                2017     Johns Hopkins University (author: Daniel Povey)
+//                2015  Vimal Manohar (Johns Hopkins University)
 
 // See ../../COPYING for clarification regarding multiple authors
 //
@@ -324,6 +325,193 @@ void TestStringsApproxEqual() {
   KALDI_ASSERT(!StringsApproxEqual("x 1.0 y", "x 1.0001 y", 4));
 }
 
+void UnitTestConfigLineParse() {
+  std::string str;
+  {
+    ConfigLine cfl;
+    str = "a-b xx=yyy foo=bar  baz=123 ba=1:2";
+    bool status = cfl.ParseLine(str);
+    KALDI_ASSERT(status && cfl.FirstToken() == "a-b");
+
+    KALDI_ASSERT(cfl.HasUnusedValues());
+    std::string str_value;
+    KALDI_ASSERT(cfl.GetValue("xx", &str_value));
+    KALDI_ASSERT(str_value == "yyy");
+    KALDI_ASSERT(cfl.HasUnusedValues());
+    KALDI_ASSERT(cfl.GetValue("foo", &str_value));
+    KALDI_ASSERT(str_value == "bar");
+    KALDI_ASSERT(cfl.HasUnusedValues());
+    KALDI_ASSERT(!cfl.GetValue("xy", &str_value));
+    KALDI_ASSERT(cfl.GetValue("baz", &str_value));
+    KALDI_ASSERT(str_value == "123");
+
+    std::vector<int32> int_values;
+    KALDI_ASSERT(!cfl.GetValue("xx", &int_values));
+    KALDI_ASSERT(cfl.GetValue("baz", &int_values));
+    KALDI_ASSERT(cfl.HasUnusedValues());
+    KALDI_ASSERT(int_values.size() == 1 && int_values[0] == 123);
+    KALDI_ASSERT(cfl.GetValue("ba", &int_values));
+    KALDI_ASSERT(int_values.size() == 2 && int_values[0] == 1 && int_values[1] == 2);
+    KALDI_ASSERT(!cfl.HasUnusedValues());
+  }
+
+  {
+    ConfigLine cfl;
+    str = "a-b baz=x y z pp = qq ab =cd ac= bd";
+    KALDI_ASSERT(!cfl.ParseLine(str));
+  }
+  {
+    ConfigLine cfl;
+    str = "a-b baz=x y z pp = qq ab=cd ac=bd";
+    KALDI_ASSERT(!cfl.ParseLine(str));
+  }
+  {
+    ConfigLine cfl;
+    str = "foo-bar";
+    KALDI_ASSERT(cfl.ParseLine(str));
+  }
+  {
+    ConfigLine cfl;
+    str = "foo-bar a=b c d f=g";
+    std::string value;
+    KALDI_ASSERT(cfl.ParseLine(str) && cfl.FirstToken() == "foo-bar" &&
+                 cfl.GetValue("a", &value)  && value == "b c d" &&
+                 cfl.GetValue("f", &value) && value == "g" &&
+                 !cfl.HasUnusedValues());
+  }
+  {
+    ConfigLine cfl;
+    str = "zzz a=b baz";
+    KALDI_ASSERT(cfl.ParseLine(str) && cfl.FirstToken() == "zzz" &&
+                 cfl.UnusedValues() == "a=b baz");
+  }
+  {
+    ConfigLine cfl;
+    str = "xxx a=b baz ";
+    KALDI_ASSERT(cfl.ParseLine(str) && cfl.UnusedValues() == "a=b baz");
+  }
+  {
+    ConfigLine cfl;
+    str = "xxx a=b =c";
+    KALDI_ASSERT(!cfl.ParseLine(str));
+  }
+  {
+    ConfigLine cfl;
+    str = "xxx baz='x y z' pp=qq ab=cd ac=bd";
+    KALDI_ASSERT(cfl.ParseLine(str) && cfl.FirstToken() == "xxx");
+    std::string str_value;
+    KALDI_ASSERT(cfl.GetValue("baz", &str_value));
+    KALDI_ASSERT(str_value == "x y z");
+    KALDI_ASSERT(cfl.GetValue("pp", &str_value));
+    KALDI_ASSERT(str_value == "qq");
+    KALDI_ASSERT(cfl.UnusedValues() == "ab=cd ac=bd");
+    KALDI_ASSERT(cfl.GetValue("ab", &str_value));
+    KALDI_ASSERT(str_value == "cd");
+    KALDI_ASSERT(cfl.UnusedValues() == "ac=bd");
+    KALDI_ASSERT(cfl.HasUnusedValues());
+    KALDI_ASSERT(cfl.GetValue("ac", &str_value));
+    KALDI_ASSERT(str_value == "bd");
+    KALDI_ASSERT(!cfl.HasUnusedValues());
+  }
+
+  {
+    ConfigLine cfl;
+    str = "x baz= pp = qq flag=t ";
+    KALDI_ASSERT(!cfl.ParseLine(str));
+  }
+  {
+    ConfigLine cfl;
+    str = " x baz= pp=qq flag=t  ";
+    KALDI_ASSERT(cfl.ParseLine(str) && cfl.FirstToken() == "x");
+
+    std::string str_value;
+    KALDI_ASSERT(cfl.GetValue("baz", &str_value));
+    KALDI_ASSERT(str_value == "");
+    KALDI_ASSERT(cfl.GetValue("pp", &str_value));
+    KALDI_ASSERT(str_value == "qq");
+    KALDI_ASSERT(cfl.HasUnusedValues());
+    KALDI_ASSERT(cfl.GetValue("flag", &str_value));
+    KALDI_ASSERT(str_value == "t");
+    KALDI_ASSERT(!cfl.HasUnusedValues());
+
+    bool bool_value = false;
+    KALDI_ASSERT(cfl.GetValue("flag", &bool_value));
+    KALDI_ASSERT(bool_value);
+  }
+
+  {
+    ConfigLine cfl;
+    str = "xx _baz=a -pp=qq";
+    KALDI_ASSERT(!cfl.ParseLine(str));
+  }
+  {
+    ConfigLine cfl;
+    str = "xx 0baz=a pp=qq";
+    KALDI_ASSERT(!cfl.ParseLine(str));
+  }
+  {
+    ConfigLine cfl;
+    str = "xx -baz=a pp=qq";
+    KALDI_ASSERT(!cfl.ParseLine(str));
+  }
+  {
+    ConfigLine cfl;
+    str = "xx _baz'=a pp=qq";
+    KALDI_ASSERT(!cfl.ParseLine(str));
+  }
+  {
+    ConfigLine cfl;
+    str = " baz=g";
+    KALDI_ASSERT(cfl.ParseLine(str) && cfl.FirstToken() == "");
+    bool flag;
+    KALDI_ASSERT(!cfl.GetValue("baz", &flag));
+  }
+  {
+    ConfigLine cfl;
+    str = "xx _baz1=a pp=qq";
+    KALDI_ASSERT(cfl.ParseLine(str));
+
+    std::string str_value;
+    KALDI_ASSERT(cfl.GetValue("_baz1", &str_value));
+  }
+}
+
+void UnitTestReadConfig() {
+  std::string str = "a-b alpha=aa beta=\"b b\"# String test\n"
+      "a-b beta2='b c' beta3=bd # \n"
+      "a-b gamma=1:2:3:4  # Int Vector test\n"
+      " a-b de1ta=f  # Bool + Integer in key Comment test delta=t  \n"
+      "a-b _epsilon=-1  # Int Vector test _epsilon=1 \n"
+      "a-b zet-_a=0.15   theta=1.1# Float, -, _ test\n"
+      "a-b quoted='a b c' # quoted string\n"
+      "a-b quoted2=\"d e 'a b=c' f\" # string quoted with double quotes";
+
+  std::istringstream is(str);
+  std::vector<std::string> lines;
+  ReadConfigLines(is, &lines);
+  KALDI_ASSERT(lines.size() == 8);
+
+  ConfigLine cfl;
+  for (size_t i = 0; i < lines.size(); i++) {
+    KALDI_ASSERT(cfl.ParseLine(lines[i]) && cfl.FirstToken() == "a-b");
+    if (i == 1) {
+        KALDI_ASSERT(cfl.GetValue("beta2", &str) && str == "b c");
+    }
+    if (i == 4) {
+      KALDI_ASSERT(cfl.GetValue("_epsilon", &str) && str == "-1");
+    }
+    if (i == 5) {
+      BaseFloat float_val = 0;
+      KALDI_ASSERT(cfl.GetValue("zet-_a", &float_val) && ApproxEqual(float_val, 0.15));
+    }
+    if (i == 6) {
+      KALDI_ASSERT(cfl.GetValue("quoted", &str) && str == "a b c");
+    }
+    if (i == 7) {
+      KALDI_ASSERT(cfl.GetValue("quoted2", &str) && str == "d e 'a b=c' f");
+    }
+  }
+}
 
 }  // end namespace kaldi
 
@@ -344,5 +532,7 @@ int main() {
   TestNan<double>();
   TestInf<float>();
   TestInf<double>();
+  UnitTestConfigLineParse();
+  UnitTestReadConfig();
   std::cout << "Test OK\n";
 }
diff --git a/src/util/text-utils.cc b/src/util/text-utils.cc
index 200e3ad9327..bbf38ecc5cc 100644
--- a/src/util/text-utils.cc
+++ b/src/util/text-utils.cc
@@ -340,4 +340,252 @@ bool StringsApproxEqual(const std::string &a,
 }
 
 
+bool ConfigLine::ParseLine(const std::string &line) {
+  data_.clear();
+  whole_line_ = line;
+  if (line.size() == 0) return false;   // Empty line
+  size_t pos = 0, size = line.size();
+  while (isspace(line[pos]) && pos < size) pos++;
+  if (pos == size)
+    return false;  // whitespace-only line
+  size_t first_token_start_pos = pos;
+  // first get first_token_.
+  while (!isspace(line[pos]) && pos < size) {
+    if (line[pos] == '=') {
+      // If the first block of non-whitespace looks like "foo-bar=...",
+      // then we ignore it: there is no initial token, and FirstToken()
+      // is empty.
+      pos = first_token_start_pos;
+      break;
+    }
+    pos++;
+  }
+  first_token_ = std::string(line, first_token_start_pos, pos - first_token_start_pos);
+  // first_token_ is expected to be either empty or something like
+  // "component-node", which actually is a slightly more restrictive set of
+  // strings than IsValidName() checks for this is a convenient way to check it.
+  if (!first_token_.empty() && !IsValidName(first_token_))
+    return false;
+
+  while (pos < size) {
+    if (isspace(line[pos])) {
+      pos++;
+      continue;
+    }
+
+    // OK, at this point we know that we are pointing at nonspace.
+    size_t next_equals_sign = line.find_first_of("=", pos);
+    if (next_equals_sign == pos || next_equals_sign == std::string::npos) {
+      // we're looking for something like 'key=value'.  If there is no equals sign,
+      // or it's not preceded by something, it's a parsing failure.
+      return false;
+    }
+    std::string key(line, pos, next_equals_sign - pos);
+    if (!IsValidName(key)) return false;
+
+    // handle any quotes.  we support key='blah blah' or key="foo bar".
+    // no escaping is supported.
+    if (line[next_equals_sign+1] == '\'' || line[next_equals_sign+1] == '"') {
+      char my_quote = line[next_equals_sign+1];
+      size_t next_quote = line.find_first_of(my_quote, next_equals_sign + 2);
+      if (next_quote == std::string::npos) {  // no matching quote was found.
+        KALDI_WARN << "No matching quote for " << my_quote << " in config line '"
+                   << line << "'";
+        return false;
+      } else {
+        std::string value(line, next_equals_sign + 2,
+                          next_quote - next_equals_sign - 2);
+        data_.insert(std::make_pair(key, std::make_pair(value, false)));
+        pos = next_quote + 1;
+        continue;
+      }
+    } else {
+      // we want to be able to parse something like "... input=Offset(a, -1) foo=bar":
+      // in general, config values with spaces in them, even without quoting.
+
+      size_t next_next_equals_sign = line.find_first_of("=", next_equals_sign + 1),
+          terminating_space = size;
+
+      if (next_next_equals_sign != std::string::npos) {  // found a later equals sign.
+        size_t preceding_space = line.find_last_of(" \t", next_next_equals_sign);
+        if (preceding_space != std::string::npos &&
+            preceding_space > next_equals_sign)
+          terminating_space = preceding_space;
+      }
+      while (isspace(line[terminating_space - 1]) && terminating_space > 0)
+        terminating_space--;
+
+      std::string value(line, next_equals_sign + 1,
+                        terminating_space - (next_equals_sign + 1));
+      data_.insert(std::make_pair(key, std::make_pair(value, false)));
+      pos = terminating_space;
+    }
+  }
+  return true;
+}
+
+bool ConfigLine::GetValue(const std::string &key, std::string *value) {
+  KALDI_ASSERT(value != NULL);
+  std::map<std::string, std::pair<std::string, bool> >::iterator it = data_.begin();
+  for (; it != data_.end(); ++it) {
+    if (it->first == key) {
+      *value = (it->second).first;
+      (it->second).second = true;
+      return true;
+    }
+  }
+  return false;
+}
+
+bool ConfigLine::GetValue(const std::string &key, BaseFloat *value) {
+  KALDI_ASSERT(value != NULL);
+  std::map<std::string, std::pair<std::string, bool> >::iterator it = data_.begin();
+  for (; it != data_.end(); ++it) {
+    if (it->first == key) {
+      if (!ConvertStringToReal((it->second).first, value))
+        return false;
+      (it->second).second = true;
+      return true;
+    }
+  }
+  return false;
+}
+
+bool ConfigLine::GetValue(const std::string &key, int32 *value) {
+  KALDI_ASSERT(value != NULL);
+  std::map<std::string, std::pair<std::string, bool> >::iterator it = data_.begin();
+  for (; it != data_.end(); ++it) {
+    if (it->first == key) {
+      if (!ConvertStringToInteger((it->second).first, value))
+        return false;
+      (it->second).second = true;
+      return true;
+    }
+  }
+  return false;
+}
+
+bool ConfigLine::GetValue(const std::string &key, std::vector<int32> *value) {
+  KALDI_ASSERT(value != NULL);
+  value->clear();
+  std::map<std::string, std::pair<std::string, bool> >::iterator it = data_.begin();
+  for (; it != data_.end(); ++it) {
+    if (it->first == key) {
+      if (!SplitStringToIntegers((it->second).first, ":,", true, value)) {
+        // KALDI_WARN << "Bad option " << (it->second).first;
+        return false;
+      }
+      (it->second).second = true;
+      return true;
+    }
+  }
+  return false;
+}
+
+bool ConfigLine::GetValue(const std::string &key, bool *value) {
+  KALDI_ASSERT(value != NULL);
+  std::map<std::string, std::pair<std::string, bool> >::iterator it = data_.begin();
+  for (; it != data_.end(); ++it) {
+    if (it->first == key) {
+      if ((it->second).first.size() == 0) return false;
+      switch (((it->second).first)[0]) {
+        case 'F':
+        case 'f':
+          *value = false;
+          break;
+        case 'T':
+        case 't':
+          *value = true;
+          break;
+        default:
+          return false;
+      }
+      (it->second).second = true;
+      return true;
+    }
+  }
+  return false;
+}
+
+bool ConfigLine::HasUnusedValues() const {
+  std::map<std::string, std::pair<std::string, bool> >::const_iterator it = data_.begin();
+  for (; it != data_.end(); ++it) {
+    if (!(it->second).second) return true;
+  }
+  return false;
+}
+
+std::string ConfigLine::UnusedValues() const {
+  std::string unused_str;
+  std::map<std::string, std::pair<std::string, bool> >::const_iterator it = data_.begin();
+  for (; it != data_.end(); ++it) {
+    if (!(it->second).second) {
+      if (unused_str == "")
+        unused_str = it->first + "=" + (it->second).first;
+      else
+        unused_str += " " + it->first + "=" + (it->second).first;
+    }
+  }
+  return unused_str;
+}
+
+// This is like ExpectToken but for two tokens, and it
+// will either accept token1 and then token2, or just token2.
+// This is useful in Read functions where the first token
+// may already have been consumed.
+void ExpectOneOrTwoTokens(std::istream &is, bool binary,
+                          const std::string &token1,
+                          const std::string &token2) {
+  KALDI_ASSERT(token1 != token2);
+  std::string temp;
+  ReadToken(is, binary, &temp);
+  if (temp == token1) {
+    ExpectToken(is, binary, token2);
+  } else {
+    if (temp != token2) {
+      KALDI_ERR << "Expecting token " << token1 << " or " << token2
+                << " but got " << temp;
+    }
+  }
+}
+
+
+bool IsValidName(const std::string &name) {
+  if (name.size() == 0) return false;
+  for (size_t i = 0; i < name.size(); i++) {
+    if (i == 0 && !isalpha(name[i]) && name[i] != '_')
+      return false;
+    if (!isalnum(name[i]) && name[i] != '_' && name[i] != '-' && name[i] != '.')
+      return false;
+  }
+  return true;
+}
+
+void ReadConfigLines(std::istream &is,
+                    std::vector<std::string> *lines) {
+  KALDI_ASSERT(lines != NULL);
+  std::string line;
+  while (std::getline(is, line)) {
+    if (line.size() == 0) continue;
+    size_t start = line.find_first_not_of(" \t");
+    size_t end = line.find_first_of('#');
+    if (start == std::string::npos || start == end) continue;
+    end = line.find_last_not_of(" \t", end - 1);
+    KALDI_ASSERT(end >= start);
+    lines->push_back(line.substr(start, end - start + 1));
+  }
+}
+
+void ParseConfigLines(const std::vector<std::string> &lines,
+                      std::vector<ConfigLine> *config_lines) {
+  config_lines->resize(lines.size());
+  for (size_t i = 0; i < lines.size(); i++) {
+    bool ret = (*config_lines)[i].ParseLine(lines[i]);
+    if (!ret) {
+      KALDI_ERR << "Error parsing config line: " << lines[i];
+    }
+  }
+}
+
+
 }  // end namespace kaldi
diff --git a/src/util/text-utils.h b/src/util/text-utils.h
index 7bc20957672..02f4bf483fc 100644
--- a/src/util/text-utils.h
+++ b/src/util/text-utils.h
@@ -183,6 +183,98 @@ bool StringsApproxEqual(const std::string &a,
                         const std::string &b,
                         int32 decimal_places_check = 2);
 
+/**
+   This class is responsible for parsing input like
+    hi-there xx=yyy a=b c empty= f-oo=Append(bar, sss) ba_z=123 bing='a b c' baz="a b c d='a b' e"
+   and giving you access to the fields, in this case
+
+   FirstToken() == "hi-there", and key->value pairs:
+
+   xx->yyy, a->"b c", empty->"", f-oo->"Append(bar, sss)", ba_z->"123",
+   bing->"a b c", baz->"a b c d='a b' e"
+
+   The first token is optional, if the line started with a key-value pair then
+   FirstValue() will be empty.
+
+   Note: it can parse value fields with space inside them only if they are free of the '='
+   character.  If values are going to contain the '=' character, you need to quote them
+   with either single or double quotes.
+
+   Key values may contain -_a-zA-Z0-9, but must begin with a-zA-Z_.
+ */
+class ConfigLine {
+ public:
+  // Tries to parse the line as a config-file line.  Returns false
+  // if it could not for some reason, e.g. parsing failure.  In most cases
+  // prints no warnings; the user should do this.  Does not expect comments.
+  bool ParseLine(const std::string &line);
+
+  // the GetValue functions are overloaded for various types.  They return true
+  // if the key exists with value that can be converted to that type, and false
+  // otherwise.  They also mark the key-value pair as having been read.  It is
+  // not an error to read values twice.
+  bool GetValue(const std::string &key, std::string *value);
+  bool GetValue(const std::string &key, BaseFloat *value);
+  bool GetValue(const std::string &key, int32 *value);
+  // Values may be separated by ":" or by ",".
+  bool GetValue(const std::string &key, std::vector<int32> *value);
+  bool GetValue(const std::string &key, bool *value);
+
+  bool HasUnusedValues() const;
+  /// returns e.g. foo=bar xxx=yyy if foo and xxx were not consumed by one
+  /// of the GetValue() functions.
+  std::string UnusedValues() const;
+
+  const std::string &FirstToken() const { return first_token_; }
+
+  const std::string WholeLine() { return whole_line_; }
+  // use default assignment operator and copy constructor.
+ private:
+  std::string whole_line_;
+  // the first token of the line, e.g. if line is
+  // foo-bar baz=bing
+  // then first_token_ would be "foo-bar".
+  std::string first_token_;
+
+  // data_ maps from key to (value, is-this-value-consumed?).
+  std::map<std::string, std::pair<std::string, bool> > data_;
+
+};
+
+/// This function is like ExpectToken but for two tokens, and it will either
+/// accept token1 and then token2, or just token2.  This is useful in Read
+/// functions where the first token may already have been consumed.
+void ExpectOneOrTwoTokens(std::istream &is, bool binary,
+                          const std::string &token1,
+                          const std::string &token2);
+
+
+/**
+   This function reads in a config file and *appends* its contents to a vector of
+   lines; it is responsible for removing comments (anything after '#') and
+   stripping out any lines that contain only whitespace after comment removal.
+ */
+void ReadConfigLines(std::istream &is,
+                     std::vector<std::string> *lines);
+
+
+/**
+   This function converts config-lines from a simple sequence of strings
+   as output by ReadConfigLines(), into a sequence of first-tokens and
+   name-value pairs.  The general format is:
+      "command-type bar=baz xx=yyy"
+   etc., although there are subtleties as to what exactly is allowed, see
+   documentation for class ConfigLine for details.
+   This function will die if there was a parsing failure.
+ */
+void ParseConfigLines(const std::vector<std::string> &lines,
+                      std::vector<ConfigLine> *config_lines);
+
+
+/// Returns true if 'name' would be a valid name for a component or node in a
+/// nnet3Nnet.  This is a nonempty string beginning with A-Za-z_, and containing only
+/// '-', '_', '.', A-Z, a-z, or 0-9.
+bool IsValidName(const std::string &name);
 
 }  // namespace kaldi
 

From f93749a12b657ba0c8b0007fdf3061602c81eb1b Mon Sep 17 00:00:00 2001
From: Daniel Povey <dpovey@gmail.com>
Date: Sat, 16 Mar 2019 20:54:52 -0400
Subject: [PATCH 005/163] [src] More work on tensor library draft in kaldi10
 (#3124)

---
 src/tensor/tensor-functions.h |  32 +++
 src/tensor/tensor.h           | 362 +++++++++++++++++++++++++++++++---
 2 files changed, 365 insertions(+), 29 deletions(-)
 create mode 100644 src/tensor/tensor-functions.h

diff --git a/src/tensor/tensor-functions.h b/src/tensor/tensor-functions.h
new file mode 100644
index 00000000000..d68d6b545da
--- /dev/null
+++ b/src/tensor/tensor-functions.h
@@ -0,0 +1,32 @@
+#include "tensor/tensor.h"
+
+
+namespace kaldi {
+namespace tensor {
+
+// This file contains functions that operate on Tensors in various ways.  To
+// avoid class Tensor blowing up hugely, we implement these things outside
+// class Tensor.
+
+
+// Note: we use the distinction between references and pointers the same way as
+// you might expect from Google-style-guide code, to reflect which Tensors'
+// contents are changed (so a pointer argument might have its contents changed.
+// But these are in most cases pointers to const Tensors; they can be
+// donst because the metadata is not changed, even if the data is.
+
+
+// Sets all elements of the tensor to zero.
+void SetZero(const Tensor *tensor);
+
+// Sets all elements of the tensor to value f (cast to whatever type
+// this Tensor has).
+void SetZero(float f, const Tensor *tensor);
+
+
+// Return a transposed version of this Tensor that shares the underlying memory.
+Tensor Transpose(const Tensor &tensor, int64_t axis1 = 0, int64_t axis2 = 1);
+
+
+}
+}
diff --git a/src/tensor/tensor.h b/src/tensor/tensor.h
index e94b6978a6b..92bf95c0f90 100644
--- a/src/tensor/tensor.h
+++ b/src/tensor/tensor.h
@@ -6,6 +6,28 @@ namespace kaldi {
 namespace tensor {
 
 
+// Similar to llvm/PyTorch's ArrayRef, this is a lightweight way to store an
+// array (zero or more elements of type T).  The array is not owned here; it
+// will generally be unsafe to use an ArrayRef as other than a local variable.
+template <typename T>
+struct ArrayRef {
+  // Note:
+  uint64_t size;
+  int64_t *data;
+
+  // Lots to do here.
+
+  inline int64_t operator (uint64_t i) const {
+    KALDI_ASSERT(i < size);
+    return data[i];
+  }
+
+  // cast to std::vector; for cases where you might need to
+  // change the contents or keep it more permanently.
+  operator std::vector<int64_t> () const;
+};
+
+
 enum {
   kCpuDevice = 0,
   kCudaDevice = 1
@@ -39,15 +61,145 @@ struct Storage {
 };
 
 
-enum {
+enum DataType {
+  // We will of course later extend this with many more types, including
+  // integer types and half-precision floats.
   kFloatDtype = 0,
   kDoubleDtype = 1
-} DataType;
+};
+
+enum StridePolicy {
+  kCstrides = 0,
+  kCopyStridesIfContiguous = 1
+};
 
 #define KALDI_TENSOR_MAX_DIM 5
 
 
 
+// This enum with one value is a trick to allow you to
+// emulate indexing schemes like, say, A[10:].
+// In C++ you'd do A(all,10).
+enum RangeEnum { all };
+
+/**
+   struct Range represents an integer or a range of integers (e.g. as used in
+   indexing).  It emulates Python's range().
+
+   There are various possibilities of what Range can contain, enumerated below.
+   Be careful: we use {a,b,c} to denote the actual class members, not the
+   arguments to constructors, which mimic the arguments of expressions with colons
+   in Python's indexing with ':'
+
+   For purposes of explanation I will assume we are indexing a 1-dimensional
+   array a, but this struct is also used for multi-dimensional indexing.
+
+   Examples are below (showing members {begin,end,step}, where inf means
+   std::numeric_limits<int64>::max()):
+
+
+   Literal contents     Python equivalent,     How obtained             Elements of array
+   of Range struct      indexing array a     using constructors           you would get
+
+    {0,inf,1}          a[:], a[0:]          Range(all), Range(0,all)    all of them
+
+    {0,10,2}           a[:10:2], a[0:10:2]   Range(0,10,2)             [0,2,4,8]
+
+    {0,-1,1}           a[:-1], a[0:-1]       Range(0,-1)                all but the last
+
+    {10,2,-1}          a[10:2:-1]           Range(10,2,-1)              [10,9,...3]
+
+    {inf,inf,1}        a[::-1]             Range(all,all,-1)            all, reversed order
+
+    {-3,-2,1}          a[-3:-2]            Range(-3,-2)             third-from-last element only
+
+    {10,0,inf}         a[10]              10 (implicit; RangeExt constructor)    the 10th element, removing axis
+
+
+*/
+struct Range {
+  int64_t begin;
+  int64_t end;
+  int64_t step;
+
+  static inline int64_t inf() { return std::numeric_limits<int64_t>::max(); }
+
+  // The default constructor leaves the range undefined.
+  Range() { }
+
+  Range(RangeEnum): begin(0), end(inf()), step(1) { }
+
+  explicit Range(int64_t end): begin(0), end(end), step(1) { }
+
+  Range(int64_t begin, int64_t end, int64_t step = 1):
+      begin(begin), end(end), step(1) { }
+
+  Range(int64_t begin, RangeEnum, int64_t step = 1):
+      begin(begin), end(inf()), step(step) { }
+
+  Range(RangeEnum, int64_t end, int64_t step = 1):
+      begin(inf), end(end), step(step) { }
+
+  Range(RangeEnum, RangeEnum, int64_t step = 1):
+      begin(inf()), end(inf()), step(step) { }
+};
+
+/**
+  struct RangeExt is used in situations, such as indexing, where what we have
+  might be a Range (like, in numpy, indexing with something that has a colon);
+  or it might simply be an integer.  There are no new members.  The reason we
+  don't just make this an additional constructor of Range is that we want it
+  so if someone does Range(10) it is interpreted as 0 through 9, but if
+  you do just 10 it means the index 10.  You can't have an explicit and
+  implicit constructor taking the same type: hence this child class.
+
+  Note that numpy's A[1] is not the same as A[1:2] because the former returns a
+  tensor with one fewer axes.
+*/
+struct RangeExt: public Range {
+  RangeExt(Range r): Range(r) { }
+
+  // implicit
+  RangeExt(int64_t index):
+      Range(index, 0, inf());
+};
+
+
+enum IndexingType{
+  kIndexingTypeRange,
+  kIndexingTypeNumber,
+  kIndexingTypeTensor
+};
+
+// This struct is used when indexing with mixed types. Ror
+// example:
+// Tensor a(...), b(...);
+// Tensor c = a(1, b, Range(0,-1), Range(all));
+
+struct IndexingArg {
+  IndexingArg(const Tensor &tensor);
+  IndexingArg(int64_t index);
+  IndexingArg(Range range);
+
+  IndexingType itype;
+  int64_t index;
+  std::shared_ptr<Tensor> tensor {nullptr};
+  Range range;
+};
+
+/**
+   This function, used in indexing operations, takes a Range that may have, say,
+   negative 'end' or end equal to Range::inf(), and turns it into actual
+   numbers with begin and end both in the range [0,dim].  So, for instance, if
+   the range had `end = -1`, it would be turned into `dim - 1`; or if `end` was
+   Range::inf(), it would be interpreted as `dim`.
+
+   Raises an exception the resulting range is empty.
+ */
+void MakeRangeExplicit(Range *range, int64_t dim);
+
+
+
 /*
   This struct stores the dimension and strides of a Tensor.  The following
   describes the properties that a Tensor will always have (note: we
@@ -55,62 +207,204 @@ enum {
   properties do not all hold).
 
   These properties are stricter than some other frameworks, such as PyTorch,
-  which allow the users to manually add dimensions with stride 0 (and dim>1) so
+  which allow the users to manually add dimensions with stride 0 and dim > 1 so
   that a lower-dimensional quantity can masquerade as one with a higher
-  dimension.  We require that it never be possible to access the same
-  memory location using two different tuples of indexes.  We also
-  don't allow zero dims (i.e. a tensor must not be empty); if you want an
-  empty Tensor, just use a null pointer.
+  dimension.  We require that it never be possible to access the same memory
+  location using two different tuples of indexes.  We also don't allow zero dims
+  (i.e. a tensor must not be empty); if you want an empty Tensor, just use a
+  null pointer.  In addition, require that the stride equal zero for any
+  axis that has dim = 1.
+
+  Our requirements on a TensorDim are:
 
     0 <= num_axes <= 5
     for 0 <= axis < num_axes:
        dims[i] > 0
-
-  The strides may take any value, including zero or negative, as long as the
-  uniqueness property is satisfied (i.e. must not be possible to access the
-  same memory location using two different tuples of indices.
-
+       if dims[i] = 1, then strides[i] = 0.
+       if dims[i] != 1, then strides[i] != 0
+    ... plus the uniqueness property.
+
+  The uniqueness property means that we must not be able to access
+  the same memory location via two different tuples of indexes).
+  Recause testing this property exactly would be difficult in general
+  without bringing in number theory, we test a slightly stronger version
+  of it that covers all cases we are likely to encounter.
 */
 
 struct TensorDim {
-
-  int64_t num_axes;
+  uint64_t num_axes;
   int64_t dims[KALDI_TENSOR_MAX_DIM];
   int64_t strides[KALDI_TENSOR_MAX_DIM];
   // We may later add methods to this.
 
   // Checks that the TensorDim is valid, assuming it is part of a Tensor.
-  // I.e. that it satifies the properties mentioned above.
+  // I.e. that it satifies all the properties mentioned above.
   bool Check();
 };
 
 struct TensorDimProperties {
-  // Below are cached properties that depend on a TensorDim.
+  // Below are cached properties that are derived from the underlying data in
+  // struct TensorDim.
 
   // The number of elements in the Tensor, which equals the product
-  // of dims[0] .. dims[num_axes - 1].  Must always be >0.
+  // of dims[0] .. dims[num_axes - 1].  Must be >0.
   int64_t num_elements;
 
   // is_contiguous means that the data form a contiguous block in memory; it is
-  // not the same as PyTorch's is_contiguous which is a stronger condition; our
-  // has_expected_strides is equivalent to that.
+  // not the same as PyTorch's is_contiguous which is a stronger condition,
+  // implying also that the strides are as for a `C-style` array.
   bool is_contiguous;
 
-  // has_expected_strides means that the strides are as if this was a "c"-style
+  // has_c_strides means that the strides are as if this was a "c"-style
   // multidimensional array, meaning that (using Python wrap-around indexing
   // conventions as if strides was an array of dimension 'num_axes'),
   // strides[-1] == 1, strides[-1] == dims[-1], strides[-2] = dims[-1] *
   // dims[-1], and so on.  This is the same as PyTorch's is_contiguous.
-  bool has_expected_strides;
+  bool has_c_strides;
 
   void UpdateProperties(const TensorDim &dim);
 };
 
 
-
+/**
+   A Tensor is a multi-dimensional array (up to 5 dimensions) of types such as
+   float or double (and eventually ints).  Multiple Tensors may point to data
+   allocated from the same Storage.  Class Tensor contains enough elements that
+   it makes sense most of the time to pass it around by reference (Tensor&) or
+   by pointer (e.g. Tensor* or std::shared_pointer<Tensor>).  This is unlike
+   in PyTorch where there is a separate TensorImpl class and Tensor really just
+   contains a pointer to it.
+
+   Most of the operations that you would do on a Tensor (like addition,
+   multiplication and so on) are declared out-of-line in tensor-functions.h.
+ */
 class Tensor {
  public:
-  //  ...
+  /// Return the number of axes (a number in {0,1,2,3,4}).  In mathematical
+  // contexts, this is sometimes known as the rank of the tensor, or sometimes
+  // even its dimension, but these terms are ambiguous so we avoid them, and use
+  // the terms 'number of axes' or 'axis' throughout.
+  inline int64_t NumAxes() const { return dim_.num_axes; }
+
+  // Return reference to the struct containing the dimension and
+  // stride info.
+  const TensorDim &DimAndStrides() const { return dim_; }
+
+  // Return an array containing dimensions of the tensor; equivalent to
+  // .shape in PyTorch.  Dims().size() will equal NumAxes().
+  inline ArrayRef<int64_t> Dims() const { return ArrayRef{dim_.num_axes, dim_.dims}; }
+
+  // Returns the dimension on this axis, a number >= 1.  Result is
+  // undefined if axis < 0 or axis >= NumAxes().
+  inline int64_t Dim(int64_t axis) const { return dim_.dims[axis]; }
+
+  // Returns an array containing the strides of the tensor.
+  // Strides().size() will equal NumAxes().
+  inline ArrayRef<int64_t> Strides() const { return ArrayRef{dim_.num_axes, dim_.strides}; }
+
+  // Returns the stride on this axis.  Will be zero if the corresponding
+  // dimension is 1, and otherwise nonzero (but not necessarily positive).
+  inline int64_t Stride(int64_t axis) const { return dim_.strides[axis]; }
+
+  // Returns the number of elements in the Tensor; must be > 0.
+  inline int64_t NumElements() const { return derived_.num_elements; }
+
+  // Returns true if the data forms a contiguous block in memory.
+  // (not the same as 'contiguous()' in PyTorch, which also requires
+  // that the strides be 'C'-style.
+  inline bool IsContiguous() const { return derived_.is_contiguous; }
+
+
+  // Returns true if the strides for this array are what you would
+  // expect if you were to construct a Tensor from this->Dims();
+  // this means "C"-style strides, except that any axis with dimension=1
+  // has its stride set to zero.  This is our equivalent of PyTorch's
+  // contiguous().
+  inline bool HasNormalStrides() const { return derived_.has_c_strides; }
+
+  // Return the data type.
+  DataType Dtype() const { return dtype_; }
+
+  // Indexing operators.  All of these return Tensors which reference the same
+  // underlying data as the original Tensor.  We could have done this with just
+  // a single indexing operator taking 5 args of type RangeExt defaulting to
+  // `all`, but we provide separate versions for each num-args for efficiency.
+  // You can provide an int64_t where RangeExt is expected; it will be
+  // converted to a special struct of type Range. See the documentation for type
+  // Range, and the table which it contains.  If a is a Tensor with 1 axis, a(0)
+  // will return a scalar Tensor (0 axes
+  //
+  // Any of these indexing operators can operate on Tensors with more axes;
+  // trailing axes will be left alone.
+
+  // this operator () taking int64_t is only provided in the one-arg case as a
+  // convenience; in any case, RangeExt can be constructed from int64_t with the
+  // same effect.
+  Tensor operator () (int64_t i0) const;
+  Tensor operator () (RangeExt s0) const;
+  Tensor operator () (RangeExt s0, RangeExt s1) const;
+  Tensor operator () (RangeExt s0, RangeExt s1, RangeExt s2) const;
+  Tensor operator () (RangeExt s0, RangeExt s1, RangeExt s2,
+                      RangeExt s3) const;
+  // A particularly complicated example showing what is possible:
+  // Tensor a(...);
+  // Tensor b = a(all,10,Range(0,5),Range(all,all,-1),all)
+  Tensor operator () (RangeExt s0, RangeExt s1, RangeExt s2,
+                      RangeExt s3, RangeExt s4) const;
+
+
+  // For a scalar Tensor (NumAxes() == 0) returns the item, cast to
+  // float (if it was not already float); throws if NumAxes() > 0.
+  explicit operator float() const;
+  // For a scalar Tensor (NumAxes() == 0) returns the item, cast to
+  // double (if it was not already double); throws if NumAxes() > 0.
+  explicit operator double() const;
+  // For a scalar Tensor (NumAxes() == 0) returns the item, cast to
+  // int64_t (if it was not already int64_t); throws if NumAxes() > 0.
+  explicit operator int64_t() const;
+
+
+  // For a Tensor storing floats, returns the data pointer cast to float;
+  // otherwise, throws.  (note: this is const only as it doesn't change the
+  // Tensor meta-info, but you could change the data using the pointer).
+  explicit operator float* () const;
+  // For a Tensor storing doubles, returns the data pointer cast to float;
+  // otherwise, throws.  (note: this is const only as it doesn't change the
+  // Tensor meta-info, but you could change the data using the pointer).
+  explicit operator double* () const;
+
+
+
+  // Assignment operation which sets all elements to a constant.  Valid
+  // for Tensors of any floating point type.
+  const Tensor & operator = (float f);
+
+  // Transpose the two axes by swapping their dims and strides without changing
+  // the underlying data in memory.  This modifies *this;
+  void Transpose(int64_t axis1 = 0, int64_t axis2 = 1);
+
+
+  // Copy constructor that copies the metadata while sharing the underlying
+  // data.
+  Tensor (const Tensor &other) = default;
+
+  // Move assignment.  Does not copy the data.
+  Tensor(Tensor &&other);
+
+  // Copy data from tensor 'other'.  Requires this Dims() and other.Dims()
+  // be compatible, meaning that they are the same, except it's OK for
+  // a dim of 'other' to be 1 and a dim of *this to be >1 (we will
+  // broadcast, i.e. copy).
+  void CopyData(const Tensor &other);
+
+  // Construct a Tensor with the supplied dimensions; and if set_zero is true,
+  // zero it.
+  Tensor(ArrayRef<int64_t> dims, bool set_zero = false);
+
+  // Construct a Tensor with
+  Tensor(TensorDim &dim, StridePolicy policy, bool set_zero = false);
+
+
 
  private:
   // The tensor dim and strides.
@@ -120,16 +414,18 @@ class Tensor {
   // The data-type of this tensor.
   DataType dtype_;
 
-  // The raw data pointer
+  // The raw data pointer.  Will be cast to a pointer of the appropriate
+  // type before indexing.
   void *data_;
 
   // The storage region where the data resides.  data_ does not necessarily
   // equal storage_->data; it may be more than that, e.g. if this is a view
   // to part of another Tensor.
   std::shared_ptr<Storage> storage_;
+};
+
 
 
-};
 
 /*
   This is the 'gradient information' that class Variable stores for a Tensor
@@ -166,15 +462,23 @@ struct TensorGrad {
   int64_t offset;
 
   // This stores the gradient (if we already have one), or nullptr if not.
-  std::unique_ptr<Tensor> grad{nullptr};
-
+  std::unique_ptr<Variable> grad{nullptr};
 
 };
 
 
+/**
+   class Variable is the same as class Tensor but augmented with autograd
+   machinery.  The overall design is quite similar to PyTorch, and the C++
+   is similar to flashlight.  If you are only familiar with PyTorch's
+   python frontend, class Variable is equivalent to af.tensor.
+ */
 class Variable {
-    using GradFunc = std::function<
-      void(std::vector<Variable>& inputs, const Variable& grad_output)>;
+  using GradFunc = std::function<
+    void(std::vector<Variable>& inputs, Variable &grad_output)>;
+  using GradHook = std::function<void(Variable *grad)>;
+
+
 
 
 };

From 21a3913fc7a9b96dd31d544fa1458fadfaf6d941 Mon Sep 17 00:00:00 2001
From: Daniel Povey <dpovey@gmail.com>
Date: Sun, 17 Mar 2019 19:44:11 -0400
Subject: [PATCH 006/163] Kaldi10 (#3131)

* [src] More work on tensor draft

* [src] Add source file
---
 src/tensor/array-ref.h        |  36 ++++
 src/tensor/storage.h          |  36 ++++
 src/tensor/tensor-common.h    |  54 +++++
 src/tensor/tensor-functions.h |  31 +++
 src/tensor/tensor-pattern.cc  |  18 ++
 src/tensor/tensor-pattern.h   | 258 +++++++++++++++++++++++
 src/tensor/tensor.h           | 387 +++++++++-------------------------
 7 files changed, 529 insertions(+), 291 deletions(-)
 create mode 100644 src/tensor/array-ref.h
 create mode 100644 src/tensor/storage.h
 create mode 100644 src/tensor/tensor-common.h
 create mode 100644 src/tensor/tensor-pattern.cc
 create mode 100644 src/tensor/tensor-pattern.h

diff --git a/src/tensor/array-ref.h b/src/tensor/array-ref.h
new file mode 100644
index 00000000000..fee280e5b1a
--- /dev/null
+++ b/src/tensor/array-ref.h
@@ -0,0 +1,36 @@
+/**
+   This is some notes on plans for kaldi10 tensor stuff, nothing is fully fleshed out.
+*/
+
+namespace kaldi {
+namespace tensor {
+
+
+// Similar to llvm/PyTorch's ArrayRef, this is a lightweight way to store an
+// array (zero or more elements of type T).  The array is not owned here; it
+// will generally be unsafe to use an ArrayRef as other than a local variable.
+//
+// ArrayRef has only two members and it will probably make sense to pass it by
+// value most of the time.
+template <typename T>
+struct ArrayRef {
+  // Note:
+  uint64_t size;
+  const int64_t *data;
+
+  // Lots to do here.
+
+  inline int64_t operator (uint64_t i) const {
+    KALDI_ASSERT(i < size);
+    return data[i];
+  }
+
+  // cast to std::vector; for cases where you might need to
+  // change the contents or keep it more permanently.
+  operator std::vector<int64_t> () const;
+};
+
+
+
+
+};
diff --git a/src/tensor/storage.h b/src/tensor/storage.h
new file mode 100644
index 00000000000..cd11fac022c
--- /dev/null
+++ b/src/tensor/storage.h
@@ -0,0 +1,36 @@
+#include "tensor/tensor-common.h"
+
+/**
+   This is some notes on plans for kaldi10 tensor stuff, nothing is fully fleshed out.
+*/
+
+namespace kaldi {
+namespace tensor {
+
+
+
+
+// 'Storage' contains a single allocated region (on CPU or GPU, according
+// to 'device').
+struct Storage {
+  void *data;
+  size_t num_bytes;
+  Device device;
+
+  // Note: will throw if allocation fails (for now).
+  Storage(Device device, size_t num_bytes);
+
+  // Destructor deallocates 'data'.  For now there is no
+  // concept of a custom allocator or an allocator object, we just use our CuDevice stuff for cuda
+  // allocation and posix_memalign for CPU allocation (obviously we need
+  // to make sure 'data' is aligned in most specific way we might need).
+  // in future we might choose
+  // to add that.
+  ~Storage();
+};
+
+
+
+
+
+};
diff --git a/src/tensor/tensor-common.h b/src/tensor/tensor-common.h
new file mode 100644
index 00000000000..aa59367c518
--- /dev/null
+++ b/src/tensor/tensor-common.h
@@ -0,0 +1,54 @@
+/**
+   This is some notes on plans for kaldi10 tensor stuff, nothing is fully fleshed out.
+*/
+
+namespace kaldi {
+namespace tensor {
+
+
+
+enum {
+  kCpuDevice = 0,
+  kCudaDevice = 1
+} DeviceType;
+
+
+// We may later add a device number (like which GPU we are using),
+// once we support multiple GPUs.
+struct Device {
+  DeviceType device_type;
+  // operator ==, probably, maybe constructors.
+};
+
+
+enum DataType {
+  // We will of course later extend this with many more types, including
+  // integer types and half-precision floats.
+  kFloatDtype = 0,
+  kDoubleDtype = 1
+};
+
+
+
+/// Enumeration that says what strides we should choose when allocating
+/// A Tensor.
+enum StridePolicy {
+  kCopyStrides,  // means: copy the strides from the source Tensor, preserving
+                 //  their signs and relative ordering (but filling in gaps if
+                 //  the source Tensor's data was not contiguous.
+  kCstrides      // means: strides for dimensions that are != 1 are ordered from
+                 // greatest to smallest as in a "C" array.  Per our policy,
+                 // any dimension that is 1 will have a zero stride.
+};
+
+/// Enumeration that says whether to zero a freshly initialized Tensor.
+enum InitializePolicy {
+  kZeroData,
+  kUninitialized
+};
+
+
+#define KALDI_TENSOR_MAX_DIM 5
+
+
+};
diff --git a/src/tensor/tensor-functions.h b/src/tensor/tensor-functions.h
index d68d6b545da..bbab4c58a4b 100644
--- a/src/tensor/tensor-functions.h
+++ b/src/tensor/tensor-functions.h
@@ -27,6 +27,37 @@ void SetZero(float f, const Tensor *tensor);
 // Return a transposed version of this Tensor that shares the underlying memory.
 Tensor Transpose(const Tensor &tensor, int64_t axis1 = 0, int64_t axis2 = 1);
 
+/**
+   Copy the data from tensor 'src' to tensor 'dest'.  Does not change the tensor
+   metadata, but does change the data underlying the Tensor 'dest'.
+
+   Requires that src.Dims() an dest->Dims() be compatible, meaning that they are
+   the same, except it's OK for a dim of 'dest' to be 1 and a dim of 'src' to be
+   >1; in such cases, we will broadcast the element from 'src' across the larger
+   dimension of 'dest'.
+
+   Does not require that the Dtype() or Device() of src and dest be the
+   same.
+*/
+void CopyData(const Tensor &src, const Tensor *dest);
+
+/**
+   Construct, if possible, a Tensor that is a view into 'src'.  Read this
+   carefully, as the semantics may differ from the 'view' functions in some
+   other toolkits.  'View' does not care how the underlying data of 'src'
+   is organized.  Its semantics can be explained as follows.
+
+   Suppose `src` were a "C"-style array with dimension given by `src.Dims()`.
+   Then reinterpret that array as one with dimension `dims`, if possible, and
+   return a Tensor describing that array.  If
+
+
+
+     @param
+
+ */
+std::shared_ptr<Tensor> View(const Tensor &src, ArrayRef<int64_t> dims);
+
 
 }
 }
diff --git a/src/tensor/tensor-pattern.cc b/src/tensor/tensor-pattern.cc
new file mode 100644
index 00000000000..933fb5f3ccb
--- /dev/null
+++ b/src/tensor/tensor-pattern.cc
@@ -0,0 +1,18 @@
+#include "tensor/tensor-pattern.h"
+
+/**
+   This is some notes on plans for kaldi10 tensor stuff, nothing is fully fleshed out.
+*/
+
+namespace kaldi {
+namespace tensor {
+
+
+
+
+}
+
+
+
+}
+}
diff --git a/src/tensor/tensor-pattern.h b/src/tensor/tensor-pattern.h
new file mode 100644
index 00000000000..838a6affd65
--- /dev/null
+++ b/src/tensor/tensor-pattern.h
@@ -0,0 +1,258 @@
+#include "tensor/tensor-common.h"
+
+/**
+   This is some notes on plans for kaldi10 tensor stuff, nothing is fully fleshed out.
+*/
+
+namespace kaldi {
+namespace tensor {
+
+
+// This enum with one value is a trick to allow you to
+// emulate indexing schemes like, say, A[10:].
+// In C++ you'd do A(all,10).
+enum RangeEnum { all };
+
+/**
+   struct Range represents an integer or a range of integers (e.g. as used in
+   indexing).  It emulates Python's range().
+
+   There are various possibilities of what Range can contain, enumerated below.
+   Be careful: we use {a,b,c} to denote the actual class members, not the
+   arguments to constructors, which mimic the arguments of expressions with colons
+   in Python's indexing with ':'
+
+   For purposes of explanation I will assume we are indexing a 1-dimensional
+   array a, but this struct is also used for multi-dimensional indexing.
+
+   Examples are below (showing members {begin,end,step}, where inf means
+   std::numeric_limits<int64>::max()):
+
+
+   Literal contents     Python equivalent,     How obtained             Elements of array
+   of Range struct      indexing array a     using constructors           you would get
+
+    {0,inf,1}          a[:], a[0:]          Range(all), Range(0,all)    all of them
+
+    {0,10,2}           a[:10:2], a[0:10:2]   Range(0,10,2)             [0,2,4,8]
+
+    {0,-1,1}           a[:-1], a[0:-1]       Range(0,-1)                all but the last
+
+    {10,2,-1}          a[10:2:-1]           Range(10,2,-1)              [10,9,...3]
+
+    {inf,inf,-1}        a[::-1]             Range(all,all,-1)            all, reversed order
+
+    {-3,-2,1}          a[-3:-2]            Range(-3,-2)             third-from-last element only
+
+    {10,0,inf}         a[10]              10 (implicit; RangeExt constructor)    the 10th element, removing axis
+
+
+*/
+struct Range {
+  int64_t begin;
+  int64_t end;
+  int64_t step;
+
+  static inline int64_t inf() { return std::numeric_limits<int64_t>::max(); }
+
+  // The default constructor leaves the range undefined.
+  Range() { }
+
+  Range(RangeEnum): begin(0), end(inf()), step(1) { }
+
+  explicit Range(int64_t end): begin(0), end(end), step(1) { }
+
+  Range(int64_t begin, int64_t end, int64_t step = 1):
+      begin(begin), end(end), step(1) { }
+
+  Range(int64_t begin, RangeEnum, int64_t step = 1):
+      begin(begin), end(inf()), step(step) { }
+
+  Range(RangeEnum, int64_t end, int64_t step = 1):
+      begin(inf), end(end), step(step) { }
+
+  Range(RangeEnum, RangeEnum, int64_t step = 1):
+      begin(inf()), end(inf()), step(step) { }
+};
+
+/**
+  struct RangeExt is used in situations, such as indexing, where what we have
+  might be a Range (like, in numpy, indexing with something that has a colon);
+  or it might simply be an integer.  There are no new members.  The reason we
+  don't just make this an additional constructor of Range is that we want it
+  so if someone does Range(10) it is interpreted as 0 through 9, but if
+  you do just 10 it means the index 10.  You can't have an explicit and
+  implicit constructor taking the same type: hence this child class.
+
+  Note that numpy's A[1] is not the same as A[1:2] because the former returns a
+  tensor with one fewer axes.
+*/
+struct RangeExt: public Range {
+  RangeExt(Range r): Range(r) { }
+
+  // implicit
+  RangeExt(int64_t index):
+      Range(index, 0, inf());
+};
+
+/**
+enum IndexingType{
+  kIndexingTypeRange,
+  kIndexingTypeNumber,
+  kIndexingTypeTensor
+};
+
+// This struct is used when indexing with mixed types. Ror
+// example:
+// Tensor a(...), b(...);
+// Tensor c = a(1, b, Range(0,-1), Range(all));
+
+struct IndexingArg {
+  IndexingArg(const Tensor &tensor);
+  IndexingArg(int64_t index);
+  IndexingArg(Range range);
+
+  IndexingType itype;
+  int64_t index;
+  std::shared_ptr<Tensor> tensor {nullptr};
+  Range range;
+  };*/
+
+/**
+   This function, used in indexing operations, takes a Range that may have, say,
+   negative 'end' or end equal to Range::inf(), and turns it into actual
+   numbers with begin and end both in the range [0,dim].  So, for instance, if
+   the range had `end = -1`, it would be turned into `dim - 1`; or if `end` was
+   Range::inf(), it would be interpreted as `dim`.
+
+   Raises an exception the resulting range is empty.
+ */
+void MakeRangeExplicit(Range *range, int64_t dim);
+
+
+
+/*
+  This struct stores the dimension and strides of a Tensor.  The following
+  describes the properties that a TensorPattern will always have.
+
+  These properties are stricter than some other frameworks, such as PyTorch,
+  which allow the users to manually add dimensions with stride 0 and dim > 1 so
+  that a lower-dimensional quantity can masquerade as one with a higher
+  dimension.  We require that it never be possible to access the same memory
+  location using two different tuples of indexes.  We also don't allow zero dims
+  (i.e. a tensor must not be empty); if you want an empty Tensor, just use a
+  null pointer.  In addition, require that the stride equal zero for any
+  axis that has dim = 1.
+
+  Our requirements on a TensorPattern are:
+
+    0 <= num_axes <= 5
+    for 0 <= axis < num_axes:
+       dims[i] > 0
+       if dims[i] = 1, then strides[i] = 0.
+       if dims[i] != 1, then strides[i] != 0
+    ... plus the uniqueness property.
+
+  The uniqueness property means that we must not be able to access
+  the same memory location via two different tuples of indexes).
+  Recause testing this property exactly would be difficult in general
+  without bringing in number theory, we test a slightly stronger version
+  of it that covers all cases we are likely to encounter.
+*/
+struct TensorPattern {
+  int64_t num_axes;
+  int64_t dims[KALDI_TENSOR_MAX_DIM];
+  int64_t strides[KALDI_TENSOR_MAX_DIM];
+  // We may later add methods to this.
+
+  // Checks that the TensorPattern is valid, assuming it is part of a Tensor.
+  // I.e. that it satifies all the properties mentioned above.
+  bool Check();
+};
+
+struct TensorPatternProperties {
+  // Below are cached properties that are derived from the underlying data in
+  // struct TensorPattern.
+
+  // The number of elements in the Tensor, which equals the product
+  // of dims[0] .. dims[num_axes - 1].  Will always be >0.
+  int64_t num_elements;
+
+  // is_contiguous means that the data form a contiguous block in memory; it is
+  // not the same as PyTorch's is_contiguous which is a stronger condition,
+  // implying also that the strides are as for a `C-style` array.
+  bool is_contiguous;
+
+  // has_c_strides means that the strides are as if this was a "c"-style
+  // multidimensional array, meaning that (using Python wrap-around indexing
+  // conventions as if strides was an array of dimension 'num_axes'),
+  // strides[-1] == 1, strides[-1] == dims[-1], strides[-2] = dims[-1] *
+  // dims[-1], and so on.  This is the same as PyTorch's is_contiguous.
+  bool has_c_strides;
+
+  void UpdateProperties(const TensorPattern &pattern);
+};
+
+
+/**
+   Compresses a TensorPattern by removing or combining as many dimensions
+   as possible.  This version is suitable for 'flat' operations that do
+   not rely on any kind of structure, such as zeroing or nonlinearities.
+
+      @param [in]   src   The pattern to be compressed
+      @param [in]  src_properties  Properties of 'src'; required to
+                          be accurate (behavior is undefined otherwise,
+                          e.g. if you provide some other pattern's properties).
+      @param [out] dest   A simplified-as-much-as-possible pattern that
+                          covers the same set of elements as 'src' (when
+                          combined with the offset below).  'dest' will
+                          be free of negative strides.
+      @param [out] data_offset  A number that we would have to add to
+                          the data pointer of the source Tensor so
+                          that 'dest' would cover the same set of
+                          elements.  It will always be zero if 'src'
+                          was free of negative strides.
+
+   Examples are below, where we write a TensorPattern as
+  `   {{dim1,dim2,..}, {stride1,stride2,..}}
+
+   Input pattern             Output pattern            Output offset
+     {{10},{1}}               {{10},{1}}                  0
+    {{3,4},{4,1}}             {{12},{1}}                  0
+    {{9},{-1}}                {{9},{1}}                  -8
+   {2,3,4},{100,4,1}        {{2,12},{100,1}}              0
+
+
+ */
+void CompressPatternFlat(const TensorPattern &src,
+                         const TensorPatternProperties &src_properties,
+                         TensorPattern *dest,
+                         int64_t *data_offset)
+
+/**
+
+ */
+void CompressPattern(const TensorPattern &src,
+                     const TensorPatternProperties &src_properties,
+                     TensorPattern *dest);
+
+
+
+
+/**
+
+
+ */
+bool CreateViewPattern(const TensorPattern &pattern_in,
+                       const TensorPatternProperties &properties_in,
+                       ArrayRef<int64_t> dims,
+                       TensorPattern *pattern_out,
+                       TensorPatternProperties *properties_out);
+
+
+
+};
+
+
+}
+}
diff --git a/src/tensor/tensor.h b/src/tensor/tensor.h
index 92bf95c0f90..220af7bb045 100644
--- a/src/tensor/tensor.h
+++ b/src/tensor/tensor.h
@@ -5,267 +5,6 @@
 namespace kaldi {
 namespace tensor {
 
-
-// Similar to llvm/PyTorch's ArrayRef, this is a lightweight way to store an
-// array (zero or more elements of type T).  The array is not owned here; it
-// will generally be unsafe to use an ArrayRef as other than a local variable.
-template <typename T>
-struct ArrayRef {
-  // Note:
-  uint64_t size;
-  int64_t *data;
-
-  // Lots to do here.
-
-  inline int64_t operator (uint64_t i) const {
-    KALDI_ASSERT(i < size);
-    return data[i];
-  }
-
-  // cast to std::vector; for cases where you might need to
-  // change the contents or keep it more permanently.
-  operator std::vector<int64_t> () const;
-};
-
-
-enum {
-  kCpuDevice = 0,
-  kCudaDevice = 1
-} DeviceType;
-
-// We may later add a device number (like which GPU we are using),
-// once we support multiple GPUs.
-struct Device {
-  DeviceType device_type;
-  // operator ==, probably, maybe constructors.
-};
-
-
-// 'Storage' contains a single allocated region (on CPU or GPU, according
-// to 'device').
-struct Storage {
-  void *data;
-  size_t num_bytes;
-  Device device;
-
-  // Note: will throw if allocation fails (for now).
-  Storage(Device device, size_t num_bytes);
-
-  // Destructor deallocates 'data'.  For now there is no
-  // concept of a custom allocator or an allocator object, we just use our CuDevice stuff for cuda
-  // allocation and posix_memalign for CPU allocation (obviously we need
-  // to make sure 'data' is aligned in most specific way we might need).
-  // in future we might choose
-  // to add that.
-  ~Storage();
-};
-
-
-enum DataType {
-  // We will of course later extend this with many more types, including
-  // integer types and half-precision floats.
-  kFloatDtype = 0,
-  kDoubleDtype = 1
-};
-
-enum StridePolicy {
-  kCstrides = 0,
-  kCopyStridesIfContiguous = 1
-};
-
-#define KALDI_TENSOR_MAX_DIM 5
-
-
-
-// This enum with one value is a trick to allow you to
-// emulate indexing schemes like, say, A[10:].
-// In C++ you'd do A(all,10).
-enum RangeEnum { all };
-
-/**
-   struct Range represents an integer or a range of integers (e.g. as used in
-   indexing).  It emulates Python's range().
-
-   There are various possibilities of what Range can contain, enumerated below.
-   Be careful: we use {a,b,c} to denote the actual class members, not the
-   arguments to constructors, which mimic the arguments of expressions with colons
-   in Python's indexing with ':'
-
-   For purposes of explanation I will assume we are indexing a 1-dimensional
-   array a, but this struct is also used for multi-dimensional indexing.
-
-   Examples are below (showing members {begin,end,step}, where inf means
-   std::numeric_limits<int64>::max()):
-
-
-   Literal contents     Python equivalent,     How obtained             Elements of array
-   of Range struct      indexing array a     using constructors           you would get
-
-    {0,inf,1}          a[:], a[0:]          Range(all), Range(0,all)    all of them
-
-    {0,10,2}           a[:10:2], a[0:10:2]   Range(0,10,2)             [0,2,4,8]
-
-    {0,-1,1}           a[:-1], a[0:-1]       Range(0,-1)                all but the last
-
-    {10,2,-1}          a[10:2:-1]           Range(10,2,-1)              [10,9,...3]
-
-    {inf,inf,1}        a[::-1]             Range(all,all,-1)            all, reversed order
-
-    {-3,-2,1}          a[-3:-2]            Range(-3,-2)             third-from-last element only
-
-    {10,0,inf}         a[10]              10 (implicit; RangeExt constructor)    the 10th element, removing axis
-
-
-*/
-struct Range {
-  int64_t begin;
-  int64_t end;
-  int64_t step;
-
-  static inline int64_t inf() { return std::numeric_limits<int64_t>::max(); }
-
-  // The default constructor leaves the range undefined.
-  Range() { }
-
-  Range(RangeEnum): begin(0), end(inf()), step(1) { }
-
-  explicit Range(int64_t end): begin(0), end(end), step(1) { }
-
-  Range(int64_t begin, int64_t end, int64_t step = 1):
-      begin(begin), end(end), step(1) { }
-
-  Range(int64_t begin, RangeEnum, int64_t step = 1):
-      begin(begin), end(inf()), step(step) { }
-
-  Range(RangeEnum, int64_t end, int64_t step = 1):
-      begin(inf), end(end), step(step) { }
-
-  Range(RangeEnum, RangeEnum, int64_t step = 1):
-      begin(inf()), end(inf()), step(step) { }
-};
-
-/**
-  struct RangeExt is used in situations, such as indexing, where what we have
-  might be a Range (like, in numpy, indexing with something that has a colon);
-  or it might simply be an integer.  There are no new members.  The reason we
-  don't just make this an additional constructor of Range is that we want it
-  so if someone does Range(10) it is interpreted as 0 through 9, but if
-  you do just 10 it means the index 10.  You can't have an explicit and
-  implicit constructor taking the same type: hence this child class.
-
-  Note that numpy's A[1] is not the same as A[1:2] because the former returns a
-  tensor with one fewer axes.
-*/
-struct RangeExt: public Range {
-  RangeExt(Range r): Range(r) { }
-
-  // implicit
-  RangeExt(int64_t index):
-      Range(index, 0, inf());
-};
-
-
-enum IndexingType{
-  kIndexingTypeRange,
-  kIndexingTypeNumber,
-  kIndexingTypeTensor
-};
-
-// This struct is used when indexing with mixed types. Ror
-// example:
-// Tensor a(...), b(...);
-// Tensor c = a(1, b, Range(0,-1), Range(all));
-
-struct IndexingArg {
-  IndexingArg(const Tensor &tensor);
-  IndexingArg(int64_t index);
-  IndexingArg(Range range);
-
-  IndexingType itype;
-  int64_t index;
-  std::shared_ptr<Tensor> tensor {nullptr};
-  Range range;
-};
-
-/**
-   This function, used in indexing operations, takes a Range that may have, say,
-   negative 'end' or end equal to Range::inf(), and turns it into actual
-   numbers with begin and end both in the range [0,dim].  So, for instance, if
-   the range had `end = -1`, it would be turned into `dim - 1`; or if `end` was
-   Range::inf(), it would be interpreted as `dim`.
-
-   Raises an exception the resulting range is empty.
- */
-void MakeRangeExplicit(Range *range, int64_t dim);
-
-
-
-/*
-  This struct stores the dimension and strides of a Tensor.  The following
-  describes the properties that a Tensor will always have (note: we
-  also use TensorDim inside implementation code in ways such that these
-  properties do not all hold).
-
-  These properties are stricter than some other frameworks, such as PyTorch,
-  which allow the users to manually add dimensions with stride 0 and dim > 1 so
-  that a lower-dimensional quantity can masquerade as one with a higher
-  dimension.  We require that it never be possible to access the same memory
-  location using two different tuples of indexes.  We also don't allow zero dims
-  (i.e. a tensor must not be empty); if you want an empty Tensor, just use a
-  null pointer.  In addition, require that the stride equal zero for any
-  axis that has dim = 1.
-
-  Our requirements on a TensorDim are:
-
-    0 <= num_axes <= 5
-    for 0 <= axis < num_axes:
-       dims[i] > 0
-       if dims[i] = 1, then strides[i] = 0.
-       if dims[i] != 1, then strides[i] != 0
-    ... plus the uniqueness property.
-
-  The uniqueness property means that we must not be able to access
-  the same memory location via two different tuples of indexes).
-  Recause testing this property exactly would be difficult in general
-  without bringing in number theory, we test a slightly stronger version
-  of it that covers all cases we are likely to encounter.
-*/
-
-struct TensorDim {
-  uint64_t num_axes;
-  int64_t dims[KALDI_TENSOR_MAX_DIM];
-  int64_t strides[KALDI_TENSOR_MAX_DIM];
-  // We may later add methods to this.
-
-  // Checks that the TensorDim is valid, assuming it is part of a Tensor.
-  // I.e. that it satifies all the properties mentioned above.
-  bool Check();
-};
-
-struct TensorDimProperties {
-  // Below are cached properties that are derived from the underlying data in
-  // struct TensorDim.
-
-  // The number of elements in the Tensor, which equals the product
-  // of dims[0] .. dims[num_axes - 1].  Must be >0.
-  int64_t num_elements;
-
-  // is_contiguous means that the data form a contiguous block in memory; it is
-  // not the same as PyTorch's is_contiguous which is a stronger condition,
-  // implying also that the strides are as for a `C-style` array.
-  bool is_contiguous;
-
-  // has_c_strides means that the strides are as if this was a "c"-style
-  // multidimensional array, meaning that (using Python wrap-around indexing
-  // conventions as if strides was an array of dimension 'num_axes'),
-  // strides[-1] == 1, strides[-1] == dims[-1], strides[-2] = dims[-1] *
-  // dims[-1], and so on.  This is the same as PyTorch's is_contiguous.
-  bool has_c_strides;
-
-  void UpdateProperties(const TensorDim &dim);
-};
-
-
 /**
    A Tensor is a multi-dimensional array (up to 5 dimensions) of types such as
    float or double (and eventually ints).  Multiple Tensors may point to data
@@ -284,27 +23,27 @@ class Tensor {
   // contexts, this is sometimes known as the rank of the tensor, or sometimes
   // even its dimension, but these terms are ambiguous so we avoid them, and use
   // the terms 'number of axes' or 'axis' throughout.
-  inline int64_t NumAxes() const { return dim_.num_axes; }
+  inline int64_t NumAxes() const { return pattern_.num_axes; }
 
   // Return reference to the struct containing the dimension and
   // stride info.
-  const TensorDim &DimAndStrides() const { return dim_; }
+  const TensorPattern &DimAndStrides() const { return pattern_; }
 
   // Return an array containing dimensions of the tensor; equivalent to
   // .shape in PyTorch.  Dims().size() will equal NumAxes().
-  inline ArrayRef<int64_t> Dims() const { return ArrayRef{dim_.num_axes, dim_.dims}; }
+  inline ArrayRef<int64_t> Dims() const { return ArrayRef{pattern_.num_axes, pattern_.dims}; }
 
   // Returns the dimension on this axis, a number >= 1.  Result is
   // undefined if axis < 0 or axis >= NumAxes().
-  inline int64_t Dim(int64_t axis) const { return dim_.dims[axis]; }
+  inline int64_t Dim(int64_t axis) const { return pattern_.dims[axis]; }
 
   // Returns an array containing the strides of the tensor.
   // Strides().size() will equal NumAxes().
-  inline ArrayRef<int64_t> Strides() const { return ArrayRef{dim_.num_axes, dim_.strides}; }
+  inline ArrayRef<int64_t> Strides() const { return ArrayRef{pattern_.num_axes, pattern_.strides}; }
 
   // Returns the stride on this axis.  Will be zero if the corresponding
   // dimension is 1, and otherwise nonzero (but not necessarily positive).
-  inline int64_t Stride(int64_t axis) const { return dim_.strides[axis]; }
+  inline int64_t Stride(int64_t axis) const { return pattern_.strides[axis]; }
 
   // Returns the number of elements in the Tensor; must be > 0.
   inline int64_t NumElements() const { return derived_.num_elements; }
@@ -314,7 +53,6 @@ class Tensor {
   // that the strides be 'C'-style.
   inline bool IsContiguous() const { return derived_.is_contiguous; }
 
-
   // Returns true if the strides for this array are what you would
   // expect if you were to construct a Tensor from this->Dims();
   // this means "C"-style strides, except that any axis with dimension=1
@@ -391,26 +129,65 @@ class Tensor {
   // Move assignment.  Does not copy the data.
   Tensor(Tensor &&other);
 
-  // Copy data from tensor 'other'.  Requires this Dims() and other.Dims()
-  // be compatible, meaning that they are the same, except it's OK for
-  // a dim of 'other' to be 1 and a dim of *this to be >1 (we will
-  // broadcast, i.e. copy).
-  void CopyData(const Tensor &other);
-
-  // Construct a Tensor with the supplied dimensions; and if set_zero is true,
-  // zero it.
-  Tensor(ArrayRef<int64_t> dims, bool set_zero = false);
-
-  // Construct a Tensor with
-  Tensor(TensorDim &dim, StridePolicy policy, bool set_zero = false);
-
+  /**
+     Construct a new Tensor with freshly allocated underlying data with
+     the data type, device and dimension the same as `other`.
+
+       @param [in]  other  The tensor that we are taking metadata from (we
+                    are not sharing its underlying data).
+       @param [in]  sp   The stride policy; if kCopyStrides then we use
+                       strides with the same sign and size-order as
+                       `other`, while filling in any gaps if `other`
+                       was not contiguous, if kCstrides then we use
+                       "C" style strides for any dimensions != 1.
+       @param [in]  ip   The data initialize policy
+
+     The strides will not be the same as 'other' if other.IsContiguous() ==
+     false, but the ordering of the strides (smaller vs. larger) and their
+     signs will remain the same.
+  */
+  Tensor(const Tensor &other, StridePolicy sp, InitializePolicy ip);
+
+
+
+  /** Construct a Tensor with freshly allocated data.
+       @param [in] dims    The dimensions of the tensor (zero to 5
+                    positive integers).
+       @param [in] dtype   The data type to use
+       @param [in] device  The device to put the data on
+       @param [in] set_zero   If true, set the tensor to zero.  If false,
+                        the contents will be undefined.
+   */
+  Tensor(ArrayRef<int64_t> dims, DataType dtype, Device device,
+         bool set_zero = false);
+
+  /**
+     Construct a Tensor with the dimensions and strides provided.  This differs
+     from the constructor taking `ArrayRef<int64_t> dims` in that it will use
+     the strides in `pattern` (except that if the data in `pattern` is not
+     contiguous, it will make it contiguous by filling in any gaps).  This means
+     that, for example, if you use this constructor on a 2-dimensional Tensor
+     that has been transposed and thus has a column-major layout, the resulting
+     Tensor will also have a column-major layout.
+
+       @param [in] pattern  The dimension and stride information that
+                  this tensor should match (although we will fill gaps
+                  to make it contiguous)
+       @param [in] dtype   The data type to use
+       @param [in] device  The device to put the data on
+       @param [in] set_zero   If true, set the data to zero.  If false,
+                        the contents will be undefined.
+
+  */
+  Tensor(TensorPattern &pattern, DataType dtype, Device device,
+         InitializePolicy p);
 
 
  private:
   // The tensor dim and strides.
-  TensorDim dim_;
-  // Cached properties that depend on dim_.
-  TensorDimProperties derived_;
+  TensorPattern pattern_;
+  // Cached properties that depend on pattern_.
+  TensorPatternProperties derived_;
   // The data-type of this tensor.
   DataType dtype_;
 
@@ -453,7 +230,7 @@ struct TensorGrad {
 
   // The dimension of the Tensor for which this is the gradient.  Used
   // to set up 'grad' when needed.
-  TensorDim dim;
+  TensorPattern dim;
 
   // 'offset' is only inspected if this is a view; it is the offset
   // (in elements) from the
@@ -468,16 +245,44 @@ struct TensorGrad {
 
 
 /**
-   class Variable is the same as class Tensor but augmented with autograd
-   machinery.  The overall design is quite similar to PyTorch, and the C++
-   is similar to flashlight.  If you are only familiar with PyTorch's
-   python frontend, class Variable is equivalent to af.tensor.
+   class Variable is somewhat like class Tensor but augmented with autograd
+   machinery.  Because autograd requires a rather 'functional' way of doing
+   things (i.e. is not super friendly to in-place operations), the functions
+   that operate on class Variable will tend to be ones that return something,
+   rather than in-place operations.
+
+   The overall design is quite similar to PyTorch, and the structure
+   of the the C++ code is similar to flashlight.  If you are only familiar with
+   PyTorch's python frontend, class Variable is rougtly equivalent to what they
+   expose as af.tensor.
  */
 class Variable {
   using GradFunc = std::function<
-    void(std::vector<Variable>& inputs, Variable &grad_output)>;
-  using GradHook = std::function<void(Variable *grad)>;
+    void(const std::vector<Variable>& inputs, TensorGrad *grad_output)>;
+  using GradHook = std::function<void(TensorGrad *grad)>;
+
+
+
+  /** Constructor from a Tensor.
+       @param [in] data  Pointer to the source Tensor
+       @param [in] requires_grad    If requires_grad argument is true,
+                the gradient w.r.t. this Variable will be computed if and when
+                you call Backward() on a Variable that depends on it.
+                The same as requires_grad in PyTorch.
+  */
+  Variable(const std::shared_ptr<Tensor> &data, bool requires_grad);
+
+
 
+  /**
+   * Creates a Variable which wraps the array and inputs specified
+   * @param[in] data array to the stored in the Variable
+   * @param[in] inputs a vector specifying inputs for this Variable
+   * @param[in] gradFunc function specifying how to calculate gradient of the
+   * input Variables
+   */
+  Variable(std::shared_ptr<Tensor> &data, std::vector<Variable> inputs,
+           GradFunc gradFunc);
 
 
 

From eca0e809a49e495b6c402823f840328247eaad26 Mon Sep 17 00:00:00 2001
From: Daniel Povey <dpovey@gmail.com>
Date: Mon, 18 Mar 2019 11:13:53 -0400
Subject: [PATCH 007/163] [src] More drafting of tensor related stuff (#3132)

* [src] More work on tensor patterns

* [src] More documentation for View().
---
 src/tensor/tensor-functions.h |  93 ++++++++++++++++++---
 src/tensor/tensor-pattern.h   | 147 +++++++++++++++++++++++++++++-----
 src/tensor/tensor.h           |   2 +-
 3 files changed, 213 insertions(+), 29 deletions(-)

diff --git a/src/tensor/tensor-functions.h b/src/tensor/tensor-functions.h
index bbab4c58a4b..a2d357a4e9f 100644
--- a/src/tensor/tensor-functions.h
+++ b/src/tensor/tensor-functions.h
@@ -42,21 +42,94 @@ Tensor Transpose(const Tensor &tensor, int64_t axis1 = 0, int64_t axis2 = 1);
 void CopyData(const Tensor &src, const Tensor *dest);
 
 /**
-   Construct, if possible, a Tensor that is a view into 'src'.  Read this
-   carefully, as the semantics may differ from the 'view' functions in some
-   other toolkits.  'View' does not care how the underlying data of 'src'
-   is organized.  Its semantics can be explained as follows.
-
-   Suppose `src` were a "C"-style array with dimension given by `src.Dims()`.
-   Then reinterpret that array as one with dimension `dims`, if possible, and
-   return a Tensor describing that array.  If
+  Construct, if possible, a Tensor that is a view into 'src' with the
+  requested dimensions.
+
+  The semantics are based on those of PyTorch's "view" or NumPy's
+  "reshape", except we try to be more agnostic about the striding
+  of the input.
+
+  Consider a Tensor 'a' has "C"-style strides.  Then this function will return
+  Tensor (say, 'b') that interprets the raw data of 'a' as an array with
+  "C"-style strides but with dimensions 'dims'.  (The product of 'dims' must
+  equal src.NumElements()).
+
+  Now consider a Tensor 'a2' that does not have "C"-style strides but
+  has the same elements as 'a' in the sense that a(i,j,k) == a2(i,j,k).
+  Then, *if possible*, this function will return a matrix b2 with
+  the same elements as b, e.g. b2(i,j,k) == b(i,j,k).
+
+  This function returns NULL if such a tensor could not be constructed.  In that
+  case, likely what you will want to do is to construct a temporary Tensor from
+  'src' with the same dimensions but "C"-style strides (see the constructor of
+  Tensor that accepts the 'dims' parameter).  You may then call View() on that
+  temporary Tensor, which is guaranteed to succeed.
+
+     @param   [in] src   The source Tensor, whose data is to be
+                         reinterpreted.
+     @param   [in] dims  The dimensions that we want for the returned
+                       Tensor; its product must equal src.NumElements().
+     @return   If the view could be constructed, this function returns
+               a shared_ptr to a new Tensor with the requestd dims,
+               that shares underlying data with 'src'; otherwise returns
+               NULL.  (If src.HasCStrides(), then this function is
+               guaranteed not to return nullptr).
 
+ */
+std::shared_ptr<Tensor> View(const Tensor &src, ArrayRef<int64_t> dims);
 
 
-     @param
+/**
+   Returns a Tensor with a new view of the data in 'src', in which the axes
+   numbered axis1 and axis1 + 1 are merged.
+
+   For example, if 'src' is a Tensor with dims (3,4,5) and you call
+   MergeAxes(src, 1), this funtion will merge axes 1 and 2 and return a Tensor
+   with shape (3,20).  The order of the elements in the second axis of the
+   result is required to be what you would expect if the layout as as a
+   "C" array (so: 4 blocks of 5 elements, and not vice versa).  This
+   is a common special case of what the function 'View' can give you.
+
+   If the pattern of 'src' makes the requested merging impossible,
+   this function will return nullptr.  (This may be the case if, for
+   instance, the two axes in question were previously transposed).
+   In that case the caller will probably want to construct a temporary
+   Tensor 'temp' passing src.Dims() in the constructor, copy the data
+   in 'src' to 'temp', and then call MergeAxes on 'temp, which is
+   guaranteed to work.
+
+       @param [in]  src   The Tensor which whose axes we will attempt to
+                          merge
+       @param [in] axis1  The index of the first of the two axes which
+                          this function will attempt to merge.  Must
+                          be less than src.NumAxes() - 1.
+       @return            Returns a pointer to a Tensor with the
+                          merged axes (if the pattern of 'src'
+                          allows it), or nullptr otherwise.
+ */
+std::shared_ptr<Tensor> MergeAxes(const Tensor &src, int64_t axis1);
 
+/**
+   Returns a Tensor in which the axis numbered 'axis' is split into
+   two axes, with dimensions respectively 'dim1' and 'dim2'.  The
+   interpretation will be as for a "C" array; so, for instance,
+   if the dimensions of 'src' were (10,12) and you called
+   `SplitAxis(src, 1, 3, 4)`, the indexes along axis 1 would
+   be interpreted as 3 blocks of size 4.  This is a common
+   special case of what the function 'View' can do.
+
+
+      @param [in] src  The Tensor whose axis is to be split.
+      @param [in] axis  The index of the axis to be split; must
+                       satisfy `0 <= axis < src.Dims().`
+      @param [in] dim1, dim2   The two dimensions into which
+                       we will split the axis.  Must satisfy
+                       `dim1 * dim2 == src.Dim(axis)`.
+      @return     Returns a Tensor which shares the same
+                  underlying data as 'src'
  */
-std::shared_ptr<Tensor> View(const Tensor &src, ArrayRef<int64_t> dims);
+std::shared_ptr<Tensor> SplitAxis(const Tensor &src, int64_t axis,
+                                  int64_t dim1, int64_t dim2);
 
 
 }
diff --git a/src/tensor/tensor-pattern.h b/src/tensor/tensor-pattern.h
index 838a6affd65..d5e2b4ffbba 100644
--- a/src/tensor/tensor-pattern.h
+++ b/src/tensor/tensor-pattern.h
@@ -185,9 +185,10 @@ struct TensorPatternProperties {
 
   // has_c_strides means that the strides are as if this was a "c"-style
   // multidimensional array, meaning that (using Python wrap-around indexing
-  // conventions as if strides was an array of dimension 'num_axes'),
+  // conventions as if strides was an array with 'num_axes' axes),
   // strides[-1] == 1, strides[-1] == dims[-1], strides[-2] = dims[-1] *
   // dims[-1], and so on.  This is the same as PyTorch's is_contiguous.
+  // this->has_c_strides implies this->is_contiguous.
   bool has_c_strides;
 
   void UpdateProperties(const TensorPattern &pattern);
@@ -195,18 +196,22 @@ struct TensorPatternProperties {
 
 
 /**
-   Compresses a TensorPattern by removing or combining as many dimensions
-   as possible.  This version is suitable for 'flat' operations that do
-   not rely on any kind of structure, such as zeroing or nonlinearities.
+   Compresses a TensorPattern by removing or combining as many axes as possible.
+   This version is suitable for 'flat' operations that do not rely on any kind
+   of structure, such as zeroing or nonlinearities; the only equivalence
+   maintained is equivalence of the set of memory locations covered.
+   The order of the (dim,stride) pairs in the input does not affect the
+   output.  The output (dim,stride) pairs will be ordered from
+   greatest to least stride (note: all output strides will be positive).
 
       @param [in]   src   The pattern to be compressed
       @param [in]  src_properties  Properties of 'src'; required to
                           be accurate (behavior is undefined otherwise,
                           e.g. if you provide some other pattern's properties).
       @param [out] dest   A simplified-as-much-as-possible pattern that
-                          covers the same set of elements as 'src' (when
+                          covers the same set of memory locations as 'src' (when
                           combined with the offset below).  'dest' will
-                          be free of negative strides.
+                          contain only nonnegative strides.
       @param [out] data_offset  A number that we would have to add to
                           the data pointer of the source Tensor so
                           that 'dest' would cover the same set of
@@ -214,40 +219,146 @@ struct TensorPatternProperties {
                           was free of negative strides.
 
    Examples are below, where we write a TensorPattern as
-  `   {{dim1,dim2,..}, {stride1,stride2,..}}
+    `{{dim1,dim2,..}, {stride1,stride2,..}}`.
 
+\verbatim
    Input pattern             Output pattern            Output offset
      {{10},{1}}               {{10},{1}}                  0
     {{3,4},{4,1}}             {{12},{1}}                  0
+    {{4,3},{1,4}}             {{12},{1}}                  0
     {{9},{-1}}                {{9},{1}}                  -8
    {2,3,4},{100,4,1}        {{2,12},{100,1}}              0
-
-
+\endverbatim
  */
 void CompressPatternFlat(const TensorPattern &src,
                          const TensorPatternProperties &src_properties,
                          TensorPattern *dest,
-                         int64_t *data_offset)
+                         int64_t *data_offset);
+
+/*
+  Compress two TensorPatterns by combining axes (and possibly
+  flipping the sign of their strides and changing the data offset)
+  The type of compression involved is the same as for CompressPatternFlat
+  (meaning we are doing some kind of operation that doesn't care about
+a  the structure, such as an element-by-element nonlinearity).
+
+  The difference from calling CompressPatternFlat() twice is that this function
+  is only allowed to do the same operation to src1 and src2, e.g. if combining
+  two axes of src1 we would also have to combine the same two axes of src2.
+
+    @param [in] src1  The first source pattern.
+    @param [in] src2  The second source pattern.  The assumption is that src1
+                     and src2 are participating in some kind of operation like
+                     copying, or elementwise multiplication.  The patterns
+                     must satisfy src1.NumAxes() == src2.NumAxes(), and
+                     for each axis, either src1.dims[axis] == src2.dims[axis],
+                     or one of those two dimensions equals 1 (so there would be
+                     some kind of broadcasting).  The
+    @param [out] dest1  Compressed pattern out corresponding to src1.  Will
+                     be free of negative strides (but dest2 might not be).
+    @param [out] dest_offset1  Data offset that we'd need to add to src1's
+                     data pointer before using the pattern 'dest1'
+    @param [out] dest1  Compressed pattern out corresponding to src2.
+                     Might not be free of negative strides if some dimensions
+                     of src1/src2 had strides of opposite sign.
+    @param [out] dest_offset1  Data offset that we'd need to add to src1's
+                     data pointer before using the pattern 'dest1'
+
+  TODO: examples
+ */
+void CompressPatternsFlat(const TensorPattern &src1,
+                          const TensorPattern &src2,
+                          TensorPattern *dest1,
+                          int64_t *data_offset1
+                          TensorPattern *dest2,
+                          int64_t *data_offset2);
+
 
 /**
+   Compresses a TensorPattern by removing or combining as many axes as possible,
+   while respecting certain invariances that are relevant when constructing
+   'views' ('view' is PyTorch terminology; the NumPy equivalent is 'reshape').
+   The "C" in the function name refers to C-style arrays.
 
- */
-void CompressPattern(const TensorPattern &src,
-                     const TensorPatternProperties &src_properties,
-                     TensorPattern *dest);
+    This function removes axes with dim=1.
 
+    This function combines successive axes if the relationship of their
+    dims and strides is what you would expect in a "C"-style array.
+    Suppose that in 'src' we had two successive axes with dims and
+    strides (dim_a, dim_b) and (stride_a, stride_b), with dim_a > 1 and
+    dim_b > 1.  If stride_a == stride_b * dim_b, then this function
+    will merge them into a single axis with dimension (dim_a * dim_b)
+    and stride stride_b.
 
+   The output pattern 'dest' is what you get if you keep applying the
+   rules above until no further change is made.
 
+   Examples are below, where we write a TensorPattern as
+  `   {{dim1,dim2,..}, {stride1,stride2,..}}`.
+\verbatim
+   Input pattern             Output pattern
+     {{10},{1}}               {{10},{1}}
+    {{5,1},{1,1}}             {{5},{1}}
+    {{9},{-1}}                {{9},{-1}}
+   {2,3,4},{100,4,1}        {{2,12},{100,1}}
+   {2,3,4},{100,-4,-1}        {{2,12},{100,-1}}
+\endverbatim
+ */
+void CompressPatternC(const TensorPattern &src,
+                      const TensorPatternProperties &src_properties,
+                      TensorPattern *dest);
 
-/**
 
 
+/**
+   Creates a TensorPattern corresponding to a requested 'view' of the matrix.
+   ('view' is PyTorch terminology; the NumPy equivalent is 'reshape').
+
+   The PyTorch/NumPy semantics are (I believe) as follows: Firstly, a view
+   can/should only be created for a tensor whose layout in memory is as for a
+   "C" array; suppose that the shape of array a is (9, 8), a "C" layout would
+   imply strides of (8, 1).  A 'view' of this array simply implies interpreting
+   the same block of memory as a "C" array with some other sequence of
+   dimensions, say (3, 3, 8) or (8, 9) or (1, 72); any sequence whose product
+   matches the number of elements in "a".
+
+   Our semantics of "view" is the same as that of PyTorch/NumPy except that we
+   impose fewer constraints on what strides the input Tensor cmay have.  Let the
+   'view' of the array 'a' be 'b'.  As long as it is possible to find a tensor
+   pattern for 'b' that would lead to the same relationship between the elements
+   of 'a' and 'b' as what you would get by asking for the same "view" in
+   PyTorch/NumPy assuming 'a' had had "C"-style strides (viewed in terms of
+   indexed elements of and b, without regard to the physical memory layout), we
+   allow it.
+
+
+   Notes on implementation (glossing over ones in 'dims' which are easy to
+   handle as a special case): we would first call CompressPattern on
+   'pattern_in'.  Then we would attempt to find a correspondence with
+   the dimensions of this compressed pattern and a partition of the
+   sequence 'dims'.  For example, suppose the compressed pattern
+   is (100, 9) and dims is (50, 2, 3, 3), then the partition would
+   be (50, 2), (3, 3).  If this is not possible (e.g. if dims
+   had been (30,10,3) instead), we return false.
+
+   @param [in]  pattern_in   The input pattern for which we are trying to
+                          find an alternative view
+   @param [in]  dims  The sequence of dimensions corresponding to the
+                      desired view.  Its product must be the same as the
+                      product of pattern_in.dims.
+   @param [out] pattern_out  The output pattern, if we were
+                      successful (otherwise undefined).  Its 'dims'
+                      will be the same as 'dims'.
+   @return           Returns true on success (i.e. such a view existed),
+                     and false otherwise.  This function will never return
+                     false if 'pattern_in' had strides as for a "C" array
+                     (i.e., if its properties' has_c_strides was true).
+
  */
 bool CreateViewPattern(const TensorPattern &pattern_in,
-                       const TensorPatternProperties &properties_in,
                        ArrayRef<int64_t> dims,
-                       TensorPattern *pattern_out,
-                       TensorPatternProperties *properties_out);
+                       TensorPattern *pattern_out);
+
 
 
 
diff --git a/src/tensor/tensor.h b/src/tensor/tensor.h
index 220af7bb045..31331d2c142 100644
--- a/src/tensor/tensor.h
+++ b/src/tensor/tensor.h
@@ -58,7 +58,7 @@ class Tensor {
   // this means "C"-style strides, except that any axis with dimension=1
   // has its stride set to zero.  This is our equivalent of PyTorch's
   // contiguous().
-  inline bool HasNormalStrides() const { return derived_.has_c_strides; }
+  inline bool HasCStrides() const { return derived_.has_c_strides; }
 
   // Return the data type.
   DataType Dtype() const { return dtype_; }

From 63e35b0a5986be8b9d072c3c8674114f4d0a50e1 Mon Sep 17 00:00:00 2001
From: Yiwen Shao <sywcs007wow@gmail.com>
Date: Wed, 20 Mar 2019 12:53:41 -0400
Subject: [PATCH 008/163] [src] completed stride support for kaldi-vector
 (#3146)

---
 src/matrix/kaldi-vector.cc | 296 ++++++++++++++++++++++---------------
 1 file changed, 176 insertions(+), 120 deletions(-)

diff --git a/src/matrix/kaldi-vector.cc b/src/matrix/kaldi-vector.cc
index 84ff7a9d768..de6e626289e 100644
--- a/src/matrix/kaldi-vector.cc
+++ b/src/matrix/kaldi-vector.cc
@@ -241,9 +241,10 @@ template<typename OtherReal>
 void VectorBase<Real>::CopyFromVec(const VectorBase<OtherReal> &other) {
   KALDI_ASSERT(dim_ == other.Dim());
   Real * __restrict__  ptr = data_;
+  MatrixIndexT dim = dim_, stride = stride_, other_stride = other.Stride();
   const OtherReal * __restrict__ other_ptr = other.Data();
-  for (MatrixIndexT i = 0; i < dim_; i++)
-    ptr[i] = other_ptr[i];
+  for (MatrixIndexT i = 0; i < dim; i++)
+    ptr[i * stride] = other_ptr[i * other_stride];
 }
 
 template void VectorBase<float>::CopyFromVec(const VectorBase<double> &other);
@@ -253,9 +254,11 @@ template void VectorBase<double>::CopyFromVec(const VectorBase<float> &other);
 template<typename Real>
 void Vector<Real>::RemoveElement(MatrixIndexT i) {
   KALDI_ASSERT(i <  this->dim_ && "Access out of vector");
-  for (MatrixIndexT j = i + 1; j <  this->dim_; j++)
-    this->data_[j-1] =  this->data_[j];
-  this->dim_--;
+  Real *data = data_;
+  MatrixIndexT dim = dim_, stride = stride_;
+  for (MatrixIndexT j = i + 1; j <  dim; j++)
+    data[(j-1) * stride] =  data[j * stride];
+  dim_--;
 }
 
 
@@ -277,8 +280,10 @@ void VectorBase<Real>::SetZero() {
 template<typename Real>
 bool VectorBase<Real>::IsZero(Real cutoff) const {
   Real abs_max = 0.0;
-  for (MatrixIndexT i = 0; i < Dim(); i++)
-    abs_max = std::max(std::abs(data_[i]), abs_max);
+  Real *data = data_;
+  MatrixIndexT dim = dim_, stride = stride_;
+  for (MatrixIndexT i = 0; i < dim; i++)
+    abs_max = std::max(std::abs(data[i * stride]), abs_max);
   return (abs_max <= cutoff);
 }
 
@@ -286,17 +291,21 @@ template<typename Real>
 void VectorBase<Real>::SetRandn() {
   kaldi::RandomState rstate;
   MatrixIndexT last = (Dim() % 2 == 1) ? Dim() - 1 : Dim();
+  Real *data = data_;
+  MatrixIndexT stride = stride_;
   for (MatrixIndexT i = 0; i < last; i += 2) {
-    kaldi::RandGauss2(data_ + i, data_ + i +1, &rstate);
+    kaldi::RandGauss2(data + i * stride, data + (i + 1)*stride, &rstate);
   }
-  if (Dim() != last) data_[last] = static_cast<Real>(kaldi::RandGauss(&rstate));
+  if (Dim() != last) data[last * stride] = static_cast<Real>(kaldi::RandGauss(&rstate));
 }
 
 template<typename Real>
 void VectorBase<Real>::SetRandUniform() {
   kaldi::RandomState rstate;
-  for (MatrixIndexT i = 0; i < Dim(); i++) {
-    *(data_+i) = RandUniform(&rstate);
+  Real *data = data_;
+  MatrixIndexT dim = dim_, stride = stride_;
+  for (MatrixIndexT i = 0; i < dim; i++) {
+    data[i * stride] = RandUniform(&rstate);
   }
 }
 
@@ -306,11 +315,11 @@ MatrixIndexT VectorBase<Real>::RandCategorical() const {
   Real sum = this->Sum();
   KALDI_ASSERT(this->Min() >= 0.0 && sum > 0.0);
   Real r = RandUniform(&rstate) * sum;
-  Real *data = this->data_;
-  MatrixIndexT dim = this->dim_;
+  Real *data = data_;
+  MatrixIndexT dim = dim_, stride = stride_;
   Real running_sum = 0.0;
   for (MatrixIndexT i = 0; i < dim; i++) {
-    running_sum += data[i];
+    running_sum += data[i * stride];
     if (r < running_sum) return i;
   }
   return dim_ - 1; // Should only happen if RandUniform()
@@ -320,7 +329,9 @@ MatrixIndexT VectorBase<Real>::RandCategorical() const {
 template<typename Real>
 void VectorBase<Real>::Set(Real f) {
   // Why not use memset here?
-  for (MatrixIndexT i = 0; i < dim_; i++) { data_[i] = f; }
+  Real *data = data_;
+  MatrixIndexT dim = dim_, stride = stride_;
+  for (MatrixIndexT i = 0; i < dim; i++) { data[i * stride] = f; }
 }
 
 template<typename Real>
@@ -328,6 +339,7 @@ void VectorBase<Real>::CopyRowsFromMat(const MatrixBase<Real> &mat) {
   KALDI_ASSERT(dim_ == mat.NumCols() * mat.NumRows());
 
   Real *inc_data = data_;
+  MatrixIndexT stride = stride_;
   const MatrixIndexT cols = mat.NumCols(), rows = mat.NumRows();
 
   if (mat.Stride() == mat.NumCols()) {
@@ -337,7 +349,7 @@ void VectorBase<Real>::CopyRowsFromMat(const MatrixBase<Real> &mat) {
       // copy the data to the propper position
       memcpy(inc_data, mat.RowData(i), cols * sizeof(Real));
       // set new copy position
-      inc_data += cols;
+      inc_data += cols * stride;
     }
   }
 }
@@ -347,15 +359,16 @@ template<typename OtherReal>
 void VectorBase<Real>::CopyRowsFromMat(const MatrixBase<OtherReal> &mat) {
   KALDI_ASSERT(dim_ == mat.NumCols() * mat.NumRows());
   Real *vec_data = data_;
+  MatrixIndexT stride = stride_;
   const MatrixIndexT cols = mat.NumCols(),
       rows = mat.NumRows();
 
   for (MatrixIndexT i = 0; i < rows; i++) {
     const OtherReal *mat_row = mat.RowData(i);
     for (MatrixIndexT j = 0; j < cols; j++) {
-      vec_data[j] = static_cast<Real>(mat_row[j]);
+      vec_data[j * stride] = static_cast<Real>(mat_row[j]);
     }
-    vec_data += cols;
+    vec_data += cols * stride;
   }
 }
 
@@ -370,15 +383,16 @@ void VectorBase<Real>::CopyColsFromMat(const MatrixBase<Real> &mat) {
   KALDI_ASSERT(dim_ == mat.NumCols() * mat.NumRows());
 
   Real*       inc_data = data_;
+  MatrixIndexT stride = stride_;
   const MatrixIndexT  cols     = mat.NumCols(), rows = mat.NumRows(), stride = mat.Stride();
   const Real *mat_inc_data = mat.Data();
 
   for (MatrixIndexT i = 0; i < cols; i++) {
     for (MatrixIndexT j = 0; j < rows; j++) {
-      inc_data[j] = mat_inc_data[j*stride];
+      inc_data[j * stride] = mat_inc_data[j*stride];
     }
     mat_inc_data++;
-    inc_data += rows;
+    inc_data += rows * stride;
   }
 }
 
@@ -396,8 +410,10 @@ void VectorBase<Real>::CopyRowFromMat(const MatrixBase<OtherReal> &mat, MatrixIn
   KALDI_ASSERT(row < mat.NumRows());
   KALDI_ASSERT(dim_ == mat.NumCols());
   const OtherReal *mat_row = mat.RowData(row);
-  for (MatrixIndexT i = 0; i < dim_; i++)
-    data_[i] = static_cast<Real>(mat_row[i]);
+  Real *data = data_;
+  MatrixIndexT dim = dim_, stride = stride_;
+  for (MatrixIndexT i = 0; i < dim; i++)
+    data[i * stride] = static_cast<Real>(mat_row[i]);
 }
 
 template
@@ -414,11 +430,13 @@ void VectorBase<Real>::CopyRowFromSp(const SpMatrix<OtherReal> &sp, MatrixIndexT
   const OtherReal *sp_data = sp.Data();
 
   sp_data += (row*(row+1)) / 2; // takes us to beginning of this row.
+  Real *data = data_;
+  MatrixIndexT dim = dim_, stride = stride_;
   MatrixIndexT i;
   for (i = 0; i < row; i++) // copy consecutive elements.
-    data_[i] = static_cast<Real>(*(sp_data++));
-  for(; i < dim_; ++i, sp_data += i)
-    data_[i] = static_cast<Real>(*sp_data);
+    data[i * stride] = static_cast<Real>(*(sp_data++));
+  for(; i < dim; ++i, sp_data += i)
+    data[i * stride] = static_cast<Real>(*sp_data);
 }
 
 template
@@ -440,23 +458,25 @@ void VectorBase<double>::ApplyPow(double power) { vdPowx(dim_, data_, power, dat
 // takes elements to a power.  Throws exception if could not (but only for power != 1 and power != 2).
 template<typename Real>
 void VectorBase<Real>::ApplyPow(Real power) {
+  Real *data = data_;
+  MatrixIndex dim = dim_, stride = stride_;
   if (power == 1.0) return;
   if (power == 2.0) {
-    for (MatrixIndexT i = 0; i < dim_; i++)
-      data_[i] = data_[i] * data_[i];
+    for (MatrixIndexT i = 0; i < dim; i++)
+      data[i * stride] = data[i * stride] * data[i * stride];
   } else if (power == 0.5) {
-    for (MatrixIndexT i = 0; i < dim_; i++) {
-      if (!(data_[i] >= 0.0))
+    for (MatrixIndexT i = 0; i < dim; i++) {
+      if (!(data[i * stride] >= 0.0))
         KALDI_ERR << "Cannot take square root of negative value "
-                  << data_[i];
-      data_[i] = std::sqrt(data_[i]);
+                  << data[i * stride];
+      data[i * stride] = std::sqrt(data[i * stride]);
     }
   } else {
     for (MatrixIndexT i = 0; i < dim_; i++) {
-      data_[i] = pow(data_[i], power);
-      if (data_[i] == HUGE_VAL) {  // HUGE_VAL is what errno returns on error.
+      data[i * stride] = pow(data[i * stride], power);
+      if (data[i * stride] == HUGE_VAL) {  // HUGE_VAL is what errno returns on error.
         KALDI_ERR << "Could not raise element "  << i << " to power "
-                  << power << ": returned value = " << data_[i];
+                  << power << ": returned value = " << data[i * stride];
       }
     }
   }
@@ -467,31 +487,33 @@ void VectorBase<Real>::ApplyPow(Real power) {
 // Throws exception if could not (but only for power != 1 and power != 2).
 template<typename Real>
 void VectorBase<Real>::ApplyPowAbs(Real power, bool include_sign) {
+  Real *data = data_;
+  MatrixIndexT dim = dim_, stride = stride_;
   if (power == 1.0)
-    for (MatrixIndexT i = 0; i < dim_; i++)
-      data_[i] = (include_sign && data_[i] < 0 ? -1 : 1) * std::abs(data_[i]);
+    for (MatrixIndexT i = 0; i < dim; i++)
+      data[i * stride] = (include_sign && data[i * stride] < 0 ? -1 : 1) * std::abs(data[i * stride]);
   if (power == 2.0) {
-    for (MatrixIndexT i = 0; i < dim_; i++)
-      data_[i] = (include_sign && data_[i] < 0 ? -1 : 1) * data_[i] * data_[i];
+    for (MatrixIndexT i = 0; i < dim; i++)
+      data[i * stride] = (include_sign && data[i * stride] < 0 ? -1 : 1) * data[i * stride] * data[i * stride];
   } else if (power == 0.5) {
-    for (MatrixIndexT i = 0; i < dim_; i++) {
-      data_[i] = (include_sign && data_[i] < 0 ? -1 : 1) * std::sqrt(std::abs(data_[i]));
+    for (MatrixIndexT i = 0; i < dim; i++) {
+      data[i * stride] = (include_sign && data[i * stride] < 0 ? -1 : 1) * std::sqrt(std::abs(data[i * stride]));
     }
   } else if (power < 0.0) {
-    for (MatrixIndexT i = 0; i < dim_; i++) {
-      data_[i] = (data_[i] == 0.0 ? 0.0 : pow(std::abs(data_[i]), power));
-      data_[i] *= (include_sign && data_[i] < 0 ? -1 : 1);
-      if (data_[i] == HUGE_VAL) {  // HUGE_VAL is what errno returns on error.
+    for (MatrixIndexT i = 0; i < dim; i++) {
+      data[i * stride] = (data[i * stride] == 0.0 ? 0.0 : pow(std::abs(data[i * stride]), power));
+      data[i * stride] *= (include_sign && data[i * stride] < 0 ? -1 : 1);
+      if (data[i * stride] == HUGE_VAL) {  // HUGE_VAL is what errno returns on error.
         KALDI_ERR << "Could not raise element "  << i << "to power "
-                  << power << ": returned value = " << data_[i];
+                  << power << ": returned value = " << data[i * stride];
       }
     }
   } else {
-    for (MatrixIndexT i = 0; i < dim_; i++) {
-      data_[i] = (include_sign && data_[i] < 0 ? -1 : 1) * pow(std::abs(data_[i]), power);
-      if (data_[i] == HUGE_VAL) {  // HUGE_VAL is what errno returns on error.
+    for (MatrixIndexT i = 0; i < dim; i++) {
+      data[i * stride] = (include_sign && data[i * stride] < 0 ? -1 : 1) * pow(std::abs(data[i * stride]), power);
+      if (data[i * stride] == HUGE_VAL) {  // HUGE_VAL is what errno returns on error.
         KALDI_ERR << "Could not raise element "  << i << "to power "
-                  << power << ": returned value = " << data_[i];
+                  << power << ": returned value = " << data[i * stride];
       }
     }
   }
@@ -501,28 +523,30 @@ void VectorBase<Real>::ApplyPowAbs(Real power, bool include_sign) {
 template<typename Real>
 Real VectorBase<Real>::Norm(Real p) const {
   KALDI_ASSERT(p >= 0.0);
+  Real *data = data_;
+  MatrixIndexT dim = dim_, stride = stride_;
   Real sum = 0.0;
   if (p == 0.0) {
-    for (MatrixIndexT i = 0; i < dim_; i++)
-      if (data_[i] != 0.0) sum += 1.0;
+    for (MatrixIndexT i = 0; i < dim; i++)
+      if (data[i * stride] != 0.0) sum += 1.0;
     return sum;
   } else if (p == 1.0) {
-    for (MatrixIndexT i = 0; i < dim_; i++)
-      sum += std::abs(data_[i]);
+    for (MatrixIndexT i = 0; i < dim; i++)
+      sum += std::abs(data[i * stride]);
     return sum;
   } else if (p == 2.0) {
-    for (MatrixIndexT i = 0; i < dim_; i++)
-      sum += data_[i] * data_[i];
+    for (MatrixIndexT i = 0; i < dim; i++)
+      sum += data[i * stride] * data[i * stride];
     return std::sqrt(sum);
   } else if (p == std::numeric_limits<Real>::infinity()){
-    for (MatrixIndexT i = 0; i < dim_; i++)
-      sum = std::max(sum, std::abs(data_[i]));
+    for (MatrixIndexT i = 0; i < dim; i++)
+      sum = std::max(sum, std::abs(data[i * stride]));
     return sum;
   } else {
     Real tmp;
     bool ok = true;
-    for (MatrixIndexT i = 0; i < dim_; i++) {
-      tmp = pow(std::abs(data_[i]), p);
+    for (MatrixIndexT i = 0; i < dim; i++) {
+      tmp = pow(std::abs(data[i * stride]), p);
       if (tmp == HUGE_VAL) // HUGE_VAL is what pow returns on error.
         ok = false;
       sum += tmp;
@@ -554,8 +578,9 @@ bool VectorBase<Real>::ApproxEqual(const VectorBase<Real> &other, float tol) con
   } else { // Test for exact equality.
     const Real *data = data_;
     const Real *other_data = other.data_;
+    MatrixIndex other_stride = other.stride_, stride = stride_;
     for (MatrixIndexT dim = dim_, i = 0; i < dim; i++)
-      if (data[i] != other_data[i]) return false;
+      if (data[i * stride] != other_data[i * other_stride]) return false;
     return true;
   }
 }
@@ -564,9 +589,9 @@ template<typename Real>
 Real VectorBase<Real>::Max() const {
   Real ans = - std::numeric_limits<Real>::infinity();
   const Real *data = data_;
-  MatrixIndexT i, dim = dim_;
+  MatrixIndexT i, dim = dim_, stride = stride_;
   for (i = 0; i + 4 <= dim; i += 4) {
-    Real a1 = data[i], a2 = data[i+1], a3 = data[i+2], a4 = data[i+3];
+    Real a1 = data[i*stride], a2 = data[(i+1)*stride], a3 = data[(i+2)*stride], a4 = data[(i+3)*stride];
     if (a1 > ans || a2 > ans || a3 > ans || a4 > ans) {
       Real b1 = (a1 > a2 ? a1 : a2), b2 = (a3 > a4 ? a3 : a4);
       if (b1 > ans) ans = b1;
@@ -574,7 +599,7 @@ Real VectorBase<Real>::Max() const {
     }
   }
   for (; i < dim; i++)
-    if (data[i] > ans) ans = data[i];
+    if (data[i * stride] > ans) ans = data[i * stride];
   return ans;
 }
 
@@ -584,9 +609,9 @@ Real VectorBase<Real>::Max(MatrixIndexT *index_out) const {
   Real ans = - std::numeric_limits<Real>::infinity();
   MatrixIndexT index = 0;
   const Real *data = data_;
-  MatrixIndexT i, dim = dim_;
+  MatrixIndexT i, dim = dim_, stride = stride_;
   for (i = 0; i + 4 <= dim; i += 4) {
-    Real a1 = data[i], a2 = data[i+1], a3 = data[i+2], a4 = data[i+3];
+    Real a1 = data[i*stride], a2 = data[(i+1)*stride], a3 = data[(i+2)*stride], a4 = data[(i+3)*stride];
     if (a1 > ans || a2 > ans || a3 > ans || a4 > ans) {
       if (a1 > ans) { ans = a1; index = i; }
       if (a2 > ans) { ans = a2; index = i + 1; }
@@ -595,7 +620,7 @@ Real VectorBase<Real>::Max(MatrixIndexT *index_out) const {
     }
   }
   for (; i < dim; i++)
-    if (data[i] > ans) { ans = data[i]; index = i; }
+    if (data[i * stride] > ans) { ans = data[i * stride]; index = i; }
   *index_out = index;
   return ans;
 }
@@ -604,9 +629,9 @@ template<typename Real>
 Real VectorBase<Real>::Min() const {
   Real ans = std::numeric_limits<Real>::infinity();
   const Real *data = data_;
-  MatrixIndexT i, dim = dim_;
+  MatrixIndexT i, dim = dim_, stride = stride_;
   for (i = 0; i + 4 <= dim; i += 4) {
-    Real a1 = data[i], a2 = data[i+1], a3 = data[i+2], a4 = data[i+3];
+    Real a1 = data[i*stride], a2 = data[(i+1)*stride], a3 = data[(i+2)*stride], a4 = data[(i+3)*stride];
     if (a1 < ans || a2 < ans || a3 < ans || a4 < ans) {
       Real b1 = (a1 < a2 ? a1 : a2), b2 = (a3 < a4 ? a3 : a4);
       if (b1 < ans) ans = b1;
@@ -614,7 +639,7 @@ Real VectorBase<Real>::Min() const {
     }
   }
   for (; i < dim; i++)
-    if (data[i] < ans) ans = data[i];
+    if (data[i*stride] < ans) ans = data[i*stride];
   return ans;
 }
 
@@ -624,9 +649,9 @@ Real VectorBase<Real>::Min(MatrixIndexT *index_out) const {
   Real ans = std::numeric_limits<Real>::infinity();
   MatrixIndexT index = 0;
   const Real *data = data_;
-  MatrixIndexT i, dim = dim_;
+  MatrixIndexT i, dim = dim_, stride = stride_;
   for (i = 0; i + 4 <= dim; i += 4) {
-    Real a1 = data[i], a2 = data[i+1], a3 = data[i+2], a4 = data[i+3];
+    Real a1 = data[i*stride], a2 = data[(i+1)*stride], a3 = data[(i+2)*stride], a4 = data[(i+3)*stride];
     if (a1 < ans || a2 < ans || a3 < ans || a4 < ans) {
       if (a1 < ans) { ans = a1; index = i; }
       if (a2 < ans) { ans = a2; index = i + 1; }
@@ -635,7 +660,7 @@ Real VectorBase<Real>::Min(MatrixIndexT *index_out) const {
     }
   }
   for (; i < dim; i++)
-    if (data[i] < ans) { ans = data[i]; index = i; }
+    if (data[i*stride] < ans) { ans = data[i*stride]; index = i; }
   *index_out = index;
   return ans;
 }
@@ -646,8 +671,10 @@ template<typename OtherReal>
 void VectorBase<Real>::CopyColFromMat(const MatrixBase<OtherReal> &mat, MatrixIndexT col) {
   KALDI_ASSERT(col < mat.NumCols());
   KALDI_ASSERT(dim_ == mat.NumRows());
-  for (MatrixIndexT i = 0; i < dim_; i++)
-    data_[i] = mat(i, col);
+  Real *data = data_;
+  MatrixIndexT dim = dim_, stride = stride_;
+  for (MatrixIndexT i = 0; i < dim; i++)
+    data[i * stride] = mat(i, col);
   // can't do this very efficiently so don't really bother. could improve this though.
 }
 // instantiate the template above.
@@ -669,9 +696,10 @@ void VectorBase<Real>::CopyDiagFromMat(const MatrixBase<Real> &M) {
 template<typename Real>
 void VectorBase<Real>::CopyDiagFromPacked(const PackedMatrix<Real> &M) {
   KALDI_ASSERT(dim_ == M.NumCols());
+  Real *data = data_;
   MatrixIndexT stride = stride_, dim = dim_;
   for (MatrixIndexT i = 0; i < dim; i++)
-    data_[i * stride] = M(i, i);
+    data[i * stride] = M(i, i);
   // could make this more efficient.
 }
 
@@ -688,9 +716,10 @@ template<typename Real>
 Real VectorBase<Real>::SumLog() const {
   double sum_log = 0.0;
   double prod = 1.0;
+  Real *data = data_;
   MatrixIndexT dim = dim_, stride = stride_;
   for (MatrixIndexT i = 0; i < dim; i++) {
-    prod *= data_[i * stride];
+    prod *= data[i * stride];
     // Possible future work (arnab): change these magic values to pre-defined
     // constants
     if (prod < 1.0e-10 || prod > 1.0e+10) {
@@ -819,15 +848,17 @@ void VectorBase<Real>::ApplyAbs() {
 
 template<typename Real>
 void VectorBase<Real>::ApplyFloor(Real floor_val, MatrixIndexT *floored_count) {
+  Real *data = data_;
+  MatrixIndexT dim = dim_, stride = stride_;
   if (floored_count == nullptr) {
-    for (MatrixIndexT i = 0; i < dim_; i++) {
-      data_[i] = std::max(data_[i], floor_val);
+    for (MatrixIndexT i = 0; i < dim; i++) {
+      data[i] = std::max(data[i * stride], floor_val);
     }
   } else {
     MatrixIndexT num_floored = 0;
-    for (MatrixIndexT i = 0; i < dim_; i++) {
-      if (data_[i] < floor_val) {
-        data_[i] = floor_val;
+    for (MatrixIndexT i = 0; i < dim; i++) {
+      if (data[i * stride] < floor_val) {
+        data[i * stride] = floor_val;
         num_floored++;
       }
     }
@@ -875,8 +906,10 @@ MatrixIndexT VectorBase<Real>::ApplyFloor(const VectorBase<Real> &floor_vec) {
 template<typename Real>
 Real VectorBase<Real>::ApplySoftMax() {
   Real max = this->Max(), sum = 0.0;
-  for (MatrixIndexT i = 0; i < dim_; i++) {
-    sum += (data_[i] = Exp(data_[i] - max));
+  Real *data = data_;
+  MatrixIndexT dim = dim_, stride = stride_;
+  for (MatrixIndexT i = 0; i < dim; i++) {
+    sum += (data[i * stride] = Exp(data[i * stride] - max));
   }
   this->Scale(1.0 / sum);
   return max + Log(sum);
@@ -885,8 +918,10 @@ Real VectorBase<Real>::ApplySoftMax() {
 template<typename Real>
 Real VectorBase<Real>::ApplyLogSoftMax() {
   Real max = this->Max(), sum = 0.0;
-  for (MatrixIndexT i = 0; i < dim_; i++) {
-    sum += Exp((data_[i] -= max));
+  Real *data = data_;
+  MatrixIndexT dim = dim_, stride = stride_;
+  for (MatrixIndexT i = 0; i < dim; i++) {
+    sum += Exp((data[i * stride] -= max));
   }
   sum = Log(sum);
   this->Add(-1.0 * sum);
@@ -908,8 +943,10 @@ void VectorBase<double>::Tanh(const VectorBase<double> &src) {
 template<typename Real>
 void VectorBase<Real>::Tanh(const VectorBase<Real> &src) {
   KALDI_ASSERT(dim_ == src.dim_);
-  for (MatrixIndexT i = 0; i < dim_; i++) {
-    Real x = src.data_[i];
+  Real *data = data_;
+  MatrixIndexT dim = dim_, stride = stride_, src_stride = src.stride_;
+  for (MatrixIndexT i = 0; i < dim; i++) {
+    Real x = src.data[i * src_stride];
     if (x > 0.0) {
       Real inv_expx = Exp(-x);
       x = -1.0 + 2.0 / (1.0 + inv_expx * inv_expx);
@@ -917,7 +954,7 @@ void VectorBase<Real>::Tanh(const VectorBase<Real> &src) {
       Real expx = Exp(x);
       x = 1.0 - 2.0 / (1.0 + expx * expx);
     }
-    data_[i] = x;
+    data[i * stride] = x;
   }
 }
 #endif
@@ -946,8 +983,10 @@ void VectorBase<double>::Sigmoid(const VectorBase<double> &src) {
 template<typename Real>
 void VectorBase<Real>::Sigmoid(const VectorBase<Real> &src) {
   KALDI_ASSERT(dim_ == src.dim_);
-  for (MatrixIndexT i = 0; i < dim_; i++) {
-    Real x = src.data_[i];
+  Real *data = data_;
+  MatrixIndexT dim = dim_, stride = stride_, src_stride = src.stride_;
+  for (MatrixIndexT i = 0; i < dim; i++) {
+    Real x = src.data[i * src_stride];
     // We aim to avoid floating-point overflow here.
     if (x > 0.0) {
       x = 1.0 / (1.0 + Exp(-x));
@@ -955,7 +994,7 @@ void VectorBase<Real>::Sigmoid(const VectorBase<Real> &src) {
       Real ex = Exp(x);
       x = ex / (ex + 1.0);
     }
-    data_[i] = x;
+    data[i * stride] = x;
   }
 }
 #endif
@@ -963,8 +1002,10 @@ void VectorBase<Real>::Sigmoid(const VectorBase<Real> &src) {
 
 template<typename Real>
 void VectorBase<Real>::Add(Real c) {
-  for (MatrixIndexT i = 0; i < dim_; i++) {
-    data_[i] += c;
+  Real *data = data_;
+  MatrixIndexT dim = dim_, stride = stride_;
+  for (MatrixIndexT i = 0; i < dim; i++) {
+    data[i * stride] += c;
   }
 }
 
@@ -976,16 +1017,19 @@ void VectorBase<Real>::Scale(Real alpha) {
 template<typename Real>
 void VectorBase<Real>::MulElements(const VectorBase<Real> &v) {
   KALDI_ASSERT(dim_ == v.dim_);
-  for (MatrixIndexT i = 0; i < dim_; i++) {
-    data_[i] *= v.data_[i];
+  Real *data = data_, *v_data = v.data_;
+  MatrixIndexT dim = dim_, v_stride = v.stride_, stride = stride_;
+  for (MatrixIndexT i = 0; i < dim; i++) {
+    data[i * stride] *= v_data[i * v_stride];
   }
 }
 
 template<typename Real>  // Set each element to y = (x == orig ? changed : x).
 void VectorBase<Real>::ReplaceValue(Real orig, Real changed) {
   Real *data = data_;
-  for (MatrixIndexT i = 0; i < dim_; i++)
-    if (data[i] == orig) data[i] = changed;
+  MatrixIndexT dim = dim_, stride = stride_;
+  for (MatrixIndexT i = 0; i < dim; i++)
+    if (data[i * stride] == orig) data[i * stride] = changed;
 }
 
 
@@ -994,8 +1038,10 @@ template<typename OtherReal>
 void VectorBase<Real>::MulElements(const VectorBase<OtherReal> &v) {
   KALDI_ASSERT(dim_ == v.Dim());
   const OtherReal *other_ptr = v.Data();
-  for (MatrixIndexT i = 0; i < dim_; i++) {
-    data_[i] *= other_ptr[i];
+  Real *data = data_;
+  MatrixIndexT dim = dim_, v_stride = v.Stride(), stride = stride_;
+  for (MatrixIndexT i = 0; i < dim; i++) {
+    data_[i * stride] *= other_ptr[i * v_stride];
   }
 }
 // instantiate template.
@@ -1019,8 +1065,10 @@ void VectorBase<Real>::AddVecVec(Real alpha, const VectorBase<Real> &v,
 template<typename Real>
 void VectorBase<Real>::DivElements(const VectorBase<Real> &v) {
   KALDI_ASSERT(dim_ == v.dim_);
-  for (MatrixIndexT i = 0; i < dim_; i++) {
-    data_[i] /= v.data_[i];
+  Real *data = data_;
+  MatrixIndexT dim = dim_, v_stride = v.stride_, stride = stride_;
+  for (MatrixIndexT i = 0; i < dim; i++) {
+    data_[i * stride] /= v.data_[i * v_stride];
   }
 }
 
@@ -1029,8 +1077,9 @@ template<typename OtherReal>
 void VectorBase<Real>::DivElements(const VectorBase<OtherReal> &v) {
   KALDI_ASSERT(dim_ == v.Dim());
   const OtherReal *other_ptr = v.Data();
-  for (MatrixIndexT i = 0; i < dim_; i++) {
-    data_[i] /= other_ptr[i];
+  MatrixIndexT dim = dim_, v_stride = v.Stride(), stride = stride_;
+  for (MatrixIndexT i = 0; i < dim; i++) {
+    data[i * stride] /= other_ptr[i * v_stride];
   }
 }
 // instantiate template.
@@ -1043,8 +1092,11 @@ template<typename Real>
 void VectorBase<Real>::AddVecDivVec(Real alpha, const VectorBase<Real> &v,
                                     const VectorBase<Real> &rr, Real beta) {
   KALDI_ASSERT((dim_ == v.dim_ && dim_ == rr.dim_));
-  for (MatrixIndexT i = 0; i < dim_; i++) {
-    data_[i] = alpha * v.data_[i]/rr.data_[i] + beta * data_[i] ;
+  Real *data = data_, *v_data = v.data_, *rr_data = rr.data_;
+  MatrixIndexT dim = dim_, v_stride = v.stride_, rr_stride = rr.stride_, stride = stride_;
+  for (MatrixIndexT i = 0; i < dim; i++) {
+    data[i * stride] = alpha * v_data[i * v_stride]/rr_data[i * rr_stride] +
+      beta * data[i * stride];
   }
 }
 
@@ -1055,13 +1107,13 @@ void VectorBase<Real>::AddVec(const Real alpha, const VectorBase<OtherReal> &v)
   // remove __restrict__ if it causes compilation problems.
   Real *__restrict__ data = data_;
   OtherReal *__restrict__ other_data = v.data_;
-  MatrixIndexT dim = dim_;
+  MatrixIndexT dim = dim_, v_stride = v.stride_, stride = stride_;
   if (alpha != 1.0)
     for (MatrixIndexT i = 0; i < dim; i++)
-      data[i] += alpha * other_data[i];
+      data[i * stride] += alpha * other_data[i * v_stride];
   else
     for (MatrixIndexT i = 0; i < dim; i++)
-      data[i] += other_data[i];
+      data[i * stride] += other_data[i * v_stride];
 }
 
 template
@@ -1076,13 +1128,13 @@ void VectorBase<Real>::AddVec2(const Real alpha, const VectorBase<OtherReal> &v)
   // remove __restrict__ if it causes compilation problems.
   Real *__restrict__ data = data_;
   OtherReal *__restrict__ other_data = v.data_;
-  MatrixIndexT dim = dim_;
+  MatrixIndexT dim = dim_, v_stride = v.stride_, stride = stride_;
   if (alpha != 1.0)
     for (MatrixIndexT i = 0; i < dim; i++)
-      data[i] += alpha * other_data[i] * other_data[i];
+      data[i * stride] += alpha * other_data[i * v_stride] * other_data[i * v_stride];
   else
     for (MatrixIndexT i = 0; i < dim; i++)
-      data[i] += other_data[i] * other_data[i];
+      data[i * stride] += other_data[i * v_stride] * other_data[i * v_stride];
 }
 
 template
@@ -1194,7 +1246,7 @@ void Vector<Real>::Read(std::istream & is,  bool binary, bool add) {
         is.get();  // eat the ']'
         this->Resize(data.size());
         for (size_t j = 0; j < data.size(); j++)
-          this->data_[j] = data[j];
+          this->data_[j * this->stride_] = data[j * data.stride_];
         i = is.peek();
         if (static_cast<char>(i) == '\r') {
           is.get();
@@ -1250,9 +1302,10 @@ void VectorBase<Real>::Write(std::ostream & os, bool binary) const {
     WriteBasicType(os, binary, size);
     os.write(reinterpret_cast<const char*>(Data()), sizeof(Real) * size);
   } else {
+    MatrixIndexT stride = this->stride_;
     os << " [ ";
     for (MatrixIndexT i = 0; i < Dim(); i++)
-      os << (*this)(i) << " ";
+      os << (*this)(i * stride) << " ";
     os << "]\n";
   }
   if (!os.good())
@@ -1263,8 +1316,10 @@ void VectorBase<Real>::Write(std::ostream & os, bool binary) const {
 template<typename Real>
 void VectorBase<Real>::AddVec2(const Real alpha, const VectorBase<Real> &v) {
   KALDI_ASSERT(dim_ == v.dim_);
-  for (MatrixIndexT i = 0; i < dim_; i++)
-    data_[i] += alpha * v.data_[i] * v.data_[i];
+  Real *data = data, v_data = v.data_;
+  MatrixIndexT dim = dim_, v_stride = v.stride_, stride = stride_;
+  for (MatrixIndexT i = 0; i < dim; i++)
+    data[i * stride] += alpha * v_data[i * v_stride] * v_data[i * v_stride];
 }
 
 // this <-- beta*this + alpha*M*v.
@@ -1316,19 +1371,19 @@ void VectorBase<Real>::AddDiagMat2(
   if (trans == kNoTrans) {
     KALDI_ASSERT(this->dim_ == M.NumRows());
     MatrixIndexT rows = this->dim_, cols = M.NumCols(),
-           mat_stride = M.Stride();
+      mat_stride = M.Stride(), stride = this->stride_;
     Real *data = this->data_;
     const Real *mat_data = M.Data();
-    for (MatrixIndexT i = 0; i < rows; i++, mat_data += mat_stride, data++)
+    for (MatrixIndexT i = 0; i < rows; i++, mat_data += mat_stride, data += stride)
       *data = beta * *data + alpha * cblas_Xdot(cols, mat_data, 1,
                                                 mat_data, 1);
   } else {
     KALDI_ASSERT(this->dim_ == M.NumCols());
     MatrixIndexT rows = M.NumRows(), cols = this->dim_,
-           mat_stride = M.Stride();
+      mat_stride = M.Stride(), stride = stride_;
     Real *data = this->data_;
     const Real *mat_data = M.Data();
-    for (MatrixIndexT i = 0; i < cols; i++, mat_data++, data++)
+    for (MatrixIndexT i = 0; i < cols; i++, mat_data++, data += stride)
       *data = beta * *data + alpha * cblas_Xdot(rows, mat_data, mat_stride,
                                                  mat_data, mat_stride);
   }
@@ -1348,10 +1403,11 @@ void VectorBase<Real>::AddDiagMatMat(
   if (transM == kTrans) std::swap(M_row_stride, M_col_stride);
   MatrixIndexT N_row_stride = N.Stride(), N_col_stride = 1;
   if (transN == kTrans) std::swap(N_row_stride, N_col_stride);
+  MatrixIndexT stride = this->stride_;
 
   Real *data = this->data_;
   const Real *Mdata = M.Data(), *Ndata = N.Data();
-  for (MatrixIndexT i = 0; i < dim; i++, Mdata += M_row_stride, Ndata += N_col_stride, data++) {
+  for (MatrixIndexT i = 0; i < dim; i++, Mdata += M_row_stride, Ndata += N_col_stride, data += stride) {
     *data = beta * *data + alpha * cblas_Xdot(M_col_dim, Mdata, M_col_stride, Ndata, N_row_stride);
   }
 }

From c4a326e2b8677b84189d30066d4c11706f4e2624 Mon Sep 17 00:00:00 2001
From: Daniel Povey <dpovey@gmail.com>
Date: Wed, 20 Mar 2019 16:57:01 -0400
Subject: [PATCH 009/163] Some cleanups in matrix/; more work on tensor draft
 (#3150)

---
 src/featbin/Makefile                |   3 +-
 src/featbin/fmpe-acc-stats.cc       | 108 ----------
 src/featbin/fmpe-apply-transform.cc |  89 ---------
 src/featbin/fmpe-est.cc             |  67 -------
 src/featbin/fmpe-init.cc            |  63 ------
 src/featbin/fmpe-sum-accs.cc        |  63 ------
 src/hmm/transitions.h               |   2 +-
 src/matrix/cblas-wrappers.h         |  60 +++---
 src/matrix/kaldi-matrix.cc          |  17 +-
 src/matrix/kaldi-matrix.h           |  10 +-
 src/matrix/kaldi-vector.cc          | 298 +++++++++++-----------------
 src/matrix/matrix-common.h          |  10 +-
 src/matrix/optimization.cc          |  60 +++---
 src/matrix/optimization.h           |  37 ++--
 src/matrix/qr.cc                    |  26 +--
 src/matrix/sparse-matrix-test.cc    |   2 +-
 src/matrix/sparse-matrix.cc         |  64 +++---
 src/matrix/sparse-matrix.h          |  18 +-
 src/tensor/array-ref.h              |  63 +++++-
 src/tensor/storage.h                |  59 ++++--
 src/tensor/tensor-common.h          |  49 ++++-
 src/tensor/tensor-functions.h       |  26 ++-
 src/tensor/tensor-pattern.cc        |  99 ++++++++-
 src/tensor/tensor-pattern.h         | 270 +++++--------------------
 src/tensor/tensor.h                 | 152 +++++---------
 25 files changed, 621 insertions(+), 1094 deletions(-)
 delete mode 100644 src/featbin/fmpe-acc-stats.cc
 delete mode 100644 src/featbin/fmpe-apply-transform.cc
 delete mode 100644 src/featbin/fmpe-est.cc
 delete mode 100644 src/featbin/fmpe-init.cc
 delete mode 100644 src/featbin/fmpe-sum-accs.cc

diff --git a/src/featbin/Makefile b/src/featbin/Makefile
index 1067244b2db..09de6c2fb19 100644
--- a/src/featbin/Makefile
+++ b/src/featbin/Makefile
@@ -11,8 +11,7 @@ BINFILES = add-deltas add-deltas-sdc append-post-to-feats \
            concat-feats copy-feats \
            copy-feats-to-htk copy-feats-to-sphinx extend-transform-dim \
            extract-feature-segments extract-segments feat-to-dim \
-           feat-to-len fmpe-acc-stats fmpe-apply-transform fmpe-est \
-           fmpe-init fmpe-sum-accs get-full-lda-mat interpolate-pitch \
+           feat-to-len get-full-lda-mat interpolate-pitch \
            modify-cmvn-stats paste-feats post-to-feats \
            process-kaldi-pitch-feats process-pitch-feats \
            select-feats shift-feats splice-feats subsample-feats \
diff --git a/src/featbin/fmpe-acc-stats.cc b/src/featbin/fmpe-acc-stats.cc
deleted file mode 100644
index c69e95b6b59..00000000000
--- a/src/featbin/fmpe-acc-stats.cc
+++ /dev/null
@@ -1,108 +0,0 @@
-// featbin/fmpe-acc-stats.cc
-
-// Copyright 2012  Johns Hopkins University (Author: Daniel Povey)
-
-// See ../../COPYING for clarification regarding multiple authors
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//  http://www.apache.org/licenses/LICENSE-2.0
-//
-// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
-// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
-// MERCHANTABLITY OR NON-INFRINGEMENT.
-// See the Apache 2 License for the specific language governing permissions and
-// limitations under the License.
-
-#include "base/kaldi-common.h"
-#include "util/common-utils.h"
-#include "transform/fmpe.h"
-
-int main(int argc, char *argv[]) {
-  using namespace kaldi;
-  using kaldi::int32;
-  try {
-    const char *usage =
-        "Compute statistics for fMPE training\n"
-        "Usage:  fmpe-acc-stats [options...] <fmpe-object> "
-        "<feat-rspecifier> <feat-diff-rspecifier> <gselect-rspecifier> <stats-out>\n"
-        "Note: gmm-fmpe-acc-stats avoids computing the features an extra time\n";
-
-    ParseOptions po(usage);
-    bool binary = true;
-    po.Register("binary", &binary, "If true, output stats in binary mode.");
-    po.Read(argc, argv);
-
-    if (po.NumArgs() != 5) {
-      po.PrintUsage();
-      exit(1);
-    }
-
-    std::string fmpe_rxfilename = po.GetArg(1),
-        feat_rspecifier = po.GetArg(2),
-        feat_diff_rspecifier = po.GetArg(3),
-        gselect_rspecifier = po.GetArg(4),
-        stats_wxfilename = po.GetArg(5);
-    
-    Fmpe fmpe;
-    ReadKaldiObject(fmpe_rxfilename, &fmpe);
-
-    SequentialBaseFloatMatrixReader feat_reader(feat_rspecifier);
-    RandomAccessBaseFloatMatrixReader diff_reader(feat_diff_rspecifier);
-    RandomAccessInt32VectorVectorReader gselect_reader(gselect_rspecifier);
-
-    // fmpe stats...
-    FmpeStats fmpe_stats(fmpe);
-
-    int32 num_done = 0, num_err = 0;
-    
-    for (; !feat_reader.Done(); feat_reader.Next()) {
-      std::string key = feat_reader.Key();
-      const Matrix<BaseFloat> feat_in(feat_reader.Value());
-      if (!gselect_reader.HasKey(key)) {
-        KALDI_WARN << "No gselect information for key " << key;
-        num_err++;
-        continue;
-      }
-      const std::vector<std::vector<int32> > &gselect =
-          gselect_reader.Value(key);
-      if (static_cast<int32>(gselect.size()) != feat_in.NumRows()) {
-        KALDI_WARN << "gselect information has wrong size";
-        num_err++;
-        continue;
-      }
-      if (!diff_reader.HasKey(key)) {
-        KALDI_WARN << "No gradient information for key " << key;
-        num_err++;
-        continue;
-      }
-      const Matrix<BaseFloat> &feat_deriv = diff_reader.Value(key);
-
-      if (feat_deriv.NumCols() == feat_in.NumCols()) { // Only direct derivative.
-        fmpe.AccStats(feat_in, gselect, feat_deriv, NULL, &fmpe_stats);
-      } else if (feat_deriv.NumCols() == feat_in.NumCols() * 2) { // +indirect.
-        SubMatrix<BaseFloat> direct_deriv(feat_deriv, 0, feat_deriv.NumRows(),
-                                          0, feat_in.NumCols()),
-            indirect_deriv(feat_deriv, 0, feat_deriv.NumRows(),
-                           feat_in.NumCols(), feat_in.NumCols());
-        fmpe.AccStats(feat_in, gselect, direct_deriv, &indirect_deriv, &fmpe_stats);
-      } else {
-        KALDI_ERR << "Mismatch in dimension of feature derivative.";
-      }
-      num_done++;
-    }
-
-    KALDI_LOG << " Done " << num_done << " utterances, " << num_err
-              << " had errors.";
-
-    WriteKaldiObject(fmpe_stats, stats_wxfilename, binary);
-    
-    return (num_done != 0 ? 0 : 1);
-  } catch(const std::exception &e) {
-    std::cerr << e.what();
-    return -1;
-  }
-}
diff --git a/src/featbin/fmpe-apply-transform.cc b/src/featbin/fmpe-apply-transform.cc
deleted file mode 100644
index 9473e5f287b..00000000000
--- a/src/featbin/fmpe-apply-transform.cc
+++ /dev/null
@@ -1,89 +0,0 @@
-// featbin/fmpe-apply-transform.cc
-
-// Copyright 2012  Johns Hopkins University (Author: Daniel Povey)  Yanmin Qian
-
-// See ../../COPYING for clarification regarding multiple authors
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//  http://www.apache.org/licenses/LICENSE-2.0
-//
-// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
-// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
-// MERCHANTABLITY OR NON-INFRINGEMENT.
-// See the Apache 2 License for the specific language governing permissions and
-// limitations under the License.
-
-#include "base/kaldi-common.h"
-#include "util/common-utils.h"
-#include "transform/fmpe.h"
-
-int main(int argc, char *argv[]) {
-  using namespace kaldi;
-  using kaldi::int32;
-  try {
-    const char *usage =
-        "Apply fMPE transform to features\n"
-        "Usage:  fmpe-apply-transform [options...] <fmpe-object> "
-        "<feat-rspecifier> <gselect-rspecifier> <feat-wspecifier>\n";
-
-    ParseOptions po(usage);
-    bool add_to_features = true;
-    po.Register("add-to-features", &add_to_features, "If true, add original "
-                "features to fMPE offsets (false useful for diagnostics)");
-    // no non-default options.
-    po.Read(argc, argv);
-
-    if (po.NumArgs() != 4) {
-      po.PrintUsage();
-      exit(1);
-    }
-
-    std::string fmpe_rxfilename = po.GetArg(1),
-        feat_rspecifier = po.GetArg(2),
-        gselect_rspecifier = po.GetArg(3),
-        feat_wspecifier = po.GetArg(4);
-    
-    Fmpe fmpe;
-    ReadKaldiObject(fmpe_rxfilename, &fmpe);
-
-    SequentialBaseFloatMatrixReader feat_reader(feat_rspecifier);
-    RandomAccessInt32VectorVectorReader gselect_reader(gselect_rspecifier);
-    BaseFloatMatrixWriter feat_writer(feat_wspecifier);
-
-    int32 num_done = 0, num_err = 0;
-    
-    for (; !feat_reader.Done(); feat_reader.Next()) {
-      std::string key = feat_reader.Key();
-      const Matrix<BaseFloat> feat_in(feat_reader.Value());
-      if (!gselect_reader.HasKey(key)) {
-        KALDI_WARN << "No gselect information for key " << key;
-        num_err++;
-        continue;
-      }
-      const std::vector<std::vector<int32> > &gselect =
-          gselect_reader.Value(key);
-      if (static_cast<int32>(gselect.size()) != feat_in.NumRows()) {
-        KALDI_WARN << "gselect information has wrong size";
-        num_err++;
-        continue;
-      }
-      Matrix<BaseFloat> feat_out(feat_in.NumRows(), feat_in.NumCols());
-      fmpe.ComputeFeatures(feat_in, gselect, &feat_out);
-      if (add_to_features) // feat_out += feat_in.
-        feat_out.AddMat(1.0, feat_in, kNoTrans);
-
-      feat_writer.Write(key, feat_out);
-      num_done++;
-    }
-    KALDI_LOG << " Done " << num_done << " utterances, " << num_err
-              << " had errors.";
-    return (num_done != 0 ? 0 : 1);
-  } catch(const std::exception &e) {
-    std::cerr << e.what();
-    return -1;
-  }
-}
diff --git a/src/featbin/fmpe-est.cc b/src/featbin/fmpe-est.cc
deleted file mode 100644
index 76463c32782..00000000000
--- a/src/featbin/fmpe-est.cc
+++ /dev/null
@@ -1,67 +0,0 @@
-// featbin/fmpe-est.cc
-
-// Copyright 2012  Johns Hopkins University (Author: Daniel Povey)  Yanmin Qian
-
-// See ../../COPYING for clarification regarding multiple authors
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//  http://www.apache.org/licenses/LICENSE-2.0
-//
-// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
-// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
-// MERCHANTABLITY OR NON-INFRINGEMENT.
-// See the Apache 2 License for the specific language governing permissions and
-// limitations under the License.
-
-#include "base/kaldi-common.h"
-#include "util/common-utils.h"
-#include "transform/fmpe.h"
-
-int main(int argc, char *argv[]) {
-  using namespace kaldi;
-  try {
-    const char *usage =
-        "Do one iteration of learning (modified gradient descent)\n"
-        "on fMPE transform\n"
-        "Usage: fmpe-est [options...] <fmpe-in> <stats-in> <fmpe-out>\n"
-        "E.g. fmpe-est 1.fmpe 1.accs 2.fmpe\n";
-
-    ParseOptions po(usage);
-    FmpeUpdateOptions opts;
-    bool binary = true;
-    po.Register("binary", &binary, "If true, output fMPE object in "
-                "binary mode.");
-    opts.Register(&po);
-    po.Read(argc, argv);
-
-    if (po.NumArgs() != 3) {
-      po.PrintUsage();
-      exit(1);
-    }
-
-    std::string fmpe_rxfilename = po.GetArg(1),
-        stats_rxfilename = po.GetArg(2),
-        fmpe_wxfilename = po.GetArg(3);
-
-    Fmpe fmpe;
-    ReadKaldiObject(fmpe_rxfilename, &fmpe);
-    FmpeStats stats;
-    ReadKaldiObject(stats_rxfilename, &stats);
-
-    stats.DoChecks(); // checks certain checksums.
-    fmpe.Update(opts, stats);
-
-    WriteKaldiObject(fmpe, fmpe_wxfilename, binary);
-
-    KALDI_LOG << "Updated fMPE object and wrote to "
-              << fmpe_wxfilename;
-    return 0;
-  } catch(const std::exception &e) {
-    std::cerr << e.what();
-    return -1;
-  }
-}
diff --git a/src/featbin/fmpe-init.cc b/src/featbin/fmpe-init.cc
deleted file mode 100644
index 5f4455f44fc..00000000000
--- a/src/featbin/fmpe-init.cc
+++ /dev/null
@@ -1,63 +0,0 @@
-// featbin/fmpe-init.cc
-
-// Copyright 2012  Johns Hopkins University (Author: Daniel Povey)  Yanmin Qian
-
-// See ../../COPYING for clarification regarding multiple authors
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//  http://www.apache.org/licenses/LICENSE-2.0
-//
-// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
-// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
-// MERCHANTABLITY OR NON-INFRINGEMENT.
-// See the Apache 2 License for the specific language governing permissions and
-// limitations under the License.
-
-#include "base/kaldi-common.h"
-#include "util/common-utils.h"
-#include "transform/fmpe.h"
-
-int main(int argc, char *argv[]) {
-  using namespace kaldi;
-  try {
-    const char *usage =
-        "Initialize fMPE transform (to zero)\n"
-        "Usage: fmpe-init [options...] <diag-gmm-in> <fmpe-out>\n"
-        "E.g. fmpe-init 1.ubm 1.fmpe\n";
-
-    ParseOptions po(usage);
-    FmpeOptions opts;
-    bool binary = true;
-    po.Register("binary", &binary, "If true, output fMPE object in binary mode.");
-    opts.Register(&po);
-    po.Read(argc, argv);
-
-    if (po.NumArgs() != 2) {
-      po.PrintUsage();
-      exit(1);
-    }
-
-    std::string dgmm_rxfilename = po.GetArg(1),
-        fmpe_wxfilename = po.GetArg(2);
-
-    DiagGmm dgmm;
-    ReadKaldiObject(dgmm_rxfilename, &dgmm);
-    
-    
-    Fmpe fmpe(dgmm, opts);
-
-    Output ko(fmpe_wxfilename, binary);
-    fmpe.Write(ko.Stream(), binary);
-
-    KALDI_LOG << "Initialized fMPE object and wrote to "
-              << fmpe_wxfilename;
-    return 0;
-  } catch(const std::exception &e) {
-    std::cerr << e.what();
-    return -1;
-  }
-}
diff --git a/src/featbin/fmpe-sum-accs.cc b/src/featbin/fmpe-sum-accs.cc
deleted file mode 100644
index e2976abe5ff..00000000000
--- a/src/featbin/fmpe-sum-accs.cc
+++ /dev/null
@@ -1,63 +0,0 @@
-// featbin/fmpe-sum-accs.cc
-
-// Copyright 2012  Johns Hopkins University (Author: Daniel Povey)
-
-// See ../../COPYING for clarification regarding multiple authors
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//  http://www.apache.org/licenses/LICENSE-2.0
-//
-// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
-// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
-// MERCHANTABLITY OR NON-INFRINGEMENT.
-// See the Apache 2 License for the specific language governing permissions and
-// limitations under the License.
-
-#include "base/kaldi-common.h"
-#include "util/common-utils.h"
-#include "transform/fmpe.h"
-
-int main(int argc, char *argv[]) {
-  using namespace kaldi;
-  using kaldi::int32;
-  try {
-    const char *usage =
-        "Sum fMPE stats\n"
-        "Usage: fmpe-sum-accs [options...] <accs-out> <stats-in1> <stats-in2> ... \n"
-        "E.g. fmpe-sum-accs 1.accs 1.1.accs 1.2.accs 1.3.accs 1.4.accs\n";
-
-    ParseOptions po(usage);
-    bool binary = true;
-    po.Register("binary", &binary, "If true, output fMPE stats in "
-                "binary mode.");
-    po.Read(argc, argv);
-
-    if (po.NumArgs() < 2) {
-      po.PrintUsage();
-      exit(1);
-    }
-
-    std::string stats_wxfilename = po.GetArg(1);
-
-    FmpeStats stats;
-    for (int32 arg = 2; arg <= po.NumArgs(); arg++) {
-      std::string stats_rxfilename = po.GetArg(arg);
-      bool binary;
-      Input ki(stats_rxfilename, &binary);
-      stats.Read(ki.Stream(), binary, true); // true == sum accs.
-    }
-
-    WriteKaldiObject(stats, stats_wxfilename, binary);
-    
-    KALDI_LOG << "Summed " << (po.NumArgs()-1) << " fMPE stats and wrote to "
-              << stats_wxfilename;
-    return 0;
-  } catch(const std::exception &e) {
-    std::cerr << e.what();
-    return -1;
-  }
-}
diff --git a/src/hmm/transitions.h b/src/hmm/transitions.h
index b446e4cc6c4..72e3b62691c 100644
--- a/src/hmm/transitions.h
+++ b/src/hmm/transitions.h
@@ -208,7 +208,7 @@ class Transitions {
   /// Information about transition-ids, indexed by transition-id.
   /// the tuples are in sorted order which allows us to do the reverse mapping from
   /// tuple to transition state
-  std::vector<const TransitionIdInfo> info_;
+  std::vector<TransitionIdInfo> info_;
 
 
   /// Accessing pdf_ids_[i] allows us to look up info_[i].pdf_id in a way that
diff --git a/src/matrix/cblas-wrappers.h b/src/matrix/cblas-wrappers.h
index f869ab7e078..2a4e774a9ce 100644
--- a/src/matrix/cblas-wrappers.h
+++ b/src/matrix/cblas-wrappers.h
@@ -83,20 +83,10 @@ inline void cblas_Xscal(const int N, const float alpha, float *data,
                         const int inc) {
   cblas_sscal(N, alpha, data, inc);
 }
-inline void cblas_Xscal(const int N, const double alpha, double *data, 
+inline void cblas_Xscal(const int N, const double alpha, double *data,
                         const int inc) {
   cblas_dscal(N, alpha, data, inc);
 }
-inline void cblas_Xspmv(const float alpha, const int num_rows, const float *Mdata,
-                        const float *v, const int v_inc,
-                        const float beta, float *y, const int y_inc) {
-  cblas_sspmv(CblasRowMajor, CblasLower, num_rows, alpha, Mdata, v, v_inc, beta, y, y_inc);
-}
-inline void cblas_Xspmv(const double alpha, const int num_rows, const double *Mdata,
-                        const double *v, const int v_inc,
-                        const double beta, double *y, const int y_inc) {
-  cblas_dspmv(CblasRowMajor, CblasLower, num_rows, alpha, Mdata, v, v_inc, beta, y, y_inc);
-}
 inline void cblas_Xtpmv(MatrixTransposeType trans, const float *Mdata,
                         const int num_rows, float *y, const int y_inc) {
   cblas_stpmv(CblasRowMajor, CblasLower, static_cast<CBLAS_TRANSPOSE>(trans),
@@ -208,7 +198,7 @@ inline void Xgemv_sparsevec(MatrixTransposeType trans, MatrixIndexT num_rows,
       if (x_i == 0.0) continue;
       // Add to ydata, the i'th column of M, times alpha * x_i
       cblas_Xaxpy(num_rows, x_i * alpha, Mdata + i, stride, ydata, incY);
-    }    
+    }
   } else {
     if (beta != 1.0) cblas_Xscal(num_cols, beta, ydata, incY);
     for (MatrixIndexT i = 0; i < num_rows; i++) {
@@ -225,31 +215,31 @@ inline void cblas_Xgemm(const float alpha,
                         MatrixTransposeType transA,
                         const float *Adata,
                         MatrixIndexT a_num_rows, MatrixIndexT a_num_cols, MatrixIndexT a_stride,
-                        MatrixTransposeType transB, 
+                        MatrixTransposeType transB,
                         const float *Bdata, MatrixIndexT b_stride,
                         const float beta,
-                        float *Mdata, 
+                        float *Mdata,
                         MatrixIndexT num_rows, MatrixIndexT num_cols,MatrixIndexT stride) {
-  cblas_sgemm(CblasRowMajor, static_cast<CBLAS_TRANSPOSE>(transA), 
+  cblas_sgemm(CblasRowMajor, static_cast<CBLAS_TRANSPOSE>(transA),
               static_cast<CBLAS_TRANSPOSE>(transB),
               num_rows, num_cols, transA == kNoTrans ? a_num_cols : a_num_rows,
               alpha, Adata, a_stride, Bdata, b_stride,
-              beta, Mdata, stride); 
+              beta, Mdata, stride);
 }
 inline void cblas_Xgemm(const double alpha,
                         MatrixTransposeType transA,
                         const double *Adata,
                         MatrixIndexT a_num_rows, MatrixIndexT a_num_cols, MatrixIndexT a_stride,
-                        MatrixTransposeType transB, 
+                        MatrixTransposeType transB,
                         const double *Bdata, MatrixIndexT b_stride,
                         const double beta,
-                        double *Mdata, 
+                        double *Mdata,
                         MatrixIndexT num_rows, MatrixIndexT num_cols,MatrixIndexT stride) {
-  cblas_dgemm(CblasRowMajor, static_cast<CBLAS_TRANSPOSE>(transA), 
+  cblas_dgemm(CblasRowMajor, static_cast<CBLAS_TRANSPOSE>(transA),
               static_cast<CBLAS_TRANSPOSE>(transB),
               num_rows, num_cols, transA == kNoTrans ? a_num_cols : a_num_rows,
               alpha, Adata, a_stride, Bdata, b_stride,
-              beta, Mdata, stride); 
+              beta, Mdata, stride);
 }
 
 
@@ -388,26 +378,26 @@ inline void clapack_Xtptri(KaldiBlasInt *num_rows, float *Mdata, KaldiBlasInt *r
 inline void clapack_Xtptri(KaldiBlasInt *num_rows, double *Mdata, KaldiBlasInt *result) {
   dtptri_(const_cast<char *>("U"), const_cast<char *>("N"), num_rows, Mdata, result);
 }
-// 
-inline void clapack_Xgetrf2(KaldiBlasInt *num_rows, KaldiBlasInt *num_cols, 
-                            float *Mdata, KaldiBlasInt *stride, KaldiBlasInt *pivot, 
+//
+inline void clapack_Xgetrf2(KaldiBlasInt *num_rows, KaldiBlasInt *num_cols,
+                            float *Mdata, KaldiBlasInt *stride, KaldiBlasInt *pivot,
                             KaldiBlasInt *result) {
   sgetrf_(num_rows, num_cols, Mdata, stride, pivot, result);
 }
-inline void clapack_Xgetrf2(KaldiBlasInt *num_rows, KaldiBlasInt *num_cols, 
-                            double *Mdata, KaldiBlasInt *stride, KaldiBlasInt *pivot, 
+inline void clapack_Xgetrf2(KaldiBlasInt *num_rows, KaldiBlasInt *num_cols,
+                            double *Mdata, KaldiBlasInt *stride, KaldiBlasInt *pivot,
                             KaldiBlasInt *result) {
   dgetrf_(num_rows, num_cols, Mdata, stride, pivot, result);
 }
 
-// 
+//
 inline void clapack_Xgetri2(KaldiBlasInt *num_rows, float *Mdata, KaldiBlasInt *stride,
-                           KaldiBlasInt *pivot, float *p_work, 
+                           KaldiBlasInt *pivot, float *p_work,
                            KaldiBlasInt *l_work, KaldiBlasInt *result) {
   sgetri_(num_rows, Mdata, stride, pivot, p_work, l_work, result);
 }
 inline void clapack_Xgetri2(KaldiBlasInt *num_rows, double *Mdata, KaldiBlasInt *stride,
-                           KaldiBlasInt *pivot, double *p_work, 
+                           KaldiBlasInt *pivot, double *p_work,
                            KaldiBlasInt *l_work, KaldiBlasInt *result) {
   dgetri_(num_rows, Mdata, stride, pivot, p_work, l_work, result);
 }
@@ -419,8 +409,8 @@ inline void clapack_Xgesvd(char *v, char *u, KaldiBlasInt *num_cols,
                            KaldiBlasInt *l_work, KaldiBlasInt *result) {
   sgesvd_(v, u,
           num_cols, num_rows, Mdata, stride,
-          sv, Vdata, vstride, Udata, ustride, 
-          p_work, l_work, result); 
+          sv, Vdata, vstride, Udata, ustride,
+          p_work, l_work, result);
 }
 inline void clapack_Xgesvd(char *v, char *u, KaldiBlasInt *num_cols,
                            KaldiBlasInt *num_rows, double *Mdata, KaldiBlasInt *stride,
@@ -430,14 +420,14 @@ inline void clapack_Xgesvd(char *v, char *u, KaldiBlasInt *num_cols,
   dgesvd_(v, u,
           num_cols, num_rows, Mdata, stride,
           sv, Vdata, vstride, Udata, ustride,
-          p_work, l_work, result); 
+          p_work, l_work, result);
 }
 //
-void inline clapack_Xsptri(KaldiBlasInt *num_rows, float *Mdata, 
+void inline clapack_Xsptri(KaldiBlasInt *num_rows, float *Mdata,
                            KaldiBlasInt *ipiv, float *work, KaldiBlasInt *result) {
   ssptri_(const_cast<char *>("U"), num_rows, Mdata, ipiv, work, result);
 }
-void inline clapack_Xsptri(KaldiBlasInt *num_rows, double *Mdata, 
+void inline clapack_Xsptri(KaldiBlasInt *num_rows, double *Mdata,
                            KaldiBlasInt *ipiv, double *work, KaldiBlasInt *result) {
   dsptri_(const_cast<char *>("U"), num_rows, Mdata, ipiv, work, result);
 }
@@ -452,14 +442,14 @@ void inline clapack_Xsptrf(KaldiBlasInt *num_rows, double *Mdata,
 }
 #else
 inline void clapack_Xgetrf(MatrixIndexT num_rows, MatrixIndexT num_cols,
-                           float *Mdata, MatrixIndexT stride, 
+                           float *Mdata, MatrixIndexT stride,
                            int *pivot, int *result) {
   *result = clapack_sgetrf(CblasColMajor, num_rows, num_cols,
                               Mdata, stride, pivot);
 }
 
 inline void clapack_Xgetrf(MatrixIndexT num_rows, MatrixIndexT num_cols,
-                           double *Mdata, MatrixIndexT stride, 
+                           double *Mdata, MatrixIndexT stride,
                            int *pivot, int *result) {
   *result = clapack_dgetrf(CblasColMajor, num_rows, num_cols,
                               Mdata, stride, pivot);
diff --git a/src/matrix/kaldi-matrix.cc b/src/matrix/kaldi-matrix.cc
index d70ac5cefc8..bb1f164441a 100644
--- a/src/matrix/kaldi-matrix.cc
+++ b/src/matrix/kaldi-matrix.cc
@@ -2786,7 +2786,7 @@ void MatrixBase<Real>::GroupMax(const MatrixBase<Real> &src) {
 
 template<typename Real>
 void MatrixBase<Real>::CopyCols(const MatrixBase<Real> &src,
-                                const MatrixIndexT *indices) {
+                                const int32 *indices) {
   KALDI_ASSERT(NumRows() == src.NumRows());
   MatrixIndexT num_rows = num_rows_, num_cols = num_cols_,
       this_stride = stride_, src_stride = src.stride_;
@@ -2801,7 +2801,7 @@ void MatrixBase<Real>::CopyCols(const MatrixBase<Real> &src,
   // For the sake of memory locality we do this row by row, rather
   // than doing it column-wise using cublas_Xcopy
   for (MatrixIndexT r = 0; r < num_rows; r++, this_data += this_stride, src_data += src_stride) {
-    const MatrixIndexT *index_ptr = &(indices[0]);
+    const int32 *index_ptr = &(indices[0]);
     for (MatrixIndexT c = 0; c < num_cols; c++, index_ptr++) {
       if (*index_ptr < 0) this_data[c] = 0;
       else this_data[c] = src_data[*index_ptr];
@@ -2812,7 +2812,7 @@ void MatrixBase<Real>::CopyCols(const MatrixBase<Real> &src,
 
 template<typename Real>
 void MatrixBase<Real>::AddCols(const MatrixBase<Real> &src,
-                               const MatrixIndexT *indices) {
+                               const int32 *indices) {
   KALDI_ASSERT(NumRows() == src.NumRows());
   MatrixIndexT num_rows = num_rows_, num_cols = num_cols_,
       this_stride = stride_, src_stride = src.stride_;
@@ -2826,8 +2826,9 @@ void MatrixBase<Real>::AddCols(const MatrixBase<Real> &src,
 
   // For the sake of memory locality we do this row by row, rather
   // than doing it column-wise using cublas_Xcopy
-  for (MatrixIndexT r = 0; r < num_rows; r++, this_data += this_stride, src_data += src_stride) {
-    const MatrixIndexT *index_ptr = &(indices[0]);
+  for (MatrixIndexT r = 0; r < num_rows;
+       r++, this_data += this_stride, src_data += src_stride) {
+    const int32 *index_ptr = indices;
     for (MatrixIndexT c = 0; c < num_cols; c++, index_ptr++) {
       if (*index_ptr >= 0)
         this_data[c] += src_data[*index_ptr];
@@ -2837,7 +2838,7 @@ void MatrixBase<Real>::AddCols(const MatrixBase<Real> &src,
 
 template<typename Real>
 void MatrixBase<Real>::CopyRows(const MatrixBase<Real> &src,
-                                const MatrixIndexT *indices) {
+                                const int32 *indices) {
   KALDI_ASSERT(NumCols() == src.NumCols());
   MatrixIndexT num_rows = num_rows_, num_cols = num_cols_,
       this_stride = stride_;
@@ -2879,7 +2880,7 @@ void MatrixBase<Real>::CopyToRows(Real *const *dst) const {
 template<typename Real>
 void MatrixBase<Real>::AddRows(Real alpha,
                                const MatrixBase<Real> &src,
-                               const MatrixIndexT *indexes) {
+                               const int32 *indexes) {
   KALDI_ASSERT(NumCols() == src.NumCols());
   MatrixIndexT num_rows = num_rows_,
       num_cols = num_cols_, this_stride = stride_;
@@ -2908,7 +2909,7 @@ void MatrixBase<Real>::AddRows(Real alpha, const Real *const *src) {
 
 template<typename Real>
 void MatrixBase<Real>::AddToRows(Real alpha,
-                                 const MatrixIndexT *indexes,
+                                 const int32 *indexes,
                                  MatrixBase<Real> *dst) const {
   KALDI_ASSERT(NumCols() == dst->NumCols());
   MatrixIndexT num_rows = num_rows_,
diff --git a/src/matrix/kaldi-matrix.h b/src/matrix/kaldi-matrix.h
index 4b06a22ece9..e5b2a658054 100644
--- a/src/matrix/kaldi-matrix.h
+++ b/src/matrix/kaldi-matrix.h
@@ -283,14 +283,14 @@ class MatrixBase {
   /// all elements of "indices" must be in [-1, src.NumCols()-1],
   /// and src.NumRows() must equal this.NumRows()
   void CopyCols(const MatrixBase<Real> &src,
-                const MatrixIndexT *indices);
+                const int32 *indices);
 
   /// Copies row r from row indices[r] of src (does nothing
   /// As a special case, if indexes[i] == -1, sets row i to zero.
   /// all elements of "indices" must be in [-1, src.NumRows()-1],
   /// and src.NumCols() must equal this.NumCols()
   void CopyRows(const MatrixBase<Real> &src,
-                const MatrixIndexT *indices);
+                const int32 *indices);
 
   /// Add column indices[r] of src to column r.
   /// As a special case, if indexes[i] == -1, skip column i
@@ -298,7 +298,7 @@ class MatrixBase {
   /// all elements of "reorder" must be in [-1, src.NumCols()-1],
   /// and src.NumRows() must equal this.NumRows()
   void AddCols(const MatrixBase<Real> &src,
-               const MatrixIndexT *indices);
+               const int32 *indices);
 
   /// Copies row r of this matrix from an array of floats at the location given
   /// by src[r]. If any src[r] is NULL then this.Row(r) will be set to zero.
@@ -317,7 +317,7 @@ class MatrixBase {
   /// be in [-1, src.NumRows()-1], and src.NumCols() must equal this.NumCols().
   void AddRows(Real alpha,
                const MatrixBase<Real> &src,
-               const MatrixIndexT *indexes);
+               const int32 *indexes);
 
   /// Does for each row r, this.Row(r) += alpha * src[r], treating src[r] as the
   /// beginning of a region of memory representing a vector of floats, of the
@@ -336,7 +336,7 @@ class MatrixBase {
   /// Requires that all the indexes[i] that are >= 0
   /// be distinct, otherwise the behavior is undefined.
   void AddToRows(Real alpha,
-                 const MatrixIndexT *indexes,
+                 const int32 *indexes,
                  MatrixBase<Real> *dst) const;
 
   /// Applies floor to all matrix elements
diff --git a/src/matrix/kaldi-vector.cc b/src/matrix/kaldi-vector.cc
index de6e626289e..6caa49376c5 100644
--- a/src/matrix/kaldi-vector.cc
+++ b/src/matrix/kaldi-vector.cc
@@ -134,7 +134,7 @@ void VectorBase<Real>::AddSpVec(const Real alpha,
                                 const Real beta) {
   KALDI_ASSERT(M.NumRows() == v.dim_ && dim_ == v.dim_);
   KALDI_ASSERT(&v != this);
-  cblas_Xspmv(alpha, M.NumRows(), M.Data(), v.Data(), v.stride_,
+  cblas_Xspmv(M.NumRows(), alpha, M.Data(), v.Data(), v.stride_,
               beta, data_, stride_);
 }
 
@@ -241,10 +241,9 @@ template<typename OtherReal>
 void VectorBase<Real>::CopyFromVec(const VectorBase<OtherReal> &other) {
   KALDI_ASSERT(dim_ == other.Dim());
   Real * __restrict__  ptr = data_;
-  MatrixIndexT dim = dim_, stride = stride_, other_stride = other.Stride();
   const OtherReal * __restrict__ other_ptr = other.Data();
-  for (MatrixIndexT i = 0; i < dim; i++)
-    ptr[i * stride] = other_ptr[i * other_stride];
+  for (MatrixIndexT i = 0; i < dim_; i++)
+    ptr[i] = other_ptr[i];
 }
 
 template void VectorBase<float>::CopyFromVec(const VectorBase<double> &other);
@@ -254,11 +253,9 @@ template void VectorBase<double>::CopyFromVec(const VectorBase<float> &other);
 template<typename Real>
 void Vector<Real>::RemoveElement(MatrixIndexT i) {
   KALDI_ASSERT(i <  this->dim_ && "Access out of vector");
-  Real *data = data_;
-  MatrixIndexT dim = dim_, stride = stride_;
-  for (MatrixIndexT j = i + 1; j <  dim; j++)
-    data[(j-1) * stride] =  data[j * stride];
-  dim_--;
+  for (MatrixIndexT j = i + 1; j <  this->dim_; j++)
+    this->data_[j-1] =  this->data_[j];
+  this->dim_--;
 }
 
 
@@ -280,10 +277,8 @@ void VectorBase<Real>::SetZero() {
 template<typename Real>
 bool VectorBase<Real>::IsZero(Real cutoff) const {
   Real abs_max = 0.0;
-  Real *data = data_;
-  MatrixIndexT dim = dim_, stride = stride_;
-  for (MatrixIndexT i = 0; i < dim; i++)
-    abs_max = std::max(std::abs(data[i * stride]), abs_max);
+  for (MatrixIndexT i = 0; i < Dim(); i++)
+    abs_max = std::max(std::abs(data_[i]), abs_max);
   return (abs_max <= cutoff);
 }
 
@@ -291,21 +286,17 @@ template<typename Real>
 void VectorBase<Real>::SetRandn() {
   kaldi::RandomState rstate;
   MatrixIndexT last = (Dim() % 2 == 1) ? Dim() - 1 : Dim();
-  Real *data = data_;
-  MatrixIndexT stride = stride_;
   for (MatrixIndexT i = 0; i < last; i += 2) {
-    kaldi::RandGauss2(data + i * stride, data + (i + 1)*stride, &rstate);
+    kaldi::RandGauss2(data_ + i, data_ + i +1, &rstate);
   }
-  if (Dim() != last) data[last * stride] = static_cast<Real>(kaldi::RandGauss(&rstate));
+  if (Dim() != last) data_[last] = static_cast<Real>(kaldi::RandGauss(&rstate));
 }
 
 template<typename Real>
 void VectorBase<Real>::SetRandUniform() {
   kaldi::RandomState rstate;
-  Real *data = data_;
-  MatrixIndexT dim = dim_, stride = stride_;
-  for (MatrixIndexT i = 0; i < dim; i++) {
-    data[i * stride] = RandUniform(&rstate);
+  for (MatrixIndexT i = 0; i < Dim(); i++) {
+    *(data_+i) = RandUniform(&rstate);
   }
 }
 
@@ -315,11 +306,11 @@ MatrixIndexT VectorBase<Real>::RandCategorical() const {
   Real sum = this->Sum();
   KALDI_ASSERT(this->Min() >= 0.0 && sum > 0.0);
   Real r = RandUniform(&rstate) * sum;
-  Real *data = data_;
-  MatrixIndexT dim = dim_, stride = stride_;
+  Real *data = this->data_;
+  MatrixIndexT dim = this->dim_;
   Real running_sum = 0.0;
   for (MatrixIndexT i = 0; i < dim; i++) {
-    running_sum += data[i * stride];
+    running_sum += data[i];
     if (r < running_sum) return i;
   }
   return dim_ - 1; // Should only happen if RandUniform()
@@ -329,9 +320,7 @@ MatrixIndexT VectorBase<Real>::RandCategorical() const {
 template<typename Real>
 void VectorBase<Real>::Set(Real f) {
   // Why not use memset here?
-  Real *data = data_;
-  MatrixIndexT dim = dim_, stride = stride_;
-  for (MatrixIndexT i = 0; i < dim; i++) { data[i * stride] = f; }
+  for (MatrixIndexT i = 0; i < dim_; i++) { data_[i] = f; }
 }
 
 template<typename Real>
@@ -339,7 +328,6 @@ void VectorBase<Real>::CopyRowsFromMat(const MatrixBase<Real> &mat) {
   KALDI_ASSERT(dim_ == mat.NumCols() * mat.NumRows());
 
   Real *inc_data = data_;
-  MatrixIndexT stride = stride_;
   const MatrixIndexT cols = mat.NumCols(), rows = mat.NumRows();
 
   if (mat.Stride() == mat.NumCols()) {
@@ -349,7 +337,7 @@ void VectorBase<Real>::CopyRowsFromMat(const MatrixBase<Real> &mat) {
       // copy the data to the propper position
       memcpy(inc_data, mat.RowData(i), cols * sizeof(Real));
       // set new copy position
-      inc_data += cols * stride;
+      inc_data += cols;
     }
   }
 }
@@ -359,16 +347,15 @@ template<typename OtherReal>
 void VectorBase<Real>::CopyRowsFromMat(const MatrixBase<OtherReal> &mat) {
   KALDI_ASSERT(dim_ == mat.NumCols() * mat.NumRows());
   Real *vec_data = data_;
-  MatrixIndexT stride = stride_;
   const MatrixIndexT cols = mat.NumCols(),
       rows = mat.NumRows();
 
   for (MatrixIndexT i = 0; i < rows; i++) {
     const OtherReal *mat_row = mat.RowData(i);
     for (MatrixIndexT j = 0; j < cols; j++) {
-      vec_data[j * stride] = static_cast<Real>(mat_row[j]);
+      vec_data[j] = static_cast<Real>(mat_row[j]);
     }
-    vec_data += cols * stride;
+    vec_data += cols;
   }
 }
 
@@ -383,16 +370,15 @@ void VectorBase<Real>::CopyColsFromMat(const MatrixBase<Real> &mat) {
   KALDI_ASSERT(dim_ == mat.NumCols() * mat.NumRows());
 
   Real*       inc_data = data_;
-  MatrixIndexT stride = stride_;
   const MatrixIndexT  cols     = mat.NumCols(), rows = mat.NumRows(), stride = mat.Stride();
   const Real *mat_inc_data = mat.Data();
 
   for (MatrixIndexT i = 0; i < cols; i++) {
     for (MatrixIndexT j = 0; j < rows; j++) {
-      inc_data[j * stride] = mat_inc_data[j*stride];
+      inc_data[j] = mat_inc_data[j*stride];
     }
     mat_inc_data++;
-    inc_data += rows * stride;
+    inc_data += rows;
   }
 }
 
@@ -410,10 +396,8 @@ void VectorBase<Real>::CopyRowFromMat(const MatrixBase<OtherReal> &mat, MatrixIn
   KALDI_ASSERT(row < mat.NumRows());
   KALDI_ASSERT(dim_ == mat.NumCols());
   const OtherReal *mat_row = mat.RowData(row);
-  Real *data = data_;
-  MatrixIndexT dim = dim_, stride = stride_;
-  for (MatrixIndexT i = 0; i < dim; i++)
-    data[i * stride] = static_cast<Real>(mat_row[i]);
+  for (MatrixIndexT i = 0; i < dim_; i++)
+    data_[i] = static_cast<Real>(mat_row[i]);
 }
 
 template
@@ -430,13 +414,11 @@ void VectorBase<Real>::CopyRowFromSp(const SpMatrix<OtherReal> &sp, MatrixIndexT
   const OtherReal *sp_data = sp.Data();
 
   sp_data += (row*(row+1)) / 2; // takes us to beginning of this row.
-  Real *data = data_;
-  MatrixIndexT dim = dim_, stride = stride_;
   MatrixIndexT i;
   for (i = 0; i < row; i++) // copy consecutive elements.
-    data[i * stride] = static_cast<Real>(*(sp_data++));
-  for(; i < dim; ++i, sp_data += i)
-    data[i * stride] = static_cast<Real>(*sp_data);
+    data_[i] = static_cast<Real>(*(sp_data++));
+  for(; i < dim_; ++i, sp_data += i)
+    data_[i] = static_cast<Real>(*sp_data);
 }
 
 template
@@ -458,25 +440,23 @@ void VectorBase<double>::ApplyPow(double power) { vdPowx(dim_, data_, power, dat
 // takes elements to a power.  Throws exception if could not (but only for power != 1 and power != 2).
 template<typename Real>
 void VectorBase<Real>::ApplyPow(Real power) {
-  Real *data = data_;
-  MatrixIndex dim = dim_, stride = stride_;
   if (power == 1.0) return;
   if (power == 2.0) {
-    for (MatrixIndexT i = 0; i < dim; i++)
-      data[i * stride] = data[i * stride] * data[i * stride];
+    for (MatrixIndexT i = 0; i < dim_; i++)
+      data_[i] = data_[i] * data_[i];
   } else if (power == 0.5) {
-    for (MatrixIndexT i = 0; i < dim; i++) {
-      if (!(data[i * stride] >= 0.0))
+    for (MatrixIndexT i = 0; i < dim_; i++) {
+      if (!(data_[i] >= 0.0))
         KALDI_ERR << "Cannot take square root of negative value "
-                  << data[i * stride];
-      data[i * stride] = std::sqrt(data[i * stride]);
+                  << data_[i];
+      data_[i] = std::sqrt(data_[i]);
     }
   } else {
     for (MatrixIndexT i = 0; i < dim_; i++) {
-      data[i * stride] = pow(data[i * stride], power);
-      if (data[i * stride] == HUGE_VAL) {  // HUGE_VAL is what errno returns on error.
+      data_[i] = pow(data_[i], power);
+      if (data_[i] == HUGE_VAL) {  // HUGE_VAL is what errno returns on error.
         KALDI_ERR << "Could not raise element "  << i << " to power "
-                  << power << ": returned value = " << data[i * stride];
+                  << power << ": returned value = " << data_[i];
       }
     }
   }
@@ -487,33 +467,31 @@ void VectorBase<Real>::ApplyPow(Real power) {
 // Throws exception if could not (but only for power != 1 and power != 2).
 template<typename Real>
 void VectorBase<Real>::ApplyPowAbs(Real power, bool include_sign) {
-  Real *data = data_;
-  MatrixIndexT dim = dim_, stride = stride_;
   if (power == 1.0)
-    for (MatrixIndexT i = 0; i < dim; i++)
-      data[i * stride] = (include_sign && data[i * stride] < 0 ? -1 : 1) * std::abs(data[i * stride]);
+    for (MatrixIndexT i = 0; i < dim_; i++)
+      data_[i] = (include_sign && data_[i] < 0 ? -1 : 1) * std::abs(data_[i]);
   if (power == 2.0) {
-    for (MatrixIndexT i = 0; i < dim; i++)
-      data[i * stride] = (include_sign && data[i * stride] < 0 ? -1 : 1) * data[i * stride] * data[i * stride];
+    for (MatrixIndexT i = 0; i < dim_; i++)
+      data_[i] = (include_sign && data_[i] < 0 ? -1 : 1) * data_[i] * data_[i];
   } else if (power == 0.5) {
-    for (MatrixIndexT i = 0; i < dim; i++) {
-      data[i * stride] = (include_sign && data[i * stride] < 0 ? -1 : 1) * std::sqrt(std::abs(data[i * stride]));
+    for (MatrixIndexT i = 0; i < dim_; i++) {
+      data_[i] = (include_sign && data_[i] < 0 ? -1 : 1) * std::sqrt(std::abs(data_[i]));
     }
   } else if (power < 0.0) {
-    for (MatrixIndexT i = 0; i < dim; i++) {
-      data[i * stride] = (data[i * stride] == 0.0 ? 0.0 : pow(std::abs(data[i * stride]), power));
-      data[i * stride] *= (include_sign && data[i * stride] < 0 ? -1 : 1);
-      if (data[i * stride] == HUGE_VAL) {  // HUGE_VAL is what errno returns on error.
+    for (MatrixIndexT i = 0; i < dim_; i++) {
+      data_[i] = (data_[i] == 0.0 ? 0.0 : pow(std::abs(data_[i]), power));
+      data_[i] *= (include_sign && data_[i] < 0 ? -1 : 1);
+      if (data_[i] == HUGE_VAL) {  // HUGE_VAL is what errno returns on error.
         KALDI_ERR << "Could not raise element "  << i << "to power "
-                  << power << ": returned value = " << data[i * stride];
+                  << power << ": returned value = " << data_[i];
       }
     }
   } else {
-    for (MatrixIndexT i = 0; i < dim; i++) {
-      data[i * stride] = (include_sign && data[i * stride] < 0 ? -1 : 1) * pow(std::abs(data[i * stride]), power);
-      if (data[i * stride] == HUGE_VAL) {  // HUGE_VAL is what errno returns on error.
+    for (MatrixIndexT i = 0; i < dim_; i++) {
+      data_[i] = (include_sign && data_[i] < 0 ? -1 : 1) * pow(std::abs(data_[i]), power);
+      if (data_[i] == HUGE_VAL) {  // HUGE_VAL is what errno returns on error.
         KALDI_ERR << "Could not raise element "  << i << "to power "
-                  << power << ": returned value = " << data[i * stride];
+                  << power << ": returned value = " << data_[i];
       }
     }
   }
@@ -523,30 +501,28 @@ void VectorBase<Real>::ApplyPowAbs(Real power, bool include_sign) {
 template<typename Real>
 Real VectorBase<Real>::Norm(Real p) const {
   KALDI_ASSERT(p >= 0.0);
-  Real *data = data_;
-  MatrixIndexT dim = dim_, stride = stride_;
   Real sum = 0.0;
   if (p == 0.0) {
-    for (MatrixIndexT i = 0; i < dim; i++)
-      if (data[i * stride] != 0.0) sum += 1.0;
+    for (MatrixIndexT i = 0; i < dim_; i++)
+      if (data_[i] != 0.0) sum += 1.0;
     return sum;
   } else if (p == 1.0) {
-    for (MatrixIndexT i = 0; i < dim; i++)
-      sum += std::abs(data[i * stride]);
+    for (MatrixIndexT i = 0; i < dim_; i++)
+      sum += std::abs(data_[i]);
     return sum;
   } else if (p == 2.0) {
-    for (MatrixIndexT i = 0; i < dim; i++)
-      sum += data[i * stride] * data[i * stride];
+    for (MatrixIndexT i = 0; i < dim_; i++)
+      sum += data_[i] * data_[i];
     return std::sqrt(sum);
   } else if (p == std::numeric_limits<Real>::infinity()){
-    for (MatrixIndexT i = 0; i < dim; i++)
-      sum = std::max(sum, std::abs(data[i * stride]));
+    for (MatrixIndexT i = 0; i < dim_; i++)
+      sum = std::max(sum, std::abs(data_[i]));
     return sum;
   } else {
     Real tmp;
     bool ok = true;
-    for (MatrixIndexT i = 0; i < dim; i++) {
-      tmp = pow(std::abs(data[i * stride]), p);
+    for (MatrixIndexT i = 0; i < dim_; i++) {
+      tmp = pow(std::abs(data_[i]), p);
       if (tmp == HUGE_VAL) // HUGE_VAL is what pow returns on error.
         ok = false;
       sum += tmp;
@@ -578,9 +554,8 @@ bool VectorBase<Real>::ApproxEqual(const VectorBase<Real> &other, float tol) con
   } else { // Test for exact equality.
     const Real *data = data_;
     const Real *other_data = other.data_;
-    MatrixIndex other_stride = other.stride_, stride = stride_;
     for (MatrixIndexT dim = dim_, i = 0; i < dim; i++)
-      if (data[i * stride] != other_data[i * other_stride]) return false;
+      if (data[i] != other_data[i]) return false;
     return true;
   }
 }
@@ -589,9 +564,9 @@ template<typename Real>
 Real VectorBase<Real>::Max() const {
   Real ans = - std::numeric_limits<Real>::infinity();
   const Real *data = data_;
-  MatrixIndexT i, dim = dim_, stride = stride_;
+  MatrixIndexT i, dim = dim_;
   for (i = 0; i + 4 <= dim; i += 4) {
-    Real a1 = data[i*stride], a2 = data[(i+1)*stride], a3 = data[(i+2)*stride], a4 = data[(i+3)*stride];
+    Real a1 = data[i], a2 = data[i+1], a3 = data[i+2], a4 = data[i+3];
     if (a1 > ans || a2 > ans || a3 > ans || a4 > ans) {
       Real b1 = (a1 > a2 ? a1 : a2), b2 = (a3 > a4 ? a3 : a4);
       if (b1 > ans) ans = b1;
@@ -599,7 +574,7 @@ Real VectorBase<Real>::Max() const {
     }
   }
   for (; i < dim; i++)
-    if (data[i * stride] > ans) ans = data[i * stride];
+    if (data[i] > ans) ans = data[i];
   return ans;
 }
 
@@ -609,9 +584,9 @@ Real VectorBase<Real>::Max(MatrixIndexT *index_out) const {
   Real ans = - std::numeric_limits<Real>::infinity();
   MatrixIndexT index = 0;
   const Real *data = data_;
-  MatrixIndexT i, dim = dim_, stride = stride_;
+  MatrixIndexT i, dim = dim_;
   for (i = 0; i + 4 <= dim; i += 4) {
-    Real a1 = data[i*stride], a2 = data[(i+1)*stride], a3 = data[(i+2)*stride], a4 = data[(i+3)*stride];
+    Real a1 = data[i], a2 = data[i+1], a3 = data[i+2], a4 = data[i+3];
     if (a1 > ans || a2 > ans || a3 > ans || a4 > ans) {
       if (a1 > ans) { ans = a1; index = i; }
       if (a2 > ans) { ans = a2; index = i + 1; }
@@ -620,7 +595,7 @@ Real VectorBase<Real>::Max(MatrixIndexT *index_out) const {
     }
   }
   for (; i < dim; i++)
-    if (data[i * stride] > ans) { ans = data[i * stride]; index = i; }
+    if (data[i] > ans) { ans = data[i]; index = i; }
   *index_out = index;
   return ans;
 }
@@ -629,9 +604,9 @@ template<typename Real>
 Real VectorBase<Real>::Min() const {
   Real ans = std::numeric_limits<Real>::infinity();
   const Real *data = data_;
-  MatrixIndexT i, dim = dim_, stride = stride_;
+  MatrixIndexT i, dim = dim_;
   for (i = 0; i + 4 <= dim; i += 4) {
-    Real a1 = data[i*stride], a2 = data[(i+1)*stride], a3 = data[(i+2)*stride], a4 = data[(i+3)*stride];
+    Real a1 = data[i], a2 = data[i+1], a3 = data[i+2], a4 = data[i+3];
     if (a1 < ans || a2 < ans || a3 < ans || a4 < ans) {
       Real b1 = (a1 < a2 ? a1 : a2), b2 = (a3 < a4 ? a3 : a4);
       if (b1 < ans) ans = b1;
@@ -639,7 +614,7 @@ Real VectorBase<Real>::Min() const {
     }
   }
   for (; i < dim; i++)
-    if (data[i*stride] < ans) ans = data[i*stride];
+    if (data[i] < ans) ans = data[i];
   return ans;
 }
 
@@ -649,9 +624,9 @@ Real VectorBase<Real>::Min(MatrixIndexT *index_out) const {
   Real ans = std::numeric_limits<Real>::infinity();
   MatrixIndexT index = 0;
   const Real *data = data_;
-  MatrixIndexT i, dim = dim_, stride = stride_;
+  MatrixIndexT i, dim = dim_;
   for (i = 0; i + 4 <= dim; i += 4) {
-    Real a1 = data[i*stride], a2 = data[(i+1)*stride], a3 = data[(i+2)*stride], a4 = data[(i+3)*stride];
+    Real a1 = data[i], a2 = data[i+1], a3 = data[i+2], a4 = data[i+3];
     if (a1 < ans || a2 < ans || a3 < ans || a4 < ans) {
       if (a1 < ans) { ans = a1; index = i; }
       if (a2 < ans) { ans = a2; index = i + 1; }
@@ -660,7 +635,7 @@ Real VectorBase<Real>::Min(MatrixIndexT *index_out) const {
     }
   }
   for (; i < dim; i++)
-    if (data[i*stride] < ans) { ans = data[i*stride]; index = i; }
+    if (data[i] < ans) { ans = data[i]; index = i; }
   *index_out = index;
   return ans;
 }
@@ -671,10 +646,8 @@ template<typename OtherReal>
 void VectorBase<Real>::CopyColFromMat(const MatrixBase<OtherReal> &mat, MatrixIndexT col) {
   KALDI_ASSERT(col < mat.NumCols());
   KALDI_ASSERT(dim_ == mat.NumRows());
-  Real *data = data_;
-  MatrixIndexT dim = dim_, stride = stride_;
-  for (MatrixIndexT i = 0; i < dim; i++)
-    data[i * stride] = mat(i, col);
+  for (MatrixIndexT i = 0; i < dim_; i++)
+    data_[i] = mat(i, col);
   // can't do this very efficiently so don't really bother. could improve this though.
 }
 // instantiate the template above.
@@ -696,10 +669,9 @@ void VectorBase<Real>::CopyDiagFromMat(const MatrixBase<Real> &M) {
 template<typename Real>
 void VectorBase<Real>::CopyDiagFromPacked(const PackedMatrix<Real> &M) {
   KALDI_ASSERT(dim_ == M.NumCols());
-  Real *data = data_;
   MatrixIndexT stride = stride_, dim = dim_;
   for (MatrixIndexT i = 0; i < dim; i++)
-    data[i * stride] = M(i, i);
+    data_[i * stride] = M(i, i);
   // could make this more efficient.
 }
 
@@ -716,10 +688,9 @@ template<typename Real>
 Real VectorBase<Real>::SumLog() const {
   double sum_log = 0.0;
   double prod = 1.0;
-  Real *data = data_;
   MatrixIndexT dim = dim_, stride = stride_;
   for (MatrixIndexT i = 0; i < dim; i++) {
-    prod *= data[i * stride];
+    prod *= data_[i * stride];
     // Possible future work (arnab): change these magic values to pre-defined
     // constants
     if (prod < 1.0e-10 || prod > 1.0e+10) {
@@ -848,17 +819,15 @@ void VectorBase<Real>::ApplyAbs() {
 
 template<typename Real>
 void VectorBase<Real>::ApplyFloor(Real floor_val, MatrixIndexT *floored_count) {
-  Real *data = data_;
-  MatrixIndexT dim = dim_, stride = stride_;
   if (floored_count == nullptr) {
-    for (MatrixIndexT i = 0; i < dim; i++) {
-      data[i] = std::max(data[i * stride], floor_val);
+    for (MatrixIndexT i = 0; i < dim_; i++) {
+      data_[i] = std::max(data_[i], floor_val);
     }
   } else {
     MatrixIndexT num_floored = 0;
-    for (MatrixIndexT i = 0; i < dim; i++) {
-      if (data[i * stride] < floor_val) {
-        data[i * stride] = floor_val;
+    for (MatrixIndexT i = 0; i < dim_; i++) {
+      if (data_[i] < floor_val) {
+        data_[i] = floor_val;
         num_floored++;
       }
     }
@@ -906,10 +875,8 @@ MatrixIndexT VectorBase<Real>::ApplyFloor(const VectorBase<Real> &floor_vec) {
 template<typename Real>
 Real VectorBase<Real>::ApplySoftMax() {
   Real max = this->Max(), sum = 0.0;
-  Real *data = data_;
-  MatrixIndexT dim = dim_, stride = stride_;
-  for (MatrixIndexT i = 0; i < dim; i++) {
-    sum += (data[i * stride] = Exp(data[i * stride] - max));
+  for (MatrixIndexT i = 0; i < dim_; i++) {
+    sum += (data_[i] = Exp(data_[i] - max));
   }
   this->Scale(1.0 / sum);
   return max + Log(sum);
@@ -918,10 +885,8 @@ Real VectorBase<Real>::ApplySoftMax() {
 template<typename Real>
 Real VectorBase<Real>::ApplyLogSoftMax() {
   Real max = this->Max(), sum = 0.0;
-  Real *data = data_;
-  MatrixIndexT dim = dim_, stride = stride_;
-  for (MatrixIndexT i = 0; i < dim; i++) {
-    sum += Exp((data[i * stride] -= max));
+  for (MatrixIndexT i = 0; i < dim_; i++) {
+    sum += Exp((data_[i] -= max));
   }
   sum = Log(sum);
   this->Add(-1.0 * sum);
@@ -943,10 +908,8 @@ void VectorBase<double>::Tanh(const VectorBase<double> &src) {
 template<typename Real>
 void VectorBase<Real>::Tanh(const VectorBase<Real> &src) {
   KALDI_ASSERT(dim_ == src.dim_);
-  Real *data = data_;
-  MatrixIndexT dim = dim_, stride = stride_, src_stride = src.stride_;
-  for (MatrixIndexT i = 0; i < dim; i++) {
-    Real x = src.data[i * src_stride];
+  for (MatrixIndexT i = 0; i < dim_; i++) {
+    Real x = src.data_[i];
     if (x > 0.0) {
       Real inv_expx = Exp(-x);
       x = -1.0 + 2.0 / (1.0 + inv_expx * inv_expx);
@@ -954,7 +917,7 @@ void VectorBase<Real>::Tanh(const VectorBase<Real> &src) {
       Real expx = Exp(x);
       x = 1.0 - 2.0 / (1.0 + expx * expx);
     }
-    data[i * stride] = x;
+    data_[i] = x;
   }
 }
 #endif
@@ -983,10 +946,8 @@ void VectorBase<double>::Sigmoid(const VectorBase<double> &src) {
 template<typename Real>
 void VectorBase<Real>::Sigmoid(const VectorBase<Real> &src) {
   KALDI_ASSERT(dim_ == src.dim_);
-  Real *data = data_;
-  MatrixIndexT dim = dim_, stride = stride_, src_stride = src.stride_;
-  for (MatrixIndexT i = 0; i < dim; i++) {
-    Real x = src.data[i * src_stride];
+  for (MatrixIndexT i = 0; i < dim_; i++) {
+    Real x = src.data_[i];
     // We aim to avoid floating-point overflow here.
     if (x > 0.0) {
       x = 1.0 / (1.0 + Exp(-x));
@@ -994,7 +955,7 @@ void VectorBase<Real>::Sigmoid(const VectorBase<Real> &src) {
       Real ex = Exp(x);
       x = ex / (ex + 1.0);
     }
-    data[i * stride] = x;
+    data_[i] = x;
   }
 }
 #endif
@@ -1002,10 +963,8 @@ void VectorBase<Real>::Sigmoid(const VectorBase<Real> &src) {
 
 template<typename Real>
 void VectorBase<Real>::Add(Real c) {
-  Real *data = data_;
-  MatrixIndexT dim = dim_, stride = stride_;
-  for (MatrixIndexT i = 0; i < dim; i++) {
-    data[i * stride] += c;
+  for (MatrixIndexT i = 0; i < dim_; i++) {
+    data_[i] += c;
   }
 }
 
@@ -1017,19 +976,16 @@ void VectorBase<Real>::Scale(Real alpha) {
 template<typename Real>
 void VectorBase<Real>::MulElements(const VectorBase<Real> &v) {
   KALDI_ASSERT(dim_ == v.dim_);
-  Real *data = data_, *v_data = v.data_;
-  MatrixIndexT dim = dim_, v_stride = v.stride_, stride = stride_;
-  for (MatrixIndexT i = 0; i < dim; i++) {
-    data[i * stride] *= v_data[i * v_stride];
+  for (MatrixIndexT i = 0; i < dim_; i++) {
+    data_[i] *= v.data_[i];
   }
 }
 
 template<typename Real>  // Set each element to y = (x == orig ? changed : x).
 void VectorBase<Real>::ReplaceValue(Real orig, Real changed) {
   Real *data = data_;
-  MatrixIndexT dim = dim_, stride = stride_;
-  for (MatrixIndexT i = 0; i < dim; i++)
-    if (data[i * stride] == orig) data[i * stride] = changed;
+  for (MatrixIndexT i = 0; i < dim_; i++)
+    if (data[i] == orig) data[i] = changed;
 }
 
 
@@ -1038,10 +994,8 @@ template<typename OtherReal>
 void VectorBase<Real>::MulElements(const VectorBase<OtherReal> &v) {
   KALDI_ASSERT(dim_ == v.Dim());
   const OtherReal *other_ptr = v.Data();
-  Real *data = data_;
-  MatrixIndexT dim = dim_, v_stride = v.Stride(), stride = stride_;
-  for (MatrixIndexT i = 0; i < dim; i++) {
-    data_[i * stride] *= other_ptr[i * v_stride];
+  for (MatrixIndexT i = 0; i < dim_; i++) {
+    data_[i] *= other_ptr[i];
   }
 }
 // instantiate template.
@@ -1065,10 +1019,8 @@ void VectorBase<Real>::AddVecVec(Real alpha, const VectorBase<Real> &v,
 template<typename Real>
 void VectorBase<Real>::DivElements(const VectorBase<Real> &v) {
   KALDI_ASSERT(dim_ == v.dim_);
-  Real *data = data_;
-  MatrixIndexT dim = dim_, v_stride = v.stride_, stride = stride_;
-  for (MatrixIndexT i = 0; i < dim; i++) {
-    data_[i * stride] /= v.data_[i * v_stride];
+  for (MatrixIndexT i = 0; i < dim_; i++) {
+    data_[i] /= v.data_[i];
   }
 }
 
@@ -1077,9 +1029,8 @@ template<typename OtherReal>
 void VectorBase<Real>::DivElements(const VectorBase<OtherReal> &v) {
   KALDI_ASSERT(dim_ == v.Dim());
   const OtherReal *other_ptr = v.Data();
-  MatrixIndexT dim = dim_, v_stride = v.Stride(), stride = stride_;
-  for (MatrixIndexT i = 0; i < dim; i++) {
-    data[i * stride] /= other_ptr[i * v_stride];
+  for (MatrixIndexT i = 0; i < dim_; i++) {
+    data_[i] /= other_ptr[i];
   }
 }
 // instantiate template.
@@ -1092,11 +1043,8 @@ template<typename Real>
 void VectorBase<Real>::AddVecDivVec(Real alpha, const VectorBase<Real> &v,
                                     const VectorBase<Real> &rr, Real beta) {
   KALDI_ASSERT((dim_ == v.dim_ && dim_ == rr.dim_));
-  Real *data = data_, *v_data = v.data_, *rr_data = rr.data_;
-  MatrixIndexT dim = dim_, v_stride = v.stride_, rr_stride = rr.stride_, stride = stride_;
-  for (MatrixIndexT i = 0; i < dim; i++) {
-    data[i * stride] = alpha * v_data[i * v_stride]/rr_data[i * rr_stride] +
-      beta * data[i * stride];
+  for (MatrixIndexT i = 0; i < dim_; i++) {
+    data_[i] = alpha * v.data_[i]/rr.data_[i] + beta * data_[i] ;
   }
 }
 
@@ -1107,13 +1055,13 @@ void VectorBase<Real>::AddVec(const Real alpha, const VectorBase<OtherReal> &v)
   // remove __restrict__ if it causes compilation problems.
   Real *__restrict__ data = data_;
   OtherReal *__restrict__ other_data = v.data_;
-  MatrixIndexT dim = dim_, v_stride = v.stride_, stride = stride_;
+  MatrixIndexT dim = dim_;
   if (alpha != 1.0)
     for (MatrixIndexT i = 0; i < dim; i++)
-      data[i * stride] += alpha * other_data[i * v_stride];
+      data[i] += alpha * other_data[i];
   else
     for (MatrixIndexT i = 0; i < dim; i++)
-      data[i * stride] += other_data[i * v_stride];
+      data[i] += other_data[i];
 }
 
 template
@@ -1128,13 +1076,13 @@ void VectorBase<Real>::AddVec2(const Real alpha, const VectorBase<OtherReal> &v)
   // remove __restrict__ if it causes compilation problems.
   Real *__restrict__ data = data_;
   OtherReal *__restrict__ other_data = v.data_;
-  MatrixIndexT dim = dim_, v_stride = v.stride_, stride = stride_;
+  MatrixIndexT dim = dim_;
   if (alpha != 1.0)
     for (MatrixIndexT i = 0; i < dim; i++)
-      data[i * stride] += alpha * other_data[i * v_stride] * other_data[i * v_stride];
+      data[i] += alpha * other_data[i] * other_data[i];
   else
     for (MatrixIndexT i = 0; i < dim; i++)
-      data[i * stride] += other_data[i * v_stride] * other_data[i * v_stride];
+      data[i] += other_data[i] * other_data[i];
 }
 
 template
@@ -1246,7 +1194,7 @@ void Vector<Real>::Read(std::istream & is,  bool binary, bool add) {
         is.get();  // eat the ']'
         this->Resize(data.size());
         for (size_t j = 0; j < data.size(); j++)
-          this->data_[j * this->stride_] = data[j * data.stride_];
+          this->data_[j] = data[j];
         i = is.peek();
         if (static_cast<char>(i) == '\r') {
           is.get();
@@ -1302,10 +1250,9 @@ void VectorBase<Real>::Write(std::ostream & os, bool binary) const {
     WriteBasicType(os, binary, size);
     os.write(reinterpret_cast<const char*>(Data()), sizeof(Real) * size);
   } else {
-    MatrixIndexT stride = this->stride_;
     os << " [ ";
     for (MatrixIndexT i = 0; i < Dim(); i++)
-      os << (*this)(i * stride) << " ";
+      os << (*this)(i) << " ";
     os << "]\n";
   }
   if (!os.good())
@@ -1316,10 +1263,8 @@ void VectorBase<Real>::Write(std::ostream & os, bool binary) const {
 template<typename Real>
 void VectorBase<Real>::AddVec2(const Real alpha, const VectorBase<Real> &v) {
   KALDI_ASSERT(dim_ == v.dim_);
-  Real *data = data, v_data = v.data_;
-  MatrixIndexT dim = dim_, v_stride = v.stride_, stride = stride_;
-  for (MatrixIndexT i = 0; i < dim; i++)
-    data[i * stride] += alpha * v_data[i * v_stride] * v_data[i * v_stride];
+  for (MatrixIndexT i = 0; i < dim_; i++)
+    data_[i] += alpha * v.data_[i] * v.data_[i];
 }
 
 // this <-- beta*this + alpha*M*v.
@@ -1371,19 +1316,19 @@ void VectorBase<Real>::AddDiagMat2(
   if (trans == kNoTrans) {
     KALDI_ASSERT(this->dim_ == M.NumRows());
     MatrixIndexT rows = this->dim_, cols = M.NumCols(),
-      mat_stride = M.Stride(), stride = this->stride_;
+           mat_stride = M.Stride();
     Real *data = this->data_;
     const Real *mat_data = M.Data();
-    for (MatrixIndexT i = 0; i < rows; i++, mat_data += mat_stride, data += stride)
+    for (MatrixIndexT i = 0; i < rows; i++, mat_data += mat_stride, data++)
       *data = beta * *data + alpha * cblas_Xdot(cols, mat_data, 1,
                                                 mat_data, 1);
   } else {
     KALDI_ASSERT(this->dim_ == M.NumCols());
     MatrixIndexT rows = M.NumRows(), cols = this->dim_,
-      mat_stride = M.Stride(), stride = stride_;
+           mat_stride = M.Stride();
     Real *data = this->data_;
     const Real *mat_data = M.Data();
-    for (MatrixIndexT i = 0; i < cols; i++, mat_data++, data += stride)
+    for (MatrixIndexT i = 0; i < cols; i++, mat_data++, data++)
       *data = beta * *data + alpha * cblas_Xdot(rows, mat_data, mat_stride,
                                                  mat_data, mat_stride);
   }
@@ -1403,11 +1348,10 @@ void VectorBase<Real>::AddDiagMatMat(
   if (transM == kTrans) std::swap(M_row_stride, M_col_stride);
   MatrixIndexT N_row_stride = N.Stride(), N_col_stride = 1;
   if (transN == kTrans) std::swap(N_row_stride, N_col_stride);
-  MatrixIndexT stride = this->stride_;
 
   Real *data = this->data_;
   const Real *Mdata = M.Data(), *Ndata = N.Data();
-  for (MatrixIndexT i = 0; i < dim; i++, Mdata += M_row_stride, Ndata += N_col_stride, data += stride) {
+  for (MatrixIndexT i = 0; i < dim; i++, Mdata += M_row_stride, Ndata += N_col_stride, data++) {
     *data = beta * *data + alpha * cblas_Xdot(M_col_dim, Mdata, M_col_stride, Ndata, N_row_stride);
   }
 }
diff --git a/src/matrix/matrix-common.h b/src/matrix/matrix-common.h
index f7047d71ca5..b0dad6a0cdb 100644
--- a/src/matrix/matrix-common.h
+++ b/src/matrix/matrix-common.h
@@ -28,7 +28,7 @@
 namespace kaldi {
 // this enums equal to CblasTrans and CblasNoTrans constants from CBLAS library
 // we are writing them as literals because we don't want to include here matrix/kaldi-blas.h,
-// which puts many symbols into global scope (like "real") via the header f2c.h 
+// which puts many symbols into global scope (like "real") via the header f2c.h
 typedef enum {
   kTrans    = 112, // = CblasTrans
   kNoTrans  = 111  // = CblasNoTrans
@@ -95,14 +95,12 @@ template<> class OtherReal<double> {
 };
 
 
+// BLAS's interface has 'int' which on even many 64 bit systems is
+// 32 bits, so using 64 bits for the matrix index would be like making
+// a promise we can't keep.
 typedef int32 MatrixIndexT;
-typedef int32 SignedMatrixIndexT;
 typedef uint32 UnsignedMatrixIndexT;
 
-// If you want to use size_t for the index type, do as follows instead:
-//typedef size_t MatrixIndexT;
-//typedef ssize_t SignedMatrixIndexT;
-//typedef size_t UnsignedMatrixIndexT;
 
 }
 
diff --git a/src/matrix/optimization.cc b/src/matrix/optimization.cc
index c17b5b94d8c..a70e31ae0ec 100644
--- a/src/matrix/optimization.cc
+++ b/src/matrix/optimization.cc
@@ -105,7 +105,7 @@ void OptimizeLbfgs<Real>::ComputeHifNeeded(const VectorBase<Real> &gradient) {
       H_.Set(gamma_k);
     }
   }
-}  
+}
 
 // This represents the first 2 lines of Algorithm 7.5 (N&W), which
 // in fact is mostly a call to Algorithm 7.4.
@@ -114,7 +114,7 @@ template<typename Real>
 void OptimizeLbfgs<Real>::ComputeNewDirection(Real function_value,
                                               const VectorBase<Real> &gradient) {
   KALDI_ASSERT(computation_state_ == kBeforeStep);
-  SignedMatrixIndexT m = M(), k = k_;
+  MatrixIndexT m = M(), k = k_;
   ComputeHifNeeded(gradient);
   // The rest of this is computing p_k <-- - H_k \nabla f_k using Algorithm
   // 7.4 of N&W.
@@ -127,16 +127,16 @@ void OptimizeLbfgs<Real>::ComputeNewDirection(Real function_value,
     q.CopyFromVec(gradient); // q <-- \nabla f_k.
   Vector<Real> alpha(m);
   // for i = k - 1, k - 2, ... k - m
-  for (SignedMatrixIndexT i = k - 1;
-       i >= std::max(k - m, static_cast<SignedMatrixIndexT>(0));
-       i--) { 
+  for (MatrixIndexT i = k - 1;
+       i >= std::max(k - m, static_cast<MatrixIndexT>(0));
+       i--) {
     alpha(i % m) = rho_(i % m) * VecVec(S(i), q); // \alpha_i <-- \rho_i s_i^T q.
     q.AddVec(-alpha(i % m), Y(i)); // q <-- q - \alpha_i y_i
   }
   r.SetZero();
   r.AddVecVec(1.0, H_, q, 0.0); // r <-- H_k^{(0)} q.
   // for k = k - m, k - m + 1, ... , k - 1
-  for (SignedMatrixIndexT i = std::max(k - m, static_cast<SignedMatrixIndexT>(0));
+  for (MatrixIndexT i = std::max(k - m, static_cast<MatrixIndexT>(0));
        i < k;
        i++) {
     Real beta = rho_(i % m) * VecVec(Y(i), r); // \beta <-- \rho_i y_i^T r
@@ -148,7 +148,7 @@ void OptimizeLbfgs<Real>::ComputeNewDirection(Real function_value,
     if ((opts_.minimize && dot < 0) || (!opts_.minimize && dot > 0))
       KALDI_WARN << "Step direction has the wrong sign!  Routine will fail.";
   }
-  
+
   // Now we're out of Alg. 7.4 and back into Alg. 7.5.
   // Alg. 7.4 returned r (using new_x_ as the location), and with \alpha_k = 1
   // as the initial guess, we're setting x_{k+1} = x_k + \alpha_k p_k, with
@@ -178,7 +178,7 @@ bool OptimizeLbfgs<Real>::AcceptStep(Real function_value,
   s.AddVec(-1.0, x_); // s = new_x_ - x_.
   y.CopyFromVec(gradient);
   y.AddVec(-1.0, deriv_); // y = gradient - deriv_.
-  
+
   // Warning: there is a division in the next line.  This could
   // generate inf or nan, but this wouldn't necessarily be an error
   // at this point because for zero step size or derivative we should
@@ -190,11 +190,11 @@ bool OptimizeLbfgs<Real>::AcceptStep(Real function_value,
   if ((opts_.minimize && prod <= 1.0e-20) || (!opts_.minimize && prod >= -1.0e-20)
       || len == 0.0)
     return false; // This will force restart.
-  
+
   KALDI_VLOG(3) << "Accepted step; length was " << len
                 << ", prod was " << prod;
   RecordStepLength(len);
-  
+
   // store x_{k+1} and the function value f_{k+1}.
   x_.CopyFromVec(new_x_);
   f_ = function_value;
@@ -239,12 +239,12 @@ void OptimizeLbfgs<Real>::StepSizeIteration(Real function_value,
                                             const VectorBase<Real> &gradient) {
   KALDI_VLOG(3) << "In step size iteration, function value changed "
                 << f_ << " to " << function_value;
-  
+
   // We're in some part of the backtracking, and the user is providing
   // the objective function value and gradient.
   // We're checking two conditions: Wolfe i) [the Armijo rule] and
   // Wolfe ii).
-  
+
   // The Armijo rule (when minimizing) is:
   // f(k_k + \alpha_k p_k) <= f(x_k) + c_1 \alpha_k p_k^T \nabla f(x_k), where
   //  \nabla means the derivative.
@@ -255,11 +255,11 @@ void OptimizeLbfgs<Real>::StepSizeIteration(Real function_value,
   // Below, pf is \alpha_k p_k^T \nabla f(x_k).
   Real pf = VecVec(new_x_, deriv_) - VecVec(x_, deriv_);
   Real temp = f_ + opts_.c1 * pf;
-  
+
   bool wolfe_i_ok;
   if (opts_.minimize) wolfe_i_ok = (function_value <= temp);
   else wolfe_i_ok = (function_value >= temp);
-  
+
   // Wolfe condition ii) can be written as:
   //  p_k^T \nabla f(x_k + \alpha_k p_k) >= c_2 p_k^T \nabla f(x_k)
   // p2f equals \alpha_k p_k^T \nabla f(x_k + \alpha_k p_k), where
@@ -285,7 +285,7 @@ void OptimizeLbfgs<Real>::StepSizeIteration(Real function_value,
   // code will quickly detect convergence.
 
   d_action = kNoChange; // the default.
-  
+
   if (wolfe_i_ok && wolfe_ii_ok) {
     iteration_action = kAccept;
     d_action = kNoChange; // actually doesn't matter, it'll get reset.
@@ -319,13 +319,13 @@ void OptimizeLbfgs<Real>::StepSizeIteration(Real function_value,
 
   if (d_action == kDecrease)
     d_ = std::sqrt(d_);
-  
+
   KALDI_VLOG(3) << "d = " << d_ << ", iter = " << k_ << ", action = "
                 << (iteration_action == kAccept ? "accept" :
                     (iteration_action == kDecreaseStep ? "decrease" :
                      (iteration_action == kIncreaseStep ? "increase" :
                       "reject")));
-  
+
   // Note: even if iteration_action != Restart at this point,
   // some code below may set it to Restart.
   if (iteration_action == kAccept) {
@@ -358,7 +358,7 @@ void OptimizeLbfgs<Real>::StepSizeIteration(Real function_value,
                     << "close to the old value; restarting.";
       iteration_action = kRestart;
     }
-        
+
     if (iteration_action == kDecreaseStep) {
       num_wolfe_i_failures_++;
       last_failure_type_ = kWolfeI;
@@ -433,7 +433,7 @@ OptimizeLbfgs<Real>::GetValue(Real *objf_value) const {
 //  p_k : A-conjugate direction
 //  \beta_k  : coefficient used in A-conjugate direction computation for next
 //  iteration
-//  
+//
 //  Algo.  LinearCG(A,b,x_0)
 //  ========================
 //  r_0 = Ax_0 - b
@@ -464,21 +464,21 @@ int32 LinearCgd(const LinearCgdOptions &opts,
   p.AddSpVec(-1.0, A, *x, 1.0);  // p_0 = b - A x_0
   r.AddVec(-1.0, p);  // r_0 = - p_0
   x_orig.CopyFromVec(*x);  // in case of failure.
-  
+
   Real r_cur_norm_sq = VecVec(r, r),
       r_initial_norm_sq = r_cur_norm_sq,
       r_recompute_norm_sq = r_cur_norm_sq;
 
   KALDI_VLOG(5) << "In linear CG: initial norm-square of residual = "
                 << r_initial_norm_sq;
-  
+
   KALDI_ASSERT(opts.recompute_residual_factor <= 1.0);
   Real max_error_sq = std::max<Real>(opts.max_error * opts.max_error,
                                      std::numeric_limits<Real>::min()),
       residual_factor = opts.recompute_residual_factor *
                         opts.recompute_residual_factor,
       inv_residual_factor = 1.0 / residual_factor;
-  
+
   // Note: although from a mathematical point of view the method should converge
   // after M iterations, in practice (due to roundoff) it does not always
   // converge to good precision after that many iterations so we let the maximum
@@ -492,7 +492,7 @@ int32 LinearCgd(const LinearCgdOptions &opts,
     // Below is how the code used to look.
     // // next line: \alpha_k = (r_k^T r_k) / (p_k^T A p_k)
     // Real alpha = r_cur_norm_sq / VecVec(p, Ap);
-    // 
+    //
     // We changed r_cur_norm_sq below to -VecVec(p, r).  Although this is
     // slightly less efficient, it seems to make the algorithm dramatically more
     // robust.  Note that -p^T r is the mathematically more natural quantity to
@@ -500,23 +500,23 @@ int32 LinearCgd(const LinearCgdOptions &opts,
     // recommended in Nocedal and Wright only as a kind of optimization as it is
     // supposed to be the same as -p^T r and we already have it computed.
     Real alpha = -VecVec(p, r) / VecVec(p, Ap);
-    
+
     // next line: x_{k+1} = x_k + \alpha_k p_k;
     x->AddVec(alpha, p);
     // next line: r_{k+1} = r_k + \alpha_k A p_k
     r.AddVec(alpha, Ap);
     Real r_next_norm_sq = VecVec(r, r);
-    
+
     if (r_next_norm_sq < residual_factor * r_recompute_norm_sq ||
         r_next_norm_sq > inv_residual_factor * r_recompute_norm_sq) {
-         
+
       // Recompute the residual from scratch if the residual norm has decreased
       // a lot; this costs an extra matrix-vector multiply, but helps keep the
       // residual accurate.
       // Also do the same if the residual norm has increased a lot since
       // the last time we recomputed... this shouldn't happen often, but
       // it can indicate bad stuff is happening.
-      
+
       // r_{k+1} = A x_{k+1} - b
       r.AddSpVec(1.0, A, *x, 0.0);
       r.AddVec(-1.0, b);
@@ -530,7 +530,7 @@ int32 LinearCgd(const LinearCgdOptions &opts,
     // Check if converged.
     if (r_next_norm_sq <= max_error_sq)
       break;
-    
+
     // next line: \beta_{k+1} = \frac{r_{k+1}^T r_{k+1}}{r_k^T r_K}
     Real beta_next = r_next_norm_sq / r_cur_norm_sq;
     // next lines: p_{k+1} = -r_{k+1} + \beta_{k+1} p_k
@@ -555,8 +555,8 @@ int32 LinearCgd(const LinearCgdOptions &opts,
     SolveQuadraticProblem(A, b, opts, x);
   }
   return k;
-} 
-    
+}
+
 // Instantiate the class for float and double.
 template
 class OptimizeLbfgs<float>;
diff --git a/src/matrix/optimization.h b/src/matrix/optimization.h
index 66309acaad5..e9c16142fdb 100644
--- a/src/matrix/optimization.h
+++ b/src/matrix/optimization.h
@@ -43,12 +43,12 @@ struct LinearCgdOptions {
   // scratch.  This helps to keep the computed residual accurate even in the
   // presence of roundoff.
   BaseFloat recompute_residual_factor;
-  
+
   LinearCgdOptions(): max_iters(-1),
                       max_error(0.0),
                       recompute_residual_factor(0.01) { }
 };
-  
+
 /*
   This function uses linear conjugate gradient descent to approximately solve
   the system A x = b.  The value of x at entry corresponds to the initial guess
@@ -103,7 +103,7 @@ struct LbfgsOptions {
   int max_line_search_iters; // after this many iters we restart L-BFGS.
   int avg_step_length; // number of iters to avg step length over, in
   // RecentStepLength().
-  
+
   LbfgsOptions (bool minimize = true):
       minimize(minimize),
       m(10),
@@ -123,16 +123,16 @@ class OptimizeLbfgs {
   /// Initializer takes the starting value of x.
   OptimizeLbfgs(const VectorBase<Real> &x,
                 const LbfgsOptions &opts);
-  
+
   /// This returns the value of the variable x that has the best objective
   /// function so far, and the corresponding objective function value if
   /// requested.  This would typically be called only at the end.
   const VectorBase<Real>& GetValue(Real *objf_value = NULL) const;
-  
+
   /// This returns the value at which the function wants us
   /// to compute the objective function and gradient.
   const VectorBase<Real>& GetProposedValue() const { return new_x_; }
-  
+
   /// Returns the average magnitude of the last n steps (but not
   /// more than the number we have stored).  Before we have taken
   /// any steps, returns +infinity.  Note: if the most recent
@@ -140,7 +140,7 @@ class OptimizeLbfgs {
   /// step lengths.  This makes it suitable as a convergence test
   /// (else we'd generate NaN's).
   Real RecentStepLength() const;
-  
+
   /// The user calls this function to provide the class with the
   /// function and gradient info at the point GetProposedValue().
   /// If this point is outside the constraints you can set function_value
@@ -149,7 +149,7 @@ class OptimizeLbfgs {
   /// the second overloaded version of this function) will be ignored.
   void DoStep(Real function_value,
               const VectorBase<Real> &gradient);
-  
+
   /// The user can call this version of DoStep() if it is desired to set some
   /// kind of approximate Hessian on this iteration.  Note: it is a prerequisite
   /// that diag_approx_2nd_deriv must be strictly positive (minimizing), or
@@ -157,7 +157,7 @@ class OptimizeLbfgs {
   void DoStep(Real function_value,
               const VectorBase<Real> &gradient,
               const VectorBase<Real> &diag_approx_2nd_deriv);
-  
+
  private:
   KALDI_DISALLOW_COPY_AND_ASSIGN(OptimizeLbfgs);
 
@@ -175,7 +175,7 @@ class OptimizeLbfgs {
     kWithinStep, // This means we're within the step-size computation, and
     // have not yet done the 1st function evaluation.
   };
-  
+
   inline MatrixIndexT Dim() { return x_.Dim(); }
   inline MatrixIndexT M() { return opts_.m; }
   SubVector<Real> Y(MatrixIndexT i) {
@@ -196,12 +196,12 @@ class OptimizeLbfgs {
   void StepSizeIteration(Real function_value,
                          const VectorBase<Real> &gradient);
   void RecordStepLength(Real s);
-  
-  
+
+
   LbfgsOptions opts_;
-  SignedMatrixIndexT k_; // Iteration number, starts from zero.  Gets set back to zero
+  MatrixIndexT k_; // Iteration number, starts from zero.  Gets set back to zero
   // when we restart.
-  
+
   ComputationState computation_state_;
   bool H_was_set_; // True if the user specified H_; if false,
   // we'll use a heuristic to estimate it.
@@ -222,7 +222,7 @@ class OptimizeLbfgs {
   int num_wolfe_ii_failures_; // the num times we increased step size.
   enum { kWolfeI, kWolfeII, kNone } last_failure_type_; // last type of step-search
   // failure on this iter.
-  
+
   Vector<Real> H_; // Current inverse-Hessian estimate.  May be computed by this class itself,
   // or provided by user using 2nd form of SetGradientInfo().
   Matrix<Real> data_; // dimension (m*2) x dim.  Even rows store
@@ -233,11 +233,11 @@ class OptimizeLbfgs {
   // (up to m) iterations; these are not stored in a rotating buffer but
   // are shifted by one each time (this is more convenient when we
   // restart, as we keep this info past restarting).
-  
+
 
 };
-  
-/// @} 
+
+/// @}
 
 
 } // end namespace kaldi
@@ -245,4 +245,3 @@ class OptimizeLbfgs {
 
 
 #endif
-
diff --git a/src/matrix/qr.cc b/src/matrix/qr.cc
index 861dead05ba..8912d2892ce 100644
--- a/src/matrix/qr.cc
+++ b/src/matrix/qr.cc
@@ -57,7 +57,7 @@ void House(MatrixIndexT dim, const Real *x, Real *v, Real *beta) {
     if (max_x == 0.0) max_x = 1.0;
     s = 1.0 / max_x;
   }
-  
+
   Real sigma = 0.0;
   v[0] = 1.0;
   for (MatrixIndexT i = 1; i < dim; i++) {
@@ -73,7 +73,7 @@ void House(MatrixIndexT dim, const Real *x, Real *v, Real *beta) {
       v[0] = x1 - mu;
     } else {
       v[0] = -sigma / (x1 + mu);
-      KALDI_ASSERT(KALDI_ISFINITE(v[dim-1]));      
+      KALDI_ASSERT(KALDI_ISFINITE(v[dim-1]));
     }
     Real v1 = v[0];
     Real v1sq = v1 * v1;
@@ -155,11 +155,11 @@ void HouseBackward(MatrixIndexT dim, const Real *x, Real *v, Real *beta) {
    with packed lower-triangular matrices to do it this way.  There's also
    a shift from one-based to zero-based indexing, so the index
    k is transformed k -> n - k, and a corresponding transpose...
-   
+
    Let the original *this be A.  This algorithms replaces *this with
    a tridiagonal matrix T such that T = Q A Q^T for an orthogonal Q.
    Caution: Q is transposed vs. Golub and Van Loan.
-   If Q != NULL it outputs Q. 
+   If Q != NULL it outputs Q.
 */
 template<typename Real>
 void SpMatrix<Real>::Tridiagonalize(MatrixBase<Real> *Q) {
@@ -195,7 +195,7 @@ void SpMatrix<Real>::Tridiagonalize(MatrixBase<Real> *Q) {
     if (Q != NULL) { // C.f. Golub, Q is H_1 .. H_n-2... in this
       // case we apply them in the opposite order so it's H_n-1 .. H_1,
       // but also Q is transposed so we really have Q = H_1 .. H_n-1.
-      // It's a double negative.    
+      // It's a double negative.
       // Anyway, we left-multiply Q by each one.  The H_n would each be
       // diag(I + beta v v', I) but we don't ever touch the last dims.
       // We do (in Matlab notation):
@@ -309,7 +309,7 @@ void QrStep(MatrixIndexT n,
     if (k < n-2) {
       // Next is the elements (k+2, k) and (k+2, k-1), to be rotated, again
       // backwards.
-      Real &elem_kp2_k = z, 
+      Real &elem_kp2_k = z,
           &elem_kp2_kp1 = off_diag[k+1];
       // Note: elem_kp2_k == z would start off as zero because it's
        // two off the diagonal, and not been touched yet.  Therefore
@@ -338,7 +338,7 @@ void QrInternal(MatrixIndexT n,
   MatrixIndexT counter = 0, max_iters = 500 + 4*n, // Should never take this many iters.
       large_iters = 100 + 2*n;
   Real epsilon = (pow(2.0, sizeof(Real) == 4 ? -23.0 : -52.0));
-  
+
   for (; counter < max_iters; counter++) { // this takes the place of "until
                                            // q=n"... we'll break out of the
                                            // loop when we converge.
@@ -356,7 +356,7 @@ void QrInternal(MatrixIndexT n,
         off_diag[i] = 0.0;
     }
     // The next code works out p, q, and npq which is n - p - q.
-    // For the definitions of q and p, see Golub and Van Loan; we 
+    // For the definitions of q and p, see Golub and Van Loan; we
     // partition the n dims into pieces of size (p, n-p-q, q) where
     // the part of size q is diagonal and the part of size n-p-p is
     // "unreduced", i.e. has no zero off-diagonal elements.
@@ -392,7 +392,7 @@ void QrInternal(MatrixIndexT n,
     } else {
       QrStep(npq, diag + p, off_diag + p,
              static_cast<MatrixBase<Real>*>(NULL));
-    }      
+    }
   }
   if (counter == max_iters) {
     KALDI_WARN << "Failure to converge in QR algorithm. "
@@ -497,7 +497,7 @@ void SpMatrix<Real>::TopEigs(VectorBase<Real> *s, MatrixBase<Real> *P,
       // We do "full orthogonalization" to preserve stability,
       // even though this is usually a waste of time.
       Real start_prod = VecVec(r, r);
-      for (SignedMatrixIndexT e = d; e >= 0; e--) { // e must be signed!
+      for (MatrixIndexT e = d; e >= 0; e--) { // e must be signed!
         SubVector<Real> q_e(Q, e);
         Real prod = VecVec(r, q_e);
         if (counter == 0 && static_cast<MatrixIndexT>(e) + 1 >= d) // Keep T tridiagonal, which
@@ -528,11 +528,11 @@ void SpMatrix<Real>::TopEigs(VectorBase<Real> *s, MatrixBase<Real> *P,
     }
   }
 
-  Matrix<Real> R(lanczos_dim, lanczos_dim);  
+  Matrix<Real> R(lanczos_dim, lanczos_dim);
   R.SetUnit();
   T.Qr(&R); // Diagonalizes T.
   Vector<Real> s_tmp(lanczos_dim);
-  s_tmp.CopyDiagFromSp(T);  
+  s_tmp.CopyDiagFromSp(T);
 
   // Now T = R * diag(s_tmp) * R^T.
   // The next call sorts the elements of s from greatest to least absolute value,
@@ -544,7 +544,7 @@ void SpMatrix<Real>::TopEigs(VectorBase<Real> *s, MatrixBase<Real> *P,
   SubMatrix<Real> Rsub(R, 0, eig_dim, 0, lanczos_dim);
   SubVector<Real> s_sub(s_tmp, 0, eig_dim);
   s->CopyFromVec(s_sub);
-      
+
   // For working out what to do now, just assume the other eigenvalues were
   // zero.  This is just for purposes of knowing how to get the result, and
   // not getting things wrongly transposed.
diff --git a/src/matrix/sparse-matrix-test.cc b/src/matrix/sparse-matrix-test.cc
index 26b2c227bba..9d7b59a7705 100644
--- a/src/matrix/sparse-matrix-test.cc
+++ b/src/matrix/sparse-matrix-test.cc
@@ -77,7 +77,7 @@ void UnitTestSparseVectorMax() {
     vec.SetRandn();
     svec.CopyElementsToVec(&vec);
 
-    int32 index1, index2;
+    MatrixIndexT index1, index2;
     Real max1, max2;
 
     max1 = svec.Max(&index1);
diff --git a/src/matrix/sparse-matrix.cc b/src/matrix/sparse-matrix.cc
index 55d8edeb4b3..896d70ae799 100644
--- a/src/matrix/sparse-matrix.cc
+++ b/src/matrix/sparse-matrix.cc
@@ -29,7 +29,7 @@
 namespace kaldi {
 
 template <typename Real>
-std::pair<MatrixIndexT, Real>* SparseVector<Real>::Data() {
+std::pair<int32, Real>* SparseVector<Real>::Data() {
   if (pairs_.empty())
     return NULL;
   else
@@ -37,7 +37,7 @@ std::pair<MatrixIndexT, Real>* SparseVector<Real>::Data() {
 }
 
 template <typename Real>
-const std::pair<MatrixIndexT, Real>* SparseVector<Real>::Data() const {
+const std::pair<int32, Real>* SparseVector<Real>::Data() const {
   if (pairs_.empty())
     return NULL;
   else
@@ -65,7 +65,7 @@ void SparseVector<Real>::CopyElementsToVec(VectorBase<OtherReal> *vec) const {
   KALDI_ASSERT(vec->Dim() == this->dim_);
   vec->SetZero();
   OtherReal *other_data = vec->Data();
-  typename std::vector<std::pair<MatrixIndexT, Real> >::const_iterator
+  typename std::vector<std::pair<int32, Real> >::const_iterator
       iter = pairs_.begin(), end = pairs_.end();
   for (; iter != end; ++iter)
     other_data[iter->first] = iter->second;
@@ -85,7 +85,7 @@ void SparseVector<Real>::AddToVec(Real alpha,
                                   VectorBase<OtherReal> *vec) const {
   KALDI_ASSERT(vec->Dim() == dim_);
   OtherReal *other_data = vec->Data();
-  typename std::vector<std::pair<MatrixIndexT, Real> >::const_iterator
+  typename std::vector<std::pair<int32, Real> >::const_iterator
       iter = pairs_.begin(), end = pairs_.end();
   if (alpha == 1.0) {  // treat alpha==1.0 case specially.
     for (; iter != end; ++iter)
@@ -147,10 +147,11 @@ template <typename Real>
 void SparseVector<Real>::Write(std::ostream &os, bool binary) const {
   if (binary) {
     WriteToken(os, binary, "SV");
-    WriteBasicType(os, binary, dim_);
-    MatrixIndexT num_elems = pairs_.size();
+    int32 dim = dim_;
+    WriteBasicType(os, binary, dim);
+    int32 num_elems = pairs_.size();
     WriteBasicType(os, binary, num_elems);
-    typename std::vector<std::pair<MatrixIndexT, Real> >::const_iterator
+    typename std::vector<std::pair<int32, Real> >::const_iterator
         iter = pairs_.begin(), end = pairs_.end();
     for (; iter != end; ++iter) {
       WriteBasicType(os, binary, iter->first);
@@ -160,7 +161,7 @@ void SparseVector<Real>::Write(std::ostream &os, bool binary) const {
     // In text-mode, use a human-friendly, script-friendly format;
     // format is "dim=5 [ 0 0.2 3 0.9 ] "
     os << "dim=" << dim_ << " [ ";
-    typename std::vector<std::pair<MatrixIndexT, Real> >::const_iterator
+    typename std::vector<std::pair<int32, Real> >::const_iterator
         iter = pairs_.begin(), end = pairs_.end();
     for (; iter != end; ++iter)
       os << iter->first << ' ' << iter->second << ' ';
@@ -173,13 +174,14 @@ template <typename Real>
 void SparseVector<Real>::Read(std::istream &is, bool binary) {
   if (binary) {
     ExpectToken(is, binary, "SV");
-    ReadBasicType(is, binary, &dim_);
-    KALDI_ASSERT(dim_ >= 0);
-    int32 num_elems;
+    int32 dim, num_elems;
+    ReadBasicType(is, binary, &dim);
+    KALDI_ASSERT(dim >= 0);
+    dim_ = dim;
     ReadBasicType(is, binary, &num_elems);
     KALDI_ASSERT(num_elems >= 0 && num_elems <= dim_);
     pairs_.resize(num_elems);
-    typename std::vector<std::pair<MatrixIndexT, Real> >::iterator
+    typename std::vector<std::pair<int32, Real> >::iterator
         iter = pairs_.begin(), end = pairs_.end();
     for (; iter != end; ++iter) {
       ReadBasicType(is, binary, &(iter->first));
@@ -217,7 +219,7 @@ void SparseVector<Real>::Read(std::istream &is, bool binary) {
         KALDI_ERR << "Error reading sparse vector, expecting numbers.";
       KALDI_ASSERT(i >= 0 && i < dim
                    && (pairs_.empty() || i > pairs_.back().first));
-      pairs_.push_back(std::pair<MatrixIndexT, BaseFloat>(i, p));
+      pairs_.push_back(std::pair<int32, BaseFloat>(i, p));
     }
   }
 }
@@ -226,8 +228,8 @@ void SparseVector<Real>::Read(std::istream &is, bool binary) {
 namespace sparse_vector_utils {
 template <typename Real>
 struct CompareFirst {
-  inline bool operator() (const std::pair<MatrixIndexT, Real> &p1,
-                           const std::pair<MatrixIndexT, Real> &p2) const {
+  inline bool operator() (const std::pair<int32, Real> &p1,
+                           const std::pair<int32, Real> &p2) const {
     return p1.first < p2.first;
   }
 };
@@ -235,12 +237,12 @@ struct CompareFirst {
 
 template <typename Real>
 SparseVector<Real>::SparseVector(
-    MatrixIndexT dim, const std::vector<std::pair<MatrixIndexT, Real> > &pairs):
+    MatrixIndexT dim, const std::vector<std::pair<int32, Real> > &pairs):
     dim_(dim),
     pairs_(pairs) {
   std::sort(pairs_.begin(), pairs_.end(),
             sparse_vector_utils::CompareFirst<Real>());
-  typename std::vector<std::pair<MatrixIndexT, Real> >::iterator
+  typename std::vector<std::pair<int32, Real> >::iterator
       out = pairs_.begin(), in = out,  end = pairs_.end();
   // special case: while there is nothing to be changed, skip over
   // initial input (avoids unnecessary copying).
@@ -273,7 +275,7 @@ void SparseVector<Real>::SetRandn(BaseFloat zero_prob) {
   KALDI_ASSERT(zero_prob >= 0 && zero_prob <= 1.0);
   for (MatrixIndexT i = 0; i < dim_; i++)
     if (WithProb(1.0 - zero_prob))
-      pairs_.push_back(std::pair<MatrixIndexT, Real>(i, RandGauss()));
+      pairs_.push_back(std::pair<int32, Real>(i, RandGauss()));
 }
 
 template <typename Real>
@@ -339,7 +341,7 @@ template<typename Real>
 Real SparseMatrix<Real>::FrobeniusNorm() const {
   Real squared_sum = 0;
   for (int32 i = 0; i < rows_.size(); ++i) {
-    const std::pair<MatrixIndexT, Real> *row_data = rows_[i].Data();
+    const std::pair<int32, Real> *row_data = rows_[i].Data();
     for (int32 j = 0; j < rows_[i].NumElements(); ++j) {
       squared_sum += row_data[j].second * row_data[j].second;
     }
@@ -367,7 +369,7 @@ void SparseMatrix<Real>::CopyToMat(MatrixBase<OtherReal> *other,
     for (MatrixIndexT row = 0; row < num_rows; row++, other_col_data++) {
       const SparseVector<Real> &svec = rows_[row];
       MatrixIndexT num_elems = svec.NumElements();
-      const std::pair<MatrixIndexT, Real> *sdata = svec.Data();
+      const std::pair<int32, Real> *sdata = svec.Data();
       for (MatrixIndexT e = 0; e < num_elems; e++)
         other_col_data[sdata[e].first * other_stride] = sdata[e].second;
     }
@@ -413,7 +415,7 @@ void SparseMatrix<Real>::CopyFromSmat(const SparseMatrix<OtherReal> &other,
       rows_[r].CopyFromSvec(other.Row(r));
     }
   } else {
-    std::vector<std::vector<std::pair<MatrixIndexT, Real> > > pairs(
+    std::vector<std::vector<std::pair<int32, Real> > > pairs(
         other.NumCols());
     for (MatrixIndexT i = 0; i < other.NumRows(); ++i) {
       for (int id = 0; id < other.Row(i).NumElements(); ++id) {
@@ -511,7 +513,7 @@ void SparseMatrix<Real>::AddToMat(BaseFloat alpha,
     for (MatrixIndexT row = 0; row < num_rows; row++, other_col_data++) {
       const SparseVector<Real> &svec = rows_[row];
       MatrixIndexT num_elems = svec.NumElements();
-      const std::pair<MatrixIndexT, Real> *sdata = svec.Data();
+      const std::pair<int32, Real> *sdata = svec.Data();
       for (MatrixIndexT e = 0; e < num_elems; e++)
         other_col_data[sdata[e].first * other_stride] +=
             alpha * sdata[e].second;
@@ -524,7 +526,7 @@ Real VecSvec(const VectorBase<Real> &vec,
              const SparseVector<Real> &svec) {
   KALDI_ASSERT(vec.Dim() == svec.Dim());
   MatrixIndexT n = svec.NumElements();
-  const std::pair<MatrixIndexT, Real> *sdata = svec.Data();
+  const std::pair<int32, Real> *sdata = svec.Data();
   const Real *data = vec.Data();
   Real ans = 0.0;
   for (MatrixIndexT i = 0; i < n; i++)
@@ -546,7 +548,7 @@ const SparseVector<Real> &SparseMatrix<Real>::Row(MatrixIndexT r) const {
 }
 
 template <typename Real>
-void SparseMatrix<Real>::SetRow(int32 r, const SparseVector<Real> &vec) {
+void SparseMatrix<Real>::SetRow(MatrixIndexT r, const SparseVector<Real> &vec) {
   KALDI_ASSERT(static_cast<size_t>(r) < rows_.size() &&
                vec.Dim() == rows_[0].Dim());
   rows_[r] = vec;
@@ -566,7 +568,7 @@ template<typename Real>
 SparseMatrix<Real>::SparseMatrix(const std::vector<int32> &indexes, int32 dim,
                                  MatrixTransposeType trans) {
   const std::vector<int32>& idx = indexes;
-  std::vector<std::vector<std::pair<MatrixIndexT, Real> > > pair(idx.size());
+  std::vector<std::vector<std::pair<int32, Real> > > pair(idx.size());
   for (int i = 0; i < idx.size(); ++i) {
     if (idx[i] >= 0) {
       pair[i].push_back( { idx[i], Real(1) });
@@ -587,7 +589,7 @@ SparseMatrix<Real>::SparseMatrix(const std::vector<int32> &indexes,
                                  MatrixTransposeType trans) {
   const std::vector<int32>& idx = indexes;
   const VectorBase<Real>& w = weights;
-  std::vector<std::vector<std::pair<MatrixIndexT, Real> > > pair(idx.size());
+  std::vector<std::vector<std::pair<int32, Real> > > pair(idx.size());
   for (int i = 0; i < idx.size(); ++i) {
     if (idx[i] >= 0) {
       pair[i].push_back( { idx[i], w(i) });
@@ -617,7 +619,7 @@ void SparseMatrix<Real>::Swap(SparseMatrix<Real> *other) {
 template<typename Real>
 SparseMatrix<Real>::SparseMatrix(
     MatrixIndexT dim,
-    const std::vector<std::vector<std::pair<MatrixIndexT, Real> > > &pairs):
+    const std::vector<std::vector<std::pair<int32, Real> > > &pairs):
     rows_(pairs.size()) {
   MatrixIndexT num_rows = pairs.size();
   for (MatrixIndexT row = 0; row < num_rows; row++) {
@@ -719,7 +721,7 @@ Real TraceMatSmat(const MatrixBase<Real> &A,
       Real col_sum = 0.0;
       const SparseVector<Real> &svec = B.Row(i);
       MatrixIndexT num_elems = svec.NumElements();
-      const std::pair<MatrixIndexT, Real> *sdata = svec.Data();
+      const std::pair<int32, Real> *sdata = svec.Data();
       for (MatrixIndexT e = 0; e < num_elems; e++)
         col_sum += A_col_data[Astride * sdata[e].first] * sdata[e].second;
       sum += col_sum;
@@ -1163,11 +1165,11 @@ void GeneralMatrix::AddToMat(BaseFloat alpha, MatrixBase<BaseFloat> *mat,
 }
 
 template <class Real>
-Real SparseVector<Real>::Max(int32 *index_out) const {
+Real SparseVector<Real>::Max(MatrixIndexT *index_out) const {
   KALDI_ASSERT(dim_ > 0 && pairs_.size() <= static_cast<size_t>(dim_));
   Real ans = -std::numeric_limits<Real>::infinity();
   int32 index = 0;
-  typename std::vector<std::pair<MatrixIndexT, Real> >::const_iterator
+  typename std::vector<std::pair<int32, Real> >::const_iterator
       iter = pairs_.begin(), end = pairs_.end();
   for (; iter != end; ++iter) {
     if (iter->second > ans) {
@@ -1218,7 +1220,7 @@ SparseVector<Real>::SparseVector(const VectorBase<Real> &vec) {
   for (MatrixIndexT i = 0; i < dim; i++) {
     Real val = ptr[i];
     if (val != 0.0)
-      pairs_.push_back(std::pair<MatrixIndexT,Real>(i,val));
+      pairs_.push_back(std::pair<int32,Real>(i,val));
   }
 }
 
diff --git a/src/matrix/sparse-matrix.h b/src/matrix/sparse-matrix.h
index 76f77f531d5..2b5723bd244 100644
--- a/src/matrix/sparse-matrix.h
+++ b/src/matrix/sparse-matrix.h
@@ -68,22 +68,22 @@ class SparseVector {
   // If all the elements stored were negative and there underlying vector had
   // zero indexes not listed in the elements, or if no elements are stored, it
   // will return the first un-listed index, whose value (implicitly) is zero.
-  Real Max(int32 *index) const;
+  Real Max(MatrixIndexT *index) const;
 
   /// Returns the number of nonzero elements.
   MatrixIndexT NumElements() const { return pairs_.size(); }
 
   /// get an indexed element (0 <= i < NumElements()).
-  const std::pair<MatrixIndexT, Real> &GetElement(MatrixIndexT i) const {
+  const std::pair<int32, Real> &GetElement(MatrixIndexT i) const {
     return pairs_[i];
   }
 
   // returns pointer to element data, or NULL if empty (use with NumElements()).
-  std::pair<MatrixIndexT, Real> *Data();
+  std::pair<int32, Real> *Data();
 
   // returns pointer to element data, or NULL if empty (use with NumElements());
   // const version
-  const std::pair<MatrixIndexT, Real> *Data() const;
+  const std::pair<int32, Real> *Data() const;
 
   /// Sets elements to zero with probability zero_prob, else normally
   /// distributed.  Useful in testing.
@@ -95,7 +95,7 @@ class SparseVector {
 
   // constructor from pairs; does not assume input pairs are sorted and uniq
   SparseVector(MatrixIndexT dim,
-               const std::vector<std::pair<MatrixIndexT, Real> > &pairs);
+               const std::vector<std::pair<int32, Real> > &pairs);
 
   // constructor from a VectorBase that keeps only the nonzero elements of 'vec'.
   explicit SparseVector(const VectorBase<Real> &vec);
@@ -115,7 +115,7 @@ class SparseVector {
   MatrixIndexT dim_;
   // pairs of (row-index, value).  Stored in sorted order with no duplicates.
   // For now we use std::vector, but we could change this.
-  std::vector<std::pair<MatrixIndexT, Real> > pairs_;
+  std::vector<std::pair<int32, Real> > pairs_;
 };
 
 
@@ -181,8 +181,8 @@ class SparseMatrix {
   // Posterior. indexed first by row-index; the pairs are (column-index, value),
   // and the constructor does not require them to be sorted and uniq.
   SparseMatrix(
-      int32 dim,
-      const std::vector<std::vector<std::pair<MatrixIndexT, Real> > > &pairs);
+      MatrixIndexT dim,
+      const std::vector<std::vector<std::pair<int32, Real> > > &pairs);
 
   /// Sets up to a pseudo-randomly initialized matrix, with each element zero
   /// with probability zero_prob and else normally distributed- mostly for
@@ -196,7 +196,7 @@ class SparseMatrix {
   const SparseVector<Real> &Row(MatrixIndexT r) const;
 
   /// Sets row r to "vec"; makes sure it has the correct dimension.
-  void SetRow(int32 r, const SparseVector<Real> &vec);
+  void SetRow(MatrixIndexT r, const SparseVector<Real> &vec);
 
   /// Select a subset of the rows of a SparseMatrix.
   /// Sets *this to only the rows of 'smat_other' that are listed
diff --git a/src/tensor/array-ref.h b/src/tensor/array-ref.h
index fee280e5b1a..47cc96a31b7 100644
--- a/src/tensor/array-ref.h
+++ b/src/tensor/array-ref.h
@@ -1,3 +1,26 @@
+// tensor/array-ref.h
+
+//  Copyright      2019  Johns Hopkins University (author: Daniel Povey)
+
+// See ../../COPYING for clarification regarding multiple authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//  http://www.apache.org/licenses/LICENSE-2.0
+//
+// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
+// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
+// MERCHANTABLITY OR NON-INFRINGEMENT.
+// See the Apache 2 License for the specific language governing permissions and
+// limitations under the License.
+
+#include <base/kaldi-error.h>
+#include <tensor/tensor-common.h>
+
+
 /**
    This is some notes on plans for kaldi10 tensor stuff, nothing is fully fleshed out.
 */
@@ -13,24 +36,42 @@ namespace tensor {
 // ArrayRef has only two members and it will probably make sense to pass it by
 // value most of the time.
 template <typename T>
-struct ArrayRef {
-  // Note:
-  uint64_t size;
-  const int64_t *data;
+struct ArrayRef final {
+  const T *data;
+  size_t size;
 
-  // Lots to do here.
-
-  inline int64_t operator (uint64_t i) const {
+  inline T& operator [] (uint64_t i) const {
     KALDI_ASSERT(i < size);
     return data[i];
   }
 
-  // cast to std::vector; for cases where you might need to
-  // change the contents or keep it more permanently.
-  operator std::vector<int64_t> () const;
-};
+  constexpr ArrayRef() : size(0), data(nullptr) { }
+
+  // Construct from one element.
+  // Caution: this constructor allows you to evade 'const'.
+  constexpr ArrayRef(const T &element) : size(1), data(&element) { }
+
+  // Construct from data and size
+  constexpr ArrayRef(const T* data, size_t size): data(data), size(size) { }
 
+  /// Construct from a range.  Caution: this constructor allows
+  /// you to evade 'const'.
+  constexpr ArrayRef(const T* begin, const T* end): data(begin), size(end - begin) { }
 
+  /// Construct from a std::vector.
+  ArrayRef(const std::vector<T> &vec): data(vec.data()), size(vec.size()) { }
 
+  /// Construct from a C array.
+  template <size_t N>
+      constexpr ArrayRef(const T (&data)[N]): data(data), size(N) { }
 
+  /// Construct from a std::initializer_list
+  constexpr ArrayRef(const std::initializer_list<T> &vec):
+      data(vec.data()), size(vec.size()) { }
+
+  // We will add iterators later if they are needed.
 };
+
+
+}  // namespace tensor
+}  // namespace kaldi
diff --git a/src/tensor/storage.h b/src/tensor/storage.h
index cd11fac022c..434880f5192 100644
--- a/src/tensor/storage.h
+++ b/src/tensor/storage.h
@@ -1,31 +1,59 @@
-#include "tensor/tensor-common.h"
+// tensor/storage.h
 
-/**
-   This is some notes on plans for kaldi10 tensor stuff, nothing is fully fleshed out.
-*/
+// Copyright      2019  Johns Hopkins University (author: Daniel Povey)
 
-namespace kaldi {
-namespace tensor {
+// See ../../COPYING for clarification regarding multiple authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//  http://www.apache.org/licenses/LICENSE-2.0
+//
+// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
+// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
+// MERCHANTABLITY OR NON-INFRINGEMENT.
+// See the Apache 2 License for the specific language governing permissions and
+// limitations under the License.
 
+#ifndef KALDI_TENSOR_STORAGE_H_
+#define KALDI_TENSOR_STORAGE_H_ 1
 
+#include "tensor/tensor-common.h"
+
+namespace kaldi {
+namespace tensor {
 
 
 // 'Storage' contains a single allocated region (on CPU or GPU, according
 // to 'device').
 struct Storage {
+  using DeallocatorFunc = std::function<void()>;
+
   void *data;
   size_t num_bytes;
   Device device;
+  // 'deallocator' to be used with external toolkits, for example, to decrease
+  // the refcount
+  DeallocatorFunc deallocator;
+
+  // 'device' and 'deallocator' have default constructors.
+  Storage(): data(NULL), num_bytes(0) { }
 
-  // Note: will throw if allocation fails (for now).
+  // This constructor tries to allocate the requested data on the specified
+  // device.  It will throw if allocation fails (for now).
   Storage(Device device, size_t num_bytes);
 
-  // Destructor deallocates 'data'.  For now there is no
-  // concept of a custom allocator or an allocator object, we just use our CuDevice stuff for cuda
-  // allocation and posix_memalign for CPU allocation (obviously we need
-  // to make sure 'data' is aligned in most specific way we might need).
-  // in future we might choose
-  // to add that.
+
+  Storage(Device device, DeallocatorFunc deallocator):
+      data(NULL), num_bytes(0),
+      device(device),
+      deallocator(deallocator) { }
+
+  // If 'deallocator' is non-NULL (only true for external-to-Kaldi tensors such
+  // as NumPy), the destructor calls it; otherwise it deallocates 'data' (the
+  // method of deallocation depends on the device pointer 'device'.
   ~Storage();
 };
 
@@ -33,4 +61,7 @@ struct Storage {
 
 
 
-};
+}  // namespace tensor
+}  // namespace kaldi
+
+#endif  // KALDI_TENSOR_STORAGE_H_
diff --git a/src/tensor/tensor-common.h b/src/tensor/tensor-common.h
index aa59367c518..3d778cc4446 100644
--- a/src/tensor/tensor-common.h
+++ b/src/tensor/tensor-common.h
@@ -1,3 +1,28 @@
+// tensor/tensor-common.h
+
+// Copyright      2019  Johns Hopkins University (author: Daniel Povey)
+
+// See ../../COPYING for clarification regarding multiple authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//  http://www.apache.org/licenses/LICENSE-2.0
+//
+// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
+// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
+// MERCHANTABLITY OR NON-INFRINGEMENT.
+// See the Apache 2 License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef KALDI_TENSOR_TENSOR_COMMON_H_
+#define KALDI_TENSOR_TENSOR_COMMON_H_ 1
+
+#include <cstdint>
+#include <vector>
+
 /**
    This is some notes on plans for kaldi10 tensor stuff, nothing is fully fleshed out.
 */
@@ -5,6 +30,11 @@
 namespace kaldi {
 namespace tensor {
 
+typedef int64_t int64;
+typedef uint64_t uint64;
+typedef int32_t int32;
+typedef uint32_t uint32;
+
 
 
 enum {
@@ -17,7 +47,12 @@ enum {
 // once we support multiple GPUs.
 struct Device {
   DeviceType device_type;
-  // operator ==, probably, maybe constructors.
+
+  Device(): device_type(kCpuDevice) { }
+  Device(DeviceType t): device_type(t) { }
+
+  // TODO: operator ==
+  // maybe in future we'll make a way to set the default device.
 };
 
 
@@ -48,7 +83,15 @@ enum InitializePolicy {
 };
 
 
-#define KALDI_TENSOR_MAX_DIM 5
+// In practice we don't expect user-owned tensors with dims greater than 5 to
+// exist, but there are certain manipulations we do when simplifying matrix
+// multiplications that temporarily add an extra dimension, and it's most
+// convenient to just increase the maximum.
+#define KALDI_TENSOR_MAX_DIM 6
 
 
-};
+}  // namespace tensor
+}  // namespace kaldi
+
+
+#endif  // KALDI_TENSOR_TENSOR_COMMON_H_
diff --git a/src/tensor/tensor-functions.h b/src/tensor/tensor-functions.h
index a2d357a4e9f..8f21fc61674 100644
--- a/src/tensor/tensor-functions.h
+++ b/src/tensor/tensor-functions.h
@@ -81,7 +81,8 @@ std::shared_ptr<Tensor> View(const Tensor &src, ArrayRef<int64_t> dims);
 
 /**
    Returns a Tensor with a new view of the data in 'src', in which the axes
-   numbered axis1 and axis1 + 1 are merged.
+   numbered axis1 and axis1 + 1 are merged.  This is just a special case
+   of View().
 
    For example, if 'src' is a Tensor with dims (3,4,5) and you call
    MergeAxes(src, 1), this funtion will merge axes 1 and 2 and return a Tensor
@@ -91,12 +92,12 @@ std::shared_ptr<Tensor> View(const Tensor &src, ArrayRef<int64_t> dims);
    is a common special case of what the function 'View' can give you.
 
    If the pattern of 'src' makes the requested merging impossible,
-   this function will return nullptr.  (This may be the case if, for
-   instance, the two axes in question were previously transposed).
-   In that case the caller will probably want to construct a temporary
-   Tensor 'temp' passing src.Dims() in the constructor, copy the data
-   in 'src' to 'temp', and then call MergeAxes on 'temp, which is
-   guaranteed to work.
+   this function will return NULL.  (This will happen if, in the
+   Tensor 'src', stride[axis1+1] != stride[axis1] * dim[axis1]).
+
+   If this function returns NULL then the caller will probably want to construct
+   a temporary Tensor 'temp' passing src.Dims() in the constructor, copy the
+   data in 'src' to 'temp', and then call MergeAxes on 'temp'.
 
        @param [in]  src   The Tensor which whose axes we will attempt to
                           merge
@@ -110,14 +111,17 @@ std::shared_ptr<Tensor> View(const Tensor &src, ArrayRef<int64_t> dims);
 std::shared_ptr<Tensor> MergeAxes(const Tensor &src, int64_t axis1);
 
 /**
+   Returns a Tensor with a new view of the data in 'src', in which the
+   specified axis is split into two axes.  This is just a special case
+   of View().
+
    Returns a Tensor in which the axis numbered 'axis' is split into
    two axes, with dimensions respectively 'dim1' and 'dim2'.  The
    interpretation will be as for a "C" array; so, for instance,
    if the dimensions of 'src' were (10,12) and you called
-   `SplitAxis(src, 1, 3, 4)`, the indexes along axis 1 would
-   be interpreted as 3 blocks of size 4.  This is a common
-   special case of what the function 'View' can do.
-
+   `SplitAxis(src, 1, 3, 4)` resulting in a Tensor of dimensions
+   (10,3,4), the indexes along the original axis of dimension 12 would be
+   interpreted as 3 blocks of size 4.
 
       @param [in] src  The Tensor whose axis is to be split.
       @param [in] axis  The index of the axis to be split; must
diff --git a/src/tensor/tensor-pattern.cc b/src/tensor/tensor-pattern.cc
index 933fb5f3ccb..7d76007eac7 100644
--- a/src/tensor/tensor-pattern.cc
+++ b/src/tensor/tensor-pattern.cc
@@ -1,18 +1,105 @@
+// tensor/tensor-pattern.cc
+
+// Copyright      2019  Johns Hopkins University (author: Daniel Povey)
+
+// See ../../COPYING for clarification regarding multiple authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//  http://www.apache.org/licenses/LICENSE-2.0
+//
+// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
+// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
+// MERCHANTABLITY OR NON-INFRINGEMENT.
+// See the Apache 2 License for the specific language governing permissions and
+// limitations under the License.
+
 #include "tensor/tensor-pattern.h"
 
-/**
-   This is some notes on plans for kaldi10 tensor stuff, nothing is fully fleshed out.
-*/
 
 namespace kaldi {
 namespace tensor {
 
+bool TensorPattern::Check() {
+  if (num_axes < 0 || num_axes >= KALDI_TENSOR_MAX_DIM)
+    return false;
+  for (int32 axis = 0; axis < num_axes; axis++) {
+    int32 dim = dims[axis], stride = strides[axis];
+    // All dims must be positive.  (We have no concept of
+    // an empty tensor, you would use NULL, or None, to represent
+    // that.
+    if (dim <= 0)
+      return false;
+    // If dim == 1, stride must be zero.  Otherwise, stride must be nonzero.
+    if (dim == 1) {
+      if (stride != 0) return false;
+    } else {
+      if (stride == 0) return false;
+    }
+  }
 
+  {
+    // Now check for potential overlap.  We take all the axes with dim != 1 and
+    // sort them from greatest to least stride, and check that for each i,
+    // abs(strides[i]) >= dims[i+1] * abs(strides[i+1]).
+    std::pair<int32, int32> dims_abs_strides [KALDI_TENSOR_MAX_DIM];
+    int32 num_nontrivial_axes = 0;
+    for (int32 i = 0; i < num_axes; i++) {
+      if(dims[i] != 1) {
+        dims_abs_strides[num_nontrivial_axes].first = dims[i];
+        dims_abs_strides[num_nontrivial_axes].second = std::abs(strides[i]);
+        num_nontrivial_axes++;
+      }
+    }
+    // We want to sort on strides from greatest to least, so use '>' not
+    // '<' as the comparator.
+    std::sort(dims_abs_strides, dims_abs_strides + num_nontrivial_axes,
+              [](const std::pair<int32, int32> &a, const std::pair<int32, int32> &b) {
+                return a.second > b.second
+                    });
+    for (int32 i = 0; i < num_nontrivial_axes; i++) {
+      // if (abs(strides[i]) < dims[i+1] * abs(strides[i+1])) return false;
+      if (dims_abs_strides[i].second <
+          dims_abs_strides[i+1).first * dims_abs_strides[i+1).second)
+        return false;
+    }
+  }
+  return true;
+}
 
 
+void TensorPatternProperties::UpdateProperties(const TensorPattern &pattern) {
+  KALDI_PARANOID_ASSERT(pattern.Check());
+  int32 num_axes = pattern.num_axes;
+  int64 dim_prod = 1;
+  bool c_strides = true;
+  // 'element_range' is the distance (in elements) between the
+  // first and last elements of the array.
+  int64 element_range = 0;
+  for (int32 i = num_axes - 1; i >= 0; i--) {
+    int32 dim = pattern.dims[i], stride = pattern.strides[i];
+    if (dim != 1) {
+      if (pattern.strides[i] != dim_prod)
+        c_strides = false;
+      element_range += std::abs(static_cast<int64>(stride) *
+                                static_cast<int64>(dim - 1));
+    }
+    dim_prod *= dim;
+  }
+  this->num_elements = dim_prod;
+  this->has_c_strides = c_strides;
+  if (has_c_strides) {
+    KALDI_PARANOID_ASSERT(element_range + 1 == num_elements);
+    this->is_contiguous = true;
+  } else {
+    KALDI_PARANOID_ASSERT(element_range < num_elements);
+    this->is_contiguous = (element_range + 1 == num_elements);
+  }
 }
 
 
-
-}
-}
+}  // namespace kaldi
+}  // namespace tensor
diff --git a/src/tensor/tensor-pattern.h b/src/tensor/tensor-pattern.h
index d5e2b4ffbba..bbbc06d447f 100644
--- a/src/tensor/tensor-pattern.h
+++ b/src/tensor/tensor-pattern.h
@@ -1,3 +1,25 @@
+// tensor/tensor-pattern.h
+
+//  Copyright      2019  Johns Hopkins University (author: Daniel Povey)
+
+// See ../../COPYING for clarification regarding multiple authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//  http://www.apache.org/licenses/LICENSE-2.0
+//
+// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
+// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
+// MERCHANTABLITY OR NON-INFRINGEMENT.
+// See the Apache 2 License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef KALDI_TENSOR_TENSOR_PATTERN_H_
+#define KALDI_TENSOR_TENSOR_PATTERN_H_ 1
+
 #include "tensor/tensor-common.h"
 
 /**
@@ -49,29 +71,29 @@ enum RangeEnum { all };
 
 */
 struct Range {
-  int64_t begin;
-  int64_t end;
-  int64_t step;
+  int32 begin;
+  int32 end;
+  int32 step;
 
-  static inline int64_t inf() { return std::numeric_limits<int64_t>::max(); }
+  static inline int32 inf() { return std::numeric_limits<int32>::max(); }
 
   // The default constructor leaves the range undefined.
   Range() { }
 
   Range(RangeEnum): begin(0), end(inf()), step(1) { }
 
-  explicit Range(int64_t end): begin(0), end(end), step(1) { }
+  explicit Range(int32 end): begin(0), end(end), step(1) { }
 
-  Range(int64_t begin, int64_t end, int64_t step = 1):
+  Range(int32 begin, int32 end, int32 step = 1):
       begin(begin), end(end), step(1) { }
 
-  Range(int64_t begin, RangeEnum, int64_t step = 1):
+  Range(int32 begin, RangeEnum, int32 step = 1):
       begin(begin), end(inf()), step(step) { }
 
-  Range(RangeEnum, int64_t end, int64_t step = 1):
-      begin(inf), end(end), step(step) { }
+  Range(RangeEnum, int32 end, int32 step = 1):
+      begin(inf()), end(end), step(step) { }
 
-  Range(RangeEnum, RangeEnum, int64_t step = 1):
+  Range(RangeEnum, RangeEnum, int32 step = 1):
       begin(inf()), end(inf()), step(step) { }
 };
 
@@ -91,32 +113,10 @@ struct RangeExt: public Range {
   RangeExt(Range r): Range(r) { }
 
   // implicit
-  RangeExt(int64_t index):
+  RangeExt(int32 index):
       Range(index, 0, inf());
 };
 
-/**
-enum IndexingType{
-  kIndexingTypeRange,
-  kIndexingTypeNumber,
-  kIndexingTypeTensor
-};
-
-// This struct is used when indexing with mixed types. Ror
-// example:
-// Tensor a(...), b(...);
-// Tensor c = a(1, b, Range(0,-1), Range(all));
-
-struct IndexingArg {
-  IndexingArg(const Tensor &tensor);
-  IndexingArg(int64_t index);
-  IndexingArg(Range range);
-
-  IndexingType itype;
-  int64_t index;
-  std::shared_ptr<Tensor> tensor {nullptr};
-  Range range;
-  };*/
 
 /**
    This function, used in indexing operations, takes a Range that may have, say,
@@ -127,8 +127,7 @@ struct IndexingArg {
 
    Raises an exception the resulting range is empty.
  */
-void MakeRangeExplicit(Range *range, int64_t dim);
-
+void MakeRangeExplicit(int32 dim, Range *range);
 
 
 /*
@@ -146,7 +145,7 @@ void MakeRangeExplicit(Range *range, int64_t dim);
 
   Our requirements on a TensorPattern are:
 
-    0 <= num_axes <= 5
+    0 <= num_axes <= KALDI_TENSOR_MAX_DIM
     for 0 <= axis < num_axes:
        dims[i] > 0
        if dims[i] = 1, then strides[i] = 0.
@@ -157,16 +156,19 @@ void MakeRangeExplicit(Range *range, int64_t dim);
   the same memory location via two different tuples of indexes).
   Recause testing this property exactly would be difficult in general
   without bringing in number theory, we test a slightly stronger version
-  of it that covers all cases we are likely to encounter.
+  of it that covers all cases we are likely to encounter.  This is
+  that, if we take all the axes with dim != 1 and sort them from greatest
+  to least stride, for each i, abs(strides[i]) >= dims[i+1] * abs(strides[i+1]).
 */
 struct TensorPattern {
-  int64_t num_axes;
-  int64_t dims[KALDI_TENSOR_MAX_DIM];
-  int64_t strides[KALDI_TENSOR_MAX_DIM];
+  int32 num_axes;
+  int32 dims[KALDI_TENSOR_MAX_DIM];
+  int32 strides[KALDI_TENSOR_MAX_DIM];
   // We may later add methods to this.
 
   // Checks that the TensorPattern is valid, assuming it is part of a Tensor.
   // I.e. that it satifies all the properties mentioned above.
+  // Returns true if valid, false if not valid.
   bool Check();
 };
 
@@ -176,194 +178,30 @@ struct TensorPatternProperties {
 
   // The number of elements in the Tensor, which equals the product
   // of dims[0] .. dims[num_axes - 1].  Will always be >0.
-  int64_t num_elements;
+  int64 num_elements;
 
   // is_contiguous means that the data form a contiguous block in memory; it is
   // not the same as PyTorch's is_contiguous which is a stronger condition,
   // implying also that the strides are as for a `C-style` array.
   bool is_contiguous;
 
-  // has_c_strides means that the strides are as if this was a "c"-style
-  // multidimensional array, meaning that (using Python wrap-around indexing
-  // conventions as if strides was an array with 'num_axes' axes),
-  // strides[-1] == 1, strides[-1] == dims[-1], strides[-2] = dims[-1] *
-  // dims[-1], and so on.  This is the same as PyTorch's is_contiguous.
+  // has_c_strides means that the strides of all axes i with dim[i] != 1,
+  // equal the product of all later-numbered dims, i.e.
+  // \f$ strides[i] = \prod_{j>i} dim[j] \f$, or `strides[i] = 0` if
+  // dim[i] == 1 (since we use the convention that axes with dim=1 always
+  // have stride=0.
+  // has_c_strides is the equivalent of PyTorch's is_contiguous.
   // this->has_c_strides implies this->is_contiguous.
   bool has_c_strides;
 
+  // Sets the members of *this to be the properties of pattern 'pattern'.
+  // Ignores the previously existing values of *this.
   void UpdateProperties(const TensorPattern &pattern);
 };
 
 
-/**
-   Compresses a TensorPattern by removing or combining as many axes as possible.
-   This version is suitable for 'flat' operations that do not rely on any kind
-   of structure, such as zeroing or nonlinearities; the only equivalence
-   maintained is equivalence of the set of memory locations covered.
-   The order of the (dim,stride) pairs in the input does not affect the
-   output.  The output (dim,stride) pairs will be ordered from
-   greatest to least stride (note: all output strides will be positive).
-
-      @param [in]   src   The pattern to be compressed
-      @param [in]  src_properties  Properties of 'src'; required to
-                          be accurate (behavior is undefined otherwise,
-                          e.g. if you provide some other pattern's properties).
-      @param [out] dest   A simplified-as-much-as-possible pattern that
-                          covers the same set of memory locations as 'src' (when
-                          combined with the offset below).  'dest' will
-                          contain only nonnegative strides.
-      @param [out] data_offset  A number that we would have to add to
-                          the data pointer of the source Tensor so
-                          that 'dest' would cover the same set of
-                          elements.  It will always be zero if 'src'
-                          was free of negative strides.
-
-   Examples are below, where we write a TensorPattern as
-    `{{dim1,dim2,..}, {stride1,stride2,..}}`.
-
-\verbatim
-   Input pattern             Output pattern            Output offset
-     {{10},{1}}               {{10},{1}}                  0
-    {{3,4},{4,1}}             {{12},{1}}                  0
-    {{4,3},{1,4}}             {{12},{1}}                  0
-    {{9},{-1}}                {{9},{1}}                  -8
-   {2,3,4},{100,4,1}        {{2,12},{100,1}}              0
-\endverbatim
- */
-void CompressPatternFlat(const TensorPattern &src,
-                         const TensorPatternProperties &src_properties,
-                         TensorPattern *dest,
-                         int64_t *data_offset);
-
-/*
-  Compress two TensorPatterns by combining axes (and possibly
-  flipping the sign of their strides and changing the data offset)
-  The type of compression involved is the same as for CompressPatternFlat
-  (meaning we are doing some kind of operation that doesn't care about
-a  the structure, such as an element-by-element nonlinearity).
-
-  The difference from calling CompressPatternFlat() twice is that this function
-  is only allowed to do the same operation to src1 and src2, e.g. if combining
-  two axes of src1 we would also have to combine the same two axes of src2.
-
-    @param [in] src1  The first source pattern.
-    @param [in] src2  The second source pattern.  The assumption is that src1
-                     and src2 are participating in some kind of operation like
-                     copying, or elementwise multiplication.  The patterns
-                     must satisfy src1.NumAxes() == src2.NumAxes(), and
-                     for each axis, either src1.dims[axis] == src2.dims[axis],
-                     or one of those two dimensions equals 1 (so there would be
-                     some kind of broadcasting).  The
-    @param [out] dest1  Compressed pattern out corresponding to src1.  Will
-                     be free of negative strides (but dest2 might not be).
-    @param [out] dest_offset1  Data offset that we'd need to add to src1's
-                     data pointer before using the pattern 'dest1'
-    @param [out] dest1  Compressed pattern out corresponding to src2.
-                     Might not be free of negative strides if some dimensions
-                     of src1/src2 had strides of opposite sign.
-    @param [out] dest_offset1  Data offset that we'd need to add to src1's
-                     data pointer before using the pattern 'dest1'
-
-  TODO: examples
- */
-void CompressPatternsFlat(const TensorPattern &src1,
-                          const TensorPattern &src2,
-                          TensorPattern *dest1,
-                          int64_t *data_offset1
-                          TensorPattern *dest2,
-                          int64_t *data_offset2);
-
-
-/**
-   Compresses a TensorPattern by removing or combining as many axes as possible,
-   while respecting certain invariances that are relevant when constructing
-   'views' ('view' is PyTorch terminology; the NumPy equivalent is 'reshape').
-   The "C" in the function name refers to C-style arrays.
-
-    This function removes axes with dim=1.
-
-    This function combines successive axes if the relationship of their
-    dims and strides is what you would expect in a "C"-style array.
-    Suppose that in 'src' we had two successive axes with dims and
-    strides (dim_a, dim_b) and (stride_a, stride_b), with dim_a > 1 and
-    dim_b > 1.  If stride_a == stride_b * dim_b, then this function
-    will merge them into a single axis with dimension (dim_a * dim_b)
-    and stride stride_b.
-
-   The output pattern 'dest' is what you get if you keep applying the
-   rules above until no further change is made.
-
-   Examples are below, where we write a TensorPattern as
-  `   {{dim1,dim2,..}, {stride1,stride2,..}}`.
-\verbatim
-   Input pattern             Output pattern
-     {{10},{1}}               {{10},{1}}
-    {{5,1},{1,1}}             {{5},{1}}
-    {{9},{-1}}                {{9},{-1}}
-   {2,3,4},{100,4,1}        {{2,12},{100,1}}
-   {2,3,4},{100,-4,-1}        {{2,12},{100,-1}}
-\endverbatim
- */
-void CompressPatternC(const TensorPattern &src,
-                      const TensorPatternProperties &src_properties,
-                      TensorPattern *dest);
-
-
-
-/**
-   Creates a TensorPattern corresponding to a requested 'view' of the matrix.
-   ('view' is PyTorch terminology; the NumPy equivalent is 'reshape').
-
-   The PyTorch/NumPy semantics are (I believe) as follows: Firstly, a view
-   can/should only be created for a tensor whose layout in memory is as for a
-   "C" array; suppose that the shape of array a is (9, 8), a "C" layout would
-   imply strides of (8, 1).  A 'view' of this array simply implies interpreting
-   the same block of memory as a "C" array with some other sequence of
-   dimensions, say (3, 3, 8) or (8, 9) or (1, 72); any sequence whose product
-   matches the number of elements in "a".
-
-   Our semantics of "view" is the same as that of PyTorch/NumPy except that we
-   impose fewer constraints on what strides the input Tensor cmay have.  Let the
-   'view' of the array 'a' be 'b'.  As long as it is possible to find a tensor
-   pattern for 'b' that would lead to the same relationship between the elements
-   of 'a' and 'b' as what you would get by asking for the same "view" in
-   PyTorch/NumPy assuming 'a' had had "C"-style strides (viewed in terms of
-   indexed elements of and b, without regard to the physical memory layout), we
-   allow it.
-
-
-   Notes on implementation (glossing over ones in 'dims' which are easy to
-   handle as a special case): we would first call CompressPattern on
-   'pattern_in'.  Then we would attempt to find a correspondence with
-   the dimensions of this compressed pattern and a partition of the
-   sequence 'dims'.  For example, suppose the compressed pattern
-   is (100, 9) and dims is (50, 2, 3, 3), then the partition would
-   be (50, 2), (3, 3).  If this is not possible (e.g. if dims
-   had been (30,10,3) instead), we return false.
-
-   @param [in]  pattern_in   The input pattern for which we are trying to
-                          find an alternative view
-   @param [in]  dims  The sequence of dimensions corresponding to the
-                      desired view.  Its product must be the same as the
-                      product of pattern_in.dims.
-   @param [out] pattern_out  The output pattern, if we were
-                      successful (otherwise undefined).  Its 'dims'
-                      will be the same as 'dims'.
-   @return           Returns true on success (i.e. such a view existed),
-                     and false otherwise.  This function will never return
-                     false if 'pattern_in' had strides as for a "C" array
-                     (i.e., if its properties' has_c_strides was true).
-
- */
-bool CreateViewPattern(const TensorPattern &pattern_in,
-                       ArrayRef<int64_t> dims,
-                       TensorPattern *pattern_out);
-
-
-
-
-};
+}  // namespace tensor
+}  // namespace kaldi
 
 
-}
-}
+#endif  // KALDI_TENSOR_TENSOR_COMMON_H_
diff --git a/src/tensor/tensor.h b/src/tensor/tensor.h
index 31331d2c142..f479eac6c91 100644
--- a/src/tensor/tensor.h
+++ b/src/tensor/tensor.h
@@ -1,6 +1,28 @@
-/**
-   This is some notes on plans for kaldi10 tensor stuff, nothing is fully fleshed out.
-*/
+// tensor/tensor.h
+
+// Copyright      2019  Johns Hopkins University (author: Daniel Povey)
+
+// See ../../COPYING for clarification regarding multiple authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//  http://www.apache.org/licenses/LICENSE-2.0
+//
+// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
+// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
+// MERCHANTABLITY OR NON-INFRINGEMENT.
+// See the Apache 2 License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef KALDI_TENSOR_TENSOR_H_
+#define KALDI_TENSOR_TENSOR_H_ 1
+
+#include "tensor/tensor-common.h"
+#include "tensor/tensor-pattern.h"
+#include "tensor/storage.h"
 
 namespace kaldi {
 namespace tensor {
@@ -23,7 +45,7 @@ class Tensor {
   // contexts, this is sometimes known as the rank of the tensor, or sometimes
   // even its dimension, but these terms are ambiguous so we avoid them, and use
   // the terms 'number of axes' or 'axis' throughout.
-  inline int64_t NumAxes() const { return pattern_.num_axes; }
+  inline int32 NumAxes() const { return pattern_.num_axes; }
 
   // Return reference to the struct containing the dimension and
   // stride info.
@@ -31,22 +53,25 @@ class Tensor {
 
   // Return an array containing dimensions of the tensor; equivalent to
   // .shape in PyTorch.  Dims().size() will equal NumAxes().
-  inline ArrayRef<int64_t> Dims() const { return ArrayRef{pattern_.num_axes, pattern_.dims}; }
+  // We limit each dimension to int32, because BLAS's interface uses int,
+  // which on many common 64-bit platforms is configured with 32 bits.
+  // However the product of dimensions may still be 64 bits.
+  inline ArrayRef<int32> Dims() const { return ArrayRef{pattern_.num_axes, pattern_.dims}; }
 
   // Returns the dimension on this axis, a number >= 1.  Result is
   // undefined if axis < 0 or axis >= NumAxes().
-  inline int64_t Dim(int64_t axis) const { return pattern_.dims[axis]; }
+  inline int32 Dim(int32 axis) const { return pattern_.dims[axis]; }
 
   // Returns an array containing the strides of the tensor.
   // Strides().size() will equal NumAxes().
-  inline ArrayRef<int64_t> Strides() const { return ArrayRef{pattern_.num_axes, pattern_.strides}; }
+  inline ArrayRef<int32> Strides() const { return ArrayRef{pattern_.num_axes, pattern_.strides}; }
 
   // Returns the stride on this axis.  Will be zero if the corresponding
   // dimension is 1, and otherwise nonzero (but not necessarily positive).
-  inline int64_t Stride(int64_t axis) const { return pattern_.strides[axis]; }
+  inline int32 Stride(int32 axis) const { return pattern_.strides[axis]; }
 
   // Returns the number of elements in the Tensor; must be > 0.
-  inline int64_t NumElements() const { return derived_.num_elements; }
+  inline int64 NumElements() const { return derived_.num_elements; }
 
   // Returns true if the data forms a contiguous block in memory.
   // (not the same as 'contiguous()' in PyTorch, which also requires
@@ -67,7 +92,7 @@ class Tensor {
   // underlying data as the original Tensor.  We could have done this with just
   // a single indexing operator taking 5 args of type RangeExt defaulting to
   // `all`, but we provide separate versions for each num-args for efficiency.
-  // You can provide an int64_t where RangeExt is expected; it will be
+  // You can provide an int32 where RangeExt is expected; it will be
   // converted to a special struct of type Range. See the documentation for type
   // Range, and the table which it contains.  If a is a Tensor with 1 axis, a(0)
   // will return a scalar Tensor (0 axes
@@ -75,10 +100,10 @@ class Tensor {
   // Any of these indexing operators can operate on Tensors with more axes;
   // trailing axes will be left alone.
 
-  // this operator () taking int64_t is only provided in the one-arg case as a
-  // convenience; in any case, RangeExt can be constructed from int64_t with the
+  // this operator () taking int32 is only provided in the one-arg case as a
+  // convenience; in any case, RangeExt can be constructed from int32 with the
   // same effect.
-  Tensor operator () (int64_t i0) const;
+  Tensor operator () (int32 i0) const;
   Tensor operator () (RangeExt s0) const;
   Tensor operator () (RangeExt s0, RangeExt s1) const;
   Tensor operator () (RangeExt s0, RangeExt s1, RangeExt s2) const;
@@ -98,8 +123,8 @@ class Tensor {
   // double (if it was not already double); throws if NumAxes() > 0.
   explicit operator double() const;
   // For a scalar Tensor (NumAxes() == 0) returns the item, cast to
-  // int64_t (if it was not already int64_t); throws if NumAxes() > 0.
-  explicit operator int64_t() const;
+  // int32 (if it was not already int32); throws if NumAxes() > 0.
+  explicit operator int32() const;
 
 
   // For a Tensor storing floats, returns the data pointer cast to float;
@@ -119,7 +144,7 @@ class Tensor {
 
   // Transpose the two axes by swapping their dims and strides without changing
   // the underlying data in memory.  This modifies *this;
-  void Transpose(int64_t axis1 = 0, int64_t axis2 = 1);
+  void Transpose(int32 axis1 = 0, int32 axis2 = 1);
 
 
   // Copy constructor that copies the metadata while sharing the underlying
@@ -158,12 +183,12 @@ class Tensor {
        @param [in] set_zero   If true, set the tensor to zero.  If false,
                         the contents will be undefined.
    */
-  Tensor(ArrayRef<int64_t> dims, DataType dtype, Device device,
+  Tensor(ArrayRef<int32> dims, DataType dtype, Device device,
          bool set_zero = false);
 
   /**
      Construct a Tensor with the dimensions and strides provided.  This differs
-     from the constructor taking `ArrayRef<int64_t> dims` in that it will use
+     from the constructor taking `ArrayRef<int32> dims` in that it will use
      the strides in `pattern` (except that if the data in `pattern` is not
      contiguous, it will make it contiguous by filling in any gaps).  This means
      that, for example, if you use this constructor on a 2-dimensional Tensor
@@ -204,93 +229,8 @@ class Tensor {
 
 
 
-/*
-  This is the 'gradient information' that class Variable stores for a Tensor
-  when it is initialized with requires_grad = true (or is a result of
-  an operation on Variables one of which had requires_grad = true).
-  This does not give you access to the underlying Variables; doing it
-  like this makes reference counting easier (no loops).  The GradFunc
-  will store any pointers to the original Variable that it may have
-  needed.
-
-  Users will rarely need to interact directly with this struct directly.
- */
-struct TensorGrad {
-  // The gradients corresponding to the input variables, which
-  // we may need to update.  Some subset of these may be nullptr,
-  // corresponding to input Variables for which no gradient
-  // was required.
-  std::vector<std::shared_ptr<TensorGrad> > inputs;
-
-  // is_view is
-  bool is_view{false};
-
-  // The device we
-  Device device;
-
-  // The dimension of the Tensor for which this is the gradient.  Used
-  // to set up 'grad' when needed.
-  TensorPattern dim;
-
-  // 'offset' is only inspected if this is a view; it is the offset
-  // (in elements) from the
-  // 'inputs' will just contain one member, which is the gradient for the source
-  // Variable, and we use 'dim' and 'offset' to construct the sub-tensor).
-  int64_t offset;
-
-  // This stores the gradient (if we already have one), or nullptr if not.
-  std::unique_ptr<Variable> grad{nullptr};
-
-};
-
-
-/**
-   class Variable is somewhat like class Tensor but augmented with autograd
-   machinery.  Because autograd requires a rather 'functional' way of doing
-   things (i.e. is not super friendly to in-place operations), the functions
-   that operate on class Variable will tend to be ones that return something,
-   rather than in-place operations.
-
-   The overall design is quite similar to PyTorch, and the structure
-   of the the C++ code is similar to flashlight.  If you are only familiar with
-   PyTorch's python frontend, class Variable is rougtly equivalent to what they
-   expose as af.tensor.
- */
-class Variable {
-  using GradFunc = std::function<
-    void(const std::vector<Variable>& inputs, TensorGrad *grad_output)>;
-  using GradHook = std::function<void(TensorGrad *grad)>;
+}  // namespace tensor
+}  // namespace kaldi
 
 
-
-  /** Constructor from a Tensor.
-       @param [in] data  Pointer to the source Tensor
-       @param [in] requires_grad    If requires_grad argument is true,
-                the gradient w.r.t. this Variable will be computed if and when
-                you call Backward() on a Variable that depends on it.
-                The same as requires_grad in PyTorch.
-  */
-  Variable(const std::shared_ptr<Tensor> &data, bool requires_grad);
-
-
-
-  /**
-   * Creates a Variable which wraps the array and inputs specified
-   * @param[in] data array to the stored in the Variable
-   * @param[in] inputs a vector specifying inputs for this Variable
-   * @param[in] gradFunc function specifying how to calculate gradient of the
-   * input Variables
-   */
-  Variable(std::shared_ptr<Tensor> &data, std::vector<Variable> inputs,
-           GradFunc gradFunc);
-
-
-
-};
-
-typedef std::unique_ptr<Storage>
-
-
-
-
-};
+#endif  // KALDI_TENSOR_TENSOR_H_

From 4cab3dbbfcb6a45d90cc6c8c679f4786e23e4e92 Mon Sep 17 00:00:00 2001
From: Desh Raj <r.desh26@gmail.com>
Date: Thu, 21 Mar 2019 15:59:04 -0400
Subject: [PATCH 010/163] Merged with 'master' (#3156)

* [scripts] Add fix regarding num-jobs for segment_long_utterances*.sh(#3130)

* [src] Enable allow_{upsample,downsample} with online features (#3139)

* [src] Fix bad assert in fstmakecontextsyms (#3142)

* [src] Fix to "Fixes to grammar-fst & LM-disambig symbols" (#3000) (#3143)

* [build] Make sure PaUtils exported from portaudio (#3144)

* [src] cudamatrix: fixing a synchronization bug in 'normalize-per-row' (#3145)

was only apparent using large matrices

* [src] Fix typo in comment (#3147)

* [src] Add binary that functions as a TCP server (#2938)

* [scripts] Fix bug in comment (#3152)
---
 .../steps/cleanup/segment_long_utterances.sh  |   9 +-
 .../cleanup/segment_long_utterances_nnet3.sh  |   9 +-
 egs/wsj/s5/utils/parse_options.sh             |   2 +-
 src/cudamatrix/cu-kernels.cu                  |  19 +-
 src/cudamatrix/cu-math-test.cc                |  49 +-
 src/decoder/grammar-fst.cc                    |   2 +-
 src/doc/online_decoding.dox                   |  64 +++
 src/feat/online-feature.cc                    |  98 +++-
 src/feat/online-feature.h                     |   5 +-
 src/feat/resample.h                           |   4 +
 src/fstext/context-fst.cc                     |   2 +-
 src/nnet3/decodable-online-looped.cc          |  14 +-
 src/nnet3/decodable-online-looped.h           |  16 +
 src/online2/online-feature-pipeline.h         |   2 +-
 src/online2/online-nnet2-feature-pipeline.h   |  14 +
 src/online2/online-nnet3-decoding.cc          |   7 +-
 src/online2/online-nnet3-decoding.h           |   8 +-
 src/online2/online2-feature-pipeline.cc       |  15 +
 src/online2bin/Makefile                       |   3 +-
 .../online2-tcp-nnet3-decode-faster.cc        | 442 ++++++++++++++++++
 src/onlinebin/online-gmm-decode-faster.cc     |   8 +-
 src/util/edit-distance-inl.h                  |   4 +-
 tools/extras/install_portaudio.sh             |  18 +-
 tools/extras/portaudio.patch                  |  21 +
 24 files changed, 771 insertions(+), 64 deletions(-)
 create mode 100644 src/online2bin/online2-tcp-nnet3-decode-faster.cc
 create mode 100644 tools/extras/portaudio.patch

diff --git a/egs/wsj/s5/steps/cleanup/segment_long_utterances.sh b/egs/wsj/s5/steps/cleanup/segment_long_utterances.sh
index c7e50ea165e..7a16bdcdb12 100755
--- a/egs/wsj/s5/steps/cleanup/segment_long_utterances.sh
+++ b/egs/wsj/s5/steps/cleanup/segment_long_utterances.sh
@@ -174,10 +174,17 @@ if [ $stage -le 3 ]; then
   cp $srcdir/phones.txt $dir 2>/dev/null || true
 
   mkdir -p $graph_dir
+  
+  n_reco=$(cat $text | wc -l) || exit 1
+  nj_reco=$nj
+
+  if [ $nj -gt $n_reco ]; then
+    nj_reco=$n_reco
+  fi
 
   # Make graphs w.r.t. to the original text (usually recording-level)
   steps/cleanup/make_biased_lm_graphs.sh $graph_opts \
-    --nj $nj --cmd "$cmd" $text \
+    --nj $nj_reco --cmd "$cmd" $text \
     $lang $dir $dir/graphs
   if [ -z "$utt2text" ]; then
     # and then copy it to the sub-segments.
diff --git a/egs/wsj/s5/steps/cleanup/segment_long_utterances_nnet3.sh b/egs/wsj/s5/steps/cleanup/segment_long_utterances_nnet3.sh
index 751200bdf83..f0df1e7730c 100755
--- a/egs/wsj/s5/steps/cleanup/segment_long_utterances_nnet3.sh
+++ b/egs/wsj/s5/steps/cleanup/segment_long_utterances_nnet3.sh
@@ -235,10 +235,17 @@ if [ $stage -le 3 ]; then
 
   mkdir -p $graph_dir
 
+  n_reco=$(cat $text | wc -l) || exit 1
+  nj_reco=$nj
+
+  if [ $nj -gt $n_reco ]; then
+    nj_reco=$n_reco
+  fi
+
   # Make graphs w.r.t. to the original text (usually recording-level)
   steps/cleanup/make_biased_lm_graphs.sh $graph_opts \
     --scale-opts "$scale_opts" \
-    --nj $nj --cmd "$cmd" $text \
+    --nj $nj_reco --cmd "$cmd" $text \
     $lang $dir $dir/graphs
   if [ -z "$utt2text" ]; then
     # and then copy it to the sub-segments.
diff --git a/egs/wsj/s5/utils/parse_options.sh b/egs/wsj/s5/utils/parse_options.sh
index 34476fdb37a..335e69e9ac7 100755
--- a/egs/wsj/s5/utils/parse_options.sh
+++ b/egs/wsj/s5/utils/parse_options.sh
@@ -42,7 +42,7 @@ done
 
 
 ###
-### No we process the command line options
+### Now we process the command line options
 ###
 while true; do
   [ -z "${1:-}" ] && break;  # break if there are no arguments
diff --git a/src/cudamatrix/cu-kernels.cu b/src/cudamatrix/cu-kernels.cu
index 17d56a05772..515412ca398 100644
--- a/src/cudamatrix/cu-kernels.cu
+++ b/src/cudamatrix/cu-kernels.cu
@@ -2552,9 +2552,12 @@ static void _normalize_per_row(Real *y, int y_stride, const Real *x,
   const int i = blockIdx.x;
   const int tid = threadIdx.x;
   const Real* x_row = x + i * x_d.stride;
+
   typedef cub::BlockReduce<Real, CU1DBLOCK> BlockReduceT;
   __shared__ typename BlockReduceT::TempStorage temp_storage;
-  __shared__ Real ssum[CU1DBLOCK];
+
+  __shared__ Real stddev_div_target_rms;
+  __shared__ Real scale;
 
   // Reduce x_j^2 to CU1DBLOCK elements per row
   Real tsum = Real(0);
@@ -2563,14 +2566,14 @@ static void _normalize_per_row(Real *y, int y_stride, const Real *x,
   }
   tsum = BlockReduceT(temp_storage).Sum(tsum);
   __syncthreads();
-  
 
-  const Real kSquaredNormFloor = 1.3552527156068805425e-20; // 2^-66
-  ssum[tid] = sqrt(
-    fmax(tsum / (target_rms * target_rms * x_d.cols), kSquaredNormFloor));
-
-  const Real stddev_div_target_rms = ssum[0];
-  const Real scale = Real(1) / stddev_div_target_rms;
+  if (tid == 0) {
+    const Real kSquaredNormFloor = 1.3552527156068805425e-20; // 2^-66
+    stddev_div_target_rms = sqrt(
+      fmax(tsum / (target_rms * target_rms * x_d.cols), kSquaredNormFloor));
+    scale = Real(1) / stddev_div_target_rms;
+  }
+  __syncthreads();
 
   // Store normalized input to output
   Real* y_row = y + i * y_stride;
diff --git a/src/cudamatrix/cu-math-test.cc b/src/cudamatrix/cu-math-test.cc
index 09255c9587b..022742ed29f 100644
--- a/src/cudamatrix/cu-math-test.cc
+++ b/src/cudamatrix/cu-math-test.cc
@@ -545,6 +545,50 @@ static void UnitTestCuMathNormalizePerRow() {
   }
 }
 
+
+template<typename Real>
+static void UnitTestCuMathNormalizePerRow_v2() {
+
+  int row = 128;
+  int col = 1024;
+
+  Matrix<Real> Hi(row,col);
+  Matrix<Real> Ho(row,col);
+  Hi.SetRandn();
+  Hi.Scale(5.0);
+  Hi.ApplyFloor(0.0); // like ReLU,
+
+  CuMatrix<Real> Di(row, col);
+  CuMatrix<Real> Do(row, col);
+  Di.CopyFromMat(Hi);
+
+  Real target_rms = 0.3456;
+  bool add_log_stddev = false;
+  const Real kSquaredNormFloor = 1.35525271560688e-20; // 2^-66
+
+  //gpu
+  cu::NormalizePerRow(Di, target_rms, add_log_stddev, &Do);
+
+  //cpu
+  {
+    MatrixBase<Real>& in(Hi);
+    MatrixBase<Real>& out(Ho);
+    Real target_rms=0.3456;
+    Vector<Real> in_norm(in.NumRows());
+    Real d_scaled = in.NumCols() * target_rms * target_rms;
+    in_norm.AddDiagMat2(1.0 / d_scaled, in, kNoTrans, 0.0);
+    in_norm.ApplyFloor(kSquaredNormFloor);
+    in_norm.ApplyPow(-0.5);
+    out.CopyFromMat(in);
+    out.MulRowsVec(in_norm);
+  }
+
+  Matrix<Real> Ho2(Do);
+  // here the BUG was detected (by processing big-enough matrix),
+  AssertEqual(Ho,Ho2,0.00001);
+}
+
+
 template<typename Real>
 static void UnitTestCuDiffNormalizePerRow() {
   for (int32 i = 0; i < 2; i++) {
@@ -660,6 +704,7 @@ template<typename Real> void CudaMathUnitTest() {
   UnitTestEnsureNonzero<Real>();
   UnitTestBackpropLstmNonlinearity<Real>();
   UnitTestCuMathNormalizePerRow<Real>();
+  UnitTestCuMathNormalizePerRow_v2<Real>();
   UnitTestCuDiffNormalizePerRow<Real>();
 }
 
@@ -673,9 +718,9 @@ int main() {
   for (; loop < 2; loop++) {
     CuDevice::Instantiate().SetDebugStrideMode(true);
     if (loop == 0)
-      CuDevice::Instantiate().SelectGpuId("no"); // -1 means no GPU
+      CuDevice::Instantiate().SelectGpuId("no"); // 0 means no GPU
     else
-      CuDevice::Instantiate().SelectGpuId("yes"); // -2 .. automatic selection
+      CuDevice::Instantiate().SelectGpuId("yes"); // 1 .. automatic selection
 #endif
     srand(time(NULL));
     kaldi::CudaMathUnitTest<float>();
diff --git a/src/decoder/grammar-fst.cc b/src/decoder/grammar-fst.cc
index ab1a8142c1d..1b79e7b5521 100644
--- a/src/decoder/grammar-fst.cc
+++ b/src/decoder/grammar-fst.cc
@@ -706,7 +706,7 @@ bool GrammarFstPreparer::IsEntryState(StateId s) const {
     // we check that at least one has label with nonterminal equal to #nonterm_begin...
     // in fact they will all have this value if at least one does, and this was checked
     // in NeedEpsilons().
-    if (nonterminal == kNontermBegin)
+    if (nonterminal == GetPhoneSymbolFor(kNontermBegin))
       return true;
   }
   return false;
diff --git a/src/doc/online_decoding.dox b/src/doc/online_decoding.dox
index 799bfb5895f..dc04d9bef4e 100644
--- a/src/doc/online_decoding.dox
+++ b/src/doc/online_decoding.dox
@@ -438,6 +438,70 @@ and downloadable models that can be used with online nnet3 decoding, please
 see http://kaldi-asr.org/models.html (the first model there, the ASPIRE model,
 includes instructions in a README file).
 
+\subsection online_decoding_nnet3_tcp TCP server for nnet3 online decoding
+
+The program to run the TCP sever is online2-tcp-nnet3-decode-faster located in the
+~/src/online2bin folder. The usage is as follows:
+
+\verbatim
+online2-tcp-nnet3-decode-faster <nnet3-in> <fst-in> <word-symbol-table> <listen-port>
+\endverbatim
+
+For example:
+
+\verbatim
+online2-tcp-nnet3-decode-faster model/final.mdl graph/HCLG.fst graph/words.txt 5050
+\endverbatim
+
+The word symbol table is mandatory (unlike other nnet3 online decoding programs) because
+the server outputs word strings. Endpointing is mandatory to make the operation of the
+program reasonable. Other, non-standard options include:
+    - samp-freq - sampling frequency of audio (usually 8000 for telephony and 16000 for other uses)
+    - chunk-length - length of signal being processed by decoder at each step
+    - output-period - how often we check for changes in the decoding (ie. output refresh rate, default 1s)
+    - num-threads-startup - number of threads used when initializing iVector extractor
+
+The TCP protocol simply takes RAW signal on input (16-bit signed integer
+encoding at chosen sampling frequency) and outputs simple text using the following
+logic:
+    - each refresh period (output-freq argument) the current state of decoding is output
+    - each line is terminated by '\r'
+    - once an utterance boundary is detected due to endpointing a '\n' char is output
+
+Each output string (delimited by '\r') should be treated as uncertain and can change
+entirely until the utterance delimiter ('\n') is sent. The delimiter chars are chosen
+specifically in order to make the output look neat in the terminal. It is possible to
+use it with other interfaces and a web demo (HTML/JS AudioAPI+WebSockets) exists.
+
+To run the program from the terminal you can use one of the following commands. First,
+make sure the server is running and accepting connections. Using the Aspire models, the
+command should look like this:
+\verbatim
+online2-tcp-nnet3-decode-faster --samp-freq=8000 --frames-per-chunk=20 --extra-left-context-initial=0
+    --frame-subsampling-factor=3 --config=model/conf/online.conf --min-active=200 --max-active=7000
+    --beam=15.0 --lattice-beam=6.0 --acoustic-scale=1.0 model/final.mdl graph/HCLG.fst graph/words.txt 5050
+\endverbatim
+
+To send a WAV file into the server, it first needs to be decoded into raw audio, then it can be
+sent to the socket:
+\verbatim
+sox audio.wav -t raw -c 1 -b 16 -r 8k -e signed-integer - | nc -N localhost 5050
+\endverbatim
+
+It is possible to play audio (almost) simultaneously as decoding. It may require installing the
+'pv' program (used to throttle the signal into Kaldi at the same speed as the playback):
+
+\verbatim
+sox audio.wav -t raw -c 1 -b 16 -r 8k -e signed-integer - | \
+    tee >(play -t raw -r 8k -e signed-integer -b 16 -c 1 -q -) | \
+    pv -L 16000 -q | nc -N localhost 5050
+\endverbatim
+
+Finally, it is possible to send audio from the microphone directly into the server:
+
+\verbatim
+rec -r 8k -e signed-integer -c 1 -b 16 -t raw -q - | nc -N localhost 5050
+\endverbatim
 
 
 */
diff --git a/src/feat/online-feature.cc b/src/feat/online-feature.cc
index 138dabe2236..3e8bf483694 100644
--- a/src/feat/online-feature.cc
+++ b/src/feat/online-feature.cc
@@ -24,7 +24,7 @@
 
 namespace kaldi {
 
-RecyclingVector::RecyclingVector(int items_to_hold) :
+RecyclingVector::RecyclingVector(int items_to_hold):
   items_to_hold_(items_to_hold == 0 ? -1 : items_to_hold),
   first_available_index_(0) {
 }
@@ -38,7 +38,8 @@ RecyclingVector::~RecyclingVector() {
 Vector<BaseFloat> *RecyclingVector::At(int index) const {
   if (index < first_available_index_) {
     KALDI_ERR << "Attempted to retrieve feature vector that was "
-                 "already removed by the RecyclingVector (index = " << index << "; "
+                 "already removed by the RecyclingVector (index = "
+              << index << "; "
               << "first_available_index = " << first_available_index_ << "; "
               << "size = " << Size() << ")";
   }
@@ -59,14 +60,13 @@ int RecyclingVector::Size() const {
   return first_available_index_ + items_.size();
 }
 
-
-template<class C>
+template <class C>
 void OnlineGenericBaseFeature<C>::GetFrame(int32 frame,
                                            VectorBase<BaseFloat> *feat) {
   feat->CopyFromVec(*(features_.At(frame)));
 };
 
-template<class C>
+template <class C>
 OnlineGenericBaseFeature<C>::OnlineGenericBaseFeature(
     const typename C::Options &opts):
     computer_(opts),
@@ -77,29 +77,80 @@ OnlineGenericBaseFeature<C>::OnlineGenericBaseFeature(
                             &window_function_);
 }
 
-template<class C>
-void OnlineGenericBaseFeature<C>::AcceptWaveform(BaseFloat sampling_rate,
-                                                 const VectorBase<BaseFloat> &waveform) {
+
+template <class C>
+void OnlineGenericBaseFeature<C>::MaybeCreateResampler(
+    BaseFloat sampling_rate) {
   BaseFloat expected_sampling_rate = computer_.GetFrameOptions().samp_freq;
-  if (sampling_rate != expected_sampling_rate)
+
+  if (resampler_ != nullptr) {
+    KALDI_ASSERT(resampler_->GetInputSamplingRate() == sampling_rate);
+    KALDI_ASSERT(resampler_->GetOutputSamplingRate() == expected_sampling_rate);
+  } else if (((sampling_rate > expected_sampling_rate) &&
+              !computer_.GetFrameOptions().allow_downsample) ||
+             ((sampling_rate > expected_sampling_rate) &&
+              !computer_.GetFrameOptions().allow_upsample)) {
+    resampler_.reset(new LinearResample(
+        sampling_rate, expected_sampling_rate,
+        std::min(sampling_rate / 2, expected_sampling_rate / 2), 6));
+  } else if (sampling_rate != expected_sampling_rate) {
     KALDI_ERR << "Sampling frequency mismatch, expected "
-              << expected_sampling_rate << ", got " << sampling_rate;
-  if (waveform.Dim() == 0)
+              << expected_sampling_rate << ", got " << sampling_rate
+              << "\nPerhaps you want to use the options "
+                 "--allow_{upsample,downsample}";
+  }
+}
+
+template <class C>
+void OnlineGenericBaseFeature<C>::InputFinished() {
+  if (resampler_ != nullptr) {
+    Vector<BaseFloat> appended_wave;
+    Vector<BaseFloat> resampled_wave;
+    resampler_->Resample(appended_wave, true, &resampled_wave);
+
+    if (waveform_remainder_.Dim() != 0)
+      appended_wave.Range(0, waveform_remainder_.Dim())
+          .CopyFromVec(waveform_remainder_);
+    appended_wave.Range(waveform_remainder_.Dim(), resampled_wave.Dim())
+        .CopyFromVec(resampled_wave);
+    waveform_remainder_.Swap(&appended_wave);
+  }
+  input_finished_ = true;
+  ComputeFeatures();
+}
+
+template <class C>
+void OnlineGenericBaseFeature<C>::AcceptWaveform(
+    BaseFloat sampling_rate, const VectorBase<BaseFloat> &original_waveform) {
+  if (original_waveform.Dim() == 0)
     return;  // Nothing to do.
   if (input_finished_)
     KALDI_ERR << "AcceptWaveform called after InputFinished() was called.";
-  // append 'waveform' to 'waveform_remainder_.'
-  Vector<BaseFloat> appended_wave(waveform_remainder_.Dim() + waveform.Dim());
+
+  Vector<BaseFloat> appended_wave;
+  Vector<BaseFloat> resampled_wave;
+
+  const VectorBase<BaseFloat> *waveform;
+
+  MaybeCreateResampler(sampling_rate);
+  if (resampler_ == nullptr) {
+    waveform = &original_waveform;
+  } else {
+    resampler_->Resample(original_waveform, false, &resampled_wave);
+    waveform = &resampled_wave;
+  }
+
+  appended_wave.Resize(waveform_remainder_.Dim() + waveform->Dim());
   if (waveform_remainder_.Dim() != 0)
-    appended_wave.Range(0, waveform_remainder_.Dim()).CopyFromVec(
-        waveform_remainder_);
-  appended_wave.Range(waveform_remainder_.Dim(), waveform.Dim()).CopyFromVec(
-      waveform);
+    appended_wave.Range(0, waveform_remainder_.Dim())
+        .CopyFromVec(waveform_remainder_);
+  appended_wave.Range(waveform_remainder_.Dim(), waveform->Dim())
+      .CopyFromVec(*waveform);
   waveform_remainder_.Swap(&appended_wave);
   ComputeFeatures();
 }
 
-template<class C>
+template <class C>
 void OnlineGenericBaseFeature<C>::ComputeFeatures() {
   const FrameExtractionOptions &frame_opts = computer_.GetFrameOptions();
   int64 num_samples_total = waveform_offset_ + waveform_remainder_.Dim();
@@ -145,7 +196,6 @@ void OnlineGenericBaseFeature<C>::ComputeFeatures() {
 template class OnlineGenericBaseFeature<MfccComputer>;
 template class OnlineGenericBaseFeature<FbankComputer>;
 
-
 OnlineCmvnState::OnlineCmvnState(const OnlineCmvnState &other):
     speaker_cmvn_stats(other.speaker_cmvn_stats),
     global_cmvn_stats(other.global_cmvn_stats),
@@ -173,8 +223,6 @@ void OnlineCmvnState::Read(std::istream &is, bool binary) {
   ExpectToken(is, binary, "</OnlineCmvnState>");
 }
 
-
-
 OnlineCmvn::OnlineCmvn(const OnlineCmvnOptions &opts,
                        const OnlineCmvnState &cmvn_state,
                        OnlineFeatureInterface *src):
@@ -328,7 +376,8 @@ void OnlineCmvn::SmoothOnlineCmvnStats(const MatrixBase<double> &speaker_stats,
   // If count exceeded cmn_window it would be an error in how "window_stats"
   // was accumulated.
   KALDI_ASSERT(cur_count <= 1.001 * opts.cmn_window);
-  if (cur_count >= opts.cmn_window) return;
+  if (cur_count >= opts.cmn_window)
+    return;
   if (speaker_stats.NumRows() != 0) {  // if we have speaker stats..
     double count_from_speaker = opts.cmn_window - cur_count,
         speaker_count = speaker_stats(0, dim);
@@ -341,7 +390,8 @@ void OnlineCmvn::SmoothOnlineCmvnStats(const MatrixBase<double> &speaker_stats,
                              speaker_stats);
     cur_count = (*stats)(0, dim);
   }
-  if (cur_count >= opts.cmn_window) return;
+  if (cur_count >= opts.cmn_window)
+    return;
   if (global_stats.NumRows() != 0) {
     double count_from_global = opts.cmn_window - cur_count,
         global_count = global_stats(0, dim);
@@ -433,7 +483,7 @@ void OnlineCmvn::SetState(const OnlineCmvnState &cmvn_state) {
 
 int32 OnlineSpliceFrames::NumFramesReady() const {
   int32 num_frames = src_->NumFramesReady();
-  if (num_frames > 0 && src_->IsLastFrame(num_frames-1))
+  if (num_frames > 0 && src_->IsLastFrame(num_frames - 1))
     return num_frames;
   else
     return std::max<int32>(0, num_frames - right_context_);
diff --git a/src/feat/online-feature.h b/src/feat/online-feature.h
index 0ddc2601dec..2978d02f090 100644
--- a/src/feat/online-feature.h
+++ b/src/feat/online-feature.h
@@ -112,10 +112,7 @@ class OnlineGenericBaseFeature: public OnlineBaseFeature {
   // more waveform.  This will help flush out the last frame or two
   // of features, in the case where snip-edges == false; it also
   // affects the return value of IsLastFrame().
-  virtual void InputFinished() {
-    input_finished_ = true;
-    ComputeFeatures();
-  }
+  virtual void InputFinished();
 
  private:
   // This function computes any additional feature frames that it is possible to
diff --git a/src/feat/resample.h b/src/feat/resample.h
index ecac2ba7566..e0b4688c99b 100644
--- a/src/feat/resample.h
+++ b/src/feat/resample.h
@@ -185,6 +185,10 @@ class LinearResample {
   /// Resample(x, y, true) for the last piece.  Call it unnecessarily between
   /// signals will not do any harm.
   void Reset();
+
+  //// Return the input and output sampling rates (for checks, for example)
+  inline int32 GetInputSamplingRate() { return samp_rate_in_; }
+  inline int32 GetOutputSamplingRate() { return samp_rate_out_; }
  private:
   /// This function outputs the number of output samples we will output
   /// for a signal with "input_num_samp" input samples.  If flush == true,
diff --git a/src/fstext/context-fst.cc b/src/fstext/context-fst.cc
index 9936a398e37..1e41adc021f 100644
--- a/src/fstext/context-fst.cc
+++ b/src/fstext/context-fst.cc
@@ -345,7 +345,7 @@ SymbolTable *CreateILabelInfoSymbolTable(const vector<vector<int32> > &info,
                                          const SymbolTable &phones_symtab,
                                          std::string separator,
                                          std::string initial_disambig) {  // e.g. separator = "/", initial-disambig="#-1"
-  KALDI_ASSERT(!info.empty() && !info[0].empty());
+  KALDI_ASSERT(!info.empty() && info[0].empty());
   SymbolTable *ans = new SymbolTable("ilabel-info-symtab");
   int64 s = ans->AddSymbol(phones_symtab.Find(static_cast<int64>(0)));
   assert(s == 0);
diff --git a/src/nnet3/decodable-online-looped.cc b/src/nnet3/decodable-online-looped.cc
index 2159575df6c..751438606e8 100644
--- a/src/nnet3/decodable-online-looped.cc
+++ b/src/nnet3/decodable-online-looped.cc
@@ -30,6 +30,7 @@ DecodableNnetLoopedOnlineBase::DecodableNnetLoopedOnlineBase(
     num_chunks_computed_(0),
     current_log_post_subsampled_offset_(-1),
     info_(info),
+    frame_offset_(0),
     input_features_(input_features),
     ivector_features_(ivector_features),
     computer_(info_.opts.compute_config, info_.computation,
@@ -66,7 +67,7 @@ int32 DecodableNnetLoopedOnlineBase::NumFramesReady() const {
   if (input_finished) {
     // if the input has finished,... we'll pad with duplicates of the last frame
     // as needed to get the required right context.
-    return (features_ready + sf - 1) / sf;
+    return (features_ready + sf - 1) / sf - frame_offset_;
   } else {
     // note: info_.right_context_ includes both the model context and any
     // extra_right_context_ (but this
@@ -78,7 +79,7 @@ int32 DecodableNnetLoopedOnlineBase::NumFramesReady() const {
     // doesn't need any attention to rounding because info_.frames_per_chunk
     // is always a multiple of 'sf' (see 'frames_per_chunk = GetChunksize..."
     // in decodable-simple-looped.cc).
-    return num_chunks_ready * info_.frames_per_chunk / sf;
+    return num_chunks_ready * info_.frames_per_chunk / sf - frame_offset_;
   }
 }
 
@@ -105,9 +106,14 @@ bool DecodableNnetLoopedOnlineBase::IsLastFrame(
     return false;
   int32 sf = info_.opts.frame_subsampling_factor,
      num_subsampled_frames_ready = (features_ready + sf - 1) / sf;
-  return (subsampled_frame == num_subsampled_frames_ready - 1);
+  return (subsampled_frame + frame_offset_ == num_subsampled_frames_ready - 1);
 }
 
+void DecodableNnetLoopedOnlineBase::SetFrameOffset(int32 frame_offset) {
+  KALDI_ASSERT(0 <= frame_offset &&
+               frame_offset <= frame_offset_ + NumFramesReady());
+  frame_offset_ = frame_offset;
+}
 
 void DecodableNnetLoopedOnlineBase::AdvanceChunk() {
   // Prepare the input data for the next chunk of features.
@@ -231,6 +237,7 @@ void DecodableNnetLoopedOnlineBase::AdvanceChunk() {
 
 BaseFloat DecodableNnetLoopedOnline::LogLikelihood(int32 subsampled_frame,
                                                     int32 index) {
+  subsampled_frame += frame_offset_;
   EnsureFrameIsComputed(subsampled_frame);
   // note: we index by 'inde
   return current_log_post_(
@@ -241,6 +248,7 @@ BaseFloat DecodableNnetLoopedOnline::LogLikelihood(int32 subsampled_frame,
 
 BaseFloat DecodableAmNnetLoopedOnline::LogLikelihood(int32 subsampled_frame,
                                                     int32 index) {
+  subsampled_frame += frame_offset_;
   EnsureFrameIsComputed(subsampled_frame);
   return current_log_post_(
       subsampled_frame - current_log_post_subsampled_offset_,
diff --git a/src/nnet3/decodable-online-looped.h b/src/nnet3/decodable-online-looped.h
index 4867c5decb8..f040b62516a 100644
--- a/src/nnet3/decodable-online-looped.h
+++ b/src/nnet3/decodable-online-looped.h
@@ -81,6 +81,17 @@ class DecodableNnetLoopedOnlineBase: public DecodableInterface {
     return info_.opts.frame_subsampling_factor;
   }
 
+  /// Sets the frame offset value. Frame offset is initialized to 0 when the
+  /// decodable object is constructed and stays as 0 unless this method is
+  /// called. This method is useful when we want to reset the decoder state,
+  /// i.e. call decoder.InitDecoding(), but we want to keep using the same
+  /// decodable object, e.g. in case of an endpoint. The frame offset affects
+  /// the behavior of IsLastFrame(), NumFramesReady() and LogLikelihood()
+  /// methods.
+  void SetFrameOffset(int32 frame_offset);
+
+  /// Returns the frame offset value.
+  int32 GetFrameOffset() const { return frame_offset_; }
 
  protected:
 
@@ -111,6 +122,11 @@ class DecodableNnetLoopedOnlineBase: public DecodableInterface {
 
   const DecodableNnetSimpleLoopedInfo &info_;
 
+  // IsLastFrame(), NumFramesReady() and LogLikelihood() methods take into
+  // account this offset value. We initialize frame_offset_ as 0 and it stays as
+  // 0 unless SetFrameOffset() method is called.
+  int32 frame_offset_;
+
  private:
 
   // This function does the computation for the next chunk.  It will change
diff --git a/src/online2/online-feature-pipeline.h b/src/online2/online-feature-pipeline.h
index f89cbbbb898..fab1be3cb27 100644
--- a/src/online2/online-feature-pipeline.h
+++ b/src/online2/online-feature-pipeline.h
@@ -166,7 +166,7 @@ class OnlineFeaturePipeline: public OnlineFeatureInterface {
 
   // This is supplied for debug purposes.
   void GetAsMatrix(Matrix<BaseFloat> *feats);
-  
+
   void FreezeCmvn();  // stop it from moving further (do this when you start
                       // using fMLLR). This will crash if NumFramesReady() == 0.
 
diff --git a/src/online2/online-nnet2-feature-pipeline.h b/src/online2/online-nnet2-feature-pipeline.h
index e379f7263ec..2e3fbf7bd78 100644
--- a/src/online2/online-nnet2-feature-pipeline.h
+++ b/src/online2/online-nnet2-feature-pipeline.h
@@ -196,6 +196,20 @@ class OnlineNnet2FeaturePipeline: public OnlineFeatureInterface {
   virtual int32 NumFramesReady() const;
   virtual void GetFrame(int32 frame, VectorBase<BaseFloat> *feat);
 
+  /// If you are downweighting silence, you can call
+  /// OnlineSilenceWeighting::GetDeltaWeights and supply the output to this
+  /// class using UpdateFrameWeights().  The reason why this call happens
+  /// outside this class, rather than this class pulling in the data weights,
+  /// relates to multi-threaded operation and also from not wanting this class
+  /// to have excessive dependencies.
+  ///
+  /// You must either always call this as soon as new data becomes available,
+  /// ideally just after calling AcceptWaveform(), or never call it for the
+  /// lifetime of this object.
+  void UpdateFrameWeights(
+      const std::vector<std::pair<int32, BaseFloat> > &delta_weights,
+      int32 frame_offset = 0);
+
   /// Set the adaptation state to a particular value, e.g. reflecting previous
   /// utterances of the same speaker; this will generally be called after
   /// Copy().
diff --git a/src/online2/online-nnet3-decoding.cc b/src/online2/online-nnet3-decoding.cc
index d40dcb411d1..08c8ba28060 100644
--- a/src/online2/online-nnet3-decoding.cc
+++ b/src/online2/online-nnet3-decoding.cc
@@ -41,6 +41,12 @@ SingleUtteranceNnet3DecoderTpl<FST>::SingleUtteranceNnet3DecoderTpl(
   decoder_.InitDecoding();
 }
 
+template <typename FST>
+void SingleUtteranceNnet3DecoderTpl<FST>::InitDecoding(int32 frame_offset) {
+  decoder_.InitDecoding();
+  decodable_.SetFrameOffset(frame_offset);
+}
+
 template <typename FST>
 void SingleUtteranceNnet3DecoderTpl<FST>::AdvanceDecoding() {
   decoder_.AdvanceDecoding(&decodable_);
@@ -56,7 +62,6 @@ int32 SingleUtteranceNnet3DecoderTpl<FST>::NumFramesDecoded() const {
   return decoder_.NumFramesDecoded();
 }
 
-
 template <typename FST>
 void SingleUtteranceNnet3DecoderTpl<FST>::GetLattice(bool end_of_utterance,
                                              CompactLattice *clat) const {
diff --git a/src/online2/online-nnet3-decoding.h b/src/online2/online-nnet3-decoding.h
index b30f035b4d2..b80baad893f 100644
--- a/src/online2/online-nnet3-decoding.h
+++ b/src/online2/online-nnet3-decoding.h
@@ -60,7 +60,13 @@ class SingleUtteranceNnet3DecoderTpl {
                                  const FST &fst,
                                  OnlineNnet2FeaturePipeline *features);
 
-  /// advance the decoding as far as we can.
+  /// Initializes the decoding and sets the frame offset of the underlying
+  /// decodable object. This method is called by the constructor. You can also
+  /// call this method when you want to reset the decoder state, but want to
+  /// keep using the same decodable object, e.g. in case of an endpoint.
+  void InitDecoding(int32 frame_offset = 0);
+
+  /// Advances the decoding as far as we can.
   void AdvanceDecoding();
 
   /// Finalizes the decoding. Cleans up and prunes remaining tokens, so the
diff --git a/src/online2/online2-feature-pipeline.cc b/src/online2/online2-feature-pipeline.cc
index 510c401fba2..c495c9fc8ef 100644
--- a/src/online2/online2-feature-pipeline.cc
+++ b/src/online2/online2-feature-pipeline.cc
@@ -128,6 +128,21 @@ void OnlineNnet2FeaturePipeline::GetFrame(int32 frame,
   return final_feature_->GetFrame(frame, feat);
 }
 
+void OnlineNnet2FeaturePipeline::UpdateFrameWeights(
+    const std::vector<std::pair<int32, BaseFloat> > &delta_weights,
+    int32 frame_offset) {
+  if (frame_offset == 0) {
+    IvectorFeature()->UpdateFrameWeights(delta_weights);
+  } else {
+    std::vector<std::pair<int32, BaseFloat> > offset_delta_weights;
+    for (size_t i = 0; i < delta_weights.size(); i++) {
+      offset_delta_weights.push_back(std::make_pair(
+          delta_weights[i].first + frame_offset, delta_weights[i].second));
+    }
+    IvectorFeature()->UpdateFrameWeights(offset_delta_weights);
+  }
+}
+
 void OnlineNnet2FeaturePipeline::SetAdaptationState(
     const OnlineIvectorExtractorAdaptationState &adaptation_state) {
   if (info_.use_ivectors) {
diff --git a/src/online2bin/Makefile b/src/online2bin/Makefile
index 8792cc5b11a..28c135eb950 100644
--- a/src/online2bin/Makefile
+++ b/src/online2bin/Makefile
@@ -11,7 +11,8 @@ BINFILES = online2-wav-gmm-latgen-faster apply-cmvn-online \
      online2-wav-nnet2-latgen-faster ivector-extract-online2 \
      online2-wav-dump-features ivector-randomize \
      online2-wav-nnet2-am-compute  online2-wav-nnet2-latgen-threaded \
-     online2-wav-nnet3-latgen-faster online2-wav-nnet3-latgen-grammar
+     online2-wav-nnet3-latgen-faster online2-wav-nnet3-latgen-grammar \
+     online2-tcp-nnet3-decode-faster
 
 OBJFILES =
 
diff --git a/src/online2bin/online2-tcp-nnet3-decode-faster.cc b/src/online2bin/online2-tcp-nnet3-decode-faster.cc
new file mode 100644
index 00000000000..46e9cbc05be
--- /dev/null
+++ b/src/online2bin/online2-tcp-nnet3-decode-faster.cc
@@ -0,0 +1,442 @@
+// online2bin/online2-tcp-nnet3-decode-faster.cc
+
+// Copyright 2014  Johns Hopkins University (author: Daniel Povey)
+//           2016  Api.ai (Author: Ilya Platonov)
+//           2018  Polish-Japanese Academy of Information Technology (Author: Danijel Korzinek)
+
+// See ../../COPYING for clarification regarding multiple authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//  http://www.apache.org/licenses/LICENSE-2.0
+//
+// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
+// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
+// MERCHANTABLITY OR NON-INFRINGEMENT.
+// See the Apache 2 License for the specific language governing permissions and
+// limitations under the License.
+
+#include "feat/wave-reader.h"
+#include "online2/online-nnet3-decoding.h"
+#include "online2/online-nnet2-feature-pipeline.h"
+#include "online2/onlinebin-util.h"
+#include "online2/online-timing.h"
+#include "online2/online-endpoint.h"
+#include "fstext/fstext-lib.h"
+#include "lat/lattice-functions.h"
+#include "util/kaldi-thread.h"
+#include "nnet3/nnet-utils.h"
+
+#include <netinet/in.h>
+#include <sys/socket.h>
+#include <sys/types.h>
+#include <poll.h>
+#include <signal.h>
+#include <arpa/inet.h>
+#include <unistd.h>
+#include <string>
+
+namespace kaldi {
+
+class TcpServer {
+ public:
+  explicit TcpServer(int read_timeout);
+  ~TcpServer();
+
+  bool Listen(int32 port);  // start listening on a given port
+  int32 Accept();  // accept a client and return its descriptor
+
+  bool ReadChunk(size_t len); // get more data and return false if end-of-stream
+
+  Vector<BaseFloat> GetChunk(); // get the data read by above method
+
+  bool Write(const std::string &msg); // write to accepted client
+  bool WriteLn(const std::string &msg, const std::string &eol = "\n"); // write line to accepted client
+
+  void Disconnect();
+
+ private:
+  struct ::sockaddr_in h_addr_;
+  int32 server_desc_, client_desc_;
+  int16 *samp_buf_;
+  size_t buf_len_, has_read_;
+  pollfd client_set_[1];
+  int read_timeout_;
+};
+
+std::string LatticeToString(const Lattice &lat, const fst::SymbolTable &word_syms) {
+  LatticeWeight weight;
+  std::vector<int32> alignment;
+  std::vector<int32> words;
+  GetLinearSymbolSequence(lat, &alignment, &words, &weight);
+
+  std::ostringstream msg;
+  for (size_t i = 0; i < words.size(); i++) {
+    std::string s = word_syms.Find(words[i]);
+    if (s.empty()) {
+      KALDI_WARN << "Word-id " << words[i] << " not in symbol table.";
+      msg << "<#" << std::to_string(i) << "> ";
+    } else
+      msg << s << " ";
+  }
+  return msg.str();
+}
+
+std::string LatticeToString(const CompactLattice &clat, const fst::SymbolTable &word_syms) {
+  if (clat.NumStates() == 0) {
+    KALDI_WARN << "Empty lattice.";
+    return "";
+  }
+  CompactLattice best_path_clat;
+  CompactLatticeShortestPath(clat, &best_path_clat);
+
+  Lattice best_path_lat;
+  ConvertLattice(best_path_clat, &best_path_lat);
+  return LatticeToString(best_path_lat, word_syms);
+}
+}
+
+int main(int argc, char *argv[]) {
+  try {
+    using namespace kaldi;
+    using namespace fst;
+
+    typedef kaldi::int32 int32;
+    typedef kaldi::int64 int64;
+
+    const char *usage =
+        "Reads in audio from a network socket and performs online\n"
+        "decoding with neural nets (nnet3 setup), with iVector-based\n"
+        "speaker adaptation and endpointing.\n"
+        "Note: some configuration values and inputs are set via config\n"
+        "files whose filenames are passed as options\n"
+        "\n"
+        "Usage: online2-tcp-nnet3-decode-faster [options] <nnet3-in> "
+        "<fst-in> <word-symbol-table>\n";
+
+    ParseOptions po(usage);
+
+
+    // feature_opts includes configuration for the iVector adaptation,
+    // as well as the basic features.
+    OnlineNnet2FeaturePipelineConfig feature_opts;
+    nnet3::NnetSimpleLoopedComputationOptions decodable_opts;
+    LatticeFasterDecoderConfig decoder_opts;
+    OnlineEndpointConfig endpoint_opts;
+
+    BaseFloat chunk_length_secs = 0.18;
+    BaseFloat output_period = 1;
+    BaseFloat samp_freq = 16000.0;
+    int port_num = 5050;
+    int read_timeout = 3;
+
+    po.Register("samp-freq", &samp_freq,
+                "Sampling frequency of the input signal (coded as 16-bit slinear).");
+    po.Register("chunk-length", &chunk_length_secs,
+                "Length of chunk size in seconds, that we process.");
+    po.Register("output-period", &output_period,
+                "How often in seconds, do we check for changes in output.");
+    po.Register("num-threads-startup", &g_num_threads,
+                "Number of threads used when initializing iVector extractor.");
+    po.Register("read-timeout", &read_timeout,
+                "Number of seconds of timout for TCP audio data to appear on the stream. Use -1 for blocking.");
+    po.Register("port-num", &port_num,
+                "Port number the server will listen on.");
+
+    feature_opts.Register(&po);
+    decodable_opts.Register(&po);
+    decoder_opts.Register(&po);
+    endpoint_opts.Register(&po);
+
+    po.Read(argc, argv);
+
+    if (po.NumArgs() != 3) {
+      po.PrintUsage();
+      return 1;
+    }
+
+    std::string nnet3_rxfilename = po.GetArg(1),
+        fst_rxfilename = po.GetArg(2),
+        word_syms_filename = po.GetArg(3);
+
+    OnlineNnet2FeaturePipelineInfo feature_info(feature_opts);
+
+    KALDI_VLOG(1) << "Loading AM...";
+
+    TransitionModel trans_model;
+    nnet3::AmNnetSimple am_nnet;
+    {
+      bool binary;
+      Input ki(nnet3_rxfilename, &binary);
+      trans_model.Read(ki.Stream(), binary);
+      am_nnet.Read(ki.Stream(), binary);
+      SetBatchnormTestMode(true, &(am_nnet.GetNnet()));
+      SetDropoutTestMode(true, &(am_nnet.GetNnet()));
+      nnet3::CollapseModel(nnet3::CollapseModelConfig(), &(am_nnet.GetNnet()));
+    }
+
+    // this object contains precomputed stuff that is used by all decodable
+    // objects.  It takes a pointer to am_nnet because if it has iVectors it has
+    // to modify the nnet to accept iVectors at intervals.
+    nnet3::DecodableNnetSimpleLoopedInfo decodable_info(decodable_opts,
+                                                        &am_nnet);
+
+    KALDI_VLOG(1) << "Loading FST...";
+
+    fst::Fst<fst::StdArc> *decode_fst = ReadFstKaldiGeneric(fst_rxfilename);
+
+    fst::SymbolTable *word_syms = NULL;
+    if (!word_syms_filename.empty())
+      if (!(word_syms = fst::SymbolTable::ReadText(word_syms_filename)))
+        KALDI_ERR << "Could not read symbol table from file "
+                  << word_syms_filename;
+
+    signal(SIGPIPE, SIG_IGN); // ignore SIGPIPE to avoid crashing when socket forcefully disconnected
+
+    TcpServer server(read_timeout);
+
+    server.Listen(port_num);
+
+    while (true) {
+
+      server.Accept();
+
+      int32 samp_count = 0;// this is used for output refresh rate
+      size_t chunk_len = static_cast<size_t>(chunk_length_secs * samp_freq);
+      int32 check_period = static_cast<int32>(samp_freq * output_period);
+      int32 check_count = check_period;
+
+      int32 frame_offset = 0;
+
+      bool eos = false;
+
+      OnlineNnet2FeaturePipeline feature_pipeline(feature_info);
+      SingleUtteranceNnet3Decoder decoder(decoder_opts, trans_model,
+                                          decodable_info,
+                                          *decode_fst, &feature_pipeline);
+
+      while (!eos) {
+
+        decoder.InitDecoding(frame_offset);
+        OnlineSilenceWeighting silence_weighting(
+            trans_model,
+            feature_info.silence_weighting_config,
+            decodable_opts.frame_subsampling_factor);
+        std::vector<std::pair<int32, BaseFloat>> delta_weights;
+
+        while (true) {
+          eos = !server.ReadChunk(chunk_len);
+
+          if (eos) {
+            feature_pipeline.InputFinished();
+            decoder.AdvanceDecoding();
+            decoder.FinalizeDecoding();
+            frame_offset += decoder.NumFramesDecoded();
+            if (decoder.NumFramesDecoded() > 0) {
+              CompactLattice lat;
+              decoder.GetLattice(true, &lat);
+              std::string msg = LatticeToString(lat, *word_syms);
+              server.WriteLn(msg);
+            } else
+              server.Write("\n");
+            server.Disconnect();
+            break;
+          }
+
+          Vector<BaseFloat> wave_part = server.GetChunk();
+          feature_pipeline.AcceptWaveform(samp_freq, wave_part);
+          samp_count += chunk_len;
+
+          if (silence_weighting.Active() &&
+              feature_pipeline.IvectorFeature() != NULL) {
+            silence_weighting.ComputeCurrentTraceback(decoder.Decoder());
+            silence_weighting.GetDeltaWeights(feature_pipeline.NumFramesReady(),
+                                              &delta_weights);
+            feature_pipeline.UpdateFrameWeights(delta_weights,
+                                                frame_offset * decodable_opts.frame_subsampling_factor);
+          }
+
+          decoder.AdvanceDecoding();
+
+          if (samp_count > check_count) {
+            if (decoder.NumFramesDecoded() > 0) {
+              Lattice lat;
+              decoder.GetBestPath(false, &lat);
+              std::string msg = LatticeToString(lat, *word_syms);
+              server.WriteLn(msg, "\r");
+            }
+            check_count += check_period;
+          }
+
+          if (decoder.EndpointDetected(endpoint_opts)) {
+            decoder.FinalizeDecoding();
+            frame_offset += decoder.NumFramesDecoded();
+            CompactLattice lat;
+            decoder.GetLattice(true, &lat);
+            std::string msg = LatticeToString(lat, *word_syms);
+            server.WriteLn(msg);
+            break;
+          }
+        }
+      }
+    }
+  } catch (const std::exception &e) {
+    std::cerr << e.what();
+    return -1;
+  }
+} // main()
+
+
+namespace kaldi {
+TcpServer::TcpServer(int read_timeout) {
+  server_desc_ = -1;
+  client_desc_ = -1;
+  samp_buf_ = NULL;
+  buf_len_ = 0;
+  read_timeout_ = 1000 * read_timeout;
+}
+
+bool TcpServer::Listen(int32 port) {
+  h_addr_.sin_addr.s_addr = INADDR_ANY;
+  h_addr_.sin_port = htons(port);
+  h_addr_.sin_family = AF_INET;
+
+  server_desc_ = socket(AF_INET, SOCK_STREAM, 0);
+
+  if (server_desc_ == -1) {
+    KALDI_ERR << "Cannot create TCP socket!";
+    return false;
+  }
+
+  int32 flag = 1;
+  int32 len = sizeof(int32);
+  if (setsockopt(server_desc_, SOL_SOCKET, SO_REUSEADDR, &flag, len) == -1) {
+    KALDI_ERR << "Cannot set socket options!";
+    return false;
+  }
+
+  if (bind(server_desc_, (struct sockaddr *) &h_addr_, sizeof(h_addr_)) == -1) {
+    KALDI_ERR << "Cannot bind to port: " << port << " (is it taken?)";
+    return false;
+  }
+
+  if (listen(server_desc_, 1) == -1) {
+    KALDI_ERR << "Cannot listen on port!";
+    return false;
+  }
+
+  KALDI_LOG << "TcpServer: Listening on port: " << port;
+
+  return true;
+
+}
+
+TcpServer::~TcpServer() {
+  Disconnect();
+  if (server_desc_ != -1)
+    close(server_desc_);
+  delete[] samp_buf_;
+}
+
+int32 TcpServer::Accept() {
+  KALDI_LOG << "Waiting for client...";
+
+  socklen_t len;
+
+  len = sizeof(struct sockaddr);
+  client_desc_ = accept(server_desc_, (struct sockaddr *) &h_addr_, &len);
+
+  struct sockaddr_storage addr;
+  char ipstr[20];
+
+  len = sizeof addr;
+  getpeername(client_desc_, (struct sockaddr *) &addr, &len);
+
+  struct sockaddr_in *s = (struct sockaddr_in *) &addr;
+  inet_ntop(AF_INET, &s->sin_addr, ipstr, sizeof ipstr);
+
+  client_set_[0].fd = client_desc_;
+  client_set_[0].events = POLLIN;
+
+  KALDI_LOG << "Accepted connection from: " << ipstr;
+
+  return client_desc_;
+}
+
+bool TcpServer::ReadChunk(size_t len) {
+  if (buf_len_ != len) {
+    buf_len_ = len;
+    delete[] samp_buf_;
+    samp_buf_ = new int16[len];
+  }
+
+  ssize_t ret;
+  int poll_ret;
+  size_t to_read = len;
+  has_read_ = 0;
+  while (to_read > 0) {
+    poll_ret = poll(client_set_, 1, read_timeout_);
+    if (poll_ret == 0) {
+      KALDI_WARN << "Socket timeout! Disconnecting...";
+      break;
+    }
+    if (client_set_[0].revents != POLLIN) {
+      KALDI_WARN << "Socket error! Disconnecting...";
+      break;
+    }
+    ret = read(client_desc_, static_cast<void *>(samp_buf_ + has_read_), to_read * sizeof(int16));
+    if (ret <= 0) {
+      KALDI_WARN << "Stream over...";
+      break;
+    }
+    to_read -= ret / sizeof(int16);
+    has_read_ += ret / sizeof(int16);
+  }
+
+  return has_read_ > 0;
+}
+
+Vector<BaseFloat> TcpServer::GetChunk() {
+  Vector<BaseFloat> buf;
+
+  buf.Resize(static_cast<MatrixIndexT>(has_read_));
+
+  for (int i = 0; i < has_read_; i++)
+    buf(i) = static_cast<BaseFloat>(samp_buf_[i]);
+
+  return buf;
+}
+
+bool TcpServer::Write(const std::string &msg) {
+
+  const char *p = msg.c_str();
+  size_t to_write = msg.size();
+  size_t wrote = 0;
+  while (to_write > 0) {
+    ssize_t ret = write(client_desc_, static_cast<const void *>(p + wrote), to_write);
+    if (ret <= 0)
+      return false;
+
+    to_write -= ret;
+    wrote += ret;
+  }
+
+  return true;
+}
+
+bool TcpServer::WriteLn(const std::string &msg, const std::string &eol) {
+  if (Write(msg))
+    return Write(eol);
+  else return false;
+}
+
+void TcpServer::Disconnect() {
+  if (client_desc_ != -1) {
+    close(client_desc_);
+    client_desc_ = -1;
+  }
+}
+}  // namespace kaldi
\ No newline at end of file
diff --git a/src/onlinebin/online-gmm-decode-faster.cc b/src/onlinebin/online-gmm-decode-faster.cc
index 9aa8751cf50..dac7657ea57 100644
--- a/src/onlinebin/online-gmm-decode-faster.cc
+++ b/src/onlinebin/online-gmm-decode-faster.cc
@@ -70,7 +70,7 @@ int main(int argc, char *argv[]) {
     OnlineFeatureMatrixOptions feature_reading_opts;
     decoder_opts.Register(&po, true);
     feature_reading_opts.Register(&po);
-    
+
     po.Register("left-context", &left_context, "Number of frames of left context");
     po.Register("right-context", &right_context, "Number of frames of right context");
     po.Register("acoustic-scale", &acoustic_scale,
@@ -86,7 +86,7 @@ int main(int argc, char *argv[]) {
       po.PrintUsage();
       return 1;
     }
-    
+
     std::string model_rxfilename = po.GetArg(1),
         fst_rxfilename = po.GetArg(2),
         word_syms_filename = po.GetArg(3),
@@ -151,7 +151,7 @@ int main(int argc, char *argv[]) {
       opts.order = kDeltaOrder;
       feat_transform = new OnlineDeltaInput(opts, &cmn_input);
     }
-    
+
     // feature_reading_opts contains number of retries, batch size.
     OnlineFeatureMatrix feature_matrix(feature_reading_opts,
                                        feat_transform);
@@ -200,4 +200,4 @@ int main(int argc, char *argv[]) {
     return -1;
   }
 #endif
-} // main()
+}  // main()
diff --git a/src/util/edit-distance-inl.h b/src/util/edit-distance-inl.h
index c1d1682804c..3304b27d0bf 100644
--- a/src/util/edit-distance-inl.h
+++ b/src/util/edit-distance-inl.h
@@ -35,8 +35,8 @@ int32 LevenshteinEditDistance(const std::vector<T> &a,
   //  elements a_0 ... a_{M-1} and b_0 ... b_{N-1}.
   //  We are computing the recursion
   //     E(m, n) = min(  E(m-1, n-1) + (1-delta(a_{m-1}, b_{n-1})),
-  //                    E(m-1, n),
-  //                    E(m, n-1) ).
+  //                    E(m-1, n) + 1,
+  //                    E(m, n-1) + 1).
   //  where E(m, n) is defined for m = 0..M and n = 0..N and out-of-
   //  bounds quantities are considered to be infinity (i.e. the
   //  recursion does not visit them).
diff --git a/tools/extras/install_portaudio.sh b/tools/extras/install_portaudio.sh
index 58797f554e8..36c95047a7f 100755
--- a/tools/extras/install_portaudio.sh
+++ b/tools/extras/install_portaudio.sh
@@ -14,10 +14,10 @@
 #See the Apache 2 License for the specific language governing permissions and
 #limitations under the License.
 #
-#This script attempts to install port audio, which is needed for the run-on 
-#decoding stuff. Portaudio enables the decoder to grab a live audio stream 
-#from the soundcard. I tested portaudio on Linux (RedHat and Suse Linux) and 
-#on MacOS 10.7. On Linux, it compiles out of the box. For MacOS 10.7, 
+#This script attempts to install port audio, which is needed for the run-on
+#decoding stuff. Portaudio enables the decoder to grab a live audio stream
+#from the soundcard. I tested portaudio on Linux (RedHat and Suse Linux) and
+#on MacOS 10.7. On Linux, it compiles out of the box. For MacOS 10.7,
 #it is necessary to edit the Makefile (this script tries to do that).
 #The script will remove all occurances of
 #
@@ -29,8 +29,8 @@
 #also, it seems that one has to uncomment the inclusion of AudioToolbox in
 #include/pa_mac_core.h
 #
-#All this should make it compile fine for x86_64 under MacOS 10.7 
-#(always assuming that you installed XCode, wget and 
+#All this should make it compile fine for x86_64 under MacOS 10.7
+#(always assuming that you installed XCode, wget and
 #the Linux environment stuff on MacOS)
 
 echo "****() Installing portaudio"
@@ -38,7 +38,7 @@ echo "****() Installing portaudio"
 if [ ! -e pa_stable_v19_20111121.tgz ]; then
     echo "Could not find portaudio tarball pa_stable_v19_20111121.tgz"
     echo "Trying to download it via wget!"
-    
+
     if ! which wget >&/dev/null; then
         echo "This script requires you to first install wget"
         echo "You can also just download pa_stable_v19_20111121.tgz from"
@@ -81,6 +81,8 @@ if [ -z "$MACOS" ]; then
     echo "${pa_patch}" | patch -p0 Makefile.in
 fi
 
+patch -p0  Makefile.in < ../extras/portaudio.patch
+autoconf
 ./configure --prefix=`pwd`/install --with-pic
 perl -i -pe 's:src/common/pa_ringbuffer.o:: if /^OTHER_OBJS\s*=/' Makefile
 
@@ -93,7 +95,7 @@ if [ "$MACOS" != "" ]; then
     mv include/pa_mac_core.h include/pa_mac_core.h.bck
     cat include/pa_mac_core.h.bck \
       | sed 's/\/\/\#include \<AudioToolbox\/AudioToolbox.h\>/#include \<AudioToolbox\/AudioToolbox.h\>/g' \
-      > include/pa_mac_core.h 
+      > include/pa_mac_core.h
 fi
 
 make
diff --git a/tools/extras/portaudio.patch b/tools/extras/portaudio.patch
new file mode 100644
index 00000000000..9fc201f9278
--- /dev/null
+++ b/tools/extras/portaudio.patch
@@ -0,0 +1,21 @@
+diff --git a/Makefile.in b/Makefile.in
+index 24129a3..61a3952 100644
+--- a/Makefile.in
++++ b/Makefile.in
+@@ -44,7 +44,7 @@ PALIB = libportaudio.la
+ PAINC = include/portaudio.h
+ 
+ PA_LDFLAGS = $(LDFLAGS) $(SHARED_FLAGS) -rpath $(libdir) -no-undefined \
+-	     -export-symbols-regex "(Pa|PaMacCore|PaJack|PaAlsa|PaAsio|PaOSS)_.*" \
++	     -export-symbols-regex "(Pa|PaUtil|PaMacCore|PaJack|PaAlsa|PaAsio|PaOSS)_.*" \
+ 	     -version-info $(LT_CURRENT):$(LT_REVISION):$(LT_AGE)
+ 
+ COMMON_OBJS = \
+@@ -57,6 +57,7 @@ COMMON_OBJS = \
+ 	src/common/pa_process.o \
+ 	src/common/pa_stream.o \
+ 	src/common/pa_trace.o \
++	src/common/pa_ringbuffer.o \
+ 	src/hostapi/skeleton/pa_hostapi_skeleton.o
+ 
+ LOOPBACK_OBJS = \

From 910ec505268136d0e3f4595085ef6dc752898928 Mon Sep 17 00:00:00 2001
From: Daniel Povey <dpovey@gmail.com>
Date: Fri, 22 Mar 2019 16:49:25 -0400
Subject: [PATCH 011/163] [src] More tensor draft stuff; add simple test of
 vector stride

---
 src/matrix/kaldi-vector.cc    |   5 +-
 src/matrix/matrix-lib-test.cc |  14 +++++
 src/tensor/tensor-common.h    |  13 ++++
 src/tensor/tensor-functions.h | 109 ++++++++++++++++++++++++++--------
 src/tensor/tensor.h           |  21 +++++--
 5 files changed, 131 insertions(+), 31 deletions(-)

diff --git a/src/matrix/kaldi-vector.cc b/src/matrix/kaldi-vector.cc
index 6caa49376c5..ef37ab0137a 100644
--- a/src/matrix/kaldi-vector.cc
+++ b/src/matrix/kaldi-vector.cc
@@ -219,7 +219,7 @@ template<typename Real>
 void VectorBase<Real>::CopyFromVec(const VectorBase<Real> &v) {
   KALDI_ASSERT(Dim() == v.Dim());
   if (data_ != v.data_) {
-    std::memcpy(this->data_, v.data_, dim_ * sizeof(Real));
+    cblas_Xcopy(dim_, v.data_, v.stride_, data_, stride_);
   }
 }
 
@@ -241,9 +241,10 @@ template<typename OtherReal>
 void VectorBase<Real>::CopyFromVec(const VectorBase<OtherReal> &other) {
   KALDI_ASSERT(dim_ == other.Dim());
   Real * __restrict__  ptr = data_;
+  MatrixIndexT stride = stride_, other_stride = other.stride_;
   const OtherReal * __restrict__ other_ptr = other.Data();
   for (MatrixIndexT i = 0; i < dim_; i++)
-    ptr[i] = other_ptr[i];
+    ptr[i * stride] = other_ptr[i * other_stride];
 }
 
 template void VectorBase<float>::CopyFromVec(const VectorBase<double> &other);
diff --git a/src/matrix/matrix-lib-test.cc b/src/matrix/matrix-lib-test.cc
index 8097ab119b5..7ea998a5cb4 100644
--- a/src/matrix/matrix-lib-test.cc
+++ b/src/matrix/matrix-lib-test.cc
@@ -3226,6 +3226,19 @@ template<typename Real> static void UnitTestLbfgs() {
   g_kaldi_verbose_level = temp;
 }
 
+template<typename Real>  static void UnitTestVecStrideSimple() {
+  Vector<Real> v(20);
+  v.SetRandn();
+
+  Vector<Real> v2(20);
+  SubVector<Real> v_a(v.Data(), 10, 2);
+  SubVector<Real> v2_a(v2.Data(), 10, 2);
+  v2_a.CopyFromVec(v_a);
+  SubVector<Real> v_b(v.Data() + 1, 10, 2);
+  SubVector<Real> v2_b(v2.Data() + 1, 10, 2);
+  v2_b.CopyFromVec(v_b);
+  AssertEqual(v, v2);
+}
 
 template<typename Real> static void UnitTestLinearCgd() {
   for (int i = 0; i < 20 ; i++) {
@@ -4590,6 +4603,7 @@ template<typename Real> static void UnitTestTriVecSolver() {
 
 
 template<typename Real> static void MatrixUnitTest(bool full_test) {
+  UnitTestVecStrideSimple<Real>();
   UnitTestLinearCgd<Real>();
   UnitTestGeneralMatrix<BaseFloat>();
   UnitTestTridiagonalize<Real>();
diff --git a/src/tensor/tensor-common.h b/src/tensor/tensor-common.h
index 3d778cc4446..56c02cec9a3 100644
--- a/src/tensor/tensor-common.h
+++ b/src/tensor/tensor-common.h
@@ -82,6 +82,19 @@ enum InitializePolicy {
   kUninitialized
 };
 
+/// This enumeration with one value is used in the constructor of Tensor,
+/// so if you do:
+///  `Tensor a;  Tensor b(a, kUntrackedStorage);`
+/// it will not copy the 'storage' pointer like it normallly would.
+/// This is useful as an optimization that avoids atomics with
+/// std::shared_ptr, for temporary Tensors in situations where we
+/// know the Tensor we are copying from is not going out of scope
+/// for the lifetime of the temporary.
+enum TensorStorageEnum {
+  kUntrackedStorage
+};
+
+
 
 // In practice we don't expect user-owned tensors with dims greater than 5 to
 // exist, but there are certain manipulations we do when simplifying matrix
diff --git a/src/tensor/tensor-functions.h b/src/tensor/tensor-functions.h
index 8f21fc61674..77af4e8a602 100644
--- a/src/tensor/tensor-functions.h
+++ b/src/tensor/tensor-functions.h
@@ -69,14 +69,14 @@ void CopyData(const Tensor &src, const Tensor *dest);
                          reinterpreted.
      @param   [in] dims  The dimensions that we want for the returned
                        Tensor; its product must equal src.NumElements().
-     @return   If the view could be constructed, this function returns
-               a shared_ptr to a new Tensor with the requestd dims,
-               that shares underlying data with 'src'; otherwise returns
-               NULL.  (If src.HasCStrides(), then this function is
-               guaranteed not to return nullptr).
-
+     @param   [out] dest  If the view could be constructed, this function
+               make 'dest' a view of the data in 'src' with the requested dims;
+               otherwise 'dest' will be unchanged.
+     @return   Returns true if this view could be constructed. If
+               src.HasCStrides() is true, this function will never return
+               false.
  */
-std::shared_ptr<Tensor> View(const Tensor &src, ArrayRef<int64_t> dims);
+bool View(const Tensor &src, ArrayRef<int64_t> dims, Tensor *dest);
 
 
 /**
@@ -99,41 +99,100 @@ std::shared_ptr<Tensor> View(const Tensor &src, ArrayRef<int64_t> dims);
    a temporary Tensor 'temp' passing src.Dims() in the constructor, copy the
    data in 'src' to 'temp', and then call MergeAxes on 'temp'.
 
-       @param [in]  src   The Tensor which whose axes we will attempt to
+       @param [in]  src   The Tensor whose axes we will attempt to
                           merge
        @param [in] axis1  The index of the first of the two axes which
                           this function will attempt to merge.  Must
                           be less than src.NumAxes() - 1.
-       @return            Returns a pointer to a Tensor with the
-                          merged axes (if the pattern of 'src'
-                          allows it), or nullptr otherwise.
+       @param [out] dest  The Tensor which is written to; on success this
+                          will be a Tensor with axes merged as requested,
+                          sharing the data of 'src'.  On failure, it will
+                          not be changed.
+       @return            Returns true on success, false if the axes could
+                          not be merged (e.g., because of the strides not
+                          having the required relationship).
  */
-std::shared_ptr<Tensor> MergeAxes(const Tensor &src, int64_t axis1);
+bool MergeAxes(const Tensor &src, int64_t axis1, Tensor *dest);
 
 /**
-   Returns a Tensor with a new view of the data in 'src', in which the
-   specified axis is split into two axes.  This is just a special case
-   of View().
-
-   Returns a Tensor in which the axis numbered 'axis' is split into
+   Creates a Tensor in which the axis numbered 'axis' is split into
    two axes, with dimensions respectively 'dim1' and 'dim2'.  The
    interpretation will be as for a "C" array; so, for instance,
    if the dimensions of 'src' were (10,12) and you called
    `SplitAxis(src, 1, 3, 4)` resulting in a Tensor of dimensions
    (10,3,4), the indexes along the original axis of dimension 12 would be
-   interpreted as 3 blocks of size 4.
+   interpreted as 3 blocks of size 4.  (This is the normal semantics
+   of things like NumPy's reshape or PyTorch's view.)
 
       @param [in] src  The Tensor whose axis is to be split.
       @param [in] axis  The index of the axis to be split; must
                        satisfy `0 <= axis < src.Dims().`
-      @param [in] dim1, dim2   The two dimensions into which
-                       we will split the axis.  Must satisfy
-                       `dim1 * dim2 == src.Dim(axis)`.
-      @return     Returns a Tensor which shares the same
-                  underlying data as 'src'
+      @param [in] dim1  First dimension with which to split the axis.
+      @param [in] dim2  Second dimension with which to split the axis.
+                        Must satisfy `dim1 * dim2 == src.Dim(axis)`.
+      @param [out] dest Tensor to be created, with one more axis than 'src',
+                        sharing the same underlying data.
+*/
+void SplitAxis(const Tensor &src, int64_t axis,
+               int64_t dim1, int64_t dim2,
+               Tensor *dest);
+
+
+
+
+/**
+   Does:
+
+    `c := alpha (a * b)  +  beta c`
+
+   where '*' is elementwise multiplication subject to broadcasting
+   rules.  This does not support reducing operations (see AddProductReducing).
+
+   @param [in] alpha  Value that scales a * b
+   @param [in] beta   Value that scales the initial value of c
+   @param [in] a      First input tensor
+   @param [in] b      Second input tensor; require BroadcastCompatible(a, b)
+   @param [out] c     Tensor to be added to (must already be correctly sized,
+                      and either its data must be initialized to a known
+                      value (if beta != 0) or known to not contain NaN (if
+                      beta == 0).   We require BroadcastCompatible(a, b, c, true).
  */
-std::shared_ptr<Tensor> SplitAxis(const Tensor &src, int64_t axis,
-                                  int64_t dim1, int64_t dim2);
+void AddProduct(float alpha, float beta,
+                const Tensor &a, const Tensor &b, Tensor *c);
+
+
+
+/**
+   Does:
+
+    `c := alpha (a * b)  +  beta c`
+
+   where '*' is elementwise multiplication subject to broadcasting
+   rules.  This version supports reducing operations (i.e. it allows
+   'c' to have dim=1 on axes where a and/or b has dim!=1).
+
+   This function actually supports a strict superset of AddProduct(); we
+   separate the functions to make the implementation for AddProduct() simpler,
+   for speed.
+
+   The Tensors do not all have to have the same NumAxes(); they will
+   (internally) be made the same size by padding on the left with trivial axes
+   (dim=1;stride=0) to make them the same size.
+
+   The Tensors need to have the same Dtype() and Device*().
+
+   @param [in] alpha  Value that scales a * b
+   @param [in] beta   Value that scales the initial value of c
+   @param [in] a      First input tensor
+   @param [in] b      Second input tensor; require BroadcastCompatible(a, b)
+   @param [out] c     Tensor to be added to (must already be correctly sized,
+                      and either its data must be initialized to a known
+                      value (if beta != 0) or known to not contain NaN (if
+                      beta == 0).   We require BroadcastCompatible(a, b, c).
+ */
+void AddProductReducing(float alpha, float beta,
+                        const Tensor &a, const Tensor &b, Tensor *c);
+
 
 
 }
diff --git a/src/tensor/tensor.h b/src/tensor/tensor.h
index f479eac6c91..b09649282fb 100644
--- a/src/tensor/tensor.h
+++ b/src/tensor/tensor.h
@@ -41,6 +41,9 @@ namespace tensor {
  */
 class Tensor {
  public:
+
+  inline bool Initialized() { return data_ != NULL; }
+
   /// Return the number of axes (a number in {0,1,2,3,4}).  In mathematical
   // contexts, this is sometimes known as the rank of the tensor, or sometimes
   // even its dimension, but these terms are ambiguous so we avoid them, and use
@@ -49,7 +52,7 @@ class Tensor {
 
   // Return reference to the struct containing the dimension and
   // stride info.
-  const TensorPattern &DimAndStrides() const { return pattern_; }
+  const TensorPattern &Pattern() const { return pattern_; }
 
   // Return an array containing dimensions of the tensor; equivalent to
   // .shape in PyTorch.  Dims().size() will equal NumAxes().
@@ -126,7 +129,6 @@ class Tensor {
   // int32 (if it was not already int32); throws if NumAxes() > 0.
   explicit operator int32() const;
 
-
   // For a Tensor storing floats, returns the data pointer cast to float;
   // otherwise, throws.  (note: this is const only as it doesn't change the
   // Tensor meta-info, but you could change the data using the pointer).
@@ -136,17 +138,20 @@ class Tensor {
   // Tensor meta-info, but you could change the data using the pointer).
   explicit operator double* () const;
 
-
-
   // Assignment operation which sets all elements to a constant.  Valid
   // for Tensors of any floating point type.
   const Tensor & operator = (float f);
 
   // Transpose the two axes by swapping their dims and strides without changing
   // the underlying data in memory.  This modifies *this;
+  // Negative axes are allowed, and interpreted as NumAxes() - axis.
   void Transpose(int32 axis1 = 0, int32 axis2 = 1);
 
 
+  // Constructor which does not really initialize the Tensor.  pattern_,
+  // derived_ and dtype_ may contain nonsense.
+  Tensor(): data_(NULL) { }
+
   // Copy constructor that copies the metadata while sharing the underlying
   // data.
   Tensor (const Tensor &other) = default;
@@ -207,6 +212,12 @@ class Tensor {
   Tensor(TensorPattern &pattern, DataType dtype, Device device,
          InitializePolicy p);
 
+  /**
+     This constructor, which is intended for use primarily in internal
+     code and
+   */
+  Tensor(TensorPattern &pattern, DataType dtype, Device device,
+         void *data_);
 
  private:
   // The tensor dim and strides.
@@ -215,6 +226,8 @@ class Tensor {
   TensorPatternProperties derived_;
   // The data-type of this tensor.
   DataType dtype_;
+  // The device this Tensor lives on
+  Device device_;
 
   // The raw data pointer.  Will be cast to a pointer of the appropriate
   // type before indexing.

From 5fa86ad643d1ff3fdad1429bdd645306f292a196 Mon Sep 17 00:00:00 2001
From: Daniel Povey <dpovey@gmail.com>
Date: Mon, 25 Mar 2019 20:25:58 -0400
Subject: [PATCH 012/163] Kaldi10 (#3167)

* [src] progress on tensor code

* [src] Remove stride support for Vector (more trouble than it was worth)

* [src] More Tensor drafts

* [src] More tensor drafts
---
 src/matrix/kaldi-matrix.cc         |  32 +--
 src/matrix/kaldi-matrix.h          |   8 -
 src/matrix/kaldi-vector.cc         | 186 +++++++------
 src/matrix/kaldi-vector.h          |  88 +++---
 src/matrix/matrix-lib-test.cc      |  14 -
 src/matrix/sp-matrix.cc            |   5 +-
 src/tensor/cpu-impl-linear.cc      |  58 ++++
 src/tensor/cpu-impl-linear.h       |  66 +++++
 src/tensor/tensor-common.h         |   2 +
 src/tensor/tensor-functions.cc     |  66 +++++
 src/tensor/tensor-functions.h      |  33 ++-
 src/tensor/tensor-impl-linear.cc   | 121 +++++++++
 src/tensor/tensor-impl-linear.h    | 153 +++++++++++
 src/tensor/tensor-pattern-utils.cc | 251 +++++++++++++++++
 src/tensor/tensor-pattern-utils.h  | 418 +++++++++++++++++++++++++++++
 src/tensor/tensor-pattern.h        |  52 +++-
 src/tensor/tensor-utils.h          |  84 ++++++
 src/tensor/tensor.h                |  82 ++++--
 src/tensor/variable.h              | 157 +++++++++++
 19 files changed, 1644 insertions(+), 232 deletions(-)
 create mode 100644 src/tensor/cpu-impl-linear.cc
 create mode 100644 src/tensor/cpu-impl-linear.h
 create mode 100644 src/tensor/tensor-functions.cc
 create mode 100644 src/tensor/tensor-impl-linear.cc
 create mode 100644 src/tensor/tensor-impl-linear.h
 create mode 100644 src/tensor/tensor-pattern-utils.cc
 create mode 100644 src/tensor/tensor-pattern-utils.h
 create mode 100644 src/tensor/tensor-utils.h
 create mode 100644 src/tensor/variable.h

diff --git a/src/matrix/kaldi-matrix.cc b/src/matrix/kaldi-matrix.cc
index bb1f164441a..16b75d08c1a 100644
--- a/src/matrix/kaldi-matrix.cc
+++ b/src/matrix/kaldi-matrix.cc
@@ -119,8 +119,8 @@ void MatrixBase<float>::AddVecVec(const float alpha,
                                   const VectorBase<float> &a,
                                   const VectorBase<float> &b) {
   KALDI_ASSERT(a.Dim() == num_rows_ && b.Dim() == num_cols_);
-  cblas_Xger(a.Dim(), b.Dim(), alpha, a.Data(), a.Stride(),
-             b.Data(), b.Stride(), data_, stride_);
+  cblas_Xger(a.Dim(), b.Dim(), alpha, a.Data(), 1,
+             b.Data(), 1, data_, stride_);
 }
 
 template<typename Real>
@@ -133,17 +133,16 @@ void MatrixBase<Real>::AddVecVec(const Real alpha,
     // temporary vectors of the right type and use BLAS.
     Vector<Real> temp_a(a), temp_b(b);
     cblas_Xger(num_rows_, num_cols_, alpha,
-               temp_a.Data(), temp_a.Stride(),
-               temp_b.Data(), temp_b.Stride(),
+               temp_a.Data(), 1,
+               temp_b.Data(), 1,
                data_, stride_);
   } else {
     const OtherReal *a_data = a.Data(), *b_data = b.Data();
-    MatrixIndexT a_stride = a.Stride(), b_stride = b.Stride();
     Real *row_data = data_;
     for (MatrixIndexT i = 0; i < num_rows_; i++, row_data += stride_) {
-      BaseFloat alpha_ai = alpha * a_data[i * a_stride];
+      BaseFloat alpha_ai = alpha * a_data[i];
       for (MatrixIndexT j = 0; j < num_cols_; j++)
-        row_data[j] += alpha_ai * b_data[j * b_stride];
+        row_data[j] += alpha_ai * b_data[j];
     }
   }
 }
@@ -165,8 +164,8 @@ void MatrixBase<double>::AddVecVec(const double alpha,
                                    const VectorBase<double> &b) {
   KALDI_ASSERT(a.Dim() == num_rows_ && b.Dim() == num_cols_);
   if (num_rows_ == 0) return;
-  cblas_Xger(a.Dim(), b.Dim(), alpha, a.Data(), a.Stride(),
-             b.Data(), b.Stride(), data_, stride_);
+  cblas_Xger(a.Dim(), b.Dim(), alpha, a.Data(), 1,
+             b.Data(), 1, data_, stride_);
 }
 
 template<typename Real>
@@ -594,10 +593,9 @@ void MatrixBase<Real>::AddDiagVecMat(
   if (transM == kTrans) std::swap(M_row_stride, M_col_stride);
   Real *data = data_;
   const Real *Mdata = M.Data(), *vdata = v.Data();
-  MatrixIndexT v_stride = v.Stride();
   if (num_rows_ == 0) return;
   for (MatrixIndexT i = 0; i < num_rows;
-       i++, data += stride, Mdata += M_row_stride, vdata += v_stride)
+       i++, data += stride, Mdata += M_row_stride, vdata++)
     cblas_Xaxpy(num_cols, alpha * *vdata, Mdata, M_col_stride, data, 1);
 }
 
@@ -628,12 +626,11 @@ void MatrixBase<Real>::AddMatDiagVec(
 
   Real *data = data_;
   const Real *Mdata = M.Data(), *vdata = v.Data();
-  MatrixIndexT v_stride = v.Stride();
   if (num_rows_ == 0) return;
   for (MatrixIndexT i = 0; i < num_rows; i++){
-      for(MatrixIndexT j = 0; j < num_cols; j ++ ){
-          data[i*stride + j] += alpha * vdata[j * v_stride] * Mdata[i*M_row_stride + j*M_col_stride];
-      }
+    for(MatrixIndexT j = 0; j < num_cols; j ++ ){
+      data[i*stride + j] += alpha * vdata[j] * Mdata[i*M_row_stride + j*M_col_stride];
+    }
   }
 }
 
@@ -1793,7 +1790,7 @@ void MatrixBase<Real>::DestructiveSvd(VectorBase<Real> *s, MatrixBase<Real> *U,
   // Throws exception on error.
 
   KALDI_ASSERT(num_rows_>=num_cols_ && "Svd requires that #rows by >= #cols.");  // For compatibility with JAMA code.
-  KALDI_ASSERT(s->Dim() == num_cols_ && s->Stride() == 1);  // s should be the smaller dim.
+  KALDI_ASSERT(s->Dim() == num_cols_);  // s should be the smaller dim.
   KALDI_ASSERT(U == NULL || (U->num_rows_ == num_rows_&&U->num_cols_ == num_cols_));
   KALDI_ASSERT(Vt == NULL || (Vt->num_rows_ == num_cols_&&Vt->num_cols_ == num_cols_));
 
@@ -2005,8 +2002,7 @@ void MatrixBase<Real>::SymPosSemiDefEig(VectorBase<Real> *s, MatrixBase<Real> *U
 
   KALDI_ASSERT(num_rows_ == num_cols_);
   KALDI_ASSERT(IsSymmetric() && "SymPosSemiDefEig: expecting input to be symmetrical.");
-  KALDI_ASSERT(U->num_rows_ == D && U->num_cols_ == D && s->Dim() == D &&
-               s->Stride() == 1);
+  KALDI_ASSERT(U->num_rows_ == D && U->num_cols_ == D && s->Dim() == D);
 
   Matrix<Real>  Vt(D, D);
   Svd(s, U, &Vt);
diff --git a/src/matrix/kaldi-matrix.h b/src/matrix/kaldi-matrix.h
index e5b2a658054..33ce5108360 100644
--- a/src/matrix/kaldi-matrix.h
+++ b/src/matrix/kaldi-matrix.h
@@ -191,14 +191,6 @@ class MatrixBase {
     return SubVector<Real>(data_ + (i * stride_), num_cols_);
   }
 
-  /// Return specific column of matrix.  Warning: this can get
-  /// around const constraints.
-  inline const SubVector<Real> Col(MatrixIndexT i) const {
-    KALDI_ASSERT(static_cast<UnsignedMatrixIndexT>(i) <
-                 static_cast<UnsignedMatrixIndexT>(num_cols_));
-    return SubVector<Real>(data_ + i, num_rows_, stride_);
-  }
-
   /// Return a sub-part of matrix.
   inline SubMatrix<Real> Range(const MatrixIndexT row_offset,
                                const MatrixIndexT num_rows,
diff --git a/src/matrix/kaldi-vector.cc b/src/matrix/kaldi-vector.cc
index ef37ab0137a..d1d66b7bda4 100644
--- a/src/matrix/kaldi-vector.cc
+++ b/src/matrix/kaldi-vector.cc
@@ -33,7 +33,6 @@
 
 namespace kaldi {
 
-
 template<typename Real> inline const Real* Get64Ones() {
   // The C++ standard doesn't seem to provide a compact way to do this.
   static const Real ones[64] = { 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
@@ -43,13 +42,12 @@ template<typename Real> inline const Real* Get64Ones() {
   return ones;
 }
 
-
 template<typename Real>
 Real VecVec(const VectorBase<Real> &a,
             const VectorBase<Real> &b) {
   MatrixIndexT adim = a.Dim();
   KALDI_ASSERT(adim == b.Dim());
-  return cblas_Xdot(adim, a.Data(), a.Stride(), b.Data(), b.Stride());
+  return cblas_Xdot(adim, a.Data(), 1, b.Data(), 1);
 }
 
 template
@@ -60,26 +58,25 @@ double VecVec<>(const VectorBase<double> &a,
                 const VectorBase<double> &b);
 
 template<typename Real, typename OtherReal>
-Real VecVec(const VectorBase<Real> &a,
-            const VectorBase<OtherReal> &b) {
-  MatrixIndexT adim = a.Dim();
-  KALDI_ASSERT(adim == b.Dim());
-  const Real *a_data = a.Data();
-  const OtherReal *b_data = b.Data();
-  MatrixIndexT a_stride = a.Stride(), b_stride = b.Stride();
+Real VecVec(const VectorBase<Real> &ra,
+            const VectorBase<OtherReal> &rb) {
+  MatrixIndexT adim = ra.Dim();
+  KALDI_ASSERT(adim == rb.Dim());
+  const Real *a_data = ra.Data();
+  const OtherReal *b_data = rb.Data();
   Real sum = 0.0;
   for (MatrixIndexT i = 0; i < adim; i++)
-    sum += a_data[i * a_stride] * b_data[i * b_stride];
+    sum += a_data[i]*b_data[i];
   return sum;
 }
 
 // instantiate the template above.
 template
-float VecVec<>(const VectorBase<float> &a,
-               const VectorBase<double> &b);
+float VecVec<>(const VectorBase<float> &ra,
+               const VectorBase<double> &rb);
 template
-double VecVec<>(const VectorBase<double> &a,
-                const VectorBase<float> &b);
+double VecVec<>(const VectorBase<double> &ra,
+                const VectorBase<float> &rb);
 
 
 template<>
@@ -88,7 +85,7 @@ void VectorBase<float>::AddVec(const float alpha,
                                const VectorBase<float> &v) {
   KALDI_ASSERT(dim_ == v.dim_);
   KALDI_ASSERT(&v != this);
-  cblas_Xaxpy(dim_, alpha, v.Data(), v.stride_, data_, stride_);
+  cblas_Xaxpy(dim_, alpha, v.Data(), 1, data_, 1);
 }
 
 template<>
@@ -97,7 +94,7 @@ void VectorBase<double>::AddVec(const double alpha,
                                 const VectorBase<double> &v) {
   KALDI_ASSERT(dim_ == v.dim_);
   KALDI_ASSERT(&v != this);
-  cblas_Xaxpy(dim_, alpha, v.Data(), v.stride_, data_, stride_);
+  cblas_Xaxpy(dim_, alpha, v.Data(), 1, data_, 1);
 }
 
 template<typename Real>
@@ -110,7 +107,7 @@ void VectorBase<Real>::AddMatVec(const Real alpha,
                || (trans == kTrans && M.NumRows() == v.dim_ && M.NumCols() == dim_));
   KALDI_ASSERT(&v != this);
   cblas_Xgemv(trans, M.NumRows(), M.NumCols(), alpha, M.Data(), M.Stride(),
-              v.Data(), v.stride_, beta, data_, stride_);
+              v.Data(), 1, beta, data_, 1);
 }
 
 template<typename Real>
@@ -123,8 +120,30 @@ void VectorBase<Real>::AddMatSvec(const Real alpha,
                || (trans == kTrans && M.NumRows() == v.dim_ && M.NumCols() == dim_));
   KALDI_ASSERT(&v != this);
   Xgemv_sparsevec(trans, M.NumRows(), M.NumCols(), alpha, M.Data(), M.Stride(),
-                  v.Data(), v.stride_, beta, data_, stride_);
+                  v.Data(), 1, beta, data_, 1);
   return;
+  /*
+  MatrixIndexT this_dim = this->dim_, v_dim = v.dim_,
+      M_stride = M.Stride();
+  Real *this_data = this->data_;
+  const Real *M_data = M.Data(), *v_data = v.data_;
+  if (beta != 1.0) this->Scale(beta);
+  if (trans == kNoTrans) {
+    for (MatrixIndexT i = 0; i < v_dim; i++) {
+      Real v_i = v_data[i];
+      if (v_i == 0.0) continue;
+      // Add to *this, the i'th column of the Matrix, times v_i.
+      cblas_Xaxpy(this_dim, v_i * alpha, M_data + i, M_stride, this_data, 1);
+    }
+  } else { // The transposed case is slightly more efficient, I guess.
+    for (MatrixIndexT i = 0; i < v_dim; i++) {
+      Real v_i = v.data_[i];
+      if (v_i == 0.0) continue;
+      // Add to *this, the i'th row of the Matrix, times v_i.
+      cblas_Xaxpy(this_dim, v_i * alpha,
+                  M_data + (i * M_stride), 1, this_data, 1);
+    }
+    }*/
 }
 
 template<typename Real>
@@ -134,8 +153,7 @@ void VectorBase<Real>::AddSpVec(const Real alpha,
                                 const Real beta) {
   KALDI_ASSERT(M.NumRows() == v.dim_ && dim_ == v.dim_);
   KALDI_ASSERT(&v != this);
-  cblas_Xspmv(M.NumRows(), alpha, M.Data(), v.Data(), v.stride_,
-              beta, data_, stride_);
+  cblas_Xspmv(M.NumRows(), alpha, M.Data(), v.Data(), 1, beta, data_, 1);
 }
 
 
@@ -143,20 +161,19 @@ template<typename Real>
 void VectorBase<Real>::MulTp(const TpMatrix<Real> &M,
                               const MatrixTransposeType trans) {
   KALDI_ASSERT(M.NumRows() == dim_);
-  cblas_Xtpmv(trans, M.Data(), M.NumRows(), data_, stride_);
+  cblas_Xtpmv(trans, M.Data(), M.NumRows(), data_, 1);
 }
 
 template<typename Real>
 void VectorBase<Real>::Solve(const TpMatrix<Real> &M,
                         const MatrixTransposeType trans) {
   KALDI_ASSERT(M.NumRows() == dim_);
-  cblas_Xtpsv(trans, M.Data(), M.NumRows(), data_, stride_);
+  cblas_Xtpsv(trans, M.Data(), M.NumRows(), data_, 1);
 }
 
 
 template<typename Real>
 inline void Vector<Real>::Init(const MatrixIndexT dim) {
-  this->stride_ = 1;
   KALDI_ASSERT(dim >= 0);
   if (dim == 0) {
     this->dim_ = 0;
@@ -180,6 +197,7 @@ inline void Vector<Real>::Init(const MatrixIndexT dim) {
 
 template<typename Real>
 void Vector<Real>::Resize(const MatrixIndexT dim, MatrixResizeType resize_type) {
+
   // the next block uses recursion to handle what we have to do if
   // resize_type == kCopyData.
   if (resize_type == kCopyData) {
@@ -219,7 +237,7 @@ template<typename Real>
 void VectorBase<Real>::CopyFromVec(const VectorBase<Real> &v) {
   KALDI_ASSERT(Dim() == v.Dim());
   if (data_ != v.data_) {
-    cblas_Xcopy(dim_, v.data_, v.stride_, data_, stride_);
+    std::memcpy(this->data_, v.data_, dim_ * sizeof(Real));
   }
 }
 
@@ -235,22 +253,27 @@ template void VectorBase<float>::CopyFromPacked(const PackedMatrix<float> &other
 template void VectorBase<double>::CopyFromPacked(const PackedMatrix<double> &other);
 template void VectorBase<double>::CopyFromPacked(const PackedMatrix<float> &other);
 
+/// Load data into the vector
+template<typename Real>
+void VectorBase<Real>::CopyFromPtr(const Real *data, MatrixIndexT sz) {
+  KALDI_ASSERT(dim_ == sz);
+  std::memcpy(this->data_, data, Dim() * sizeof(Real));
+}
 
 template<typename Real>
 template<typename OtherReal>
 void VectorBase<Real>::CopyFromVec(const VectorBase<OtherReal> &other) {
   KALDI_ASSERT(dim_ == other.Dim());
   Real * __restrict__  ptr = data_;
-  MatrixIndexT stride = stride_, other_stride = other.stride_;
   const OtherReal * __restrict__ other_ptr = other.Data();
   for (MatrixIndexT i = 0; i < dim_; i++)
-    ptr[i * stride] = other_ptr[i * other_stride];
+    ptr[i] = other_ptr[i];
 }
 
 template void VectorBase<float>::CopyFromVec(const VectorBase<double> &other);
 template void VectorBase<double>::CopyFromVec(const VectorBase<float> &other);
 
-// Remove element from the vector. The vector is not reallocated
+// Remove element from the vector. The vector is non reallocated
 template<typename Real>
 void Vector<Real>::RemoveElement(MatrixIndexT i) {
   KALDI_ASSERT(i <  this->dim_ && "Access out of vector");
@@ -664,15 +687,14 @@ void VectorBase<double>::CopyColFromMat(const MatrixBase<double> &mat, MatrixInd
 template<typename Real>
 void VectorBase<Real>::CopyDiagFromMat(const MatrixBase<Real> &M) {
   KALDI_ASSERT(dim_ == std::min(M.NumRows(), M.NumCols()));
-  cblas_Xcopy(dim_, M.Data(), M.Stride() + 1, data_, stride_);
+  cblas_Xcopy(dim_, M.Data(), M.Stride() + 1, data_, 1);
 }
 
 template<typename Real>
 void VectorBase<Real>::CopyDiagFromPacked(const PackedMatrix<Real> &M) {
   KALDI_ASSERT(dim_ == M.NumCols());
-  MatrixIndexT stride = stride_, dim = dim_;
-  for (MatrixIndexT i = 0; i < dim; i++)
-    data_[i * stride] = M(i, i);
+  for (MatrixIndexT i = 0; i < dim_; i++)
+    data_[i] = M(i, i);
   // could make this more efficient.
 }
 
@@ -682,16 +704,15 @@ Real VectorBase<Real>::Sum() const {
   // implement sum. This allows us to access SIMD operations in a
   // cross-platform way via your BLAS library.
   Real one(1);
-  return cblas_Xdot(dim_, data_, stride_, &one, 0);
+  return cblas_Xdot(dim_, data_, 1, &one, 0);
 }
 
 template<typename Real>
 Real VectorBase<Real>::SumLog() const {
   double sum_log = 0.0;
   double prod = 1.0;
-  MatrixIndexT dim = dim_, stride = stride_;
-  for (MatrixIndexT i = 0; i < dim; i++) {
-    prod *= data_[i * stride];
+  for (MatrixIndexT i = 0; i < dim_; i++) {
+    prod *= data_[i];
     // Possible future work (arnab): change these magic values to pre-defined
     // constants
     if (prod < 1.0e-10 || prod > 1.0e+10) {
@@ -704,8 +725,7 @@ Real VectorBase<Real>::SumLog() const {
 }
 
 template<typename Real>
-void VectorBase<Real>::AddRowSumMat(Real alpha, const MatrixBase<Real> &M,
-                                    Real beta) {
+void VectorBase<Real>::AddRowSumMat(Real alpha, const MatrixBase<Real> &M, Real beta) {
   KALDI_ASSERT(dim_ == M.NumCols());
   // the BLAS standard does not support vectors with stride zero, even though
   // some implementations (such as Mac's accelerate framework and I believe
@@ -719,29 +739,27 @@ void VectorBase<Real>::AddRowSumMat(Real alpha, const MatrixBase<Real> &M,
         std::min<MatrixIndexT>(64, num_rows - row_offset);
     cblas_Xgemv(kTrans, this_num_rows, M.NumCols(), alpha,
                 M.RowData(row_offset), M.Stride(), ones, 1,
-                beta, data_, stride_);
+                beta, data_, 1);
     beta = 1.0;
   }
 }
 
-
 template<typename Real>
-void VectorBase<Real>::AddColSumMat(Real alpha, const MatrixBase<Real> &M,
-                                    Real beta) {
+void VectorBase<Real>::AddColSumMat(Real alpha, const MatrixBase<Real> &M, Real beta) {
   KALDI_ASSERT(dim_ == M.NumRows());
   // the BLAS standard does not support vectors with stride zero, even though
   // some implementations (such as Mac's accelerate framework and I believe
   // CUBLAS) seem to allow it.  We compile a fixed-size (64) vector of ones
   // into the program.
   const Real *ones = Get64Ones<Real>();
-
   MatrixIndexT num_cols = M.NumCols();
   for (MatrixIndexT col_offset = 0; col_offset < num_cols; col_offset += 64) {
     MatrixIndexT this_num_cols =
         std::min<MatrixIndexT>(64, num_cols - col_offset);
     cblas_Xgemv(kNoTrans, M.NumRows(), this_num_cols, alpha,
-                M.Data() + col_offset, M.Stride(), ones, 1,
-                beta, data_, stride_);
+                M.Data() + col_offset, M.Stride(),
+                ones, 1,
+                beta, data_, 1);
     beta = 1.0;
   }
 }
@@ -759,11 +777,8 @@ Real VectorBase<Real>::LogSumExp(Real prune) const {
 
   double sum_relto_max_elem = 0.0;
 
-  const Real *data = data_;
-  MatrixIndexT dim = dim_, stride = stride_;
-
-  for (MatrixIndexT i = 0; i < dim; i++) {
-    BaseFloat f = data[i * stride];
+  for (MatrixIndexT i = 0; i < dim_; i++) {
+    BaseFloat f = data_[i];
     if (f >= cutoff)
       sum_relto_max_elem += Exp(f - max_elem);
   }
@@ -772,50 +787,38 @@ Real VectorBase<Real>::LogSumExp(Real prune) const {
 
 template<typename Real>
 void VectorBase<Real>::InvertElements() {
-  MatrixIndexT dim = dim_, stride = stride_;
-  for (MatrixIndexT i = 0; i < dim; i++) {
-    data_[i * stride] = static_cast<Real>(1) / data_[i * stride];
+  for (MatrixIndexT i = 0; i < dim_; i++) {
+    data_[i] = static_cast<Real>(1 / data_[i]);
   }
 }
 
 template<typename Real>
 void VectorBase<Real>::ApplyLog() {
-  MatrixIndexT dim = dim_, stride = stride_;
-  Real *data = data_;
-  for (MatrixIndexT i = 0; i < dim; i++) {
-    if (data[i * stride] < 0.0)
+  for (MatrixIndexT i = 0; i < dim_; i++) {
+    if (data_[i] < 0.0)
       KALDI_ERR << "Trying to take log of a negative number.";
-    data[i * stride] = Log(data[i * stride]);
+    data_[i] = Log(data_[i]);
   }
 }
 
 template<typename Real>
 void VectorBase<Real>::ApplyLog(const VectorBase<Real> &v) {
   KALDI_ASSERT(dim_ == v.Dim());
-  MatrixIndexT dim = dim_, stride = stride_, v_stride = v.stride_;
-  Real *data = data_;
-  const Real *v_data = v.data_;
-  for (MatrixIndexT i = 0; i < dim; i++) {
-    data[i * stride] = Log(v_data[i * v_stride]);
+  for (MatrixIndexT i = 0; i < dim_; i++) {
+    data_[i] = Log(v(i));
   }
 }
 
 template<typename Real>
 void VectorBase<Real>::ApplyExp() {
-  Real *data = data_;
-  MatrixIndexT dim = dim_, stride = stride_;
-  for (MatrixIndexT i = 0; i < dim; i++) {
-    data[i * stride] = Exp(data[i * stride]);
+  for (MatrixIndexT i = 0; i < dim_; i++) {
+    data_[i] = Exp(data_[i]);
   }
 }
 
 template<typename Real>
 void VectorBase<Real>::ApplyAbs() {
-  Real *data = data_;
-  MatrixIndexT dim = dim_, stride = stride_;
-  for (MatrixIndexT i = 0; i < dim; i++) {
-    data[i * stride] = std::abs(data[i * stride]);
-  }
+  for (MatrixIndexT i = 0; i < dim_; i++) { data_[i] = std::abs(data_[i]); }
 }
 
 template<typename Real>
@@ -828,8 +831,8 @@ void VectorBase<Real>::ApplyFloor(Real floor_val, MatrixIndexT *floored_count) {
     MatrixIndexT num_floored = 0;
     for (MatrixIndexT i = 0; i < dim_; i++) {
       if (data_[i] < floor_val) {
-        data_[i] = floor_val;
-        num_floored++;
+	data_[i] = floor_val;
+	num_floored++;
       }
     }
     *floored_count = num_floored;
@@ -838,18 +841,16 @@ void VectorBase<Real>::ApplyFloor(Real floor_val, MatrixIndexT *floored_count) {
 
 template<typename Real>
 void VectorBase<Real>::ApplyCeiling(Real ceil_val, MatrixIndexT *ceiled_count) {
-  MatrixIndexT dim = dim_, stride = stride_;
-  Real *data = data_;
   if (ceiled_count == nullptr) {
-    for (MatrixIndexT i = 0; i < dim; i++) {
-      data[i * stride] = std::min(data[i * stride], ceil_val);
+    for (MatrixIndexT i = 0; i < dim_; i++) {
+      data_[i] = std::min(data_[i], ceil_val);
     }
   } else {
     MatrixIndexT num_changed = 0;
-    for (MatrixIndexT i = 0; i < dim; i++) {
-      if (data[i * stride] > ceil_val) {
-        data_[i * stride] = ceil_val;
-        num_changed++;
+    for (MatrixIndexT i = 0; i < dim_; i++) {
+      if (data_[i] > ceil_val) {
+	data_[i] = ceil_val;
+	num_changed++;
       }
     }
     *ceiled_count = num_changed;
@@ -858,15 +859,11 @@ void VectorBase<Real>::ApplyCeiling(Real ceil_val, MatrixIndexT *ceiled_count) {
 
 template<typename Real>
 MatrixIndexT VectorBase<Real>::ApplyFloor(const VectorBase<Real> &floor_vec) {
-  MatrixIndexT dim = dim_, stride = stride_,
-      floor_stride = floor_vec.stride_;
-  Real *data = data_;
-  const Real *floor_data = floor_vec.data_;
-  KALDI_ASSERT(floor_vec.dim_ == dim);
+  KALDI_ASSERT(floor_vec.Dim() == dim_);
   MatrixIndexT num_floored = 0;
-  for (MatrixIndexT i = 0; i < dim; i++) {
-    if (data[i * stride] < floor_data[i * floor_stride]) {
-      data_[i * stride] = floor_data[i * floor_stride];
+  for (MatrixIndexT i = 0; i < dim_; i++) {
+    if (data_[i] < floor_vec(i)) {
+      data_[i] = floor_vec(i);
       num_floored++;
     }
   }
@@ -971,7 +968,7 @@ void VectorBase<Real>::Add(Real c) {
 
 template<typename Real>
 void VectorBase<Real>::Scale(Real alpha) {
-  cblas_Xscal(dim_, alpha, data_, stride_);
+  cblas_Xscal(dim_, alpha, data_, 1);
 }
 
 template<typename Real>
@@ -1012,8 +1009,8 @@ void VectorBase<Real>::AddVecVec(Real alpha, const VectorBase<Real> &v,
   KALDI_ASSERT(v.data_ != this->data_ && r.data_ != this->data_);
   // We pretend that v is a band-diagonal matrix.
   KALDI_ASSERT(dim_ == v.dim_ && dim_ == r.dim_);
-  cblas_Xgbmv(kNoTrans, dim_, dim_, 0, 0, alpha, v.data_, v.stride_,
-              r.data_, r.stride_, beta, this->data_, stride_);
+  cblas_Xgbmv(kNoTrans, dim_, dim_, 0, 0, alpha, v.data_, 1,
+              r.data_, 1, beta, this->data_, 1);
 }
 
 
@@ -1321,8 +1318,7 @@ void VectorBase<Real>::AddDiagMat2(
     Real *data = this->data_;
     const Real *mat_data = M.Data();
     for (MatrixIndexT i = 0; i < rows; i++, mat_data += mat_stride, data++)
-      *data = beta * *data + alpha * cblas_Xdot(cols, mat_data, 1,
-                                                mat_data, 1);
+      *data = beta * *data + alpha * cblas_Xdot(cols,mat_data,1,mat_data,1);
   } else {
     KALDI_ASSERT(this->dim_ == M.NumCols());
     MatrixIndexT rows = M.NumRows(), cols = this->dim_,
diff --git a/src/matrix/kaldi-vector.h b/src/matrix/kaldi-vector.h
index 9da4c18af1d..c2d62c15f70 100644
--- a/src/matrix/kaldi-vector.h
+++ b/src/matrix/kaldi-vector.h
@@ -62,19 +62,8 @@ class VectorBase {
   /// Returns the  dimension of the vector.
   inline MatrixIndexT Dim() const { return dim_; }
 
-  /// Returns the stride betwen elements of the vector; will normally be 1, and
-  /// must be nonzero.  CAUTION: we are in the process of updating this library
-  /// to support vector strides, so stride != 1 may not be supported everywhere,
-  /// and may sometimes lead to unexpected behavior or crashes.
-  inline MatrixIndexT Stride() const { return stride_; }
-
-  /// Returns the size in memory of the vector, in bytes, assuming
-  /// stride is 1 (if not, this doesn't make sense in the contexts
-  /// in which this is called.  TODO: get rid of this
-  inline MatrixIndexT SizeInBytes() const {
-    KALDI_ASSERT(stride_ == 1);
-    return (dim_*sizeof(Real));
-  }
+  /// Returns the size in memory of the vector, in bytes.
+  inline MatrixIndexT SizeInBytes() const { return (dim_*sizeof(Real)); }
 
   /// Returns a pointer to the start of the vector's data.
   inline Real* Data() { return data_; }
@@ -86,14 +75,14 @@ class VectorBase {
   inline Real operator() (MatrixIndexT i) const {
     KALDI_PARANOID_ASSERT(static_cast<UnsignedMatrixIndexT>(i) <
                  static_cast<UnsignedMatrixIndexT>(dim_));
-    return *(data_ + i * stride_);
+    return *(data_ + i);
   }
 
   /// Indexing operator (non-const).
   inline Real & operator() (MatrixIndexT i) {
     KALDI_PARANOID_ASSERT(static_cast<UnsignedMatrixIndexT>(i) <
                  static_cast<UnsignedMatrixIndexT>(dim_));
-    return *(data_ + i * stride_);
+    return *(data_ + i);
   }
 
   /** @brief Returns a sub-vector of a vector (a range of elements).
@@ -371,18 +360,25 @@ class VectorBase {
   ~VectorBase() {}
 
   /// Empty initializer, corresponds to vector of zero size.
-  explicit VectorBase(): data_(NULL), dim_(0), stride_(1) {
+  explicit VectorBase(): data_(NULL), dim_(0) {
     KALDI_ASSERT_IS_FLOATING_TYPE(Real);
   }
 
+// Took this out since it is not currently used, and it is possible to create
+// objects where the allocated memory is not the same size as dim_ : Arnab
+//  /// Initializer from a pointer and a size; keeps the pointer internally
+//  /// (ownership or non-ownership depends on the child class).
+//  explicit VectorBase(Real* data, MatrixIndexT dim)
+//      : data_(data), dim_(dim) {}
+
+  // Arnab : made this protected since it is unsafe too.
+  /// Load data into the vector: sz must match own size.
+  void CopyFromPtr(const Real* Data, MatrixIndexT sz);
 
   /// data memory area
   Real* data_;
   /// dimension of vector
   MatrixIndexT dim_;
-  /// stride between elements of the vector.  Would normally be 1.  Must be
-  /// > 0  (if the vector is nonempty).
-  MatrixIndexT stride_;
   KALDI_DISALLOW_COPY_AND_ASSIGN(VectorBase);
 }; // class VectorBase
 
@@ -488,32 +484,18 @@ class Vector: public VectorBase<Real> {
 template<typename Real>
 class SubVector : public VectorBase<Real> {
  public:
-  /**
-     Constructor from a Vector or SubVector.
-     SubVectors are not const-safe and it's very hard to make them
-     so for now we just give up.  This function contains const_cast.
-        @param [in] src     The vector we are taking a sub-vector of
-        @param [in] begin   The first element in 'src'
-        @param [in] num_elements  The number of elements we are taking
-        @param [in] step   The step between elements from 'src'; must be
-                           >0.
-  */
-  SubVector(const VectorBase<Real> &src,
-            const MatrixIndexT begin,
-            const MatrixIndexT num_elements,
-            const MatrixIndexT step = 1) : VectorBase<Real>() {
-    // Casting to UnsignedMatrixIndexT is a mechanism to test something
-    // is >= 0 as well as < x (for positive x) in a single comparison.
-    typedef UnsignedMatrixIndexT U;
-    KALDI_ASSERT(
-        step != 0 &&
-        static_cast<U>(begin) < static_cast<U>(src.Dim()) &&
-        static_cast<U>(begin + step * (num_elements - 1)) <
-        static_cast<U>(src.Dim()));
-    VectorBase<Real>::data_ = const_cast<Real*> (src.Data() +
-                                                 begin * src.Stride());
-    VectorBase<Real>::dim_   = num_elements;
-    VectorBase<Real>::stride_ = step * src.Stride();
+  /// Constructor from a Vector or SubVector.
+  /// SubVectors are not const-safe and it's very hard to make them
+  /// so for now we just give up.  This function contains const_cast.
+  SubVector(const VectorBase<Real> &t, const MatrixIndexT origin,
+            const MatrixIndexT length) : VectorBase<Real>() {
+    // following assert equiv to origin>=0 && length>=0 &&
+    // origin+length <= rt.dim_
+    KALDI_ASSERT(static_cast<UnsignedMatrixIndexT>(origin)+
+                 static_cast<UnsignedMatrixIndexT>(length) <=
+                 static_cast<UnsignedMatrixIndexT>(t.Dim()));
+    VectorBase<Real>::data_ = const_cast<Real*> (t.Data()+origin);
+    VectorBase<Real>::dim_   = length;
   }
 
   /// This constructor initializes the vector to point at the contents
@@ -521,7 +503,6 @@ class SubVector : public VectorBase<Real> {
   SubVector(const PackedMatrix<Real> &M) {
     VectorBase<Real>::data_ = const_cast<Real*> (M.Data());
     VectorBase<Real>::dim_   = (M.NumRows()*(M.NumRows()+1))/2;
-    VectorBase<Real>::stride_ = 1;
   }
 
   /// Copy constructor
@@ -529,28 +510,21 @@ class SubVector : public VectorBase<Real> {
     // this copy constructor needed for Range() to work in base class.
     VectorBase<Real>::data_ = other.data_;
     VectorBase<Real>::dim_ = other.dim_;
-    VectorBase<Real>::stride_ = other.stride_;
   }
 
-  /// Constructor from a pointer to memory and a length, and an optional stride.
-  /// Keeps a pointer to the data but does not take ownership (will never
-  /// delete).  Caution: this constructor enables you to evade const
-  /// constraints.
-  SubVector(const Real *data, MatrixIndexT length, MatrixIndexT stride = 1):
-      VectorBase<Real> () {
+  /// Constructor from a pointer to memory and a length.  Keeps a pointer
+  /// to the data but does not take ownership (will never delete).
+  /// Caution: this constructor enables you to evade const constraints.
+  SubVector(const Real *data, MatrixIndexT length) : VectorBase<Real> () {
     VectorBase<Real>::data_ = const_cast<Real*>(data);
     VectorBase<Real>::dim_   = length;
-    VectorBase<Real>::stride_ = stride;
   }
 
 
   /// This operation does not preserve const-ness, so be careful.
-  /// This function is somewhat deprecated, for being ambiguous
-  /// MatrixBase:Row() is probably preferred.
   SubVector(const MatrixBase<Real> &matrix, MatrixIndexT row) {
     VectorBase<Real>::data_ = const_cast<Real*>(matrix.RowData(row));
     VectorBase<Real>::dim_   = matrix.NumCols();
-    VectorBase<Real>::stride_ = 1;
   }
 
   ~SubVector() {}  ///< Destructor (does nothing; no pointers are owned here).
diff --git a/src/matrix/matrix-lib-test.cc b/src/matrix/matrix-lib-test.cc
index 7ea998a5cb4..8097ab119b5 100644
--- a/src/matrix/matrix-lib-test.cc
+++ b/src/matrix/matrix-lib-test.cc
@@ -3226,19 +3226,6 @@ template<typename Real> static void UnitTestLbfgs() {
   g_kaldi_verbose_level = temp;
 }
 
-template<typename Real>  static void UnitTestVecStrideSimple() {
-  Vector<Real> v(20);
-  v.SetRandn();
-
-  Vector<Real> v2(20);
-  SubVector<Real> v_a(v.Data(), 10, 2);
-  SubVector<Real> v2_a(v2.Data(), 10, 2);
-  v2_a.CopyFromVec(v_a);
-  SubVector<Real> v_b(v.Data() + 1, 10, 2);
-  SubVector<Real> v2_b(v2.Data() + 1, 10, 2);
-  v2_b.CopyFromVec(v_b);
-  AssertEqual(v, v2);
-}
 
 template<typename Real> static void UnitTestLinearCgd() {
   for (int i = 0; i < 20 ; i++) {
@@ -4603,7 +4590,6 @@ template<typename Real> static void UnitTestTriVecSolver() {
 
 
 template<typename Real> static void MatrixUnitTest(bool full_test) {
-  UnitTestVecStrideSimple<Real>();
   UnitTestLinearCgd<Real>();
   UnitTestGeneralMatrix<BaseFloat>();
   UnitTestTridiagonalize<Real>();
diff --git a/src/matrix/sp-matrix.cc b/src/matrix/sp-matrix.cc
index 40511f537ef..d63e1b1aed1 100644
--- a/src/matrix/sp-matrix.cc
+++ b/src/matrix/sp-matrix.cc
@@ -185,12 +185,11 @@ void SpMatrix<Real>::AddDiagVec(const Real alpha, const VectorBase<OtherReal> &v
   KALDI_ASSERT(num_rows == v.Dim() && num_rows > 0);
   const OtherReal *src = v.Data();
   Real *dst = this->data_;
-  MatrixIndexT src_stride = v.Stride();
   if (alpha == 1.0)
-    for (int32 i = 1; i <= num_rows; i++, src += src_stride, dst += i)
+    for (int32 i = 1; i <= num_rows; i++, src++, dst += i)
       *dst += *src;
   else
-    for (int32 i = 1; i <= num_rows; i++, src += src_stride, dst += i)
+    for (int32 i = 1; i <= num_rows; i++, src++, dst += i)
       *dst += alpha * *src;
 }
 
diff --git a/src/tensor/cpu-impl-linear.cc b/src/tensor/cpu-impl-linear.cc
new file mode 100644
index 00000000000..7b659841395
--- /dev/null
+++ b/src/tensor/cpu-impl-linear.cc
@@ -0,0 +1,58 @@
+// tensor/cpu-impl.cc
+
+// Copyright      2019  Johns Hopkins University (author: Daniel Povey)
+
+// See ../../COPYING for clarification regarding multiple authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//  http://www.apache.org/licenses/LICENSE-2.0
+//
+// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
+// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
+// MERCHANTABLITY OR NON-INFRINGEMENT.
+// See the Apache 2 License for the specific language governing permissions and
+// limitations under the License.
+
+#include "tensor/cpu-impl.h"
+
+
+namespace kaldi {
+namespace tensor {
+
+
+template <typename Real>
+inline static void AddProductScalar3CPU(
+    float alpha, float beta,
+    const TensorImpl &a, const TensorImpl &b, const TensorImpl *c) {
+  Real *a_data = static_cast<Real*>(a->data),
+      *b_data = static_cast<Real*>(b->data),
+      *c_data = static_cast<Real*>(c->data);
+  if (beta != 0.0) {
+    *c_data = (beta * *c_data) + alpha * (*a_data + *b_data);
+  } else {  // don't propagate NaN
+    *c_data = alpha * (*a_data + *b_data);
+  }
+}
+
+
+void AddProductScalar3CPU(
+    float alpha, float beta,
+    const TensorImpl &a, const TensorImpl &b, const TensorImpl *c) {
+  if (c.dtype == kFloatDtype) {
+    AddProductScalar3CPU<float>(a, b, c);
+  } else if (c.dtype == kDoubleDtype) {
+    AddProductScalar3CPU<double>(a, b, c);
+  } else {
+    KALDI_ERR << "Data type not supported for this operation";
+  }
+}
+
+
+
+
+}  // namespace tensor
+}  // namespace kaldi
diff --git a/src/tensor/cpu-impl-linear.h b/src/tensor/cpu-impl-linear.h
new file mode 100644
index 00000000000..3ccc92c34c1
--- /dev/null
+++ b/src/tensor/cpu-impl-linear.h
@@ -0,0 +1,66 @@
+// tensor/cpu-impl.h
+
+// Copyright      2019  Johns Hopkins University (author: Daniel Povey)
+
+// See ../../COPYING for clarification regarding multiple authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//  http://www.apache.org/licenses/LICENSE-2.0
+//
+// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
+// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
+// MERCHANTABLITY OR NON-INFRINGEMENT.
+// See the Apache 2 License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef KALDI_TENSOR_CPU_IMPL_H_
+#define KALDI_TENSOR_CPU_IMPL_H_ 1
+
+#include "tensor/tensor.h"
+
+
+namespace kaldi {
+namespace tensor {
+
+
+template <typename Real>
+inline static void AddProductScalar3CPU(
+    float alpha, float beta,
+    const TensorImpl &a, const TensorImpl &b, const TensorImpl *c) {
+  Real *a_data = static_cast<Real*>(a->data),
+      *b_data = static_cast<Real*>(b->data),
+      *c_data = static_cast<Real*>(c->data);
+  if (beta != 0.0) {
+    *c_data = (beta * *c_data) + alpha * (*a_data + *b_data);
+  } else {  // don't propagate NaN
+    *c_data = alpha * (*a_data + *b_data);
+  }
+}
+
+
+void AddProductScalar3CPU(
+    float alpha, float beta,
+    const TensorImpl &a, const TensorImpl &b, const TensorImpl *c) {
+  if (c.dtype == kFloatDtype) {
+
+  } else {
+    KALDI_ASSERT(c.dtype == kDoubleDtype &&
+                 "Data type not supported for this operation");
+
+  }
+
+}
+
+
+AddProductScalar3Cpu(alpha, beta, a, b, c);
+
+
+}  // namespace tensor
+}  // namespace kaldi
+
+
+#endif  // KALDI_TENSOR_CPU_IMPL_H_
diff --git a/src/tensor/tensor-common.h b/src/tensor/tensor-common.h
index 56c02cec9a3..7a585d45fde 100644
--- a/src/tensor/tensor-common.h
+++ b/src/tensor/tensor-common.h
@@ -51,6 +51,8 @@ struct Device {
   Device(): device_type(kCpuDevice) { }
   Device(DeviceType t): device_type(t) { }
 
+  std::string ToString() const;
+
   // TODO: operator ==
   // maybe in future we'll make a way to set the default device.
 };
diff --git a/src/tensor/tensor-functions.cc b/src/tensor/tensor-functions.cc
new file mode 100644
index 00000000000..dcc89f022f4
--- /dev/null
+++ b/src/tensor/tensor-functions.cc
@@ -0,0 +1,66 @@
+// tensor/tensor-functions.cc
+
+// Copyright      2019  Johns Hopkins University (author: Daniel Povey)
+
+// See ../../COPYING for clarification regarding multiple authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//  http://www.apache.org/licenses/LICENSE-2.0
+//
+// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
+// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
+// MERCHANTABLITY OR NON-INFRINGEMENT.
+// See the Apache 2 License for the specific language governing permissions and
+// limitations under the License.
+
+#include "tensor/tensor-pattern.h"
+
+
+namespace kaldi {
+namespace tensor {
+
+inline static void SclalarMultiply(
+    float alpha, float beta,
+    const Tensor &a, const Tensor &b, Tensor *c) {
+
+}
+
+
+
+void AddProductReducing(float alpha, float beta,
+                        const Tensor &a, const Tensor &b, Tensor *c){
+  CheckDeviceAndDtype(a, b, *c);
+
+  int32 a_pcode = a.PatternCode(), b_pcode = b.PatternCode(),
+      c_pcode = c->PatternCode();
+  int64 combined_pcode = (int64(a_pcode) << 24) + b_pcode << 12 + c_pcode;
+
+  // Each group of 3 hex numbers describes on of the argument Tensors,
+  // so it's 0xAAABBBCCC.
+  //
+  switch (combined_pcode) {
+
+    case 0x000000000:
+      // scalar multiplication
+
+
+
+
+  }
+
+
+  SubTensor a_temp(a), b_temp(b), c_temp(*c);
+
+  PadAxes(&(a.pattern), &(b.pattern), &(c.pattern));
+
+  CompressPatterns({&a_temp, &b_temp, &c_temp});
+}
+
+
+
+}  // namespace kaldi
+}  // namespace tensor
diff --git a/src/tensor/tensor-functions.h b/src/tensor/tensor-functions.h
index 77af4e8a602..9f0c0c8a5e6 100644
--- a/src/tensor/tensor-functions.h
+++ b/src/tensor/tensor-functions.h
@@ -1,5 +1,26 @@
-#include "tensor/tensor.h"
+// tensor/tensor-functions.h
+
+// Copyright      2019  Johns Hopkins University (author: Daniel Povey)
+
+// See ../../COPYING for clarification regarding multiple authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//  http://www.apache.org/licenses/LICENSE-2.0
+//
+// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
+// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
+// MERCHANTABLITY OR NON-INFRINGEMENT.
+// See the Apache 2 License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef KALDI_TENSOR_FUNCTIONS_H_
+#define KALDI_TENSOR_FUNCTIONS_H_ 1
 
+#include "tensor/tensor.h"
 
 namespace kaldi {
 namespace tensor {
@@ -191,9 +212,13 @@ void AddProduct(float alpha, float beta,
                       beta == 0).   We require BroadcastCompatible(a, b, c).
  */
 void AddProductReducing(float alpha, float beta,
-                        const Tensor &a, const Tensor &b, Tensor *c);
+                        const SubTensor &a, const SubTensor &b,
+                        SubTensor *c);
+
+
 
+}  // namespace tensor
+}  // namespace kaldi
 
 
-}
-}
+#endif  // KALDI_TENSOR_FUNCTIONS_H_
diff --git a/src/tensor/tensor-impl-linear.cc b/src/tensor/tensor-impl-linear.cc
new file mode 100644
index 00000000000..9f122270756
--- /dev/null
+++ b/src/tensor/tensor-impl-linear.cc
@@ -0,0 +1,121 @@
+// tensor/tensor-impl-linear.cc
+
+// Copyright      2019  Johns Hopkins University (author: Daniel Povey)
+
+// See ../../COPYING for clarification regarding multiple authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//  http://www.apache.org/licenses/LICENSE-2.0
+//
+// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
+// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
+// MERCHANTABLITY OR NON-INFRINGEMENT.
+// See the Apache 2 License for the specific language governing permissions and
+// limitations under the License.
+
+#include "tensor/tensor-impl-linear.h"
+#include "tensor/tensor-impl-wrappers.h"
+
+namespace kaldi {
+namespace tensor {
+
+
+inline static void AddProductScalar3(
+    float alpha, float beta,
+    const TensorImpl &a, const TensorImpl &b, const TensorImpl *c) {
+  switch (a.device.device_type) {
+    case kCpuDevice:
+      AddProductScalar3Cpu(alpha, beta, a, b, c);
+      return;
+#ifdef HAVE_CUDA
+    case kGpuDevice:
+      AddProductScalar3Gpu(alpha, beta, a, b, c);
+      return;
+#endif
+    default:
+      KALDI_ERR << "Unsupported device type " << a.ToString();
+  }
+}
+
+
+void AddProduct(float alpha, float beta,
+                const TensorImpl &a, const TensorImpl &b, const TensorImpl *c){
+
+  if (a.pattern.code < b.pattern.code) {
+    // Ensure, via a recursion that a.pattern.code >= b.pattern.code.
+    // This avoids us having to test for the swapped versions of the patterns.
+    AddProduct(alpha, beta, b, a, c);
+    return;
+  }
+
+  CheckDeviceAndDtype(a, b, *c);
+
+
+  int64 combined_code = CombineCodes(a.pattern.code, b.pattern.code,
+                                     c->pattern.code);
+
+  /*
+    The case-statement values in the switch statement below may be
+    interpreted in groups of 3 hex characters, are 0xAAABBBCCC,
+    pertaining to Tensors a, b and c respectively.  See
+    GetPatternCode() in tensor-pattern-utils.h for documentation on
+    the meanings of the values and our notation with X,x,1.
+   */
+  switch(combined_code) {
+    case 0x000000000:
+      // () * () -> ()
+      // scalar * scalar -> scalar
+      AddProductScalar3(a, b, c);
+      return;
+    case 0x101000101:
+      //  (X) * ()-> (X)
+      // vector * scalar -> vector
+      AddProductVecScalarVec(a, b, c);
+      return;
+    case 0x101101101:
+      // (X) * (X) -> (X)
+      // vector .* vector -> vector
+      AddProductVec3(a, b, c);
+      return;
+    case 0x103101202:
+      // (x,X) * (X)  -> (X,1)
+      // vector * matrix -> vector.unsqueeze(-1)
+      AddProductMatVecVec(a, b, c);
+      return;
+    case 0x203101202:
+      // (X,x) * (X) -> (X,1)
+      // transposed-matrix * vector -> vector.unsqueeze(-1)
+      AddProductTmatVecVec(a, b, c);
+      return;
+    case 0x202101103:
+      // (X,1) * (X) -> (x,X)
+      // vector * vector -> matrix (outer product)
+      AddProductVec2Mat(a, b, c);
+      return;
+
+
+    default:
+      break;
+
+  }
+
+  // If we reached this point, it means we could
+  // not handle this request with any of the basic operations above.
+  // Something is a little differ
+
+
+  SubTensor a_temp(a), b_temp(b), c_temp(*c);
+
+  PadAxes(&(a.pattern), &(b.pattern), &(c.pattern));
+
+  CompressPatterns({&a_temp, &b_temp, &c_temp});
+}
+
+
+
+}  // namespace kaldi
+}  // namespace tensor
diff --git a/src/tensor/tensor-impl-linear.h b/src/tensor/tensor-impl-linear.h
new file mode 100644
index 00000000000..e1d7ec12145
--- /dev/null
+++ b/src/tensor/tensor-impl-linear.h
@@ -0,0 +1,153 @@
+// tensor/tensor-impl-linear.h
+
+// Copyright      2019  Johns Hopkins University (author: Daniel Povey)
+
+// See ../../COPYING for clarification regarding multiple authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//  http://www.apache.org/licenses/LICENSE-2.0
+//
+// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
+// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
+// MERCHANTABLITY OR NON-INFRINGEMENT.
+// See the Apache 2 License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef KALDI_TENSOR_IMPL_LINEAR_H_
+#define KALDI_TENSOR_IMPL_LINEAR_H_ 1
+
+#include "tensor/tensor.h"
+
+
+/**
+   This header contains basic linear-algebra and copying types of operations
+   on TensorImpl objects.  See also tensor-impl-nonlinearly
+ */
+
+namespace kaldi {
+namespace tensor {
+
+/**
+   Modifies 't' in-place by inserting an axis with (dim=1,stride=0) at the
+a   specified position.
+
+   A negative axis-index i is interpreted (like PyTorch) as (num_axes + 1 - i).
+
+   Showing just the dims in the tensor for some examples:
+
+\verbatim
+    Unsqueeze({3,4}, 0)  -> {1,3,4}
+    Unsqueeze({3,4}, 1)  -> {3,1,4}
+    Unsqueeze({3,4}, 2)  -> {3,4,1}
+    Unsqueeze({3,4}, -1)  -> {3,4,1}
+    Unsqueeze({3,4}, -2)  -> {3,1,4}
+\endverbatim
+ */
+void Unsqueeze(TensorImpl *t, int32 axis)
+
+
+/**
+   Modifies 't' in-place by removing an axis with (dim=1,stride=0) from the
+   specified position.  It is an error if 't' did not initially contain
+   such an axis.
+
+   Showing just the dims in the tensor for an example:
+
+\verbatim
+    Unsqueeze({1,3,4}, 0)  -> {3,4}
+    Unsqueeze({3,1,4}, 1)  -> {3,4}
+    Unsqueeze({3,1,4}, 2)  -> [error]
+\endverbatim
+ */
+void Squeeze(TensorImpl *t, int32 axis);
+
+
+
+/**
+   Does:
+
+    `c := alpha (a * b)  +  beta c`
+
+   where '*' is elementwise multiplication subject to broadcasting rules.  This
+   version supports reducing and broadcasting operations, and is where
+   matrix multiplication actually gets implemented; see Matmul().
+
+   The Tensors do not all have to have the same NumAxes(); they will
+   (conceptually) be made the same size by padding on the left with trivial axes
+   (dim=1;stride=0) to make them the same size.
+
+   The Tensors need to have the same Dtype() and Device().
+
+   @param [in] alpha  Value that scales a * b
+   @param [in] beta   Value that scales the initial value of c
+   @param [in] a      First input tensor
+   @param [in] b      Second input tensor.
+   @param [out] c     Tensor to be added to; we require Broadcastable(a, b, c).
+                      and either c's data must be initialized to a known
+                      value (if beta != 0) or known to not contain NaN (if
+                      beta == 0); but we have to figure out whether we can drop
+                      the NaN requirements as some BLAS's may treat beta=0
+                      specially.
+ */
+void AddProduct(float alpha, float beta,
+                const TensorImpl &a, const TensorImpl &b,
+                const TensorImpl *c);
+
+
+/**
+   Copy elements from Tensor a to Tensor b, possibly broadcasting
+      @param [in]  a    The source Tensor.
+      @param [out] b   The destination Tensor.  We require
+                       Broadcastable(a, b, true).
+
+   See also Add(), which is more general than Copy.
+ */
+void Copy(const TensorImpl &a, const TensorImpl *b);
+
+
+/**
+   Add elements from Tensor a to Tensor b, broadcasting or summing
+   as dictated by the dimensions involved; does
+      \f$  b := \alpha a + \beta b.  \f$
+
+      @param [in]  a    The source Tensor.
+      @param [out] b   The destination Tensor.  We require
+                       Broadcastable(a, b).
+ */
+void Add(float alpha, float beta,
+         const TensorImpl &a, const TensorImpl *b);
+
+
+/**
+   Matrix multiplication; does
+
+     \f$  c := \alpha a b   +  \beta c  \f$
+
+   where `a b` is interpreted as matrix multiplication.  This generalizes in the
+   same way as PyTorch's matmul does if there are extra dimensions in the args.
+   In fact it generalizes more than that, encompassing cases where the matrix
+   product may be summed over certain dimensions.
+
+   The implementation is just:
+
+     Tensor a_tmp(a), c_tmp(c);
+     a_tmp.Unsqueeze(-1);
+     c_tmp.Unsqueeze(-2);
+     AddProduct(alpha, beta, a_tmp, b, c_tmp);
+
+ */
+void Matmul(float alpha, float beta,
+            const TensorImpl &a, const TensorImpl &b,
+            const TensorImpl *c);
+
+
+
+}  // namespace tensor
+}  // namespace kaldi
+
+
+#endif  // KALDI_TENSOR_IMPL_LINEAR_H_
diff --git a/src/tensor/tensor-pattern-utils.cc b/src/tensor/tensor-pattern-utils.cc
new file mode 100644
index 00000000000..0440d40bbba
--- /dev/null
+++ b/src/tensor/tensor-pattern-utils.cc
@@ -0,0 +1,251 @@
+#include "tensor/tensor-pattern-utils.h"
+
+/**
+   This is some notes on plans for kaldi10 tensor stuff, nothing is fully fleshed out.
+*/
+
+namespace kaldi {
+namespace tensor {
+
+/**
+   This function returns true if any of the tensor patterns in 'patterns'
+   contains a negative stride.  All patterns are assumed to have the same
+   num-axes.
+ */
+static inline bool NegativeStrideExists(ArrayRef<TensorPattern> patterns) {
+  bool ans = false;
+  int32 num_axes = patterns[0]->num_axes;  // required
+  for (size_t p = 0; p < patterns.size; p++) {
+    const TensorPattern &pattern = patterns[p];
+    for (int32 i = 0; i < num_axes; i++) {
+      if (pattern->strides[i] < 0)
+        return true;
+    }
+  }
+  return false;
+}
+
+
+/**
+   This utility function used in CompressPatterns() normalizes the signs of the
+   strides in all the dimensions, prior to any merging of axes, and sets the
+   'data_offsets' variables.
+
+   Consider an axis-index 0 <= i < num_axes.  We say that the strides for axis i
+   are normalized if the the lowest-numbered pattern which has nonzero stride on
+   axis i (if such a pattern exists) is positive.  If, on the other hand, all
+   the strides are zero, we also say that it is normalized (since flipping the
+   sign would make no difference).
+
+   This type of normalization is done to increase the chance that we can combine
+   axes, because the rule we use for combining axes only applies if any nonzero
+   strides present have the same sign between the two axes.  In terms of being
+   able to combine axes this rule is optimal, because any two axes where the
+   pattern-index of the first pattern with a nonzero stride for those axes is
+   different, would *not* be combinable.  So for any pair of axes that are
+   potentially combinable according to that criterion and which have any nonzero
+   strides, our normalization rule ensures that at least one pair of nonzero
+   strides has the same sign.  If there were another pattern for which the sign
+   was opposite after applying our rule, those two axes would not be combinable
+   whatever the sign normalization.
+
+     @param [in,out] patterns  The patterns to have their strides normalized
+     @param [in,out] data_offsets  Data offsets, an array of dimension
+                          patterns.size, which will be *added to* by this
+                          function, by the amount required to ensure that
+                          the memory locations visited by the set of possible
+                          indexes into these patterns is the same before
+                          and after any change of sign.
+ */
+static inline void NormalizeSigns(ArrayRef<TensorPattern> patterns,
+                                  int64 *data_offsets) {
+  size_t num_patterns = patterns.size;
+  int32 num_axes = patterns[0].num_axes;
+  for (int32 a = 0; a < num_axes; a++) {
+    for (size_t p = 0; p < size; p++) {
+      if (patterns[p].strides[a] != 0) {
+        // We have identified the first pattern-index with nonzero
+        // stride for this axis
+        if (patterns[p].strides[a] < 0) {
+          // The stride is negative, so we have to flip it
+          // for this axis.  (Note: we flip it for all patterns,
+          // for this dim, but we can ignore q < p because
+          // we know all those strides are zero.
+          for (size_t q = p; q < size; q++) {
+            // cast to int64 before muiltiplication to avoid potential
+            // overflow
+            int64 this_offset =
+                static_cast<int64>(patterns[q].dims[a] - 1) *
+                static_cast<int64>(patterns[q].strides[a]);
+            data_offsets[q] += this_offset;
+            patterns[q].strides[a] *= -1;
+          }
+        }
+        // break from loop over patterns; we identified the first pattern-index
+        // with nonzero stride for this axis, which is the only thing that
+        // determines whether we change the sign of this axis.
+        break;
+      }
+    }
+  }
+}
+
+
+/**
+   This is a note on the semantics of combining dimensions in CompressPatterns.
+   It is not a commutative property: Combinable(pattern, i, j) might not
+   equal Combinable(pattern, j, i).
+
+   We can only ever combine pairs of axes that were combinable for *all* patterns
+   passed to CompressPatterns().
+
+   When we combine axes we'll set dims[j] := dims[i] * dims[j], and make axis i
+   a no-op by setting dims[i] = 1, strides[i] = 0.
+ */
+static inline bool Combinable(const TensorPattern &pattern,
+                              int32 axis1, int32 axis2) {
+  return pattern.strides[axis1] == pattern.strides[axis2] * pattern.dims[axis2];
+}
+
+
+// Returns true iff the axis 'axis' has zero stride (and hence dim=1)
+// for all the supplied patterns.  An axis like this can be removed without
+// affecting the result.
+static inline bool AxisIsTrivial(ArrayRef<TensorPattern> patterns,
+                                 int32 axis) {
+  for (size_t p = 0; p < patterns.size; p++)
+    if (patterns[p].strides[axis] != 0)
+      return false;
+  return true;
+}
+
+// Combine the two axes axis1 and axis2 in all the patterns (which
+// the user asserts is possible); at exit, the lower numbered of the
+// two axes is guaranteed to have dim=1, stride=0 in all patterns.
+// (we will later get rid of that trivial axis).
+// axis2 is the one with the smaller stride (for patterns where the
+// stride is nonzero), and is the one whose stride we keep in the
+// combined axis; that is the asymmetry.
+static inline void CombineAxes(ArrayRef<TensorPattern> patterns,
+                               int32 axis1, int32 axis2) {
+  size_t num_patterns = patterns.size;
+#ifdef KALDI_PARANOID
+  for (size_t p = 0; p < num_patterns; p++) {
+    KALDI_PARANOID_ASSERT(Combinable(patterns[p], axis1, axis2));
+  }
+#endif
+  if (axis1 < axis2) {
+    // the if-statement is because we want the 'trivial' axis (the one with
+    // dim=1, stride=0 for all patterns) to be the lower-numbered axis; this is
+    // more convenient for our algorithm because we might later want to do
+    // further combination on the nontrivial axis (if the lower-numbered one
+    // were changed, we might repeat the search for an axis to combine with it.
+    for (size_t p = 0; p < num_patterns; p++) {
+      TensorPattern &pattern = patterns[p];
+      pattern.dims[axis2] *= pattern.dims[axis1];
+      pattern.dims[axis1] = 1;
+      pattern.strides[axis1] = 0;
+    }
+  } else {
+    for (size_t p = 0; p < num_patterns; p++) {
+      TensorPattern &pattern = patterns[p];
+      pattern.dims[axis2] *= pattern.dims[axis1];
+      pattern.strides[axis2] = pattern.strides[axis1];
+      pattern.dims[axis1] = 1;
+      pattern.strides[axis1] = 0;
+    }
+  }
+}
+
+/**
+   Removes trivial axes, defined as axes for which, for all patterns,
+   dim=1 and stride=0.  Assumes the user has already which axes
+   are trivial and passes in as the array 'trivial_axis'.
+ */
+inline static bool RemoveTrivialAxes(int32 num_axes,
+                                     bool trivial_axis[],
+                                     ArrayRef<TensorPattern> patterns) {
+  for (size_t p = 0; p < patterns.size; p++) {
+    const TensorPattern &pattern = patterns[p];
+    // we do the loop over axes inside the loop over p for memory locality.
+    int32 axis_out = 0;
+    for (int32 axis_in = 0; axis_in < num_axes; axis_in++) {
+      if (axis_out != axis_in && !trivial_axis[axis_in]) {
+        pattern.dims[axis_out] = pattern.dims[axis_in];
+        pattern.dims[axis_out] = pattern.dims[axis_in];
+      }
+      if (!trivial_axis[axis_in])
+        axis_out++;
+    }
+    pattern.num_axes = axis_out;  // will be the same for all p.
+  }
+}
+
+void CompressPatterns(ArrayRef<TensorPattern> patterns,
+                      int64_t *data_offsets) {
+  size_t num_patterns = patterns.size;
+  for (size_t p = 0; p < num_patterns; p++)
+    data_offsets[p] = 0;
+#ifdef KALDI_PARANOID
+  // check the input
+  KALDI_ASSERT(num_patterns > 0 && num_patterns < 6);
+  for (size_t p = 0; p < num_patterns; p++) {
+    for (size_t q = p + 1; q < num_patterns; q++) {
+      KALDI_ASSERT(Broadcastable(patterns[p], patterns[q]));
+    }
+  }
+#endif
+  if (NegativeStrideExists(patterns))
+    NormalizeSigns(patterns, data_offsets);
+  bool is_trivial_axis[6] = { false, false, false, false, false, false }
+  bool exists_trivial_axis = false;
+  int32 num_axes = patterns[0].num_axes;
+  for (int32 i = 0; i < num_axes; i++) {
+    if (AxisIsTrivial(patterns, i)) {
+      is_trivial_axis[i] = true;
+      exists_trivial_axis = true;
+      continue;
+    }
+    // see if axis i can be combined (in either direction with any later-numbered axis.
+    for (int32 j = i + 1; j < num_axes; j++) {
+      bool combinable_ij = true, combinable_ji = true;
+      for (size_t p = 0; p < num_patterns; p++) {
+        if (!Combinable(patterns[p], i, j))
+          combinable_ij = false;
+        if (!Combinable(patterns[p], j, i))
+          combinable_ji = false;
+      }
+      if (combinable_ij) {
+        CombineAxes(patterns, i, j);
+        is_trivial_axis[i] = true;
+        exists_trivial_axis = true;
+        // Break from the loop on j and continue over the loop on i, meaning
+        // we are done combining with the i'th axis.  At this point all the
+        // (strides,dims) for axis i are just
+        break;
+      } else if (combinable_ji) {
+        CombineAxes(patterns, j, i);
+        is_trivial_axis[i] = true;   // not a typo.  Lower-numbered axis gets
+        // dim=1,stride=0.
+        exists_trivial_axis = true;
+        break;
+      }
+    }
+  }
+  if (exists_trivial_axis)
+    RemoveTrivialAxes(num_axes, trivial_axis, patterns);
+}
+
+
+void CompressOnePattern(TensorPattern *pattern,
+                        int64 *data_offset) {
+}
+
+
+  int32 GetDimsCode(const TensorPattern &pattern) {
+  }
+
+
+}  // namespace kaldi
+}  // namespace tensor
+x
diff --git a/src/tensor/tensor-pattern-utils.h b/src/tensor/tensor-pattern-utils.h
new file mode 100644
index 00000000000..7b502529793
--- /dev/null
+++ b/src/tensor/tensor-pattern-utils.h
@@ -0,0 +1,418 @@
+// tensor/tensor-pattern-utils.h
+
+//  Copyright      2019  Johns Hopkins University (author: Daniel Povey)
+
+// See ../../COPYING for clarification regarding multiple authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//  http://www.apache.org/licenses/LICENSE-2.0
+//
+// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
+// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
+// MERCHANTABLITY OR NON-INFRINGEMENT.
+// See the Apache 2 License for the specific language governing permissions and
+// limitations under the License.
+
+
+#include "tensor/tensor-common.h"
+
+/**
+   This is some notes on plans for kaldi10 tensor stuff, nothing is fully fleshed out.
+*/
+
+namespace kaldi {
+namespace tensor {
+
+
+/**
+   This function returns a code that compactly says whether each axis
+   has dim = 1 or dim != 1.  For purposes of the code generated, the number
+   of axes does not matter; imagine we left-padded with enough `dim=1`
+   axes to give KALDI_TENSOR_MAX_DIM axes.
+
+   The rightmost (least significant) bit is the last axis (numbered
+   KALDI_TENSOR_MAX_DIM - 1 after padding).
+
+   Note that the `dims` vectors below are displayed after removing
+   any leading `dim=1` axes.
+
+   The examples below will use c++14 binary literals, although
+   the code doesn't use them.  In the notation below, in dims vectors,
+   x is a stand-in for 'any number greater than 1'.
+
+    0b00000000  0x00  dims=(), a scalar
+    0b00000001  0x01  dims=(x)
+    0b00000010  0x02  dims=(x,1)
+    0b00000011  0x03  dims=(x,x)
+
+    etc.
+
+  See also GetPatternCode(), which includes the same information but
+  also stride-related information.
+ */
+int32 GetDimsCode(const TensorPattern &pattern);
+
+
+/**
+   This function returns a code that compactly represents the same information
+   as GetDimsCode() [i.e. which axes, counting from the last axis,
+   had dim != 1], and also which axis, if any,
+   had stride=1.  (No two axes can have stride=1, due to the uniqueness
+   rule; search in tensor-pattern.h).
+
+   Let
+      n = 0 if no axis had stride=1, otherwise:
+      n = num_axes - (the axis that had stride=1).
+
+   For example if the strides where (10,3,1) we would have
+   n = 1; i if the strides were (10,1,3) we would have n = 2.
+
+   The value 'n' occupies the bits starting from 8 in the returned code,
+   i.e. bits 8,9,10 (counting from the right, i.e. from the least to
+   most significant).
+
+   Bit 11 is 1 if any of the strides were negative, and zero otherwise.
+   None of the example bit-patterns below have this bit set.  The
+   underlying BLAS in most cases does not support negative strides so
+   we deal with it by copying the data to a temporary with positive
+   strides.
+
+   The low-order KALDI_TENSOR_MAX_DIM bits are as returned by GetDimsCode().
+
+   The explanation below will use c++14 binary literals (like 0b010101), although the code
+   doesn't use them as we compile as c++11; we show the corresponding hex codes which
+   are used in the code (and anyway easier to parse).
+
+   In the notation below, in dims vectors, x or X is a stand-in for 'any number
+   not equal to 1', and upper-case X indicates that the axis has stride=1.  In
+   the example `dims` vectors below, we don't put any leading `dim=1` axes,
+   because they would not affect the code generated.  The list of numbers
+   in parentheses below may be interpreted as the sequence of dims for the
+   Tensor.
+
+   The ' at the 8th bit is to make the bit-string easier to parse.
+
+
+    0b000'00000000  0x000  dims=(), a scalar
+    0b000'00000001  0x001  dims=(x), a vector with a stride
+    0b001'00000001  0x101  dims=(X), a vector
+    0b000'00000010  0x002  dims=(x,1),  a vector.unsqueeze(-1) with a stride
+    0b010'00000010  0x202  dims=(X,1),  a vector.unsqueeze(-1)
+    0b000'00000011  0x003  dims=(x,x), a matrix with a stride
+    0b001'00000011  0x103  dims=(x,X), a matrix
+    0b010'00000011  0x203  dims=(X,x), a transposed matrix
+    0b000'00000100  0x008  dims=(x,1,1)
+    0b011'00000100  0x308  dims=(X,1,1)
+    0b010'00000110  0x20B  dims=(x,X,1), a matrix.unsqueeze(-1)
+    0b011'00000110  0x30B  dims=(X,x,1), a transposed matrix.unsqueeze(-1)
+    0b000'00000110  0x10B  dims=(x,x,1), a matrix.unsqueeze(-1) with column stride
+    0b001'00000101  0x109  dims=(x,1,X), a matrix.unsqueeze(-2)
+    0b011'00000101  0x309  dims=(X,1,x), a transposed matrix.unsqueeze(-2)
+    0b000'00000101  0x009  dims=(x,1,x), a matrix.unsqueeze(-2) with column stride
+
+
+    ...
+ */
+int32 GetPatternCode(const TensorPattern &pattern);
+
+
+
+
+inline int32 CombineCodes(int32 code1, int32 code2) {
+  return (code1 << 12) | code2;
+}
+
+inline int64 CombineCodes(int32 code1, int32 code2, int32 code3) {
+  return (static_cast<int64>(code1) << 24) |
+      static_cast<int64>(code2 << 12) |
+      static_cast<int64>(code3);
+}
+
+
+/**  This function returns true if the dimensions of tensor patterns
+     a and b are broadcastable in the PyTorch sense.  What this means
+     for tensors with the same num-axes is that dims for axis i
+     must either be the same or one of them must be 1.  For tensors
+     with different num-axes we pad with leading (dim=1)'s; for
+     instance, dims (2,8,3) and (8,1) would be broadcastable because
+     the (8,1) becomes (1,8,1).
+
+     If 'b_non_reducing' is true, then we do not allow any dim of
+     b to be 1 where the corresponding dim of a was not 1.
+
+     This check is simple to implement due to the way we store
+     the dims 'right-justified' so that the last-numbered dim
+     is always at dims[KALDI_TENSOR_MAX_DIM - 1].
+ */
+bool Broadcastable(const TensorPattern &a, const TensorPattern &b,
+                   bool b_non_reducing = false);
+
+
+/**  This function returns true if the dimensions of tensor patterns
+     a, b and c are broadcastable in the PyTorch sense (meaning;
+     after padding their dims on the left with ones to make them
+     have the same num-axes, corresponding dimensions are either
+     identical or 1).  See the version of Broadcastable() above
+     for more information.
+
+       @param [in] a  The dimensions of the first Tensor
+       @param [in] b  The dimensions of the second Tensor
+       @param [in] c  The dimensions of the third Tensor
+       @param [in] c_non_reducing   If true, then we do not allow a dim of
+                      c to be 1 while corresponding dims of a or b
+                      are > 1.
+ */
+bool Broadcastable(const TensorPattern &a, const TensorPattern &b,
+                   const TensorPattern &c, bool c_non_reducing = false);
+
+
+
+/**
+   Compresses a TensorPattern by removing or combining as many axes as possible.
+   This version is suitable for operations that do not rely on any kind
+   of structure, such as zeroing or nonlinearities; the only equivalence
+   maintained is equivalence of the set of memory locations covered.
+   The order of the (dim,stride) pairs in the input does not affect the
+   output.  The output (dim,stride) pairs will be ordered from
+   greatest to least stride (note: all output strides will be positive).
+
+      @param [in]  src   The pattern to be compressed
+      @param [in]  src_properties  Properties of 'src'; required to
+                          be accurate (behavior is undefined otherwise,
+                          e.g. if you provide some other pattern's properties).
+      @param [out] dest   A simplified-as-much-as-possible pattern that
+                          covers the same set of memory locations as 'src' (when
+                          combined with the offset below).  'dest' will
+                          contain only nonnegative strides.
+      @param [out] data_offset  A number that we would have to add to
+                          the data pointer of the source Tensor so
+                          that 'dest' would cover the same set of
+                          elements.  It will always be zero if 'src'
+                          was free of negative strides.
+   Examples are below, where we write a TensorPattern as
+    `{{dim1,dim2,..}, {stride1,stride2,..}}`.
+
+\verbatim
+   Input pattern             Output pattern            Output offset
+     {{10},{1}}               {{10},{1}}                  0
+    {{3,4},{4,1}}             {{12},{1}}                  0
+    {{4,3},{1,4}}             {{12},{1}}                  0
+    {{9},{-1}}                {{9},{1}}                  -8
+   {2,3,4},{100,4,1}        {{2,12},{100,1}}              0
+\endverbatim
+ */
+void CompressOnePattern(TensorPattern *pattern,
+                        int64 *data_offset);
+
+/*
+  Compress two TensorPatterns by combining axes (and possibly
+  flipping the sign of their strides and changing the data offset)
+  The type of compression involved is the same as for CompressOnePattern
+  (meaning we are doing some kind of operation that doesn't care about
+  the structure, such as an element-by-element nonlinearity).
+
+  The difference from calling CompressOnePattern() twice is that this function
+  needs to preserve the relationship between the tensors whose pattern is src1
+  and src2.  Suppose that a tensor with pattern src3 was the result of this
+  elementwise operation satisfying Broadcastable(src1, src2, src3); there is
+  only one such pattern.  Let x be a tuple which would be a valid index for the
+  tensor with pattern src3.  Let us use an extended indexing convention
+  whereby if an axis of src1 or src2 has dimension 1, we allow that axis to be
+  indexed by any value, which would not affect the memory location because the
+  stride is zero.  Then each such tuple x leads to a different pair of memory
+  locations (p1, p2) in the tensors corresponding to patterns src1, src2.  The
+  invariance that this function must preserve is that the set of memory-location
+  pairs (p1, p2) must be the same in the output tensors (with their
+  appropriately moved data pointers), as in the input tensors.
+
+  What this means in practice is that we need to do the same operations on src1
+  and src2.  For example, if flipping the sign of an axis of src1 we would have
+  to flip that of src2, and if merging two axes of src1 we would have to merge
+  the same two axes of src2.
+
+    @param [in] src1  The first source pattern.
+    @param [in] src2  The second source pattern.
+                      We require Broadcastable(src1,src2) == true.
+    @param [out] dest1  Compressed pattern out corresponding to src1.  Will
+                     be free of negative strides (but dest2 might not be).
+    @param [out] dest_offset1  Data offset that we'd need to add to src1's
+                     data pointer before using the pattern 'dest1'
+    @param [out] dest1  Compressed pattern out corresponding to src2.
+                     Might not be free of negative strides if some dimensions
+                     of src1/src2 had strides of opposite sign.
+    @param [out] dest_offset1  Data offset that we'd need to add to src1's
+                     data pointer before using the pattern 'dest1'
+
+
+ */
+void CompressTwoPatterns(const TensorPattern &src1,
+                         const TensorPattern &src2,
+                         TensorPattern *dest1,
+                         int64 *data_offset1,
+                         TensorPattern *dest2,
+                         int64 *data_offset2);
+
+
+/**
+   Compresses one or more TensorPattern by removing or combining as many axes as
+   possible.  See the documentation for CompressOnePattern() to understand the
+   basic concept of compressing a single TensorPattern to a pattern with possibly
+   fewer axes (and maybe with negative strides converted to positive),
+   which covers the same set of memory locations as the original Tensor.
+
+   The difference with just calling CompressOnePattern() several times is
+   that this preserves the relationships between the tensors.
+
+   Firstly, we require that all pairs of TensorPattern in 'patterns' be
+   broadcastable: that is, Broadcastable(p1, p2) would hold for any
+   p1, p2 in 'patterns'.  In the explanation below we will use a
+   'permissive indexing' convention whereby if a Tensor has an axis
+   with dim,stride (0, 1), we allow it to be indexed by any value
+   (not just zero), so that all the tensors represented can accept the
+   same set of index tuples.  Suppose for example that there are three
+   patterns, p1, p2, p3, in 'patterns', with 4 axes.  Let max_dim
+   be the 'combined' dimension, which contains the max of the dims
+   of the corresponding axes of p1,p2,p3, and let
+   x = (i, j, k, l) be an index tuple that would be valid for a tensor
+   of dim max_dim.  Each such x, when used as an index into p1, p2
+   and p3 with 'permissive indexing' as mentioned above, will
+   will give us a tuple of memory-offsets (o1, o2, o3) (indexes
+   into the respective data pointers).  Ranging over the set of such
+   x, we get a set of memory-offset tuples; call this set S_in,
+   and call the set that we would get if doing the same procedure
+   on the output tensors (with their possibly changed num-axes), be
+   S_out.  Let us represent the 'data_offset' output of this function
+   as (in this case) a 3-tuple o.  Then the invariant that this
+   function needs to satisfy is that:
+        `S_in = S_out + o`
+   where we interpret the '+ o' as adding to each element of the set.
+   Interpret the above as: one set of 3-tuples == another set of 3-tuples.
+
+   Of course, the 3 tensors and 4 axes mentioned here is just an example.
+
+      @param [in,out] patterns   An array of 1 <= size <= 4 of the patterns
+                         to be jointly compressed.
+      @param [out]  data_offsets  Pointer to an array of the same size
+                        as patterns, which on output will contain
+                        offsets to be added to the data pointers.
+
+
+      @return  Returns true if it made any change to the patterns,
+               false if they were unchanged.
+
+   Examples are below, where we write a TensorPattern as
+    `{{dim1,dim2,..}, {stride1,stride2,..}}`.
+
+ Examples are below, where we write a TensorPattern as
+ `{{dim1,dim2,..}, {stride1,stride2,..}}`.
+
+\verbatim
+    src1                src2              dest1,offset1       dest2,offset2
+  {{10},{1}}           {{10},{1}}        {{10},{1}},0        {{10},{1}},0  # no-op
+  {{8},{1}}            {{1},{0}}         {{8},{1}},0         {{1},{0}},0   # no-op
+  {{7},{-1}}           {{7},{1}}         {{7},{1}},-6         {{7},{-1}},6 # flip sign
+ {{3,4},{4,1}}        {{3,4},{4,1}}      {{12},{1}},0         {{12},{1}},0 # combine dims
+ {{3,4},{4,1}}        {{3,1},{4,0}}      {{3,4},{4,1}}        {{3,1},{4,0}} # can't combine, would be incompatible
+ {{3,4},{4,1}}        {{1,1},{0,0}}      {{12},{1}}           {{1},{0}}    # combine
+\endverbatim
+ */
+bool CompressPatterns(ArrayRef<TensorPattern> patterns,
+                      int64_t *data_offsets);
+
+/**
+   Compresses a TensorPattern by removing or combining as many axes as possible,
+   while respecting certain invariances that are relevant when constructing
+   'views' ('view' is PyTorch terminology; the NumPy equivalent is 'reshape').
+   The "C" in the function name refers to C-style arrays.
+
+    This function removes axes with dim=1.
+
+    This function combines successive axes if the relationship of their
+    dims and strides is what you would expect in a "C"-style array.
+    Suppose that in 'src' we had two successive axes with dims and
+    strides (dim_a, dim_b) and (stride_a, stride_b), with dim_a > 1 and
+    dim_b > 1.  If stride_a == stride_b * dim_b, then this function
+    will merge them into a single axis with dimension (dim_a * dim_b)
+    and stride stride_b.   (However, they won't be merged if it would
+    result in a dimension exceeding the range of int32).
+
+   The output pattern 'dest' is what you get if you keep applying the
+   rules above until no further change is made.
+
+   Examples are below, where we write a TensorPattern as
+  `   {{dim1,dim2,..}, {stride1,stride2,..}}`.
+\verbatim
+   Input pattern             Output pattern
+     {{10},{1}}               {{10},{1}}
+    {{5,1},{1,1}}             {{5},{1}}
+    {{9},{-1}}                {{9},{-1}}
+   {2,3,4},{100,4,1}        {{2,12},{100,1}}
+   {2,3,4},{100,-4,-1}        {{2,12},{100,-1}}
+\endverbatim
+ */
+void CompressPatternC(const TensorPattern &src,
+                      const TensorPatternProperties &src_properties,
+                      TensorPattern *dest);
+
+
+/**
+   Creates a TensorPattern corresponding to a requested 'view' of the matrix.
+   ('view' is PyTorch terminology; the NumPy equivalent is 'reshape').
+
+   The PyTorch/NumPy semantics are (I believe) as follows: Firstly, a view
+   can/should only be created for a tensor whose layout in memory is as for a
+   "C" array; suppose that the shape of array a is (9, 8), a "C" layout would
+   imply strides of (8, 1).  A 'view' of this array simply implies interpreting
+   the same block of memory as a "C" array with some other sequence of
+   dimensions, say (3, 3, 8) or (8, 9) or (1, 72); any sequence whose product
+   matches the number of elements in "a".
+
+   Our semantics of "view" is the same as that of PyTorch/NumPy except that we
+   impose fewer constraints on what strides the input Tensor cmay have.  Let the
+   'view' of the array 'a' be 'b'.  As long as it is possible to find a tensor
+   pattern for 'b' that would lead to the same relationship between the elements
+   of 'a' and 'b' as what you would get by asking for the same "view" in
+   PyTorch/NumPy assuming 'a' had had "C"-style strides (viewed in terms of
+   indexed elements of and b, without regard to the physical memory layout), we
+   allow it.
+
+
+   Notes on implementation (glossing over ones in 'dims' which are easy to
+   handle as a special case): we would first call CompressPattern on
+   'pattern_in'.  Then we would attempt to find a correspondence with
+   the dimensions of this compressed pattern and a partition of the
+   sequence 'dims'.  For example, suppose the compressed pattern
+   is (100, 9) and dims is (50, 2, 3, 3), then the partition would
+   be (50, 2), (3, 3).  If this is not possible (e.g. if dims
+   had been (30,10,3) instead), we return false.
+
+   @param [in]  pattern_in   The input pattern for which we are trying to
+                          find an alternative view
+   @param [in]  dims  The sequence of dimensions corresponding to the
+                      desired view.  Its product must be the same as the
+                      product of pattern_in.dims.
+   @param [out] pattern_out  The output pattern, if we were
+                      successful (otherwise undefined).  Its 'dims'
+                      will be the same as 'dims'.
+   @return           Returns true on success (i.e. such a view existed),
+                     and false otherwise.  This function will never return
+                     false if 'pattern_in' had strides as for a "C" array
+                     (i.e., if its properties' has_c_strides was true).
+
+ */
+bool CreateViewPattern(const TensorPattern &pattern_in,
+                       ArrayRef<int32> dims,
+                       TensorPattern *pattern_out);
+
+
+
+
+};
+
+
+}
+}
diff --git a/src/tensor/tensor-pattern.h b/src/tensor/tensor-pattern.h
index bbbc06d447f..90c9f686c51 100644
--- a/src/tensor/tensor-pattern.h
+++ b/src/tensor/tensor-pattern.h
@@ -131,39 +131,58 @@ void MakeRangeExplicit(int32 dim, Range *range);
 
 
 /*
-  This struct stores the dimension and strides of a Tensor.  The following
-  describes the properties that a TensorPattern will always have.
+  This struct stores the dimension and strides of a Tensor.
+
+  The main thing to watch out for is that the dimensions of 'dims' and 'strides'
+  to look at is not 0 ... num_axes, but KALDI_TENSOR_MAX_DIM - num_axes
+  ... KALDI_TENSOR_MAX_DIM - 1.  The last dimension is always located at
+  KALDI_TENSOR_MAX_DIM - 1, i.e. the dims and strides are always
+  right-justified.  In addition, for unused axes, we always maintain dim=1 and
+  stride=0. This happens to be quite convenient due to the standard broadcasting
+  rules in things like PyTorch.
+
+  Below we describe the the properties that a TensorPattern is required to have.
 
   These properties are stricter than some other frameworks, such as PyTorch,
   which allow the users to manually add dimensions with stride 0 and dim > 1 so
   that a lower-dimensional quantity can masquerade as one with a higher
   dimension.  We require that it never be possible to access the same memory
   location using two different tuples of indexes.  We also don't allow zero dims
-  (i.e. a tensor must not be empty); if you want an empty Tensor, just use a
-  null pointer.  In addition, require that the stride equal zero for any
-  axis that has dim = 1.
+  (i.e. a Tensor that is initialized must not have num_elemnts==0).  If you want
+  an empty Tensor, just use a null pointer.  In addition, we require that the
+  stride equal zero for any axis that has dim = 1.
 
   Our requirements on a TensorPattern are:
 
-    0 <= num_axes <= KALDI_TENSOR_MAX_DIM
-    for 0 <= axis < num_axes:
+    0 <= num_axes <= KALDI_TENSOR_MAX_DIM.
+
+    for 0 <= i < KALDI_TENSOR_MAX_DIM
        dims[i] > 0
+       if i < KALDI_TENSOR_MAX_DIM - num_axes, then dims[i] = 1.
        if dims[i] = 1, then strides[i] = 0.
        if dims[i] != 1, then strides[i] != 0
+
     ... plus the uniqueness property.
 
-  The uniqueness property means that we must not be able to access
-  the same memory location via two different tuples of indexes).
-  Recause testing this property exactly would be difficult in general
-  without bringing in number theory, we test a slightly stronger version
-  of it that covers all cases we are likely to encounter.  This is
-  that, if we take all the axes with dim != 1 and sort them from greatest
-  to least stride, for each i, abs(strides[i]) >= dims[i+1] * abs(strides[i+1]).
+  Note: in the public interface of class Tensor, if you ask for
+  dim(i) it will return pattern.dims[KALDI_TENSOR_MAX_DIM - num_axes + i].
+
+
+  The uniqueness property requires that we must not be able to access the same
+  memory location via two different tuples of indexes).  Recause testing this
+  property exactly would be difficult in general without bringing in concepts
+  from number theory, we test a slightly stronger version of it that covers all
+  cases we are likely to encounter.  This is that, if we take all the axes with
+  dim != 1 and sort them from greatest to least stride, then for each i,
+  abs(strides[i]) >= dims[i+1] * abs(strides[i+1]).
 */
 struct TensorPattern {
   int32 num_axes;
   int32 dims[KALDI_TENSOR_MAX_DIM];
   int32 strides[KALDI_TENSOR_MAX_DIM];
+  int32 code;  // pattern code; see GetPatternCode() in tensor-pattern-utils.h
+               // for details.
+
   // We may later add methods to this.
 
   // Checks that the TensorPattern is valid, assuming it is part of a Tensor.
@@ -180,6 +199,11 @@ struct TensorPatternProperties {
   // of dims[0] .. dims[num_axes - 1].  Will always be >0.
   int64 num_elements;
 
+
+  // Binary code describing the pattern, see GetPatternCode() in
+  // tensor-pattern-utils.h.
+  int32 code;
+
   // is_contiguous means that the data form a contiguous block in memory; it is
   // not the same as PyTorch's is_contiguous which is a stronger condition,
   // implying also that the strides are as for a `C-style` array.
diff --git a/src/tensor/tensor-utils.h b/src/tensor/tensor-utils.h
new file mode 100644
index 00000000000..20abfabedb4
--- /dev/null
+++ b/src/tensor/tensor-utils.h
@@ -0,0 +1,84 @@
+// tensor/tensor.h
+
+// Copyright      2019  Johns Hopkins University (author: Daniel Povey)
+
+// See ../../COPYING for clarification regarding multiple authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//  http://www.apache.org/licenses/LICENSE-2.0
+//
+// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
+// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
+// MERCHANTABLITY OR NON-INFRINGEMENT.
+// See the Apache 2 License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef KALDI_TENSOR_TENSOR_UTILS_H_
+#define KALDI_TENSOR_TENSOR_UTILS_H_ 1
+
+#include "tensor/tensor.h"
+
+namespace kaldi {
+namespace tensor {
+
+
+
+
+// Used in checking function arguments, this function will
+// crash and print a statck trace if Tensor a and b have different
+// Dtype() or different Device().
+void CheckDeviceAndDtype(const TensorImpl &a, const TensorImpl &b);
+
+// Used in checking function arguments, this function will
+// crash and print a statck trace if Tensor a, b and c have different
+// Dtype() or different Device().
+void CheckDeviceAndDtype(const TensorImpl &a, const TensorImpl &b, const TensorImpl &c);
+
+
+/**
+   This function allocates the appropriate storage for the Tensor described
+   in 'impl', and sets is 'data' pointer to the allocated memory address.
+   It returns the address a newly allocated Storage object which manages
+   the memory location; you will probably want to construct a
+   std::unique_ptr<Storage> from this so that when it goes out of scope,
+   the memory will be freed.
+
+      @param [in,out] impl   The TensorImpl object we are allocating for.
+                      Any previous value of impl->data is overwritten.
+                      It is required that that the product of dims in
+                      impl->pattern be nonzero (i.e. that the pattern
+                      is initialized to a valid value), and that its
+                      dtype and device values be set.
+      @return         Returns a newly allocated Storage object that
+                      manages this memory block.  When it is freed,
+                      the memory block will be deallocated using a
+                      method appropriate for the device.
+
+   This function throws on error.  See also AllocateTensorShared().  This
+   function is used by class Tensor, but also by various implementation
+   functions (called with TensorImpl) where we need to allocate temporaries.
+   We don't construct a full-fledged Tensor because we don't want the
+   overhead of managing any shared_ptr's.
+ */
+Storage *AllocateTensor(TensorImpl *impl);
+
+
+/**
+   This function is as AllocateTensor(), except that the Storage
+   object returned is allocated via std::make_shared (which involves
+   just one heap allocation, as opposed to two if you constructed
+   the shared_ptr from the Storage* pointer).  See the documentation
+   for AllocateTensor() for more details.
+ */
+std::shared_ptr<Storage> AllocateTensorShared(TensorImpl *impl);
+
+
+}  // namespace tensor
+}  // namespace kaldi
+
+
+#endif  // KALDI_TENSOR_TENSOR_H_
diff --git a/src/tensor/tensor.h b/src/tensor/tensor.h
index b09649282fb..bfafe022b95 100644
--- a/src/tensor/tensor.h
+++ b/src/tensor/tensor.h
@@ -27,6 +27,34 @@
 namespace kaldi {
 namespace tensor {
 
+
+/**
+   TensorImpl is basically a Tensor without the shared_ptr to Storage
+   (which is expensive to pass around, because of the cost of atomics).
+   The Tensor contains it as a member, rather than as a pointer.
+
+   Most of our internal functions use TensorImpl rather than Tensor because
+   it is easier to manipulate, but you need to know what you are doing.
+*/
+struct TensorImpl {
+  TensorPattern pattern;
+  DataType dtype;
+  Device device;
+  void *data{nullptr};
+};
+
+// Metadata for a Tensor.  It's occasionally convenient to have this
+// in a struct.
+struct TensorMeta {
+  TensorPattern pattern;
+  DataType dtype;
+  Device device;
+  // Note: the offset is only used in some situations,
+  // it's
+  // We may turn this into an offset measured in elements.
+  int32 offset;
+};
+
 /**
    A Tensor is a multi-dimensional array (up to 5 dimensions) of types such as
    float or double (and eventually ints).  Multiple Tensors may point to data
@@ -48,30 +76,33 @@ class Tensor {
   // contexts, this is sometimes known as the rank of the tensor, or sometimes
   // even its dimension, but these terms are ambiguous so we avoid them, and use
   // the terms 'number of axes' or 'axis' throughout.
-  inline int32 NumAxes() const { return pattern_.num_axes; }
+  inline int32 NumAxes() const { return impl_.pattern.num_axes; }
+
+
+  const TensorImpl &Impl() { return impl_; }
 
   // Return reference to the struct containing the dimension and
   // stride info.
-  const TensorPattern &Pattern() const { return pattern_; }
+  const TensorPattern &Pattern() const { return impl_.pattern; }
 
   // Return an array containing dimensions of the tensor; equivalent to
   // .shape in PyTorch.  Dims().size() will equal NumAxes().
   // We limit each dimension to int32, because BLAS's interface uses int,
   // which on many common 64-bit platforms is configured with 32 bits.
   // However the product of dimensions may still be 64 bits.
-  inline ArrayRef<int32> Dims() const { return ArrayRef{pattern_.num_axes, pattern_.dims}; }
+  inline ArrayRef<int32> Dims() const { return ArrayRef{impl_.pattern.num_axes, impl_.pattern_.dims}; }
 
   // Returns the dimension on this axis, a number >= 1.  Result is
   // undefined if axis < 0 or axis >= NumAxes().
-  inline int32 Dim(int32 axis) const { return pattern_.dims[axis]; }
+  inline int32 Dim(int32 axis) const { return impl_.pattern.dims[axis]; }
 
   // Returns an array containing the strides of the tensor.
   // Strides().size() will equal NumAxes().
-  inline ArrayRef<int32> Strides() const { return ArrayRef{pattern_.num_axes, pattern_.strides}; }
+  inline ArrayRef<int32> Strides() const { return ArrayRef{impl_.pattern.num_axes, impl_.pattern.strides}; }
 
   // Returns the stride on this axis.  Will be zero if the corresponding
   // dimension is 1, and otherwise nonzero (but not necessarily positive).
-  inline int32 Stride(int32 axis) const { return pattern_.strides[axis]; }
+  inline int32 Stride(int32 axis) const { return impl_.pattern.strides[axis]; }
 
   // Returns the number of elements in the Tensor; must be > 0.
   inline int64 NumElements() const { return derived_.num_elements; }
@@ -148,7 +179,7 @@ class Tensor {
   void Transpose(int32 axis1 = 0, int32 axis2 = 1);
 
 
-  // Constructor which does not really initialize the Tensor.  pattern_,
+  // Constructor which does not really initialize the Tensor.  impl_.pattern,
   // derived_ and dtype_ may contain nonsense.
   Tensor(): data_(NULL) { }
 
@@ -170,11 +201,7 @@ class Tensor {
                        `other`, while filling in any gaps if `other`
                        was not contiguous, if kCstrides then we use
                        "C" style strides for any dimensions != 1.
-       @param [in]  ip   The data initialize policy
-
-     The strides will not be the same as 'other' if other.IsContiguous() ==
-     false, but the ordering of the strides (smaller vs. larger) and their
-     signs will remain the same.
+       @param [in]  ip   The data initialization policy
   */
   Tensor(const Tensor &other, StridePolicy sp, InitializePolicy ip);
 
@@ -212,6 +239,29 @@ class Tensor {
   Tensor(TensorPattern &pattern, DataType dtype, Device device,
          InitializePolicy p);
 
+  /**
+     Construct a Tensor from the metadata in 'meta'.  Requires
+     that meta.pattern be contiguous (meaning: literally contiguous,
+     not the PyTorch meaning which is a stronger condition).
+     ??Possibly we could make it similar to the constructor above
+       and have it just make it contiguous if it was not.??
+
+
+       @param [in] meta  Struct containing the metadata specifying
+                     the Tensor's pattern, data-type and device
+
+                     ;pattern  The dimension and stride information that
+                  this tensor should match (although we will fill gaps
+                  to make it contiguous)
+       @param [in] dtype   The data type to use
+       @param [in] device  The device to put the data on
+       @param [in] set_zero   If true, set the data to zero.  If false,
+                        the contents will be undefined.
+
+  */
+  Tensor(TensorMeta &meta, InitializePolicy p);
+
+
   /**
      This constructor, which is intended for use primarily in internal
      code and
@@ -221,13 +271,7 @@ class Tensor {
 
  private:
   // The tensor dim and strides.
-  TensorPattern pattern_;
-  // Cached properties that depend on pattern_.
-  TensorPatternProperties derived_;
-  // The data-type of this tensor.
-  DataType dtype_;
-  // The device this Tensor lives on
-  Device device_;
+  TensorImpl impl_;
 
   // The raw data pointer.  Will be cast to a pointer of the appropriate
   // type before indexing.
diff --git a/src/tensor/variable.h b/src/tensor/variable.h
new file mode 100644
index 00000000000..7cfe9763ede
--- /dev/null
+++ b/src/tensor/variable.h
@@ -0,0 +1,157 @@
+// tensor/variable.h
+
+// Copyright      2019  Johns Hopkins University (author: Daniel Povey)
+
+// See ../../COPYING for clarification regarding multiple authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//  http://www.apache.org/licenses/LICENSE-2.0
+//
+// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
+// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
+// MERCHANTABLITY OR NON-INFRINGEMENT.
+// See the Apache 2 License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef KALDI_TENSOR_TENSOR_H_
+#define KALDI_TENSOR_TENSOR_H_ 1
+
+#include "tensor/variable.h"
+
+namespace kaldi {
+namespace tensor {
+
+
+/*
+  This is the 'gradient information' that class Variable stores for a Tensor
+  when it is initialized with requires_grad = true (or is a result of
+  an operation on Variables one of which had requires_grad = true).
+  This does not give you access to the underlying Variables; doing it
+  like this makes reference counting easier (no loops).  The GradFunc
+  will store any pointers to the original Variable that it may have
+  needed.
+
+  Users will rarely need to interact directly with this struct directly.
+ */
+struct TensorGrad {
+  // The version of the underlying Tensor.  (this number in the TensorGrad
+  // mirrors that in the Variable; it's needed because TensorGrad's
+  // 'inputs' variable refers back to the TensorGrad and does not have
+  // access to the Variable).
+  int32 version;
+
+
+  struct InputInfo {
+    int32 version;  // the version of the input that we used.  Used so we can
+                    // check in the backprop that grad->version == version;
+                    // if not, the user did something we don't allow.
+    std::shared_ptr<TensorGrad> grad;
+  };
+
+  // The gradients corresponding to the input variables, which
+  // we may need to update.  Some subset of these may be nullptr,
+  // corresponding to input Variables for which no gradient
+  // was required.
+  std::vector<InputInfo> inputs;
+
+  // is_view is true only if the Variable underlying this TensorGrad
+  // is the result of an expression like foo.transpose() that creates
+  // a view to another Tensor.  In that case
+  bool is_view{false};
+
+  // The device we
+  Device device;
+
+  // This contains the meta-information of the Tensor for which this is the
+  // gradient (its 'data' pointer will be NULL).  Used to set up 'grad' with the
+  // correct dimension and strides when it is needed.
+  TensorMeta meta;
+  // Only if is_view == true, the offset (in elements) of the start of
+  // the Tensor described in 'meta' from the start of the source Tensor.
+  // Used in constructing 'grad'
+  int64 offset;
+
+  // This stores the gradient (if we already have one), or nullptr if not.
+  std::unique_ptr<Variable> grad{nullptr};
+
+  // The tail in a singly linked list of TensorGrads... used in case this
+  // Variable is a sum of several terms that were added using an
+  // in-place method such as '+='.  (Syntax etc. TBD at this point).
+  std::unique_ptr<TensorGrad> tail{nullptr};
+};
+
+
+/**
+   class Variable is somewhat like class Tensor but augmented with autograd
+   machinery.  Because autograd requires a rather 'functional' way of doing
+   things (i.e. is not super friendly to in-place operations), the functions
+   that operate on class Variable will tend to be ones that return something,
+   rather than in-place operations.
+
+   The overall design is quite similar to PyTorch, and the structure
+   of the the C++ code is similar to flashlight.  If you are only familiar with
+   PyTorch's python frontend, class Variable is rougtly equivalent to what they
+   expose as af.tensor.
+ */
+class Variable {
+  using GradFunc = std::function<
+    void(const std::vector<Variable>& inputs, TensorGrad *grad_output)>;
+  using GradHook = std::function<void(TensorGrad *grad)>;
+
+
+
+  /** Constructor from a Tensor.
+       @param [in] data  Pointer to the source Tensor
+       @param [in] requires_grad    If requires_grad argument is true,
+                the gradient w.r.t. this Variable will be computed if and when
+                you call Backward() on a Variable that depends on it.
+                The same as requires_grad in PyTorch.
+  */
+  Variable(const std::shared_ptr<Tensor> &data, bool requires_grad);
+
+
+
+  /**
+   * Creates a Variable which wraps the array and inputs specified
+   * @param[in] data array to the stored in the Variable
+   * @param[in] inputs a vector specifying inputs for this Variable
+   * @param[in] gradFunc function specifying how to calculate gradient of the
+   * input Variables
+   */
+  Variable(std::shared_ptr<Tensor> &data, std::vector<Variable> inputs,
+           GradFunc gradFunc);
+
+
+ private:
+
+  // The version of this Variable.  Generally will start at 0 when the Variable
+  // is assigned a size and will have 1 added to it for each operation that is
+  // done on it.  If grad_ != NULL, we mirror this value in grad_->version.  The
+  // version number is only used for checking purposes, to verify that people
+  // don't modify a Variable in ways that defeat the backprop.  If we wanted we
+  // could keep the old versions around and enable the backprop to work anyway,
+  // but that kind magic is not in the spirit of how this library operates.
+  int32 version_;
+
+  std::shared_ptr<Tensor> data_;
+  std::shared_ptr<TensorGrad> grad_;
+
+};
+
+typedef std::unique_ptr<Storage>
+
+
+
+
+};
+
+
+}  // namespace tensor
+}  // namespace kaldi
+
+
+#endif  // KALDI_TENSOR_VARIABLE_H_

From 7acef2ad551d99239086defbf4667f5ac72f5505 Mon Sep 17 00:00:00 2001
From: Hossein Hadian <hn.hadian@gmail.com>
Date: Tue, 26 Mar 2019 20:03:29 +0430
Subject: [PATCH 013/163] Kaldi10: Implement topology (#3169)

* Implement most of topology.cc

Still missing some parts of Check().

make hmm still does not compile, even though topology.cc does.

* Implement the rest of Topology::Check() + cosmetic fixes

* Fix bugs
---
 src/hmm/hmm-topology-test.cc |  56 ++---
 src/hmm/topology.cc          | 454 ++++++++++++++---------------------
 src/hmm/topology.h           |  15 +-
 3 files changed, 216 insertions(+), 309 deletions(-)

diff --git a/src/hmm/hmm-topology-test.cc b/src/hmm/hmm-topology-test.cc
index 9a3a65b61a4..2ed8ce38b4a 100644
--- a/src/hmm/hmm-topology-test.cc
+++ b/src/hmm/hmm-topology-test.cc
@@ -2,6 +2,7 @@
 
 // Copyright 2009-2011  Microsoft Corporation
 //                2015  Johns Hopkins University (author: Daniel Povey)
+//                2019  Hossein Hadian
 
 // See ../../COPYING for clarification regarding multiple authors
 //
@@ -25,47 +26,36 @@ namespace kaldi {
 
 
 void TestTopology() {
-  bool binary = (Rand()%2 == 0);
+  bool binary = (Rand() % 2 == 0);
 
   std::string input_str = "<Topology>\n"
       "<TopologyEntry>\n"
       "<ForPhones> 1 2 3 4 5 6 7 8 9 </ForPhones>\n"
-      "<State> 0 <PdfClass> 0\n"
-      "<Transition> 0 0.5\n"
-      "<Transition> 1 0.5\n"
-      "</State> \n"
-      "<State> 1 <PdfClass> 1 \n"
-      "<Transition> 1 0.5\n"
-      "<Transition> 2 0.5\n"
-      "</State>  \n"
-      " <State> 2 <PdfClass> 2\n"
-      " <Transition> 2 0.5\n"
-      " <Transition> 3 0.5\n"
-      " </State>   \n"
-      " <State> 3 </State>\n"
+      " 0  1  1  0.0\n"
+      " 1  1  1  0.693\n"
+      " 1  2  2  0.693\n"
+      " 2  2  2  0.693\n"
+      " 2  3  3  0.693\n"
+      " 3  3  3  0.693\n"
+      " 3  0.693\n\n"
       " </TopologyEntry>\n"
-      "  <TopologyEntry>\n"
-      "  <ForPhones> 10 11 13  </ForPhones>\n"
-      "  <State> 0 <PdfClass> 0\n"
-      "  <Transition> 0 0.5\n"
-      "  <Transition> 1 0.5\n"
-      "  </State> \n"
-      "  <State> 1 <PdfClass> 1 \n"
-      "  <Transition> 1 0.5\n"
-      "  <Transition> 2 0.5\n"
-      "  </State>  \n"
-      " <State> 2 </State>"
-      "  </TopologyEntry>\n"
-      "  </Topology>\n";
+
+      "<TopologyEntry>\n"
+      "<ForPhones> 10 11 13 </ForPhones>\n"
+      " 0  0  1  0.693\n"
+      " 0  1  1  0.693\n"
+      " 1  1  2  0.693\n"
+      " 1  2  2  0.693\n"
+      " 2 \n\n"
+      "</TopologyEntry>\n"
+      "</Topology>\n";
 
   std::string chain_input_str = "<Topology>\n"
       "<TopologyEntry>\n"
       "<ForPhones> 1 2 3 4 5 6 7 8 9 </ForPhones>\n"
-      " <State> 0 <ForwardPdfClass> 0 <SelfLoopPdfClass> 1\n"
-      "  <Transition> 0 0.5\n"
-      "  <Transition> 1 0.5\n"
-      " </State> \n"
-      " <State> 1 </State>\n"
+      " 0  1  1  0.0\n"
+      " 1  1  2  0.693\n"
+      " 1  0.693\n\n"
       "</TopologyEntry>\n"
       "</Topology>\n";
 
@@ -84,7 +74,6 @@ void TestTopology() {
   topo.Write(oss, binary);
 
   Topology topo2;
-  // std::cout << oss.str() << '\n' << std::flush;
   std::istringstream iss2(oss.str());
   topo2.Read(iss2, binary);
 
@@ -120,4 +109,3 @@ int main() {
   }
   std::cout << "Test OK.\n";
 }
-
diff --git a/src/hmm/topology.cc b/src/hmm/topology.cc
index a0563f90c0d..0d7880cd1d5 100644
--- a/src/hmm/topology.cc
+++ b/src/hmm/topology.cc
@@ -2,6 +2,8 @@
 
 // Copyright 2009-2011  Microsoft Corporation
 //           2014-2019  Johns Hopkins University (author: Daniel Povey)
+//           2019       Daniel Galvez
+//           2019       Hossein Hadian
 
 // See ../../COPYING for clarification regarding multiple authors
 //
@@ -20,212 +22,121 @@
 
 #include <vector>
 
+#include "fst/script/compile.h"
+
+#include "util/common-utils.h"
 #include "hmm/topology.h"
+#include "util/stl-utils.h"
 #include "util/text-utils.h"
-
-
-namespace kaldi {
+#include "fstext/kaldi-fst-io.h"
 
 
 
-void Topology::GetPhoneToNumPdfClasses(std::vector<int32> *phone2num_pdf_classes) const {
-  KALDI_ASSERT(!phones_.empty());
-  phone2num_pdf_classes->clear();
-  phone2num_pdf_classes->resize(phones_.back() + 1, -1);
-  for (size_t i = 0; i < phones_.size(); i++)
-    (*phone2num_pdf_classes)[phones_[i]] = NumPdfClasses(phones_[i]);
-}
+namespace kaldi {
 
 void Topology::Read(std::istream &is, bool binary) {
   ExpectToken(is, binary, "<Topology>");
-  if (!binary) {  // Text-mode read, different "human-readable" format.
+  if (!binary) {
     phones_.clear();
     phone2idx_.clear();
     entries_.clear();
     std::string token;
     while ( ! (is >> token).fail() ) {
-      if (token == "</Topology>") { break; } // finished parsing.
-      else  if (token != "<TopologyEntry>") {
-        KALDI_ERR << "Reading Topology object, expected </Topology> or <TopologyEntry>, got "<<token;
+      if (token == "</Topology>") {
+        break; // finished parsing.
+      } else if (token != "<TopologyEntry>") {
+        KALDI_ERR << "Reading Topology object, expected </Topology> or "
+            "<TopologyEntry>, got "<<token;
       } else {
         ExpectToken(is, binary, "<ForPhones>");
         std::vector<int32> phones;
         std::string s;
         while (1) {
           is >> s;
-          if (is.fail()) KALDI_ERR << "Reading Topology object, unexpected end of file while expecting phones.";
+          if (is.fail())
+            KALDI_ERR << "Reading Topology object, unexpected end of file "
+                "while expecting phones.";
           if (s == "</ForPhones>") break;
           else {
             int32 phone;
             if (!ConvertStringToInteger(s, &phone))
               KALDI_ERR << "Reading Topology object, expected "
                         << "integer, got instead " << s;
+            KALDI_ASSERT(phone > 0);
             phones.push_back(phone);
           }
         }
 
-        std::vector<HmmState> this_entry;
-        std::string token;
-        ReadToken(is, binary, &token);
-        while (token != "</TopologyEntry>") {
-          if (token != "<State>")
-            KALDI_ERR << "Expected </TopologyEntry> or <State>, got instead "<<token;
-          int32 state;
-          ReadBasicType(is, binary, &state);
-          if (state != static_cast<int32>(this_entry.size()))
-            KALDI_ERR << "States are expected to be in order from zero, expected "
-                      << this_entry.size() <<  ", got " << state;
-          ReadToken(is, binary, &token);
-          int32 forward_pdf_class = kNoPdf;  // -1 by default, means no pdf.
-          if (token == "<PdfClass>") {
-            ReadBasicType(is, binary, &forward_pdf_class);
-            this_entry.push_back(HmmState(forward_pdf_class));
-            ReadToken(is, binary, &token);
-            if (token == "<SelfLoopPdfClass>")
-              KALDI_ERR << "pdf classes should be defined using <PdfClass> "
-                        << "or <ForwardPdfClass>/<SelfLoopPdfClass> pair";
-          } else if (token == "<ForwardPdfClass>") {
-            int32 self_loop_pdf_class = kNoPdf;
-            ReadBasicType(is, binary, &forward_pdf_class);
-            ReadToken(is, binary, &token);
-            KALDI_ASSERT(token == "<SelfLoopPdfClass>");
-            ReadBasicType(is, binary, &self_loop_pdf_class);
-            this_entry.push_back(HmmState(forward_pdf_class, self_loop_pdf_class));
-            ReadToken(is, binary, &token);
-          } else
-            this_entry.push_back(HmmState(forward_pdf_class));
-          while (token == "<Transition>") {
-            int32 dst_state;
-            BaseFloat trans_prob;
-            ReadBasicType(is, binary, &dst_state);
-            ReadBasicType(is, binary, &trans_prob);
-            this_entry.back().transitions.push_back(std::make_pair(dst_state, trans_prob));
-            ReadToken(is, binary, &token);
-          }
-          if(token == "<Final>") // TODO: remove this clause after a while.
-            KALDI_ERR << "You are trying to read old-format topology with new Kaldi.";
-          if (token != "</State>")
-            KALDI_ERR << "Reading Topology,  unexpected token "<<token;
-          ReadToken(is, binary, &token);
-        }
-        int32 my_index = entries_.size();
-        entries_.push_back(this_entry);
+        int32 entry_index = entries_.size();
+        fst::StdVectorFst fst;
+        ReadFsaKaldi(is, binary, &fst);
+        entries_.push_back(fst);
 
-        for (size_t i = 0; i < phones.size(); i++) {
-          int32 phone = phones[i];
+        for (int32 phone : phones) {
           if (static_cast<int32>(phone2idx_.size()) <= phone)
-            phone2idx_.resize(phone+1, -1);  // -1 is invalid index.
-          KALDI_ASSERT(phone > 0);
-          if (phone2idx_[phone] != -1)
-            KALDI_ERR << "Phone with index "<<(i)<<" appears in multiple topology entries.";
-          phone2idx_[phone] = my_index;
+            phone2idx_.resize(phone + 1, -1);  // -1 is invalid index.
+          if (phone2idx_[phone] != -1) {
+            KALDI_ERR << "Phone "
+                      << phone << " appears in multiple topology entries.";
+          }
+          phone2idx_[phone] = entry_index;
           phones_.push_back(phone);
         }
+        ExpectToken(is, binary, "</TopologyEntry>");
       }
     }
     std::sort(phones_.begin(), phones_.end());
     KALDI_ASSERT(IsSortedAndUniq(phones_));
-  } else {  // binary I/O, just read member objects directly from disk.
+  } else {
     ReadIntegerVector(is, binary, &phones_);
     ReadIntegerVector(is, binary, &phone2idx_);
-    int32 sz;
-    ReadBasicType(is, binary, &sz);
-    bool is_hmm = true;
-    if (sz == -1) {
-      is_hmm = false;
-      ReadBasicType(is, binary, &sz);
-    }
-    entries_.resize(sz);
-    for (int32 i = 0; i < sz; i++) {
-      int32 thist_sz;
-      ReadBasicType(is, binary, &thist_sz);
-      entries_[i].resize(thist_sz);
-      for (int32 j = 0 ; j < thist_sz; j++) {
-        ReadBasicType(is, binary, &(entries_[i][j].forward_pdf_class));
-        if (is_hmm)
-          entries_[i][j].self_loop_pdf_class = entries_[i][j].forward_pdf_class;
-        else
-          ReadBasicType(is, binary, &(entries_[i][j].self_loop_pdf_class));
-        int32 thiss_sz;
-        ReadBasicType(is, binary, &thiss_sz);
-        entries_[i][j].transitions.resize(thiss_sz);
-        for (int32 k = 0; k < thiss_sz; k++) {
-          ReadBasicType(is, binary, &(entries_[i][j].transitions[k].first));
-          ReadBasicType(is, binary, &(entries_[i][j].transitions[k].second));
-        }
-      }
+    int32 number_topology_entries;
+    ReadBasicType(is, binary, &number_topology_entries);
+    for (size_t index = 0; index < number_topology_entries; ++index) {
+      fst::StdVectorFst fst;
+      ReadFsaKaldi(is, binary, &fst);
+      entries_.push_back(fst);
     }
-    ExpectToken(is, binary, "</Topology>");
   }
-  Check();  // Will throw if not ok.
+  Check();
 }
 
+template <class Arc>
+static void WriteFsa(std::ostream &os, const fst::VectorFst<Arc> &fst) {
+  os << '\n';
+  bool acceptor = true, write_one = false;
+  fst::FstPrinter<Arc> printer(fst, fst.InputSymbols(), fst.OutputSymbols(),
+                               NULL, acceptor, write_one, "\t");
+  printer.Print(&os, "<unknown>");
+  if (os.fail())
+    KALDI_ERR << "Stream failure detected writing FST to stream.";
+  os << '\n';
+  if (!os.good())
+    KALDI_ERR << "Error writing FST to stream.";
+}
 
 void Topology::Write(std::ostream &os, bool binary) const {
-  bool is_hmm = IsHmm();
   WriteToken(os, binary, "<Topology>");
-  if (!binary) {  // Text-mode write.
-    os << "\n";
-    for (int32 i = 0; i < static_cast<int32> (entries_.size()); i++) {
+  if (!binary) {
+    for (int index = 0; index < entries_.size(); ++index) {
       WriteToken(os, binary, "<TopologyEntry>");
-      os << "\n";
       WriteToken(os, binary, "<ForPhones>");
-      os << "\n";
-      for (size_t j = 0; j < phone2idx_.size(); j++) {
-        if (phone2idx_[j] == i)
-          os << j << " ";
-      }
-      os << "\n";
-      WriteToken(os, binary, "</ForPhones>");
-      os << "\n";
-      for (size_t j = 0; j < entries_[i].size(); j++) {
-        WriteToken(os, binary, "<State>");
-        WriteBasicType(os, binary, static_cast<int32>(j));
-        if (entries_[i][j].forward_pdf_class != kNoPdf) {
-          if (is_hmm) {
-            WriteToken(os, binary, "<PdfClass>");
-            WriteBasicType(os, binary, entries_[i][j].forward_pdf_class);
-          } else {
-            WriteToken(os, binary, "<ForwardPdfClass>");
-            WriteBasicType(os, binary, entries_[i][j].forward_pdf_class);
-            KALDI_ASSERT(entries_[i][j].self_loop_pdf_class != kNoPdf);
-            WriteToken(os, binary, "<SelfLoopPdfClass>");
-            WriteBasicType(os, binary, entries_[i][j].self_loop_pdf_class);
-          }
-        }
-        for (size_t k = 0; k < entries_[i][j].transitions.size(); k++) {
-          WriteToken(os, binary, "<Transition>");
-          WriteBasicType(os, binary, entries_[i][j].transitions[k].first);
-          WriteBasicType(os, binary, entries_[i][j].transitions[k].second);
-        }
-        WriteToken(os, binary, "</State>");
-        os << "\n";
-      }
-      WriteToken(os, binary, "</TopologyEntry>");
-      os << "\n";
+      for (auto phone: phones_)
+        if (phone2idx_[phone] == index)
+          os << phone << " ";
+      os << "</ForPhones>";
+      WriteFsa(os, entries_[index]);
+      os << "</TopologyEntry>\n";
     }
   } else {
     WriteIntegerVector(os, binary, phones_);
     WriteIntegerVector(os, binary, phone2idx_);
-    // -1 is put here as a signal that the object has the new,
-    // extended format with SelfLoopPdfClass
-    if (!is_hmm) WriteBasicType(os, binary, static_cast<int32>(-1));
-    WriteBasicType(os, binary, static_cast<int32>(entries_.size()));
-    for (size_t i = 0; i < entries_.size(); i++) {
-      WriteBasicType(os, binary, static_cast<int32>(entries_[i].size()));
-      for (size_t j = 0; j < entries_[i].size(); j++) {
-        WriteBasicType(os, binary, entries_[i][j].forward_pdf_class);
-        if (!is_hmm) WriteBasicType(os, binary, entries_[i][j].self_loop_pdf_class);
-        WriteBasicType(os, binary, static_cast<int32>(entries_[i][j].transitions.size()));
-        for (size_t k = 0; k < entries_[i][j].transitions.size(); k++) {
-          WriteBasicType(os, binary, entries_[i][j].transitions[k].first);
-          WriteBasicType(os, binary, entries_[i][j].transitions[k].second);
-        }
-      }
-    }
+    int32 number_topology_entries = entries_.size();
+    WriteBasicType(os, binary, number_topology_entries);
+    for (auto const& fst : entries_)
+      WriteFstKaldi(os, binary, fst);
   }
   WriteToken(os, binary, "</Topology>");
-  if (!binary) os << "\n";
 }
 
 void Topology::Check() {
@@ -239,148 +150,155 @@ void Topology::Check() {
       KALDI_ERR << "Topology::Check(), phone has no valid index.";
     is_seen[phone2idx_[phone]] = true;
   }
-  for (size_t i = 0; i < entries_.size(); i++) {
-    if (!is_seen[i])
-      KALDI_ERR << "HmmTopoloy::Check(), entry with no corresponding phones.";
-    int32 num_states = static_cast<int32>(entries_[i].size());
+  if (!std::accumulate(is_seen.begin(),
+                       is_seen.end(), true, std::logical_and<bool>()))
+    KALDI_ERR << "HmmTopoloy::Check(), entry with no corresponding phones.";
+
+  for (auto const& entry: entries_) {
+    int32 num_states = static_cast<int32>(entry.NumStates());
     if (num_states <= 1)
-      KALDI_ERR << "Topology::Check(), cannot only have one state (i.e., must "
-          "have at least one emitting state).";
-    if (!entries_[i][num_states-1].transitions.empty())
-      KALDI_ERR << "Topology::Check(), last state must have no transitions.";
-    // not sure how necessary this next stipulation is.
-    if (entries_[i][num_states-1].forward_pdf_class != kNoPdf)
-      KALDI_ERR << "Topology::Check(), last state must not be emitting.";
+      KALDI_ERR << "Topology::Check(), cannot only have one state (must have a "
+                << "final state and a start state).";
+  }
 
-    std::vector<bool> has_trans_in(num_states, false);
+  for (auto& entry: entries_) {
+    bool has_final_state = false;
     std::vector<int32> seen_pdf_classes;
+    for (fst::StateIterator<fst::StdVectorFst> state_iter(entry);
+         !state_iter.Done(); state_iter.Next()) {
+      StateId state = state_iter.Value();
+      if (entry.Final(state) != Weight::Zero())
+        has_final_state = true;
 
-    for (int32 j = 0; j < num_states; j++) {  // j is the state-id.
-      BaseFloat tot_prob = 0.0;
-      if (entries_[i][j].forward_pdf_class != kNoPdf) {
-        seen_pdf_classes.push_back(entries_[i][j].forward_pdf_class);
-        seen_pdf_classes.push_back(entries_[i][j].self_loop_pdf_class);
-      }
-      std::set<int32> seen_transition;
-      for (int32 k = 0;
-           static_cast<size_t>(k) < entries_[i][j].transitions.size();
-           k++) {
-        tot_prob += entries_[i][j].transitions[k].second;
-        if (entries_[i][j].transitions[k].second <= 0.0)
-          KALDI_ERR << "Topology::Check(), negative or zero transition prob.";
-        int32 dst_state = entries_[i][j].transitions[k].first;
-        // The commented code in the next few lines disallows a completely
-        // skippable phone, as this would cause to stop working some mechanisms
-        // that are being built, which enable the creation of phone-level lattices
-        // and rescoring these with a different lexicon and LM.
-        if (dst_state == num_states-1 // && j != 0
-            && entries_[i][j].forward_pdf_class == kNoPdf)
-          KALDI_ERR << "We do not allow any state to be "
-              "nonemitting and have a transition to the final-state (this would "
-              "stop the SplitToPhones function from identifying the last state "
-              "of a phone.";
-        if (dst_state < 0 || dst_state >= num_states)
-          KALDI_ERR << "Topology::Check(), invalid dest state " << (dst_state);
-        if (seen_transition.count(dst_state) != 0)
-          KALDI_ERR << "Topology::Check(), duplicate transition found.";
-        if (dst_state == k) {  // self_loop...
-          KALDI_ASSERT(entries_[i][j].self_loop_pdf_class != kNoPdf &&
-                       "Nonemitting states cannot have self-loops.");
-        }
-        seen_transition.insert(dst_state);
-        has_trans_in[dst_state] = true;
+      BaseFloat outward_prob_sum = exp(-entry.Final(state).Value());
+      for (fst::ArcIterator<fst::StdVectorFst> aiter(entry, state);
+           !aiter.Done(); aiter.Next()) {
+        const fst::StdArc &arc(aiter.Value());
+        if (arc.ilabel != arc.olabel)
+          KALDI_ERR << "ilabel != olabel. " << arc.ilabel << " " << arc.olabel;
+        if (arc.ilabel == 0)
+          KALDI_ERR << "Epsilon arcs (pdf-class 0) are not allowed.";
+        if (state != entry.Start() && arc.nextstate == entry.Start())
+          KALDI_ERR << "Start state cannot have any inward transitions.";
+        seen_pdf_classes.push_back(arc.ilabel);
+        outward_prob_sum += exp(-arc.weight.Value());
       }
-      if (j+1 < num_states) {
-        KALDI_ASSERT(tot_prob > 0.0 && "Non-final state must have transitions out."
-                     "(with nonzero probability)");
-        if (fabs(tot_prob - 1.0) > 0.01)
-          KALDI_WARN << "Total probability for state " << j <<
-              " in topology entry is " << tot_prob;
-      } else
-        KALDI_ASSERT(tot_prob == 0.0);
+      if (!ApproxEqual(outward_prob_sum, 1.0))
+        KALDI_WARN << "Outward transition probabilities should sum to 1.0 "
+            "for each state";
     }
-    // make sure all but start state have input transitions.
-    for (int32 j = 1; j < num_states; j++)
-      if (!has_trans_in[j])
-        KALDI_ERR << "Topology::Check, state "<<(j)<<" has no input transitions.";
-    SortAndUniq(&seen_pdf_classes);
-    if (seen_pdf_classes.front() != 0 ||
-        seen_pdf_classes.back() != static_cast<int32>(seen_pdf_classes.size()) - 1) {
-      KALDI_ERR << "Topology::Check(), pdf_classes are expected to be "
-          "contiguous and start from zero.";
+    if (!has_final_state) {
+      KALDI_ERR << "Topology::Check(), must have a final state.";
     }
-  }
-}
 
-bool Topology::IsHmm() const {
-  const std::vector<int32> &phones = GetPhones();
-  KALDI_ASSERT(!phones.empty());
-  for (size_t i = 0; i < phones.size(); i++) {
-    int32 phone = phones[i];
-    const TopologyEntry &entry = TopologyForPhone(phone);
-    for (int32 j = 0; j < static_cast<int32>(entry.size()); j++) {  // for each state...
-      int32 forward_pdf_class = entry[j].forward_pdf_class,
-            self_loop_pdf_class = entry[j].self_loop_pdf_class;
-      if (forward_pdf_class != self_loop_pdf_class)
-        return false;
+    if (entry.Final(entry.Start()) != Weight::Zero())
+      KALDI_ERR << "Topology::Check(), start state must not be a final state.";
+
+    if (entry.Start() != 0) {
+      KALDI_ERR << "Topology::Check(), start state must be 0.";
     }
+
+    SortAndUniq(&seen_pdf_classes);
+    if (seen_pdf_classes.front() != 1 ||
+        seen_pdf_classes.back() != static_cast<int32>(seen_pdf_classes.size())) {
+      KALDI_ERR << "Topology::Check(), pdf_classes are expected to be "
+          "contiguous and start from 1.";
+    }
+    fst::Connect(&entry);
+    if (entry.NumStates() == 0)
+      KALDI_ERR << "Some of the states in the topolgy are not reachable.";
   }
-  return true;
 }
 
-const Topology::TopologyEntry& HmmTopology::TopologyForPhone(int32 phone) const {  // Will throw if phone not covered.
-  if (static_cast<size_t>(phone) >= phone2idx_.size() || phone2idx_[phone] == -1) {
-    KALDI_ERR << "TopologyForPhone(), phone "<<(phone)<<" not covered.";
+// Will throw if phone not covered.
+const fst::StdVectorFst& Topology::TopologyForPhone(int32 phone) const {
+  if (static_cast<size_t>(phone) >= phone2idx_.size()
+      || phone2idx_[phone] == -1) {
+    KALDI_ERR << "TopologyForPhone(), phone "<< phone <<" not covered.";
   }
   return entries_[phone2idx_[phone]];
 }
 
 int32 Topology::NumPdfClasses(int32 phone) const {
   // will throw if phone not covered.
-  const TopologyEntry &entry = TopologyForPhone(phone);
-  int32 max_pdf_class = 0;
-  for (size_t i = 0; i < entry.size(); i++) {
-    max_pdf_class = std::max(max_pdf_class, entry[i].forward_pdf_class);
-    max_pdf_class = std::max(max_pdf_class, entry[i].self_loop_pdf_class);
+  const fst::StdVectorFst &entry = TopologyForPhone(phone);
+
+  std::set<int32> pdfs;
+  for (fst::StateIterator<fst::StdVectorFst> siter(entry);
+       !siter.Done(); siter.Next()) {
+    StateId state_id = siter.Value();
+    for (fst::ArcIterator<fst::StdVectorFst> aiter(entry, state_id);
+         !aiter.Done(); aiter.Next()) {
+      pdfs.insert(aiter.Value().ilabel);
+    }
   }
-  return max_pdf_class+1;
+  return pdfs.size();
+}
+
+void Topology::GetPhoneToNumPdfClasses(
+    std::vector<int32> *phone2num_pdf_classes) const {
+  KALDI_ASSERT(!phones_.empty());
+  phone2num_pdf_classes->clear();
+  phone2num_pdf_classes->resize(phones_.back() + 1, -1);
+  for (auto phone: phones_)
+    (*phone2num_pdf_classes)[phone] = NumPdfClasses(phone);
 }
 
 int32 Topology::MinLength(int32 phone) const {
-  const TopologyEntry &entry = TopologyForPhone(phone);
-  // min_length[state] gives the minimum length for sequences up to and
-  // including that state.
-  std::vector<int32> min_length(entry.size(),
-                                std::numeric_limits<int32>::max());
-  KALDI_ASSERT(!entry.empty());
+  using Weight = typename fst::StdFst::Weight;
+  using StateId = typename fst::StdFst::StateId;
+  const fst::StdVectorFst& this_topo = TopologyForPhone(phone);
+  // 1) Prepare a new FST with arc weight of 1.f and final state weight of 0.f
+  // (Note that 0.f == Weight::One() in Tropical Semiring).
+  // Since we are using the Std
+  // We need to use a VectorFst in order to mutate members
+  std::unique_ptr<fst::StdVectorFst> topo_copy(this_topo.Copy());
 
-  min_length[0] = (entry[0].forward_pdf_class == -1 ? 0 : 1);
-  int32 num_states = min_length.size();
-  bool changed = true;
-  while (changed) {
-    changed = false;
-    for (int32 s = 0; s < num_states; s++) {
-      const HmmState &this_state = entry[s];
-      std::vector<std::pair<int32, BaseFloat> >::const_iterator
-          iter = this_state.transitions.begin(),
-          end = this_state.transitions.end();
-      for (; iter != end; ++iter) {
-        int32 next_state = iter->first;
-        KALDI_ASSERT(next_state < num_states);
-        int32 next_state_min_length = min_length[s] +
-            (entry[next_state].forward_pdf_class == -1 ? 0 : 1);
-        if (next_state_min_length < min_length[next_state]) {
-          min_length[next_state] = next_state_min_length;
-          if (next_state < s)
-            changed = true;
-          // the test of 'next_state < s' is an optimization for speed.
-        }
+  std::vector<StateId> final_states;
+  for (fst::StateIterator<fst::StdVectorFst> siter(*topo_copy);
+       !siter.Done(); siter.Next()) {
+    StateId state_id = siter.Value();
+
+    if (topo_copy->Final(state_id) != Weight::Zero()) {
+      final_states.push_back(state_id);
+      topo_copy->SetFinal(state_id, Weight::One());
+    }
+
+    for (fst::MutableArcIterator<fst::StdVectorFst> aiter(topo_copy.get(), state_id);
+         !aiter.Done(); aiter.Next()) {
+      Arc original_arc = aiter.Value();
+      Arc distance_one_arc(original_arc.ilabel, original_arc.olabel,
+                           Weight(1.0f), original_arc.nextstate);
+      aiter.SetValue(distance_one_arc);
+    }
+  }
+  KALDI_ASSERT(!final_states.empty());
+  // Now run single-source nearest neightbors
+  std::vector<Weight> distances;
+  fst::ShortestDistance(*topo_copy, &distances);
+  fst::NaturalLess<Weight> less;
+  auto min_final_state_iter =
+    std::min_element(final_states.begin(), final_states.end(),
+                     [&distances, &less](StateId state1, StateId state2) {
+                       return less(distances[state1], distances[state2]);
+                     });
+  Weight distance = distances[*min_final_state_iter];
+  return static_cast<int32>(distance.Value());
+}
+
+bool Topology::operator==(const Topology &other) const {
+  if (phones_ != other.phones_ || phone2idx_ != other.phone2idx_ ||
+      entries_.size() != other.entries_.size()) {
+    return false;
+  } else {
+    for(size_t i = 0; i < entries_.size(); ++i) {
+      if (!fst::Equal(entries_[i], other.entries_[i], /*delta=*/0,
+                      fst::kEqualFsts)) {
+        return false;
       }
     }
+    return true;
   }
-  KALDI_ASSERT(min_length.back() != std::numeric_limits<int32>::max());
-  // the last state is the final-state.
-  return min_length.back();
 }
 
 } // End namespace kaldi
diff --git a/src/hmm/topology.h b/src/hmm/topology.h
index eae0640af08..e99b0db7c18 100644
--- a/src/hmm/topology.h
+++ b/src/hmm/topology.h
@@ -2,6 +2,7 @@
 
 // Copyright 2009-2011  Microsoft Corporation
 //                2019  Johns Hopkins University (author: Daniel Povey)
+//                2019  Daniel Galvez
 
 // See ../../COPYING for clarification regarding multiple authors
 //
@@ -58,6 +59,7 @@ namespace kaldi {
        a self-loop on state 0 is not advised for decoding-graph-size
        reasons)
      - The start state must not be final.
+     - No phone (in the <ForPhones>...</ForPhones> block) may have the value 0.
 
 
  <Topology>
@@ -91,7 +93,7 @@ class Topology {
 
   /// Returns the topology entry for this phone;
   /// will throw exception if phone not covered by the topology.
-  const fst::StdFst &TopologyForPhone(int32 phone) const;
+  const fst::StdVectorFst &TopologyForPhone(int32 phone) const;
 
   /// Returns the number of \ref pdf_class "pdf-classes" for this phone;
   /// throws exception if phone not covered by this topology.
@@ -116,16 +118,15 @@ class Topology {
 
   bool operator == (const Topology &other) const;
 
-  // was:
-  //return phones_ == other.phones_ && phone2idx_ == other.phone2idx_
-  //&& entries_ == other.entries_;
-  // TODO: implement this; we probably need Equal() on fsts.
-
   // Allow default assignment operator and copy constructor.
  private:
+  using Arc     = typename fst::StdVectorFst::Arc;
+  using StateId = typename fst::StdVectorFst::StateId;
+  using Weight  = typename fst::StdVectorFst::Weight;
+
   std::vector<int32> phones_;  // list of all phones we have topology for.  Sorted, uniq.  no epsilon (zero) phone.
   std::vector<int32> phone2idx_;  // map from phones to indexes into the entries vector (or -1 for not present).
-  std::vector<fst::StdFst> entries_;
+  std::vector<fst::StdVectorFst> entries_;
 };
 
 

From 26edaf6d42b6992aa798de28eb9b6a0475af738c Mon Sep 17 00:00:00 2001
From: Hossein Hadian <hn.hadian@gmail.com>
Date: Tue, 26 Mar 2019 21:40:40 +0430
Subject: [PATCH 014/163] Kaldi10: Add missing file (from PR #3169) + minor
 fixes  (#3170)

* Implement most of topology.cc

Still missing some parts of Check().

make hmm still does not compile, even though topology.cc does.

* Add ReadFsaKaldi to read FSAs.

* Minor fixes

* Call Connect on a copy
---
 src/fstext/kaldi-fst-io-inl.h                 | 77 +++++++++++++++++++
 src/fstext/kaldi-fst-io.h                     |  6 ++
 ...{hmm-topology-test.cc => topology-test.cc} |  0
 src/hmm/topology.cc                           | 42 +++++-----
 4 files changed, 104 insertions(+), 21 deletions(-)
 rename src/hmm/{hmm-topology-test.cc => topology-test.cc} (100%)

diff --git a/src/fstext/kaldi-fst-io-inl.h b/src/fstext/kaldi-fst-io-inl.h
index b6bae4b9dc9..4870acdd0cc 100644
--- a/src/fstext/kaldi-fst-io-inl.h
+++ b/src/fstext/kaldi-fst-io-inl.h
@@ -163,7 +163,84 @@ void ReadFstKaldi(std::istream &is, bool binary,
   }
 }
 
+template <class Arc>
+void ReadFsaKaldi(std::istream &is, VectorFst<Arc> *fst) {
+  typedef typename Arc::Weight Weight;
+  typedef typename Arc::StateId StateId;
+  // Consume the \r on Windows, the \n that the text-form FSA format starts
+  // with, and any extra spaces that might have got in there somehow.
+  while (std::isspace(is.peek()) && is.peek() != '\n') is.get();
+  if (is.peek() == '\n') is.get(); // consume the newline.
+  else { // saw spaces but no newline.. this is not expected.
+    KALDI_ERR << "Reading FSA: unexpected sequence of spaces "
+              << " at file position " << is.tellg();
+  }
+  using std::string;
+  using std::vector;
+  using kaldi::SplitStringToIntegers;
+  using kaldi::ConvertStringToInteger;
+  fst->DeleteStates();
+  string line;
+  size_t nline = 0;
+  string separator = FLAGS_fst_field_separator + "\r\n";
+  while (std::getline(is, line)) {
+    nline++;
+    vector<string> col;
+    // on Windows we'll write in text and read in binary mode.
+    kaldi::SplitStringToVector(line, separator.c_str(), true, &col);
+    if (col.size() == 0) break; // Empty line is a signal to stop, in our
+    // archive format.
+    if (col.size() > 4) {
+      KALDI_ERR << "Bad line in FSA: " << line;
+    }
+    StateId s;
+    if (!ConvertStringToInteger(col[0], &s)) {
+      KALDI_ERR << "Bad line in FSA: " << line;
+    }
+    while (s >= fst->NumStates())
+      fst->AddState();
+    if (nline == 1) fst->SetStart(s);
 
+    bool ok = true;
+    Arc arc;
+    Weight w;
+    StateId d = s;
+    switch (col.size()) {
+      case 1:
+        fst->SetFinal(s, Weight::One());
+        break;
+      case 2:
+        if (!StrToWeight(col[1], true, &w)) ok = false;
+        else fst->SetFinal(s, w);
+        break;
+      case 3:
+        ok = ConvertStringToInteger(col[1], &arc.nextstate) &&
+            ConvertStringToInteger(col[2], &arc.ilabel);
+        arc.olabel = arc.ilabel;
+        if (ok) {
+          d = arc.nextstate;
+          arc.weight = Weight::One();
+          fst->AddArc(s, arc);
+        }
+        break;
+      case 4:
+        ok = ConvertStringToInteger(col[1], &arc.nextstate) &&
+            ConvertStringToInteger(col[2], &arc.ilabel) &&
+            StrToWeight(col[3], false, &arc.weight);
+        arc.olabel = arc.ilabel;
+        if (ok) {
+          d = arc.nextstate;
+          fst->AddArc(s, arc);
+        }
+        break;
+      default:
+        ok = false;
+    }
+    while (d >= fst->NumStates()) fst->AddState();
+    if (!ok)
+      KALDI_ERR << "Bad line in FSA: " << line;
+  }
+}
 
 
 template<class Arc> // static
diff --git a/src/fstext/kaldi-fst-io.h b/src/fstext/kaldi-fst-io.h
index 9715d81941e..f998873a019 100644
--- a/src/fstext/kaldi-fst-io.h
+++ b/src/fstext/kaldi-fst-io.h
@@ -81,6 +81,12 @@ template <class Arc>
 void ReadFstKaldi(std::istream &is, bool binary,
                   VectorFst<Arc> *fst);
 
+// A generic Kaldi-type-IO mechanism of reading FSAs from streams,
+// supporting text-mode reading.
+// Note that this assumes the input is an acceptor.
+template <class Arc>
+void ReadFsaKaldi(std::istream &is, VectorFst<Arc> *fst);
+
 // Read an FST file for LM (G.fst) and make it an acceptor,
 // and make sure it is sorted on labels
 fst::VectorFst<fst::StdArc> *ReadAndPrepareLmFst(std::string rxfilename);
diff --git a/src/hmm/hmm-topology-test.cc b/src/hmm/topology-test.cc
similarity index 100%
rename from src/hmm/hmm-topology-test.cc
rename to src/hmm/topology-test.cc
diff --git a/src/hmm/topology.cc b/src/hmm/topology.cc
index 0d7880cd1d5..6d2576edc52 100644
--- a/src/hmm/topology.cc
+++ b/src/hmm/topology.cc
@@ -22,14 +22,12 @@
 
 #include <vector>
 
-#include "fst/script/compile.h"
-
 #include "util/common-utils.h"
 #include "hmm/topology.h"
 #include "util/stl-utils.h"
 #include "util/text-utils.h"
 #include "fstext/kaldi-fst-io.h"
-
+#include "fstext/fstext-utils.h"
 
 
 namespace kaldi {
@@ -69,7 +67,7 @@ void Topology::Read(std::istream &is, bool binary) {
 
         int32 entry_index = entries_.size();
         fst::StdVectorFst fst;
-        ReadFsaKaldi(is, binary, &fst);
+        ReadFsaKaldi(is, &fst);
         entries_.push_back(fst);
 
         for (int32 phone : phones) {
@@ -94,13 +92,14 @@ void Topology::Read(std::istream &is, bool binary) {
     ReadBasicType(is, binary, &number_topology_entries);
     for (size_t index = 0; index < number_topology_entries; ++index) {
       fst::StdVectorFst fst;
-      ReadFsaKaldi(is, binary, &fst);
+      ReadFstKaldi(is, binary, &fst);
       entries_.push_back(fst);
     }
   }
   Check();
 }
 
+// This function writes an FSA in text mode to an output stream.
 template <class Arc>
 static void WriteFsa(std::ostream &os, const fst::VectorFst<Arc> &fst) {
   os << '\n';
@@ -141,27 +140,23 @@ void Topology::Write(std::ostream &os, bool binary) const {
 
 void Topology::Check() {
   if (entries_.empty() || phones_.empty() || phone2idx_.empty())
-    KALDI_ERR << "Topology::Check(), empty object.";
+    KALDI_ERR << "Empty object.";
   std::vector<bool> is_seen(entries_.size(), false);
   for (size_t i = 0; i < phones_.size(); i++) {
     int32 phone = phones_[i];
     if (static_cast<size_t>(phone) >= phone2idx_.size() ||
         static_cast<size_t>(phone2idx_[phone]) >= entries_.size())
-      KALDI_ERR << "Topology::Check(), phone has no valid index.";
+      KALDI_ERR << "Phone " << phone << " has no valid index.";
     is_seen[phone2idx_[phone]] = true;
   }
   if (!std::accumulate(is_seen.begin(),
                        is_seen.end(), true, std::logical_and<bool>()))
-    KALDI_ERR << "HmmTopoloy::Check(), entry with no corresponding phones.";
+    KALDI_ERR << "Entry with no corresponding phones.";
 
   for (auto const& entry: entries_) {
-    int32 num_states = static_cast<int32>(entry.NumStates());
-    if (num_states <= 1)
-      KALDI_ERR << "Topology::Check(), cannot only have one state (must have a "
+    if (entry.NumStates() <= 1)
+      KALDI_ERR << "Cannot only have one state (must have a "
                 << "final state and a start state).";
-  }
-
-  for (auto& entry: entries_) {
     bool has_final_state = false;
     std::vector<int32> seen_pdf_classes;
     for (fst::StateIterator<fst::StdVectorFst> state_iter(entry);
@@ -175,7 +170,7 @@ void Topology::Check() {
            !aiter.Done(); aiter.Next()) {
         const fst::StdArc &arc(aiter.Value());
         if (arc.ilabel != arc.olabel)
-          KALDI_ERR << "ilabel != olabel. " << arc.ilabel << " " << arc.olabel;
+          KALDI_ERR << "The topology must be an acceptor but ilabel != olabel.";
         if (arc.ilabel == 0)
           KALDI_ERR << "Epsilon arcs (pdf-class 0) are not allowed.";
         if (state != entry.Start() && arc.nextstate == entry.Start())
@@ -188,11 +183,11 @@ void Topology::Check() {
             "for each state";
     }
     if (!has_final_state) {
-      KALDI_ERR << "Topology::Check(), must have a final state.";
+      KALDI_ERR << "Must have a final state.";
     }
 
     if (entry.Final(entry.Start()) != Weight::Zero())
-      KALDI_ERR << "Topology::Check(), start state must not be a final state.";
+      KALDI_ERR << "Start state must not be a final state.";
 
     if (entry.Start() != 0) {
       KALDI_ERR << "Topology::Check(), start state must be 0.";
@@ -200,13 +195,18 @@ void Topology::Check() {
 
     SortAndUniq(&seen_pdf_classes);
     if (seen_pdf_classes.front() != 1 ||
-        seen_pdf_classes.back() != static_cast<int32>(seen_pdf_classes.size())) {
-      KALDI_ERR << "Topology::Check(), pdf_classes are expected to be "
+        seen_pdf_classes.back() != static_cast<int32>(seen_pdf_classes.size()))
+      KALDI_ERR << "pdf_classes are expected to be "
           "contiguous and start from 1.";
-    }
-    fst::Connect(&entry);
+
+    int num_states = entry.NumStates();
+    int num_arcs = NumArcs(entry);
+    fst::StdVectorFst fst(entry);  // Call Connect on a copy.
+    fst::Connect(&fst);
     if (entry.NumStates() == 0)
       KALDI_ERR << "Some of the states in the topolgy are not reachable.";
+    if (fst.NumStates() != num_states || NumArcs(fst) != num_arcs)
+      KALDI_ERR << "Topology changed after calling Connect().";
   }
 }
 

From d6634f7113eadfc0c352825b21cd5945a75fd4e9 Mon Sep 17 00:00:00 2001
From: Hossein Hadian <hn.hadian@gmail.com>
Date: Sat, 30 Mar 2019 01:58:01 +0430
Subject: [PATCH 015/163] Implement most of transitins.cc (#3184)

---
 src/hmm/transitions.cc | 903 +++++------------------------------------
 src/hmm/transitions.h  |  24 +-
 2 files changed, 113 insertions(+), 814 deletions(-)

diff --git a/src/hmm/transitions.cc b/src/hmm/transitions.cc
index 4198ea9cd45..c2fd5210f17 100644
--- a/src/hmm/transitions.cc
+++ b/src/hmm/transitions.cc
@@ -3,6 +3,7 @@
 // Copyright 2009-2012  Microsoft Corporation
 //        Johns Hopkins University (author: Guoguo Chen)
 //        2012-2019 Johns Hopkins University (Author: Daniel Povey)
+//        2019      Hossein Hadian
 
 
 // See ../../COPYING for clarification regarding multiple authors
@@ -23,6 +24,8 @@
 #include <vector>
 #include "hmm/transitions.h"
 #include "tree/context-dep.h"
+#include "util/common-utils.h"
+#include "fstext/fstext-utils.h"
 
 namespace kaldi {
 
@@ -31,62 +34,8 @@ bool Transitions::operator == (const Transitions &other) {
       num_pdfs_ == other.num_pdfs_;
 }
 
-void Transitions::ComputeTuples(const ContextDependencyInterface &ctx_dep) {
-  if (IsHmm())
-    ComputeTuplesIsHmm(ctx_dep);
-  else
-    ComputeTuplesNotHmm(ctx_dep);
-
-  // now tuples_ is populated with all possible tuples of (phone, hmm_state, pdf, self_loop_pdf).
-  std::sort(tuples_.begin(), tuples_.end());  // sort to enable reverse lookup.
-  // this sorting defines the transition-ids.
-}
-
-void Transitions::ComputeTuplesIsHmm(const ContextDependencyInterface &ctx_dep) {
-  const std::vector<int32> &phones = topo_.GetPhones();
-  KALDI_ASSERT(!phones.empty());
-
-  // this is the case for normal models. but not fot chain models
-  std::vector<std::vector<std::pair<int32, int32> > > pdf_info;
-  std::vector<int32> num_pdf_classes( 1 + *std::max_element(phones.begin(), phones.end()), -1);
-  for (size_t i = 0; i < phones.size(); i++)
-    num_pdf_classes[phones[i]] = topo_.NumPdfClasses(phones[i]);
-  ctx_dep.GetPdfInfo(phones, num_pdf_classes, &pdf_info);
-  // pdf_info is list indexed by pdf of which (phone, pdf_class) it
-  // can correspond to.
-
-  std::map<std::pair<int32, int32>, std::vector<int32> > to_hmm_state_list;
-  // to_hmm_state_list is a map from (phone, pdf_class) to the list
-  // of hmm-states in the HMM for that phone that that (phone, pdf-class)
-  // can correspond to.
-  for (size_t i = 0; i < phones.size(); i++) {  // setting up to_hmm_state_list.
-    int32 phone = phones[i];
-    const Topology::TopologyEntry &entry = topo_.TopologyForPhone(phone);
-    for (int32 j = 0; j < static_cast<int32>(entry.size()); j++) {  // for each state...
-      int32 pdf_class = entry[j].forward_pdf_class;
-      if (pdf_class != kNoPdf) {
-        to_hmm_state_list[std::make_pair(phone, pdf_class)].push_back(j);
-      }
-    }
-  }
-
-  for (int32 pdf = 0; pdf < static_cast<int32>(pdf_info.size()); pdf++) {
-    for (size_t j = 0; j < pdf_info[pdf].size(); j++) {
-      int32 phone = pdf_info[pdf][j].first,
-            pdf_class = pdf_info[pdf][j].second;
-      const std::vector<int32> &state_vec = to_hmm_state_list[std::make_pair(phone, pdf_class)];
-      KALDI_ASSERT(!state_vec.empty());
-      // state_vec is a list of the possible HMM-states that emit this
-      // pdf_class.
-      for (size_t k = 0; k < state_vec.size(); k++) {
-        int32 hmm_state = state_vec[k];
-        tuples_.push_back(Tuple(phone, hmm_state, pdf, pdf));
-      }
-    }
-  }
-}
-
-void Transitions::ComputeTuplesNotHmm(const ContextDependencyInterface &ctx_dep) {
+void Transitions::ComputeInfo(const ContextDependencyInterface &ctx_dep) {
+  using StateId = typename fst::StdFst::StateId;
   const std::vector<int32> &phones = topo_.GetPhones();
   KALDI_ASSERT(!phones.empty());
 
@@ -99,833 +48,179 @@ void Transitions::ComputeTuplesNotHmm(const ContextDependencyInterface &ctx_dep)
   // (pdf-class, self-loop pdf-class) of each state of that phone.
   std::vector<std::vector<std::pair<int32, int32> > > pdf_class_pairs;
   pdf_class_pairs.resize(1 + *std::max_element(phones.begin(), phones.end()));
+
+  std::vector<std::map<std::pair<int32, int32>, std::vector<std::pair<int32, int32> > > > to_topo_state_list;
+  to_topo_state_list.resize(1 + *std::max_element(phones.begin(), phones.end()));
+
   for (size_t i = 0; i < phones.size(); i++) {
     int32 phone = phones[i];
-    const Topology::TopologyEntry &entry = topo_.TopologyForPhone(phone);
-    for (int32 j = 0; j < static_cast<int32>(entry.size()); j++) {  // for each state...
-      int32 forward_pdf_class = entry[j].forward_pdf_class, self_loop_pdf_class = entry[j].self_loop_pdf_class;
-      if (forward_pdf_class != kNoPdf)
-        pdf_class_pairs[phone].push_back(std::make_pair(forward_pdf_class, self_loop_pdf_class));
-    }
-  }
-  ctx_dep.GetPdfInfo(phones, pdf_class_pairs, &pdf_info);
+    auto const &entry = topo_.TopologyForPhone(phone);  // an FST
+    int num_states = entry.NumStates();
+
+    std::vector<StateId> state_to_self_loop_pdf_class(num_states, -1);  // TODO(hhadian): Define and use kNoPdf
+    for (StateId state = 0; state < num_states; ++state)
+      for (fst::ArcIterator<fst::StdVectorFst> aiter(entry, state); !aiter.Done(); aiter.Next()) {
+        const fst::StdArc &arc(aiter.Value());
+        if (arc.nextstate == state) {
+          KALDI_ASSERT(state_to_self_loop_pdf_class[state] == -1);  //kNoPdf Only 1 self-loop allowed.
+          state_to_self_loop_pdf_class[state] = arc.ilabel;
+        }
+      }
 
-  std::vector<std::map<std::pair<int32, int32>, std::vector<int32> > > to_hmm_state_list;
-  to_hmm_state_list.resize(1 + *std::max_element(phones.begin(), phones.end()));
-  // to_hmm_state_list is a phone-indexed set of maps from (pdf-class, self-loop pdf_class) to the list
-  // of hmm-states in the HMM for that phone that that (pdf-class, self-loop pdf-class)
-  // can correspond to.
-  for (size_t i = 0; i < phones.size(); i++) {  // setting up to_hmm_state_list.
-    int32 phone = phones[i];
-    const Topology::TopologyEntry &entry = topo_.TopologyForPhone(phone);
-    std::map<std::pair<int32, int32>, std::vector<int32> > phone_to_hmm_state_list;
-    for (int32 j = 0; j < static_cast<int32>(entry.size()); j++) {  // for each state...
-      int32 forward_pdf_class = entry[j].forward_pdf_class, self_loop_pdf_class = entry[j].self_loop_pdf_class;
-      if (forward_pdf_class != kNoPdf) {
-        phone_to_hmm_state_list[std::make_pair(forward_pdf_class, self_loop_pdf_class)].push_back(j);
+    std::map<std::pair<int32, int32>, std::vector<std::pair<int32, int32> > > phone_to_topo_state_list;
+    for (StateId state = 0; state < num_states; ++state) {
+      for (fst::ArcIterator<fst::StdVectorFst> aiter(entry, state);
+           !aiter.Done(); aiter.Next()) {
+        const fst::StdArc &arc(aiter.Value());
+        int32 forward_pdf_class = arc.ilabel,
+            self_loop_pdf_class = state_to_self_loop_pdf_class[arc.nextstate];
+        pdf_class_pairs[phone].push_back(std::make_pair(forward_pdf_class,
+                                                        self_loop_pdf_class));
+        auto state_arc_pair = std::make_pair(state, aiter.Position());
+        auto pdf_class_pair = std::make_pair(forward_pdf_class, self_loop_pdf_class);
+        phone_to_topo_state_list[pdf_class_pair].push_back(state_arc_pair);
       }
     }
-    to_hmm_state_list[phone] = phone_to_hmm_state_list;
+    to_topo_state_list[phone] = phone_to_topo_state_list;
   }
+  ctx_dep.GetPdfInfo(phones, pdf_class_pairs, &pdf_info);
+
+  info_.push_back(TransitionIdInfo());  // transition-id is 1-based.
 
   for (int32 i = 0; i < phones.size(); i++) {
     int32 phone = phones[i];
     for (int32 j = 0; j < static_cast<int32>(pdf_info[phone].size()); j++) {
       int32 pdf_class = pdf_class_pairs[phone][j].first,
             self_loop_pdf_class = pdf_class_pairs[phone][j].second;
-      const std::vector<int32> &state_vec =
-              to_hmm_state_list[phone][std::make_pair(pdf_class, self_loop_pdf_class)];
-      KALDI_ASSERT(!state_vec.empty());
-      for (size_t k = 0; k < state_vec.size(); k++) {
-        int32 hmm_state = state_vec[k];
+      auto const &state_arc_vec =
+              to_topo_state_list[phone][std::make_pair(pdf_class, self_loop_pdf_class)];
+      KALDI_ASSERT(!state_arc_vec.empty());
+      for (auto const& state_arc_pair: state_arc_vec) {
+        int32 topo_state = state_arc_pair.first,
+            arc_index = state_arc_pair.second;
         for (size_t m = 0; m < pdf_info[phone][j].size(); m++) {
           int32 pdf = pdf_info[phone][j][m].first,
             self_loop_pdf = pdf_info[phone][j][m].second;
-          tuples_.push_back(Tuple(phone, hmm_state, pdf, self_loop_pdf));
+          TransitionIdInfo tuple{.phone = phone, .topo_state = topo_state,
+                .arc_index = arc_index, .pdf_id = pdf, .self_loop_pdf_id = self_loop_pdf};
+          info_.push_back(tuple);
         }
       }
     }
   }
-}
-
-void Transitions::ComputeDerived() {
-  state2id_.resize(tuples_.size()+2);  // indexed by transition-state, which
-  // is one based, but also an entry for one past end of list.
-
-  int32 cur_transition_id = 1;
-  num_pdfs_ = 0;
-  for (int32 tstate = 1;
-      tstate <= static_cast<int32>(tuples_.size()+1);  // not a typo.
-      tstate++) {
-    state2id_[tstate] = cur_transition_id;
-    if (static_cast<size_t>(tstate) <= tuples_.size()) {
-      int32 phone = tuples_[tstate-1].phone,
-          hmm_state = tuples_[tstate-1].hmm_state,
-          forward_pdf = tuples_[tstate-1].forward_pdf,
-          self_loop_pdf = tuples_[tstate-1].self_loop_pdf;
-      num_pdfs_ = std::max(num_pdfs_, 1 + forward_pdf);
-      num_pdfs_ = std::max(num_pdfs_, 1 + self_loop_pdf);
-      const Topology::HmmState &state = topo_.TopologyForPhone(phone)[hmm_state];
-      int32 my_num_ids = static_cast<int32>(state.transitions.size());
-      cur_transition_id += my_num_ids;  // # trans out of this state.
-    }
-  }
 
-  id2state_.resize(cur_transition_id);   // cur_transition_id is #transition-ids+1.
-  id2pdf_id_.resize(cur_transition_id);
-  for (int32 tstate = 1; tstate <= static_cast<int32>(tuples_.size()); tstate++) {
-    for (int32 tid = state2id_[tstate]; tid < state2id_[tstate+1]; tid++) {
-      id2state_[tid] = tstate;
-      if (IsSelfLoop(tid))
-        id2pdf_id_[tid] = tuples_[tstate-1].self_loop_pdf;
-      else
-        id2pdf_id_[tid] = tuples_[tstate-1].forward_pdf;
-    }
-  }
-
-  // The following statements put copies a large number in the region of memory
-  // past the end of the id2pdf_id_ array, while leaving the aray as it was
-  // before.  The goal of this is to speed up decoding by disabling a check
-  // inside TransitionIdToPdf() that the transition-id was within the correct
-  // range.
-  int32 num_big_numbers = std::min<int32>(2000, cur_transition_id);
-  id2pdf_id_.resize(cur_transition_id + num_big_numbers,
-                    std::numeric_limits<int32>::max());
-  id2pdf_id_.resize(cur_transition_id);
-}
-
-void Transitions::InitializeProbs() {
-  log_probs_.Resize(NumTransitionIds()+1);  // one-based array, zeroth element empty.
-  for (int32 trans_id = 1; trans_id <= NumTransitionIds(); trans_id++) {
-    int32 trans_state = id2state_[trans_id];
-    int32 trans_index = trans_id - state2id_[trans_state];
-    const Tuple &tuple = tuples_[trans_state-1];
-    const Topology::TopologyEntry &entry = topo_.TopologyForPhone(tuple.phone);
-    KALDI_ASSERT(static_cast<size_t>(tuple.hmm_state) < entry.size());
-    BaseFloat prob = entry[tuple.hmm_state].transitions[trans_index].second;
-    if (prob <= 0.0)
-      KALDI_ERR << "Transitions::InitializeProbs, zero "
-          "probability [should remove that entry in the topology]";
-    if (prob > 1.0)
-      KALDI_WARN << "Transitions::InitializeProbs, prob greater than one.";
-    log_probs_(trans_id) = Log(prob);
-  }
-  ComputeDerivedOfProbs();
+  std::sort(info_.begin(), info_.end());  // sort to enable reverse lookup.
 }
 
-void Transitions::Check() const {
-  KALDI_ASSERT(NumTransitionIds() != 0 && NumTransitionStates() != 0);
-  {
-    int32 sum = 0;
-    for (int32 ts = 1; ts <= NumTransitionStates(); ts++) sum += NumTransitionIndices(ts);
-    KALDI_ASSERT(sum == NumTransitionIds());
-  }
-  for (int32 tid = 1; tid <= NumTransitionIds(); tid++) {
-    int32 tstate = TransitionIdToTransitionState(tid),
-        index = TransitionIdToTransitionIndex(tid);
-    KALDI_ASSERT(tstate > 0 && tstate <=NumTransitionStates() && index >= 0);
-    KALDI_ASSERT(tid == PairToTransitionId(tstate, index));
-    int32 phone = TransitionStateToPhone(tstate),
-        hmm_state = TransitionStateToHmmState(tstate),
-        forward_pdf = TransitionStateToForwardPdf(tstate),
-        self_loop_pdf = TransitionStateToSelfLoopPdf(tstate);
-    KALDI_ASSERT(tstate == TupleToTransitionState(phone, hmm_state, forward_pdf, self_loop_pdf));
-    KALDI_ASSERT(log_probs_(tid) <= 0.0 && log_probs_(tid) - log_probs_(tid) == 0.0);
-    // checking finite and non-positive (and not out-of-bounds).
-  }
-}
+void Transitions::ComputeDerived() {
+  pdf_ids_.resize(info_.size());
+  for (int32 tid = 1; tid <= NumTransitionIds(); ++tid) {
+    auto transition = info_[tid];
+    auto const &entry = topo_.TopologyForPhone(transition.phone);  // an FST
+    fst::ArcIterator<fst::StdVectorFst> aiter(entry, transition.topo_state);
+    aiter.Seek(transition.arc_index);
+    auto const &arc(aiter.Value());
+
+    transition.is_self_loop = (arc.nextstate == transition.topo_state);
+    transition.is_initial = (transition.topo_state == 0);
+    transition.is_final = (entry.Final(arc.nextstate) != fst::StdFst::Weight::Zero());
+    transition.transition_cost = arc.weight.Value();
+    if (transition.self_loop_pdf_id == -1)
+      transition.self_loop_transition_id = -1;
+    else
+      transition.self_loop_transition_id =
+          TupleToTransitionId(transition.phone, transition.topo_state,
+                              transition.arc_index, transition.self_loop_pdf_id,
+                              transition.self_loop_pdf_id);
 
-bool Transitions::IsHmm() const {
-  const std::vector<int32> &phones = topo_.GetPhones();
-  KALDI_ASSERT(!phones.empty());
-  for (size_t i = 0; i < phones.size(); i++) {
-    int32 phone = phones[i];
-    const Topology::TopologyEntry &entry = topo_.TopologyForPhone(phone);
-    for (int32 j = 0; j < static_cast<int32>(entry.size()); j++) {  // for each state...
-      if (entry[j].forward_pdf_class != entry[j].self_loop_pdf_class)
-        return false;
-    }
+    pdf_ids_[tid] = transition.pdf_id;
   }
-  return true;
 }
 
-Transitions::TransitionModel(const ContextDependencyInterface &ctx_dep,
-                                 const Topology &hmm_topo): topo_(hmm_topo) {
+Transitions::Transitions(const ContextDependencyInterface &ctx_dep,
+                             const Topology &topo): topo_(topo) {
   // First thing is to get all possible tuples.
-  ComputeTuples(ctx_dep);
+  ComputeInfo(ctx_dep);
   ComputeDerived();
-  InitializeProbs();
   Check();
 }
 
-int32 Transitions::TupleToTransitionState(int32 phone, int32 hmm_state, int32 pdf, int32 self_loop_pdf) const {
-  Tuple tuple(phone, hmm_state, pdf, self_loop_pdf);
+int32 Transitions::TupleToTransitionId(int32 phone, int32 topo_state,
+                                       int32 arc_index, int32 pdf_id,
+                                       int32 self_loop_pdf_id) const {
+  TransitionIdInfo tuple{.phone = phone, .topo_state = topo_state,
+        .arc_index = arc_index, .pdf_id = pdf_id, .self_loop_pdf_id = self_loop_pdf_id};
   // Note: if this ever gets too expensive, which is unlikely, we can refactor
   // this code to sort first on pdf, and then index on pdf, so those
   // that have the same pdf are in a contiguous range.
-  std::vector<Tuple>::const_iterator iter =
-      std::lower_bound(tuples_.begin(), tuples_.end(), tuple);
-  if (iter == tuples_.end() || !(*iter == tuple)) {
-    KALDI_ERR << "Transitions::TupleToTransitionState, tuple not found."
-              << " (incompatible tree and model?)";
-  }
-  // tuples_ is indexed by transition_state-1, so add one.
-  return static_cast<int32>((iter - tuples_.begin())) + 1;
-}
-
-
-int32 Transitions::NumTransitionIndices(int32 trans_state) const {
-  KALDI_ASSERT(static_cast<size_t>(trans_state) <= tuples_.size());
-  return static_cast<int32>(state2id_[trans_state+1]-state2id_[trans_state]);
-}
-
-int32 Transitions::TransitionIdToTransitionState(int32 trans_id) const {
-  KALDI_ASSERT(trans_id != 0 &&  static_cast<size_t>(trans_id) < id2state_.size());
-  return id2state_[trans_id];
-}
-
-int32 Transitions::TransitionIdToTransitionIndex(int32 trans_id) const {
-  KALDI_ASSERT(trans_id != 0 && static_cast<size_t>(trans_id) < id2state_.size());
-  return trans_id - state2id_[id2state_[trans_id]];
-}
-
-int32 Transitions::TransitionStateToPhone(int32 trans_state) const {
-  KALDI_ASSERT(static_cast<size_t>(trans_state) <= tuples_.size());
-  return tuples_[trans_state-1].phone;
-}
-
-int32 Transitions::TransitionStateToForwardPdf(int32 trans_state) const {
-  KALDI_ASSERT(static_cast<size_t>(trans_state) <= tuples_.size());
-  return tuples_[trans_state-1].forward_pdf;
-}
-
-int32 Transitions::TransitionStateToForwardPdfClass(
-    int32 trans_state) const {
-  KALDI_ASSERT(static_cast<size_t>(trans_state) <= tuples_.size());
-  const Tuple &t = tuples_[trans_state-1];
-  const Topology::TopologyEntry &entry = topo_.TopologyForPhone(t.phone);
-  KALDI_ASSERT(static_cast<size_t>(t.hmm_state) < entry.size());
-  return entry[t.hmm_state].forward_pdf_class;
-}
+  auto lowerbound = std::lower_bound(info_.begin(), info_.end(), tuple);
+  if (lowerbound == info_.end() || !(*lowerbound == tuple))
+    KALDI_ERR << "Tuple not found. (incompatible tree and model?)";
 
-
-int32 Transitions::TransitionStateToSelfLoopPdfClass(
-    int32 trans_state) const {
-  KALDI_ASSERT(static_cast<size_t>(trans_state) <= tuples_.size());
-  const Tuple &t = tuples_[trans_state-1];
-  const Topology::TopologyEntry &entry = topo_.TopologyForPhone(t.phone);
-  KALDI_ASSERT(static_cast<size_t>(t.hmm_state) < entry.size());
-  return entry[t.hmm_state].self_loop_pdf_class;
-}
-
-
-int32 Transitions::TransitionStateToSelfLoopPdf(int32 trans_state) const {
-  KALDI_ASSERT(static_cast<size_t>(trans_state) <= tuples_.size());
-  return tuples_[trans_state-1].self_loop_pdf;
-}
-
-int32 Transitions::TransitionStateToHmmState(int32 trans_state) const {
-  KALDI_ASSERT(static_cast<size_t>(trans_state) <= tuples_.size());
-  return tuples_[trans_state-1].hmm_state;
-}
-
-int32 Transitions::PairToTransitionId(int32 trans_state, int32 trans_index) const {
-  KALDI_ASSERT(static_cast<size_t>(trans_state) <= tuples_.size());
-  KALDI_ASSERT(trans_index < state2id_[trans_state+1] - state2id_[trans_state]);
-  return state2id_[trans_state] + trans_index;
-}
-
-int32 Transitions::NumPhones() const {
-  int32 num_trans_state = tuples_.size();
-  int32 max_phone_id = 0;
-  for (int32 i = 0; i < num_trans_state; ++i) {
-    if (tuples_[i].phone > max_phone_id)
-      max_phone_id = tuples_[i].phone;
-  }
-  return max_phone_id;
-}
-
-
-bool Transitions::IsFinal(int32 trans_id) const {
-  KALDI_ASSERT(static_cast<size_t>(trans_id) < id2state_.size());
-  int32 trans_state = id2state_[trans_id];
-  int32 trans_index = trans_id - state2id_[trans_state];
-  const Tuple &tuple = tuples_[trans_state-1];
-  const Topology::TopologyEntry &entry = topo_.TopologyForPhone(tuple.phone);
-  KALDI_ASSERT(static_cast<size_t>(tuple.hmm_state) < entry.size());
-  KALDI_ASSERT(static_cast<size_t>(tuple.hmm_state) < entry.size());
-  KALDI_ASSERT(static_cast<size_t>(trans_index) <
-               entry[tuple.hmm_state].transitions.size());
-  // return true if the transition goes to the final state of the
-  // topology entry.
-  return (entry[tuple.hmm_state].transitions[trans_index].first + 1 ==
-          static_cast<int32>(entry.size()));
-}
-
-
-
-int32 Transitions::SelfLoopOf(int32 trans_state) const {  // returns the self-loop transition-id,
-  KALDI_ASSERT(static_cast<size_t>(trans_state-1) < tuples_.size());
-  const Tuple &tuple = tuples_[trans_state-1];
-  // or zero if does not exist.
-  int32 phone = tuple.phone, hmm_state = tuple.hmm_state;
-  const Topology::TopologyEntry &entry = topo_.TopologyForPhone(phone);
-  KALDI_ASSERT(static_cast<size_t>(hmm_state) < entry.size());
-  for (int32 trans_index = 0;
-      trans_index < static_cast<int32>(entry[hmm_state].transitions.size());
-      trans_index++)
-    if (entry[hmm_state].transitions[trans_index].first == hmm_state)
-      return PairToTransitionId(trans_state, trans_index);
-  return 0;  // invalid transition id.
-}
-
-void Transitions::ComputeDerivedOfProbs() {
-  non_self_loop_log_probs_.Resize(NumTransitionStates()+1);  // this array indexed
-  //  by transition-state with nothing in zeroth element.
-  for (int32 tstate = 1; tstate <= NumTransitionStates(); tstate++) {
-    int32 tid = SelfLoopOf(tstate);
-    if (tid == 0) {  // no self-loop
-      non_self_loop_log_probs_(tstate) = 0.0;  // log(1.0)
-    } else {
-      BaseFloat self_loop_prob = Exp(GetTransitionLogProb(tid)),
-          non_self_loop_prob = 1.0 - self_loop_prob;
-      if (non_self_loop_prob <= 0.0) {
-        KALDI_WARN << "ComputeDerivedOfProbs(): non-self-loop prob is " << non_self_loop_prob;
-        non_self_loop_prob = 1.0e-10;  // just so we can continue...
-      }
-      non_self_loop_log_probs_(tstate) = Log(non_self_loop_prob);  // will be negative.
-    }
-  }
+  return static_cast<int32>((lowerbound - info_.begin()));
 }
 
 void Transitions::Read(std::istream &is, bool binary) {
   ExpectToken(is, binary, "<Transitions>");
   topo_.Read(is, binary);
-  std::string token;
-  ReadToken(is, binary, &token);
+  ExpectToken(is, binary, "<Info>");
   int32 size;
   ReadBasicType(is, binary, &size);
-  tuples_.resize(size);
+  info_.resize(size);
   for (int32 i = 0; i < size; i++) {
-    ReadBasicType(is, binary, &(tuples_[i].phone));
-    ReadBasicType(is, binary, &(tuples_[i].hmm_state));
-    ReadBasicType(is, binary, &(tuples_[i].forward_pdf));
-    if (token == "<Tuples>")
-      ReadBasicType(is, binary, &(tuples_[i].self_loop_pdf));
-    else if (token == "<Triples>")
-      tuples_[i].self_loop_pdf = tuples_[i].forward_pdf;
+    ReadBasicType(is, binary, &(info_[i].phone));
+    ReadBasicType(is, binary, &(info_[i].topo_state));
+    ReadBasicType(is, binary, &(info_[i].arc_index));
+    ReadBasicType(is, binary, &(info_[i].pdf_id));
+    ReadBasicType(is, binary, &(info_[i].self_loop_pdf_id));
   }
-  ReadToken(is, binary, &token);
-  KALDI_ASSERT(token == "</Triples>" || token == "</Tuples>");
-  ComputeDerived();
-  ExpectToken(is, binary, "<LogProbs>");
-  log_probs_.Read(is, binary);
-  ExpectToken(is, binary, "</LogProbs>");
+  ExpectToken(is, binary, "</Info>");
   ExpectToken(is, binary, "</Transitions>");
-  ComputeDerivedOfProbs();
+  ComputeDerived();
   Check();
 }
 
 void Transitions::Write(std::ostream &os, bool binary) const {
-  bool is_hmm = IsHmm();
   WriteToken(os, binary, "<Transitions>");
   if (!binary) os << "\n";
   topo_.Write(os, binary);
-  if (is_hmm)
-    WriteToken(os, binary, "<Triples>");
-  else
-    WriteToken(os, binary, "<Tuples>");
-  WriteBasicType(os, binary, static_cast<int32>(tuples_.size()));
+  WriteToken(os, binary, "<Info>");
+  WriteBasicType(os, binary, static_cast<int32>(info_.size()));
   if (!binary) os << "\n";
-  for (int32 i = 0; i < static_cast<int32> (tuples_.size()); i++) {
-    WriteBasicType(os, binary, tuples_[i].phone);
-    WriteBasicType(os, binary, tuples_[i].hmm_state);
-    WriteBasicType(os, binary, tuples_[i].forward_pdf);
-    if (!is_hmm)
-      WriteBasicType(os, binary, tuples_[i].self_loop_pdf);
+  for (int32 i = 0; i < static_cast<int32> (info_.size()); i++) {
+    WriteBasicType(os, binary, info_[i].phone);
+    WriteBasicType(os, binary, info_[i].topo_state);
+    WriteBasicType(os, binary, info_[i].arc_index);
+    WriteBasicType(os, binary, info_[i].pdf_id);
+    WriteBasicType(os, binary, info_[i].self_loop_pdf_id);
     if (!binary) os << "\n";
   }
-  if (is_hmm)
-    WriteToken(os, binary, "</Triples>");
-  else
-    WriteToken(os, binary, "</Tuples>");
-  if (!binary) os << "\n";
-  WriteToken(os, binary, "<LogProbs>");
-  if (!binary) os << "\n";
-  log_probs_.Write(os, binary);
-  WriteToken(os, binary, "</LogProbs>");
+  WriteToken(os, binary, "</Info>");
   if (!binary) os << "\n";
   WriteToken(os, binary, "</Transitions>");
   if (!binary) os << "\n";
 }
 
-BaseFloat Transitions::GetTransitionProb(int32 trans_id) const {
-  return Exp(log_probs_(trans_id));
-}
-
-BaseFloat Transitions::GetTransitionLogProb(int32 trans_id) const {
-  return log_probs_(trans_id);
-}
-
-BaseFloat Transitions::GetNonSelfLoopLogProb(int32 trans_state) const {
-  KALDI_ASSERT(trans_state != 0);
-  return non_self_loop_log_probs_(trans_state);
-}
-
-BaseFloat Transitions::GetTransitionLogProbIgnoringSelfLoops(int32 trans_id) const {
-  KALDI_ASSERT(trans_id != 0);
-  KALDI_PARANOID_ASSERT(!IsSelfLoop(trans_id));
-  return log_probs_(trans_id) - GetNonSelfLoopLogProb(TransitionIdToTransitionState(trans_id));
-}
-
-// stats are counts/weights, indexed by transition-id.
-void Transitions::MleUpdate(const Vector<double> &stats,
-                                const MleTransitionUpdateConfig &cfg,
-                                BaseFloat *objf_impr_out,
-                                BaseFloat *count_out) {
-  if (cfg.share_for_pdfs) {
-    MleUpdateShared(stats, cfg, objf_impr_out, count_out);
-    return;
-  }
-  BaseFloat count_sum = 0.0, objf_impr_sum = 0.0;
-  int32 num_skipped = 0, num_floored = 0;
-  KALDI_ASSERT(stats.Dim() == NumTransitionIds()+1);
-  for (int32 tstate = 1; tstate <= NumTransitionStates(); tstate++) {
-    int32 n = NumTransitionIndices(tstate);
-    KALDI_ASSERT(n>=1);
-    if (n > 1) {  // no point updating if only one transition...
-      Vector<double> counts(n);
-      for (int32 tidx = 0; tidx < n; tidx++) {
-        int32 tid = PairToTransitionId(tstate, tidx);
-        counts(tidx) = stats(tid);
-      }
-      double tstate_tot = counts.Sum();
-      count_sum += tstate_tot;
-      if (tstate_tot < cfg.mincount) { num_skipped++; }
-      else {
-        Vector<BaseFloat> old_probs(n), new_probs(n);
-        for (int32 tidx = 0; tidx < n; tidx++) {
-          int32 tid = PairToTransitionId(tstate, tidx);
-          old_probs(tidx) = new_probs(tidx) = GetTransitionProb(tid);
-        }
-        for (int32 tidx = 0; tidx < n; tidx++)
-          new_probs(tidx) = counts(tidx) / tstate_tot;
-        for (int32 i = 0; i < 3; i++) {  // keep flooring+renormalizing for 3 times..
-          new_probs.Scale(1.0 / new_probs.Sum());
-          for (int32 tidx = 0; tidx < n; tidx++)
-            new_probs(tidx) = std::max(new_probs(tidx), cfg.floor);
-        }
-        // Compute objf change
-        for (int32 tidx = 0; tidx < n; tidx++) {
-          if (new_probs(tidx) == cfg.floor) num_floored++;
-          double objf_change = counts(tidx) * (Log(new_probs(tidx))
-                                               - Log(old_probs(tidx)));
-          objf_impr_sum += objf_change;
-        }
-        // Commit updated values.
-        for (int32 tidx = 0; tidx < n; tidx++) {
-          int32 tid = PairToTransitionId(tstate, tidx);
-          log_probs_(tid) = Log(new_probs(tidx));
-          if (log_probs_(tid) - log_probs_(tid) != 0.0)
-            KALDI_ERR << "Log probs is inf or NaN: error in update or bad stats?";
-        }
-      }
-    }
-  }
-  KALDI_LOG << "Transitions::Update, objf change is "
-            << (objf_impr_sum / count_sum) << " per frame over " << count_sum
-            << " frames. ";
-  KALDI_LOG <<  num_floored << " probabilities floored, " << num_skipped
-            << " out of " << NumTransitionStates() << " transition-states "
-      "skipped due to insuffient data (it is normal to have some skipped.)";
-  if (objf_impr_out) *objf_impr_out = objf_impr_sum;
-  if (count_out) *count_out = count_sum;
-  ComputeDerivedOfProbs();
-}
-
-
-// stats are counts/weights, indexed by transition-id.
-void Transitions::MapUpdate(const Vector<double> &stats,
-                                const MapTransitionUpdateConfig &cfg,
-                                BaseFloat *objf_impr_out,
-                                BaseFloat *count_out) {
-  KALDI_ASSERT(cfg.tau > 0.0);
-  if (cfg.share_for_pdfs) {
-    MapUpdateShared(stats, cfg, objf_impr_out, count_out);
-    return;
-  }
-  BaseFloat count_sum = 0.0, objf_impr_sum = 0.0;
-  KALDI_ASSERT(stats.Dim() == NumTransitionIds()+1);
-  for (int32 tstate = 1; tstate <= NumTransitionStates(); tstate++) {
-    int32 n = NumTransitionIndices(tstate);
-    KALDI_ASSERT(n>=1);
-    if (n > 1) {  // no point updating if only one transition...
-      Vector<double> counts(n);
-      for (int32 tidx = 0; tidx < n; tidx++) {
-        int32 tid = PairToTransitionId(tstate, tidx);
-        counts(tidx) = stats(tid);
-      }
-      double tstate_tot = counts.Sum();
-      count_sum += tstate_tot;
-      Vector<BaseFloat> old_probs(n), new_probs(n);
-      for (int32 tidx = 0; tidx < n; tidx++) {
-        int32 tid = PairToTransitionId(tstate, tidx);
-        old_probs(tidx) = new_probs(tidx) = GetTransitionProb(tid);
-      }
-      for (int32 tidx = 0; tidx < n; tidx++)
-        new_probs(tidx) = (counts(tidx) + cfg.tau * old_probs(tidx)) /
-            (cfg.tau + tstate_tot);
-      // Compute objf change
-      for (int32 tidx = 0; tidx < n; tidx++) {
-        double objf_change = counts(tidx) * (Log(new_probs(tidx))
-                                             - Log(old_probs(tidx)));
-        objf_impr_sum += objf_change;
-      }
-      // Commit updated values.
-      for (int32 tidx = 0; tidx < n; tidx++) {
-        int32 tid = PairToTransitionId(tstate, tidx);
-        log_probs_(tid) = Log(new_probs(tidx));
-        if (log_probs_(tid) - log_probs_(tid) != 0.0)
-          KALDI_ERR << "Log probs is inf or NaN: error in update or bad stats?";
-      }
-    }
-  }
-  KALDI_LOG << "Objf change is " << (objf_impr_sum / count_sum)
-            << " per frame over " << count_sum
-            << " frames.";
-  if (objf_impr_out) *objf_impr_out = objf_impr_sum;
-  if (count_out) *count_out = count_sum;
-  ComputeDerivedOfProbs();
-}
-
-
-
-/// This version of the Update() function is for if the user specifies
-/// --share-for-pdfs=true.  We share the transitions for all states that
-/// share the same pdf.
-void Transitions::MleUpdateShared(const Vector<double> &stats,
-                                      const MleTransitionUpdateConfig &cfg,
-                                      BaseFloat *objf_impr_out,
-                                      BaseFloat *count_out) {
-  KALDI_ASSERT(cfg.share_for_pdfs);
-
-  BaseFloat count_sum = 0.0, objf_impr_sum = 0.0;
-  int32 num_skipped = 0, num_floored = 0;
-  KALDI_ASSERT(stats.Dim() == NumTransitionIds()+1);
-  std::map<int32, std::set<int32> > pdf_to_tstate;
-
-  for (int32 tstate = 1; tstate <= NumTransitionStates(); tstate++) {
-    int32 pdf = TransitionStateToForwardPdf(tstate);
-    pdf_to_tstate[pdf].insert(tstate);
-    if (!IsHmm()) {
-      pdf = TransitionStateToSelfLoopPdf(tstate);
-      pdf_to_tstate[pdf].insert(tstate);
-    }
-  }
-  std::map<int32, std::set<int32> >::iterator map_iter;
-  for (map_iter = pdf_to_tstate.begin();
-       map_iter != pdf_to_tstate.end();
-       ++map_iter) {
-    // map_iter->first is pdf-id... not needed.
-    const std::set<int32> &tstates = map_iter->second;
-    KALDI_ASSERT(!tstates.empty());
-    int32 one_tstate = *(tstates.begin());
-    int32 n = NumTransitionIndices(one_tstate);
-    KALDI_ASSERT(n >= 1);
-    if (n > 1) { // Only update if >1 transition...
-      Vector<double> counts(n);
-      for (std::set<int32>::const_iterator iter = tstates.begin();
-           iter != tstates.end();
-           ++iter) {
-        int32 tstate = *iter;
-        if (NumTransitionIndices(tstate) != n)
-          KALDI_ERR << "Mismatch in #transition indices: you cannot "
-              "use the --share-for-pdfs option with this topology "
-              "and sharing scheme.";
-        for (int32 tidx = 0; tidx < n; tidx++) {
-          int32 tid = PairToTransitionId(tstate, tidx);
-          counts(tidx) += stats(tid);
-        }
-      }
-      double pdf_tot = counts.Sum();
-      count_sum += pdf_tot;
-      if (pdf_tot < cfg.mincount) { num_skipped++; }
-      else {
-        // Note: when calculating objf improvement, we
-        // assume we previously had the same tying scheme so
-        // we can get the params from one_tstate and they're valid
-        // for all.
-        Vector<BaseFloat> old_probs(n), new_probs(n);
-        for (int32 tidx = 0; tidx < n; tidx++) {
-          int32 tid = PairToTransitionId(one_tstate, tidx);
-          old_probs(tidx) = new_probs(tidx) = GetTransitionProb(tid);
-        }
-        for (int32 tidx = 0; tidx < n; tidx++)
-          new_probs(tidx) = counts(tidx) / pdf_tot;
-        for (int32 i = 0; i < 3; i++) {  // keep flooring+renormalizing for 3 times..
-          new_probs.Scale(1.0 / new_probs.Sum());
-          for (int32 tidx = 0; tidx < n; tidx++)
-            new_probs(tidx) = std::max(new_probs(tidx), cfg.floor);
-        }
-        // Compute objf change
-        for (int32 tidx = 0; tidx < n; tidx++) {
-          if (new_probs(tidx) == cfg.floor) num_floored++;
-          double objf_change = counts(tidx) * (Log(new_probs(tidx))
-                                               - Log(old_probs(tidx)));
-          objf_impr_sum += objf_change;
-        }
-        // Commit updated values.
-        for (std::set<int32>::const_iterator iter = tstates.begin();
-             iter != tstates.end();
-             ++iter) {
-          int32 tstate = *iter;
-          for (int32 tidx = 0; tidx < n; tidx++) {
-            int32 tid = PairToTransitionId(tstate, tidx);
-            log_probs_(tid) = Log(new_probs(tidx));
-            if (log_probs_(tid) - log_probs_(tid) != 0.0)
-              KALDI_ERR << "Log probs is inf or NaN: error in update or bad stats?";
-          }
-        }
-      }
-    }
-  }
-  KALDI_LOG << "Objf change is " << (objf_impr_sum / count_sum)
-            << " per frame over " << count_sum << " frames; "
-            << num_floored << " probabilities floored, "
-            << num_skipped << " pdf-ids skipped due to insuffient data.";
-  if (objf_impr_out) *objf_impr_out = objf_impr_sum;
-  if (count_out) *count_out = count_sum;
-  ComputeDerivedOfProbs();
-}
-
-
-/// This version of the MapUpdate() function is for if the user specifies
-/// --share-for-pdfs=true.  We share the transitions for all states that
-/// share the same pdf.
-void Transitions::MapUpdateShared(const Vector<double> &stats,
-                                      const MapTransitionUpdateConfig &cfg,
-                                      BaseFloat *objf_impr_out,
-                                      BaseFloat *count_out) {
-  KALDI_ASSERT(cfg.share_for_pdfs);
-
-  BaseFloat count_sum = 0.0, objf_impr_sum = 0.0;
-  KALDI_ASSERT(stats.Dim() == NumTransitionIds()+1);
-  std::map<int32, std::set<int32> > pdf_to_tstate;
-
-  for (int32 tstate = 1; tstate <= NumTransitionStates(); tstate++) {
-    int32 pdf = TransitionStateToForwardPdf(tstate);
-    pdf_to_tstate[pdf].insert(tstate);
-    if (!IsHmm()) {
-      pdf = TransitionStateToSelfLoopPdf(tstate);
-      pdf_to_tstate[pdf].insert(tstate);
-    }
-  }
-  std::map<int32, std::set<int32> >::iterator map_iter;
-  for (map_iter = pdf_to_tstate.begin();
-       map_iter != pdf_to_tstate.end();
-       ++map_iter) {
-    // map_iter->first is pdf-id... not needed.
-    const std::set<int32> &tstates = map_iter->second;
-    KALDI_ASSERT(!tstates.empty());
-    int32 one_tstate = *(tstates.begin());
-    int32 n = NumTransitionIndices(one_tstate);
-    KALDI_ASSERT(n >= 1);
-    if (n > 1) { // Only update if >1 transition...
-      Vector<double> counts(n);
-      for (std::set<int32>::const_iterator iter = tstates.begin();
-           iter != tstates.end();
-           ++iter) {
-        int32 tstate = *iter;
-        if (NumTransitionIndices(tstate) != n)
-          KALDI_ERR << "Mismatch in #transition indices: you cannot "
-              "use the --share-for-pdfs option with this topology "
-              "and sharing scheme.";
-        for (int32 tidx = 0; tidx < n; tidx++) {
-          int32 tid = PairToTransitionId(tstate, tidx);
-          counts(tidx) += stats(tid);
-        }
-      }
-      double pdf_tot = counts.Sum();
-      count_sum += pdf_tot;
-
-      // Note: when calculating objf improvement, we
-      // assume we previously had the same tying scheme so
-      // we can get the params from one_tstate and they're valid
-      // for all.
-      Vector<BaseFloat> old_probs(n), new_probs(n);
-      for (int32 tidx = 0; tidx < n; tidx++) {
-        int32 tid = PairToTransitionId(one_tstate, tidx);
-        old_probs(tidx) = new_probs(tidx) = GetTransitionProb(tid);
-      }
-      for (int32 tidx = 0; tidx < n; tidx++)
-        new_probs(tidx) = (counts(tidx) + old_probs(tidx) * cfg.tau) /
-            (pdf_tot + cfg.tau);
-      // Compute objf change
-      for (int32 tidx = 0; tidx < n; tidx++) {
-        double objf_change = counts(tidx) * (Log(new_probs(tidx))
-                                             - Log(old_probs(tidx)));
-        objf_impr_sum += objf_change;
-      }
-      // Commit updated values.
-      for (std::set<int32>::const_iterator iter = tstates.begin();
-           iter != tstates.end();
-           ++iter) {
-        int32 tstate = *iter;
-        for (int32 tidx = 0; tidx < n; tidx++) {
-          int32 tid = PairToTransitionId(tstate, tidx);
-          log_probs_(tid) = Log(new_probs(tidx));
-          if (log_probs_(tid) - log_probs_(tid) != 0.0)
-            KALDI_ERR << "Log probs is inf or NaN: error in update or bad stats?";
-        }
-      }
-    }
-  }
-  KALDI_LOG << "Objf change is " << (objf_impr_sum / count_sum)
-            << " per frame over " << count_sum
-            << " frames.";
-  if (objf_impr_out) *objf_impr_out = objf_impr_sum;
-  if (count_out) *count_out = count_sum;
-  ComputeDerivedOfProbs();
-}
-
-
-int32 Transitions::TransitionIdToPhone(int32 trans_id) const {
-  KALDI_ASSERT(trans_id != 0 && static_cast<size_t>(trans_id) < id2state_.size());
-  int32 trans_state = id2state_[trans_id];
-  return tuples_[trans_state-1].phone;
-}
-
-int32 Transitions::TransitionIdToPdfClass(int32 trans_id) const {
-  KALDI_ASSERT(trans_id != 0 && static_cast<size_t>(trans_id) < id2state_.size());
-  int32 trans_state = id2state_[trans_id];
-
-  const Tuple &t = tuples_[trans_state-1];
-  const Topology::TopologyEntry &entry = topo_.TopologyForPhone(t.phone);
-  KALDI_ASSERT(static_cast<size_t>(t.hmm_state) < entry.size());
-  if (IsSelfLoop(trans_id))
-    return entry[t.hmm_state].self_loop_pdf_class;
-  else
-    return entry[t.hmm_state].forward_pdf_class;
-}
-
+void Transitions::Check() const {
 
-int32 Transitions::TransitionIdToHmmState(int32 trans_id) const {
-  KALDI_ASSERT(trans_id != 0 && static_cast<size_t>(trans_id) < id2state_.size());
-  int32 trans_state = id2state_[trans_id];
-  const Tuple &t = tuples_[trans_state-1];
-  return t.hmm_state;
 }
 
 void Transitions::Print(std::ostream &os,
                             const std::vector<std::string> &phone_names,
                             const Vector<double> *occs) {
-  if (occs != NULL)
-    KALDI_ASSERT(occs->Dim() == NumPdfs());
-  bool is_hmm = IsHmm();
-  for (int32 tstate = 1; tstate <= NumTransitionStates(); tstate++) {
-    const Tuple &tuple = tuples_[tstate-1];
-    KALDI_ASSERT(static_cast<size_t>(tuple.phone) < phone_names.size());
-    std::string phone_name = phone_names[tuple.phone];
-
-    os << "Transition-state " << tstate << ": phone = " << phone_name
-       << " hmm-state = " << tuple.hmm_state;
-    if (is_hmm)
-      os << " pdf = " << tuple.forward_pdf << '\n';
-    else
-      os << " forward-pdf = " << tuple.forward_pdf << " self-loop-pdf = "
-         << tuple.self_loop_pdf << '\n';
-    for (int32 tidx = 0; tidx < NumTransitionIndices(tstate); tidx++) {
-      int32 tid = PairToTransitionId(tstate, tidx);
-      BaseFloat p = GetTransitionProb(tid);
-      os << " Transition-id = " << tid << " p = " << p;
-      if (occs != NULL) {
-        if (IsSelfLoop(tid))
-          os << " count of pdf = " << (*occs)(tuple.self_loop_pdf);
-        else
-          os << " count of pdf = " << (*occs)(tuple.forward_pdf);
-      }
-      // now describe what it's a transition to.
-      if (IsSelfLoop(tid)) os << " [self-loop]\n";
-      else {
-        int32 hmm_state = tuple.hmm_state;
-        const Topology::TopologyEntry &entry = topo_.TopologyForPhone(tuple.phone);
-        KALDI_ASSERT(static_cast<size_t>(hmm_state) < entry.size());
-        int32 next_hmm_state = entry[hmm_state].transitions[tidx].first;
-        KALDI_ASSERT(next_hmm_state != hmm_state);
-        os << " [" << hmm_state << " -> " << next_hmm_state << "]\n";
-      }
-    }
-  }
 }
 
 bool GetPdfsForPhones(const Transitions &trans_model,
                       const std::vector<int32> &phones,
                       std::vector<int32> *pdfs) {
-  KALDI_ASSERT(IsSortedAndUniq(phones));
-  KALDI_ASSERT(pdfs != NULL);
-  pdfs->clear();
-  for (int32 tstate = 1; tstate <= trans_model.NumTransitionStates(); tstate++) {
-    if (std::binary_search(phones.begin(), phones.end(),
-             trans_model.TransitionStateToPhone(tstate))) {
-      pdfs->push_back(trans_model.TransitionStateToForwardPdf(tstate));
-      pdfs->push_back(trans_model.TransitionStateToSelfLoopPdf(tstate));
-    }
-  }
-  SortAndUniq(pdfs);
-
-  for (int32 tstate = 1; tstate <= trans_model.NumTransitionStates(); tstate++)
-    if ((std::binary_search(pdfs->begin(), pdfs->end(),
-                          trans_model.TransitionStateToForwardPdf(tstate)) ||
-         std::binary_search(pdfs->begin(), pdfs->end(),
-                          trans_model.TransitionStateToSelfLoopPdf(tstate)))
-       && !std::binary_search(phones.begin(), phones.end(),
-                              trans_model.TransitionStateToPhone(tstate)))
-      return false;
   return true;
 }
 
 bool GetPhonesForPdfs(const Transitions &trans_model,
                      const std::vector<int32> &pdfs,
                      std::vector<int32> *phones) {
-  KALDI_ASSERT(IsSortedAndUniq(pdfs));
-  KALDI_ASSERT(phones != NULL);
-  phones->clear();
-  for (int32 tstate = 1; tstate <= trans_model.NumTransitionStates(); tstate++) {
-    if (std::binary_search(pdfs.begin(), pdfs.end(),
-                           trans_model.TransitionStateToForwardPdf(tstate)) ||
-        std::binary_search(pdfs.begin(), pdfs.end(),
-                           trans_model.TransitionStateToSelfLoopPdf(tstate)))
-      phones->push_back(trans_model.TransitionStateToPhone(tstate));
-  }
-  SortAndUniq(phones);
-
-  for (int32 tstate = 1; tstate <= trans_model.NumTransitionStates(); tstate++)
-    if (std::binary_search(phones->begin(), phones->end(),
-                           trans_model.TransitionStateToPhone(tstate))
-        && !(std::binary_search(pdfs.begin(), pdfs.end(),
-                               trans_model.TransitionStateToForwardPdf(tstate)) &&
-             std::binary_search(pdfs.begin(), pdfs.end(),
-                               trans_model.TransitionStateToSelfLoopPdf(tstate))) )
-      return false;
   return true;
 }
 
-bool Transitions::Compatible(const TransitionModel &other) const {
-  return (topo_ == other.topo_ && tuples_ == other.tuples_ &&
-          state2id_ == other.state2id_ && id2state_ == other.id2state_
-          && num_pdfs_ == other.num_pdfs_);
-}
-
-bool Transitions::IsSelfLoop(int32 trans_id) const {
-  KALDI_ASSERT(static_cast<size_t>(trans_id) < id2state_.size());
-  int32 trans_state = id2state_[trans_id];
-  int32 trans_index = trans_id - state2id_[trans_state];
-  const Tuple &tuple = tuples_[trans_state-1];
-  int32 phone = tuple.phone, hmm_state = tuple.hmm_state;
-  const Topology::TopologyEntry &entry = topo_.TopologyForPhone(phone);
-  KALDI_ASSERT(static_cast<size_t>(hmm_state) < entry.size());
-  return (static_cast<size_t>(trans_index) < entry[hmm_state].transitions.size()
-          && entry[hmm_state].transitions[trans_index].first == hmm_state);
-}
 
 } // End namespace kaldi
diff --git a/src/hmm/transitions.h b/src/hmm/transitions.h
index 72e3b62691c..0162d3f9eba 100644
--- a/src/hmm/transitions.h
+++ b/src/hmm/transitions.h
@@ -19,8 +19,8 @@
 // See the Apache 2 License for the specific language governing permissions and
 // limitations under the License.
 
-#ifndef KALDI_HMM_TRANSITION_MODEL_H_
-#define KALDI_HMM_TRANSITION_MODEL_H_
+#ifndef KALDI_HMM_TRANSITIONS_H_
+#define KALDI_HMM_TRANSITIONS_H_
 
 #include "base/kaldi-common.h"
 #include "util/const-integer-set.h"
@@ -92,7 +92,7 @@ class Transitions {
   /// The class keeps a copy of the Topology object, but not
   /// the ContextDependency object.
   Transitions(const ContextDependencyInterface &ctx_dep,
-                  const Topology &topo);
+              const Topology &topo);
 
 
   /// Constructor that takes no arguments: typically used prior to calling Read.
@@ -111,8 +111,8 @@ class Transitions {
                       // tree and phonetic-context information, etc.)
 
     int32 self_loop_pdf_id;  // The pdf-id associated with the self-loop
-                             // transition (if any) leaving the *destiation*
-                             // state of this arc, or zero if that state has no
+                             // transition (if any) leaving the *destination*
+                             // state of this arc, or -1 if that state has no
                              // self-loop.  Search for (*) above for
                              // explanation.
 
@@ -145,6 +145,7 @@ class Transitions {
     // this arc, if there is one, or 0 if there is no such self-loop.
     int32 self_loop_transition_id;
 
+
     bool operator < (const TransitionIdInfo &other) const {
       if (phone < other.phone) return true;
       else if (phone > other.phone) return false;
@@ -154,8 +155,11 @@ class Transitions {
       else if (pdf_id > other.pdf_id) return false;
       else return (self_loop_pdf_id < other.self_loop_pdf_id);
     }
-    // TODO.  operator == can compare all members.
-    bool operator == (const TransitionIdInfo &other) const;
+    // TODO.  operator == can compare all members. Also compare derived members?
+    bool operator == (const TransitionIdInfo &other) const {
+      return phone == other.phone && topo_state == other.topo_state &&
+          pdf_id == other.pdf_id && self_loop_pdf_id == other.self_loop_pdf_id;
+    }
   };
 
 
@@ -173,7 +177,7 @@ class Transitions {
 
 
   /// Returns the total number of transition-ids (note, these are one-based).
-  inline int32 NumTransitionIds() const { return info_.size()-1; }
+  inline int32 NumTransitionIds() const { return info_.size() - 1; }
 
   // NumPdfs() returns the number of pdfs (pdf-ids) in the tree,
   // as returned by ctx_dep.NumPdfs() for the tree passed to the constructor.
@@ -198,7 +202,7 @@ class Transitions {
   // fields); you then have to call ComputeDerived() to initalize teh rest.
   void ComputeInfo(const ContextDependencyInterface &ctx_dep);
 
-  void ComputeDerived();  // called from constructor and Read function: computes state2id_ and id2state_.
+  void ComputeDerived();  // Called from constructor and Read function.
 
   void Check() const;
 
@@ -207,7 +211,7 @@ class Transitions {
 
   /// Information about transition-ids, indexed by transition-id.
   /// the tuples are in sorted order which allows us to do the reverse mapping from
-  /// tuple to transition state
+  /// tuple to transition id.
   std::vector<TransitionIdInfo> info_;
 
 

From 20c73b5aacefe3b31f178129178508c83f1c6cc3 Mon Sep 17 00:00:00 2001
From: Daniel Povey <dpovey@gmail.com>
Date: Sun, 31 Mar 2019 13:31:25 -0400
Subject: [PATCH 016/163] [src] Kaldi10 changes: remove vector strides, more
 tensor progress. (#3188)

* [src] progress on tensor code

* [src] Remove stride support for Vector (more trouble than it was worth)

* [src] More Tensor drafts

* [src] More tensor drafts

From 885249af577337db3fc590064a95ba25942c205a Mon Sep 17 00:00:00 2001
From: Daniel Povey <dpovey@gmail.com>
Date: Sun, 31 Mar 2019 13:42:45 -0400
Subject: [PATCH 017/163] [src] Kaldi10, more tensor progress (#3189)

* Further tensor progress

* [src] More Tensor progress

* [src] More tensor progress
---
 src/hmm/transitions.h                   |   2 +-
 src/tensor/cpu-impl-linear.cc           |   4 +-
 src/tensor/cpu-impl-linear.h            |  13 +-
 src/tensor/gpu-impl-linear.h            |  61 ++++
 src/tensor/storage.cc                   |  29 ++
 src/tensor/storage.h                    |   2 +
 src/tensor/tensor-common.h              | 101 +++++-
 src/tensor/tensor-functions.cc          |   2 +-
 src/tensor/tensor-functions.h           | 286 ++++++++++-------
 src/tensor/tensor-impl-linear.h         |  48 +--
 src/tensor/tensor-impl-utils.h          | 133 ++++++++
 src/tensor/tensor-impl.h                |  72 +++++
 src/tensor/tensor-pattern-utils-test.cc |  85 ++++++
 src/tensor/tensor-pattern-utils.cc      | 390 ++++++++++++++++--------
 src/tensor/tensor-pattern-utils.h       | 388 ++++++++++++++++-------
 src/tensor/tensor-pattern.cc            |  59 ++--
 src/tensor/tensor-pattern.h             | 151 ++-------
 src/tensor/tensor-utils.h               | 108 ++++---
 src/tensor/tensor.h                     | 200 ++++++------
 19 files changed, 1460 insertions(+), 674 deletions(-)
 create mode 100644 src/tensor/gpu-impl-linear.h
 create mode 100644 src/tensor/storage.cc
 create mode 100644 src/tensor/tensor-impl-utils.h
 create mode 100644 src/tensor/tensor-impl.h
 create mode 100644 src/tensor/tensor-pattern-utils-test.cc

diff --git a/src/hmm/transitions.h b/src/hmm/transitions.h
index 0162d3f9eba..0909189b88b 100644
--- a/src/hmm/transitions.h
+++ b/src/hmm/transitions.h
@@ -70,7 +70,7 @@ namespace kaldi {
 //                   the neural net output.
 // (*)self-loop-pdf-id:  The pdf-id associated with the self-loop of this state,
 //                   if there is one (we do not allow >1), or -1 if there is no
-//                   self-loop.  This will be the same as pdf-id' if this transition
+//                   self-loop.  This will be the same as 'pdf-id' if this transition
 //                   *is* the self-loop.  It might seem odd that we require this
 //                   to get the transition-id for a non-self-loop arc; the reason
 //                   why it's necessary is that we initially create the graph
diff --git a/src/tensor/cpu-impl-linear.cc b/src/tensor/cpu-impl-linear.cc
index 7b659841395..e8522664f0d 100644
--- a/src/tensor/cpu-impl-linear.cc
+++ b/src/tensor/cpu-impl-linear.cc
@@ -1,4 +1,4 @@
-// tensor/cpu-impl.cc
+// tensor/cpu-impl-linear.cc
 
 // Copyright      2019  Johns Hopkins University (author: Daniel Povey)
 
@@ -17,7 +17,7 @@
 // See the Apache 2 License for the specific language governing permissions and
 // limitations under the License.
 
-#include "tensor/cpu-impl.h"
+#include "tensor/cpu-impl-linear.h"
 
 
 namespace kaldi {
diff --git a/src/tensor/cpu-impl-linear.h b/src/tensor/cpu-impl-linear.h
index 3ccc92c34c1..90ea3ad48da 100644
--- a/src/tensor/cpu-impl-linear.h
+++ b/src/tensor/cpu-impl-linear.h
@@ -1,4 +1,4 @@
-// tensor/cpu-impl.h
+// tensor/cpu-impl-linear.h
 
 // Copyright      2019  Johns Hopkins University (author: Daniel Povey)
 
@@ -17,12 +17,17 @@
 // See the Apache 2 License for the specific language governing permissions and
 // limitations under the License.
 
-#ifndef KALDI_TENSOR_CPU_IMPL_H_
-#define KALDI_TENSOR_CPU_IMPL_H_ 1
+#ifndef KALDI_TENSOR_CPU_IMPL_LINEAR_H_
+#define KALDI_TENSOR_CPU_IMPL_LINEAR_H_ 1
 
 #include "tensor/tensor.h"
 
 
+// This header actually contains implementations of functions that are required
+// by tensor-impl-linear.cc.  It should not be included by users of this
+// library.
+
+
 namespace kaldi {
 namespace tensor {
 
@@ -63,4 +68,4 @@ AddProductScalar3Cpu(alpha, beta, a, b, c);
 }  // namespace kaldi
 
 
-#endif  // KALDI_TENSOR_CPU_IMPL_H_
+#endif  // KALDI_TENSOR_CPU_IMPL_LINEAR_H_
diff --git a/src/tensor/gpu-impl-linear.h b/src/tensor/gpu-impl-linear.h
new file mode 100644
index 00000000000..3f6cd479db8
--- /dev/null
+++ b/src/tensor/gpu-impl-linear.h
@@ -0,0 +1,61 @@
+// tensor/gpu-impl-linear.h
+
+// Copyright      2019  Johns Hopkins University (author: Daniel Povey)
+
+// See ../../COPYING for clarification regarding multiple authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//  http://www.apache.org/licenses/LICENSE-2.0
+//
+// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
+// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
+// MERCHANTABLITY OR NON-INFRINGEMENT.
+// See the Apache 2 License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef KALDI_TENSOR_GPU_IMPL_LINEAR_H_
+#define KALDI_TENSOR_GPU_IMPL_LINEAR_H_ 1
+
+#include "tensor/tensor.h"
+
+
+// This header actually contains implementations of functions that are required
+// by tensor-impl-linear.cc.  It should not be included by users of this
+// library.
+
+
+namespace kaldi {
+namespace tensor {
+
+
+template <typename Real>
+inline static void AddProductScalar3GPU(
+    float alpha, float beta,
+    const TensorImpl &a, const TensorImpl &b, const TensorImpl *c) {
+  // TODO: make this actually work on GPU, probably by calling the 1-d vector version.
+  Real *a_data = static_cast<Real*>(a->data),
+      *b_data = static_cast<Real*>(b->data),
+      *c_data = static_cast<Real*>(c->data);
+  if (beta != 0.0) {
+    *c_data = (beta * *c_data) + alpha * (*a_data + *b_data);
+  } else {  // don't propagate NaN
+    *c_data = alpha * (*a_data + *b_data);
+  }
+}
+
+
+
+
+}
+
+
+
+}  // namespace tensor
+}  // namespace kaldi
+
+
+#endif  // KALDI_TENSOR_GPU_IMPL_LINEAR_H_
diff --git a/src/tensor/storage.cc b/src/tensor/storage.cc
new file mode 100644
index 00000000000..494a3347382
--- /dev/null
+++ b/src/tensor/storage.cc
@@ -0,0 +1,29 @@
+// tensor/storage.cc
+
+// Copyright      2019  Johns Hopkins University (author: Daniel Povey)
+
+// See ../../COPYING for clarification regarding multiple authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//  http://www.apache.org/licenses/LICENSE-2.0
+//
+// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
+// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
+// MERCHANTABLITY OR NON-INFRINGEMENT.
+// See the Apache 2 License for the specific language governing permissions and
+// limitations under the License.
+
+#include "tensor/storage.h"
+
+
+namespace kaldi {
+namespace tensor {
+
+
+
+}  // namespace tensor
+}  // namespace kaldi
diff --git a/src/tensor/storage.h b/src/tensor/storage.h
index 434880f5192..936abded13d 100644
--- a/src/tensor/storage.h
+++ b/src/tensor/storage.h
@@ -20,8 +20,10 @@
 #ifndef KALDI_TENSOR_STORAGE_H_
 #define KALDI_TENSOR_STORAGE_H_ 1
 
+#include <functional>
 #include "tensor/tensor-common.h"
 
+
 namespace kaldi {
 namespace tensor {
 
diff --git a/src/tensor/tensor-common.h b/src/tensor/tensor-common.h
index 7a585d45fde..268fd1721f9 100644
--- a/src/tensor/tensor-common.h
+++ b/src/tensor/tensor-common.h
@@ -22,6 +22,7 @@
 
 #include <cstdint>
 #include <vector>
+#include <string>
 
 /**
    This is some notes on plans for kaldi10 tensor stuff, nothing is fully fleshed out.
@@ -37,10 +38,10 @@ typedef uint32_t uint32;
 
 
 
-enum {
+enum DeviceType {
   kCpuDevice = 0,
   kCudaDevice = 1
-} DeviceType;
+};
 
 
 // We may later add a device number (like which GPU we are using),
@@ -58,6 +59,28 @@ struct Device {
 };
 
 
+Device GetDefaultDevice();
+void SetDefaultDevice(Device device);
+
+class WithDeviceAs {
+  // Example:
+  // {
+  //   WithDeviceAs(kCudaDevice);
+  //   // code in this block uses this default.
+  // }
+ public:
+  inline WithDeviceAs(Device device):
+      prev_default_(GetDefaultDevice()) {
+    SetDefaultDevice(device);
+  }
+  ~WithDeviceAs() { SetDefaultDevice(prev_default_); }
+
+ private:
+  Device prev_default_;
+};
+
+
+
 enum DataType {
   // We will of course later extend this with many more types, including
   // integer types and half-precision floats.
@@ -66,16 +89,41 @@ enum DataType {
 };
 
 
+aDataType GetDefaultDtype();
+void SetDefaultDtype(DataType dtype);
+
+class WithDtypeAs {
+  // Example:
+  // {
+  //   WithDtypeAs(kDoubleDtype);
+  //   // code in this block uses this default.
+  // }
+ public:
+  inline WithDtypeAs(DataType dtype):
+      prev_default_(GetDefaultDtype()) {
+    SetDefaultDtype(dtype);
+  }
+  ~WithDtypeAs() { SetDefaultDtype(prev_default_); }
+
+ private:
+  DataType prev_default_;
+};
+
+
+
 
 /// Enumeration that says what strides we should choose when allocating
 /// A Tensor.
 enum StridePolicy {
-  kCopyStrides,  // means: copy the strides from the source Tensor, preserving
-                 //  their signs and relative ordering (but filling in gaps if
-                 //  the source Tensor's data was not contiguous.
+  kCopyStrideOrder,  // means: copy the size-ordering of the strides from the
+                     // source Tensor (they will all be positive even of some of
+                     // the source Tensor's strides were negative).
   kCstrides      // means: strides for dimensions that are != 1 are ordered from
                  // greatest to smallest as in a "C" array.  Per our policy,
                  // any dimension that is 1 will have a zero stride.
+
+  // We may later add options for Fortran-style striding and for the sign of the
+  // source Tensor's strides, as well as their order, to be copied.
 };
 
 /// Enumeration that says whether to zero a freshly initialized Tensor.
@@ -84,16 +132,39 @@ enum InitializePolicy {
   kUninitialized
 };
 
-/// This enumeration with one value is used in the constructor of Tensor,
-/// so if you do:
-///  `Tensor a;  Tensor b(a, kUntrackedStorage);`
-/// it will not copy the 'storage' pointer like it normallly would.
-/// This is useful as an optimization that avoids atomics with
-/// std::shared_ptr, for temporary Tensors in situations where we
-/// know the Tensor we are copying from is not going out of scope
-/// for the lifetime of the temporary.
-enum TensorStorageEnum {
-  kUntrackedStorage
+
+
+/// This enumeration value lists the unary functions that we might
+/// want to apply to Tensors; it exists so that much of the glue
+/// code can be templated.
+enum UnaryFunctionEnum {
+  kUnaryFunctionExp,
+  kUnaryFunctionLog,
+  kUnaryFunctionRelu,
+  kUnaryFunctionInvert,
+  kUnaryFunctionSquare
+  // TODO: add more.
+};
+
+
+/// This enumeration value lists the unary function taking a single scalar arg
+/// that we might want to apply to Tensors; it exists so that much of the glue
+/// code can be templated.
+enum UnaryFunction1ScalarArgEnum {
+  kUnaryFunctionFloor,
+  kUnaryFunctionCeiling
+};
+
+
+/// This enumeration value lists the binary functions that we might
+/// want to apply to Tensors; it exists so that much of the glue
+/// code can be templated.  (Note: multiplication is not counted
+/// here; that is a special case as it will genearlly go to BLAS).
+enum BinaryFunctionEnum {
+  kBinaryFunctionAdd,
+  kBinaryFunctionDivide,
+  kBinaryFunctionMax,
+  kBinaryFunctionMin
 };
 
 
diff --git a/src/tensor/tensor-functions.cc b/src/tensor/tensor-functions.cc
index dcc89f022f4..5611becbd49 100644
--- a/src/tensor/tensor-functions.cc
+++ b/src/tensor/tensor-functions.cc
@@ -17,7 +17,7 @@
 // See the Apache 2 License for the specific language governing permissions and
 // limitations under the License.
 
-#include "tensor/tensor-pattern.h"
+#include "tensor/tensor.h"
 
 
 namespace kaldi {
diff --git a/src/tensor/tensor-functions.h b/src/tensor/tensor-functions.h
index 9f0c0c8a5e6..0584651847f 100644
--- a/src/tensor/tensor-functions.h
+++ b/src/tensor/tensor-functions.h
@@ -40,35 +40,124 @@ namespace tensor {
 // Sets all elements of the tensor to zero.
 void SetZero(const Tensor *tensor);
 
-// Sets all elements of the tensor to value f (cast to whatever type
-// this Tensor has).
-void SetZero(float f, const Tensor *tensor);
+// Sets all elements of the tensor to value f (cast to whatever type this Tensor
+// has).
+void Set(float f, const Tensor *tensor);
 
 
 // Return a transposed version of this Tensor that shares the underlying memory.
 Tensor Transpose(const Tensor &tensor, int64_t axis1 = 0, int64_t axis2 = 1);
 
 /**
-   Copy the data from tensor 'src' to tensor 'dest'.  Does not change the tensor
-   metadata, but does change the data underlying the Tensor 'dest'.
+   Copy the data from tensor 'src' to tensor 'dest', allowing broadcasting
+   (so a dim of src can be 1 while the corresponding dim of 'dest' is >1).
+   Requires Broadcastable(src, *dest, true).
+
+   Does not require that the Dtype() or Device() of src and dest be the same
+   (i.e. does not require Compatible(src, *dest)).  This is the only way in
+   which Copy() is more general than Add(); otherwise, what Copy() does is a
+   strict subset of what Add(1.0, 0.0, ...)  can do.
+*/
+void Copy(const Tensor &src, const Tensor *dest);
+
+
+
+/**
+   Template used to implement unary functions such as Log, Relu, and
+   so on (this avoids boilerplate).
+
+   Implements dest = F(src), where the F is applied elementwise.
+
+     @param [in] src  Source Tensor
+     @param [out] dest  Destination Tensor.  We require
+                       SameDim(src, *dest).  May be the same
+                       Tensor as 'src' (but must not partially
+                       overlap in memory with 'src').
+
+ */
+template <UnaryFunctionEnum F>
+void UnaryFunctionTpl(const Tensor &src, const Tensor *dest);
+
+
+/*
+   Implements *dest = exp(src), applied elementwise.
+
+     @param [in] src  Source Tensor
+     @param [out] dest  Destination Tensor.  We require
+                       SameDim(src, *dest).  May be the same
+                       Tensor as 'src' (but must not partially
+                       overlap in memory with 'src').
+ */
+inline void Exp(const Tensor &src, const Tensor *dest) {
+  UnaryFunctionTpl<kUnaryFunctionExp>(src, dest);
+}
+
+// TODO: other unary function wrappers.
+
+
+
+/**
+   Template used to implement binary functions such as division,
+   taking to a power, max.
+
+   Implements c = F(a, b), where F is some function of two scalars
+   that returns a scalar.
+
+     @param [in] a  First source Tensor
+     @param [in] b  Second source Tensor
+     @param [out] c   Destination Tensor.  We require SameDim(a, b, c).
+                     'c' does not have to be initialized on entry and
+                     is allowed to be the same Tensor as one of a or b.
+ */
+template <BinaryFunctionEnum F>
+void BinaryFunctionTpl(const Tensor &a, Tensor &b, const Tensor *c);
+
+
+
+
+/*
+   Implements c = a / b, applied elementwise.
+
+     @param [in] a  First source Tensor
+     @param [in] b  Second source Tensor
+     @param [out] c   Destination Tensor.  We require SameDim(a, b, c).
+                    'c' does not have to be initialized on entry and
+                    is allowed to be the same Tensor as one of a or b.
+ */
+inline void Div(const Tensor &a, Tensor &b, const Tensor *c) {
+  BinaryFunctionTpl<kBinaryFunctionDivide>(a, b, c);
+}
 
-   Requires that src.Dims() an dest->Dims() be compatible, meaning that they are
-   the same, except it's OK for a dim of 'dest' to be 1 and a dim of 'src' to be
-   >1; in such cases, we will broadcast the element from 'src' across the larger
-   dimension of 'dest'.
+// TODO: more binary functions.
 
-   Does not require that the Dtype() or Device() of src and dest be the
-   same.
+
+
+
+
+
+/**
+   Does
+
+      dest := alpha * src  +  beta * dest
+
+   while supporting broadcasting and summation, as dictated by the shapes
+   of src and dest.  If beta == 0, guarantees that NaN's or inf's will
+   not be propagated from the original data in 'dest' (so it works with
+   uninitialized 'dest' if beta == 0).
+
+   Requires Broadcastable(src, *dest) and Compatible(src, *dest).
+   If src and dest have an integer Dtype, alpha and beta will
+   be cast to integers before the operation.
 */
-void CopyData(const Tensor &src, const Tensor *dest);
+void Add(float alpha, float beta, const Tensor &src, const Tensor *dest);
 
 /**
-  Construct, if possible, a Tensor that is a view into 'src' with the
-  requested dimensions.
+  If possible, modifies the Tensor metadata to have the requested
+  dimensions.
 
   The semantics are based on those of PyTorch's "view" or NumPy's
-  "reshape", except we try to be more agnostic about the striding
-  of the input.
+  "reshape", except we try to be more accepting regarding the
+  acceptable striding of the input (see below).
 
   Consider a Tensor 'a' has "C"-style strides.  Then this function will return
   Tensor (say, 'b') that interprets the raw data of 'a' as an array with
@@ -78,85 +167,91 @@ void CopyData(const Tensor &src, const Tensor *dest);
   Now consider a Tensor 'a2' that does not have "C"-style strides but
   has the same elements as 'a' in the sense that a(i,j,k) == a2(i,j,k).
   Then, *if possible*, this function will return a matrix b2 with
-  the same elements as b, e.g. b2(i,j,k) == b(i,j,k).
+  the same elements as b, e.g. b2(i,j,k) == b(i,j,k).  Of course, whether
+  this is possible depends on the details of the strides involved.
 
   This function returns NULL if such a tensor could not be constructed.  In that
-  case, likely what you will want to do is to construct a temporary Tensor from
-  'src' with the same dimensions but "C"-style strides (see the constructor of
-  Tensor that accepts the 'dims' parameter).  You may then call View() on that
-  temporary Tensor, which is guaranteed to succeed.
-
-     @param   [in] src   The source Tensor, whose data is to be
-                         reinterpreted.
-     @param   [in] dims  The dimensions that we want for the returned
-                       Tensor; its product must equal src.NumElements().
-     @param   [out] dest  If the view could be constructed, this function
-               make 'dest' a view of the data in 'src' with the requested dims;
-               otherwise 'dest' will be unchanged.
-     @return   Returns true if this view could be constructed. If
-               src.HasCStrides() is true, this function will never return
-               false.
+  case,
+
+     @param   [in] dims  The dimensions that we want The tensor to have at
+                       exit; its product must equal t->NumElements().
+     @param   [in,out] t   The Tensor whose metadata is to be changed
+
+     @return  Returns true if it was possible to construct such a view, and
+              false otherwise.  If t->HasCStrides() is true at entry,
+              this function will never return false.  If this function returns
+              false, you will likely want to construct a temporary Tensor from t
+              with the same dimensions but "C"-style strides (see the
+              constructor of Tensor that accepts the 'dims' parameter), and copy
+              the data from t to that new Tensor.  You may then call View() on
+              the temporary Tensor, which is guaranteed to succeed.
+
+     Example:
+<code>
+    Tensor a({90}, kFloatDtype, kCpuDevice);
+    Tensor b(a);
+    bool ans = View({9,5,2}, &b);
+    KALDI_ASSERT(ans);
+</code>
  */
-bool View(const Tensor &src, ArrayRef<int64_t> dims, Tensor *dest);
+bool View(ArrayRef<int32> dims, Tensor *t);
 
 
 /**
-   Returns a Tensor with a new view of the data in 'src', in which the axes
-   numbered axis1 and axis1 + 1 are merged.  This is just a special case
-   of View().
-
-   For example, if 'src' is a Tensor with dims (3,4,5) and you call
-   MergeAxes(src, 1), this funtion will merge axes 1 and 2 and return a Tensor
-   with shape (3,20).  The order of the elements in the second axis of the
-   result is required to be what you would expect if the layout as as a
-   "C" array (so: 4 blocks of 5 elements, and not vice versa).  This
-   is a common special case of what the function 'View' can give you.
-
-   If the pattern of 'src' makes the requested merging impossible,
-   this function will return NULL.  (This will happen if, in the
-   Tensor 'src', stride[axis1+1] != stride[axis1] * dim[axis1]).
-
-   If this function returns NULL then the caller will probably want to construct
-   a temporary Tensor 'temp' passing src.Dims() in the constructor, copy the
-   data in 'src' to 'temp', and then call MergeAxes on 'temp'.
-
-       @param [in]  src   The Tensor whose axes we will attempt to
-                          merge
+   Attempts to modify a Tensor to contain a new view of its data, in which the
+   axes numbered axis1 and axis1 + 1 are merged.  This is just a special case of
+   View().
+
+   For example, if 't' is a Tensor with dims (3,4,5) and you call
+   MergeAxes(1, &t), this funtion will merge axes 1 and 2 and t will, at
+   exit, have shape (3 20), with elements arranged in 4 blocks of 5
+   elements each (i.e. axis 1 having the higher stride).
+
        @param [in] axis1  The index of the first of the two axes which
                           this function will attempt to merge.  Must
-                          be less than src.NumAxes() - 1.
-       @param [out] dest  The Tensor which is written to; on success this
+                          be less than t->NumAxes() - 1.
+       @param [out] t     The Tensor to be modified; on success this
                           will be a Tensor with axes merged as requested,
                           sharing the data of 'src'.  On failure, it will
                           not be changed.
        @return            Returns true on success, false if the axes could
-                          not be merged (e.g., because of the strides not
-                          having the required relationship).
+                          not be merged.  It returns true if and only if
+                        `t->Stride(axis1 + 1)==t->Stride(axis1)*t->Dim(axis1)`
+
+     Example:
+<code>
+    Tensor a({3,4,5}, kFloatDtype, kCpuDevice);
+    MergeAxes(0, &a);  // a now has dims {12,5}.
+</code>
  */
-bool MergeAxes(const Tensor &src, int64_t axis1, Tensor *dest);
+bool MergeAxes(int32 axis1, Tensor *t);
 
 /**
-   Creates a Tensor in which the axis numbered 'axis' is split into
-   two axes, with dimensions respectively 'dim1' and 'dim2'.  The
-   interpretation will be as for a "C" array; so, for instance,
+   Modifies a Tensor by splitting the axis numbered 'axis' into
+   multiple axes as supplied in the 'dims' array.
+   The interpretation will be as for a "C" array; so, for instance,
    if the dimensions of 'src' were (10,12) and you called
    `SplitAxis(src, 1, 3, 4)` resulting in a Tensor of dimensions
    (10,3,4), the indexes along the original axis of dimension 12 would be
    interpreted as 3 blocks of size 4.  (This is the normal semantics
    of things like NumPy's reshape or PyTorch's view.)
 
-      @param [in] src  The Tensor whose axis is to be split.
       @param [in] axis  The index of the axis to be split; must
                        satisfy `0 <= axis < src.Dims().`
-      @param [in] dim1  First dimension with which to split the axis.
-      @param [in] dim2  Second dimension with which to split the axis.
-                        Must satisfy `dim1 * dim2 == src.Dim(axis)`.
-      @param [out] dest Tensor to be created, with one more axis than 'src',
-                        sharing the same underlying data.
+      @param [in] dims  The dimensions desired in the axes to
+                        replace axis 'axis'.  Their product must
+                        equal the value of `t->Dim(axis)` at
+                        entry.
+      param [in,out] t   Tensor whose metadata is to be modified
+   Example:
+<code>
+  Tensor a({10,3}, kFloatDtype, kCpuDevice);
+  SplitAxis(0, {2,5}, &a);  // a now has dims {2,5,3}.
+</code>
 */
-void SplitAxis(const Tensor &src, int64_t axis,
-               int64_t dim1, int64_t dim2,
-               Tensor *dest);
+void SplitAxis(int32 axis, ArrayRef<int32> dims, Tensor *t);
+
+
 
 
 
@@ -166,54 +261,25 @@ void SplitAxis(const Tensor &src, int64_t axis,
 
     `c := alpha (a * b)  +  beta c`
 
-   where '*' is elementwise multiplication subject to broadcasting
-   rules.  This does not support reducing operations (see AddProductReducing).
+   where '*' is elementwise multiplication subject to broadcasting rules.  This
+   supports reducing operations, and is the underlying implementation used in
+   things like matrix-matrix or matrix-vector product.
 
    @param [in] alpha  Value that scales a * b
    @param [in] beta   Value that scales the initial value of c
    @param [in] a      First input tensor
-   @param [in] b      Second input tensor; require BroadcastCompatible(a, b)
-   @param [out] c     Tensor to be added to (must already be correctly sized,
-                      and either its data must be initialized to a known
-                      value (if beta != 0) or known to not contain NaN (if
+   @param [in] b      Second input tensor
+   @param [out] c     Tensor to be added to.  We require Broadcastable(a, b, c).
+                      Either its data must be initialized to a known
+                      value (if beta != 0) or it must be known to not contain NaN (if
                       beta == 0).   We require BroadcastCompatible(a, b, c, true).
+                      'c' is const because its metadata is not changed; it is
+                      a pointer as a hint to the user that its data is changed.
  */
 void AddProduct(float alpha, float beta,
-                const Tensor &a, const Tensor &b, Tensor *c);
-
+                const Tensor &a, const Tensor &b, const Tensor *c);
 
 
-/**
-   Does:
-
-    `c := alpha (a * b)  +  beta c`
-
-   where '*' is elementwise multiplication subject to broadcasting
-   rules.  This version supports reducing operations (i.e. it allows
-   'c' to have dim=1 on axes where a and/or b has dim!=1).
-
-   This function actually supports a strict superset of AddProduct(); we
-   separate the functions to make the implementation for AddProduct() simpler,
-   for speed.
-
-   The Tensors do not all have to have the same NumAxes(); they will
-   (internally) be made the same size by padding on the left with trivial axes
-   (dim=1;stride=0) to make them the same size.
-
-   The Tensors need to have the same Dtype() and Device*().
-
-   @param [in] alpha  Value that scales a * b
-   @param [in] beta   Value that scales the initial value of c
-   @param [in] a      First input tensor
-   @param [in] b      Second input tensor; require BroadcastCompatible(a, b)
-   @param [out] c     Tensor to be added to (must already be correctly sized,
-                      and either its data must be initialized to a known
-                      value (if beta != 0) or known to not contain NaN (if
-                      beta == 0).   We require BroadcastCompatible(a, b, c).
- */
-void AddProductReducing(float alpha, float beta,
-                        const SubTensor &a, const SubTensor &b,
-                        SubTensor *c);
 
 
 
diff --git a/src/tensor/tensor-impl-linear.h b/src/tensor/tensor-impl-linear.h
index e1d7ec12145..322225a7e00 100644
--- a/src/tensor/tensor-impl-linear.h
+++ b/src/tensor/tensor-impl-linear.h
@@ -25,46 +25,12 @@
 
 /**
    This header contains basic linear-algebra and copying types of operations
-   on TensorImpl objects.  See also tensor-impl-nonlinearly
- */
+   on TensorImpl objects.  See also tensor-impl-nonlinear.h
+*/
 
 namespace kaldi {
 namespace tensor {
 
-/**
-   Modifies 't' in-place by inserting an axis with (dim=1,stride=0) at the
-a   specified position.
-
-   A negative axis-index i is interpreted (like PyTorch) as (num_axes + 1 - i).
-
-   Showing just the dims in the tensor for some examples:
-
-\verbatim
-    Unsqueeze({3,4}, 0)  -> {1,3,4}
-    Unsqueeze({3,4}, 1)  -> {3,1,4}
-    Unsqueeze({3,4}, 2)  -> {3,4,1}
-    Unsqueeze({3,4}, -1)  -> {3,4,1}
-    Unsqueeze({3,4}, -2)  -> {3,1,4}
-\endverbatim
- */
-void Unsqueeze(TensorImpl *t, int32 axis)
-
-
-/**
-   Modifies 't' in-place by removing an axis with (dim=1,stride=0) from the
-   specified position.  It is an error if 't' did not initially contain
-   such an axis.
-
-   Showing just the dims in the tensor for an example:
-
-\verbatim
-    Unsqueeze({1,3,4}, 0)  -> {3,4}
-    Unsqueeze({3,1,4}, 1)  -> {3,4}
-    Unsqueeze({3,1,4}, 2)  -> [error]
-\endverbatim
- */
-void Squeeze(TensorImpl *t, int32 axis);
-
 
 
 /**
@@ -78,7 +44,8 @@ void Squeeze(TensorImpl *t, int32 axis);
 
    The Tensors do not all have to have the same NumAxes(); they will
    (conceptually) be made the same size by padding on the left with trivial axes
-   (dim=1;stride=0) to make them the same size.
+   (dim=1;stride=0) to make them the same size.  (Physically, we'd pad
+   on the right, since the axes are stored in reversed order).
 
    The Tensors need to have the same Dtype() and Device().
 
@@ -135,8 +102,11 @@ void Add(float alpha, float beta,
    The implementation is just:
 
      Tensor a_tmp(a), c_tmp(c);
-     a_tmp.Unsqueeze(-1);
-     c_tmp.Unsqueeze(-2);
+     Unsqueeze(0, &a_tmp);  // the 0 is in reversed numbering;
+                            // means introduce final dim=1 axis
+     Unsqueeze(1, &c_tmp);   // the 1 is in reversed numbering;
+                            // means introduce penultimate dim=1 axis.
+     Unsqueeze(2, &b_tmp);
      AddProduct(alpha, beta, a_tmp, b, c_tmp);
 
  */
diff --git a/src/tensor/tensor-impl-utils.h b/src/tensor/tensor-impl-utils.h
new file mode 100644
index 00000000000..dbae890a0df
--- /dev/null
+++ b/src/tensor/tensor-impl-utils.h
@@ -0,0 +1,133 @@
+// tensor/tensor-impl-utils.h
+
+// Copyright      2019  Johns Hopkins University (author: Daniel Povey)
+
+// See ../../COPYING for clarification regarding multiple authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//  http://www.apache.org/licenses/LICENSE-2.0
+//
+// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
+// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
+// MERCHANTABLITY OR NON-INFRINGEMENT.
+// See the Apache 2 License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef KALDI_TENSOR_IMPL_UTILS_H_
+#define KALDI_TENSOR_IMPL_UTILS_H_ 1
+
+#include "tensor/tensor-impl.h"
+#include "tensor/tensor-patterns-utils.h"
+
+
+/**
+   This header contains basic linear-algebra and copying types of operations
+   on TensorImpl objects.  See also tensor-impl-nonlinearly
+ */
+
+namespace kaldi {
+namespace tensor {
+
+
+// This function returns true if a and b have the same dtype
+// and device.  See also Broadcastable().
+inline bool Compatible(const TensorImpl &a, const TensorImpl &b);
+
+
+// This function returns true if a and b have the same dtype
+// and device; equivalent to Compatible(a, b) && Compatible(b, c).
+inline bool Compatible(const TensorImpl &a, const TensorImpl &b,
+                       const TensorImpl &c);
+
+
+/**
+   This function allocates the appropriate storage for the Tensor described
+   in 'impl', and sets is 'data' pointer to the allocated memory address.
+   It returns the address a newly allocated Storage object which manages
+   the memory location; you will probably want to construct a
+   std::unique_ptr<Storage> from this so that when it goes out of scope,
+   the memory will be freed.
+
+      @param [in,out] impl   The TensorImpl object we are allocating for.
+                      Any previous value of impl->data is ignored and
+                      overwritten.
+                      It is required that that the product of dims in
+                      impl->pattern be nonzero (i.e. that the pattern
+                      is initialized to a valid value), and that its
+                      dtype and device values be set.
+      @return         Returns a newly allocated Storage object that
+                      manages this memory block.  When this object is deleted,
+                      the memory block will be deallocated using a
+                      method appropriate for the device.
+
+   This function throws on error.
+
+   See also AllocateTensorDataShared().
+ */
+Storage *AllocateTensorData(TensorImpl *impl);
+
+
+/**
+   This function is as AllocateTensor(), except that the Storage
+   object returned is allocated via std::make_shared (which involves
+   just one heap allocation, as opposed to two if you constructed
+   the shared_ptr from the Storage* pointer).  See the documentation
+   for AllocateTensor() for more details.
+ */
+std::shared_ptr<Storage> AllocateTensorDataShared(TensorImpl *impl);
+
+
+
+/**
+   Modifies 't' in-place by inserting an axis with (dim=1,stride=0) at the
+   specified position.  Updates the code.
+
+   A negative axis-index i is interpreted (like PyTorch) as (num_axes + 1 - i).
+
+   Showing just the dims in the tensor for some examples:
+
+\verbatim
+    Unsqueeze({3,4}, 0)  -> {1,3,4}
+    Unsqueeze({3,4}, 1)  -> {3,1,4}
+    Unsqueeze({3,4}, 2)  -> {3,4,1}
+    Unsqueeze({3,4}, -1)  -> {3,4,1}
+    Unsqueeze({3,4}, -2)  -> {3,1,4}
+\endverbatim
+ */
+inline void Unsqueeze(TensorImpl *t, int32 axis) {
+  Unsqueeze(&(t->pattern), axis);
+}
+
+
+/**
+   Modifies 't' in-place by removing an axis with (dim=1,stride=0) from the
+   specified position.  It is an error if 't' did not initially contain
+   such an axis.  This function updates the code.  See also the same-named
+   function that operates on TensorPattern.
+
+   Showing just the dims in the tensor for an example:
+
+\verbatim
+    Squeeze({1,3,4}, 0)  -> {3,4}
+    Squeeze({3,1,4}, 1)  -> {3,4}
+    Squeeze({3,1,4}, 2)  -> [error]
+\endverbatim
+ */
+inline void Squeeze(TensorImpl *t, int32 axis) {
+  Squeeze(&(t->pattern), axis));
+}
+
+
+
+
+
+
+}  // namespace tensor
+}  // namespace kaldi
+
+
+#endif  // KALDI_TENSOR_IMPL_UTILS_H_
diff --git a/src/tensor/tensor-impl.h b/src/tensor/tensor-impl.h
new file mode 100644
index 00000000000..ef3ddadceb9
--- /dev/null
+++ b/src/tensor/tensor-impl.h
@@ -0,0 +1,72 @@
+// tensor/tensor-impl.h
+
+// Copyright      2019  Johns Hopkins University (author: Daniel Povey)
+
+// See ../../COPYING for clarification regarding multiple authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//  http://www.apache.org/licenses/LICENSE-2.0
+//
+// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
+// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
+// MERCHANTABLITY OR NON-INFRINGEMENT.
+// See the Apache 2 License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef KALDI_TENSOR_TENSOR_IMPL_H_
+#define KALDI_TENSOR_TENSOR_IMPL_H_ 1
+
+#include "tensor/tensor-common.h"
+#include "tensor/tensor-pattern.h"
+
+namespace kaldi {
+namespace tensor {
+
+
+/**
+   TensorImpl is the core part of a Tensor, without the wrapping code and
+   storage management in Tensor.h.  Most of the core implementation deals
+   directly with TensorImpl to avoid the overhead of shared_ptr management
+   and the need to deal with accessors and the like, but TensorImpl
+   is intended for use in the tensor/ directory, to implement Tensor
+   internals, and not for users of this library.
+*/
+struct TensorImpl {
+  TensorPattern pattern;
+  DataType dtype;
+  Device device;
+  void *data{nullptr};
+
+  // Returns true if this TensorImpl is valid, false otherwise.  It is an
+  // implied requirement of functions operating on TensorImpl's, that all
+  // TensorImpl's that are provided input or input+output arguments to functions
+  // must have IsValid() == true.
+  bool IsValid() { return pattern.IsValid(true) && data != nullptr; }
+};
+
+// Metadata for a Tensor.  It's occasionally convenient to have this
+// in a struct (it's the same as a Tensor without the 'data' pointer.
+// The members must stay in sync with the corresponding members of
+// TensorImpl, as we have code that does reinterpret_cast on
+// these types.  (We don't use base-classing as it would make the code
+// harder to read).
+struct TensorMeta {
+  TensorPattern pattern;
+  DataType dtype;
+  Device device;
+};
+
+
+void Compatible(const TensorImpl &a, TensorImpl &b
+
+
+
+}  // namespace tensor
+}  // namespace kaldi
+
+
+#endif  // KALDI_TENSOR_TENSOR_IMPL_H_
diff --git a/src/tensor/tensor-pattern-utils-test.cc b/src/tensor/tensor-pattern-utils-test.cc
new file mode 100644
index 00000000000..2c003174a2b
--- /dev/null
+++ b/src/tensor/tensor-pattern-utils-test.cc
@@ -0,0 +1,85 @@
+// util/tensor-pattern-utils-test.cc
+
+// Copyright 2009-2011  Microsoft Corporation
+
+// See ../../COPYING for clarification regarding multiple authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+
+//  http://www.apache.org/licenses/LICENSE-2.0
+
+// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
+// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
+// MERCHANTABLITY OR NON-INFRINGEMENT.
+// See the Apache 2 License for the specific language governing permissions and
+// limitations under the License.
+
+#include "tensor/tensor-pattern.h"
+#include "tensor/tensor-pattern-utils.h"
+#include "base/kaldi-math.h"
+
+
+namespace kaldi {
+namespace tensor {
+
+// We may later move this function to somewhere more permanent.
+void GenerateRandomPattern(TensorPattern *pattern) {
+
+  int32 num_axes = RandInt(0, KALDI_TENSOR_MAX_DIM);
+
+  // the 'cur_stride' stuff is a mechanism for generating strides that
+  // will satisfy the 'uniqueness' rule; we'll later randomize the
+  // order of axes.
+  int32 cur_stride = 1;
+  for (int32 raxis = 0; raxis < num_axes; raxis++) {
+    int32 dim = RandInt(1, 10);
+    pattern->dims[raxis] = dim;
+    if (dim > 1) {
+      cur_stride *= RandInt(1, 3);
+      pattern->strides[raxis] = cur_stride;
+      cur_stride *= dim;
+    } else {
+      pattern->strides[raxis] = 0;
+    }
+  }
+
+  for (int32 i = 0; i <= num_axes; i++) {
+    int32 raxis1 = RandInt(0, num_axes - 1),
+        raxis2 = RandInt(0, num_axes - 1);
+    if (raxis1 != raxis2) {
+      std::swap(pattern->dims[raxis1], pattern->dims[raxis2]);
+      std::swap(pattern->strides[raxis1], pattern->strides[raxis2]);
+    }
+  }
+  for (int32 raxis = num_axes; raxis < KALDI_TENSOR_MAX_DIM; raxis++) {
+    pattern->dims[raxis] = 1;
+    pattern->strides[raxis] = 0;
+  }
+  pattern->code = ComputePatternCode(*pattern);
+  if (RandInt(0, 1) == 0) {
+    KALDI_ASSERT(pattern->Check());
+  } else {
+    KALDI_ASSERT(pattern->Check(true));
+  }
+}
+
+
+void UnitTestGenRandomPattern() {
+  TensorPattern p;
+  for (int32 i = 0; i < 100; i++) {
+    GenerateRandomPattern(&p);
+  }
+}
+
+}  // namespace kaldi
+}  // namespace tensor
+
+int main(int argc, const char** argv) {
+  using namespace kaldi;
+  using namespace kaldi::tensor;
+  UnitTestGenRandomPattern();
+  return 0;
+}
diff --git a/src/tensor/tensor-pattern-utils.cc b/src/tensor/tensor-pattern-utils.cc
index 0440d40bbba..69d9e6d0e82 100644
--- a/src/tensor/tensor-pattern-utils.cc
+++ b/src/tensor/tensor-pattern-utils.cc
@@ -7,78 +7,105 @@
 namespace kaldi {
 namespace tensor {
 
-/**
-   This function returns true if any of the tensor patterns in 'patterns'
-   contains a negative stride.  All patterns are assumed to have the same
-   num-axes.
- */
-static inline bool NegativeStrideExists(ArrayRef<TensorPattern> patterns) {
-  bool ans = false;
-  int32 num_axes = patterns[0]->num_axes;  // required
-  for (size_t p = 0; p < patterns.size; p++) {
-    const TensorPattern &pattern = patterns[p];
-    for (int32 i = 0; i < num_axes; i++) {
-      if (pattern->strides[i] < 0)
-        return true;
+int32 ComputePatternCode(const TensorPattern &pattern) {
+  int32 ans = 0;
+
+  int32 n = 0;
+  // n is going to be:
+  // n = 0 if no axis had stride=1, otherwise:
+  // n = 1 + the raxis index that had stride=1.
+
+  bool found_negative_dim = false;
+
+  // caution: this axis index is a shifted-to-the-right index,
+  // not the one that the public interface of Tensor exposes.
+  for (int32 raxis = 0; raxis < pattern.num_axes; raxis++) {
+    int32 dim = pattern.dims[raxis],
+        stride = pattern.strides[raxis];
+    if (dim != 1) {
+      ans |= 1;  // set least significant bit of 'ans' to 1.
+      if (dim < 0)
+        found_negative_dim = true;
+      if (stride == 1)
+        n = raxis + 1;  // Can happen only once, if pattern.Check() == true,
+                        // i.e. if pattern is valid.
     }
+    ans <<= 1;  // shift left by one.
   }
-  return false;
+
+  // add in the value 'n' shifted 8 bits to the left,
+  // and set the 11th bit if we found a negative dim.
+  ans |= (n << 8) |  (found_negative_dim ? 1 << 11 : 0);
 }
 
 
+
 /**
    This utility function used in CompressPatterns() normalizes the signs of the
    strides in all the dimensions, prior to any merging of axes, and sets the
    'data_offsets' variables.
 
-   Consider an axis-index 0 <= i < num_axes.  We say that the strides for axis i
-   are normalized if the the lowest-numbered pattern which has nonzero stride on
-   axis i (if such a pattern exists) is positive.  If, on the other hand, all
-   the strides are zero, we also say that it is normalized (since flipping the
-   sign would make no difference).
+   Consider an axis-index i (i.e. an index into the patterns' dims or strides
+   vector).  We say that the strides for axis i
+   are normalized if either all patterns have zero stride for that axis
+   or the lowest-numbered pattern which has nonzero stride for that axis
+   has positive stride for that axis.
 
    This type of normalization is done to increase the chance that we can combine
    axes, because the rule we use for combining axes only applies if any nonzero
    strides present have the same sign between the two axes.  In terms of being
-   able to combine axes this rule is optimal, because any two axes where the
-   pattern-index of the first pattern with a nonzero stride for those axes is
-   different, would *not* be combinable.  So for any pair of axes that are
-   potentially combinable according to that criterion and which have any nonzero
-   strides, our normalization rule ensures that at least one pair of nonzero
-   strides has the same sign.  If there were another pattern for which the sign
-   was opposite after applying our rule, those two axes would not be combinable
-   whatever the sign normalization.
+   able to combine the maximum number of axes this rule is optimal, because any
+   two axes where the pattern-index of the first pattern with a nonzero stride
+   for those axes is different, would *not* be combinable.  So for any pair of
+   axes that are potentially combinable according to that criterion and which
+   have any nonzero strides, our normalization rule ensures that at least one
+   pair of nonzero strides has the same sign.  If there were another pattern for
+   which the sign was opposite after applying our rule, those two axes would not
+   be combinable whatever the sign normalization.
 
      @param [in,out] patterns  The patterns to have their strides normalized
+     @param [in]    max_num_axes  The maximum of any of the patterns'
+                          num_axes (provided so we don't have to work it
+                          out from 'patterns').
      @param [in,out] data_offsets  Data offsets, an array of dimension
-                          patterns.size, which will be *added to* by this
-                          function, by the amount required to ensure that
+                          patterns.size, which will be *added to* as needed by
+                          this function, by the amount required to ensure that
                           the memory locations visited by the set of possible
-                          indexes into these patterns is the same before
-                          and after any change of sign.
+                          indexes into these patterns is the same before and
+                          after any change of sign.
+     @return   Returns true if it made a change, else false.
+
+   CAUTION!  Does not update the pattern code (the code for that is commented).
+   If this were moved to a header we would have to make it update the pattern
+   code.
  */
-static inline void NormalizeSigns(ArrayRef<TensorPattern> patterns,
+static inline bool NormalizeSigns(ArrayRef<TensorPattern*> patterns,
+                                  int32 max_num_axes,
                                   int64 *data_offsets) {
+  bool changed = false;
   size_t num_patterns = patterns.size;
-  int32 num_axes = patterns[0].num_axes;
-  for (int32 a = 0; a < num_axes; a++) {
+
+  for (int32 a = 0; a < max_num_axes; a++) {
     for (size_t p = 0; p < size; p++) {
-      if (patterns[p].strides[a] != 0) {
+      if (patterns[p]->strides[a] != 0) {
         // We have identified the first pattern-index with nonzero
         // stride for this axis
-        if (patterns[p].strides[a] < 0) {
-          // The stride is negative, so we have to flip it
-          // for this axis.  (Note: we flip it for all patterns,
-          // for this dim, but we can ignore q < p because
-          // we know all those strides are zero.
+        if (patterns[p]->strides[a] < 0) {
+          changed = true;
+          // The stride is negative, so we have to flip it for this axis.
+          // (Note: we flip it for all patterns, but we can ignore
+          // pattern-indexes q < p because we know all those strides are zero.
           for (size_t q = p; q < size; q++) {
             // cast to int64 before muiltiplication to avoid potential
             // overflow
-            int64 this_offset =
-                static_cast<int64>(patterns[q].dims[a] - 1) *
-                static_cast<int64>(patterns[q].strides[a]);
-            data_offsets[q] += this_offset;
-            patterns[q].strides[a] *= -1;
+            if (patterns[q]->strides[a] != 0) {
+              int64 this_offset =
+                  static_cast<int64>(patterns[q]->dims[a] - 1) *
+                  static_cast<int64>(patterns[q]->strides[a]);
+              data_offsets[q] += this_offset;
+              patterns[q]->strides[a] *= -1;
+              // patterns[q]->code = -1;  // A signal to recompute the code.
+            }
           }
         }
         // break from loop over patterns; we identified the first pattern-index
@@ -88,6 +115,11 @@ static inline void NormalizeSigns(ArrayRef<TensorPattern> patterns,
       }
     }
   }
+  //if(changed)
+  //  for (size_t p = 0; p < size; p++)
+  //    if (patterns[p]->code == -1)
+  //      patterns[p]->code == GetDimsCode(*(patterns[p]));
+  return changed;
 }
 
 
@@ -99,12 +131,17 @@ static inline void NormalizeSigns(ArrayRef<TensorPattern> patterns,
    We can only ever combine pairs of axes that were combinable for *all* patterns
    passed to CompressPatterns().
 
-   When we combine axes we'll set dims[j] := dims[i] * dims[j], and make axis i
-   a no-op by setting dims[i] = 1, strides[i] = 0.
+   Two axes are combinable if stride2 == stride1 * dim1.  Here, raxis1 is
+   required to be the axis with the smaller stride, which is the asymmetry
+   between them.
+
+   (We also require that the new dimension must not overflow an int32.)
  */
-static inline bool Combinable(const TensorPattern &pattern,
-                              int32 axis1, int32 axis2) {
-  return pattern.strides[axis1] == pattern.strides[axis2] * pattern.dims[axis2];
+static inline bool Combinable(const TensorPattern &p,
+                              int32 raxis1, int32 raxis2) {
+  return pattern.strides[raxis2] == p.strides[raxis1] * p.dims[raxis1] &&
+      static_cast<int64>(p.dims[raxis1])*static_cast<int64>(p.dims[raxis2]) <
+    std::numeric_limits<int32>::max();
 }
 
 
@@ -112,138 +149,237 @@ static inline bool Combinable(const TensorPattern &pattern,
 // for all the supplied patterns.  An axis like this can be removed without
 // affecting the result.
 static inline bool AxisIsTrivial(ArrayRef<TensorPattern> patterns,
-                                 int32 axis) {
+                                 int32 raxis) {
   for (size_t p = 0; p < patterns.size; p++)
-    if (patterns[p].strides[axis] != 0)
+    if (patterns[p].strides[raxis] != 0)
       return false;
   return true;
 }
 
-// Combine the two axes axis1 and axis2 in all the patterns (which
-// the user asserts is possible); at exit, the lower numbered of the
-// two axes is guaranteed to have dim=1, stride=0 in all patterns.
-// (we will later get rid of that trivial axis).
-// axis2 is the one with the smaller stride (for patterns where the
-// stride is nonzero), and is the one whose stride we keep in the
-// combined axis; that is the asymmetry.
-static inline void CombineAxes(ArrayRef<TensorPattern> patterns,
-                               int32 axis1, int32 axis2) {
+// Combine the two axes raxis1 and raxis2 in all the patterns (which the user
+// asserts is possible); at exit, the higher numbered of the two raxes is
+// guaranteed to have dim=1, stride=0 in all patterns.  (we will later get rid
+// of that trivial axis).  axis1 is the one with the smaller stride, and is the
+// one whose stride we keep in the combined axis; that is the asymmetry
+// between axis1 and axis2.
+static inline void CombineAxes(ArrayRef<TensorPattern*> patterns,
+                               int32 raxis1, int32 raxis2) {
   size_t num_patterns = patterns.size;
 #ifdef KALDI_PARANOID
   for (size_t p = 0; p < num_patterns; p++) {
-    KALDI_PARANOID_ASSERT(Combinable(patterns[p], axis1, axis2));
+    KALDI_ASSERT(Combinable(*(patterns[p]), raxis1, raxis2));
   }
 #endif
-  if (axis1 < axis2) {
-    // the if-statement is because we want the 'trivial' axis (the one with
-    // dim=1, stride=0 for all patterns) to be the lower-numbered axis; this is
-    // more convenient for our algorithm because we might later want to do
-    // further combination on the nontrivial axis (if the lower-numbered one
-    // were changed, we might repeat the search for an axis to combine with it.
+  if (raxis1 > raxis2) {
+    // keep raxis2, remove raxis1.
+    // We want the 'trivial' axis (the one with dim=1, stride=0 for all
+    // patterns) to be the higher-numbered axis (this helps reduce
+    // the chance of having to move dims/strides around when removing
+    // trivial axes later on.
     for (size_t p = 0; p < num_patterns; p++) {
-      TensorPattern &pattern = patterns[p];
-      pattern.dims[axis2] *= pattern.dims[axis1];
-      pattern.dims[axis1] = 1;
-      pattern.strides[axis1] = 0;
+      TensorPattern *pattern = patterns[p];
+      pattern->dims[raxis2] *= pattern->dims[raxis1];
+      pattern->strides[raxis2] *= pattern->strides[raxis1];
+      pattern->dims[raxis1] = 1;
+      pattern->strides[raxis1] = 0;
     }
   } else {
+    // keep raxis1, remove raxis2.
     for (size_t p = 0; p < num_patterns; p++) {
-      TensorPattern &pattern = patterns[p];
-      pattern.dims[axis2] *= pattern.dims[axis1];
-      pattern.strides[axis2] = pattern.strides[axis1];
-      pattern.dims[axis1] = 1;
-      pattern.strides[axis1] = 0;
+      TensorPattern *pattern = patterns[p];
+      pattern->dims[raxis1] *= pattern->dims[raxis1];
+      pattern->dims[raxis2] = 1;
+      pattern->strides[raxis2] = 0;
     }
   }
 }
 
 /**
-   Removes trivial axes, defined as axes for which, for all patterns,
-   dim=1 and stride=0.  Assumes the user has already which axes
-   are trivial and passes in as the array 'trivial_axis'.
+   Removes trivial axes, defined as axes for which, for all patterns, dim=1 and
+   stride=0.  Assumes the user has already found out which axes are trivial and
+   is passing in this information as the array 'trivial_raxis' (we include the r
+   to emphasize that we use the same reversed numbering as in
+   pattern.{dims,strides}).
+
+
+   This function removes those axes, shifts the dims and strides arrays to
+   the left as needed, and decreases the 'num_axes' of the patterns
+   appropriately (note: this is not as simple as just subtracting the number
+   of axes removed, because removing an raxis that was >= the num_axes
+   of a given pattern needs to be a no-op).
+
+   @param [in]  trivial_raxis    An array which identifies the axes to
+                       be removed.  At least one element must be true.
+                       Indexed by 'raxis'.
+   @param [in,out]  patterns    The patterns to be modified.
+
+   CAUTION: this function does not update the codes of 'patterns'.
  */
-inline static bool RemoveTrivialAxes(int32 num_axes,
-                                     bool trivial_axis[],
-                                     ArrayRef<TensorPattern> patterns) {
+static void RemoveTrivialAxes(bool is_trivial_raxis[KALDI_TENSOR_MAX_DIM],
+                              ArrayRef<TensorPattern*> patterns) {
+  int32 first_trivial_raxis = -1;
+  for (int32 raxis = 0; raxis < KALDI_TENSOR_MAX_DIM; raxis++) {
+    if (is_trivial_axis[raxis]) {
+      first_trivial_raxis = raxis;
+      break;
+    }
+  }
+  KALDI_PARANOID_ASSERT(first_trivial_raxis >= 0);
+
   for (size_t p = 0; p < patterns.size; p++) {
-    const TensorPattern &pattern = patterns[p];
-    // we do the loop over axes inside the loop over p for memory locality.
-    int32 axis_out = 0;
-    for (int32 axis_in = 0; axis_in < num_axes; axis_in++) {
-      if (axis_out != axis_in && !trivial_axis[axis_in]) {
-        pattern.dims[axis_out] = pattern.dims[axis_in];
-        pattern.dims[axis_out] = pattern.dims[axis_in];
+    TensorPattern *pattern = patterns[p];
+    // Keep the axes right-justified.  We work from the right to the left.
+
+    // We do the loop over axes inside the loop over p for memory locality.
+    // We keep the axes shifted to the right so the loop goes backwards.
+    int32 raxis_out = first_trivial_raxis,
+        num_axes = pattern->num_axes;
+    for (int32 raxis_in = raxis_out; raxis_in < num_axes; raxis_in++) {
+      if (is_trivial_axis[raxis_in]) {
+        KALDI_PARANOID_ASSERT(pattern->dims[raxis_in] == 1);
+      } else {
+        if (raxis_out != raxis_in) {
+          pattern->dims[raxis_out] = pattern->dims[raxis_in];
+          pattern->strides[raxis_out] = pattern->strides[raxis_in];
+        }
+        raxis_out++;
       }
-      if (!trivial_axis[axis_in])
-        axis_out++;
     }
-    pattern.num_axes = axis_out;  // will be the same for all p.
+    pattern->num_axes = raxis_out;
+    // Make sure the axes we removed are set to dim=1, stride=0.
+    for (; raxis_out < num_axes; raxis_out++) {
+      pattern->dims[raxis_out] = 1;
+      pattern->strides[raxis_out] = 0;
+    }
+    KALDI_PARANOID_ASSERT(pattern->Check(false));
   }
 }
 
-void CompressPatterns(ArrayRef<TensorPattern> patterns,
+void CompressPatterns(ArrayRef<TensorPattern*> patterns,
                       int64_t *data_offsets) {
   size_t num_patterns = patterns.size;
-  for (size_t p = 0; p < num_patterns; p++)
-    data_offsets[p] = 0;
 #ifdef KALDI_PARANOID
-  // check the input
-  KALDI_ASSERT(num_patterns > 0 && num_patterns < 6);
+  KALDI_ASSERT(num_patterns > 0);
   for (size_t p = 0; p < num_patterns; p++) {
+    KALDI_ASSERT(patterns[p]->Check());
     for (size_t q = p + 1; q < num_patterns; q++) {
-      KALDI_ASSERT(Broadcastable(patterns[p], patterns[q]));
+      KALDI_ASSERT(Broadcastable(*(patterns[p]), *(patterns[q])));
     }
   }
 #endif
-  if (NegativeStrideExists(patterns))
-    NormalizeSigns(patterns, data_offsets);
-  bool is_trivial_axis[6] = { false, false, false, false, false, false }
+  for (size_t p = 0; p < num_patterns; p++)
+    data_offsets[p] = 0;
+
+  int32 max_num_axes = patterns[0]->num_axes,
+      combined_code = patterns[0]->code;
+  // combined_code is the '|' of the patterns' codes; it's
+  // not the same as what CombineCodes() would return.
+
+  for (size_t p = 1; p < num_patterns; p++) {
+    max_num_axes = std::max<int32>(max_num_axes, patterns[p]->num-axes);
+    combined_code |= patterns[p]->code;
+  }
+  bool changed = false;
+  if (ContainsNegativeStride(combined_code))
+    changed = NormalizeSigns(patterns, data_offsets);
+
+  // note: the codes won't be fully up to date at this point.
+
   bool exists_trivial_axis = false;
-  int32 num_axes = patterns[0].num_axes;
-  for (int32 i = 0; i < num_axes; i++) {
-    if (AxisIsTrivial(patterns, i)) {
-      is_trivial_axis[i] = true;
+  // The = {} ensures (I believe) that they are all set to 0, meaning false.
+  bool is_trivial_raxis[KALDI_TENSOR_MAX_DIM] = {};
+  for (int32 raxis = 0, mask = 1; raxis < max_num_axes; raxis++, mask <<= 1) {
+    if ((combined_code | mask) == 0) {
+      is_trivial_raxis[raxis] = true;
       exists_trivial_axis = true;
-      continue;
     }
-    // see if axis i can be combined (in either direction with any later-numbered axis.
-    for (int32 j = i + 1; j < num_axes; j++) {
-      bool combinable_ij = true, combinable_ji = true;
+  }
+
+  // The reason we go in reverse order is a small optimization; it
+  // means it's more straightforward, when combining, to 'make trivial'
+  // the higher-numbered raxis, which reduces the chances of having to
+  // copy axes to different positions later on to remove trivial axes.
+  // (If we went forward and did this, we'd have to repeat processing
+  // the current axis each time we combined, which would be a hassle).
+  for (int32 raxis1 = max_num_axes - 1; raxis1 >= 0; raxis1--) {
+    if (is_trivial_raxis[raxis1])
+      continue;
+
+    // see if axis i can be combined (in either direction) with any
+    // earlier-numbered axis.
+    for (int32 raxis2 = raxis1 - 1; raxis2 >= 0; raxis2--) {
+      if (is_trivial_raxis[raxis2])
+        continue;
+      bool combinable_12 = true;
       for (size_t p = 0; p < num_patterns; p++) {
-        if (!Combinable(patterns[p], i, j))
-          combinable_ij = false;
-        if (!Combinable(patterns[p], j, i))
-          combinable_ji = false;
+        if (!Combinable(patterns[p], raxis1, raxis2)) {
+          combinable_12 = false;
+          break;
+        }
       }
-      if (combinable_ij) {
-        CombineAxes(patterns, i, j);
-        is_trivial_axis[i] = true;
+      if (combinable_12) {
+        CombineAxes(patterns, raxis1, raxis2);
+        is_trivial_raxis[raxis1] = true;  // higher numbered raxis is removed.
         exists_trivial_axis = true;
-        // Break from the loop on j and continue over the loop on i, meaning
-        // we are done combining with the i'th axis.  At this point all the
-        // (strides,dims) for axis i are just
+        // Break from the loop over raxis2 and continue over the loop over
+        // raxis1, meaning we are done combining with axis 'raxis1' (it's
+        // trivial now).
         break;
-      } else if (combinable_ji) {
-        CombineAxes(patterns, j, i);
-        is_trivial_axis[i] = true;   // not a typo.  Lower-numbered axis gets
-        // dim=1,stride=0.
+      }
+      bool combinable_21 = true;
+      for (size_t p = 0; p < num_patterns; p++) {
+        if (!Combinable(patterns[p], raxis2, raxis1)) {
+          combinable_21 = false;
+          break;
+        }
+      }
+      if (combinable_21) {
+        CombineAxes(patterns, raxis2, raxis1);
+        is_trivial_raxis[raxis1] = true;  // higher numbered raxis is removed.
         exists_trivial_axis = true;
         break;
       }
     }
   }
-  if (exists_trivial_axis)
-    RemoveTrivialAxes(num_axes, trivial_axis, patterns);
+  if (exists_trivial_axis) {
+    RemoveTrivialAxes(max_num_axes, is_trivial_raxis, patterns);
+    changed = true;
+  }
+  if (changed)
+    for (size_t p = 0; p < num_patterns; p++)
+      patterns[p]->code = ComputePatternCode(*(patterns[p]));
+  return changed;
 }
 
 
 void CompressOnePattern(TensorPattern *pattern,
                         int64 *data_offset) {
+  // We may at some point implement this specially; doing this would be more efficient.
+  CompressPatterns({pattern}, data_offset);
 }
 
 
-  int32 GetDimsCode(const TensorPattern &pattern) {
+void SortAxes(TensorPattern *pattern) {
+  int32 num_axes = pattern->num_axes;
+  std::pair<int32,int32> strides_dims[KALDI_TENSOR_MAX_DIM];
+  for (int32 raxis = 0; raxis < num_axes; raxis++) {
+    int32 stride = pattern->strides[raxis],
+        dim = pattern->dims[raxis];
+    KALDI_ASSERT(stride > 0);  // see documentation in header for reasons.
+    strides_dims[raxis].first = stride;
+    strides_dims[raxis].second = dim;
   }
+  std::sort(strides_dims, strides_dims + num_axes);
+  for (int32 raxis = 0; raxis < num_axes; raxis++) {
+    pattern->strides[raxis] = strides_dims[raxis].first;
+    pattern->dims[raxis] = strides_dims[raxis].second;
+  }
+}
+
+
+int32 GetDimsCode(const TensorPattern &pattern) {
+  // we may not need this after all.
+}
 
 
 }  // namespace kaldi
diff --git a/src/tensor/tensor-pattern-utils.h b/src/tensor/tensor-pattern-utils.h
index 7b502529793..fe4e204d5e9 100644
--- a/src/tensor/tensor-pattern-utils.h
+++ b/src/tensor/tensor-pattern-utils.h
@@ -19,6 +19,7 @@
 
 
 #include "tensor/tensor-common.h"
+#include "tensor/array-ref.h"
 
 /**
    This is some notes on plans for kaldi10 tensor stuff, nothing is fully fleshed out.
@@ -31,14 +32,14 @@ namespace tensor {
 /**
    This function returns a code that compactly says whether each axis
    has dim = 1 or dim != 1.  For purposes of the code generated, the number
-   of axes does not matter; imagine we left-padded with enough `dim=1`
-   axes to give KALDI_TENSOR_MAX_DIM axes.
+   of axes does not matter.  The lower-order KALDI_TENSOR_MAX_DIM bits
+   of the code might potentially be set; the rest will be zero.
 
-   The rightmost (least significant) bit is the last axis (numbered
-   KALDI_TENSOR_MAX_DIM - 1 after padding).
+   The rightmost (least significant) bit corresponds to the last-numbered axis,
+   equivalent to raxis (reversed axis-index) == 0.
 
-   Note that the `dims` vectors below are displayed after removing
-   any leading `dim=1` axes.
+   Note that non of the example `dims` vectors below have any leading
+   (dim=1) axes, because they wouldn't affect the code.
 
    The examples below will use c++14 binary literals, although
    the code doesn't use them.  In the notation below, in dims vectors,
@@ -57,19 +58,47 @@ namespace tensor {
 int32 GetDimsCode(const TensorPattern &pattern);
 
 
+enum PatternEnum {
+  kPatternContainsNegativeStride = 2048
+  // e.g.:
+  // bool contains_negative_stride =
+  //     (pattern.code | kPatternContainsNegativeStride) != 0;
+};
+
+// Returns true if the pattern code indicates that the pattern contains a
+// negative stride.
+inline bool ContainsNegativeStride(int32 pattern_code) {
+  return (pattern_code | kPatternContainsNegativeStride) != 0;
+}
+
+// Returns true if the pattern code indicates that the raxis
+// numbered 'raxis' (the r refers to the backwards numbering used
+// in 'pattern') is 'trivial' (meaning: dim=1, stride=0).
+inline bool AxisIsTrivial(int32 pattern_code, int32 raxis) {
+  return (pattern_code | 1 << raxis) == 0;
+}
+
+
 /**
    This function returns a code that compactly represents the same information
-   as GetDimsCode() [i.e. which axes, counting from the last axis,
-   had dim != 1], and also which axis, if any,
-   had stride=1.  (No two axes can have stride=1, due to the uniqueness
-   rule; search in tensor-pattern.h).
+   as GetDimsCode() [i.e. which axes had dim != 1], but also encodes which axis,
+   if any, had stride=1, and has a bit that says whether any axis had negative
+   stride.  (No two axes can have stride=1, due to the uniqueness rule; search
+   in tensor-pattern.h).
 
    Let
       n = 0 if no axis had stride=1, otherwise:
-      n = num_axes - (the axis that had stride=1).
+      n = 1 + the raxis index which had stride=1.
+
+    (raxis is the axis index when accessing the axes in reversed order, as
+     stored in pattern.dims and pattern.strides).
+
+   For example if the strides were [10,3,1] we would have
+   n = 1; i if the strides were [10,1,3] we would have n = 2.
 
-   For example if the strides where (10,3,1) we would have
-   n = 1; i if the strides were (10,1,3) we would have n = 2.
+   IMPORTANT NOTE ON ORDERING: lists of dims or strides in square
+   brackets, like [1,2], are in the non-reversed ordering as exposed
+   by the Tensor API.
 
    The value 'n' occupies the bits starting from 8 in the returned code,
    i.e. bits 8,9,10 (counting from the right, i.e. from the least to
@@ -91,35 +120,31 @@ int32 GetDimsCode(const TensorPattern &pattern);
    not equal to 1', and upper-case X indicates that the axis has stride=1.  In
    the example `dims` vectors below, we don't put any leading `dim=1` axes,
    because they would not affect the code generated.  The list of numbers
-   in parentheses below may be interpreted as the sequence of dims for the
-   Tensor.
+   in square brackets [] below may be interpreted as the sequence of dims for the
+   Tensor, in the non-reversed ordering that the Tensor API exposes.
 
    The ' at the 8th bit is to make the bit-string easier to parse.
 
-
-    0b000'00000000  0x000  dims=(), a scalar
-    0b000'00000001  0x001  dims=(x), a vector with a stride
-    0b001'00000001  0x101  dims=(X), a vector
-    0b000'00000010  0x002  dims=(x,1),  a vector.unsqueeze(-1) with a stride
-    0b010'00000010  0x202  dims=(X,1),  a vector.unsqueeze(-1)
-    0b000'00000011  0x003  dims=(x,x), a matrix with a stride
-    0b001'00000011  0x103  dims=(x,X), a matrix
-    0b010'00000011  0x203  dims=(X,x), a transposed matrix
-    0b000'00000100  0x008  dims=(x,1,1)
-    0b011'00000100  0x308  dims=(X,1,1)
-    0b010'00000110  0x20B  dims=(x,X,1), a matrix.unsqueeze(-1)
-    0b011'00000110  0x30B  dims=(X,x,1), a transposed matrix.unsqueeze(-1)
-    0b000'00000110  0x10B  dims=(x,x,1), a matrix.unsqueeze(-1) with column stride
-    0b001'00000101  0x109  dims=(x,1,X), a matrix.unsqueeze(-2)
-    0b011'00000101  0x309  dims=(X,1,x), a transposed matrix.unsqueeze(-2)
-    0b000'00000101  0x009  dims=(x,1,x), a matrix.unsqueeze(-2) with column stride
-
+    0b000'00000000  0x000  dims=[], a scalar
+    0b000'00000001  0x001  dims=[x], a vector with a stride
+    0b001'00000001  0x101  dims=[X], a vector
+    0b000'00000010  0x002  dims=[x,1], a vector with a stride
+    0b010'00000010  0x202  dims=[X,1], a vector
+    0b000'00000011  0x003  dims=[x,x], a matrix with a stride
+    0b001'00000011  0x103  dims=[x,X], a matrix
+    0b010'00000011  0x203  dims=[X,x], a transposed matrix
+    0b000'00000100  0x008  dims=[x,1,1], a vector with a stride
+    0b011'00000100  0x308  dims=[X,1,1], a vector
+    0b010'00000110  0x20B  dims=[x,X,1], a matrix
+    0b011'00000110  0x30B  dims=[X,x,1], a transposed matrix
+    0b000'00000110  0x10B  dims=[x,x,1], a matrix with column stride
+    0b001'00000101  0x109  dims=[x,1,X], a matrix
+    0b011'00000101  0x309  dims=[X,1,x], a transposed matrix
+    0b000'00000101  0x009  dims=[x,1,x], a matrix with column stride
 
     ...
  */
-int32 GetPatternCode(const TensorPattern &pattern);
-
-
+int32 ComputePatternCode(const TensorPattern &pattern);
 
 
 inline int32 CombineCodes(int32 code1, int32 code2) {
@@ -133,22 +158,128 @@ inline int64 CombineCodes(int32 code1, int32 code2, int32 code3) {
 }
 
 
-/**  This function returns true if the dimensions of tensor patterns
-     a and b are broadcastable in the PyTorch sense.  What this means
-     for tensors with the same num-axes is that dims for axis i
-     must either be the same or one of them must be 1.  For tensors
-     with different num-axes we pad with leading (dim=1)'s; for
-     instance, dims (2,8,3) and (8,1) would be broadcastable because
-     the (8,1) becomes (1,8,1).
-
-     If 'b_non_reducing' is true, then we do not allow any dim of
-     b to be 1 where the corresponding dim of a was not 1.
-
-     This check is simple to implement due to the way we store
-     the dims 'right-justified' so that the last-numbered dim
-     is always at dims[KALDI_TENSOR_MAX_DIM - 1].
+/**
+   Modifies 'p' in-place by inserting an axis with (dim=1,stride=0) at the
+   specified position specified in the reversed numbering physically used
+   in the pattern.  Updates p->code.
+
+   Showing just the dims in the pattern (in the order physically present in the
+   dims array), for some examples:
+
+\verbatim
+    UnsqueezeR({3,4}, 0)  -> {1,3,4}
+    UnsqueezeR({3,4}, 1)  -> {3,1,4}
+    UnsqueezeR({3,4}, 2)  -> {3,4,1}
+\endverbatim
+
+     @param [in]    raxis   The index at which the extra axis is to appear.
+                            We require 0 <= raxis <= p->num_axes.
+     @param [in,out] p      The pattern to which we are adding an axis.
+                            Will have its num_axes increased by 1
+                            at exit, possibly its dims and strides
+                            arrays changed, and its code updated.
  */
-bool Broadcastable(const TensorPattern &a, const TensorPattern &b,
+void UnsqueezeR(int32 raxis, TensorPattern *p);
+
+
+/**
+   Modifies 'p' in-place by inserting an axis with (dim=1,stride=0) at the
+   specified axis-index (numbered in the public numbering).
+   Equivalent to PyTorch's unsqueeze(), including its behavior with
+   negative axis indexes (axis < 0 is interpreted as to num_axes + 1 - axis).
+
+   Showing just the dims in the pattern, in the non-reversed order as
+   exported by the API, some examples are:
+
+\verbatim
+    Unsqueeze([6,5], 0) -> [1,6,5]
+    Unsqueeze([3,4], 1) -> [3,1,4]
+    Unsqueeze([9,10], 2) -> [9,10,1]
+    Unsqueeze([9,10], -1) -> [9,10,1]
+\endverbatim
+
+     @param [in]    axis   The index at which the extra axis is to appear.
+                           We require -p->num_axes - 1 <= raxis <= p->num_axes
+                           The large allowable range is because negative
+                           axes are permitted, e.g. -1 means insert a new
+                           axis after the last existing axis.
+     @param [in,out] p      The pattern to which we are adding an axis.
+                            Will have its num_axes increased by 1
+                            at exit, possibly its dims and strides
+                            arrays changed, and its code updated.
+ */
+inline void Unsqueeze(int32 axis, TensorPattern *p) {
+  if (axis < 0) UnsqueezeR(1 - axis, p);
+  else UnsqueezeR(p->num_axes - axis, p);
+}
+
+/**
+   Modifies 'p' in-place by removing an axis with dim=1 from the specified
+   position (in the reversed numbering physically used in the pattern).  Updates
+   p->code.  It is an error if 'p' did not, on entry, contain an axis with dim=1
+   as position 'raxis' in the array.
+
+
+   Modifies 'p' in-place by removing an axis with dim=1 from the
+   specified position specified in the reversed numbering physically used in the
+   pattern.  Updates p->code.  It is an error if 'p' did not initially contain
+   an axis with dim=1 at position 'raxis' in the array.
+
+   This function updates p->code.
+
+   In the example below we show the dims in the order they appear in the
+   physical array:
+\verbatim
+   SqueezeR(0, {1,3,4})  -> {3,4}
+   SqueezeR(1, {5,1,7})  -> {5,7}
+   SqueezeR(2, {8,1,9})  -> [error]
+\endverbatim
+     @param [in]    raxis   The reversed-order axis to be squeezed.
+                            We require 0 <= raxis < p->num_axes and
+                            p->dims[raxis] == 1.
+     @param [in,out] p      The pattern from which we are removing an
+                            axis.  Will have its num_axes reduced by 1
+                            at exit, possibly its dims and strides
+                            arrays changed, and its 'code' updated.
+*/
+void SqueezeR(int32 raxis, TensorPattern *p);
+
+
+/**
+   Modifies 'p' in-place by removing an axis with dim=1 (hence stride=0)
+   located at the specified axis (as numbered in the public numbering).
+   Equivalent to PyTorch's squeeze(), including its behavior with
+   negative axis indexes; axis < 0 is interpreted as to num_axes - axis,
+   i.e. the last axis.  It is an error if 'p' did not, on entry,
+   contain an axis with dim=1 at position 'axis' (in the public numbering).
+
+   Showing just the dims in the pattern, in the non-reversed order as
+   exported by the API, some examples are:
+\verbatim
+    Squeeze([1,6,5], 0) -> [6,5]
+    Squeeze([3,1,4], 1) -> [3,4]
+    Squeeze([9,1,10], 2) -> error
+    Squeeze([7,1], -1) -> [7]
+\endverbatim
+
+     @param [in]    axis    The index at which the extra axis is to appear.
+                            We require -p->num_axes <= axis < p->num_axes
+                            (negative axes are permitted, interpreted
+                            as an offset from p->num_axes).
+                            We require that the specified axis have
+                            dim=1.
+     @param [in,out] p      The pattern from which we are removing an
+                            axis.  Will have its num_axes reduced by 1
+                            at exit, possibly its dims and strides
+                            arrays changed, and its 'code' updated.
+ */
+inline void Squeeze(int32 axis, TensorPattern *p) {
+  if (axis < 0) SqueezeR(1 - axis, p);
+  else SqueezeR(p->num_axes - 1 - axis, p);
+}
+
+
+ybool Broadcastable(const TensorPattern &a, const TensorPattern &b,
                    bool b_non_reducing = false);
 
 
@@ -171,6 +302,31 @@ bool Broadcastable(const TensorPattern &a, const TensorPattern &b,
 
 
 
+/**
+   Returns true if the 'dims' vectors of a and b are the same.
+   Does not require the number of axes to be the same, so effectively
+   it's testing that the dims are the same after padding on the left
+   with dim=1 (here referring to the public, non-reversed numbering
+   of the dims).
+
+   This is a stronger condition than Broadcastable(a, b).
+ */
+bool SameDim(const TensorPattern &a, const TensorPattern &b);
+
+
+/**
+   Returns true if the 'dims' vectors of a, b and c are all the same.
+   Does not require the number of axes to be the same, so effectively
+   it's testing that the dims are the same after padding on the left
+   with dim=1 (here referring to the public, non-reversed numbering
+   of the dims).
+
+   This is a stronger condition than Broadcastable(a, b, c).
+ */
+bool SameDim(const TensorPattern &a, const TensorPattern &b,
+             const TensorPattern &c);
+
+
 /**
    Compresses a TensorPattern by removing or combining as many axes as possible.
    This version is suitable for operations that do not rely on any kind
@@ -180,21 +336,20 @@ bool Broadcastable(const TensorPattern &a, const TensorPattern &b,
    output.  The output (dim,stride) pairs will be ordered from
    greatest to least stride (note: all output strides will be positive).
 
-      @param [in]  src   The pattern to be compressed
-      @param [in]  src_properties  Properties of 'src'; required to
-                          be accurate (behavior is undefined otherwise,
-                          e.g. if you provide some other pattern's properties).
-      @param [out] dest   A simplified-as-much-as-possible pattern that
-                          covers the same set of memory locations as 'src' (when
-                          combined with the offset below).  'dest' will
-                          contain only nonnegative strides.
+      @param [in,out]  pattern   The pattern to be compressed
+
       @param [out] data_offset  A number that we would have to add to
                           the data pointer of the source Tensor so
                           that 'dest' would cover the same set of
                           elements.  It will always be zero if 'src'
                           was free of negative strides.
    Examples are below, where we write a TensorPattern as
-    `{{dim1,dim2,..}, {stride1,stride2,..}}`.
+
+   `{{dim1,dim2,..}, {stride1,stride2,..}}`.
+
+   (the curly braces in our notation imply that we are referring to the reversed
+   ordering physically used in 'pattern', but actually this doesn't affect
+   anything as the order of axes does not matter here as long as it is constent.
 
 \verbatim
    Input pattern             Output pattern            Output offset
@@ -208,6 +363,28 @@ bool Broadcastable(const TensorPattern &a, const TensorPattern &b,
 void CompressOnePattern(TensorPattern *pattern,
                         int64 *data_offset);
 
+/**
+   Sorts the axes in 'pattern' from smallest to largest stride
+   (in the reversed numbering physically present in 'pattern'; would
+   be largest to smallest in the public API).  Useful in testing
+   equivalence of patterns, as CompressOnePattern() followed
+   by SortAxes() leads to a normalized form.
+
+   This function requires that for 0 <= i < pattern->num_axes,
+   pattern->strides[i] > 0.  This condition is satisfied by
+   a pattern that has previously been compressed by CompressOnePattern().
+   If in future we need to relax this constraint, we will do so.
+   (The assumption of positive strides simplifies implementation
+   because to normalize the form we'd have to make all strides
+   positive, which would require outputting an offset).
+
+     @param [in,out]  The pattern whose axes are to be sorted
+                   from least to greatest stride (in the physical
+                   ordering).
+ */
+void SortAxes(TensorPattern *pattern);
+
+
 /*
   Compress two TensorPatterns by combining axes (and possibly
   flipping the sign of their strides and changing the data offset)
@@ -249,12 +426,10 @@ void CompressOnePattern(TensorPattern *pattern,
 
 
  */
-void CompressTwoPatterns(const TensorPattern &src1,
-                         const TensorPattern &src2,
-                         TensorPattern *dest1,
-                         int64 *data_offset1,
-                         TensorPattern *dest2,
-                         int64 *data_offset2);
+void CompressTwoPatterns(TensorPattern *a,
+                         TensorPattern *b,
+                         int64 *data_offset_a,
+                         int64 *data_offset_b);
 
 
 /**
@@ -265,7 +440,7 @@ void CompressTwoPatterns(const TensorPattern &src1,
    which covers the same set of memory locations as the original Tensor.
 
    The difference with just calling CompressOnePattern() several times is
-   that this preserves the relationships between the tensors.
+   that CompressPatterns() preserves the relationships between the tensors.
 
    Firstly, we require that all pairs of TensorPattern in 'patterns' be
    broadcastable: that is, Broadcastable(p1, p2) would hold for any
@@ -274,38 +449,37 @@ void CompressTwoPatterns(const TensorPattern &src1,
    with dim,stride (0, 1), we allow it to be indexed by any value
    (not just zero), so that all the tensors represented can accept the
    same set of index tuples.  Suppose for example that there are three
-   patterns, p1, p2, p3, in 'patterns', with 4 axes.  Let max_dim
-   be the 'combined' dimension, which contains the max of the dims
-   of the corresponding axes of p1,p2,p3, and let
+   patterns, p1, p2, p3, in 'patterns', with 4 axes.  Let max_axes
+   larger of the num-axes of p1, p2 or p3, and let
    x = (i, j, k, l) be an index tuple that would be valid for a tensor
-   of dim max_dim.  Each such x, when used as an index into p1, p2
+   with that many axes.  Each such x, when used as an index into p1, p2
    and p3 with 'permissive indexing' as mentioned above, will
-   will give us a tuple of memory-offsets (o1, o2, o3) (indexes
-   into the respective data pointers).  Ranging over the set of such
+   give us a tuple of memory-offsets (o1, o2, o3); o1, o2 and o3 are indexes
+   into the respective data pointers.  Ranging over the set of index-tuples
    x, we get a set of memory-offset tuples; call this set S_in,
    and call the set that we would get if doing the same procedure
    on the output tensors (with their possibly changed num-axes), be
    S_out.  Let us represent the 'data_offset' output of this function
    as (in this case) a 3-tuple o.  Then the invariant that this
    function needs to satisfy is that:
+
         `S_in = S_out + o`
-   where we interpret the '+ o' as adding to each element of the set.
-   Interpret the above as: one set of 3-tuples == another set of 3-tuples.
 
-   Of course, the 3 tensors and 4 axes mentioned here is just an example.
+   (this equates two sets of 3-tuples, in our example) where we interpret the '+
+   o' as adding to each element of the set.  The '+ o' above would only be
+   necessary if any strides were negated; it is a tuple containing offsets, in
+   elements, to be added to the data pointers of the respective output tensors.
+
 
-      @param [in,out] patterns   An array of 1 <= size <= 4 of the patterns
+      @param [in,out] patterns   An nonempty array of the patterns
                          to be jointly compressed.
       @param [out]  data_offsets  Pointer to an array of the same size
-                        as patterns, which on output will contain
+                        as 'patterns', which on output will contain
                         offsets to be added to the data pointers.
 
-
       @return  Returns true if it made any change to the patterns,
-               false if they were unchanged.
-
-   Examples are below, where we write a TensorPattern as
-    `{{dim1,dim2,..}, {stride1,stride2,..}}`.
+               false if they were unchanged.  If false, the
+               data_offsets will be set to zero.
 
  Examples are below, where we write a TensorPattern as
  `{{dim1,dim2,..}, {stride1,stride2,..}}`.
@@ -320,8 +494,8 @@ void CompressTwoPatterns(const TensorPattern &src1,
  {{3,4},{4,1}}        {{1,1},{0,0}}      {{12},{1}}           {{1},{0}}    # combine
 \endverbatim
  */
-bool CompressPatterns(ArrayRef<TensorPattern> patterns,
-                      int64_t *data_offsets);
+bool CompressPatterns(ArrayRef<TensorPattern*> patterns,
+                      int64 *data_offsets);
 
 /**
    Compresses a TensorPattern by removing or combining as many axes as possible,
@@ -331,14 +505,29 @@ bool CompressPatterns(ArrayRef<TensorPattern> patterns,
 
     This function removes axes with dim=1.
 
-    This function combines successive axes if the relationship of their
-    dims and strides is what you would expect in a "C"-style array.
-    Suppose that in 'src' we had two successive axes with dims and
-    strides (dim_a, dim_b) and (stride_a, stride_b), with dim_a > 1 and
-    dim_b > 1.  If stride_a == stride_b * dim_b, then this function
-    will merge them into a single axis with dimension (dim_a * dim_b)
-    and stride stride_b.   (However, they won't be merged if it would
-    result in a dimension exceeding the range of int32).
+   This function combines successive axes if the relationship of their
+   dims and strides is what you would expect in a "C"-style array
+   when the axes are listed in their non-reversed ordering (i.e.
+   as exposed by class Tensor).
+
+
+   Suppose that in pattern 'p' we had two successive axes physically numbered
+   raxis, raxis+1, with p->dims[raxis] > 1 and p->dims[raxis+1] > 1
+   and p->strides[raxis + 1] == p->strides[raxis] * p->dims[raxis],
+   then this function will merge them into a single axis with dimension
+   the product of the two dimensions..
+
+    TODO...
+
+   finish this if it turns out to be needed for something.
+
+
+   with dims and
+   strides (dim_a, dim_b) and (stride_a, stride_b), with dim_a > 1 and
+   dim_b > 1.  If stride_a == stride_b * dim_b, then this function
+   will merge them into a single axis with dimension (dim_a * dim_b)
+   and stride stride_b.   (However, they won't be merged if it would
+   result in a dimension exceeding the range of int32).
 
    The output pattern 'dest' is what you get if you keep applying the
    rules above until no further change is made.
@@ -354,9 +543,8 @@ bool CompressPatterns(ArrayRef<TensorPattern> patterns,
    {2,3,4},{100,-4,-1}        {{2,12},{100,-1}}
 \endverbatim
  */
-void CompressPatternC(const TensorPattern &src,
-                      const TensorPatternProperties &src_properties,
-                      TensorPattern *dest);
+void CompressPatternC(TensorPattern *p);
+
 
 
 /**
@@ -410,9 +598,5 @@ bool CreateViewPattern(const TensorPattern &pattern_in,
 
 
 
-
-};
-
-
-}
-}
+}  // namespace tensor
+}  // namespace kaldi
diff --git a/src/tensor/tensor-pattern.cc b/src/tensor/tensor-pattern.cc
index 7d76007eac7..c9a02c3bff1 100644
--- a/src/tensor/tensor-pattern.cc
+++ b/src/tensor/tensor-pattern.cc
@@ -17,19 +17,22 @@
 // See the Apache 2 License for the specific language governing permissions and
 // limitations under the License.
 
+#include <algorithm>
 #include "tensor/tensor-pattern.h"
 
 
 namespace kaldi {
 namespace tensor {
 
-bool TensorPattern::Check() {
-  if (num_axes < 0 || num_axes >= KALDI_TENSOR_MAX_DIM)
+bool TensorPattern::Check(bool check_code) {
+  if (num_axes < 0 || num_axes > KALDI_TENSOR_MAX_DIM)
     return false;
-  for (int32 axis = 0; axis < num_axes; axis++) {
-    int32 dim = dims[axis], stride = strides[axis];
+
+  int32 raxis;
+  for (raxis = 0; raxis < num_axes; raxis++) {
+    int32 dim = dims[raxis], stride = strides[raxis];
     // All dims must be positive.  (We have no concept of
-    // an empty tensor, you would use NULL, or None, to represent
+    // an empty tensor; you would use NULL, or None, to represent
     // that.
     if (dim <= 0)
       return false;
@@ -39,38 +42,50 @@ bool TensorPattern::Check() {
     } else {
       if (stride == 0) return false;
     }
+
+  }
+  for (; raxis < KALDI_TENSOR_MAX_DIM; raxis++) {
+    // Check that all unused axes have dim=1, stride=0.
+    // Keeping them this way makes checks for broadcastability easier.
+    // We may later remove this requirement.
+    if (dims[raxis] != 1 || strides[raxis] != 0)
+      return false;
   }
 
   {
     // Now check for potential overlap.  We take all the axes with dim != 1 and
-    // sort them from greatest to least stride, and check that for each i,
-    // abs(strides[i]) >= dims[i+1] * abs(strides[i+1]).
-    std::pair<int32, int32> dims_abs_strides [KALDI_TENSOR_MAX_DIM];
+    // sort them from least to greatest stride, and check that for each i>0,
+    // abs(strides[i]) >= dims[i-1] * abs(strides[i-1]).
+    std::pair<int32, int32> abs_strides_and_dims[KALDI_TENSOR_MAX_DIM];
     int32 num_nontrivial_axes = 0;
+    // The dims and strides are shifted to the right of the arrays 'dims' and
+    // 'strides', to make the broadcasting rules of toolkits like PyTorch (which
+    // left-pad to make the arrays have the same num-axes) easier to enforce.
     for (int32 i = 0; i < num_axes; i++) {
-      if(dims[i] != 1) {
-        dims_abs_strides[num_nontrivial_axes].first = dims[i];
-        dims_abs_strides[num_nontrivial_axes].second = std::abs(strides[i]);
+      if (dims[i] != 1) {
+        abs_strides_and_dims[num_nontrivial_axes].first = dims[i];
+        abs_strides_and_dims[num_nontrivial_axes].second = std::abs(strides[i]);
         num_nontrivial_axes++;
       }
     }
-    // We want to sort on strides from greatest to least, so use '>' not
-    // '<' as the comparator.
-    std::sort(dims_abs_strides, dims_abs_strides + num_nontrivial_axes,
-              [](const std::pair<int32, int32> &a, const std::pair<int32, int32> &b) {
-                return a.second > b.second
-                    });
-    for (int32 i = 0; i < num_nontrivial_axes; i++) {
-      // if (abs(strides[i]) < dims[i+1] * abs(strides[i+1])) return false;
-      if (dims_abs_strides[i].second <
-          dims_abs_strides[i+1).first * dims_abs_strides[i+1).second)
+    // Sort on strides from least to greatest.
+    std::sort(abs_strides_and_dims, abs_strides_and_dims + num_nontrivial_axes);
+    for (int32 i = 1; i < num_nontrivial_axes; i++) {
+      // if (abs(strides[i]) < dims[i-1] * abs(strides[i-1])) return false;
+      if (abs_strides_and_dims[i].first <
+          abs_strides_and_dims[i-1].second * abs_strides_and_dims[i-1].first)
         return false;
     }
   }
-  return true;
+
+  if (check_code)
+    return code == ComputePatternCode(*this);
+  else
+    return true;
 }
 
 
+// MAY DELETE THIS.  It's not up to date anyway.
 void TensorPatternProperties::UpdateProperties(const TensorPattern &pattern) {
   KALDI_PARANOID_ASSERT(pattern.Check());
   int32 num_axes = pattern.num_axes;
diff --git a/src/tensor/tensor-pattern.h b/src/tensor/tensor-pattern.h
index 90c9f686c51..42d46203188 100644
--- a/src/tensor/tensor-pattern.h
+++ b/src/tensor/tensor-pattern.h
@@ -21,6 +21,7 @@
 #define KALDI_TENSOR_TENSOR_PATTERN_H_ 1
 
 #include "tensor/tensor-common.h"
+#include <limits>
 
 /**
    This is some notes on plans for kaldi10 tensor stuff, nothing is fully fleshed out.
@@ -30,116 +31,20 @@ namespace kaldi {
 namespace tensor {
 
 
-// This enum with one value is a trick to allow you to
-// emulate indexing schemes like, say, A[10:].
-// In C++ you'd do A(all,10).
-enum RangeEnum { all };
-
-/**
-   struct Range represents an integer or a range of integers (e.g. as used in
-   indexing).  It emulates Python's range().
-
-   There are various possibilities of what Range can contain, enumerated below.
-   Be careful: we use {a,b,c} to denote the actual class members, not the
-   arguments to constructors, which mimic the arguments of expressions with colons
-   in Python's indexing with ':'
-
-   For purposes of explanation I will assume we are indexing a 1-dimensional
-   array a, but this struct is also used for multi-dimensional indexing.
-
-   Examples are below (showing members {begin,end,step}, where inf means
-   std::numeric_limits<int64>::max()):
-
-
-   Literal contents     Python equivalent,     How obtained             Elements of array
-   of Range struct      indexing array a     using constructors           you would get
-
-    {0,inf,1}          a[:], a[0:]          Range(all), Range(0,all)    all of them
-
-    {0,10,2}           a[:10:2], a[0:10:2]   Range(0,10,2)             [0,2,4,8]
-
-    {0,-1,1}           a[:-1], a[0:-1]       Range(0,-1)                all but the last
-
-    {10,2,-1}          a[10:2:-1]           Range(10,2,-1)              [10,9,...3]
-
-    {inf,inf,-1}        a[::-1]             Range(all,all,-1)            all, reversed order
-
-    {-3,-2,1}          a[-3:-2]            Range(-3,-2)             third-from-last element only
-
-    {10,0,inf}         a[10]              10 (implicit; RangeExt constructor)    the 10th element, removing axis
-
-
-*/
-struct Range {
-  int32 begin;
-  int32 end;
-  int32 step;
-
-  static inline int32 inf() { return std::numeric_limits<int32>::max(); }
-
-  // The default constructor leaves the range undefined.
-  Range() { }
-
-  Range(RangeEnum): begin(0), end(inf()), step(1) { }
-
-  explicit Range(int32 end): begin(0), end(end), step(1) { }
-
-  Range(int32 begin, int32 end, int32 step = 1):
-      begin(begin), end(end), step(1) { }
-
-  Range(int32 begin, RangeEnum, int32 step = 1):
-      begin(begin), end(inf()), step(step) { }
-
-  Range(RangeEnum, int32 end, int32 step = 1):
-      begin(inf()), end(end), step(step) { }
-
-  Range(RangeEnum, RangeEnum, int32 step = 1):
-      begin(inf()), end(inf()), step(step) { }
-};
-
-/**
-  struct RangeExt is used in situations, such as indexing, where what we have
-  might be a Range (like, in numpy, indexing with something that has a colon);
-  or it might simply be an integer.  There are no new members.  The reason we
-  don't just make this an additional constructor of Range is that we want it
-  so if someone does Range(10) it is interpreted as 0 through 9, but if
-  you do just 10 it means the index 10.  You can't have an explicit and
-  implicit constructor taking the same type: hence this child class.
-
-  Note that numpy's A[1] is not the same as A[1:2] because the former returns a
-  tensor with one fewer axes.
-*/
-struct RangeExt: public Range {
-  RangeExt(Range r): Range(r) { }
-
-  // implicit
-  RangeExt(int32 index):
-      Range(index, 0, inf());
-};
-
-
-/**
-   This function, used in indexing operations, takes a Range that may have, say,
-   negative 'end' or end equal to Range::inf(), and turns it into actual
-   numbers with begin and end both in the range [0,dim].  So, for instance, if
-   the range had `end = -1`, it would be turned into `dim - 1`; or if `end` was
-   Range::inf(), it would be interpreted as `dim`.
-
-   Raises an exception the resulting range is empty.
- */
-void MakeRangeExplicit(int32 dim, Range *range);
-
 
 /*
   This struct stores the dimension and strides of a Tensor.
 
+   *SHIFTING TO THE RIGHT*
+
   The main thing to watch out for is that the dimensions of 'dims' and 'strides'
   to look at is not 0 ... num_axes, but KALDI_TENSOR_MAX_DIM - num_axes
   ... KALDI_TENSOR_MAX_DIM - 1.  The last dimension is always located at
   KALDI_TENSOR_MAX_DIM - 1, i.e. the dims and strides are always
   right-justified.  In addition, for unused axes, we always maintain dim=1 and
-  stride=0. This happens to be quite convenient due to the standard broadcasting
-  rules in things like PyTorch.
+  stride=0. This happens to be quite convenient for implementation if we adopt
+  the standard broadcasting rules in things like PyTorch, whereby the
+  highest-numbered axes always line up.
 
   Below we describe the the properties that a TensorPattern is required to have.
 
@@ -156,17 +61,19 @@ void MakeRangeExplicit(int32 dim, Range *range);
 
     0 <= num_axes <= KALDI_TENSOR_MAX_DIM.
 
-    for 0 <= i < KALDI_TENSOR_MAX_DIM
+    for 0 <= i < num_axes:
        dims[i] > 0
-       if i < KALDI_TENSOR_MAX_DIM - num_axes, then dims[i] = 1.
-       if dims[i] = 1, then strides[i] = 0.
+       if dims[i] == 1, then strides[i] = 0.
        if dims[i] != 1, then strides[i] != 0
 
+    for num_axes <= i < KALDI_TENSOR_MAX_DIM:
+       dims[i] == 1
+       strides[i] == 0
+
     ... plus the uniqueness property.
 
   Note: in the public interface of class Tensor, if you ask for
-  dim(i) it will return pattern.dims[KALDI_TENSOR_MAX_DIM - num_axes + i].
-
+  dim(i) it will return pattern.dims[num_axes - i].
 
   The uniqueness property requires that we must not be able to access the same
   memory location via two different tuples of indexes).  Recause testing this
@@ -178,19 +85,27 @@ void MakeRangeExplicit(int32 dim, Range *range);
 */
 struct TensorPattern {
   int32 num_axes;
-  int32 dims[KALDI_TENSOR_MAX_DIM];
-  int32 strides[KALDI_TENSOR_MAX_DIM];
-  int32 code;  // pattern code; see GetPatternCode() in tensor-pattern-utils.h
-               // for details.
-
-  // We may later add methods to this.
-
-  // Checks that the TensorPattern is valid, assuming it is part of a Tensor.
-  // I.e. that it satifies all the properties mentioned above.
-  // Returns true if valid, false if not valid.
-  bool Check();
+  int32 dims[KALDI_TENSOR_MAX_DIM];     // the dims in reversed order, indexed
+                                        // by 'raxis' (reversed axis)
+  int32 strides[KALDI_TENSOR_MAX_DIM];  // the strides in reversed order,
+                                        // indexed by 'raxis' (reversed axis)
+  int32 code;  // pattern code; see ComputePatternCode() in tensor-pattern-utils.h
+               // for details.  It is the responsibility of the user to keep
+               // this updated (i.e. don't change dims or strides without updating
+               // 'code').
+
+  // Returns true if the TensorPattern is valid, I.e. that it satifies all the
+  // properties mentioned above.
+  //
+  //  @param [in] check_code   If true, the check includes verifying that the
+  //                        'code' has the value it should (c.f. GetPatternCode()).
+  //  @return     Returns true if valid, false if not valid.
+  bool IsValid(bool check_code = true);
 };
 
+
+// We may later get rid of this struct and just have functions to get
+// these properties.
 struct TensorPatternProperties {
   // Below are cached properties that are derived from the underlying data in
   // struct TensorPattern.
@@ -228,4 +143,4 @@ struct TensorPatternProperties {
 }  // namespace kaldi
 
 
-#endif  // KALDI_TENSOR_TENSOR_COMMON_H_
+#endif  // KALDI_TENSOR_TENSOR_PATTERN_H_
diff --git a/src/tensor/tensor-utils.h b/src/tensor/tensor-utils.h
index 20abfabedb4..9e8547b7158 100644
--- a/src/tensor/tensor-utils.h
+++ b/src/tensor/tensor-utils.h
@@ -1,4 +1,4 @@
-// tensor/tensor.h
+// tensor/tensor-utils.h
 
 // Copyright      2019  Johns Hopkins University (author: Daniel Povey)
 
@@ -20,61 +20,81 @@
 #ifndef KALDI_TENSOR_TENSOR_UTILS_H_
 #define KALDI_TENSOR_TENSOR_UTILS_H_ 1
 
+
+#include "tensor/tensor-impl.h"
+#include "tensor/tensor-pattern-utils.h"
 #include "tensor/tensor.h"
 
 namespace kaldi {
 namespace tensor {
 
+/**  This function returns true if the dimensions of tensor patterns
+     a and b are broadcastable in the PyTorch sense.  What this means
+     for tensors with the same num-axes is that dims for axis i
+     must either be the same or one of them must be 1.  For tensors
+     with different num-axes we (conceptually) check this after
+     padding with leading (dim=1)'s; for
+     instance, dims=[2,8,3] and dims=[8,1] would be broadcastable because
+     the [8,1] would be interpreted as [1,8,1].  (The examples above
+     are in the public ordering, not the reversed ordering.)
+
+     If 'b_non_reducing' is true, then we do not allow any dim of
+     b to be 1 where the corresponding dim of a was not 1.
+ */
+inline bool Broadcastable(const Tensor &a, const Tensor &b,
+                          bool b_non_reducing = false) {
+  return Broadcastable(a.impl_.pattern, b.impl_.pattern,
+                       b_non_reducing);
+}
+
+/**  This function returns true if the dimensions of Tensors
+     a, b and c are broadcastable in the PyTorch sense (meaning;
+     after padding their dims on the left with ones to make them
+     have the same num-axes, corresponding dimensions are either
+     identical or 1).  See the version of Broadcastable() above
+     for more information.
+
+       @param [in] a  The first Tensor
+       @param [in] b  The second Tensor
+       @param [in] c  The third Tensor
+       @param [in] c_non_reducing   If true, then we do not allow a dim of
+                      c to be 1 while corresponding dims of a or b
+                      are > 1.
+ */
+inline bool Broadcastable(const Tensor &a, const Tensor &b,
+                          const Tensor &c, bool c_non_reducing = false) {
+  return Broadcastable(a.impl_.pattern, b.impl_.pattern,
+                       c.impl_.pattern, c_non_reducing);
+}
 
+/**
+   Returns true if the 'dims' vectors of a and b are the same.
+   Does not require the number of axes to be the same, so effectively
+   it's testing that the dims are the same after padding on the left
+   with dim=1 (here referring to the public, non-reversed numbering
+   of the dims).
 
-
-// Used in checking function arguments, this function will
-// crash and print a statck trace if Tensor a and b have different
-// Dtype() or different Device().
-void CheckDeviceAndDtype(const TensorImpl &a, const TensorImpl &b);
-
-// Used in checking function arguments, this function will
-// crash and print a statck trace if Tensor a, b and c have different
-// Dtype() or different Device().
-void CheckDeviceAndDtype(const TensorImpl &a, const TensorImpl &b, const TensorImpl &c);
-
+   This is a stronger condition than Broadcastable(a, b).
+ */
+inline bool SameDim(const Tensor &a, const Tensor &b) {
+  return SameDim(a.impl_.pattern, b.impl_.pattern);
+}
 
 /**
-   This function allocates the appropriate storage for the Tensor described
-   in 'impl', and sets is 'data' pointer to the allocated memory address.
-   It returns the address a newly allocated Storage object which manages
-   the memory location; you will probably want to construct a
-   std::unique_ptr<Storage> from this so that when it goes out of scope,
-   the memory will be freed.
-
-      @param [in,out] impl   The TensorImpl object we are allocating for.
-                      Any previous value of impl->data is overwritten.
-                      It is required that that the product of dims in
-                      impl->pattern be nonzero (i.e. that the pattern
-                      is initialized to a valid value), and that its
-                      dtype and device values be set.
-      @return         Returns a newly allocated Storage object that
-                      manages this memory block.  When it is freed,
-                      the memory block will be deallocated using a
-                      method appropriate for the device.
-
-   This function throws on error.  See also AllocateTensorShared().  This
-   function is used by class Tensor, but also by various implementation
-   functions (called with TensorImpl) where we need to allocate temporaries.
-   We don't construct a full-fledged Tensor because we don't want the
-   overhead of managing any shared_ptr's.
+   Returns true if the 'dims' vectors of a, b and c are all the same.
+   Does not require the number of axes to be the same, so effectively
+   it's testing that the dims are the same after padding on the left
+   with dim=1 (here referring to the public, non-reversed numbering
+   of the dims).
+
+   This is a stronger condition than Broadcastable(a, b, c).
  */
-Storage *AllocateTensor(TensorImpl *impl);
+inline bool SameDim(const Tensor &a, const Tensor &b,
+                    const Tensor &c) {
+  return SameDim(a.impl_.pattern, b.impl_.pattern);
+}
 
 
-/**
-   This function is as AllocateTensor(), except that the Storage
-   object returned is allocated via std::make_shared (which involves
-   just one heap allocation, as opposed to two if you constructed
-   the shared_ptr from the Storage* pointer).  See the documentation
-   for AllocateTensor() for more details.
- */
-std::shared_ptr<Storage> AllocateTensorShared(TensorImpl *impl);
 
 
 }  // namespace tensor
diff --git a/src/tensor/tensor.h b/src/tensor/tensor.h
index bfafe022b95..f992964e82b 100644
--- a/src/tensor/tensor.h
+++ b/src/tensor/tensor.h
@@ -22,39 +22,12 @@
 
 #include "tensor/tensor-common.h"
 #include "tensor/tensor-pattern.h"
+#include "tensor/tensor-impl.h"
 #include "tensor/storage.h"
 
 namespace kaldi {
 namespace tensor {
 
-
-/**
-   TensorImpl is basically a Tensor without the shared_ptr to Storage
-   (which is expensive to pass around, because of the cost of atomics).
-   The Tensor contains it as a member, rather than as a pointer.
-
-   Most of our internal functions use TensorImpl rather than Tensor because
-   it is easier to manipulate, but you need to know what you are doing.
-*/
-struct TensorImpl {
-  TensorPattern pattern;
-  DataType dtype;
-  Device device;
-  void *data{nullptr};
-};
-
-// Metadata for a Tensor.  It's occasionally convenient to have this
-// in a struct.
-struct TensorMeta {
-  TensorPattern pattern;
-  DataType dtype;
-  Device device;
-  // Note: the offset is only used in some situations,
-  // it's
-  // We may turn this into an offset measured in elements.
-  int32 offset;
-};
-
 /**
    A Tensor is a multi-dimensional array (up to 5 dimensions) of types such as
    float or double (and eventually ints).  Multiple Tensors may point to data
@@ -76,68 +49,83 @@ class Tensor {
   // contexts, this is sometimes known as the rank of the tensor, or sometimes
   // even its dimension, but these terms are ambiguous so we avoid them, and use
   // the terms 'number of axes' or 'axis' throughout.
+  // Caution: the numbering of axes in the Tensor interface is different
+  // than in TensorImpl::pattern.  Here they are numbered from zero;
+  // in TensorImpl::pattern they are shifted to the right so
+  // the last axis is KALDI_TENSOR_MAX_DIM - 1.
   inline int32 NumAxes() const { return impl_.pattern.num_axes; }
 
-
   const TensorImpl &Impl() { return impl_; }
 
+  const TensorMeta &Meta() { return reinterpret_cast<TensorMeta&>(impl_); }
+
   // Return reference to the struct containing the dimension and
   // stride info.
   const TensorPattern &Pattern() const { return impl_.pattern; }
 
   // Return an array containing dimensions of the tensor; equivalent to
   // .shape in PyTorch.  Dims().size() will equal NumAxes().
-  // We limit each dimension to int32, because BLAS's interface uses int,
-  // which on many common 64-bit platforms is configured with 32 bits.
-  // However the product of dimensions may still be 64 bits.
-  inline ArrayRef<int32> Dims() const { return ArrayRef{impl_.pattern.num_axes, impl_.pattern_.dims}; }
-
-  // Returns the dimension on this axis, a number >= 1.  Result is
-  // undefined if axis < 0 or axis >= NumAxes().
-  inline int32 Dim(int32 axis) const { return impl_.pattern.dims[axis]; }
+  // This cannot return some kind of const reference because the
+  // dims are stored internally in reversed order.
+  std::vector<int32> Dims() const;
 
-  // Returns an array containing the strides of the tensor.
-  // Strides().size() will equal NumAxes().
-  inline ArrayRef<int32> Strides() const { return ArrayRef{impl_.pattern.num_axes, impl_.pattern.strides}; }
-
-  // Returns the stride on this axis.  Will be zero if the corresponding
-  // dimension is 1, and otherwise nonzero (but not necessarily positive).
-  inline int32 Stride(int32 axis) const { return impl_.pattern.strides[axis]; }
-
-  // Returns the number of elements in the Tensor; must be > 0.
-  inline int64 NumElements() const { return derived_.num_elements; }
+  // Return an array containing dimensions of the tensor; equivalent to
+  // .shape in PyTorch.  Strides().size() will equal NumAxes().
+  std::vector<int32> Strides() const;
+
+
+  // Returns the dimension on this axis (which will be >= 1).
+  // Requires 0 <= axis < NumAxes().
+  inline int32 Dim(int32 axis) const {
+    KALDI_ASSERT(static_cast<uint32>(axis) <
+                 static_cast<uint32>(impl_.pattern->num_axes));
+    return impl_.pattern.dims[impl_.pattern->num_axes - 1 - axis];
+  }
+
+  // Returns the stride on this axis (which will be >= 1).
+  // Requires 0 <= axis < NumAxes().
+  inline int32 Stride(int32 axis) const {
+    KALDI_ASSERT(static_cast<uint32>(axis) <
+                 static_cast<uint32>(impl_.pattern->num_axes));
+    return impl_.pattern.strides[impl_.pattern->num_axes - 1 - axis];
+  }
+
+  // Returns the number of elements in the Tensor; will be > 0,
+  // and will equal the product of Dims().
+  int64 NumElements() const;
 
   // Returns true if the data forms a contiguous block in memory.
   // (not the same as 'contiguous()' in PyTorch, which also requires
-  // that the strides be 'C'-style.
-  inline bool IsContiguous() const { return derived_.is_contiguous; }
+  // that the strides be 'C'-style; for that, see HasCStrides().
+  bool IsContiguous() const;
 
   // Returns true if the strides for this array are what you would
   // expect if you were to construct a Tensor from this->Dims();
   // this means "C"-style strides, except that any axis with dimension=1
   // has its stride set to zero.  This is our equivalent of PyTorch's
   // contiguous().
-  inline bool HasCStrides() const { return derived_.has_c_strides; }
+  bool HasCStrides() const;
 
   // Return the data type.
   DataType Dtype() const { return dtype_; }
 
-  // Indexing operators.  All of these return Tensors which reference the same
-  // underlying data as the original Tensor.  We could have done this with just
-  // a single indexing operator taking 5 args of type RangeExt defaulting to
-  // `all`, but we provide separate versions for each num-args for efficiency.
-  // You can provide an int32 where RangeExt is expected; it will be
-  // converted to a special struct of type Range. See the documentation for type
-  // Range, and the table which it contains.  If a is a Tensor with 1 axis, a(0)
-  // will return a scalar Tensor (0 axes
-  //
-  // Any of these indexing operators can operate on Tensors with more axes;
-  // trailing axes will be left alone.
+  /**
+     Indexing operator taking one arg.  Returns a Tensor referencing
+     the same underlying data as this Tensor.
+
+
+     You can provide an int32 where RangeExt is expected; it will be
+     converted to a special struct of type Range. See the documentation for type
+     Range, and the table which it contains.
+     will return a scalar Tensor (0 axes
+
+     Any of these indexing operators can operate on Tensors with more axes;
+     trailing axes will be left alone.
 
   // this operator () taking int32 is only provided in the one-arg case as a
   // convenience; in any case, RangeExt can be constructed from int32 with the
   // same effect.
-  Tensor operator () (int32 i0) const;
+
   Tensor operator () (RangeExt s0) const;
   Tensor operator () (RangeExt s0, RangeExt s1) const;
   Tensor operator () (RangeExt s0, RangeExt s1, RangeExt s2) const;
@@ -150,14 +138,11 @@ class Tensor {
                       RangeExt s3, RangeExt s4) const;
 
 
-  // For a scalar Tensor (NumAxes() == 0) returns the item, cast to
-  // float (if it was not already float); throws if NumAxes() > 0.
+  // For a Tensor with NumElements() == 1, returns the element, cast to float
   explicit operator float() const;
-  // For a scalar Tensor (NumAxes() == 0) returns the item, cast to
-  // double (if it was not already double); throws if NumAxes() > 0.
+  // For a Tensor with NumElements() == 1, returns the element, cast to double
   explicit operator double() const;
-  // For a scalar Tensor (NumAxes() == 0) returns the item, cast to
-  // int32 (if it was not already int32); throws if NumAxes() > 0.
+  // For a Tensor with NumElements() == 1, returns the element, cast to int32
   explicit operator int32() const;
 
   // For a Tensor storing floats, returns the data pointer cast to float;
@@ -187,24 +172,28 @@ class Tensor {
   // data.
   Tensor (const Tensor &other) = default;
 
-  // Move assignment.  Does not copy the data.
-  Tensor(Tensor &&other);
+  // Move assignment.
+  Tensor(Tensor &&other): impl_(other.impl_) { storage.swap(other.storage_); }
 
   /**
      Construct a new Tensor with freshly allocated underlying data with
-     the data type, device and dimension the same as `other`.
+     the data type, device and dimensions the same as `other`.  The strides
+     will be the same order as 'other' if sp == kCopyStrides.
 
-       @param [in]  other  The tensor that we are taking metadata from (we
-                    are not sharing its underlying data).
-       @param [in]  sp   The stride policy; if kCopyStrides then we use
+       @param [in]  meta  The metadata we are copying the dims, device,
+                       dtype and possibly strides from
+       @param [in]  sp   The stride policy; if kCopyStrideOrder then we use
                        strides with the same sign and size-order as
                        `other`, while filling in any gaps if `other`
                        was not contiguous, if kCstrides then we use
-                       "C" style strides for any dimensions != 1.
+                       "C" style strides, i.e. we ignore the stride
+                       order of the source.  (Of course, we set strides
+                       to zero for any axes with `dim=1`, as required by our
+                       framework).
        @param [in]  ip   The data initialization policy
   */
-  Tensor(const Tensor &other, StridePolicy sp, InitializePolicy ip);
-
+  Tensor(const Meta &meta,
+         StridePolicy sp);
 
 
   /** Construct a Tensor with freshly allocated data.
@@ -212,11 +201,46 @@ class Tensor {
                     positive integers).
        @param [in] dtype   The data type to use
        @param [in] device  The device to put the data on
-       @param [in] set_zero   If true, set the tensor to zero.  If false,
-                        the contents will be undefined.
+
+       Example:  `Tensor a({3,4,5}, kDoubleDtype, kCpuDevice);`
    */
-  Tensor(ArrayRef<int32> dims, DataType dtype, Device device,
-         bool set_zero = false);
+  Tensor(ArrayRef<int32> dims, DataType dtype, Device device);
+
+  /** Construct a Tensor with freshly allocated data, and device ==
+      `GetDefaultDevice().`.
+
+       @param [in] dims    The dimensions of the tensor (zero to 5
+                    positive integers).
+       @param [in] dtype   The data type to use
+
+       Example:  `Tensor a({3,4,5}, kDoubleDtype);`
+   */
+  Tensor(ArrayRef<int32> dims, DataType dtype);
+
+  /** Construct a Tensor with freshly allocated data, data type ==
+      `GetDefaultDtype()`,
+
+       @param [in] dims    The dimensions of the tensor (zero to 5
+                    positive integers).
+       @param [in] device  The device to put the data on
+
+       Example:  `Tensor a({3,4,5}, kCpuDevice);`
+   */
+  Tensor(ArrayRef<int32> dims, Device device);
+
+
+  /** Construct a Tensor with freshly allocated data, data type ==
+      `GetDefaultDtype()`, and device == GetDefaultDevice().
+
+       @param [in] dims    The dimensions of the tensor (zero to 5
+                    positive integers).
+       @param [in] device  The device to put the data on
+
+       Example:  `Tensor a({3,4,5}, kCpuDevice);`
+   */
+  Tensor(ArrayRef<int32> dims);
+
+
 
   /**
      Construct a Tensor with the dimensions and strides provided.  This differs
@@ -263,19 +287,17 @@ class Tensor {
 
 
   /**
-     This constructor, which is intended for use primarily in internal
-     code and
+     This constructor takes the 'impl' and 'storage' provided and returns
+     a Tensor containing them.  Intended for special-purpose code such
+     as when we wrap arrays from external frameworks.
    */
-  Tensor(TensorPattern &pattern, DataType dtype, Device device,
-         void *data_);
+  Tensor(const TensorImpl &impl, std::shared_ptr<Storage> storage);
 
  private:
-  // The tensor dim and strides.
+  // This object contains the num-axes, dims, strides and data pointer, plus
+  // cached properties.
   TensorImpl impl_;
 
-  // The raw data pointer.  Will be cast to a pointer of the appropriate
-  // type before indexing.
-  void *data_;
 
   // The storage region where the data resides.  data_ does not necessarily
   // equal storage_->data; it may be more than that, e.g. if this is a view

From e36034e4fc848455a304c8019afb6f4bea918710 Mon Sep 17 00:00:00 2001
From: Hossein Hadian <hn.hadian@gmail.com>
Date: Wed, 3 Apr 2019 01:53:29 +0430
Subject: [PATCH 018/163] Implement the rest of transitions.cc + tests (#3198)

* Implement the rest of transitions.cc [except Check]

* Fix bug in topology.cc

* Fix bug in context-dep.cc which can affect when N_=2 and P_=0

* Fix some issues in transitions.cc

* Update hmm-test-utils.cc and transitions-test.cc
---
 src/hmm/hmm-test-utils.cc   | 102 +++++++++++++--------------
 src/hmm/topology.cc         |   1 +
 src/hmm/transitions-test.cc |   5 +-
 src/hmm/transitions.cc      | 136 ++++++++++++++++++++++++++++++------
 src/tree/context-dep.cc     |   2 +-
 5 files changed, 167 insertions(+), 79 deletions(-)

diff --git a/src/hmm/hmm-test-utils.cc b/src/hmm/hmm-test-utils.cc
index 5f00474219b..4323482cde4 100644
--- a/src/hmm/hmm-test-utils.cc
+++ b/src/hmm/hmm-test-utils.cc
@@ -40,7 +40,7 @@ Transitions *GenRandTransitionModel(ContextDependency **ctx_dep_out) {
 
   Topology topo = GenRandTopology(phones, num_pdf_classes);
 
-  Transitions *trans_model = new TransitionModel(*ctx_dep, topo);
+  Transitions *trans_model = new Transitions(*ctx_dep, topo);
 
   if (ctx_dep_out == NULL) delete ctx_dep;
   else *ctx_dep_out = ctx_dep;
@@ -59,22 +59,17 @@ Topology GetDefaultTopology(const std::vector<int32> &phones_in) {
   for (size_t i = 0; i < phones.size(); i++)
     topo_string << phones[i] << " ";
 
-  topo_string << "</ForPhones>\n"
-      "<State> 0 <PdfClass> 0\n"
-      "<Transition> 0 0.5\n"
-      "<Transition> 1 0.5\n"
-      "</State> \n"
-      "<State> 1 <PdfClass> 1 \n"
-      "<Transition> 1 0.5\n"
-      "<Transition> 2 0.5\n"
-      "</State>  \n"
-      " <State> 2 <PdfClass> 2\n"
-      " <Transition> 2 0.5\n"
-      " <Transition> 3 0.5\n"
-      " </State>   \n"
-      " <State> 3 </State>\n"
-      " </TopologyEntry>\n"
-      " </Topology>\n";
+  topo_string <<
+      "</ForPhones>\n"
+      "0  1  1  0.0\n"
+      "1  1  1  0.693\n"
+      "1  2  2  0.693\n"
+      "2  2  2  0.693\n"
+      "2  3  3  0.693\n"
+      "3  3  3  0.693\n"
+      "3  0.693\n\n"
+      "</TopologyEntry>\n"
+      "</Topology>\n";
 
   Topology topo;
   std::istringstream iss(topo_string.str());
@@ -112,7 +107,7 @@ Topology GenRandTopology(const std::vector<int32> &phones_in,
     const std::vector<int32> &phones = iter->second;
     for (size_t i = 0; i < phones.size(); i++)
       topo_string << phones[i] << " ";
-    topo_string << "</ForPhones> ";
+    topo_string << "</ForPhones>\n";
     bool ergodic = (RandInt(0, 1) == 0);
     if (ergodic) {
       // Note, this type of topology is not something we ever use in practice- it
@@ -120,7 +115,7 @@ Topology GenRandTopology(const std::vector<int32> &phones_in,
       // supported so we're testing it.
       std::vector<int32> state_to_pdf_class;
       state_to_pdf_class.push_back(-1);  // state zero, nonemitting.
-      for (int32 i = 0; i < this_num_pdf_classes; i++) {
+      for (int32 i = 1; i <= this_num_pdf_classes; i++) {
         int32 num_states = RandInt(1, 2);
         for (int32 j = 0; j < num_states; j++)
           state_to_pdf_class.push_back(i);
@@ -128,38 +123,32 @@ Topology GenRandTopology(const std::vector<int32> &phones_in,
       state_to_pdf_class.push_back(-1);  // final non-emitting state.
       { // state zero is nonemitting.  This is not something used in any current
         // example script.
-        topo_string << "<State> 0\n";
         BaseFloat prob = 1.0 / (state_to_pdf_class.size() - 2);
-        for (size_t i = 1; i + 1 < state_to_pdf_class.size(); i++) {
-          topo_string << "<Transition> " << i << ' ' << prob << '\n';
-        }
-        topo_string << "</State>\n";
+        for (size_t i = 1; i + 1 < state_to_pdf_class.size(); i++)
+          topo_string << "0 " << i << ' ' << state_to_pdf_class[i]
+                      << ' ' << -Log(prob) << '\n';
       }
       // ergodic part.
       for (size_t i = 1; i + 1 < state_to_pdf_class.size(); i++) {
         BaseFloat prob = 1.0 / (state_to_pdf_class.size() - 1);
-        topo_string << "<State> " << i << " <PdfClass> "
-                    << state_to_pdf_class[i] << '\n';
         for (size_t j = 1; j < state_to_pdf_class.size(); j++)
-          topo_string << "<Transition> " << j << ' ' << prob << '\n';
-        topo_string << "</State>\n";
+          topo_string << i << ' ' << j << ' '
+                      << state_to_pdf_class[i] << ' ' << -Log(prob) << '\n';
       }
       // final, nonemitting state.  No pdf-class, no transitions.
-      topo_string << "<State> " << (state_to_pdf_class.size() - 1) << " </State>\n";
+      topo_string << (state_to_pdf_class.size() - 1) << "\n\n";
     } else {
       // feedforward topology.
       int32 cur_state = 0;
-      for (int32 pdf_class = 0; pdf_class < this_num_pdf_classes; pdf_class++) {
+      for (int32 pdf_class = 1; pdf_class <= this_num_pdf_classes; pdf_class++) {
         int32 this_num_states = RandInt(1, 2);
         for (int32 s = 0; s < this_num_states; s++) {
-          topo_string << "<State> " << cur_state << " <PdfClass> " << pdf_class
-                      << "\n<Transition> " << cur_state << " 0.5\n<Transition> "
-                      << (cur_state + 1) << " 0.5\n</State>\n";
+          topo_string << cur_state << " " << (cur_state + 1) << " " << pdf_class << "\n";
           cur_state++;
         }
       }
       // final, non-emitting state.
-      topo_string << "<State> " << cur_state << " </State>\n";
+      topo_string << cur_state << "\n\n";
     }
     topo_string << "</TopologyEntry>\n";
   }
@@ -192,24 +181,24 @@ void GeneratePathThroughHmm(const Topology &topology,
                             int32 phone,
                             std::vector<std::pair<int32, int32> > *path) {
   path->clear();
-  const Topology::TopologyEntry &this_entry =
-      topology.TopologyForPhone(phone);
+  auto const &this_entry = topology.TopologyForPhone(phone); // an FST
   int32 cur_state = 0;  // start-state is always state zero.
-  int32 num_states = this_entry.size(), final_state = num_states - 1;
+  int32 num_states = this_entry.NumStates(), final_state = num_states - 1;
   KALDI_ASSERT(num_states > 1);  // there has to be a final nonemitting state
   // that's different from the start state.
   std::vector<std::pair<int32, int32> > pending_self_loops;
   while (cur_state != final_state) {
-    const Topology::HmmState &cur_hmm_state = this_entry[cur_state];
-    int32 num_transitions = cur_hmm_state.transitions.size(),
-        transition_index = RandInt(0, num_transitions - 1);
-    if (cur_hmm_state.forward_pdf_class != -1) {
-      std::pair<int32, int32> pr(cur_state, transition_index);
+    int32 num_transitions = this_entry.NumArcs(cur_state),
+        arc_index = RandInt(0, num_transitions - 1);
+    fst::ArcIterator<fst::StdVectorFst> aiter(this_entry, cur_state);
+    aiter.Seek(arc_index);
+    auto const &arc(aiter.Value());
+    if (arc.ilabel != -1) {
+      std::pair<int32, int32> pr(cur_state, arc_index);
       if (!reorder) {
         path->push_back(pr);
       } else {
-        bool is_self_loop = (cur_state ==
-                             cur_hmm_state.transitions[transition_index].first);
+        bool is_self_loop = (cur_state == arc.nextstate);
         if (is_self_loop) { // save these up, we'll put them after the forward
                             // transition.
           pending_self_loops.push_back(pr);
@@ -223,7 +212,7 @@ void GeneratePathThroughHmm(const Topology &topology,
         }
       }
     }
-    cur_state = cur_hmm_state.transitions[transition_index].first;
+    cur_state = arc.nextstate;
   }
   KALDI_ASSERT(pending_self_loops.empty());
 }
@@ -253,21 +242,26 @@ void GenerateRandomAlignment(const ContextDependencyInterface &ctx_dep,
     int32 phone = phone_sequence[i];
     GeneratePathThroughHmm(trans_model.GetTopo(), reorder, phone, &path);
     for (size_t k = 0; k < path.size(); k++) {
-      const Topology::TopologyEntry &entry =
-          trans_model.GetTopo().TopologyForPhone(phone);
+      auto const &entry = trans_model.GetTopo().TopologyForPhone(phone);
       int32 hmm_state = path[k].first,
-          transition_index = path[k].second,
-          forward_pdf_class = entry[hmm_state].forward_pdf_class,
-          self_loop_pdf_class = entry[hmm_state].self_loop_pdf_class,
+          arc_index = path[k].second,
           forward_pdf_id, self_loop_pdf_id;
+      fst::ArcIterator<fst::StdVectorFst> aiter(entry, hmm_state);
+      aiter.Seek(arc_index);
+      auto const &arc(aiter.Value());
+      int32 forward_pdf_class = arc.ilabel,
+          self_loop_pdf_class = -1;
+      for (fst::ArcIterator<fst::StdVectorFst> aiter_next(entry, arc.nextstate);
+           !aiter_next.Done(); aiter_next.Next())
+        if (aiter_next.Value().nextstate == arc.nextstate)
+          self_loop_pdf_class = aiter_next.Value().ilabel;
+
       bool ans = ctx_dep.Compute(context_window, forward_pdf_class, &forward_pdf_id);
       KALDI_ASSERT(ans && "context-dependency computation failed.");
       ans = ctx_dep.Compute(context_window, self_loop_pdf_class, &self_loop_pdf_id);
       KALDI_ASSERT(ans && "context-dependency computation failed.");
-      int32 transition_state = trans_model.TupleToTransitionState(
-                               phone, hmm_state, forward_pdf_id, self_loop_pdf_id),
-          transition_id = trans_model.PairToTransitionId(transition_state,
-                                                         transition_index);
+      int32 transition_id = trans_model.TupleToTransitionId(phone, hmm_state, arc_index,
+                                                            forward_pdf_id, self_loop_pdf_id);
       alignment->push_back(transition_id);
     }
   }
diff --git a/src/hmm/topology.cc b/src/hmm/topology.cc
index 6d2576edc52..df11b098361 100644
--- a/src/hmm/topology.cc
+++ b/src/hmm/topology.cc
@@ -95,6 +95,7 @@ void Topology::Read(std::istream &is, bool binary) {
       ReadFstKaldi(is, binary, &fst);
       entries_.push_back(fst);
     }
+    ExpectToken(is, binary, "</Topology>");
   }
   Check();
 }
diff --git a/src/hmm/transitions-test.cc b/src/hmm/transitions-test.cc
index 9b9d7099801..a66c563c76a 100644
--- a/src/hmm/transitions-test.cc
+++ b/src/hmm/transitions-test.cc
@@ -24,9 +24,7 @@ namespace kaldi {
 
 
 void TestTransitions() {
-
   Transitions *trans_model = GenRandTransitionModel(NULL);
-
   bool binary = (rand() % 2 == 0);
 
   std::ostringstream os;
@@ -41,7 +39,7 @@ void TestTransitions() {
     trans_model->Write(os1, false);
     trans_model2.Write(os2, false);
     KALDI_ASSERT(os1.str() == os2.str());
-    KALDI_ASSERT(trans_model->Compatible(trans_model2));
+    KALDI_ASSERT(*trans_model == trans_model2);
   }
   delete trans_model;
 }
@@ -53,4 +51,3 @@ int main() {
     kaldi::TestTransitions();
   KALDI_LOG << "Test OK.\n";
 }
-
diff --git a/src/hmm/transitions.cc b/src/hmm/transitions.cc
index c2fd5210f17..e802194be52 100644
--- a/src/hmm/transitions.cc
+++ b/src/hmm/transitions.cc
@@ -40,17 +40,21 @@ void Transitions::ComputeInfo(const ContextDependencyInterface &ctx_dep) {
   KALDI_ASSERT(!phones.empty());
 
   // pdf_info is a set of lists indexed by phone. Each list is indexed by
-  // (pdf-class, self-loop pdf-class) of each state of that phone, and the element
+  // (pdf-class, self-loop pdf-class) of each arc of that phone, and the element
   // is a list of possible (pdf, self-loop pdf) pairs that that (pdf-class, self-loop pdf-class)
   // pair generates.
   std::vector<std::vector<std::vector<std::pair<int32, int32> > > > pdf_info;
   // pdf_class_pairs is a set of lists indexed by phone. Each list stores
-  // (pdf-class, self-loop pdf-class) of each state of that phone.
+  // all unique (pdf-class, self-loop pdf-class) pairs that that phone
+  // can have (on its arcs).
   std::vector<std::vector<std::pair<int32, int32> > > pdf_class_pairs;
   pdf_class_pairs.resize(1 + *std::max_element(phones.begin(), phones.end()));
-
-  std::vector<std::map<std::pair<int32, int32>, std::vector<std::pair<int32, int32> > > > to_topo_state_list;
-  to_topo_state_list.resize(1 + *std::max_element(phones.begin(), phones.end()));
+  // to_arc_list is a list indexed by phone. For each phone, it has a map which
+  // maps a possible pdf class pair (pdf-class, self-loop pdf-class) to all
+  // the arcs in that phone that match that pdf class pair. An arc is represented
+  // as a (topo-state, arc-index) pair.
+  std::vector<std::map<std::pair<int32, int32>, std::vector<std::pair<int32, int32> > > > to_arc_list;
+  to_arc_list.resize(1 + *std::max_element(phones.begin(), phones.end()));
 
   for (size_t i = 0; i < phones.size(); i++) {
     int32 phone = phones[i];
@@ -67,21 +71,23 @@ void Transitions::ComputeInfo(const ContextDependencyInterface &ctx_dep) {
         }
       }
 
-    std::map<std::pair<int32, int32>, std::vector<std::pair<int32, int32> > > phone_to_topo_state_list;
+    std::map<std::pair<int32, int32>, std::vector<std::pair<int32, int32> > > phone_to_arc_list;
     for (StateId state = 0; state < num_states; ++state) {
       for (fst::ArcIterator<fst::StdVectorFst> aiter(entry, state);
            !aiter.Done(); aiter.Next()) {
         const fst::StdArc &arc(aiter.Value());
-        int32 forward_pdf_class = arc.ilabel,
+        int32 forward_pdf_class = arc.ilabel - 1,  // context-dep assumes classes are zero-based.
             self_loop_pdf_class = state_to_self_loop_pdf_class[arc.nextstate];
-        pdf_class_pairs[phone].push_back(std::make_pair(forward_pdf_class,
-                                                        self_loop_pdf_class));
+        if (self_loop_pdf_class != -1)
+          self_loop_pdf_class--;
         auto state_arc_pair = std::make_pair(state, aiter.Position());
         auto pdf_class_pair = std::make_pair(forward_pdf_class, self_loop_pdf_class);
-        phone_to_topo_state_list[pdf_class_pair].push_back(state_arc_pair);
+        phone_to_arc_list[pdf_class_pair].push_back(state_arc_pair);
       }
     }
-    to_topo_state_list[phone] = phone_to_topo_state_list;
+    for (auto const &pdf_class_to_arc: phone_to_arc_list)
+      pdf_class_pairs[phone].push_back(pdf_class_to_arc.first);
+    to_arc_list[phone] = phone_to_arc_list;
   }
   ctx_dep.GetPdfInfo(phones, pdf_class_pairs, &pdf_info);
 
@@ -89,18 +95,20 @@ void Transitions::ComputeInfo(const ContextDependencyInterface &ctx_dep) {
 
   for (int32 i = 0; i < phones.size(); i++) {
     int32 phone = phones[i];
-    for (int32 j = 0; j < static_cast<int32>(pdf_info[phone].size()); j++) {
+    for (int32 j = 0; j < static_cast<int32>(pdf_info[phone].size()); j++) {  // loop on pdf-class pairs
       int32 pdf_class = pdf_class_pairs[phone][j].first,
             self_loop_pdf_class = pdf_class_pairs[phone][j].second;
       auto const &state_arc_vec =
-              to_topo_state_list[phone][std::make_pair(pdf_class, self_loop_pdf_class)];
+              to_arc_list[phone][std::make_pair(pdf_class, self_loop_pdf_class)];
       KALDI_ASSERT(!state_arc_vec.empty());
-      for (auto const& state_arc_pair: state_arc_vec) {
+      for (auto const& state_arc_pair: state_arc_vec) {  // loop on all arcs matching this pdf-class pair
         int32 topo_state = state_arc_pair.first,
             arc_index = state_arc_pair.second;
-        for (size_t m = 0; m < pdf_info[phone][j].size(); m++) {
+        for (size_t m = 0; m < pdf_info[phone][j].size(); m++) {  // loop on all pdf pairs for this pdf-class pair
           int32 pdf = pdf_info[phone][j][m].first,
             self_loop_pdf = pdf_info[phone][j][m].second;
+          if (self_loop_pdf_class == -1)
+            self_loop_pdf = -1;
           TransitionIdInfo tuple{.phone = phone, .topo_state = topo_state,
                 .arc_index = arc_index, .pdf_id = pdf, .self_loop_pdf_id = self_loop_pdf};
           info_.push_back(tuple);
@@ -127,18 +135,29 @@ void Transitions::ComputeDerived() {
     transition.transition_cost = arc.weight.Value();
     if (transition.self_loop_pdf_id == -1)
       transition.self_loop_transition_id = -1;
-    else
+    else {
+      // Find the self-loop of the destination state:
+      int32 arc_index = -1;
+      for (fst::ArcIterator<fst::StdVectorFst> aiter_next(entry, arc.nextstate);
+           !aiter_next.Done(); aiter_next.Next())
+        if (aiter_next.Value().nextstate == arc.nextstate) {  // Found the self-loop
+          arc_index = aiter_next.Position();
+          break;
+        }
+      KALDI_ASSERT(arc_index != -1);
       transition.self_loop_transition_id =
-          TupleToTransitionId(transition.phone, transition.topo_state,
-                              transition.arc_index, transition.self_loop_pdf_id,
+          TupleToTransitionId(transition.phone, arc.nextstate,
+                              arc_index, transition.self_loop_pdf_id,
                               transition.self_loop_pdf_id);
+    }
 
     pdf_ids_[tid] = transition.pdf_id;
   }
 }
 
 Transitions::Transitions(const ContextDependencyInterface &ctx_dep,
-                             const Topology &topo): topo_(topo) {
+                         const Topology &topo): topo_(topo),
+                                                num_pdfs_(ctx_dep.NumPdfs()) {
   // First thing is to get all possible tuples.
   ComputeInfo(ctx_dep);
   ComputeDerived();
@@ -175,6 +194,7 @@ void Transitions::Read(std::istream &is, bool binary) {
     ReadBasicType(is, binary, &(info_[i].self_loop_pdf_id));
   }
   ExpectToken(is, binary, "</Info>");
+  ReadBasicType(is, binary, &num_pdfs_);
   ExpectToken(is, binary, "</Transitions>");
   ComputeDerived();
   Check();
@@ -197,6 +217,7 @@ void Transitions::Write(std::ostream &os, bool binary) const {
   }
   WriteToken(os, binary, "</Info>");
   if (!binary) os << "\n";
+  WriteBasicType(os, binary, num_pdfs_);
   WriteToken(os, binary, "</Transitions>");
   if (!binary) os << "\n";
 }
@@ -204,21 +225,96 @@ void Transitions::Write(std::ostream &os, bool binary) const {
 void Transitions::Check() const {
 
 }
-
+const Transitions::TransitionIdInfo&
+Transitions::InfoForTransitionId(int32 transition_id) const {
+  KALDI_ASSERT(transition_id > 0 && transition_id < info_.size());
+  return info_[transition_id];
+}
 void Transitions::Print(std::ostream &os,
                             const std::vector<std::string> &phone_names,
                             const Vector<double> *occs) {
+  if (occs != NULL)
+    KALDI_ASSERT(occs->Dim() == NumPdfs());
+  for (int32 tid = 1; tid <= NumTransitionIds(); tid++) {
+    auto const &transition = info_[tid];
+    KALDI_ASSERT(static_cast<size_t>(transition.phone) < phone_names.size());
+    std::string phone_name = phone_names[transition.phone];
+
+    os << "Transition-id " << tid << ": phone = " << phone_name
+       << " topo-state = " << transition.topo_state
+       << " arc-index = " << transition.arc_index
+       << " forward-pdf = " << transition.pdf_id << " self-loop-pdf = "
+       << transition.self_loop_pdf_id
+       << " p = " << transition.transition_cost;
+    if (occs != NULL) {
+      if (transition.is_self_loop)
+        os << " count of pdf = " << (*occs)(transition.self_loop_pdf_id);
+      else
+        os << " count of pdf = " << (*occs)(transition.pdf_id);
+    }
+    if (transition.is_self_loop) os << " [self-loop]\n";
+    else {
+      auto const &entry = topo_.TopologyForPhone(transition.phone);  // an FST
+      fst::ArcIterator<fst::StdVectorFst> aiter(entry, transition.topo_state);
+      aiter.Seek(transition.arc_index);
+      auto const &arc(aiter.Value());
+      os << " [" << transition.topo_state << " -> " << arc.nextstate << "]\n";
+    }
+  }
 }
 
 bool GetPdfsForPhones(const Transitions &trans_model,
                       const std::vector<int32> &phones,
                       std::vector<int32> *pdfs) {
+  KALDI_ASSERT(IsSortedAndUniq(phones));
+  KALDI_ASSERT(pdfs != NULL);
+  pdfs->clear();
+  for (int32 tid = 1; tid <= trans_model.NumTransitionIds(); tid++) {
+    auto const &transition = trans_model.InfoForTransitionId(tid);
+    if (std::binary_search(phones.begin(), phones.end(), transition.phone)) {
+      pdfs->push_back(transition.pdf_id);
+      pdfs->push_back(transition.self_loop_pdf_id);
+    }
+  }
+  SortAndUniq(pdfs);
+
+  for (int32 tid = 1; tid <= trans_model.NumTransitionIds(); tid++) {
+    auto const &transition = trans_model.InfoForTransitionId(tid);
+    if ((std::binary_search(pdfs->begin(), pdfs->end(),
+                            transition.pdf_id) ||
+         std::binary_search(pdfs->begin(), pdfs->end(),
+                            transition.self_loop_pdf_id))
+        && !std::binary_search(phones.begin(), phones.end(),
+                               transition.phone))
+      return false;
+  }
   return true;
 }
 
 bool GetPhonesForPdfs(const Transitions &trans_model,
                      const std::vector<int32> &pdfs,
                      std::vector<int32> *phones) {
+  KALDI_ASSERT(IsSortedAndUniq(pdfs));
+  KALDI_ASSERT(phones != NULL);
+  phones->clear();
+  for (int32 tid = 1; tid <= trans_model.NumTransitionIds(); tid++) {
+    auto const &transition = trans_model.InfoForTransitionId(tid);
+    if (std::binary_search(pdfs.begin(), pdfs.end(), transition.pdf_id) ||
+        std::binary_search(pdfs.begin(), pdfs.end(), transition.self_loop_pdf_id))
+      phones->push_back(transition.phone);
+  }
+  SortAndUniq(phones);
+
+  for (int32 tid = 1; tid <= trans_model.NumTransitionIds(); tid++) {
+    auto const &transition = trans_model.InfoForTransitionId(tid);
+    if (std::binary_search(phones->begin(), phones->end(),
+                           transition.phone)
+        && !(std::binary_search(pdfs.begin(), pdfs.end(),
+                                transition.pdf_id) &&
+             std::binary_search(pdfs.begin(), pdfs.end(),
+                                transition.self_loop_pdf_id)))
+      return false;
+  }
   return true;
 }
 
diff --git a/src/tree/context-dep.cc b/src/tree/context-dep.cc
index 5583717633c..7562abf29e2 100644
--- a/src/tree/context-dep.cc
+++ b/src/tree/context-dep.cc
@@ -218,7 +218,7 @@ void ContextDependency::EnumeratePairs(
     // Choose 'position' as a phone position in 'context' that's currently
     // -1, and that is as close as possible to the central position P.
     int32 position = 0;
-    int32 min_dist = N_ - 1;
+    int32 min_dist = N_;
     for (int32 i = 0; i < N_; i++) {
       int32 dist = (P_ - i > 0) ? (P_ - i) : (i - P_);
       if (phone_window[i] == -1 && dist < min_dist) {

From f8adced6ed0f68e95d5528ea8313468ddc9bdfa7 Mon Sep 17 00:00:00 2001
From: Daniel Galvez <galv@users.noreply.github.com>
Date: Wed, 3 Apr 2019 09:02:43 -0700
Subject: [PATCH 019/163] Remove lingering HTK support fully. (#3201)

It was causing various test failures.

Make wave-reading tests recognize that waves are now scaled to [-1, 1].
---
 src/feat/feature-fbank-test.cc          | 383 ------------------
 src/feat/feature-mfcc-test.cc           | 496 +-----------------------
 src/feat/feature-sdc-test.cc            |   2 -
 src/feat/feature-window.cc              |   6 +-
 src/feat/online-feature-test.cc         |  52 +--
 src/feat/online-feature.h               |   6 +
 src/feat/test_data/README               |  22 +-
 src/feat/test_data/fbank1.conf          |  26 --
 src/feat/test_data/fbank2.conf          |  26 --
 src/feat/test_data/fbank3.conf          |  26 --
 src/feat/test_data/fbank4.conf          |  26 --
 src/feat/test_data/hcopy1.conf          |  28 --
 src/feat/test_data/hcopy2.conf          |  24 --
 src/feat/test_data/hcopy3.conf          |  24 --
 src/feat/test_data/hcopy4.conf          |  24 --
 src/feat/test_data/hcopy5.conf          |  27 --
 src/feat/test_data/hcopy6.conf          |  27 --
 src/feat/test_data/plp1.conf            |  23 --
 src/feat/test_data/test.wav.fbank_htk.1 | Bin 13076 -> 0 bytes
 src/feat/test_data/test.wav.fbank_htk.2 | Bin 13076 -> 0 bytes
 src/feat/test_data/test.wav.fbank_htk.3 | Bin 13076 -> 0 bytes
 src/feat/test_data/test.wav.fbank_htk.4 | Bin 13076 -> 0 bytes
 src/feat/test_data/test.wav.fea_htk.1   | Bin 22164 -> 0 bytes
 src/feat/test_data/test.wav.fea_htk.2   | Bin 22164 -> 0 bytes
 src/feat/test_data/test.wav.fea_htk.3   | Bin 22164 -> 0 bytes
 src/feat/test_data/test.wav.fea_htk.4   | Bin 22164 -> 0 bytes
 src/feat/test_data/test.wav.fea_htk.5   | Bin 22164 -> 0 bytes
 src/feat/test_data/test.wav.fea_htk.6   | Bin 22164 -> 0 bytes
 src/feat/test_data/test.wav.plp_htk.1   | Bin 22164 -> 0 bytes
 src/feat/wave-reader-test.cc            |   7 +
 src/hmm/tree-accu.cc                    |   2 +-
 src/transform/Makefile                  |   2 +-
 32 files changed, 29 insertions(+), 1230 deletions(-)
 delete mode 100644 src/feat/test_data/fbank1.conf
 delete mode 100644 src/feat/test_data/fbank2.conf
 delete mode 100644 src/feat/test_data/fbank3.conf
 delete mode 100644 src/feat/test_data/fbank4.conf
 delete mode 100644 src/feat/test_data/hcopy1.conf
 delete mode 100644 src/feat/test_data/hcopy2.conf
 delete mode 100644 src/feat/test_data/hcopy3.conf
 delete mode 100644 src/feat/test_data/hcopy4.conf
 delete mode 100644 src/feat/test_data/hcopy5.conf
 delete mode 100644 src/feat/test_data/hcopy6.conf
 delete mode 100644 src/feat/test_data/plp1.conf
 delete mode 100644 src/feat/test_data/test.wav.fbank_htk.1
 delete mode 100644 src/feat/test_data/test.wav.fbank_htk.2
 delete mode 100644 src/feat/test_data/test.wav.fbank_htk.3
 delete mode 100644 src/feat/test_data/test.wav.fbank_htk.4
 delete mode 100644 src/feat/test_data/test.wav.fea_htk.1
 delete mode 100644 src/feat/test_data/test.wav.fea_htk.2
 delete mode 100644 src/feat/test_data/test.wav.fea_htk.3
 delete mode 100644 src/feat/test_data/test.wav.fea_htk.4
 delete mode 100644 src/feat/test_data/test.wav.fea_htk.5
 delete mode 100644 src/feat/test_data/test.wav.fea_htk.6
 delete mode 100644 src/feat/test_data/test.wav.plp_htk.1

diff --git a/src/feat/feature-fbank-test.cc b/src/feat/feature-fbank-test.cc
index 47b7b1c4244..39fc5a2906d 100644
--- a/src/feat/feature-fbank-test.cc
+++ b/src/feat/feature-fbank-test.cc
@@ -29,48 +29,6 @@ using namespace kaldi;
 
 
 
-static void UnitTestReadWave() {
-
-  std::cout << "=== UnitTestReadWave() ===\n";
-
-  Vector<BaseFloat> v, v2;
-
-  std::cout << "<<<=== Reading waveform\n";
-
-  {
-    std::ifstream is("test_data/test.wav", std::ios_base::binary);
-    WaveData wave;
-    wave.Read(is);
-    const Matrix<BaseFloat> data(wave.Data());
-    KALDI_ASSERT(data.NumRows() == 1);
-    v.Resize(data.NumCols());
-    v.CopyFromVec(data.Row(0));
-  }
-
-  std::cout << "<<<=== Reading Vector<BaseFloat> waveform, prepared by matlab\n";
-  std::ifstream input(
-    "test_data/test_matlab.ascii"
-  );
-  KALDI_ASSERT(input.good());
-  v2.Read(input, false);
-  input.close();
-
-  std::cout << "<<<=== Comparing freshly read waveform to 'libsndfile' waveform\n";
-  KALDI_ASSERT(v.Dim() == v2.Dim());
-  for (int32 i = 0; i < v.Dim(); i++) {
-    KALDI_ASSERT(v(i) == v2(i));
-  }
-  std::cout << "<<<=== Comparing done\n";
-
-  // std::cout << "== The Waveform Samples == \n";
-  // std::cout << v;
-
-  std::cout << "Test passed :)\n\n";
-
-}
-
-
-
 /**
  */
 static void UnitTestSimple() {
@@ -88,13 +46,10 @@ static void UnitTestSimple() {
   // the parametrization object
   FbankOptions op;
   // trying to have same opts as baseline.
-  op.frame_opts.dither = 0.0;
-  op.frame_opts.preemph_coeff = 0.0;
   op.frame_opts.window_type = "rectangular";
   op.frame_opts.remove_dc_offset = false;
   op.frame_opts.round_to_power_of_two = true;
   op.mel_opts.low_freq = 0.0;
-  op.htk_compat = true;
   op.use_energy = true;
 
   Fbank fbank(op);
@@ -109,347 +64,9 @@ static void UnitTestSimple() {
 }
 
 
-static void UnitTestHTKCompare1() {
-  std::cout << "=== UnitTestHTKCompare1() ===\n";
-
-  std::ifstream is("test_data/test.wav", std::ios_base::binary);
-  WaveData wave;
-  wave.Read(is);
-  KALDI_ASSERT(wave.Data().NumRows() == 1);
-  SubVector<BaseFloat> waveform(wave.Data(), 0);
-
-  // read the HTK features
-  Matrix<BaseFloat> htk_features;
-  {
-    std::ifstream is("test_data/test.wav.fbank_htk.1",
-                     std::ios::in | std::ios_base::binary);
-    bool ans = ReadHtk(is, &htk_features, 0);
-    KALDI_ASSERT(ans);
-  }
-
-  // use fbank with default configuration...
-  FbankOptions op;
-  op.frame_opts.dither = 0.0;
-  op.frame_opts.preemph_coeff = 0.0;
-  op.frame_opts.window_type = "hamming";
-  op.frame_opts.remove_dc_offset = false;
-  op.frame_opts.round_to_power_of_two = true;
-  op.mel_opts.low_freq = 0.0;
-  op.htk_compat = true;
-  op.mel_opts.htk_mode = true;
-  op.use_energy = false;  // C0 not energy.
-
-  Fbank fbank(op);
-
-  // calculate kaldi features
-  Matrix<BaseFloat> kaldi_features;
-  fbank.Compute(waveform, 1.0, &kaldi_features);
-
-
-  std::cout << "<<<=== Compare with HTK features...\n";
-  // compare the results
-  bool passed = true;
-  int32 i_old = -1;
-  KALDI_ASSERT(kaldi_features.NumRows() == htk_features.NumRows());
-  KALDI_ASSERT(kaldi_features.NumCols() == htk_features.NumCols());
-  // Ignore ends-- we make slightly different choices than
-  // HTK about how to treat the deltas at the ends.
-  for (int32 i = 10; i+10 < kaldi_features.NumRows(); i++) {
-    for (int32 j = 0; j < kaldi_features.NumCols(); j++) {
-      BaseFloat a = kaldi_features(i, j), b = htk_features(i, j);
-      /// THE FEATURES ARE ALMOST IDENTICAL WITH HTK!!! (SEE THE TOLERANCE!)
-      if ((std::abs(b - a)) > 0.001) {  //<< TOLERANCE TO DIFFERENCES!!!!!
-        // print the non-matching data only once per-line
-        if (i_old != i) {
-          std::cout << "\n\n\n[HTK-row: " << i << "] " << htk_features.Row(i) << "\n";
-          std::cout << "[Kaldi-row: " << i << "] " << kaldi_features.Row(i) << "\n\n\n";
-          i_old = i;
-        }
-        // print indices of non-matching cells
-        std::cout << "[" << i << ", " << j << "]";
-        passed = false;
-  }}}
-  if (!passed) KALDI_ERR << "Test failed";
-
-  // write the htk features for later inspection
-  HtkHeader header = {
-    kaldi_features.NumRows(),
-    100000,  // 10ms
-    static_cast<int16>(sizeof(float)*kaldi_features.NumCols()),
-    000007  // FBANK
-  };
-  {
-    std::ofstream os("tmp.test.wav.fbank_kaldi.1",
-                     std::ios::out|std::ios::binary);
-    WriteHtk(os, kaldi_features, header);
-  }
-
-  std::cout << "Test passed :)\n\n";
-  
-  unlink("tmp.test.wav.fbank_kaldi.1");
-}
-
-
-static void UnitTestHTKCompare2() {
-  std::cout << "=== UnitTestHTKCompare2() ===\n";
-
-  std::ifstream is("test_data/test.wav", std::ios_base::binary);
-  WaveData wave;
-  wave.Read(is);
-  KALDI_ASSERT(wave.Data().NumRows() == 1);
-  SubVector<BaseFloat> waveform(wave.Data(), 0);
-
-  // read the HTK features
-  Matrix<BaseFloat> htk_features;
-  {
-    std::ifstream is("test_data/test.wav.fbank_htk.2",
-                     std::ios::in | std::ios_base::binary);
-    bool ans = ReadHtk(is, &htk_features, 0);
-    KALDI_ASSERT(ans);
-  }
-
-  // use fbank with default configuration...
-  FbankOptions op;
-  op.frame_opts.dither = 0.0;
-  op.frame_opts.preemph_coeff = 0.0;
-  op.frame_opts.window_type = "hamming";
-  op.frame_opts.remove_dc_offset = false;
-  op.frame_opts.round_to_power_of_two = true;
-  op.mel_opts.low_freq = 25.0;
-  op.htk_compat = true;
-  op.mel_opts.htk_mode = true;
-  op.use_energy = false;  // C0 not energy.
-
-  Fbank fbank(op);
-
-  // calculate kaldi features
-  Matrix<BaseFloat> kaldi_features;
-  fbank.Compute(waveform, 1.0, &kaldi_features);
-
-
-  std::cout << "<<<=== Compare with HTK features...\n";
-  // compare the results
-  bool passed = true;
-  int32 i_old = -1;
-  KALDI_ASSERT(kaldi_features.NumRows() == htk_features.NumRows());
-  KALDI_ASSERT(kaldi_features.NumCols() == htk_features.NumCols());
-  // Ignore ends-- we make slightly different choices than
-  // HTK about how to treat the deltas at the ends.
-  for (int32 i = 10; i+10 < kaldi_features.NumRows(); i++) {
-    for (int32 j = 0; j < kaldi_features.NumCols(); j++) {
-      BaseFloat a = kaldi_features(i, j), b = htk_features(i, j);
-      /// THE FEATURES ARE ALMOST IDENTICAL WITH HTK!!! (SEE THE TOLERANCE!)
-      if ((std::abs(b - a)) > 0.001) {  //<< TOLERANCE TO DIFFERENCES!!!!!
-        // print the non-matching data only once per-line
-        if (i_old != i) {
-          std::cout << "\n\n\n[HTK-row: " << i << "] " << htk_features.Row(i) << "\n";
-          std::cout << "[Kaldi-row: " << i << "] " << kaldi_features.Row(i) << "\n\n\n";
-          i_old = i;
-        }
-        // print indices of non-matching cells
-        std::cout << "[" << i << ", " << j << "]";
-        passed = false;
-  }}}
-  if (!passed) KALDI_ERR << "Test failed";
-
-  // write the htk features for later inspection
-  HtkHeader header = {
-    kaldi_features.NumRows(),
-    100000,  // 10ms
-    static_cast<int16>(sizeof(float)*kaldi_features.NumCols()),
-    000007  // FBANK
-  };
-  {
-    std::ofstream os("tmp.test.wav.fbank_kaldi.1",
-                     std::ios::out|std::ios::binary);
-    WriteHtk(os, kaldi_features, header);
-  }
-
-  std::cout << "Test passed :)\n\n";
-  
-  unlink("tmp.test.wav.fbank_kaldi.1");
-}
-
-static void UnitTestHTKCompare3() {
-  std::cout << "=== UnitTestHTKCompare3() ===\n";
-
-  std::ifstream is("test_data/test.wav", std::ios_base::binary);
-  WaveData wave;
-  wave.Read(is);
-  KALDI_ASSERT(wave.Data().NumRows() == 1);
-  SubVector<BaseFloat> waveform(wave.Data(), 0);
-
-  // read the HTK features
-  Matrix<BaseFloat> htk_features;
-  {
-    std::ifstream is("test_data/test.wav.fbank_htk.3",
-                     std::ios::in | std::ios_base::binary);
-    bool ans = ReadHtk(is, &htk_features, 0);
-    KALDI_ASSERT(ans);
-  }
-
-  // use fbank with default configuration...
-  FbankOptions op;
-  op.frame_opts.dither = 0.0;
-  op.frame_opts.preemph_coeff = 0.0;
-  op.frame_opts.window_type = "hamming";
-  op.frame_opts.remove_dc_offset = false;
-  op.frame_opts.round_to_power_of_two = true;
-  op.mel_opts.low_freq = 25.0;
-  op.htk_compat = true;
-  op.mel_opts.htk_mode = true;
-  op.use_energy = false;  // C0 not energy.
-
-  op.mel_opts.vtln_low = 100.0;
-  op.mel_opts.vtln_high = 7500.0;
-  BaseFloat vtln_warp = 0.9;
-
-  Fbank fbank(op);
-
-  // calculate kaldi features
-  Matrix<BaseFloat> kaldi_features;
-  fbank.Compute(waveform, vtln_warp, &kaldi_features);
-
-
-  std::cout << "<<<=== Compare with HTK features...\n";
-  // compare the results
-  bool passed = true;
-  int32 i_old = -1;
-  KALDI_ASSERT(kaldi_features.NumRows() == htk_features.NumRows());
-  KALDI_ASSERT(kaldi_features.NumCols() == htk_features.NumCols());
-  // Ignore ends-- we make slightly different choices than
-  // HTK about how to treat the deltas at the ends.
-  for (int32 i = 10; i+10 < kaldi_features.NumRows(); i++) {
-    for (int32 j = 0; j < kaldi_features.NumCols(); j++) {
-      BaseFloat a = kaldi_features(i, j), b = htk_features(i, j);
-      /// THE FEATURES ARE ALMOST IDENTICAL WITH HTK!!! (SEE THE TOLERANCE!)
-      if ((std::abs(b - a)) > 0.001) {  //<< TOLERANCE TO DIFFERENCES!!!!!
-        // print the non-matching data only once per-line
-        if (i_old != i) {
-          std::cout << "\n\n\n[HTK-row: " << i << "] " << htk_features.Row(i) << "\n";
-          std::cout << "[Kaldi-row: " << i << "] " << kaldi_features.Row(i) << "\n\n\n";
-          i_old = i;
-        }
-        // print indices of non-matching cells
-        std::cout << "[" << i << ", " << j << "]";
-        if (j < 20) passed = false; // We know the last couple of filterbanks differ.  We let this slide.
-        else KALDI_WARN << "Ignoring difference in last fbanks, we know the algorithms differ.";
-  }}}
-  if (!passed) KALDI_ERR << "Test failed";
-
-  // write the htk features for later inspection
-  HtkHeader header = {
-    kaldi_features.NumRows(),
-    100000,  // 10ms
-    static_cast<int16>(sizeof(float)*kaldi_features.NumCols()),
-    000007  // FBANK
-  };
-  {
-    std::ofstream os("tmp.test.wav.fbank_kaldi.1",
-                     std::ios::out|std::ios::binary);
-    WriteHtk(os, kaldi_features, header);
-  }
-
-  std::cout << "Test passed :)\n\n";
-  
-  unlink("tmp.test.wav.fbank_kaldi.1");
-}
-
-
-static void UnitTestHTKCompare4() {
-  std::cout << "=== UnitTestHTKCompare4() ===\n";
-
-  std::ifstream is("test_data/test.wav", std::ios_base::binary);
-  WaveData wave;
-  wave.Read(is);
-  KALDI_ASSERT(wave.Data().NumRows() == 1);
-  SubVector<BaseFloat> waveform(wave.Data(), 0);
-
-  // read the HTK features
-  Matrix<BaseFloat> htk_features;
-  {
-    std::ifstream is("test_data/test.wav.fbank_htk.4",
-                     std::ios::in | std::ios_base::binary);
-    bool ans = ReadHtk(is, &htk_features, 0);
-    KALDI_ASSERT(ans);
-  }
-
-  // use fbank with default configuration...
-  FbankOptions op;
-  op.frame_opts.dither = 0.0;
-  op.frame_opts.preemph_coeff = 0.0;
-  op.frame_opts.window_type = "hamming";
-  op.frame_opts.remove_dc_offset = false;
-  op.frame_opts.round_to_power_of_two = true;
-  op.mel_opts.low_freq = 25.0;
-  op.htk_compat = true;
-  op.mel_opts.htk_mode = true;
-  op.use_energy = false;  // C0 not energy.
-
-  op.mel_opts.vtln_low = 100.0;
-  op.mel_opts.vtln_high = 7500.0;
-  BaseFloat vtln_warp = 1.1;
-
-  Fbank fbank(op);
-
-  // calculate kaldi features
-  Matrix<BaseFloat> kaldi_features;
-  fbank.Compute(waveform, vtln_warp, &kaldi_features);
-
-
-  std::cout << "<<<=== Compare with HTK features...\n";
-  // compare the results
-  bool passed = true;
-  int32 i_old = -1;
-  KALDI_ASSERT(kaldi_features.NumRows() == htk_features.NumRows());
-  KALDI_ASSERT(kaldi_features.NumCols() == htk_features.NumCols());
-  // Ignore ends-- we make slightly different choices than
-  // HTK about how to treat the deltas at the ends.
-  for (int32 i = 10; i+10 < kaldi_features.NumRows(); i++) {
-    for (int32 j = 0; j < kaldi_features.NumCols(); j++) {
-      BaseFloat a = kaldi_features(i, j), b = htk_features(i, j);
-      /// THE FEATURES ARE ALMOST IDENTICAL WITH HTK!!! (SEE THE TOLERANCE!)
-      if ((std::abs(b - a)) > 0.01) {  //<< TOLERANCE TO DIFFERENCES!!!!!
-        // print the non-matching data only once per-line
-        if (i_old != i) {
-          std::cout << "\n\n\n[HTK-row: " << i << "] " << htk_features.Row(i) << "\n";
-          std::cout << "[Kaldi-row: " << i << "] " << kaldi_features.Row(i) << "\n\n\n";
-          i_old = i;
-        }
-        // print indices of non-matching cells
-        std::cout << "[" << i << ", " << j << "]";
-        passed = false;
-  }}}
-  if (!passed) KALDI_ERR << "Test failed";
-
-  // write the htk features for later inspection
-  HtkHeader header = {
-    kaldi_features.NumRows(),
-    100000,  // 10ms
-    static_cast<int16>(sizeof(float)*kaldi_features.NumCols()),
-    000007  // FBANK
-  };
-  {
-    std::ofstream os("tmp.test.wav.fbank_kaldi.1",
-                     std::ios::out|std::ios::binary);
-    WriteHtk(os, kaldi_features, header);
-  }
-
-  std::cout << "Test passed :)\n\n";
-  
-  unlink("tmp.test.wav.fbank_kaldi.1");
-}
-
-
-
 
 static void UnitTestFeat() {
-  UnitTestReadWave();
   UnitTestSimple();
-  UnitTestHTKCompare1();
-  UnitTestHTKCompare2();
-  UnitTestHTKCompare3();
-  UnitTestHTKCompare4();
 }
 
 
diff --git a/src/feat/feature-mfcc-test.cc b/src/feat/feature-mfcc-test.cc
index 305ac5abe50..43a9b14dea6 100644
--- a/src/feat/feature-mfcc-test.cc
+++ b/src/feat/feature-mfcc-test.cc
@@ -53,6 +53,7 @@ static void UnitTestReadWave() {
   );
   KALDI_ASSERT(input.good());
   v2.Read(input, false);
+  v2.Scale(BaseFloat(1.0 / 32768.0));
   input.close();
 
   std::cout << "<<<=== Comparing freshly read waveform to 'libsndfile' waveform\n";
@@ -105,492 +106,6 @@ static void UnitTestSimple() {
 }
 
 
-static void UnitTestHTKCompare1() {
-  std::cout << "=== UnitTestHTKCompare1() ===\n";
-
-  std::ifstream is("test_data/test.wav", std::ios_base::binary);
-  WaveData wave;
-  wave.Read(is);
-  KALDI_ASSERT(wave.Data().NumRows() == 1);
-  SubVector<BaseFloat> waveform(wave.Data(), 0);
-
-  // read the HTK features
-  Matrix<BaseFloat> htk_features;
-  {
-    std::ifstream is("test_data/test.wav.fea_htk.1",
-                     std::ios::in | std::ios_base::binary);
-    bool ans = ReadHtk(is, &htk_features, 0);
-    KALDI_ASSERT(ans);
-  }
-
-  // use mfcc with default configuration...
-  MfccOptions op;
-  op.frame_opts.window_type = "hamming";
-  op.frame_opts.remove_dc_offset = false;
-  op.frame_opts.round_to_power_of_two = true;
-  op.mel_opts.low_freq = 0.0;
-  op.use_energy = false;  // C0 not energy.
-
-  Mfcc mfcc(op);
-
-  // calculate kaldi features
-  Matrix<BaseFloat> kaldi_raw_features;
-  mfcc.Compute(waveform, 1.0, &kaldi_raw_features);
-
-  DeltaFeaturesOptions delta_opts;
-  Matrix<BaseFloat> kaldi_features;
-  ComputeDeltas(delta_opts,
-                kaldi_raw_features,
-                &kaldi_features);
-
-  // compare the results
-  bool passed = true;
-  int32 i_old = -1;
-  KALDI_ASSERT(kaldi_features.NumRows() == htk_features.NumRows());
-  KALDI_ASSERT(kaldi_features.NumCols() == htk_features.NumCols());
-  // Ignore ends-- we make slightly different choices than
-  // HTK about how to treat the deltas at the ends.
-  for (int32 i = 10; i+10 < kaldi_features.NumRows(); i++) {
-    for (int32 j = 0; j < kaldi_features.NumCols(); j++) {
-      BaseFloat a = kaldi_features(i, j), b = htk_features(i, j);
-      if ((std::abs(b - a)) > 1.0) {  //<< TOLERANCE TO DIFFERENCES!!!!!
-        // print the non-matching data only once per-line
-        if (i_old != i) {
-          std::cout << "\n\n\n[HTK-row: " << i << "] " << htk_features.Row(i) << "\n";
-          std::cout << "[Kaldi-row: " << i << "] " << kaldi_features.Row(i) << "\n\n\n";
-          i_old = i;
-        }
-        // print indices of non-matching cells
-        std::cout << "[" << i << ", " << j << "]";
-        passed = false;
-  }}}
-  if (!passed) KALDI_ERR << "Test failed";
-
-  // write the htk features for later inspection
-  HtkHeader header = {
-    kaldi_features.NumRows(),
-    100000,  // 10ms
-    static_cast<int16>(sizeof(float)*kaldi_features.NumCols()),
-    021406  // MFCC_D_A_0
-  };
-  {
-    std::ofstream os("tmp.test.wav.fea_kaldi.1",
-                     std::ios::out|std::ios::binary);
-    WriteHtk(os, kaldi_features, header);
-  }
-
-  std::cout << "Test passed :)\n\n";
-
-  unlink("tmp.test.wav.fea_kaldi.1");
-}
-
-
-static void UnitTestHTKCompare2() {
-  std::cout << "=== UnitTestHTKCompare2() ===\n";
-
-  std::ifstream is("test_data/test.wav", std::ios_base::binary);
-  WaveData wave;
-  wave.Read(is);
-  KALDI_ASSERT(wave.Data().NumRows() == 1);
-  SubVector<BaseFloat> waveform(wave.Data(), 0);
-
-  // read the HTK features
-  Matrix<BaseFloat> htk_features;
-  {
-    std::ifstream is("test_data/test.wav.fea_htk.2",
-                     std::ios::in | std::ios_base::binary);
-    bool ans = ReadHtk(is, &htk_features, 0);
-    KALDI_ASSERT(ans);
-  }
-
-  // use mfcc with default configuration...
-  MfccOptions op;
-  op.frame_opts.window_type = "hamming";
-  op.frame_opts.remove_dc_offset = false;
-  op.frame_opts.round_to_power_of_two = true;
-  op.mel_opts.low_freq = 0.0;
-  op.use_energy = true;  // Use energy.
-
-  Mfcc mfcc(op);
-
-  // calculate kaldi features
-  Matrix<BaseFloat> kaldi_raw_features;
-  mfcc.Compute(waveform, 1.0, &kaldi_raw_features);
-
-  DeltaFeaturesOptions delta_opts;
-  Matrix<BaseFloat> kaldi_features;
-  ComputeDeltas(delta_opts,
-                kaldi_raw_features,
-                &kaldi_features);
-
-  // compare the results
-  bool passed = true;
-  int32 i_old = -1;
-  KALDI_ASSERT(kaldi_features.NumRows() == htk_features.NumRows());
-  KALDI_ASSERT(kaldi_features.NumCols() == htk_features.NumCols());
-  // Ignore ends-- we make slightly different choices than
-  // HTK about how to treat the deltas at the ends.
-  for (int32 i = 10; i+10 < kaldi_features.NumRows(); i++) {
-    for (int32 j = 0; j < kaldi_features.NumCols(); j++) {
-      BaseFloat a = kaldi_features(i, j), b = htk_features(i, j);
-      if ((std::abs(b - a)) > 1.0) {  //<< TOLERANCE TO DIFFERENCES!!!!!
-        // print the non-matching data only once per-line
-        if (i_old != i) {
-          std::cout << "\n\n\n[HTK-row: " << i << "] " << htk_features.Row(i) << "\n";
-          std::cout << "[Kaldi-row: " << i << "] " << kaldi_features.Row(i) << "\n\n\n";
-          i_old = i;
-        }
-        // print indices of non-matching cells
-        std::cout << "[" << i << ", " << j << "]";
-        passed = false;
-  }}}
-  if (!passed) KALDI_ERR << "Test failed";
-
-  // write the htk features for later inspection
-  HtkHeader header = {
-    kaldi_features.NumRows(),
-    100000,  // 10ms
-    static_cast<int16>(sizeof(float)*kaldi_features.NumCols()),
-    021406  // MFCC_D_A_0
-  };
-  {
-    std::ofstream os("tmp.test.wav.fea_kaldi.2",
-                     std::ios::out|std::ios::binary);
-    WriteHtk(os, kaldi_features, header);
-  }
-
-  std::cout << "Test passed :)\n\n";
-
-  unlink("tmp.test.wav.fea_kaldi.2");
-}
-
-
-static void UnitTestHTKCompare3() {
-  std::cout << "=== UnitTestHTKCompare3() ===\n";
-
-  std::ifstream is("test_data/test.wav", std::ios_base::binary);
-  WaveData wave;
-  wave.Read(is);
-  KALDI_ASSERT(wave.Data().NumRows() == 1);
-  SubVector<BaseFloat> waveform(wave.Data(), 0);
-
-  // read the HTK features
-  Matrix<BaseFloat> htk_features;
-  {
-    std::ifstream is("test_data/test.wav.fea_htk.3",
-                     std::ios::in | std::ios_base::binary);
-    bool ans = ReadHtk(is, &htk_features, 0);
-    KALDI_ASSERT(ans);
-  }
-
-  // use mfcc with default configuration...
-  MfccOptions op;
-  op.frame_opts.window_type = "hamming";
-  op.frame_opts.remove_dc_offset = false;
-  op.frame_opts.round_to_power_of_two = true;
-  op.use_energy = true;  // Use energy.
-  op.mel_opts.low_freq = 20.0;
-
-  Mfcc mfcc(op);
-
-  // calculate kaldi features
-  Matrix<BaseFloat> kaldi_raw_features;
-  mfcc.Compute(waveform, 1.0, &kaldi_raw_features);
-
-  DeltaFeaturesOptions delta_opts;
-  Matrix<BaseFloat> kaldi_features;
-  ComputeDeltas(delta_opts,
-                kaldi_raw_features,
-                &kaldi_features);
-
-  // compare the results
-  bool passed = true;
-  int32 i_old = -1;
-  KALDI_ASSERT(kaldi_features.NumRows() == htk_features.NumRows());
-  KALDI_ASSERT(kaldi_features.NumCols() == htk_features.NumCols());
-  // Ignore ends-- we make slightly different choices than
-  // HTK about how to treat the deltas at the ends.
-  for (int32 i = 10; i+10 < kaldi_features.NumRows(); i++) {
-    for (int32 j = 0; j < kaldi_features.NumCols(); j++) {
-      BaseFloat a = kaldi_features(i, j), b = htk_features(i, j);
-      if ((std::abs(b - a)) > 1.0) {  //<< TOLERANCE TO DIFFERENCES!!!!!
-        // print the non-matching data only once per-line
-        if (static_cast<int32>(i_old) != i) {
-          std::cout << "\n\n\n[HTK-row: " << i << "] " << htk_features.Row(i) << "\n";
-          std::cout << "[Kaldi-row: " << i << "] " << kaldi_features.Row(i) << "\n\n\n";
-          i_old = i;
-        }
-        // print indices of non-matching cells
-        std::cout << "[" << i << ", " << j << "]";
-        passed = false;
-  }}}
-  if (!passed) KALDI_ERR << "Test failed";
-
-  // write the htk features for later inspection
-  HtkHeader header = {
-    kaldi_features.NumRows(),
-    100000,  // 10ms
-    static_cast<int16>(sizeof(float)*kaldi_features.NumCols()),
-    021406  // MFCC_D_A_0
-  };
-  {
-    std::ofstream os("tmp.test.wav.fea_kaldi.3",
-                     std::ios::out|std::ios::binary);
-    WriteHtk(os, kaldi_features, header);
-  }
-
-  std::cout << "Test passed :)\n\n";
-
-  unlink("tmp.test.wav.fea_kaldi.3");
-}
-
-
-static void UnitTestHTKCompare4() {
-  std::cout << "=== UnitTestHTKCompare4() ===\n";
-
-  std::ifstream is("test_data/test.wav", std::ios_base::binary);
-  WaveData wave;
-  wave.Read(is);
-  KALDI_ASSERT(wave.Data().NumRows() == 1);
-  SubVector<BaseFloat> waveform(wave.Data(), 0);
-
-  // read the HTK features
-  Matrix<BaseFloat> htk_features;
-  {
-    std::ifstream is("test_data/test.wav.fea_htk.4",
-                     std::ios::in | std::ios_base::binary);
-    bool ans = ReadHtk(is, &htk_features, 0);
-    KALDI_ASSERT(ans);
-  }
-
-  // use mfcc with default configuration...
-  MfccOptions op;
-  op.frame_opts.window_type = "hamming";
-  op.frame_opts.remove_dc_offset = false;
-  op.frame_opts.round_to_power_of_two = true;
-  op.mel_opts.low_freq = 0.0;
-  op.use_energy = true;  // Use energy.
-
-  Mfcc mfcc(op);
-
-  // calculate kaldi features
-  Matrix<BaseFloat> kaldi_raw_features;
-  mfcc.Compute(waveform, 1.0, &kaldi_raw_features);
-
-  DeltaFeaturesOptions delta_opts;
-  Matrix<BaseFloat> kaldi_features;
-  ComputeDeltas(delta_opts,
-                kaldi_raw_features,
-                &kaldi_features);
-
-  // compare the results
-  bool passed = true;
-  int32 i_old = -1;
-  KALDI_ASSERT(kaldi_features.NumRows() == htk_features.NumRows());
-  KALDI_ASSERT(kaldi_features.NumCols() == htk_features.NumCols());
-  // Ignore ends-- we make slightly different choices than
-  // HTK about how to treat the deltas at the ends.
-  for (int32 i = 10; i+10 < kaldi_features.NumRows(); i++) {
-    for (int32 j = 0; j < kaldi_features.NumCols(); j++) {
-      BaseFloat a = kaldi_features(i, j), b = htk_features(i, j);
-      if ((std::abs(b - a)) > 1.0) {  //<< TOLERANCE TO DIFFERENCES!!!!!
-        // print the non-matching data only once per-line
-        if (static_cast<int32>(i_old) != i) {
-          std::cout << "\n\n\n[HTK-row: " << i << "] " << htk_features.Row(i) << "\n";
-          std::cout << "[Kaldi-row: " << i << "] " << kaldi_features.Row(i) << "\n\n\n";
-          i_old = i;
-        }
-        // print indices of non-matching cells
-        std::cout << "[" << i << ", " << j << "]";
-        passed = false;
-  }}}
-  if (!passed) KALDI_ERR << "Test failed";
-
-  // write the htk features for later inspection
-  HtkHeader header = {
-    kaldi_features.NumRows(),
-    100000,  // 10ms
-    static_cast<int16>(sizeof(float)*kaldi_features.NumCols()),
-    021406  // MFCC_D_A_0
-  };
-  {
-    std::ofstream os("tmp.test.wav.fea_kaldi.4",
-                     std::ios::out|std::ios::binary);
-    WriteHtk(os, kaldi_features, header);
-  }
-
-  std::cout << "Test passed :)\n\n";
-
-  unlink("tmp.test.wav.fea_kaldi.4");
-}
-
-
-static void UnitTestHTKCompare5() {
-  std::cout << "=== UnitTestHTKCompare5() ===\n";
-
-  std::ifstream is("test_data/test.wav", std::ios_base::binary);
-  WaveData wave;
-  wave.Read(is);
-  KALDI_ASSERT(wave.Data().NumRows() == 1);
-  SubVector<BaseFloat> waveform(wave.Data(), 0);
-
-  // read the HTK features
-  Matrix<BaseFloat> htk_features;
-  {
-    std::ifstream is("test_data/test.wav.fea_htk.5",
-                     std::ios::in | std::ios_base::binary);
-    bool ans = ReadHtk(is, &htk_features, 0);
-    KALDI_ASSERT(ans);
-  }
-
-  // use mfcc with default configuration...
-  MfccOptions op;
-  op.frame_opts.window_type = "hamming";
-  op.frame_opts.remove_dc_offset = false;
-  op.frame_opts.round_to_power_of_two = true;
-  op.use_energy = true;  // Use energy.
-  op.mel_opts.low_freq = 0.0;
-  op.mel_opts.vtln_low = 100.0;
-  op.mel_opts.vtln_high = 7500.0;
-
-  BaseFloat vtln_warp = 1.1; // our approach identical to htk for warp factor >1,
-  // differs slightly for higher mel bins if warp_factor <0.9
-
-  Mfcc mfcc(op);
-
-  // calculate kaldi features
-  Matrix<BaseFloat> kaldi_raw_features;
-  mfcc.Compute(waveform, vtln_warp, &kaldi_raw_features);
-
-  DeltaFeaturesOptions delta_opts;
-  Matrix<BaseFloat> kaldi_features;
-  ComputeDeltas(delta_opts,
-                kaldi_raw_features,
-                &kaldi_features);
-
-  // compare the results
-  bool passed = true;
-  int32 i_old = -1;
-  KALDI_ASSERT(kaldi_features.NumRows() == htk_features.NumRows());
-  KALDI_ASSERT(kaldi_features.NumCols() == htk_features.NumCols());
-  // Ignore ends-- we make slightly different choices than
-  // HTK about how to treat the deltas at the ends.
-  for (int32 i = 10; i+10 < kaldi_features.NumRows(); i++) {
-    for (int32 j = 0; j < kaldi_features.NumCols(); j++) {
-      BaseFloat a = kaldi_features(i, j), b = htk_features(i, j);
-      if ((std::abs(b - a)) > 1.0) {  //<< TOLERANCE TO DIFFERENCES!!!!!
-        // print the non-matching data only once per-line
-        if (static_cast<int32>(i_old) != i) {
-          std::cout << "\n\n\n[HTK-row: " << i << "] " << htk_features.Row(i) << "\n";
-          std::cout << "[Kaldi-row: " << i << "] " << kaldi_features.Row(i) << "\n\n\n";
-          i_old = i;
-        }
-        // print indices of non-matching cells
-        std::cout << "[" << i << ", " << j << "]";
-        passed = false;
-  }}}
-  if (!passed) KALDI_ERR << "Test failed";
-
-  // write the htk features for later inspection
-  HtkHeader header = {
-    kaldi_features.NumRows(),
-    100000,  // 10ms
-    static_cast<int16>(sizeof(float)*kaldi_features.NumCols()),
-    021406  // MFCC_D_A_0
-  };
-  {
-    std::ofstream os("tmp.test.wav.fea_kaldi.5",
-                     std::ios::out|std::ios::binary);
-    WriteHtk(os, kaldi_features, header);
-  }
-
-  std::cout << "Test passed :)\n\n";
-
-  unlink("tmp.test.wav.fea_kaldi.5");
-}
-
-static void UnitTestHTKCompare6() {
-  std::cout << "=== UnitTestHTKCompare6() ===\n";
-
-
-  std::ifstream is("test_data/test.wav", std::ios_base::binary);
-  WaveData wave;
-  wave.Read(is);
-  KALDI_ASSERT(wave.Data().NumRows() == 1);
-  SubVector<BaseFloat> waveform(wave.Data(), 0);
-
-  // read the HTK features
-  Matrix<BaseFloat> htk_features;
-  {
-    std::ifstream is("test_data/test.wav.fea_htk.6",
-                     std::ios::in | std::ios_base::binary);
-    bool ans = ReadHtk(is, &htk_features, 0);
-    KALDI_ASSERT(ans);
-  }
-
-  // use mfcc with default configuration...
-  MfccOptions op;
-  op.frame_opts.window_type = "hamming";
-  op.frame_opts.remove_dc_offset = false;
-  op.frame_opts.round_to_power_of_two = true;
-  op.mel_opts.num_bins = 24;
-  op.mel_opts.low_freq = 125.0;
-  op.mel_opts.high_freq = 7800.0;
-  op.use_energy = false;  // C0 not energy.
-
-  Mfcc mfcc(op);
-
-  // calculate kaldi features
-  Matrix<BaseFloat> kaldi_raw_features;
-  mfcc.Compute(waveform, 1.0, &kaldi_raw_features);
-
-  DeltaFeaturesOptions delta_opts;
-  Matrix<BaseFloat> kaldi_features;
-  ComputeDeltas(delta_opts,
-                kaldi_raw_features,
-                &kaldi_features);
-
-  // compare the results
-  bool passed = true;
-  int32 i_old = -1;
-  KALDI_ASSERT(kaldi_features.NumRows() == htk_features.NumRows());
-  KALDI_ASSERT(kaldi_features.NumCols() == htk_features.NumCols());
-  // Ignore ends-- we make slightly different choices than
-  // HTK about how to treat the deltas at the ends.
-  for (int32 i = 10; i+10 < kaldi_features.NumRows(); i++) {
-    for (int32 j = 0; j < kaldi_features.NumCols(); j++) {
-      BaseFloat a = kaldi_features(i, j), b = htk_features(i, j);
-      if ((std::abs(b - a)) > 1.0) {  //<< TOLERANCE TO DIFFERENCES!!!!!
-        // print the non-matching data only once per-line
-        if (static_cast<int32>(i_old) != i) {
-          std::cout << "\n\n\n[HTK-row: " << i << "] " << htk_features.Row(i) << "\n";
-          std::cout << "[Kaldi-row: " << i << "] " << kaldi_features.Row(i) << "\n\n\n";
-          i_old = i;
-        }
-        // print indices of non-matching cells
-        std::cout << "[" << i << ", " << j << "]";
-        passed = false;
-  }}}
-  if (!passed) KALDI_ERR << "Test failed";
-
-  // write the htk features for later inspection
-  HtkHeader header = {
-    kaldi_features.NumRows(),
-    100000,  // 10ms
-    static_cast<int16>(sizeof(float)*kaldi_features.NumCols()),
-    021406  // MFCC_D_A_0
-  };
-  {
-    std::ofstream os("tmp.test.wav.fea_kaldi.6",
-                     std::ios::out|std::ios::binary);
-    WriteHtk(os, kaldi_features, header);
-  }
-
-  std::cout << "Test passed :)\n\n";
-
-  unlink("tmp.test.wav.fea_kaldi.6");
-}
-
 void UnitTestVtln() {
   // Test the function VtlnWarpFreq.
   BaseFloat low_freq = 10, high_freq = 7800,
@@ -631,15 +146,6 @@ static void UnitTestFeat() {
   UnitTestVtln();
   UnitTestReadWave();
   UnitTestSimple();
-  UnitTestHTKCompare1();
-  UnitTestHTKCompare2();
-  // commenting out this one as it doesn't compare right now I normalized
-  // the way the FFT bins are treated (removed offset of 0.5)... this seems
-  // to relate to the way frequency zero behaves.
-  UnitTestHTKCompare3();
-  UnitTestHTKCompare4();
-  UnitTestHTKCompare5();
-  UnitTestHTKCompare6();
   std::cout << "Tests succeeded.\n";
 }
 
diff --git a/src/feat/feature-sdc-test.cc b/src/feat/feature-sdc-test.cc
index 4b99c65fef8..722c1dda41d 100644
--- a/src/feat/feature-sdc-test.cc
+++ b/src/feat/feature-sdc-test.cc
@@ -139,8 +139,6 @@ int main() {
 
   // mfcc with default configuration...
   MfccOptions op;
-  op.frame_opts.dither = 0.0;
-  op.frame_opts.preemph_coeff = 0.0;
   op.frame_opts.window_type = "hamming";
   op.frame_opts.remove_dc_offset = false;
   op.frame_opts.round_to_power_of_two = true;
diff --git a/src/feat/feature-window.cc b/src/feat/feature-window.cc
index b68b8854128..1d1ab381826 100644
--- a/src/feat/feature-window.cc
+++ b/src/feat/feature-window.cc
@@ -96,7 +96,7 @@ void InitFeatureWindowFunction(const FrameExtractionOptions &opts,
 }
 
 void ProcessWindow(const FrameExtractionOptions &opts,
-                   const VectorBase<BaseFloat> *window_function,
+                   const VectorBase<BaseFloat> &window_function,
                    VectorBase<BaseFloat> *window) {
   int32 frame_length = opts.WindowSize();
   KALDI_ASSERT(window->Dim() == frame_length);
@@ -104,7 +104,7 @@ void ProcessWindow(const FrameExtractionOptions &opts,
   if (opts.remove_dc_offset)
     window->Add(-window->Sum() / frame_length);
 
-  window->MulElements(*window_function);
+  window->MulElements(window_function);
 }
 
 
@@ -114,7 +114,7 @@ void ExtractWindow(int64 sample_offset,
                    const VectorBase<BaseFloat> &wave,
                    int32 f,  // with 0 <= f < NumFrames(feats, opts)
                    const FrameExtractionOptions &opts,
-                   const Vector<BaseFloat> &window_function,
+                   const VectorBase<BaseFloat> &window_function,
                    Vector<BaseFloat> *window) {
   KALDI_ASSERT(sample_offset >= 0 && wave.Dim() != 0);
   int32 frame_length = opts.WindowSize(),
diff --git a/src/feat/online-feature-test.cc b/src/feat/online-feature-test.cc
index c5a2ae44ec7..fbdb9c4f11f 100644
--- a/src/feat/online-feature-test.cc
+++ b/src/feat/online-feature-test.cc
@@ -152,17 +152,12 @@ void TestOnlineMfcc() {
 
   // the parametrization object
   MfccOptions op;
-  op.frame_opts.dither = 0.0;
-  op.frame_opts.preemph_coeff = 0.0;
   op.frame_opts.window_type = "hamming";
   op.frame_opts.remove_dc_offset = false;
   op.frame_opts.round_to_power_of_two = true;
   op.frame_opts.samp_freq = wave.SampFreq();
   op.mel_opts.low_freq = 0.0;
-  op.htk_compat = false;
   op.use_energy = false;  // C0 not energy.
-  if (RandInt(0, 1) == 0)
-    op.frame_opts.snip_edges = false;
   Mfcc mfcc(op);
 
   // compute mfcc offline
@@ -204,14 +199,11 @@ void TestOnlineTransform() {
 
   // build online feature interface, take OnlineMfcc as an example
   MfccOptions op;
-  op.frame_opts.dither = 0.0;
-  op.frame_opts.preemph_coeff = 0.0;
   op.frame_opts.window_type = "hamming";
   op.frame_opts.remove_dc_offset = false;
   op.frame_opts.round_to_power_of_two = true;
   op.frame_opts.samp_freq = wave.SampFreq();
   op.mel_opts.low_freq = 0.0;
-  op.htk_compat = false;
   op.use_energy = false;  // C0 not energy.
   OnlineMfcc online_mfcc(op);
 
@@ -247,14 +239,11 @@ void TestOnlineAppendFeature() {
 
   // the parametrization object for 1st stream mfcc feature
   MfccOptions mfcc_op;
-  mfcc_op.frame_opts.dither = 0.0;
-  mfcc_op.frame_opts.preemph_coeff = 0.0;
   mfcc_op.frame_opts.window_type = "hamming";
   mfcc_op.frame_opts.remove_dc_offset = false;
   mfcc_op.frame_opts.round_to_power_of_two = true;
   mfcc_op.frame_opts.samp_freq = wave.SampFreq();
   mfcc_op.mel_opts.low_freq = 0.0;
-  mfcc_op.htk_compat = false;
   mfcc_op.use_energy = false;  // C0 not energy.
   Mfcc mfcc(mfcc_op);
 
@@ -262,23 +251,6 @@ void TestOnlineAppendFeature() {
   Matrix<BaseFloat> mfcc_feats;
   mfcc.Compute(waveform, 1.0, &mfcc_feats);  // vtln not supported
 
-  // the parametrization object for 2nd stream plp feature
-  PlpOptions plp_op;
-  plp_op.frame_opts.dither = 0.0;
-  plp_op.frame_opts.preemph_coeff = 0.0;
-  plp_op.frame_opts.window_type = "hamming";
-  plp_op.frame_opts.remove_dc_offset = false;
-  plp_op.frame_opts.round_to_power_of_two = true;
-  plp_op.frame_opts.samp_freq = wave.SampFreq();
-  plp_op.mel_opts.low_freq = 0.0;
-  plp_op.htk_compat = false;
-  plp_op.use_energy = false;  // C0 not energy.
-  Plp plp(plp_op);
-
-  // compute plp offline
-  Matrix<BaseFloat> plp_feats;
-  plp.Compute(waveform, 1.0, &plp_feats);  // vtln not supported
-
   // compare
   // The test waveform is about 1.44s long, so
   // we try to break it into from 5 pieces to 9(not essential to do so)
@@ -305,22 +277,17 @@ void TestOnlineAppendFeature() {
     GetOutput(&online_mfcc_doubled, &online_mfcc_doubled_feats);
 
     // compare mfcc_feats & plp_features with online_mfcc_doubled_feats
-    KALDI_ASSERT(mfcc_feats.NumRows() == online_mfcc_doubled_feats.NumRows()
-      && plp_feats.NumRows() == online_mfcc_doubled_feats.NumRows()
-      && mfcc_feats.NumCols() + plp_feats.NumCols()
-         == online_mfcc_doubled_feats.NumCols());
+    KALDI_ASSERT(mfcc_feats.NumRows() == online_mfcc_doubled_feats.NumRows() &&
+                 online_mfcc_doubled_feats.NumCols() == 2 * mfcc_feats.NumCols());
     for (MatrixIndexT i = 0; i < online_mfcc_doubled_feats.NumRows(); i++) {
       for (MatrixIndexT j = 0; j < mfcc_feats.NumCols(); j++) {
-        KALDI_ASSERT(std::abs(mfcc_feats(i, j) - online_mfcc_doubled_feats(i, j))
-          < 0.0001*std::max(1.0, static_cast<double>(std::abs(mfcc_feats(i, j))
-                                    + std::abs(online_mfcc_doubled_feats(i, j)))));
-      }
-      for (MatrixIndexT k = 0; k < plp_feats.NumCols(); k++) {
-        KALDI_ASSERT(
-          std::abs(plp_feats(i, k) -
-            online_mfcc_doubled_feats(i, mfcc_feats.NumCols() + k))
-          < 0.0001*std::max(1.0, static_cast<double>(std::abs(plp_feats(i, k))
-            +std::abs(online_mfcc_doubled_feats(i, mfcc_feats.NumCols() + k)))));
+        MatrixIndexT jj = j;
+        for (int count = 0; count < 2; count++) {
+          KALDI_ASSERT(std::abs(mfcc_feats(i, j) - online_mfcc_doubled_feats(i, jj))
+                       < 0.0001*std::max(1.0, static_cast<double>(std::abs(mfcc_feats(i, j))
+                                                                  + std::abs(online_mfcc_doubled_feats(i, jj)))));
+          jj += mfcc_feats.NumCols();
+        }
       }
     }
   }
@@ -374,7 +341,6 @@ int main() {
     TestOnlineDeltaFeature();
     TestOnlineSpliceFrames();
     TestOnlineMfcc();
-    TestOnlinePlp();
     TestOnlineTransform();
     TestOnlineAppendFeature();
     TestRecyclingVector();
diff --git a/src/feat/online-feature.h b/src/feat/online-feature.h
index 2978d02f090..d1e3a74342e 100644
--- a/src/feat/online-feature.h
+++ b/src/feat/online-feature.h
@@ -123,8 +123,14 @@ class OnlineGenericBaseFeature: public OnlineBaseFeature {
   // waveform_remainder_ while incrementing waveform_offset_ by the same amount.
   void ComputeFeatures();
 
+  void MaybeCreateResampler(BaseFloat sampling_rate);
+
   C computer_;  // class that does the MFCC or filterbank computation
 
+  // resampler in cases when the input sampling frequency is not equal to
+  // the expected sampling rate
+  std::unique_ptr<LinearResample> resampler_;
+
   Vector<BaseFloat> window_function_;
 
   // features_ is the Mfcc or Fbank features that we have already computed.
diff --git a/src/feat/test_data/README b/src/feat/test_data/README
index 8deadd273a4..e44395c6bad 100644
--- a/src/feat/test_data/README
+++ b/src/feat/test_data/README
@@ -7,24 +7,4 @@
 #1) convert 16kHz,lin16 wav to KALDI ASCII vector format
 cat prepare_wav_in_ascii.m | matlab
 
-#2) perform reference feature extraction by HTK
-# we used HCopy from HTK V3.4
-HCopy -C hcopy1.conf test.wav test.wav.fea_htk.1
-
-HCopy -C hcopy2.conf test.wav test.wav.fea_htk.2
-
-HCopy -C hcopy3.conf test.wav test.wav.fea_htk.3
-
-HCopy -C hcopy4.conf test.wav test.wav.fea_htk.4
-
-HCopy -C hcopy5.conf test.wav test.wav.fea_htk.5
-
-HCopy -C hcopy6.conf test.wav test.wav.fea_htk.6
-
-HCopy -C plp1.conf test.wav test.wav.plp_htk.1
-
-HCopy -C fbank1.conf test.wav test.wav.fbank_htk.1
-
-HCopy -C fbank2.conf test.wav test.wav.fbank_htk.2
-
-HCopy -C fbank3.conf test.wav test.wav.fbank_htk.3
+#2) perform reference feature extraction by HTK - HTK support is deprecated.
diff --git a/src/feat/test_data/fbank1.conf b/src/feat/test_data/fbank1.conf
deleted file mode 100644
index b751b61d6d1..00000000000
--- a/src/feat/test_data/fbank1.conf
+++ /dev/null
@@ -1,26 +0,0 @@
-#!/bin/sh
-
-SOURCEKIND   = WAVEFORM
-SOURCEFORMAT = WAV
-SOURCERATE   = 625
-BYTEORDER    = VAX
-TARGETFORMAT = HTK
-TARGETKIND   = FBANK
-
-LOFREQ       = 0
-HIFREQ       = 8000
-#WARPLCUTOFF  = 100
-#WARPUCUTOFF  = 7500
-#WARPFREQ     = 1.0
-NUMCHANS     = 23       # number of critical bands
-USEPOWER     = T        # using power spectrum
-USEHAMMING   = T        # use hamming window on speech frame
-RAWENERGY    = T
-ENORMALISE   = F
-
-
-PREEMCOEF    = 0.0
-TARGETRATE   = 100000   # 10 ms frame rate
-WINDOWSIZE   = 250000   # 25 ms window
-SAVEWITHCRC  = F
-
diff --git a/src/feat/test_data/fbank2.conf b/src/feat/test_data/fbank2.conf
deleted file mode 100644
index 604819a88c4..00000000000
--- a/src/feat/test_data/fbank2.conf
+++ /dev/null
@@ -1,26 +0,0 @@
-#!/bin/sh
-
-SOURCEKIND   = WAVEFORM
-SOURCEFORMAT = WAV
-SOURCERATE   = 625
-BYTEORDER    = VAX
-TARGETFORMAT = HTK
-TARGETKIND   = FBANK
-
-LOFREQ       = 25
-HIFREQ       = 8000
-#WARPLCUTOFF  = 100
-#WARPUCUTOFF  = 7500
-#WARPFREQ     = 1.0
-NUMCHANS     = 23       # number of critical bands
-USEPOWER     = T        # using power spectrum
-USEHAMMING   = T        # use hamming window on speech frame
-RAWENERGY    = T
-ENORMALISE   = F
-
-
-PREEMCOEF    = 0.0
-TARGETRATE   = 100000   # 10 ms frame rate
-WINDOWSIZE   = 250000   # 25 ms window
-SAVEWITHCRC  = F
-
diff --git a/src/feat/test_data/fbank3.conf b/src/feat/test_data/fbank3.conf
deleted file mode 100644
index f86fec4b248..00000000000
--- a/src/feat/test_data/fbank3.conf
+++ /dev/null
@@ -1,26 +0,0 @@
-#!/bin/sh
-
-SOURCEKIND   = WAVEFORM
-SOURCEFORMAT = WAV
-SOURCERATE   = 625
-BYTEORDER    = VAX
-TARGETFORMAT = HTK
-TARGETKIND   = FBANK
-
-LOFREQ       = 25
-HIFREQ       = 8000
-WARPLCUTOFF  = 100
-WARPUCUTOFF  = 7500
-WARPFREQ     = 0.9
-NUMCHANS     = 23       # number of critical bands
-USEPOWER     = T        # using power spectrum
-USEHAMMING   = T        # use hamming window on speech frame
-RAWENERGY    = T
-ENORMALISE   = F
-
-
-PREEMCOEF    = 0.0
-TARGETRATE   = 100000   # 10 ms frame rate
-WINDOWSIZE   = 250000   # 25 ms window
-SAVEWITHCRC  = F
-
diff --git a/src/feat/test_data/fbank4.conf b/src/feat/test_data/fbank4.conf
deleted file mode 100644
index a19679f8375..00000000000
--- a/src/feat/test_data/fbank4.conf
+++ /dev/null
@@ -1,26 +0,0 @@
-#!/bin/sh
-
-SOURCEKIND   = WAVEFORM
-SOURCEFORMAT = WAV
-SOURCERATE   = 625
-BYTEORDER    = VAX
-TARGETFORMAT = HTK
-TARGETKIND   = FBANK
-
-LOFREQ       = 25
-HIFREQ       = 8000
-WARPLCUTOFF  = 100
-WARPUCUTOFF  = 7500
-WARPFREQ     = 1.1
-NUMCHANS     = 23       # number of critical bands
-USEPOWER     = T        # using power spectrum
-USEHAMMING   = T        # use hamming window on speech frame
-RAWENERGY    = T
-ENORMALISE   = F
-
-
-PREEMCOEF    = 0.0
-TARGETRATE   = 100000   # 10 ms frame rate
-WINDOWSIZE   = 250000   # 25 ms window
-SAVEWITHCRC  = F
-
diff --git a/src/feat/test_data/hcopy1.conf b/src/feat/test_data/hcopy1.conf
deleted file mode 100644
index 25230348076..00000000000
--- a/src/feat/test_data/hcopy1.conf
+++ /dev/null
@@ -1,28 +0,0 @@
-#!/bin/sh
-
-SOURCEKIND   = WAVEFORM
-SOURCEFORMAT = WAV
-SOURCERATE   = 625
-BYTEORDER    = VAX
-TARGETFORMAT = HTK
-TARGETKIND   = MFCC_D_A_0
-
-LOFREQ       = 0
-HIFREQ       = 8000
-#WARPLCUTOFF  = 100
-#WARPUCUTOFF  = 7500
-#WARPFREQ     = 1.0
-NUMCHANS     = 23       # number of critical bands
-USEPOWER     = T        # using power spectrum
-USEHAMMING   = T        # use hamming window on speech frame
-RAWENERGY    = T
-ENORMALISE   = F
-
-
-PREEMCOEF    = 0.0
-TARGETRATE   = 100000   # 10 ms frame rate
-WINDOWSIZE   = 250000   # 25 ms window
-SAVEWITHCRC  = F
-
-CEPLIFTER    = 22
-NUMCEPS      = 12
diff --git a/src/feat/test_data/hcopy2.conf b/src/feat/test_data/hcopy2.conf
deleted file mode 100644
index 36c7d97d459..00000000000
--- a/src/feat/test_data/hcopy2.conf
+++ /dev/null
@@ -1,24 +0,0 @@
-#!/bin/sh
-
-SOURCEKIND   = WAVEFORM
-SOURCEFORMAT = WAV
-SOURCERATE   = 625
-BYTEORDER    = VAX
-TARGETFORMAT = HTK
-TARGETKIND   = MFCC_D_A_E
-
-LOFREQ       = 0
-HIFREQ       = 8000
-NUMCHANS     = 23       # number of critical bands
-USEPOWER     = T        # using power spectrum
-USEHAMMING   = T        # use hamming window on speech frame
-RAWENERGY    = T
-ENORMALISE   = F
-
-PREEMCOEF    = 0        # no preemphase
-TARGETRATE   = 100000   # 10 ms frame rate
-WINDOWSIZE   = 250000   # 25 ms window
-SAVEWITHCRC  = F
-
-CEPLIFTER    = 22
-NUMCEPS      = 12
diff --git a/src/feat/test_data/hcopy3.conf b/src/feat/test_data/hcopy3.conf
deleted file mode 100644
index 6ed093af685..00000000000
--- a/src/feat/test_data/hcopy3.conf
+++ /dev/null
@@ -1,24 +0,0 @@
-#!/bin/sh
-
-SOURCEKIND   = WAVEFORM
-SOURCEFORMAT = WAV
-SOURCERATE   = 625
-BYTEORDER    = VAX
-TARGETFORMAT = HTK
-TARGETKIND   = MFCC_D_A_E
-
-LOFREQ       = 20
-HIFREQ       = 8000
-NUMCHANS     = 23       # number of critical bands
-USEPOWER     = T        # using power spectrum
-USEHAMMING   = T        # use hamming window on speech frame
-RAWENERGY    = T
-ENORMALISE   = F
-
-PREEMCOEF    = 0        # no preemphase
-TARGETRATE   = 100000   # 10 ms frame rate
-WINDOWSIZE   = 250000   # 25 ms window
-SAVEWITHCRC  = F
-
-CEPLIFTER    = 22
-NUMCEPS      = 12
diff --git a/src/feat/test_data/hcopy4.conf b/src/feat/test_data/hcopy4.conf
deleted file mode 100644
index e51a361cccd..00000000000
--- a/src/feat/test_data/hcopy4.conf
+++ /dev/null
@@ -1,24 +0,0 @@
-#!/bin/sh
-
-SOURCEKIND   = WAVEFORM
-SOURCEFORMAT = WAV
-SOURCERATE   = 625
-BYTEORDER    = VAX
-TARGETFORMAT = HTK
-TARGETKIND   = MFCC_D_A_E
-
-LOFREQ       = 0
-HIFREQ       = 8000
-NUMCHANS     = 23       # number of critical bands
-USEPOWER     = T        # using power spectrum
-USEHAMMING   = T        # use hamming window on speech frame
-RAWENERGY    = T
-ENORMALISE   = F
-
-PREEMCOEF    = 0.97
-TARGETRATE   = 100000   # 10 ms frame rate
-WINDOWSIZE   = 250000   # 25 ms window
-SAVEWITHCRC  = F
-
-CEPLIFTER    = 22
-NUMCEPS      = 12
diff --git a/src/feat/test_data/hcopy5.conf b/src/feat/test_data/hcopy5.conf
deleted file mode 100644
index d280548b91f..00000000000
--- a/src/feat/test_data/hcopy5.conf
+++ /dev/null
@@ -1,27 +0,0 @@
-#!/bin/sh
-
-SOURCEKIND   = WAVEFORM
-SOURCEFORMAT = WAV
-SOURCERATE   = 625
-BYTEORDER    = VAX
-TARGETFORMAT = HTK
-TARGETKIND   = MFCC_D_A_E
-
-LOFREQ       = 0
-HIFREQ       = 8000
-WARPLCUTOFF  = 100
-WARPUCUTOFF  = 7500
-WARPFREQ     = 1.1
-NUMCHANS     = 23       # number of critical bands
-USEPOWER     = T        # using power spectrum
-USEHAMMING   = T        # use hamming window on speech frame
-RAWENERGY    = T
-ENORMALISE   = F
-
-PREEMCOEF    = 0.97
-TARGETRATE   = 100000   # 10 ms frame rate
-WINDOWSIZE   = 250000   # 25 ms window
-SAVEWITHCRC  = F
-
-CEPLIFTER    = 22
-NUMCEPS      = 12
diff --git a/src/feat/test_data/hcopy6.conf b/src/feat/test_data/hcopy6.conf
deleted file mode 100644
index 5e305c9d445..00000000000
--- a/src/feat/test_data/hcopy6.conf
+++ /dev/null
@@ -1,27 +0,0 @@
-#!/bin/sh
-
-SOURCEKIND   = WAVEFORM
-SOURCEFORMAT = WAV
-SOURCERATE   = 625
-BYTEORDER    = VAX
-TARGETFORMAT = HTK
-TARGETKIND   = MFCC_D_A_0
-
-LOFREQ       = 125
-HIFREQ       = 7800
-#WARPLCUTOFF  = 100
-#WARPUCUTOFF  = 7500
-#WARPFREQ     = 1.0
-NUMCHANS     = 24       # number of critical bands
-USEPOWER     = T        # using power spectrum
-USEHAMMING   = T        # use hamming window on speech frame
-RAWENERGY    = T
-ENORMALISE   = F
-
-PREEMCOEF    = 0.97
-TARGETRATE   = 100000   # 10 ms frame rate
-WINDOWSIZE   = 250000   # 25 ms window
-SAVEWITHCRC  = F
-
-CEPLIFTER    = 22
-NUMCEPS      = 12
diff --git a/src/feat/test_data/plp1.conf b/src/feat/test_data/plp1.conf
deleted file mode 100644
index 3465bd20d18..00000000000
--- a/src/feat/test_data/plp1.conf
+++ /dev/null
@@ -1,23 +0,0 @@
-#!/bin/sh
-
-SOURCEKIND   = WAVEFORM
-SOURCEFORMAT = WAV
-SOURCERATE   = 625
-BYTEORDER    = VAX
-TARGETFORMAT = HTK
-TARGETKIND   = PLP_D_A_0
-
-LOFREQ       = 0
-HIFREQ       = 8000
-NUMCHANS     = 23       # number of critical bands
-USEPOWER     = T        # using power spectrum
-USEHAMMING   = T        # use hamming window on speech frame
-COMPRESSFACT = 0.33
-
-PREEMCOEF    = 0
-TARGETRATE   = 100000   # 10 ms frame rate
-WINDOWSIZE   = 250000   # 25 ms window
-SAVEWITHCRC  = F
-
-CEPLIFTER    = 22
-NUMCEPS      = 12
diff --git a/src/feat/test_data/test.wav.fbank_htk.1 b/src/feat/test_data/test.wav.fbank_htk.1
deleted file mode 100644
index cd6083c130dc9487e110c18daf1b9f62df54f4e3..0000000000000000000000000000000000000000
GIT binary patch
literal 0
HcmV?d00001

literal 13076
zcmWlfcU%qn|Ho}E?#0)hrHl~Cu1HUm2+0U(X`qBsl1kcX?{Us)p7uEH(jtlMJ+IBR
z_uk|8`TcPoJvxuaS)cKGzgD|;?V{SX56*07-mWV-kK;+L?oDc+PDua#i<HPyNM0zA
z?d?K_ohtGLose%)B0Zyxc&~26-5W*fsXnAR?IUSxCNVd@lByVr)W(F2IXjWh3PGZ?
zgm7I2xm%9lHX(~0$<6HC)lA^Er-WJD#P>}aJFSkf<BNhl?k}+U(I1;GE7+{{gXyR1
z(0liZY2S`8d4mMC4yufbiDm5eTd4RiK=r?3#%Nt9H@6deSAE8JXgdCXWfJ}?i+xF{
z`1SnE_Fh3aKYmMKuq#d-eX)&pW2595E0%0y;b#{nWItuxYy-x3bwTaoZpMc+F}D9S
zR9p5kcHeOFUS^Owu!1<7E5z@NBu=fFkggxuyD^BJIR^=K+(g3kKlsZt*x{>&lk5vN
z`bn(yRiLr*Ad@stFwtQ*;~u<3y)lz<gMTp2!~nHNh7`6PBLCbG@~YA(K6#QdNiv1L
zR>=PqYh3t;oJV&k*yo3=?`t9p9uxjodxEVT2++TaUeqV1>ds-l)_SIoyoh@LKt|V`
zMXgeWiQ2~~8nTg+Cw7z^pGZZ<Kh$?PLPg_43U+9cJ>w4rhYnKqt%%%7M@ZeKOuVfp
zG4tw(iaUgr*pvC~!Z4p1%Yx}$nd~@#F@}9m`Ju)5zXwz7Eurk4KIJ2BvwxTg&Fyul
zsq2Db?F9-Ss!+B{&i>!NlyHZfSA9tf7>#U(76}Ii;q?6@CO1FgbmJAvk~^Y3ER{*;
zD;brQh(^K+N|rWK@yU#edlP9?-AzmSJofMTkJ7de6kRW-%*T&<n=s1v{HCN`6!IWX
z(v^pkoLhj0hJ<zc-*InM!sOLM7PLB`)7-|?`Hpb24<+Z_sI31=Wmy;p#|)?C*C1*x
zkECq*9twX3Qo8IOb#rSe`}2q5kNe1O%_hgYgfxv|c&^-!UC9@A^?AtJEtbsqJsZt`
z+cRqVau&_HiNg3Xl}8p(RXm*LYtv|XmrV7gX_Q5(Q~2LiN`t0Q_rG0~J#0nMvy!53
z^C@;8N0#pg0=$;sH1--k{mQUd)F0glHKvtTG3on6mfgO9V(e-v+SI6KF^5hK;c(q5
zY6ktGyjKi`FRoKkdY=8yn<zaxk>X?@$~tRPHnI<SIt7F*_rOp42!XN$%rQbI#DuX2
zrlCFyR_{v}d-js@Ks%~VJmpaB7LIP(&HmVbsnoTmX!KVKGy1XLv=?Q!M^N}wtktYW
z%KB!I^LZoDRR+Yo-b!SLdF*-~g7rgnmh4~4bSo>?Yk8o!Y)*M^9jb2arulOtC&n4k
z(l&&;lM^Uw_ZKBqU1)4pq4CTUO6&Wg*!dqNMMmWIZX#uV0g}nd$d*4R+D{$lwWnEI
zIU4;*|6yt6MCtr1ls;WYb#5bv!nbg)hc~CrcBIj7K9&75DfP~#d6y|i5+zh^x=Kmr
zXB1^0QN-IJX`PAW>MpWuUX#4Ck)0!EU@^#?Wzjv^KC6zhQ+|{*ZJ>rcnul-Tbp9Yt
zrORlvyh-JlM<~WU5%;Nr)*BPpKO~2ujJK4u9HnH6CW($WiGS0c422H~ohRY?X(F2j
zq_MuwVtn8GigjE~@vzC%9G}U-j<-1Few~w&=`>~?qN1}qMNuoLy3~!sO{-{3(4e^U
zb@G=_Bj3f3I17JLyL?98&xVNqLUFh<A3F~v*1zb&j<XJwc~7BmkrK5X9&^y`1Se}Z
zapGkPjl-@`R<oYGvGXaN`-kT3muQTHl7`C^7<?wXd<^l`vyoNZB=h@1VvJYfY_b5G
z#miWEC4+5}3zUzFr!ed)HP!7oXlcXoNA(<!?@q&Jv9^cDk!#aSoTXtjZu(6_WjdwR
zCF0!PN3K{$VrXw<3meIN96+RsF>Wq;tUc$z+P#<Y&`Y2!e;Gx8mQp+I4b58<Id-!r
zZMPp$_oklGFC`QNcSW&zIE_oC92kFwikRLgdi5gH%K@qLA2Oz%CPTM9vFpDQ;91L-
zABWgFI3M4He9D|NDLI)<ZDl@(J{56dvmM9Ub)tUNR?2orDcm|5Mb%{*-}-RSu{-5;
zPbk@Rkb*OB$;<}Xqv>S)dxWIo^90{+!*r9BZL^&S{JE2|vzh`sZc?XaKx?;5P6yc1
z_DqAimSL1n%omuVi9+9&#%N`lZS1JnF`Uw_QVM)y$Q*hE*`+YjD~*V+n~49&ORO6b
z&5o#Af-huJu9r#0?P}_-+@dw{E9ZJ&qHWD7_HS87nePS)S4^P9%9O@RRSxNj&l}&5
z(*I^s*e@K}=cy!>+mN>TEpZxY_@32crDg%HXP*=HTZQuS5ma}~rr}m!TJQOAcJ57%
zeCSAx=Uy>4a>+GKqp<J@`xl8d`Cv%t*!C2UA4<O3BBc5~Nx7*_rqVA`v_7!w!AUG4
zH?ei{3xagDDL)rS&9YN89azsXoei7_YR@6V8&t*4rEq;;awaB`A90%fsB`d4n!o^m
z^4IB-A952}&KHu(J|X?&L)^H<xF&wVe8v%6hCAUGs!K&zH)^#%axmsTCwF(@<jW`;
z=BrZP_J#cJ1Cc-2N^bN7s*YRIczOo~<BZAl&_-TbNUXal`=%HWn^{7H(HWd?`eV5B
z8M~YY5_tKun1%M#IDV$Ns68kDXmeth8x?6L6#f~Cd_*bfTRxMurYEJB-%@!lmCW&<
zNZ!(oly<-HmFD3#B$0hukMNOxWxeev49hxTdus^ZVZ$l8HIB+2zc}zu6wQm4)3_><
z0<&YJMRX<U@Kxd~?MOW0iTsWiIlDR%tgTMixs60T{>(0&#q3;Fjjxe8PMtR}uf_&l
zzxJ%LT!^!dHwBwR$vp5s(n>aw`Ai4t`)BNv|6)gFH1;Juu=q9v2f>AcoT~`5vcNVZ
z6Z?POVWq!<Wy^1|u<LNl9jh@8pMv)78R#zZW@XEBjKW*V`FWo(6FL4NZ`rl^BEBys
z;F$TF6>~!{ihG8!Nk^7Uufn8P3+r^cVe(}ky7T@5M^C0YZe;SMTr|%WGtpIz@z=Z=
z-{TPK3I{a&1+P-=gYCSB*sN&9=2<$H9o}Isc-4vx70eI2gzkSuXx+1Csm3TwbG9>I
z=EdaIgPGVpk8xl3G0J!bY8tL6{qG#gO8F@FibG|^Y({CWB(KL?Ruo=g#f87H*rvm}
zHZK;f$ieuU1@ouZptYbsT8e#`*lS`w_6b@UJJ3ArjAl_jW7Pdoetw&g{|;mLwGRw;
zOkl*$kBmIml~K}v$z71bI>T$2oRwm3Xv*e&cUif&7~?ng%+^a{MkiNhTp5pE`;Qoo
z?}>JdI+J}mGU42F#%4F6)-@L8X%-B>Fp!af<59XWU;J;+s0r)IJwF|bsaM(XK8?*)
zt8l7o!`{D!O*f3OKj6s*=|}9X>#)mr#^%4Jn4P<T>8Hc2{`Q>p$|kISaf(%Qp0Ubo
z6h@2mFm88(ZAB7XUQZ{tu>}W%R=oG$#ml58UT<&Ut=ofLw;kENNMM7*8+L1N#Dg$)
zPF{~|`cZ++n{oBHgv-5nHm#VC>)Q!9XJ)Zc-j{VRHsH0(lMufYa$X!DVbXC@_HQHY
z;bh`m29w#ZFOpNXB+czX#8F>@z0MI9|Cp#-3)$y6jbQ6W0_Tn(@N>O*492&6HNmqK
zcsfjGtMP0CvhK4l=L~u0W+TntO-fu4DGsNRp3EWlqciD`6i6Gt5Eb^FsOYXF<}4xN
zFB8HYz7sb33BmiC@jEdUx7L3N8uEq!-PL&Z-+`;+zXH2m2vR;w!O_d)mK>nq*J>(z
zex&N?e9A6{lYdn~R*M^%9lgju6-z;xCXx^1Nbs{HqvvJP!_tU)zL|(V&4fv0Y<X|b
zy8dMtHay34<3~()_onQIoJxWB4RQmHqy}&-<|nN?-?P7Q3uWhSP$F=kZn!y3lN`yd
z?@2~$BBeF+C~HzdW^;s;Z4M+geqe7{7?xK?Vk&E6-NzcX%oIHSrZLq*3p5Buc*HQA
zGra`XrtYG?rYB{35(;~-q-Ioa8U+W+@euoB)`9Gjt7Q9E60_-lM7caA^27%A+?<Z>
z?QkqR9mMi^CYvu5QSO{d)yU=4o44mk!E4S2_2YPR0}UpEZw`1zf%8<VI<=!|vkeNd
zf2nQrNY^?`((fniIp3RrWhMmA&&GAdSBzyI=-I8oAT<%==DU=Y&Y>!AB@I5WXj^5(
zxdn9`J7Y+_Ld-7j$K?K6M5!W;h9En^KUa|P+laJ%OGtLGB>1}<;W}@K8F-q2#F4CX
z`w2gnV=ycPn^8iGoajq+WKWs~_vUztD`(q!akMRqTFW04Tb&|HHIMwS=2ZHLeR<e}
zoTXNz8IC4J@Pd#EfwjxK5YrM)P)QS;gVb3kX1Rs=QyfC{C~vi-Mm2>4m49<;Za62$
z=5a_Wo<HB8+|j>~R-Ykbkd)lncPVY`Oo_%}@<-gD;O`40zw0Y>PaO#}G6>$agH3;Y
zuvmANjdljOsY%6qU!&%jzu>StID23NZRaLXH)$5dZ!eO*Y7&X&eMnLoh-CK<3deq@
zy#6A^V>(g%L>p<>3naa-At6KP>9+-J^s8t7n@TLA_Tcf?RWXZ?Q~O{Kt%p8xJ~M+h
z%OXk(JjuQCi=?L|#P04#Ok*?AlAWZ@u%JS75Q^BTC>Gu%qrL~y@@kU0eHMB*nk|(D
ztUI|1$FvwcLk3X3VF<MuK^)22!Ks95THc%$diDWX2BswIts!om8!<{sM74#IoG3%l
z<2q%kN)+Qy_8V;y7X%TV{gsf^q3kS)#XkQrJN9oSU`PXHle<&%TZ^OZJvj8$l=`gQ
z0tbpo>u*a!{Q;sf)Cup{m(V5pM6XjLC+!g>gOe!eca_}hi6mZIO4#XV>|J;X$HT3x
zcGJc_WCDA-MNxY8G_}2KIeIObrl(RW@2n@MV|P*xxe)Wwm{8qb1a9e%|5<N>r!Pb%
zGo@I})Ev`3<jkE!ve2K=H~JE=;xOAgj$=u66t<DK@fgs7QmHvLugqxcmO$h46ck&V
z$sA}zLgFgI-+L11vjo3=hIpGk!_R97i5@NFWg3y*m_S<jN8+>A5$j_^(7t}SDjs6?
zGM$YNzTx5SO{unc?=@x|IX;)h>2~A=Jx6+d15u&92%f2j&-(x2`7;3L9|Q2ZKAeQS
zD`b30AXWJl>ElL`l$$}UbqOJ>_1PJo#ZrO4)^`QBQ145rnl{zlRcIM8llo2v$oxB-
z#P3T8F}aD~K(RlT4Y;&F#g<;X*fZFe#J4$QpSgoHY!mXzJ;>*KAk+3Gs@f0F^I8}s
z?!~TiU);S$Q@U^kH5*bnFljTj_mat+*PF!LSA+~(L%`4m+`A6LY2qLp_9f$@xJJZM
zU!>=skhu3L8RvVDmGJ_J<3qwP+u*e(n`JR2c-T4P|EdFu`!lG@o<l>~UaBRzq{kSN
zr1KvERW<lu`Uf|0{~S(D!0v}FTWod^F(;qYK_w)OttIzrJq60yq)m$#965lX?wOeO
zDZyuGHX&;9D2fcJS`<&cQ8-oFKS)ziBXQ$v{306hch|w?NiLh09KvR%HCw}<6JZiT
z>chdr#x|0Z^O}MwA4rRIA#SA+zG>mC+4>N_A!7*gHbUX|SZKUG><@iO#r?mK2|kn1
zu@3<Y$K$K8!SU_`tX~ep+^>a=*8>StUX3*PHWAOSld1kecm~}`KBh+G<ZZazJj>G3
zkNEZ7Lg3ablw513ddMrPdOD-<h(@B{hp5iG@ykAl=esnv8P;I+@U`F)|KM0QnV`}Q
zB&_R+|GdY<Xq_PZ&m;m8a&ccdi#5};(AUqxW%7Bv@48ZW-h`6f!^wYNMta@@B849D
zFTKe2OI>lcHo^AhFif?rFr701%Ohs^I&39qcMrA(Utp{FKK4@tuaFnvXx|B=+#M|G
zDZ}aTE8M0;k~^Rl*?)&f5?CFkq=Nf|KsF5t!Kz>m7Ak^kro3g5*>x7?n6PwCH6|UO
zvSPtYmhC<yp6|@k|78h3DFW_=pkw_TZH*-up3TEZ!-AY8hY7XWOn|K&mXr0cDwAN<
z=@H9q=AgU4l{w-yy25jtUA6+PY0k`kGY{>cLCn*TGP`LQdYwbjPqSd*u(4=Q31E)d
zIP_=rWZ|}p<is~)<=upX-!QBeB(v5o4E_GgF&Y02!yVo%)BG1qYnb<a1q=SnWch<H
z7$^CnH>Vvd7v5rZxC5)-J;wCs8dlU>v9eH=%~z#t{QC$w*XH45S&3u+E^JwB&9>k?
z%p>erKD3SHN+YrA{{oBAVJu%9j!E+pY}}t>?-IwRS{=+^FJ@zr@H{U}XX}}j>~8;u
zJzMV(GVq7M06lV=+p%`BDYn<2vt!~&TuV=}ez|y!-vhQ7nz2P`wZP-$ICPK2{p=TU
z-sA9{rb&RhIBTZm`2PJ3Kh-0IEY2tN@Jd3GB1p0}K(bMpT&qmMJB`?J;xWPUc6ePg
zA>jUY0{$CI@Z2bZ%B%@B>4J}iGQqYALQ<Rv`FB2%Dc!|gOeV6;oyaNa#9a0xKGTxu
zi33P}bD5OGZR9LGK=`pHeBE9VIm(~N3LWt}XF~Iy5iGkzcu!5jPP`!^&WyMzp~SeH
zB)oMtp*=?sJ98t+s>LLH%OKG~j&!LWlJ(<BTmBx|h~ea##Us_~Ok&&+()HJq{`?2&
zHwKbod0t>?GLl1kNSXG47=K?<RmUM6;3M>87Kz&&N!Ag#<?tJcyeCrs_DF-~lM)|_
zeElQjL)%j@x0Ec$9i*1dB&X;da+fSJ&s|6Q>N&EV;iSh6BsH*^gw?%~{5(#w`$^Jt
zK9Vw{f`sQ|NbXQT`d}B*Q>G(#K1pWrN(ypclGn$TJZ)3*Z`)J8>8<dVk|+weOM%@q
z@(TWt=k!^4^mR!4#S))>m#oa)WCu8t@@OXM)q=Ye8<5oeimbQK$zJx5{NrLC&90+R
z_;UHtp_F@?qPTmH%KxQOr`1k)v0~0QzNW1IGD<tElXLqjIU0kJ+?696FbTQc4rJaQ
zWGy{Fs_r7vhR>p)W|hFhk>nc$lGV47LgAO^e|SQL-cJg4ZJ_*s8&ygps8YU5@p4Cs
zgA$Qz&m*_t967<UWI3HcK1zw~$`_<3OOf3;k1X^n($ksb4Rax9`Zls8t)!;kCoevk
zoC(V04_YI*Bplczd_WTkHRg5{)O%BWGmzBG31qxKh%~Sm>Cqh|?yMuL)Rok`dyyMQ
zk-c&#DYvGR_fH6!*Tmy}DH5YF<iFF9zYCz?e}gF*F`bI_kEzWT-e`w76u;Ud-=0m1
z@f;FubVxXPh&bC5#1!up-t-I7w(KE&&r>odbtZP(d{VhbijyHJJ~xQ0nM0DR4N|qC
z6#U~v*|I1qybQ>`dVwr!TQZ*3k|w<Q#96b6eln4Wup+{ay(3gfhlq^*LT3rTaf1(e
zQw@opwgFk*TVkgDA>!3OLMt?gzBq)iF_x6I25?}kHRVm~DX<<-_ETfx6n988x<aIw
zks<Qm_@5L!#ZZNyAYDQN*OI(imDqFQtP9PNJn?@dW>vAzatQ%l7O^Mt5upvicl
z#jN8rEtk{yFr0?3A}5+C{1M$`;sb9HDKtcA?<_o@CE+VC5c5+aIFuvOcP%9J^C7)^
zEg7e+$h0^^+>3(*RDWb!AA17N1rd02D;1lTQ2(=xrgnp<Uo=)|urEk`Yzh4_h~V8z
z@$U5lx8!;}=SAXurjE#&=A<rKAUN(&QiZ=HaqmG=yE4M<yW@MY0lT{$@yYOFui9zK
z{k~IwrUMO6SF=A&h4g{%kY@g1U&q-5?JvjceH+`yRkJPi8(tdsi5lEM_RsaC8uSzQ
zcMO@@Z;%E&Al4)Sk5%7Tmn$K#Z!bceCsQ8ufV!&1H2k-YTCaYjo^V2XA%-B~@duQQ
z#zSQ?u7%rh{_vaKhYk>vH<ALAQR2?LM=o@Ac5Nl8!YhkPdCK-Geaw%A5VCf$*weX`
zulY%xgE5U_|Mm~{C#_jiXk(#k*DWIG`e?kqzGjEoeVmdk@Ng(1zSWe%`)5hNGmoqb
zlab5nN!#>|=urc44=cqsTj=!Qn?xv2NAX4ICYQGy>>#7QxSI6Gi%8WABK+|@LTc^t
z|9A@bexLAMaFZaPeWcI7Mrr6`ftTq>U3HMH-9yradxUkU6CPe)Y%H$g-%l6+@M3C`
z9@FF_@&HOW=&C?5S$J$`Rmm3fNmi6e=v-66njQ)srHxGR@!T$7C~C|hw(4KPtE!1P
zbeQm<GWOodz{cSd22Iy+>7mE=u5&1h4WRV91J%m!sA`u)j#>*6>n9}ik&Bs<E_~3w
zcpDcGuyQsLk{g8lZ6SEm51f>8*lfCujU!^&ux<=%yJfIq^;?XinlQQjo|S`7lbf`O
zc>BXdzJ1K@Nwus_=!UUwPmEfovh30j;mgUG>ClbI^X$+ZwU(JptI_Qwct!7BOfy)D
zR^St+)NEsPk4RKrr!uOS4yuYns0hCzzrzCj#EN*u*<zhji_xk=7Cc+e{9_Z*R{hDe
zDr2Vg)nQ`Z1Sao)#<ZRK=znNp!jq{?{9=cOQUW87s4(I{3o4&aqiWQIvWq9ft_G3!
zqzw}USNJ#v(=jP5>9L1}`7nRNPUZ>@qZOCLbkjDb&x~jMobhN14L9rlNG7zO!<d>=
zj6XXbwT;&q_1d1%#&T5p&P7Ef3gyW60v`=oxy1t0_ztYjo{Q0(#mv+9V6JQ#vpbJK
zE9DL|pN>VdaxIgl`7mCu8<Xs+QD3p1aX!YV{pHH&FZWP+6oBffM3h_>pyZ{4$^@Z{
zbSGi9dJ4-TA~9a<g+XjD<~q5fGp-M_ZkRIb?^<REFKLRFIvS5{m~`|w>hj@CKH3qr
zGv67f6e-?wGs^e=MR~(slv9VHEX_c<rJCF?+gUC0$mQ4WvHVecmh1gyp`I(VwRM@D
zvzeI@f0&XspUEQ}QD6Olaru83AD_W!>&}ewF+t_y1x9|{i^}c}D9tuOS+*Xv34fs+
z=0>htEGvd-VA9!x<r{you<yStntTlE#xi@qDKlOx(A?j`#NQX0B=Xb=nTr@V>Ja0V
z%NgUj64mOtsJ>pt=-aud_?n_(HILEnhoT%YNu0sCn5vJ&M7xZ|26Hf+T#cc71iG)z
zp%b`*>56b>K2$?<sy~zedxNI%D%G}bWBlk(j9EGa_0c0y@e+Hv=MW=f&Y)!9pOJdI
zP#P&G*Z8M6-}=NAO(Ns3Po$5WK=fQ)qW{q+q&^VO<FAPr@gI>@r-{hki}yWGyt>R`
z_x_>m@cPE)eks^3w87?fHr8e;ENXSZDANoh-D=@!<x=G!vX!uR)LmLn<3I;WyG*3u
z`Ykdf_DG&IlKmr(?7;P8M0O#*zb<jwyNG(-MtF&gkk3o;-<HkZnsIo=hO%j#7Ir`L
zuzL2F$gzgf@HvqdX#*!soH>1IHAnkArM@75iarl06FRJ6&T|e{|4qe8k!3vWO;PG5
z(u*sIHd#vI<3bXA_7M2e2DjTsS-brswr<C8I(nYEV-IQUeV-Q1?VS1|=kn?}PLKUX
z<724S4X3=Nkb`Ts(waA#`hagD(=bEvJC5{Ag6oEzAZn|4L@Xw-+zoTn0QB!JV4cVs
z)(Vfa@Yml|AD%?R(L#;|58y<mHZ9w`Q?=BE5?i4kPj#nO_~4B$!ml#hNNH6D`KgbH
ze!L#9u21lswU52MAL6q!i{0vjaJ9P4j$u~p2-YG0i{Pt5r{>?7MMd9iDlTe>%xNB}
zQ}-d8;!JAi-{c$-`CQlW<c3s{RZ)aAY7OzV(fB`VVY|yQ9D+KsRaN-M)92y1U@%Tq
zVQlssO!>UM)GSt~;m{vieWr8fqsU!{YS7ZDQDmYAshk!`L%D*(U%#^dTn*JDJ5e@z
zA%)>r$QiR%@S!67HYZ~Kp#tkeU=`JXbB6GK${ndbu$}q=8#(9@Nt<gWmyND+^}$@)
zv?4`@6ho~}5{GI+Ir4TfjkB6)a_mO6^;SwugDKoACvnh6_Ni90_tG%-T>gfqSs;7z
z9uhdihpM&O>_1;mlSVkLiuYVx*}|2ljhr6nOH1F*)c-h1>*Yxto}EaIQ;W#iW}`U1
zitO8J<k`<AC9ylfm5sRAM6&sFdu-kRVE^ScI}Jut`N^Ex*kBsnhtVQi!nv={x#FzE
znU~odS~Hr)uJ1TfeU_HRE-0ReJm}IC!7J-Y86HPQ*Q3Pyn-M6m%6GXH+sriBbf^=y
zi|%0?`i#o#V5*awsoQAIK_?YX-gV-Ft;ndJjHdaCl*W~bwEnovL5(-$R`jH3@^mD@
z`odFRO_Z9c;B;+xI6h#{=$UNysl)8TA1uqZu{}4BD(O4ne{G}O;|aB$b2zB;lA~io
zXt`lR+1hZ5BVSN;c_+mp+fLppLE5J;F~b}Qd~%4eU`@Pt#^Cm+5%=B8*>fzP4I-Pg
z^lfJQ{QFcqI7E5;GQnZ{hz`LsD&~Bl-0>=kK{jNZc}jZFMY29iNc%p4<j&K{8Yr~h
zkvIa%+6Y{zMbOF^d^AV1_vmf>3^uT9MF4&$yR+BLixNjIN<HgQM75AznoUtcCW>KR
zBEwrrMzJdClg!D`{6>D1GE$*MC5bVlHccaXj?lhCZ3tbviO_fc><gO4zV4OyUL8y5
zfcFIMUm>#SaP}`&X8*6QG#(Y1|CS&sKfa?pdpAWvb!6+zCbeKFxuXP6%rGH2DU!4#
zFTxLd625*d5mi}4F4ZB#eI4OLTt)7XN#G~gvwsdH9~My=7)t%cI2y;Uqh441mU*KE
z7k@y8@N6>26p&@ynVe%`Bucs?St5E0Q8z?BD16|m?MN=Pk@QBBXn6{;V%|p>-XYE_
zf#`>ysTwG>ja52jUy7*gw3Ko$d(n9S1xHDX_a)_v_`dRONJ1J&x+?toH8VtRl8Z$6
zgt8Y`ke=;{M0Bs>3VM?8zao<B7m_t-FDao{s9YxA>&|s*1xK#aE1`aA5oJRcQZiO>
z0OM~WXC97ptoV!_9g)o&j-*8Rhh=w><^DnvBr?(|Q9?s^A;sn*iJSEW{&XU@$eL8Q
zwG{U_N@>53l-*LLa>!MxL&GS3S%|!A2w98&LaHMner^WIbEHVx?Im%dlw{$tB}Y{d
zv%ZixCsQOICrSF#L~@wWy7NDZ{>WF-Pe)Uzw~EToN2u&Q2)W8xk@JX5r>`NI#@$G*
zDMn)4Op=E1U4;&is2h<yR*7WkPvL|8Afi^_VdXg_#RVh}*hEUg5YiYzUUM$;Np~o+
zc}V#-Z7L!=QF1?@th+fv$IM0cW*jNPA4q7?AoBV<5)=YE#NCh_H$?Kujkp_2iTgf=
z#FtA*8Dx!gPA_EU>7=Gvk?pt-`8ErR`*k9-{W-DMW63}IhN4W-1q*nLq*{0uC)bgZ
zpCtMSuZjC2GAXleME{sa?8Co_m-Qw=)0F7i9!Nfl&c)&e(#I#0{waro{hH(|{Ukp)
zj*^3B<W^sxAWufNK_ls<%Z1NfL_)b5@dhnKeHuxEX)SSA*O8PU@`w8t#M?|Ev4@OU
zCo?1yh5qVnA+nzs(k>~H_f$r1<_Ss`1XB=TNx_{wicJ?$_^dlJFC|i{pObK}A92&W
z5SjT(%tYZe_WMEfq<O@zm`TFU&cwY}C85!TxC={>R%{@9av<r0&LBT{k&J_51-2EU
z7}!Wj?Ll(fE+YT?66wC>q^)s9_UtLL=L1OGByx=h(L`%l6K~i`(!9RJfA~Xe^em#+
z8jxrv?$7cqWVp;BO=Q-2YkQEH^ojhB4U{+~kRLNftcx}U1I5{|l_7ttBkpz)NfUn%
zyWt2iBlZxpMC2xg;?5oOBJP#Y14e_ydR;@JyONZF1H^jhlTjl2b;XHfd#h5=a*E2`
z-6<Z_f}(P)$o*D{uHR*f3NMgRxSIr-E-@3VNglF`WaAse>{Jn%%NcRz{D}PHM6_85
ziSAk?*Bm4*<T9xpbjZz`K-LLEa)t;$#rPkxem|#1Z7G#|yHnwABYdGB6x{AXx=sqI
zpBl+f%N6JG0`al#NDG9owAYZBsv;7*DM*$@ka(tC^dwG`HrRkHwUr{nwWPF@6(wt&
zseb1}^|&-@|9vg)ksD=NN6Gi9C2N;CY43_j{ai)Lvy;R-tR`-3mguxh6CRuR4VI@!
zUeid-+a;t=6rNaT;b+veC;eGCWfhAlb$&#BbTkJu-cc<$P3Z~IBT6tJXTvNc<Lyaq
z)Dj%chQwX&#F?}wLG&k5hj=4X4I*w(0EwMX5~KH*;H!^_YraI9qZ%o-#uW9aB>M{j
zU)qsBZaYQa3(1KSXGK$kR4p6gmCJ}#3LrtFACVfuzgw0~bdKn*9<3vK=?B3zKM+yx
zOUMbKGmpI?bdlh^KP(7+AvoKr{^C2`BCqy4IY$?f-Q5`3y1vMcxf1uVE0N#i#J_VD
zcYQc9Cxyn0`zSu6J7G)uu}`lDuGT;BS#X)1E7kEIJ_h&pLh~)r!L?xuIp4Y=U(%UO
z;maqT%qP9$TarfzZ%WME@Tgb<0$qu^B6>&-wFIU$5x8I{@h86!a0)!1df?!8nJqho
z&iio?$4kYS-KfP(bQ&y`Tg6#xPhpb4D)D)BV(v8fPo_FMks_;gqz{Q9v8^82VZj5<
zE)o;bN_goBlBUijl2-)XHN|;N5Z*n{vGLXlwtg7ShGAc^jOl{ImvTyW-9fSYDU}L$
znyMynIMAA=fsvGMw<L4$I3%BUl9O^kV44!r8?GW(8iM>=0Li6lgqr5yu~bUnad7Up
zhAjnwtloPBa~DT;b_}9;@G*){8&WnzPLt?g9qtrM)9e|PMf4;uYCBomO(+n3uc~ba
zkXs7BHA+gB=%OVa9D#rBXm<3t$X>@k*lG8}Vn7&6`?j(E>Kq(al~MHIIfe25l-zcw
zVOleXWR^7aj6`8I7x~mAq_-={mA$00#fIF$GLk;bB(v)alDg^;Jj4<&ttNIm{fCYH
zGi*BigYn*3tbUe(mAqN}RxxYzv?*TdK;3Q`&7QvOx8Fh06?@VaidoiQ%%S2M%0h*o
z{X^icZ8;gIf=CK{M__Ux-W{jl{(dG_2K%sn@(d%@O3XAIS#@GN#qY%ZD;9t2CR4vs
zhvqHcsQK7J!SheZ(pHcZr6A*$8|7UdljqlwxCt9bcV157^#ypArQ`HwEsp1tSQGaY
zyRHeC{G7+i{8pA<@fF^yKKT|$QK;)u-?^No7wJ?s#*z1z5vjc%kvx04$f|Tu2p*N)
z7Dw>sd&J6w2k$=s?~FNk_KnBC@H(zjyRrGeZ!Bl$V*X|pRwMN&oFux&v&$)#kEeG1
z80s3AQkuCy`1$vd2yBR+c$Fk~akpj(ol>pDo&kLbYure{&jefpo!RUVg4ftvBFhV5
zlf?$1E8Wqz$VC797ts$?kW(|5!d;iCGS(1yqD_&(JJSC*ow#9R2z@`2ID-VT=46uC
z+6#|T(IHrCg1hiWt+ki2MqgyPSvsuwJdss*wM2KPKT}h6L|1kLdAI*xlTILa*l`rW
zgDCHlLY7;r$S1!Mx=w*#_hTaCC`8t9P~?kKu?y{p$H+~%SZ~J6`aVV-=3*6kf`wYg
zG19SP+VEG5eHoADwQuBUjS@4_fwVh!kUzIa-ZT=4_}-CKf;X0b6TPpqIDM4h8!!|5
zjbAaBp2e@<YkVZ{Stjsar-uP^U(dkk@ox+_IWwWt0w#6sh3574<lR*v`Ml`S&7Dou
zR9&Lhj3;zo3xO%$@sRAmc8V$HuO?%AcP$qC5?OhwCrif9W$8_2Cf7JI$=8AL&rf3D
z>4@HdGmO;Vh<aBT)nSn6qIM;8%?Gx>_=aQezPPv?#`~}wSLIJENp@nb*56pKQo`|!
z5z9C1#_EC49M&6|KlC^{0cS-o+ml((exd(kAQNZ2WJIzXWA)t_*K-58J2zqbz#B*5
zhiz52z-~Y?o443uyQ&rAbN^r}=IolSuQ49y&ceA%S#c|lrCK#CGS_5I{Vg<lrZAyt
zzVO^;F{!bbQRY6V{#HS)PY-hCuh}%m2Zu&gY~G1pYpyL@<3-QIwU9;b|6t{if=S#s
z7V1QZEO0W5qf?l@K@(kl6{dW#MooB#s=HS)+SUxU)@)Qnb~3tq3#!7y&lC8&QJT)y
zc1!U%^8&lJR?+3!$?Aa4=&#Pf@c2)R^B<u%JX2(gZ&^HBk0lS&(H+o}DML>&G9-ji
zW;0Pwku%<B5lZ`(G0LKzk$a5D8E%WmqlLKsn1IJFZyZ|RVE4}{tTSI?<o%0f1`RC!
zunra)GjGQhmdh(yWhrvXbt21*b7#!?u8bMcmFa69GOO?`BkgxHCd`X5@-O6FdrOeg
z1$LP&V~gZG%WUOrxH%sCuc^YrTE>cL<}CQ@JhO*MnDz4@M*f-@j~L86Q*9RLzGm#(
z`>5O${@58~)IB~k%)1^XF@KbN=8+w_AD5b8ST4AOiElB>w-07h)>hG*)5rAce9U&v
zWa0Bs7|28?&pI7rn`l-Z?E>k2(DqVCZO016^&5uzqP}QcGi2nHS*TuLgqn@$V@waj
zdFl?f>gD4eFV>(`1v^g(mM)bny3&#PjhC2jae!Iv?=fBGz(RW$3_eE*?=%gK;{P%B
c;u}UE{}1);qf!4S6jhf#j1sxW=w*rj2V!Tg!2kdN

diff --git a/src/feat/test_data/test.wav.fbank_htk.2 b/src/feat/test_data/test.wav.fbank_htk.2
deleted file mode 100644
index 4e95c15a30844318cb4a8165a8d9047860c58f90..0000000000000000000000000000000000000000
GIT binary patch
literal 0
HcmV?d00001

literal 13076
zcmWlg1yq%37lqL=$F4EPAOsXq?7)CW6cq%OQbbBXR0KpoK?F(Z?#rdSOOO`C?i{<u
z?i#!P_g^lTYvElV_r7t?+0WirMn)z|rpvx8nGG^Mky@4_&Cnn%X)BUm*`$ONAyErL
zDmRa;u{X)t+?DL#&ZH~55hKwf;rk8J7kMFlIE#eO1;lkfiuCJYQd?5VtQ$!7%d@1M
zs3z>ldqQfea9f#$tIZ-jU!}1xSdB2*>$um=#AC{GTr+Rt<zI~X&~ogy$6&kkC^Jp{
zpdpxP>ZVNHumk0R*^D<nkMh_!<QEJ?q09lLKHmxNIR#Jc)%baxByhkeB8p}Z^xhYr
zoKhT54PjT}Wdg30V&i`eyWmCG*ekQ<_a>IqnlsVt1QWB4GqK}66V`o0Wsf3CxffAz
z{E4#78q#NuCnfkQaq_0bKm3E}qL&2CIE&|sH+XqD5gCy|+=Hd~v_{}Op`NWnR$}8`
zh@s9>Ccpg5lvP(yeLR4Pdu36rH)lfjE+#4~G9hOcg<-!aFz!xak`m>Kg;b=Dr!fB^
z*;>cRIRBY~y1ytKEg`uufr$A_2pjna`>J#BlYNQ$q%vkaJB^m_7iJ7Q#3Zv$6y3_1
zAaQ2$pdS=Sny8pmMOl<O`<-fN*xNzH)N%?pohK)!E2UO<sO&b8oM*F<thqznGBslR
zwGy#+A|_3@(H>BSS?4OaHJvHft5D2)fc$MUCO-O1(Sb))@7zI^?n0VO{-#aUl$wDH
zD4WztaZ5clyBer}aD@`{269K0Abp)p+WaDMJ_cg*=@G`S|6^NpG%NPrV&45dOx99j
ze0~sW9*Ze?{Ek{PKWe^J(E4*S?fWvRm2^^m<vS(%pQveFOGBObY~NidsX9XT=t9z;
zEhbTGKF&7Fu^6Yq&KXLq96OXHS6Wc_vSNlp54f<0()UBC{rZkN=UKF?x6wZ1AvL$R
zQ2x4#;;0Q&g+8Fcr#EE>Ybou$j+{qR$cxHDViC-a`G>Keu!EhUyD?0f#q749Os#K5
zQJlLKO1CMC)}Z!-gagxz=-8vg!Ri;({98$RY9PgB8B`s)PQ&~!l>OaG`C~_lAGA}d
z+m(#r9qirt5T{2Q@m{Qf>Da$n`1~p}G*6*sqlEtR)08P*rdA<>hJG_R<hhl0-A-x?
z6)BGw>%Gl@O5<-d47^E+Z7Aic52@JQOl7Zg<o3=b@S6gDb!!QDG>o-@>C7t&LTS%4
zCL5_?95<ZOpjFiL3+6zw2b~K((Yfe5_5D4mbQ(i(_f*PGX>p*koH7#|N*8rfIlPjJ
zEGu$GE+YDTBQX*8h|qeB+mvru^*2XHcLcMVUtwmVC(hmxaZXY=AU~eYBj-83;1um$
z!>Cizraak!il^IYU7tkbo|BZne@pqAqm&IxC3n6d$+zT5U3!f)**iq8o`O^4GEB~n
zV(FL|EOweumJ>qNA+g>QOF81<$9b7k9DBclhKgNObre&1>^6t=u5qYr7S&@jC|mpo
z71jGFEqg{v*it0Pm&vv%CvlY~JHAcFY{~-kE5G2pzLl~!Z>XH}n**CqaHv6rv!*XO
z_Fy#)ABR)rG@J6j2GDYRGo9m~QEPFEV(B%pr>`hcm`Z$}GKnodWcmjZr#6@!pUz;H
zJrA=XmH73rqAY)?c<ova+&xD7i(1aujNs@^Wg5jESM1cJBv)3f*EBk<4sjs*CdFcp
z3p2)(|8yuZExkyOu_AlzPQnyi*)nPsHb?z1I~>T4>)MnRcu_VdgNBTH+TZJPI=U-I
zH~G`Bq?j_D*%atqrn<nNcE2w)M6IK=`V@tlVlTIzA>ODSY5rBR!W@b8NyPbUCRXl>
z7;SCCDe5-mvig)heNID2HSNKhIhnANBSlHnZyrgp?MMm~zEK%AgqF+UG}tVrBK{Jk
zEuxmimyz(o0O{&}WSPw+^u`HXlZ!F&QO4B!5gs!iQSQH%vbWu6T=0gDlA#=b{e;6K
z(y43LqU?qxMfzQ-9Q=Y-IX#-U=2PVpL}}f2GIc*7S-6PIT6Z$6tclqWioe<d9JHq6
zsF{avVipx94OFPlqw!8xIvZR$@pTY~|J0}cSUcsG=@gH1p>od@T5pV^rK*F<KHVw3
zVM~#DHJN|?jdbufGNo5Y3_3y3#!QTrj^XsviNGJvs9g4nsyr>4#h;_s`fz$$5S_t?
zIWVS;@|Wfm@6x8?%@bOy$B5_KP#Nn+dG3D{rZ|()Vn*7VUSzlpByK=Yd~f=*Ht!X#
zYEA6B>Oz&W7qxu@Y2NL}@x@-8jk`cc?cda$nMAqDT8a+Mr2PIp+9oWgEijUbaPe8M
zv{Gny0?D??BoCcQy3RaeTm#rE_b&!<*Kpf?i10s8QgzFRx`64l_F2O*|1i$V$<fw2
zni>zW){DQ8KV6m58KE>S+ed5iaY`?~p>+69^3Rr%s**$MtS4kGo<mY-4(_sZ*f4Yy
zjy1Cg(vhL6F@ggghP2%_=Y+vSj`wk-`Ft|fgViZKFrD0rvlRFFo2HEkv>Gj?r2ifY
zePt-fe~M&%6G<TtNb4d)jFTlUiW4z2YrrL9IDWxlR8PvKp;Lj5uF9M~;?1#c4>|C#
z7ZuZN$Y1!4ob7*7D7&5dH`i%0OC+!E1lhyw$WEV2^o-X8*NrD`iX36Z`8aDVV(H0w
zc#LWz@Psv0RtIR<e2GKHA~>B9!Qskus%*qwl{_WO^)(qf%H*`Krn>u9s^zQ2UGXJ3
z?>b3wGI+~9!tG)NA&qi)9!zKbq-88?KZUK20(&C+P!^y{UB9)o-91TLY#j%lJtFUk
z8fis;kysf>{K6`dCm51<WdYewEC@VjOoYq_!o&OHb|Vt^C~bVLx?|sT5-RSo=z<Mv
zD;BeT<x8?E70E0R@9)}kGVdmm9Py68p3!Vqjl#O-9hRQXIHpV>cuW`k90S<+ZxeRT
zdodHeO26L)v|m+YzThM)r6ZX4J`jyR!ZAFThpx<eB2488Em324<Z1RqUt{mztFh|)
zoK+`wu+lLS{h03Po*sjtMt9bwJYm&TMHadBM&sTJrkh`2YRw&{xh+HW;}KMxx-#*O
zJ*o+fs4bTgsuGCR?9o{7>A*($C1x(YG5e<!-7n`^DkJ8bUoTKkmSN5F91PERp!qw9
zDF?@+S}305Fa?F515ozLVeGvG#x+ks_D^l(hrdJN*J(n3=wNL5Ck7r<vCwqEWLOd_
z{<w_(vm!J;I5Io`5%Zn?!KCUg>jED$M{5kzW@a$;m?nzd2P5m6h}`x7#ux`N#(V^0
zwI?BWK+K+|J_OInW$k=lR<)$C(bfdJ%nPi_xQ*Vh8_a!}#LTO^n44-2Ia(}pa%PTk
zPt@%DF=^x)l<x(g^eu#O9X%M6*cG{m{){uIMpkYl3Lk9=asGz6UL00iyW%8skWEj%
zVc({J{U2+w?N`gjO=Z}gyo=rNU08km%=#_6F^<T?q|<@*873GTjKD~7E{0_@(Ftv2
z_4uAR{nv_1!hS+Z?y`M*1wPS#;Whd&9(Vu4`(i2X*^}{fS<H?#F?gPv$}asmxKAB|
ztK}_r#Z}<yx18<LNo*=yg4^M*IG5F8H)a6qpOxYD^CZD;=7fK6C$Z3&G>1^qUb&Hw
zeu}K}LXuvak*HQr=)QP@erOYs_cxKDN(A3aXWy2c1bBQSsIHWNhK2Z^c4l9n{&;xx
zz)8uSfR-VIgnTFIhBs*=ca!w&KB;HZNY5xB|4A~^S9YZIjwGu2F)?x*NqGO0uua_w
z_qHUobr3=R-SBfbg6pP_1RcDKzrhwfZwBESeUBiSBJp<}xr-N&xBD4o<(kx-Suc9|
zcFJ2_#q+z8bzX_A`fC&q>7d}%5m8SUlIUVVR##o2ab^*jtw@BjE}{A3*gXFZ>jvFn
zx!xX3qUT^TB8uu=)2MBjK#RNR!Irl<7H3a;-}BUMZlL_%{gg>6X{s@#@!fSXgMTBd
zbsAObv7&GEMq=wl%Ca;PZ~lSjeh19Im|`+?KI?nGz+u)LY8O^h_iHY#15R-C;~>r`
zdUAN(59;3CqTKBr#Wv<NEMG$7-cYh?7K?hVPu@jQ2P?M{y+4#Ftui8IE$|wnfK8u&
zF!y|g#l2o^Zjq;EhZXy``O<P#mSZP-ioUafBN3meOBZuxzsD4J9ZY?c3C-%KDL$}|
zv=<>{n0_Q-;V*Veq6l13O;Df@E^6D+JAW9BefQCRHh@*3-KhRPh&l~fT0afsMDP2Y
zGmxP(;v#k9_fmRU=%LXYscu?BQ|Lr0Zk3VIeIe=DQW94m7W=u32r~ns{omlPm4-?2
zKWOf3M7wwd8>jT4R?Uoh%|0CT3*}_I3}>6Kb8velHSdp7_{N-^Zo4U-<v{H-1**QC
zB4@DBu7kr#`WVZ;ufvE`9Zbv`Z33;=U}ry+b#I<x?xe;RpL}X(UF5)ufpi#m<Fu|4
z$8WdNwDT$DW!7Y`7W&D>h0NO)6nq^`rNcjz{B25+at{j9yOWf@nS`tM#Bcsd(EJtH
z<$l1t*KIZ>Prz-+3u;Fd(eToV!`F{<Zp9ATGcL0~)0x6+zmQ1ok`P!(a*qH~pM4Pe
z<r9_vh`BA=oRWeyr1W?~(&sV6H#{QX<YG2n3P#IyGZwQ3;n7>0nv?Hn7Ml22jT7fz
zOrZJR8OrMqk>!7wgj4#&D4rx%rJm?weR2Y;sqPTkM`(kxqc6nl;D*HV4vCtY1j)Ix
z<*f{+PWd<tT*z*_!BjuDq2(_Vj;_4MsnZ2Ctjwg)>;@UqT_kEeA@=W8MF09i^lV!s
zePyY*xrp*h^C+F{O7;yY32nWJdYjC?Yjbc7vBz%dS6nBj5YS)D>oe79J{rd1*F9-B
z+(gaeLUK)3khX6uakhtvsPiV=;1Hq1Du^2tK;eq7ln%N`QRyY}=CqU0IF3;1RlG9>
zviY6`M#l}=H1`90UK&w#Bb$~<NgQbwbMbk3%7xy}%9=&;pM!`xR6+3K!vwCC!GB&B
zA^B^_R$Wc0;wJK52rb)X5lMTt5Z(O|ez*Q$+r~HOnnYo1{DNIiw@`I_BhA+`II1+8
z{VO~vy0=)IF)`mYz9#hRE&}F##@BfoUiYf;-?IQoa3BR6t;tY~MtY}%xbz>yj-O4y
z11T=2dt&-R8+%&~JoL1wI{AyH;bk1ryFt~nNup->k}~5d5n+o6QuxK5v1WMKF2ZHQ
zXng)t5&9{D%*r817xyQ9e-nv5Ux-z8AgFL8ZhdUg9p4Y@iWc0y?WU?VjQUOb9Fm(O
zw4K=d{w2igWD<0KD0|;%;rjR<PDN+fRyF|NhEyb*?8zN27}0w%&y86|wpS>UgI|eg
zcE_XW7%Lj8u)X&uJJo}z)_Y5x;d<JlRH*n<Q{3lG#4FUYud6zKF+*{E9nIFuI&9wZ
z8@K3hLQ{(8J`E&sMG#pBMP0n^O-jy1!d8^wc_|KEgG4+huf+dof2vA_c5ODMMQt8s
zGIvS4@*nZ52N6)5jPIA>?6_PiG<!O>nP+g0ktJ49hqQtbBrjH=;7ufjb8?VIHW3qg
zoIoo}R-cGxuS_h#e`rvp^p={L+i5y_jFR*;B>BnW>@CKBb_c%6)7k!a2X?Pu<L;a6
zQ29-q$`#VSb|-$Ks?chR6gpcV-T#}I+Lidqlwo+V06*6<f|lzF4cSi3yx%kqIz(~4
zE~%R4#JbesZ+{W*nXB2N*Aw%a1kCkAaOlyG7_|aYx0Vofa~;{!#LSmnPV$gH3E#An
z?Kfo5J+~FV%?1Qc7rK946E#lv**~uf`A2?|RNzYZsowYsM&@xj2L~q&EN-~7@po^w
z^;t>SuTR8}^b>V^7BM@+2z&pGfZhvmUDh2F9bJ~rn2*cbi|pCGk)qs{lq4%tcr}hh
z%j=^5ZDn_=8ZJwG**<6l8$9DNZWZ&E>~XAZM4i66fq=7za6IVFwn3|~Gtt0patWLJ
zsG-wkFM3}6a8$j6>*+8gC(n?o{FKD_KKM^P!gksD*cp9f!;}Wh-L=s1Im!xM9kk!R
zMSsaw*5vuICUht(^}3+j<pnG4yP>Z=7R|sRsF!|5-ElX|f1gL^wCL0J`UHP!WUt|A
z)=4*Dt=kQ=4NYQ(&1Avvi7YsC0F5^}s5^Z}Jz9tPM6n=O3(ZSTEC~66W_1Hgzqz8_
z%bj`8j<C>Z983AhGI>KndcMLgI*Uy^l(6i&h}AjESf0|4)wdq7yh|E-Lnfi2`T{NG
z`z*8n!J0pP&?}$K;#GYxZ2J%6W!Es7?1hQ4JO+LN7`_<FmZ1&U#itWuFa*bAM%ZgT
z!11>)4x8gK>DR;>Ln&(nQ?QENj%oN8R*##Bk-={^R^+qk(R1t`pJu}qS?o(n*|x<6
zr?p0S$~?ilw47j-;{;E3A-L~*3|{K8S?FB%EjBnAp1|}$0b9JUv3ZUR&aG8A_nL{r
z&7JK0=N>z}&fquoB>v|;@V(j>-?1C<Gt?!x*ojc%1VXw#6k2F6sdEIUbhw8@;Scuw
zXU)EZQrt9m5LDBF|Ngt|J2swu+Aj#P+=W-w6M~zcv9I7GA$z_Ml{1Fe1QjCdUJ^O?
zCo#Wz5pVGy(PqO*etDIYf^wp5R}rFPL!ib5BEt_7HLsn>zM5j5d_w3l@m@TYiSV6G
z_?G|IFl(aT#SlJ7QT$v&+|(kH2N;piaht>)H<1{8Ce=2C^a-Xyb1IOvbUq2|{t>-a
zRWR*nvhJ&sc3kwYKEY&lKTq<HAtcXpB<jgB(na4&j(JWj9VF(jCpkd8E>0@$vI>$A
zah_K@l5%c4Ijic#?6H{K+a6?;E0G~|Y`)7ivM0rm8D~kVyART|mt@?RN8(XK!cYw)
zZT2L~j}h<7gJg|)BpiD|a@<Wa7SAQ)!WqHd3Iz9@Owqox<i86iyWO0!#G_PPQlYfC
zkfNi(6wO#C`l2(r-+LpS(Msy%SmK1H%YAg4>=hcMMkSH)GM&`j>LfiLNRG2Nxjr@&
zhI^9!Er#OD1r!>oQvRci%BFH^GR$bSn$CWSl;WMMsJ#D)(lkTz{XNK;vX>Mm(d(-7
z$hJN~+FC8L)AW$sb|S;*4n>0Z7tOK}8viQUn&R0DB@{MXqRb(lqJV#?d6Y{{>|3f&
zHc}E1ONqFnSr?-yn42MZ!9lW@D3YTixS(zV>9T%EJ1!#mV>@X-{-D6`1i9rBvQ-Wt
zJ<viy(H?UDeL`-HgxWK=IJm=v){&E`ei}f@z(R_5j6}LAoXj#kQY{jZto%Uom_l-T
zok#M^o!n-zH*L{K26j?l=ug(7)nx53Bh`Df=y_YnssBaRgBmJ(hEo%wNWBV_|2cxP
zHdmx)El9amPD0Uh;%DC{zRP)H%wCh)dJ5@;b~68yCF5Wau~+va-Ox#jau+efX%JbV
zMN-5d;kj%kH($)G+NqScNXZE^BU^nOnY||?^;RQpg3voMbBSE}mXKTH33F>Bx<ZZA
zg^gr4#gq4FEiqO%NRwP4s%Rt;nfZiXK0@?CD?;yTP!}ViMYDk7%&rus{!7*bGvck!
zkYFb6Na}5ZeB}wKeoo+)>jXv15cWb!vN$_&CxemPmylEzMZ&v<1P6hCqZU5%7Z6%y
z%x;CD)c<gzsYHj?uldxSenUoo(R+596L;N$@ZA3Fd)ph&7BBp=@8iF6DRH@CzC9rB
zqWxDge6`8E5=7>`BgBrHga3=uIKF&D;3o-zS>LE1F_Z)61nbhfMa}eeq$fNlCAts6
zex3x@p2KryGCMuY@Dyj+$2^Fbk+YDt<dZZd4e9-VN!jm8V)#hHRF>g8y#kvbgYdbt
z8lTQP)a6d&K>kTudk>*%>06}DmZWShCFs{C0?lgih)ounw;N8kPU5{Tk=WpILQAQT
zdhD~{uz_T{ZxZ#Pj_9`=*wyn1rWS$?8a*ah%Y*$tZc$%VM9ZNbR0Iqq?T#EN#-|C`
zDaC*A8}a(zxW-&&d*C3veLs@)@+U=k)kw4z$sL?SuGUe~LR^U`OT+23CmVi$Avj0u
z>8^10+Y0ttGoIGa^;CQ@L7KFUl&S8bE?y=uW(=Mw2XWc{g>9<G+4Cufw7tf{<1i!r
zx(3<bKasO(3ew$Ih#aAYJAx~;2#$Y3@WBf)RG+S-ZvRORzP&{CiAkhe)ROwukFcRS
z1b1H~=9oBKAJpMJcm~1GdyuChbZdZy@G>?cnUzYK;u4ZZw-D0Z4VQ1P*?4Cj{>zWy
zuNOe$KUZlgUPMPhyWsvmC|y!dQAszlx5|<d*GaH{x|m5I{>E#hPDK<N)=)erp2*f%
zBJ$@F6DdpBq3`U;t;T9xEX!sL$0cwEPMgF!bW5gkk(f7S*HSg1f{c2x-inWjdvS_z
z-Ie$RWUzbm4T3(G65&-z(2aHYWOuOjmpb+b!>~(Ug;lTzYu9<Q=AaF$47#x9@CXcU
z1QY9dme_bL!XMPLDSSVxuI)fS=L9R~o3Qes6B^g8nY~7t$@NOiSUi-u>gmGAdCx-M
zjm&U4%e(*!rUe?IP*H*Wl~@!m+oQ0r1o;tzh$tN`=FVU|2I*q7?Ex#serM^e<<M!*
z>`xz=*(;S9x1~%v*~;|coy<Ax%93ris0{v>DPFZqnRpRd$MuXodI<Sfvys353fYz4
z7`@Y#Fv)aG`;}rbx)Xz)YtbppXX!l$v}S0s;Pq+d&R)qZOH*d7`J0KyA2U-$huL2`
z1g|qgX=DYHjEA8-%Z~8}yP!C}4Y~HS$X!~9tcg3pXY4TvaK+%4EyntCEGvA3=J#VP
zI;D+zH!tQLjA8cY0ZbLX&t(0Ts2C_QMMaBAl0!_?TYz%MNAb)j$iMrAg5f&G4ZF@b
zyPn9s66~k>J0?rSjMnWXdi@j79`>7s%M+Q`F_^iJVwrtYo7s=+m^$DbQ;p4;ymLB}
zh3BRgE___+XC{P;HD0|7S?kBhE*E@IUmsbay<~k&33;W*+QmQ63w_I))oJJ(3Eh15
zKj!}wv)~_tnYFW&>Hmpmp0P$%@g*v~BABSPobivIFg|xH^1Z^4`?U=DDJL0wSP|Lc
zC?@nih3x451n<~_kw-W5n(nd6JrM1Iy;%Nm0UAeLg!lG<*;;u_Q&L0q%1WlZ5?oMy
zF%x<`M#W|rik_Y*tXY7<byF0J|3ZGOGx7`FQ9LjnS&a>ZwpC#g(FMIseU>fEMyI=A
zE0G^qDCWtzMy<?tbYRXzf7D!^nby{V+72fs6i#H~m^&y5PB1BW8*)BZ7=Jz=*~&4D
z6W+I+%sIy15%aw1JmUU#ASqXqEcJ5IMvEEDMujLNT>@>q@V%8q#HYSQUP~c-TyH!a
z#^W`|5zj#dxY+H*?pJqgJ5AX*Fbm6CDJy#Apu4RPIxi<tGiWr`g7wy%JU~<KJ)yfR
zgqL`p!Y#^3zrP?YU@m!LHqGsOpL9tnapN0_P5VVuOAz5RuM<436n}+#_*|;MYd|EM
zv`w%rd4Z+NYg#hL&^*9ZcsgpF3OmDTgVnTG-r#_iyYNMIs1l5)Wf!#0I!2|r7L{^|
z6s3uoZ=&d<o9o2qwISYpIRSm9<F+^*lQaix9%Qj?=Y5*&OoS(-&7m(JIVZUNr6vcC
zMLeX*Z8_D=!qd1gj*dYaIMl6<#^I}|cWt7yV!U9Wwgf7kAnHdUk)O^JuwonQ?FO(!
z{{!nyf3f!PB#IKBa3HgrsNa^HIC7U`!5e5hw}Wb#XiBbzQ>^=r#_16>UMQscN2cJC
z_2m0ZBK+ST>{gLwx92K+KE>fPv>uPJ`MCDni_0xb!Erm0+;~iOWC5i{yQqF*FZ8B8
zc`n+d*~gGJEE1{NJdr(cpv-*)d0hj@*6l}1d?Im2cjD{Y1IHgxY+2NVqk0aGjX7)`
zI+|^PrEHcvO!ZPX4){H!<=ZrlZc*iIfgQ)C6|`9za3Dzh|3hC|bGmY<-xBH$4PyU@
zG|KgBDVBJWGe3psxjpbHoQ=8cpIB}CfaUJ~Y=1Y39gEbc)BZ%W&{A!0)H#{6jDHfl
zbGiF04tuPmZQvp5whX0H?H@Wf_NVDzIhsFOQZwFK@Y@~~UV22_|Mi5dir^oac*o}B
z`Q|I$7X}bG%8C6lc{Dau((=z<j>;QwW%MU5uh_!zwYD78yi1eJF^&v=#le#6)J?VE
zz~nuYG<uS^bO`y!dXV`0oXCB=#p&=tHs7Ai#;<~jYm5Af(ABk~uQrUSp(Qqk&Ztr@
zwi$6{&@_&}_>;EZcWC}r&k;YdMysb%7TighQ;Xo&V*XS;M8?@ah`HO9z{;)oB>jui
zUuUpaY7+W<9X2b@Q#;q2x+&*qyz(#YJLNdD%bE*Y|Kez(32mE#XmK9SVK2c<t>eiz
zxj?aCK5<8O5!9$dl-E8n)4jxVYbSdwci<#mYnq{hrRr#$ZS~o2|BR|(qo|o{N5j})
z+8Zx&<nMgiUS?1}^D|{DPq5!xLV@sAlGWFdy32sbd%6U*4<_{XI6U(jai92!oqO`}
z)|rlFRxB1Z%{X`cN!8<ilsn3htFoQ4aDS@zSW;R42PMsoWK@Yc*vg-r7;B-sT#=~E
zBx`yGvC7{F&{iWb;|PH+n)p06V(-bJ_zEv=SM`4U&eY@c!jRk%o2Yga`b2nB`K9TU
zrtcJ47aJ-Td?9VYQ!;ADlC{5>!hU^89{UB!*sn+pstEUNBG|MqAsr%<<Qz@#Wha7f
zR0^N?BBAQ81nQlk_WLvHR~ORwLioDhJ84|Ll9~mhsp=~<RhPfW?h{J7RR#Hlqe!Tl
zDD>_@q<*@DS#KeHK@Jf&1(SO!c!Ny_VW*}Go>@xZ-C=lly-3xEqtwKFqq&Cz&E*DS
z9(STN^Dpu}ca!P-l`NA%<S1SzxBfD5yYxwYQAWy_XyV=qUv#lQsl}xv8FdgX%_P=o
zC=qjy5a(q~bpA_9&-hR}tcvRE9#rQTQz3F#<vV04!JPE;3X&(clg2-!<c=dLGKj>D
zszmv3CatF(k|)A*)r}@~=r!U_cM>lcV9M<NWcL#u%eXTXG+v_etYF~3WogvdME!Mj
z%AG|9LnDpspPR{0Xdva35y@ZMkxrRUa$T*^3dcxmjw0o|$dL)}EoDP3DU*b!F-nTG
z%v0#bBS@zWq}W<Yd7RLGtCFc5>_E*+1&VHeB<Fhs8Nw4ueR!DI`M*gKx;JIwZxUi7
zNf`l2Czlf)m_mGZS5p4VAX)n}$xAwrmY*ZP*AX(dPo|{Bg#FX>s91M{+++WW8qq=e
zJu9-;k0!NpBdG;~ou9r$;*y1=Brhj<mlsL@SP?yCDiQ0RkWjdV)I?vB&KZ$XA#_LL
zI0{<7lYQqSMIz@=z0aSjicOSQ&L-2Liged$B9Guq%Ck?zne``f@JW*PiDxefK~hvg
z%GUKl6Rsrg-2;+F{6Wh2_edrSU10GJX|B-bUT4Ws{YtJy3F&{%BxRv7r9*~Mv>=<D
zWP4I>&n8Xy97$I<lXOXsxPV2(pSVV}n49C)oh80Sn|R;n#4H#3b>nL!pZ1Vp@tDjm
zb16FBOx~#n6g_XHL^7B{{f89PZy<N$ang35AuZFIxKS^NUsXYr;|vnJjw06UILSqy
zNa{J4_yuE0xVD$rX<|<T9Fb^=+@F&_8U3D8@I9G)n<W(bPodaX==^ujDf_&i{2zyq
zgtn0)XHC3!I<Y%;5ph^}IFt4eGvPc@6NUaOEhB#VHR3aqNRa<V+^6BBod_o9dL0>)
z-N|VbcVmxW*q?H#^p>Z%+bzLT3doxMl1!y2r0T<wx|a)$-9pmrJ0t}PZ_D|p_<c7>
zdh(RGwBE#ew-CL07>Qd0NtRhlrs)OJhZ#|@HIA$q_T=CFi}E1_<ZT;Bfl(WU${u9*
zPbare0Fup`BEPbbSoMF2z8NO+PQ!`S-b#$tP2!yT6H!@7^m}7c_xwlldn?jUOebq-
z6h#>w<W5td_*w!rag!*V=uM@%H6{KAlnoKJ<ZCd}em98MEhNS~kQ7rz5;HauV<XPS
zEs-ne_nF8H7vat9A+a@t<kwN8JI+BmxrzLe5VH3QW^_~dd;d+RFi1lEMUlPeyHR+E
z0Tc}sS&JY~!9yCz=>4AboB-nQ2<_gzSlmgWtJaPc-cLWltqziG|A?g3;bd)pg!HE(
z*^`CmmZ(Gd!&1uBUkG1dJGGwf)ZhC|m8QtAWQ-AOR795G(dn5Y2eGpc$)**=NB&8?
zOay876i9N4CCPIs$$f7UyJ`s;v&=*db|iQIdNL%}sVXoK%p`|aeUYP@w1C>r3X}~L
z^<-sFvU-ZURxNU)_wva|X(4X?N8;ARkoada(oVrr@(YP`uOs2fIdSeC1YZavu0yb=
z5G_)4jmUg_nLO!J@@G3xJn$FA1-9fybVusg6UqLO#Le&~cEKd#*Y6?nA46ht<cJwI
zh?q80qLe)eNf$NTvq<Q72@!v6AmryFB7Z~@njS{x8VOksn<-q@pS)FD$^54q(qBVK
z4bCODS#ZY}n&J#@7WL>GQRSILJQ^ToW;;SZ>ag$UMRo|E*++Q;ZWc4~-(-fH?q0m)
zgvV}h9%)-5x$=T<HMWuxxmx%VQd0V!AU5C|VZOr4t_US=qwsNN-o$V4FoLg&J5^DP
z*ZN>Q@AqbNPJg!M)?rz)g{{vGG4-foy<a#M-GV4uxt|iHB5Ko*&}f-WW0{qhU6Ux1
z7uiN%;kDSeld-HDnHS`UxEe;(un{DF8$|f-zX<F&i1QJ5yf!b#E`0%xPu61j;XW1;
zN4DHrOzGNdRLHNPZb%acRabIo+)Ww+Ehrhcn{;`>jdDXNbWNeyr!OhCQ%OFcM(#3I
z;>Q{jVzG^#N975;;DCe6WgO1RU{HP;^JRx|d%KCExJco7j->j(e>mtXa+PMvG<??(
zxh+@nJkJY;Dxu6viJCbt$Pm0HrRXg=)2xZk-G;xB70!p(uvcVsEgBABp52O0-gGtu
zzGlmSsT3q8QhdFgO8ZH)PWy}Yxyyy8UQFp?cQRLK33fS{;=~go%Q~OTVUI{w${@@B
z6|uKv36j1Q{%RVY*&;jh#a84pZnE-$B___duw1u-d>&DB!bRjW`_Oz_nYJ2zYORGo
zIjAd=>w@!q-AmqWIVuZRllkxt35Wg`_fJ;vSykaZPQY`t96Q_2VfHW<YvpE^uP<bM
zjuS?ME|EX7lA=qysj#l6B}9+5c_J5eXD|gLtw^;TK+^F~<Q9Zc73V`%`wrrki!9>Q
z@5F94#q;u3wz_;|>#n01KWo5NH-yzP{V}W(|GK>(-*Ev&9oMNmGL~ll?KIn7p&~St
z+*aX#xmlA``i#s0$yAJ)F3#&f!3mcW_p_8R-3xeEf5M}B82-2V<6QBaEnRP7-sLVE
z9=Tv~*Nfb@0Ti%}iXNskIJR-1mog>i$CCZ%il}whh<cDns_$J2zL=8u>I7aNz7k%!
z4&RMQxC&p_-smu1yGCL?J_&o*NS4{$W@)`DOG8ACBczu69@8m#^M$&-3#mDOP-Gpq
zBgwih`phXpx7U%VUrF{wc_Mo>vOCd&0OP~#Fc^mU+Nqf4Y{L0W2!`r=F&3J2!MZif
zczc}r1`9~t`<0y2{^Y9*rX(`g<tls0D77c1&W?R?>Fm`HB1*1~w2Z&;5Bdwcu&#KP
zUByv)g>~KA(d#be?Sc1LDt`gpZYoSWt;&R<u}tfX6`u1`5)3NIa)=}6mJZp8FNE(I
zOIZ0E0%k?AO?Z>8TN2niOBbtUikLq%!GHH-Jg;X7?Qe*B_HQ(=c%pm#1j~~3nQ%{z
zDGJ_9OY<S+dNfIT-H9DHipYEuBIhI$BJw-_%P-+B<}iz%?U-wOVf$>l(58!7v+Ean
z<CD?71*(%9nUbEsq<gDaHh%+)Y<eO)!4lPhPf&1QPuztD0$)GF;leN+_QkQoX)iwB
znQS?Enq|uSG1F3I<M*{}zBK|phYYNLRI@(dIg2%?p}y@oOP31nv|fp&i5jQ`d}ORm
zG0JX^f>p>6q)~*etv}oH3fS5$5*xdhY#mmB+0Qf#r?z9<cQ<Rh8lt!AF3bA<!e~M@
z%irHdcluA}nj4_@S2reYh+wAR!IO{OL*eOK6fR9cS@hK4ke*my=!L`kPOLKz<CJy*
z$H&^}U*C^`Y7iS<y~3c+T9~p0O}W9WsQSP>TQ@Y;Jw$Ex0VZrHL;hi3k%eE)1c#5v
z|J4J<{sAaVsuLM#ku|U?W=F?*+?PMX;>117jT$gG<RUVAtI)a8ivFNeEYuMG;_<Gm
zY>HyVtq>O8i5Ab6Lr(EI<B!y%dhV?FKEhPSP0nDvo;7lRi|;NRx`|tU9XpS`!flf#
z_ThhGS38IGa~`td@dWhk)X^!kWTDd#wBBsTa8N93G<UIR(I~;;zM{0n0HsAH%(g9J
zjwi@|x`tB39F%m|5?eh3zvriM3u?#4dlYLz8nL^!7OP$ZF?v~op6fyu2dARm?vHv?
zCHh^OS=l>?C5fZJ+XE$$6_rn_7i=kyNzN}AJ^lgXN|rN@TLjGZW!r(@ti5c_YMrf^
z1fFKIU>1h;D_Ebn6caZo+V|sGI$<hvo(#s|?gLh5zk@GE%;|fL3C*%7f7TW3u!JdJ
zjw2iU423H&K~;~CQC4iNS|@sEINOJQVAbh9*iSXX<jh-E?AC=3(^x#<470w7yq}K_
rD~^YtRkEL%LyVcyr4!}R4Jcl2L-nZzlSU6fVcG)5Pt-y2kAeRKX!@mV

diff --git a/src/feat/test_data/test.wav.fbank_htk.3 b/src/feat/test_data/test.wav.fbank_htk.3
deleted file mode 100644
index fb3ab2258ebe61150ea5eda6b659d56e9ff4720d..0000000000000000000000000000000000000000
GIT binary patch
literal 0
HcmV?d00001

literal 13076
zcmWlfRajM9+lF=9-Cd-Dfr=u6fwzeY0t%u6A|Q&QpoCHi(%rF$MR!U{8=$CQclWlt
zzis^EKj45^^BQyT##5a-bqep)*+0FLX{R1Y>5p{y0g~TrBJoU55)Ceq>^`5=<MCuF
zJSDSZ6=_M^h}%DysMrcpY)2ric}~KD#Y8Iqj}%ick}f)vnvy`K(`FLy{v=d$0ta*F
z6WFB#CzVi~x9sQ8-}eL^zk%O_XE^-5iId`a4qPw6rm+KaHz#&3zlwTmK8w86nDgL0
z)4y#*>F!b#8b>k7NC&xsROGK}qxfnH!<Tj^*kvQ`ibwHYe~+NeHiX3N!u#+A9DToH
zU(+9-w_kDkI0(zoI&9>6u=Y(XE1v`~wR{-LU9_1zp%2R4o-o;|fJw^Q$lv_Sq$MVd
zkPjqv>TlvtX%nMzoY-zv1b_U3cl|3Ia^~Pynn|paA^vB5aC&lx-F_djcvgVPEGK4Y
z<}vM_Hq$PwL;3G2rUqR^d5ss!>bFsPkiy7o2gpynOm5p_3Z#aVHD0DzGlcB->quL!
zLDnh{3RWLOn*EUQd+P{ESc(6H@A&QxV(ErtRC8=te192p{Oy<;eiX&_NhsO)GBxB1
z@(cYb_-7`iu~Nz=#B$`c9Mu<RQ&@6^Z2Obs{`8>i$8YkEu0`6RMa)MBBA)CZB6%Fv
zJ7rlsX)Y#rgIRvsjG4tsOuTghximGDJ*G4M-~@_xODX>_k;=0=G~Jj>eg1gL9R8tj
z{{u=;q_(pjrEiv!+dT~F`#vPE8bJK;^Ej-qL%(}3b~)>^_Srr($Llb|pb3Q$k5F;`
zj<TT#C0Q@2tZC(l>tc>wN}%z=L=ImYNzs{hO6H!XZcYgmz8Mt9*O7HRj8yfhNdArI
zfJ}dk!wcDGp^tI<GnPg#Xa0>D%<5MSy+fGUOPs0h2URMcs9&1JiIKNy__&$Fb_Epe
zY@pbF4M*OnQ8Bum;+4^4S8ODER|Ki4+wd$j!79#@eI2JTy|4oHm6fPIyo~JKBD5p@
zQ7)>ZOf`+Fb=zs|HH;ID`80HS%;6hW6pemI@eT1lm0hTqJ&t0Z_Y{?lrO4wvS<15b
zwQXXj`D_kuG-Hc@D+}KYL+vX}J^hU}a{Eya8AWN_dMdvZ)A)TVEjhDkFtnt+?G8ms
zT`01+#F52kC{HP((034J#<wVwQ6uk0GeLb6@p||fzpi^Q_q@x3(Opn_vH+#eWf)4&
zv*4LIB~@Rk?EaNTS~*$pndaUjsTw(p;=a1%ze%8GX<sUw5-FIROxY%9%4W2Zo0&mG
zl^Rh4ZxCjC822InVd1NVUN0?Xoy%v_cny|M%%xOoEER2UY4Fyk?c7>U<~^q2)(FZ~
z&Qa`PLfvaqnx^-mtaTbCg=Z=LmPYPCdnDzuB=+e=^44r3?~h`ysKrgO^I3X)6{cF6
zn2zg0$+a*l>PJ#PC4r94!?<9mM|0UPs%BrOw0S0tlINU^IzYAGREpoOrexkYN?qzm
zs#hU#%|B!qEkhEqk9{{5v-y8t&^@2Q?tu>2rcR>7){TnsuGD?{hcm19a^6Lr=07r2
zkwD2EKdK*hqvdH=>Mli5wC6g-owteFv?Sh3pXlZZq-B*6xA_ZBg<r67&t%hXSv<cT
z!L2rn;wSc$XRoJzdNpkigE_Y=kmg4xsH`)jXhJ8-E7~~zN{*&;yD8StCvVmp@-@#A
zWAT=x%N=BTjvzek6FYreu<m4o$##48CT?cC+iHpv4siIEsOzzrw3+m#BgLBL%#&0+
zt|PbOi1@uO%^LnR>F=f_&5ZniOUO1@PVD>vBzNgY#-t{qXXmnK^9!~rw_~6k!S13@
zZ0%D`iNa0}`>W6}CyA3!k~sTiEX{_B9PV?OoE7&ddX`T87&RI@52mc=3rcQfkmdBA
z`04E=MI0q#VmJ{xWjK9%i;1{@qlLfOf9N^ZwIeC1eMPy(CmM$-b5iQWnN9!ExV4RP
zn}rk@I8r>RFLfux*=01TaP3A(W&|0<1|$`|C-wYF(rTqd%eUei)`MNAn%Ffu3$I?k
za9&tU>DhBs$Z65kPmQ+VAkJK<rZIFF6+`<{I7&isXf^fA+c~;n4;2HnD2_Qqfzn<w
zQU;QIE|s*qV@Qa1Be1U$8%96Hw&*0jLF%|IwBhjJzEt&ZrTLsUXTSd7tk)tMeMHT7
z^r!IRe-y77LtWb?j!p2Oa>!3grzle}B%h3)=Si;GLF&bs#Ga|ZXNDP@qBgVV&?@}Z
z@8Fesi}IDXsE+mFsChc)v{h->Zl<9!RP^~^3Yklh;d|<WvN^6`M|tv8N>A*hu#XGU
zu^)(c{eaXoh!}e}4&C);-TO`KO?D($BN+d|q2fO5IC4&(W3M;Uu{DgdE^Db%`9bkk
zM{<6}kYD(js%Lo|-BZKid<zQyPA0!o3(2z{A&Ctk?b~P3LwdNm$74R_H@184<7=hC
zp^ts2=yjdC>XWqePN%)MDy@CrQa!0F#ZD<?H>r}F`<5!}lN|M0K<Nt`^5%{s@8cX&
z%8wI&t%IcYrNnG-#JSENv)(e;A1}h|Zx{AmOQ*uciw4E%oXl0>+|!>l`&3eK^$G>$
zF=VZ8B>QxK$~QfxalSl7x9*TWWFA?I)QB!VN?>{xQR9viws8m!nzF2Ry@K=XczmOq
za4|VgxtBK$+J|YI{)BVg0;vrhO>y-fvVsSb*7peM!5b)<(2q)gf3g&pkYKVINx^x%
z?*!nOZcD(0w|G^kV`}w?l_|PdG#TNZQ^Brr`zT#DirPe1j&;9F^Yli_HX4%N>jw!h
zj}mJ;j@a*BWXM~PbMY#H6)gmPb092aGOiI%aF%(1*92MYE<a_-&vPs&yuf<dx7c@H
zfaR%Yq7UAZHf25OwSP#J<PZ_j7q=ZL*yx6{x%~&*uJ6X@LoI&q9<a4m^mUd5iv|C&
zMt&A67T?CCx;F+J#xO6j5AzRiXWeQYbgX}(=C_H^$0-CpkHzu(6%K8W$656~M*gm>
z-qjQRvB%IIW{F<lem457V_kL<3k#E2tX9XYFV@U>Va&|8U71#A$&_v;D8JmoRI?dO
zU$6(cow@`rG{Qm}jO9HCtj39*k@kTN|FyArxfzS~YtXPMVx@5b8zv5A*|N>dY%*n<
ziY}9_GEvyn8Kv0qOz3Hk%;rKS)cGMdZx9MyGZ|bznZTbF7{@KgI75%k%MY-oCL6up
z4Xk*s#k|5S=H@R&=c+trKg*fFY!#~0ZlWrBf8y-}$ga*q_Q6!f&s@y7<#vqMpN6c}
zOB9+$F!;%7{Iw=w{HC6DBd4>Y?`TZt%)&tD49qG={q;Z0Q6GesvOe1VF0(+ZCo}$z
zWvZ3~ld|ujWG6u;G?;OBej{tJ4w>$kn9%$R`9BH_w`(Q99n3>(u&ixo@789lY#p#U
zTg7(WJS^UQz&hTTt??t-x>bfvZ?|FmwE&|aH%yE-vR>;QhBIw3td3*#0~K@%#<6=}
zZyaLou}-?3fXemkpR}BVf7Lm#vKOw6;L*zumzY+Z=Q!YUe+(`UX5xHfA&%DXad~_K
zNA(Nh^PjNkwhTu(1@;PTxBZA08%8PMy80Rcb~ZTKDiERcj`*hENdK2iLcJYQ=l4im
zZV`Wd2@#$b3F!2RAh}c`0{0Nu(vE+WCBD@H%ewBsuV5bsUth+5iZ33TL$ICq6`x_-
z32dE=<EBqYqHd9_t3twrM3OEzkuBpuronhpR^KDyzZk;Beu-<lM_8;CAxlaLK44D3
zL=_IbILE$$b@=*C#OLfQ-1OXVusDiu!&Urr!`OLXB-y{7kYoCj!z!<-{N0U;kx~k$
z7LzsHll1;la<h6-kZVH9q65ULnv%M9KWQe@h&Z>Oknbjh$UVir`#m<^yn=RzI!2kI
zH~-s$R-Gah@!L3JoW!xRSz^yyacXl1^<(!?uDzdP&zDp`2o>0;nSz{tr0Hu>l2^-N
z=}1!4FC%f4lAt*X?;f$3Z;uksoWjOFKI~kX&t{1sl|Kc>Ug62{A?>uUyvx~Fsx-gV
zrb4+N`Pmn!aL=J}mDppwJdoJ^B(v@Y*&ZW^{MUx?AXmbt-ovxrh^;oq*lgI$=6$-@
z^i;>J*9I#09;2rD0>|gQp(FMK9aD~S^y>&Jj{G3ssV|4K`->eH$>B1wYc35XwZA+P
z=dpO+e1f;c9sk79I4^C%VEYzWE-!lbK6=^4Xx;oo<)Z!^c|DC2XXkRxQJxOHb2N`w
zLizd!<mM?-Vy;8o+09fnJ|#n6)Khd365rzlya^^aZ~>8vrTBKd#&qrwmLztt<o#bP
zMs~-r;0;x-FR9ZlrS**s=MrjY-|&+<g}#*PekQAn7r9R_Q?AuSRp+1NT(?0wXRqM6
z#sq3SC!}#7kuO8=FP_NGlYiLwyECRo?6KKzfw`mjOn40S729aD7(z#<XPk^UK;`^-
z<b6Lv%8ehScK%EDDhZ`y$|$jvBiAp1{OUr1VdoK>tWRvoO9J$**_k*BGsi_(?{LA<
zM~$s!j#PzirSZ`p&i?m}j)w<n2oIt3TNN4qbSM7Z3gQGmNwiZUPe+Zz&;C-ht$^Y!
zTS&e=llYDY#4QTJuQHAuvjsk$@dcB0aqREo&Q9H2s#Yd*^z0?hX(n*KlMZF+lPHMn
zjKsZ#Xn~a@{Z<g!GoQqN?{nDIh|+isN>q!;sL~R9*qnHQp95)Q*I#os-W7A;RRC`D
zcd_k#Bb7V4aqOy;GdsR>X2ep889>(l7$h&&6I1q{$c3_mKhYrWvEV6(1wVSak)l!6
z<jkrdF0quzVMPQEGGxEyNvs7%+LM0(pB^3T`aO?|+j<=Dn@G!v+cY2Zrr_E)Qq#W>
zf1;F#n)8IX-X$paBjKYX$ZQxz@m|4Y?+1{3WDN1X=7j8jhnM12c4<z+=w~CApFiSp
zUI)v>wN%7T<9MY7El#sJqP&*e+iOTFT}yP`Y=Ms!;TKwt&ul~d^sA7xoTligm=m{(
z$iDLuiK3{%72WYQv0~5j-K-gZl`TgO;L^z%iz+cc;x#!gt)ivXm-5*yWEQ#;|MnSS
zmzwaiT8VdfcRY5?!|T^gVg{Zkmm1O<Vo2>_M6B6pqKa(rPuP$1fFGC+(h{@&87_{x
zSPjai;*23j|2(5bDV>50C!{v}iOQ5G;L``ZjNjm%Re-}}H(cM|CF-s@>56kn>61w6
zmUt4DDH0XnO`yFQ`!3%_&-f5#?jLb>tYy<ASt`VSZ0wiFG4n%Yt-Odtc{L$ffp`~b
zvES!1dllDVdq4)aP02+2^(E`gK$3oZCev^{nc1gEPM$zy*HSz@<I$a5&X&^k?ECQ;
zqmE}(<~dXQ+Jh$JD`aU+Lvpf?kSDtMK7EI)(MERvoym@t66`M>CwNRBl7a>jcT<5h
z%?o6FF(mQt6+$oR;ZbUXL1r1wvCHsTv;@n)Sya^Cp+?7thEA?zJnW5x{|FMgkFVKl
z_H`MEt^6UjEq%m}aCZWmACl5p)R2-VIm4HdFSH@))v?5AcEvAi4x9Ep#_hyG0uD^U
zcC6qBD|S=8IE*^!bJCZ#k&t|jfcK;D?Y$DG+%eeP^uellKend*2n;$)%B@tQmmVgo
z$$|V3ZBnQIBF3?VLt|3d6gCgfqx13mc@Zmbb1JfKQ>7D0ZJ%kR<(wj}`XfP8KjS_A
zB6jw=Sgwo0tX~{k#XRsUTuV~Q3&J}3kY=-yjJIQu=nNp-QVoaH6X<n4jOW!~`1O8<
z@z`sWe-k)4S(ZvOXHpJ!BC1*kzhbdlFAu|Egs9g)zu3~&z|Qf_9NOkjY?gwk_wz)&
z`$K5@Pk|LXa9L=H@%57|z5fwAp;sO3m4R-t7KI-NQrJ_2oX-}7PYWf`xemL1vFvp_
z&5jX<tS|e)`qkYr{Uz%Bs~i67PqAC<z1>M?uwL8;hcCA58sv)3!?CP%wr6MSMWF|d
zWktRYDVF6(_UaM+%8&z#CbGk7K9-|jV&<oc;g$a4?`NQ$;Dyenxfskng~8`0^hX{>
zTXPY*E<ae4yp6>-zBBKY5*p_FS#eX1)n^_uD_lzG{oMov8RINlhgr5BmU0#tb>0f%
zoC^o{Vqu!lU8eM9ZdxayiJV14_a2Kc1vCG>6<VI#ST_3^E84rFsiVsLzt>qB@`x4p
z-lFm|m4NYG*e>2<YxEJ!H40g}P6mC!@m3ekKzBzI%RFP?eLhP^FG0U068+?1Xq_6w
zT7mJ55>ha_BZKiG6V`r#wW7YZzo=lFz({N2r;Az~j9r@nHviPIx4$b}mj$7B=o0#$
zZ7|*Vku8f|&{w#Mf&T$4a*wg~&3<gMO@&7F3hNIq*xfsj-O4X<Ipu{%l{Wsz_Y<&l
zFfN-c@gEw=M(g$L)_Q?Uk7eSn+pt``irquy*?w{xPRGr#v5~TG#snNxuj4s-G(P5g
zITSq%uO8#^dVLfBuPOxnw;umnb;M<)ka+Aoe&=ckk}SZ^!2zFyk@)s=z-!6~{BFnK
zCw~(EE{h0CXynkqX8hLF2_6tdfVCl^xAzkrtW4PHYQm=05gGoDn3ipX-*qQJ^qWNB
zi{SGCL|mAOzxy-M=jRCTT1)uwI6~dE@TpZGbWUf2!lx5vR!EdVBT;i9baf4Zf1-#C
z-bRArN#X|FA#Uwrk`$as9B+ZtT7zW8gG3JTB7K<$F&c_Ux9bRe@QRF`+mLi~6SL?$
z$#<V1xigojh(RPh>p`;pR-u7kB+g_!2?IZoBziLOcmPTJw~>^%TyQ&~fiHYdX7`my
zs$|L5?Ld0|Ihj_w$WHA<X4fE6whC<|yN*<?OQa4^B38U_;zrR2vwM;9K#j!y-H02a
zOnmWuq_;<qDmYPQ(m>LhhYNghfZP$eq<HI*Y4(aT&vBIaou^3X9r^#Nko$QzdDqQJ
z?Khm{sn3Y5??hT#JK6aqNL-$g8u3Z!vNwsp5=w^q4zh$6lJ~etVB9(i#4gHvp-GCp
zFQvO)QXbKn>VrbdC@G_?n++wKUQ>3$fSeca$x+)tvi~m<ckL(hLO+r(|0VrRD$=R)
zq%1#8zH2l2qteK0F(p08kNjKR$#>t0<mwxW8@(wCJV@10H7dP@hEeQEQGRc7Y&Vhp
z<2gC1Gs$?eMR*TW$$H(Bw7umd8(t!1$p{iZ8<EqIO7?+GWIVV-ivBKgrzw$r^eeHY
z{m6~}K<&C#j%Dc!+}%u3*jY+sW|J0YMcO+(l5cG%>1QZ$TYi!ulS1mOA!KM&lPz|i
zM5RADP3mOmrIRtEn#9f)Wcth{%T<=J@_)!`T}b(u!&J-XQCawiQiBNc1Ok#wPa^)N
zyO`%|iMpjr)Qo!K7av0UHd5&LHl$y>LR8dBQU-iMB3X>2<33>x1BiPzjU)ptLS+7s
z7wbl)XAuQQwa85PNXFV*NS`Sq8McV1r4tA#8cOKA7y{?*Cv=<<iS`qbemE$6get+W
z8c2?SsMP{<C>$hc;}Igt1+Hlm+V$CL>cozzU-p`Uh_~bx)g#gFOPu5^ksF){TE7f$
zwRU_%?%=mx%u6SG;+sQ=9??nYY@<ksYa#B%Yl2n?KD9m%_s2d270=?psYh&X7Wz>`
zZ|dv&(;Qw#@hgEJr;jK0N(&KhstM*6o+C5x8l#JMNGf4p)Jb|BN$e6er1|Xv1Lcuk
zSSb7z6?~1w4653TuWCDf2D8}x`5U!vKdHSimF6{X$ysbfa)~)nwgOX^O~Px_I-JHA
z;`*x)5A9WiXoQjc%A5qZe@XH5A@N}d@oSC~I_*7&Ock(}g#)4g;n|_gj-(UR4lkj0
zrxuOZhm#dsLvm>=k*b67FD%7tttR&N53##lfa{4jgu2`(OQ#>w#7NRT`;Z=ag`|5$
zM5{OAYI}l>0*CpG&Lf~)3HR$h99esmBY#iRINe9+&4S0~9VBABD!%s(ag#2_QL+Vx
zeiB?%%8BT7hP<2<($rRwIkYob-}I4AdQQYq1@=VSWAbV!{u?|9R``vZ<49`GPNH_V
z9*s{s$hfzT<bT5nuL#F~*GWA8N^zK`gWbSr_BlHfb?^}d`hwHeW|EPSPUg`GNMC9Z
zY45`RQd2AvvkA$6N9e6i?3*{7s!dC&Q+~ivlUro_*dra+kLUq?2^#rE_)k69-ypDC
z|3CN^2^@H79;MC;1z&R`Da@T@mw_axsS%=8j?<+-SXgc5kl`<Uy4+y5$$J`3me6pc
zniJ#osYr3AFis-+eIlvyw+ZvzMzCEv5q~z593rq(0u(K^BCcJPkp7xPUfE8_myaCk
z^qj2|(^<Al4|}f&_BdZecX$$IiUpJhwo|?R2u0CiCPzIX?&Jm{2WsOpU?vC3j&msR
zHK9%r*dGqvkYl&YYixX`u+8f!7KfLz!7G`yh5w=#W{H8|3u`a-V@1zf#LOrV9C3=k
zJOeSmHxk{J0Ca^<qc{3K%O`9>!(WA&>t6{!>p#?MZ?hoFiG`jX%zn5K4X-3-Z7D<X
zTo2?P?L?tp81i<?$O-H-;=px6Xa2@>^IP`r64?A(G|OW>Sw5(k`2x$S$?QUHL_enY
zc*KnHPno^)8_U)yGh>$r({{EqJ^Q`z=E@i!XN}y&HsoCvBV(|bv7gQ|;AooI#jcnc
zs$wd9KRuV1teP0bvegPK8fV75X)l>$Ihr{OW0|7+kQur$XatU7%EKN^QmA3_4h@ut
z`=TJQwc=WFcG)k;sVqlkP!J=PKj7ct!$u`N3@wvbyINoPGbdQ&Y|a926*QNXq0!F?
z)%ZA6eq3a-Bn*|E@=VSCi*mLBN-H&(xLln1NgVQG7EFko%7n?X$o^W%F!T8Yv_@hy
z_$zB%wy`Q$oh5dL%-?2%hPovh(Kk?+d(NypZKjtPFg>OVQ}=r@bJ|6ee*Qu^JQ4Ym
z_mN%R7a5(-$Yg#(W|uB9Sso1SbB}-z`54bvM{jc#YkSnP`tmat{?K9m1o8a$OPSTz
z4Ap-!n9|XO$z$S~vi4sTk3K+Qy*cvRz94&U19EXN;ow_j`n^GE@JVDIcrrLsc)<?&
zte;bi-jJ>69I!{bEu3Y_>MUIRl(~vc%rfkXO8q-jj!T$o`T^w|KT*EgiAlXPk$-m=
z`Ts7W*cgdiYiHypK0)!z0AvDTi0TajkGW!8yb7J^H7x%*gVjc%u*Q@H8vU6wx;Jyr
zj6!w0J~Qg>pelO-rC?1apZ>_iVSSjYkc9lo5yETTi=1W-<NxR*t9_OUL%J|%MGA4R
zt`YTdme?QGq=&f>TdYFF%w7b0+2MN8OZ4A1!rG1z=6D{jCGoiBFT!nt&<N**vSZ;G
ztlAb~Wf;zu)7h+Y_{eIfORP?r%k1ERRHn|Ly#E+#x=p3wOFornl!O;;M&>_NBwke}
ztFt@Vqpy*k+=bZb$B8z(LBzBsLVG+XaO*|9<MZ*7&By(rCELHAz<PihTk89<)aV>d
zx(8@hwWTdLijMEkIkWQ)4NrPf@#i{aHG4Snaw$inbEx<{h0<dR6y^LPEi#PA=zQYV
zO(brl68<uiab6(DrUWqyZ`rV0?j!nFJ5&GrBh8muI92_WD>3C<taIh)5DRL02=AxX
zfQGt7wE6^6zgUAKAtNaBnoIhU*Z8QZ684{*&_Ax>C$Yus@o`qnGhxHLn{0egfyz)Q
z{<&Z5<FA~kwxxYa5Uu*dsU0k^#OU4RzfGd5k0Fgqh2Ql~gEG@p@>`Y=t@#>vKXqIm
zKEiXH3kQ8oam{YT(JdQ?33)gScE?8DjTA8xvi7=D`m>aZhf<2q4I$Ndr@*YENEvm8
z%uFjv`V0K{LV=7I0VInIK&;Ade77%R&uU9{<k+$6ZYFjIQ`sr6%r3E;ZIaiqaqdJa
z(q2)wU5R6XcW8I3;at``TAfR%UmHi|F*j;kG&%9fka~eTtBmJySY{{%!86Da-d$9V
ztH@>KU@9HPmM_Ves~pDO^%;(F6|8tTkD7UQG&$#S?DHco>bG(;c^U0SM>xJokJ_R!
zG<(Z(^2ll$M{3i!?L5^_%_*MTnL>TxGX^~%@NffOvrcf}vCt|)&*I@f8{cJ%*lQqs
z;2s4u?7G9z^sk)r?Zb@~O`QMOPD{cW>Oalm#2P2z=}x1jYdc3gdQ-N?kgThk<VLBI
zq_LQQ<V+lm4q$VstElyLY#l4K@<t2Rg|tv5(Wma#eVV@|(!TmO*UGK9w9JSTQGaMK
z@}(tEe9hfMSxz;@cN57PFH1sI9;t6^iJO*yZ@V^!x~;{|x&WJniCF3M!D@Lh8-k8g
zHB*n8{5>>UhSD~(k&BnLI6w0PC&s>^K_`ZmneH5Ozd>I4UkV4VM7rRs@U-R<cK-{3
zXFS+%w~Yg7#q7~>!?d#rrady*BXlRduG&=3{vmQ8a#R&s(s1?xEtiaFjp{+wsLvE7
z2v1twfl^mFBpPZY={FFyeFeT}ItXrl&%yAKI5kJGZ{0=?b`HkMMi<j#irAkz#ENOv
zl>dsSXi*foi7g`AqD{F`Ic1Im$q?E^>eVG=q$!dX>QCHCV=_z~h>P2WuYoc?&%y8E
z01m_(;(2f#he|%OPw<{Yo)hpgv&Lj(7jn;hru?B3S#J|4=(~;LtV#~=`9;ppI;7{`
zl6pgiyycllgCKF|8d55U39W1p{-19Wlx0s)@LqyCUn1b|IS#4m5g2QYU*>zv1lFqQ
zIfI&FH5y!=(s<s4>L^#r{R=4hM~&<&i%1bXCfj=w(mDg7?P(!R&L;eS`Ggn=T<+;l
zSX2%Hdv_7CycM5K+4zc??rG@6`a|thSXWV_@{@)I>u4My?q0Qp!qhmyNd*3tc}a#r
z4cXsxkbLWl<mgKh+vgB#FYZY2^2C1+h@6$cMtPfw`Y)Q$$(x8)`yY|s*Kv8)m*N+(
z6xY9}qAr7qQ6gt>MC3IFG?P|(o#fsxNIX4WaMTPEWR4JjWB?IMt&tSDkm4*uQs4uU
z*8U~B@G!Akh3<S^oAh6KB<`^#FwBC2?mwy2y-n?QJ%RCSshD{|Xs){`NNhtoLh!%X
zKS-LMlXNPWq<Ep<84Dck{DZ{%gORLMK(g+Y@CY6emz__NzdAYhd`NknM?_8wMNJlz
zMs`qstDI_8Wy-wQlC|MCS(fWbni4{s;ZWj7tCQ$hOq{e22|IL1Xzxzckkdp37L!={
zg82G-B-Eyn>@b_`=Z&NlX%Kurnvy>!s7jemq0VRWeh6--szaLnG13+)kf?i#g#B4W
z&&nlH<Q)=*Z4g{vV7xD80tX)<X4rHRlf=(g3`b%yjTDj1$@Quu^Y<gdX1h@I)`<$E
zJCwbfLhi+_q+Y&^bowPE5r)M7P$%5_Ix#wrg$K}?#6{x0&+Q_*UU=Kx42e6@3rW{F
zk~Djhto~N$0On+V3L|UeSHS~5lI4Gr<TfkP{V$4~lgLqZJx|K_o+K^!NYa*h#4jH}
z%!71dJ-ZR<7DrU0J~4X)#}DXEgvdrEdjAx;y^Bci>yw(dp8Pcn$@yDF=(RBv>2#rB
zj4TB`a><!yNyg_&lFo;bP&$_w{nJF$Y7q16DlvsaiEEih!uC31f<F^CdJ<7pCP>=C
zgwAslX}2b%bA?A1B2Vtfo5YEImA|2i!nAVomjqHGA4VF>NsPZt{K1JtmvkmPBbsnI
z;kBsO5m8o1^tKw}<S!5%f0Nh`_lT}}M`Gw(vUZ6+J})?Tv>oYc1w=g&Ih9Fyl)cm;
z_fBtt6`D!45S~EWZjx=jA<d{Iw%ve)K0-IT@IYwmPl<b}No;{1QK5nZuTmFtCYQJr
zQ5(W%O}QgxNViX9tSBO8L@oK(Z&0jkPma!bp?hSIU1}t>do$9_8$~AJ5z(Dj5xMuW
z@HEy4ZT){lWVsSErY~U^785@E0Fp1ik?8M3nmL2iYZu9njUZd(5yNMFq9o3o@|ZUi
zS|wB3a)F}$Cn#1B*%+0V#IFh=vVA!5GhLCaSx;n`oZ#aNNN^PTYU(p0ddwwm!34o&
zdkR1GFUeuG<d}(j*|v%>n_SY1rjs%1DP<2|P@{O7!@pf99(s-fg;p{YH;^JZLwb)z
z#0eiWW`#RR|MVoax0}$#J`$UtOG5Ex;sozcHy63j(bGtOvW-;FO2Wg=2rp<dhv)cE
zV-rjD<zrMm+{a<H@8a1#$TBY$xk#at&Dlayw;sgA30>@O2g%Zt;+cbp*St=`L7}-_
z3`Y8;x2S=xWCgX67B5HG5_z!;9#Veh9Y<X|QyY*gvR3gF&+bXS@YfQ%iEPDnKT;h9
zhj%(i^q#%MxOF1ANYwO;B%=P6A@;qL$WzWl+b9t2-ieg|og^`L7Cvhd1s9$|PHQ|l
z_x&hZA4}elIMOv<A=NxatjLE%wTl_o&6uz&r9?@;6S4j$k+)ol`0$i~fEq$u4-+`G
znGpBU1X+F;dSo-f5d*M)Eg>zpnXLEY$bD{4_IDL=Cea6@8;A>@M0lZ?pBr3A>i3Xn
z6CFYm|6j-4Lh#`(1YYcdlgP~+9PWej$65}Z*uws`&vBnIQFsY6F;e_S+PLmyZO<cF
zWD!zg0>w-^LBd@(LYidoA0y7Qq>_ZauL<yZfzQ@w#H^|$z@rWKf5xy=(uVD`^=ug{
z$IgJkm`+q@Q@%cCCnuouj}is;B7=CbQe^o~Q~%3>BZrPs(!GpKk*A0^o<w4%Gnu`N
zh)GQ&*v6XpK640<Fu~tt9D7In!b340n@J<tU9uNTZFkIG++#;f5yl!Z6#Mq1?8a{D
z7N-kLvYMmT3#sh=i7er(B>rAc#+^qL-S<M;=cn*8g{BpzO5$@ng5EsE^_qgvn>OJX
zqKEC!i)_eHXJf#2oJt2^B4%!(mDufWhSVgMaU#11&E;Yb>8&8YwJVt#E6J&RN9Dm{
za(5OWIU#aY+lL}ix`y}ne{nQP#B-Go+YO#$x?2~$`Ip$}{!(~N7OePYO1?$}h4VG2
zI=P)=iNk18`XxBZzvNiHA=z^SndTzP6fjNrM71OssF4;k7s-%b_<LI8@n$xz^JLlj
zN9Y@ev@p1M2E&0_Y#9{Cq7CXI<FsDrEvi&jKB4)mGYyjeQSv^643l+YcRfRT!-~@X
zZj<-m0kNNi{-qW#at^13xBZC&9U<%s-68Jz23yu=35_9;4Og$TUiS*Kt~-*eW<`O}
zda86radh1&>Z3(YaJJBmR(q2$SzsP9kII%+2n{!i=;}3Mj&>s9ng{N-OR)35BeJG1
z*cdbjtB6aiYgc5Q>K_dLQH0Z$<eJ3^JaUez`&Ve%y-H+oPm=%3iquw-vDGjmY0M0Y
zR}0_r(l3IB8WNe+LiktVEp)n#+uu!i8xO&`H3yq{#h7ok$8=9Fn{VY}m_LZ@r`O2$
z*+Ioip(95;qEai4+$uGZt<WLTL}aUX&J<Y;ku7To!E^osf(QBG7e5n6+X!~}YU385
z!S=`hW81ew==Be0sj?@_TFO|wZv)v;BHwuJ0c974Q?q(FrPbPG=ok<i_%9)rWkj@3
zAU#@}L{;H+^xTWj99djvoW<gD1V*R6VQ;jMO;<&BMr}Sz6!xHMa}&*h9?XBFB>c$r
zWY<rn!0i#`qTlk=G)WpZpAe}D-mS;*ZU`gky$ez5>R7LT!~X8+IIen!Nk<y1Mz&+V
zPn(rvs#v|IhB-SwGD+44mCsT%<h}}=bp=UsjM#HxrW%W#HhwvgTMLEmX^Y3kR(2$I
z$ICVyTb&-5O)tQ!+itwvr=xGXoq2Pz(9-J0>Vk6d{27Qy2h)UZKI3CD(=a9R@O0ul
z&4>({LU`GALd0|ZUu)vg|ET!?RAAcr39BjIm<N5rNJ;or5B*tFJDpi|p{R)cI{8vN
z%RE$tP<C195dD}Eeh`H_7a4HyI?;z}@z&poZH6!Q#yxQ@{fnFWI1E3}!cbKa^D7sy
z?c6H9U&LltQ?@Kz$I|}l%(q(1(jKj7v<nUEc`DOS{b9V{cP4I8M>#i}{sTNjj&Kdu
z$6sP!md5so9avv&Vu$h|bUv>X`OCc+53|Fd*E&|bKaD}ePxP8Tp)FUyqQyF>92w2j
zZoioI=O@$c+)xOxLB9SelP=gZ>h&7@OD(XTIhLLC#mtPmhFyLX+uSr+Qrj1E3lG+9
zX=3%z1~l);v#RtJ3*D+%5U$Sb-^M6q^hTlcV-!0zqx9MlxjzXgWb{N{<ex^YttZ6r
z13Uit7Z<xE?CQE=X0eTpEk3OJ@}1?EFQVV|yU48UK(n_B+5szB6CgC9zWteX#UI&@
z9mroh##HTfOi@x|LV`T<U&D~SR>YtQnfRn^!SVHS+)VFb?@++j*8W%vEn4@c0=m^g
zzdCz^C0{m)@Yr<p`pw1YLZr~(UbEoe1x)I$isIWx%$|E5jg<GuhQ%<k-#<)zxPf6a
z-w~y>S!n3lICh_g@uR<(7zDAS?FRa5ZeWy?#4?>U7AU(gckfwNDb%AsupaKbVOe1U
zN*7)r-=&htodz*&(HF+X-ef}BaV8k+GPtic-j9mdS>D9j6>{ips=%uJU#zm~SbKK@
zW{R&^8EVUlFkj|P9g5E5K^U2|L+nT9Pm4lHWVw_jUQ97aXU4v#$VO@*zhgK`@==U5
z+)9AWUv^v&_pST|E6eF@mRhmZO_9~7y0g5sn#Ilz%v)8Cnu>&FN4uk4I+oeDMlvJp
hGD=-+Q9M7DDYw5dRe1jLTl=BV%^F2xcZSXO`9Id?j==x`

diff --git a/src/feat/test_data/test.wav.fbank_htk.4 b/src/feat/test_data/test.wav.fbank_htk.4
deleted file mode 100644
index ec9fae638c012fd4dd2f6f20fdf631d6a34865e2..0000000000000000000000000000000000000000
GIT binary patch
literal 0
HcmV?d00001

literal 13076
zcmWlgWmuG37lw^vcemJr7}%&7H;5piqGAGqh!TPb7>KBJcg-*`bPe5YAtGYuu{)35
z?YF-l{9rDx*FF2)d#z{P&qJ$LtwLM1_DybO(y9Zp=bMqWcp+Q&iFmU~Nam;^DRv@0
zxst?lA4%z@L8@~!$^NBCbB+*vV<Qq%0}_v}LUwR45p}1C(aIswECWg6PEs_Ek$k)-
zakdHq_V2*y@pK%|U&Sf>B3}9Z2=MxWx8HGg-0h4@i8lKx&*GFAhVjjRu<PK<+L<?*
zZDa|Cn^8Sw&y)^<j15yTrdG~4?L<Z$Jc!DR8iKMGvRB80gIn@BB#S4gxA^}u{PD=H
z#>q^F9j!m%vG*7U2b!|!;!Uhm_1JLX3@d!&nV&kDab{{v(5OQt!iDiIKbSE8A!FYw
zn6N>UgcakFHoYUFG>6FYE<}3#gI~gBJW9LZu=op}+u9P@@(&>^f8qLYK6Z%<uspwl
zEw=;FY-P?QZB-^c+K%dfXHZ!*m`Oug7%!d0<Pdc-6MvC8+?-7H8z^MIDD=oi(N%}E
zrbEc(T}hUEk~6d`DK|5TTym74t%3NtKESW{BX(PxGF@?v+3#D>?D>$XoitGy(T%Z7
zH=?$5B^fal<h{L3?(J%dw0lx@s0*b!6Udo#o3tOHq;E5&p#C_yi)19vv?I240O8%2
z6WO*WzOqZ`hM#BI!QZSqlgI*(Rj7Vn#+czFP>o9`EAk>mN^dBf{Fd^hE*wtjN~KGG
z3P$LY^Xv(VSI$)I>P5-pA7uT?ASpo&S?EsUZ!RS;=@N!Y-`F&CIyQ=qaNn6Z4d)r3
zWy=)FBXa5oQZn=@MXzU2U0cgB3r(umyr*#Mc5+Qx$Q`trN+mnWuLY4mQ<ton$4DBQ
zKw?ThA=f?GSml9r%mSQzKeFWaAGDo4nECr2i#q3^IO##jmBkcS%c;BI$nmjFR3=WR
zaB+}$?kI9zi0A&BM)|hC<R*3|ds;QA751ckG$(THbgWLl#=ct}&if~zx5$9mbt6%=
z?~0~!8o7H8Q*vfAC707UEcfU5>IN!QTqu0*fuiCUiazTozd4ceW&2UgFeZOhcXA#F
zl5@6>7)1x{P0Db&qlW9dZY)n<&4TGxO!{n!Zgvy7S{ErkT}o+`Jx8KH(J;<|im5j#
zdL~10)s5`uDU=Tp>+Wt!mfm#=0uw1PI8XkKrNryL$HOWT&nGwV2nk_XT@5oX{=?`)
zMJ(%Bh~nOKiUNmGI&2R|hFzqw;RaQuZ^T*oNbV(HvUbd%)XJ1{2W7HbrIMdkO@Z_V
z`5UexJ$RmQpP7Uewjto<2JBBfWs}+f7U-X4b*C%jjci5XJu6C$WYoWS%&FJ6IbLv^
z8o4D!KXl36<xcsyP!5}=QyMaae3KaResx8W*%|rWO5(20LRue4eCsg$8i!-oaUIKg
zc(UHIi9FYb6vTF<Y<v^-UU2r4EoW`6QTOf=rI%AE*nWkIHgjlP){nZgQRKf!MR7BM
zBB>e~xf_Y=5lKQ(AChm}A@s#m9NZ7EO8yAz#{<ZppibTlTgtBPq`pfoE#u5N^V^Pk
z!%dWaUP(b4dx{)wIaZ;=kv)ScxPOwY*E=XwSdez!m2j^iNWB*$Pg_m!p4sd)oQ3(e
z-RwK?ki5ggQ7lrUd`BI14icKrbfD!^Gxa+yDb3O#_aA@q-4i(6Rh47eB@}naC+pT<
zvTs$9vh*!s=T?$z8Y}iE8PA*fSkL+wi%@SIraUCS!!NRa#87_tU+Oz-q*<vQEvnVj
zt30OUXGbzC)|35r4b^i8bF|-8O12C}Q4&f<{b%Hvkwov=hCIs~`I(vcoY{n9-<zyE
z@erF_NAhiclHLA4%3V5B_gjf3hpjZn?xXHg8Aao)Ne{e7_HlE{f^(^V-kK824;19g
zA=^ogyzwY8<9Z?w?}dD^H69n;ah%_e6&g-B*;|mGUrJ8!9Lhf_QGYvwlgCHV^v0T+
z%lQ<qIY`dLrR0PKQ+j<EhaViL;$a|#UA~jO&yB>fgNXnCo&=vl_8^|nfzh~s9f-}+
zm$<*#Mu9~Yxi&6T>>0!n)#sdwFy_=ub!tuzp`^nPa-M!c(eE2&VsDN}E>JbEg#0#!
zC{n+W`E(`;kHSc7S18s#n8?dVITUYz(Z_N;W6da36=!gzHkGgXbM$)#Ex(#MWi9si
z?^8;uW}t{0fnwGvN{8ffRO=Geql3vmr$gS-bTX<Jk&tIY!jZisF4#ioln1yLyRa(l
zCEnf<6yCR{c)>iXyuCTLSA%oGGdN}R57n0QDc)&~VxicxDLIr(tm0TVJ*t<wlfQl$
z`KMAz*BnP&%VZ=sUXq}lL9qN4yTT^2GG#Y@59%n|dYsbop42!RaXjc2XT4)-GHyfV
z^(o}dbtfZ#FPX-*6#RRZ`VZHr)=(nniv|jxQd0AV68CNg@qR8OkBuQ>#w~W-7|1&P
zd^{!dDbiX=`9@>vrUr6SHG<{<Gmf|UMVVPg^2Gg4x6md%X%_iLRU95@Kvj+kS&K)K
z)wqN7+4qr@=@Wap2l46O2-;YQT}e3`8vo%?&|``UJgL~~NB!>!G%xH(OQVe1Dg7w^
z_c=KSdXf6IoYZ%EqW_2*Q@Nh<(=$l!l}W0@apbmd2sUxWe{utnxBK9e?1PQiAFcI`
zc>VUK@W>I$Q+zlwAd2Q+TR9y#lHx}J<a(!)recZww)j!JB-5`qrA7D2&H4w)f$K=Z
zvx&_4!|sts*#Ep0{xLtXe|&_sOLw5H?7-fz(I|fOqhweKb;VCPTy~7QAtvJ7eL&t|
zN!+eFqUZc2O4A9M?OifOAB@*=!h3@jAuogQ3(CTIl?QtY`f||rE2f&sEIfP?O_Rmg
zo$NrevVtVPN8%hAk?isX*)a(8{mZW31K8HmACs0}Y(9R2eWzQ*caO!Xrxc4{o7l=j
zR!?h(LE2^X90p^0>^It)$C*9&5?WuLu~y?Q!Qx&A$cpgjwjP(MYIyWGii49js}e(5
zdaICSp2sjSRl}gqYt{&jw069dC2}>ebY`CGRHiP-M=jVHwJSouy)Z!K+#@C^w`IEZ
zXac)uVYX#B7CnYyaeg#rOAT0m;Tvm5|6<{}7__=xM7?b{=05Mtx&uw<ADO^h-4RUs
zkc8^`qbPS<#2CpO#_IJ&Y3>GwpPk3Zeq&HxAT-8%WsFLuvErx;8(-_N!QP#vPSzMc
z`HYTF4D+ftG4rfG8k?_R)<X#cvk^?sT#MQ(M<#vuV|40Yl#1OMsiMNrSsDy;Rc82w
zRVZ&Q#D7dr*7TXj@;jSFZEV2$>p50`w`RqPbu5bC!K{tvnUyvjb$K{Tw8dU_y2Es<
zf0?9jj>?B&D9_!_7`q)PnVe-<i3=lRZZq6h%J_u$1eisz-fKSQC#GPPQ-MRD-fZ45
znQg;W*}B#a^OAvBUi4-2s}ijK>x9X4Z`P+PScMnswM|)9*NfFDtypcIhSAwHbah*?
z#{XY-=fo2rIf?Cd8E${R;5xS>?!Dizx5sk!4eHD8n~CfyyveT1m&9ihb_rd%YyT`<
zmTbZ0^><uG6=Hwm8rH#XxZHBaAt{|r4a+dUJON+j90G3q620RGlD>W<76=Wgq>fyB
zH3<_&61(#=;i*!5B|Gui`irm)^9a;&#rN`3d=0PQ*`W)5bFw+KZy%mvm+|S+6X#|<
zT%;TEe<$XN+&jdJdmC4zOWe9%LUXhu{bUKrr;n2`CyQ`Zp}kA>h*pXuVvjxnIRgnA
zu$Taczj%pSbnwS{_NWiQccF|!+rHwe@{B#D{qXPFo0Rl6r2D@oJNy8}+I=bAD0*2?
zg3wc<mQ=hWN%tgaWtUKNyhF;1ETZnsLegy)DdXQEIh;b!GZTV6Z1K%Y!J_OG#yvNq
zYkmUL*6S(K?@aOhTh#39Kttg!PTqOLiL+0so?K1wXlrtpLdo1F>QcK?HgG!f!@8um
zv7%(=R<aZNAXN)NqW*)Z0SB-@{R@+Uz8H6Xi}fwhx1vT<I?Ige)V(zHc*Plu!<;Iz
zpzetUB~y~gs<fowjw7}0^eF3ON4)1&(IfrIls6!I*qJcj&4f-~OHiu~xEKs)^U!XX
zm`JgicZy<5(Yv~Rr0Pl!8n#7n_Cg<;&q+DlQ+%Fsmu#~f^4n)oW3iY@zx$-xN|B|C
z8Sq?@m<Ok`*Dem9$CY^PAB^4H#pt>8MkA*lBlA*<0vahfCZVS9NE(;xan3D==7Tq>
zE15y@xB{{UzYzL=Fcp<As4D12*1eS^u!qE<zX*3;fVb~iLKp5Oq*{XmqmHw(n}m6X
zEHU|2M{%hKrD4~pJ>7w(liN7^>l3GR?5LXYh61Hqq(xPc_E_k^i<XqFzfRtpv&c<F
z|IPC!GJ7b#N1qYa_adQLGw?8aBJ|p0*7|B=qoGD|zAt5Zi>Y^P%jxhcnm?9s{Cx-|
zd!7p2x0{4N^(4$cM9Q61a_lEiG-M(f6LiVReov}zBQf3j5HrD+h?!IIxN-#xt1+15
zKE>JGo#Gv9D4+j|qq>(kr}UntPq(R--Jw7^f|M?shz;sa^pnR(`aLJ{Yy*YnkI3Ki
zmAsIZWObZL%-R0LwfT>T$(9^4C}Ul}H)sXPakkw@ap!(iJo2Mq*Iq7kU&E>4ODIVF
zkL=2&Vs1Y}q>3rw>r4o1)F4vtJvsX`D85rE=B!2(fya?`w?f(wLge)<4&}XO%haQ+
zcelcI;xvjxZ?Cv@n-kmZIQ?S)4PnPn6xWb?P8ms~gs6$X33t0e=(2l6?6)B$TuRZg
zOmc_DqbSWHA=HH^lk<cedxy8tDC~Ei#;VtDJZkq+xS%%`vz0mV<R?d`m<z4;fo$dd
zB;Hv=?3qD?88#Bsphke(ega*m5m(#^MfF><PwSC0{y52&vx)3~65nvK)|YBn`UJDe
zegv)~`%rNDCKYDIG#=|k-Q;hSo+>B9_z==F@kE~eNnq>C`1aMutMvvvbE*lko=4We
zf5|@IpUid_NqYU0=<O2;9`rAV2F+m0dw-S=(qPx}5DF42sTlf(#&7x58ro7Y&z_VO
zM#LO`OUPv(d~BQXln!VA=Mn7x`3B#6Z%FC!kn}?~$PXkDAGw=|_}+xc#eQ4CcBdDt
zvpB_WhZka=aie@_7AKBbQd2XREcG|Y?r$LMQZ|17Hgc#zU(Ajhap{(iQ{^mt+BT73
zIuW@=vCyweNZ3|F)W%(ek6wpI!(<!+pQ7WhiqrmK6qxD>jatgFk*BCiJw$T)ZE>bN
z@Gnxrb9*#<Z%t-LVI{V~-ElN>#^=u=G1HGErR{JM%!NL>H<zU7b4WGs;J5uPJHEYU
ziO@bSQNt++FQIJpTrq>pq9Xbw$#>=w>t=)hmYH}<7qf3z29Cp)vTZ~bwxM<$cy^c=
zv*}1{FA--@NUD;!r{4z=TOz|J;~6ftWh_bAgIE7(aj)wsnZ1FUaxr)4hmzE%GjTfl
z_;ggk+u%8SIz(a@qsG?8g;>T!;yUdUv0p?zY;Q@t#cHze>?Yl39x_8Og4|Ex**cab
zv5WDo>O_8QH;P~CQf;_^k~iKYhVLTQ<077`j^OpKg5A2ivCf=}WlKDk!w0d?I}pia
z!Nv4G68o_$*$Zxx{`dhhk7oqNMBo<W#aerPe3Raif8q`$yKYhGQ%-SwD-sU$C;FU3
zoR4pKwu!`1IS$KoZ#GUh!*pvk&O6@|J0P3bt&T*k7)$D(B_uxVKy04^yc=7v*Xzi-
z^9S(llun*uS4unIrsT#X^5^6ueWXfgsU}|fxwxH`;@oKwo8u;87I6Zrmig>xN)t2u
z5W>8^63}}k;f6=?-)SSb)@p19-el!92?lG_aZj5|W~bleE?1Bx1-YgJAqJE1{xbx-
zO$v5=xP_hbBa920S-toz#x=dzRy`QsE;BgLwHAjl6R^qj!`ku|E)&MHdAKs#{T^aG
z%a`5T(~x}kB3^k3u|x6+h%9E0UI<%1pTm6P5H`MzXN{^Yx}|f_UB87TuNJbRA{wK<
zo~(FOgw8WJ46nqae`q!aFLhYZRTK699x;FDEtWS;C%E4u{73Y{Ge&|*n-!S8^T2%c
z3Rdp94ZWS1uNBU`oJ1BDUS-zDVCIjGWA^B+ERuUM|M40$zB!}WemYBbEI_Lym^oHz
zEXX*C{+ogLe=5hkF$3$7<(M8{&IX-hEIPlMWq+d4`(lLNo0;ftY0rX*&1klLi-DdC
zMiXwLJFgo$M>?=NsST^+BU$Tg#Ol^@7-i34&ABr;2)!SmIv%_0H?X!+#rp3T99~Xj
z(~1@hx36dU{jscFQ^V$-S}cE>%L=8ntUoNrGHVsvN`u(!>V|1uYi!;x#O{nEj#;L-
z_S(R~FZKjFTH^QZ4J)iKVQJ$l?(1+|s@h|+CJys6`D}McWOIQbF3o${rn`@w{rcgu
zejU4Bq~Y0=g2$<!973H#t9szMM%-m_j{OIBAaJV^vL34lntKL2_n$bmlH)rtf&*=O
zbLiC;e8fHXs-J;R`DOgicfh-KcifjOBA`4M@AnS~*elMWjtk)qQbN+^6C!w5L`4jd
zn@19{b_B9P#-c~BCLp;{u--=m`@bdl>kWcG{6o;C{v7)7nBZPI_$k5((H~C8jshaD
zMH4d2oWT9A1P<s&wEk&gv&DToQA|w!XX3qf5x>C%`F~plR}r7(sT2KEpTs}TB#!VP
zS+K^~3Q<QFv?U=v2g%lrL`C)?%FPh@!TChiZ6|WF8!;({NW2}1UvdwrS#RR?V~FqA
zS?DfbGL+<`opL6rV;5w9f`tzGL#l=qDFzpjZJLNwX9fw+rjlg(nwWOYMBnX;Y}pJ%
zMIfP_DY3(+5YsUU>EK7mhdYq$DfI7P1!?D8QLJnt<<w@ft1~FPa~XxfI219#WPQCy
zhTscXjUPyS@fF#Pr=niWCc2kCncK~Xzk5}@t24;2v?Wfj2I<6BqzaavG4v4`H^-Co
zdkc9QYstU6oqXLk6mPAe(r6!5g1acn{mDyOMWLTNng3iPBXKN=x0*#=k0GVMU?rCg
zNzlDTimNt~1&c{p-6$ANAF`{ANjGaD^|X}SlfmTaY#=jl2SwH`l-MUys(YH;__^eE
zkdQI^25H+Sk+F0isUuF2GXEuMb6S(~Z4%OgyU62u6LUWs`4JtmdUYW6Oc6=BACapG
z=Jd;v<oAau+-pVsz}Xz>+=U9YV`ME8O#Etll6o8#KS?A!lpy{6fjC7Z@qaFps(*sG
z{qbZR^&!#XEQwuOlNmjnr1G`MH)$c;5l`mO-N*wR$SVz|sB8}vlipG|Zzly^#9C{+
z5I6ccu}|+2Q=mt9q=(QUor%z1AZmON(uyghb`W*$OfRCfM1P)ID|DS2vCG~P`t%2~
zk#ds%yrJly3lxR6A#LXWf4@j_?-&w$FGMn{g@}ooVs^eoh~gGLJH`>*x>`JIJ#q36
zWT=lJ`Rh(%h87b)wUiJ)CxQ}k2r{`qcwqv?x93w=KAM6hXUMAfmvo^WBd0VF(_f2l
zvmgS-uf(Hr4c<M+<DK^dAO9jE9L^EbsTVO#?TD}PAx5yC$RGCjT<wHM?k7AqJR!jP
z1f}_Hsa-yrT1hrFZ-$U&KZ)2k?TPXpLD+-A1UyQ`<D)VA=U531`~~lT3B)-aL9$=`
zJ8xf-idT_zFGTRcV+7|3ZRM(ilZqPtgBDP_XbM&NH>iCXO|@DaNm~MlkJwIVze4;r
zyyZ~fPh7H+*zJ~yTfb3)b-yL%p763h#1q?8O`_TvV()Jw+F(0=i#p;KS%Y=^Zg|RM
zloai!%J&mBmyS{C_5it&HnI+)rgZ9q?_@O&T@7KUR2jRRGh+6Q$E(<0%urWJ+geJz
z!jY7R+9Y26iM0KEf+pJH^t&zQS38JaI)jo4Z>Ti!pyqlA6^8$i(BOh>r<{QIUGUDD
zid&C?IDNawPXATxJTV=whKpjh>`CUD8zflkl6LDHDPKa6>U!h*ygPPdwAj*X8NuzH
zDGt~~<+vf#riW7TqBn_6I>-V>65y@C=abNbg{N^!Sc!d*4-P>O@iNySt;C$HZZApF
zauxNboRol8#BUe9z|a)O3+=GqTuAtocnVVfQoh29!>!g)eITEt)_x?so<(STRs3Vc
z+HcLk^>!65*XMC)<$r{@3g2qfbQHUUF44{>-bmC{)A>Zt{eV}a7u#37X6vuT_&nH1
z<(+zJ$G7H4zc>yXd64fHMb7!@q_?s~KC(SwYohT#Tto1x8N}T+756X}#pWu~6@v(9
zNh5UMD8dpw2xvQ&UDx}HUfBafg(rKCJ|OSo8}cvRrS$w3N*dmfVKETtj&DSVv?siS
zD&8G(aPJ<1YpMkS*Zc|m=7x9NE_O{6T4z@ew%HrA#i|=-wmujOhPNX71<Ti*W9@(A
z2#c*CQgsX=`(xPiUlMD(bjEOH5(d4WVR+P(C0B1S&&7kOlNT~6shFAl44HerD;h1c
zn5W)K{9cIqPB$i(FK6Vjj*OnRoYDHnP(Eis$dsiVTm<`ay0NML0QwQRXkPGUaYPFm
zPOePX?8uCk6{u;RWRlWork)?mf}lng_SlBX6<wx$H$`>m2Zm=YLCHIv(e2e4)$bW&
zf+rFbc9@mIb2Ym?1miin=$A&Jvrz}Fy(?Kb;vI87_%b`8kr_+6GDTw)6B3p%dx{^^
zL~T*I@rrSp38+ep8A}CYF6%I6aS9`!s54HcjQ{a=7&Yu+b=5&uAHIivpe>qvx3l1E
zUo?b=rash~x#Mm!ZPq=eB)Ksme<iA=ADLw9%7g*N!Vmqx_}$AGy(x;(E*XrPXvpwD
zO~&a|3y!lD<I46JY%*t=M-bY_{aK`PmboWfn0;3d^$xd~)%1cHi^eh8-w`$a?x^<h
z6CR?1iH1W^xwwUKOZzkGb`(l^VT|xth*Dv!_&$FKxF?>iqsuZ|O_uM8V)@s;=*B)r
z!%LsJ+Mdj*yT|lR2bj9A2Gz7cRNI_J<;!2jmt0`9UmM0wuxG>$A4bg){H{c>!GV>G
zyw?s@O~I9{c4Ks~ndQCnSnjcbB|d}D?Hh<zpOegA62z>Q7G~`1$RxpACmB{Ud3qWu
z7DE|-dJQTCy%^*8g;Akb7$auBF(<DvQs|;lZXZ$cI7(pQEmmK3VfhkIme!`B{jxi{
zf=Mkjy@rPMG^XEoMg6uX)7<wlWz8(6c_=fXvXKdfI~aFuCCcJ#sc61qgy0?HdbBX?
zl{O=cY#4jUo*1PAM2>n*#Cvz-<E|sGP!O)$kx+#XfqpyLn;a+R%x{D!R}$K*AHF9A
zGpz5!zFQC3vr$)QNJlK(La;h2#r#MpCZ2vQjT*p;d-W9g)lht^gd!rTly0PU+h+<*
zx{!0^1}X2l2<_2GGKWcDpG|5vGo=0R5^*Y+aH9r7+iWLL)J>n!f<p(G;yy>gUcncw
zE-Pa#c}eYO52~jm3ZM8X&B1>;vuY40Z#|;6y)h-Jb11l6PU+b=j<o7T)owis+6GXt
z>jmjQvXGV+6I`~RxU<~|pP$MBU4I<k9Ank#5FDghRDTw;<?8Po4cy7;QD$6e>%qCc
z+SHF&O?g8UMJh5Xn%mQG<`wlX4^lPv1ZDH*koB&Z$UXb<Z5Fla+8BaOKXKqt18Xe0
zvv5o~7E?x$(ei?Fr@qv5dBI6d;gfuU<4e9#UNwmPulvcKFFf3=_tZW(OnH$GMHhvC
zVk{-;@HV{0n6fwfIc}+UIoP2H_x9J=Gq4Lz(nahM_bGn%2$HG-g%_+rk&f`Yn?g`5
z(<dQf4e__$BOB_8e2a`++u!6|enjg0Op+cY5YxDuumq7~m>R=&B?UH)J+QyhpY5r~
zvF-Q>$BF=oR;W`xLilIB)^dExGtRtSM9YV28U%~0eleD^McpWKYtOOOeK{PunzCLK
zC|%ctqD@b7Ob?K_NRQwpjX2w1$IRsln?AH=Q)M1|w@4^!*+6ysBI>jM(r`zWOLLcS
z{lkBp+8M^tcW<c951{&w1&zw39LX)C=65qSnPQ&Sze@hl^Q5iOC9G=@KHZgYx9@>_
zk9p#|tiXGn@SuGzQCSj6t&=6k?OZwk`T*B#?{oImE*hMbsbxL&(p@yZPNK?kAmzdX
zDHEPWX2BdXwgr-ws7%DNo_NllinZ`OtQ{`0xwFUu#CM{!TLl$!q|{vS;%I<3XRBSg
zDm2vv>unt0a+%rz)*Pw0LSuOtW!0m}z0i%E`+txOX@_jjViG;A34A#Y?`>l5^-ZzQ
zScsL=HgUe^P;#n;(u+f=yf&43H{rPsd&&jpXPng<#IfJs1p64y(ec4FqzJY*do)=_
zdO}mT$M>d~gRUwQ8Z?`O#`ke6S<Sv_Vc4~I6F$ud>{G{zK50NvRxgSMj;Fj&EOpMK
zIU(*_gQC0exy3B;c{oMl9v1&zMPj#mNG5C{Zs&9S4L{;P{T+U%d*QP9JiA@Sv$yLR
z_AS%J^ulGfC(Nbr!y*b4+NAn7lWlK9k)|3&YjcIRSV&60*W&!$A-SDkrs~PWnBPI3
zbdJP%?gSg1#^c9heB{A+&Jy!#Y&eIi7UCWy`lD_o-a{P(Q}q?vz5}^`1%GxE%=pZC
za?dqW_*aX}J|!gdxrqFn8|h)SqzPslEB%D5>OIn|0Q@f8!N0Z8jVG)LGW-wUh#&Yq
zd%+<$V}dq~pk(BFDhv{-jB=soNpEUz_oU>)Yl_?_k@q2sj4{3>4!0)7F`m@g4@5ox
zgUreksl|5!4+aw$=|hP97J^=Q;rHYQ0f`sz61CswTLFbjv<2txO?8tAHA5|^>1s{h
z=+ET*EJI!xL(=35k|PUAZyiO_kjX@U5_4IBn9n}!Au`Z_IP)Dyckd#4awFmXf{EMB
zA^52S(GinHj%Or_=qTZ14yIVG2Su58$z7;LLESQv#QY%3zd~GyV2t^rh)ZcA)+CXr
zHm8Jt5-mJ#Q>4D<kQqf0xAZ7c`$RAB=!9&@V^X%}lCkh7`N|qp6u+gi*C?u694Yk8
zBX{B?a+-^fou5NOg$j~EU6C}|lF-JJnD)bvtm}v@Ck{ymb0l4xko2BPT!l8#d0r%4
zT1SQ~o~$uu!V8{9Veoy5u87P<OE!6b`Uy^WowQ&bqzm+jn=^!nvo=T*h7fx~cs=`0
zA-Uh1=u?4&27eY>{~d9cMb7M<n7s-&BY$g1X7|HnT~DXv%T$V0&yl=#6FIF4NWLm^
z8TyY%(oR5Pq>9AQi13M@#2!B<u1w5P9W99MGK$~@>Vz1+C3etGq|!gc>SQDNd4i<t
zv1Csb&+jTSEEd8`e?5*of8iIcdq85@dlGz}61Q*(lJ>QPe11uY&qE~7gNU=7g3Ndw
zF=>%RzOW)n){ofHsYp8&i079fyLnIK=BAN3Rp@?;-z2^sL#$psnN#%0?NUVgTj7Zd
z9x3e`PW)?EqP;AMafu~LuZf5sQwTeCoXDN|M7x>_W}-`^$w*`->cmgCBWdJ+Bs>2Q
zxtahnwR6dhc~AP!b7XH2{zT3bk|&A2Al`3W;9DZY787nRCqih@=m_DdbzX)fq?YJJ
zXTgtah+f=|NX=nL<1Qm}Jb)~|2T9T=<fIBd{`xOjcTSSsZ4!zVauhB1k@<{35<iXD
z<-!+KYeVQ?6@s@9AiVW1!tNCk{BkL=4s!{AxP@p%528l*A@WII<X4}OqG*TwmJ{j4
z6-fTAApi14a`zo4?Lr$;Q@)Y#p#zDZdWm^sCW&<$iB9Q3tl1x;H>wkw(@fM=C*l@2
z6Mj#;<1=jt4=5!1Lo!mwW|9`lkn6rAtImdW^)}>fQzzs1a=~m(gzvPJ)IrZlIWkPJ
zmKDTGg2mcB2;UJznBcq-|MVd2{UjnBFA?GAKv0~>2Cf$D_r)ubxe!@2^$Vn~h!uG<
zXEK*%P_nU>9NX69J+dUv&JIQ4a&p&Kk~T}s{$YNEpZ`VV!WBqAh#u1}gNRR;iE|Dj
z>Xl$ubJ`Qpd!}GObCE83Lc-G<Bo?kAOA<)pkX0lf=tIf^(K|JS-uZok;xUg=s2wFc
zP;lbD709BNlJuq%QujI{gZdL=aE(OA0Hijf318DdtZ^i<&0^iRdy~9CaHyvbNYm*^
z{wx)dF{z>KOgl<<dsCt6Ldk>K6vkX5JMJlIw|^q<B_*LS99eJ!(fO%Ft%?=vG)LCw
z8?ho=6YG0~m}|AfObjN`c>@V;&B<8SOi|Ca<o*!-+$5I6_M#^?b)oRPExFsrk)88Z
zaP%|8O-dm_<N;;NRua)HJS3Y~;->v2@wo83&UYbdxA0*`X%PD36p`O#M18(Ql3)&r
zFY-w9Fd}1YFc}YSlH2_x>7}PdZS6<GH#1^1q(prGzu)za@bI&Q9lT0dK`LSY))BGA
zg1~3J2~<-O^}8d!2b&1qZIA!YM8dMtNOB&4e2#?7trF5M8<Y0m1Ed4X#H`+vn9W-W
z-6Qm>?|(?O9ued2OYrlr1h>cuigqOMRv?~V6u1<>z+L?gyLR-$(|-_79dkG^c9zIS
z_8?Vww@Dc%i0R^myw5XY1<#KY?=`5~aC~zQ;i*<j?71w$1~uR@dp04D3W-oLVo%r*
zb{}hxP5TjS%NF|b*#d0RO<31j4~zA}ql|b%c79KhpE*RO>T#iIMb;%PNn}|1kZP<#
z%#UJXCpeSRb0@Ot>+v)DNyMuMM7x;d|L8j&cinLwzk$6bKd}lkz`kJtCI?NhDVQy?
zHrnLf9!|-OY1C%@;n<&p)UTOFv8;@=6@rUsZY5E38j9O>q+6>IGuD>GX@^PKe^cZ&
z-SH8*lzsIp@%r}|mPdPF7kydGG{13tIh`yw3Az1yQefXit?+J+J=sdV(mRTa^U0Pa
zkn(*ksn!2dJbonkPg_L)61``?9?7f4tYWDlJhAKS%$|jNknkjy-^En0HLds*Y_^mN
zHYz3i{crMyDyZ&k$<e`|sdclWP(zuFt%HzlF(c7xJcTFTiJYwP<Nw|endti@%~T;!
zr3UYv+Bo~_vgchBYu6UDxo886+w{Uhy-Z}`CX&@8a!AnwshnKPk+ut{*3}}fryVI;
zJ&3cqh;%{$xzYcU-{U-z;q^!!Xpt1v8~+JPc(ughta*`Lme#C(7m3BdZY-?a&8F+4
zNG~iV%TZhOggaC|m2qUj04h(m6WOc^l0+|$jk+c{@iB6jwNPNJOq`nVkcFoxuQ`eL
za*^*4-qO}pX;^DJu;%YZEWLVTApG@pPvvBE9Yp5Gx#VTP6MAzXb$3H4lW38pZ;w3a
z4wAw9h?`qY)^m{?eqc<v$q;<KrW5tXm_s4%91N;tub2xx%}uZ_yNOMZCF^%B$5Nq2
zn&n5*9Y&Bl_ax<;CsCD^LeY*5WbB$kg2=9gdtW2`{by1(3?e<Xy~qiRZ1vrIf(&-!
zGVLo)_fKQD!W!3tcbJN7@zzRPW?kruL2jHl16HI394DvSf0S6=r|ixqkz?CQ(!*9n
z+w34nc(=hycSyKgfc%wUEiYp@)a@@G-)CT_)Xdu80a&~$$3A8th7Gqd?px2)5wXmF
zI}fQ>BuRfKkiJ#$rTGnFmK8m-ZX}Y1PXzfF<29p-gPUFx`$vMLM@M$}Yhn9o6Z<S|
zusV^-O8XL4z8Eg%;azAYyR!6xIx2Uqm}V#D&h>_be-^AcO5}{S%}ABhi`v<aumz9t
zRxIXVw_D=Ab!WfzEcU3*!&Ew*t@E$ry~q%oL}9|+wqwrMxh$S(A-u2WEZz~#*q5Hn
z+|U=v%Dv(&h<@VSnh>)@Lf46mu18;dUR}nmdM~z<_p<Te8qEGWu=(%`)(LKIFzOY{
zi!9OZv=Ws$A%ac3XX>hv=pOsd?8hyP9DI!_2O@}?_6y&VH`rH{Vy$&S__wzOQ*&ak
z;4Q|QWf%nY#B7ns(r#&H)1I}go;(=KdRvUt%vj|6o4H?BqxG=_4Ye#Z-}y3D@D(MI
zHJhv<azV;BG5=PF^=H8iHDp*euElEc0k+i*L~oobYo1zQ>}tfSVQws4b(W=jJTRUR
zjOHBBAM=ALMxU6hR>VZVlgzMNgsRAKjMhAg>acD2n>Aoz)D4>zCovaufPJn4yGIMz
z;{Fed+AqW8-zyl;bY!{jRn${ope?xVlJtYjN={`#vpy>S{9*jT;eyAkWL(`##yhTH
zgtrS5?g@W?N*!CL#tPpnlD*v?uuZFx^?&tPYxflWPMgr`e-XoTQ&~EDIkSE0Sy1<p
z6%OM2i`>b)WbyY0{WBQh)P`}=u}lg(z_?AdDE)em@(Ym%?Vcmpv+(uzwrB4=vHrZj
zSXi2}Wz~HQZ^fWD`V<Bu_0jFVk+~xGx8#ZrYxen|`?ZiogD*0D=V`{~>!JLQA9Dta
zzZ+2Q%}C{DCVEv8sV{udiyPSGm4@Bgju_gkWTW3PY&ys>I5dDYPo|(fIGqJ48K^sp
z_ZM}X6+NBN?)eJckMmH!afHz|2N~7$NpQU|l&5}YX!kFSRegb1Y%UHVg_xL)V#Nk^
z!Aobd#qmBCg?(67VuWeH9rW&nqVsJe3q^K)uBkF>n|g~}LMJr8J21)B1eFcij9a`E
zwLarez4jEPvs$P|hu|-AaCRc6W_R>7yQ`g8KI#^m&PdsOGZwu<UzUV-5I*b?=6}D!
ujCESf%jkywa!WLVU6`R3$b^=cjIDjgIOhr`TO^{q_&%ea=rX}z_WuAeUb4CX

diff --git a/src/feat/test_data/test.wav.fea_htk.1 b/src/feat/test_data/test.wav.fea_htk.1
deleted file mode 100644
index 0cc28939ef209d8e9d08069b6bc9131629202199..0000000000000000000000000000000000000000
GIT binary patch
literal 0
HcmV?d00001

literal 22164
zcmWKX_aoO&9LMb~Awr1wN})sv`MmG$^HFIbqC%w<Qc;wa%Ff=qNC;7gQsMKyI~pPl
z(ojkZNkwHe@a;diA71ym=XjiefIxzPU}V0)SsgLR{WQ$1e{luJom+-AS8hkgYYqV0
zb#D>pcL~js$U=*(9>DH{I)JwHP{FFpJYjzmBq!g7byaTS6?>M$K>ZG46Lk=X-JS!2
z6fPrkPzh~M{)CsigP;$S1rLN>rAy*Mn71x`tnq!a;j@i7lPZGvT_yx?)ZB)zxuy~8
zuP-q%Uc@ZO>ctn!B=9Y%An*l1G?`sUq<j-d$?}&-;n7bd^tlkpj@F}~nl<RLLM_6l
z^HJP|=O}X68yO~?;NEz(66tnlVawqsjM*#}w&8lB5}l88OcxN-^y83i<$?5Xk6^2I
z4ix-P7i_L@r&=qF*w4IPd~@P$1JB|P_Wm=##vfmSYm`_#AnHp>GB1-kKFU}Wl@Y@^
zu|ze>0j5Sw<BA@-gZl&b6U(Kg$TgrF9;(sADqh>rzMTdrSYiwnTj-!(*&xI<=>eq$
z<J^Y*1!(RLC9)@{8CR!1fv4Q(lMuUJV)~FFk<mFg_UjvBZzc%q!qyYXElp&WQ~+6+
zACL3;t`gDZbFJ!D?IALAd)b@+#p9gxI9x0<OcECw5OuR3xJ*u+82o3C`xI1RP4-K!
zM!qDO^}!mLEMA8i`&^Lve3m4|d?MZ*Psr1CMhI>EiMqc&g!i0fsie#`?%nUZVS}w2
zx$n9fi>=GVD{8-!ke>r2yTF@VnGB+$uj<KN^$@5ZCP!Y`jg!J}yW!x=ej@v@8b4d-
z!g+UAm6)okuzi^YSayRF6F+}FPVU}{4?Y#g{a@s9Zc`1v3fIB76a8FUkV}&yDyVVl
zF0|~)Yf|cUojeX+2;}$rQh}ahfMaKZ#$MHs<EQRZ{VHAV`+g40GY_Z2SvT0cRt4fN
zAy2|qej@uS#!1+Zak6*AGEzEm3H;ZoOiI^YC-NdrNI*@V=>2W6Dt#Ob_Iron+($PU
zd-p&l{Yx2`Z}SUuc6I>unKrm)pBJ#){s0s#`UWzd9pTz%)zXKgmH2Px!+X<`$anj4
z$_+hC&RQQJKDU&inM)@6GF^|VxYp2VyPCP*{xbm+eH!HD{f`XNV9CK%eI&JX5pi#S
zg6rQpf##*cM9+AH)m<Av{wrS2RBXtHk9)`Q@7EsS`hHswcp()`zXF)OgS8;pu@ktc
zVW=ZJ4QzW@&GKWkz>nn9pjAl>-@NgKYx}B^{_;>DvQq}IRP8Sn&^Sa7g_e^G(-?Bn
zCK-oVilT8(b<(V;Mm54ix#N9Nc+NT_;*>NAtvprnT;By`;RAO(S>i#KF}hIU*(V~X
z{*~F1RD<V+Yyp|ew!)o$)A8*~2f;{IJczS-3*yC-ITk`LfPOY)j(SId^d~-?-e>Vl
z`tAWx?R6FyZfn8a87W-riBvk=HbNG`r%1wCm;#X@8h@yZ3bsGPKG%LWJXrY{Jp-Dg
zGGC1Jw&ZZ9q=s?QEHm6G8wuy1c>=UNA=w4fthQ;j;UDgAK-K;<vh_nYIJL)}DLGaL
zVkHCNOiwkuD^vrJ{sORny#OO^+r?ZL@duZ_Z09JL25_9TEm_Tl?-}oF8^HF}GW>Fv
z9XMz5J9p)_xs5_WuizT3K=|2Amqvy)QP-<8=}OH^C{*S`CbSB;qP2ch@uWGabWY$-
z72m*TkIUlR*Ann}W+&_7t4QLKZ$mNOA7VJu15P*IL!#~uf~G`!CV12dSZ;OzH&;gD
z`J<x1_dhNtPs$SC`>_jTb6Obm@ILc(MJaQsIt}=oTEmVhRxuUpd)elacWmmh8(dBY
zuTjkU1$-&fibQ+-X=#BlwRLz-4Sq;4TiD}tQF}61wC)bMKe(M-eLlt$ti4NQRma)e
z^KxLE-)!RiGK&cC#6b406nxcK1`HZh6FJYrOyP%OaQRCBqY|nGn)e1^W8Xfor1&eI
z8-9-{J)8p-Oj~fof|*RxT73}xU<%5YhhmM`QLJxK!Pe*PW5SOsa4jaQ8mC=egH|h4
z!ze!&+PcJm^!;RL%xZ1;%w#u>tiHsRt$9Y0XMZPeVuW}?nf=6-6avE<Q>Za|f;`Tz
zgX(ST@#D>gc+(LU`aNgCcqD_rG;@K3RW6(7l?1iB*0JLU-vZ-iC*1A!23J0=gHk`n
zaO<6NAn<Ag<ea+#b-wtpwUSa8S5ES?EEh3vR_*7`PQOA&MD8FjEnD(TDTT&6mQvqO
zOX!+fNyrX!$@8Oo&?~uA9G>1mYG1|jM92KdmZue9-?!h4fVU%=P}mLQcaD<{XCC6K
zzk47#_z-3<Z(wf^*N_$VL#%hp7PvB#Fdi{_OiA-rJnFIuXz~nzWSay=z5SdCryx-A
z@HseRwVd6$FalS6bKs;T>sZ-3)o|q>^XUj#fR_0!rwKjDq`B-cEzxqI$KJK0M5|3K
z8TgH!KVCr<yM@S`PHmplCS`*3&B5J~GvvRW$4U0x)zDt`0r8C3#Ttp%!8>K|V1B$7
zr(>{}<Xct%>8)YVEVvigK1CeIeTVR216L?6oQu!*buw<eT}=A7As`{;gY~;6@PGW3
zoDt;0<a289{9aS8Ohz&NGtUc7kO-RJpGhU!_35>vhBWQx3UoXDJWSmj%biyAjh=Pc
zOI|37@nn5tadCz)u(Q@6M$S4ox$!?H<%R)n3w?)^93|n`g-4-+^#%OKTY(s9IC2V;
znm~Q0De!oGiElS!3sbsc9-K~x@!OSazyc-$%u0-4Z>KurpU&6tHOW9QP``+&jdsBi
zUI)42+C}tbYXsyxIZD^hbfK<`OKJHuJ?cMo2EAVr2X)m3(d5^8v>@sM@h#cKlhxeI
z{^*3@<Sb`=V0i>kubaSqN6+AE(R-L5zLBt2DURRi8BF3=_X9u8WguQ^x>Zz?01(#>
z;rFjL0I6H;K-MP}9A!NLre8V>Wb`ZWxl#sy*13<fA7nwfXSW#KK8fwNyK$v@*U`6U
z17Yd=J#_Cc5o)u#gI=r6q-kGnb7%5T!?iZcP?+&qI(>^0dF}LrCq0nGrrh`e0=46T
zf1wRnq|$>2fBnTFZUUhA@;MkQ=tpFHQc3Za^Kiw^G*Gy60vp~J;0Jmean|+R2K%;8
zGB+>q*+`*&rmguT5Kb8+l0zJPI)lTsp56uI$3KIB=dJ8odrhvSPCD&s-;P9KJB`iW
zNu8|w=$nQFx}j$pSM$3K%bRjVKEbh6!|njCIlGA`z0ecyc&i2W*y?~wa`{Y%?sOcy
zegls7y#_8@dx1vQ6OTURk%+1W=p+yc`rnGNLiqx$N2^=CcTq26pDe_nMX#)y3N3+I
zcrAPWO%xIKdCEp^j|BH*%dtneKiGXYgh}<=$QA86NIQ+|(d;%q>b2(zP5hxln-XqN
z`__ltIa4F#T#gY+F}Oo#WG%tXhC6uDVhXtWc^QZcJ_c;why$03C|tX{p4sOp0D`^V
z;d9S3h}eb(&fm6mI5J{AOg#1;NHql4RaoRSe7|VSUX|L-6!%B6Mu*pexhs6J-EaWD
z-gb+zsS*M|VJ>dAKLJcy_A+(f51?-^e^Nfj2o4lTQ&6>?p2}{b4`)<RQ>z<XDK9$$
z42O_b$$GM*Eg2d>JD$uJFB}d`fwz_qqrLJNXt>VcA4mRxCSZUMUg^Z|`-E}5=UlMj
zs~WzZb_<41^8lNbbC}0AEe)bMrYzVH&m4F5VdDz)nUh<V;T@Z!S$AzYPRd>bK%G@_
z)|d#JZQ##Q%*{dHG=u4_lLKhhLR-4Q<1$T2=FqFRS5cq6He5+bL(=f`D9XwBLSwfr
zMb5pxJXyMe-7cB~*0=p;w?Z?}HTIjx^}B(CzgOV4ZJTiBB@cXaRt@-CnMjnTjN$Ih
zJHfo_HlSF_avpRjF#qk-<=hPW&gnNOW#3-DfTtJVV9rPBF*`#Rv(sZknZ)kZxK801
zn`JSK#+W?H_FAAB^N!H@w|Mm0?j^KLBZ|iGs<{&D)5zQE2$VFtm)icaLuPe;ylHYA
zAb<8f2wm0!b{naI$b@qI$2*E$@iiX5z23pJ*v8}UPc}g<jtUWw!ho5!xIy4(4iJfd
z%v`xOkFjZa&$wm{GW})Wu>Ik={O^G~_>rn!obSgYF>)Sbx);B|@yqhD_`#WI%EFb_
zq=~^v2@ldPdzwB}Hl>wK#xw~@aAh{Wr*p<UP{XTm8hT6<Szi<8$=<#I7RC{9N9F=J
zbM`Z6+<6Z>Hs|96cjJG1BoK2q24HI*gKtWI1v;}GaN@HwSj0vRNC^J|iuO9Jmr^(|
zNSz0iqC~Lyl|+7Q@+N%X##YX1kwP4)k_G053$g<aZkV~?izXAL=-H_uwAT6()m~ms
zt7gg57p^iirNE6ly+oA$x85BUZj_<hFAT!hIRQMGw+lf0!Us_P#cJkRrvjX(wS=wk
zRKzq$7Pm`ukr}6p@EP??d`Tn^^J)Ts??ndZWSIi71Uum1{}^-2VnOgK4Iuw!J0m?Y
zz>Xyf;W|}g21PHzE2`U=jMXO?lL~@Qv!+~Oe_fiHFblbR+$DJB7_GG{rT2<5=tX)9
zecr~Qf~kwp0y{ume4UY2tqo7Q@gtOawE|jOjN`ZuoiHkluyWb~c=Fo?BH*J%lA}lQ
zZO<-zGfs%fyDk9EUr-<#K`R=x7RP|Pi=WxbjgG)~p%*v>MFFr7<amNC41`U9?}Kj~
zSBqTc!N!AZq?s4~`H*tuZpl!OP=Z8F#-UuxUixtFLHeRFn_@i`)Dl=scbu?9@#cLr
zsoDTVmu%umij_kR^&}X3?>_Es{{}N8n}Jkp4cPbCj2vmqBIjJ?@btsuxJu+M%h#x7
zo;~|QVmkKWQ1e@$^>HcCjVNGVndgB?)fM2U-E;7FjueyQzX~con#b21D+BM$XW`JU
zCbHPxk~^mfQVmT#nD$o|NfsZYw?!V&sa0*X1)W3*3q|ND%A)vhPw844KeVwvfG50b
z5noHH4gT`}17?)e!=jsi;l5sRP;-2;!TWL!S<TIZJ8xdaX)Q0HhUx&34*ZWwEEB?`
z?<IhJfGkKfNMUt???COOI%p#E0fy))fzQ)RA<8vcHi$i;vX(!)qeYTfI*f6(u74oP
zs>Mhq^*U^HokdgCF4F96q;c9;ITUqu16ACXiL~E+q+xR!;b)0P?kInfn6GSse`Rh!
zla62*yP^V)2rq;!*PUVS-CvkIu7~3v$MLgeAK{zHVWP2tORZNq<FDJ=@TT?yKmho%
z)u-lzp~^i_YX51Nb<_oFyx#;B<~lHUqBlV~{(8XA+K8Vz=c1qPPH6jH1r(9`2_@Vm
zWJ>2hdZn_79vX5*@lTuJr#KFh%^W1t4*9^)#8B=diFA_vBpF`f`NC;uC7^e;8=BTS
zf)xE<p;f<Kp@p9cQqI|i&)e4`(XtufzL^ZQe>#TMt{oy8*a{o7fjIU*Y5sQ~0gzRi
z45eG2LBXs6COdopI5Z$2pKuDQDQp1izg3|pU+s|N#$gm{YJtwo^n}5^vTWn4LfG5T
zjS{w7q4g)4kn41SEP2O}>Kij|HSr>evn@gSPGkOt2Q_f-hYxU|YZd(cNF8nda0JHe
ze*-7_+<<yVIh@dW3fY<G>0J5OoXF=KGIQR3O!PnCs;Y1zJduQF=+xm+Z$DrY*Mz%-
zK7zP+mvC)O2`H1g0k^j8MfE36Bc)jg!Dlt-ypR`?$(?{%m)cP1Ks}n`OrdkTZy>e6
zc;t8c2r9_z=N5B2$n?I8`20R)d~<suN3GWrIxN@%J1#iD(WpBxB*qb*>%0Q`%m-o4
zj}&-K{tYQRT){b2pNh-lDb63*jGHPx5VL)mxI|orq->4AXUiJM`kDrCVY>*Kn|2I`
z1^t5OEYeZgZ$0=d#1a{$@=<+MDAIhA3m?7AM`2&ipb6PFL>DNcwTJ4E_q&UzZ&xk1
zaLHA2FoTcP-OS0-K?e-y7&G%02f$3agu|&300aGbpkZYwR0wng4=u}~WTO!%v=igh
zgaxv-b7o_1wl4m~DJ6#Y*W!Ekn@C!~5h8NRfrP#=<A0jUBaSw|h(WaqEKb)$75DOy
zv10<_UNuF*3jVP2+X6T^`VyrnW})$YJ5c9n8cMmk9Tizjplf(Ow;(Et<RAmK@_G)L
zw|y=Cpt1(PzVi*9F4{u290PblN}BlbuEN+fK5l({AGf#FL3VT*kn1XAcSXFx;>*9{
ztJk&gGmah+Q&>zCLkjV*@N2T!Vhye>?;=iiE6EbS+3-a9J|yx)70r143k*hXMDa0_
z@ZsMJs7n0-Y<;4QUYb~;DBJs}*eMd7zTuCu4>@z`wwuIwXDarT(IobkyGY^RDzf|3
z6iDI5lT&41349<%h1Lb)Jl=G2Ni%~SjOazP#Ur6s#(rEe)&R8WNz5i<e0rlEnSbUq
zp1;llR}&R{dcYb#lW!!`w0;t$bbauw%pBG~It*JD+<_ZTnxO3mRY0$&7fQZ;6^0E2
zAgWRXv#ku!%aYfy&MXj}u6)34?`pEL&uAlZmOIJO^ZF#Gx|gi$c|(NXJi@(eVu+Pv
z7y0O+OLi`2sHo8tsT%%-=ER<Z+xOfgistTsW=`U`mrJqv)xSig&>II2N@DBH8$kE>
zdwA7?RIIe;9eyqUi6ffOhK{aPhAAWM_+-T=eAeHaJox+@cI7>V`;KX&{H5Ncdddn4
zZC^{y9eab;nC|9&J+K<ydp1cP4mpF}=^shw_E-|f`%W(ZI!3~x|B`ermxzDJBa$a;
zNz=n0<onM=_+yh8G)ef62uQC5@6DES!UJ8{b&>7({Gc;_d~*$(?w|zSGn+xTRRlPm
zy#T{TKYZ?44oWeag3{l25&nYP;6m*s5@ve^W-JOvMkiR5Z?+RolPf@)vy!QH*E9TF
zIe|NzDU3vPg6Z@d!EniHOdjnjBj?<ENL-~V6&LwQ&c)pTBOliQkHfo2_$d=AHXaH;
zMLU4nG99evwij$kUIUQNZN}=xbnJSf1{W*HgKZ~;q3R|TF!optyiEBEb|?RbA399q
zip9s%(oqgF-lIlm2+pG`4;%saRe#gC&1vu!R}YS6KSmif+sL)0@9E5yVcbdCS@0dY
zLB9QX38!x~CMVJ~$$?c>B>1o~mF>Al+}J31#(y2WP_039a!!$3R-a*!L@2hN#|JON
zO>z8sH}K+;07!mx39nn#3lh}dLGvZ)ti)a`Ag_HBBzG0S`G+>+ZgqFAO0)re<aHe_
zO&+6O_JuURqXVbRk)|)>UEr;uS~&KAi|QvoQ}3E!x`T6+Cnfw2K95o(=h98tV=Fch
zwWaMuFY+LH`zDw0WiJE!oC<JpmM&C(B88hW4&fJTr$a~MZv6F9G_=xvg)1t~fX_U0
za8u0$$F^UD%lrM|hHEx>Y@QI#ne!dwoic;YiBm*S_AysiSeRx&SJ1jgiaLbV(wL}U
z^glxxTK*~%%{ul9og6%kO22dHk#<YE>~11Yf^!erTGn!SA)dtA*M;5k<rwF0xC<40
zc9k&=R)bvu<z%*FAD{m0#oa~^pmI|_IP5JzRE~dvx>>_aURV@Jd4HH$UUi=_n>+zI
zw|9eXr+lIh4iL%TuR#6ml~DJOCa$a~;G!!|6mQfeYFRU=Oj{ZCnKGicXRoIbk8Ysq
zBN{NHMGKXD@1bia8tC*CXP!vqHQ*+r28L>%<1*PPU@PIooLP}ees$SHEtRM6;X_dp
z^7$c|jnly6;}tNpBo{Asts(#I{|*Mc7sJ`FzcM{Hqw%fR_n>N&1h}KygJT+7@gOY2
z*=G>AQEvhAr^2z8>^ZK<p2xH@y$R0!d6qh4n^N!oX!_!%6s=nPfh$1Mi0vMV&VD;f
z{c8@gK01~>v4~4}xXBER78$}=+0BgM(Et*~ks=Fk>4Bc(=b`e;=cGnah}f80!`0vB
z!@~g@Oq5U#?lIj2wi|tdyRO#q^92_Y!|!d-^~nHdw#7-Z`@I1Um-gph_xcDLUM^!7
zs!iuVZ<6B9iKwLq#4(H!x=8ijucqf3F4BodVbrJ46}?oSNtl>?WMw}|9+w*eQTH*P
zxQia~%y5OaHx!Y?Szq?ijeIJ2eg>RAD+fdze@$lAC6QLAa3VCT1==mY4-bWi05Ovq
zyz5~Lkn9SAn#QyF8Pm)0kn%EUpJ|QdWF<++$$pkvIswnWWlYB*E?%D8)sR0jhb!Ce
zKsOt0L+5g%>C&m)jdCxy(jpa6>N@ZfWnO(wg~1%8?Hx%LfAN5e?W1@SR++@{$Ot^J
zSqaUaX;zn-YDyKxJ7Dlo2Z$^0B|W}s<dLu!UNKh-p3%()4?h~ROLls4JbMM0`j-Ee
zEfUOOLK@nDihVE`Gtp*yKgN<J>Lvhm=>j+PGgxNEAsi8~mVIY>kt_X8g@TpV=<kPg
zYI)teanX1rjrf*Ij|KRlsF-qkD{(zKdeV)U#LP#MLzz6W01mMnk%YT1iX-9bIY4xH
zf^@_yz(O-mj+~|^o&K7T%FbuNIZYZawNwX}Og7^gPuhUcc#&1sUQgVqo4`oczkz!G
z^T5H}9qf@UV>sx$CwOj18Q)AVd}FK|SN6nXWe*!P#w{UfH`=*sO%BxNj%g$OAWwH&
zjnLpVbD@I#5WVYPhqk!c!Cnmsc<5spPfXbZ|FX}59!r0~B+nJ#h@BY`Y>9!DQLC3(
zt<Rz2i<c4kP7(HW2@lNP$MPTQ72_+DvOsE~AOD{Adm^?+fIaQ~0y;O$W=`FD$`5}h
z3?kuv&?z;>I(fU1X%W8oyqGPPyv|4OJeK2vzF6+`oj1uRo4CehDSA{<A(=XbDpQq;
zF<Mn4gSPM50*%d&z-JM6c%l(1_*LR<=-SW&mu)@9ZXfFf7dKeKdqsW>&6gtWV>u0F
zB5gSFatHp_Jwl{@2jV_6HKudSZrRx89Gthd9534T2AZ7iz~<b3zF^&KsBj`1a1txn
zro)=JWm6lDnSY=Ch@YYUBLg_?!#%FH(orhA-=fhp_XV{1l|g;=@6y>f9cXd&Q>4Ou
zPiBioLi=^SJVDIIIr_)ornlWt^mh*q$ovYu`WC?SswAN1dYnK(LG0u96gv&qlVkkd
z#8T0fOkX9|aLquAvvfsQz1rWle`gv5FIMK`?sYb7>FENfFQSB{r8!oOodIl&wip{}
zW6AHmdl_{(Ys0OIRoo>{`P4_UsZmq!C~Wu}Nq6mDMAgl1)6S-Qh|Bnrfz<_g{E{wr
z%CHybNKC>5j6bwLP>m;odSQ~wbnxTtUH19IDz^MEA)3yA@hY)uvS{N`GB2DZs`_ed
z^rD$oVed^DH1?Zy5y}H|tA+6^`!MV@bOx$UC}4|_T8?XRGzj(D4veZr8IvIfwW_Bg
z&HI13i~4J+i^;b}aqWxXd_I@jc+RJ*ZoH;1qPo%i8CK+Tv>a*OrO*A7Bt&LxxePs<
zYhVn?!Ebt4c;-qY4pfN6c@sY1j#vXZpq<H{R}Etm=9b|M?jHOk(UbitE5@IkIm8cp
z)ysF5i)YRLL}D51nK)!!J$U|E(W>A}Fh`c%51O5Sg5-l-=97gTB45uV805fRrCC7t
zR{f#h7WRXJi3c=7dyr}#?W4aLPo!FwO{H>+ahJdx?wDdJ(c1n3ZXR;_H-iuGj=>}F
zWK19szSm96jmF^WQ=>$7=r2yVCIe~~9AmC+{DgD2uVAZrK5SCYR_ro$i!(yI@%#%x
zK+?h#i{3T^9^KP`(=tcSa${Xkawik;PAvy=s>&$KUlXm`k;e6%*GJ<-VrZ+OD9}6c
ziJpu5Or&iWG%BU@kfQrzvi3_cStzW>9Ub~emVLPjtvVM#!Jr5DEvkc{Z8y}9wIH_I
zhoG<HQhcDMj-C7@1wO5^0tH{6VL4&K>|;FGYpbSk&73t%%zjZEx4aAZcf4YD3aWAX
z8#O?@$3d$PDm6fUVHoFNKo@f%>j_HqU5v6Fmve3Jex%VNYbe_&_s>V~qs_YBV57Hv
zqtKIku)l4bsMgrwiQ9X)BgQX@esLZ&DF4k!^8|3^R0k9{c7!JT4-ug$HHi58n1nlo
z?dywy3qF|h-A;_MDvbibTeKP*4OHVY)psB^=M6r)T8QKFU=*ap$YP1V8-dpDJDhTz
zB#wfu7CS8H#~&`#LT-=zQR!PF?#5UpTBbjXR%HL1uVypa6iI-N-wgUI(+$mddmdLE
zbzx@G8{E;)JRG+$6Uv-Dk1y-)XVp)6GKF&aaNcqgEWObPN;i%H!RAOTSR@C&&whsu
zdT(H*-2~J-ZO2O+&f@{IFHqk^6LaTlVBa5g;7_a%-V}eAabC2IlW6vqjqNSN^HZAf
zCif#~v+`SX@xWE?Mm-mrx1xlW{%%G(R$BjZ5r25FHnvfC$P#{X6NjN4Ld4~0G<PH`
z4a6TTU~XIV5?KKcxc}uDyz0F@SW%k=s8l&Puj<Hfmzm;4ZmQt+IX4`3>^kmSO~5s~
z`&gv|5sBGbq0JEt%$vJ#0`n6pczwVw-0Q$e=R3bo)Ev*>w9G2qFB*4V--?3fA#^ot
zjB7jihn^00ruQ87BZo3!TKg^n_De2llzQI-I}gf3c+Hbsl9A^Q_4Yxn>o!2BQkJmU
z8c62%B$#{69fw6thdDvM3~irelpd=!^wBmDdBPvxTYQt8FzjV|ZjZ1T3CHm77hj;X
z=t8L1;*3rE-a<uhLw0dTJDk-t!NCVqu-Pv?X0UD<5e{F6YVHT1f&owNT(J)JN4-9M
zG`t#_)o0KY9b<TKQl5VMGaVMG3&NG}WJ&C<DsHc~37lu)45SWhp~8Cekb2Kyc%}LU
z_C3#oPm@B~viNN<wSO+ozb(R=%=BP>U3f}fjF#eacn?2GcLPrNI-fNT(}Sx6uQT`a
zCICs&0qbXV!F8q($aKGBHmbCPGI|eJN1s8Jmdg=IT7ZT|*CB`5;xyQ2Im|77N4CBu
zQ2XUuI#u}|>3n5itY0AMtK7+bHgkx%sN(|kbTz2Z(K<LX<%8zD$Z9xnNdSrdvV~#`
zhtOQtVdjYER_st;1$Ky4P|@m1qE-HjO;z2+#QbR{5{LJL>P5OBCa9dBe9#>nn|cXN
zEH{9P2kSt^V>|G4*B|D+e?Q_k97H!|?NGxT9ptgx5Q;`5BDHi8yn1IfOgdXf7th#<
zR|K4eb3T3Nn-<A)*&TmK$UaZLqIC}G{96yVTwH{7SIR;8xldvLGW@Sk*o+p5m4Hz_
z0=G|cVf@-S(w^~!Fn@Zm>UcNU^X(*=>70X~&9VnBabIxh%P8j3vmPcSe*jJ|Tmn8T
zT>$s}o<n2jD=5;@12uWiMftD~?TB+l7QbepZHY^e?y3M3s~(Sp*K9^cZWplqhx24*
zf-(1sneIO}gNJoExuo2|5?mCsgrzG2p^0=dj9hgCE`279j50pJ;)Y^Ky5(Wci^X{G
zUOxUZWQgPJe1TutHzHywL$+plGwXu%h|CdDoYe4z$i(!5xWo0pYg!k`yPygq!uT-z
zRSL?V#vrHXuTki^6Ua>UJxcB)$i#Lp%7~f=OUm_;*FsTLH+TshOF78B&izVd78l`9
zTcpXk5(v^YeA&AR-C+D*mLhXO5gd+u32PUgfWrgQu)1nGeE8yDu9=j|1X!ry)2)$U
z+ujZMzAZ~EZ&=|przs*dCXEZU9mu?*AsoHs0<NB0j~AC<c#vI8Bs8p0O@cBKPi9fN
zhY+lu<RkA}rN}%IqFL`Ka?h<nA5UyUq%;W07xB2w`evlVO`ZhFw38S|H;fHk@UZG)
zkXqW!PhR4QdpUQYRHibtT)!30_pgQ2<^=qFLIp}lI^uN_-oT=`13!v-gbO#m!>`)f
zaPy!4*jr7Xi1nMbWb<@PlpG52dxup}NneG|Uuuo^t3|;CeHWBqv=y$xEXoS~gj6Mq
zkwZrX+IdqS_1xQolC(ond)hVbbBSnT(V|E88W@lzRiDX9IZ+JAO=6Uqi8n5;V#eRK
zK*?M4h(hBX{3i4|lR|~z#Nbo7D&hy@{%8{DHt689m77^DMIrpNvkQlZ-C*xk4-xI3
z(Zor7HU6u?A-3axfchhrN*LzCzR)meZYKh<XgK^wzo4AFR%8GOGTr|Oc`N&%v?vSY
zf1igcgjKl%KE5D$_Fb~iHIta`xK1*J#P9(ojBMABA`&IGAnRl>F|LUuT`NQI-=ZAs
zcP#;q&W?w=E8l|)cs&r5_2p093}z=|KH~dj*VufW5e%ANlDVa?F!`q|+4((WUf3Kg
zV|bf9Ivk7?QvZNIoptcTtTNc&umvjkn843Fd{B|{78LYv?gXl8P}Vg8WN3GRJ94}e
zuJKxcH_o&pb3~np^UzJ)ZkR$Yt_;M#=06~_yBCqfw<aWVXeRz8Ge~a#d<#2{MuOVY
z{(wK`&$OQrVJr*_Fh}YQt9ZJbQx(*R-ySF=%6G-U0;?jt@x%bu%G-_iW%dI5!E3N<
z^B{@7`3y`8Ov4#ncCcKI3;sy)5sX-a1v<CFRR3AX<!KJ$>F?$V8=Aqj$`?QaCkOxD
z=#Qrs=aJ*n-N=<o&N$}CG!jygMSdLlMZ71zk@VH0<U}`t7u9YvJ71Q8sM)yy?&<(a
zOE&ON?WkhPegrZB-5J<X)08!}5(YzCUD&G4Czx8!Nt~;e1(b<8GB(;pQh(aQJzqs|
z$&rt2%Q7Dr8Dz>N<ZZzVPT8Sty>@7N`fbKYd<IWUu@TxtD1nP2AvkBF9P!vSOsu-S
zh+6t@?8jeDYBx2J-cvp7$X_Lrdh!S^`@9<_4y|Uh?zJ=1bwfdl%q+%oW+bELPQm*8
zk622uo9SQ3fS4_hK*C{hMqqz1s9gUOU#tv=NBp>m^HP?0p6@2|9WUv0xufLWs5*T4
zDh5Q+T6*yB6Eb(UI<c9d40|Wyd6J{YprmLz5a;AE=_hRQ)A$6edP<WxtbK-W#w^BV
zN6Lu8%xqw4qC;ldPU4o3F*sMWi>*J~#Fo3{gX@ponVEMSI8UMlK-8>UysbqVY%@Fw
zHCcI(`7;m9dXUQenjV5%&K`w7nvId;8$#j@o>RpzZ?f~iAlZ4+p4ryBn9LqXpzDrC
zq8Wm9Fk$I(*j{jvC;M~*ls%BjXg)axg$)>@_d|#17;hv>(l<!L#cbS~Z;X3lreg=c
z9Yj;?HWt4u4N6N**~glT*xKY<;8WWHW>8Iztwr{r>B|T!v+o>Geng>iQ79{QuaRA&
zmdaM&a>YMWi;#e79TLryB35l3xH9eq;SSr=nZt?1;h8pk?y-r~-BLkw%Y|Y1+gBjQ
zI+Z80aR=yccy3kd&;~shJK|TZM~UZ^AU5I7zrC`e2*0_M21UFiphibCF|=*LW5f11
zLb;9qoUY^C%{mT6m6VxN(Y_q@pVOdrY8F7D#_aZfBM>>;3di&6`CUWrvFO7z-1|8Z
zsV);mn{Eiei=nMleN>f3txljuyAZXX`I_px_EHb&Lg-ZLOJ2J=1Lc$jJn5r7Ab(8;
z>y#M}oA3S8uW>HKHg_BJ6MBaQc;;lzn;fVzc8mxWEF_nIUm_MyGKuZoUaM5!Vy5<W
zJopnS&3qqz0xl05K;f{>AYy6|n6wJwX}?e67@>NQIatZ!ukUc)Gk*knw<C^X15(*%
zL+91C(1J~&<czH=IcJqkd}I_!^sZ5~`hFu_z<mkC$#b6Q@FOVNwj5+HH$##ZW;ikY
z96#>PI(YBS08{fm4FArt20Pq`NaUqQ_?lW3fr+go+%1xkv~B{5?{z`FUIKI3QUVm_
zJ!P`Ctm3D2MFQU+l&AzrW8aEGF!sEisZkPTy*7y=#z+XwJ&=Sh+%usDfzfo4XBS#~
z@+rP4xrAyhZN*QzuMlCcU>a~jkBsQ9<q5n!1*L+Hg3FpMNLfD;OV&Ptn&IDIT7Ne_
zxBC?o*sVlPC}UzSJq{Q5FU8;HO%SKbD*kl$A3$iPCX;9B%ovHrvoRM3F#oDPPVtBb
zDF%{c(ZOVJ)vpTlb@qe3yZwP7o<uSGFQSv8;aq{9)8tWeJZUZVMmq78WWZn{aS%2p
zBiH@NpF6VP3>csq`%ZIz4J?B~Dk)IHTo)-7?qJEj9Dd}LcmVnjl0CIIf$npLbe@vJ
z?+Y1d))RymUtUhcJ%sTb(=YsS=aqP8qa!2S*NguYs$lkcFFxTo6R7`*!%Z7g;GB7h
z{24czfZRi05dLlviXTitDHcXtonSBOU3P+6biaUS<9^blQ;V=s&ap<p8DFsVLl2Pj
zYyZDHY|WjFn*#9%bimhPd-&z?R}3xxl9bBtc-PZR5_v=xd`)X6qg*xUe?<cDJH>$9
zk5723bPe9KEQ#$cdWqkgxikFC??mda1#3V2kr9^O!Jdr03l@x-fM5Flc)9y9TW9?R
zIQj2F+0VjI+TKR4`HNyYZ_yAfcx4Hf)TYt{8<a`m7BO1gvKp=uR3lkKPsn*6QSSK0
zxmacepD7pq0Sj)NX8A1{r1f?ac%&CaR>nUC8Nb$$v!Z`sk#r~CTl0^9n*W!zIA@BN
zFpbz(WCoEB<OA)tpV(4Tn<)@~3@$zxX0zMRg9|6N1A&Y#c7}NrwmEnis8ruYVHtiX
zwQ>=6O)NuK&e5UIm8B7=zfaRLMVM{E_Km_8bCB7ATO|8oF6sNQjr*%d5ho8{t6#`7
zh4b`1LCFJMQnEE0u05KFJvQ=~iAS&T^=o->^M?@N_H>*$t^Ui(IR0ZWzI$R;U7eV%
zKLnzmF9p$#uW&GW2jwo_;T+%M3>wB9@Z01*CLzp%%~<+{xho)q0=B!L*djTuL&AQl
z{Ad>aDa)X}pJ&l)OQWC^DWH?<wUOmZ6ZUO-0X@Ju#QnX$kiGTE9=D9_XPWK@gPsFr
zWRrp&ld?=7TnPQmnru`cPT7uNpLHix-@TX^s4f5kfdSxCxGK)HSwXz_odH>n#Zc4D
zjhL<329-zV1I4h#pl5F$S^VoFlTgkDEqf%`_&6SNZ~lZbLUp-oLJO$G@8m|IJ5I=V
zb~R0BmGEbw&PJ)eECiaC!1Y&t(ttx9+`roUq1R<Ql4>kS66&A9ooUzcQUgy?lCuQn
zsK-GwrVhsx`xDFOr$E+@K{j4n7Fdazf_Hlj_%>HE$nq6aAT8Vi9zT(eU+#(oAEug_
zp2HzfFYhr?xVwTasb`@2zrIZ5`)rgpZ3UWZWWa@C!^H1*U87{i0=U?bP>K3^ByX!l
zqsqD`@Y^wMxH;?=^=vQW{#~XD@0jq(`Z`srvZ(<kxSxQ2_hwNsj}lnt!h_~{8{t0O
zLh6^uGiL>-VBX^mc<K{j{-L&@^}Q3m{EC2RXBH+nUIsUd5s1oLz(2mx7Y4axv#q*P
z{I9+b!Km|1;`=F$E1dlshi_-lXzc>1+h9aL{%1{vU5fA>kr9-nA=4;hABy9A?ePkk
zb5w3I<_XM;gz>AjfU1fB`K@OFZv@^&M%569n*_s~o8&;+^2bQvLj_4sR%J3S#=wH2
zI67lXF%kcC9L}u$1%k!qFgljP(82aD43!CD3dU|@Pw{r>y5tp9oyr1--gtsFbvMbO
z>hD}d5hJv)?>gGwW&rzD&QrHL!SvXwzeN1fRoJ=yCbiGMLXCsR;fhbHu*0R8`(bkm
zQ>oqt4-A|kncF8JsV+e4E$6_?gE!%=s9L7ZNCjDaHDt}jk3+?$Vlb*Glsd$x6RVOX
zz+dzrRM;tnH;`RG%v%{AYj<L2bktfEN<9UYiHR_zP7PdKTE>=|@o>Ys4D?(?0bK$J
zy(?af_UgYwi?bf0%*^|+=i3DSB3eXVf5;#bb5!Bkw0ySEb_=(B<0TSmp9X#1!|)fs
zBDl-C1{t?@BB6i2?e2HN-Wth3;F~vWe*UkwulNK*vSq04Oe6g6*+uN5cmZ4(4#I6_
zglIP>Gq1)}@UC-KAg)gu%Zb!LYlmLo{)%mQcdi$T$%Uhv8xEtS$T-y2xg8Z~pG3x+
zR-!PwM<`)RAHFCnhuv~+@V#j@LQe;w-j}D`LantV`urDuzq~1q@)rPCR<*<C`(m(T
zmIgf2`V{Wpl8hva?2w%L7MP+F2M@(DWcd6?KA4e*ub)}PTvoh}pDy`AHtHzhmsLHu
z^VTp3|8bv4xezd0jmKI?SMhK7bU`oA=P2i*5psWVABD7jMB6T=BM)r`g<lgzxoM3k
zcma!ipFThr$wQRup@cRxKH%nSn-cLKhD5O8FIioy!HGJv4m^~33Z&N<f$`Qpm^Zir
zzN}gWn@}0_sLFsB^zv|<bt*`5RKiR8UvbI}`FK~rFj*keL8Q7C<F<eKU?8)S%#%Ec
zNB()rGrNPZ<+K}ct<M>hRlXDLxaf(3+%3^=H8m9c>k>*^uo|7TC`RFOTTqs{9;(uL
zkJ5b3qReLk+&urKcuTl27BSdQRI<Xca^y)6WVQ~>7ck>{wg^D;J)v-?b|8dfqEH0o
zLi+)N&6lqO5j!6<r&A}G>w$4>(N8YEqC)YNYo&NwYB#o1Y9}(kmJmI8IeacV2$!Ff
z0}m5jP^x=AathZ%v3EsKVstf%%q~VngSP+P*>@;eqzIMmo`y<xIHNN)KTwRP4>$LM
zE30nwgTXUe@Jij)oa(obb0@)*N%klP2G?Xk{+SF=5~spe&ah;R;^ToSZDLErO~LxR
zMeM$nqd;6^8=iS16`!6yj&JQW!tn;X@P!XA@DSWa#6(ZyFYQ+_D^v(N-43H1Hw|>c
z#}zpYO`=2LvdBlv9L3*SjM7qOqx0NP=+fpds5&DKUEM2*PVXD$<{E_Y&rH@>8O|4A
zhYa?!2ik%;Lpy6&*VZ2(xz-&tCps|=!gbiT>9W;}d`~<_NQz(5WD4TW`eJgE;cUH-
zfzNtz@uT(5WU0tPBKF-D-<w&9D|MZ5*P|n3)(t)`dEf=z#$TY+>!RquhGgV2Lms&w
z@J506x1!YLFVN)*PgHn26P>SJgK}Q<qO^IvDB;o)Zf?kK(DKh4xUqjRyW}0QE<(h1
z>pk*R<Q$1`@4~Jta_}C!pJ-S+k&M>kY`#VpY%?kV?O~^gcJC<LeoG18e>Rs0UmhaP
zTSkc8F&-|6J%}GDRbdt1*F+-bB<`@%foCLAQTjo1<dn4o9m?2?4xN@lvCFriv_0|Y
z>}@%8)3pMXqjr>`8ile!IXZGTg_~O)1ozXc<d$^?eyg;a6gJ6`$~r|lXKM%*dUK!r
zOD2ibR$r26oKHsNXA+ayhRC6H2^d`wOXi(QVt-lp;R7ZI@o%-~M1SQMqOGjMrhc_0
zqQ0Mj`NAw5{BsVG4mu9QEuW(&<5pz8*fAuxxd{HJ;D8sN)}W4EYlxSTIa=-$fFfs`
zAiYZ~i0b)96t;UFw>wxH-s*~=ipYb=AQ4hISB(t1w9=*DFVh7IONnx<D5->7Fgvf8
zJQ?>V4-HnKn7M;s{qS9E^QsN&m(K^8yD^rj5@wC8?*MZlb-*#=V&dxyUQ2d@>NP=l
zJopVG=ab5nP+LXVOOo&zx=)gFwven`4H})0NY#q3z_hh~Xyt|h81I`!N<S&UeQ#d!
z1P-Oaek(8fUwIbM;T$4<cfOJDt8M5)`%hFb;1Af!zd*)^o<l|dcO(Q&qvDIUA%8D<
zoc51>D}0)Qr!{**Mkfi(H?ITzt(I`DtR@hB70V3kz6UL1BT!>wC=h$UiUbX8;Od!9
z(K1dtd~;lbxTp+My^CY?;+Ynj<o^vNwoeelfOeGCipiW45mfT?6i;T&33xm!ij=x9
zB5`XlloAOixt(jt2i;Mu>C(!)KiWyIYR`p>2Np2zE?1C`zuaNd?|nq2#~RKN6XeGX
z#X|$5t6-yy84MM4hdX8%<CT%4aBXu3-tjdH^d;v(Q!h%&)ttDD@F4Af=LL6dO=a($
zR-{Lr7dHwDU!^XFVd!>_FBxB-jvhvy#z#(UraK<Z=1F((@&54~+@)7S++OrSeuyF2
z{7;wIF`J<NZU<ucX)hDqY72^$^+8OJ8ndBl3l0fWBQw4YK)G3sK<khX(9S)<yh3WQ
zM7IG9ImF?V(LL~Vs0q>d*v?D_nLw8&Q{tMogsZ7HMq@Qxs2Na2i|=82F8pibfB$ml
zOvz}N^lAexxI72B6>Acu2WP4NFGZesl?Hx(z7TJHVn#&uw?peimq}@B9?Z*M1g=f?
zk%W#C`0(o>xxD-WyYO`jTvzfPUola`60_ejemRYxJlKfL*m!|iHTxt?|9lP$CTIcu
zvPh_}@e2QPe94ZT$_M?m0qnj=DfB(3k%oCK#^uqAQGD-PditRVtqT|-YyNpu$;e!^
z{-_w*G||aH7rWqf^Ca#c@71_D+z3}XL}2--zc}KGEh+hZ7s<aUV&ZfD5Vgx&;6REh
z;m^hRT>3`n#+eV^{B;2dul(7;X-An6<8#Dk%>ht;*c^-=nS<liM4-eSPyTa~#eR3z
z!y-cyc<<J~pi1N|SFcu>PTk*#4@o#ev-0DO)7|f2kJ2U@vO5&XyQ`v{A2;xaBU0p;
zm=4iG3OpIV4WQ-YIuf<|G}w0aC}DS~5#+ZCX@4BSe_a}(r=BZ3xh01<))`{k@AsIL
zcl+TkS2Yl)XwLko--g#t=n-KdSLhKo6Mrnb&T>5Y;IBs&QDL|J^KPAd-ry#D8(A~1
zUvF^LyfFQ^p_(Kv?#I<`_Zr28=a6o#e_pj^6Z+4?_}}|50wbbANLx@KJ{6+LQ!t%?
zP<$DgVYrRV(c_Un6Jt=?!wm*BF2|XnXW$7r76hBUBjO#7IIBL3SVg>to9t7WffzgR
zd0`YTay!U~b~nNx%lU*ST?CdkY-Qdql_F}7^`TVS9e%`5Pwa;)@on87T%&bo8->t2
zQW367r)hT5PTP+lJpUK9Jy?T6HJ+mr%>!WkaXeXd(w5vfEXk8|wFCH&9++q{Cxyt1
z?6*0LWKxE4g>eMPRon_?1Kfzb(|IVEx`gdf&cVf|J8+WRI-K*X9<J2b#CFu!!8u3T
zpgdYm3<`|EsNHoO6Rm^qC**?L$8DI#$qkIH6pu`^`pq>Fenz`XEU1;f84a?rrG`&}
zNQwCqYH-yZxrZ-9d7W!OtVAxk7d@MN9_Z)E>IML7nhG;kX_8=xTFi#}BZ-bLY}2dj
zP+oW)^W)$O+`e!K$_-cIw+0#P$NCuJY1ly&%gms<Q3-g$bi>U@m$FlH<FQddGWZmK
z2&bYN9DUV<`RdWjJXyYysVTO@E#6<a=IzVrRc8fSn~_STueg!jg85V;<vy91Jcm;3
z-owv7wUMS;Cb=!#MsB{}&XaTwCeA5b_{(kq``Kp{dRY3yqWwNNs~<uBnFstcfAxsr
zJ_|T)W+bQz3geh5#FLPJJfYx$W8lCWIcT9&0X~|ClmEC)oQ4(YK<4x}d|~%EmJuJt
z`8A0^YRxQuqe%|unfe2+?!T^CM{6|=s;i*WQoD)P)*U2dK{}j~EQyvavq!rtx1*&d
z`qb6ij@U}-@q{fukfxLx_-1btbR9H;6^}Lp3eA~I6C<$tLlvG|9#10H<#W0<V;Vk%
z_u{Yn-u&ae#Bge0Hhb@sG@M?nwoKjJlW3nCXPdw3;eqD;xctKzEO9><-w7=Ownovc
zkW>jMc|RY$JEcx$?kFJF!=}LsEn!l4$CFe|nIawClPGGZ9?U=f8~(|g4Zj9kksocx
zcmnFH$c55TSS_^+UOnUqUX|R$Y8Qp^`9t^r@#;xrPyb8empe?>TyzB0KU{H?$5p&#
zzA58s9*HB)odVNt=HjXIbMU=QmW+9z64~mKgbykF>$YM7@ntN=k+}K>$U5bN$NPoR
zbsbS+etr&F`Em(Nz_*Cj=QeWQstgGoI)Kci+?e%6icr5)hdO@QN`p>i^Mt%dNod?0
zc&INAMhS%A>GRLw9Hm7>@rw*8oUKFRKLFx?%AJ@VJPD^q_+WA8Hc)x57+AF(!D?>~
z;ZtpaM4)*q4tRSFC|#XT^cFSowf3wAZ%nJ$*5)j*OkWc;ZZidE@)OW!k*(liLIV!W
z-br4BKO&vtGwH4)y<pM(cB`^2i%9t80sOF345-GQCueq4^F;R*5yzSHV0ic{Fwzr9
zT!pUV%C)ya*w>Au$-0U#E62eVk0)g1Onc^mpbeC0zY0bwWtr3C+t}oPT<Yu7DQt3w
zDi*cb3lf}T_{_38kY2qNT>oeWtg15^VD|67(yuj4cB~u9&$NL(*GHkXPXMX$`$@*<
zIa5=iA2ikB4gUR4dk2=fL20QqAUN5A^gUY26W>3RY(&E_eWyN?y<C<E=5~>#^&?Q+
z>oZv^bQI^D-UlXwl<~;1=P+#I8w^+51s>`aGmZyOvgN^_asByWkONH_?geFFJXH(l
zUm62N#nr6%XeodTkFrrA`s~so1uW=Nj2tfuA?Jl(zzVHr<caNPsubw}4fCo<&kr7w
z&G0}>`)2a>YBf}9T!ynZJ>`inN+gl_U!ZTEJ&qoG3&MWLkeP#OP}Kkb3eGgH#_fIM
zO&WA6`Dvh0qH;({WY=E9PN59R5QkERCTUQK2BK)tqzRQ$p;4uhr2VY*gi53|YCys+
ziF1%B(|`ZpuFq$^T=#vi>$<<!H<(!R8~$Z;3(AQ`a*tlrhLyhyV27_Y6n8KIRaZud
zeXtJmbdCd>A)^DG9z+pt=NmN7WGCpCxklv+JK(ZU?!;hhn&iDUB>1Ko-scjGJxmm-
zR>3xU`)xaW?7(RXDu-!^OdUMDWe`dJdQMMi>!Nl`RfK;jTFPYm(1jl(k;?cI(qeuF
zZC1GhPVn=grhOwyneqll${x^D^Pdr&reC1HaEPS$x-bGq5pc)TfW$dLZrY4Sn6g_9
zGQASS+@}|w9I^mzvJ)gUXqT{`yd<ZVH4-;phODxChgS?v;s}8%JrJW$9=dN}J$WoE
zvTYObO{<56-<#>2*O~CmVtbS^<V=Pq$1EjBmXOKsmtp$d3AiO=0L6|f!CO;+OtvpZ
zVxgs^`A8(~9WP>XgBOwLtB&O1Vo|bf&phz_MHX@HyA8S|ofxwtYS6VLnejgTgb@_H
z;$|;c0d0s4-104qsZ|L9)#Vn<^-V5Vbcr91t&66M=NDs<!XO$1&e3H{TwqsxEP`cb
z?1qVdXthcx{PgxQyD_!ba>jONx`gKl-V44k=YQ`+68#s^-|x)8_(mOgLt+vJJ50hA
zJzZ!^dm6~`h!S#AgXEf7D=2?c&k&~_peH7flyt9W!n3BhNdsCWQgc7U`^};S(b_OT
zFk*4bEgLdV3pfRQSJe4zJx<gyLD~C{W3jebtmvE{bXYkI5Q%&Be%b2m!TbZPc<&`*
zWT8nC`h6^C7}=2DeHTHxVLYg44uGalYoSM3Ej-BtLeVr?ILjslNq>HX<mCOCvH?vb
z7cIQ|$ONix2xBDe<C#s(H%Vd273TT=aWGNQ$Oy7r$<jU1@T-Y8*DO1V-}Bcr=)Ug+
zcr^g=)b8L@9%osx*Xc~qhx4>_86hbK7m@7ABGeXIgE>LZaO|geymXBi{_EsfONkkU
zWTys%M*<f^6f#D-asWZMb(tRpJHR0Wp=aE^7R8+OLuPG_VENCbC~jQ`5f^Hzp3UpI
z>rQI&Uxo>YsHHm7cC*}~WMdD%tXY%9Wo#!N`;<t>gYRV7R%envD-C4XcA%i_S2)$i
z0(us`ME9O9WtCKAXn^;Bu-LZ`&0QfzkN%c{*Z+8tlBH4P-sc0B5^A%!-)kp9LJlRS
zm*W}gWDkR{--N!(wV+b@H9b(82_w(!;PzG&lZKXGB>FbX#K>pBZC=ay_o`12gK-{n
z#_%X()X{Cu<G2F;%h%jQ5Kl5~{YZ`LBVx4Hfe>yjNQ#<+3T&ltnREwQ8+8m?o^Gc@
z@@2H&;5SlIc|l)?zD2uT06Jyng{-b6<2greS&DDo4R-{$!<8olOy;gnT-v_?YK;qZ
zZRQdTkM5-EajWRoCEfJ+*OzpD;50?<4=o-b5h%U5fUF>QN!Z*wOy#q4%#YTV`e)T!
z!P&!NFvn*((Z8Heq)&A-WpT<RvA_bb4~NlxeLWoUO`1l%@ke*u?@<-qD`4!H4iek<
zm^I#hkLHP_lCvCf8eg1GdlEw}C00C!g$b|tF<(rmy6|n^9VsLWitmE|!au^FNy#uL
z)`Jm;tH?bqPpbB+k~Hg2gONm6&}wpkaFpAa@hNxWyCt7Q7)~->OGKCt$Kzntsn76S
zy)0DHt%I@}7vMe}E71Ez1T}oL#MhN4=<6C^(ClbJZ^u?MSxd~&fc6@ALMoC*k;^2c
z#eoTYmqa~-|Fe`($%2{v!X0;%PPbLq&{aC0=-m4prYXh;a@v)U*bzV2GPVIm7MhTl
zt@}xEdonzIq7}TV?Ig>`&H?^|ETWY)MAn`-!E9<hL>^7%0S&=g*gAF&+FnS5dU6Zl
z?GQtd=C6RYcdo=~=MT}wHRnmBS2SxcoRKc@9$^J>7nJNik1V!cP5m#cvD_RRYGy$#
zXH50LZ~SfKZABwt*)V!=@GI5#k|w%JIb@9OfR~K>(W(45@UOWKz<(`WNGaedEcs>v
zvX_N35fu!qZ=MD<w(-D{8w5a`C1E%h$ZD4^klVSLJGi?AIv+U+T_5-`cNZ&QrT6cI
ztle35-M%1JXLc03>$5n0kl2FeH)hcbqnUVi=wob<whP|wP(d2qqLxx;T9Bb2lYBj`
z1!HS9$PYzts+eaD6!U{=?#M%-&cZ<kFAGVK+)J`DZY`2Kn*k#S7J;(RJ=~&LVP-+*
z4%iv!0}~2Vfc!}TBOzzVG_}<brLIV(D|wo0an}y4o6|<J@73ac6%u$?z7?z36io-m
zjM>$zUjQ`AlIG~RQu&IPSnB-(8ew;Ua)P(P36U5}8B<3jS?EC?-z`9fZkmjoS`rAi
z*bmR|aHhJ_2Z&Ua8T5SbZ{c}J=qYUDz|xpOauWyuexXTbHhRN5FXPb6dSRWm(`I&Z
z!i0aeA%MRxCfkmlf=e$Cl4aAK|L>8IukJ=TK9Iq?Lminwmr&Z~?!zV;bEvo#!>)QK
zOOJLf68Z<C^wxY?R$_lIx+30eDOK!_Ow%Q)q=*rUzBmhc{*r>v*Lo2pNogR^(1HFN
zKLa-{XJR!XLKwj-@MFsz8nDM1yw1KvY!h?1kB64R_(B(`vfiAmXiJCxeXfK$!S@)s
z?f}TnyF?u1g}n5{7%A8_7e^=^!(zNVc$L+r`(?vfb@e)uU^*YIyEp;)`)yHk?-CT3
z>j=jeSfE8>*}Pwqy2w=BiN^2rMv<TYLh+4WNczrtQrvkJZBP9hI!xBUgl-WOC>W)>
z-+t4jH~VRw;upAU-92LE7)e|YpJ%?BH<QbXFGyHtYJJhS1)%?E2v^NP67~w2oSULX
z!mNW%@Mn4^u1fs{9X)O%w*nvZG<O57cYDoB3Ufcq3LcR0hX3J}wvtFoOnBxNjgtKX
zcwba}QS!GTns_D#I-TAJul5O;vp+ki%!_-%TF?Y{sP2OwJ=dT#bbv0~A5E?I*-(4K
zFL1t64KsBkgAB|TkfA(nI<tEYk@x8!{O&t&=H^?>&3B@Zg@%mS^Y_eM&3q)%+k@MJ
zHK=&zEv$A+fMsjX(!owCYTsyss`i|Op?ZDTAdh0B#HVaXM;*J0^EXeh{|PFWMKs#_
zDnFAa?AzI@G|TTU<!vgW>lc^6uBc&<GL(UCuhXVU74CF_`7qq2>d#&OPe1>Rm4Gyr
z1dyuvBBU=Tm>I4=P5zy*g=SY)Fm)jfOo(JFT-$mao=fK-m5vdfLTDa!{rr&sp=$+w
z(0P=-X3)%<S9;N!Z$ohQu`;|my$2ukeZ$&MR8S{xE^pfU8cGxHucah%sjRjPl^b)Q
zRlBcK?f&P~+aIFMDZfZu(kQAcxKC2oHIQ3TJ&1Fym7LDRV4kNX<E614=x$iT_1SXI
z{KBkOrs2sBDD!3qFnkn1$VoStvwerKeqM$La&mcdR4ZA-t9Hn^NXP`|PO&|yIjpD6
zIIHw%7|RGfzb|YYe%2GfCV!Kl_l3zHqFw~?(rjrOe<qb0H6wQJvee5Xkvin2Q>PW-
zXfyK@TKhC2F*$cAEu6bQe^5XcNnBE$5CB0|4S1>70VDTrf%og5ftdweut4E9(|2}0
zOf}7+n${O!wS);gqhdh}oML&}LlbO*sw?$#Nr1zz`dN~d!3LUJu#1{cV*_K1#`E6b
z`msmsCaHhuwWJA4QE>!UR18s<twE3*JPW?-Iz|)=i^-25E2gB6i<T|?jWpLZk)Tf;
z7*O>;Xqlx8q|98&jjB;Nu*wg*TpEOG^%+Ps+Soj|+yv(I)RCi#aWJ|po63&t1?<2W
z)E3!8($5QcYJKr+iWndIX$`<-c1Kx0-=0mro5Cs|R>hWHPB`+a5snNJYQSkvNJ8j7
z-k)ht&{FA2VjUmBg(num5sxvD;CqAW$8^DGqWX+O0zxs44RrJ3CRiu78a8bSB+)ne
z$)=_VG*ed+G)|5(Uo%6Hs6ES-dUp@blKo2d1hz06jGT#^Rwh&Q?+Tza!6B!L+jw*9
zB-wNQt5K5Ndbs%h6SmpmA$vLhFdd&6kF7r?<IKTR*lSxKyR5L6&OE2Y`z38bHq=%j
znZ{VS^HTvcWt4#CclXeh9nt(SAO{R`#*pUVCaQ6X1L7Voh5uO(lE97r{2RN@!1tmT
zn9Cc5J646(@cEl~Mm@@j8SU3#k|)Q>)qqg$m(monsM!k~TbV~xH(KztF3e{m$`ate
z=ZjEH{tmWDnZrK*UPaZb=An+R?YP#q5(fr7VbOp9sA_%WO_#_rA>OWN%MB^Ur!|v`
zr^ukpSaoU?G(>FkQh}S=G~@QK2zjBhj;xotfh4L6$V2ZNATlr%o)inNmyyaR?rXwf
zEJqSl7%d{aQAt3pI>?>B+@apCUBnnIG2f?cLb&@b@D#-B>F-OIIoO&<Z6&MNztcq7
zwuXFm|39zr{v&-jGP?+a^Oh_=@Dd$drpNm+*8tA*--?_X;+YzaJ~~@D0Nv^RA6;BF
zM#?qr!r0QA<k{&%B=J)Rx!~{%4!p6Y3+vlKOSKg+*vkjsD>J!YEj@|<@3pY{LIL!x
z%OKd!oM5sDmPXuUPCvJ1#P178+!I5dx=`bJ#*;*%PG9NQ3yHMD(Uu*Ve8XxToQv14
z9>Y;4jaW(a7#sP(9X&Sv!TV&o35Jde&}L<0m~r4e8MFI>POUyc!ha}}yG3F!qez$R
zvj0vNEEOmDeTuNNR-T4F_hpPXjDvvwC(z9H5R6;eP1-FsA}#r1G;i+=l8i){H}$bF
zFLWOm{B)R+SoM&kPTt|^`F7FE%w=Q{+D}h4DA8*MDeUyq0#<K#CHB6Ri(O70!n}*G
zSdUR>w6(5;H}X6K&e9Vy3x9^tg4F$Wa`LX^U0Dm-R^o$lOx0)vx1WS%k0I~Gc7lB!
zVeNe{T9`Xb;(I-~Zhtw$)k<M-VA!0fJ>G@vrbZFZy_)ZS+@Jq5b_hn8jKYkBezMXc
zfcRQ<@$__>$Xhy%I?nE;Xw4M;cXc8Ak`%I>z(07+kuZG4+7zR}k8Di%D3%^p<qbBj
zCg+N$QToHZaKiI0QhX+yRYks|?&JknG%SR!wlb$CuOgAG;44hsB`ai1oGDwjopdz(
zVxDVm1oQ3H;M*CJFw)~1TJBbXq?f0`>_i(N?y(1a^{7P>JvC$<T*kap7x3l`n4$B(
z46(jsGp*SAfz32cWi<q=u-Wn*v@u!-Yro^*cV}GL<LaSUZNCVwJzIkw59~qsJU|?!
z8z@@A5+xmw#Tu&Kc-FN+wDNcZDS9P~7M2UY3Z+>@ASObaJORnIEGFS6tBI&iH%xlJ
z9Z4AR;IWcCvcBB_sWLp6<9MFPhw37=nl@%6_aWfCPvR-u6Y?@6$IwLlR+O-LDYC0_
zq>uRDan^$CSo+FpA*U0L&5KOgg{#%Dk#ZZ4Zc3(Cj+Vh_Q(d7C@EPS4nj&_48J_pv
z7rluVpb(_U*XkD_CCPp?IJAQ-m~oKaex1w&`yV9vn@$tSI2XQcW)<HtO@jp5CD6GF
zvEcF*RoKc*0Xera*ur*$i<<Fp>$?)%;-HVV%x%YKEY@R(!b*(7L(vPl-S`p?#@}~`
zVBgY9xJ51qh&9O5!M<VM?Pd=3OR;6vt?1<D-8qb6?)9MB*?*(%>}<#yEkyC5$I;O(
z3*g@51}vTN9VBH4Xp(&(j41uY6iG*uC8D(?VUs`kp%g|9#lBJH=O$D!zziHIv1Mqd
zEmJP44{NQbU}5@Pe9QS6IumP;*N5iewFY0%P}Dq(@_ljcz!ctgMghk@sKc#ltFTI@
z75?X$3$MIRj;hY=;Cr;Y!sRO4k%;*(=H@sLIVyJYABj~_8Ik))e?S>||9p+QRO^vQ
z^*m~v<Ou7l%Rth$x7^CDZ%D!CaVi?&N#}|mCNbweQw5(xBqz^@Tmsrucf(Fle=r%=
zm94~0t?B5$UJESQG!rA>hyFd>fe*aZ#A#V_cxQMet_{A9yPito7^;lbGY;`eo@DaV
zpNYV4m#%SNKAH!>`c^7=+?W`I#=yPr&eHPjM+raaggH-`b0+b>FmyV&pCo-k(95%p
zTXLiZJk;tWo?21F^<o#<{KpaM+4&Ja<_Q%&A4bZ~e4+NG0Yq+K5MDS;F#BK~o@e<K
z`#08Nf4wT?Kf51|%+kY`{%l4a6OXWSBM*OQe}xa&=wsUe9<MmA6N+B3hSi&Okiwcx
zU|LC;nd>tHR(9v08K;}5lA02AlWwP*IDvFYQa>?%yoi~dWdcgSWHF9&=J0=PQDuh1
zDxgl~OpqNm7aA=uhxxg}{4A-Tq+#n`Dq;16<hs^FP2ZWAU0#b9tkuIO6pv!NsC2Y0
zw+OMEVN927KrdU@;W??V@tvpgSme<cy!RO7mCODnbJYbzEJ7dVbqT=qNCZsL6-6G3
zG3Zd6F=Kz$pRP~cLp7d6QoHjUa^9hZdwWeQGiUowrZ(z1Gv4rn;dDZ%Ij<IERW4!t
z{(OXjtavbzq{Q#r8dv}ISUgENp~sEraKw-Qwa1#${rJjJH|(lfilyYW@g3`fcxXmC
z$`Z0L4_?_|lf&EKoEfvRXkQ7hw6vS)v%kVE*KcM16lgHDfjux|WiMLkb_0H!s)JAR
za!9sf8P)6jOwLs9BfZ~VF{vLi_@nY)xZe_flN}}MOva)>=G3S$DO`w1f@BAD+HeK>
z%5Q_t;ldoPJ!L?DZ8$Ugdl4?)Acr!;IoN7S0Uw^8j}4=5;rn|Wcw)a+;O!~a*hEl>
z>$wW(dyXb*onOH#$**9FaU-;~SV^{rlrb6CD9Ap>0gCE381KahHf@Mx+Wn4@tuJEX
zIV>O-#f}h*3yZkj4x>;cK#lpLImw(@A)Gf{cvLK=jZ{1<2F?pvxY(@+?k3}~&|x1K
zHn!vcG<c8efdRIC@e#WXwc)i{5{Uny99xFU;rr>fc-6yNG<fwIj;Qg#v%f#bA|GUU
zMLTyeR#`&k#(bK%R;3eRo)|fqVMG7<>_@0)EljGbXVTg}g7OX%_%Y-RP=9%rd21oU
zJ=&rGa&D;bEB8+EOJz5a*)|sBiSAosS6<A>S04czYRsT%M=aF89STJfMDOZoi~Jwr
C*D?J7

diff --git a/src/feat/test_data/test.wav.fea_htk.2 b/src/feat/test_data/test.wav.fea_htk.2
deleted file mode 100644
index 5d68a63d43b1d364aa548708eec3b226ad7578d3..0000000000000000000000000000000000000000
GIT binary patch
literal 0
HcmV?d00001

literal 22164
zcmWKX_aoO&9LMd<tQ4~ON})sv`MmG$W2A+M3YAhwrJ}S{va<KCB*|V;5<c&{qao5D
z4WqP>G^CQ2um9kFc-`xs<8gd^d<lH~k$HUQ1Z^Sr{TQ?E`4t>@VL4V?u?-!sJpk<2
zzCxT|MKnt^3oW+15Bm>l1KKt~`Kv2Xb@EIkE!%;06mMYL-8L{tuZ!449Rwn`v_P<2
z1u_Fw(Ej8%SkWI0{g^CxAnYnG*jUQEa^qpuum74p*qJe@LWtL6Oz;M^t@xUI8nOP=
zi;2-<W?|MKE-4koH^qY)e)U`EU-l&;?w>%4Y<iL0!*597!zCp3qY(wyu11gK>JUDY
zhvF_jLy=>?$RObaN-zjQI{jJLV(c+vI+ukVxREGE=iwZag~TNNIAq(cK>Fv0uw63;
z^8cp;HdT7!{9+sSgVi9u@%L4emHBP#`+J0qKW>X_6<9nX>`#g^D~OigY%GjQiGfxu
zQHpY69-;SK;X}9aaL|5Yv8))m2lm55wQ5+=XDiyfLm!2RPM|__Z8RtqjJU=Fpg8|0
zO7OXi=51FXyK`D`P3mKK%5woZ?J!789xx;_ItRyodPy8j`C&uYIwH2Yh0GNX#CpF3
zaqiGnB5X6yvSH<JA~|o6z42c>&Pk8Mg_2_=agjbzF&)RH(kev%KSw+yrvzqBL9S|^
z7@7Ol8W=BGi<*brkjes<B*nZZzFm*WleLBjZTN=zKRtkVU8Sg)<TaG=$PhN!E0cTf
ztFXx0Ol({Cm7M-ILbCIH$(4V>RJgB^+)+6V^}?h{pTkdb>GLk;-J(<?^`HhnUF*hq
zeNKs(C@Hc-nfX|1y#f=zU>#2G-+~W5nSqBt%Hs1awY+NF0EBxcxc1;YO^T?bMyWf|
z^2aYovCnn#C}a_k-Q!RB295!agE5-ut0l)z-J^QdIw<{VKFl=>r-E7aY;L<8@f4LM
zVJqH~y_G*n*!WMfXZ><g{I?wZ*FBpQuenZSg<OEHD<Qgn+AND7g@FCOVfg&RddAT+
zh)Mrg3KrN+g6{4vpfbk}*Y5QJ7TfNF{KcP{P?uq@V^$q~K-x*bHXgh?Gl_h4ETi1e
z!{nUx0pfRaHZ*n1L?2~zsiJ!=ow>6Wo%2`({tl^<-h1yDq{@<mD~CvG@nYiH`4~68
zasjQ&1c|QEcUEUjAo;J*hN)bi2aMJZ;$JVk!1ew1An0-`khubwJ)?CX*|{6IDPyQD
zH4|)oP{Z<KG{AWB8PKjE!ejC;aqat>>7<t&k($zn#mavupXwocD71`Rp2?7tcFFj(
zg)sW*twLJmm8oiYD7tu|25YS~BrZw+pryAGp69=iEV}QB{}p+W<%|xLd-|U6t9)WM
zC)MJ4r#FMl<y)9Pg5LO6`9bhKD;~tzy#nzwk~!uAZh(F;V2=7mf%M0IoWZB@O!}@7
zP~&qB7;J50XUf=ft^cOdv5xO#F?@nVT?Hu+`a<Im^-%uKr`Ye>x2F3m9-*f|ja21{
zkioVbRB&`1PMT|qyQL!Gg0qi-hBqWTVVdPujSf8S`4Uv`Pa|92W`k3^J(;3o4Iox5
zh{@RU3-1h71!OoM>|e*nNZ9u<*M$N=`NwS>Ig>z+i>3vuw&)Gxdu=_~wn~y$=C+-k
zTl|x|V(YwSf#5#4S|bR4Fx8=vVJ+1C>KwX4EfWfqx{<#c`CQ?;eN_IW8L4tjK$nlY
z;B&{N@c9>_@OWl7>*g;{;*xJc5v$+CV2&4*G1^U{?u>$#L`Np%hYPUS<ix1juEh&}
z2m}BBxSU*Z3w(EcC&=csG3dcP=96tPQ(lt>{7$WAC*-S{%5{TmYtd_78ri|+bXhfv
zSU-oolI=)%Ab=L<3sQThXH<V&l-bN4r;9t2xxx*%$-U8S<m$8OowD#Qky84}-kP5S
z<Myc&*WN6`XC(r%cc$R0Mv`DuzlKPAA7(DSEd&)G0~y6o4NhwYA2#wI0!s@&;d$YA
ziNXUdC}+}!BNonKlGf;f==)Ppwk#B@#{R&1=9O$??q18V?l7+Tzv||h6|2!Ixf&R?
z&yBV()h9#W7#g!m6FxQGMI&p<xl*-HNwWG^@-jvM6~4bl+{q<iP-_BJf1DtX@*1E@
z$2$CIlL6j%goXQ_vEXMUgFm)%fvDwqHrFSK@!h(X{dw>eFlu$d{T?rI)uRR|K0bll
zZ<hhSzV9IC!WF3f(T}YY6UVsfA1}*dG3U_5<6QOhEA+e2ZRDe2Po65I(0J!!>i>Qz
zU0o*z*)cA8c62xDlTO9q>0P9*FBTP*OOwq{D#6~*zZgDWXYyBW7mVNWldM1c0AKwz
z0Lj4zFx#eyy){-#Y#qO_zHOVC^7cT+D@K<oYTbf=xNQV#R{B7!LlmRIVa{KdU{Lws
z88~Zc!){p=fh#{daZ-}CIihwsT-irF`kgF9%lFyPgn?wzT6&lkX*khiuRBqq<wll_
z{6fzj*;4s_0rIk26II;bO^}`$xbyui`ESQ@l6_|tbX2-eyd!q9hBF%A?b6pUFJ6Pw
zH9AQ0EGmJ-mN2HYTN&6tK^*73hwx#2cQ`}vJia*8&3IVtWYRx>0ixo5Sg+?V{*SkU
z^BuV{d7N6_=I~cs$&5n!d%h3+OCo6Aa3&S))T7sq8ql<Fw&+&+MVPuNmOHcHGd<_F
zhdh@TLDiPdxG+NyI9RI^LsxB_-25MtQm>CYLSN$~XEFF`(NQR8eHp*>l_Q3#&YVk0
zEzCx~wZQ8|InQC%W~SJ7K9r$j_|=NlU?CF$<|an4w^Cj4H`i<UnphARX<W?IMZ57{
zx7u-MXco}k_6W#%e3Y)6<3`<=6w|Vqx-?+wEPAsv4(ce6qJN*})BLFW#J^}Ks`_Ql
zj(0<Fa;_^rU=smU8vf#;qi6B8=-te?e<Z9^h~ss8hmiPH!(gA<au6>rW0|kR2WDuV
z<_)jX2dP^eK-POj9A*6%$dsQ0l6sZ+LNS9sXy3!x_p_k%)0+(L{KxA}%IAs?uBETe
z1;OGsyXl@uA!@g(i(aeCq-h^-ap&;Pz%_QuQJB#=DzjOEyl@#uRU-OqO8qzp(u@ZI
zm+Zh|#Q{7z`3Il&-~)vf7ho*^J|gLtN(wh$gtj}<z@;63c>!e|ydYmg&f0-nVDGkn
z%#F)DHd0`i>1aI(1XD(d*cT2ylfhxy&+G)UKR<xLXYITLwZdF6?R47HxeW=!P8yrN
zgSuD`(U(mLbp60guG&{S)@sTf`Gv$%Rfhw(_S{BPwZs5#f29F-+iQbz={)ANjtmZ2
zw;o6PUjr4^KA@TP#y=idk%;Oh=)xBXhF^*BqPs#^uXc|{--1ELF<F2?i~B5FE?EH6
z@H+P5%P1n~_k@ky776Z3m0_>&0I=)M>4q0C>bSx^2WhuaBU10!M}2l*p^4+#v?ZaQ
zI<`OHYE6A77jg_yivDdnD{CokHQ0_SVL7gORtn-mjsd%uGk{xV6t3IV$n16I10g=I
z@r7p@L}Yyv=TFC492v0=CLVjk)7A8Ds5H-M`dVVdUKQWP6b?tShKJXHdA9!8VJr||
z@3_g>RSSS`@H}pHJOPZ`_E`SAeE@y#{YH5lLpYKzK|%F4dMdkxKA2TWO)TrV;yw-p
z7<@q*MeE4+j%2709Z*HtQydOVfUkxhqq*W3Xu8he@gu)M3(&_0uXN)#LxQ-`dmga;
zq>Qhp-Grety)336AM?nrtw}h?gazy4nd7d0Y+Sw`b8_=?ynRzN>!~TtN!g<hsH+mr
znh;{M^#eGHs&?qJS_r*)as<s?WKY+7RnU}V4!wG7CH325#}yMZAWh$nqMVG6G<NGU
z<T~h&D)j}}ZNfQVUB@qW3p52i6TgV`um?E!%NBQR-H0>Gz3`2>wct}#B2kz!g1a{D
z-~<PD0Qq8;bH7WD`ERcd=SJ99&ai$l`>NtHmMN@fE=K7xJ5DcQWnx2_#Qs&dLGGAk
zqzoULU~(xtXpUyhKSCGWw4&E`Ev2QZQ8dP?hAX-*jl8OfKuPL@)PB+dnKtZ0RZd@l
z?724}ba@-tWvC1y6Uy*!-ze7hQ#^iky^CqHkH=phZ-g2gMZzbJ0W)(6C;Fi`5Q=}q
zT)8=)v1@z7xMz(r!=<0G<KcO{uR+^+kxD+Cug4=Xa-CrMmpsSu%k!GT8%)rYxjU^*
z6M_Fky-26j8Tw$h39V`|qDe@UE4kqf)tc}^O?}}s^q3m5z9xvO-a@b_j)2>em%-U{
zA3*btyV$uk4<~pUP4AH)%-s-(t*sb*L*f(AR(HaQPtRIP-s=FOf|EerQJeKq2nYJ9
z^MOK?5H`D#$cs(hh!50n;j9w6gd-KRz`SsNcErhphc-u`e~IGs+|(De#=4wp+LY1i
zxw7=RyChA?_u$GD3Df`9d7?`jB<Z%xqwqyeAgT&k1>zUohqBLCF;BbY;CzjxY^Aq6
zromFUQ?!T7I#Ynps$}AFp<HZL8wmVM7~Y0uCO{;?0XPLb!ranW5VBGg$iCdhNc<gP
zClUp4gOU-0q8DS^nhqvo)d|M9lJGJf_HYFQbZBP6T;%C>hu{?xw9cWJ-Yv+WCG;lx
zu$4pkQx~I!4uHD(yCRJ`J5;&H6^i%SLTmG%IPPsXj0z*Hv}Pdw_xUp6^V1;7(LeAl
z?;d<3PJqe1&Ic}Dmg9{a9c<EA5(64aKCo3AoPqx$A8-l^17Ob2@djBK2pR+b`=2@P
z=I5FF8xFFOrar9b{uNy5o08NklptZ_pHRAO4}GxbAbox*n_^u>)D~1ox1X>;@n%Cb
zsYV|~7i~lp-}<4dN)n8{dk^<_euf!htw21s7VLdwN{%#Vkqhq9Smy9gTrG5m<*C*&
zPoI8d52;GwP_vt${ZTQ{iO6UA%yPj$C0p>#;Tia&CC=motc3Cp=kwGiO2KQhxj3|^
zg+0(zz}0GjR8>tEru~sZVui=(Eun{WYGns)Lnl$fA|ZN;vMB!Z6S~H3AKK6uh{_ex
zc^cv!aMJfTm{rsW3vT>@dk1HL+T;J4d@FLuDsC>^apNjZYkLk=l}3m}(0^EF9|!+<
zBMKY?r9h&73acG*8)_ysKx4_b@U*T1_#jgZ(Rst=qu3kH)(Bv?w~4W5wv})-uD>O-
zl?st$>UG%cK9{B{m(c93q<Q8iX%uyJJ(b^@i8NooqhVUj@PlYGDpHIhW-Hp@AIW-X
z+!X?2Z7bn-!9}p`x+@&KGl|KgM)>pHPyBTGJNWY77*So%#iwWF;ZIvT@W##qfDiby
zHK!JUFIBst`2I66>!=%4eX|kD&2wUIM{k7Eymf$=wSiS{xQxDex}a@)<WNNFdz5g8
zkSXo|=#{D#dgzNgiht4q-^X!~ROTp|dB_ijCWfN3E7eK%<78NF<qv0`6NSDt9%yF!
zcO)OshgSY_hvxef(d?Y9_@ZMS5-yzu?wLyB$f{bbeC-fX#g^EJ4Z^YiN$|e<@qw)3
zWGK=86!K?{FxlZFz^MrV*@RP2S#CWi_0c)>_>%*2-Y|wjP0Z2RIo>d2P>OBty95WD
z`cc9*OSJAp3v!nM$inIvQhI5M5=84rqPhhr+hN38f4>&)dHWWQ^sI!x9;%>CZ;!y3
z{V(C)ArGL^RR;fRKY{F=i`e$a2~OlQ4w*B5KPGx_admY#5&WBkXK6R!AHMs5U0e(9
z5qJmUUYFy#+9FUYUeCk_zDJEG&LD-k2*Ibd=%RoRl05$xW|eoK(2+(o#hF4EcGV;0
zpm?<J))ADSIgAn_Gl<Mk3BI^@Hombfk)u554V@NlhFzDP;E$-=@N|qbywH6G44I9>
zobeQRP4*>wsmg+LsxcLp#Z#O&vI)0Tz9pu6GjY)jNs_W90-q~wBI|0Kz~ya1WM0}a
z92PvuqzT8M(qFpp>1hjOn94(qQK3lf@p<^LHxGq<Jd6HHbs)M>9<4dlfP7z<prM_0
zC?RN)9L(Th6%RAAY}5%uEhA?Bl0cYAmvT6@d|+fa7c{L1g>pg8;DJRM6l*qIKC~`}
zQyUh<)@i9@Znh4d<P;Nwdu#CBdo3g_@CXq)<wQcCoATaI<q~JRNupn)$gI^$LX~&(
zkdbo&;$AgDA#wq*>hnT4`lA=6$!DRTd$*(RA89D%>NZqh{uf=t3s8dHT9SkG*{bU~
zWd61__^sk<{Nna!c&1=8*?bJ(zv2>PpVd_uo5sWKkM7~lj)vvpZ%lx6Pbs@I;w7G8
z^9f(Qu7RI&bcu-E5+Z;45*`zLK{lDM#&u;q#KmC+S-MZ1gH0un&|@Vu>%}A(jog6Z
zW5nQtKbKLp%6-`WSQGUcTcRlYd#KPQ5}m0JK-q^}QNn&5VzeU_drPVjM~j{0(w}Ow
z>(mrT;l`6wrJo4AFG2;@2H{*Q8B(s6K@LU?av~*ap+?4jTshGMH0epqCSrVMgDzQc
z_6%OI)*ROmMSNz&8b6h7CNnj@5ruR;&RFwO*!b`;Y+HC6u0LsvwjERigWf(U`PNk!
zHWG-aVgbyy)JMHVFJOad5IR$JA0=G4YU!BKL8L8qkfRs%NKVZlSv&BO2)=xX2Uo`s
zOXnW)&P#{vuwkgM;S{MJd(Rn*<b&IG-yrg4o`7cl!*RXKu-Vl=MDda@4jC20)|=LY
z{;zlO%7v*|VfSnNLiRo5d(;LUT{9b|eDB03E8pXD0lwt^hhMNK_X*s4OcUiT^CdM?
zmQY~Z8gk*-OSIZ#7fN~`0`ETkM;?4}1-sJUk?w7=B+lw9shB)Q!lM6>bS{_7c$-Va
zPS%l@2jk@Hw?rVWFdG^t{73jC)_^yr%Q)ddZtU8~PJD6H6+gPMnoW07fS#GHpx-hA
z9M4{eVe>vF-v1R!F`R-DUw0DT!du{ST{#J}zXCHBha<xiEXp(80cT3*Bel87RI}$P
zel|M+CAV;pkah@_sSklmS7GvSXDPYhIY8p7l;{khZ{$K;J^21^HSjvTlZ2l#rXoK>
zf%lP8P*<vrbv^cg&B?0)^1H=Y*2`e`6ScTdP8MuEDF~G|DuRhe%Ahyp57?FbALH_(
zk}DD)Pm6zWkkM{sI*WfkU2)(DxTo}s#%)T2H@Uj-NA@F>QM;8~TlR*|SrLZv50t^z
zsGfWt?}aiOjL3;JHF98OH3>OvM5PAq5)U>Co()(FFW0CN?VMBOrsW54M#&9Z&*y>O
za1$KA&I3Fz=L5+P%kkQkgCIfWH8fkA&Wi4_1hSepKypt$TySVpQ?D%HDn{$mhd$TQ
zvg8Ts<9LbYb#>tsEeYBi?*?yvse=>uxv25q2kKiJLbr2{qLLR!;j<`Zav|M>J!ZR+
zC@<?Ix{(LTtC!~qPpSeq=2U``xjIngu{dtYIE0_CkpVtOR^d<O(a=(-4_8*61s|-;
zzzt<%9NSp}ZH5Ej`fGN0V!i;*(fSH<Pnkm3#3^=T<s?@}P>^OochJ6DoH~Wo(U_=7
z`k#R$E$hofbC30*lcQ%)@mCH#(rH1L-$_JO^7o;=MIFcLv^TN#cVjnyJjVGG?ne2a
zUS&)|lwnU`8Bup0;?Zw|xZm(ToZXVg+`q<06pw#|I$2{(ZdepZd2^VtslLaU{yPCV
zw|0Skmpq~e4iK?leV|c&1=RVi#-<#~;-V`q6mQTW%2~6hWJf9Wn=+)g)Ys97hxMrD
zh$_rz(?CUE2k4r=O;jeu6;+*^03MRc;7i>zTq-pM>_uIev$p5SWRD}%P<#R(JP;<Q
zKRh7nI1M~HUI{~s&NBvjCgi{UU%`m)5~%*-6Ekol8sB_z7b-=Gg4;R+IHtKBkHS)%
zeHMZGMstui70&v*YjcfvKcZFXEpXnqbJQu@g!&Fg)91b7w0g-~E+0)J_PZ%M_xT(R
zs6EK~X<MMG`XoHoVhVl~7{FMmO^m_OKoZ3fCyQ?Cf`Q`~;q2aLq*h*l*qK?wRi78Y
z!-1-eFJ^V%0h5hjo8f!7^J*P0kAD#{_}T&8ACGX<%}<hDZ}f4vL;&x)&pXi6yPRF5
zEW;Z<rOwrgsG|pFU>GA%LUrG)q8FM<=--E7)bElz>Q$LTn3z0d>G+R4Dl-7Wo)f6r
zrj&SRxI_DTc_ezypM6-LNBJ+#f--Y+K*aGEWKKg8X?F=H0(0A-gUvm7=(G@T%i9jT
z^FbRB>j{QxM(Vr_nKJxk_HyW$X^o|&#K`HB!z{IM0p5Q~nXW@zY?Iu>+n>_OmFje&
zn+&(23+JQhvZ-Co(!E<~fub;VANhtduRf!KKnrR5Mv^5Tz2FkZC{%fDCUHLU9Uj=E
zfYj%hHl(JSP`RI7Fyu=Yh$|Z;1OCe7p`Z`8ou>iM>STil?+kbw;=DQDgM3V5+keX!
z^XD+9n>v7^V+fcq)?^3Y#ge5e#sKu_01uV3SaQ}O91*yNXK?#FSK_lG1uLx4pSS7M
z;<|71;-8T;;&UoJ7Pt>Z#gx&TiR;kOlODu4W&slWl8Gv3{vj6M#o(@z8Az~33kZ+>
zC0+4y@RF%FM_SFB%Df<?s{1K$O_PAjEL1?b@g`oUfe8@!Szwv9#~XL+Brsx)FQIP0
zd~opmcJ|2T2^{>@8$2_hjDMyNuAk_~RRi(7qi1iR32qTdtMBA0w>VL|+a}HMtt{PT
z`JINWo(JV*ztB4Y4QR860~}Npg@@jiqRJbhc+xQodMz7=N#3^Lh=VENZ;OFdQLC0)
zuFIh_mMkZ--9qe{A}gT2m*qXsEo5yi27&mZeZ0GxZ-~flKK6|7bLiTn&YZgWgctr=
z5JbZLpj&)`b@BBeGb8-*MG<>m`qFOn+RFy#55;n2cHAKE?c$o3r|437xn$}RI-4q1
zPSEOFNwjU}W@uz~1U`+pjmjk)abMys=-xB{mv1@7ZkrecCF?EV-GY4#%@Ze`6FE(#
zLLE4<q6>fS|4zhz1+l4jelgt>4$CJt<>1^kWq9${m(ci37dGP#^Y|Ooq1=gVz)7rR
zTMnz?wv8P)X2Ct)qO~Px_{a!OdwZ9wsc@7^?Kf{WIsY8mO=eJky*pI>h7&EUd4d$V
zZ-}~ZBy?Omh>CcbI7jao-1w>=3jZ3wftjD6&(K1cUY!J#-H#K<&yW2)o?w@;Mskd|
zi&)6JvzMQEHeJ)#;4HK4X;l8xF@2^{@O(uc?q6%i7N5z7dO`|VLV{!2+#SfqXo|3r
zb{3X=*IH1It0vqcU(H?mgh&0vTAI~#kHV%uk#y&-#Z<-g7VU1ii@1zG8CjK&f0pZ@
zLVg9DBl-^>U;?1yfg1cbcn~JJ$$;@!ci3kSs@bx`gs8dx!7D{-$l?u0$^3AZ4Z#|0
z^x`>|VQ)+rH1Ugd6UYVgY6NkgV;FY%auzE6mBZ$z>p1R((IC`k8!)U9wj7?QLhUN4
zNbTMq?&9G(>Sp}8d4^^QxR}SKcHRr<%K8`dc~n1IFw2sBh?XYpJM~b(U2igLa|QHn
zt%Wfp2frL(;n^$AI7luQ=l=Brw?&%B0nJSIqEZ-}Fs~G6aCftd#mCuiQX;&6bH4Dx
z`UZKf(($b6??^0ZJqMp&+X$Y0khjeL7{ZZa_k&i~Zy@;~*OGimMC8*&1cRNpE7kJp
zp6cK9^P*vp|MxzP&>W>|M~CPy#v3V>W>fL=g}8@L3tb92Ml`lPhnv2*OwZtbynXZt
zJQ))N1n>3}Gs6kE>eLS+_2myvxF!i|7an7-ZFtXm*-EiBR(@>Kz!vN_b(8a*_TvSY
zgMpa22^PL(3cUJf0+;2^92+AYP;@&JSe>$I<Xe}AvI5l5%I#@f|M^2SPAG=98wdm4
z6YuGTxDQ0aeqpmhx)qZ5d_>lKEF_Brby2?JVY2+=RcP705b_7#$FEQW1RcAeUaUE>
z*E|IMotNPQwGHgQ_u}CFN=uOc=_$M2IE2~Dc(K=3PT^Xu)lAHOVH{`E0|L7Gm>vAe
zoZ)6w5bt%+@~vVmkX;nUIT+Z}=v;9OCHgNx+0Hgx`#bMww9snGHcL<Y=)JU6#}{ny
zb!--Rd>0OP{3J@X4*2h_-6;3SL!wuh3-!x>F%njMxN52k&M<O@#`_NufhlE(czc<I
z+k_n&ih&E?n(;hN{9psV^Z;Ms8f-XHgG-fOgV>yx_}nT1j@$hoASFf$i~iXFG<MzQ
zlxZh%<m@%rG5&p)r$-f#$HM?r{K}BKAy$Ew>dmE<+0*mYYD!xo3DDj*i%w>Gpjodj
z;_9Pr%p6*eas#+HZc!$bJa-XS=<H`zPI)tzr1Ri>8)Gc7!4OI`PXPYbNX%a#4Zf<s
z#`=TxtT|Q&jV{~pvZjl8#PlQ7Ggiag1*+J8yaD`<^}`$E?=Y^5w{j9qU$L=+rFcO~
zYh(Gk7PM*hD^zmeDtCjf8_l&XqQ$>jk+!AAbS@GA57xyt3x2VH?>%O~&@KVub~GBD
zy=MU859Tws%m;}SpBLQUdls*J;|Of)vH%q?0~eK?8SZiuyx2nt+`8a_!;W2Nb=R1H
zYYz9YVizK!>RX`Q5e&>)dvF5t4a)hv#ctf|z(xBjZ%Eh-FW9);GJRh(Ppj_<3Z9S9
z)v)RQdGt3u6XHtmI_*bJrGm8Xbp#w1TiPuCW&m~{l!fq`Hz}8tMdu8ipvHAOAW$Vm
z*lblK`RgB;cg+)rMajUNV1I^o{$ms#DK`z#4iI@F0N-75gH5jQU<Ph|XEPFx;a|@`
zLJ8qTP`Aw$n+&~z^1cS_lCDlTx8*MfA5g@mle)}k!*X7Q+5=R3FA(LAcys58bg|=&
zdi3GgDrDN2K~uDi;K6^g^z&~SSfIiWSG<-Yu{*0#wu(KRZ|(}j4{WA_y7Q6Bz+rf$
z<~jDiXa%1ng|embTVd+(Je+q+h&7($#Y|p)!rtue!WZyvUXspwobYJ@YZRsnR|Q>X
z?&bakBuN{ro7)4|nm{1g|C-sL*a=GMU0&qmepF>)gGkas^ySA|<fJ}>hWOdQ^M$X;
zmKOwS_P(N1Rd0~?CkDpu3nD{RJ5ZVrWlFT&fUb@z6*$@ezfbugt>;-y2g>=7@T5Hy
zS@Z?Xb01@lcyGZ@jn!bgNG1En--KwCO|q#<JDHf@twi+jeo(Vm2gC%I@sbaEf@4#?
z(AZ)<sJy=xR6cS5Pj>#c4B3){cufb<4JilI^imsn*%(0Kh(x5EE`(R@sDVl68t9T)
zJFsowIjHsiE6=1r7A4t3a(b^fPu@C*bpL6Dn@biWofXnhcHR>>yc|#W37gPjks|Oz
zm%wfRxG;WA9D6&=gfPDcu+q<du>12#GRHLsKb`9c+~Pjs;@&8x{OJI5I&TEZTv`e~
zC|m~j_C4dY@%N!fXD`&^I}hc-A+$Zt9hpzgMOzb>BAu0iC{`sN39jCR3_UJm$F~>B
ziUcDR*UOt`Gpw*S=R7HMvH&F_7O>bh2pUTy!^oBOaM@EqWSH?57B&?^(k}~ho-bk7
zMJwZvUkq@ZgFo0;`k4q>NRlmCzRcQST_Slz7$-G-B$6?MAntG@@R`{IaxW_}a|8j*
z?n^;=Ga2Oa>;($FZ~~bsy+O%C1R2}!K^al=VNsbL@>wK|8b-^}v6O=--cyE3E-Aq8
zH%pKUMG&N``m=Wu`oYiXEJgCNJUATL3+oo0fMX*Pu%=oDK6w6zC(F%Y0?n22nf6Gq
zb<cWy&z>a~^_DozWr_$)NZ@=;Co;d_3y$7=8Q08f#7l}WXL9FOBC2YMY7=Io8Obb4
z_Y#0L|9HsvW-&60glO(-iagKPqIV~@B2pZTWDBfNeAP+P<snOgB|Axsvj@foZg@;-
z2}mvO=Or)o#)F*OP&{)sv{<(VE(oZD)b0c)_R1g>6?4XGMSX#JVHbWF^$=g$@EZ4Z
zbl}$C|FJh)-V^JWYse-UOcb0h;WthzIps6m=z?X|Xuon4Owe;f35HwXO3b3H(Dz75
ztPnYMRiYg?^w7ZF-6%;j6m_Oeb2UfJiFunY*`u#dmR5ftE2M=nAUB9%Y9`*Wq?-Bp
zx($lmoKNJMZ{wGt*O?S5$f=0qgDWG(8PA9RfKHP(K3BDg)sPp!-@1EncvwAqx8@7c
z{1#1IW~{=0R5`@{=WkAe-7G3<a2^hYhCwq2A&7;;;WYh%a&p^|J|M_s|3l<E+YhBh
znWKPvR;W@?2_=f<gAnyQWUqTBG1-2dWC)1h156m%rWZv-i|j$x$q-^x8%cUroW_3&
za`3)u35>dQAk<m$23*GL0Kb$!Z|X(}`!D7lzE^sU&C~vlL2ECWSKNomw60_q43PO@
zTAZe;PV(?@2$D<v4Sshwz_)Wt;c(MtDCcJkKWz6y1+zD!;OV*JtFA>^*Z7cu!)25-
zy8*8DS%^2xaUfd4F2wcA4cuvvLP}Ny;mHN}iF*HHlK9G)M1GlrCnZP8t#7ZG=pDMC
z?o0sSO$0EVXN4GZgM7>pf62<9>E~1jH{(|aE|J-HM8HDJ0=(hG2-e8m#Tj4*f#c{k
z*t2PrMBjJ{W(K9<j2;JACd~!E#d!!utj2uZTVQIyT;%p72U+RuLaB2$!Zoumg9J_v
z{<R?hPc6wM$7MXom2y`cb7UqtU71D3k4zHZzn@9^svqP;KVe?psbF^WmVzkt^8oJb
z0t!pl^G<EAW=h9{n85xF?5t+Onpg^gFI(K$>P;t@I?hSX1;>S8Hc>%FhC4~>H+#7I
zlMpUC@{Vm=?gt}-O_+q-&3NG{2eft20m-D_Vhm@@LYWH}L%RqCP$G01=WLKBUOUH#
zWxo$mPXC4X@oY%l#uhSoYJmOzM}efCJc3I<>|%Z=7qVG*I~f_BP>>=ym$8@=$tZhL
zurBW%7U%D0h8Hm)X7fXkaCip8w?70_t@~ErsnP+D?BgO%uN3jV*iU4;dZ~=`QS$nS
z3heER0a3J$9{lr|%u`n(cC%)~!N2k7+++w86HW&+IJr#v348n`J^?G8QX@`lp5hxZ
zOK|CtQX)4e8<-etlR5VPaNFtWZf`o*X*}1$mbvAD>yJE{Ik%lSkE8iO)ZFuUYnuev
zYH$*&v9ciZTP~P;Kb4u3InBFk{vM9E8X@PGgv9GVqw-<CWXFL~vg4#9vvqI@Q6EX5
zYmY{vS^Nz!VVMo=%r8N?9W$WRfmBBA@i8c<&k)^lZK7?ofhb7SlZ29N+@5EI2V!Kf
z)4uIQP2?7yQ6a&^EN@^RsV!#fl5c|d9S4|EWofn!If9mt-&x7M7r^X?6wWRPWyS9{
zv#XU;*_xZ~tgep};!|os!kOa4vZD)E#XTq7F-JOQERi@p)r8NyHj;*$ifCS$APj%i
z2V$&KQC?;`7;buIS?tsSy_Y!SzV@TU`${mIaC>^M*cRZI<!MmJM--}dwGspSHas!r
z$nkOF<2|EmId`&-Lt%y4%&BO9j>@;0P%||Ppim=r+pr;sRJX+OR*k%#FK@8$gEUrK
zxf>}h7e*WF`Cv(CJ5~9iM59(EP{W;wI?j1P_1p)km&7ILQtVG&xVwVcDGO1a{v(jL
zI)in|jEAjvr}b-`8?isX748#wjrpw1h}O#-s5o(q2;?s!6~D@f`QuDh)YQ!~)xVIb
zdl3(QM@leX#~y=<F?}c)wh2T`jRNC#emwKnNgN~42r@^jSp4ZVFK;phfx&HvBj1D+
z_uA3<4Q(`kV<<Um?@lgQW)nY2c@n+z2U>NnnJ(n^f*Ir)%8m1d!W}js+r|`$nVaIo
z@C&@S+iT(7+apZvn=t$<#~N(+{6ZqjAL47u)dVKCv${e(jF@!`kbk2C8g&zx3JXzi
zDfbDJwRt5kttS%rk5i%;EP?$iFM)|?olLEQu;tYoiHI>2K=TeHq04uTseVv2UF_Y1
z)|`BTZ-_0W8q3=8<Nhl|&?kfjp3o)Vb=IJqkdsh6_$a7QYeTd3BC%NAW2hGX8Kw>Q
z;|sg`Am1(pa$+_nW)eT)lHq0e^ZdU}o=qJ*8P9PbFh`BawQyw&h2z<nk`c_is)tj&
z;z5eO7+HKU8C>004Tidh!Jb_KoNM$JirHU+P6~%}`3BCAhpq9Xz0enF$5)XN{YAt{
z(1?7$zK{IAEd|bk5vscP3_3S96ACD%Kv6Rtq;P3FOZMjQBCo^)Fno~guDbzro-w5R
zlsJBKiGijA!FWl94X4-J8*7<-<o$GAfp;`JGlD~d`1d76%sv~$C!FU1mEUo=WkU+o
znxDv<Ro?=nANX?)I4?r+qbVrG+>oms;zNB)Pf+vz=kQ$IH=1;6F*eLO*33WaBes6v
z1(GKBPv2o{l($zD#2?TGpT->F$48$qwD?0(s=nf#Pclj55h?H~t(E-XD#L&)qJY;e
z0;I>^GfSML@b2YF>|jAJeq-v%@G`#=@jvFQ<Jdb!P+~iKGWHHwIAIJX^#ZVs=NQ{y
z{ju@qYGaiBGz_KfY37<eFQoGqf1&w(7I0}@Dm}1%Ho3G}gx0jJf-CuzN!FLg<f5N2
zy6{pPOWN|7GQn|}Uw?+>wPld@TP@(BZWLJ&{{&=At|sS%f5QTaZoH><ntxjGhokhc
z121KovA@tPA{)d5njPP;g_tIjKjRT7xj)8ccU}aSPizBx89nSQvnXtLu!1*REC+>U
z>_e$li@B>~8M;DCn?9Q@fk5LunwBZVY!!5D7BtsFrU!14><8z`(A%vjZ&oo*9=q1K
z$jSuH*YgHN_jO3omTb7@Xd?F7V8#4>_yS+QmJ2t%Jq<jb{3I@`{xn6CX$Iq~H)d5-
zi0QgRAo|%d5bfNDL(pp|U2>aqe6uTPnsCCel82atFmpCz*~g|k873%jn;VKPkmfoi
z?5DFI&ZXa^7_{fZTzYL;6ci`<^xry7WYKHPzDm!h2RMgN?)g^s=6grn_I*FoaxVl7
z94I9l<s6ul<$B<9=r7iIgB)?mb_RQ`yP?XiB}89op=Hr_KJY$V31`~b65qXNL6&nN
zRP*p4rmMHY+20oc`LHEmU{5YtGFieTlyO1ZZc&SDwjO!5zDF6MI^5Nv`Bd~*a<jl~
z7v!&AL(^FW{6V0*S$rr9ftICk-IZ@N@K6`Z+ZF+RDjZ0v5kE<2d<u7@UB}Dxy-88d
zQkbI>2Thp<98(xTES{YLS=&e1cugrzNcCFqdXGNO?n(x+v7G{G;pXu8iFDk%GZMU=
zYGnovpN6`*kBHnITehf?fhrjtJYM4{N}Fkm<{9d9Vb~bi_p6~<EMp;D;!LP$<9w34
z#k^T@?PK`)m?qp5c9VK{mZH4q&+xV}kF0A@qKX@vV1nlfxbN;<D&kcH8{Dj*S?&h7
z7q^kdrLxR9{wbLIX#MitUM9>mY75%mxZsLD0-{}6nBZIiZWJOAmAjC4e1ks>cFShl
zb;Nm}{2zcHt~Xfjk5gR1>|Z#18-sq-ErdEvhV<Qk)>P1~0N)n+j*?U*n<X7Xah$&+
zww1g<rI%oIF<%76uhIZYihSgku0E^}x`PaBAPzSUfj2fvgEX5*i0^GBNl#W{GD>1#
zenA{Nb{LWw?~lVdb(0`OM2pe35QI+lcVMVwFq1!V3wzJ#gziiGpwd(pIP}sRtZul$
zuJ6CZl@~HZi-xYF{T=#nSn(qDxE(@|t^7k~lwXD2>uylTyerfw<R`R!uLQf?3Q?|>
z15-7vUk;3%C7Ij)K~j^C)>&x5iqRYJW>g*1V5o>JKN+xQGmb;~Cn7MaAQUf<u^^U3
zOF@9}K`6IF9Iq!kfr#&Hc&yWfoz+!mc}e^Us7g$PryG<($+A+m*wl*G<W_*53CW>y
zfY9s0C1{V{YqTWm0m{t02M0d?#UF(W$cwibL{v)&o=eMPFWGNK3H3rG)G-bEdxqhU
z`wHMr>sn;g-i-t<E5SQo348N<1_GabVe7N$-oElZJe@6xbxxb$*H25ZpZsNTc`O)r
zm=dDdn#}Y~DB_(LEJ56m1eO-6h1O1kz_X8SdVOJ#^QUn>y0QK+N{Wm_9o^edzUE0}
zv~dLrb9jgnru5+R(lXdD?E&AI)FAX^6zYC_f)e-mkm!padBd_MI4Xb-Tv^!(Tkna$
zuDPo4Z2J?qe{(VtD{w&4Dw|=7b{squ$FRJ|<2*1c7hgZSlBtltj-M?3NH%CI;NI#1
z+<kKlgpc1N;%)?}D_gPF(bc?qzaGZg;w8!{F+`rv@1fJ}@6gtgbmXPUpzv$L=zLl;
z3R%b^|0nlR33-6ddnusx&G%8FT^pG(Zb0~({*YC5s+_2^YrzA_CqQDgA^6!o1an7i
zVQ=+H*n&!-S9J!wtee|VE7<{(oE7lW;XY280T1sC93u-QyNGzt65KJJ4+b$S$b7Mr
z`1`b{Ji99xTg<FyG_Lxitg;<wdx<v+_Ow8|l$BA)WI0M(xC)&#FGS(en^BgTE~?gk
zgVOxYq0FazC{cJN-W)E7h4l9m#jJ2VJMtt5HeCxA@R{<w+xVc_?ohZxGYG;7VJL*o
zL&p)))azgZB6d7t&ZPcjt_Q`j1>d;%iXz2Vt`+00sr}eNp_52XE+x9M()dDlFfKbO
z%?T|tLaCm4$R%6@#oiG@iP1GEGP@8JjM`7Xv#(LIPys64H4_zWcSUDw$5D*8A4=%Y
zU{wsq89b{Euh3b=sd)uCw-daXWUoS?e@zPHoy`D6af)o!EDOdkJ|39RmZoptO~AT4
z1?=7xKfnytt$0p-Dn29g6W`onh~xEl;>&NJ<1cV45fMIvKXzWhtiUCXPQ$e3@lZu4
z{M?b#mw)I`xD@i!FhlV-m!Py1b##&Y9+hwUh-xz8(A7O+=*-?Rl#uv>clKYcrNII|
z_KW_0_CQAn=gW>-*1dfkB-eR@)<hSkNw5Liw^Ud@&zsgC0^&TS$Q%%N&L5K-3}?&b
z41CUqiyy9YCCh{s5s|O<`0ku4T&3fRdmbJkbL)Ay=)MnQUi1&8UKd6O)+ZykS+dCU
zfG-NVw*{rzJVzCk-ssY;OmwksHOhHDh|=Z{qJ;9LC}G`N&^GN2JlH>&UHY0>7a(H4
z<t}+5bb&;8_F#A09K0LvC#u#iB%}Q}o2S~ttov68I>Sy8&A}gR=S>BC@98`uSn-9p
zZvIXjj#=US*n{}KLN!+Oe?dfJPU0?0ZKh~c0;L}`LoQj{(V>hz=+GH)6l=2;rR|>f
zQ@5ni4fje^hB{G(QWVMtW$4JA6cpcn9`2`C$xZ7F{7PXJxzr*}sv6{})|S&$;N?9s
zolFw(E&e3YD35%XokNV(4S<z)5BOmlOXi<SVkfPK@B!n4_?PlCqPOBB(VVT#rhc*~
z!u}tC*`h2Q@=c3K1RrPW2M~Jv(~>L@IfkS+6~O=GobaMEs?@n>HSsYtLpFYaC{o=R
z>6Y6PrHjodY}b608CU{u_C!#5<V7Tr5UHA{Oh(<>>9VgCbfMfbGCNk7RKd-doj*t(
z{|q1x^jCtMw|ZdR*d1)w*MarQ7J$rM7)w?QvWC{Tfti2`;23f-@%INW#JWMv>R|jc
z<Rzmq`hY8{yppixV(=-tN0M?jldSWqG&&)XDi>aXX>0bO73)V}ynhxcelG|2zU)QC
z!K>h~r4RkDEQ@G!4v~GgKa;Pk?C2uL_mn^IH`v0vOn!cO2IT`@lhbG>ow0Z;xHvhE
z)27+COHWep%vNv6XeWUMW({Dt-2$$WQUk($vCNpx8_+iK9ja~!1tM=&vImvTxVk1&
zw3L$$UmjN_Zi-`6w`78roNc2?0iRJ~=U-wF*om^*G0{2^LB&2yp(~zN@OV}fDfV1U
z;#OlQE)-7AcdsFDb$(zqw|3^u(Qa~8a~@nWvXFUQQAyrSdIAn8B#HypP)mfL7xN_^
z>Kk4K8zfC(D8DD%KFbKNi2MQ9w07a`pR&MEaxOISp={r~7hFbgln%f4fjhUPvUktO
z)1$6Sn)wB<Qa6JzbSuZ7{9Kog9z>qOM^0>_+aIc<+Q2<{|IZxUqgzBgo)1CZX#=up
zT9-I58=>AVCt~n^4-?&P4+>}NftUehW_|T$*7mg<ne}-DO3!Tu8i)LV=J^v$A5w-z
zI!)k<QyfnDu^XNVH72U>I+=gL#?Y<Bge^*4$yL*xps}iM)D$S8C3i8s5dNw8zv<jL
zQ!E-L^{vPG6<WxnP>m?uKS%W@<xy3PIDU5V65jIIlnCo>gVu{HNO5~E%*|U2uKgP#
z30+0-!KYDDVRM;X^r8)}E&9rC2?@la>aUo6InAIf#E{I|aG6=DeiEjCxPbW+G=N@d
zB-B&w!;{Xv?8K=&FkBbdR4=|9ea&g6VLnT6S@aSVKlqBCc_2g^0>6{h(;ihU@;q91
zR0M7O+s#2GJ@C3&(zHfO#I50mxXLL4%SQdd5m)R<(XTs5_IUvlpYxk2S8RqODN2Ml
z5915z8=wbg0jKv|2uSD)U`J;jWxg9-AbzV4fU?78;Kvay9Iq?{MQ?lao{=o}tE(;+
z`tld=+46_kvpAZoTPH}T?rp$_M4h2&+3{u>&)e9mxP_kH6^dj%l~B%jJ$`#coE#I;
zCK^Z%UCFEfZSU5Ss8wgc)~iPeyIq-}eH)SHyYKjqTQl_5b%!T6=Md)x18o2GE|c<l
zKQsPd9K^|+G2@L}@tVK7L{PvTdWFrw?@F(;9B&@@<5f)**{##wt(#{xx)I+()||%{
z9b9D}Oy8}qA&E<dagE2_<{5%oq+es&tF~=K|9KfrzYpJGMAT{05gde1omNA2btMqa
zSWadcY$aN{R^<2J2`J&^0Rx+DaAxRPctV;5A*Qd%j4o%K)tE&rBi;ZNx{4WzaR46{
zMd1REgN$&0GYqy^K&&JRz_O+-%&TSMMEQ{(6i>U&i}>b^_u(p*FSmnhxb|GL0D4U-
z!<FbvwQkyN{|<!bO;Y=VwJ22e89LEA0)9S<Co50dllsGAs9w?%;6u9LZ;KhZgnY<;
zyTeE_WeisuMS%12TcA{+2a$ET2>DZ&vIDboaG}X|j!mC8&Y5h4D^xeKU9}ES>qrNb
zMK(k~-w^z8xQ=6@weh`#^WfHTJEr;Hdd6PdiY+o-$Tb#xO8bk<simGN4YsqV29JYD
zk=bLaf7KIthA&6C-K#;Y=y`HCTAh3t8Af%3$AL9Xg&8Z=NQh`1W<vvzXxB%!rSCeF
z6<o`VAGF1ti@reVu`2vZKZAYO7{i{QTS4SYO`(cm5qQk>!%at*u~YNnv0-2`cprZV
zr=nUMebt!x<Tc1Vwpqc{7CNwvb_=*>oi_BUs~oM%NTm{2JjgEoJSv)UkNo|20i`&+
zfgiqUA~oeqa!as-+<3DM)%x!yt|?qN>9COf;P(T1Sp>j>{eC!W7(w3I`@FM%bcw-U
zb2xKOB&ZG!<Cx0DW8=dkg#W-XaNwmhG*_qu@65u<f7}*MlWjVXJo6b}-t`kp&iH}z
zYNt8A)pL2x#yRYYkPNQQbl0q{v5E#aRMMHL{X}ESc5-@QI-Hd(hL$gPM7yfCp=HK;
z)ZN#C*o*0+a_mc5QflGLJuT3E)DTua+yp2zW6J-2hc$1j@w~El60tUq)2|lO^getL
zf7<&JZxpq~sX^K7-BS`!rcimgikUajyzrB4{iKUWTKD6!w`Z~Fz4Q2XXaTS{jAjMI
zi<XA~Z}j?<3YoJ#pIi@{2`e=O$)(%gq<YE(Y3rOsQ9E>D-tk}Xcdk186k<unJC31?
z95Hga_y?>J-wCfC@&<iHH?VSvAij9$-ZZbCM0OANl6~jL$m$YjP&4k1qr9$SJsTe5
zZWf6nE}Q~0Z=A<d7q#%+jTVeqkOJA_mV^(<O?O){LAU~oa73@Z1X3<}jLpL+bX{AR
zm|fH&D|(m01bmZdeCQw-ElZKWp##WN+=E$HAP@D5wW;&REj0L4Hp=C^BB60w@X$~$
zjN&_uWfok(ISPx3{6|T0NnM-7zXc@VlqWGccoND)_~99@9qf6#L15W&1S`Khgimz@
z5x(iacY&|20fnmzi0<MRp2qG~;H615+uoW5mg}j3=B*}7(41H3gU}Z6AfX8dW$z%*
z!yl6F8FT2)BZFY^y-v&0&5KER#Rz^-ECQ5bFOsv{Yf#>fmBe|@d>9_S5_}&BBJKj$
zan+hzAnely(qdgrm=!<46|cu+#T-ZGKEE9l?YwIKq}GHv^K&bkJk6!PIFrIAcPU|E
zyFDPmHHOD5Zvg2vTfp^qrogf$lL4mF|CJ_JGjlxXbbf0G2d@8s)_#GccHcMhbG|Dz
z5g4bbPA~DVY3&_U>;WajSA&pbb29XB8OnP-n`}U1Fnxy}lWikK_|Nx{WsTqA44)5V
zjlfZyb7n927d#t(KlTiU{rwEXm3LZ>mY6Wk2T!tPAs=w##W9crO&IRw*}!P34lXF4
z00o6L?2I4904_SpMxEAUmleoaI=&~!xk3QBF8T;;HJ*~k_8+K1q!TpAttJEGR!Azt
z3o)IW$fv6{P@#D_&ffS0UHGg)BJ(~%|6E5LJ@E>JjZ2a_qsmY?U;-qT{R5J#t^q0j
za86Lx{}r5hK+W6Nhbu{>lAi_|B`ViY3QcDZCsGOtWy-7+niCB~(V%(MJaH?LRFZV|
z-rrD(M3W{e!;yq5nJ@P}@4x&1z4qGcvz|u_R{kl5?Os-biW}=dRsINd2-GG|W;vnB
zQ?=o)dl8h^@e0i`+6wxnUSe`3?QrR*LsWnCCoO!rM3DHy1n=7$h~117m}c>2cJuXP
z?)ZT-3{<{gK~wAC(TxL0>i08tT1y8#wp2m*w*t<~w_)=?gd^p#MYP5IEZU%a3!D@b
zK@EpSl>Ea3_>|vcr|0%i?WW(LuVjd(J>5&}UNdmZU7yD6g1pqpjWBtKDkM)Osk!G<
zcxuQ3xXMh>;DBwS_hgWsUfM`qy@=q#l*!n5;42Ojs;~o5di4GwJI<ZYabla-Q?Jx|
zSn|D@&3c&yUoCV%nL~T%i?5?NC-D;f`aK7x-JXCOGy75Wh$6i41JJLJ7b1z^E40}s
zoIM>YB?W;CXyip_dVirf39v%o+4F3=r}rl4l-fm1d{m)JSrYL$^N0wGhk3d4jNw|k
z7H<3&LTZ(RK=n0?`W9DLEWXGaN7qHNg>x@sv62850M4_ei}u3K*k}aHO*p%W?W|fk
z7=C*FkXx7X6lco!u|<4m@J{%dB>vfoB>OI)zuuaFv31(;s^nJ~==2pDcXy(Vk5fUO
zTZD*{8Xys2>%g^F^@Q$n1l>{ow5-d9gl7NXCG>03aE<+h|A#~KBDJ8O@QuX{*IY=R
z6kGfW%S0VLHaK3}2<7fSfhF2zaN@Imu@_1qfJ)w}_s&)04i+8YCOyrd1{NALuFn%^
z$Ysz!y%#{*l2}mD><5=WsfBLkweS@2hvKO+aK_qXBt8BB$;$bX@_r2@8<|0txAZ|3
zyAUGf5KGoKU#BI>`Q+LDF)&flNQBugbn(te_{GSBXObHs=$<axmAku(=kYNJ@l|i(
z({ATDiI-_4;C&*yvXs(f{R>FuR4Hl;uEAyjJvjPPEMB}y0#8464yW73(yi(c`uHz|
zC}@;+<^hUs>X2W>j^MDqC^PO_gQ8A(Ba^m9u<YN(D8{;7kfZGa-J5NA)~7TCgCRmH
zZmCAvu3xh#Th}cpZ`PnOnOmsaK1JGo?>k+(X%9`Ck;+S4#ZW-*Fiu%(0o_Xn(VZuY
zIYpJJ%+KQ=c-gBL%{G=`$No%(fB$!cmMxB;cg7Fk^xi?<_u8)@E|1aWIkAN8a)5z<
zUx!{wwV+bzB|C5>3x=O{<UOspOdDE$)5x2gz{t@BZa%V1aHskt)gR-NvrCQ<gZ3_S
zzL^UU48G*WgIJnn>rHE19#DfdPL%R$c|L8XsMvN2E|+dcYa)(A%QKJJkX$+I)Bl4M
zm7lYh!LQMdy#SpyIf7PSO2V^zZs7C_=b>ZZW4Pj^kYsK9#AAK)pyt?Bs6sMeXk-Ud
zi&@DwE$U(?z6`Rt{y!Nybl*a-#Ry6-ET+cvHVv75i&XZUC%;-->U*j;fpbSCV4mkP
zs+UtlrB8Q}@)#u=Uu?l!e<Kgw)zigc-=tZ@D_?Z$&>g0%lMhCZYa@w$4>`jvcUYlV
z3O#2wiN#(nV%_n<I9;~^mc$JUqCOijHPPL^{icM@yL=n`6Z!%En~((aqTR?OxRTz{
zbZ4r=m9$y!Cok4G6to&0pk_*KWbDTw>b0?mhAsI@Iv0tN_a|at)#-6~zFr0@>eN9Q
z^>nyTdo?enbOUPmV2S@$nqV($yg;+F5xW^(O|lo6pnk1Y@Z^+m7D01pP>U1sf1AME
z1OLGn60KlXpJ>M&rLoNwYuQTePi*#GGtw003C$iWA_*UF*fMGd!%K{4)TaG3@Np7v
z!l)Gt*LKilqvwI(UN+Ut9-?bbo+RsA57P%<3xT?D4Qw4f4{b#mqps|Hcr$1T@8qXp
zthIFoPE9<_8doLK@FS6&gQ!PJ=Rd&mllG#dLv!dtD^up1qssB})-n?dhBGxI;Wxo%
z`nsZ#a$E>IIPit(9g(Iwig|RDYlj(zedu)2D>!}jJ@8LUCsOpg2tr$>K<?5|5>`QA
zee+LHV;c)Bc>w^lSyEz_PEGfAf`X0>yn!7paF5R^=yK1~LWMoTitpZvSi5ta^}Ya3
zdu9Z;ZF~~D7vF;BHfFQ*kt{qj_#xI$-3D*AD<kzTahy3^hL#Ak=$A8^FuGQq{!;K@
z3WZicp(v0Qytyx$vzQ_M!4euEJ4jc=tU<EpGQsiQIiNgvC$BVG<SfW?gdP5#Fs@h`
z$ej`rN!cZ&sjY@8c7~JAq@O&C+xEbER-2$vkd60MNaAfpt2u?HNH#EP$eEfx2WW;R
z%hPsYautJk%DZ_i%>Do~3)~DR#G-J%d<v2(aib4!7o#Pv8bnq#0fbuYhl!4Rn2z)T
zI;F}4y1(<aa6fDWUu-smSE2@ZJNh#Ley%~MHG05XgE44Yy{JyxYmu#HA)-Iq5WwFT
z(#^+C!^JrRbm`BI|IbLs`rjvTtUtj!f}M%~-eA^t$dijVG-H!i6K>^O8Fs95fha!^
zXE)}`aFY9<qWnqCIB(-^v^-6UNr@St$O|)&`|m06*_tC%QA!#J)wQATx^dvDxreTP
zBSwjE82s9Ji$vbE0xxqjsBL^6@8Qs57+bOzD%+S-<F+*T&v+%&4!lETyZj(GCxbf4
ziFoOWQ6f>e0*5Ic#}fQPc#+d$`(;8oHMKe#w|p+LzAym=`)yJ4(?uwzz!{Fsvp@?Z
za&e~Ye6(C`7mMBMfx^e9qu9nHNcxryz1(pTZAtkHI(@Byab04_UpT^azWrf~ulG@L
z**$QZ^&Ps}Ih?v2O(b8;n`w^1a~jf-QeXOQ9_Tw3#8Y*Wf=@+E&UJAEk?Wv?m!&)n
zSEc-h&Tco6Yq2MKQeemGU0-riBJYPu@jW`$@ISo5RtjlKh|XLiQIfA8PBu<NN#BN8
z{MjhD>&!lQu~$gW{oBr_KEETX1x?UVWgq<Dz6zzH18nL3NM^Nft>Aq?D4eTULw;P%
zr2R96bf{2^P3xLP<viP|pz9W#w&4c3{#G1va0!uk_Kw`vC<0OA4!A8)olVNRfmLq^
zu}tkbHqbGJIW(?CRXb0?VBKD<U&ydQ{1Yyyy^dRH_7_f^n2D~*AQowLQIN$Kz1z7e
zEZh4w<F7AeHVeyOXT%GTJd}xUT5GX{ibHIk`3q1}7R$5Q-X}P_T1cD9{Akr&G1{9K
zNM6*Rp&ut~p-H|msS9c#K~k-7P3sAGKFtgyYD@CvgA19<_<h0qPGfej;}~~IznL?y
zJi=<e4Z)em%du%%H$Ld~igTE#V7okcIJfR2N)_#|CCBrajMh{pJL<%$cKpq>`kpZl
zUx+p&|E4hsBdD(UE={p+pf@7A;R5?KdL|2lIqsI^h`J5Xv0KFR+<3=4eMT#3c;pDD
zzH$Uh9{5pu$`$5qaTL|h9A1d^8vZPmN^Z$Tdt_KDVuA~PaNQ|+ocr1_PVvDDJaw)Y
z`pm`Po^C%b>6;|G3pH?Iyffmb+Okx^G&W_#gxVjHVMpBJnNvX;+hrVzHjsazm1iT8
zkUa#YMSb`9dwDpeNl2^X{2<7#0fV~jFnsq$c(=X>Oe^k$#qu{v@42}!WqBUcuu6y3
zlA?LPvIS8nFyd<sO>l84F3fvx9DFg{$I<Le&fnaEThM$4>l<P;R`?3nk3QhmPub2c
zB~0LpFT<d5#Sq)ODFE^UXTZ0e$EiZeW%?^%H7V=mp{4WxAdOW`G~kmN^sD+Gw9MAw
z^-fgLt5qYgf2B9vn=t@Y>obvfq@j7iH6xhUT}O{8#K6e%Tqg5oH{kk5p_bTA;*#3H
zSM81Ek|hMtTeBZ7wLiuQ1P)x%?PN~rs0y|`vI~b_G{E5jq8V`NBN`XH4_{Q41TB>=
zG}`$AoPTmYeB(9>;=Hahy{JyuBd$lB;t+~*ZeSZ0Ho-axQ`of8Ur@TGm9B3JL(_Dm
zK;zdD@+B(>i92w-DR1w<88Tn!PX88SXRwF5YG#qrkH$c8!i*S5C-Z05Npa`<Oi_ZY
z4P1El5!dW=pUWva%EqR}VyjO{IBVcEKC-!&TUzpzO*^lM3r${9yV@!=wJ{oQ{ZveT
z7{sBuUEOR&d!!%)$O8SmQKT`jo~dV;ftdS?;XhUbWbr*C!PRYN;XCnklCw^<V^v}W
zpS_ADY7x80NS``M`Z`80`UUeoUrDA5nva0vD+&eKH7R_}^toJEc^v$hSc+<j9JwYX
zGw$K{DyCL72eo%@!L_!P*gxPAhx&y;Me_sBPZJ|S9xiC()hWcYHH%G3o{F-f)tEuR
z5Vb8#0j{b)iR(u(`drzX+DyHQB&&;wWmzZ)_Ya1rBm(QFPAQ^?R)xZ7GbvDEuz>PM
zqySspPH#;=1a-G<qlRda`93Wpfg@|+%TKCje=>5+uoa)#N>y=xrHXTH4Mp7k?ZbG#
zPcIJ7EyW<wlEViE(ZQv<IInLboa4I*?P`c6HR`==rjj4J_4I#iVfiS%rhXenU%5_u
z&K#!kpW11<({I@S%9iY3*9uyyR|Eas0`R>ui}%ISo%;S+1FO@Ep;uid#rEbD(*^KK
z*mZK|nH8CIS15=(wt%lDVn%!TQb>H)7xpDRp0zvMa&NxA;xrG=#%oMRafDGLRun(Z
zh2J}b9xnfda|$wH@Q4s?P%?y>2j0<9`_JgK=}8*;ONrhtm4KP0I&_=EcRFwJBwEy~
z06S{sNYP)7#L#XG`1L)4CboxR%;GNk*kT>hl)H@P?4C@MkQjMY9}NqG_kn>=M~URh
z`-024k$hdRPL@M*kbZC<JKdnjF6k$8Kc5tHx;rYd$BhEK_tas`zc9?XjqE|2>dJ6t
z`*b)%SHvv*H-zS;?5~rRbD?j`ThQh*Pn5S@m4)&8Xh`lT@`!&-v6nNfy?ca=*jmxp
zr*1sg>3g85VhHSiVNO*aZbSAzMiBo{wcyYRU%|i8Lom!}1ZKwd(G?bc0^1$>d|mA(
z`kMVj?dNtgwCV@@Xj;Mz(h|<h|3AFSCj{qPEyu|J0~ZxKf~8-m;I!>G>G{h)QQH08
zaKimIQs@!&s$$<!SJFHz9umY%SDQ1V;cz4)`~p+9$%q(}J>=K@I@;dwn>^E42j)7c
z!q=0fV7S{Qw9K^vNiR!<x$$ekB)6U5i(4&{?5?5KaH+-U_$Yr?zX?kGy#(t?HM5FM
z@42kyDV(}+B{o^+$QmQHvDRBN{PwI1cS0=~tL_)WiB7}pgnu`}i2#`?UPY1emMGzX
z3|3e1z%wolpcN+?Xz8#Fntx67Rk)H(g%V=qu9uJ&SYD=~r>d#Ab{9-|w*^TW@Zs^Y
zLTd9^AE^*N%yUkpa=|)CwWf`{DY(z;iAdth-x2XLZ;qph*i9&I!(wD#<;)%kzT@n9
ze`D!<QxT^Vip@)xbMsAAv4K(>j;nmn@{g6n$mKes9AF&fl`KcxmU2Ajofmo)DMUd?
zL7>?uM2b><Xkf^Z&YOIYIBlOp0(}qCqV;E}RLow1ZI;N*l&Vhs?c>;N`Dl=nuL4`i
z4<PGW4qLb`a6uy$h_wUU;-rT*&VG!~TG(Kxl1hw1gVA%@9XJCA;_o|xu-BCg+#(wQ
zBpT${K<^72zfO~RC)*Nh;|^Zot)nRFPB*HZ`4{TS&4rx7d=wje0v+2p5AIHCz|xuD
zK|-cbaE@iduq&TPsdOY=BwkD7*89?5iXm)?#5bn&%!n!YnE;<MTf#bQ$u)63SZnoz
zSL3@5-`LZG&PF?6o8UscM*lM!ikO2@kryuL|A9B3mB-Qd>Ts*-O01l<8gK8}i{ri~
zGnHxW0ymN8Wts98Bxe4bTp!~jXN3;I1Bog&RqQU(>sLY^|Gq?>D)mULdQScIW-(Y_
zT@Dg9zvfkLdPR%J$C!ASJDWY}D2+Nl&g4B0)4W1YngO(!j@?#Je=v!s{@NNhwWgte
zo?2k3rfC=fZ}jnKJ3jDQ1E*%o;;o^TxHj-_-1$TbM=>R=mU$S*rPvD6dc@$jj7z-1
z2Xg?hX=PF;45@x_6x{vx9J{vV7!@R(H0O&vXOjO5L1zN{1e2{@;1TyWUYSn~xUbnk
z-8Cbq%Y{z5;Xh}nYwu0H$s;D77(&a>er66={HSdIfcX{M9?add#&ay6VBf}C?5kUa
zd}sEdH#2l`#(x`7`@{phr;(4}KOV*h*6LwfKR%8#REOgER<L@5Hj-bp9{f~PBC|aw
z!^*BaH2F*uQ&d%CuF{X$dNY5vD4~xUK3pJ}KFtMO`J7FhXU!7)+NeTagj7K7%4r}s
zWHvNdb`2I4i2PYo{-q6@b~DM<k7$8Q{q49z3o*B>7SCIwi%%*X!}bwr$hx2uab_<t
zTV#g@Tdnb|l$ZF{6FDsQ;4|KR9OAfrm*{LYA(aTzgN2<!@bgU=Ox6)cZVFN8aGN1<
zIOof3Qg$-+N8!vq(TpZKwFsm^CYiNmE2)imM#dU`5wi{mHRjZU?8-&N`@auRm>mn=
zBq$1cH^tO{IUY-sPwL+Gne!4q{OEu+r2BCGF<0!Oas^M3)55o`4&tH7X((I7!rU8P
zi;a$MhO;Km#NxeWIQF_7>2=8GUDInN{}ro~TK{gCx#B5W;d&K*`%woU73R@gg>t6b
zF;34`?xRn?4GRq1#RMaApLyTn{!qs<HIli&pPU{sq$TqajgxAJyX^9zm)vH!CsgFo
z+F1_t)`XfWYE<FNcCsid)C{lwA&-y#oQs!4-oSTvI`Jib8{;j>R@g{bg6nzm=zE?9
zYMonwV^5`%%eWERTCAX3g33weB?fYjn*jy2tHfg=f=zbe<gvF8-Sj*fp2tFZLBdA>
zI5A$A(+CvvQzf4@zLJy1qJCp9pGicu(TbkSV9$IGE_CgNJLniJaoPu77}}eIK8Wjq
zKDK@S0lN;h;WgQkNbvp|whWfVchhX~%KNow;Nm45R^x_eet(9=-cQ9bZ?nkiY!P!~
z{*$^?rBRVjjGoF|%eIeuQ|4X^6YA<oYTE~Jt=$NI2s#VY2G7;^QzPE77I~0&RasEE
t`-k9)%z8R=tp$Cg^P1XUyG-P&eSlq!30&SD4fSpYL$Nq<^RmBo{vSqDMz{a~

diff --git a/src/feat/test_data/test.wav.fea_htk.3 b/src/feat/test_data/test.wav.fea_htk.3
deleted file mode 100644
index c1c577d5749ade282c4bff7c329f056486eb8268..0000000000000000000000000000000000000000
GIT binary patch
literal 0
HcmV?d00001

literal 22164
zcmWKX_di!(0LJYZ$t)pRDJzMr&pqdS`jQkyLRL~~YEVc^Wba+d4rP~y@VVz4G$^H%
zrqGbp(2`Pp{RQ`j^Sbvw&--~FK0dx=KK|%JzO#b1P=+^!BMtk=QY8tp?Z6u3{Hy`T
z+6p6sz29ifpAE2c)fi0wwihfdv7)xOg-~6ZG!hxH#^)m6<MH-MsP{vkWPUh{A7?1w
z&(H55Y2hm%bJjW-bgC5U*qntMQ*7`#_X?nT!<4NxZDWci_5pZM6iA2JHQO^mSZa|e
zQJhM|2YUj+>IX_#@3JON*A8a*=iNXf|Mrtw#~W08To7tJ`N{iWe-;;YwZiPo<!HOi
z5#(H9jg)^MM<%K>-~~-3l&l|&)K9;_M{CsCkK3O@vbK|Gr-k5`=Y>dg2!wGut5`LW
zukiZ5+fc>_fmZz#TzuY!oiRCx!vaIVy!oZP*OnZB9$E8Vzc|L?ghmo(@)_Tl{0Bsu
z0DgJO9se~EU>=}h^y9-3Qf?$mMQeQElU!5c6<SE;#{^)?Hx7(9k3dp}<>>rbV<ful
z4mlw51SM}PMS8wcWTjynj@X?7PtVy%EZ#Ht*(Yh@Q*;K;yQ4>d<1_ed>Lykwh{Jg$
z?O6AhAl{+01mDj+4)hF;;Y*+DfUx=$E2SFMoRXMJR>t4Q*D7<_p7u`sHS7)UbRPuL
z{~&jEb_)r0;1Ri79;iZc6p8h9lS_Oyr0;JebuZfwlV!D0M{WY*Z?~YcYMoK?0|VIY
zc7~LHSPSf~z5?-oGH{>LBO;~slyD>p2=9OhNmW}3^M=()QXL`@8y_?8j53Je(TAAa
z4`Uv#wkE0{y4fhbR`%LodobtDD=hYD9Xp<W9Cv%3!r6-+uopT25Pm+vb!gZ{6W41~
zAie|{8Sf!i+U3cIt1Gd|`etgBe4RwOOQ0VW)mZ<<3u<QY6=gjshV}9yWGH$aaM|z*
zj~3VxwF7NLG;9OW{`CmI7Vjhnf7n8?*9K%uvIXw769PJ}ge>|K%1Rm90*8z3__Cue
zE3~?V6)ZDlEEflZIy{%@y?X|K>)Oohh~E#qx)v}I>xZ~&50=ooA*Cco{yl7JD<nVe
zrqOkES>$6+7x@@!K+L+b(CD*aa@Y7AU3zN-o%J$;^7;Pc9z6#Zey$=&z=cFE7bPyU
zV(>$exlpp=2oa98<NcioC!&W6IAf~M!D5F&B38r)GVEl){D`Z}oODe#=gDcnx2uO)
zz=(lwVKvO{4Q)6jiGs8FiQv4T4v#4=;jRgKK)*L%A$Cebu-xt(`Jt>v!-Ku3V7nU`
zXXlV7ox12l+$$oX*-K4ZcB7(#OPIf6kT`}8!sVoqUC$p)lv?KDclMAt1_LNvbeRZ<
ztFnG&gLqCj7aWP@W2OZC@sq@C@M0na_}mBrt4~q(lmr)?+9ulM?ClR$=KN##?&xME
z??bTv^aY^MxrUXNapc-AN}*%7{*Vp6Uy;n2!_>&+FinmwqGpo{B>tg0$&Tei9g9_Q
zZ(KALs~kn=4msfn_ijx4>)_I+Um)U;4{^FP#mqeDkB8-?LD$Di#OwDA!2h(Gaaw&C
zq;_m)vVDHx;9+fWxz!Q)1#h+5l+n!?>yCnAJp)eb4inxnV8X^trZYO-H-XW0HO~BM
zA9n8fpIp1<GxU#n9Mremjf59B(qnZwbkkuo8W7=vFYen#UH2YAe@mqa{%T8p6~9BJ
zht}hab0RpsXB0-7)iE=sx=F&=Tqr8gLFO}wP$j*B#7F-IY2QYh*Sm*<#XBY$Ra*ya
zJl+bt8;m$zOgDbw{Susrxy9+clgQAEH<)nOVsPT<F82Ib2opbt#|Agg=Vg*DToB~l
zB2YdF?^hi_isciuV3<(%rHiPG<wK(M=PZqOA!s67pLi)tkPm`)P?=#nk#yS34oH1~
z=_;ed{--0Esb&G<W(?sL{f}UvLW9VnC)Tt1_&`M0BWBF_Dd%=JA2t=R1v)?lYlMdo
zwIy$%n9)w0yl%ic<OLsaPgo08n$_6ITaH+**NQDYzQ;Q9aU|D@*WMyo{RP>MM#5{$
zFVNdI=gD`yR9fKa0(U26(n~GpxB?b+xOT-56}{?&%7(9y4LA3JS+mulenKAU>QaXi
zje)p7t_8Em9eSMq169^_<NNuxOpS2~=VOm8vwfoj`%gp<7zoDT-Up{~>$T@l%wiCC
zuP$f&1q?uh-CVfjKp*D;b;sAhC3f@K63)S*!`%7maddo@Kia{6hSZe}(ZsB;H0;R`
zjqD8v6RnZdGGQCKRsIdc+6fV!<`z_5HIsNAlmxy?Cd_3&6UwKt4@Qg*kd0>1M20Sg
zRZ@#!HJ>w>Gnq^N>x^Yr#nv%ZcY>Jpo5PrE6P9?Y)es0Al?C!6iP%0ap|OU`2hL}U
zf@^C#I6rdgah0Yen_H*F5wpwX$~_gJpHn0dcfnp7UG<r?n^n;o&_GMq$0PR(%gH`B
zLG-xsGI@|;Oa6^`qUw9Q2-1=O?eU&Obwv*;ewGAn@7a?O9}gTkOAuaj)`1O)nV>rN
zGbtIn$7_CV!c;s~26p9ZIc0|tj%t4hg|C_7oK@Y-^5G54#kfeId0rNa?sdh6+m$W#
zF63G^JqYJ{pB&&yKd7ePFJUCI?JUi0T}-8qc2GPYQ+j#^I$LWG_W#V}%7s&U;!YBk
zSgDQbtk>ZS@A-homM&u0wi#EJ$}?-aa`A}qVcefT3I`W3(4jOG`!@O$W*&?8zZqoQ
z`5b`vp}V{-eK(otiQ`aFK?pxl>;VRY>w#78Zg$SUZaj4VG9HwF4<29sz|`#t<GsFZ
z#}$kFMtk&sL&va}boJXD>g^jruSrbM3xR#8S?w8v^mlNDBO_?~<uLLk_ywx{Wr4+>
z+JeJ>J@8T4JwVa-5+2lx#9gNKO}7mF;nmb|u;uA@636jlvKEvufBiOF7i;kW;oZHg
zbh0Bz+wTi1=d8mK-!*}fxE?S+os3WTdg3qEmvN6{E|l1R2UyO0%6pMo%$;EsN?&?a
z!^*-5y7{pUU7aLETTJiJ+kD5lg69<A+^3IG+}kQLl`laJqB>Bmh#p&)55X3dAh0)U
zGw>a}jX%69!nvOtfWJ^Q%=8h$g^SjZOJc8}*0OMrx3!cPSlP+5efX6#!!!#VkY||D
zTj}i9*(%IH+5oWlSA&1||G?#2rWpQ>Yk};MU~uWcHs1bvVXj2rGWz)WdL-%1p^>?B
z>H0mR^ux_CTI@TME0w>a`B&zAv_G_i%FjDOk_;zM?NWWb&Bzvbi3);?f5Mobi?8Ca
z6XE#m%|)Ov-yTd(F2_yWERq$`46RIfP*t&q7uOxm+Wrc+-h3^Z*>0u5cy%aq_Ush|
zD=Le5i6%Gj)Y3m#C1McV=+npR6wE=^%Ff1DudZ@MCcNpxWraw4n;3Og45S&Uz4VdM
zGFoJexpMZyq%7zWN;}g`{)~;0h!8_m1FP^wKQoXpI}f<u{S8i(h2jepzRcq7ouHsP
z0o&MqCE~j*cs{+k_-c+CEQ;{s>CE@He#MEj&RlQ6R_0u0iVF|%o+!QomZO`ns_$;x
zY#YEE`}7#{HLGBJ*9utWaIOE|+>gGP7t)4TJKzt^Mrt^<g&y-MpnWq8Y5e)y=vUSV
zSy#Ug%}&)IyPW|j@1KvVE1%#at^^!l9T@H&0`6wb!*7IEfsW8HEc~1gH(WQsFHT+t
zMuqBlWJnGs1&mn!gM7^6W1Fm7Z_BW{d<smm{SFpAa%M_>zv7K6H?SAg$9PqnIzZPh
ze;k#zgl%6k!I`aUhrameQ0(^w&A)0%nPE+uHnW?y)I6g_-`}I3w`0lU#d0Wbyox3t
z3PI+<J5i0E0IM&m4EB6fWu>2f1>Zs<@ZY8U;AHc5eC~oaj!1rrpUW=+Lw0kC##dt)
z=HSmc64VK1Uen;*>}p}GEBzRyH_Mnc7Jh8w)qcF7S%k5yc4Q8=eBn(bdNbWNtMQG;
zme$cSeCRh=M6Zi{Mk@ZRsLY27dTH31UPFTP>X0P*x#KjM5^zPy0)f=+%W9;i;ecwL
zzW~`v4-hkN1jMRYfve$F_^x^hPe0QHj|T9VOG*Yrgla&IWwyBYW<QA8x}Fo)<<I=i
ztY(T0OBjbn!h|&~2jZQf*y^1ZYfu!$`=G`$t0F}(hxy6e(GSKY*rfSnqY0OPLk_LF
zr;EfhUXj!?E84E5LfO+}l#QN5fBW31*_0o;zFwXlS(b%NWQ9=eTL`oqO+d@DP*AaB
z2N=!2%--ANfn6W;<J#@o7#Y}Oc(@FA6pI0c-Y$In<7?{~cRPVl#z7#}@Rp6r^#N)D
z8bI}1I99RQV13@V6K}mU%t*?V;dru)=@yH^w*6B)<Q>K3JF=hVe0+?&Oh1#)0Y7N*
zCm;IyrV_pJC7vtLxR5T>S4Jg2hpDS{EPQ%qHmVJ`2eDycaOUNC%qah2X!gR6;XmS!
zGnPPHAsa-*-|xq#89rS8)E-~))MmcRsq@^HnE;^xP2d#u1+SYa1N<tF0ig}0%pOA7
z;Ku8??oA})dSNH_<@?T&+TqDM+O6becRl9{WmB3VDS>vcT}ga}b?F8D?eyMJW%>Y_
zphxnvsl-Avv|z<qx_9Pr#P6?#YIeIqNyBPr+m(XT4OC&A)duE(g$Ztw(8Uc#mL%3Z
z9rIR=;V0`8uvUo)Q}jB3H*#o@m!5GG)Q$gPN3LxFoB$8teE1(&a-^CO8%@HVH9=ry
z&jTJi$Diq5UdVpZ>So3Et>Dgb(WLH1PtZ(a1G6V)P+rek+Pg-bzOK83uH~oG1I8y%
z;<jV7{8KsdUT%r1zxBd7+LvI09}f>(>%q+VCxPC^U~uhR5P=0j<n;IvY+PJ{-&rc+
zKow^^_c)I|s49hXR~-Q7r}~&``Ke|B8xz1&;({h{5OmvpXH|EqfiGE?*v>o#jK`nB
z0bbkK{mtjNTDx87jE^z!{5wIUKg3Tj>1?CFZ)eistK}#}l#j-2Nkb>PrD<Tx5+wIJ
z8C5A{u`P@a98-^E%;g}={GtFoKi&q_oBbJS34apsz5rUw?#J&p$U@~khsctP8CYg7
zhs;<%pXoUF5NOR2$LlN+=v~8rZy%SzbtP*-rap#qW52Lp4|hR{r)PQ9XFS-LO&7T8
z0ekVkM-51EqXiN@zmP^&9-!&XKU(HUH6y>gXsYisi6oBwM^iot!IiyrsC;%T+0~{G
zdm6<78*u~bo?ZfZj4nK7It#YBN0HUu-mrxeMb-wy!~1pRL?>4tpAao10*|V2_N;dt
zKX*NRH2wrg;RS($EqU<pv1agH^%595TElBFjt375Ye3x$S$1AyDH?h2iB_v;q7c8e
zC`IEg`MP8a<>kzyCI7Y~8^us~Kz0>usyj^*XDDKEwfpGI%6X(gPzZ*p+cI6ZML=fM
zDR|qn5#B=x>3SIe^G|hf*#01K`7{LYr-*>fZ%^Uq+Ip<$U_|E4T8f=67~l(LO6-JG
zE->|u0IwE)1D~7f8TWe~K>foQPh*)flvo(g$#`%Uv9?Y~e$GW?>M;jxO|-z+-I@H%
zmLq&0T#*>L408nf;k^x?ApLm)UOg0nlEto)b7?lr&a|8OQLr(%>YxYD3}(WTOVwzZ
zRwC4R91d^Z*1^2!C9wS2E|_$AKei1$%H)WBBLbUt;OvswIJ|!lzp^l9Bf^vL^UEti
z>Ml!sp&$d)6)(lSFnf@4OovGd8b;^lErKN*-QgYGQsmJd3U0|)k~E1ic)m3m)%5Ek
z>+nvn#9#`R?+!$^*Vdur=xm~WXcm6IQJ5$^62(>B5#UDCHJJVFD7>M67%p{L0uS5m
zW!`a&py}ytFeTZ6EvdC+Y8K1lohucvRc{I2a3_>VTnuL?enyf*v*WOSUotT>0O0kO
zi-bSQ6YmxgV={%~QFfd!WNZZB<)4>OQl%wysmUjbOc-n_5Jq*wJQT636dueBfSyUw
zC_DZuN)DbRfu%FCn!YPR8z1BJs1){r;ZK+#w}Rd7^&HgOsDo6ki=e+@GuUzK49Hdc
zV)foBmq~kaxq0aQVm4!|Bd*jLB_dL7*lFSmIWlzsx5f{VJ%U0^!^f3GLw6UMZT6gT
z&`3o&9t+`%^oOup#t|ubtt7J6Z^+W(Pw@8o&*<#$MJUN~Hd0LLKubil(ZQ_{CEGcW
z920LgY-SzNURaIaCtbx8N%AmBZWXaT`jZHArAgu~J$UGd0d8tYAR+^GRuXSafXVMC
z)=`al>@vB7_`q&Koa1=_-&pK~`z(@iY3mhYk>rG(wFZfftte4H6UV`3lJK%fA-ur(
zL=O4vMS8Vv_`c+Q<aG5e*`d;mqTGYv2aX3a7$NZc4OgUPp@EY3X_18|-(lmIJi^pw
zlOhoX5~3XjvWl$9uFnNzq2)O$c=8#pU;K)kuQ4T2>(w~XlJ#(*K><z|cLG6%_i<Eg
zIhNC|Cem&h?8$6HyhS;H{g@<yeVhgG)wXB&T}2jWyu}o5R7{43$2{?y8V|((_8_^O
zFNIWM(nw{0HS)vaq~Sslit;WYF&0tqhLSN#F1XBGOESYx`P|4_KPwV-Q-UlMnZ%dc
zYq5)nEPk;13u&%DM}%iAA_E`qkSZ}1&UiE*-2Jzf@E>~$#-E?SqEDiEk4)t7z|LYU
zzAuoi+P(~`9<OJ%%SAB{Eqn2?cZ-<qhis5b$t1WPcaRhjXYzEihHiOdPfQa2k_YoF
zk;S`*)X8f<dFJa$gZ|b*t!o6O4u`{+D+5Tq#TF=J_MMdG4ic*agoL!8B8%p~B~Evj
zvqK%_Oy|9B;&~{Aw4s+kLO~I7Rvg0v@pque#U7xx<~H+cPyo;BTZ&IL$FQ;=Rbi;@
zKBzW24#q<#d6^6QnWQ}fC`MeANf#KQvueDE-j^XN>n4WB?kmGBraEY2!Y$kpb`SoZ
zJ)5rb(Ir2eU!b&B4wBh)nf#HihY14bN#gPA#Pa5A{Mq0(X+QOacr|W<;n!2)N&Pgo
za)k|PU0DzOLn}a2-!ClNwFMk$l>}ibT;|)=lUQ|L60Y358tBO=!dYtG;OApcFgl?L
zQb-xI{#6ZEc&i?*{N{(|$8Dr@->jwEjdrlFQUqzGxgkUSJs@{n4aKi_BjpPhQnxfd
zRJ^|uPImkycZ+Vpy;%?NNN)tbF&c{-Jg$>ow|=5JcMiPy=`Q@E>jYN&G?Azkr$Edc
z54>?m7TgIl#^-b;LBpHZ;6&(ioF%RW1m8wMO}&@wzz-!(;>T;u4^=HFuYI)n#Vo>A
zb~r#g{+o?#UMf<X@_bq((~j-hCg|w#MNm`fD7@W!2z4+;)G+fajVQ}O7hWBL?^A_H
zl(Z&ri_O9z#;SO~Q#!fQX+o+_?*ye4bD_v(IT*@&%vn5ChMAIJur1V{2;EkMi+Bms
zd9x6dHkyLS>!<MODLy#k&LzkhFC>b-yNFQNa?YFQvCu>^gmtf(<Z8d!N)vylKr6Q-
zYCPUUPrBz(-QuIP?F)uQT@GlI3O^z~{FED2M}5BPp<20naKl?&W_(<V=*&%JDxJ$f
zI-f0>T;q<#XDY%snicrjdkf%WVusHgwuXyaLYRA3_=xt+cqsErm^b^9J22rp!shAR
z#)8Z1;B5UO(3wz9%wJ^@?mG|W=?YmWwML3fKX{6}{ND~*A1#5)?;j!0cN9{G<9W2b
z=rgVHe1cND??ca+7=+DTsm%VF)I>4>)t;RI8+vuXzlt9?Lt7Ob&sGM0=6Pi5gDRlf
zi(z-!a<X68ix|2hkhkn6Ogfgt=<Avg!@%31y&FS$x!+7d#54Tk={hKI6@mMFhWJ?c
zE+YIU0{@;}46JgOg7}wB>>f`|?$VN@^xW=Dcyz^8YT(vMS3q<6Y;QE}TWW&(ud9(0
zp84of>QTD9JCR8LHbk`zsd#d%8hlvg2GcI80fRU9$?<{}#6n{xXgTc-Eu$UC1xE)G
zzE%>-yZwT3F7ZvT<U8@uD=A>s%fQXEu5wb>1{2H4MbJfn0{N|B#7oQx*Q#E{YZi}z
z@XDKP;px}Bq2u$onqO+E_i88DI?c*I>m8)Y^2YRE-*#Fo@Boo}f62zXmdMWMJIQf%
z=4BmAKy@}1B%rPka*;EV+SJbUj7L*GJ6|YlmJA+Cbl{H1BjmxiBgALe0sIUyf?)~e
zJfF9nc$dB`Gn6a@Rf_-P99pv;|EeE`&Vidao4q+CIOI28!k7U5PX&yVY8^fz@`krB
z{UKNS<t=I-GmL8cHOZKZZ;P1u6s>xHmnN@SkD?@YQ)!+ZQl4rd1|wDA_wmc9CQO=m
zm<Ym6XWNj*ZgFsE#E#0K5V(J^9?GI>f?K)dZJ8u7(Rl>D)hEDpGZCI!qCdx{t)8jP
z_vXdvc`!!S=fNz6HYlXW;l;dPLY90<0Foh=z%$95oyhXQr-c=H`ZvFGXPil=CYz<W
zq5<E?y<M#>x>3^%uH-V!w0w__rRLKoNA9B4eYzx~QV6yyMWGt$DFO#Vpy#EFNa*YV
zkimOI?vh2YY)=`ezuHO#`5Dr|sm1nr0pQ7eH?ZrY3h$x*beC~oWgWuv$9>^;ytgX~
z;QZh@Ak*j??`%#Kj{Lk645Tar3%iYQe8CNT@xOPxLuam|pNuJq$+*DP+_0V8_ZDd}
z-Ta$|H2Tu`S`{e0T!Hp)=|%#d-C_3Q3NUPa1Jzs?Bm6&-pj+(^n1W?MR)`XwTDTTA
zjr)P~cjl9y2d)skzdks&NY;8H;Wdk{o3OT)gFuX!@@_5@B;tmNc*Z$j=-5`l{N^9!
z32prfj$d{K6Q2f{zlKuy<<1UV!k5I$GJT9*E2?8({ERz0G>=Grz0|UJcRiK4+)3Tm
z6;Wj!V|sUa0#cCs$y+F!2{X2tqAJO?c!>OltEG2CtG+;X&y#x~Zu?F6+-(R9OeB$}
zmA2NEb4~Ec*`M&hp$;;`#-7c%^^3W;DXMAPDid7jU5dSrN<+hi*;r!RZC<Hb1C&n?
z1MyIiF}=PL=j3y6+V+z?BZmuUXy#9RDC8b@A$I}UwQsyduSybblz`N;?k-iVkD?7>
z0Ji-0A?bG|z=yk;sGOIBD@DSfeXt`Gl{tmuo^6F2uFZwjd!510dQl>g?}+CgxXP{!
z=_j78R>aNb2V2_X%hTSu+`8n)SDy44f*sx^g5J~t{9wg0%*PDEMTSc3nI#cSn92*@
zN5h|N?ddJnd{<i0)3k5U*3E(ojJ8vm6$33w!zbWC<xOfn>n<4=>!BlOmm*2A_f+ti
z3CUhijmr2HaFw+?++?^7I<TQcaG@Pcz1#~8y^FzO=K!F#Boa@$9c8QUcM<iXR$~7x
zoejt8Y+Pclb+-E;BRIfg^A2AJX2%TiK+sWq%4Q9ey{3Z|&VS=r)(e49)!D$l^q}?7
zL@m0NY=h<;TEMk5Swck@57FP|*Web9^;A5Cp~_MJ=ty7_{JdX;3SZQs+KHOzT)RI}
z6B2~$V)9|s6$?Bm@f+sz_OU07gR%DGH&FIN5OJ9L&f3f{V@<vJ@D<Ustg*yb_Wg_w
z-d}Zp-jmR7R<Wy&J+`cDy7-ykhTZX?F=oGYMc{Lu`_Y|%XZ0A+xlz{Sa|&ucsfNtn
zUF5D^l}FWY8q?>Z{4h=bAYEx+Mmlp{Tlh{p!(J6Vs^@TxoVyx<N`k|Pj?!=Fu$2Qf
zhGlT2N)HVFYzaJU`H6W4A9S~J!JW?yvEw#B(6}iU*cUxuH`_|F1-fHw!t0NCgM~9^
z-Q^#6!!|YM$ec;+N%VnAf(J0-#WrU?X#wRY`9XTaizYs&LX_dM37IWY;vTSBOC6Vw
z)20Wjpif6RjSM;iNauzY#UH7VRIMN}HPWOY`5P*p9YQ!CU7&txH~3!ILnL0f!+8-Q
zP^b6`@v)7Ad%eElC2Qr_9SU_|>Y)LgQRl+?EeU6$KgO{o&3kcW+9Z?Pa|0Lb@dt~C
zws6`8(|OlRmxBQRkIgrSGJsO$9p-4#;-+=gVJN<D7^VFg=dPb=NZsoE=#|Bp$dq@T
z)*0)<{ZC3;gifD=RbNvH(l#a@$=gtVXct*noC8&jLpfbxxwvO&FQE2va6WY;Gfd<m
zM{pLi<&Fv4z4jI4*xd#2uLv9X`8n9}KoM))9L48^=YTU0uHZ8{J9+mu?g!a_uVd|y
z93XU0zB!{_o#U`O8!K=Jtxt?8pmnZ2D9@*o>$4<*CYJr97rtyp&iBmeg%2I@$dbGk
z{sl+i+c}?!JpRG@y>UVLfn1!@4WRI<CR|oP*l53g9=mT4Ds9<@1-}+T!I&1ncLrjC
ztFNHI(h+vPi92h7mBBTyZCC@P;Lm3+!9_#WSmFK;9KLM;{1KDDdra3eo}RJ1WFH^)
ze2NDS-|W&<<<yECOTVMDiQl;1K1*nLyE!dOS%YjW73c*qUHImgdyA;pTNvjx3$A!C
zN%obyqBD2(LA2`(rm9n!$jHUQz#pDCfU}(Wl=K0#&HW9IBFx-qAQ*AWKu=8-4%E(N
zbyk~ziWBl!=9wE&sNg~?^V`7n%WQmRy%`ir-;7TNPk<GUrR=H8eRyYoDLeEqlc({r
z2kmpZfG()?ab4w~Qb%WRdh@vk3NW6cXUg=^eB9h3d8r+S-)M%*bUaACIg8F3I77u*
zuYlt5MWp?5Iucc@19q<ouKThBre#NRl7k$W?77P9y}=w1sy>K+-;!t38akOAixRfy
z=|M7kr7gI7<ONjvq=w@zE&)BmDtKY`K`68)lU?dIgwyZM=e^}W#jBp?8`WoLqtp4l
zTz#bv&|dKmy>o8`a`+`g56s>VFQ0f!KbanYtIi8U<Fo5YP10+0TE!90D)wS9t4wF@
zyMn||h{N;wH}T_@kKpB%UvS>^-Wem~g@47E;KZQySaoj;+xEB{UknLg^DA!S_J#qx
z-Q)(;@{3|O{Cfu$2$^%L|MEliDiKazs02IGI1ivj?Y!v8edwICD7v~K3ysH}L;Ju6
zy7$!=h^vp`tCO!_;g9qdLH-DM#dHYjC!eHp8+N12ZIq3Y(BVmx=TenJWw0V}KazO4
z9ZxDo!N$mgtWuv3lE#zRp6Ms@!ozrzE@h|om=Mk0BrJTRuUYnaCt0xRC$N~akohq4
zIkpoqVr;K}1XB}7z>d{FfaeP_P`_WqI@~80RV{yk&J72ls=!j@c%dK6whTuir&I9M
z-hXh@K0rNt%}Ix_Fq!pXBPpB;L8*?Aq^JME{<Yau#Et-G0Vnv6ZwFhn@Dx1xbs=aO
z??U2#7}oTbD~NQ_g|hRuvo|A6h^3ARUU+E*ILI6%3W>h>?}B(B^I!*VF?z~d>ZQnL
zY)=M%wS2)q*=x{uU5nGf{}OriuSLZw<LF?WE0Vfi1?%LT;K=qBko(dB={&j$I`>M#
zT>Ee2#%X=3=^cy`U+_r(#~1iw!d}v|ssb}t%;9pga3(Uk71aIIWh>uH!hiFp;M)85
zVe|o6xOH*^>lCMqpP2o`I}#$n`5%`s>G(oc@znw8FR{4CC<d=D+eIX18iUOFGe8#o
z1fD9KW|Ra0l&IK%j&DAWL`wsZ_Kr0$vfm9YQh5lS#?6uLs4F>Yp9T%uYLHD(2fWu-
ziIRL}sPw!uMCMr>Y5cf^=cVh9$7UYkgw{(j2~rujdOJT{>%AVbChah}feV|rXYgim
zbD1ZHzT)L_nV@!R9rpg7LS_mJv5#b3apAvlEH><dCs$u&Z?E;hy2HPlYgUJICbw=R
zc~Md*?a)iO=BWu<xq=Uq*dlo2KX+*M{3*Lg<r|dNNk<8n_MylHY7kx<MM<^CNY`;|
z5*D$TB&TNK*H)O!N{R)#$BXgos~*HWBnCMB69N;o4Gc|ehYc%3ISCgBp?sbhdq0eT
zGT%+u|C9;)rtv7Y7_7#M6)C_jpbGb|y-sF-d5g*8-T2s}Qcjh$2X+7N9n$uf!FOW=
zk$(1WQZnv{<XH(UY^4na2GgK)nGK4qEkcs5?@+QYi;@pn5X0e3#H&n$=)SilaGE(w
zJpD+tV~ufo+&0cHXGf@-<AIwE_u%+gFOVn-IMs=KaDBlkpq9jd+`jKv3Q(Z;TO02P
zi*9~3t_*ypUgNaUdqn279uA9XA@Y`LoMbzBYB(qceccY=&!5EDwaKBlARD8l2|K~>
z#f?mYPzPMV>4PIP<zUXL-^j6|1f_@<1JYegmL_WALDWz7M?YdrGNg&1B|r91s)nYw
z$Jlno`=r3F5YJrA1t+S`F!N>xK?9uu(Bb9{c4boLj+`<hP?3mtw=4w$x5Am4mu~oh
z{Z72h`V%(3)<y*Hec&|LKBS`W{a}l5BoV)P54>1(0+fu*CTC8{<A3qW$Y=>f0x=~_
zzlSwiy`&EIImx0_#YPx7x{XQi`-)RGSmO6NbxdY(5m~u%8x!4{%t#;oi$^&We-GXU
zP{RY_5~Rk&ZP5YE_cFj8mlHtkco8$j$HxwE8yLxo#Xv)1F;0Ipmq>|Chu9VR*uo?a
zukI4&^fQB;_G3}-=9oXJ6+VsoSG*(#Wfw!>{|GNq3qx~Uw-STp-C%Qg5VBY!fZT4b
zM;S`)aPPAODEed}R+TklTkZA{L60`F>k9)4!}j2ueV<5KN*2CyDVu0TE+Gom>zKY<
z)tu28b-??R0+1_N%X|_vVIwsHtWM9U=gG*K<BTW+yz0N3;Ih^}yw-UX^GVxq3f3C(
zJasIP&iOZ_YV0~xAN@sUADl^K<@Q67-vRi$sxpZ?U5xa+1<|ZST41Nu0Ln2mhMP7o
zf&7QVSV^BZWTxFD?%Fkk6Qpi|kkhY-|8YLj*0CS(dn%IMyNht%M}6jZS{Xa$2U+cU
zQ?M%bA#3r|kG*isjENI*#b;mqVv93mm@1q1OylfU%vtuGxw<RA;h{<=+?t|_z)V71
z+I|pq8c&s<h*MGi-|)tjIbh{&eHu4?UU@q`NIUl+j9ZS-*~xGy_5MB3wi*ITI1rz`
z8^bE(e<jP$mf*Uxy*RpP0g>W7nD$Y2kU5>J$^6MArZ>w3OQ`+8T7tho++Gp(P{KX-
zUY{)xdS-zi=iTBZ$ZiI2rn!i~Rx^&OPm=Yret%xO#V{Owx)-gNaUq8$GU%-0rR4Be
zOwteKVfi0N$s&eB51yOucXw_=$3#U~JwpcNcZxz8od=BfiFHs$%aq8Dyd?4`XA{vb
zK++<6Fung1ci*~>^SVzH6nGIgXdGeUts1dJ(h002VG258TN(TNaoG0jTaXcb4=e3h
zVtu9E5>EfR*c!(swsGzw_Fm{dR%e?t{G_oF{_F53Ygcy@q3zE}p!`X?#8HjhoGgUd
zYiy`s;9)q<tcCO5-NsiG{-VO1EbydwCTmKc!_9BTar=T?vgbI1Wq)ds_)2-A#XJL7
zKHUNT5(J6C<y%Bjsh6|OnU7U{q-oh5A_WD{iZXFq$9PXCN})#CED+tXo2MM(0ct&c
zF-)#z9JeiF)iZTiP36Z(o~cA`iT}aIHLX<kdj*X-SV_%G=1~7WDY{J3kj4g`#7`2;
zsZL2M@nnvoLcIqd{%`~q{P-ERJO9LS#r0&5%R=b+{2><HQces%^27Pohw<wt(WG*#
z7V!*P%8HqJSbu!k&D8(IP{6wpsGS@F*S?uS`PkjyZs}@pM#A6v+)xG<u?PbX_J?s2
z1Jrqilj&&jzbD8jzXZ*B+DWy+c3N{hko*d5q2l7s<lE3;Dwi#aqyjwX+G&YO!sa;2
zPuva#B(y+8m@<;8xy?@H##(pRcfuEg3gDA2i|2=@Le1<zvd}S--6{8#$Ope-b%dTX
ze*Trf_^=`9l&S)<-m5{USRH`-E0}*K!=OFuK7QY~AIq%3P;yHTn19&G`ttP@WYnU7
z<bIB$^B-ENcGfEDegPpP$>(^g?HSeBrA}n#wvrw5%IT3LU26L90?G|P2Bm%8fI6v<
zNM-ta5cQM`!73HlVDHCV6xa>7w45iREmF8`jSUPdu*MMso0@%_J2{SuNnk<TFNRE?
zTg`G6&Ta2tTp-Ye<JVmS=N!g~*`yRG5z+$hdy_!ytu34@^ahGHlt)2U23+Ce=H#e^
z0jXU02!Yf2#5Q*&*!tr=F+JT!vTF~(6DxhF%#|*5c3c`t^Cd!ssR*R1l@HW*MuLF3
zevm&%mdtVvgyMZ0NX6Xic;D0S&@Z`!-TWz!^TOXBt9S(RlFv;ugvSRt7e8j;@3|Vd
zw^|lAZ`sPMtCuFTm_OkA@LD$3m0=2;qB#548KK1GDJboY2UjmGml{hrQq72caMZ1l
z1`o}E;*-nhkEhf73x6R{mz{Q><YQ4`fEdVcD1;)j1mF+x22QHU8?x*13by{wIug80
z63SjtA-`S@!@cPT!BEN?;GMLFS?Vl>57s1Lk!LyhXKNqhDtm&=ZdiZ~Di5<6<3m{Q
z*tO=X4dI}{^%7o_H;CoUH#hxUwFKqN<VR_zFL8lu9(gHhMYEgxU_|r~T_<*)a97CD
zD#4l1KW7Q?>((NjVp*u5PZNt~M=}M0BCxRM3`e1&jNIDt3KXx}MmAWEgQz?M!h5s^
z#yj+4f$BZ@`I+aOIbEIDM*kp=DYPVWnPtq=qVG6n$b(nbXAhE-g0Wc7ez5D#LC))!
z_gO6g9qjd1pQk9Ei%v+`qO|rITqh?k6-#fT?W6%YD|yfpu13%a2e$BqJVS<cTZ!e|
zom9AXIVzMtk58v=XG7H-pqioyxEqy2DvN8OQRy5k9-j(UTuQ}1Hn>BsojMo~XdtVM
z>YL+;KlAr>EpEH&KrBb1!KS%=z^Ba<XB0exVkPOEGaK%JhnAZ7bMSl4uM#P?ILom4
zR<;QWFx!rz=aq5Yw4+Fo=07?lb_WG_^U=I=NqF(WP5O6wU(g7QfottW=<)xqqx`(v
zY`2a&ew$^=<aQ~8DJF*a-I(IUUC#v%?FE^j(4%B0uIGuK%Y_Dt-ekUPwRO249~ge)
zi<6}I$&P{lz`kkjH1B;IF{$1Qh2EZLe)#f1kts?RPc*ZoXS~6!&<V@aYy;ZBSAmi~
z8FQUxDU##tVfwEu4+RGX(*RNo25ik+qzpTdg2`t1Sjdu|Nbf;~o1);(;S`eQpi9y!
zJD@M$B5V-dL+a8hVcff2@V}dtm>Wchk3~6fMha|kvppxg&H+3oD$~Rxo~$cL0rO(h
zVGvgZ4{B(Evrcimlhg0HmfZ(3CwvF44yuNH19d!J(-=y9^bgH8`pdP6cmzz<Qd)#s
z9z&;>r^sdXSe#%R(K3gB0>&IMfMqtmbieu@R2cUO-Z(Nrmg`8-xg;3|diBDvJ2q6p
zY%NTey$9vIGhq0?QgZ8$2)?^yJ<R;I)N0pelP1U4{-AP=H6GsE#pG@ehDQdjg4hiS
z%$IE=c+>f}(34YwXOHu6<lHfkzx^c3{W0yhZLG!~fh*8o!$YuTLV`Y;=Rrku-r+|9
zFH!C7=oW#ui;2(o`^=6F)>Oyv5h^Mcfx$!9Ih%8)`DT+33`)L&)KawY7mE-Wc_0YO
z*@eP^&sL;KXErf%PKE2P7_;LckSHD015+<=0HLQ%SnNqAoFA49eH+qwIU=3-?F%QU
z+S&{L2<I{b*OoHEcrEMFTf&vm`-%)?kE4UP1dvo!B;_i%(xac=5uev{;j8Zbbj?o}
zx=ZRTCf_hj@@q!<8f&odnJ%dFUXwhuSP5O8*&)NrFX7fjo-i^Ym?+LPMzcEu@ksPH
ztnpI?e0X{sFOsn&+#}h*;O8GGFt8S<1Rmo(UnU1VKpK19YY>Z+c>%?*FX6tG(%|Ov
zqj>I?LSD1SIfP}_qxesW)4R|kWVAj6zMkQSLb<bGMu0UrmuyQ{j=0kW(`;?`tb0^(
zw*pFT5F)usV^BRti1?)*1u@$vkjOs_d5te&lvfJzD`(J*!FCvZBo1nPUkZ9ZpT}A!
zOmOGva%}T`9cUZ<j!V+y$b64NrdIhU_P5yqQmRkmIU`4)Qx*k1u|v!R&yq9Mv=9~C
zU5MPI?xGUyKD6gRJj}Q<6Ukl6L)I0`01zt2Qb)saL&{5dOxKazzC49ec5fqD-@LJJ
zRT7cu&}3g1%fZN@Mwlv90}VIt0~wz;!qy#|;isY1aML?|sH`B(@*aI<*1P_~apx<6
z)7U&5nZJ~n3IyVE#sgP-ZvwX$7~#9e`a$x4GC-&CKVa>JnEx#MQ1muuq$BtXnZNsq
zgs-c>;dp{1-H#*l^Q+<L<W5*jpCiK~a!4~W6>c}1fl}-`h;UC75%Sa_zCm}eiTo2_
zGC;9bS|oF3qabwlGKLL3dtg(7E!=!b8P*(*X{?{o3Cfk#*v>tR0e|8iHq~ez5j^k|
z-(^nYM7skZa@J1##;goqnEM}A+H-{`8m-Q#U*3aGSbjxn9U4gMsv-QV(S(HWOy4;v
zSAvhTUc&oFWRS#gI6^(Mk?ikRXrA;Nlp?$myA|=+AG=D(yg6-n@8f8oz3>B5@zN4o
z))qs-2h#ArPdsSh_8E$<_Jq59mo~pxV**m*b(!8(Z-C}V2In)g0{htQ#2afe+0&zg
zyi%SWX7x0Q<cn+g$Ujef@~8<XqH^)HyU>VcB>zF?6PJ*YOgxhO&lWLWFObBCxk%)O
zHL`S&Mo2>wEgP^yD)KW>a&I>4I;z8*$qmL^-zM|Ar9^n7!GI&ZQki+=y&R+(T?g02
z<aiIB`ZGD#yTG1t<>qhSO@Lps1#jf+JYY34$nL!mh393e;iLr^>&Ka4m&zOX>hBl$
zUFA32UtNM*UiNUb8dsyp?`nv%;t>3M!wpII%z<zBc_aDvPtlykACRuo24vpbhE~`O
zA)l}dNap!*l$`Raxu}qjbE}w-ZJPCiRg>;#midOW&d+{<ENBi!b|e9}51#Bh%Us6S
ztQrd_oABmD=Yo<R7ku`XE7M!_0P6*1<1ph;BBPJ-gXC76s_Bi_9M{FMiyHCoM+`oG
zWf@~p{udpxpYEXw`;lOS0~}o(3O~*~hvfbxB5p|^GV1C_hLTT^X6iDeaL5Enc1xjT
zCkOE3<a%NiJMH_GUn4S?cM!?!!=!mr76}iW9!M`nvk?b!r(L*RB(}Ph7c=cSJN+#K
z-;~veX!bkKEw^m!_b(Oa3o8;$jW(jkXN#BL{DdR7WH5^)igD?uSNK@c2va^LiH?0L
zLkjDxk;YtEH1mKbQsY=6ZLe!ce_1Ws+GvPePTC>;&Ou~)*%1Db-ing$<iV86Fj7$1
z$DSK_O&pfJB&#d_l7E7tq-VDR(cNmr^7;7U{<9BAjr2b3q~H&hYfgLjcsG&zoX9(u
z#AWf(jrjadH^T3`p9p`oWCHH&!S}?KKwiZfR<8diCU=yWhJJ+FfAx}#)^><%^w>5*
zGqPpapQdhaqu`1wG}`KeW-DdFt0x7hc)%Mp?{g8#3AzAZb1|9dd(HONR}fgWm86IN
zrczurDr2~n{g@v>R{yvS>bjN^*RIthG^+vRzSRZm&0b>gR1oW&&;h4?4`PXBhV1t6
zqX3?;1hE><xcQbLpi2wDlXV~Pn_WeW`q+K$v^z*D^5x*Cc|XY-M{BZv{5_4Cdx~=E
zssMYS5Lt9ZLso^K#P8~W@goxG{1JP^A1p;>dJ^&R+orf%Cx?{nm7r3GSCMCD#Gsk`
zOmc1WIT%-DgAWe;BW-%~K+)t^Tz~#A@V76<$$P4y?0iw)p&OlG@0*=)esi()?tl@t
z!?KVWs`dkIx+84KoI~sZWpnNVznQe;Tq+!^Pr!Gd@1auOw`m28r&Z+>X#deM5|<Z=
z&K}ZcI<2ax^hN=6(RcZDhn`6?OIMSi{)5nZ<NygpBjjPz0P8gE6%1#kkleka&_;73
z_-bcD+Bazd4k#n~3x7dD3nOg)@d+3eHRH9b61Y76Ftkhx#rK6DLb(}V@Yvmt%=07#
z@K$^u`|{l@F7#eV`{lpDq@H!)!oADX?Phrk|BQb$<G2H=-}#%0o;Z*02!%kwn+?>Z
zxgFI9?Z#;Zbwr>*oJ2cJ!1A>pNpNx~Nt~1gZQ|J^K+zYU8xvtK{0U(1N3X@XoL{W%
zYY#GiUkLc_n$5V?wsHpk?P2rPykUCVD0qII#qSF;V9<^kM8C<Noha0XrZ0c8<ryov
zYF+Q?NwF8yyjC8eV>UF!<WY;kB2zkd`y+7a)IXA5!$oF`DVg<f3ANv`4%Nm>;DOI9
zKK`eUs3gil!#h&s+8t9E<7NgfJ*^_=AE&_<nG57%WeUa<-=O9oWPQSeu*mQoW}}KP
zsD0B!<c+m?uNCBAc2XMax91U%by0%ww;ZmQ+Q9yB^?|}&Z<-q<tkCGE`!sR(4RXSp
zAB7+ILsK)mXrJ9bs-b2@H75ST3V|(1xKE!HZri~8_j1#8Mw))6X)7*u%*X1FSKuS}
znn<JIHY71-1@`+dCJPUb!e6gz$ys?<-1AHs?kIENyf_;U3REQ7Fk@HXa^(<l>!qOK
z3=8K(ox&=&6QF)}FjzGu%3PcFe9IIQcwGw~GrNuBxC>Xv((lFgcy>%1%$9%MA|)YD
z<ofk!*3<@AnD+ynp3%&lu`3{(CmV=q9tT~_sRkp-cgW$S98j`KgVd!0;`K}d>8v@8
zBZNGrS^0W+<=Ys3C@08>5I)fR>JRhv{#TF^tPf<$EbziImI$dlfV;Y@Fxzp6U7siq
zl`M}Fk-~Z4=kRABT2hKrFQs!HS$1+&Q8j)3`7ViioW{AHF{efJ@D-}C*MiE~Hp9_h
zwkZ4WCip<0l3dl$<T$HFp{rLfK&Ku*JSfe>UrQd4&pzo;>eLdLy7&WI`dtJTi=Bk#
zg{|!G7H6i@{wOY6R|!<8J#)KfCwTX2H$J~xp7-wAJ$RyKC$SXW$BYZ+0`YB<L_c8`
z7~U1exxy8};fFF<zWh$E(L)zH`Ph||2!1D@y?g0%2?MO$T0{>htwyfq?@(;aB^bFN
z3zz)pB4G#hP{RysaDD7K6z1DZO5Q7z_;4L0QLBp6rt`#+FQ=g1r2#zk>?`<VBG2i+
z+=x}lCXUTZe_T1SA6gxc#)?fjP{z^=%H7#dmTJuckLFXHsii@r{FK2_J9$RbAFw)E
zLu|R3A$O^b4Sn$DIbHfwiw19Pp^Htu$gRVRsN0`q$Vk^6l`S|0!`<hQlemmDBwRyR
z2M+^X#~_&XYJi0Q_kf5SbRiK?&a3m;0A*ip1~!M)raea$I5+(mK6_7%waJrX^X67h
z`(4wn_kZo+irgdU7*FurQgs|LgMzN&RDAAT5N_Z0)FMwIw^>8|40~2$H``>lh->-K
zj+To>(JS+BQ@NFgiR;!OIy+yRTBenvDE*^IXh<9_`>%%tf}aHDwW0bwyT}&W4L{j7
zW7Tt(@IR|_u+Z!<z9^~;*%l8VnAJ^SrvMmJ+y**hZJDMU>Uc@W2oXB(4xA6JfMyZx
zU{Z1p(F$qcS^d%k3+G1T^5s!jHO3s*dc`wNwz3!vh_frgv$<ODA5mF;Ga6zhMpZ(e
zkdSNt$c6erW}K6Z=9HwP&F&1^H}3;oDDI4FZkMAfJnfO}QiOMH_@QOBBFw9H2H~4$
zv4erXV8ye3Hrq&qBtP59K5ffm?+B$57p-g9UCb6IkGx?U)+s>606wO`-iyc<h_GI3
zj^f9euDG6q*^&9__@U_=z|S^d*_rdK!ht{P7yC};9B?76tFOR9duMWeV>$VE@(28)
zD2}3}q~HYwQ}}Dm6*O<vED}4}f{Hleq~enateDQWSJDl^$C^NP@+`(VUaHX8WFv9e
ze2VPZ_;UJgvJF^PG@ZSt`Jt{2kI9`Sh_|bs0v0+qaKBL@uAAo4qHSKpMP&~z5YGT^
z{rhpP=2mu3i8FKaYCL1p6^rU*kC4reE|G$lx1iYCesbz*9F;fN1h=|mBBidqy!k~0
z2JBry*NV)d)#py4e9izl;c@{U`K1Xnbk|!u_wljCn}%>vk}OFMen;XZW654;6=HYk
z80g=55;q)q$>!M&g4HX9vCz3?c+(pb{C(jC9OQ8tSnl{p7L-b`?{;JXJU9#MMlEK>
zc87wmOBXT0N&{%@sSe}{$pOAivq<BO2~u&TlDZ}H0reCY&Kr?YvU=4n-o*9wAR=0d
z44i65g<Dn<=cH->LUBEmFwG~96Y;oU-F498`JG&S=}LBszXoo*S+av~J=WN^9MtaB
zx9F)iVYZjuWZ9~l_|uWi?0CHp-cY>)n17XHJBwYwx&cX$WW1WW;wIOW9%BOPL~R)*
zFN%s-YxpPi0gSn5K;k^S$dymoRBFtNrl)9;0P8XK_(&jJXWRo9Ju4#b{$59guNBEA
zO?#Ng>|&yqUmzm>8pN$F6AYwEl1(%B5Ji{k05ttawjCCO+UM+GM9hTs*aZ_tYb=%R
zd2EO~8#O@W`CGh}m1fMkU9;ibTT`In&r#mHIjul@nhP6*+~o90Sz50hCTNAJ7P3G5
zopUxDlfN6)srlb6j7|Ovsy^MFPg^ZQa}si?M%QZ`edZ*|`Wb=>KB<%BA7yYYj=?lf
z2+U0JBo3*<VCRSkll6U&)w$-u4(V3mYln52WH|y#r04VMuKeG?nTBK8gkiX3--@zC
zh*BY1lx*)aGf0bwl2T|_l9a6|S+i3pM4~JeQV}B0%skdo+7(i^NQz4GDW&iIKmX=9
z=AJp{IxlOcX1fhpt*{t3wtGMcxsxDf*<)h3W-pxI+ymk&jY(HaF$l@@!m@L+$*z^5
zY~n9Nv~kRn?@Jz}E-M4+ptvn}evvZ$B(anekGe#Y#gEg)vz<7(cm(`Yl&G3dGs?<d
zN3}c?;7s=u<niBC&^C51V^QwGD6YQ^Q#x0H<OTiIt#}Fv+0HWk@u!L7Qa8Mzk23G(
zd?hC<v>BzeOmMX&77U1Q!cE;y;QE><U@2jZ$0}BVC%^qLx7mQq`gey-6A?ms;&)Iu
z_>Wr8-%e_tpW{5Q=5tE?x`8FuFLC(wEE-veiQg{)XuIw(MK2OjW?~-s^j-lJ-L3_R
z-4^h2U?x;($|YxKE`nx#PkBC;VYH&$hBs<si4z``6A526eB2T;V@uByYa<C}%&Us+
z<)whAiL<<OJCgDFJp#ODt5$GS{tdVibcVIO@{{>t52~9E9Dw6-K`2gmHx-p#gd|fW
zC{oF%8%(c)$L0YrNp2cvbkKu7@_z}JE%4_oG}fW?>FreK=oY4RM>f7};sOQdi$SAn
zAHW8m4)8|p9=L13fRl7BJiJB)1f2K}3JC<_$S`Xr@0=ih=2p*CN&F;LIdS-L-cr*?
zW>qB9B$^mb4~J3%Iv}iV01J%`f^5HB)5)-O$kx3@u}-^Tj=3_DH0`9rF%xuH-JQ4M
zsuP%c=NNY=R)t%p`;TrJil^$8TIizmMLJD7j<F1@VglRpfznn9u(mxM92?PQ?lml6
z9M<-NpXc5|^@!j2P~;vcF?txUY-$G*6&cvZ{4p+gcYxIYcarZh_%j<-{fJ467)iAi
z0uA;@*(C;XxHVxNdrV}IcjW6yC^(*hymxHmWbcnt(S!?hyse$ai`^kF8uH<^?_+Ry
z3XAp>yhXD4(r~oiAEjEw6Zcd-z*l2{*18BXR4zzDa*BxAs`>2AKVejTT@iGpo1p1b
zb$ke<!;s3CY^H`QIA9aQGpjbn0y|BJ*sgo{qhAX<Cu5wo(itP6FT#n#sbPM8=o4|t
z7bf-7gn5Y;6uOKUq6@nIpphpD-=8ey#2;nQ*lB)XNzHO75XjG5tNDTw?PVm^JDg6F
z4?(GIou>EF+8LoRV-nK5p4gFBU{TX`uvo5^34g0WPt?VOWxu?cb9@)9Pa~0hdi9gl
zcW?z;y-e8f<AX%qcN+F+QN+{L2TjM8?_wr(x|m1rj}n)4O{5^dm&|eA$DjK%cwSG9
zV4Qakx;}Crn*L2?p7|f6U*$G%qLJOeF@6pw`H}-^o6MoGjt?_gpAN@MKcLjKSYX}Q
z3zjV^!B-T$c`RuL`jdwM<FyH}?~Tbz)`Tt)k*51CEvRO7H$5PiZOU%Z2l6_1i0bx(
zWLHlnc6S=W?S_9%esogi!qU^AV(>g}tDK34kFLT(Yb(i_CT*TgSr)vsPYNCD{z~b1
zCv^0APNRn}(y8i+JR-c5?<WRuQ$|$jL1|SwS)$CDBut{zg*705(LHu+MKe{Z93gdP
z1!Q$e4Rc%j5}a0d8}zJD0=giE>~3F7h0js)B~hOjzap6VFL^Uj#Q8Y-;%j1bZv}C8
zHNj;^HW7hI5s<I_3Y>9%1LO-Nfl<{KkoGQ+7k6VLEXt`xIRO^*y?Yq%tz0R6vSb1I
zGHeHHGz>ta{{gz+c^bKFk%>zbF4K~Q#wab(65N_RLB&QzDe5$(T4$Z8^8Q95eS0zU
z{mBCGSz`p?$Oe!WQcjF?I?4H)ZoE<bW@h4LHc@-`ff<hNBm%R8$SO^7EcnfYw8;P*
zZukM@NF4=+V-JCbwj*eor^P${y&EaSh@lHl+-OOhH<@88&e=9krMSx!3H^Kv%Xu@v
zlZD#krd9yAyz>i9ez^>#t3^Sv{#;T%EI@)~4pNIZ(Nv+`6Zf@UAbknn!H&vac-%Y?
zIHZ+;0|DjGxbq<s+$_u_^{>ZS8Ux_2VH%ScTg9l_9$>5*&yh!gOGva`JS%D&iWfj5
zurNIU?D^ztsz{%p>2VWCH03vESwcBoY|VLh3UYF>vc%)&1UJ*w9ePVg!*`Bn2sif$
zd88$S(z^@cN{`v({UigftvW?s-;ASz6)%~I1TA{ic#NtY=ARqcnAis=k;9X@;D^&<
z=0uwk<NxP3Th|c}+VaFf6<GoTvqE?w1s!;Q&;agNHz2C_*AN*QG3-C)Q@y)Mm~CJ$
zA?MLX<Z}NfC-~qwefoKfGu@X(rrrNVi^S8ZkDoIf(MqSgD!)*J53|8uMFEsO{XCqa
z>qG`r*YiDVDg0#lPQ2oJKCn&JpldE2CKdZ1F$+tIiQa$@e>Uv}5qI)=JKm=A?s|?9
zUZ*68YZL(Ax*EWR^KL}&w+$$=@M5C;&4{P>LT17+lsIs4q$#Nc+x+!MF(b#&9^o0_
zXzvMnW8yLwo7O-@u1Rv1;gz(`=Oo{|@}kkLt2kNxT98_sh_dYNL+#(gbk3o}(DUXW
zxVe4;l$Y9)o%<b44_itB)0OFrcc(7dov24ebtiFZw=<5auw**6mJx?3I;?$Q0SLXa
z2uwGxAxDPCK=(&;usvxxUJ+*n8s-)fWk+{XcV#*jxN3lWQXas!xh1BXkC@X!V;*M>
z2kC9i6vEbzQRq|w#lA9N>!(T*eBlvcF5E=vQkqc5&X8VQEC=m6??B(jLZCyflq%Rn
zz}d@V!OWk2jK)S4Xm6EB7tc|k{zk9JRH<FSr*<Kkb+?!p<auD@AzyODw25pxhI#cK
zuYi_?8Dr2^0!8Zn5etjm<nYrVUWS4=D*F8bt8Tmq#rRAj=i(_k5^|B#_cWyX2Y3`O
zBXG<_ohejS<Ls7VI>YKFN?wr&&p()?@k4Gv!^a)u_YLCxLUXBR&>9ewJQb{d@R?6$
zgkh$_Hu86y9NnB@!VVn|29`1-*gfGWxf5N=$7z0KD2nfPi1w2^SyzFAOa!*HNdbdz
zy}{IuJB-w0M<#k?6M85*MprfMgI{aEz^{!l+|-9l=r4oqppVZv!{Xv#%ZM6UEEUY{
zch2V`2I5iTs4T2J)Ju;TK4y9Xi*fd<&op>$J)OtTGOY!{U}%>J5H`F4y9RI3MTIqd
zo-W9g7sfO4&d$7!xE~}}Q=KF!_7l2vC7ZXL?_3;N3FNDq+4v4u*6&n0*zqs|++SwE
zBx(qnNjG{^tDI4Wgq)=L?^L*J2M0Kp@Ir1ziwm{=C5FVpRFF@^6K-|icKS)j6J4(Q
z3bPKx(r}55q^B;9lm%>|S^Vsa<nJH!%!)o}E;d2js=HzH-wLMIBcBK!=>!XGQ^=W5
zvv{N;A19|@H*FHwQk~x2z*|<TPlQyBm=xYc#y{m2dCk;;_55sJvU@BqaP1niS-mH@
zdDV?@jnPfgg|2hY_iJ;J{73CB@vSiP)(jZqdkTG+t-~EWtICPZu0mJOI>U{(T4`L5
z8X5cDMW!Asq>dlIQQod>8s^jv71Jkyd(>$-W1b(=ni>J8h8zPTjs2v~!x@Ojh%k2c
z-@#l7A8^G<j*V_s0_T%^h<^GP5MnAo4Fn><?B7mc#YYD$b4A}weK?zoYBiwel6F8X
zM@8=b4<jz#*oZqBQ2?7%m0)zQD`Mw5am%E;xarRk%IysU%lw7u!MO*R8bwV|JJph?
zF07^E_jE}@WE=R?5CCVDsgY>y1g2lBlzFDSoYyv5O4#eSLBqolV17-3x%>@)FQNa~
zI{8eH>%?EXtseL^9i)nudDz}1ow*i!1G`?RF;jli%AMV#z?>g{55muAbNJbQF0WXO
z^K+j8Yd4D`hmON2q1cnN{%%04b7WAif*=#w56M+&mT5HY1clGqcms-4s4;j8l9{t4
z;ZGC{log~qh6k9N5)mNSV>x?cOEcN|XfYJdQ)F7o<`9|wTJVR{=bfl+07j3Dh<=7W
z-sRj*g0M9HXgJAxC9xbY6-_pqtzF5*HuORFxU(QcFO+-e7Q&TxdUGz7pCG-Fh62vt
zL_P-Jxh1?N`r@-Xx@y=<JV!mCVAxZz_ZuSX)s3Or?wwRu?>%ONUJ@A<Gbr!dN*6k%
zlQ|t8;91BYtXr?o-i<N?-<~6^QtL$$jr~DSMjbC>dI5GFKUSS~;|@u4s9@$Ui^D%p
z9A|R2A7w9<pEpxI@4<yhTm!zBw!wRS%3Onb0{2m28z<{_nFu5`pyZvs=){Ip&c>sI
zrU~bx?35`uYg#b0o2SA)R1Bt)<qKe@*<Gqw_J=sir!n%~wR{g{Ci%0Wm*n?}ga1UE
zuz6uH6a3{R2r`J`>7{!S$H*ZNZD_}Q+WnmvW+5P{?LuyE=m(Z=zWB9<3_D_M%sS8-
zGpXl1`mIhMKkk*F?z0f5wIhjZt?A(M5{qFQ6M`IP+9Q(-%G~Z2Pf}($g0kA}fVhh_
z+|?3@6;ysuMKg0)RmATC<KLAoPc8>pUs_4yY>tG!k|O5}mxH=NDZJOZnYrWgm>1SI
zjj8Rd=T&_!;O~7OI44&CoI3-F!J$bKxbGjxlY52jjn?27OWW84zXfK>Hs17k$7=BT
zY%u+Ce+7NvewrKAljTB%OW~7@cC;-Z2Pyx(&m{z`1{d~!K$%z4!MfQmA$U0o&YoFE
zMdpV<Z@cTnch7Iir#HZ0r!@&E^}`h!zX{=YZw)HH$2mF;IAS=Bm-|`;2xjC0ue%y#
zG#~@~O%Z~MPh&}3mn$(YNChE3beT<)k8zUg2e#mHl$mDcESkPV1kTlXPb0)SX^wqp
z%~ZWEE<DEzNyb|u%{2><?Zr3T9+f0A)E<n|Uq~{i3YDOG{RSwxXBDPB-c;k%B$V2A
z6DB4((D<c4spLNj(Y<e^p;nj4R^Y#-tt`pKGzBbfmdy0riv-oPrOC;eR#10l3^dam
z=iMp+*e@s_Ecz@80s<$9sM7#zwL{BHV_yVa@+2GP6l|sE%GYrsKTEjrE6!Yjj|`mV
zw-0$)KR|YiYq+a_F`b@#8l`NnAnauxJY%p0+)xVy^-JY&{|p;g<X?iOsl-wBlnT15
z!Ue?Gim@le(m_JNT0HTvir*=sOXQ0@vFJ)2z^+aP3;S)Lva3Fn*nR@s9&qMq2hU~Y
z46VQ~9F8xYj4>S?88n-|G8S4pt0U>2>9lwAcCIA$Gk0q51(@Vn%Gqg5fKJ=9=wZ?x
zF1P*{7}CCl61R5Kn8U+xhP^$QD0>Fi@y|f>m~JG986>6H12<2NB}7dWG`ed*^BE1K
z^1m)zceJ16jz1*{o<9h>Y5*K0b--ZGD{xp;mYg=Of`Z8}K+;DUvgf`ERLL%9zeg?O
zwMHhHiFThx^Lnh|wPSstJ|!MI?b73TTRYGm%W;^t*^Y?{;%6<j*K<eYV&Em2n<$}V
zh!$^n%b43Z1DV%VaQ9(j7%kt71Xf1EH(uM|F`cR8oaP9##wH))+ua!EzQ9|zD`5vl
zm1K_!5UFB0_MpCX)wbYfvU$A~)jE-aBbxQV#nO*hbK3_{zafG-uw*|o<pqN(%7oe3
zv#ubEgMLV)tsHU!!MM}l9h#H*8xi{yG{5x*D$f!m&pWcX^+O8ithE}A=oBOo9=l1#
zDsi|%^cjrWv>O&1-es=LRf76+w?MhCmBiOM1CGxKWytz&HiD*u_{>qPN1ox{_&(xj
zW=Nj9J)+7t)2RFnE4o1CI{Qqt0FR?6B9UYd;*vyp<=)olvV{QFbGQH>uAdKY*!Pp{
zbsb<ls|L*xxB+LE&4JefW07~sYSztv5ezLJLJ4EZRMXZAe?R?<nJJeK%Kq*kk(#UF
zVkH}5HFJRO?{@?<bre9IkQcn+eg+iCW>nvKFa<PkSq#MA^La48_ixj%72zzb$q!R!
zvS_HBe0q0-nCCl^836-SZcLoHAXvat>066#B|m}dLnonJ{yup1kre(%OJL$a6cUOQ
zg2D;GXi3^;6mzi_Ef%qYVYOZ;;k*?-U_`;YmIzkwqdvRQ-<qnZoFYj|E5OEUmnjtz
zp$0*i)cK@<g{S(U-N&u0&{|gzHXzTO+b;)J1cVd04g;bj6^lo8r9slTFkUjnf?OPO
zBBHpM8h($#E$=N%imX~uafls!I8zhpmCGW{LNh9zn@9F^Mj$;WV=DVZ2<k_;p^P?v
zSX6i&uDQ#ggk>sV*0yPY^D%<+{#k%6DIL7(AU!blXAoMorjz5>UQl^=T`E_)no3#4
zW5!jOl@xbnsQwi^dr=-6DiMrd2J8n~V;`93JbtuI+6uh8Z-gBLB#GI)GE$$~PL8~F
zxu0-Q8|5dcz@nDtNcpE1EKlvC_ZF_9shb#Nza^P^ZV;qPc2^@?pI0PKJq1Q=N=6BL
z^2q$Vx<viX1c=vlU<~I*0Np(b&}rUc7&<o>s6Q~EQkJEpwI!6Ap4m%$7rbLdnRIs8
z`vJakQXUU0x8T)Xtw7E89+PBbi<?3#Ku>cvnD3#&`|r$A#)seM6iPMjdnxsz%Fzt?
z;Jh31b1_D;etW^-=ynv+<$@~9WSH@AmMPY(hGo4oXkNGjTx0VF#oyV0Co^uciYAK)
zzZ{a!Xf(j`jE`^*|6iut>;UN6X-C#SOrR2p#YF9)8p-IYVV547!u~6K%m_ty5a++H
zxajE@9Jx@65Izrmp!totRhGxJ-E(2u{p~>Pf<`PO&~GHCT8hr83YhK%6qe?c!58&Y
z!S=^~X!D|Obazn;Jl^KZ_b_bGDd{F~ZJ#gEemjEVgHmyf2n9!UU5UZfEqHf+JX5yB
z#&o<x9}nze0eS6)UuS+NA*KN!^2B;_?c)T?a8r1l(<_05Zy<h}>W9^uHxc1OT2zQl
zA#L;&wpSHnctwKDgGpZydLotgQ2Vh7(+*Hx%St49<2afn(F<$eou&dk=}<v>A3C=1
z7SxW9p=<cL`~J=<5bg7rh%wty+@}oep(h1ykHiq?<DO)p;dBz^hp68VVXDv>&s^C$
zLC$Kl6G?*?;L;~^vMTCA^;@FPbJIJ_Tn}ku)if=u%bJVuU)z7=rR-J`>ih^leRUX%
W@62Ry*IuSUNRpWsxWS}Q$LW8bD<<Xu

diff --git a/src/feat/test_data/test.wav.fea_htk.4 b/src/feat/test_data/test.wav.fea_htk.4
deleted file mode 100644
index 0b8667e1fa3ce777821990fb69af283a8b62e52f..0000000000000000000000000000000000000000
GIT binary patch
literal 0
HcmV?d00001

literal 22164
zcmWifcQ}{d8^@81WMq$$l@%IV#^*WbC?X*Z4YZRYC6zSo9YPe@Wn@OS&vVZC$V#L{
zr9?}mzDi0o^z-}k`R82Ebv^fUpZooKzi&Q1z8F6K&`iGTGpuOak3gz*NR>u}hEm;Y
zThSVihsgAY5z6C>C&N3QpzzZJ$Td?J-l&;Jg-;fsvda=kH&Gd%ZTSH9|2q#ejs=p-
z5g$m3%OSE&*dI16p=@C{fhxrYfY{H4c1b_+jcvD>hRPwV)L+M4u9M9Kg-J02#-?mV
zK8p`T-5|QbTX3#a8PKaY$AJ>BaZRHp!#}?XNrm}RXD*^AqKlyM_FK$OzBkY&`Y{qY
z6NL(Z7)t3aLbl42X!G6`NdD3~6k~7-p`BN8qDmNeFSZ^2akeJ=uDFx-Ykx>v$9AUq
zvo4XF*a%MsmoVHS3+DW}GdMfn61%#`<AN>$7CCF+)PZm&N$H5Wa~vPmjQ1eN1^e;q
z%v(U>c?=OS4kluq70g5Q5or-Xw;l<hJGcknc$^ZsG^9iY^81lSy%ds}T!Hr0Jx7nZ
zsYo>9J}&G&hGGuhL?(r6iC-<l?Apl(Q{*<2OLGE9?Q3UpKgpB?RVR|VrCBiRWIcgR
z2Z*}2HP-z#1CMwo<6&=o*4MR<=v&-pGG;3SDQ+CM{IMxnu#&-rAqH$(#8*6#U4k1Q
zngWS`5Jj!mrw32Zr>Y&TDBVyVt_T>Xn&v!GP`8KfDC~t#4HHn$dJfF*T1E|SUP3Vs
z4Pj+z5or=LViFoPao{^kV$-mRgr5i}S>r-v>;RW^+U|sBGEb9^rP?Ir=Q8Hqq6D(+
z<VRc^mC4TI6DFJg_Hd4x8iU;`y&&k<Vj#3_)VzJfA72f;inm5@X6)2!!OXs&=xw<l
z{j@!uCYdL|tUJbZ$z~&}|FxEcT$xXG-W(*=L31tm_Ix2+vo&;4!BceQNjAKcIZ3_;
z+cIz3&Jf>iyUF^i_lU*lMsn%>6j}6QCCNCa1a{n5O;VP);%^!PK--BBvrDRszDO^S
z?ij%3rT>_WDRZ#>IRF=+DwL>j0;!i*;)aRMz<EJD$jdEa0=IlXuUBaEqz#kkva9y^
zoT?999oS97>=mg}!5uQF`GJ_MK8ZdX<<eDQK6HMQ0J`q72(;bpAP?0OfoGQ(xx_>f
z-+~FU^~_5mvh@^{n_)_Ht0Nk^?D>e<rxoC+NDWwQH-vv$od=a)4}gVse}HLGSwrV)
zWpH6n9<Xr!2W9qN1fI?=c=d1~cw4Ii@S~MnCOZc``M#Z}5o$vXU>;6y6sPuTduZk+
zH##pnpZuCWzdlwSeL6LVydEo}#^49a$|%Pdw~3Qg{&H}Mk~{kqz9I7s*AYHFLQJ$%
zprF1He$<`A)(c%?zb1Eq%O$IrKQr8>zo!8BL>B|QEG1AlCx8u6=>&It?3kjn7l3EN
zLr!3FE@!;f2b6SWGk+Y5SP3b6)Kyc*)7D-~Wj_jn?Z-N)-=zUs-k(PuU!Epg6@^H9
zvITlM!lUAnU4%G9q8omWI6LAVuATv*s!b#C{JxX8`<FAO-9ALXjs?Ulo$UHl3yk@d
znQ~bIE<0&6Njrbx9k2#)=NST%gfC2fMJ}ho`v}OVC5*tS3*7N{6|8ikBZnRR3Qpch
zWur37S(W?=)D(4uw_w|O!hJ0b-l+`IlALTBd#{*=Nv*_FH;w7)*5hcR#f~bC8juf9
zbkI$|Ex1DLDL$tZ3O#<7FbezclZ4WPP$K0GF%~X>(hh4#*x6+uTFjVfxR}O_dJ<-y
zl^vE^dI4B*t}-^}pYYS>H6Z?qA*U_8oq00yi7Bi60k;2WW7D&bGA0<au_|}CiDW;j
z$+*NbbeY11^X;L?QdRoET8)NVEv11A{t@M^el#LA(t-~wQ<2k=q_w;X<r=k-`SMZR
z=GVqB@Z~$Q=Kc&KKmR<4kr2jBt2;qwZ4#M*li7r?YM|!cLT*#(f1K7NKCE!1A4nb>
zXTQyTK{UPuLV1;a_$ucf<G*A#xBxaoY55QAop-D8`hhrh{hQ_HK|MjJNZg#a^!jJ$
z(UyQOPfpPB!_VldKR0Q)as#xE6Qf11|FaO7_)Lm&=1?&&1C;x*lGqv8GdDTn(A>g{
z^iQTiZ3S`sr~DEQmN)>t{u~0ML$`5d=RWQm(Ojk~E|+oNWye;n83)2MTXEysx%hr~
zKb-S!A?_Wy$2{^q2vXho;QSr#oV6e1@#7v1Mi5=&oXk3lPRRK3)K6N%dpBp&4dd5n
zRkZ-koPV5Nmehx$%^4J&JB;q%t0PB3=8_t=4CNI`lFk2#fR&G|nDO{dDroc>#^&ae
z!#)PMXQvsgZa0Rh))S1yDQ&XDzk#uw2brRFA0|FJf{D;C#a}Kaf>~}qfK;m`UPWe^
zdq$^#%F%dmr8eBWb#^8${&vGWbk_-vh;<6GS-y^^_2ef^7f+)14qTwxX7Th<&oi1o
zsE*wBDU+g4OHkj2C~D(uM5-IdQE{gSSsc;=GMvPT$lYBe@B0k6b$=G|{xgLQpH;y&
zXES(p<53WuEk(-0{{TPHTIN=dGBA4cnmN!@h*MtlLK$W^zVp|SDGr^*_+_pJ{E1)i
zjG-8O;NUN&<?woLeMvvpHTV@Wia5wy@J5VzqkNb?y{}AnG!)Z;?8CI}7)Ceu+2Rl@
zTMJ2_23j`Dkvu!T1(lg^!Z|tLfVFTgF*?t~saZXo=71r5-)IMZlq?5F44*;y>a(~i
z!IJ2mD`jsky2NbbvjfbZ(~QukL!89N@^Jo-SUl>|32eS80sZoEwrzMGe*9M#e>|uH
zXS(h+|JvTjecft}R9CI#DflOofcqzC_6KvCczPY}{`G_2mKjI)N*)l$&`lPC9+H&J
z(<57I-lEc9CU}F-f8fshQ<$7x2}06?$ef;ZoUg;jEW3OO7Tf#;QvAUrd%DNhNv&e$
z-`ZfFt;NR-D>DsQ^Fl$$26<4Na|1_CZU9;u3xMthNt`^f5ceI|#dS4&P&QYLshDld
zeIA#MG;ITTivBLdc*7l9^unLU>#d>#&bw*n-BT9A*C@^siASf$dg=1jaa7bj50wfn
z#B-8&gPT5aKp1*~DEI$}@c9Jn^KLOn&&q|Nbw#*T&x6!mzYF(RE&$+EE%$g~hw1*K
z#%3DF_&|spA5)*pVMRb3^QP-Rpy|TFe^*T5D@A-@#*`d$-)}a^@fhNISIk7>Wixn^
zmRk74@1wLTl}A(ipYvw$T4;uDl7-muF{1kB69QRXRNpU=1giI=(q#sCQ*k5MFG)a6
zZ4cAvBZE)0Y{CaZW`HZq2>5g682*0a8MzoF1Y!GDDE5KjMs)jgt5)-|v&JHs?!~7$
zw>md6Hbz>26XnV7V(RgK<SrITP5@s|AHi`(^FeldU){@>RY=s{j3=dC1@2zTq&a&o
zQFg~H`f1`nn%VuyLgr}{6+AVGBDGcNq7MngVBJ$x0*i3ARW%3`tpXQseg?<WigDV$
z1I&|nQxNL60Go=|5h=TQK<Rrbu3oSXW}WuuYOA|(c7Tms-}A$4%-|q%Ezp|Vo7o4J
z8kb>k_b@KgHmPsNd{E%DDt>gR0a&&iGM{SphL6-ud7>{Cf%ByW^y=;!dLu)Kj)woF
z*W52#2o-8kxp@U>&2>a2{@#Fz@heer;S=osx*6;U-@)9>m;!fai{O`aR^VCNFYHJ|
zam5W|{Ft_YNS+3MHkAycs_jkxK|ZGJ`*lv1p)?!2&6&wx5yz(6dov+Z1$bkz1>5lJ
zG3UekZJ=m%Hx9Uwif8KOa1`fR!_1vCctR>0LF%JSnlxuGWq+2_XMup`rKh3skN4@E
z2lG*aO&CpS@IY&lub`5J0&GaUJ_yj`vT2|X^7Y9O$x#WAX1x#7+9-TBas_UbD+M2S
z2oSmcLAYmABj=n?2biNZgVVjVol&(r&IqN9aquxSwyEeH*5~9g@!~%jQN=;lUgZ!I
zZ`Fm{_N_AymEwcE(O}+;Rf~Y*gJ|l{Cr4lDap?WqEwmxx4Em{UMzw_mQE2K*di3jM
zq_y7&m2UjTwC61Zd0W1Lr6zX(JK~7{2|1e!e&@qI+g>t(!OFNVy&H;1q~ZHpRGD)5
zP)<ab8xR`V&g89fXS~mCWcJ?BWc~!s#I^?yvP)N8=61I3=Ij=%!t098FqQS$_=;a-
zLvWoj{CesJol4xpDRp(HjLZW1ODvgoaE54m$8n39Tpm67bvq)TcGI-IrU-?fM5S*b
zIMnV9iufwPy$@f(gQ-ZY@i-3a<%#3ZZ@;pFW!kvqk1jqULb&@E+hK<$2lLtYJHYIP
zl|Wo#kTpNW57Zo{fS&zkjIJ7(uTB%k!ET2+#ez?#?@W6j;~0-;_lR+kYY6;Ut3!vF
z8!&5m20eDdj`p@l(VyGIXxpF97D5>V)T3+z$_#a)p+dp%f%P6#>c0+L<~l>!qbWe*
z_(izfs)~ElGzAyjipRVX4<dKO2$$M*V*IxsN2o6ZOF~v~x2`k>{KY9?quC$4-pdR4
zC^`X!|8{fk=}fU{XB==#i#3yCB8iU(tY%UUWtm-HJ;qJy>O+#zMRX|E2EJ1IOP74S
zPW!L@qc3l-q{E+&qM;H!>hUukSzPI(SH3hO4be}i#M221J@A7YLbP$B&M=H!xPa3Y
z(t$^}cjCsw*(5b!0B0AK5J9uEOw`j3p!nehci3-;`$69k)Wy%hYM)jDSIJu7xnmqG
z2=ZdqA3BADGR#3rs|maL@kXH6>5gaH7_uVA)}qBjUbM1UpAk4%NW{Mx&~Mj`=)_-7
z`myUgD$*^Y(Kf;8#Iq2Z8#RbL(qvKb_kJk1>ne=7)`G{q+NX7vC{WQ11u=e6WUKE9
za`SZod;8BA{&`e`4e%lCt-Vg{$$8><uRsqd%;7K-vO-{{hdyW@o(uY{*MbLq1x&_-
zA(ScG%N#r70Dh-U;&Z3AvEB_gP=L!zniw%e9GcI<z}h1ELBfM4_9=%>eE)-dY>v?E
zD<@IF#=kUh>2<Vdb1N#6yTbZs6~k{c@37hC4lw;#26S&+28v$)<-~UTlT(I5@Pb7z
z9_aFhEBu!di&K19>L>^I`wB2$4(|Zxo3gk!R=R;-yIY}R&Nb-fn+#{J-3Zsrl42hD
z?T4EFl8n?r6E<v50Xq3Nl9~#b0&%-<pyoput#z=YB|wCy^)L^)KeDDfhV+rpE*+X8
zparAZHk79rPK1nw;PcluP?I+TbN5|^jrb=l&sTx9SGhQ+?gbpZxPqOKu7M-J-xKA&
z5F8+!iTm5n;K&y%nSR%m?1d`}LFtcq;8oRWm_T+w<2!;-ZL=;=nJ|YkcX*Irpo*Pe
zcN5-zPz7BjB4A$XTV(!5i#kTXq@AK?XpQkZ<hU;$u9w$`&tHX*{CH{3y{yeB)n-14
z4BrW(kH&x&+sE+qE*bb%+#XJH%Mi0k47zy^BlTDgHrE|O(!b_FdG#b5T3Ue@9}*|Z
zL=#IXSK(;a61IEvCh*S?g9~;WLQOkUu<4*JfV_V|{?{wGI#`sG@bEfpm~R9$e&wK9
z{r)J!<t7=2{p6Q5CE=~dP=GIsgr%EM^r{rJ*x?i6`&5QvM5;)Hg9lKJ&cntICt#45
zC9L-U4SW6hk;})8@W!gQXilFnP+>kG{^{pebzE$9`aJvOL?`}tUJz5EU3hH2JAPQl
zW%G0@aI)+<uy$_+&P!ASv)u-9BS#E0Cca=U`FzBttL@0*t%sN~qdRE-%**&vj|DLo
zEJmtpBrW*kSakJ~JN%ou6}fK`K)Ej*P)ukN(cY7eD{QndF_SX)bm;)E<Tt`PnG_`1
zb^ul-5jd<90~*%+hNVlw;FsU>Y)+{uds#FGADkG$p~Cy{#FukK<di(#l*E!q=WzU5
zpo$z0uK?az2E;Vf6E|GA$0W{-B<5moNO6KBo|Ed0awQwtiF|z~PyG#A{!JErKkbfi
zr~xwaQ9&!s*P-DJ)+pxGB*|84XPq@Ph|4D{qVUTPsB{CE4p~sFsSa9x+d-un!XV+n
zJ@9OG8q{0lWcJ=Mg{$VMY`$o}44$DeMPyIS#NT`+@wJ=3$aPz3vT8*n$%@{`mbr=%
zkGWRF&3+bRrx8aiG^4QKlX@s0AB$FQJwyH~&A@0DL3><;EkvunS@0=;MA3GqP^qdN
zdh)~)#aP>s!c$9_=)@Kxv-=s*j#kH~@BD<9GgQbf?mFVKtDU5#^TQ)Px<qV+5;30G
zXg2GOF<82yl^f@g#&(2c;_6Zz9BeRzzs%f6jOQB?gNA5wd42+}f2vG;LdD3*ix)U}
zg&2+BwH7yDyFeA~jgW!@gEJR?L+59jF|9QUsJcrU?HtoW3FFVu`2~B><;pP>b4-g^
z4cudOt51@Z^S+U$zI>w4ejeN^R3%ZL{*o)7HK|%cG?A3%k&a6|QahW=2^Fh=E9=+c
z^1gRqW0xBadeMf1FNzS|wq5wdNHQ+zWO2C+AFg<EnMf<#Bbse%IHSDfG`;E@*?e#=
zc{Hm5zNl0r<+(-(X$KJ@n^=@w+sZn*twojIhd^B~i!5EHIa0<QZtE*U{O0m8;^SLI
z@>k5pKSW>Q=b0%)6ory2wwhGhk)MQISVa|{sF9ITZ_a2aA3X7(m#E8r0$mPPcvaj#
zyeC4NNXX5_fl_^3{{#O)en)>+;amV~uiS?tlr0!{KTE2vyPTwIdQt6L3&<Z)d%8_+
z6OaupAWHEpI?{WEs())HE7bL=&T<ntzR3&4ee{PtbCgI^!gEk*vw<{kpJr|nr$~;E
z1PSt3OA>$lAu77QczoQO(7_4vYyS%{OHLlz;FCmgt{xO@Kg(2#%ws$2KH{U-qHym=
zdBBKngC~xUK%+x_pys&{b5&z6bII!!`M$l1Ol&l#0W^>_t=>zQ-&}*ASE|FSOes1Z
zT!VWmcEV4~>Zny)HOW!<iY_;Dkc6%em3=h<J%dF^^PYnw&i6OjA|6D=c8Cz~0|TJ?
zG6R=wI7_zLt|f2FJ_EPYx4`4@F??9+4@gWY1)E0|IqwGuRukTd3vV`qClSk`?qO%B
z%gBSj*2SQAOpw{~vII%k7SYl4M5s8~PmkGD(j;O{mM^lS4Y7HkBe)Z4)DNOyl`-=5
zM+jZ5@)u=$7b1ZaODY&+0FU)2lXLsu5X*b_iAL02s=#}OE8CKw_hD`Lr$~s*;k!&?
zvJnVV+JUzoKLeh)-oaOzMnRX;Cy>hYV%K?_L1I)9G<)-fle}^fkXtze#2vT~7d;AW
zcs_?9@9aZ7*}^lh%!x~Hn60Gk?@y7H0iwLwoCokp%n_h2IE<cF+@hgz4BcuvgbH5z
z!I6_=q~LHp*i@%QBn|}O&qH$L?u@OZNuvphaP`1pr6(}p!y=3&`7l!G1P4#ABcj)C
zK%1%n{Jhr-yzjRHcM3mal6wd`>XpM4(L%U6{01I*st3Z0jzQEJ%Wf-~L@ARz`X@@B
z%)9MQbJj1VrCJy1s@rGjmrV~5NZtm&^(&&@`8VnHn^sgi@dPTB>4fV_W;0$_{0aDX
zoQuXPfTmCv<@1SVp4`fWugw(jyN*z%ZhZ$XUUv%`-W^~%EBT0_=UZs(BFFvuSO#3(
z7GvHzvy64s)rYH713-51MWU$ZMpkfZLCz*eC{rcF#-F^38f9klgqFOZ%0b8IAwLei
znty~249U@|$x3vS8De;5rRa`f9CdA9O6TerqSEU>f%>0gaJFwYrnVBmePA~*eCbO0
z8|C1_gHy2O@m{ibe<)dWw;b?1-QcA;>WqP|F)?UL1s`2}p*;U}X6)h~JdgySbVV|#
z&0UBu7ladmB4tc&uLtonqd<zS7whG$i861O)A43!lKE(mUTzbiNka{E%(jER@Lh|(
zd=DmVZ$r_I|M=*+6VmK|6E3K<CJz7Qp8$XRTw&A<hIwTlNy2=;5#0qBK$$%YS8ZKD
z8hnch6E6z2zYfB<S%dX2Wjk>1l}KQEH5YoG9&T``=pm~T@4y3v|5z={Tf`~M58tzd
z-2VdJgD?jNc7^L0_rt~cD0WQ=ee4yDEgZhk@N4gA!}DuA;V(M0s`V{;fAtnsZ}^6S
zj*HW!!jtgeDSK2leb&xx>Cim36^Z}(z^(1=paT1Sp>FO7m^b4XuB);o?>`LUcfY0?
zb3KC5IttvKZ#(do=<UqMk3w*PxUKn>1Un+YxeQ(JPhu5k1#&dh3fuErfWv_vjB#!h
zK5)OAdo2DDvdzq+ZA*?K&(9GwtkILFx!98ypOB(8o6^zk9z=tOGmzI^3-V`50~U+a
zp^`HaWS#RDxO=WF(hAmOS`OZ#av!YV<sI@s`@TP^>rErSMso1tJ}J2W<stBCpE7rA
zv>QkJz8&*y%F@iO!k7u4EC!N$5cn-J$=a>WC#Fh%fawwdE8bdSzVByoW~UO@p!o+{
zsCb(eZ+1k@dnD)?%Hgf{SwpWK%AjRId??~a9Q~2yfr4av$>ZwPNCuuqB@%y#d0-#(
zn$wMhZvFy6E)t~w_Ct7Qb)VT_KnmsOs*p$3m%(D575Lu#9c1d}a~~NPGtUk?GWv^L
z@te3soV=aU)BWro*ybk9EczpdOBUaQ!djoqV_JK0)K-c+zR7X@QfuKK$258_=&6O!
zk<T>jVLWg1#$VLCP=+=|TEfTgm+@u~{z4Ic5v;gGHsnrvqmo(?BCziR^zx8}@xw0Q
zy2M{1&~Xe_lC{9_rW&2;^p4EGo{P1O#F-1_X{@WLHfv=%#5~Z<<)lO=5UEY$EcR%G
zF0&dqDl7Py?`hr4NKq41jBR5lMr`mSx&JUNY2{v7-UBOEAEoP7Y(oFG7f>);g}1>Y
zpU$}*NK4<l6QB6+JfY7TC`s@RDcP%tl<FR!BC!ql``exH@HA^%t|!NB&dmiy%YtCr
zZhcVKHjfNmm;{>2GPwV|7jCu_Ai5nBY(mQ~CUI&BXF_cZmmBur4YNGq%C<J_k-3>^
zw5$h1CmB%w{wmfodW%CHC2{_@PVOSR0+`JP)5S>!79tN$Qro9uJj=u*;0fmjO}m&z
zB{GHSi_6MLXNZq{aCrcO1%IJDZZhtEJ_-H)DT5(vd+Z^j0zG<iVDyVwP*x?41h5P<
zQdNw<=cN*pglQj5+Je2=yPKPI?g(eG(QdBocYo~gaW;6IB!J)idCnGS4M3e!EL(r~
zlzDPwB)3w)l1+c(YtC2M2-WMpk*X$h3+3zAX#BEV-lAbA=;xqFBlXTwwbt|W7q<y5
zL0#0)zKB>#dZ1i>IsAn48is|Qg)7dd;oqYTFn-=E(9<#pL|Z!HjPrFkqP-c9+TO>n
z9PX0+dcCYaR%5+OFLH~e?lEQyud-2TO+a}ugP#sh;`m_^sPkO_yR3I-JT`9vKC(u@
z`R8Wy4?jymd~*O3zo6S<?x+(@SZ={no@P)FW}8so(lM$tuErDD?2A@<&7)vm8F{>X
z+WTm8BkCI_;GwZb7&*<i#>OUK;{C(;uvQ~pS78N4ijBxlb#wN|ei?2>oH)*Xb)8)@
zYmDvEzQsm;m1p_C`g57h%IvzkNmx{{4c9)92cN&N-0$=_;~=F0YMNv~*$P{8@--Ga
zsdT_qimet#ISXj=^<TVM$9de5s{d%>yI3mo!ICHGC5XgN{-R8j1zBF`iE>VzA^K}0
z;MRC`sG6#ReNXg4-|yqh_aYNw<nI7&TrT5hCd1QS?M(2_E0pQ2sAu<CiL;6C*RXeX
z*y4Tv#5n!KcG!6F0WkB~8f<gG5x5H40FI|1=YvTJxDj<3_*Ayn^Eqacx_U2Ye%0S%
zi&s4@J)uO$Gl%is@*!H?kz_X5A<WY++=XOLSd(`;XNcMhX_T$#OV;0h4jC(7P&8dT
z-FtUHi_{b-drgU0D5gVbr^~Fgu)|lMAA%YM72wivKD&3RKXW=~HJfqt0Ipdx#6%W|
z;3%hH;Qsv{bMnyxmfx5I;#VHz=HK-K3-@Er*$;2(Hx-{DA60wN+_nUZMI*cDEqso?
z_4qrj0!QdG<|XrDwKq>f>nyDLc$9>@Y7&dwWR!loi!A*&3oct|$bBp8hMQk)0$<MZ
zpnldQetusUs)?-t>$f-KlcPG&>GDW}Nr5SQ{A(ZZUZR6l{RDABdjtr+vlQP7Jk2T6
zUJ8<=Zm}QA)R}ty*Nn)#>)bsjeAwQpkLCfN<*3O>Au<zAuvqE%A8jw^(#Ni=k?_+_
z`si0Bo8jEZo4NlheARA3Ru5$1b5Vy-`f&@K9(({Q@~7iAZE-eu*$}gZ+Ch!%1a1?5
z2WVk}zy-z%hh?il)jLXf?eJpO1S<o&Rur4q7~@9~l2Fst20JSf?C#?Pz6OM0g|7|l
z!thq+Am<zVbJP^8X1dfDIW|(}sU)(o^|mnc-A-F>NYm$w3z3M27;T@r4|m)h=1D8B
zf;o-vq49|y<gl(2O1*CY!vAOk>603GxN0%nvw;tn?)?C4o{NB<y&B+t<N$l=!WcGQ
zwgEKjjbQWDw^;4<#^BDwa=c(~0ulV32rXT=fq8zzI5olo&d^N9s~p}i6ZPd>Cu=9X
zN9=Bcl~z7i<3%r3S+9kh-mkD&Y`&frZuv^Ty%R!<cVNo1%7<;ft9Y{AIWY8V5$H4P
zC5;z<qU(kmp)xgM#$wkIvMULRE;eHtvomoH?+d(g^da+0Nq{@yWz1U7XF%ZV9^7UB
zm%Uum!Q7S0U~4pm$qac7$X_`CRpmSJk&g$#pF=NMkJdmarP#-LeR3{7viBU57WbZ8
ztoneM%yvQM=`I!$_iISa(MaCR-+@S}Vw9#2o5Gy>%{;O13edes7)0-iCLuzO=-OO+
zI9DtPhzu}P@ccZauqYm;hR9$%WDoD&ki}BBHpAn~EpRvQFPr&1fT{S}%--$k#)aGB
z*?E3(xTO3Q8)_;K9fA)s>P?0q1Ds<%^dEv-LOqzLm!>!YA@4u{LEO;EV<aGU2Ri5(
zfNu5vgTI6(Y0>Syp!xA4Qup>1xFMg&oApi>_Hk^W$uBV~{x=FG9;Ef-oyI`X?<o~M
z0%5;SCsLRzgHJ6Uf?w}N0M>B@(hPNM__kIXU+=%pIEOD}|9BY_jevb@so!>Hj?ob!
z+O`BVCaeXgqYtp|A3}jX?t!LmCqeOyB9Jq86`-e=nfvcdAzF)4kw<4N3JtA>KdwuV
zFL763%_%Q(vC#!u?Mb9bw2H(eX#w@|M_91$KNM#V$tAULHpsV^e0op@EpF^UT9Y?H
zWkn$DtP6)*`(=?aISp?4eT4x1gFANBu+2fn#O#YUHtE*{mMbQRbeSr?J!=Nod9@rD
zc6u`6{Ubm?rUWXC41x)j<Dlp1D$YIr7gROiFY@}a80o&sgB^#?!BgB`B%t#c7L9u$
zkB5A)Zi^b??<panN)hC_yFQA3&Lz(s>habTZBk>I0s`<eSa4Pc3Li{>+xUXPe>)A)
z!jsuB+nf(p-_3y0_mo-32xa^tA`vI9>I6Oo9r*jwXtF7?haoFt$V}5kIALsxNcCWl
z7*Ywso+^Xp-3u9o8Gx3lWFsvHTNticfHXK_@YW+;<dk+6_DLwA6SF>pL%P*S*|`E;
zFQ0`N2Ohe#TZ)QnULuOhb4k6xMsQ4io_Vr_8_3Ja26Qfu>!zv<dA@1zhn+oaNJ@a+
zNp9RZ7AZ_S+JJ+OTmYv({KYNWKL{|W#%DQ)@%P$YI3_ZREPmsRPfz&c8#3}N4jJW4
z9@s_SHWZ+>?LTqioFZhp?=|L`CnCY&7x0c&2#mGvMUq7?QR~+%bX)T!?BD(iT`Ijm
zI-K^A;Pq1^{--2vV{hZ1-P5%*{l9fR<|A5DDS-Q_0czL1g}N&TV0+e5PSouoD0$>Q
zUT8G{_8o1+t<MhQ{6<yG{?o&^-o56=wp}C0RG4hNSx@ACo8tR=4xA#19Xz=W7hzzC
z6nPl^8mau<Ns@9c(dvu|=-hi8PVHWb=DVFj_a>I1uv49=!~6}3@iQTMOOnXZ+&|Nr
zxs<rLy5j(831S+33a4DFH7{xzf&%=WL`Fp!k3>oX$xSyo#nF7weqJ^6?!;r@B`J!d
z<A+&+{9fF>K?hsL-l{u%X^5!oA!K9NK0FohgRINjz=^Sz<%w?5CVDc3WN>8-&X#?M
z?G9F>?Q08RR#Q4G<8wq}K3~!16QfA$gguJ8F@|DAvq4sg581t{j2O6_C5d4&*m`Rr
zv46h^Ph@F>`tu8j{Nr-+?8FceTF{ASU%$Z2m-K;}7sfzUZ5J?di!tvnA7hI@KgKmz
zzH=)Aj<C&7C5ilILtJcDg-c$)CHhz9avDk>@%X*p<9)FcRB&enR$UvyJwvV%9|1lP
z_TvgVa)}=qFK&Z@yT(yKDnWDFy-}Qe9duc<lYKA#9}zXZMbJhUd`*2ON%=K^?<H;~
zGodpHj#MRqsekd4hn=Lx+n<TpuMNunW`m}p^+5bk4pUzeh-EIFX?W|<$HWimVBSO$
zk=Y&qOom6W<r7V86%))EWQJ(7ESG3}51}r4C8T|_nM|l|WK_eB5)&6bH1mcLaTdt~
zZ$I`T=o60C%0QH$unlgOo&v>r&+%iA0o?7&BYUq6ll<;CSU1y#xK5Oickv3u(e^uu
zaH=9cw#%7;mSRRtbOrE#C<T~}Qef`&cy36lF_WM%i*dQAjCZ_LVpEs@1)m~bv8{6r
zIPre1oQw@dv;~QQH=5;ir}YbByyPw!>U~M_pT59+IfCSLn<u$a8iN)(t%Kj?8rZVO
zKTxvK66kO%4Kxc3;cW-AiB+v1QL7##0tz<l<H!Z%##}A(+SZ7Dq4Am|>d4@yNqd;z
zmvh;gXG=J`H7h`voi*qCnv<M^uhf942EqDP=1jKDe&FpS0?rFoaL?<AgGkF8HIL?Y
z(84bdpmEP^x^~!@3QhV_ua^;Y`bD6|<2<73GMiqj3n2H;Y#=YLZ-73VFuFeJ4@I?q
z1K|LF26r^#CPT`;ml-7MUuojn$W(lz%77@#?FO=&caa5x_lQIsAJc!un0?5{W4k+s
zK(T2FC(|RLE@G`Q@EfVbA!6r&_KY4V2VR2k?bXb0xhG85Z(nYk$wwNqFb9V0GN&6=
zyQz1l5S2Rpk^HusLA=HWNzuw_DyKLA$<gCbt?vfRP%=U39l}szX%=JP)&qq{Dv4^H
zKAC5qLnMncNZg?!+-`9hzaLzR!z~e65p0PQ7R+EG&Fa{}Fk$x5xntn<QDvs%u0H$U
z-3y?uFqS=hJDqv=#0o0tns6JG@3B^AjoE0GwXF8RjWqhbERJ>@rfaOC$zSiibWTnf
zJtVM(ik#p;{p<2nxpWA=66b-ZTmxLZNg8D)Ujc7jXEs!4RKSBzPvEvKK4kxmEo{Q&
zCgPJr@xKeL;J-L=sPN`AF}hJj1ZM5!9Nfsq?U3KZ8IT--!kN37w2Oy1{&C^c{cjIA
zIddu3`Jfai%uc`?%tkm}=P$8Gt94jS<sN#rJsifbx<Iy3W9qvloK|c}r#@4zG@9w9
zZaaR_L>W_DP@_nVI(CuW6G13*;X{yiW0>`_UJmaIad4#GTjJ==gFZWT*~a_(i2A@4
zsC`!gzs)#J%6Qwzy7{wNkrg{Qa^N1ro7R#1**QS+`wLKevmHvdxPZdO6yX168M`n&
z9-A)M52(-XhC_>txtWvkbi2n6IRC*Vz&y;S0W)K1SK9$Pw|9gp`5KWaJ1Z*Amw}|6
zqN%I02oYXaj?$yup-6io$Wbdq;z5Sk`2Arf($EU_JADP&`N=pE|AI4f{}Su@Yp_{k
zA5mKRlhqdNV;0H!0FAevptbca(-ZX^l;kta<VZBbSl$6QzxWUlseEh^lmSJY&H~xz
z3(W7-#?l@9sURiU4z2F}MuWgb8m7ZVv;9IyPVP%;8|_A>F$THdph_d#ji`>0I7;!q
z03|NTg316Jqzt{ekEQzH%Dz5Wv?-1)T{=B`9Gpp(@N3}7Og?zTVjT`qv})Mh(9y88
zWeCiM4NUcbKjX3rvWIrZ;F|ZwIChBz$aqpmO!qAX*-odyXYc8mbj<{(k~Wclht|WF
zX}1y8ctIDrtfji9TM(a$1{IqBmmGK{KpuCUCLiA9fOOxBbY8PMx;`oaMTL{0obw$t
zZ;OC=n|TMbJi{N9m^Tv>-#GB%eLvwn-hzwxpTSM}%UE;OP|kBVH>~+?E<5wb41A>c
zImh*vAenh`7w);*gR^SiF!yAC5&o(OD3a*GUc5Y?sT|JXcyC%nKL00!<Okv{1pVA-
z;+al*K4b_kahpxk@224gQ3kx3#e!s|vm+<p{}D-fuZ1#?h=8Ocir}yKaromFWE<8)
z5>+XNwT|s2eiC<~%)=(~n<oYj`>h5~6H36QO$V7}8^!VQyp`-!Vk;g!^_K}?rihGt
z1^dK(FDo0`i6xfHG8uPDz~j2B*mw1Ltf*#QKViF+;LiI<ZE&&0yf>B9>uELZcoS<j
z;TujvYF?7m=Hs-H{~q{K?L_eSr|BHeHIy-+iAB@(nUauhc=MeAd(G<-dAP(Cv}-&h
zj_#{K+R=-o^szEbzO01(tetW5o(zssR|n>l+{9<D#}NgAFMz*D2Rj>XVl3b+5c};j
z7A^S=;<hYi2Gx12i>5htw_VSb7fqpJZ)YRL@|zY5pYYQ_`E7J!=qXZ?Jw|hwKrAX|
z$`gtFism|ak^F*MI&<ZFlqs8!lN2#G{<%NYm}d_P#D9|lJ#Dz|{%*Wy_bukLyc#CE
z2jS*pBOuU8hiv}yt0988ff*^^FcvyTmOqmPem*QXzanbd6V!wvMs=K-*AD>7*Mxia
zYBFvvF6>Pw&4!jFV><ueMkIP`mBq5lJi5JZ22WzmDP&#YMH}xdf_&e%@q{G{kotiR
ze8ToAJwMIj($iYmmPm1Yx7-7$1Py}c77xfF{VPo8s6Du@ILMv<sEoLu4*`LTOQ4o#
z0?}KkYM!^34+<Z9fZ3=6WY64vAR^!hoEN~7rQ>(NxGx3At~mnU{amuJQ<vepJq%g}
zbxg0ZHFTM&7@EudZLzRDoZ7rF;mLXTA<LL{nmr{+7N6eDlltC-l&0Q7z2Zq4RAGuT
z_k_U1f91*LGpk8hoe|tM5P^3J|3?b+bzz*86}0LP#951G5aXZPAf%IJef#Y={$+Mh
zSTBjKIN3~0Eo}j(@+I`}dXMi$EdjOTCxK$cMJTYkp2!Duuq`)d!#Pcu%dP)Rdb&rU
zWRZx4z9gmsKbP<{9R`4=-*&36kweanT;?qpTLuSzo4{3No9WT|&nPqE3mkj%mZX;W
zQYoehUif$#?&Ee-fx=OEIJ65U9GC(A$qkW;r<QDW({$a5^O<?<GiLHWZ3f)xSe)&<
z9q4smhk>P2px%!KiOd}Ko0BK>*l->@WbNhFZPNyKKYV2^#xKIDR4b}B-yPM2*<x;`
zCvS$86E#W}A=gISQ0$-;Pue<&EN;rfzjvRZ^Ydy@R<;l<G7e=mKUR>Tym;s*WQ!D)
z_}TrD9C%aj0Ej*J1J=h(k(_6bfXI%yFk0mrJL(I`Txoe|&=w8?FMVvtv9EwAd=Q?F
z-UhrQ%ke<&PUv=QHWbP}078aOFg{8O?B@O)l<?bz)UEJGYIDB<&ao#n)Ac?LSyxJT
z@*L3Nd+RAzwt;d~YuHy4&M=m2K<OF|%sn>@J(B83xDh{0DYQf;$C6?0nys*>g~4~)
zuOMTIC_JjY4k}cez(vcdu)dTj@xI>;US?@R{$EB|<ikOLUP{9Kat1*E!&@9P3ZP;{
z6buVi23&(EmK@-58+P1)tGE9}8;<v(+UY$AMQK@fpIjHp5I2XfPq~xMcO&#XwjvL6
zf5QP=Z4&YFABw3FBx#b<e1YU+9?uLiYyU%<_e9``TQVGPnniSO1tN<LYhhit6ihQ{
z13#ZRV66aSBA;!HSDtkMC$!Gv6=Nm%@wz}}nbjx`OgIC=#ZCZ)iEMar&sT6Px*2F^
zZRPx_*CSFU`;fHiI#dvth;~2O2R1%9hKwUhr?XocAmt|?47wu3{?+}1w5)AOuz?MV
z^*l%#e|b-PWy82n^#FLiemxx6Rs`!5l412)EwKO6Rb;Tv3aKw0f+@q-V8Fj#miu^&
zHIYA!+q0Cpo3=XPxFwf~?w0eoMQuG1xA_6a!woT+Gy)%e4lypmRv^q<lv!gsK$=U>
zK<>K#(7adj@QKxY__s|GS#AG>;{U`TlUHg;>!dTvRc}NQpT0u>Qx{RJ^*v&(sX(+R
zoXL@vFW~d@7Px0AVs1a;0MYO+SRiJBr1CZ+@vI_vu{sbAh?mz@%<cfib{aVN<uBIy
z&>w8mJU}$h?ZaQhRPk`v3Q+5|pUnO66F=NFgq=QpW4vsSGirCd$d}e0c<o5*w3ZKs
zpGUqRgF|^Jz+x25k-Cc1qy15Y!adZYuYs;ewWFI07NgjiHn?Jr23GsiN_G@_vOXcV
zz>sDQI6X5QTXMW$c$hTIjfsWT_LrdFy)anj{IlV?gE6Q-p2tzuDrfTXW2_NBg`a@M
z`0DT^d;iL8CU>lv=$H?Y&7UzYk2;LI1D|sO3m4P*hqpjc;)RwxoR5xjo*=8u8tB?|
z_GpK7J31}ximD1cP^*eKs#>IrE+Y<#=}*FZo7;eZUlV@h4mgrG31{y)2t?HvfVYDz
zG#gZg`gK<9LTz!7eDoVs6j5&Y{=*m~mvFfya1D^Y_yZ>#<HzLn0^D7jfEB&6aQTuV
zVz{q{xRxI$`a<eNRauLpRkxme8VH8JyT8Gm&yS<oGh*Pkq)B8uvk{&2Fhw`oe<F5J
z4s|VlkDjWlqQG8R6chWB`zhCsW8!R#qj;X|zXy>_sySo>lx3jREives^b_pM4<a*G
zreIT5J3MywCRZsm1yr;h#CP{OFxB>5IA@tS?n`hb7E7v#Ot=nFF!UsnxyfV^yhxlg
z^NC2)6~-iQiq7_ogZ(*U@WP&1u*E45%^jSDLeDHj)KeYhZ*M}`v%aI0k+&$hiihIX
zF(}5-4vPMfCR^`T5`J+-;@!5HI2wK<!~BILT6#Y&-#o3I&DuzO$W<~he-`fD*3URj
z<$|9}jmd%@LGG5BgSb-mFaCFb6WMJrN<t)8lG#T>iS)T1JQvy$mEdi7==KsO@3R=4
zm3$xS#($soWM3j#>n&*efqBS3LJg(FRHDvv4ybB!2a4jJLSd^ckg=>Hy40QqBYbp7
zd*C-*<Lp9`Qdg7g06VI6Vk?#L*hh+VM#=8{2c*XQ9F=}@l-NX6gVmaSP%yxOEVxy`
z+9rzN<>5c^AN??*b8w7o%$|#n%{oeCz5<ZBP?(*x2*9MRgsB-spk8|wMf>#F3UzT3
z3O&io{igKlqia-?T?@H=FOZ2>FT7u^N?y%=fh>C`P_j<}e7SEem3aqn|Ne54_US%(
zKk$C~!rn&hF7lCGU5ez<spYujt{Qn#yM+)pUy$-v7X$_5;d74z*gK6DAUAgvTbTBa
z^*7uH{7W`~kjJ&Sc<>TjC>{s}1h?XvjtNHX^8+N=a)P??9^sbt4OB^afXa{DrZ*?A
z&~1CYL1wK8(uZZx%jglQAN>Sf&dQ?vbL-%**AjI8^zM;F(Np63Qjm<k7@>=$|Is<`
zOgO$~6;$MTIn>^DkT_O7BOi{x0a=q{cz@tda6&>B3+pRG6P^OlYJClwHywtj$5#R)
z89tz<Ap)jK>!*3G0T9l;$DUALg(8gj>9{fnTi#2gQpavkubW-;Zod<47(Iy2?_Nl{
zyjP%9J9pxAX9E@Ro{4VnUJc9gb4iy~CE3;zJw0FiBuC`<=!{j@fby(L5R;onx|`%-
zf}1RJ%wmA_kz0TRa*5KBolxPpI=fX{0_s;B0m~<D!3*ijVEF7n>?EEG_uMbQLG1~k
zL2({b3(#O+ynBf%<NA3DTj!7@g*0-fZIou+E#XOZ`A_E>R-%IGyoh121i}SYxL{@z
zJ@Q^1Rrq)Uv7bYPb6S>cP^g4$q6x&^Wr$o_r3-hzHX;#PpV`{8njpnKfE}B)iCgiC
zkF|QegJ_Ozf^$6AG9S$jfE71pHh9eKfOmM;z`zqD?7u}0CS;70B`5bWw=Emtx^Y)F
zFTn=gF3IA}xZOd|2DrfJZ^<+>BbetHx`c$<4T9$S7;2yrh*HiLQUSwFbX^!mrIEAn
z?Qb6VP<RfRv+4odQ}&(Q`CSJSjo(8d%NSCo{TjZUyh`fSPhs2Oo6uZtC%e<%2d4`O
zfDI8{AYyGDkzRP5eUlLc?-=#t>b->^Sep-8MN|=$TGFun-z6|+(A`io%M8f~uBWeb
zTIhlkgXp-35^WVzrtjSmskF7D8~nS_q7_4EZf78=-#-NNt;(inq}XX+I0%>j8^;0-
zu2{u!j6D9TgbejfaBNK@eq`?he~L;ITF!@qrhTI$s&_cguloZd4|&#nzyU0Z>>+`@
z!vL?YfJ!cUxTh}@!gto@NiR8U^rz2^RJSr-C8fc5E{Q;4*HU?E#e1o2sV08w@S3N&
z+>m6y*+er-)nKu-E{gITBMY9}kgVTZ$R4jn=yq~3Ll3GEyZ)s>xxbgxr`3>4wxP&!
zyE_TH=M2;10%58}7dadAlX>fv3)TQXX6(Tj2%q<zq2*VxSR+d`mY;_HV!}k=d^fB2
z;vt+PyoqR?6Nj^0hd8;d(~NMah4a|71I5Tbna-qIQXPj@QvU1}&miPBsSbWeBl*?P
zVw=loNAf3_cx_s0EV9Ihj$A=iRRwU(u~uS4w&AQbsZ`<A7_^_e03PX2z&*Kv@ODEI
zuqm^~G11O!b5#`4UVI$PrR$i7){S6<{}#SAxsX*GREP270mQHOAh^rjV=nsDl+0g)
zpv*Itb!|?<J`#snzVr^19<Y_CH@%Cp&gcqt?>)y8-1(Ns1j*9tldIAG=b^~Lc?Oh}
zUr$obLQ-9vg=%J-gNM!=z@yZ4Bw=!tBz@Y2<kR&CUtR%d+Byj(Li34ot|(LwD!|L5
z#PKpIK91!JH=Md|9kfOlF(dR9F04|5s?+(e4ecAD?xSqn(GiOuycPxA*gnQ^_%EkH
za|fHZ!U)woZsKXg$J3+!BD6Dp8C^gBDm82TOZUCGg0zZ{plGHVe7IXd{&N11?rI%W
zHFTESW@!vde(oXh`xoNh(}-K%9^yv6ad`MeHG4cH7f0<#fpa#w;k(z1*)&^GHch3N
zXb-i(#UJv3?`|V_VVfI1d$<KFNxcEz^dd0(Yd$`nx!2snTbpTpV#6`*z-+y>KB~wy
z<ng^dO-EOMp+3J#sYLBrYGHAj?vzhKVW0u-*trj_6mFuk0xU?#A6Hc2<w1hBmY|ty
zN^$ME1JK9z8cZE*!o@A0VCz#3i};6#oJKQvt3bdPD{)S)*?GLwcbEw0RD;l}wNStN
zE);qDf~>Uu#^gCHVxEQE!r3co*)Pm1d=HHR%~NYJzxRLaTK^<;QCEv5kIK=zF|E{F
z`3s%#HI@o}Gsdr9M<S6B9b~hI53ShLOkIxdCic#;s0h20H%r#TpWhdQzvH^_Zf+2e
zZGFRZEE|T!5exCs^55iQ`YdcTVaHr{<C4V{c6ghJ750=az!HWvQ0=@Yvmog((V7>*
zbv0EZ{83kNa<v6k6SKt~7PG+e^NnmWBV*<d+~CU025Ndfk$T(3VUPA$y6BY-T{x5m
z%kB@Leb0j+=jSZgek2c$c$SmFAB8B3BT7mdw!@m}HLyfG1B~mLvj3*Dr-hr2!ll90
z(@xe8vT4~*;&=5FXmk5R`0S2jT}v+W?b#K4?d&4pd+#U_$WO#Y*BzOh_<z&>gf>3q
z5(YRPqd3jOfwh!Y1atJY8OyG4qJ6=ST8#drHv622RP<pgKKnP-ew7Q?b+4J8h5d;5
z=XT(|@+fuZu{3$A45f2k5&x+}@WRq67#APP?$Q>YuJ2WN=)Mg}%DGJ<r*~1k$~O{|
ze_?QzP!{I!6xcNDA#m(wKDO6eij6zx60zUA@aEThfynJ+WT8$Dix?w--TByRPkZM1
zvT@LNLWnt~@CsfkuO-i%vZ$os5el#O(4~snG^X@8QF^CDq(h@f<)jc<zT-OEIh0H8
zZM}#x_uCLhWeXTKw-X9~d_nAYUBfB8HK4E|iPZmEM%Ej=0O>BtWW%nahA+Hf@O{<>
zlivU5qs%(khr`<VLsAPnoV5aTW-kT%YnHK<6Su(G-}^vxMlw*99pUo+n*!lWbr}U0
zN;0mF;?`Xpl9iW3^{(|%W+gw}d6G+S@t%=uJO1F8LQ0_S=|kZ4Ta<j1H$a)M<;gA{
z3u7M{Fhb?ZMEq3^v0vx~`u)4eLEm+F%GhUmFEWoTozjE$zCQ4T<tg*e1;&iQ7b~`-
zYCUdWF#sa;zL<Bs?qfc=eTM2%H^5D=1FUK2OrSkh!d~0;kzF6vYrf$lp=<W8fWajX
z>5Q^})NbM_JuSVJ$S6OcCP%v9Rh6emBIOX(czcB%ur4P~Rq80?iyDcpF@mm!yKrsV
z1)!d;Ld-wS03{p4m@l(LxPR^vHs)RwcK!VYNGV(a!3D$IvPwIqiWiADs>ZTUuWW`w
zPEY^Wz<K!f*!^MrTeOT)2`w4PsF0#k-}{`SBvC0Og;W|O5=k;z8rpklFN!=$rF7rt
zoYK@Hk))9F!;@JE55Mmpalh`*xvuy70w+K0#o8W!;i4^FK)>cO5z_AlR^ck#v*&?0
zAz~pLKcz<vs*GTU&IW26wv<|Q16nmQk1k*h=`uA@Qo3lI>`NA=awD5TNA5#hscD3A
z1l`N>=D%>B+9-Y?RSeA&Mwy_kbC@4RDKI7O0#LGDN6wsa!8ZTNF}1i8mxwdm_EEz0
zsu~OY)W6Kq7#ENizZE<gzs&vJ@d#vp{RHIxI%3s4OHl6Fhm}ux;KjDxY?_z|b^3D)
zzCG1Ow`yvW??R_&!QEAK=ax89ku#611h%9_`w@O|D;`e&{*VM+4M$n=MR<~(2hs{n
zz@Y`AFur3s_+NMmZdz6WRn+|PKK~BV_Cbr2w)Diz)iIp=w372Rfy`I?GuZ0ILBP9a
zz}94`gW$jZ?8b>!F6&+nV;trN{A?b8*j<>do+At7)&1(*-P~w|;&~)-QlGBH7olFL
zAP-uZMo9HAwysSCKi{O#z5E4a==46ob?l>il!j4;)BsVR`<wab+{&F+{|d!VD8TiS
z_rTgU&7eK@1-Lq`0|fS;fQN^V0Q2u(z?%<MTv)Kc2Uk47H95^-dQz6+TfaQG8(Uos
z?vHBY#QGs_*uxzzc+m|4Jc8@9va<yJtEb_g;0&5>e-BNIzeJ)kvLR2$klIHYQNuMW
ziQoGwvYr2mMx`WDl+Z_loui1G+XPCNOvir@x-hA6W6a$IQJ}AK8XRdn3qqDjFjJ{Y
z%#RcAp~Rw5s3LNieeiV*E}R|9t#2OyatW6?X;pjfgmw|GD9pe!iyfGn^KMwO!VDiA
zz5rg%*=IN<@{02*(PO+M{_u}|^ryF<M!?4!#xyE@6L~FtiGstB4ERhVmQM>oLv#;(
zn`(+2*9m&m!zbZ*VLM9Q7LDC>WWoNiaIorP41VkV44?aVA>PdU&4yh!B~k@9ppU08
z)P<W^2gf5YTqK9hTH*@aQwU#kYZoUxS&l{e&vX4<5_R3WJl0pG3P)($VE1M}JfNJ3
zZPbt9J4@{N@y3Lf9rH$tsY&GHRt#K6-Km4$5i$`duyb~8*!acCmvBdRG7B$c5aDm~
zWXAh$l=^g-{~@W9i4Tm!e9ta!<hY=fI5Y<QMK1!;m5a#%=P(el05RRg1k;zNaPb>k
zR@=rE?Ckr(b`17mjoNxHOw5g2c3#}@%SaY8Jkye~0ev{^W(BU+ScJ7dXy9%iF}`=V
zJ}q3g6>VxYBu%MRq-xxYS{hnV=G7}AA*@U_@2YeA*2{uypZ5&RQ-!Y@tx;-PB-pWQ
z5m>XYoh#XF&+oo|8K}Q80y_6YL0)DUnZ7wzu&i;8oO%U`x)>%7S?>(l-P!^!RTr<)
z-HyEknNsXM7w)Ob3ch)<F;nO=2+Hps<EE-SxsTDM9BdcImt^($7F9VkOY|HZeCtW~
zRZL(n8x1<`p$>_7@Cv^-T}WL;j#DuQN)F$hB;z*6sEYRnl)AhD6z=k1-;8FEMYjrY
zi<d3d3kqOfl{Z4(Tqkg+coP#MW{3-K&Lpz8_3_J)VScnu05eQev4*q=*W&muw(H!9
z&)EiXZK7&exEF!aOZp%(u@%g-X22R3JHcD+Ge7E@CCv<QKzgU*Xn@-aQhmsZ>V~9{
z>`nsFisiU_vYvG2JtkXwr0}n8^5kAQi_+pvz}2PRWKLoi*@|_@s*h3v$1{n$nEV24
zpLhqw`VzocgAcfA--+ct^0Dq@1%E=jllizUAFtr=X8OPH!E>A;hJjnT&EI5k@7%NX
zEsZ=-A0i1%o<)H5dyj#h^40tcpI_2512fV53U7KM@E<a$V?`4#ic!hLBzUuT4^-Pc
z6St_0k)Xl_)O5uFiOm0wGE~ChG|TO{(_sZJP-r5%^&H4Tm1-=s))9|+%m9%!A7Jn|
zCBRsi0f!+CsJH((6VNH}C1#%GV#;=dIy)t%wBCXdH;&@}eVc~c)?UQ!n>O(q54E!e
z(SLw}m_2aX*<-kXbkU1%UUCoq^`@m;F%5`JqUmOubhU^H9(#9#&b%89%jAmD^ruB!
zs6-ObGBrXOFUz2obORng8xB%HGJbbnmCU{h8N1^Q$<O5z-jg(_*vrE{azR+HW<Qj>
zl+K)as=&nVGpoNY_7%K&Hv#DBJa9(Vx9)|lD;Ih{7*A_M7`cSwWplH+%gBJOFtcEr
z*(+4*v?<KoaGXXRjG`v#cJ$JZU&MFh1QqSyNXo9Nz}89&Qnj&)_-(ZYK`skWu2d2f
zzO^3@lu1H0qRM6W^{~mmI)RNiOEwk@;b3ch=JACkSmd8Apz`-i5Z6}7w||zw>}blw
zI%Naks&^0M^<{(fi*vbK?b|_J6vr$Islq26W`g;O^4MSL5N3;4a~A*kk)L;-!i42(
zsg%huMFGvUF2;{^Zkb0<k7kl@l?vdUz?6I4zKSlK{S|Hy-i&gr?n2F<4n%fJ6`mhG
z0&QnhfQsj(*t6m_Uo*ZED0;7Cr#|H3E4G3grDYKpGy0GVuQ35jJ^QgwUmU}K_7g;`
zx&fq(|6={sd%$?43^0=!=4?jvz^kSd9P;}<u9Pw5g!4Dhc@5II*|LC6tLdd`)eSU6
z;Fk<FdJ^f-PO_;Z1j@V)0wG#YusY6R_U9?1jM>Ye$pVgCnb-)=u6_bTmPdj?)z`R>
zIzfZgf}MsNXMvhR5<K<4R${GUK`iDx!$PwUf-rFxyd{yvnmdKLf+h_dnzjPls;}c0
z>WhJ;Ka2RS-`t>t*=n5ca1`&pF~QHAKZ97R%F|i93`nDQ4aYQjQ={*Hse|B`ntV8r
zJoCB<pR5VTiGJnOQpbtQxO@#I>cqnvz5kHBVMkDV_5dhZ?ZHK+?;sL+5um`)8^o6d
zf&4rrSOet9p_B=-aC|d6?i&C!zJ_qQHwJLyPZ8X#y$JvMwS;@5uZ}DGPlM%W`#JZk
z!(b>qiitfXW)N<n%|v{$h9dqgbcgJ5AgitqWfV%N-MAwao38?P<lZLy+8+4IcokZH
z`5&61twL>;f1~&b8OYU)lQ5qOX43g5cK9?%VxQ*`t#mhHde#a|lxQ$^hcw|6+vg<Q
zSONdAtzfFlqB*1ArA(mIUEEociwjHTajVHsuH(lxJn@eNSp4z`=lW?H_wDg2khUfb
zM3&ky@k^$oy{g;j@o&ncF~^cJ`F`}lqCgsa*paThkVvLAw8Gjr9~7#nO_R@3qUkG*
zvg-uB2goF$UsmDYN#poK_Bbi+mL*EJ<;i&&J~UtaAHK9gR`Afi&Xri1V3+?51GOC~
z__m@B(>DJImieC{KX*ocy~KvEY}=jvSk&tQ<F_h?Nz3{}MAvNw^r<MYvf=U1n{GiM
zaRI&3EntvKm8j&<?gn0HJH7JCfF4i234;sfz*7w)sI&DxO>1~er=QbC`LX-qV$UHG
zB=?>uc)4QTOcQb_B8WJ8r;&ifUZ~+90yK3o$lp~CE(s@qJD3N=TE=kW(nzpGK8Oj}
zQwq$j(m{pjr@P<HGeG7~8NBpO8;DcUAsc%AL1sw}&{NpON#|)JZ-e*rk<Sp_ZXHYf
z1lt7Tl`gbMI)jGn?-wv|7I5qSG*o(h6LoDnK>q#p7Zr{K^Z$)JNwyt-WB90y2fCGP
zvC02V5^2HEjQhMx;FsNXxOn(95vsiaZiv*f|9sQtKb@$=Nslr?`IRLgxJZ#<rwKT;
z`p@j@xG_*B$TW}rc>*qIYY`D{s^0clG??WX&$%WypaWNY>7ZLR6;87y*WPWRAEvFP
zI7fyC9rA~Lv+pBA|M$rKSPb>It0LzFY+m8~=|CyI4BwyW1q_N}K!wi-W&=_n`s6XF
zdASg)TW*1Y&y9%VGh^VNumj|L%V4kV?!+^SuY*s6g+POOjpqPeD1KR;|H0N7n9u3P
zihpl$%!x@XG3kw^g*=%$2_J5)cp|cXE=iy4njjfE<>a<m6`iURp&i|`shh_X?9NC*
zZhn)<D0ep9`{oF_HsOcz^+q_8DNRtGH3B-!9^eT7#}Fzu6K3sp{@i?R#-XzTN={xQ
zLajAi9U<UH;0;c*N!w6Vb^~~S)|vgbz7|K1r-E@ccLO264DN)kiJ|PHVths89`jsM
z5wC1J$o#HY&t9ocLIDdz=|JyQaxdc<j>L$5eP2ueQxc=Icl0wiT&7TB;dW%ESWgez
zjtl&S9+aEHtN#r%pz)PG%qrm#GE3?{EY8s)%W8wT?~{#S`cNX<zn~LW2iD-@+hw5?
zUB(%e1sJ|mZ~}$#H#yVj2YAEs5fGSvn;F#5#Rl$|nIDe+_@<&gP<tMQMZ{(rxWP`=
zhBP3nv1>HDJciUJ1k(-kKTsF@2>PFvCCzO$LkpGU(3--fXww#P8hUOm_-LVzat5rJ
ziHK-;R7;#o-kDFtvppeJav%zs@i=xU3{=U5V6G?|d-#UpfI1cMSt5*cHt*zTs~%$p
z`4{*#QE!+8o+-{XSpu$ow*Z#8BG|?E1-2FuWchyWoaXo~u3UO18*^F>?aLFV{uM4{
z;IlQgkxHeTLP{FM0zT6VK2q>=yE3wz9E2Zaf729m9z0r5i?Z@EnBFtN(8I?YtV<Hc
z4`y$HUPo<l@bD7+(CHXRGTwnbQi8DcM;<O}^an)~-CTisGZ!&Eh_TT41$aOogmj4D
z@eOmKv`Yb$JFAHE6MC`Yyi*{&OO=V!&E{gyb+e_};pmd57(L`Rm#~$|bV*JmU11Z{
zAj90CAx?fsh2Mn~fAyfHi3x)5HUsGH@<bUAXVrh}(}w2Zy<j9?j4wEs$&ytOQ18(n
znC$n4B$TM&y8o1+UXczCu@MEjuKL{nwwd72<j0)+rFO7<bQx&+Ergl0IJo)c7&O`(
zRc|Qo%Km(655f$-gSek-@XBe~>^A$ADCl4+HRui%WV(4&dC-;aeHbp_Lx*Te#(dZ~
zV+L|NUyJ5nZlTx0`{4z4JxZ~w!7(*$@bVf69ym>aF9AmwlW1+&*rbi7*C&wmX79+>
z&>v8UH_1#@5m5c$Jol@u4&N#2!RtCknAo?1+|#gLkc~IPb>f9kcDV+qt!rj(1&Ran
zG9fsFJHvI=t}}f1<sCBc3?z42#DQ;x8r^Q}NgwLGqG3;rV9MWf)VQw_8j8<A9Y+#q
zvA!C-Cg6kO_Xu*Ni<5A9Lj_!{$wHq~D`CKYV`zQ|AIYSYLz|=ye6Gh88t%OTPl#(^
zcKjK4&ua{aU3bCB-5y-ySTcwWk%J<a)j&dG8kVQkP{q9tbRo*gmCS&e-FEfd;tYO&
zSOQX4JpsC;a)_9mB2lZ1BRhwGP%FbhbbS6P7?*DYR^&cL%B619xlR&^ml>d#igA+O
ztHFG0+yI{UJcP*#KjH2Dl1Sd|2>g0nut9;^fNbhYXm+s)zRnJ0sP9^CkDUTHduJNX
zF0kW1WPIZPSy0M0BOSbTRD{S5t>ar6D}X19t(jc|z2M2b`%KluX@>WZ!ScSr<mZnu
z*crW8z*cm_SkEua(rre_L^K}_ia$pNr$SNF)HqHX^QKblUKDGtLJoXCiuK-=a|YMu
zz=+TwSah=$4ldM$y!#tqn1>8Z^{oZ%M(dEQZ!w6I+0BNM3=sQ%9=Ehm6-!?Dh7Xn;
z#FL$O$to_6%<d;-Wls}pCR~N*%&o;5w+2Cuxi7!^q&aSPlBd&e$3UC;aWL}uJ#1NM
z1oKqRq0=h%NVMu6vUI+UI-}!ZYECiQVKjtdz6soo%SPPJN<FYbRvpSL&&T^`{e}nT
zc`^kH&JrYJ4COb*!sCk_Vbdl6zHa(af2)-Ts_&i#VjtC*jqh7=ddF$}_2DTp@2MRQ
zIPFRlMK9xScVSHJqliI6H9x*So3A`-$_&XQkmwO@qHd7~ODf*t2N9k~q&f?^Pj$iP
zV)m#Y7oZL!XLMd=I-0ru28v1A#+)2ff-|n^Fj1leEV=4R=7tSm%ee1guuf9o+shCM
zwF++WoB-go;3&Kh@5PFkx&n*%In41KQ6SWF7za(W!mHsOyjj@}sI_}x!{0+hb{oNW
zZ;q3Lp?|Q-4>yDB+xj_eh5b0rqaD^L79&a37bIcLS7u$-8ssZ~iwk>n0!`O6MlYA`
zMndK#NY)?`#b_xrpR!s(e|{HKxb4h*-!zAl6p;ZBhYrHkA54kzo?j%WEr~d8Yb7?X
zcVI2SQsk@|uFQeqH=NbievVh^kC)C(23wBgF@r8I!6NlI(E0ozK4cw<>xvBUsKhW%
z>Z-aM<GzX=l|PE-w=~1vEf%oNOPZ9lOyRp8&rk{X0vx+O7uuY^fT~7ZL9@elwEcHF
zig7B!dd4=GcU}?HA2R1pM9YHPb#CySx+x42+R0pcvzs`n`Vg_<d~!%?5^Fzv#)>l;
zoZZGvT#G|E7E}JnWi%fJi@bI)O;^LX`|el358G-W@!~n#_30m`-v2SqOZL9&tuO*E
zSups8*)Y;5y8~PL`k1sQbtwL{GwN#0#!mG-5b-J=%^!3jS*Bv}<CH6kzO^0mzMkhl
z$hePX)^uRi`ad99MI2go?SazGx4{@|hHu)RC8|j-SV}b?x51<A+9NzRHGL(sbfgA5
z^9DGFmv&tFk+-;JxgXA!G6cq}*Mq~2nZWXZC%~S`?B}Cv^yL;-k`out2|T|*DB)!T
zN5x)4U^$4=JX_GXvj!4(b%Y)B+5`u{8W<<m4XMcq6n!z3D|gfdfy1%bK{|`e)!N3W
zw*6rWuFJEtRJFmaD@E8+$O6ZQl!9xK1A@JacPvABjJn50pk%s>`^5)r4lKh)ms5zO
zmLIOmvo=)siw1KQ-GSD=wIH9B;MebJGhhY)-ZxMT<(qq9fru?E3A;mdg*U=`-F2w)
z@d(U5S<UuL8Y1^VXLz#zF-(O#6!kWft?5e!L&7rHcJmdktz`zztg0YK(=u^|p$o7)
zWR9IS4e;lU1%StwT=AMDt@>wJn?LyPA?E(if7#tG_xb9x-eF~}ay+WR!v}6QbHUm{
Z{F*zVU};(t^X<77lUBIapiF(={{h++v*!Q+

diff --git a/src/feat/test_data/test.wav.fea_htk.5 b/src/feat/test_data/test.wav.fea_htk.5
deleted file mode 100644
index d5164ad76a4bba6ab42d1153a5066d30f4a2d5ce..0000000000000000000000000000000000000000
GIT binary patch
literal 0
HcmV?d00001

literal 22164
zcmWJsc{J8d7`0~)g=|qIOG=3f?{8)lDUrTPQA#OoLMf#rgzS5=g(O7wsPO(~=B*G3
z6^hbIDW#CIRG+`+oH=uzbLKwx-se7ie0(u{{Gr)=8G=^yS<g-?8mU6Fg@f@k%Vf0i
zo&!oRO+m5?A7g*3d#GYTGW`CRk2HVz%1C($qOy2tq;qkU$aVEV)kk7*g^B{X(-1)l
z%K#{J%!g4=n^+fpTNo^m0vC$Pv74Xe;N0zn%#~7zr49;lT7H=_{-H9aA?__$Qr!uE
zd~Hr-Eo9AGM0c~fe>M<7(*(x1`zXV&{s8{C=uIy;?4>rs8{kdr2K@1`8*JWXh&*O;
zP<f*U3JJ|YiT*o~dUg;w;}?Nq7Wp9f)s)Oha)rLVMeufE6}hdDO@7S|COcL)bN>3y
z14`9#(AzNtud8^B1G?OCPJty3<T~Puj3S%(;0z9r%>^Rwj<ceM{ba`XcCu~Ai(Q+t
z7e{w)CR=B11nW{Hn07RPW>!9>0l|{gB&G++31re4T}!c!f(Wu*q>dDh-$uA{H>$d3
zkMW(iATHxNiaBr{E&p9fvYj78>0^#i?_xW->tjn^%2tsgEnkxQ!2|QXGlBZ%)r4JY
z1a7_DiFGCfiQscnd|S;HD;e1krIQQ6K@l<LzIir|ZCg%MGe2PTd@rs>ABg^F8L)8J
z1El{!<Zq%)LoP^?l)iXWyx5pzY#gGpuI+U0&>T|1grIw_E79fak<{kT8glb|I*MsG
zfH}v{kOsaq5Nr0CC<pB&)|RVD{#qTvx*3wmyh<Dz^B5c6ji#zPt<dz#45n{MB4Km`
z@C88=yt>PWSnRt69Mg}16W(qh8pbo4haGTOS}2ZvJD0t*U=cGhkOG9ef1$_54|&2*
zIMmiE4ep(7P7kb|P5Lidkb;9p=(4q3<iD^FU0>QwAGC|Zzv`LjQdbW2u*)ake>wrj
za%FP$sw<g)_9D5odXk{auH@vC^~A=f5Cabf()2_Mw252=+75)63I7A4_YQJGWDelO
z-(q0#<CDOBoi=!AwjYWF`s0N%V{F;__mHpDm;F^}$pmfvgzm2v=BcdqA&=JnB&YvP
zzb$fz2FrAkng9*b?G}wrC3>L%<vH|4kq8`+l|>mlmjHk68`8k@1Zy^ikcwV6V&&>b
z0xJ>ln6;aEW7|m_PiAso<nj?gL3vQL!V4JM_Tvvq`@sEwyMT~-4={Q8uGYQY4O}^E
z2`(QEf)Y}8?1O9zV4-FN^%NFh^`*PGOwKh_vqFidm*z&Aou83y;-mC}dMnM*HX@Om
zET~+QCbBLXK-bPcqOC=P(Dd;tbTz92Uxj@{UN{6U4j94rVl;`;jw8fm$2)xVgaxSA
zo<kOl<pJH3%1mIFGP`5lc4kV@4d;AM2E#%Qz^M~~i@QX)4Fz$ar{HznjmO`?snN}>
zNTePJV!S|c-wIY(k!7W2>`>V!1)hP)FA~xmLrh92P2VzuR>wAy+gEOo$ZL;~Z23Wy
z#O$IKBXgnmz4s{BcM~pp@Eyl~CeX0<7*;O3NNoSCGr#z$1xJ*IG8way$)?BzsN`76
z93_0fY=JD3ylVn;Ub};Plb^t18wE!HRx;CNKMwBlX0hK)Lpi&WT{*q83xV{N8ZhG)
zV5{LuR;A!Ky6!*D(?4v&ro{P>#K~fctFF_89S^AbJROpR%;CLtMyM)HkhY2%BC)$K
z(RJUgxbA`>o}YOX2FvLX85c>ydGiZ=uU8>6n)N~0gA2s>lmWCXR>4c0j)H5245Mac
zi)9zufxX+?+5JW-IRAwTxGNrEeq??Zknvm2d@ue7G;MsaW@$Ln6nBFC^7RiliR?ou
zKTLT`x0}PL96%a}U(n7~PBe0Z9-Xl?fDEr2fwOZiqRu{X+E%#_@dx}ud5a$s?Jjd5
zJA=So8qy@n^*c6Mc^EkOJ;xhzYJs=Hc_JmC0cx{3Al!BYr_7J!G$!+5UA5=H=(IMD
z89s&Ut-ixqt<t#BY%55rX#{t19cVLe#%}|4I5SOzSvlu8^9vmpP~^HWo-w0^)Zfpb
zc2B?2myU!w&NiZsRftF@WfIqfa`fj;K7G0LGweGui1G%ii1X#+AS=BLWIKzKdi_bb
ze})6uAkh!r*xJH<y~do=Tq`2v>kK}=-h~@Zt1$<6+p>@UV}MY23E`vpY~v*^_$2I)
zM>o`g`??*Veerwn{d770<*R^|{qBNe9tSulubxHvrxSRpzxN|Pj}UrYjiEQrG|~K@
zs$^=&5^oB>3{6{NP+_||4LsS0tK@f}{F^h$p(~Ey&ISWqzA=|PGUUOlk3N%vykNL2
z$QnL;A_`aT-cC+#{f)Qns06ofm@_w<y_r1itE<!p(r~YVB?!I5FmsYeFef++SiPPP
z9=b;}Bl}O`9Sei;WmXXk{JF;wwN8V>M|60qHy*>_mN)dh<y9*6rj*CuunX6C7$dcB
zqwwCzPIRn{Lra#<hZV>K6+b>gIPKp7Ppgx-KmlU=y$#x?w2(yat<baKD{LQifgIs>
za@He`EUAozmpT=h!VYDysc)3k`D}tU&*_8lT@v_(lQMA9t>GHns9_#yPT<tWASO`4
zkyT1b0G@UtT<74o@UD^wPopz{X}B3hM<Sn5gB?$KVjKS9+1iFEcH<1h*bAZa|Fn5R
z#nLDsq6(FnZ^l>dl{2sQXcJ?lZai1XnQ;zrA!0A&pzxJc`1!yqobzcd{$cA#wBF^w
zw6q{*JD)9Z-!z+*ZTSP1oH_ztir*zd$#$Sd_d1&~wG-P-I1u^UmS9a<8Ngp>;3KXh
z+;@%E@M_35-t6(M<mjfGlngAUA+U@NU3>~^?IF5RZVJ0}63~&~c|1W;AyjfF0hLay
z!gAY0z)_I^oIcPE^pYV__tj^AM-wnS^Ax-~<UvjqB;XI5JHgnOJD}=tq<M}OACR{Y
z;YOJ+0@V?zK=;1-GzR&ApvDR|J0lZ6?P<h)2VVfxm;=RMJi`B!HMl)-Ik0AlCvT2k
zCy})GrH}u4(7>XDJRzC6AUv`R{g{(aymVh9R9r#pk`&RQ+gnhnh(3FC&-A@hroc@-
zkBRhZA*zq6x!y1Mpv3)r2)`?ko8La-u%o>&V%ceY??oN=cu@<-O!+Y*qA~--&z;HH
zaUhOcVA=;nX9csW3ojC-C2^cvr4mfXoKBD~HqBY|pL35@3d7_>aXjg>^RV{9BeeU2
z6TPIinJ2BX9rj%qLZ8l!lH#qZNI)x|UOapjIV|I#(&dY=l>7p)hff2%O<cj2mrCJq
zZXuTFl7gcDU0~j(Ob|aCUo2IU2$M#i5|QAg-0-IX+@iZD>c&KrnU7t5T$ycIOkcGD
z(;9pi8x%(3$qGA=_~91wbw?}v?sN)}|L9TE+glA&V@r86GnYb#yDMqW4G&s|Bk14U
z#c(L*EBfSrhsyLlMZ0rDY5La|w0=V)DuFj~VAnW^(W?hHbEO%L+tJwVw<ZvM-~!UN
zRTFt@Q*3j^k?pLohcnLPfd`FNT<v*ood2wcxoM}W*keUU8LzepW@d;VQ@St(hknS!
ziK%zNt+ywbN3tF`zbO$k$|suttv?2h?D=>zimpR>rC{24vWKS74?N+SHAp&i5yF`Z
z=z^VlkYRNRUAcEXvg9{F#YJ6M>AV&&JHQA2Th;(1p#~0mWdpKMEe;HtLqsfCwi!Aw
z+lEcSCjT&KR5H`_ALL`olgrHV6QbFRvFn(?wN`A@X?2cFUnD+0aD*#&t^lks2nG^O
zzi`vt7vPElmorDr8oUtW;|cHRgR6f{()OVsdTpqWPPPcbhNvB=!0a$7xPAhOK5w9B
zMU7GQSTHKl7vQWj=>{uJjTj5bdJr(DAHR244A5ykOuNl6w=jYG-p>h0oUvqcE{wpF
zOMY<tyjy?>Y34+)S<LD9^NrE&Nn%BGpJNe+G(5YA3+Cy%vKOTinA>h=u(hgKUEid#
zd8iB@v`0BSq2FfEE!mtt?6RTv58Ki&uWI2EOJj7a>I0o!t%I0D&h$vbTvT!HIV$~c
z7--XXAnMUQ5VXmGnJRZ9e3g6HpY2C+c6c<qFenc%e`^e$53M1B{0HDFY03$2b7Mw+
z>}OKOUNE{#b(yYmdrr860EPyotkWDdhA)X>BmFa3?;1PqhabAE*BGZRxW)*kEjhvy
zNOXdSGrMWU_iyy=*cv(%u?B*EK~zy3N$0CHqaecynqL1Ab?&c0r5_+THxdkT?s<cE
zA3eeOTeEPO>^Xcz`2l{|{*hJ52*4Aqd%!#s3HCM?0VSSe=8}(EfEaH-^LAx5yXuw-
z@Z`_qd}wDdzs7g==YTj?v&-TRsi<Pb8HGT6&o?&iU@sRrhrrm+DYRexIegz(P4BN*
zN1GP6^JHsMV3>hCs(dj)P5!PzXD)uGncuTe{qg0fG+;f*?_3DRliq<H;`^ZPw|E?q
zbPw|^rHH`iLL!=@kGDQh!9fpHnAoG*aD}rXciRdhFw;>9Xmo7EoS!ZrxAzUBC%2CY
z_<RH>J`cyX(la<eU+u^19CLs+@?v+?yK$4-x*?U1r33eb(fm?|-fVWE&56o9(Ws|H
z>F-69bIF44{%(iV^!8DYvcKs4`OT=r-2p0_MnUzgUwD4|Bbd;xgI{o)a8~hWvd88L
z*|#?i&+~}DA(NdTV_!2=X)xn{_U$(>JWvU$=0C=qavmr!&;SDS-+}!xXRxU{mm}9N
z575dD?3r!O%wf4p*m>V5E9$it%`(43A6{#KNqX|s^3V_+ciPR9=xZgZiwBY2-5qpg
z>UpHG?k4qpblPI3<rP#s`U)-y6o3ae^<(zPcbI!(C*B!h3f7IDA@un+qIPm4wzM0@
zl?z;%_HHSVyUc<;sV0SOhQER~#d0R-oi+%Dqks<EK!L_Gu)0DKWS=+>xQ|l7{NXgP
zZs04?xblcSR+ozoa+c82Y#w;!ahjA>#Pent_w#1uRML5NmT0!~V!D}BBIkQ*^wf)G
z7Q)@V=%)N7hSypRzncbPi#v~Cla~u9ztRsyl0FbygG|OE!xVf_f5JX4=7E)}=ZN?R
zGc4oD!M#mY;Ka)t0R5TEd|cTHXL(gX^Qj7GYtjIvoFw23#j{|y@CqnmY(r*&UF^j@
zH_+~LJE-eqBJ>SCMtYvL(dL!1^mn@&Ess5q&aR!JXV?n(ENnN8LgA<{?-t6R6NaDZ
zB*9_ZL?{{64tr1c!mP{{@Xf>Xpmkk2lrjATiw0MLFUOX_Sfl4;aH#?goRN*c6hvS%
z7|J}!Oys}=E#SS^b-3i}OPJzX4*Az6LqU<hz}~tVu3yp3Djl|D)oZT9@~RGKobnlV
z?K%Png1c$a`~mvN_c>L$^&VwO+#+b9B~pAlMHf73LO16BK<Vq$NzlYYc%$JG$Zt`D
ztV|(%nGl7vUH2pL`e*Rky$7%db%N)XpONyz8{~5O1P(2h*7iRzAFw%;cS7hL8r
zH&$K(SaS+GYL9^1cmD&P)yJ8G_RHbi2@Ls<9OERmXTX~UQ%rE1DcW@383m{;rB=g=
zq^wgJTIXk?%W~h5rf()HGW3U`&b=_lSr)~JR+Ed&IuL$vEBkF=09NQ&!h-96(SoL6
z#1R~Z*Ry-!tEOVmdif4AD=Pz^uEk-i)90{|WGb0;BpEm4JCj*8%kiQ0zwt7=IMzjb
zK4a>481J*<u;G%6*v~_8&?-)siS{1A9pVi{-`^D;Ier1z;3-mLe;&@#b3(blsx5@&
zKO_FNb;#q*AylR=g>KfIMKPhtWU<>zTvLAncZtL^uWI<<s)%qn)n0(aFMo!06Hj5)
zXC83q??!W$bil3Mf7xrLrfgW&6P&f}E>8LuN>qoma7kqq5es-m{1%53f#|nH?9zTl
zQ|kjc?^6lFw~aDM!V$zZpr73P_6b_PIE3Q=VA73W!{_~nP;tDCg+xpcau`{O;Dtx1
z;p8a`g|#9m#%G*dIaz^a6zs_bF9-bjw;xnF@(-r|HwI?(h{C1oYXL6x21gZ+LzxGe
zFlac@?Bk|1^V>0L+`sE9@yf*)i5~weoKap&mWfS~^qW&;_caT$$4mhWG;ASd&I@tV
z=x4@uejIT#ScY><QsCCcV@No7h^ox1f#d&OM?6;piv`(Ah~2A&5~g;d&YUIaW}7>T
zv9={S@gm%oPxVB9P@F8hHVN4LT&QYuf!uUCN={qsz?WsFpz)pzQl6}Z1upZMiN7}j
zrb?f<+w!N_k4L;PH{=YCx==xMyg!qbnYW4OlL@lZ`WbF2cOp43m`E?!!oemIH2&LO
zVqbZXd}59xnbYauNwgk1Gx`VmHAkVoZ9YgXoDcC%4x&vtPthZ@U=-t}Ma)JHvIiBb
ziR*h|QZIaomEU*|Z}RLVrEwq0;J{k)@mM(iC2*b!)nF1bAj=7rsD%0kvvFpO2iUj9
z6knS;h-*tHh^DeGo_+N_e*f(+j?Hwyljl5%UO_z(mTczy;2F~^_3b1vYmy|Kw}clz
zeIyI_NF$9;3Q+TT7rNLtANpH=LANW77}Ls~sL^jBipjdgE!fD1e@s0hp---o)C1>m
zUY!R~Q{O}k4X%-2Ew-djHIv+GolA8lUCBWq0nU$5KImMwhbV~IfRA$n@uel3amoxG
zBJ5|*-aZ%vLiW~ylH)IMNcaaV-yzC&U4FwH^tGh(TQ`x48^`G)$!My#ekEO6(hnL$
z24OC2LGx@sQrnrAIaL+S#BQZD^0-ln;sydBdqIWN#!G^j953>|!HT#X|3=DY)e?6#
zV-o6Vk7IA2BS{l8sF>J6a=GIL5SLegM)wbq+3EdI>3lL9x^NcT?O=|J_W0wVUy4vK
z<S}&Hz7G@+4g$WBSs>RihKcrgOZl#=(mCJX)AQF`$laBMs%={Z7Z{Eqb%iJdO7pP3
zXDF0@bcM!*&PI|!0Vw_+2aX>KrZTg3!T?Pv^3UCtEM1~ZZe1Cp{Hx}&5&ECu0=Y39
zbk>s!``8otQy+la=|VtWPvNt7CP9UxHq+GpyiP?C;*5xUYz~QsIv3NRO34Q(ac>29
zK5`L^y6G}odrM%|S6}-5-US$N`~khW{SsZ{A3^y&4%7I6J~aROC~U9lLMB@d(CdxP
zNL^)`?Hwzc*7qZHTDQZ9|1wG4kSl(0|1IIVRFc1QW<rUqX&gz}4>jkuke7R}gC^cS
za8bz#FIam4+}$mR6UD^9zu`H|tI{jjHqaZYFM0@`N&}YnB^+E^Q3NF`|AL<>J#{^@
z1Q{#N<H?5%Aike7=#R?7^xT*K=+?6awB1G%`Ci-u`v#oQ>V1SKIHwz(8@PsU^!mac
zf$1(gZyY458xyDZ(;C6xGZk*OCsHrNr)#VQ@YYYD!@iA}nfF7(6T{%Z>Gk+SX)*-P
z9r$O;I;dr3%$NZycH$`mqfYuTh1Zmb!UiQqH}D5sy}bl1kK|{!myDzHZ;tVVti^HC
z4j20RN-g#K*H5#PlWFk$B9waK7qYpW1Sf`<(I4&sC~aFCDxLKhZg?ihUT+E@8$PQ8
zCHpbl&>Thuf)~OXbr-m-&VbCF7-6V}Hyi6u;H;e@%;PFPq9hOxO@mLcmvqd)o$aES
zFZ4aQAkzrpmSDVYOAN7o!^hfv-vf;!F0+HbG}wfbDX75sD^G^8C!<*p=@s80dZpnv
zPiTaLr@HIWC6Oc0gFgXTdfU@iHF79+!D>{R@e6D}Eea;z+v0&&z2MP{UOc#YD|x+W
z5q!;c29dcrBun=h*!NZn9ysL=w!Cd&7U>v~c`|RHWZ)?{iwLuU-S6<l<xX(zSu56{
zG86X<$>U+STTJnVXz=>TT8tcfS&to>D73JG$AA75369UCY`{$#Z+VBOQk{TPL&MRn
zkE=-VI*2x(SxO(6%A;#BBdGLF9R6^@6lx?3z{ss7INd>lgoh?$EZ79+Ha)|amAK@I
z;s_Yq76b#D2;;G_yS8_B3w|;j051KkhsWpL!(|WT@q^|_c&5gP=s%7odjA+aukjby
zBqRt-Kb7GPD?+%R&a0#20($hdnlsZe4^tlJCB^RTJQb~bAijPIrIiTK+44<j_vbJg
znQevAGr~}rWg*#ke-NHauSU{W8FF0m9cg$R1y7GqT+wleL=J}#?yEw$Ah`h^ipnw%
zka6Sg`p|;6?aTruUbCT&pe;V*W`?&m{Dn@@xx}o<hv+4A0D0rTK(wfteZKJ!=D)U{
z>y_{XX{@%S_ur19;QWR3PkjMTvgs|YR{*5SwH?_#LDccZaulMhNRR*eimn{lfJ)9t
z6Oi%%#xIgY3j&vr13O~KyJsgL9@8gI+ib{0#bR<D>B0l6+Mq+hVSFv^D0f?w8%L|#
zjk#;Ki<{Nt#B@eYf&(7Q!Rr<U+%qp2x9dxSR#qBIw$@=3GKMvXJ$F(4PbA3PqgTb6
z(X)^}Jn`7`Jl%3xnm3+Ax}Y;?*)L6R<a1G7_<rhY;fE5s^ihfQ6fr!!6vo9kA)!~R
zh;U6ZNfPga%}%?>W$k-ZI!uHZ=8eL|GbX^7(?$5$XEW}TMMlhv>`&a3ozZyQ^ACtp
zPl3vzr<m9Kp0X1MNbRed7RWcc5O)N%;Mltj*uvk2>zjTT7J99wj+&b+#06z}a?c8R
zOAp)A*m(!&{-p=W*y0?DJGP;xzx3!Fjbd~t2cwd^qWH5)2Mn~`2*a*?;K)q5VFgty
zNMiL!v`syESA3SM$mPTPEM!39qP;}LPMWnc?Pq$dBUrikBSd|04{!{xhUVAm*`{Ub
zV9_={kXJ~-v(|JXSh5G-_bkD4KE2>xGVXxx62GazLM;oykCHsT?+HBJuK{%7f^izZ
zT#I^a+eBmjzD89xx9QRViXv3|&`k+@+$|9YPZs@wQk%}OqTRMoYxqA%r{0sT&F&;s
zTogPUI!|0AWbn?<-Ne>;6PwsD!DI<t;pFW}!yOk6Fk&+WphD6uoD+8mlv%$7Puw|7
zx2+@2J!69N^K#kIVZ>cxdjnbnaVqosrG@y1**t+ivAj8MuQ>yUSJ5)dJnH(&m!3L(
z53N^QL6^LeLj0b#D4%;7mz`Y;L*@m7{qGFevL(teFlYpZoScKrOI*ly><`X#IFo%^
zv4k^XL9G4z*z27x+}#~qX4AJ?_N-ng+w({pl)WD#;_LPSx?m8LCRAbH3yw_FOE<iF
z#z7`pRMDKT>K?GJ4IvxWZnlu0yv~zfXwMVKmtkJ7ZKN3~Jesnmlt!hxpp9RHX^hEp
zSfTw4<?+ko>fSz>e(f!Is>+A21dhQp%Pes8(mv+;C2cZ)w>gdv3?_<MVMI48iZJ6*
zYyj3^ZCXEaY@T^Dx-ZJvU6otFyc>Z;WWWM!@Z19KUmC)7h4Vm`_aCNwD-SH%49q|M
zD&-F5&Vf#swJhYmdGThZ&!?YOxRAMbZqbN6x%8ILGn(2-(02J`S|FtbQy1r;+=p&t
z=GI0SUs(jVWSike!7-TUZAhd=4uL!wZ+tV_gt)rB10qtr_#MA3PO7@kE)^ePYmP~>
zmro0r%ZzGs|19-kOv|3)!1yRu^6Yx>@xn*!a_$sJtmcB`X^pW!&(oZIkHw!ecf(Is
zr!6#k8Qv^Lk>1{@L*i@`=w%asTI)KW79Z9{+Os-n+O3E1{O!Hyn$H;`t5*VjMcl!_
zwar*HGa6nOpTxN<-PxLUC-8kk9@%rv2P~0DX2K=&@D?>$*40XijrcZ(C8A`ou@>Mo
z42H39o`f>DRQTD4!|k9mP>U1#{u*H9o`cMPc1+@uZMA%xvdQypN6?JoMvJu{x_O$*
zn&?F?Lq)ZUsK2H)O}5FU<0sam;8V+J%EmFc1`VQ|Ij6|d{$jZP&Q0J{{1xZiPlTy9
z{mhOZmvDrl2i)0kj!5}v0)t1>UUPFZ_LKO<?p+qZtbUx0CHG&(H%@jlb!ESBx-S4%
zzg}Q73jYCDu})yG|B|ykd?na%sfk^al~=pD_zd|N@C@y!FtX6#|H)Il{F>UG86uq9
zO0?{S8eJKjM+e#-AdlsBG}7fblr#B<GEcV=jVUK+Y$%8qCe~orjc1_o;!#ki{e%7T
z!T|2vnSi-A8X$-77#MeWgWET@vB$r6gN^%juwBhOd_XiG1R91AzT)TLf;W%3pCZmi
z|C9ixQTlaV&5t?XRCVgw+RMxX2j!`;;3gC>KfyxlxIRz&<xe`#qMS&0n$d#W`E<Fw
z3VkMJjSg>_q=!r4v>zydGLKu}q|*e-KaC-RMaoR4d_Qx^K@%>n3}ZR-qv5Q>@mQcv
zocZf#1iFs=hkJg1Vpm~hP<fXh`zRXVSluY7n=VO&)w-AuRo8)6Ai}40MS*aD948nw
zvfmdRWXA4C)!y86k6LJYqG;WR77GF-c$)ev=xPwm=Is=xNo^;ndSWh}wA4b;_6MnX
zWCnpgo+$m%B5-=*1QQk%jjyEF!2MX8=x(or+Ty=Jbej$cTd@Y4?S?=r?Kv>NvmYxo
zePXpY7=dc*P1wsq7vI(5gL^02@RE~HSOZuH5`IPEweU4?DL%;#%{jrkozeyu{_1n*
zzwD&*9u%SYCl4%C`L^-oC0EmPygGQocL`1M{g3iBB~AN=EQ;p5rP~zGk(FC>P=>*O
zP`$~Ct-h8<VqC|NO6N<m;%+?nafk<v_c832Dl2mu!3PRvrLeKd0l@921RH;+g&7Z#
z!E#>8_{Ag#GH(`xnr)d_RuhBf9r?@;CJ6F7KfuQyzhhJh7r48Ja*OA-5@z*#wA;bW
zLLx1KmLL30?~Z<kZ!CE<Cf|%qNVD`?L^KL+?4|n?wLyI6E0n5g2bWgO!h)uRbk13g
zEQYIr-wGEH@@g?Ww&en_ORa&r$}2E;jvMG7eG8?03fTJ{PjU9Mhpg&>v$*-R9dHaj
z3?U9>eRr{->U=tQ`p*>Vjvd2V_nSc}*9S0zn%vNFFXF^_qJS&%sO-@PGQ0K%y}Ta7
z{YUf3K(#mBI(~q@IGsQ3x$ULyY5(B*nQu_i0m|T)r&*zRWil}~gbXhphj(;W1N|)v
zU`vS$Of{Jc|0o<LHZQWlPsL-?Gaw;$%EO3gzIni|cqqYgZ!W;EuPuS1wWiGB<qT8y
zMhyJeod*pzz5$}I)4|WKkId!1cjf`R((t1<<0x>oHwv1}hk8s7xg%Bxv&+-TP2mx;
zsf<VS-4`Lj+L=W7o+-?JCV}GYAh|d;fmIzk$&mLm*gp>={>#<iuW~GW)Nus%%pF5w
z1#j8B!3fx<#X#e(Cv5!%BVu7)i={gYnQwc&$a4KS_O+EP^ORA>yM{-AoWls<-K~dm
ziq)Vn%5&P!@8UG^zogO+uA>XjCy-4030NB107G0mkwElS_+^C(avaitYI+267tg2a
z69Q0QH3>!aaLEhTzc>wqk-Euo=rUs=4AT^anjJ6U#s|u<-dzOgZax8>*8PF6HO8Qj
z-wt+DxHA6IkcWe*4e;z;Lcs6!M2>?QfiH$c{I3WmwQsPM<Vo;DXf;qWe+D*nC@_kG
zfEJZjAbqe41~>+xxnc~gej<quJMD%&iu;gV`+eMMoQ7`8wIM&BG30wd5=FbnP$~K4
z#BivH6edrCq2FcfdoG6JC;ozfw;LeG*#t=^N5E(9Ya#ca2`p0x=E_>6F&XziVP~Cq
z5WKXAs9h#RDasQkM6bj{NCbb%Q6XaflbEyQDXyzpi*47aa>n=Xrag%hXoJrxoK^h~
zt(q><wx_OW)q+vDe4`bbbFvr(q{pMlhAFgcbtn?d?nlw3{-ixemR!ubK`x&0$N7eO
zxZ>+`kRk7cU;U25HXruF)AT!B((w!C&D{^fBIj`;%lo0&t`(Rcr+{_e1@YwI4t&Dx
zA?|_0*eS*c7hjhoegpEvNK*m-+4&TYF4E=Ply>6DRsk3?yp!ab{X{c+jYw$BGPHN@
zTQGJs3pol)q0o34)Tf<?!d94~Z_2k(jPEL9e5;K(oz5le27eKQ_t$VZe;!%mv6%gI
z#|m7X?*r0asS<Z}2Xo%n79uOYhEp8H2W`MHAlQ2tEG+qk<JWy*b^l7?w^3HC$E6oI
z>6$V*EU!fjB<Bz{jjiOr?!%lI>)AZfMt*#^t%dyXX~)W&=Rna8TXb;c9S}c$3CYh@
zL+kHUqR@rg(Mbn0bkX@eiWSQN$ERF~Q^W(ZdXWbSJo1{|?EH#s?;d0?S$Q(y60W3h
zUq2~6K{&$UMr6N(8>2qc8!q@91PYl-W^}C`%U8q0CP~L}eDW5?Wn?aXnixv7Uf;rj
zRfMP)OW<!>DxA8~Cp`Xn+X0j)p<*wVz&R@diPCZxBL40NEM9gLrKLu~`<Cv=JWC1f
zeD;ldHzNVXDb&Et{f$_u^%l`Rl|)oBvhk3o6^WQUj`ye+k?m_+NJ?%OF%nu#ykftQ
zw_ZD$@O|2#_6`>mKh0w9KP@uP-|K|UO}8>0C(~Fsdp}tqwjD>E9tWHBd@v>Hz_py^
zykYujz2R}9H7HHpWm-v*OeMMT*%>b5b7${oYQukVZ%AH5E81{x8L}(-1@jZUQKI5@
zxHTvP1XODhkqc@>=;sJY5DzCOb(+}c>>%l4D+tjrC5C^d$jzwx)BUW0dEHRVoQjPD
z$G+YKde0Xzynkoe$g(14l{>@D=sAhc3HyS*50ycxs4d<dzY`3fU&G0=Urg^8ih<Xb
zoz&f$U>m3~?IZ2OJ<a=YciSJLmwytB>Rdx1+5>o|&|ijkHy2%AycBL;VFfO&pTgZ|
zZV<Qi<wVO#6L$sw#W$KM6_^!5{Dh*gM}Q1TyE*`R^p-P|@p)|J$CI2CS2u8e;0r5#
zx}Pb|{=gY-v&V9E%D{6jA2^uQ%@nlV1cL7s;S4t5&J)!ZTKM@Ew3HsEp83I4;ZzyD
zSUN<~D__6`?wZuWUYmM9lmvoT0!fVBT<APc7iEkGKymMCraS5#YgXlk8!TjThMO!o
zA!x()zxhmN&0mZ2-%hZ^q>y+ErvvI(&Ahr~#P$nkvk9GCP@RR?rBgo4$fOJ7{ck@$
zbD|#3^!g13(jpmK8$Pg0b1T#QDV_Up)c}orF9^NhI~ry?ea*HfR3XfrJU>@L4Cb7p
zdh<=F=+%X2%MOY=D{dnBhAfoXG6RbKN#LB8cLf!)(|_R6D?DhujCiFg5|6eo*aLVG
zt=o=7{*MFx<0l5VH-?!Avl=|(gdMAY%pUUPC2?y_`Z%2)1)xYu950`m3I5Ks0fnm!
zambnsrol-8M@F-(_JRLsl*BcB*K(X5Oi(3V=TB2p)suARJUb#Lx}6zW=}p>!BwANl
zM#}7EkWk}ZlzsUU=(zoX^Y`mlxa(sdS-qJ@=2uMON0ZLvLYfY6-s1#SRW{;_8|%s3
zvNc#-&4F{^KR$Nz<5uR0q6CyldCJOdRsaj%Z-C2tyjWAKV6f<D7DL-M;UJGrCM(nc
zUwyWK)l}}Feq#nO{96O%3tLZ9!bEBDQ&k!e5k=!fchG%u*XfDTUL3p2kj{PO2#fF!
zl&#+m+!~abpW{1W+x7{vx%?9@Uy7lx$_9c)yBWvgP~ae|M(&;qB6~Xxi9w|vD{A6Y
z$JI;&hKb=&Sk8eNex(Px#D79r^=xo#n<vA@c4M=_OrSK=2?#$wgmvvgx!L0h)Va_N
z?vQ^-giQa?(!KL(-5f_M7}-x3AX)NhvoKS(Ydut1{Eyn4e+Lb1no(xdLGZeL49Xg&
zAkiK<GV9YBFi<}VTdQ8+9==`7-^gih`@)!bUYU(+BL5=>S3Oy6p>AePr3$dQv<Ou2
z)iV8lnV{0}C$phF0c>0Jm!;PN$jr_*Ks|MUefDF9yU*SH*4<d@-pfD_4M#LPXbru!
z>NnNq>_>B-C()fND@e7!7#UBSO?5v@QNQX1`1as8losF*{w~}MRi+HkY|(UF?iCFq
zJ4NAM8Ao<^j}(YW7b7t(m%!ibKF*|@Hs<h!*SXZSaQ=<DF<(9|WE@i67{Tad)~n+W
z{&_wL2oE*kMSfBk4?lv8P%86TP6UJ<yUMAe59n+tgfu1cP{!p+8hm;g6}S}&t9SmQ
zw$ht%!GA}ofBI5f619Y?J*mRqk_J%54{0czGY;22bwjd}yYbr)7tpls9otn^iwivM
zFjsmvxqmMNYB#QBz5kgqxkH;cJ#KF7s96eo@8D;=R;rr4DOHaLX13ri+gJ{}{596%
z1hV1zesD&25r*D*K<7*&=h)^YRNzSy+SD3^dQchd^IAlHq@5&1wG}kxX(}}hbfRBI
z-Xi^r?o|KnFg!2rfU=K@g5v8#P@$It2THFKbDbEXGIfT`JboOjR)rDqr(a2MJ%+WT
zd3Z=A21F(JFw6gw!rLVMux+#szQ=#uta&3JkrwM`w)>`VV_)`g>+L+ise;vDJ~JQ3
zj_0xo65+MKZI+QP>nCXcp(zW|=3M#;R?{$!EzwWfOpmtirKaVZX@)z1smgcg1-~)y
zlzu~5uQl1+*G|l%q3`gP{|;h%XPBHCqTtlcdK~Jq5QlYykcCro;g#Yq<f2S3Te5#1
zN2#p^n|Ao)jPoz?Fq;evo;Bl)UELs@9Aj=-Z(#!ZGQdGyQ%;qM5nK6X9@d(>kE<Y-
zMrY?bAct?aEtI~6&_A9tX}sP9G__qp{o4Ad@#taN(VC6UG-=S1hi{?W+X<9Cy8s8S
z{Kq<{ia>`|MtGiJ069N73;J5_!$w0RO!pi&T%M5v&p&e@F1f<^bMCylaN@?a&&a^f
zoBN2Gl{$E38^o%nUgs2uF(6mJ42WDf#azjZ##dkL1^&gt>>hpBx`t#Ux<FYA!N70}
zoly-sF1ekSPvh33+mC2U%|r6y=5adWyaDCy9;3~hrpRLBvncaQBTEC<<8F;A(9ovI
zjHXu-&&Cg6Saq10=<mcl5eEV)mEn0!Q=Ggp5m%k)p6=~@U`jCq_wKPGo4fL$LOB3-
z_x*{8>s|0mvKQ?6oy&OKK0>6WZ@`7#vw?qOiD@c(hnn<6q5no2EHwPj(C-x=X+z{H
zq%_Bkrp<_?V22|8HMJg9r8iR6eVl;c`zU))2;705ldB)}i1~@X&}T$|*#8?Oa=cua
zJyix4ANL?Ci$9rH%JG4wbO|69VZjL~vxS0N{IGys95MS{1*NN9pk|UPe)w<|sNH-U
zZ2OuHf(JZ^cGGnT6D&c}IXf=5c98st5QMC@h=o?&651A&&Er403+}w}g6d7Jq?fbD
zdHe}q(T%GPv@qA2EQ@VI+2KQQpzAlOE%`)#A6o^RWQ*91XT8Z|{#)?&nJQ@edNuZX
zI6*>s3}KVpak%qopV<*tBc@$M1hDtF<NHR6P-yX1cy!eXa4zf|(^lpLg5>8ylRejn
zNOA_81)efKHI}T!*m?Lkw1rB|_C!3Jm(=6VEc#_bD0vqX19P9*;2)iiJjD}Rk+1bp
zdOkZAW}lltS93&QvE+BWc#Aga$P9-gtHa<Mc$_glu^*lf7lE=h&*9QL<0R8b5{_H@
zgZogH{c#Es+3Z=+tZ4}F`!oTj`5)Z<OBrtRYsB-kCm0|1GcZinohdq13~aR)u)~2T
z*e$QFp@>xr$-~_i$k;8Md<f;y0v#n9S+|-j{8J1szCBF;gciY_b}DdY)nlY{<Pgf7
zzY#2&>j+&*54o7yilhgvkn(<iSm`4JR|`V8bbb&L_gY5+U7X-Y)5q9&@-fzvF(pTg
ze?$J3AE4>fVLZ=C8nivAf%cX!*dIHdFnitIV8{szxTxtUHuJtwr=Xh2t#itS8*-zO
zzf%h;c67k^hql1zI9Zf$<4?APbx@0@mo)N@73_W~4!7~o#eYR|P|O`6l4Q9EX0!$n
ziBp5Hu_F%6e!m3qwYWgLz&a>0^$xjQ-3{6nEr+@57BT0`EwNUh5m6eq#QM`Xz0al*
z*WBsEkv}B4?GeA(m4X~lvdbSzthf(199@Xd*bReQe)~C7wYo&}ZX7a^JcSC6?LthX
z7!KaD4MqN7kYtRo`MYDnK!2+-EIKy@bC=5y6Y&8Q>wbXHkUqew@Q6s7A3X7<4!*dv
z4c1ldgkD)UVKm=Nw9oGZB!gR_|I=$Q@a!j+`)q^_znO)*;tM&a)=dyWftf_~$#l;;
zpvu1Ep9hK#$z#tC1yC_g2d|kg1nTFyGiyv=lNZOQwWv-661i|2zWR#b`%kmbS!reD
zYnh4`A!!t7s*I{<=prFL1+;O=ClqVlMC@xC2*|M{|LG0D74sHC&YjQfPlNYx$*LA4
zWG9OZrn8nf?mUdc*I|A1lbTA&7Etiy0FFIuQr9&TVw(4m%r>2ie+-qgFP{f7uhZ4Y
z-nSiW{I3nzP`82;JUEZhxaC3qc1FP}vET5UZyWrr5Qa?M3eaKMP1E_*Fy#KZAKhMI
zg<8av(Ul@eRI}v(iWOdmi=XS`!+IrTd)RiSb=OuX>N3P6U;-jqMBpRKd$3}G9;|#|
z58a<-!obW!bv+x6KwSmp?4Em+DOK5mgO={a?Yn)jy}y?^r=<nEy!9m^nQMqde<ZHx
z8NkCLe4L;nL#iG40Uoz&Ll&(g=xF32WapEJvQmtY$=)xh1pJ2{4Jx7zg=gqeKnXe@
zIf`OlCA0VUL^A)R{@{*Q1LpYxagJYL3#e?}0F@Q~!Of>NfIfjt+w3#obM0v$ck*4`
z=uab%(4|(_Sn-SLY-+&eE5G38KhF4OiW}C8%^^y;--+`d8{+V$oJbxO!jobTIa)Ov
z$dA42knrw9P{SG{_58iC%c&ZL?#@F&y9s(4a0Ic28fZ$|7A0=Fg#34`MKQ6x+`&*c
zb5mf1Q<LX&HPu=;8Kp*Gui{te-#-Vc*E?cbVM}Jr>B7%~YRH1;DqN+|G?4i+72mtG
zlqq<xi<_IY34eeH*@!cV;1+SRXy6!;NJ%1IyNZa)@7K6EvWr=j|99G}@kf0844l`y
z6aINwjpl+?s94egRp&XQqIz|dl6V`H)a#<Si#t(<6h$$cY{6F(4YFV54v}qYBPYv+
zh=0&M@;#%M2;2?BeP$(ua|4oBzju*i3axBx!W(AO-#pOW-cJ^Ne9h?u75JBtJdsey
zCsEecWaH0ZGUMn;eAD$QzU%#ysDHbL4NSe5{6PsiJM=DedpQR-b_O9sJrlJ3ssW16
z*@E(SpF=${LMW++MUiv>nVqXfVsd>by7>z9$ZsTV0VnZo*$bpNbq`^O_tO<sJ4o;0
zYLejRNcOmXrivYFNL7C=nYFzatk&!XLwv5p@VhT#P%wc@yFL-Vz#g(yRh38$#^B_s
z5F80&!CfI|JegHuz8xzwcitd2qJU7d34){AT1k3d6xG&lq6salaNL3(WU0?02PHmy
zA^8v7(7OS}zez=xy>Gw<t~FH|7Qn%3mq`2OUu0ZLitgX&Ooe||5l@>Oa`oLzvMz8H
z72o4TIQq*#+6NtQ^3G$Nba)o$*QNjveEuAsy;+ZwFR2V_^Srs*%L#kEbr>GIyc|0o
zc4B+v%^8isR`|SagzkS|M~p!p)&B+Q#&cOT_xBiSwNyd;6DyI7UIK|zg+!~-5oTB1
zLIr;7;hWANs$Xyp`^GDf@=jOEHzr0csR8lR6Co;2(xmRE9kesGr!w`M@a2C4;Oh7Y
zURP5Gb}CA+^Zd3z%bmWQS6T$F;@yKFu94BGc@J`}Zo!pF1EA7K2h8{8u_u&QqJ65W
z^v};KFedLycm4O2`u!(BM+ZGfVWtwY`kRicO;u6Y-FGw&%E30FcvSAP8fL54kWYn6
z@rYM0sJ!G(YEFlcj$mbYT&9n#Ib=;7XY|5~E6t=SZICcg3V;Lhh{PI8s1iQHE);5l
z%<iQ?y?Q?^*JuY1{$>H$v!!sZb|Vgn4~F{+<iJt^DfVSwFG{+3l&7Ix&wax&r)vx*
zD2c4#$v+Q-TUZZtLO_(bj3uJ`iz?{t)%<A2@Ih4R?as8lb|%hhL-@niGDyl7kVik8
z@P@Fj;J3Uv30ACwg5T}{nP;|)M&VJg=jt5R>YWpro#O+Q?i(_J^S%Ici@VdA5g}O5
z@Z<PDS3$@4Zz!``kT@<`20g45IRU30v-yeZP%`<(lakY<AZ#TxmH)z<@wSC$ps$DQ
z7TS>~w=cp)Qi{m(eGg5re+M2fI)_Rl#BrgP557`%3E!#Vhx2dxlhW#Ca9FaA>>Ewx
zqNFsqMOK5nHC_f2Q~a^$@kw@9fHzJ&bsQYZcLJuVWkf&f68MyL9VX_#CAybHpuJBA
z(|UFbSzyryel@4C?+?GPyCZG}+sgUqkdzHcC8B7_HXq*11Q(jH>L97ho?dd=NKk}g
zH(0`Jz|XQu5NASqnj^*Hs+(_duYwxqVEbI6+O(RO_k2Suer6N4tr$ihuYi1A!{qDG
zHh4km9TCy`#OcWh0P;uGxjZbvsVrSie0N#G&6D52Jg+Xc`E)weuR4z}OSy2BPw7qP
zZw}#>*@KMx(s1Ntd7h{LEr^(3G@={0UwQISmCWG<(4;qeQDOFTn6mf>dGNrTRQ!4a
z#SW{Y^2^1H#v4fXZO8%t1r3qk9>2i->3V7j#E`N;bu?4)3NFfQAzMXLLD@Nda`d1r
zGtxQ&{CiB8cVDjJWXnjLFMbxDIwCin$q}ArcT3s%n~br4P%V`9&&8bzn=$J8%Xwzn
zg7zGq;u*P1kcd`e8nPyuH#^;(?e=>>_4_4It*$UEi`q-JyM3UNJ}ZgLoENCN`UYIE
z#fTWTUm<go9jMTkG59RY1qAN<0FI<EFm+2Sron@ltGpW5=sh9jn>BzcT@Uz%WWjGK
zZ9EcU0P1cEK+}TPBwX+-w1`h4T6Lc|<nuo;saK8D#nPDJX)VW>*@DhRX!1->xl;bK
zLA0#(DQ$7j$38#f=o<Sf)ZkqJzUHkYGd&-WiTiC-ZK?#_ku(QSo_z(=3uJ_z62+lE
zpCg(4gCu-U3yj2@@rBjdoGxc85c5bL`%ddaZ9N%|<@Buk&{_j%Tib*^kBz{?LJOG7
zZQ4X}|FqY-Q<ykh*$5<#{sq<Fbh)MDbHR<1MQpywVw6yNo2PrBnjRXGr>%!Do$)xE
z?i$@sV%ndeRFi9H+0k(-emjYbk4;cDy@jZ{|149r)EO2B7!p#h2^#kWz$t}YL}%a?
z3_mMDoSOx~TTekoZG$(qzHG<F-VbH3s1%c#ZzkZ1ZBM~93mv%M&zeYo@B=Oqy<o9x
z2_CgCVF#bC1lxBeb9;9df#wSyY^}8($`m!=$$ec(->iN@9ftRk$MSDzSd$m0^jro?
zNcsrXZuLQdQg6CswGw$7r;92*j*$P1ejt%6TZr)5I#`kO1}X(^XU;tlgw=1p5_jJR
zY>?u72$fy{HTBa>>`D{7?9^w%H!%woaMGbExC{L6r;)XG{^0RGdwg*E+-7|-zN(VJ
zZ4)(P)p`FwSP){@1|*|hr!Ld-AYpp%@lyI<_8=Kt)lbTshG6`c<4AsF26EZxL{b*o
z(PhRuBxkV~x`_{xdJQh(&pQu(o@#;*OEPfkW;wXl>kB+YXA-AdD&)}G1n^PGjVvqq
z0Zt!K#@j`$@CBK5>~(WFsC_IF?+QLgWW?8T6IGnaf>=}Tg@+EfYR(S!WPb!Fe!VBq
zar$63JwXKR1JcyV-kv%#p;%6(m%6r{B4@%gk$J^>WE`~$Z~uK1EgC*RtV%|y!F6YJ
zl_N&33rNDQt+g;-qYl(JmE*tv_JVy2wHUR<yGizSJF;6mf!*!B4nrp&I799L*0JO=
zmwBIYWQZrQ){!FfCdJut^C`S;w;$Q7Dg}0sncO6`>73g*0COKM0hY0bjAdIG*{8XR
zIs{y&Yjo=I_7k~O7c`PgI0pZ6f~WJ`uSl_z842hZp)GNPWX(~1l*xHZPAZ><Nv&_;
z5t-+Bjbt{yEa^bD_?;%Tp4MdTgcNbV8VO|NC7Hla&QSRMPWFm*KiGP10=q8O!j50o
zky#z1+|A9d?ASsfq7m`{VChF-XAqB_tXT%e#=My(`j_!hd<zW>s>u*MPgi(ep&S@Y
z-Ct<Zjry9{p*NGHKYm8??*#(|9ZT}{$@JFF>Q0osZyjO2X2WDTcX0gQVY2tw8m{pb
zWp+r_iu7}AnEobHtT6C^NXAD&Nn0nl@8`-@ozpqj>$|RD-N;S2?dS&frJf6$=qmwi
z!G3Ohm>X!Rc?OpG-Db4cn1Q`s8(5|Hs~N?el-zl1L-;oZl0N4+YTR*_+VgFu(Q~)c
zMC+yC>$F~xwR{2BfAJ+_Nr!Q6xhKkgr$DTR7<lPNE0(yPNfz}1q7oqiD(1S9yzUT?
z&+WtO3|Z!;{uI2qIu%O2y<$Fi!-&~+#{#RpzlpOC&jT4!f7oE>Hc%y`0vA3@XLN8X
zx4mo)^Vmm}z4lWHyR36Hw;v$X;`SuG5NkqBByLfAQ8T)EYBi@JcLp__>4HuzSwM;&
zwUJ|vZ8*Mq5Z=GE9Ayn@5MMSHI;Oi~eBFlOzJ|o;#z8p$sSy^Qx;34X>A^M2s=)GJ
z|3LKB{~I{ZxSYE%j#t`G3KfZlC~0UZt^0q@AxQ%jrG!XBDuhRbw2P+pQlThRN>RGc
zNJ3;}G>nj>9@(RedhYk<#rb^Bo9lP3@Au~g%0{F4HT6#1S+bMt7=L7aVwVt<R#?q@
z(`d!-i4BHk9iik}L^X+>auujZd6J6f@#IF;1V4SuoLSb+hw-Zh*v`WTnHT!YnkW0v
zw6(r$b?tpHW?aV<j*8<2gNMkER6p3Z^e!qcSwW{M1;NSj1EhN6Ubs%mk{qilhjO*S
z@Id=Ds=6tb?y4^ZX)9xh(@%49{O3*VI6%1ziP|L2QG)9d8v|G7F9gLg;zY502N-np
zCd=i_xtrq;xU%9jl2|Q}1k7{zxe{V*LyIW9bEBN?Ts4i!Y}(A4tDn%Gb<1e$J~y(x
z>n59`sS6vI#Lzrg3eUVefQr(~NMB1EIC0huG?jb7I2@1F^uLk@p9wg}j!!FmJZPeO
z8~4?I7CB${iE5<B;eED{Yi)T!mU|ZhDGLQ_xx>96S5AqDsC!wDyC`#YybqxBojI>Y
zd7RUT590I(Uo^M-`?IL)3h+<Lb+%Mn1A4?ovo!^6ls9pk6u-5hiYAL#eq|~=@=}+n
zh6*^!_(qg3*Gts`W&yEz<-EcXS2(Xw9jb1n+&ftzD6jdM`#3BDwN&+>=<ZxlbW5AN
zCaZ$uqn)_KGY$CE%Wa&^R~d4lM~2HSlf;L|)yeUIgWwoE2n`OK;F$MaVB>uSP<Wu(
zdOSLxWgdTuBuZz}8kKuQ;%OqwQ@lXO5*O0M6CaqY+!<CCqDeO$cuV)MufhwJf1^Cv
zJo3q+m%Ai4%>Cye4pvX_!TIKF;3wn3y_Qb^iSLd>KV%Pdb{d1tXRJWY?KW)M-U}pm
zW?|`NTkzkYZj!%bW|MkX2iLt<l_X~C0}-jM;IWt!(KpE9$;|KsYBra7As@q8)f{u!
zxFVhvaOJe5Prw~FOpq6^mr#{~Z%EYP51O+Np<wM7FtpGg>3!OXavYM$rW<Y`UTP1o
z$>bM#I#P)*wJBpwf!nd<xdq<b+zkbg9ZoQ^g|klkLuke07istbFO_H3#G2xmt`7Xm
zWe`92KhMkTcOklObx4}CBx#yeL25ULk_?qr0#)sJ={A&|UM7m<-HX}Om#z3)uLg??
znMo}JcG5K!vSe+77rnguAPiBfr-RO4;fA%hP)<)D?}hjgE?4X%iT-DhtB`gAC3PH#
zkn<uHM|7z2!Vst^U}#=c){_1-6~Ze$z&CO81CGy)u<dPMvcMz)JL&$$7kUskNurgj
z{+5hy?*>#jXgb;JX+usn{YR|sRPsXaS+JshP2|`q$yCSmX!)Ne7Rlz*8)A#-#ch)G
zLpBf2duU1ms%2ofoGyGg9*J^tkAc;_B|t~A0e|N_IX+*JyJGALY%-2xlLQgE-fEOa
zt_3tG-kCbwc}L}+{jlb*H31UhC&`+UBC_Hx!Oek-@Xv}~US~}h;Fiq*tHZ}gnd=Eo
zT|A$-_rBrBl!WnC))%wf^{w!3b`9HmB>)uUy=P{R&lCCf>1?`&EL%N3m)=2c^ysEZ
z%&FLoyqV^Ta^|;yQ*9@4_iJxDx7Uso1x_c6woU_yAqC*W&sJcx%ZrefZmPV-l<L$x
z;rQu*m%K>eQ|tXA%hbzoy?ho)E0ZGjd|BM2w~h-l-2@)fmjEvL0Q_&B21!~qAkiwF
zmsGKeW$dm-^E*md)~-T&q{^Q8eDMV#Mp3vV?hiGnInQQS`f*o7q?n>gFZcu>qug{`
zkllEf8kioX%WTW2(cOQ@fU5~t(s>wEiD*H2iyNSEW;*Z~Sxe6CIt?Yh4f4L3ba3~^
z%E`Rn?c9*NDe?R{g4-(B5XnUf<dl#%$dir`_}O*9eqIP5ZvTP&GrM?)Kfhu@MILC<
zk3B4Dqaz)(_?J~iHjq@V1bS?4<&uLh&{xa5m|SKY+oWg=zxRcpe6<Adv34^VQ94NQ
z^#r=(%D?3P-bkWk??yHoUV}EHuc4ceJD8Lf1C3Hnz?s*IxFa3nTzFzIp5#6Z$nQP8
zdxJt8U$A#?9;_p`)#~|C3MaX~(ps*2Y9z=zx`khEILTUx-enQRh`cdLV=WRFS;*p#
zthDbkO;WO?r@g!A{1=yzhE@Qqj}D`<rRK=a+#BV;ItS-2rsTb_E+`#%M;9DSBdJ=s
zBt9gbmIYn{RcaGZGQykW|Mv#m>~V(r%|l#Rk23e~EjP|mFdt0p4g{C>7=oqNGT6m2
z4WIStB6Bje$&x9lL`QCvx47m9u~m-bxA6<v!X#z5R%sP0S@?#bC7)SLp(F8aRbi(8
z8Nn&P1~~km4^5W-L+2EGz@kM4s6;Lk49&S9$SV=h@NWuP5!V8oK6&%2W~fk;04)Kd
zFV0oFU|`8V2tMhx;R^c>-ukEcoapvABK7?z=&r4W!tg3k>XpH>Wr9KHi?3v(#R&dv
zcn3G0<p_V9FG<)DhgbgHPrp9Ohas73n9=KxEO--U7tj2psdC3ywpk`zI(n6w`q<F%
z2~j%G6Nuuf<598mRj8~sK_-S$VQ%JgXj~x;Rw~UU`<fO4{Us;3*qsZB``1c3ZyXYx
zW3iZB7{v*fZGrT`Yh>38ac)BL8c2|supTON#TWE#;H<bk__3HRkWZ85_K%$-Pv9uo
z9s3rGoHS!9EjOsB-9a`l>m@T0`NX1jN)U2+G2N|y1U#yd0GXoS0kJ$p#k{29zLON?
zPu7H*ZRcpJNC!Nl?f?VAws7q$)JfgmGcb8jftVeb2UR2f5|!w1>b*;u)YS2b@MJ+o
z44g)GUmM2bOP`Th+l@$?#TDZCWd*P;_u{q%dV<3ZF<`+rKKZfmIk)<<1+QR+G)=kO
z&89lK(6i%Sq(*Zli)v9|vS-KW#l>PIYNaczxxIvG)P~b5qk_!kG=MS|rNe?}zVu|T
zIh@ll%ni9}lV!fUso32}=x)%!3&jiJ!L#{bA-zST_luC0dH(#7eMf-h`}suAFoxVS
zk|0CgN#x35Iig{eOdK2Mg1rXSTq|t@W8cmYH|GwZ@JpRb{NlpBFa4LTxM=|$MD2)X
ziMhb#jHmU(n_=wJO@Mdv9Fj6urFZNNS^4x!V7;a%O8+(uUMjywa~#fsy1bcm!MpV|
z(go7Z)p<C9_XLU^X@!=*1@GQIjnaaAQ1WLP*KjVGA6Ge<f7kaR8HpYxRX<WmoN5P=
zG9M;g9<|`~@4dw02L;*k&cK;Y0*TF6xO5FsWTD;0qSNYW`NMdYQe41p=S*cz_AL|_
z&V+lmL?c&KL!xs3JIhd-g!JYL_JB=-3?}@Pp4qd4)a#$74msaxY@a5z!FeRC+8%Dp
z`vp9I+=TW1->9FK0w~bG542WilhfrRTq8R~90J|A^AE+F`x-K=CT4UH8`bY%wLv(E
zx!r}Qt#}O{Pj>=pX4iR#?3N?Vf&o@zyq>y7XS2vRGHl|!2TSy3v}j)<Qoc?R*C&i*
z#thj#VFkh2G;}g`1Dut8iWWX@q+(n4lZz@<)b4>V^}Y3;3<|m;!ao#pHzg-SLy*Q*
z-bjYp6`O&?jkkoXvjjVwwly6s69Q-S3_*a(79gx+!!^~p;Oe?5_~2<m9{+8Jlh;k>
z=Xz|%Q%g<In#m6A+iG7nb-0nuxP6R$^2uYF>k??wZ#Cq&L>>uGtV9cz*szL{>uCAf
zVRX7bnrDrLsN>XKT&|-mV3TA?m5>q@URuZ%G#!AB_x!CN8JJ*4Wpfbo+ZFhSrto^c
z)sc)0EAa8Z=>lJ25V&}bfYz&K+^WC`ZlcYS9C>@1tKXqVrv+UBWktq3;}s>?FRKMv
z?@(l)QiEv4ZUtu5=Er_+)L`YnoEB9Np$KCI<Sh9Bym_3(k`FW^m-k)h^bAq%;q5?@
z*lG!+vp;}qUohDoo<^>HiUr%(s{y5TDNy|Q5QY9FP~X4{#46|VE7o@4@keQJGTg-_
zyH4hpe3k{@Z~Vn`-RrpLFT_cvml01&(vsxvsO2((+<0-<G_j#%1~LkH!(NZb(}iu;
z%<hC%%Op8vRw3k1lOGEhsnBQ0aTW$&g_YP*$1=3x?rC(=ydV2MGZFkp9T0T$JR}ko
zMR2CTxnA}}i__G0=Z@>W1+Opdq2_<4gRYN$Q1*Bo*1vATJ2jC7-p?Mu&&_j)&RY@i
z@xUCSdg51ej{jle{v-i^eRZ$-os=!!QHVITIAeZcLnc}$@{09;cB7$<h`ze3-6GSW
z&zhJ2CehbdAzitAR4QslJ~b_7Tc0YT;M7i3k}bq5w&{iISKcL?I&=v7r45g_Ytw&|
z<hb6ptGFZQAud(sk&!)vpi$sv%6oO;rRR?DdLxEF%7f=PW0wLE-rNYXv_Ijv4mBKA
zQiT^(YZA}yOdv0CMQwe?@&2J0z8h^pi&h%3(zLDg@dG>NUt-E4=5K5fIXFT4l4hg0
zEFI)-xPppA1TlY)*{DI~Eh_GH1}gSCFjUo@Wc1b%gM?Y|%+)9K5DejlH@_jG-gn9I
zhiggTNl$LPEDgjJ6<{By4&IH?I&Mde373!tukP9IL}C_y2R*6|+@^6=l796i@9C93
z;1FO!=D*y5-#f(dQ~uRL$Xtyj=ta{Xvn^Oa|1lP=Ro5b=_gm0uVTU$+nv9HHLTODQ
zpYh5?QR#PWR8*P|BozOFA)bX-Cu#YS?ehI_W6v|H9PL7cb%Z(H0U=`MA4XE1t|QYv
zMRWIS_3<g)HhiRQ5@+u14a^kmfl~Pk(xQ0}uF;(X_WuYY?fh9lr0M}!5xE6>xsKxd
z(%$^a6A38zcPI-Tlc5hDU1UoS>}C<3`&(qyy2w}OT4XnaL*~ZiWW&x!?8u^DD0=f(
zl;0`CQ+-_rSM{9&t?`Zc<MkRUJ8dD%3M&N9mYdRsd0tRGFaVfEC=rEYy<p=kb^Ol3
zmPB<-<p&%1f!(#MLDk6ueDc)|7+w+$zGg<@x|va&pN9x2ywT6O*yWKmPNsZ^^$U>i
zmPgE1Y&qTMag#aj)MF{j0$U^t3g|V7AhcdW3MDPw!-QI|vuM>JRI>Xn%J#fWwyq0?
z8RB!nIpq;3vB3ve4ZlF*Nf%*OO)<T^A9J1CPJxr+Mqpda0N2<#f`8v?A`LCtWV)m-
zP}^q-hCL5}vn%DHwL%#9Ji8YhjabX?b94dU7I=cxi41bw=%e-UmtmyoYC~_nH79|;
z+*$J3t*rB24O84^3J>U%GxKIS)Zml=jgtV|6*h!=j^9V=?yqPl424ViccE49c9`+t
zEG+4misqHJ!S{<E0E4BXL@_oA9!cB;mVbFjZB8-Vy7Mhb?BSEt(j@-AdRx%)^D`7b
zk<J}k?aiMOQ3IDR&F1}c`yP0fw-9tE8I!x6hP>|h<4Dyo6OjCCbkA@+IiWI_e(G1I
zl@rrY4jZOMo~PjFzlTVa^<Fw^FN$I}FG4A`BQ$&SRNxb91C;|tVZ;0a*wG(~X2glZ
zmJ6$((#LmP27Cg$OH|;oo`ZC)vm<u*RK}~N8p+w7G+gLv)y)668~Z(3Nb19+saQ`c
zU_MzO-Sa5Oa(@Y`Qj&1wk=dM3Cr5_*-q7AOQzR^K{7(J*6h@k;0G1Yk>TXLSwk!sD
zSWlpE)XJg{@u1;+OO)!QM!i<H6OnLXVD#~pV3)4~t5(~=8cAQ^zWW+%`lkgN*u8?C
zM-*V|EmwN3B$yvZ^TE;W5a<1wAmEfCsf-3xUfi5cFH56y<J3vNw=Xa~+Dm-bRDt>4
zi@}AU5njV?C%R1iIsGoO3<haf!GxZ0vU2!49BrM2cAj|-$5aVwE|fy~SW&?FdZ7LH
z-lCKr8B}cQ7~Z%RgAk3&;C}LaqI}^p3}z3gdag0y_6g>=3udrsku)qlw2IrGoYZ{r
zrVywfRp3S<2Kj}$gmh8Cv~lSec@(#W<fap<t<z1SvR6@FelWRw>NF@Vlj5li*x`21
z^)!C`1j(xngYCtoRMk5OSv37ca~wO-(hshv{|-fWgzC^go@pT9xfn{xbl{2$??Ul@
zF@A!6V6%F-3AJ{(18UztB1!vn=p(%fu+?$8^_rh1FgRTceETrQ7qjyN_U(7L==xUf
zV&?;rx9<V&Ef2tsoep3nD5fHjFR0MEDC#)!n5s0s1MZVoS)F(2CWQ}Mi1(jKu-YIR
z-pagAhc-?^A|GtgieXbC`BV$l3SC2u!5d)V!$T-L;wnlpQUQ{44}!1ie5maGfbT3T
z_`j-s4Oc3<fMee!Y1)%7H0&Fux$fGey}gBKFOBERNc(X?YgMtH!bSeuvI%_g%LZ`t
z^&ZZ|x)PN2*8?slgmgvuk#~>3laX~w#Lmm-YD$0($(z>4o!Yb(J{`XeBL%s9q+Je)
zl2MeNWlHnU>!Pe&akQcEA~e`=Nx<lBK`C3x$ch8caT=Tfy8UfAXC!c-ZdAgYFBQ<n
zM}?|wdqnN*r_q?CL2CBmH8BhJ=Sy<=c=4Jaxc5c_k@(!nuTbv*i;UCpg;l?}d!w~*
zvg80>@vsGeL#AYw$Qb^XpL;b_xxcyc%0IZ@asjgPYl3B#m!W&e5W2&gg`)Fj(OdV9
zqbZgzKvkJ6$kUq#M-Gak<cp4&?z+Os6#PR(K5AmG+P~nZ@K<P=>;gWY+6w|e6#08#
zGRX_+B|FQ5>GTKYe8WJY<^xwbo?%2WN!0j(J%Abs7#JcSpQz(LT?|^oIIt&H2TFes
z1)FB10LiBV7K*BMw5ThIHcMv0m}Qah@8N!U=gMOAc-s@yJZ(MH^%8Werm3Ouc}6tE
zzaB2R=!cRca<FJaB=FSICZ$Vm;VNfwyh(K>p1NNhNUi=09>0qvac1J+Pmmnfc+i_8
zSC$G)IUz1YaChPMd&JUJ9v|&|L~gooqtg?g5l{eu<?>jtT&4z$jK+e<ZXe+M^1T(;
z3&;s;2rWMTfr(}pp^Ie`O&72Z9*Xj)z@m@~f1`^)ktT{#>VfMU%aQ)z9+WgxzzOGm
z1#NTZkY*(f;?}=Kun&bacv2E)zKw!};&oVcgFIZasuyIwx&)ec<upGfCY-SPL++Bi
v81^`qfsLO%C0R$)NS6r4Y<6AqJMfEp*DwR@nKB;;8Q24F4QH!!x?cYShGnTz

diff --git a/src/feat/test_data/test.wav.fea_htk.6 b/src/feat/test_data/test.wav.fea_htk.6
deleted file mode 100644
index c7d52ce013cda791d26b5a0ccd35d2d49e570b6e..0000000000000000000000000000000000000000
GIT binary patch
literal 0
HcmV?d00001

literal 22164
zcmWKXi9c3N7>4bHtci#a6{4a=c+bpvm7=t2)24{fB9*?RvQr_1iYy^yEn9{6%rmc6
zB1I}iC21ioB2xPN2j`qQzkBAso@+!zL}Ell!!txKO_id8_2cP;_w#8UYfUTl7C>#+
zJDBiJhu!n`7!6s-BZsqAvPuh`Sa-{%bXtWMe^{rJov!tUEU(N%n;&e&Z)Ub}v8Q%J
zNZcFvbi)vf9UFid8!d^YiwQ}MI*N<5Ln#01T_ob!!_nfsyl0ahAa(6K$Yntn@^ssV
zPMiKm0+%KbZb2UMSX0MvTdT?2@%rdjMgXg4JW3ZS9;O$cRzl&8b#$V|S2}l=C*2Zz
z6~a2ok*UpRG$(m3t}XFmqjsF&U)l43Ws3F45uHqQJ4L{ntjWM@tNh7ORVToVStN1V
zZm4)R3n>RLWzuz3v9(qUofVc(q_=J)ZiT;@uoDM~YhyMtVzN<Q=P;H(>PzC!A0c;^
z>!T-$Cm8n!8nE1VCQh?AX07_GDbL=7Mo)>rZl74FX<Sdwty6_B+fwP`Ng8zD>*MSs
zlPT=tZ+EegO9B6?V>4@e@C8i!Qi8(hd{*@OWvsg56`5-I0H&TvBqgrRR3=6R+cz6<
zD)M_lrga`H?$9LVV!5PHe-b0GFb2H^I|Kt7H&LX~1uS}5ic8OY$2Fveq3nZqxR`o5
zm=)QBYp;p1aRaNUV{aq%5w@^VjmE6OcL_Mzd>%p$&!j?=BwD6X%Pw4K$VOZW2en-V
z{Ia#Lar<H)I1~R3OL+0Hh|53Jo@|LwvNpQ%&xHJabP9Uc@^R^0d1Cl65MEecVnq&#
zfz~rIa-qwaxo)osJ5;L3Wqn)DB-tA|RrsKgZ(mW4c@Kjg6_KqM=8z+u);Q(EO1`9H
z3%$1T2|YVY1m8dC3o*B^QI*xt!Qzb`UAtvKDB^LHO}+`Ngiba!FCFGL<18#d&!Nh*
z8&5XSCuvb{1vT}~-1t{&WbN%(<Ql|*@3srrQ8FFItyh69dE;1xI8U(olFS(#^=JB-
z@uW-RHrbKT!o7-4AyUb`C_-NrOKw*ual5Y)*}PKZzx@s>`f0?Mb}XYGuHL8e>eKO>
zBZ1^rNipRg^PrOoA5-W3dbE8-JS(tSic_0xsQSaB{5RE|c%kW9$d3?UgQN8@UOtb!
z7o7%d{~0i?s>AdD{v`Jov?77eVZi?q$>l-~-0Ncp3Rx%7n$|UpU2rN>d@`A=aqHkT
zMlFd`VI(@RAP1cliy+TtCve&$|Io{ARiy564PWx*Dxr98CLQNdh2!`m+~C0h>Jy+&
z<=>^zH9zC&@S{QYlBXs9`O=Zj#+&#(<z+<XyfavDdyXAmUqU%2UUJv;bBTL%9xU^b
zN1RJ830Y*z4Yp*Er#qh3%u=(!ft)1y{Iw6IJ)OeTDD<JQL!Zcs?az_Z-`U(=pT*2b
zWg<82=6Vvg`2bUSdJyeCW=MwkqkOUW2|}4wrtor^FYcRzh_25l4a(D`UVTgGl!h;K
zz^I%}`gIJ;EoG@(ngjpapD!dZx0F23>&7N(tMO#hCQ#D*%iLC5MV`&!W39t4VA+Yi
z<V2VVX<7e)cVbC0o|+Lz;#cUSsD?<S-t5lnFS^h5cAP-58{RUZ<u1HAvn#o?S96K=
z-L(j_ESmazJt-2q!w&zuER;F&5zwDT)<iB7o{V}?t8qy*Xmv64?{pL@^lxU9<4>a;
zx7l=Ib`QUQ^GT97OOxDOD1{?twV~#qxo~v+Av|Gf2I$UPfn5ia!T0VDvgrO_61FCu
zd9@}GH3hhk8I6@_Zoy&ZU0^ndCXHj7eu#1^q3=1FWJhl5G##|>xf7>t=!GUYl_Hg;
z99OKN$$sUZ5UMUQL?4f~vUZs&RMS6~BIUJo*RpCjbxc8MxKN8t+WCw_yHluwejR`C
zXe?-@&LAq)b~pq7BVC8gKuuZ&o%|X}-ZZvhG2e7jmpc)}nx8Xm2ezVD^3LefcTF-m
zdk&guX~QWVjt7<$qGpM5lKxtSM4mlHHp$(?D%Zo%4vo*u1(%nM#^VI$bnH>KQ$rxs
z98hG%=j~w)$A?kBx+_rHK1vHlYSFvuH$sa;QtYvhi}7yhcq$(^k3Wnj!IF@8o{_OT
zj+ogBtpaz{A^U|4`f0(zn;mH9^T~K#Yb#lJ@I8Zswpi3~HJ&Nb&*jaQLk^cRNU2Re
zjJv6hdX6ZPfzCJNR89mrs<aAg99xTqTZLTb!(pO3`3dt<#EmU^x<e=%Y0cX2JVdTU
z{0AdW+o)^i6`HeC0B5g!6-tk0u+t^ZVtco3<c_$Tt;mdOaPXdh_AQn|e<Eg48JkMH
z%w-kqKUN6!X?MBrw<~bkQv<wlElcdB!ff_Vw!j-Vc`<3CO-S+RS`ZWSVX#pc5@*B7
z3GGG_daRvSGhmKxnB=0)gG-2fbryLDt~I3_%Ggx#pF(-@kL<F0OKG5r8A=+_q8abE
z(|>m#z+$;1`oO=3O_!-e%6Hu0O5Z12iQE@p*qDYo75vDhIiYYvP=lxL4uVwgE+{>`
zRKS;egHx1u;`^VX!RYb{(yO@zqkST12GwUc_rv7DylYr<KO!MbH+XF|pLt(DIie#a
z^-M*v1IMY$RM&1yCZ&6P$ZQ?Np0ghm%IG_>6O-#`#P1~_v2_Zqo+2rnn&k-*D-7wA
zG-tk4_%bj&m_Uv0EwPoE(m~Gc6+y*cH9>3D2+7}cmNWMn1nrk%K!bHK@3}0FU)Y4#
z&RtDB=JVi;?p<_0(E^z&U$Th`$l(&IUg7zNdq_o14`M}3(7BQ*L@&PQLJo?9aYQ!K
zT09`Q(ff~_nH|AK3RcnY1%cT6(?U9<KoqWL_R)vWLWJ5Yj8J*i0a{D%@D+^w;Q6LO
zx*>+OmA<G-)?UAhp1Eq08@EHz+e>w%BPb6<+dE+6CKViLP=zo4ivs<WZf0xBOPI69
zm(iKgz|?H;<6?JfqMa|?(bH$HWL0cE+V(0E3BDSjL#l)@-wa?`%@-u<P=$n_?vssG
zkJvNaS@iq4Zk)bkKQ(@z1O=g1lnc@q^7gJ|MWaWk<y39<Li=m1(m$78RQ0!&3j0Bf
zHPTSTfj*?$Xo*I>)5tr!Oj6;~N}i{+;Y`JRyeGj1u8+TkR=GT~`Lx-BOx;~3C>r>{
zg+4+k<Y*@I_+q{wO#KCt-6TXkDvBK0v70o~P;#)rj+|EQ!_)sXbDlvH*ps$Jv`cv*
z9(puN4X?IA^-VweZQF=&e)1A_=80WYCZv{i|GS7dNIKI!`-*I(tb)l22ti_rt>~HO
z0d%nP5wWx^C1>)Rxq|=tu~%;!y8AF5ay6fjV=vqBt~rLBLc9nkBXyb!)Hh)or>O`&
zUi9V-ooGVNU)s_6se0tR@pJO8TMQ*eW|Plp&(M$HCrpH$AL}1cPJ6jI><j}}=z1rh
zGt=Z~r$Us_vUMgKVrNH>i#xKRh6x0w-=NxxMYfV@T7*a$q3Pf3(7EYW=xoFS5~S|W
z?faI48a>+4<tM%5+mbNBuik#BTo#JU=DQ-5H}1SSkE?3(UH!P+VFhN~LN~$6w;5<V
zyk|O+y~(YYuSsjBG#arVB2T}t49VWh2zGn3tAa3nUHqL@+x!~7ANda&KJTa9m9vG@
zmLFn|!bG~e`8;cqqDChNHgX=?>9*3&_sOX@@<{LVIVN+a5^CBmPw4j))Z7q_w*7Rx
z^~}NoCb{+@4YMk!G<}Wza}&_E#0I83Gl~)E%;Yj237LCIwt{X=A0+?$0#{w@PaH$6
ziBI`UB+OhU=zq9^qz~8I{GRB+E~v|*FCtE|Q}ma^qg9KkUe_}EFuzY|xOSAqN`v&Q
zYCoIVQAdj+-lBS9Z!6O>hig(ehukiX<!Q)oJi+fNX?n&-b@S#6hBDl_n{MG?nsN)3
z45vemxE*S*2}g7PSs(?P#cTBlWfa|BGnvaon1^gTqu?<@7FtWxT>O29aT~uvAhJUR
zmE{DI{&CXWjbk@i2R}2~vb~4ZL|<U4BcE>ZnM`lKi4mH8Tgt91yH8V|&1R20$)gwT
z9uarNFSas6Zj6Vc3kohaM9!)eSappk`On@5;U`ig=VBJ~O(z4)NCSE-K1e<n+Y;+H
z>jchH?#%BUznPR%_n7tNaty5YMQ&1doO#<GPNwoa&$=oAnIBujy-sRG;peZDO1-;e
z(h5y>tLz|cEXrZ!whGYj!D4#IL!aKdUn89Vy_r2B9Hq~eFJ<%8uG32K?PT@Q)wVJl
z?UBKcsVKlw8`XukVfD)lnxFRsdFDG4Z`*Tde^3n3xSLOw%S02cE@yJTte3F28j#qJ
zFf{4>M=ns#2pNqh==2)}-mdZMk(v>3PYiX?@~3NvuY(*?+EGg)JQtG{Z<N@T{ra?K
zu`lcDwvr1}dPwCQ2z?^;pKw9g8TMq)ODYHpViQ7~=($bWFw<nIt+Z1z+Hi9!ma6%~
zJWlh++Zq<4P8kNpR45?L>CIegq7qMJxdjn<=n7^Yh}_jXPD&j#na^sT$Y*Ch@z^Us
zj&Xg+uFV4N)2~A7oX3$Jo*u|DWI0*C+KiFCs!3Myj*#f5?d<OJEtE6)#U3?q#o^!N
zsq#)snv%F#XyKL3`ZjpdB(10HUYC#be%o#+Si;&$wjV}4Z~gH4174)4y#-&EoQt&<
zE=H{{>`+g{IWqEHS-_cFfJ)e0GAI)d_LmaD_-F%{H}Nehj+#JRB`-2>hy0N0fzxPb
zlmTL7Hj+5!S;%_cI<D*DMs(%lD6#EY1tQ*4*iiW^G}7w|E1|uW-5j}}iWGU#lpXm(
zgS=XH?)0U!+N^*L3=pS(UOs{P*N<$)nNR3v6A#BJZ3g**2{`>!4;HU{g)>Tg5odlE
zbefN#DasSc2iqR*-}^uaT-^W(0cS}5>2$Ot=Qrn}sKX63?_z8}eMF+WUZDMpmy;EG
zJJ7{ECo-`PW8S(gXh~c&EYA{Q^UdAq;u%i(Qqfd4W8@@kyZMsF{o)C=&dIRSW~ua*
zSutxWZ%!KxN~w3=9scjB*5tN!8h+AlM()nw<3qO=;w@=HyzzV@o_OF1><!Grv%mU*
zhH(_Wuv`Jc<_}VpTrp6Zvx~c3rYHDrKZCqoWk>XGJs{Ruw@{+O$?9wB4d|1jBAK#(
zDxM^#gf%xufb)!C_F{xPJn2tmWd^phVGnnaOQN3Sbym30@Rk?u*!7$SUG`)xb7E=s
z^==yL<--4@pbMe%U2v3B8kus-3O&nm#p6#O#pa<~aYpuiaJ#u2pK}J-)Xrn&Qa9lF
z4efNNQVoc_K0@9p8juB92qeC|AlIV$L2k_}l&tv)86NdPl_kf?D(MQISc(@`TXh1&
z*)V)vO%De|>*4|K09#ye4WCc3r1M&egyJ29Tzc@7%3QpFbMihw8#|k3raJPux}ET_
z;R6EyT`)_e8l+R_k(I7$WLv!$uE`xDw*t!Wn;vU8w~4_;$3=08l@Z<V(Z>CB=mYJ#
zC*;t@IUqWx3}kQmkP+^%U<0ok?cXB7RK-+tZq??XonwqLjI_{-*_C+ja2u>jGspXD
zQrY~KX5`HG4yurUmtI)#lytt>OtoKaKs~?j;;EY1tcmjme$wj)a6{dhypPs_99?hF
za-R*RdkO5<w81z08aQG1SzIno;DSXHF?m0N=N%^$eFet;lOxIRX(e`_+(>2Baw6n6
zarYZm5H4abvdHKmXICpB$89sf`d%(6heRCaOaN^#gGb|fp}z4y{Nc4b{P?q+UYO)U
zv%g=&rHZ~d#%~aMPF$i>bF@kOkyZTYizi^ot9&pDX@j73rV#!(A5uv^EF0jX_S8P2
z_azty?{9^<+uLCN*QHo0qZ{tV#xUJ(ayHXN%$OHT^f`}>#+*f9FcuG~Ahj8vnQGsC
zfM%6q`RU4J%T5z=!{IO%Gy`N0KB9)Hoz!sXAj`<Vr{l7Y(a`2e^n9otYCgCf7u}Qu
zE^`jO-g}6y5qR=XU8{nNlPb6^JX`P+*g(|iNpP}Xf<_&Zx#C5d+>fwH%&$4$xcKnb
zq<bg`^o%;O+PJMquXHtvJYbKmHCqW(kG(-WcRy^lQITkEIFEvlHNtFL2CI)6AV;ZB
zoNLWmd}pkWmtV+-oAT9ABK{PYO#eWG&=FdleTNF4_CW3I6FA&98`Ayksob&8SjO`L
zKf*}>=NkNw?Z`%w)jJK8Qa6+9Qcn1>izvD>XD{<oBLnhlOL0hWHF_50j7}^V#@}`O
zQPR0XXj{J(iV1OJ8poxgT<2JHMcxA8o1Um<%T1DdeFYkweh3}!u;xyiyu|T}L-f4(
zVn|xBh@QMQo>BeRLf3Ed!7FY4(4@{$^gM7HeBSOw>;n4W!`fgt`1>0F`mzl2amN<)
zdD>;BAg=;;?Q0`TOT=-;<w|Vzl83{7R>KyxIoR`+H;U+6hm~J#VwoHcFWuoz_(xx$
zwtg|DsV*BW8qy{yn+6z-9-W%hPYz6iswi?-`h?amyFnD}Ls6a2JbLl|cPLR*qf<^w
zqmRq7DSJ;B`^rwEu8WiK!22`wcy2dq6#tg0$v-DIyVvpWOxu963@y+svHe83*8rv)
zxG}|<A?Wqo&G?9GGLFhEA>v0);A}q?WL#!Jo*ZLYo~bVmxHbV49vwvAw6^jFI*l1c
zkSFh+myxg7AH=2?qZKRu+IapdL-XDbg7o=HuD)g}^cfG3LZb!HxKbIyzdfV7D^(%u
zYY&}xHG@4SUqd<`4&vC#r!;IlXJZ#Y_;0@%;J-1RaDpd=Pyg5n@>b2<)n6jy+3GE1
zM_eCFJ@B6UrKW}Vtonv6d!67!`Y%@W=pStN<rGZ!P()8%A246<U*b`NNV3eo9ag`s
zhK;_WM0!su(>BIzHt{OSXp1QM)X)RzDOM23UZ<<K4?*GQt8|;RC7tnp0oD2&!YasL
zA_+f3*x0c7#O&HVDk6KI|DNe&4dja9YuOZBD~Dj>Lm9Gr{$_MyhZWrZxf1Hz9)mgW
zD-JUY!>8BWg-2@ha9`$UbT_1&q+ZEHCSv=LPTn}C<I;9AYtB93jn^ZuxA>7Dzd}yo
zizzpG-dWDU&lbdrUD-_|&a}?)GX7*kq5IGhy7S&2^e=cet?zH5R`rW%WNs_FoxOk;
zmPykrkH`EUCKp)M2R88DYzfYP@}86&b7a!bV^r}phDsl8fzKagAZ`5-e81KOZ;SW}
zdD}B_+DBCq*`9z(qaKlFZ!fg$p9wm<LY1_*sgb2;>e0XpSL^AU1G(W>f5{WJ1ig}r
zBfTGO*+L}+`YX+lb+*!{(g{^GJU0R7Ou8gg6>gx@Z!f0T+6~#Kr{%ETuUE7t@4T&e
zLoF*g&=3DPmSbg)<Gj>PQ!KxA2COK$3-4l9kzdX3u<o)aw$Ih!?nU>I(f1vA*(Lzp
zUHkFep0gl(RSX^buLmX1-U#O7FQLn*6Z_nqflip+CW9Ymg3^^3Y+mjQV!HR(`foi{
za5xsfn&(J05+6`Y{i9HmG%D1%JfF^6zMBRr53+yKLx}T^g*0KjzOCrInRtiJdUVnB
zAc(A5gjAC^;=PSuVTAO;X`4$pIjezNJc7|%@mS#XGFU@zKJwq?2#fUc@r0Qtxu&o{
z6f#SL6GZML1$$eu+}9wy!_yjVmH7q8c`fPEoi(<fT9W+3@$9eg1C$m?65;z{@bkR}
z5%o&6QprxJ)m2E2?w>->P0?iUD>xGT=OaDu9LN9jJ`zoM>5jE;eISuXBvIXgQS5x2
zPbIddkimbSas5go&f&5<tZ@4XK3k?@uby*cfowAH4c4L4Gp}OB2aiy&TmY!so=43w
zcd(XR7rNp0gDWou(3EZ@mM2_LZq{6)`!AVo9ymZNJpSSo*&%2#=^)=9%hIaey+X?s
z>G;?FsWhEm$kvqHrv_zLsp(GxTajElkT?H`(*9WEbmcD?tLnhG_?=_9eu5JT*2J}=
zav<Nn6G~F{fsg81?3S6p1l78e!Ex_+tm8E7d-gU;)G32?#iMxl%5pq4P?;D_;Yiiz
zBjou88Rmgy1@F<XHcmhO7)w^E(u+zQt7hc{qVqB!-Z7NE473v(horOC|E|!hvou&w
z$r5V0&x<Mv?%Ik31wp_(29Yg+tnQLD9B?=kTJ=@1&V}E2VfRuTe`!917#Tv|>N;=_
zFu}*&T$#Fc$t3W4hfS411~wT<L#wu>!esww>{C&Pb?UrG)9^A<>b8(qpdo_n)g`#?
z@`Hji_aCycd+ce(gtzRI<G*lR`A>SS#+#l#-zQXUS;DS(=Rq4LII(JXK2g`0FHo9z
z&{ouUJ{(@O9u4fB!OlOl3+Kw7hQF!#I8{6VQ)3aV8UGrtdv!v^C2MkePCTCF8p#P>
zhzJgSn8uWk&*4moi(reXJ8y@R2v)tc5sMy)0_klPywi#kQA$WM3Kjd!Iq_1s9}#=l
zBWnX_?5P62`q~KA;DjW7)lfv&&ugY1RcqPg!ly!|32nG);U^lt?K<h)`Nmd6S|2?A
zYGOrKO;+V_ACB8#0D@D&xb?p)xHn6RXW3f@B`sE9nzxxG4BKJ!Or9`7B22@438rYK
z3CM2?CW%MRpwG8$(eW*@DA=fk{JOgkz5VhXeg7<m(#RdIS;3m9I=*KuM;_8mKU(;V
zj~lyAr;L7+_y^VABD5m&7#pakD%8Cbz_mKn(^Wk)$V#am{-|m(7#$dgXUw$3ae_*$
zRXZD$Vt3-FD`oMo#&(b^n?MrLJ2I>2C4&~1VOH%Qxzpg!v_$`4^44#G8Q&8@ME)Pv
zDGLyM{$^D(NG*ulzWLa2c??##*@WUWe~>BC`-tdj8&)aah?<M4^5;BoV1o~v(C=-(
znW8@n=<?&D?6ltZLc?tlxMuq}dUl^Yc8!STkNn;MdhPmn-mGz0=YTj7nUly>94W$&
z17mTQQxjNBn1S_o<)P<)-=UFP+tJ%aVdO_p6?4jEfJy1!M(&<<A|rV=Sh1p%gv)o2
zor8HuOyM<_Sel5gIx3(<wFSgkbv@U%Cz%!T{zeUtrSTVN7P6Z+ItoRWin1#FPN?Va
z#f|-MY4^S~TvK?FE{oB{ZT-vm1ODqk^;imCEtQ1sB##HB#(4C6hCYrq5MnsFj!fxk
z$LlW-;MFS!Fnhd*3A&;V6JoathE<DfIxPBYCK&~iH1|!YO?)f4n~+0(4ppE}mrwGN
zO-?g@C-a$Gm6=4ZNx;d5*W>u;QMjJBoWDG%9RJ8VNINUSSl-!Pgc|>3^}5c{kCq!*
z^94rGHCqik8dmVX?`{NbEla#*pA=qr{{q;U?!)V*GuUv%32$qaK_BM&VMDz@G%%o#
zR~}5~4Sa8fDJxfT5h|VB>fKKmzUp@F`f?HEyZ#W#j;w?kS2m(|-vUT5{3F3Fhls4A
zg`mjsB%`?Q3Rc!SCdi1m$Y1%B#r2PqXt~@hcKXC@)UoFPyXs*w?ak9<oxb?PIx-FE
zy$I#^8g{`Nfil*%qWIMAPoO{32uIubp`8``tQr*>a6)JeS$PRFjXwubv7a(BHM&j)
z*V=J4FE(-y&f0VKqJ{z-h6S(71G$H3n_=dncWCNZPYGT%iQM@;g@hXbQ`OkPe4lE`
z-N^qB`IXl5w|mdRc~4Skd{;Z`5NM3le0Q=zJ5A{uZx_}kbt*K>^T!|8SMfi939QeX
zi^pf1WAO<EAZ4nFS&?G&?MFNCb*|uT{8nyQm`wiO7|&~P(?zjTp`>)p7ml41#-*N#
zC;nQoJh|d-5*WUm*AZS#K3I)%1D-Ru9~GNOZs1Qc=if)((8E1Q^O+5Be~ys6m|}iV
zEnxX|i0+3_c1@N!^qz=jpBn!VO70ZmAN@1oe%KHzKDMSGlnh}{%}@06-FtN0IUKBB
z7@{0^2Rvu`2?+ao1GVRRaVr!1LAP=_F?4T74=&6g;uV_AjUzh*3wj$!b=n5zR9prT
zE?CO@wLF-++PIY@ysl;*Yi%J`gZp`f#u?m|m2S+vp=nffMg_jjAL4H=K226OrqM{_
zvuyh8LzKN^%U(GupuL69aInA~uB=wVm*d~@yCp4Pxj$gJ&E{NQrzRMDvE!s~jlEaM
zD%gtK&{oH20kn#M!|^R+=k7yH%;Bw^nY0B8%dh6nJ&z}uyRW0vq<17EVj3~pm4e*j
z7m$fIvdo8P{sQL*$;_7wGos@)fm8O|LKlC{#IL%1_`6bb;K~O@8n$~odu7>U>Ub=Y
zO-XB|eOXrc-0B?YHdkRMHRSR?bU2c_HCwTg2qnF?Q^~<aJo50wQj~vzBR6}k8A<I1
z6uQ#~j3t85EgugeUY0`CmRq2^KGB@;bTCLC-iNghRdIzm(WG*4G3qscFSxgC2?O^P
z81kY8O}AJ>!d&Ia{!dwSjgAujRcX)nxVHrSdje?OJ|#9?O^$9)sT&)RZ=yeN8R&1h
z2~v^vc+}!B|9y@mTG2U(COzfAa_%5rbYGQxn5{wPxs2!i*CB!m4tSvib1jftJ`<HZ
zj3ax_?E(q!SXAz<%XO@1g9XP;v6`n13JX3*Y8O4k(ymXr{HzY->Dh}Ghi&F<>$M_*
zf0f9m<%M+K5gArRWeI;{Y&HpuTujRhhuE}|an$)k9{aGSk#<~a28~7DWc<HfI4u7S
z|E-r0i}Q<+<HB)NQd15;e(;g$uA2tdYr^oF@i{2)cRULJkVs-1PT~n`oPj^*K151(
zGNlsfTv?n0SXQ?ot;!czF8C@bDHK7KQ|^!`11Z#|UWO`b<!aP!HgnIL55o8(u@Dzv
z%!;^S{*3Gs?2<Em^w#M*Hhg+3op0XG3Qw5RyDmFnZHzg2WK@prw+`}KVvMoa=Bdbf
zjE&82tHk-gtZ>llm9Tb`HNIdupQJrLhGSwD2%@g<L0Wr<!9_X>`gXh{MUxb{^v9;8
zU(%O+Re6C%oD+$vmNB#PRV-;9xPsoc7^1y}A2`QfiOj7%&q!&+HfYc|%g$VRoE2R9
z&8E!mp-lEZ{G)m&Rt~jgYvmWvHg5@HGO-zlcJzY-uZ4f-qZ{|B`Wy3V=o-cH|DnU8
z%vdgZL4;2*c8YI>pz0d@XTS|_-d~5hE)SD}e_m8^${d*N^oorC?;XRw&;&~*1;#Ax
zGP9yYg%reylD}&^nOiyYYObar?qZ`eDzF-lrl^cSk*y4CH2XLkXxhix9~Qz=DPLCo
zw*Y;1m0-{Hn^B3Cg|J`!A*ra|2{ry9{Jf8;kmTk`?AEu@8AtZPR>3lSx%4qu``$zm
zCdm*zDHK1xF~~5oBS?RmE6*r$5L)>HFg5y4mR7i+yfdAod#jKPpHXB!e=H?mQwR~X
z4ZuVTPqJli0&zGpgd)d`1&6~?bggVLt7{>FyP0Ub&#0D6$eD>%1q$HJ^|In6_IPuH
zA-lieDJ!wh1^pG>!%wY~g%_U9B>84D{C4yO=k5fwVyhEKl_-<@O_Rplnk@7&>jf!Z
z`w}^Nu14RLMTjuRg_N{yBtGjrP|vAuu4i!y%#xjmw)Y$&X9GTSPc+M5gULKnC!b3?
zy&m#DT)TkdOMU4bo1Zwg?>Z<vbHiw+9Jni3;tTtJ!%~}<tl@!~&|5Z+PF=W$h@UP%
zO9$fkSMma=q_HljUw#Dby{kxd(?76MUkVZ#cR*G>5Z=n@Kt$9e^4E3^%qIrSy2>n^
zFr0`MWH}Kvuf1604P_j~e~>%#*E9S!Wn!kAiFBt8k_s&+F74%c5`6HCpzHPn96U*m
z4o+Bue|=Yn`+qOtcc#Pe^I;NEzPTJ0KWv~PL&`L9o-;%|cf^B232fE;AOGgs!*J~o
zg{^ne;GAhIs8yT?r^g1|IqjXq)OrD2+;b0HuA0NfeFcy&`<QG=5o0ADJ;&NdmvHJT
z)u?A6hx@Pg2#-^W6?oov<Q%hP@ubw<$ouaWL5JB)?)=1CJSUGESbfiZp~kQSr>O5k
znXGr%&Ent~|Exu-zD0C`;vy>J%+hnn4+0~Vsb6|N<QwPn>*jQmH?DqQFwFoYE}jLv
zD2Axrcn_B4i@`iS74#kF;BiJXxXL*vNEsK4od2k@a;JOosvi@P>xTi<-W(uE&=N<<
zBkMSxzX8)c&JRzzdL7+8IE9HVe1@KD^fRNsTbQ02S)p`?4Wv8i(pg)UQ;CH~DH=Bq
z^Xk_@pn*3$7<fm=H3UOO>2hj%bSARYv*C9{*kFZ~&ZIZPf`sO!z>&k5ypFtZ^7nBc
zD7k;flZ94zrD;EyHohfg(kbX;donArM;q(3PDbq#Us2JB00q8(hz_k?%tdFuMF~Nl
z&}g*><0V~yyrav|(@&e3{6;aduwnuojTj`qEX}Eb^msV4=o6gzrAOy~m!e_fso;1c
zoV>W=LKphSppMVy@SRyQ{7-lG;7}_-S+Z2HrdxzWZrX(^`%7V5!ZI|+vkw(^6*9^Y
zP24By;NA1rprW=W{8~m8WgY&2p6pUX$5)(2@iL>#j5E8@?Ay9nI=mMJcYh@JCwQS#
zQ>=MTi_FOaTN`ppUq~yTk0X<IZKsPWlc|K;E{eF7aCRM^8vp7>w+(pIbfOz|Id>dJ
zLjCdm$6fpp$3-~c`(C{A+W<QM-W9zX{?5GBumiE9et6@5U$A4!EK(xkkGR`>B$e_Q
zDai5g^+b1@@!b|^MfnvpFS7<6pB}}8otw|RkxNHjz8Rd|PZM%EA`{W!3*5zQKa!GC
zN%A|4=`F1-I0$D`OEoL<rf>&zR5w7Q`&XD;WCxWK4QOcfO$gMAMrn34NUn&Atq7Fk
z15-z_U8O8~TW2G<-7CdQXiY{M29Y@3{uGvvxBweU@8gC0cY@UGbs+g#2A^Nn#Pul}
zbBU%&Xeqmx+5Wzer}2i5j21PLT@5<iuZhz*U95n#Z=4~gEX+vY$VF0`>p&AWzhxy(
z`ax8E3E5}uL8WIRsugk<SDF50XFa`4MdbI8;#_eUt=Ww~@XKu_Re#~hf@GBWgQ5mr
zC{pi=#gc8$Nuo;&V+prW#KiL?@7Q6?%ehkX?`;F<&$h*8tRoq3xrVsU3PIbyb_xnj
z_HgN5U(huAnv5hZC3P+%%&phDjMhtcUWm3DckPw|X=x0jzEjj$l>-@2sIN?ACP=`|
zk9(o&hl-6sVH4&rXK8w29Bi?QpmII0;6i1;t<0k?JoV03v_ho}dzK6$4U0kKz-vR6
zW4R)~|ENv)f_`G`){kzEEG4&kqd=xu0hQfxCI$!RkiF60QC`UvM!~6y1bX+NZ4F)|
za62LAcF#vmYfG37{wt7^u_Q8^HwYrXEvRQ>CwAQV6+U+bkzae?f^@hmHR{?*XIrS@
z$uUm!m~T1rFSiBQv`|PiakQ1TF+kmOwFGx9$K!J~vFPwOaZ>Yf6FS#k4)gy8f&0bl
z#3S4aGiPIwMD-hTPUSv{8XeDldeTY6e~dlDs+(L2mqXGP{)e7ln2bEKy13cTE~3w#
zo4A7IipYLXB+?x!CBLU_po{titkJ~?Ry?W?z5JUBchc)&l0SirsfZLz5T$`Zy;$YK
zODf&H1*pw5TdC5uXvcX?vgy1deyJWl)`_=(14_WYR*R9KX@I1*yuoopGjYIbL-h7o
z4$`sx3Y&W@xJU2GxVyjPQF>Shk@C>z6b`>bc{fVY@z$-VeB>Zn<+X%l&0mRDvZ=hy
z!I)G$8=$lHbl^_sY?jeaB54VyslQ@6R4l&@L}>()UwcrGIC0iEbUKtg48YQxb8SU3
zno;06A@@^sGAri5lJ!AeAnmvU-`sW;FWB=9tscJ*=koXROx8DGl`m~%k!v0#Xf<+5
z)q%Xo2lXgC<)@%@Fa$XrS&POoKN($iDORvKNcQh(1c|-T+)RlWZlGc!S8d9tOPk(e
zM%RM9__Y|GKa-++XBM$W*P8I9-tS1<KO8HZYGy4|H=-QLCy=>Nfj`i(3GEquN_?xP
zv7(;}No;)}(|jvr%;~xys8_#$FO6Ho6xFh@^`Z=F`!5;9K?jb#`XbOSThCOre_|So
zzahib2bn)+k;GB*8Yy%Fl#i`o+ZS<?D|3$QY^~zlVs4_TYO|<%$S<4_e39+w`9Qu&
z#8S!TR@PeN6nT840GF+EM_Hl;us=8uyWWhYiWPDEo*x{F?E1sR$4y{Wn;xPZF*oEo
z(H*aFo=CD|M{&#<S=cuI0(hlQ=8~dM!+2jwFbO`$l|*YWW_dc?lO;(gLg@wBx~-C&
z+5ChQSG{LeeA)$T-~ERcC5<C;=6obFtcYS)@Tt`E7~JVq!=JjU06Z>#gJ0KE*zNl_
zz<_f)8*{FjUjIBib{5(WXHPrRnGqxWZ`Tvi^uJG$dS4$asoIPUx+}(Jy!|NLJ_eR4
z6=C<~x2epeH^k~cD^k*anc1cAn_Oy5CK?O&RU6}j<d({G<ZJVqyy=f8O8$Iqcm76l
zr2IY!Fn2+94-X)X^eAMJXp8)>oP~k17ToB6mCw_&!d?|Q)LVF#l^=7TYBPOU`Gf!H
zrP2@R^AiNYYnnh9a-082E{CXE-sI{e-(ynt2OGMchU|q(SXJ^jXvwa_+K)`(Ro^14
z+vA1Kjx$A*H-vN7-7UCR9^<(0Z>&H~C!480yqKIRC)|+yQncjbGoop)jOLs*LsP89
zQC`(A#{1VHWb<*%{K`qhm&dvDm-~<4bw?s-g_JIP`p^R^7me76O#!s;?@xTeC7ZnY
zrVoXk?))BQJ#wiofarY4$GY7;m^aS^3Ua<-qj+y(Ts{wl&JcqI=^ChnO~ogkU*KtH
z1afPmMbP5Dbh7uN8909NXC@AMp~VXu$rt`8iu<^p*KF2~)dydZxoN#<%Bnh|1b>*J
z?Y4A5S`pr(|AN2fx;_3eXFYAn-pLk9`qH&$KC<Uj2WXEE59{xa!^0vM=(zVO{GMgQ
zobt^#q<VBK^ZUCywh`w8GIg|ibt@SakM$U{4Z1M*#1qos8I3*C#K{ruk4QvQ1W&wt
zoj6P}hoglxf+6{aF_$n7RwW7;Qh$f3)9Ap~+P>tL=pu5@J(-K2l*-+^T|gZURpEja
zv3!T723FN=6Me50#^%&pK;NoZHtrxx|E?5}tGykp`p+9wP45r?OW+xN>i&Fi$qs{z
z+zWVVZ5x{KAqhI|=$Ho-j8DpFqpL?cK|Uds8D3UMKDyeB`4;Y23SQg1t~dxA1-po!
zNCaMOtVli=RwKPVZ#ehgOR-(tWwLqFX^;yaV!lWQg7%y((3Kxg!q1=NFR3N0UolIc
zu6d8&>{H>TU#ek?r^X4zXJ&ytznhg>!%;>3R(@~G04w#Z9TxiTqgua=(D(9PI6r4M
zRhf5<Ts)qMdD;26>~<1ZAD4yH_?f(SylyZ*=`P6f-G|~1>;_5ad{Vw~jCT(7pd*3j
zQHEj^h^>8!9T)jBK5e(jJRfBg`$YjFen+#aMu_Tf)@5I+h_MGEKG0OJ5<0WpjoNOC
z;LE4D(86=WuyNIRMi5_4cPXXte<+*ajMzrT`(X{8P@hk(?)`$>rWTXorT+MU*%NMy
zVh$_Sum}{}{(zCE68gC27&Sd+42!f>(W|C+%-rm|q^<ld(*GHVmmcg!KiGA!HB$%q
z{qscOOSd2{ScrU@#`^YKPd2ry1#BmrV2=j&u{+eWAgauSO5V$(;nzN}cW&vz!yPen
z&ibEpk*6}0YEgc}6iRm8)W&K(MKEk#PQF>$U}4lftQ=8_X)=X^gB+`wQ^_@YHlio<
zcVNrS)pXyVjSwMQkLG&3M8yt)q~xd!$?7Ip<M>C!8hV2L@MF^TQw`m>^TCpWd7NQH
zED>y)NCF@KhrekE*aGP>4r=lUDy84DNp1_-w8uMG|0z{4>%Yl#T$B}V)=4D4?ycm<
z_L)HaQcXOiG6%Bm&43-D*YGCE>DV${h#x6;Q<WQ&aLDj1)D<hl33<y9xb@K8=QBak
zqMG;*3DG`vKQ70xi+rBl!hJ=FV8Aqvbwo8-y+s0rulgbgIe(X^RTN^4y>`&_dp`R7
zZ#nBCewRfv*1@VO2Jf2xgLSJN!sjop!L==FFxBZIyWyP+v4$1=NEr@ZU#ushmD3?(
zcqts&8;T6d=Q1}P)}TvXpQ)sG5lPxV_Pz96(6=48h-|`iD(Sh5S*zAYE{5kLjcG4y
zY*gJzhg}W1ezXgA-(r!qbuid}T_s5QrA=BqFL6QLLVV)Nays!thM+6I7#j+okZN;Z
zSQA$Z6Gvlkd|&~YXE6=BqaLE{ng6jS-WpWZ-;W={`v6Nu`LMDf3gFWlGW@WMxjWez
zoO<s<%bg2QcftX5izq7krU;8eQjw;n0VsTPMXjeUaUT+V(9HB*NP9shNj_9Y;v7vu
zv8xMf9aA92qbbB``M5DBT@oojxP^BNNYhzOn&jQ27;bujDlFU?GRBP5aNya0u=vJA
zD&Ln+b%W=U$JeDG&Eh^J%x~ZaCQTrMz>Q;b&&aX+6^dY{kw&6=2gu!y(@?rJ6P7fH
zfjP}3DhfGZ9$bSZME@ffUVcN3PV1Q8oHt6!PC_O@vD_c^H{7E;YRm<-a6A@-sQkDW
z*JE&qi8DNowz~AA?;=IieP1xN=SY%gHapP1N=XQ}Ig5rh?~gH$N~+4OgAZbJKq;w_
zxnnAZ*<%m*0a<U!!h2^JE_(+^nk^$CYt2dL#XRzP<RK~POk!%998vubGcr}_0;$Y!
z!nRslxK9!N=vD=b#Pm(jbFT+Tw$h%7oERm@Iq@6i=7yr5hvd<-d%(1)Utvc6>TyF8
zH)F*Eujm4!x9I8vQ>gGi4)c%eQAtSw)U{uu3O5%+bAS^3e(DUnV_k6)O~zZ@{P@SU
z)Ht#I;~DAj7IJoNF7eO(MtJTVXr5Zji{a|<dU;iRsCX}`Q{IBL8{_cQ2Q>_8&_pBa
z&X8$iZsX(iZfG@AiK5mIa@S8*@s1l=2)tr9BJatTXsNmw_wnd;LO-p*kx9p>qrC>R
z{7e{J?x)aP^pmR8=|FwREHve008dTk23N2tlX#rAC+TIm=&yG>|Ku(iyyf>IlJI5(
zIexSge6ab5ZigR(wXR8E)hk}JM8pD5AD@ILLKdttQygQK@vKDXTWqrAHmTh|jIP+$
zA)A#~P_)tjw=8xx`4-<pPNh8|_vY0h+0Mm8;=F+LSNz6hAup-jxd2E^3xVLZi4b_c
z9d;Eyg`>9t3Ofez>b6l_ergLY4RB?}Eio%Q9LNv4vkc$++6b%3R;>QI814Rb9h)V;
zgQ<I1IGeK(9KYU1SIh|NDe{8Ql3w76Ik5qU46*U6eA2n~GSZiwjU3z;G0|FkxFM&H
zApPVdSLQw!)+Lo#M^-g3?sMmn+J_VH+3P#-x<5fw>*XGF$La(+m^clcT{NBsoc~4h
zPIOY&$4ANTfKr@l(M_ZN{-8R)D*2C2X|w79-f+E{;2HDxqP8e8EaJKZW?wx)rC&cL
zuRhO$)W9!j=(RJM*A@wT@5HcaJN(dT$>${SnI_rL@*DX{)^a7nIC6T?dPbw{BdT5&
z40gTK1ezJ)Sg+BWC;csut51q$t?k38-hLJ@oIVaDcWX12S*6t7VVGu{Wzzo!14z`3
zN^E>=5Ps*yLxGt)|EJeecB;1tG<mC{xG~Nmq4u6>wEhpWqHn^G*#SJcmZ8cqkI{$m
zLwJI#2}Fj}uo2@QlkD&L%!Q62($iyvxKkNq+4?Brep~_T`=%ntC5~iwo*#EuWf>~<
zZXfH{T5!pJkS$9=^q%wud`?%04jr{7A-R@xUtX+m+++o`HX)i8p1;Bx?2N&VeG}*;
zcCxM1%m?^o+)Ri+xfjeci*d-cJao+Q5PV7TCbGqM@mU3Z_!ZxS<(dcZ?LUi1dcq!j
zqt+7moAXd<q852%7l=;k7LxB13Q%0nJ-p9eNGcZ|#_POYiQBYTvS>&aHJM6*@WE5|
zW6OH_#U+v#Qoj>Fh)pLV7OwQG;RWGDhx7Pf-E-Qj(abW!bXILBh&q}W*h;CJpy*aB
zP>ythQ1ldc-8LcChFjshZa<E&bp+M)HKcdVUL<AhP9{%R<Dw7slg5lTSX{mmUAwBm
zMR%rheW@LQ2Hmi?>=P`(h=QKy5YB&ZMUJmCCVy*>W7SucuqS;x`yrK~l{Zh**<wH0
z)7ReP`g?PPb8|K6!7r`g<+GjEPk6&xtX9Kuc|h$x2l4+-*hUh*77z>fXqb28G7hb3
zf|nAac-D<sI265u#!pqOn0N^J8rchr2Nz(?(L{2+&yn1ivxB*@uNY;mNQSutgX5SN
zlz7>hlx`9up0keO;Adya{I<6wH5Q}Mtk1l>xjWhIU%Y5p%N9B%Y6e?RufY4$HS~sZ
zB~|d7NAEs3N~IbwPM(`UEgyb?tGA;0Js&&BCG}`>H&mAM?r!H|Z|tSU-Ic6Z-e=~}
z{DY8WvYpkucZb9jhXbQvii;17@%k~xY_7GK;IC@{%KtGA5@$T+GTv2S(XUBh@az=&
zp_h&}zb{}S_fJLz6?f4kxpSyN_9shE8q<GI%OP$so@E6(^xujeNHXiAuM-{Ufi-Qc
z%xq<RR?LBP6)qvEt3qt09iF3Yr?!H9z*F=%c#Iz|&ZByAn^}z+zsFeXN)$cJ;*M1f
zMA>*fI&WZwt7_-rGbIyI=$B5UUH_9iYIYHJ5{eINzan+`5Q*OQ7*GETU?#T&MedG5
z-0^G7$sO8^&Y`(%{bUKD_}&ya)e?)pjtmRsl*Q@TUr)bTchJcum(frD5?1SY2&l%%
zvEx_U+RBCNVvp$$$kR*FOqz^1ohAN;%u#K?Vgm(m==W>9VR<R$@6kZXW)=AOe?d4T
z(E{(#bz}?#^;kr~l?eOClgx;NIA5d{9LM&r>>ujrg2_v;A2LEAhkqa?87tzdU`3vr
zEo5IcjTcJB#X<RPA!Mt%(eDmD5G}f0C@MY(71PIjtj|MO5PA$(6>VcRyHDDVODIK+
z!RC0f$$mKP@RN!+FTkI^n}C$cPh@cKIG(XG3C1<GVbyWTXs1{XKHXABN)HE;Hra4&
zjw(r|$}uceq>nx3j04`;M!fKCrJy)}BWUiEWYpI<a6galBe~QJG(D8qANTa=x4t;|
zv*HHLZe33;r^L|Z)m8M7P9k0QB#c#fHiOk`{fc9z&t@k-`e7^OE<@~3y5I_pE=akB
zp+dq9cbh*3$%(Dl{hTW^;=P&N6Zwk`uLM^QIz^*Sk$iAZ$^g4pe(3irQ%2ii1wOqk
zw7OcSn@oBdiZ$M0#v)#m?D{#88?g1^0=2tAGL1*>1x#c6%$L!tv)|HMd1dMooe8h=
zo#^`O&uFV#E%)wt2HgCwoej|SAxL&QoBq_^R;<$zik&CmcNs6hvKgWJE5C5iEh|`Q
z(2lQe*^Q+y=fK>t`IqFB`^aw4lKE8I520?Ypy8>^Jbt?dYt59xE?x}zT55^34IR)q
zr4dFt&>Bf@=qIJA8cfCUT3A`&%d8Ur&1UwEQsd7DY4WT^^!PGKI?ZqcJt(P6y*sZ`
zowPT^bzqc@?#n^G8&ue(v$FZaHzrdlhhconJ(Rh*U@E>D*^YJx2)VmEXX9HNf8w)m
zu9CCMi<#})3-IiF+abz%3M_0)C99Krh<R{6whX$5K9xt2)G6yx>BJ6VU5vSuJ2gx+
z+dvuxs)9O^4lsOKfUYgMj0?;(;JfoMbzjlOyv<3c({10-6hnDBuXO_6abN;u?D)ft
zZ0=zf_?NRrf<^p(r_J!FY%-1xTa9xhYVodx8t6={1S9S)g0H$qqU;ebX1~Z3*q*x`
zWv2Y4@&g-)mEU-@e8G0I^1KWB`lX5N{t`-N_vE8ro|i#(>`v9A%U_Vg8#&@K@eN6T
zTRFyj0X@G{hSWPtV07;<I>&Y=GZHKdxxss>b>DYLxGsi=>{KvMN}f&5mSOp0ZcpDl
zPyW!z2RIXIjdvLR!PlHbu}(w;*54C`mmjJW1S@9{ZJ)d3MelbA6s;y`g&I75dj{3X
zSD_5XjSCg;U@TuYk>^#9$?n=CC}Q0wm^8>I-{*QTKQnKUs1?si=bLhLqy833ZvKqB
zuljNgcOyaH!yK+!q(Vrt5A_)O3Wbx$a{t61Xu4btx)-voLDih@@E_ohzPJF}#$5aG
zWg^($teWKMrE^;g%W=YvEimofTG-sR8{Kei19OitC;#eFIK=v4$*PIW_sofc&LAst
zZ@U69S<z1_V>Vmu5fdSoL>_W+>*Sd5jy5JTzL%_x%)~QX&M<MjN>Dewh1Xp@izm81
zBop3VAzNeZNlS-3joq6;C;yyGWnCjkhMW$zSr`p}o95bzc5vV+R))_BkD&eogIr+e
zJ3!(bHd^EaDvQm*K<P3vIeH1oZX5#lKFXXQ`i45pfLW5)#@+461Tm>7Cbm(DcwhG?
zd8y_o?WO^`cPN$Dtl*DUuXRMq&*RXj!E~aabDPfS8pgx^SI6cetx)>jmu^0Fgk*?q
zpwSohK}yd$JbJPNLiaz1DNiO*@uGHH(LHP6?DcN!^xqI!R_cWwHdVmtcZ6vYMl<Oq
z)6hmab*3u(2`MtULT>16Mqbw?(d1wY=IWf2<o^cF#2w2misGV#28v7}LxfZ!uF7}L
zc|WB{i3*ocGL*V$PMQoQW04FYg+if3n)u#*PNRrQku)JRh=?>u-S2<cdq3-0d;Qj4
z?^4JdvcScw&*JsF)ZqFB6*BgKCDATWgTUvZIBd-kTs;2=POiTVj@5E(Ou=f-^kg8F
zIz19JFTQ0v^Y7BLg=I|ZR5=Y>dIEyR8`9zawe&=~A{#ICuo-DGl3JN*6NzpQI<ZTh
zOs=T}nKEC<JzzwV3vZBYqZXJpvjlV|ti(I7N8?b^#(zprN5%XqIG4?XscSLH>{i08
z%LfE!obXONbsSi&0!ux5ak=~p>|_4{J$wHa7Fhc+@-u;3n0S!JNCuFZm9yDO;Sn}{
z|9<*AGZUyvB$MfwLPgals7tv7Q`i+@Gh*cxI?h-UD;Hc8{aK$(A})Iq+ofwkGif%-
z^tnPVsP@w-k%9dB%3N_i<Su@`=O}p3*^Zi@{sW=BzTn(IFBpmc5iTuLtobHAFy2`J
z2`zv5oNNg^)Aft!?VJw5&m$pxxv2p=__u~LGm@Y`CQl>X;7PaC&!Q_$WzZ|tX3l-c
ziUy+9bdAJ5PV&k|s<Y{X@GqG}7e?jsKZzyu`XpM59JO)T8V4AwEQh>&?(yz7gfQ^r
z9`>!P<CFO9_^*6C6o(`suq)>)?6k4EZYpH%a^-7Fy9IrP?og-GN|EL!<X`jzJ>O*x
zk@T;q_p<SPTgRpBSV|i=U5;R5saS5-#sC_6r<~^B+k~quthlAS%c-4D5B+lQ9nF3u
zO>x6B;UDpzMt&fQ|0K!<wN0)3-*;A`Ws%ZI_wO07duBq%nvMfir2>4;Y8n*%_=+RC
zBETYtVAESMg8N~*h-1MJo0bh`d#<B<dENZ)h}m@9zr%QS++_5#Kak%(se`W+-9)a2
zUF>}25l*>l1oIl4NRyzDIqofDMju<4+lOGbAS8vYm}g3-zyC_}V^_kD2adwuRV}b@
z$P`<Id*JPlv*6OdO0fIgEr^a^2eP4)>8;VT`To*X_#fLX)G<{W3$iDHv}H3s+*FS0
zE#9Dp>+OPk^-O_M!8JS~stu=T&%qk*1~{tYA)2}<9Z$p0VDctoloxoIolAA#W`@6}
zk;C6em4TQA&rhaR=cd#2??3TqK7clv92InXou}@z%;@38rNZA2i;%rdKN?6_4l13R
zFjZ3uJ4{o@HH$GZSB{|OUT1Nf*K?xi>5GpPccGq38T_`eTD-+297U`keC~zCmHmsK
ziW>SWL?hE`@q*Dc_}$+1*k8*Q9ec0|`<JGHy<7o05p|4ZM)q<W^f$xUhcRUS<N5UN
zL`x?5umo?*mSWS=n&}a-j@;~9$Hom>;nsm2!e4#w@J8K!92XsfRwy@Oi7j>bgiAH9
zb{#{at#%N*)d57m!U?B2KgVsaB0<Qj3uG5aV6*X#JoS%-MQ(_{(&hjr(X&A@ToWe6
zHskvmlhLHVJqX$4V)?q+kaK!8YN)--jxLer4E0i&{O48VKx+bBEhxg3%@4SdFOJZW
zq8C(lO%qAvYKhdhmsH2~kZ^cz4d&k$VjaV;@G&X@s&8AvlN}{+!Cnt*m-LWPk0uh8
zBP~R8nLO4H4Wn|gh3IpJIhL)z%MUzl=f(CoSed9oeBnOv-lotIm5X7V(<c5+iI`8E
zrVAT#e8I9Z3-!$1$r8@Z;kNc>FhedAg1^czt;7gg|Ev|~KjKK5=4`50>4+oCi}1Q1
zk+lAwOyTbjlW|vLDxBVJPL-bBrN(hJbkQv@I^~Xts=T-Y^Y?z@Zz-+7RheQg_*5H>
z4#7Ah&=!y0>jabEC*sj@eSB<O0?0gfLnEIy(Ggp>(2;gKKy7F<SpV&WB83iq;ml@~
z6|#_dLo~NKWFi}NqYEW@^s()6j?81*80houBd$f`$dM<Joc00_n*Caq&CsVd5>^gm
zvioKbR5kEhwxm;|k8i1gi9D2r2GLNf8XC}f6FvI#3D%@3Q|(#b;X-E-hWC*uv_uIc
z!*=1^zE4<7HxX-bzBpayD9EUaz3@|>FygM5PwaXMlH#nPgS3dRFP*>^C`ysW(`(qG
zF9S@bcm~@)oW+(8eWEuP4ztKN>)9kLH+<>(Wm*@e&GwAZwviP15!+cQkiD;mNZtC)
z=T(})A*WYFVaS25+3iJRGJazPB|TWRHIKUXcT<_>RX9pAUGTCk4<zMEarUX#n9icO
z{Lld$NMf*K=~MjAB_S$xJ}oM%sX=i@%{cGaeJF{3$YdqTIa7T*Hu3d0S~9nZxhia-
zil#i1%#&cs#?y(@Y^jRDNN3vgyPc-2zHcM-eIXeQA{ZzV_Z&PF^E|tAQT-}^qFDD3
zDi%4wrO6jT&;AGWtu*2DLq}l&UkbM(`UTsY%prY-C6;$}!1pig#|0jPeDmBFV*O->
zD{kEu_^h(S%;YaF^xuPizL2NV8XM^OSH+y3Nf+W|r=ZlXi|k;yc<#)*n3Go~Gu(6^
zBiF<DYu<de_2UMXbHdzaM0+98RT@tn8vl^#4&#XnG9j9^2a&IBBdQ;4!LO$*!+mpV
z@#DTZ0yo8*I7PC99zKMy#@q{#_&J83Ha!@R$Xtzvom%0pWG&ug6^;K}Gm>xleuv1d
zoQjLLjzq8AZa@X7)0YbZh}J<_YWnyZ)f7HqPM?FB)W$YiYRS`Qt260=tM;fs<0(~9
zT}M+>#|XcwND;DAncmpApD3)2Co9TB$#_18YFG8*!HDB{=GL9omp{vq;x9u)D^DH9
zb_!`tOc~Ay>IX^3L;S&4&iu;9<H4rO26A+IU{lKpJ~}Q4Qd%gHQAkJ0AF@E-^aJYA
zd&qYfxX?2LNmMCSg*5DVP5+zM!{i&IX!g5crgmf|*{r;XoAi2!N&C7p^<PxjSv-$i
zJ$jIy$-9E{ceY|H-3&aMDNqhS2j%$lV71Z=hJ5Cc>Ji!a@3kq|@ZNx!*%09n@sD}E
zxF0|erULs`=|OXE5I?AvOqCX?fcZFa=HNdk<kd<={LNGl-%TRhafpO@88Q8&2I6v?
zks*cWOuAB;%{X<9>KWG4I&*2<S2cmF2w6ZB=AB{=OQ#DPTM2o(;W+jA<Ay(yBCMfX
zO*M+DsoL!hakobn@cxHsQqX4du&)LgT5Hqs=Q6Oqfx6)0>=10xuoI$I27%rdZ8)WK
z1{pn%pc*Y(`KO*g;Xw2ac;UPie1o>}A6w>$wdxn{s6jRx-8_~AcFRDvqC5+^7|b^Q
z(qVCS6>MGNK1QZnipihDv`4a#EjJ1fermHPWiMV)2ko)2tiFc-&ohPwHMCIq@snt`
zjU3GKdqeBWHV}DD7mN~@($V@##Mb5vT(p{jUMmE{iaqa8xJ<ApWP%w=(JBKC?{5{q
z6eVz&(@d)1xgRFv)bbZETjHaYv0PL#W^SA9iObOsB=Ly>%aB{bV(y2q=6SZv+Grj-
zY0=6(&ijv!SdzomMa&Tn2K^vjYj;q4-9{>L;~G}+Sp#jpH|fZ_VEWoM7Rs!gsY&-v
z2pjZ-;v^HevEw9h`=AUjbG1<iJ-~~U?9hYw*Esgg+RCD{;tZ(#96tHLaWZwg4-E7s
z@ukn!qvLNg!N@6zJFoGUWvCycHQ`F6{f{a;a`h8C;(Ckq=qfS&M}{n6haGpa22<U%
z0+u;VPWY`Q2=y$PL2VM=K!7-b_P6Gb=+lFx@UpIgqU<fObdesB$<~82EJJkJFbS@8
zxS=Z-$^crLaNMnWY{uGoxq3bPe)a`H^$%Mz?pPe3n-fnow|oY<o_%~<yB9j>;|Uk9
z&EihISjo;seTL^%+sIQf<9Vq2BFol2%6=HEW;=&lm~5ghcPcfIzO{J9A}2Hp`)#*k
zbv6i>JcxA_eTpk=+wjQ)B0780d2Auh_*ivXkbPcCQ2yUhNM7j=37wPSoLoA%qpi5f
zT6}9K?ts|wk8p3@ZFFU?BCfUTMMdl#woeO2eisJ#%o{5OO_Dy4UYN?I=<Z_K2a55t
zk3ur|%ALgx?O~Tfb=W{~3XAYBrBAcwa49j7v?#HGrTSvwmp|+H)>&eXO|=>~m})}N
z+!nHZ#wI8YSjU^NX1Ft;NOTNG@$tLwLDdd32=DKP%cpI5<2MGl?Ro|}E4v@6e$T|O
zH*Lpqk3vxG^=;TC+z>2Ro<(_0nLKO^Lz8@)z<8TGcY)kur=O&Qtm_kQ?*8R$dCL>l
zvE~`OnJZ#9e813hqiVR5C*o;sQ3A^j{2=_i&zLXFPa-yctr+_rr1sT=q`b<F|N2sq
z&lpYcv^&W-YVunC&Q5<S>n8yTn-ZXXYON@4f)-8=<dEvnHrTiFH};-7A3NQ5;SUY#
z;K*yg!0*^w)YPFQI1nC!%ndBTB(IK3blT1AE5G8>e}B=D;qy>fc?HY-(Q`}o$zJ9Y
zR!%#HR&WX6%#5AX*)@NWu**+JP}gWq{7TzVVf+*7^DjpptXJkWCdTvKW14YY^=@nL
zBc1%j=kemqy%tzR&Y&~P#?42=?kF|U9Vu^g!(-NGKxMQHx?&N5sa@gxf1wO^>^X_Y
z-O)r*RzJ|dS_)wae%$e<N;dIp85uKdMPt415zEz#iM$uGfnpI`an+9EobOzsUK888
zY9h-uixR%0W|b2AB#7JM+xS}NL8?-aO`^NcVvB-bd}~t@Uee@)E$8-PU9S`f7*yw<
zsVC8czUq8?`X*7#tXe@#xiQY|$fCyk&Y{$*UShfNm37H&Z7>QpMpuLKkm%t$oTusx
z>FuAmb1emQVB-)T9_;~tU0g_IP7Hf+GKYQq5y4ieo}~k_QCzb0PA2W(#!6rN3tuSs
zf%4sSGPB_!k&27KA<mi9YS|7xz)6C1n;eIlrhGceRGbCw90eYRT{!gVGFmwL9Teq#
z1e9;ZKe<tW4}JYXwSG^)GF#+{TIOvWJ>Q;JpZ`^mw3^4M!5u{6k`f$-joj&iSF}uJ
z2dVK6=2OP_kn*fwtn{KHYt?LHnH|1RvZ0GBd32j`s}k82E?-#7x6|=lAhEOlh5|zU
z@t)oYYU4N$AHKef)1I}EsD#DA$uJ#MXxR;FS|ubwVHbV;*ce`owSlq`xsXs^jB>ss
zI3_WKt%)~YZod^j_@XI_U;Gt>8l!N-hEYTsIzek&Bj;Z}9+s^b!KwL_W4qKg7Tg=n
z3f(W`lG<r3H7JNBnfGyzIV9fMdKP&~Ur76%=w*HR$~et*h=1T)8L6H?H6lL}<q3o2
z!A~7>PwfrzTsRF?#M+YRaBcF+<27yg$wOj&Ak?IBaM@asFYKduVMZX{TeXBvTyr0N
zzkLvE<Ve8%`Ya;hvI<}C%C;Wr?Bopm8pxtMzWmRvFTwM_Pw?C|j^&i=lfx<#@s(Xl
z)c<rm7uw0QH9b-=?N+YvV*Dj~$H<q;FIY|u(^v4vy4L|rY9qG0&BP4BRdUj^8y0Jh
zrONgvNXGJZ(h-wDYe|Qoqizix85PLG#hFOsi8Fc|+=v<_45@m(h)VZw=NlZOA#3pv
zwkS-4dEJHBLGB%uOHO2*^JQNCX)ttHdQhVeSu7wKn5SPec{dnNi(e^m71=MiwF^yg
z;yh>Jx#|J2wp~MYIxXmGi9Mndmsg;r>bFVY?kb2ce!<@j@xTkdy{E_4t|xU??&MY0
za+=>Rh3}L};i(VD<4M=PVZA;1$g;c|CYasi@0zyL*+Caz+3;pqYVrjJe_259%MN@g
z?j=>r(PL(Tp73jgGymlLO(ykXEt}f-6O}h@WpcIB%zA}@o42-<E?6N$VjXgX88d6B
zLR&L<=dI-j|4GJeCj#kMM=?XU!4*FqJxDVHROz~V&D6sp1ozIKjoliK(d7l(adUtl
zPER|COG2Ab`^`gw@<+emA1^hrpVI`gcB2uoUx^LOA5s<9HB?I16hEHo1uhX&Y4yw?
z`trUJvfDO=&DlPWjrh2nDohMz`^T!XX@@;YMf6Q>YJDcRjc~%0>EX!Z`D~&yHv|j3
zghV#K3({2#c+2%mN%>4mYGIK?HDn5@ZK5OHZCHe-jt+t=64H3nxKF5m!z>(L)5f1a
zX&^FApU(gJBJPF@F2vpDFYwY|!?@wqRQMLQ9BwK_;!Ewjp*km*9@1HW`YQ9tTbRQ}
zIR`K^>jv@&YM9lRr7Ts>kNi}cOwC+BQ4f`~!sOe^Nb28fIM>;P$PGQfuH+(rIV%ZY
zU%Q=@_63kt>&kJfzam=IDDDV|s}rnk@8p+Xlt9ZIZTL`GQ(oR~HpIR;j5hXu6&?F!
zEc!A(kErR5!jEiRa6%RlT#Z`A@6^o2<`>2QowSqUH_b#@V;?El;K1a@YOo;P@0{gM
zO<LM*z}Bj&ahgBdXkoK5Cw2O!FnvoM1m61>PEF22C2x8l`|)1b%r)YseRU+etdOK-
zmJk`S7L`g@gFIU!>k^Gx!JSxh-aEUOA3SRd+B65e4rK_IU79jKNK+l!1|_ibn`Ky{
zbvK$7xF4r`3&F=;R^ahsAzZyBq_wq&NOVjP&ffco&8fYH&bTe%3Omb)RgxbhFVEnN
zhs5)*_J$j=W}GlpavPXd3;9sbc-RYV@Mn$;q?|p3(`F^(qIv6y)A=n#L0g(H|GgO>
zwwxoHdOH*((w5GT^;SkVPyYiGCRA2FjeU$}ZB2n)H=XJD#szS=*A@1PJ+p1Y1!y?-
zG$b_}^9wxtXo<{vHs^I9ZN2`4E>Emy0mk~YXqP1yyzn7*R0SrN{f^r>Hkuq?u!mFW
zI4VpD3+KN?C1UR@_rW>d7Z&UO0a^Dl(U@v$h;clI3qs!U#;crQXq_q+IKQErYrSFr
zB#Jf$Yhs@$!F-lXuu2(2H7O$@L+dDfO*;w~PWHhW)$fp~y;)$htqGFOhVokXH&Njd
zH8{1-42AEq!{<E}*_=-nY)X<eXO=0CA4TkE`qSPJ|Al%~Z}Sft#n}muUwI%pd0&%A
z$*YP^yH%my_hsmeGNC#n{HXTGAN;Is)$ljHnd;nV<gdFeqr0SJL95ms87$1lij}D-
s{N+!q`K%D#k$eXGEylnGatd6Mqv4SGLw?Bg5`Vw)87#~p)^4oxe<ef-xc~qF

diff --git a/src/feat/test_data/test.wav.plp_htk.1 b/src/feat/test_data/test.wav.plp_htk.1
deleted file mode 100644
index 3485889d10d194a2b7e1568cc966798d1970c1c4..0000000000000000000000000000000000000000
GIT binary patch
literal 0
HcmV?d00001

literal 22164
zcmWKX2|ra`6o(O#sbrQURFaaE>7KpMZKOeoN|aJb8Z;1<l6gwz2xW|j$WX#PYwt@!
zL{cP;Z-YiuG*IdF5zg=Iwf0)i|JfoUA~7POVOb&<b!DitbT4%xDU`icNu6ITz>7I&
zIy+MrJ+m~TS#%If6s*Bhj_MHc{WHl&(@}nvsTh8nXvfTwmIU+D$MKou_3$qKG?pkn
zi|!x#z^cxEfoG?P<CP!QAo$jSjPEK^>8t%r1Y-kJG=n(fj8B}-@nY`H<TMDjN`o76
z8ASD$8|+F{V{)S7U~>E??$_gdR6g>LRo)&)H%0{0Bg@vo@$eSd7yJ=DiyCI-r!uTx
z-y{}mPQbkvZ{TaL(~0&MDgFtcBv#DxN{!rB5iTv!2VXbdPec{1Ig=IAFru{zU%Kmn
zY8~2eXk!hsKKKIWzKJJ4M9aB<J*H48SH)T90$jsC0B^eo_ljy@@PZvV{mKKH^_psO
zT3aD@@?21UxD{`|<<HJ4okMNc^iY?=0aUcNj|2udl4j=#xH<O(J2~w)J8R-p)^GeM
zx^vx?H2xUipN>gjMQ{=^9ukM%i3FRsbdZzRrofQ?DA_9+#f5HefQt{#<8TK@2)<|y
zVdy!LPT397cjm*$tBZnL4{C|RpBG?uGZ`-J)Pid-3yGat0^CW82i1pzaJYUa7r0;t
zjyN}kJ@D`abty2QR>$;k=jY{Q$MP`h*!_vl2-KsK<xk_1r_b5YsfR7ePH(zU><d5q
z?jsywu!vk^i{Tu6g|`75iRqOl0ry-;eq}VNh%P4fx5ne?2C>A{b}^J_e8A;Wa`5(!
z0@QC?$H;8HPE=y;IKdJf?yBc)#!6=wXvK!XLYHR{Q}cp5aN-Tq9()mR`kcse7A5rY
zy|=X7Qi0f>?xWrg7SzfsnmoC5n94hh!iGg3+2ofAB-Txyj?=2(r%N2byZ1Gbr@daV
z><$oz1zIFD;v-Q_h$Fk3YvAs;a<XvdAry2;3o6de=G^s0aBXNS5s`bsQ&fybo0o_J
zKjkb_HM*C1;nsu}y%$3_2W~MZZ(71hg{esWd^J+ES&lpw8L`3&B9!!xrD1bS$&l+O
zdbafg^<LmX=cLXdEBnu(6RNM+1QS1U)3=XGoHXU<od#5H=0_?THghs6y@bE7g&f?!
z7}|YjLBN}IY`A6*TzFQ+Y$^`mm|idBImZ-dz5%E?aR%iZtwHN2vS?GFJM;8XISP=y
zf?OR2(5DC&=Fvhsu3mZxI$B(T#D26ur0*n_sLT<n{5nNX)@-8Ue>G{$uS|O8)G2EE
zLLWXFi$hsr37ggOjp#P_Qn{Vx{HyP0f_#lHnLf~ghTaLdh`#w`nlKm+oZCs{ruSpx
z;|C#hBWAweu|c{uPROuJ70;J*gL4{|sLMSY*(yFlahKlkT4si!U_~V+a_bdlcr{y-
zT-nOgDJep-d++i}&i;YR1_^BO+z6qfAJE{<A4$#PO*CJ^oTimo(e;sQ(V9SOt~61c
zjW`ny{i@eV!yZR|S;u$qw)_Hd1FKN;;upwqYX(^v<G}>2w*@YL4yp)rBa$Kv+FrX0
zx!24<tl=s2iPM1DA9_(n;0`o9_#_jMJ(rm=?HO`-cT}LZLWAoOig8wpOL<dYH!_*a
z?K$CwpK#zn9eeHCWuf}GR=RdkDk+`fLhrj<(IYFL(9j$y%z6U3I%N@C^1hL*RhvUN
z|LOd4XLkr*stn7mT=0~?MoiTrMdDXyf!<Y2Afh635tu0uEs@t~mqI&QHF}3h)jW>k
z=QeQPx~CwWgqgg*!SawAauw~pc8&M3+Jm`ZW?2)eC&_#*@aBfP4hUF(0I5`4&g+m4
zo1N4m)SR}8>RrnuslG?)hgfy$S!_j*ty_SqwNl9!F>khH$|~5Brc3J5SMYDlYlH5L
zP29!WR&052DG^sQAu{bp(8ZO{A@{x=N|Pu9m=wgUzSfKK3JBm(D|CK1ksF!2m}%=5
zaybh{;fr=F`rx7fI`6`mUAeKy@sbrP#1}x`VhOB1VGGKO9hrqsUD&i|ib5TQmE?in
zC-SmYQ7AbxlFkUOq&XR8IR9c7)$!)DWv3m{p5Suwx&0i!GQO4Qem~B5cm|;U2U|&M
z_XE`Y%LFziOM$h00A6ru5CtxGggdTIjMEl*$jom=U%S#cfyy)HVXX;7bX|mW@#CmZ
zuoyy@xG}F2Z=fV?NhEMp;ffx$169o9^oo?2{8iTMp5|hqvPwM3Y{;WNlTXu^-__}f
z^FB1XaTr(5h$4kQ=CZL1)3NT<qjYk?556E}3DFOfXBw>2n7H-6gsgMIYnGgY)<fRl
zdmfF|<Z8TdRR9<s$|J@XHK07{DpoPCWTdxeFoz5eK=!w1OoG8Yq}SmClKhLzHZM(-
zd=(ISNVo#EjZpeskLkU3-LkPFn&oSL6so@IBMsM9(>jq@nrwBGrmTEGFFb$4dMua;
z*A1?)rymCspJzAd>@8OOIyq0G=JOhPKbC}+j*G;5@hYqz;Rb)X)x1r`t+-g+8PA<K
z#ND|0fSlWUo-^yP$Fk#{kWx_^?@9Gen3r3Fgx3<l9<+G}{dAcVzaBE>6;)jQimxz<
zZ^VggXhaUX>$%~*f7v;kUkTNHa;U<lv-IAU*);dP0ZqPrie8@ghAmcc#B#BOy}BTm
zF3;_!W}c<|h7a8kovqJ|miEA-m^O(0EQQ2szkofF2CXAsaJ9%RRHmXp#;tEBYj((@
z6Co$ixnt4DNNF9<$kUF&2K}gKuQyzt*~*0W-DhsxJPWG7gCIq80i0<&fNp${Li|t}
zSSjbgDsMY1l)bu!TDy<Xe0wb_99>0|CU($sb`#k;HzTe?V*`8c?lASR&>(N<O@2eO
zKKH2l5;9HpfQfxVCOzC2(%Qn{V5Tz@GQ9zZTwKI0$#_o=ygSORpRp9ZHYnsR{x6Mr
z6{%KpLuw}CP5g~~?d4&c^m@i_`x@p-@NUR`e+15iz5&Ohzp%)|046RbAG9NctX`h6
za1tG+dWs+D4aXXKb50P=JncaXZH?Lf*Agi5Rtmmz+mHNxWln@nXZiIHFt_6PTx5}I
zj~0j(qH)v~N^KT^-SNN3*d-ksm~MeCGc^(&{0xtGkVR#=J|N~|QuD1#f;W`9A4%&C
zFt@ZFxcDi4jKWSs=9~6Oc%>EtQ>HCp9`_EQfSf&yXr%+EQ}TnIURx`a`x8hd2ba>i
z@-hlxNwnH@F^xT$#{TyH2XD2daMrgz@~YjR#7O4yZ^eHG?M;J-Ib?&jYs4^K7D8CE
z>j8M{{zYMV(nvCA7bJK6Bu*s^HjZC`f>(SLyjzsUtsJW1<+Y^ptomm%o7<#o(%gHQ
z+g@jRfh@qSm*04Ly}syjd@QHF{W&wYkTCAvKUn3wE}`_=XfpKQ3|g#wl0M_~XkL;b
zJ^Nac9r?14lzeE%4GtW+UaLm9g`4=d4labmlbg|*ye~}3+y}^Z@G!*PD;IQ0X`rjw
zn;_%k7D)7o5QNutFxS+VqQ4D_%$vDs0{4*|p2Y?&F1z6{6E5x`7!p-w1Ob*@)T;t;
zi+|3%RLw)J;vL|aQp4m<c+aE?@8QnS*FsT4TdEr5NVA9j(7VbBw90NWJ@QePz3={%
zED`;Onx0k@Nh1RyadZp6VM`%+<m^F*x4H=iKV_i{N$F6TejMq}2xd-ZG(e-ZGv^;T
z32C1W5=1l)Vx^9iXrYfB6Sp<YVl=@`@W*~DGbPBA+j?LkBR9MW<YWzbEoSb_sw)&_
z&R79<o#SgZ{GQ2UJWO!oz&N4kX+0_~v54kPj-t=>KG9sQV(Py{neE^^ks!(2_(y~u
zl?ism^E`6+jjxw-`O((MF?tf$xTFVNm3syEAJs9Nqinb{D+|HK&55^mhc9ZfYys0R
zrdYXF3@O`tF+<hgEGCC)Fs@tA2`noP^3F4JxQZbJ$|3$tw@n|9c%0xZ|Nft#>XVRr
zqi(}_<OSfDS2xjrmcOY?=}ele>`s5Y-b1sza_L!XSN4ARK5}TaEAE@!OgQHtY@YLn
z-{dOFz?Tx_%J)F(OR^9zeIqm<+0S(|zTirwkc~$dtXgW1M#c7nZC*CAp0|v*{C*nq
zT4{h;5#GiGDymr)RG2UYtK-37{9i6BSJJ|*FhG#0$Z-8O!92lDmP<c2!aexS!(X?q
zp+j?D(P^i&sBhpiq4<zJy(0gO#*Eiud+y927s3lz<HR52<I$yfqis3AiCv9U&bFc0
zDLfR`xdv_6X$wZnyPz=OIRx-6xeOgAuJfQJh&(T0!cTW0$(~Kz?MEiee~ZeP=0sxw
zT=<VEHE3a;Yn#KY@R70qzXg^Uzp6PSk_m>V1dI^1TV9AT0mTy%_>X}%?NzQL74<V{
zO7tQ+^m8U{AXjLT|9@;B$|4<wIjrUlee$GfE2_(X!*95<5^Z>Iie9VLFpt#EpgUzF
zT<Go^7^+ePZ}Dh&b8(bw`@IbeQgsDQ-z`!1*!{6%TL9B+u@~vhdk>S}G$3Oi38u9$
zS@2(!827rn6mmPaGiOOM$cv6Lp||pxU3X`J&9W)1tnLVX)wPqv+dQX1`POvU?>lW+
ze3)KR-^Nx+$C2}<6#G}Zlc&Q~xYm9Gzy9@p)S>$jtJUhjQmd<Yw*FI2;z=t}GLr;i
z{EFBw`N_GP*l^!B%|tf6UFfe>C45Zkv$$8$i&DJ4bBD{`AV&2VT5_;~A^9f-|CG+c
zMu8%lanz5~Upk+`pPzxqylkkM*v`(@?V!Q|QL23UJ=MQqDHN}Bq|fa?(Sq?K>;>;U
zQgiA$9x^H=9j#wjZHb%w+GTw7zSA5#m;Q#5ZSvTzyoyl|5OP1(y(H!u4J5>GJu?zM
z3N~5;%uZ^6k|Mpyx<wCQcA_FGoEL};7d>bG%FRR%$A3gOf4ZU6TVdQR`w<j+c^4N^
zFq;_}s)X#XVnk-zbJop|rTNy;6kk^-T|u{nVh4G2xc3GXE+}CASHzK)-b<`zv_9Dy
z<i<{H%H~&Ps57EEFLC4AG_>r?e!O?347OCs;;?Kq7+6XY`xFJN8kGrIKVP8`p)Nex
zo<;6i6hU6>IAnLi1f{Kg$pp8FV%7E{tWoBKdHFBVZXpk)2(NSbKU0zFz3s5et%gim
z`k3A6@q~I_oJM83R*=5nMf8L9b1F1gMn}f|Vy&)VvSHE=Y%$-F3{(m+=W~sJb$=v0
zJ^2{7Pgso?PjkgPzfQ&NDxNs3tO%bwVo7EmoR4={=s=rI3ZAH?K^81Cq$=Izka2oD
z_i!U0t$rHBtXyk{C6BMhiv6qce!l_K85oZo)wc5*JHDg&2l|<I_Ap$Y?}Z2FY=iEx
zzT_|Zi79k$r%5&f`n0Z=p1XdSoo23p1qbAK7cakooDH$u*X$qsnCM!v)ZYnb+JD5$
z?oPs9`}VO?JuSHJTqLVfG{kh8H{<X{7@q9^h3DUT%>5E}Qtj#e+}*$X;iY~RSCn#s
zYa0KLxuLCr+8cGS*aQvqbOXZ+-JgcEN7pf?fge%r&~~J&bPN}r+YJ{tHZyU}pYXma
zNirlHM<-0ofbqtcSe5s!`1<>19G!C(yR1z>HaBAUhbIh^1ya#i<i{=~s9uW0mM_L5
zmoo7CtJdtSk^!uI<vRA_O9@KMMzKkdIi8UfMn*e!2+}n@;nVV1$bEAKTwg`Qot^`N
z8>5mC`Cf>QJPUwH@&dH_&_eElR5v=V*o6&m&%!(CS;A_|LAXXH?mmA7)Ib|Py;+5S
zY*u0~o_xsaZ9K!gd6kYYRJ*YXY5siY;~U|s%^fcO!8cA)a~PHMm*aqKb+}E&9CtX*
z!;9vf!@4eiP;6T_>hLhbtNHaLfZNIKNN|GmeZx?G=Li%_nUlHcHV~uyh-@0GOO@rj
zfNT7Yj^aL;Gk6eIZnnnWH;u4D_fit0Z~{uL<k=aARM4@Xogke%#EPh`VAJ}WSqE-D
zo~$W@o$pOzT}sXQj=K#B@<|o!i}ZyL()mpC$*X8rSRszUCy}9tCE6F&if(WEf{ttw
zN0&xjnSDXOQRE8|E>s}}rVZ|al&baccfkx|E<Fo6AHN|7k1PUW-A<I_yKCz9Y=Q6#
z)$rdqP3%8)D%M~kL6~+H><pc-x0VgDJh23+;2Es;{9SD7ltFg?>aDoj-vhr;T*Atx
z`tTio%^^X$Qrv_^wq(*sECi&QLH!36yxC<r(b8T8?>2`LQC1u4+>m8vuh%8(R;|KG
zJER$;Z3>g(5FFYl3*GV;;HSxFBDH2UnJDKEoa{KFB5@H8TfKmqN(ZQy`iuE1ovFO~
zRWk9NB%LUdf*-hjBx2ugu;!XB=#kqKcGtOW?1FV2m~Jm-<v2quS>?(PG4LYubbrA4
zv^QkSr8^|f`Y<_S7r?wZ??l`u{UOE~A4r8>JnXG=gO{$C37cJw2j7*W{-hrOdsZ@{
zb4Q`O>=<13SwqzAJ7B*G4|dMk1W~c|u)-=CZb=n_Sk7y7L92k8WOk60iJRzztRlQ;
zRut(T?PMpVcmrM7&#n*of{J=0*@Z(Rm~C-@IO(JO_>x$V?oJ{)&t?$+KW|9V=>W39
zU?EZT6eF4=!^EI1j8tgvAhRcKBOweXW~D;>bE!NwnJh{q|NKEA_TmuJrv#e~*THaa
z1*~)F<ILWLp)-(US>N*;j>ey6##`<a1iX_YakzuZ+}lR?Z7QUT!{<<cGZms^(oCXW
zy=SH7A0Q)#K9YDLA@-xgIH0$ee_5dxCu$m!rg1e~-aH%9+$>7YFUlepwfxCRdp)u&
zNQI0SOy`Q_jY!z|ZlZnI7*}{q!b)j8X!BA=G-VQFN3^-MiQ2Hr=^1QE{mwj0EXDIe
z)zOQbIf8cAeT+z>4g}2jPNuE6NsYL#G`6~r>O69$)=AHZufub+!r7da-cv=qBwHYP
z%TzMBO_sFZUCS@9*@c@td#RXXJGxr*i`<BQPOeOBBp$!QNy}3sqFW(}9_GKts-yFX
z<ASMVVOA?HI=dES`px5N-tR-(U00!PsbS2)@L$~7^FKjssTIn0PQ!9rKBI1}>&UWh
z1u`7Tfeh<PcFOHiI;dn$1b0u+c;92Ra!Va)HMXTGrFrPA*f6$IDq{6R+sI=%G4j*l
z5Wlof3tx-4K-xCQp;G7P1YcGoo^M-;$&@{Wq;7-7i#>4kI8j_UQG)32HX&-Cj$^M`
ze2@weM;(QApk3UHnwplP%<FqO8KW2!EOr5FTof>8Cm8U|AL*me!+TKN7!S&>QD@hk
zeJPaAmm`fmt+cpwi1wS<lflw5dhg>o9N@bRhqmozec3F!w8?`i*HrUspH}1i79$dI
zO`02CH-(5Slq0it9wy0O-jLHTR$#55bSzW23S;L}kbR?(={5GinpYa2a)mUWQR)FZ
ztu0YilmTjZG#f-dOJkYJRaoUzE?hmV0d;Z8sB&iqy0JwLh@uSZCHatkP#d6*FD&Sl
zraIc;GfeljjiZ-mm*M~F{8-(Snye$eN_QSPL)sNg_|)1QYx1OdvnpGO4z=J4QWF`o
z!~l~0I=$Mp!UW%|c7ecJJ??>cuRuSc7u7!6hgeY)_^fsx%epo(F)Iv^b!`XJuyTl*
z9<>FF)E1-l<~d}Xo)sDAybj&*lS9$$69|0Y#ukV_5srJ}O+ywvrAfOLDCv1jYd+S|
zgsqxvoW}~h>t!Rm<ZJ}hZP6kZ{YUuKY0k(XTo*O8HA9TX5yoME1v9#-m|Sx0L`~O|
zak+;rQC%-aR1M1Tl-H-Q;v)<bVl?1imNdG(gF#=8$1vqD0^t0ocj%VEDU{q)2sgtg
z0jzlp!7mo0oS(;;_d^f3-pG13);*H`V#29|$`o37*PS*zWhl84O>bUEXCLz8pyTCM
zR@W_={L7Lj>J!@dT*zjq-mQ#UduHI}vPU`nL?_}DV+QXX9-u2<tI!*zr^MA5h{D+X
zw0Q_&*`i!#+i4RhtW!WH4MIGJpIW_V(2`6pw89c64Vkl_`DCrae3)0>%q-raf(|8V
z@Wc;U3WD#~u?N5Yq4zFaqgdw@&8`1TUrzr_-$Z<+5hsk;GIlBSB)-F?eHV!J;95qe
zJ%|r4G>M7WH$1zq7vGY9$QAGFA>Aqn<$A}Uj1fYV(_WF2eJo@uoW`5$gRz>wCG+W@
z2+WP8Opnqb^l_>wFXz%8=t^?LVqyv$+c}%8_7fEh%%6{31y;<eAJ4gpS9}X>rokFj
zgwXnq9U!x8A1x?J7S1r8O|O`2rThOe?6!-mNhZ$|mvU0T_c29(%5(YDPKiu=bPMy9
zWbnIV27+@td&$ti5-g*20M*C8Cf6UD6ZfxQYW~X%z@a90(EXS91gXo@cp*;7Ombf)
zPto%-lcze&a9R>5gMXH*diRoyb6<v(M!k@-@(~bwFahSi{l*n6>0oC)yhm@E+2Aje
zUea6<d*Sl)HZ)<e77aWpWLLFprF+(evEq5J@N%^doc4AtpS$NvWa%o*YL{S=cL+_q
z-cG6ljq%b8K}d2mhCJBjODt^<Bi~Ca@RH=8XyeolPWZN!c{IM?@<@6wB-GAgR>&$~
zvH920!k$yy7o9x7iqh!dg$7Pc_aS646%e(ghI`-m34ho9K+o<pW^GS9(h{Xh!u7&a
zG_J>so^(8k7i(Xp4pD=wjK*Jf%AwELdAbLmTay5lYYt+|f(3ZPNE`CsF+#-S0`Rum
zYJ#6LZAgo0KO}`1aNoTapg_sPTsP%Iz`L1Dce1*m;C>niCd6@59ES1ih(snWvJ*t*
z4<VU-Q;?FJ6{wtE0lhaLKwNYo*WHzbD`u^w?&Jn*{NEiKm7XWG(Tt)8-tVQ`mCH$S
zxC!O6pIN#64!9@66YnC6`GRy8Nbd{B##`QFrG=3^s@RLP17_k-W(9Y)x|CGz`2*u7
zbi#vE0?4b=BBGZL!&O@$lPCNpnCsdLwt=#s3Q|}$Hiqji(c`*orlay38G<t}y+HHH
z6u7!14(7c8?&jTD_)b$k-DTjzx~9a^v%fNgt6r9qYrf$$xaB^T&5Ne|6m4u8Ity#~
z1Tp_u;NN&W8Jv$l!G=jC=*6vhpmyH?tMhcR^$J-~_NgG7oz}pxZ6Q2x8YOe`M~L1Z
zTj)90S5teZvZk}lhF7_-9gN;ALYJaHLDHZbSMTx-eH&Z@(>rqoY39=07K_!)45b0V
zExS0J*}sX-i6~(cGXiL7$2TG4XGD&qpQe%fs;E}2kVZdWjF|_TOpvgbc-`sdmnVFJ
ztp+XFJVPGOF*ShJBResEn}lMX?t((D0AfU4h}oo_Kp#lKr@?C=9IK-b(}kRSDN{4o
zN`&Dl2655#mP}uQB_vb_bHB@@(AU$8xh(<Cysr2uXf<;c?HoPI$RDV{?pLY^wN7E<
zI-O~SQ<2bM?ltm{cauh_I#J8nIW(tz0^aPM#@*j`id2cq@~;sCc(!#ho>#H}?;a_E
zmd)nadLR^>Hh+LxJ#VzxV1%sfodX9O?3m3@R`Y)QB|>I|7k9VIUXb}tS74}MA~4EV
z<m9jTasI!SbN5;_P@(EfWH>k8QXM;?vyK~4#NCSwFh8*V*$*WA_+j>X$wFF{B`2KT
zwwC<cN@$L70#&QqPs^;$(3#cK;KZLC*z7-#fAy#_{F1qa4f~d2#S9f#opKoK)gq?*
z=2D`tCJf#4Tn)bc=|J||p!8|4P`L9LbGWR*UA*{}LqmR`e@mayd-90uUdux=wa06M
zYFv=YT33|xX{TV^vsH+{?HUr*Ji@$N>ILVVdWq+gCiZKf25m~u5vqHvBtu2&wBl4W
zNij;KcN@Nd%c0je=)ooqWc>J7k2Daa-bAc^<Oh0sJOM5gFGquiOR@aXDP+R(5Hu)Z
z&kQb5gto~CQ1+Y82!`Ks1wtD}>6923kns<q13xoKO7FoyL;@kNEatzs9>I;?DJZf^
zr#fSOFK;Mh8K?hd0~2PRO(u8FM2}ZCvoz`uZ4k8<D!BZE56ARqTlhCpu;L`mQd1!+
zUz~7A<|?=_+nitIo<$_u<M8+wx0!4idpPC$5s|EDteEx;&a(aJ@~TpFeohTsQ652M
z=aw-=1@mi4+(nRabvcZC_Z-4KN6^-+iLf~MtYB$y0rH~W+@~|?Ol-A)i6!p^FZI{L
zoLlDwcPGk_GQ~-F$uq*XukoPIY)yo+P6A$PeIsog?Iw|PnrVvre8Lo;#+%BrP~Qr5
z{?%W`;1QvN{=1|LhmB?m)UJsz)od#o{F2Y>;kTlRIrS*Pw*x|F96**yQ@9duUrt~V
zfD+ZNaeq~lp!y#l{k$Z`C4>pUbHjIZ+4u}h3AE#$T=|LA-`5IWo1{V1mtmM^)=RE0
zYr-hroW1|dgLYOA3#DWHxeS9cT5tE9*x7OPqD3Ex&0U9^kBM`!jobM}BPY>D|Nod*
zPk+OMn#-7{KaP9et-+KYRYN5u@-_1VOn5gg*l^t4V&p0O24#FZD6vjM2?{5;8!Jvi
zYq~U^JdlVyzm`GNz1^r_e=7GYZjg(PazRp$^|;1$UqNO6W4I!WAt74V@!6e&Y~5It
zREPfwmD6NU+@w5u-TndLeQBdv1D8maTLpd?Qb=@;U*{JDRHG3;K+BZpk@)`@+%8{-
zZhFPRt?ixI%D0RAnH2%C&->B-kZH){?=E81@s=zdF<_p|QWOk5cs|B#@=)94eOSxm
z6ewPPimu!!VDPI3w1Q4%KKuv*+dX_sHJ4~;8$O1T4NLGk-+nekYmlPpLZPyE6KL~y
z)9l7cgr<7X6zdIS)k{14u6BqJ#d?00WGR-BE<}#+4iJ#|jNi=)!j(=0s^<N}r&F?E
z%9NAn;KN%C*$@r-4}9RlG-q<9X3JQ=w&$5Xl;XY_#aKQs*T9x}Z;?u73yRZuhXS=t
z(AU&HroM71_vycO6kJpfk(+KJM+Ig4D6oVT%|&Ep!Y=yIcM|zbB&mhbASv3|M;kZg
zqto&-`0J5MI`8p#{yBRQBqMHsd?Z{+L0<<JT<B#rgw2A|{7PIbxQ5;?I)xKANTcnY
z(}>PqhO2wOmfUl&hp)7XGaVT~lV(kY+zGc(KG}>);1Kimxd%$feu~wHt|Bx%j;Xt7
zfM{SQBitm5jrM)QwR>OTw2d3#X8$m?P|zpqj^|MwomO&be-pLZ=SZ&OsaWq<0hzEq
zn;$rS3Rz$liWE}v$i8_#c(ctzR{M}A*5iM~HQi$P#OCw3bN(Q@bJ3lt73gENjtwL$
z{s2+YoXtIcT7>LMKf}$12rl`qLUD%;Iro>XXet~*$vy{A$;v4xrE(=&s;~{q1l%IB
zIj8ZkttR%`X~^kB<-jkCDlDSo%KbOa7?*qHjOF`YNOjJA;z2D*bI$?(!F^sNi!HKr
zsct1^0n4$d#C2R_O|jiy9~>X8gxR>qtVq^Q9PkNY#g83WSEvtHdM-iZxb?7SVKtJ}
z^o8rvKp43JCVBB}I3F1RW_Gth@9=%Zd+3dhuW4dZ%U<Jkoh{TP=NB&P%x7THR_t?U
z9rJ1E6aKOw3p>7h&Waq$!6Ik-ak}U!_JCzH_Ru`eKi<BSbblEE8-FpvKHY&vn+H(F
ziTPNYp2Kz`ztKQv9nSf+6jxYp!{+V}P~5?%c=5c0Jo)+x2tOyoczU*gpx%}kcin+l
z$3b{pEr2rv5%BogVX*a109|ep_oI0$wrt!$BlENIe|{H9fZ0NvcJ3K*?kU6`5o__;
zdcj!Vb7Vz|AFzJUlvrEs`4E1+mLKWtPV!HP6SMSG!tPMv27Nw&_V`Qay7gML@@W)U
zIe23kdy1z&OGe?GJbLkO9==)l2-W#L245>RUf{M#@NBXN+$(<%k5kLxrK|zW@M<NR
z?FwXii4B~a8_VV28AeY#cTq4k!Rn2Hq{;a>78cu*#0xvv@vaHjC-*ik{qHR+d%=?3
z|FWE&Fl>UhIO_6~b(WK9U2Vi}O*7H(ok!;GECRhNcZo#BI#|gwW^NRWpfee};YD0F
zk@Z-~+{#wPwYD>{@=0&*(6EH%=RzB}omCBJ`(GGTl!o#%_dzHB29f%vMAULdz`kpM
z`>0#O+;%ObWfm4BlRrR4uJFj73)ji?mMV6_q}h1=!-tr6b3J~YQpHNB5}YJ^4|^Y2
zz`q1h%-iD2#62*REE>5=B64QJ)uCWwac3iue(H=Q_V*GwtVhzz`EdBK8p;dT!_{UN
z&=2`v$oiudFS;Tb#&_O>#hIpXNBS`+y_Eya`aAF^&;q<&&U2@n_JYp1E!<~=5_+Zo
zE4dIQOEnXsNy*CsvetPDDH7E1<YjlWQ!AVCACJ$1O~0e@=-;b&6+Gh?WozPb6+Y1N
zNQ>}n2gWk-EF#I8leAO0M6Ps<4UIYyr+|Jk<F5u6Y#IXx#~s1x0jp4qU>!<2bdWjZ
zti$6QxxmcTx44rU$E$STw?dqx2{eS7Ae+&(jM1sf99?$>7C5Y<fsSQlAXbJ3mxWNn
zJzjKPVHr98pFfGWlET~7&Jdl-7f?Fm8Y^w-fFne^`BnbTSn}W!X5&6xqIUl>(OZ>A
z_MbjOa`yg(iqG9-&esZ3GA;`~1%^O^*e6(`e-QhhdB%j4g(Lq~d1N7Nh$j9`;JLpH
zV@gdQFlU+%awKuHz~OKr5?s#`q-ex)Jxl}a@ynt?ovkFYPL+mScBE>*r_y-=om6#R
zJ+Z&Ak;EN!B+k02_`zEhT!i)!|KVGFZrBS?nm!**_sIb+crMxSq=n227bEhQmq5~o
z-(=r)X_9?66Y*l2A;XHteX0z^&NDU%zWM%O(ye4sq)Q)j``1Qh!$x0p;9VKF;?N=_
zqWlsaVf_%Y%@>56EnrqX&xMm8mFSMZTO{o6LmD*NO5ILBq7fwq^nmSFBB2mMHcVSW
z-A>&mVZR5MuyZ-sUHJsR?%!n8@g#)_XfNk2IG+YRLxI3gv?f~Z=b-SX825A0ad;pX
zi%?Aw*p1hK$h9xfzkg}ml4;ijL+7uej6K#&rl=V2#*AVVedag!xMDGycV#E~K2(GB
zGDSGC+q~*daTiE?FQoHrOvxs3aT*zwMRRzibn6~-YCOG+{Ffg>a>C-N*$xGqEgp%f
z!3LCkyPe-Opo+S)z6-K6P0+>5gedi=lj-r>fR+o0&6iM+%i98KjiwwcA_mAahS_d9
z3HgV~aly6S+;YFuNO*pb`Q5~FNzq?X=CQgO6KNr%S3QCrMs4Q0A2oqh+#F8xm=(a{
zB04*DBk3xBPW?q%==LBTYX3Hj9=nx8mBSCAV!IOZSjh}q{glDs7I(;jfd~ACkYIGv
z@qj?qvlnajPKMgrMPy^q3EuVTazs0#3^vNGLci8uLmyw15Yf6Mh|50>8(iJ03=BmW
z@$UEN>B@)9mCNlln%R8RJ0%iH1ZV@UZbCXIWFUmhg-OeDxV(rPP&WFCYVgd-*8H>7
z;MHqdwEj75Ng1XgIs54;<L^|x>oe68IfX@aZgN*<zD0{?Y4dLz|3s1N^|_@M&+r8m
zGdMQ4hgdMX@!Y!`LATO_7|Nuf>M^h9r&S<{`&|jCGSA3#&s~g2#A&92l|-Fn2a?<w
zi+l$tD&qjDmWp!|t}OtuTssIIn=7VG%;7p^bHKA?AsKuN<V@Bf^6ASiTExFWUxejT
zi$(rak3WN4&Rs*U42k2m&c)=BRujAz-O1<TE76F$E;{;j5cjk!1V(-ZOg-@cpKZTU
z6SK#JSpRK8BIExft_g49g_0xu6?&32_C5?RqLcZ4s|%f5>dRcev;(af-p9N1Vh+bU
z;=w!#t%L@zP%xUe7rja6GcGgCIGx5@<jsY{<XU7so_}b77EO_&`9^z@+`dy(GixvP
zdJsi2B!<bIz)EVea)jJ`Jez;LBm<9YS%{J^{lTB-BnmjxiUQwdV7t>RL8$avkg&yz
zOxa&g46T-8d8Z1v`81a3KA9v~m2r#d8C}jZzTd-`bm?)g-!6q1^+7Nj-Ha}ZL_t&O
zJrw=B6^owvjwX&XL(<Xbh=iygxUaBe%}+McXnd2Nt2;-=X&t63uA5Qwb4ld6Y%`s`
zNELZWy`nQ7R`W|9&p>?#)}u$OQ}7F>g2O`^kf)l?B%j|)*yt*Ybpaye*j*1uJKTwP
zq@Uy!2X;Ygm<wpn3lW6*>A>-oGk9P3u7~!CIuK=OzzsCbL8o=Tg0<I4)U&Z3DLkxU
ze%t**>(@;s;~jgD#eNAkOkau;k;$}aXEwPgcudQJ{K(^;7<x4;fbhih@$)zrGU0kK
zzx=4g*i{&cn%~Rd%zK4gwQ?Fsl@0{!MGJ^gg%bK$-9y5>Zs94pK^V<BgC=di3z{=c
zz-i1&&G{$`x35PsY4x^nYw1qTab!F5YyV5$!isJ*Ip-M)*y;hYVXEAP1t-z0kKM5K
zYBM^a_@2!btD{$zRtrU!n3B)4)M*MENL4}()1cO6ko9sO`mk#}S^at)zc?!xl(h<&
z$I8xlb@J;PgBmwd^Y07te?1SLubz*#bxV-x?EzTb<`P2AOJF?e7hE!M1MMwKVc~!)
zJo`S%{A-)ct@<3p;IwX(pi|F<a(rZc*PeHIb^(ZJ%m-6LZ{}S~8QDBz8WKupveg5+
zv?#_*sM>lDjM-l_yZ9h^v-Tss(l;BWi78_18XeMp!iay(_z0ZRF=MtWxuZsDT~z2H
zLe9WiG>~S_q|CDC;!A#WM_#!h2`>$_ee*Rk+jgR0!9@|KU12X=FnB~nq(36l$+D<$
zwF?;eL?cehPf(Jhg;d^Z!GS4!F0qwxN5VpwuufMJ6{v#jH5A#}lw6wp%27DOOB8J$
zyKYN49r7}Nn5O@m2eYj-@cm9XI(PnN{<RWcu8ihD(q>)DdkrE;5c!g5FX<QDXO5th
zp55GuFm3p`YagRE%@K>wItT+sk_>$lfNHW{L7?wlq9d7zjBi%qsizVAe7X-^P5q8E
z3U4FzeldvjEa7rbJVXhv?{i^)q{x*u@n}F;&EAMxLgV|(h5Gjxru@(*nz}=Ssx)=e
z0^eOQRpJjjap6O1{{0BQ^shT!5adqw$5xUp7ZUMQW<0Yd)`slUo`J1XDEjSv4jRhF
z^4Qpe$T2FO=W^1G`5rEUsz$yDycJD|NS`rcK7PW^lb6BSgJoz_bpR3>7lOhJgkTo8
z5^Q!$ph2I*aNj8cIut&W>>y7zf6+(k^QA>-(6x_BCSInFOCC|xatqqtS;cgBJ!Tbq
zLa4TS3%~TlC{9}vLFT;cBri^NVT<GXILx4)z?OS>cl<7Ftf-6P59}r~<UTlkuEWPf
z72x4s5$2p@Fe)AQ3{+1aLkHLO<Abe^C}I2}WN@ckkQnhB%eIUP{O&9OhxL}I(BF-i
zP0nQ%of7Hx<FD{fze#lNZ%3iR)U|YX@eC?knTvhvQ)pYNIXP$3hihHel84P7`4#8w
z@IF~<WZ0WcvfTA>WbJZRefKn2cuNA?^^~JUujTPJwMO_BvlF`dl5v2<3(|W=3rK<?
z9v>Wxve-B@(KHv&dn1CEKlMgZ5hq|GIgeGgd7@k!yXx+$UPfDIHnBLZ#ag}HLXrlY
z$U29WROCe%t$B5l7ERqvl_P8!1;H@2{M|&=Rj=Yh<Abm;{vSX6&U$3<*%a^i^^Q!?
zn};tixyzbu3dSK)W;n*x&=S%`@cpK{OxU4La9J*o7w%M|)3=8a=~N*K_{QOJA3q7w
z=1L;*Kfmyr9~R8BY(-wT`k0^gW(}4e+=}KTE&xBbYKV}k#ZY(#=a@FKQ<y(o`xbL@
z%9LRH0uPd=X+jP9kJEX@*(7Jk0o#{nF;9O>@m-S)$P7tI?4{txwAWw8!CTu{-I8M5
z{dN*AdeD!??;MW@B1LeBV+yhto<_x03#ew3C6rz{1==He$Rp-7?48g9Ls_Sh=`tU|
zr>+Mm;oLkX+4uOE)7^m<shh$YT}SjlK^<=__=uNZQ(+ZZ7kp*#12p%(!J9kfac4sg
z5uY9gR(=uacg#lIxNaMg$qC?Zj$BX7zlEb!hTD*4_Afl^b~0}Jw;s3HjK|530B_Ep
zk2y;Nd_&9?Wlk<<Ji?pFjT}i%l_`b{{g;9lA7i0-q=zi1xehT!Yay!p8`_hy6}<f{
znB^ZXF%b{MxHm@Av2oZvw0lAYig%L4yH;Jo=wBK37xBbz##~43yzTh=+E=)~ejclL
zbQLTAX%9Ag=)m7lEe?Xn6>v6=2QMdxpio0^JiYWMdM=c}Gos^gia`c0csznVb-mFV
zk1HtO`Z^@ZNg~tfb71ZMa_&jlc8D4bA=4$I;d8`9C<_n@j^;|kJrgr1Q+D8@Z)J1)
z#@)plzckRq2v3ymIT2e}J;V;1SL3YQr|kH3YWT5d3jUKiz$(f$u=9^8;x}7&V3Iw|
zU;q0EXy>?dYsTM(HobF#DBr*6s%IODT;RjhR?Fcj0$Xh6s){O}RWje+-9l3X|A17U
zBwDa?xaLjdGC|7c-(WaT2wa&IRIOHqqd}gqUGW(-_p|V-$dx0OFSux*A#_15wnnR>
z4T*`2A3Ha?xGG7AuO6bTYSKbh<mWS1^!}c)H!A_G(zRgxM0GQcum`?vf&!<hwwnnJ
zT?X2_&eTNlOqk8;+Kido3ZxNn2`y|0L2mAwVAj(n&X`?gDL5*}g_Ss=3H5uqC$kpv
zs&>wW&2Ns4@#!BRY~KnAmmfpy$SkOqQXF$g;{ZSX!5Ma(L76r&XqLq*tTApj+J3bk
zyM|TZm~E2yThS|4ZS)VTCELWBYE5EQiICM;eGBt;MDo`S^a_FoYikBJC~|oTLeBBM
zG86P%L=bvG1Raz*iNb{T%;X+>@ZIani8FIS^z&IktgQ($zkq<MN^>_?tHITlcksNG
zA?gFx@b+CA_%~X?A@f+6_U9iYYGheXwaLdb>lJvViSN<pAHJw^SOOdWnu`nf#IxfI
zWLe8rOICMz0;|>&jek{s#69CT<2YGqzU|2obR;{9Y$%I>U(L&iRj&kDWqOh1>+}()
zYFW<o!%R?Bl7xM=PDEe-01DO?;xtbY6cg=B6ysE3rB5&19ruk)5F8~2#a?9gjMJbr
zE*>26Z}FxdEQf=MmfTQV8s67m#w^df%REq+fM@L;#h#B3;^r88R$4BD)i7ShE^bj_
zXMAnLPleXF-Esum>;B-|W-Y{1LQWIG8-HSSQk@hW|3Ged-=*Sm%Sgw7E*W2Z4Z?e;
zL(7gF66jIF%yji;r<p{d64e?q*{%W1la#^p<bUvMZV?fg^OuNTo4{yI-wlDS6Ol&r
zX6{JT0!UcJVe6J4oH03<9=X#*?gi|k)3%M$MJdN<<k}IcAeD@7*-mCf;<jU<cr!Ua
z?GURta{@obB?*V_m8TMC3dp8N9a19LM84ID(Mcoosa%>Bs2gRGxqFszn&0OVa}i@=
zIMT<iU)_mxnlE#sl49Kb2QN_ALLctJOhL_@A8trL>LpsNln)CdJ|ou*eH5732b^g!
z@5cfmJ5ypNO}wQ<<tOf;!4qZZ35Bin@|!YhyMG83D#qcS%XxT;g)B)O_C@83Tlf`j
z&+(<7`{_iZQL@&loE-T&p1d=AO2wkP$i0XVEM~Ec91DDb9<II!=PwIM)bZJ@S(gK>
z!DIF8^GKk&K@ra?l0#?WKcUJM+wk0*Uztm<2@_}V5UsGihw@w-IGJU&ggsQs`W+6Y
zgTe7s-C+}T`>aTdm-x}r{Bn9_u#sI^G`6oh&t?_>!!UR<hl&g?=HCty!N#RwB;r&r
zIjmlbN-a*169;NZq3u%6Y|K@x9XLbg7Sy628`Mz1nR_H?W)M!2nFBpvtFemJHs&U~
z7HcX$LRZs5u%*K{yk;4NsEbnQn~y)JXCFaI4?U2u<{Alne~r!Qy)B$vGM}=mZK%!I
zu628s7VW!sfQF_mXH%@t6K~IU)^XY}*cZ3asa?1D^<%s7*0-&Yd^m&53*CaFZJra!
zGY1KO_7XfBS;L(=XQq5<0?Irj&J6ZjGWzN6uzH;aJQmMDf9;PUqbxt<d@E~=?Ow-6
z$0eZ;-*!V|f-~0c{R6Fk|KrqhZ==4vhh(N>JnOu;k@j5QMj?BQ1zqZ<9WH-_Q{8@1
z(XVzm?LZy3>QWA?-KGxR_vVw*jlO)c;UScL*5dxG>4SIOC$N6$4B|MneT*{>V4X&9
z68>r@KG*l2Fxexp=F}k8)3^?i**)C#KZvnw%tb*#-{EFi7O%sz7$4Cc<cim+3w}54
zMlJp4q3P%f?rG>j<gBH^&2AsXuV%Z`dvjmVWBjvd_i<@TCJxfapSF;e>tBh1=X3mZ
zU_3kSuQ}0~ZHSJDoAa;9IzvuECmc>sgr%2U1X&Gk<Zwk3eynJS@^<7B^Gz-|D{~h)
z)&B~Duc)Hdp=MN~VS{#SxN~39C_`;lkXakdkX?u$`uqGGBowBga&;#`<l8~unJ?!i
zJy;KxvKlCSehqs(b&F74e;*Ck5v5`#T|&*^Jya(34BawiIhHs2&3eagBa61Xk!<G#
z(A%_-e>=JdU08IJY`XNEsrju)cCFn?bbV&C6Gomu$}f4;P^pP6OJj+Bo+sBbGlTo`
z*9p&8iW|$Y+L!>n4({XIz0h&P9qZjRfg>|E!I~y(^fK26KK=ZRytdmj`ZKpc#_NrY
zVNN$|7y4T`F>xnN^|?bW8s`a>bXHR1QYF$Ks)S+a9_#R6%mJ5vPa=<+kO=M9{I;B4
ztoTBmNXqoW@R;&fE+K=Ot7hWBU)69S`6Je9TZwj9CldKqHSU?r4H#}W#d;Ms%*~JE
zQQ7LwvAJdsXL6U~rKcI99D2K^b9@_`-Xb6p&laM(bAj9rlPjQClnKeArfk^lEkeyh
zb7+=^BMmBFCKRn&N+RPs=;m%aR{Y{6*7VjtGEH5Xn22bRg5a<GX4wW5=eGkD{<9%2
zuPzYNL=XIUbr1J+r!BW`jRyK0p+%;pj%8U2dR(;IM(E!xgiZBcaA3s(tZf{`i5i?h
zcherD?+@+Cw7RV*mFEXj-tC6&($6TTu}+}>iZS0(od#X5RcvgvmvG`ED+=OoX=Tg~
zni?2IC$Bf7)|Sg~S9LDCt^Pl<+(<}jR7FVptXceqh)+mi3*f!t2gy9WG{~Q2gL|Iy
zI2g-14fcnQZ8m9SUjH;y_d=4$PPzem9kj^YG3NXq^2Xw0nd((Xgw+ivadG=}A$oZd
zDsgxSr%KZxt^FyJ$`kOoP-Es;%5gYvYQ!d+XVX9FvxFi;t7w=@A>ENUj-slKbjf`u
z*4XGNHrP3UcRf!gH<+1Jj<<?mYc_{W|J8=??5^egdSQbVhof+`k0o>{m0+LU;apB*
z06DZb10`=?#Y9W}fKzc-2`fGc>U=LDn~HiY>k@?0l8fQru1v19zZuOM-vcS~F5E_k
zZLqN~5=}a!fy4&Z@rpPD_OQPTEy_JlJKfIGSnI2lFP=;%|96$7Nj=7wgXM9Ue*#YX
z5Jk7X_(5Kdiu0?)YDm&VH+-{G52Jy@_-GV|il5hj<M%%7x^oz=`F|nt?h@eWUC;Sh
z)WCUlZ8E8_6kH_aIPTzT)bY_?&?luqq$RYuM>h>1dH~??`wHfZ#|hYy?Sm4IN+XT0
zFVQKXE54!8PY+s;z1@+UP2Ra?(rr}&YSnrHeRg)h{XP~%<NHULT&9in+T5wu@G*YH
zF^2e5Pr#9vZ{kGXAvA5%YVLKH2Ye5(M6ItR$i@Z{qQ60ch>hx_JN}c=2KNlmW22ez
zVoh*Z;tg|lY5_E^C;@?g1ao=A9HMl>5LA<$Q0wAFpj>mC>6-Bv?X;bZ()YKb^lu00
ziN!0a|A`ND{P$F9bz73I-*pfkx%;9{ix;Fxe;O&+c8f0H9#fQS%&%-SBP-V}#S2Cw
z@%m^2p~}yo)?S_{mowyW&?2(*R}38P5P@@&o!C5RK1v&#iNhOeka*lyn2_@ic0)Cg
zJO5z)P&MK+6XA~M0bYOXB{Xu$wmLCN1kL#J8*R8FhdeT*@ZE?1sO{8a)V*;VP1Mn%
z0hede3xjt_<@PAz^I?P(tO2-}c^B>alR*@;#rXoauSDPII$o|Jfr^x>$&#WD;AEa7
z%gJ#hsO|-sQk9Ig_UFJ|n?ht`7KA4{%OPRN14iNgX-?B_EnIiY5(Ij$;NtGRMB#TL
z7{k!#Xw!Br6tXLb$y%FlkuXa{5Pl_#Icbmq5%C<A%yOq}s4G3olcb4_Tj-f#j%MTv
zNX*e1w10Lf*SX^as(yQvq-75C$z?Y(=bHGKdyaVnqC)6()FSfS2x{JRa4d^=hfI-U
z*wB3+ob~#QH`{q)1)H@ff0rAhW|PNFTm*1N={Sl=I?8l6enIkDPtefVH?C@%3x@lp
z8U9r{u6c|H%>63PQ3*_%quR-`m&Me6RyK`wJxCMg=94a(K-K2j!h^09+|uGrjnW&L
zD2Yd8B8c%xQYdlwS%z06KY=a(5Hc#|1ML+-mGOVjWfLPo;MiHPa5Mt3hIi;{WTN2z
z2F}Bg3N8%e5>j7AN(sq`NE)uJd(OF+GLjUfLE2G90|}K>TvTM$mr68flSsMuyyu`n
zN_`Eqq+uj2RN8&_k2vpn-sgROk7^c1@2quER%0Egy_`cg;-9#&CmFkLRt2kTrnp%1
z9%^_5Y|w{esBT~!{ju{9EOV}cVQ;oEscJG)vz74pn5R6oA&!^LZQ%}+z1d6uMpjl*
zi%X|YWz{avNEK2B#Xo<<)gC3R--CWcdD|S=QCURvj#T5(W1EE+UDSxny74qK<U8{(
zJ_gQ)|6wEC3X(S~Xoh+zI$W#{-c{yE)3+K&j}_x|&)+a)dJj52y&Y9JMMKWgci1{$
zB7`l?rC%%OgNE-S-Y-)JbWWsjNzgAIR(*k=n?H?5TCHWJLOr&!O$49L3gBExA~{oM
z$Fh{~irM5FP__0W-cwP4r+8MAS)NbuW~W{7rB(&?1-<~E)8E+{ooY09MJnAj!ifGk
zZwPviD^d9mEfDA~L}g9iWicH)@yKEMvQrjkXouxPw619{k|o(;bHNK3ct8*3HEyR8
z<z#Luc!^!NwDZU{1?*YdQtp511ozn2%WmzdWVxGua;L5(TyvogQ3}Ldy(vgc6@M}1
zWmmA)Ua?SjUMYTc%AO25T8|pW&BTMV0&(~<YnnFkIqAoC)3%{ch43gCBHtcF$$qW0
zeBWY}@pCHd$i7Zv54}ap-JXJi#dcw4nHT=rorIT+p>*%xYCJFZl+-VJ4tE^74zJY`
za>HxqS!>E`o@RN0Z}Hf|-uEc8Yp9mbpWwy^1b5-pp%+1AjiFd}>^an&(nb}d0#TJ#
zF1EE2V7sm;yzuo|bVRom{XBgD$IgF;FEqu$%zjZga{3rhylRHtFZQN_DVU;~myqal
zgzh0SR4=<KT^*|nD)ygoWwt7Ewi-r5I)9>lE@5b3f-Vo*%4JWd*YTVm4SYuB6~1t{
z79aX>0$bp|6Uh{0q#~A*-Yy}jOfX`~UI)ce1^He+u>f2Cu#$V)5-5FBLqE0rpsoS=
z*it+mw67JBY3}QZ`2J`4d)P~K7rmvo-F5MlzPmDGfs|_B8!x|ksnEQMA7t;FFH4Vo
zzW|$F4uJlz4ntJ5FAW`@NfS@BqvaLPcz~@fez2!M_i9n&Dpq@Vy3un!W0NH-k@T|l
zSKSy}w4Y6S8V)Ak^VzJ+7sb*MV}*CM;*s@xeflNEgI0G*@LEGJbn~(+DhpC!mZ7~^
zwJiu+oC=1LH_g;j+m;3_IfzZx6w1CB?}veo<#gpiHCf(?2-z3|xhLSOE5|dV<Xo&3
z>>ag@j-9FwljAMXf-j%=^bO0%hE4W-V%<M{?#C9s&37XYk9)_qM*4Hjcnky8{vn<|
zuJRdhl<ci+6koZu4x5yJKp!&>(4-y{S&g<C?icx0xHGa6J=H2>L+^Gn^Y9N$Z@wCw
z9NiC$E519pk35KO+&B)yY>HsUt4DO^Y9o4a@mb-K0msmY%hA}@Q5zJ}2Z2lIEyqd0
zy)Y@Z5oPJtawo+$`p?k{o^a?JA8c&NMXJNO)%sX=&b66`Ifk>|xW7!;q`_vabY(fM
z60vmH6dZHAo({5q1Koa7NJ?BoP3)|&^1njNERLWa&(*Z?T?DX<UvPMQ4Qw@RMJ1mD
zQO1}y&|9#IHeD?S%YqHyJ@d5eM8iWm=G=9hS=^07ElknAocrkXH+NLw8-|V-p5@MK
zF>yEKe1HEWK06ihWn~NasIopd?QoBk-O*+DhOS_##c|lF9N6CZ)?(=(7d-dYC8p#j
z*JZ1Hm{PbWG=$$l{fR4{-%yAzeha_}q6=uPyCn>gdVoo51$Nr_1f6wJfuYV6d9GEW
zZ>IbQxNW5@=zs-8y?=<(JL2)>n`f|f#srjUAi*;N+fl&NYOZy%78k~K@vPhVJU~mt
zbKlP7p8Bt`M#Wj?lGx6UrslBQms?qjpp}KG)`)5M4qUD~3~rSUfTD9R*u=Y`OewWU
zW_q$6%<FP+&h&CJ*JKr*F}X?BW~Ywtgx|yQS9(#}{#-fxW{CH!u15)7nX*-}R>G}`
zZm=u-K7Ab5hkh!@paT*y`aH%5k9Db~`>Z>-!Du(uIc6xQ=iB({{(k(pWh`IN@q}4*
zEM~ur|B!jX>7=^u04SH}W30YQ%(_+(<+GomX>cs+zAk|3lPyf+;c{WTNWLB(w}<@4
z^2~Tbm8>A8kT&nvB#PeN*y(vMD$9sKqi%NMnuscVe6%rkYSfasSskUvA8w;re%EQI
zUOi3!@j)iqKUdfscmOsAd$8)VvCRIuke_^ZmuF@B^Bh4pA3Ah4`y1ZEz81|Q@t4N3
zB#Thy>ehx$Bf7*@7f0gfr%yost4yMhJXWT0x}J`-SHl4|<<PacnHKDlGk=BmkkJ%F
zoThXauRl2e()8<5)RrE4I?oCpX<v_Dj+luhE-&cOFBW*_f^ww4dl>BQKLmxghJ%@$
zJC9p@8{nWVoAvDwo3TaSFEL4jXYPyQ^|~YY6gzkBcR<ERe)b~T6T}QwZD#Wf2a{!&
ze8e0T6TQxEkj1%>h10JQlx2pq#|dDF?F>5ZupYK|973HB9D#s!ick_=jlzTPFjDdr
zU5l`WVWwfo!gMf>n)4Q`iMwGSjKv4mU%}R81~5+6gTtcxK}dB192&V6wns&<a0fT`
za>6$5k?@{Jc}4QHj}y7h!&2svvWZ36p2C%Gf6>hW=F<3&k@S}2ytt(3Fwy@qg3YT;
zB@32~BoTccm@bgO$=PQ3)n6qX<FN%M*d!9|t}Ny-OpT2idyQqt8FHQ5g<!UAAb4%c
zq5Fi7Aok!$F!s)N4Ebw-O;)<n(It=Yv8<P{NwyAV-TH~H8JxhGKPGVZ)L(r3!uRa=
zrrSKS-#os(Op~!vQ&v+unFJFR_y?-F#z#-0AJQd0Z8V3hIew2VOI?BW^OSK>?+==z
zQwrCY6yZ5bf^l7IA<CXDB>|Aloc+$TVb`}Y)8ehD!u~V;=o1cC2fl#QOPrx)!)H2n
z>mTr8E?D^dA`<OciVu&sL>`0YQN0Ewd}H%H5?45rud7R7we#{>W}`h%YW>aU?)k-V
zyM(2G?33Buc}#N79p@pJSMkXkri=5!L}aJzF<X4<GZ=OIgP^&c?b26*H+OxQ=Bp~A
z_hcM$GISxkepxYxM*v;K1?#J;$r9_2%jmjhIByURS&y#6scIG3f$$3;@w$L5kNv@l
z2h3&Ljkn^UTLf=ia{zbVE)q@dImac3G<jsmc)9PO&Y$fw;3@vmTz9lPoZtSIWR%D~
zF2N*j4XteAFC}qNOb_u3MQq*TD5mye3Y66#w(rb+NZ;kfG~1&{u+b=#H+(Qjl8m7@
z%h!XixPc7*E(MQ*Lr6toI8|~3q0zSY!oNY!rAu=BWa(QUp~2_nT?+aUkk&F92aS%x
zQ9jS{B)xAU^gD-NoH3Cv93ka_yrfdu<TuZlzm9wOeJ0dLi2pV{7p1(;<nzsSSj#yV
z@g?2IMB`K?3!5GXKkp0$trj09%u9go#pmJcZC&Cc`G!9X+=TC2okjVT5_ulplBnsv
zhV4rL>)Tz#YU>7LtH?9>qW@58p4yDlj;Mf>Z4egaPlM+WjE9_6!G15)K{0KMXeCYK
z&x(iejNIScNZp9{wwCfk^Zw&*a$fLB=QbiS(H8|C(&i3w?|6fMsJP5~H#Y1xW|IO&
zU}If|`6nMr?cc-A2Y=Av3FY|R7$F&0HUp;qZl)?!o;j9u3K#YSLe#H?_{^j+IB;MQ
zDzd+YAO06jGxsaWZ?ze)ecU)iHzK%hZRdC;{VV!2Ycs5*VWNdinf#^2Bpz&XiCcBO
z=H2ma{9O5JE|@l-{6I0H;oW&6v$(f>mToe1`KyXArWT@3#e)!%J{;SG>fqc3AJHEZ
zU8Y-=N&1DG(`K!5A}-kn@5_%;=cIbvDEa^a`{QAF<yw3Xtwvq{hSF}|Z}_^|6`AbW
z9O2IfTNqg|i2m_c1VMtQ%rk2*(z)gePJ0H5eD4k7&4GkFpa<Og@_)SVydIY_Ay=E~
zL7p{T5IMvh6%GFP8+V$q2;SeT6<=__3QCqexa!Vqy#8!B=-<@FU;CRgt;e^eivEG9
zXL%F8dDaG=e)5-oT3e4FDW!qq4T0>~HwnHtdnUF2_W|ta$iYV{snD)U1$;cOBGoMm
zLGn6U>K#%k^jIoH!h>$K<wb;O_0y}ok?C{aOA^*+mdrn=sPb2fF0ozD?MUdZERpZA
zP|<kXG`>8u5<OaPAuigqh$=cq5N*pqWWT@>K7J}8LBBr09}_8L4{^gWYX$V2nOwV_
z2ymRQ6NvxjdcePFKag<6Gd#W1Uiy$C5O|-)4jQ?rXR0|QoAzO)0D0fj1Z#A1#CVyT
z+Y_o&5en+FFNy^3q`Y9RF<*}3c!+}=FW6SVpUt?*<MyYDY`lMqjINoJvdm%JdO;Xj
z?jsNvYxZE}BUg#(irxyU7|8m6TT7Dj9HF*;7gIZY8Fxf#z?J)pA=%c|am5b>T;VH#
zUndPw!TND%nQb-7i4`Dmo&ii4QHRrx#bV=c-5@+Q113DR!m0Bf(pA)!K06u<3mgTa
zrH4~^UY`MXLz{Wlf46u_&Tii2cZ<(A-61j`bx<_m%~Ntz<i&lHj7ZTEZShH;6?oyJ
z8)UST9+ntRfg4sDMEE_LK1^<BgO_AuW@ir*b`1ozu+{WUs}+`*TwzWzTV*vHmCz+~
zAFQG%|2tNBLXtxmerb|N6qfZsndW5Bds2=QKYo?A5BG!NHkz=aGEcPqu`$mGapB?*
zPk8);1AIroJ^uM`2;Ug;Q{=wCgFG&7CUrX%<T)DwvCCE!=Nn9x&w}g37zYyt?`~xN
zMhY1=Psr#kDkOBN2U}D)2paqckOc)X5L^@I&=(iTwrdu`%$E^pXHf+nH(C?lxKvHY
z?)*cB$KE35%9-F=*Gs!@uS1`1?nKRdhk!)Q7HnJwi*`sB^Vm71Tr2r954Dfs=|=|e
z_T2qEX7h8=*jJ?@htxpwMbm|EsZAjKwLtvur7AF&h{?{MA934KL)@*LjT1``k^^ts
z2yF0V{)gL{x6J^2*P{WQx3HoC7j#&bdnp{6zC<SVx{Bx~Yur%YfivollfjGM;_hWR
z=!Hj*45n?RbM-{{pj$XZ;Fq*;*%OhEM4cO-y2h?7YvY?`C;8ogF23=5C0Q-#W318+
z1&X$c&J4NFR}UXb=Gfm8r|r^U6AKBM>U*7_=tpE*KY12dnn-%uTFBFND(Kp^IyOqd
z8hM%3;n&{{*z~`D*#5LHuzo`@j0*RrhWtKGPh5>VLVn>b5g%#eB_rJ4Z#h0@KLR4v
zP4S&*16nowGcvWR6irLMAWxl}vxjlzd}!};*5<N_`-HZUhVXb|Hm93qbcd2lv1Qz%
zrkW(&Un1V2P1*7wQ~Y`|A-XkY<nWgBL{{-uq@Wr|%HCebFXv4K@x)k%sG|o-(&+VU
z0OYZhUzXH1;tIsfcBEzZE+GZ)NzyYz9>70Jjc_$708iSUjXUZ@NPd;aISR$7t-n0q
zc%+1+*M8%g^~tzQ-v%dDnWFG`x$hBqf&4H~6=e}8GB)^!NZIKfySky8iyCXho35%e
zi-3XHI64!rwzyAH{#!|kA50|`y@|Lv#DO4()x^)^C}_R(A%**EAWV52n}0VMqh4+L
zq{s!V@99Hwt|Bz69%O2gSI`us3VYRGV4J0{@vxOkg&iefxc+Jyj(gHWQ1~MDHc=bQ
z^4)R4xZTjXVHa6He5J_3>mTwu`!IPZTOwNSIz*n&eNLPgIf+9W*TL=VQ|SF>Yn)iS
zj12b-MYXTm30Bm>3o`QI=>F5hHs6Z4N|ur=w+T2X?=|Zm?uZj^c*t@L^=RUR-*EZ4
z5>q%`%G9ssgQU<JWcKr5WlbKPCTHF*)J5Wm&T_ole=W(<$Yc4L?^x0(9jyA#Mc6wr
zhn)1!6`7hICU<jwi5zbY6D^N!A(zbJz@;ElysExdS{*3BHyz(0^;wFtLY=X|JELKP
z-X*-?p*ain4gg`~dVFJ(78#_vi_G7z3;x&)8>IiGmK{tuW<V|^eC&ZeYm6cG%5R~Y
z##ET2RRf>ZYrtKzg(>8O3MFTp@zVGrl5FG1rahX8OYIhrNx2(Y`klj&UwBBQ>mNll
zq&}pes92Oa<{kMjc^a7}VB(cpYtXsZ7x3ERT3qQcOm=gW1STPK6d2@%eIGAo8v1kC
z^bJX@pWNTi_TG+uH)hj*M1W>{HqgCV%W2SnJldD@6zwTaN5kHoLCAS07CbNmiGMVd
zU6gD66Ye1B4o7b<=#q7R+Sy!T6;8R?Bs<nA=S+rIFsr&RB6p8NL`UlkJ#u%jXsJge
z{`It$9DMLl9Gu_*N}-&-xV8!}{TYwW9WKU$8#2j|`aJwwa1JInx-hqUQSjyCGx+gN
zf!<VjBXmzWD61MXg2t<e=^06(Oj&y;`qudvHT{gD>c+G1Q5ywRdfu0IOP<q-g0bMg
z=&>*?@dP<q8^V^EY$lSkDL6GGh1pe%5vlF#7WwUsVG0j8bH#OJXNww}66Z#&Vw%K(
zpZn-p``^N>W*^$|*n_4OyU4rSMiQm%H8>}AKNdSafSl~(OntN%ENy0k<C!<q|BkLq
ztXzY}1gpT1YbU5S?<aG1FaWz%6`*)Th?jK?!12N**vN7tUVP%X(CANt3@FBTR1j-{
zc*vD06X!=4v6s>|k;BLcQAR;8Iq~m4<YnO?a`<F`2aZf<Cg&B!tL9r#lR<N^INluw
zr$$r#zr#>Kxh;xKv5|iGEWl~DBB5(XD%iZIIBCi{dT3#ke76l0cHW<k9?f1skJsLz
z{$EGX`*I_-sQ3*y7PX*OaW0-y<BO*Ttie%vCsB#5T(?|TCTm_vVQKUVlzpmzx^MSl
ziTg_M562*p=L0X2C9RitkGvHr-|Zza#;wFHd4qU)K)39WxgXAWnF2=nX`sD(B{)0$
zW!g8!gRPebKK9KKr7nMimFr*QCWpn?%t(_SN)Hr94Q)r;E#AoleNSW!i}dKN)IE^#
hS4P`q3Bo?p8|c`r>DcnaF0AUi3c*WLhn<sW{tsnl#9IIW

diff --git a/src/feat/wave-reader-test.cc b/src/feat/wave-reader-test.cc
index f9a71e8af34..ce8299446be 100644
--- a/src/feat/wave-reader-test.cc
+++ b/src/feat/wave-reader-test.cc
@@ -72,6 +72,10 @@ static void UnitTestStereo8K() {
   std::istringstream ies(expect_mat, std::ios::in);
   Matrix<BaseFloat> expected;
   expected.Read(ies, false /* text */);
+  // WaveData scales data to the range [-1, 1], so do the same. Don't
+  // put the scaled values in the string expect_mat, since
+  // representing floating point as text losslessly is tricky.
+  expected.Scale(BaseFloat(1.0 / 32768.0));
 
   AssertEqual(wave.SampFreq(), hz, 0);
   AssertEqual(wave.Duration(), 3.0 /* samples */ / hz /* Hz */, 1E-6);
@@ -118,6 +122,7 @@ static void UnitTestMono22K() {
   std::istringstream ies(expect_mat, std::ios::in);
   Matrix<BaseFloat> expected;
   expected.Read(ies, false /* text */);
+  expected.Scale(BaseFloat(1.0 / 32768.0));
 
   AssertEqual(wave.SampFreq(), hz, 0);
   AssertEqual(wave.Duration(), 5.0 /* samples */ / hz /* Hz */, 1E-6);
@@ -157,6 +162,7 @@ static void UnitTestEndless1() {
   std::istringstream ies(expect_mat, std::ios::in);
   Matrix<BaseFloat> expected;
   expected.Read(ies, false /* text */);
+  expected.Scale(BaseFloat(1.0 / 32768.0));
 
   AssertEqual(wave.Data(), expected);
 }
@@ -194,6 +200,7 @@ static void UnitTestEndless2() {
   std::istringstream ies(expect_mat, std::ios::in);
   Matrix<BaseFloat> expected;
   expected.Read(ies, false /* text */);
+  expected.Scale(BaseFloat(1.0 / 32768.0));
 
   AssertEqual(wave.Data(), expected);
 }
diff --git a/src/hmm/tree-accu.cc b/src/hmm/tree-accu.cc
index 80041d275e6..5131d6a4cf1 100644
--- a/src/hmm/tree-accu.cc
+++ b/src/hmm/tree-accu.cc
@@ -86,7 +86,7 @@ void AccumulateTreeStats(const Transitions &trans_model,
         EventType evec_more(evec);
         int32 pdf_class = trans_model.TransitionIdToPdfClass(
             split_alignment[i+info.central_position][j]);
-        // pdf_class will normally by 0, 1 or 2 for 3-state HMM.
+        // pdf_class will normally be 0, 1 or 2 for 3-state HMM.
         std::pair<EventKeyType, EventValueType> pr(kPdfClass, pdf_class);
         evec_more.push_back(pr);
         std::sort(evec_more.begin(), evec_more.end());  // these must be sorted!
diff --git a/src/transform/Makefile b/src/transform/Makefile
index bd745599032..3899ac52334 100644
--- a/src/transform/Makefile
+++ b/src/transform/Makefile
@@ -7,7 +7,7 @@ TESTFILES = lda-estimate-test fmllr-diag-gmm-test
 OBJFILES = lda-estimate.o \
     cmvn.o transform-common.o fmllr-diag-gmm.o \
     lvtln.o mllt.o basis-fmllr-diag-gmm.o \
-    compressed-transform-stats.o fmllr-raw.o
+    compressed-transform-stats.o
 
 
 LIBNAME = kaldi-transform

From 6d5c87bf334a87e4e971fce1de95ace385bdf153 Mon Sep 17 00:00:00 2001
From: Daniel Galvez <galv@users.noreply.github.com>
Date: Wed, 3 Apr 2019 09:37:50 -0700
Subject: [PATCH 020/163] Fix compilation of posterior.cc (#3200)

* Fix compilation of posterior.cc

* Replace TransitionIdToPhone() with InfoForTransitionId().phone
---
 src/bin/ali-to-phones.cc              |  6 +++---
 src/chain/chain-supervision.cc        |  2 +-
 src/hmm/hmm-utils-test.cc             |  6 +++---
 src/hmm/hmm-utils.cc                  |  6 +++---
 src/hmm/posterior.cc                  | 14 +++++++-------
 src/hmm/tree-accu.cc                  |  6 +++---
 src/latbin/lattice-arc-post.cc        |  2 +-
 src/online/online-faster-decoder.cc   |  2 +-
 src/online2/online-endpoint.cc        |  2 +-
 src/online2/online-ivector-feature.cc |  2 +-
 10 files changed, 24 insertions(+), 24 deletions(-)

diff --git a/src/bin/ali-to-phones.cc b/src/bin/ali-to-phones.cc
index 5def11ffc79..ed7f99758cd 100644
--- a/src/bin/ali-to-phones.cc
+++ b/src/bin/ali-to-phones.cc
@@ -98,7 +98,7 @@ int main(int argc, char *argv[]) {
         BaseFloat phone_start = 0.0;
         for (size_t i = 0; i < split.size(); i++) {
           KALDI_ASSERT(!split[i].empty());
-          int32 phone = trans_model.TransitionIdToPhone(split[i][0]);
+          int32 phone = trans_model.InfoForTransitionId(split[i][0]).phone;
           int32 num_repeats = split[i].size();
           ctm_writer.Stream() << key << " 1 " << phone_start << " "
                       << (frame_shift * num_repeats) << " " << phone << std::endl;
@@ -108,7 +108,7 @@ int main(int argc, char *argv[]) {
         std::vector<int32> phones;
         for (size_t i = 0; i < split.size(); i++) {
           KALDI_ASSERT(!split[i].empty());
-          int32 phone = trans_model.TransitionIdToPhone(split[i][0]);
+          int32 phone = trans_model.InfoForTransitionId(split[i][0]).phone;
           int32 num_repeats = split[i].size();
           //KALDI_ASSERT(num_repeats!=0);
           if (per_frame)
@@ -122,7 +122,7 @@ int main(int argc, char *argv[]) {
         std::vector<std::pair<int32, int32> > pairs;
         for (size_t i = 0; i < split.size(); i++) {
           KALDI_ASSERT(split[i].size() > 0);
-          int32 phone = trans_model.TransitionIdToPhone(split[i][0]);
+          int32 phone = trans_model.InfoForTransitionId(split[i][0]).phone;
           int32 num_repeats = split[i].size();
           //KALDI_ASSERT(num_repeats!=0);
           pairs.push_back(std::make_pair(phone, num_repeats));
diff --git a/src/chain/chain-supervision.cc b/src/chain/chain-supervision.cc
index af28ef85a33..c702b4b1114 100644
--- a/src/chain/chain-supervision.cc
+++ b/src/chain/chain-supervision.cc
@@ -229,7 +229,7 @@ bool PhoneLatticeToProtoSupervision(const SupervisionOptions &opts,
 
 bool TimeEnforcerFst::GetArc(StateId s, Label ilabel, fst::StdArc* oarc) {
   // the following call will do the range-check on 'ilabel'.
-  int32 phone = trans_model_.TransitionIdToPhone(ilabel);
+  int32 phone = trans_model_.InfoForTransitionId(ilabel).phone;
   KALDI_ASSERT(static_cast<size_t>(s) <= allowed_phones_.size());
   if (static_cast<size_t>(s) == allowed_phones_.size()) {
     // No arcs come from the final state.a
diff --git a/src/hmm/hmm-utils-test.cc b/src/hmm/hmm-utils-test.cc
index cf282ac03c5..fddbd82a9a0 100644
--- a/src/hmm/hmm-utils-test.cc
+++ b/src/hmm/hmm-utils-test.cc
@@ -221,7 +221,7 @@ void TestSplitToPhones() {
     KALDI_ASSERT(!split_alignment[i].empty());
     for (size_t j = 0; j < split_alignment[i].size(); j++) {
       int32 transition_id = split_alignment[i][j];
-      KALDI_ASSERT(trans_model->TransitionIdToPhone(transition_id) ==
+      KALDI_ASSERT(trans_model->InfoForTransitionId(transition_id).phone ==
                    phone_seq[i]);
     }
   }
@@ -305,8 +305,8 @@ void TestConvertAlignment() {
     KALDI_ASSERT(b1 && b2);
     KALDI_ASSERT(old_split.size() == new_split.size());
     for (size_t i = 0; i < new_split.size(); i++)
-      KALDI_ASSERT(trans_model_old.TransitionIdToPhone(old_split[i].front()) ==
-                   trans_model_new.TransitionIdToPhone(new_split[i].front()));
+      KALDI_ASSERT(trans_model_old.InfoForTransitionId(old_split[i].front()).phone ==
+                   trans_model_new.InfoForTransitionId(new_split[i].front()).phone);
     if (!new_topology && subsample_factor == 1) {
       // we should be able to convert back and it'll be the same.
       std::vector<int32> old_alignment_copy;
diff --git a/src/hmm/hmm-utils.cc b/src/hmm/hmm-utils.cc
index a70dc5275c2..6e1bd483580 100644
--- a/src/hmm/hmm-utils.cc
+++ b/src/hmm/hmm-utils.cc
@@ -751,8 +751,8 @@ static inline void ConvertAlignmentForPhone(
   int32 alignment_size = old_phone_alignment.size();
   static bool warned_topology = false;
   int32 P = new_ctx_dep.CentralPosition(),
-      old_central_phone = old_trans_model.TransitionIdToPhone(
-          old_phone_alignment[0]),
+      old_central_phone = old_trans_model.InfoForTransitionId(
+          old_phone_alignment[0]).phone,
       new_central_phone = new_phone_window[P];
   const Topology &old_topo = old_trans_model.GetTopo(),
       &new_topo = new_trans_model.GetTopo();
@@ -944,7 +944,7 @@ static bool ConvertAlignmentInternal(const Transitions &old_trans_model,
   std::vector<int32> mapped_phones(phone_sequence_length);
   for (size_t i = 0; i < phone_sequence_length; i++) {
     KALDI_ASSERT(!old_split[i].empty());
-    mapped_phones[i] = old_trans_model.TransitionIdToPhone(old_split[i][0]);
+    mapped_phones[i] = old_trans_model.InfoForTransitionId(old_split[i][0]).phone;
     if (phone_map != NULL) {  // Map the phone sequence.
       int32 sz = phone_map->size();
       if (mapped_phones[i] < 0 || mapped_phones[i] >= sz ||
diff --git a/src/hmm/posterior.cc b/src/hmm/posterior.cc
index 3089be237b2..4742c0f7824 100644
--- a/src/hmm/posterior.cc
+++ b/src/hmm/posterior.cc
@@ -303,8 +303,8 @@ struct ComparePosteriorByPdfs {
   ComparePosteriorByPdfs(const Transitions &tmodel): tmodel_(&tmodel) {}
   bool operator() (const std::pair<int32, BaseFloat> &a,
                    const std::pair<int32, BaseFloat> &b) {
-    if (tmodel_->TransitionIdToPdf(a.first)
-        < tmodel_->TransitionIdToPdf(b.first))
+    if (tmodel_->TransitionIdToPdfFast(a.first)
+        < tmodel_->TransitionIdToPdfFast(b.first))
       return true;
     else
       return false;
@@ -328,7 +328,7 @@ void ConvertPosteriorToPdfs(const Transitions &tmodel,
     unordered_map<int32, BaseFloat> pdf_to_post;
     for (size_t j = 0; j < post_in[i].size(); j++) {
       int32 tid = post_in[i][j].first,
-          pdf_id = tmodel.TransitionIdToPdf(tid);
+          pdf_id = tmodel.TransitionIdToPdfFast(tid);
       BaseFloat post = post_in[i][j].second;
       if (pdf_to_post.count(pdf_id) == 0)
         pdf_to_post[pdf_id] = post;
@@ -354,7 +354,7 @@ void ConvertPosteriorToPhones(const Transitions &tmodel,
     std::map<int32, BaseFloat> phone_to_post;
     for (size_t j = 0; j < post_in[i].size(); j++) {
       int32 tid = post_in[i][j].first,
-          phone_id = tmodel.TransitionIdToPhone(tid);
+          phone_id = tmodel.InfoForTransitionId(tid).phone;
       BaseFloat post = post_in[i][j].second;
       if (phone_to_post.count(phone_id) == 0)
         phone_to_post[phone_id] = post;
@@ -381,7 +381,7 @@ void WeightSilencePost(const Transitions &trans_model,
     this_post.reserve((*post)[i].size());
     for (size_t j = 0; j < (*post)[i].size(); j++) {
       int32 tid = (*post)[i][j].first,
-          phone = trans_model.TransitionIdToPhone(tid);
+          phone = trans_model.InfoForTransitionId(tid).phone;
       BaseFloat weight = (*post)[i][j].second;
       if (silence_set.count(phone) != 0) {  // is a silence.
         if (silence_scale != 0.0)
@@ -405,7 +405,7 @@ void WeightSilencePostDistributed(const Transitions &trans_model,
     BaseFloat sil_weight = 0.0, nonsil_weight = 0.0;
     for (size_t j = 0; j < (*post)[i].size(); j++) {
       int32 tid = (*post)[i][j].first,
-          phone = trans_model.TransitionIdToPhone(tid);
+          phone = trans_model.InfoForTransitionId(tid).phone;
       BaseFloat weight = (*post)[i][j].second;
       if (silence_set.count(phone) != 0) sil_weight += weight;
       else nonsil_weight += weight;
@@ -546,7 +546,7 @@ void PosteriorToPdfMatrix(const Posterior &post,
   // Fill from Posterior,
   for (int32 t = 0; t < post.size(); t++) {
     for (int32 i = 0; i < post[t].size(); i++) {
-      int32 col = model.TransitionIdToPdf(post[t][i].first);
+      int32 col = model.TransitionIdToPdfFast(post[t][i].first);
       if (col >= num_cols) {
         KALDI_ERR << "Out-of-bound Posterior element with index " << col
                   << ", higher than number of columns " << num_cols;
diff --git a/src/hmm/tree-accu.cc b/src/hmm/tree-accu.cc
index 5131d6a4cf1..2a3d99fc5c5 100644
--- a/src/hmm/tree-accu.cc
+++ b/src/hmm/tree-accu.cc
@@ -54,8 +54,8 @@ void AccumulateTreeStats(const Transitions &trans_model,
         i + info.central_position < static_cast<int32>(split_alignment.size())) {
       int32 central_phone =
           MapPhone(info.phone_map,
-                   trans_model.TransitionIdToPhone(
-                       split_alignment[i+info.central_position][0]));
+                   trans_model.InfoForTransitionId(
+                       split_alignment[i+info.central_position][0]).phone);
       bool is_ctx_dep = !std::binary_search(info.ci_phones.begin(),
                                             info.ci_phones.end(),
                                             central_phone);
@@ -65,7 +65,7 @@ void AccumulateTreeStats(const Transitions &trans_model,
         if (i + j >= 0 && i + j < static_cast<int32>(split_alignment.size()))
           phone =
               MapPhone(info.phone_map,
-                       trans_model.TransitionIdToPhone(split_alignment[i+j][0]));
+                       trans_model.InfoForTransitionId(split_alignment[i+j][0]).phone);
         else
           phone = 0;  // ContextDependency class uses 0 to mean "out of window";
         // we also set the phone arbitrarily to 0
diff --git a/src/latbin/lattice-arc-post.cc b/src/latbin/lattice-arc-post.cc
index 63d25383aa5..57a761b4157 100644
--- a/src/latbin/lattice-arc-post.cc
+++ b/src/latbin/lattice-arc-post.cc
@@ -85,7 +85,7 @@ class ArcPosteriorComputer {
             if (trans_model_->IsFinal(ali[frame])) {
               if (first_phone) first_phone = false;
               else os << ' ';
-              os << trans_model_->TransitionIdToPhone(ali[frame]);
+              os << trans_model_->InfoForTransitionId(ali[frame]).phone;
             }
           }
         }
diff --git a/src/online/online-faster-decoder.cc b/src/online/online-faster-decoder.cc
index c403a3eeff5..222c529ce0b 100644
--- a/src/online/online-faster-decoder.cc
+++ b/src/online/online-faster-decoder.cc
@@ -219,7 +219,7 @@ bool OnlineFasterDecoder::EndOfUtterance() {
   SplitToPhones(trans_model_, isymbols, &split);
   for (size_t i = 0; i < split.size(); i++) {
     int32 tid = split[i][0];
-    int32 phone = trans_model_.TransitionIdToPhone(tid);
+    int32 phone = trans_model_.InfoForTransitionId(tid).phone;
     if (silence_set_.count(phone) == 0)
       return false;
   }
diff --git a/src/online2/online-endpoint.cc b/src/online2/online-endpoint.cc
index da057f8e62d..39ef6228f7f 100644
--- a/src/online2/online-endpoint.cc
+++ b/src/online2/online-endpoint.cc
@@ -95,7 +95,7 @@ int32 TrailingSilenceLength(const Transitions &tmodel,
     LatticeArc arc;
     iter = decoder.TraceBackBestPath(iter, &arc);
     if (arc.ilabel != 0) {
-      int32 phone = tmodel.TransitionIdToPhone(arc.ilabel);
+      int32 phone = tmodel.InfoForTransitionId(arc.ilabel).phone;
       if (silence_set.count(phone) != 0) {
         num_silence_frames++;
       } else {
diff --git a/src/online2/online-ivector-feature.cc b/src/online2/online-ivector-feature.cc
index 6898c38ad12..f210d304273 100644
--- a/src/online2/online-ivector-feature.cc
+++ b/src/online2/online-ivector-feature.cc
@@ -624,7 +624,7 @@ void OnlineSilenceWeighting::GetDeltaWeights(
         // frame we have a traceback for (probably a reasonable guess).
         frame_weight[offset] = frame_weight[offset - 1];
       } else {
-        int32 phone = trans_model_.TransitionIdToPhone(transition_id);
+        int32 phone = trans_model_.InfoForTransitionId(transition_id).phone;
         bool is_silence = (silence_phones_.count(phone) != 0);
         if (is_silence)
           frame_weight[offset] = silence_weight;

From 8fa9d18407095cbab32374dcbb165421557bb2bb Mon Sep 17 00:00:00 2001
From: Daniel Povey <dpovey@gmail.com>
Date: Wed, 17 Apr 2019 23:52:05 -0700
Subject: [PATCH 021/163] Kaldi10: more tensor drafting. (#3246)

* [src] Tensor changes, still very rough draft and in flux

* [src] More tensor progress

* [src] Various progress

* [src] More drafts of tensor stuff
---
 src/tensor/change-tracker.h              | 139 ++++++
 src/tensor/op.h                          | 195 ++++++++
 src/tensor/storage.h                     | 123 ++++-
 src/tensor/tensor-common.h               |   8 -
 src/tensor/tensor-functions.h            |  98 +++-
 src/tensor/tensor-impl-linear.h          |  25 +-
 src/tensor/tensor-impl-utils.cc          |  52 ++
 src/tensor/tensor-impl-utils.h           | 102 +++-
 src/tensor/tensor-impl.h                 |  43 +-
 src/tensor/tensor-pattern-extra-utils.cc | 501 +++++++++++++++++++
 src/tensor/tensor-pattern-extra-utils.h  | 272 ++++++++++
 src/tensor/tensor-pattern-utils.cc       | 121 ++++-
 src/tensor/tensor-pattern-utils.h        | 603 ++++++++++++++++-------
 src/tensor/tensor-pattern.h              | 324 ++++++++++--
 src/tensor/tensor.h                      |  44 +-
 src/tensor/variable.cc                   |  34 ++
 src/tensor/variable.h                    | 259 ++++++++--
 17 files changed, 2600 insertions(+), 343 deletions(-)
 create mode 100644 src/tensor/change-tracker.h
 create mode 100644 src/tensor/op.h
 create mode 100644 src/tensor/tensor-impl-utils.cc
 create mode 100644 src/tensor/tensor-pattern-extra-utils.cc
 create mode 100644 src/tensor/tensor-pattern-extra-utils.h
 create mode 100644 src/tensor/variable.cc

diff --git a/src/tensor/change-tracker.h b/src/tensor/change-tracker.h
new file mode 100644
index 00000000000..1c0a1af1e63
--- /dev/null
+++ b/src/tensor/change-tracker.h
@@ -0,0 +1,139 @@
+// tensor/change-tracker.h
+
+// Copyright      2019  Johns Hopkins University (author: Daniel Povey)
+
+// See ../../COPYING for clarification regarding multiple authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//  http://www.apache.org/licenses/LICENSE-2.0
+//
+// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
+// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
+// MERCHANTABLITY OR NON-INFRINGEMENT.
+// See the Apache 2 License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef KALDI_TENSOR_CHANGE_TRACKER_H_
+#define KALDI_TENSOR_CHANGE_TRACKER_H_ 1
+
+#include <functional>
+#include "tensor/tensor-common.h"
+#include "tensor/tensor-pattern.h"
+
+
+namespace kaldi {
+namespace tensor {
+
+
+
+/**
+   class ChangeTracker is something we only use in 'debug mode'.  Its purpose is
+   to keep track of when data was last changed
+
+   to make sure people don't mutate data via in-place operations in a way that
+   will invalidate the backprop.  This is a replacement for the 'version
+   numbering' of Variables used in PyTorch, i.e. it's a different way of solving
+   the same problem.  The mechanism is (I think) more exact than version
+   numbering, and less hassle for the calling code; but since it's slower, we
+   will only activate it occasionally.  c.f. SetDebugMode(), GetDebugMode().
+
+   When a computation requiring derivatives creates a graph that will (when
+   Backprop()'d) require a certain Tensor's data to remain unchanged until
+   the backprop is done, we put a lock on the relevant memory region.
+   This is done by LockPattern().  Conceptually the locking is done at the
+   byte level, but without explicitly creating a byte-level map; it's
+   done by detecting overlap of Patterns and will be reasonably efficient
+   unless the user is creating a large number of different views of the same
+   memory region.
+
+   The same piece of memory may be locked multiple times.  This is not a write
+   lock, it is a lock that prevents modification of that memory location.
+   Attempts to mutate that memory (assuming the code calls Mutate()) will cause a
+   crash.  The solution would be to remove the offending in-place operation from
+   your code.
+
+ */
+class ChangeTracker {
+ public:
+
+  // Increments the global counter and returns that value.
+  static uint64 GetTick();
+
+
+  /** Constructor.  A Storage object is created for each allocated block of
+      memory, and each Storage object has at most one ChangeTracker object.
+
+      @param [in] num_bytes  The number of bytes allocated in this block.
+                           Only needed for checking, to make sure that
+                           the patterns do not overstep this bound.
+   */
+  ChangeTracker(size_t num_bytes);
+
+
+  /**
+     Record a change to this storage region at the current time (obtained by
+     GetTick()).  Just appends it to the vector of changes after canonicalizing
+     the pattern.
+
+     @param [in] element_size  The size in bytes of the data type being stored
+                             here: for example, 4 for float.
+     @param [in] pattern    The pattern being changed.  It will be reduced
+                            to canonical form (c.f. CanonicalizePattern())
+                            before being stored.
+   */
+
+  void RecordChange(int32 element_size,
+                    const TensorPattern &pattern);
+
+
+  /**
+     Returns true if any element covered by this pattern has been
+     changed since the time given by 'tick'.
+
+      @param [in] tick  The time (obtained by GetTick()) since when
+                     we want to know about changes
+      @param [in] pattern  The pattern that we are checking
+   */
+  bool ChangedSince(int64 tick,
+                    const TensorPattern &pattern);
+
+ private:
+
+  // number of bytes in this storage region (or possibly just a very big number,
+  // if the size of the region was not known).
+  int64 num_bytes_;
+
+  // The size of elements in this storage region (e.g. 4 for float).  If for
+  // some region the same region was accessed with multiple different element
+  // sizes, this will be their lowest common denominator and all patterns
+  // will have their strides and offsets scaled appropriately.
+  // (We don't just store patterns in terms of bytes because we don't want
+  // to increase the risk of overflowing int32 storage).
+  int64 element_size_;
+
+  struct ChangeRecord {
+    TensorPattern pattern;  // The pattern (offset, dims, strides) that was
+                            // changed within this storage region.  This pattern
+                            // will have been reduced to canonical form.
+    int64 tick;             // The time, in ticks (c.f. tick_counter_) at which
+                            // this set of elements was changed.
+  };
+
+
+  // changes_ is a sequences of changes.
+  std::vector<ChangeRecord> changes_;
+
+  static int64 tick_counter_{0};
+
+};
+
+
+
+}  // namespace tensor
+}  // namespace kaldi
+
+#endif  // KALDI_TENSOR_CHANGE_TRACKER_H_
diff --git a/src/tensor/op.h b/src/tensor/op.h
new file mode 100644
index 00000000000..a3f2028c8d8
--- /dev/null
+++ b/src/tensor/op.h
@@ -0,0 +1,195 @@
+// tensor/op.h
+
+// Copyright      2019  Johns Hopkins University (author: Daniel Povey)
+
+// See ../../COPYING for clarification regarding multiple authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//  http://www.apache.org/licenses/LICENSE-2.0
+//
+// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
+// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
+// MERCHANTABLITY OR NON-INFRINGEMENT.
+// See the Apache 2 License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef KALDI_TENSOR_TENSOR_OP_H_
+#define KALDI_TENSOR_TENSOR_OP_H_ 1
+
+#include "tensor/tensor.h"
+
+namespace kaldi {
+namespace tensor {
+
+class Variable;
+
+
+/**
+   class Op is a base-class for objects that are created when we compute
+   functions of Variables; they exist as long as we retain the computation
+   graph.  In fact, the Ops (together with the Variables) *are* the
+   computation graph.  An op may in general have multiple input Variables
+   and multiple output Variables.
+
+   Every base Variable (see variable.h for definition) that is tracked
+   has a singly linked list of Ops that changed that base Variable,
+   ordered from most recent to least recent.
+
+   When a user calls Backprop() on a Variable, the backprop code works out a
+   topological order of Ops and calls the Ops in (essentially) the reverse order
+   in which they were created.  The backprop code also frees gradients of
+   Variables when it knows they will no longer be needed.
+ */
+class Op {
+ public:
+
+  Op(): n_(GetCount()) { }
+
+  /// InputIteratorBegin() and InputIteratorEnd() form the begin and
+  /// end points of a list of Variables that were inputs of this Op
+  /// but were not outputs.  This is used by the backprop code when finding
+  /// the topological order of ops.  (Note: output variables themselves
+  /// refer to Ops, so if we included them in the input list we'd
+  /// get a cycle in the graph).  These Variables are expected to
+  /// still have their graph information (i.e. sub-classes of class Op
+  /// class must not call RemoveGraph() on the members of this list).
+  virtual Variable *InputIteratorBegin() = 0;
+  virtual Variable *InputIteratorEnd() = 0;
+
+
+
+  Op *GetTail() final;  // returns the tail (in a singly linked list of Ops for
+                        // this variable); this list will only have >1 element
+                        // only if in-place operations were done.  (If, later,
+                        // we need the shared_ptr to be returned from here, we
+                        // can change this code to return that; we just return
+                        // the raw pointer for efficiency.)
+
+
+  // Connect this Op to Variable 'v', which is expected to be an output of this
+  // Op.  What this does is to ensure that 'v' has a TensorGrad object,
+  // and add this Op as the head of the 'ops_' list of v.
+  void ConnectToOutput(Variable *v) final;
+
+  // Checks that tail_ is currently nullptr, and sets tail_ to 'op'.
+  void SetTail(std::shared_ptr<Op> op) final;
+
+  // This number is used to determine the order of Ops in a graph; each time we
+  // generate an Op we increment a global counter.  Doing it this way, rather
+  // than via topological sorting, avoids the need to think about having to
+  // track implicit dependencies such as when different Ops operate on Variables
+  // that are overlapping views into an underlying Variable.
+  int64 GetNumber() const final { return n_; }
+
+  virtual void Backward();
+
+ protected:
+  // Constructor, to be used from child classes.  This base-class takes care
+  // of storing the list of input Variables for purposes of tracing dependencies;
+  //
+  //  @param [in] input_vars  The list of input Variables (meaning: Variables
+  //                   that are inputs to, but not outputs of, i.e. not modified
+  //                   by, this Op).
+  //  @param [in] output_var  The output Variable of this Op, i.e. the Variable
+  //                   which is modified or set by it.  We will provide another
+  //                   constructor taking ArrayRef<Variable> in this position,
+  //                   as and when we need to support Ops that take multiple
+  //                   output Variables.
+  void Op(const ArrayRef<Variable> &input_vars,
+          const Variable &output_var);
+
+
+  // TODO: maybe have a constructor of Op that takes an ArrayRef of the inputs
+  // that are not also outputs?  Could use that for graph traversal.
+
+ private:
+  static int64 counter_{0};
+
+  // num_inputs_ is the number of base Variables that are the base Variables of
+  // inputs of this Op (but not of outputs).  These are stored in the
+  // array 'inputs_'.
+
+  // inputs_ is a pointer to an array of shared_ptr<Variable> of size num_inputs_, which
+  // will be be allocated by new [] in the constructor and deleted by delete []
+  // in the destructor.
+  // This is a list of the unique Nodes that are the Nodes of inputs (but not outputs)
+  // of this
+  std::shared_ptr<Node> *inputs_;
+
+  int32 num_inputs_;
+  void *inputs_;
+
+  int64 n_;  // initialized from the counter when this object is created.
+  std::shared_ptr<Op> tail_;
+ protected:
+  // Return true if this is not the last Op in the list of Ops attached to this
+  // base Variable (can be useful to know whether we need bother to scale the
+  // derivative in a scaling operation, for instance).
+  bool HasTail() const { return tail_ != nullptr; }
+};
+
+
+class AddToOp: public Op {
+ public:
+
+  // This Op corresponds to the computation:
+  //   \f$  b  :=  alpha a  +   beta b.  \f$
+  // with broadcasting or summation depending on the dimensions
+  // involved.  Obviously alpha and beta are constants,
+  // and differentiation w.r.t. them is not supported.
+  //
+  // The Op is only constructed if b_.HasGrad() (which it
+  // would normally if a_.HasGrad()).
+  AddToOp(float alpha, float beta,
+          const Variable &a, const Variable &b):
+      Op({a}),
+      alpha_(alpha),
+      beta_(beta),
+      a_data_(a.GetData()),
+      a_grad_(a.GetGradIfPresent()),
+      b_data_(b.GetData()),
+      b_grad_(b.GetGrad()) {
+
+    Add(alpha, beta, *a_data_, b_data_.get());
+  }
+
+
+  void Backward() {
+    // Do: a_grad += alpha * b_grad.
+    if (a_grad_ != nullptr)
+      AddTo(alpha_, 1.0, b_grad, &a_grad);
+
+    if (beta_ != 1.0)
+      Scale(beta_, b_grad.get());
+  }
+
+ private:
+
+  float alpha_;
+  float beta_;
+
+  // We hold onto all inputs that are not also outputs
+  // (here just a_) for dependency tracking.
+  Variable a_;
+
+  std::shared_ptr<Tensor> a_data_;
+  std::shared_ptr<Tensor> a_grad_;
+  std::shared_ptr<Tensor> b_data_;
+  std::shared_ptr<Tensor> b_grad_;
+
+  Variable b_;
+  bool must_scale_b_grad_;
+
+};
+
+
+
+}  // namespace tensor
+}  // namespace kaldi
+
+
+#endif  // KALDI_TENSOR_VARIABLE_H_
diff --git a/src/tensor/storage.h b/src/tensor/storage.h
index 936abded13d..c4f88adbbbb 100644
--- a/src/tensor/storage.h
+++ b/src/tensor/storage.h
@@ -27,40 +27,123 @@
 namespace kaldi {
 namespace tensor {
 
+struct StorageExtras;
 
 // 'Storage' contains a single allocated region (on CPU or GPU, according
 // to 'device').
-struct Storage {
-  using DeallocatorFunc = std::function<void()>;
-
-  void *data;
-  size_t num_bytes;
-  Device device;
-  // 'deallocator' to be used with external toolkits, for example, to decrease
-  // the refcount
-  DeallocatorFunc deallocator;
-
-  // 'device' and 'deallocator' have default constructors.
-  Storage(): data(NULL), num_bytes(0) { }
-
-  // This constructor tries to allocate the requested data on the specified
-  // device.  It will throw if allocation fails (for now).
+class Storage {
+ public:
+  // This returns a reference to the object held in this->locker if it is
+  // non-NULL; otherwise it allocates one and returns that.
+  ChangeTracker &GetChangeTracker();
+
+  inline bool Allocated() {  return (data != NULL);  }
+
+  // TODO: we may need a mechanism to automatically zero data when it is
+  // allocated, we have to figure out the right level to do this at.
+  inline void *Data() {
+    if (data) {
+      return data;
+    } else {
+      Allocate();
+      return data;
+    }
+  }
+
+
+  /**
+     Creates a Storage object for device 'device' with size 'num_bytes'.
+     The actual data will not be allocated until someone calls this->Data().
+
+       @param [in] device  The device on which the data is to be allocated
+       @param [in] num_bytes  The number of bytes to be allocated; must be >0.
+  */
   Storage(Device device, size_t num_bytes);
 
-
-  Storage(Device device, DeallocatorFunc deallocator):
+  /**
+     This constructor is intended for use with data allocated by code outside
+     this codebase (for instance in external toolkits).
+
+          @param [in] device  The device on which this data exists
+          @param [in] data    Pointer to the data to be held
+          @param [in] num_bytes  The number of bytes held in this region
+                              (does not have to be exact, but should be
+                              at least the number of bytes in the part of
+                              this memory block that is going to be accessed
+                              through this Storage object.
+          @param [in] deallocator A std::function, which, if not nullptr,
+                              will be invoked in
+   */
+
+  Storage(Device device,
+          void *data,
+          size_t num_bytes,
+          DeallocatorFunc deallocator):
       data(NULL), num_bytes(0),
       device(device),
       deallocator(deallocator) { }
 
-  // If 'deallocator' is non-NULL (only true for external-to-Kaldi tensors such
-  // as NumPy), the destructor calls it; otherwise it deallocates 'data' (the
-  // method of deallocation depends on the device pointer 'device'.
+  // Returns true if the data has already been allocated.  I am hoping that it
+  // will never be necessary to call this.
+  bool IsAllocated();
+
+  // Deallocates the data.  This is user-callable because our autograd mechanism
+  // deletes the underlying data of gradients that are no longer needed, while
+  // keeping around the metadata in cases where it is instructed to retain the
+  // autograd graph.  Conceptually we think of this as simply zeroing the
+  // relevant gradients, since any data that is deallocated is implicitly
+  // treated as zero.
+  // Calling this is an error if a deallocator function was provided
+  // to the constructor of this object.
+  void Deallocate();
+
+  // Destructor that frees any data held.
   ~Storage();
+
+ private:
+
+  // Allocate the data.  It is an error to call this if data_ != NULL.
+  void Allocate();
+
+  // 'data_' is either 'nullptr' or the actual data pointer.  Due to lazy allocation,
+  // the 'data' pointer will remain NULL until it is actually needed.  Lazy
+  // allocation makes it much easier to set up the autograd graph without
+  // allocating the memory for the gradients.
+  void *data_;
+
+  // num_bytes is the number of bytes in the region we have allocated
+  // (or are going to allocate).
+  size_t num_bytes;
+
+  // the device the data is located on (or is to be located on).
+  Device device;
+
+  // contains some extra, less-often-used fields
+  std::unique_ptr<StorageExtras> extras;
+
 };
 
 
 
+// struct StorageExtras contains what (conceptually) are some rarely-needed
+// extra fields of class Storage; we store them separately, holding a
+// possibly-NULL pointer to struct StorageExtras, to reduce the size of struct
+// Storage in the normal case.
+struct StorageExtras {
+  using DeallocatorFunc = std::function<void()>;
+
+  // 'tracker' is used in debug mode to detect when data that might be
+  // required in the backprop phase is invalidated.
+  std::unique_ptr<ChangeTracker> tracker;
+
+  // 'deallocator' is to be used with external toolkits, for example, to
+  // decrease the refcount.  In normal cases it will be nullptr.
+  // If non-NULL, it will be invoked when we want to deallocate the
+  // storage object.
+  DeallocatorFunc deallocator;
+};
+
+
 
 
 }  // namespace tensor
diff --git a/src/tensor/tensor-common.h b/src/tensor/tensor-common.h
index 268fd1721f9..142df0e4e1a 100644
--- a/src/tensor/tensor-common.h
+++ b/src/tensor/tensor-common.h
@@ -147,14 +147,6 @@ enum UnaryFunctionEnum {
 };
 
 
-/// This enumeration value lists the unary function taking a single scalar arg
-/// that we might want to apply to Tensors; it exists so that much of the glue
-/// code can be templated.
-enum UnaryFunction1ScalarArgEnum {
-  kUnaryFunctionFloor,
-  kUnaryFunctionCeiling
-};
-
 
 /// This enumeration value lists the binary functions that we might
 /// want to apply to Tensors; it exists so that much of the glue
diff --git a/src/tensor/tensor-functions.h b/src/tensor/tensor-functions.h
index 0584651847f..47da508d857 100644
--- a/src/tensor/tensor-functions.h
+++ b/src/tensor/tensor-functions.h
@@ -45,8 +45,20 @@ void SetZero(const Tensor *tensor);
 void Set(float f, const Tensor *tensor);
 
 
-// Return a transposed version of this Tensor that shares the underlying memory.
-Tensor Transpose(const Tensor &tensor, int64_t axis1 = 0, int64_t axis2 = 1);
+/** Transpose the two specified axes of a Tensor
+
+    @param [in] axis1  First axis to be transposed; must be in range
+                       `[-t->NumAxes(), t->NumAxes() - 1]`,
+                       with negative axis being interpreted as an offset
+                       from t->NumAxes().
+    @param [in] axis2  Second axis to be transposed; must be in range
+                       `[-t->NumAxes(), t->NumAxes() - 1]`.
+                       If identical to axis1, nothing will be done.
+    @param [in,out] t     Tensor whose axes are to be transposed.
+ */
+inline void Transpose(int32 axis1, int32 axis2, Tensor *t) {
+  Transpose(axis1, axis2, &(t->impl_));
+}
 
 /**
    Copy the data from tensor 'src' to tensor 'dest', allowing broadcasting
@@ -98,39 +110,97 @@ inline void Exp(const Tensor &src, const Tensor *dest) {
 
 /**
    Template used to implement binary functions such as division,
-   taking to a power, max.
+   taking to a power, max, min.
 
    Implements c = F(a, b), where F is some function of two scalars
    that returns a scalar.
 
-     @param [in] a  First source Tensor
-     @param [in] b  Second source Tensor
-     @param [out] c   Destination Tensor.  We require SameDim(a, b, c).
-                     'c' does not have to be initialized on entry and
-                     is allowed to be the same Tensor as one of a or b.
- */
+     @param [in]  a  First source Tensor
+     @param [in]  b  Second source Tensor
+     @param [out] c  Destination Tensor.
+                   We require Broadcastable(a, b, c, true).
+*/
 template <BinaryFunctionEnum F>
 void BinaryFunctionTpl(const Tensor &a, Tensor &b, const Tensor *c);
 
 
 
-
 /*
    Implements c = a / b, applied elementwise.
 
      @param [in] a  First source Tensor
      @param [in] b  Second source Tensor
-     @param [out] c   Destination Tensor.  We require SameDim(a, b, c).
-                    'c' does not have to be initialized on entry and
-                    is allowed to be the same Tensor as one of a or b.
+     @param [out] c   Destination Tensor.  We require Broadcastable(a, b, c, true).
+                    'c' does not have to be initialized on entry and is allowed
+                    to be the same Tensor as one of a or b.
  */
 inline void Div(const Tensor &a, Tensor &b, const Tensor *c) {
   BinaryFunctionTpl<kBinaryFunctionDivide>(a, b, c);
 }
 
-// TODO: more binary functions.
 
 
+/**
+   This is like PyTorch's slice() / narrow() functions.
+   It selects a range of dimensions on one of the axes.  It is similar to
+   indexing with a range in Python, like A[10:20].
+
+      @param [in] axis   Axis on which to possibly reduce the dimensionality;
+                         require -t->NumAxes() <= axis < t->NumAxes(), with
+                         negative axis interpreted as an offset from t->NumAxes().
+      @param [in] start  Starting index; must be in range [0, t->Dim(axis) - 1]
+      @param [in] end    Ending index; must be in the range [start + 1, t->Dim(axis)]
+      @param [in,out] t  Tensor whose metadata is to be modified.  Its NumAxes()
+                         is not changed by this function (unlike Select()).
+
+   See also: the other overloaded version of Slice() which accepts the 'step'
+   parameter; and Select(), which also reduces the num-axes.
+ */
+inline void Slice(int32 axis, int32 start, int32 end, Tensor *t) {
+  Slice(axis, start, end, &(t->impl_));
+}
+
+
+/**
+   This is a version of Slice() which also takes a 'step' argument to support
+   things like taking every other element.  See the documentation for the other
+   Slice() for more context.   This is related to indexing with a range
+   in Python: for example, A[0:6:2], selecting elements [0, 2, 4] of A.
+
+      @param [in] axis   Axis on which to possibly reduce the dimensionality;
+                         require -t->NumAxes() <= axis < t->NumAxes(), with
+                         negative axis interpreted as an offset from t->NumAxes().
+      @param [in] start  Starting index; must be in range [0, t->Dim(axis) - 1]
+      @param [in] end    Ending index.  If `step > 0` must be in the range
+                         [start + 1, t->Dim(axis)]; if step  < 0, must be
+                         in the range [start - 1, -1].
+      @param [in] step   Nonzero number that indicates the subsampling of elements
+                         (and possible axis flipping).
+      @param [in,out] t  Tensor whose metadata is to be modified.  Its NumAxes()
+                         is not changed by this function (unlike Select()).
+
+   See the other version of Slice(), and Select().
+ */
+inline void Slice(int32 axis, int32 start, int32 end, int32 step, Tensor *t) {
+  Slice(axis, start, end, stride, &(t->impl_));
+}
+
+
+/**
+   Select one element from an axis of Tensor 't', reducing t->NumAxes() by
+   one.
+
+       @param [in] axis Axis from which to select an element; require
+                         -t->NumAxes() <= axis < t->NumAxes(), with negative
+                         axis interpreted as an offset from t->NumAxes().
+       @param [in] index  Index in t to select; must be in range
+                         [0, t->Dim(axis) - 1].
+       @param [in,out]  t   Tensor whose metadata is to be modified.
+ */
+inline void Select(int32 axis, int32 index, Tensor *t) {
+  Select(axis, index, &(t->impl_));
+}
+
 
 
 
diff --git a/src/tensor/tensor-impl-linear.h b/src/tensor/tensor-impl-linear.h
index 322225a7e00..7ad98f13aea 100644
--- a/src/tensor/tensor-impl-linear.h
+++ b/src/tensor/tensor-impl-linear.h
@@ -79,14 +79,33 @@ void Copy(const TensorImpl &a, const TensorImpl *b);
 /**
    Add elements from Tensor a to Tensor b, broadcasting or summing
    as dictated by the dimensions involved; does
+
       \f$  b := \alpha a + \beta b.  \f$
 
       @param [in]  a    The source Tensor.
       @param [out] b   The destination Tensor.  We require
-                       Broadcastable(a, b).
+                       Broadcastable(a, b).  It's OK for b's data to
+                       be uninitialized at entry if beta == 0.
+ */
+void AddTo(float alpha, float beta,
+           const TensorImpl &a,
+           const TensorImpl *b);
+
+
+/**
+   Version of Add that does a simple sum of two Tensors and writes to the
+   product to a third location i.e. does:
+
+        *c = alpha a  +  beta b
+
+   Requires Broadcastable(a, b, c).
  */
-void Add(float alpha, float beta,
-         const TensorImpl &a, const TensorImpl *b);
+void Add(float alpha,
+         const TensorImpl &a,
+         float beta,
+         const TensorImpl &b,
+         const TensorImpl *c);
+
 
 
 /**
diff --git a/src/tensor/tensor-impl-utils.cc b/src/tensor/tensor-impl-utils.cc
new file mode 100644
index 00000000000..78f62479100
--- /dev/null
+++ b/src/tensor/tensor-impl-utils.cc
@@ -0,0 +1,52 @@
+// tensor/tensor-impl-utils.cc
+
+// Copyright      2019  Johns Hopkins University (author: Daniel Povey)
+
+// See ../../COPYING for clarification regarding multiple authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//  http://www.apache.org/licenses/LICENSE-2.0
+//
+// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
+// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
+// MERCHANTABLITY OR NON-INFRINGEMENT.
+// See the Apache 2 License for the specific language governing permissions and
+// limitations under the License.
+
+#include "tensor/tensor-impl-utils.h"
+
+
+namespace kaldi {
+namespace tensor {
+
+
+void Slice(int32 axis, int32 start, int32 end, TensorImpl *t) {
+  int32 num_axes = t->num_axes;
+  int32 raxis = (axis >= 0 ? num_axes - 1 - axis : - 1 - axis);
+  if (static_cast<uint32>(raxis) >= static_cast<uint32>(num_axes)) {
+    KALDI_ERR << "Axis out of range: " << axis << ", num-axes = "
+              << num_axes;
+  }
+  int32 dim = t->dims[raxis], stride = t->strides[raxis];
+  if (end <= start || start < 0 || end > dim) {
+    KALDI_ERR << "Slice() parameters out of range: start,end = "
+              << start << "," << end << ", dim = " << dim;
+  }
+  AddToPointer(stride * static_cast<int64>(start), t);
+
+  int32 new_dim = end - start;
+  t->dims[raxis] = new_dim;
+  if (new_dim == 1)
+    t->strides[raxis] = 0;
+}
+
+
+
+
+
+}  // namespace kaldi
+}  // namespace tensor
diff --git a/src/tensor/tensor-impl-utils.h b/src/tensor/tensor-impl-utils.h
index dbae890a0df..7d5592b1e46 100644
--- a/src/tensor/tensor-impl-utils.h
+++ b/src/tensor/tensor-impl-utils.h
@@ -27,8 +27,7 @@
 /**
    This header contains basic linear-algebra and copying types of operations
    on TensorImpl objects.  See also tensor-impl-nonlinearly
- */
-
+*/
 namespace kaldi {
 namespace tensor {
 
@@ -44,6 +43,26 @@ inline bool Compatible(const TensorImpl &a, const TensorImpl &b,
                        const TensorImpl &c);
 
 
+
+
+//  This function moves the 'data' pointer stored in 't' by adding
+//  a number of elements equal to 'offset'.  It casts it to the
+// type specified in t->dtype so the memory address changes by
+// the right amount.
+inline void AddToPointer(int64 offset, TensorImpl *t) {
+  switch(t->dtype) {
+    case kFloatDtype:
+      t->data = static_cast<void*>(static_cast<float>(t->data) + offset);
+      return;
+    case kDoubleDtype:
+      t->data = static_cast<void*>(static_cast<double>(t->data) + offset);
+      return;
+    default:
+      KALDI_ERR << "Unknown data type";
+  }
+}
+
+
 /**
    This function allocates the appropriate storage for the Tensor described
    in 'impl', and sets is 'data' pointer to the allocated memory address.
@@ -112,16 +131,87 @@ inline void Unsqueeze(TensorImpl *t, int32 axis) {
    Showing just the dims in the tensor for an example:
 
 \verbatim
-    Squeeze({1,3,4}, 0)  -> {3,4}
-    Squeeze({3,1,4}, 1)  -> {3,4}
-    Squeeze({3,1,4}, 2)  -> [error]
+    Squeeze(0, {1,3,4})  -> {3,4}
+    Squeeze(1, {3,1,4})  -> {3,4}
+    Squeeze(2, {3,1,4})  -> [error]
 \endverbatim
  */
-inline void Squeeze(TensorImpl *t, int32 axis) {
+void Squeeze(int32 axis, TensorImpl *t);
   Squeeze(&(t->pattern), axis));
 }
 
 
+/** Transpose the two specified axes of a TensorImpl
+
+    @param [in] axis1  First axis to be transposed; must be in range
+                       `[-t->NumAxes(), t->NumAxes() - 1]`,
+                       with negative axis being interpreted as an offset
+                       from t->NumAxes().
+    @param [in] axis2  Second axis to be transposed; must be in range
+                       `[-t->NumAxes(), t->NumAxes() - 1]`.
+                       If identical to axis1, nothing will be done.
+    @param [in,out] t    TensorImpl whose axes are to be transposed.
+ */
+inline void Transpose(int32 axis1, int32 axis2, TensorImpl *t) {
+  Transpose(axis1, axis2, &(tensor->pattern));
+}
+
+
+
+/**
+   This is like PyTorch's slice() / narrow() functions.
+   It selects a range of dimensions on one of the axes.  It is similar to
+   indexing with a range in Python, like A[10:20].
+
+      @param [in] axis   Axis on which to possibly reduce the dimensionality;
+                         require -t->NumAxes() <= axis < t->NumAxes(), with
+                         negative axis interpreted as an offset from t->NumAxes().
+      @param [in] start  Starting index; must be in range [0, t->Dim(axis) - 1]
+      @param [in] end    Ending index; must be in the range [start + 1, t->Dim(axis)]
+      @param [in,out] t  TensorImpl whose metadata is to be modified.  Its num_axes
+                         is not changed by this function (unlike Select()).
+
+   See also: the other overloaded version of Slice() which accepts the 'step'
+   parameter; and Select(), which also reduces the num-axes.
+ */
+void Slice(int32 axis, int32 start, int32 end, TensorImpl *t);
+
+
+/**
+   This is a version of Slice() which also takes a 'step' argument to support
+   things like taking every other element.  See the documentation for the other
+   Slice() for more context.   This is related to indexing with a range
+   in Python: for example, A[0:6:2], selecting elements [0, 2, 4] of A.
+
+      @param [in] axis   Axis on which to possibly reduce the dimensionality;
+                         require -t->NumAxes() <= axis < t->NumAxes(), with
+                         negative axis interpreted as an offset from t->NumAxes().
+      @param [in] start  Starting index; must be in range [0, t->Dim(axis) - 1]
+      @param [in] end    Ending index.  If `step > 0` must be in the range
+                         [start + 1, t->Dim(axis)]; if step  < 0, must be
+                         in the range [start - 1, -1].
+      @param [in] step   Nonzero number that indicates the subsampling of elements
+                         (and possible axis flipping).
+      @param [in,out] t  TensorImpl whose metadata is to be modified.  Its num_axes
+                         is not changed by this function (unlike Select()).
+
+   See the other version of Slice(), and Select().
+ */
+void Slice(int32 axis, int32 start, int32 end, int32 step, TensorImpl *t);
+
+
+/**
+   Select one element from an axis of TensorImpl 't', reducing t->NumAxes() by
+   one.
+
+       @param [in] axis Axis from which to select an element; require
+                         -t->NumAxes() <= axis < t->NumAxes(), with negative
+                         axis interpreted as an offset from t->NumAxes().
+       @param [in] index  Index in t to select; must be in range
+                          [0, t->Dim(axis) - 1].
+       @param [in,out]  t   TensorImpl whose metadata is to be modified.
+ */
+void Select(int32 axis, int32 index, TensorImpl *t);
 
 
 
diff --git a/src/tensor/tensor-impl.h b/src/tensor/tensor-impl.h
index ef3ddadceb9..7b0594f3175 100644
--- a/src/tensor/tensor-impl.h
+++ b/src/tensor/tensor-impl.h
@@ -39,7 +39,33 @@ struct TensorImpl {
   TensorPattern pattern;
   DataType dtype;
   Device device;
-  void *data{nullptr};
+  std::shared_ptr<Storage> data;  // 'data' points to a shared Storage object
+                                  // that contains (or eventually will contain,
+                                  // due to lazy allocation) the actual data
+                                  // pointer.
+
+  inline int32 NumAxes() { return pattern.num_axes; }
+
+  // Returns the dimension on the supplied axis (using the public axis numbering)
+  //  @param [in] axis  Axis on which dimension is required, with
+  //                    -NumAxes() <= axis < NumAxes(); negative axis
+  //                    is interpreted as an offset from NumAxes().
+  //  @return        Returns the dimension on this axis, a number >= 1.
+  inline int32 Dim(int32 axis);
+
+  // Returns the stride on the supplied axis (using the public axis numbering)
+  //  @param [in] axis  Axis on which stride is required, with
+  //                    -NumAxes() <= axis < NumAxes(); negative axis
+  //                    is interpreted as an offset from NumAxes().
+  //  @return          Returns the stride on this axis, which will be 0 if
+  //                   Dim(axis) == 1, and otherwise nonzero.
+  inline int32 Stride(int32 axis);
+
+
+  // Returns the data pointer corresponding to the element whose index
+  // is all zeros.
+  inline void* GetData() const;
+
 
   // Returns true if this TensorImpl is valid, false otherwise.  It is an
   // implied requirement of functions operating on TensorImpl's, that all
@@ -61,7 +87,20 @@ struct TensorMeta {
 };
 
 
-void Compatible(const TensorImpl &a, TensorImpl &b
+inline int32 TensorImpl::Dim(int32 axis) {
+  if (axis < 0) {
+    // it will usually be known whether axis < 0 at compile time, since it's
+    // inlined.
+    KALDI_ASSERT(axis >= -pattern.num_axes);
+    // num_axes - 1 - (axis + num_axes) = - 1 - axis
+    int32 raxis = -1 - axis;
+    return pattern.dims[raxis];
+  } else {
+    KALDI_ASSERT(axis < pattern.num_axes);
+    int32 raxis = pattern.num_axes - 1 - axis;
+    return pattern.dims[raxis];
+  }
+}
 
 
 
diff --git a/src/tensor/tensor-pattern-extra-utils.cc b/src/tensor/tensor-pattern-extra-utils.cc
new file mode 100644
index 00000000000..57e4449b14c
--- /dev/null
+++ b/src/tensor/tensor-pattern-extra-utils.cc
@@ -0,0 +1,501 @@
+// tensor/tensor-pattern-extra-utils.cc
+
+// Copyright      2019  Johns Hopkins University (author: Daniel Povey)
+
+// See ../../COPYING for clarification regarding multiple authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//  http://www.apache.org/licenses/LICENSE-2.0
+//
+// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
+// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
+// MERCHANTABLITY OR NON-INFRINGEMENT.
+// See the Apache 2 License for the specific language governing permissions and
+// limitations under the License.
+
+
+#include "tensor/tensor-pattern-extra-utils.h"
+
+namespace kaldi {
+namespace tensor {
+
+
+class IntersectionComputer {
+ public:
+  IntersectionComputer(const TensorPattern &pattern1,
+                       const TensorPattern &pattern2):
+      pattern1_(pattern1), pattern2_(pattern2);
+
+  /**
+     Computes the intersection between the pattern1 and pattern2 given to the
+     constructor (must be called only once); if it could be computed, the
+     intersection is represented as the union between all the (disjoint)
+     patterns in patterns_out.
+
+        @param [in] patterns_out  A list of patterns (in arbitrary order)
+                         is written to here.  The union of this list of patterns
+                         will, if this function returns true,
+                         represent the intersection between the pattern1 and
+                         pattern2 passed to the constructor.  These patterns
+                         will be valid without a code, but won't be
+                         in canonical form (the user can do that themselves;
+                         we don't it here because in most cases the caller will
+                         only care whether the union is empty or not).
+  */
+  bool ComputeIntersection(std::vector<TensorPattern> *patterns_out) {
+    CanonicalizePattern(&pattern1_);
+    CanonicalizePattern(&pattern2_);
+    std::vector<int32> axes;
+    if (!FindCommonStrides(&axes))
+      return false;
+    std::vector<TensorPattern> patterns1, patterns2;
+    patterns1.reserve(8);
+    patterns2.reserve(8);
+    ConvertToCommonStrides(pattern1_, &patterns1);
+    ConvertToCommonStrides(pattern2_, &patterns2);
+    patterns_out->clear();
+    ComputeIntersection(patterns1, patterns2, patterns_out);
+    return true;
+  }
+
+ private:
+
+  // Attempts to find a common list of strides which can be used for the
+  // combined patterns.  Returns false if this cannot be done.  This is done by
+  // taking the union of the strides in pattern1_ and pattern2_, sorting them,
+  // and then checking that each stride in the sequence divides the next (it
+  // returns true if this is the case, false otherwise).
+  // These strides must all be positive because pattern1_ and pattern2_ have
+  // both been canonicalized.
+  bool FindCommonStrides(std::vector<int32> *axes);
+
+
+  /**
+    This function converts a pattern 'pattern' in canonical form to a list of Patterns
+    whose union (viewed as memory-index-sets) is equivalent to 'pattern'
+    where the strides of the output patterns are equal to the provided 'common_strides'
+    vector.
+
+    This function requires that the actual strides in 'pattern' all be present in
+    the list 'common_strides'; that the elements of 'common_strides' be positive
+    and sorted from smallest to greatest; and that each element in
+    'common_strides' divide the next element exactly.
+
+
+       @param [in] pattern  Input pattern in canonical form, valid except for
+                         code.
+       @param [in] common_strides   A sorted list of integers >0, with the
+                         property that each element must divide the next
+                         element exactly, and also that each stride in
+                         'pattern' must be present in 'common_strides'.
+       @param [out] patterns   This will be set to a nonempty list of patterns
+                         whose union (viewed as a memory-index-set) equals
+                         'pattern', and whose strides are equal to
+                         'common_strides'.  The patterns in `*patterns` at
+                         output will be valid except for the code and for
+                         property (iv) (search Valid Pattern in
+                         tensor-pattern.h): that is, it will have nonzero
+                         strides for axes with dim != 1.
+  */
+  static void ConvertToCommonStrides(const TensorPattern &pattern,
+                                     const std::vector<int32> &common_strides,
+                                     std::vector<TensorPattern> *patterns);
+
+  /**
+     Computes the intersection between pattern1 and pattern2, which must have
+     identical axes and strides, and must be valid *except* for property (iv),
+     i.e.  it's not required that axes with dim=1 must have stride=0, and the
+     code does not have to be set.
+
+
+        @param [in] pattern1   The first input pattern.  Must be valid
+                               except for property (iv), and must have positive
+                               strides.
+        @param [in] pattern2   The second input pattern.  Must be valid
+                               except for property (iv), and must have
+                               the same strides (in the same order)
+                               as pattern1.
+        @param [out] patterns_out  The output patterns; this function will write
+                               to this location a vector of disjoint patterns
+                               whose union (viewed as a memory-index-set) is
+                               identical to the intersection of pattern1
+                               and pattern2.  The patterns in this vector will
+                               be valid except for property (iv) [i.e. they
+                               won't have zero strides for axes with dim=1], and
+                               they will not have their code set.
+  */
+  static ComputeIntersection(const TensorPattern &pattern1,
+                             const TensorPattern &pattern2,
+                             std::vector<TensorPattern> *patterns_out) {
+    patterns_out->clear();
+    ComputeIntersection(pattern1, pattern2, pattern1.num_axes,
+                        patterns_out);
+  }
+
+  /**
+     In this recursive implementation of ComputeIntersection() [see version
+     above for more information on pattern1, pattern2 and patterns_out], the
+     user guarantees that for all axes with raxis-index `raxis >=
+     identical_raxis`, pattern1 and pattern2 have the same dimension, and
+     it may be assumed that we are only interested in the part of the
+     intersection where the indexes are the same for pattern1 and pattern2,
+     for all raxis >= identical_raxis.
+
+     In this recursion, when we get to 'identical_raxis == 0', it means pattern1
+     and pattern2 have identical dims and strides; and if they also have the
+     same offset, all we need to do is append one of them to 'patterns_out'
+     (otherwise this part of the intersection is empty; but note that this
+     function may in general fork into two branches each time it recurses).
+     This is all part of a process of trying to make the 'offset' identical
+     between the two patterns by discarding some leading dimensions on one of
+     the two patterns.  On raxis-indexes that we have processed, we also make
+     the 'dim' the same by lopping off trailing dimensions.
+  */
+  static bool ComputeIntersection(const TensorPattern &pattern1,
+                                  const TensorPattern &pattern2,
+                                  int32 identical_raxis,
+                                  std::vector<TensorPattern> *patterns_out);
+
+
+  /**
+     This function, called by ConvertToCommonStrides() converts a pattern in
+     canonical form to a Pattern whose strides are equal to the
+     provided 'common_strides' vector, and which is valid *except for*
+     the axis-sorting (property (vi) of a valid Pattern) and
+     for property (iv), that strides must be nonzero for axes
+     with dim != 1.
+
+         @param [in] pattern_in  The input pattern; must be valid and
+                                 in canonical form.
+         @param [in] common_strides  The list of strides.  Must be sorted,
+                                 have the property that each element
+                                 divides the next element, and all
+                                 strides in pattern_in must be present
+                                 in this list.
+         @param [out] pattern_out   The output pattern.  Will be equivalent
+                                 to pattern_in in terms of memory-index-set,
+                                 its strides will be equal to 'common_strides'
+                                 (including the order), and it will be valid
+                                 except for properties (iv) and (vi), as
+                                 mentioned above.
+  */
+  static void ConvertLazilyToCommonStrides(const TensorPattern &pattern_in,
+                                           const std::vector<int32> &common_strides,
+                                           TensorPattern* pattern_out);
+
+  /**
+     This function makes sure that the axis-sorting property in 'pattern'
+     holds for the axis numbered 'raxis' (in the private numbering, of
+     course).  I.e. it ensures that:
+
+       `pattern->strides[raxis+1] >= pattern->strides[raxis] * pattern->dims[raxis]`
+
+     If it does not already have this property, this function ensures that it
+     does have it by modifying its dims for raxis and raxis + 1, and if necessary,
+     moving part of the pattern to 'extra_pattern'.  This will be necessary if the
+     value of `pattern->dims[raxis]` at entry is not a multiple of
+     `pattern->strides[raxis+1] / pattern->strides[raxis]`.
+
+         @param [in]      raxis    The axis on which we are doing the check
+         @param [in,out]  pattern  The input pattern, valid except for properties
+                                (iv) and (vi).  Its strides must be in
+                                increasing order (in the private numbering) and
+                                each must divide the next.
+         @param [out]     extra_pattern   This function writes to 'extra_pattern' if
+                                and only if it returns true.  See documentation of
+                                return status.
+         @return  Returns true if it wrote to extra_pattern.  If it returns true,
+                  then it guarantees that the union of the memory-index-sets of
+                  'pattern' and 'extra_pattern' at exit are equal to the memory-index-set
+                  of 'pattern' at entry.  If it returns false, then it guarantees
+                  that the memory-index-set of 'pattern' has been unchanged.
+                  In either case it guarantees that property (vi), the axis-sorting
+                  property, holds for axis 'raxis', in 'pattern' and (if applicable)
+                  in `extra_pattern`.
+                  The codes of pattern and extra_pattern are not set.
+  */
+  static bool EnsureAxisSortingPropertyHolds(int32 raxis,
+                                             TensorPattern *pattern,
+                                             TensorPattern *extra_pattern);
+
+
+
+  // the same as pattern1 passed to the constructor, but reduced to
+  // canonical form
+  TensorPattern pattern1_;
+  // the same as pattern1 passed to the constructor, but reduced to
+  // canonical form
+  TensorPattern pattern2_;
+
+  // patterns1_ is the list of patterns we get when we convert pattern1_
+  // to have the shared list of strides.  Will have at least one element.
+  std::vector<TensorPattern> patterns1_;
+  // patterns2_ is the list of patterns we get when we convert pattern2_
+  // to have the shared list of strides.  Will have at least one element.
+  std::vector<TensorPattern> patterns2_;
+
+  std::vector<TensorPattern> *intersection_;
+};
+
+
+bool IntersectionComputer::FindCommonStrides(std::vector<int32> *axes) {
+  axes->clear();
+  axes->reserve(pattern1_.num_axes + pattern2_.num_axes);
+  for (int32 raxis = 0; raxis < pattern1_.num_axes; raxis++)
+    axes->push_back(pattern1_.strides[raxis]);
+  for (int32 raxis = 0; raxis < pattern2_.num_axes; raxis++)
+    axes->push_back(pattern2_.strides[raxis]);
+  SortAndUniq(axes);  // sort from least to greatest, remove duplicates.
+  int32 prev_stride = (*axes)[0];
+  size_t num_axes = axes->size();
+  for (size_t i = 1; i < num_axes; i++) {
+    int32 cur_stride = (*axes)[i];
+    if (cur_stride % prev_stride != 0)
+      return false;  // prev_stride does not divide cur_stride; our algorithm
+                     // for detecting overlap cannot be used.  This shouldn't
+                     // really happen in "reasonable" uses of Tensors.
+    prev_stride = cur_stride;
+  }
+  return true;
+}
+
+bool IntersectionComputer::EnsureAxisSortingPropertyHolds(
+    int32 raxis, TensorPattern *pattern,
+    TensorPattern *extra_pattern) {
+  KALDI_PARANOID_ASSERT(raxis + 1 < pattern->num_axes);
+  if (pattern->strides[raxis + 1] >=
+      pattern->strides[raxis] * pattern->dims[raxis]) {
+    // Property already holds -> nothing to do.  Return false
+    // because 'extra_pattern' is not needed.
+    return false;
+  }
+
+  // It would not make sense if pattern->dims[raxis + 1] were > 1; that would
+  // imply we started with some kind of self-overlapping pattern, whicg would
+  // not be valid.
+  KALDI_PARANOID_ASSERT(pattern->strides[raxis + 1] %
+                        pattern->strides[raxis] == 0 &&
+                        pattern->dims[raxis + 1] == 1);
+
+  int32 ratio = pattern->strides[raxis + 1] / pattern->strides[raxis];
+  int32 orig_dim = pattern->dims[raxis];
+  pattern->dims[raxis] = ratio;
+  int32 next_dim = orig_dim / ratio;
+  pattern->dims[raxis + 1] = orig_dim;
+
+  int32 remainder = orig_dim % ratio;
+  if (remainder == 0) {
+    // We didn't need to make use of 'extra_pattern', so return false.
+    return false;
+  } else {
+    *extra_pattern = pattern;
+    extra_pattern->dims[raxis] = remainder;
+    extra_pattern->dims[raxis + 1] = 1;
+    extra_pattern->offset += next_dim * pattern->strides[raxis];
+    // we used extra_pattern, so return true.
+    return true;
+  }
+}
+
+
+void IntersectionComputer::ConvertLazilyToCommonStrides(
+    const TensorPattern &pattern_in,
+    const std::vector<int32> &common_strides,
+    TensorPattern* pattern_out) {
+  int32 num_axes_in = pattern_in.num_axes,
+      num_axes_out = common_strides.size();
+  pattern_out->num_axes = num_axes_out;
+  int32 raxis_in = 0;
+  pattern_out->offset = pattern_in->offset;
+  for (int32 raxis_out = 0; raxis_out < num_axes_out; raxis_out++) {
+    int32 stride = common_strides[raxis_out];
+    pattern_out->strides[raxis_out] = stride;
+    if (pattern_in.strides[raxis_in] == stride) {
+      pattern_out->dims[raxis_out] = pattern_in.dims[raxis_in];
+      pattern_in++;
+    } else {
+      pattern_out->dims[raxis_out] = 1;
+    }
+  }
+  if (raxis_in != num_axes_in) {
+    KALDI_ERR << "Something went wrong converting strides (likely code error)";
+  }
+}
+
+
+void IntersectionComputer::ConvertToCommonStrides(
+    const TensorPattern &pattern,
+    const std::vector<int32> &common_strides,
+    std::vector<TensorPattern*> *patterns) {
+
+  patterns->resize(1);
+  ConvertLazilyToCommonStrides(pattern, &((*patterns)[0]));
+  int32 num_axes = common_strides.size();
+  for (int32 raxis = 0; raxis + 1 < num_axes; raxis++) {
+    TensorPattern extra_pattern;
+    int32 num_patterns = patterns->size();
+    for (int32 p = 0; p < num_patterns; p++) {
+      if (EnsureAxisSortingPropertyHolds(raxis, &((*patterns)[p]),
+                                         &extra_pattern))
+        patterns->push_back(extra_pattern);
+    }
+  }
+}
+
+
+// see declaration for documentation.
+void IntersectionComputer::ComputeIntersection(
+    const TensorPattern &pattern1,
+    const TensorPattern &pattern2,
+    int32 identical_raxis,
+    std::vector<TensorPattern> *patterns_out) {
+  if (identical_raxis == 0) {
+    if (pattern1.offset == pattern2.offset) {
+      patterns_out->push_back(pattern1);
+      RemoveTrivialAxes(&(patterns_out->back()));
+    }
+    return;
+  }
+  // we'll be modifying the dims and strides on axis 'raxis'.
+  int32 raxis = identical_raxis - 1,
+      stride = pattern1.strides[raxis]; // will be the same in pattern2, and positive.
+
+  // By the '?..:' statements below we possibly switch pattern2 and
+  // pattern1, thereby ensuring that pattern2_mod.offset >= pattern1_mod.offset
+  TensorPattern pattern1_mod(pattern2.offset >= pattern1.offset ? pattern1 : pattern2),
+      pattern2_mod(pattern2.offset >= pattern1.offset ? pattern2 : pattern1);
+
+
+  // pattern2_mod's offset is larger (or the same), so we may need to discard
+  // some leading indexes of pattern1_mod (on axis 'raxis'), increasing the
+  // offset and reducing the dim, to get the offsets closer to being the same,
+  // and then take the min of the dims on that axis.
+
+  // 'dim_discarded' below will be rounded down in the division, and we will
+  // also need to also consider the value that's one larger than that.  We don't
+  // need to consider any other values of 'dim_discarded' other than these two,
+  // because it's possible to prove that if we recurse with the remaining offset
+  // being greater than 'stride', we would never be able to get to offset=0
+  // without discarding all dims of at least one axis numbered less than raxis.
+  // The proof requires the axis-sorting property.
+  int32 offset_diff = pattern2_mod.offset - pattern1_mod.offset,
+      min_dim1_discarded = offset_diff / stride,
+      max_dim1_discarded = ((offset_diff == min_dim1_discarded * stride) ?
+                            min_dim1_discarded : min_dim1_discarded + 1);
+
+  // Make a copy of the relevant dims, and pattern1's offset, because the
+  // versions in the patterns may get modified in the loop.
+  int32 pattern1_dim = pattern1_mod.dims[raxis],
+      pattern2_dim = pattern2_mod.dims[raxis],
+      pattern1_offset = pattern1.offset;
+  for (int32 dim1_discarded = min_dim1_discarded;
+       dim1_discarded <= max_dim1_discarded; dim1_discarded++) {
+    pattern1_mod.offset = pattern1_offset + dim1_discarded * stride;
+    int32 new_pattern1_dim = pattern1_dim - dim1_discarded;
+    if (new_pattern1_dim <= 0)
+      continue;
+    pattern1_mod.dims[raxis] = new_pattern1_dim;
+    // set both dims of pattern1_mod and pattern2_mod to the minimum
+    // of the two dims.
+    if (pattern2_dim > new_pattern1_dim) {
+      pattern1_mod.dims[raxis] = new_pattern1_dim;
+      pattern2_mod.dims[raxis] = new_pattern1_dim;
+    } else {
+      pattern1_mod.dims[raxis] = pattern2_dim;
+      pattern2_mod.dims[raxis] = pattern2_dim;
+    }
+    // Recurse.  We would have continued above if we discarded all dims on this
+    // axis.
+    ComputeIntersection(pattern1, pattern2, raxis, patterns_out);
+  }
+}
+
+
+
+bool IntersectionComputer::ComputeIntersection(
+    const TensorPattern &pattern1,
+    const TensorPattern &pattern2,
+    std::vector<TensorPattern> *patterns_out) {
+  // First ensure that pattern1.offset <= pattern2.offset.
+  if (pattern1.offset > pattern2.offset)
+    return ComputeIntersection(pattern2, pattern1, pattern_out);
+
+  int64 extra_offset = pattern2.offset - pattern1.offset;
+  int32 dim_offset[KALDI_MAX_TENSOR_DIM];
+  // What we are doing conceptually here is shifting pattern1 to have the same
+  // offset as pattern2 by saying that on each axis, instead of starting the
+  // index from zero to dim - 1, we start that index from some number less than
+  // zero i.e. we shift those indexes to the left.  The index of the
+  // intersection will still start from zero though, because pattern2's index
+  // still starts from zero.
+  // We are going to express 'extra_offset' as a sum
+
+
+  // pattern1 and pattern2 are required to have the same stride and num_axes.
+  int32 num_axes = pattern1.num_axes;
+  for (int32 raxis = num_axes - 1; raxis >= 0; raxis--) {
+    int32 this_stride = pattern1.strides[raxis],
+        this_offset = extra_offset / this_stride;
+
+  }
+}
+
+bool TensorPatternRebaser::Convert(TensorPattern *pattern) {
+  if (!needs_conversion_)
+    return;  // An optimization to make the common case fast.
+
+  pattern->offset = ConvertMemoryIndex(pattern->offset);
+
+  if (num-axes_ == 0)
+    return;  // Another optimization to make a fairly common case fast.
+  int32 num_axes = pattern->num_axes;
+  for (int32 raxis = 0; raxis < num_axes; raxis++) {
+    int32 stride = pattern->strides[raxis],
+        dim = pattern->dims[raxis];
+    if (stride == 0)
+      continue;
+    int32 pstride = std::abs(stride),
+        product = pstride * dim;
+    // We will convert 'pstride' using
+
+
+  }
+  return true;  // Success
+
+}
+
+
+int64 TensorPatternRebaser::ConvertMemoryIndex(int64 m) {
+  int32 num_axes = num_axes_;
+  int64 ans = dest_offset_;
+  m -= src_offset_;
+  if (num_axes == 0)
+    return m;
+  // We visit the compressed axes in order from greatest to least src_stride.
+  // What this loop does is to reverse engineer the indexes into (the compressed
+  // version of) src_pattern that we'd need to get memory-offset m.  The 'i'
+  // values in the loop are those indexes.
+  for (int32 raxis = num_axes - 1; raxis >= 0; raxis--) {
+    int32 stride = src_strides_[raxis];
+    int64 i = m / stride;
+    m -= i * stride;
+    ans += i * dest_strides_[raxis]
+  }
+  if (m != 0) {
+    // This should not happen; likely it means the memory-index m was not covered
+    // by the src_pattern passed to the constructor, so someone was trying
+    // to rebase a pattern which was not covered by src_pattern.
+    KALDI_ERR << "Could not convert this memory-index (likely code error)";
+  }
+  return ans;
+}
+
+
+
+
+}  // namespace kaldi
+}  // namespace tensor
diff --git a/src/tensor/tensor-pattern-extra-utils.h b/src/tensor/tensor-pattern-extra-utils.h
new file mode 100644
index 00000000000..6088ad0a09f
--- /dev/null
+++ b/src/tensor/tensor-pattern-extra-utils.h
@@ -0,0 +1,272 @@
+// tensor/tensor-pattern-extra-utils.h
+
+//  Copyright      2019  Johns Hopkins University (author: Daniel Povey)
+
+// See ../../COPYING for clarification regarding multiple authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//  http://www.apache.org/licenses/LICENSE-2.0
+//
+// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
+// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
+// MERCHANTABLITY OR NON-INFRINGEMENT.
+// See the Apache 2 License for the specific language governing permissions and
+// limitations under the License.
+
+
+#include "tensor/tensor-common.h"
+#include "tensor/tensor-pattern.h"
+#include "tensor/array-ref.h"
+
+
+// This header includes various functions operating on Patterns,
+// particularly ones relating to set-theoretic views of Patterns
+// and obscure, less-user-facing ones.
+
+
+namespace kaldi {
+namespace tensor {
+
+
+/**
+   Returns true if there is overlap between pattern1 and pattern2,
+   meaning that pattern1's memory-index-set and pattern2's
+   memory-index-set have nonempty intersection.
+ */
+bool PatternsOverlap(const TensorPattern &pattern1,
+                     const TensorPattern &pattern2);
+
+/**
+   Returns true if pattern2's memory-index-set is a subset of pattern1's
+   memory-index-set.  See glossary in tensor-pattern.h for explanation of
+   memory-index-set.
+ */
+bool PatternIncludes(const TensorPattern &pattern1,
+                     const TensorPattern &pattern2);
+
+
+/**
+   Returns true if the two patterns are equivalent in the sense that their
+   memory-index-sets are the same.  See glossary in tensor-pattern.h for
+   explanation.
+
+   This function works by reducing both patterns to canonical form
+   and testing whether their canonical forms are equal.
+
+       @param [in] pattern1  First input pattern
+       @param [in] pattern2  Second input pattern
+       @return  Returns true if the patterns are equivalent, otherwise
+                false.
+ */
+bool PatternsEquivalent(const TensorPattern &pattern1,
+                        const TensorPattern &pattern2);
+
+
+/**
+   This function tries to compute the set-wise intersection between two patterns
+   (i.e. the intersection between their memory-index-sets).  On success it
+   outputs a vector of patterns rather than a single pattern, because this
+   intersection may be empty or may not be expressible as a single pattern but
+   only as a union of patterns (i.e. a union of the patterns this function
+   outputs).  This function may fail to compute the intersection (see
+   documentation of return status).
+
+      @param [in] pattern1  The first of the two patterns of which
+                        we want the intersection.
+      @param [in] pattern2  The first of the two patterns of which
+                        we want the intersection.
+      @param [out] intersection  On success, this function outputs
+                       a vector of patterns (in arbitrary order), where the
+                       memory-index-set it (conceptually) outputs is the union
+                       of all the memory-index-sets of the patterns output
+                       (which this function guarantees will be disjoint).
+                       If this vector is empty and this function returns true,
+                       it means the intersection was the empty set.
+
+      @return  Returns true if the intersection could be computed, and
+               false otherwise.  This function will always return true if,
+               when the strides of pattern1 and pattern2 are sorted and
+               duplicates removed and listed in increasing order, each
+               stride divides the next on in the list exactly.  (This should
+               cover all the immediately forseeable cases we might need to
+               compute).
+*/
+bool ComputeIntersection(const TensorPattern &pattern1,
+                         const TensorPattern &pattern2,
+                         std::vector<TensorPattern> *intersection);
+
+
+/**
+   Outputs the memory-index-set corresponding to the pattern 'pattern' to 's'.
+   See glossary in tensor-pattern.h for definitions.
+
+
+   This is strictly to be used in debugging code, as it is extremely
+   inefficient.
+
+      @param [in] pattern  The input pattern
+      @param [out] s   The memory-index-set
+ */
+bool ToMemoryIndexSet(const TensorPattern &pattern,
+                      std::unordered_set<int64> *s);
+
+
+
+/**
+   Outputs the memory-index-tuple-set corresponding to the pattern 'pattern' to
+   's' (see tensor-pattern.h for definition).  For storage in 's', each tuple is
+   converted into a single integer by a hashing function that should keep
+   distinct tuples separate as long as the memory-indexes were not huge.  (We
+   may output the actual tuples at some point in the future if they are ever
+   needed).
+
+   This function is strictly to be used in debugging code, as it is
+   extremely inefficient.
+
+      @param [in] pattern  The input pattern
+      @param [out] s   The memory-index-set
+ */
+bool ToMemoryIndexTupleSet(const ArrayRef<TensorPattern*>  patterns,
+                           std::unordered_set<int64> *s);
+
+
+/**
+   Returns true if the two pattern-tuples are equivalent in the sense
+   that their memory-index-tuple-sets are the same.  See glossary
+   in tensor-pattern.h for explanation.
+ */
+bool PatternTuplesEquivalent(const ArrayRef<const TensorPattern*> &patterns1,
+                             const ArrayRef<const TensorPattern*> &patterns2);
+
+
+/**
+   Class TensorPatternRebaser is an object that converts TensorPattern
+   when memory layouts change.  The main use-case is when a base Variable
+   (c.f. variable.h for definition) has a TensorPattern that is not
+   contiguous (see tensor-pattern.h for definition of 'contiguous'), and
+   its gradient Tensor is allocated contiguously.  This class is
+   needed to convert patterns for Variables into patterns for their
+   corresponding gradients.
+
+   We make it an object rather than a function in order to avoid repetition when
+   multiple patterns need to be rebased.
+ */
+class TensorPatternRebaser {
+
+  /*
+    Constructor.
+       @param [in] src_pattern  The pattern that we are converting *from*,
+                              e.g. the pattern of a Variable whose gradient
+                              has a different layout from itself.
+       @param [in] dest_pattern  The pattern that we are converting *to*.
+                              Must have the same num_axes and the same dims
+                              as 'src_pattern'.
+
+    Let t be a valid index-tuple for src_pattern/dest_pattern, determined
+    by their 'dims' and 'num_axes'.  Using t to index src_pattern and
+    dest_pattern gives memory-indexes:
+       m_src = src_pattern[t]
+       m_dest = dest_pattern[t]
+    View this object as a function from memory-indexes to memory-indexes
+    (m_src -> m_dest), whose domain is the memory-index-set of src_pattern
+    and whose range is the memory-index-set of dest_pattern.
+
+    The purpose of this object is to modify patterns in a way that maps
+    their memory-indexes with the same function.
+  */
+  TensorPatternRebaser(const TensorPattern &src_pattern,
+                       const TensorPattern &dest_pattern);
+
+
+  /**
+     This function attempts to modify pattern->offset and pattern->strides in a
+     way that does the mapping of memory-indexes m_src -> m_dest that is implied
+     by the src_pattern and dest_pattern passed to the constructor.  That is,
+     for any index-tuple t valid for 'pattern', the memory-index `pattern[t]`
+     evaluated before and after calling this function gets mapped according
+     to the function (m_src -> m_dest) mentioned in our documentation for
+     the constructor.
+
+     @param [in,out]  pattern  The pattern to be rebased.  Must, at entry,
+                          satisfy `PatternIncludes(src_pattern, *pattern)`,
+                          where `src_pattern` was the pattern passed to the
+                          constructor.  On success (i.e. if this function
+                          returns true), the condition
+                          `PatternIncludes(dest_pattern, *pattern)` will
+                          be satisfied.  On failure, the contents of
+                          'pattern' is undefined.
+
+     @return  Returns true if the conversion was possible.
+   */
+  bool Rebase(TensorPattern *pattern);
+
+  private:
+
+  // TODO: remove src_pattern_ and dest_pattern_ once everything
+  // is debugged.  They are copies of the src_pattern and dest_pattern
+  // passed to the constructor.
+  TensorPattern src_pattern_;
+  TensorPattern dest_pattern_;
+
+  // If needs_conversion_ is false, it means the patterns don't need any conversion
+  // at all (this is an optimization).
+  bool needs_conversion_;
+
+  // The 'offset' value of src_pattern_compressed (i.e. the src_pattern passed
+  // to the constructor, which has been jointly compressed and normalized with
+  // dest_pattern (to make all src_strides positive).
+  int64 src_offset_;
+  // The 'offset' value of dest_pattern_compressed
+  int64 dest_offset_;
+
+  // num_axes_ is the number of axes, not in the original src_pattern /
+  // dest_pattern but after the two patterns have been jointly compressed and
+  // then sorted from smallest to greatest stride in src_pattern.
+  // src_strides_ are the resulting strides from src_pattern_compressed, and
+  // dest_strides_ are the resulting strides from dest_pattern_compressed.
+
+  // dest_pattern_ are the strides of the thus-modified src_pattern and
+  // dest_pattern.  As an optimization, if src_strides and dest_strides end up
+  // being the same, we set num_axes to zero and skip modifying the strides when
+  // CompressPattern() is called.
+
+  // Note: all of src_strides_[0] .. src_strides_[num_axes_ - 1] will be greater
+  // than zero.  We can guarantee this because src_pattern and dest_pattern as
+  // passed to the constructor had the same dims, so any axes with dim=1 would
+  // have had dim=1 for both src and dest, hence they would have been removed by
+  // CompressPatterns(), hence no strides would be zero after
+  // CompressPatterns(); and CompressPatterns() normalizes the signs of the
+  // strides so the first one (i.e. src_pattern) has positive strides.
+  int32 num_axes_;
+  int32 src_strides_[KALDI_TENSOR_MAX_DIM];
+  int32 dest_strides_[KALDI_TENSOR_MAX_DIM];
+
+  // The basic algorithm in Convert() is:
+  //  First, add offset_ to its offset.
+  //   Then:
+  //     For each nontrivial axis of 'pattern', we are going to modify
+  //     its stride as needed.
+  //     Let that stride be `stride`, and the corresponding dim `dim`.
+  //     Let `pstride = abs(stride)` be the absolute value of the stride
+  //     (we'll modify that, and then restore the sign.
+  //     positive.
+  //
+
+
+
+  // Converts a memory-index from the src to dest pattern.  This is applying,
+  // to a single arbitrary memory-index m_src, the mapping (m_src -> m_dest);
+  // see the comments above for explanation of this notation.
+  // It is required that m >= 0 (otherwise it would not have been inside
+  // the source pattern).
+  int64 ConvertMemoryIndex(int64 m);
+
+};
+
+
+}  // namespace tensor
+}  // namespace kaldi
diff --git a/src/tensor/tensor-pattern-utils.cc b/src/tensor/tensor-pattern-utils.cc
index 69d9e6d0e82..56d567e58c2 100644
--- a/src/tensor/tensor-pattern-utils.cc
+++ b/src/tensor/tensor-pattern-utils.cc
@@ -1,8 +1,23 @@
-#include "tensor/tensor-pattern-utils.h"
+// tensor/tensor-pattern-utils.cc
+
+// Copyright      2019  Johns Hopkins University (author: Daniel Povey)
+
+// See ../../COPYING for clarification regarding multiple authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//  http://www.apache.org/licenses/LICENSE-2.0
+//
+// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
+// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
+// MERCHANTABLITY OR NON-INFRINGEMENT.
+// See the Apache 2 License for the specific language governing permissions and
+// limitations under the License.
 
-/**
-   This is some notes on plans for kaldi10 tensor stuff, nothing is fully fleshed out.
-*/
+#include "tensor/tensor-pattern-utils.h"
 
 namespace kaldi {
 namespace tensor {
@@ -361,27 +376,97 @@ void CompressOnePattern(TensorPattern *pattern,
 
 void SortAxes(TensorPattern *pattern) {
   int32 num_axes = pattern->num_axes;
-  std::pair<int32,int32> strides_dims[KALDI_TENSOR_MAX_DIM];
-  for (int32 raxis = 0; raxis < num_axes; raxis++) {
-    int32 stride = pattern->strides[raxis],
-        dim = pattern->dims[raxis];
-    KALDI_ASSERT(stride > 0);  // see documentation in header for reasons.
-    strides_dims[raxis].first = stride;
-    strides_dims[raxis].second = dim;
+  switch(num_axes) {
+    case 0: case 1:
+      return;
+    case 2:
+      if (pattern->strides[0] > pattern->strides[1]) {
+        std::swap(pattern->strides[0], pattern->strides[1]);
+        std::swap(pattern->dims[0], pattern->dims[1]);
+        pattern->code = -1;
+      }
+      return;
+    default: {
+      // This is bubble sort, which might seem super inefficient, but it avoids
+      // the need to create a temporary of pairs (or implement an appropriate
+      // in-place sort); and since num_axes will rarely be more than about 3,
+      // and never more than 6, I don't think the speed will be a problem.
+      while (true) {
+        bool changed = false;
+        for (int32 i = 0; i < num_axes - 1; i++) {
+          if (pattern->strides[i] > pattern->strides[i + 1]) {
+            std::swap(pattern->strides[i], pattern->strides[i + 1]);
+            std::swap(pattern->dims[i], pattern->dims[i + 1]);
+            changed = true;
+          }
+        }
+        if (changed)
+          pattern->code = -1;
+        else
+          return;
+      }
+    }
   }
-  std::sort(strides_dims, strides_dims + num_axes);
-  for (int32 raxis = 0; raxis < num_axes; raxis++) {
-    pattern->strides[raxis] = strides_dims[raxis].first;
-    pattern->dims[raxis] = strides_dims[raxis].second;
+}
+
+}
+
+void Transpose(int32 raxis1, int32 raxis2, TensorPattern *p) {
+  if (static_cast<uint32>(raxis1) >= static_cast<uint32>(p->num_axes) ||
+      static_cast<uint32>(raxis2) >= static_cast<uint32>(p->num_axes)) {
+    KALDI_ERR << "Invalid axes to transpose: raxis1="
+              << raxis1 << ", raxis2=" << raxis2
+              << ", num-axes = " << p->num_axes;
+  }
+  std::swap(p->strides[raxis1], p->strides[raxis2]);
+  std::swap(p->dims[raxis1], p->dims[raxis2]);
+  p->code = -1;
+}
+
+void Transpose(int32 axis1, int32 axis2, TensorPattern *p) {
+  int32 num_axes = p->num_axes;
+  // interpret negative axes as offsets from num_axes.
+
+  // Work out the reversed / private axis indexes that we physically use
+  // in the arrays.  This includes interpreting negative axis
+  // indexes as being relative to the number of axes.
+  int32 raxis1 = (axis1 < 0 ? axis1 + 1 : num_axes - 1 - axis1),
+      raxis2 = (axis2 < 0 ? axis2 + 1 : num_axes - 1 - axis2);
+  if (static_cast<uint32>(raxis1) >= static_cast<uint32>(p->num_axes) ||
+      static_cast<uint32>(raxis2) >= static_cast<uint32>(p->num_axes)) {
+    KALDI_ERR << "Invalid axes to transpose: axis1="
+              << axis1 << ", axis2=" << axis2 << ", num-axes = " << p->num_axes;
   }
+  std::swap(p->strides[raxis1], p->strides[raxis2]);
+  std::swap(p->dims[raxis1], p->dims[raxis2]);
+  p->code = -1;
 }
 
 
-int32 GetDimsCode(const TensorPattern &pattern) {
-  // we may not need this after all.
+
+void RemoveTrivialAxes(TensorPattern *pattern) {
+  int32 num_axes = pattern->num_axes,
+      num_axes_out = 0;
+  for (int32 raxis = 0; raxis < num_axes; raxis++) {
+    int32 this_dim = pattern->dims[raxis];
+    if (this_dim != 0) {
+      if (num_axes_out != raxis) {
+        pattern->dims[num_axes_out] = this_dim;
+        pattern->strides[num_axes_out] = pattern->strides[raxis];
+      }
+    }
+  }
+  // It is a requirement of struct TensorPattern that dims and
+  // strides for raxis > num_axes be 1 and 0 respectively.
+  for (int32 raxis = num_axes_out; raxis < num_axes; raxis++) {
+    pattern->dims[raxis] = 1;
+    pattern->strides[raxis] = 0;
+  }
+  pattern->num_axes = num_axes;
+  // Caution: we are not updating the code.
+
 }
 
 
 }  // namespace kaldi
 }  // namespace tensor
-x
diff --git a/src/tensor/tensor-pattern-utils.h b/src/tensor/tensor-pattern-utils.h
index fe4e204d5e9..ee80fbabf23 100644
--- a/src/tensor/tensor-pattern-utils.h
+++ b/src/tensor/tensor-pattern-utils.h
@@ -19,44 +19,17 @@
 
 
 #include "tensor/tensor-common.h"
+#include "tensor/tensor-pattern.h"
 #include "tensor/array-ref.h"
 
-/**
-   This is some notes on plans for kaldi10 tensor stuff, nothing is fully fleshed out.
-*/
+// This header includes various functions operating on Patterns.
+// See also tensor-pattern-extra-utils.h which contains the
+// more obscure and less user-facing functions.
 
 namespace kaldi {
 namespace tensor {
 
 
-/**
-   This function returns a code that compactly says whether each axis
-   has dim = 1 or dim != 1.  For purposes of the code generated, the number
-   of axes does not matter.  The lower-order KALDI_TENSOR_MAX_DIM bits
-   of the code might potentially be set; the rest will be zero.
-
-   The rightmost (least significant) bit corresponds to the last-numbered axis,
-   equivalent to raxis (reversed axis-index) == 0.
-
-   Note that non of the example `dims` vectors below have any leading
-   (dim=1) axes, because they wouldn't affect the code.
-
-   The examples below will use c++14 binary literals, although
-   the code doesn't use them.  In the notation below, in dims vectors,
-   x is a stand-in for 'any number greater than 1'.
-
-    0b00000000  0x00  dims=(), a scalar
-    0b00000001  0x01  dims=(x)
-    0b00000010  0x02  dims=(x,1)
-    0b00000011  0x03  dims=(x,x)
-
-    etc.
-
-  See also GetPatternCode(), which includes the same information but
-  also stride-related information.
- */
-int32 GetDimsCode(const TensorPattern &pattern);
-
 
 enum PatternEnum {
   kPatternContainsNegativeStride = 2048
@@ -80,11 +53,25 @@ inline bool AxisIsTrivial(int32 pattern_code, int32 raxis) {
 
 
 /**
-   This function returns a code that compactly represents the same information
-   as GetDimsCode() [i.e. which axes had dim != 1], but also encodes which axis,
-   if any, had stride=1, and has a bit that says whether any axis had negative
-   stride.  (No two axes can have stride=1, due to the uniqueness rule; search
-   in tensor-pattern.h).
+   This function removes trivial axes (i.e. axes with dim=1) from 'pattern'.
+   Although in a valid pattern axes with dim=1 must have stride=0
+   and vice versa, this function does not check that property; it simply
+   removes axes with dim=1, reducing num_axes appropriately.
+
+     @param [in,out] pattern   Pattern to be modified.  Any axes with dim=1
+                         will be removed and the num_axes reduced.  Will be
+                         valid at output if it was valid at input, or even if
+                         it was valid at input in all but property (iv),
+                         that strides must be zero for axes with dim=1.
+                         CAUTION: the code of 'pattern' is *not* updated.
+ */
+void RemoveTrivialAxes(TensorPattern *pattern);
+
+
+/**
+   This function returns a code that compactly represents information about
+   which axes had dim != 1; which axis, if any, had stride == 1; and
+   whether any axis had stride < 0.
 
    Let
       n = 0 if no axis had stride=1, otherwise:
@@ -105,12 +92,15 @@ inline bool AxisIsTrivial(int32 pattern_code, int32 raxis) {
    most significant).
 
    Bit 11 is 1 if any of the strides were negative, and zero otherwise.
-   None of the example bit-patterns below have this bit set.  The
+   (None of the example bit-patterns below have this bit set.)  The
    underlying BLAS in most cases does not support negative strides so
    we deal with it by copying the data to a temporary with positive
    strides.
 
-   The low-order KALDI_TENSOR_MAX_DIM bits are as returned by GetDimsCode().
+   The low-order KALDI_TENSOR_MAX_DIM bits have a 1 corresponding to
+   axes where dim != 1, and a 0 if dim == 1 for that axis.  Axis zero
+   in the private numbering (equal to the highest-numbered axis in the
+   public numbering) is the rightmost/lowest-order of these bits.
 
    The explanation below will use c++14 binary literals (like 0b010101), although the code
    doesn't use them as we compile as c++11; we show the corresponding hex codes which
@@ -279,23 +269,110 @@ inline void Squeeze(int32 axis, TensorPattern *p) {
 }
 
 
-ybool Broadcastable(const TensorPattern &a, const TensorPattern &b,
-                   bool b_non_reducing = false);
+
+/** Transpose the two specified axes (specified in the private/reversed
+    numbering) of a TensorPattern.
+
+    @param [in] raxis1  First axis to be transposed; must be in range
+                        `[0, p->num_axes - 1]`
+    @param [in] raxis2  Second axis to be transposed; must be in range
+                        `[0, p->num_axes - 1]`
+                        If identical to axis1, nothing will be done.
+    @param [in,out] p  TensorPattern whose axes are to be transposed.
+ */
+void TransposeR(int32 raxis1, int32 raxis2, TensorPattern *p);
+
+
+/** Transpose the two specified axes (specified in the private/reversed
+    numbering) of a TensorPattern.
+
+    @param [in] axis1  First axis to be transposed; must be in range
+                       `[-p->num_axes, p->num_axes - 1]`,
+                       with negative axis being interpreted as an offset
+                       from p->num_axes.  This axis-index is in the
+                       public numbering, not the reversed numbering
+                       physically used in 'pattern'.
+    @param [in] axis2  Second axis to be transposed; must be in range
+                       `[-p->num_axes, t->num_axes - 1]`.
+                       If identical to axis1, nothing will be done.
+    @param [in,out] p  TensorPattern whose axes are to be transposed.
+                       p->code is updated.
+ */
+void TransposeR(int32 raxis1, int32 raxis2, TensorPattern *p);
+
 
 
+/**
+   Modifies 'p' in-place by removing an axis with dim=1 (hence stride=0)
+   located at the specified axis (as numbered in the public numbering).
+   Equivalent to PyTorch's squeeze(), including its behavior with
+   negative axis indexes; axis < 0 is interpreted as to num_axes - axis,
+   i.e. the last axis.  It is an error if 'p' did not, on entry,
+   contain an axis with dim=1 at position 'axis' (in the public numbering).
+
+   Showing just the dims in the pattern, in the non-reversed order as
+   exported by the API, some examples are:
+\verbatim
+    Squeeze([1,6,5], 0) -> [6,5]
+    Squeeze([3,1,4], 1) -> [3,4]
+    Squeeze([9,1,10], 2) -> error
+    Squeeze([7,1], -1) -> [7]
+\endverbatim
+
+     @param [in]    axis    The index at which the extra axis is to appear.
+                            We require -p->num_axes <= axis < p->num_axes
+                            (negative axes are permitted, interpreted
+                            as an offset from p->num_axes).
+                            We require that the specified axis have
+                            dim=1.
+     @param [in,out] p      The pattern from which we are removing an
+                            axis.  Will have its num_axes reduced by 1
+                            at exit, possibly its dims and strides
+                            arrays changed, and its 'code' updated.
+*/
+inline void Squeeze(int32 axis, TensorPattern *p) {
+  if (axis < 0) SqueezeR(1 - axis, p);
+  else SqueezeR(p->num_axes - 1 - axis, p);
+}
+
 /**  This function returns true if the dimensions of tensor patterns
      a, b and c are broadcastable in the PyTorch sense (meaning;
      after padding their dims on the left with ones to make them
      have the same num-axes, corresponding dimensions are either
-     identical or 1).  See the version of Broadcastable() above
-     for more information.
+     identical or 1).  The previous sentence is written in terms
+     of the public numbering; in the private numbering it just means
+     for each index `raxis` into the dims vector,
+     either `a.dims[raxis] == b.dims[raxis]`, or one of them si 1.
+
+       @param [in] a  The pattern of the first Tensor
+       @param [in] b  The pattern of the second Tensor
+       @param [in] b_non_reducing   If true, then we do not allow a dim of
+                      b to be 1 while corresponding dim of a is >1.
+       @return  Returns true if a and b are broadcastable (with
+                an additional constraint that `a.dims[i] <= b.dims[i]` if
+                `b_non_reducing == true`.
+ */
+bool Broadcastable(const TensorPattern &a, const TensorPattern &b,
+                   bool b_non_reducing = false);
+
 
-       @param [in] a  The dimensions of the first Tensor
-       @param [in] b  The dimensions of the second Tensor
-       @param [in] c  The dimensions of the third Tensor
+/**  This function returns true if the dimensions of tensor patterns
+     a, b and c are broadcastable in the PyTorch sense, which is
+     the same as
+     `Broadcastable(a, b) && Broadcastable(b, c) && Broadcastable(a, c)`.
+     See the 2-argument version of Broadcastable for more information.
+
+       @param [in] a  The pattern of the first Tensor
+       @param [in] b  The pattern of the second Tensor
+       @param [in] c  The pattern of the third Tensor
        @param [in] c_non_reducing   If true, then we do not allow a dim of
                       c to be 1 while corresponding dims of a or b
                       are > 1.
+       @return  Returns true if a, b and c are broadcastable (with
+                an additional constraint that
+                `max(a.dims[i], b.dims[i]) <= c.dims[i]` if
+                `c_non_reducing == true`).
+
  */
 bool Broadcastable(const TensorPattern &a, const TensorPattern &b,
                    const TensorPattern &c, bool c_non_reducing = false);
@@ -329,107 +406,104 @@ bool SameDim(const TensorPattern &a, const TensorPattern &b,
 
 /**
    Compresses a TensorPattern by removing or combining as many axes as possible.
-   This version is suitable for operations that do not rely on any kind
-   of structure, such as zeroing or nonlinearities; the only equivalence
-   maintained is equivalence of the set of memory locations covered.
+   This version is suitable for operations that do not rely on any kind of
+   structure, such as zeroing or nonlinearities; the only equivalence maintained
+   is equivalence of the set of memory locations covered (the memory-index-set).
    The order of the (dim,stride) pairs in the input does not affect the
    output.  The output (dim,stride) pairs will be ordered from
    greatest to least stride (note: all output strides will be positive).
 
       @param [in,out]  pattern   The pattern to be compressed
 
-      @param [out] data_offset  A number that we would have to add to
-                          the data pointer of the source Tensor so
-                          that 'dest' would cover the same set of
-                          elements.  It will always be zero if 'src'
-                          was free of negative strides.
    Examples are below, where we write a TensorPattern as
 
-   `{{dim1,dim2,..}, {stride1,stride2,..}}`.
+   `{{dim1,dim2,..}, {stride1,stride2,..} [,offset] }`
+
+   (the offset is written only if nonzero).
 
    (the curly braces in our notation imply that we are referring to the reversed
    ordering physically used in 'pattern', but actually this doesn't affect
-   anything as the order of axes does not matter here as long as it is constent.
+   anything since the order of axes does not matter here as long as it is constent.
 
 \verbatim
-   Input pattern             Output pattern            Output offset
-     {{10},{1}}               {{10},{1}}                  0
-    {{3,4},{4,1}}             {{12},{1}}                  0
-    {{4,3},{1,4}}             {{12},{1}}                  0
-    {{9},{-1}}                {{9},{1}}                  -8
-   {2,3,4},{100,4,1}        {{2,12},{100,1}}              0
+   Input pattern             Output pattern
+     {{10},{1}}               {{10},{1}}
+    {{3,4},{4,1}}             {{12},{1}}
+    {{4,3},{1,4}}             {{12},{1}}
+    {{9},{-1},8}                {{9},{1}}    // offset reduced by 8.
+   {{2,3,4},{100,4,1}}        {{2,12},{100,1}}
 \endverbatim
  */
-void CompressOnePattern(TensorPattern *pattern,
-                        int64 *data_offset);
+void CompressOnePattern(TensorPattern *pattern);
+
+
 
 /**
-   Sorts the axes in 'pattern' from smallest to largest stride
-   (in the reversed numbering physically present in 'pattern'; would
-   be largest to smallest in the public API).  Useful in testing
-   equivalence of patterns, as CompressOnePattern() followed
-   by SortAxes() leads to a normalized form.
-
-   This function requires that for 0 <= i < pattern->num_axes,
-   pattern->strides[i] > 0.  This condition is satisfied by
-   a pattern that has previously been compressed by CompressOnePattern().
-   If in future we need to relax this constraint, we will do so.
-   (The assumption of positive strides simplifies implementation
-   because to normalize the form we'd have to make all strides
-   positive, which would require outputting an offset).
+   Sorts the axes in 'pattern' from most negative to most positive stride
+   in private numbering, equivalent to sorting from most positive to
+   most negative stride in public numbering.
+
+   TODO: decide whether to change this to sort on abs(stride), or
+   maybe create another version that does sort on abs(stride), if there
+   are situations where this turns out to be useful.
 
      @param [in,out]  The pattern whose axes are to be sorted
-                   from least to greatest stride (in the physical
-                   ordering).
+                   from most negative to most positive stride (in the
+                   physical ordering).
  */
 void SortAxes(TensorPattern *pattern);
 
 
-/*
-  Compress two TensorPatterns by combining axes (and possibly
-  flipping the sign of their strides and changing the data offset)
-  The type of compression involved is the same as for CompressOnePattern
-  (meaning we are doing some kind of operation that doesn't care about
-  the structure, such as an element-by-element nonlinearity).
-
-  The difference from calling CompressOnePattern() twice is that this function
-  needs to preserve the relationship between the tensors whose pattern is src1
-  and src2.  Suppose that a tensor with pattern src3 was the result of this
-  elementwise operation satisfying Broadcastable(src1, src2, src3); there is
-  only one such pattern.  Let x be a tuple which would be a valid index for the
-  tensor with pattern src3.  Let us use an extended indexing convention
-  whereby if an axis of src1 or src2 has dimension 1, we allow that axis to be
-  indexed by any value, which would not affect the memory location because the
-  stride is zero.  Then each such tuple x leads to a different pair of memory
-  locations (p1, p2) in the tensors corresponding to patterns src1, src2.  The
-  invariance that this function must preserve is that the set of memory-location
-  pairs (p1, p2) must be the same in the output tensors (with their
-  appropriately moved data pointers), as in the input tensors.
-
-  What this means in practice is that we need to do the same operations on src1
-  and src2.  For example, if flipping the sign of an axis of src1 we would have
-  to flip that of src2, and if merging two axes of src1 we would have to merge
-  the same two axes of src2.
-
-    @param [in] src1  The first source pattern.
-    @param [in] src2  The second source pattern.
-                      We require Broadcastable(src1,src2) == true.
-    @param [out] dest1  Compressed pattern out corresponding to src1.  Will
-                     be free of negative strides (but dest2 might not be).
-    @param [out] dest_offset1  Data offset that we'd need to add to src1's
-                     data pointer before using the pattern 'dest1'
-    @param [out] dest1  Compressed pattern out corresponding to src2.
-                     Might not be free of negative strides if some dimensions
-                     of src1/src2 had strides of opposite sign.
-    @param [out] dest_offset1  Data offset that we'd need to add to src1's
-                     data pointer before using the pattern 'dest1'
+// TODO: document this?
+inline void CanonicalizePattern(TensorPattern *pattern) {
+  CompressOnePattern(pattern);
+  SortAxes(pattern);
+}
+
+
+/**
+   This version of SortAxes() sorts the axes in 'patterns' (which must be
+   nonempty and all have the same number of axes), by ordering them from the
+   most negative stride value in patterns[0] to the most positive stride value
+   in patterns[0], using the strides in the other patterns to disambiguate the
+   order only in case of ties, which could only happen if some strides were
+   zero.  I.e. it's a lexical ordero the strides of the patterns.  Note: the
+   most-negative-to-most-positive ordering is in terms of the private, `raxis`
+   numbering; it would be most-positive-to-most-negative in the public
+   numbering.
+
+     @param [in,out]  The patterns whose axes are to be sorted.  All
+                    will have their axes subject to the same permutation.
+                    The ordering is based on the strides of patterns[0],
+                    but using the strides of later numbered patterns in
+                    case of ties.
+ */
+void SortAxes(ArrayRef<TensorPattern*> patterns);
+
+/**
+  Multiplies all strides and the offset in 'pattern' by 'scale', which must be >
+  0.  For now, will just crash if this causes integer overflow.
+
+  This function is used in the memory-locking code if the same storage location
+  is accessed using different dtypes (which is unlikely).
+ */
+void ScaleStridesAndOffset(int32 scale, TensorPattern *pattern);
+
+
+
+/// Hashing object, used when we need an unordered_map containing TensorPattern.
+class PatternHasher {
+  size_t operator () (const TensorPattern &pattern) const;
+};
 
 
+/*
+  CompressTwoPatterns() is a special case of CompressPatterns() where there
+  are exactly two patterns to be jointly compressed.  See documentation of
+  CompressPatterns() for explanation.
  */
 void CompressTwoPatterns(TensorPattern *a,
-                         TensorPattern *b,
-                         int64 *data_offset_a,
-                         int64 *data_offset_b);
+                         TensorPattern *b);
 
 
 /**
@@ -442,44 +516,20 @@ void CompressTwoPatterns(TensorPattern *a,
    The difference with just calling CompressOnePattern() several times is
    that CompressPatterns() preserves the relationships between the tensors.
 
-   Firstly, we require that all pairs of TensorPattern in 'patterns' be
-   broadcastable: that is, Broadcastable(p1, p2) would hold for any
-   p1, p2 in 'patterns'.  In the explanation below we will use a
-   'permissive indexing' convention whereby if a Tensor has an axis
-   with dim,stride (0, 1), we allow it to be indexed by any value
-   (not just zero), so that all the tensors represented can accept the
-   same set of index tuples.  Suppose for example that there are three
-   patterns, p1, p2, p3, in 'patterns', with 4 axes.  Let max_axes
-   larger of the num-axes of p1, p2 or p3, and let
-   x = (i, j, k, l) be an index tuple that would be valid for a tensor
-   with that many axes.  Each such x, when used as an index into p1, p2
-   and p3 with 'permissive indexing' as mentioned above, will
-   give us a tuple of memory-offsets (o1, o2, o3); o1, o2 and o3 are indexes
-   into the respective data pointers.  Ranging over the set of index-tuples
-   x, we get a set of memory-offset tuples; call this set S_in,
-   and call the set that we would get if doing the same procedure
-   on the output tensors (with their possibly changed num-axes), be
-   S_out.  Let us represent the 'data_offset' output of this function
-   as (in this case) a 3-tuple o.  Then the invariant that this
-   function needs to satisfy is that:
-
-        `S_in = S_out + o`
-
-   (this equates two sets of 3-tuples, in our example) where we interpret the '+
-   o' as adding to each element of the set.  The '+ o' above would only be
-   necessary if any strides were negated; it is a tuple containing offsets, in
-   elements, to be added to the data pointers of the respective output tensors.
+   In technical terms (and you will have to follow definitions several deep
+   in the glossary to find all the definitions), this operation
+   preserves the memory-index-tuple-set of the Pattern-tuple, and
+   also the memory-index-set of each of the Patterns (we have to specify
+   the part after "and" to disallow swapping the Patterns).
 
+   Note: while the first Pattern will have no negative strides at output,
+   the others may.
 
       @param [in,out] patterns   An nonempty array of the patterns
                          to be jointly compressed.
-      @param [out]  data_offsets  Pointer to an array of the same size
-                        as 'patterns', which on output will contain
-                        offsets to be added to the data pointers.
 
       @return  Returns true if it made any change to the patterns,
-               false if they were unchanged.  If false, the
-               data_offsets will be set to zero.
+               false if they were unchanged.
 
  Examples are below, where we write a TensorPattern as
  `{{dim1,dim2,..}, {stride1,stride2,..}}`.
@@ -494,40 +544,35 @@ void CompressTwoPatterns(TensorPattern *a,
  {{3,4},{4,1}}        {{1,1},{0,0}}      {{12},{1}}           {{1},{0}}    # combine
 \endverbatim
  */
-bool CompressPatterns(ArrayRef<TensorPattern*> patterns,
-                      int64 *data_offsets);
+bool CompressPatterns(ArrayRef<TensorPattern*> patterns);
 
 /**
    Compresses a TensorPattern by removing or combining as many axes as possible,
-   while respecting certain invariances that are relevant when constructing
-   'views' ('view' is PyTorch terminology; the NumPy equivalent is 'reshape').
-   The "C" in the function name refers to C-style arrays.
+   while preserving the memory-index-set of the pattern (see glossary for
+   explanation), and also while respecting certain invariances that are relevant
+   when constructing 'views' ('view' is PyTorch terminology; the NumPy
+   equivalent is 'reshape').  The "C" in the function name refers to C-style
+   arrays.  Basically what this function does is a highly restricted subset
+   of what CompressOnePattern() does.
 
-    This function removes axes with dim=1.
+   This function removes axes with dim=1.
 
    This function combines successive axes if the relationship of their
    dims and strides is what you would expect in a "C"-style array
    when the axes are listed in their non-reversed ordering (i.e.
    as exposed by class Tensor).
 
-
    Suppose that in pattern 'p' we had two successive axes physically numbered
    raxis, raxis+1, with p->dims[raxis] > 1 and p->dims[raxis+1] > 1
    and p->strides[raxis + 1] == p->strides[raxis] * p->dims[raxis],
-   then this function will merge them into a single axis with dimension
-   the product of the two dimensions..
-
-    TODO...
-
-   finish this if it turns out to be needed for something.
+   then this function will merge them into a single axis whose dimension
+   is the product of the dimensions of the two original axes.
+   (However, they won't be merged if it would
+   result in a dimension exceeding the range of int32).
 
+   TODO...  finish this if it turns out to be needed for something.
+   I'm not sure if it will be.
 
-   with dims and
-   strides (dim_a, dim_b) and (stride_a, stride_b), with dim_a > 1 and
-   dim_b > 1.  If stride_a == stride_b * dim_b, then this function
-   will merge them into a single axis with dimension (dim_a * dim_b)
-   and stride stride_b.   (However, they won't be merged if it would
-   result in a dimension exceeding the range of int32).
 
    The output pattern 'dest' is what you get if you keep applying the
    rules above until no further change is made.
@@ -586,16 +631,226 @@ void CompressPatternC(TensorPattern *p);
    @param [out] pattern_out  The output pattern, if we were
                       successful (otherwise undefined).  Its 'dims'
                       will be the same as 'dims'.
-   @return           Returns true on success (i.e. such a view existed),
-                     and false otherwise.  This function will never return
-                     false if 'pattern_in' had strides as for a "C" array
-                     (i.e., if its properties' has_c_strides was true).
+   @return           Returns true on success (i.e. such a view could be
+                     created), and false otherwise.  This function will
+                     never return false if 'pattern_in' had strides as
+                     for a "C" array (i.e., if HasCStrides(pettern_in)
+                     returns true).
 
  */
 bool CreateViewPattern(const TensorPattern &pattern_in,
                        ArrayRef<int32> dims,
                        TensorPattern *pattern_out);
 
+/**
+   This function returns true if 'pattern' has the same strides
+   as 'C' array with the same dimensions would have.  (Note:
+   we are referring here to the public numbering of the axes).
+   For example, an array of dims [3, 4, 5], if it were
+   a "C" array, would have strides of [20, 5, 1].  As a special
+   case, since our Patterns use stride=0 for axes with dim=1,
+   we treat that zero as a wildcard; that is, if there
+   is a stride value for which the array would have "C" strides
+   then we'll return true.
+
+     @param [in] pattern  The pattern we are checking.  It is expected
+                     to satisfy Valid(pattern), but this function does not
+                     check this.
+
+     @return  Returns true if this pattern has 'C' strides, and
+              false otherwise.   (See note above about axes
+              with dim=1).
+*/
+void HasCStrides(const TensorPattern &pattern);
+
+/**
+   Returns true if there is overlap between pattern1 and pattern2,
+   meaning that pattern1's memory-index-set and pattern2's
+   memory-index-set have nonempty intersection.
+ */
+bool PatternsOverlap(const TensorPattern &pattern1,
+                     const TensorPattern &pattern2);
+
+/**
+   Returns true if pattern2's memory-index-set is a subset of pattern1's
+   memory-index-set.  See glossary in tensor-pattern.h for explanation of
+   memory-index-set.
+ */
+bool PatternIncludes(const TensorPattern &pattern1,
+                     const TensorPattern &pattern2);
+
+
+/**
+   Returns true if the two patterns are equivalent in the sense that their
+   memory-index-sets are the same.  See glossary in tensor-pattern.h for
+   explanation.
+ */
+bool PatternsEquivalent(const TensorPattern &pattern1,
+                        const TensorPattern &pattern2);
+
+
+/**
+   Outputs the memory-index-set corresponding to the pattern
+   'pattern' to 's'.   See glossary in tensor-pattern.h for
+   definitions.  This is strictly to be used in debugging
+   code, as it is extremely inefficient.
+
+      @param [in] pattern  The input pattern
+      @param [out] s   The memory-index-set
+ */
+bool ToMemoryIndexSet(const TensorPattern &pattern,
+                      std::unordered_set<int64> *s);
+
+
+
+/**
+   Outputs the memory-index-tuple-set corresponding to the pattern 'pattern' to
+   's' (see tensor-pattern.h for definition).  For storage in 's', each tuple is
+   converted into a single integer by a hashing function that should keep
+   distinct tuples separate as long as the memory-indexes were not huge.  (We
+   may output the actual tuples at some point in the future if they are ever
+   needed).  This function is strictly to be used in debugging code, as it is
+   extremely inefficient.
+
+      @param [in] pattern  The input pattern
+      @param [out] s   The memory-index-set
+ */
+bool ToMemoryIndexTupleSet(const ArrayRef<TensorPattern*>  patterns,
+                           std::unordered_set<int64> *s);
+
+
+/**
+   Returns true if the two pattern-tuples are equivalent in the sense
+   that their memory-index-tuple-sets are the same.  See glossary
+   in tensor-pattern.h for explanation.
+ */
+bool PatternTuplesEquivalent(const ArrayRef<const TensorPattern*> &patterns1,
+                             const ArrayRef<const TensorPattern*> &patterns2);
+
+
+/**
+   Class TensorPatternRebaser is an object that converts TensorPattern
+   when memory layouts change.  The main use-case is when a base Variable
+   (c.f. variable.h for definition) has a TensorPattern that is not
+   contiguous (see tensor-pattern.h for definition of 'contiguous'), and
+   its gradient Tensor is allocated contiguously.  This class is
+   needed to convert patterns for Variables into patterns for their
+   corresponding gradients.
+
+   We make it an object rather than a function in order to avoid repetition when
+   multiple patterns need to be rebased.
+ */
+class TensorPatternRebaser {
+
+  /*
+    Constructor.
+       @param [in] src_pattern  The pattern that we are converting *from*,
+                              e.g. the pattern of a Variable whose gradient
+                              has a different layout from itself.
+       @param [in] dest_pattern  The pattern that we are converting *to*.
+                              Must have the same num_axes and the same dims
+                              as 'src_pattern'.
+
+    Let t be a valid index-tuple for src_pattern/dest_pattern, determined
+    by their 'dims' and 'num_axes'.  Using t to index src_pattern and
+    dest_pattern gives memory-indexes:
+       m_src = src_pattern[t]
+       m_dest = dest_pattern[t]
+    View this object as a function from memory-indexes to memory-indexes
+    f: (m_src -> m_dest), whose domain is the memory-index-set of src_pattern
+    and whose range is the memory-index-set of dest_pattern.
+
+    The purpose of this object is to modify patterns in a way that maps
+    their memory-indexes with the same function f.
+  */
+  TensorPatternRebaser(const TensorPattern &src_pattern,
+                       const TensorPattern &dest_pattern);
+
+
+  /**
+     This function attempts to modify pattern->offset and pattern->strides in a
+     way that does the mapping of memory-indexes m_src -> m_dest that is implied
+     by the src_pattern and dest_pattern passed to the constructor.  That is,
+     for any index-tuple t valid for 'pattern', the memory-index `pattern[t]`
+     evaluated before and after calling this function gets mapped according
+     to the function (m_src -> m_dest) mentioned in our documentation for
+     the constructor.
+
+     @param [in,out]  pattern  The pattern to be rebased.  Must, at entry,
+                          satisfy `PatternIncludes(src_pattern, *pattern)`,
+                          where `src_pattern` was the pattern passed to the
+                          constructor.  On success (i.e. if this function
+                          returns true), the condition
+                          `PatternIncludes(dest_pattern, *pattern)` will
+                          be satisfied.  On failure, the contents of
+                          'pattern' is undefined.
+
+     @return  Returns true if the conversion was possible.
+   */
+  bool Rebase(TensorPattern *pattern);
+
+  private:
+
+  // TODO: remove src_pattern_ and dest_pattern_ once everything
+  // is debugged.  They are copies of the src_pattern and dest_pattern
+  // passed to the constructor.
+  TensorPattern src_pattern_;
+  TensorPattern dest_pattern_;
+
+  // If needs_conversion_ is false, it means the patterns don't need any conversion
+  // at all (this is an optimization).
+  bool needs_conversion_;
+
+  // The 'offset' value of src_pattern_compressed (i.e. the src_pattern passed
+  // to the constructor, which has been jointly compressed and normalized with
+  // dest_pattern (to make all src_strides positive).
+  int64 src_offset_;
+  // The 'offset' value of dest_pattern_compressed
+  int64 dest_offset_;
+
+  // num_axes_ is the number of axes, not in the original src_pattern /
+  // dest_pattern but after the two patterns have been jointly compressed and
+  // then sorted from smallest to greatest stride in src_pattern.
+  // src_strides_ are the resulting strides from src_pattern_compressed, and
+  // dest_strides_ are the resulting strides from dest_pattern_compressed.
+
+  // dest_pattern_ are the strides of the thus-modified src_pattern and
+  // dest_pattern.  As an optimization, if src_strides and dest_strides end up
+  // being the same, we set num_axes to zero and skip modifying the strides when
+  // CompressPattern() is called.
+
+  // Note: all of src_strides_[0] .. src_strides_[num_axes_ - 1] will be greater
+  // than zero.  We can guarantee this because src_pattern and dest_pattern as
+  // passed to the constructor had the same dims, so any axes with dim=1 would
+  // have had dim=1 for both src and dest, hence they would have been removed by
+  // CompressPatterns(), hence no strides would be zero after
+  // CompressPatterns(); and CompressPatterns() normalizes the signs of the
+  // strides so the first one (i.e. src_pattern) has positive strides.
+  int32 num_axes_;
+  int32 src_strides_[KALDI_TENSOR_MAX_DIM];
+  int32 dest_strides_[KALDI_TENSOR_MAX_DIM];
+
+  // The basic algorithm in Convert() is:
+  //  First, add offset_ to its offset.
+  //   Then:
+  //     For each nontrivial axis of 'pattern', we are going to modify
+  //     its stride as needed.
+  //     Let that stride be `stride`, and the corresponding dim `dim`.
+  //     Let `pstride = abs(stride)` be the absolute value of the stride
+  //     (we'll modify that, and then restore the sign.
+  //     positive.
+  //
+
+
+
+  // Converts a memory-index from the src to dest pattern.  This is applying,
+  // to a single arbitrary memory-index m_src, the mapping (m_src -> m_dest);
+  // see the comments above for explanation of this notation.
+  // It is required that m >= 0 (otherwise it would not have been inside
+  // the source pattern).
+  int64 ConvertMemoryIndex(int64 m);
+
+};
 
 
 }  // namespace tensor
diff --git a/src/tensor/tensor-pattern.h b/src/tensor/tensor-pattern.h
index 42d46203188..dfc3e714a8c 100644
--- a/src/tensor/tensor-pattern.h
+++ b/src/tensor/tensor-pattern.h
@@ -33,31 +33,271 @@ namespace tensor {
 
 
 /*
-  This struct stores the dimension and strides of a Tensor.
+  GLOSSARY
+
+    Axis:             An axis is one of the (dim, stride) pairs that form part
+                      of a TensorPattern, and we often use the word "axis"
+                      to refer to the index of the axis, as in, for example,
+                      in a Tensor with dims=[5 6 7], axis 0 has dim=5 and
+                      axis 2 has dim=7.  See also axis-index and raxis-index,
+                      which are more precise terms for the index of the axis
+                      and clearly disambiguate the numbering used (public
+                      numbering, or reversed private numbering).
+                      Caution: some other toolkits use the word 'dimension' where
+                      we use 'axis', but we avoid that usage because it is
+                      ambiguous.
+
+    Axis-index:       An axis-index of a Pattern or Tensor (sometimes just "axis" for short,
+                      especially in code) is an index in the range [0, num_axes - 1]
+                      that identifies an axis in the public numbering (see "Public numbering").
+                      See also: Raxis-index.
+
+    Axis-sorting property: search below for [Valid Pattern], point (vi).
+
+    Broadcasting:     A convention whereby for an operation on Tensors that would
+                      normally be required to have the same dimension, it's
+                      acceptable for, on some axis, one Tensor to have `dim = n`
+                      with `n != 1` and the other to have `dim = 1`.  I.e., two dims can be
+                      different as long as one of them is 1.  Most operations will
+                      take place as if the Tensor with `dim = 1` had been extended
+                      to `dim = n` by making identical copies.  However, if it is
+                      the output Tensor that has `dim = 1`, there would be summation
+                      or possibly some other appropriate reduction instead of making
+                      copies.  This is different from other toolkits (the fact that
+                      we extend the concept of broadcasting to encompass summation).
+                      See also: PyTorch-style broadcasting, extended indexing.
+
+    Broadcastable:   See documentation for function Broadcastable() in pattern-utils.h.
+                     Briefly, two Patterns are broadcastable if their dims (padded
+                     as necessary on the left by 1's to make them the same size)
+                     are, for each axis, either the same or one of them is 1.
+                     So for example, comparing ([ 3 4 ], [4]), we first
+                     pad on the left to get ([3 4], [1 4]); then we say they
+                     are broadcastable because 4 == 4 and in the remaining axis,
+                     one of the dimensions is 1.
+
+    Canonical form:  A TensorPattern is in canonical form if all axes that could be combined
+                     (without affecting its memory-index-set, obviously) have been
+                     combined, there are no trivial axes, all strides are positive,
+                     and the axes are sorted in increasing order of stride.
+                     (Note: this is in the private numbering; in the public numbering
+                     this means decreasing order of stride, which is consistent
+                     with "C" strides).  See CanonicalizePattern().
+
+    Contiguous:      A Pattern is contiguous if its memory-index-set forms a contiguous
+                     range of integers (no gaps).  This is different from the PyTorch
+                     definition of 'contiguous', which also requires C-style strides.
+
+    Dims-vector of a Pattern: The vector of dimension of a Pattern: e.g. [] for
+                    a Pattern with num_axes = 1 or [2 3] for a Pattern with
+                    num-axes = 2.  Note: whenever we display dims vectors in
+                    square brackets as opposed to curly, it implies we are
+                    displaying them in the public numbering.
+
+    Dims-vector of a Pattern-tuple:  The dims vector of a Pattern-tuple is
+                    formed by taking the dims-vectors of each Pattern in the
+                    tuple, extending them on the left with 1's as necessary
+                    to make the the same size, then taking the largest
+                    dim on each axis (i.e. the one that is not equal to 1,
+                    if they are different).  For example, for a Pattern-tuple
+                    of Patterns whose dims-vectors were ([4 1 5], [6 1], [5]),
+                    the dims-vector of the tuple would be [4 6 5].
+
+    Extended indexing:  A convention whereby if we have a Tensor with, say,
+                      `dims = [5 1]`, we can index that Tensor with an index-tuple
+                      that:
+                       (1) may have nonzero index values in any axis for which
+                          with dim=1, so `index_tuple = [4 100]` would be a valid
+                          index for this Tensor in extended indexing.
+                       (2) may have more elements than the Tensor's num-axes; the
+                         Tensor is implicitly extended with extra axes on the left
+                         (in the public numbering) / the right (in the private
+                         numbering) with dim=1.  See also: PyTorch-style broadcasting.
+
+    Index-tuple:      A tuple of integers used as an index into a Tensor.  Must
+                      have at least as many elements as the Tensor's num_axes
+                      (see Extended indexing).  Elements of such tuples may
+                      not be negative.
+
+    (valid Index-tuple) An index-tuple is *valid for a pattern* if it may be
+                      used to index that Pattern, allowing extended indexing.
+                      This is true if, after padding the index-tuple with 0's
+                      on the left and padding the Pattern's dims-vector with
+                      1's on the left as needed to make them the same size,
+                      for each axis, if the element of the index-tuple is
+                      i and the element of the dims-vector is d, i >= 0
+                      and either i < d or d == 1.
+
+    Index-tuple-set of a Pattern: The index-tuple-set of a Pattern is the set
+                      of valid index-tuples assuming we are not allowing extended
+                      indexing.  For example, for a Tensor with `dims = [2]`, the
+                      set of valid index-tuples would be `{ (0), (1) }`; for
+                      a Tensor with `dims = [2 2]` the set of valid index-tuples
+                      is `{ (0,0), (0,1), (1,0), (1,1) }`.
+
+    Index-tuple-set of a Pattern-tuple:  The index-tuple-set of a Pattern-tuple is
+                      the index-tuple-set that you would obtain for a Pattern whose
+                      dims equal the dims-vector of that Pattern-tuple.
+                      See "dims-vector of a Pattern-tuple" for explanation of what
+                      that is.
+
+    Memory region:    A region of memory that will have been allocated with malloc()
+                      or some equivalent (or obtained from some memory-management
+                      code, in the case of GPU memory).  Objects of type `Storage`
+                      are responsible for allocating and deleting memory regions.
+
+    Memory-pointer:   A void* pointer to the start of a memory region.
+
+    Memory-index:     A scalar (int64) index into a memory region viewed as a
+                      linear array.  For example, for a Tensor of floats, we'd cast
+                      the address of the memory-pointer to `float*` and then use
+                      the memory-index as an index into that array.  For a
+                      Pattern p and an index-tuple i that is valid for p, we have
+                      a memory-index m = p[i], which is equal to the
+                      pattern's offset plus the sum over all axes of the product of the
+                      element of the index-tuple times the corresponding axis's
+                      stride.  (Note: this becomes much easier to compute and
+                      explain in the private numbering, because no left-padding
+                      has to be done explicitly).
+
+    Memory-index-tuple:  A tuple of Memory-indexes.  This concept is used in connection
+                      with Pattern-tuples.  For a pattern-tuple q = (p1, p2, p3)
+                      and an index-tuple i, we may write q[i] = (p1[i], p2[i] p3[i]),
+                      where expressions like p1[i] evaluate to a memory-index.
+
+    Offset:           The memory-index of the element with index-tuple = (all zeros)
+                      of a Tensor.  Offsets will always be >= 0 because they are to
+                      be used as an index into a memory-region, and negative
+                      index would be outside that region.
+
+    Pattern:          An object representing the dims, strides and offset of a Tensor.
+                      (see struct TensorPattern).  The Pattern has
+                      an 'offset' which is the memory-index of the element of the Tensor
+                      whose index-tuple is all zeros; the Pattern also
+                      has a number of axes, `0 <= num_axes < KALDI_TENSOR_MAX_DIM`,
+                      and for each axis from 0 <= axis < num_axes, it has a dimension
+                      dim(axis) and stride(axis).
+
+                      Search below for 'Valid Pattern' for properties a Pattern must
+                      (in most circumstances) satisfy.
+
+
+    Pattern-tuple:    A pattern-tuple of a tuple of Patterns, say:  (pattern1, pattern2);
+                      we require the patterns in the tuple to be broadcastable, meaning,
+                      for example: Broadcastable(pattern1, pattern2).
+
+
+    An object of type TensorPattern, representing the dims, strides
+                      and offset of a Tensor.
+
+    Public numbering: The numbering of axes used in the public interface of class
+                      Tensor.  We use the index `axis` when in the public numbering.
+                      We use square brackets when describing dims or strides ordered
+                      in the public numbering, e.g. dims=[3 4].
+                      See also: axis-index
+
+    Private numbering:  The reversed numbering of axes in struct TensorPattern.
+                      For an axis numbered `axis` in the public numbering, its
+                      reversed axis index is `raxis = num_axes - 1 - axis`.
+                      This reversal makes PyTorch-style broadcasting easier.
+                      We use curly brackets when describing dims or strides
+                      ordered in the private numbering, e.g. dims={4,3}; this
+                      is supposed to call to mind a C++ brace-initializer.
+                      See also: raxis-index
+
+    PyTorch-style broadcasting:  We use this name to refer to the fact that in
+                      PyTorch, if an operation is done on two Tensors with
+                      dims=[5 6] and dims=[6], the second one would be interpreted
+                      as having dims=[1 6].  That is: we pad with 1's on the left.
+
+    Raxis-index:      We use the term "raxis-index", often just "raxis" for short,
+                      to mean the index of an axis in the reversed, private numbering.
+                      This would usually be in the range [0, num_axes - 1] for
+                      a Pattern with `num_axes` axes, but for broadcasting purposes,
+                      if we are doing an operation between Tensors of different
+                      numbers of axes we may often use larger raxis values for the Tensor
+                      of smaller num_axes (see PyTorch-style broadcasting).
+
+    Trivial axis:     An axis of a Pattern for which dim=1 and stride=0.
+
+    Memory-index-set of a Pattern:
+                      The set of all memory-indexes obtained by indexing
+                      the pattern with all index-tuples in the index-tuple-set
+                      of the Pattern.  The size of this set is the same as the
+                      size of the index-tuple-set (by the uniqueness property).
+
+    Memory-index-tuple-set of a Pattern-tuple:
+                      The set of all memory-index-tuples obtained by indexing
+                      the Patterns in the tuple with all members of the
+                      index-tuple-set of the Pattern-tuple.  See "memory-index-tuple"
+                      and "index-tuple-set of a Pattern-tuple" for more information.
+
+    Stride:           A stride is the distance, in elements, between successive
+                      elements of a Tensor along a particular dimension.
+                      For example, a Tensor with one axis having dim=3 and
+                      stride=2 would have its elements laid out in memory
+                      as:  `[ element0  xxx   element1  xxx  element2 ]`,
+                      where `xxx` means an element that is not part of the
+                      Tensor.  Axes with dimension=1 always have stride=0
+                      in this toolkit.  Tensors with negative strides may be created,
+                      although they will be copied to temporaries with
+                      positive stride in linear algebra operations where
+                      necessary (since most BLAS implementations do not support
+                      negative stride).
+
+
+    Valid Pattern:
+                     A valid Pattern must be as follows.  Think of this as the mathematical definition;
+                     see the declaration of struct TensorPattern for additional details about how
+                     it is stored.
+
+                          (i) The num_axes must satisfy 0 <= num_axes < KALDI_TENSOR_MAX_DIM
+                          (ii) The offset must be >= 0.
+                          (iii) the dims must all be >0.
+                          (iv) the strides must be zero for axes with dim=1
+                          (v) the strides must be nonzero (but not necessarily positive) for axes with
+                                dim != 1.
+                          (vi) the axis-sorting property.   This property assures that no memory-index
+                              can be accessed via two different index-tuples, and is sufficient
+                              but not necessary toensure the Uniqueness Property (see its own entry).
+                              This property requires that if the axes are sorted from least to greatest
+                              value of abs(stride),
+                              for each axis i < num_axes - 1:
+                                    dim(i) * stride(i) <= stride(i+1).
+
+   Valid+ Pattern:  a Pattern which is valid and also has its code set.  See declaration for
+                    struct TensorPattern.  This is not a mathematical type of definition, more
+                    of a code-level definition, but since we frequently need this notion,
+                    we give it its own name.
+
+   Uniqueness property:  A property of a Pattern that no two different index-tuples,
+                      when used to index the Pattern, generate the same memory-index.
+                      The axis-sorting property is sufficient, but not necessary,
+                      to ensure the uniqueness property.  (The uniqueness property
+                      is probably not so easy to test for efficiently in the general
+                      case; at least, we have not found a way).
+ */
 
-   *SHIFTING TO THE RIGHT*
 
-  The main thing to watch out for is that the dimensions of 'dims' and 'strides'
-  to look at is not 0 ... num_axes, but KALDI_TENSOR_MAX_DIM - num_axes
-  ... KALDI_TENSOR_MAX_DIM - 1.  The last dimension is always located at
-  KALDI_TENSOR_MAX_DIM - 1, i.e. the dims and strides are always
-  right-justified.  In addition, for unused axes, we always maintain dim=1 and
-  stride=0. This happens to be quite convenient for implementation if we adopt
-  the standard broadcasting rules in things like PyTorch, whereby the
-  highest-numbered axes always line up.
+/*
+  This struct stores the dimension and strides of a Tensor.
 
   Below we describe the the properties that a TensorPattern is required to have.
+  Most of them are described in the glossary in the entry for "Valid Pattern",
+  but there are a couple more that have to do with the specifics of how we
+  store things in this struct.
 
   These properties are stricter than some other frameworks, such as PyTorch,
   which allow the users to manually add dimensions with stride 0 and dim > 1 so
   that a lower-dimensional quantity can masquerade as one with a higher
-  dimension.  We require that it never be possible to access the same memory
-  location using two different tuples of indexes.  We also don't allow zero dims
-  (i.e. a Tensor that is initialized must not have num_elemnts==0).  If you want
-  an empty Tensor, just use a null pointer.  In addition, we require that the
-  stride equal zero for any axis that has dim = 1.
+  dimension.  (This framework allows the same kinds of operations, they are just
+  not done by the same mechanism).   We
+  also don't allow zero dims (i.e. a Tensor that is initialized must not have
+  num_elemnts==0).  If you want an empty Tensor, just use a null pointer.  In
+  addition, we require that the stride equal zero for any axis that has dim = 1.
+  There is also the "axis-sorting" property (see its glossary entry for more info).
 
-  Our requirements on a TensorPattern are:
+  Our requirements of a TensorPattern are:
 
     0 <= num_axes <= KALDI_TENSOR_MAX_DIM.
 
@@ -70,18 +310,16 @@ namespace tensor {
        dims[i] == 1
        strides[i] == 0
 
-    ... plus the uniqueness property.
+    offset >= 0
 
-  Note: in the public interface of class Tensor, if you ask for
-  dim(i) it will return pattern.dims[num_axes - i].
+    The axis-sorting property (see property (vi) in "Valid Pattern" above)
 
-  The uniqueness property requires that we must not be able to access the same
-  memory location via two different tuples of indexes).  Recause testing this
-  property exactly would be difficult in general without bringing in concepts
-  from number theory, we test a slightly stronger version of it that covers all
-  cases we are likely to encounter.  This is that, if we take all the axes with
-  dim != 1 and sort them from greatest to least stride, then for each i,
-  abs(strides[i]) >= dims[i+1] * abs(strides[i+1]).
+  Note: in the public interface of class Tensor, if you ask for Dim(i) it will
+  return pattern.dims[pattern.num_axes - i], i.e. the interface uses the public
+  numbering, while the axes are physically stored using the reversed "private
+  numbering".   This reversal makes it much easier to implement
+  PyTorch-style broadcasting where in an operation on Tensors of dims,
+  say, (3,4) and (4), the (4) is interpreted as (1,4).
 */
 struct TensorPattern {
   int32 num_axes;
@@ -90,17 +328,26 @@ struct TensorPattern {
   int32 strides[KALDI_TENSOR_MAX_DIM];  // the strides in reversed order,
                                         // indexed by 'raxis' (reversed axis)
   int32 code;  // pattern code; see ComputePatternCode() in tensor-pattern-utils.h
-               // for details.  It is the responsibility of the user to keep
-               // this updated (i.e. don't change dims or strides without updating
-               // 'code').
-
-  // Returns true if the TensorPattern is valid, I.e. that it satifies all the
-  // properties mentioned above.
-  //
-  //  @param [in] check_code   If true, the check includes verifying that the
-  //                        'code' has the value it should (c.f. GetPatternCode()).
-  //  @return     Returns true if valid, false if not valid.
-  bool IsValid(bool check_code = true);
+               // for details.  If this is negative then it means it has not been
+               // computed.  In a valid TensorPattern the code will always be either
+               // negative or up-to-date.
+  int64 offset;  // Offset of the element with all-zero indexes
+                 // from the start of the originally allocated memory
+                 // region
+
+  // Returns true if the TensorPattern is valid.  This includes all the
+  // mathematical conditions on a valid Pattern (search above for "Valid
+  // Pattern"), plus extra conditions related to struct TensorPattern,
+  // namely: dims and strides with index >= num_axes should be
+  // 1 and 0 respectively; and the code should either be -1 or or
+  // be the same as ComputePatternCode() returns on this pattern.
+  bool IsValid();
+
+  // This comparator induces a total ordering on valid TensorPatterns.  It is a
+  // lexical comparison on the offset, num_axes, dims and strides.  (The code
+  // does not need to be compared because, if not -1, it is a function of the
+  // dims and strides).
+  bool operator < (const TensorPattern &other) const;
 };
 
 
@@ -122,6 +369,7 @@ struct TensorPatternProperties {
   // is_contiguous means that the data form a contiguous block in memory; it is
   // not the same as PyTorch's is_contiguous which is a stronger condition,
   // implying also that the strides are as for a `C-style` array.
+  // TODO: see if this is even needed; it may not be.
   bool is_contiguous;
 
   // has_c_strides means that the strides of all axes i with dim[i] != 1,
@@ -131,6 +379,7 @@ struct TensorPatternProperties {
   // have stride=0.
   // has_c_strides is the equivalent of PyTorch's is_contiguous.
   // this->has_c_strides implies this->is_contiguous.
+  // TODO: see if this is even needed; it may not be.
   bool has_c_strides;
 
   // Sets the members of *this to be the properties of pattern 'pattern'.
@@ -139,6 +388,7 @@ struct TensorPatternProperties {
 };
 
 
+
 }  // namespace tensor
 }  // namespace kaldi
 
diff --git a/src/tensor/tensor.h b/src/tensor/tensor.h
index f992964e82b..96eb8ea68f8 100644
--- a/src/tensor/tensor.h
+++ b/src/tensor/tensor.h
@@ -43,7 +43,7 @@ namespace tensor {
 class Tensor {
  public:
 
-  inline bool Initialized() { return data_ != NULL; }
+  inline bool Initialized() { return storage_->data_ != NULL; }
 
   /// Return the number of axes (a number in {0,1,2,3,4}).  In mathematical
   // contexts, this is sometimes known as the rank of the tensor, or sometimes
@@ -63,32 +63,31 @@ class Tensor {
   // stride info.
   const TensorPattern &Pattern() const { return impl_.pattern; }
 
-  // Return an array containing dimensions of the tensor; equivalent to
+  // Return a vector containing dimensions of the tensor; equivalent to
   // .shape in PyTorch.  Dims().size() will equal NumAxes().
-  // This cannot return some kind of const reference because the
+  // This cannot return a const reference because the
   // dims are stored internally in reversed order.
   std::vector<int32> Dims() const;
 
-  // Return an array containing dimensions of the tensor; equivalent to
-  // .shape in PyTorch.  Strides().size() will equal NumAxes().
+  // Return a vector containing the strides of the tensor.
+  // Strides().size() will equal NumAxes().
   std::vector<int32> Strides() const;
 
 
-  // Returns the dimension on this axis (which will be >= 1).
-  // Requires 0 <= axis < NumAxes().
-  inline int32 Dim(int32 axis) const {
-    KALDI_ASSERT(static_cast<uint32>(axis) <
-                 static_cast<uint32>(impl_.pattern->num_axes));
-    return impl_.pattern.dims[impl_.pattern->num_axes - 1 - axis];
-  }
+  // Returns the dimension on the supplied axis
+  //  @param [in] axis  Axis on which dimension is required, with
+  //                    -NumAxes() <= axis < NumAxes(); negative axis
+  //                    is interpreted as an offset from NumAxes().
+  //  @return        Returns the dimension on this axis, a number >= 1.
+  inline int32 Dim(int32 axis) const { return impl_.Dim(axis); }
 
-  // Returns the stride on this axis (which will be >= 1).
-  // Requires 0 <= axis < NumAxes().
-  inline int32 Stride(int32 axis) const {
-    KALDI_ASSERT(static_cast<uint32>(axis) <
-                 static_cast<uint32>(impl_.pattern->num_axes));
-    return impl_.pattern.strides[impl_.pattern->num_axes - 1 - axis];
-  }
+  // Returns the stride on the supplied axis (using the public axis numbering)
+  //  @param [in] axis  Axis on which stride is required, with
+  //                    -NumAxes() <= axis < NumAxes(); negative axis
+  //                    is interpreted as an offset from NumAxes().
+  //  @return          Returns the stride on this axis, which will be 0 if
+  //                   Dim(axis) == 1, and otherwise nonzero.
+  inline int32 Stride(int32 axis) const { return impl_.Stride(axis); }
 
   // Returns the number of elements in the Tensor; will be > 0,
   // and will equal the product of Dims().
@@ -298,10 +297,9 @@ class Tensor {
   // cached properties.
   TensorImpl impl_;
 
-
-  // The storage region where the data resides.  data_ does not necessarily
-  // equal storage_->data; it may be more than that, e.g. if this is a view
-  // to part of another Tensor.
+  // The storage region where the data resides storage_->data will equal
+  // impl_.data (we duplicate it in impl_ for convenence and to avoid an extra
+  // pointer dereference).
   std::shared_ptr<Storage> storage_;
 };
 
diff --git a/src/tensor/variable.cc b/src/tensor/variable.cc
new file mode 100644
index 00000000000..b3ef347a076
--- /dev/null
+++ b/src/tensor/variable.cc
@@ -0,0 +1,34 @@
+// variable/variable.cc
+
+// Copyright      2019  Johns Hopkins University (author: Daniel Povey)
+
+// See ../../COPYING for clarification regarding multiple authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//  http://www.apache.org/licenses/LICENSE-2.0
+//
+// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
+// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
+// MERCHANTABLITY OR NON-INFRINGEMENT.
+// See the Apache 2 License for the specific language governing permissions and
+// limitations under the License.
+
+#include "tensor/variable.h"
+
+
+namespace kaldi {
+namespace tensor {
+
+
+void TensorGrad::EnsureGradAllocated() {
+
+}
+
+
+
+}  // namespace kaldi
+}  // namespace tensor
diff --git a/src/tensor/variable.h b/src/tensor/variable.h
index 7cfe9763ede..08fb8b75a16 100644
--- a/src/tensor/variable.h
+++ b/src/tensor/variable.h
@@ -26,16 +26,75 @@ namespace kaldi {
 namespace tensor {
 
 
+/**
+
+   Definitions:
+
+     Tracked:   A variable v is tracked if v.Grad() returns non-NULL.  By "tracked" we mean:
+                we are keeping autograd history.  Being tracked or not is actually a
+                property of the base variable (see "Base variable").
+    Debug mode:  Debug mode is a global bool, accessible via GetDebugMode().  When it is
+                true, we check for invalidated data in the backprop phase.  (This,
+                which performs the same function as version numbering in, say, PyTorch,
+                is quite a slow operation so we only enable it occasionally.
+   Base variable: A base Variable is a Variable that is not a sub-part (e.g. row or
+                column range) of another Variable.  Every Variable has a base Variable;
+                a base Variable is its own base Variable.  A base variable is
+                created when a Variable is initialized from a Tensor or when we
+                call .detach().  The base Variable is the unit at which we make
+                the decision "is this being tracked?".  As soon as a Variable
+                becomes tracked, all Variables sharing the same base Variable
+                become tracked.
+
+      tick:   a tick is the value of a global 64-bit time counter that we increment every
+              time we mutate a Tensor.  When we create Ops for backpropagation of
+              derivatives, we record the tick at which the Op was created.
+    invalidated:  if some data used in backprop needs to have been unchanged since
+              a particular tick (as recorded in an Op), but it has been changed
+              since then, we say that it has been invalided.  This is an error, but
+              it will only be detected in debug mode.  In effect we store a
+              record of what time (in ticks) data last changed at the individual-element
+              level, via the ChangeTracker object that is attached to the Storage
+              object (it's done in a structured way, not via a huge array).
+              This means that the change-tracking mechanism is not defeated by
+              doing .detach() or by constructing multiple Variables from the same
+              Tensor.
+
+
+*/
+
+
+void Add(const Variable &a, const Variable &b, Variable *c) {
+  // assumes c already correctly sized.
+
+
+  Add(a.data(), b.data(), &(c->data()));
+
+  Variable *a_grad = a->grad(), *b_grad = b->grad(),
+      *c_grad = c->grad();
+
+  auto gradFunc = [a_grad,b_grad,c_grad] () {
+    a_grad->Add(*c_grad);
+    b_grad->Add(*c_grad);
+  }
+
+  c->SetGradFunc(gradFunc);
+  c->SetDependencies(a, b);
+
+}
+
+
 /*
-  This is the 'gradient information' that class Variable stores for a Tensor
+  This is the 'gradient information' that class Variable stores
   when it is initialized with requires_grad = true (or is a result of
   an operation on Variables one of which had requires_grad = true).
-  This does not give you access to the underlying Variables; doing it
+  The Variable holds it via a shared_ptr.
+  This does not give you access to the underlying Variable; doing it
   like this makes reference counting easier (no loops).  The GradFunc
   will store any pointers to the original Variable that it may have
   needed.
 
-  Users will rarely need to interact directly with this struct directly.
+  Users will rarely need to interact directly with this struct.
  */
 struct TensorGrad {
   // The version of the underlying Tensor.  (this number in the TensorGrad
@@ -60,11 +119,17 @@ struct TensorGrad {
 
   // is_view is true only if the Variable underlying this TensorGrad
   // is the result of an expression like foo.transpose() that creates
-  // a view to another Tensor.  In that case
+  // a view to another Tensor.  In that case, the variables
+  // 'meta' and 'offset' become relevant, and when asked to create
+  // the 'grad' Variable, we won't allocate it directly but will
+  // instead create a view into inputs[0].grad->data.
   bool is_view{false};
 
-  // The device we
-  Device device;
+  // grad_discarded will be set to true in the backprop when we are done
+  // with this->grad and have deallocated it.  If a future user
+  // attempts to reallocate the gradient, this will trigger an
+  // exception.
+  bool grad_discarded{false};
 
   // This contains the meta-information of the Tensor for which this is the
   // gradient (its 'data' pointer will be NULL).  Used to set up 'grad' with the
@@ -76,36 +141,103 @@ struct TensorGrad {
   int64 offset;
 
   // This stores the gradient (if we already have one), or nullptr if not.
-  std::unique_ptr<Variable> grad{nullptr};
+  std::unique_ptr<Variable> data;
 
   // The tail in a singly linked list of TensorGrads... used in case this
-  // Variable is a sum of several terms that were added using an
-  // in-place method such as '+='.  (Syntax etc. TBD at this point).
-  std::unique_ptr<TensorGrad> tail{nullptr};
+  // Variable is a sum of several terms that were added together in-place.
+  std::unique_ptr<TensorGrad> tail;
+
+  // You call this function to ensure that the 'grad'
+  void EnsureGradAllocated();
+};
+
+
+
+struct TensorGradOp {
+  std::vector<std::shared_ptr<TensorGraph> > inputs;
+  std::vector<std::shared_ptr<TensorGrad> > outputs;
+
+
+  std::vector<std::shared_ptr<Variable> > vars_needed;
+
+  std::function<void()> op;
+
+
+  TensorGradOp(std::initializer_list<VariableRef> inputs_grads_needed,
+               std::initializer_list<VariableRef> output_grads_needed,
+               std::initializer_list<VariableRef> variables_needed,
+               std::function<void()> op);
+
+};
+
+/**
+   This contains the graph-related information stored with a Variable.
+   For Variables initialized with requires_grad = true, it's held
+   via shared_ptr as graph_.
+ */
+struct TensorGraph {
+  // creator_ops contains the op that created (or modified) the Variable that
+  // this TensorGraph is held by.  (If it modified this variable, 'tail' records
+  // any previous operations on it).
+  std::shared_ptr<TensorGradOp> creator_op;
+
+  std::shared_ptr<TensorGrad> grad;
+
+  std::shared_ptr<TensorGraph> tail;
 };
 
 
 /**
-   class Variable is somewhat like class Tensor but augmented with autograd
-   machinery.  Because autograd requires a rather 'functional' way of doing
-   things (i.e. is not super friendly to in-place operations), the functions
-   that operate on class Variable will tend to be ones that return something,
-   rather than in-place operations.
-
-   The overall design is quite similar to PyTorch, and the structure
-   of the the C++ code is similar to flashlight.  If you are only familiar with
-   PyTorch's python frontend, class Variable is rougtly equivalent to what they
-   expose as af.tensor.
+   GradFunc is the type that is passed into the constructor of Variable by a
+   function implementing some operation on Variables (addition, multiplication,
+   etc.).  It is at the core of the backprop mechanism, so we explain it here
+
  */
-class Variable {
-  using GradFunc = std::function<
-    void(const std::vector<Variable>& inputs, TensorGrad *grad_output)>;
-  using GradHook = std::function<void(TensorGrad *grad)>;
+typedef std::function<void(const Variable &grad, const std::vector<Variable> *input_grads)> GradFunc;
 
+typedef std::function<void(TensorGrad *grad)> GradHook;
+
+
+// This is an enum but will be used as if it were an int32,
+// as a bit pattern.
+// TODO: figure out proper way to do that.
+// It's used as an arg to the constructor of Variable
+class enum VariableInit {
+  CopyData = 1,
+  CopyGrad = 2,
+  CopyGraph = 4
+};
+
+
+// Shared data of a base Variable.  Each base Variable gets one of these; but
+// non-base Variables (views into other variables) share the Node of their base
+// Variable.
+struct Node {
+  // The gradient.
+  Tensor grad_;
+
+
+  // op_list_ (may be NULL) is the head of a list of Ops
+  // that wrote to this Node (most recent at the head).
+  // TODO: make it unique_ptr?
+  std::shared_ptr<Op> op_list_;
+};
+
+
+class Variable;
+
+
+/**
+   class Variable is somewhat like class Tensor but augmented with autograd
+   machinery.
+*/
+class Variable {
 
 
   /** Constructor from a Tensor.
-       @param [in] data  Pointer to the source Tensor
+       @param [in] data  Pointer to the source Tensor.  Will accept a
+                      raw Tensor* pointer, in which case it will construct a
+                      shared_ptr.  (??)
        @param [in] requires_grad    If requires_grad argument is true,
                 the gradient w.r.t. this Variable will be computed if and when
                 you call Backward() on a Variable that depends on it.
@@ -115,31 +247,82 @@ class Variable {
 
 
 
+  /**  Returns shared pointer to the Tensor storing the data. */
+  std::shared_ptr<Tensor> Data();
+
+
+  /**  Returns pointer to the Tensor storing the derivative w.r.t.  this
+       data.  Obtaining this Tensor won't allocate the memory, thanks to lazy
+       initialization.  It is an error to call this if this Variable is
+       not tracked (search for "Tracked:" above for definition).
+       See also GradDataIfPresent().
+  */
+  std::shared_ptr<Tensor> GradData();
+
+  /**  Returns pointer to the Tensor storing the derivative w.r.t.  this
+       data, or NULL if not present..  Obtaining this Tensor won't allocate the
+       memory, thanks to lazy initialization.  See also GradData().
+  */
+  std::shared_ptr<Tensor> GradDataIfPresent();
+
+
   /**
-   * Creates a Variable which wraps the array and inputs specified
-   * @param[in] data array to the stored in the Variable
-   * @param[in] inputs a vector specifying inputs for this Variable
+     Returns pointer to the base Variable (which may or may not be
+     identical to 'this'.
+   */
+  std::shared_ptr<Variable> GetBaseVariable();
+
+  /**
+     Constructor that will be used by functions implementing mathematical
+     operations on Variables.
+
+
+     @param [in] data    Data to be stored in the Variable
+     @param [in] inputs  A vector containing Variables which this Variable
+                         depends on (for backpropagation purposes; will
+                         be stored in the TensorGrad object).
+     @param [in]
+
+     a vector specifying inputs for this Variable
    * @param[in] gradFunc function specifying how to calculate gradient of the
    * input Variables
    */
   Variable(std::shared_ptr<Tensor> &data, std::vector<Variable> inputs,
-           GradFunc gradFunc);
+           GradFunc grad_func);
 
 
- private:
+  // Returns true
+  bool Tracked() const;
+
 
-  // The version of this Variable.  Generally will start at 0 when the Variable
-  // is assigned a size and will have 1 added to it for each operation that is
-  // done on it.  If grad_ != NULL, we mirror this value in grad_->version.  The
-  // version number is only used for checking purposes, to verify that people
-  // don't modify a Variable in ways that defeat the backprop.  If we wanted we
-  // could keep the old versions around and enable the backprop to work anyway,
-  // but that kind magic is not in the spirit of how this library operates.
-  int32 version_;
 
+ private:
+
+  // data_ is the Tensor underlying this Variable.
   std::shared_ptr<Tensor> data_;
+
+  // base_ is the base Variable which is non-NULL only if this Variable is a
+  // view of an underlying Variable.  This needs to be tracked even if
+  // we are not yet tracking gradients, because if any Variable with a
+  // particular base becomes tracked, all such Variables do.
+  // If base_ is NULL, then this Variable is its own base variable.
+  std::shared_ptr<Variable> base_;
+
+  // grad_ is a pointer to the struct containing gradient information (for
+  // Variables that require a gradient; else NULL).  It may also be
+  // NULL because someone called this->RemoveGrad().
   std::shared_ptr<TensorGrad> grad_;
 
+  // ops_ is the first in singly list of Ops for this Variable.  If this
+  // Variable is not its own base variable (i.e. if base_ != NULL), this will be
+  // NULL since the Ops are only stored in the base Variables.
+
+
+  (there will be just one element,
+  // unless in-place operations were done).
+  // Will be NULL if this Variable does not require a gradient or if someone
+  // called this->RemoveGraph().
+  std::shared_ptr<Op> ops_;
 };
 
 typedef std::unique_ptr<Storage>

From 4a6b739d5b36396ff2445bc6f332b95ec195edd6 Mon Sep 17 00:00:00 2001
From: Daniel Povey <dpovey@gmail.com>
Date: Sat, 20 Apr 2019 16:44:34 -0700
Subject: [PATCH 022/163] [src] Tensor progress

---
 src/tensor/array-ref.h                   |   4 +
 src/tensor/op.h                          |   2 +-
 src/tensor/tensor-common.h               |   6 +-
 src/tensor/tensor-pattern-extra-utils.cc | 288 ++++++++++++++---------
 src/tensor/tensor-pattern-extra-utils.h  |  70 +++++-
 src/tensor/tensor-pattern-utils.h        |  19 +-
 src/tensor/tensor-pattern.h              | 140 ++++++++---
 7 files changed, 377 insertions(+), 152 deletions(-)

diff --git a/src/tensor/array-ref.h b/src/tensor/array-ref.h
index 47cc96a31b7..873496f34df 100644
--- a/src/tensor/array-ref.h
+++ b/src/tensor/array-ref.h
@@ -72,6 +72,10 @@ struct ArrayRef final {
   // We will add iterators later if they are needed.
 };
 
+// Converts an ArrayRef<int32> to a string, e.g. "[1 4 300]"; can implicitly
+// print std::vector as well.
+std::string ArrayAsString(const ArrayRef<int32> a);
+
 
 }  // namespace tensor
 }  // namespace kaldi
diff --git a/src/tensor/op.h b/src/tensor/op.h
index a3f2028c8d8..ce70c998d1c 100644
--- a/src/tensor/op.h
+++ b/src/tensor/op.h
@@ -124,7 +124,7 @@ class Op {
   void *inputs_;
 
   int64 n_;  // initialized from the counter when this object is created.
-  std::shared_ptr<Op> tail_;
+  std::shared_ptr<Op> tail_;  // TODO: make it unique_ptr?
  protected:
   // Return true if this is not the last Op in the list of Ops attached to this
   // base Variable (can be useful to know whether we need bother to scale the
diff --git a/src/tensor/tensor-common.h b/src/tensor/tensor-common.h
index 142df0e4e1a..d55cdc09e9c 100644
--- a/src/tensor/tensor-common.h
+++ b/src/tensor/tensor-common.h
@@ -161,11 +161,11 @@ enum BinaryFunctionEnum {
 
 
 
-// In practice we don't expect user-owned tensors with dims greater than 5 to
-// exist, but there are certain manipulations we do when simplifying matrix
+// In practice we don't expect user-owned tensors with num-axes greater than 5
+// to exist, but there are certain manipulations we do when simplifying matrix
 // multiplications that temporarily add an extra dimension, and it's most
 // convenient to just increase the maximum.
-#define KALDI_TENSOR_MAX_DIM 6
+#define KALDI_TENSOR_MAX_AXES 6
 
 
 }  // namespace tensor
diff --git a/src/tensor/tensor-pattern-extra-utils.cc b/src/tensor/tensor-pattern-extra-utils.cc
index 57e4449b14c..daa7914b1aa 100644
--- a/src/tensor/tensor-pattern-extra-utils.cc
+++ b/src/tensor/tensor-pattern-extra-utils.cc
@@ -41,10 +41,10 @@ class IntersectionComputer {
                          will, if this function returns true,
                          represent the intersection between the pattern1 and
                          pattern2 passed to the constructor.  These patterns
-                         will be valid without a code, but won't be
-                         in canonical form (the user can do that themselves;
-                         we don't it here because in most cases the caller will
-                         only care whether the union is empty or not).
+                         will be valid, but won't be in canonical form (the user
+                         can do that themselves; we don't it here because in
+                         most cases the caller will only care whether the union
+                         is empty or not).
   */
   bool ComputeIntersection(std::vector<TensorPattern> *patterns_out) {
     CanonicalizePattern(&pattern1_);
@@ -74,37 +74,6 @@ class IntersectionComputer {
   bool FindCommonStrides(std::vector<int32> *axes);
 
 
-  /**
-    This function converts a pattern 'pattern' in canonical form to a list of Patterns
-    whose union (viewed as memory-index-sets) is equivalent to 'pattern'
-    where the strides of the output patterns are equal to the provided 'common_strides'
-    vector.
-
-    This function requires that the actual strides in 'pattern' all be present in
-    the list 'common_strides'; that the elements of 'common_strides' be positive
-    and sorted from smallest to greatest; and that each element in
-    'common_strides' divide the next element exactly.
-
-
-       @param [in] pattern  Input pattern in canonical form, valid except for
-                         code.
-       @param [in] common_strides   A sorted list of integers >0, with the
-                         property that each element must divide the next
-                         element exactly, and also that each stride in
-                         'pattern' must be present in 'common_strides'.
-       @param [out] patterns   This will be set to a nonempty list of patterns
-                         whose union (viewed as a memory-index-set) equals
-                         'pattern', and whose strides are equal to
-                         'common_strides'.  The patterns in `*patterns` at
-                         output will be valid except for the code and for
-                         property (iv) (search Valid Pattern in
-                         tensor-pattern.h): that is, it will have nonzero
-                         strides for axes with dim != 1.
-  */
-  static void ConvertToCommonStrides(const TensorPattern &pattern,
-                                     const std::vector<int32> &common_strides,
-                                     std::vector<TensorPattern> *patterns);
-
   /**
      Computes the intersection between pattern1 and pattern2, which must have
      identical axes and strides, and must be valid *except* for property (iv),
@@ -161,68 +130,6 @@ class IntersectionComputer {
                                   std::vector<TensorPattern> *patterns_out);
 
 
-  /**
-     This function, called by ConvertToCommonStrides() converts a pattern in
-     canonical form to a Pattern whose strides are equal to the
-     provided 'common_strides' vector, and which is valid *except for*
-     the axis-sorting (property (vi) of a valid Pattern) and
-     for property (iv), that strides must be nonzero for axes
-     with dim != 1.
-
-         @param [in] pattern_in  The input pattern; must be valid and
-                                 in canonical form.
-         @param [in] common_strides  The list of strides.  Must be sorted,
-                                 have the property that each element
-                                 divides the next element, and all
-                                 strides in pattern_in must be present
-                                 in this list.
-         @param [out] pattern_out   The output pattern.  Will be equivalent
-                                 to pattern_in in terms of memory-index-set,
-                                 its strides will be equal to 'common_strides'
-                                 (including the order), and it will be valid
-                                 except for properties (iv) and (vi), as
-                                 mentioned above.
-  */
-  static void ConvertLazilyToCommonStrides(const TensorPattern &pattern_in,
-                                           const std::vector<int32> &common_strides,
-                                           TensorPattern* pattern_out);
-
-  /**
-     This function makes sure that the axis-sorting property in 'pattern'
-     holds for the axis numbered 'raxis' (in the private numbering, of
-     course).  I.e. it ensures that:
-
-       `pattern->strides[raxis+1] >= pattern->strides[raxis] * pattern->dims[raxis]`
-
-     If it does not already have this property, this function ensures that it
-     does have it by modifying its dims for raxis and raxis + 1, and if necessary,
-     moving part of the pattern to 'extra_pattern'.  This will be necessary if the
-     value of `pattern->dims[raxis]` at entry is not a multiple of
-     `pattern->strides[raxis+1] / pattern->strides[raxis]`.
-
-         @param [in]      raxis    The axis on which we are doing the check
-         @param [in,out]  pattern  The input pattern, valid except for properties
-                                (iv) and (vi).  Its strides must be in
-                                increasing order (in the private numbering) and
-                                each must divide the next.
-         @param [out]     extra_pattern   This function writes to 'extra_pattern' if
-                                and only if it returns true.  See documentation of
-                                return status.
-         @return  Returns true if it wrote to extra_pattern.  If it returns true,
-                  then it guarantees that the union of the memory-index-sets of
-                  'pattern' and 'extra_pattern' at exit are equal to the memory-index-set
-                  of 'pattern' at entry.  If it returns false, then it guarantees
-                  that the memory-index-set of 'pattern' has been unchanged.
-                  In either case it guarantees that property (vi), the axis-sorting
-                  property, holds for axis 'raxis', in 'pattern' and (if applicable)
-                  in `extra_pattern`.
-                  The codes of pattern and extra_pattern are not set.
-  */
-  static bool EnsureAxisSortingPropertyHolds(int32 raxis,
-                                             TensorPattern *pattern,
-                                             TensorPattern *extra_pattern);
-
-
 
   // the same as pattern1 passed to the constructor, but reduced to
   // canonical form
@@ -302,17 +209,67 @@ bool IntersectionComputer::EnsureAxisSortingPropertyHolds(
 }
 
 
-void IntersectionComputer::ConvertLazilyToCommonStrides(
+// See declaration in header.
+bool IsRegular(const TensorPattern &pattern) {
+  int32 num_axes = pattern.num_axes;
+
+  for (int32 i = 0; i + 1 < num_axes; i++) {
+    int32 this_stride = pattern.strides[i],
+        this_dim = pattern.dims[i],
+        this_prod = this_stride * this_dim;
+    for (int32 j = i + 1; j < num_axes; j++) {
+      if (pattern.strides[j] >= this_prod) {
+        // in this case, 'j' would be the 'k' value used in the proof.  If we
+        // fall off this loop, it would correspond to k == num_axes, which is
+        // also OK.
+        break;
+      } else if (pattern.dims[j] != 1 ||
+                 pattern.strides[j] % this_stride != 0) {
+        return false;
+      }
+    }
+  }
+  return true;
+}
+
+
+/**
+   This function, called by ConvertPatternStrides(), is not declared in the
+   header.  It converts a pattern in canonical form to a Pattern whose strides
+   are equal to the provided 'strides' vector, which is valid--,
+   satisfieds the uniqueness property, and has positive and increasing strides.
+
+       @param [in] pattern_in  The input pattern; must be valid and
+                               in canonical form.
+       @param [in] strides     The list of strides which we want
+                               'pattern_out' to have.  Must be a list of
+                               positive integers sorted from least to
+                               greatest with size <= KALDI_TENSOR_MAX_AXES,
+                               and all strides in pattern_in must
+                               be present in this list.
+       @param [out] pattern_out  The output pattern (must not point to
+                               pattern_in).  On exit its memory-index-set will
+                               equal that of pattern_in; its strides will be
+                               equal to 'strides' (including the order, when
+                               numbered in the private numbering); and it will
+                               be valid-- and satisfy the uniqueness property.
+*/
+static void ConvertPatternStridesLazily(
     const TensorPattern &pattern_in,
-    const std::vector<int32> &common_strides,
+    const std::vector<int32> &strides,
     TensorPattern* pattern_out) {
+  KALDI_PARANOID_ASSERT(IsCanonical(pattern_in));
   int32 num_axes_in = pattern_in.num_axes,
-      num_axes_out = common_strides.size();
+      num_axes_out = strides.size();
   pattern_out->num_axes = num_axes_out;
+  pattern_out->code = -1;
   int32 raxis_in = 0;
   pattern_out->offset = pattern_in->offset;
+  // The following code relies on pattern_in being in canonical form
+  // (so its strides are in sorted order), and all of its strides being
+  // present in the list 'strides'.
   for (int32 raxis_out = 0; raxis_out < num_axes_out; raxis_out++) {
-    int32 stride = common_strides[raxis_out];
+    int32 stride = strides[raxis_out];
     pattern_out->strides[raxis_out] = stride;
     if (pattern_in.strides[raxis_in] == stride) {
       pattern_out->dims[raxis_out] = pattern_in.dims[raxis_in];
@@ -322,28 +279,135 @@ void IntersectionComputer::ConvertLazilyToCommonStrides(
     }
   }
   if (raxis_in != num_axes_in) {
-    KALDI_ERR << "Something went wrong converting strides (likely code error)";
+    KALDI_ERR << "Something went wrong converting strides; trying to "
+        "convert pattern with strides = " << StridesAsString(pattern_in)
+              << " to strides " << ArrayAsString(strides);
+  }
+}
+
+
+
+/**
+   This function, not declared in the header, attempts to ensure that the axis-sorting
+   property in a provided Pattern holds for the axis-index 'raxis' (in the private
+   numbering, of course).  I.e. it ensures (for the pattern we are to modify) that:
+
+      `pattern->strides[raxis+1] >= pattern->strides[raxis] * pattern->dims[raxis]`.
+
+   This function expects that the pattern will also satisfy that property for
+   all axis-indexes `0 <= i < raxis`, and will be valid--.  This function will
+   always succeed if the pattern is regular (see IsRegular(), and "Regularity
+   property" in the glossary).
+
+   Ensuring this property exists may sometimes require splitting this Pattern up
+   (i.e. adding extra Patterns); the union of their memory-index-sets together
+   with that of the modified pattern will equal the memory-index-set of the
+   original pattern at input (these sets being unioned will be disjoint).  Any
+   newly created Patterns will be appended to the vector 'patterns'.
+
+    @param [in]      raxis    The axis for which we are ensuring that the
+                             axis-sorting property holds.
+    @param [in]      pattern_index  The index in the vector 'patterns'
+                             of the pattern for which we are ensuring that
+                             the axis-sorting property holds.
+    @param [in,out]  patterns  The vector of patterns in which to look for the
+                             pattern to operate on; we may also append
+                             Patterns to this vector if needed, as mentioned
+                             above.  Note: the newly added patterns may not satisfy
+                             the axis-sorting property for 'raxis', but they will
+                             still satisfy it for all axes numbered less than
+                             'raxis', assuming the pattern at 'pattern_index'
+                             did at entry.
+
+    @return                  Returns true on success, false on failure.
+                             Will always return true if `(*patterns)[pattern_index]`,
+                             satisfied the 'regularity property' at entry;
+                             see IsRegular().
+ */
+static bool EnsureAxisSortingPropertyHolds(
+    int32 raxis,
+    int32 pattern_index,
+    std::vector<TensorPattern> *patterns) {
+  TensorPattern *pattern = (*patterns)[pattern_index];
+  // We use 'i' as the internal name for 'raxis', because we want to mirror the
+  // notation used for the regularity property in the glossary, and in the
+  // function IsRegular() that checks for it.  There is an index k with `i < k
+  // <= num_axes`, that appears in the definition of the regularity property.
+  // The algorithm used here iteratively decreases the value of k until it
+  // equals i + 1, adding new patterns as needed, at which point the
+  // axis-sorting property will hold for index i.
+  int32 i = raxis, num_axes = pattern->num_axes;
+  int32 this_stride = pattern->strides[i],
+      this_dim = pattern->dims[i],
+      this_prod = this_stride * this_dim;
+  if (this_dim == 1)  // This is a small optimization for a common case.
+    return true;
+  KALDI_PARANOID_ASSERT(raxis + 1 < num_axes && this_stride > 0 &&
+                        ValidMM(*pattern));
+  int32 j, k = num_axes;
+  for (j = i + 1; j < num_axes; j++) {
+    if (pattern->strides[j] >= this_prod) {
+      k = j;
+      break;  // regularity property is OK as far as this 'i' is concerned.
+    } else if (pattern->dims[k] != 1 ||
+               pattern->strides[k] % this_stride != 0) {
+      return false;  // Pattern was not regular.
+    }
+  }
+  for (; j = k - 1; j > i; j--) {
+    int32 j_stride = pattern->strides[j],
+        stride_ratio = j_stride / this_stride;  // will divide exactly; we
+                                                     // checked above.
+    KALDI_PARANOID_ASSERT(j_stride % this_stride == 0);
+
+    // We can prove that j_dim will always be at least 1; if this is the
+    // first time round the loop this is easy to show (else k would be smaller);
+    // otherwise we can use the fact that the strides for axes i, i+1 .. k-1 are
+    // strictly increasing and all multiples of this_stride (hence stride_ratio
+    // strictly increases from one j to the next).
+    int32 j_dim = this_dim / stride_ratio,
+        remainder = this_dim % stride_ratio;
+
+    if (remainder != 0) {
+      patterns->resize(patterns->size() + 1);
+      pattern = (*patterns)[i];  // in case it was reallocated.
+      TensorPattern *remainder_pattern = &(patterns->back());
+      *remainder_pattern = *pattern;
+      remainder_pattern->dims[i] = remainder;
+      remainder_pattern->offset += j_stride * j_dim;
+    }
+
+    pattern->dims[j] = j_dim;
+    pattern->dims[i] = stride_ratio;
+    this_prod = j_stride;
   }
+  return true;
 }
 
 
-void IntersectionComputer::ConvertToCommonStrides(
+void ConvertPatternStrides(
     const TensorPattern &pattern,
-    const std::vector<int32> &common_strides,
+    const ArrayRef<int32> &strides,
     std::vector<TensorPattern*> *patterns) {
 
   patterns->resize(1);
-  ConvertLazilyToCommonStrides(pattern, &((*patterns)[0]));
-  int32 num_axes = common_strides.size();
+  ConvertPatternStridesLazily(pattern, &((*patterns)[0]));
+  int32 num_axes = strides.size();
   for (int32 raxis = 0; raxis + 1 < num_axes; raxis++) {
-    TensorPattern extra_pattern;
-    int32 num_patterns = patterns->size();
-    for (int32 p = 0; p < num_patterns; p++) {
-      if (EnsureAxisSortingPropertyHolds(raxis, &((*patterns)[p]),
-                                         &extra_pattern))
-        patterns->push_back(extra_pattern);
+    for (int32 p = 0; p < static_cast<int32>(patterns->size()); p++) {
+      if (!EnsureAxisSortingPropertyHolds(raxis, p, patterns)){
+        patterns->clear();
+        return false;  // Couldn't be converted, because 'pattern' was not
+                       // regular.
+      }
     }
   }
+#ifdef KALDI_PARANOID
+  for (int32 p = 0; p < static_cast<int32>(patterns->size()); p++) {
+    KALDI_PARANOID_ASSERT(IsValidM(*patterns)[p]);
+  }
+#endif
+  return true;
 }
 
 
diff --git a/src/tensor/tensor-pattern-extra-utils.h b/src/tensor/tensor-pattern-extra-utils.h
index 6088ad0a09f..7c1206c2d02 100644
--- a/src/tensor/tensor-pattern-extra-utils.h
+++ b/src/tensor/tensor-pattern-extra-utils.h
@@ -139,8 +139,74 @@ bool ToMemoryIndexTupleSet(const ArrayRef<TensorPattern*>  patterns,
    that their memory-index-tuple-sets are the same.  See glossary
    in tensor-pattern.h for explanation.
  */
-bool PatternTuplesEquivalent(const ArrayRef<const TensorPattern*> &patterns1,
-                             const ArrayRef<const TensorPattern*> &patterns2);
+bool PatternTuplesEquivalent(const ArrayRef<const TensorPattern*> patterns1,
+                             const ArrayRef<const TensorPattern*> patterns2);
+
+/**
+   This function returns true if a Pattern is regular (see Regularity property
+   in the glossary in tensor-pattern.h) and false otherwise.  'pattern' must
+   have all positive strides, the strides must be in increasing order (in the
+   private numbering), and it must be valid-- (see glossary).
+ */
+bool IsRegular(const TensorPattern &pattern);
+
+
+/**
+   This function returns true if a Pattern is valid- (see definition in
+   glossary); see also TensorPattern::Valid() and IsValidMM().
+ */
+bool IsValidM(const TensorPattern &pattern);
+
+/**
+   This function returns true if a Pattern is valid-- (see definition in
+   glossary); see also TensorPattern::Valid() and IsValidM().
+ */
+bool IsValidMM(const TensorPattern &pattern);
+
+
+/**
+   This function attempts to convert a pattern 'pattern' in canonical form
+   (c.f. "Canonical form" in glossary, and CanonicalizePattern()) to a list of
+   Patterns (see documentation of `patterns` below for note on their possible
+   non-validity), whose strides (in the private numbering) are equal to the
+   provided 'strides' vector, the union of whose memory-index-sets (which will
+   all be disjoint) is equal to the memory-index-set of the input Pattern, and
+   which are all linear in `pattern` (c.f. documentation of "Linear Property).
+
+   This function is not guaranteed to always succeed (return true) but it will
+   always succeed when people are doing "reasonable" things with Tensors.  It
+   will always succeed if each element in 'strides' divides the next element
+   exactly, although this is not a necessary condition for success.
+
+       @param [in] pattern  A valid Pattern in canonical form
+       @param [in] strides   A list of positive integers, sorted from
+                        smallest to greatest; it must contain all strides in
+                        `pattern`.
+       @param [out] patterns  On success (see documentation of return status)
+                        'patterns' will be set to a nonempty list of patterns,
+                        the union of whose memory-index-sets equals
+                        the memory-index-set of `pattern`, all of whose strides are
+                        equal to `strides`, and each of which is valid- and linear
+                        in `pattern` (see "Linear property").
+
+                        except for property (iv) (search for "Valid
+                        Pattern" in tensor-pattern.h): that is, they may have
+                        nonzero strides for axes with dim == 1.  Each elements
+                        of 'strides' dividing the next is a sufficient but not
+                        necessary condition for this function to always return
+                        true.
+                          On failure, `patterns->empty()` will be true.
+
+        @return         Returns true if pattern strides could be converted using
+                        our algorithm, false if not.  This algorithm will work
+                        for any 'reasonable' request; it doesn't attempt to
+                        cover the types of cases where, to solve them, we would
+                        have to output a number of patterns that couldn't be
+                        bounded given the number of axes.
+  */
+bool ConvertPatternStrides(const TensorPattern &pattern,
+                           const ArrayRef<int32> strides,
+                           std::vector<TensorPattern> *patterns);
 
 
 /**
diff --git a/src/tensor/tensor-pattern-utils.h b/src/tensor/tensor-pattern-utils.h
index ee80fbabf23..af3c7370a23 100644
--- a/src/tensor/tensor-pattern-utils.h
+++ b/src/tensor/tensor-pattern-utils.h
@@ -460,6 +460,13 @@ inline void CanonicalizePattern(TensorPattern *pattern) {
   SortAxes(pattern);
 }
 
+/**
+   This pattern checks that 'pattern' is valid and in canonical form (see
+   glossary for the meaning).  CanonicalizePattern() will modify a valid pattern
+   to put it in canonical form.
+ */
+bool IsCanonical(const TensorPattern &pattern);
+
 
 /**
    This version of SortAxes() sorts the axes in 'patterns' (which must be
@@ -808,9 +815,9 @@ class TensorPatternRebaser {
   // The 'offset' value of dest_pattern_compressed
   int64 dest_offset_;
 
-  // num_axes_ is the number of axes, not in the original src_pattern /
-  // dest_pattern but after the two patterns have been jointly compressed and
-  // then sorted from smallest to greatest stride in src_pattern.
+  // num_axes_ is the number of axes, not in the original src_pattern and
+  // dest_pattern, but after the two patterns have been jointly compressed and
+  // then sorted from the smallest to greatest stride in src_pattern.
   // src_strides_ are the resulting strides from src_pattern_compressed, and
   // dest_strides_ are the resulting strides from dest_pattern_compressed.
 
@@ -830,7 +837,13 @@ class TensorPatternRebaser {
   int32 src_strides_[KALDI_TENSOR_MAX_DIM];
   int32 dest_strides_[KALDI_TENSOR_MAX_DIM];
 
+
+
   // The basic algorithm in Convert() is:
+  //
+
+  //
+  //
   //  First, add offset_ to its offset.
   //   Then:
   //     For each nontrivial axis of 'pattern', we are going to modify
diff --git a/src/tensor/tensor-pattern.h b/src/tensor/tensor-pattern.h
index dfc3e714a8c..906845a0fa0 100644
--- a/src/tensor/tensor-pattern.h
+++ b/src/tensor/tensor-pattern.h
@@ -52,7 +52,16 @@ namespace tensor {
                       that identifies an axis in the public numbering (see "Public numbering").
                       See also: Raxis-index.
 
-    Axis-sorting property: search below for [Valid Pattern], point (vi).
+    Axis-sorting property: search below for [Valid Pattern], point (vi), for the main
+                      definition.
+          [Axis-sorting property of an axis-index]:
+                      There is another sense in which we use the term
+                      'axis-sorting property': for a Pattern whose axes are sorted
+                      from least to greatest abs(stride) [in the private numbering],
+                      we say that "the axis-sorting property holds for axis-index i
+                      of that Pattern" if:
+                                 dim(i) * abs(stride(i)) <= abs(stride(i+1)).
+
 
     Broadcasting:     A convention whereby for an operation on Tensors that would
                       normally be required to have the same dimension, it's
@@ -76,13 +85,14 @@ namespace tensor {
                      are broadcastable because 4 == 4 and in the remaining axis,
                      one of the dimensions is 1.
 
-    Canonical form:  A TensorPattern is in canonical form if all axes that could be combined
-                     (without affecting its memory-index-set, obviously) have been
-                     combined, there are no trivial axes, all strides are positive,
-                     and the axes are sorted in increasing order of stride.
-                     (Note: this is in the private numbering; in the public numbering
-                     this means decreasing order of stride, which is consistent
-                     with "C" strides).  See CanonicalizePattern().
+    Canonical form:  A TensorPattern is in canonical form if all pairs of axes that
+                     could be combined (without affecting its memory-index-set)
+                     have been combined, where there are no trivial axes, all
+                     strides are positive, and the axes are sorted in increasing
+                     order of stride.  (Note: this is in the private numbering;
+                     in the public numbering this means decreasing order of
+                     stride, which is consistent with "C" strides).  See
+                     CanonicalizePattern().
 
     Contiguous:      A Pattern is contiguous if its memory-index-set forms a contiguous
                      range of integers (no gaps).  This is different from the PyTorch
@@ -174,7 +184,7 @@ namespace tensor {
                       (see struct TensorPattern).  The Pattern has
                       an 'offset' which is the memory-index of the element of the Tensor
                       whose index-tuple is all zeros; the Pattern also
-                      has a number of axes, `0 <= num_axes < KALDI_TENSOR_MAX_DIM`,
+                      has a number of axes, `0 <= num_axes < KALDI_TENSOR_MAX_AXES`,
                       and for each axis from 0 <= axis < num_axes, it has a dimension
                       dim(axis) and stride(axis).
 
@@ -218,6 +228,9 @@ namespace tensor {
                       numbers of axes we may often use larger raxis values for the Tensor
                       of smaller num_axes (see PyTorch-style broadcasting).
 
+    Set-equivalent:   Two Patterns are set-equivalent if their memory-index-sets
+                      are identical.
+
     Trivial axis:     An axis of a Pattern for which dim=1 and stride=0.
 
     Memory-index-set of a Pattern:
@@ -232,6 +245,52 @@ namespace tensor {
                       index-tuple-set of the Pattern-tuple.  See "memory-index-tuple"
                       and "index-tuple-set of a Pattern-tuple" for more information.
 
+    Linear property:
+                      Consider Patterns P and Q with the property that the
+                      memory-index-set of P is a subset of the memory-index-set of
+                      Q.  If i is an index-tuple, let P(i) be the map from
+                      i to a memory-index, and let
+                            \f$   Q^{-1}(m)   \f$
+                      be the function that maps a memory-index m in the memory-index-set
+                      of Q to the index-tuple i in the index-tuple-set of Q such
+                      that Q(i) = m.  Then we say that P is linear in Q if
+                      for all index-tuples i and j such that i, j and i + j are
+                      in the index-tuple-set of P,
+                      \f$  Q^{-1}(P(i)) + Q^{-1}(P(j)) = Q^{-1}(P(i+j)) \f$.
+                      [Transitivity]
+                      It is easy to show that the linear property is transitive;
+                      that is if P is linear in Q and Q is linear in R, then
+                      P is linear in R.
+
+    Regularity property:   This is a property of Patterns that is relevant when reducing
+                      Patterns to a common set of strides.
+
+                      We formulate the regularity property to only apply for
+                      Patterns which are valid-- and which have positive strides in increasing order; these
+                      the stipulation on having postive, sorted strides
+                      is for convenience, since we happen to need it only for
+                      that case and it's easier to formulate in that case.
+
+                      For the regularity property to apply, a Pattern must also
+                      be valid-- (see its own glossary entry).
+
+                      A Pattern is regular if, in addition to satisfying the
+                      properties mentioned above, for each axis-index
+                      0 <= i < num_axes - 1,
+                      there is an integer k with i < k <= num_axes, such that:
+                        (i) Either k == num_axes, or dim(i) * stride(i) <= stride(k),
+                      and
+                        (ii) For all j with i < j < k, stride(i) divides stride(j)
+                            exactly and dim(j) = 1.
+                      [Note: the condition that dim(j) == 1 will anyway be true if
+                      the Pattern has the uniqueness property.]
+
+                      The reader may notice that if we were to restrict
+                      k to equal i + 1, then
+                      this would be equivalent to the axis-sorting property
+                      (property (v)) plus the requirement that the strides be
+                      positive and sorted.
+
     Stride:           A stride is the distance, in elements, between successive
                       elements of a Tensor along a particular dimension.
                       For example, a Tensor with one axis having dim=3 and
@@ -245,6 +304,12 @@ namespace tensor {
                       necessary (since most BLAS implementations do not support
                       negative stride).
 
+   Uniqueness property:  A property of a Pattern that no two different index-tuples,
+                      when used to index the Pattern, generate the same memory-index.
+                      The axis-sorting property is sufficient, but not necessary,
+                      to ensure the uniqueness property.  (The uniqueness property
+                      is probably not so easy to test for efficiently in the general
+                      case; at least, we have not found a way).
 
     Valid Pattern:
                      A valid Pattern must be as follows.  Think of this as the mathematical definition;
@@ -254,28 +319,25 @@ namespace tensor {
                           (i) The num_axes must satisfy 0 <= num_axes < KALDI_TENSOR_MAX_DIM
                           (ii) The offset must be >= 0.
                           (iii) the dims must all be >0.
-                          (iv) the strides must be zero for axes with dim=1
-                          (v) the strides must be nonzero (but not necessarily positive) for axes with
+                          (iv) the strides must be nonzero (but not necessarily positive) for axes with
                                 dim != 1.
-                          (vi) the axis-sorting property.   This property assures that no memory-index
-                              can be accessed via two different index-tuples, and is sufficient
-                              but not necessary toensure the Uniqueness Property (see its own entry).
-                              This property requires that if the axes are sorted from least to greatest
-                              value of abs(stride),
-                              for each axis i < num_axes - 1:
-                                    dim(i) * stride(i) <= stride(i+1).
-
-   Valid+ Pattern:  a Pattern which is valid and also has its code set.  See declaration for
-                    struct TensorPattern.  This is not a mathematical type of definition, more
-                    of a code-level definition, but since we frequently need this notion,
-                    we give it its own name.
-
-   Uniqueness property:  A property of a Pattern that no two different index-tuples,
-                      when used to index the Pattern, generate the same memory-index.
-                      The axis-sorting property is sufficient, but not necessary,
-                      to ensure the uniqueness property.  (The uniqueness property
-                      is probably not so easy to test for efficiently in the general
-                      case; at least, we have not found a way).
+                          (v) the axis-sorting property.   This property is sufficient, but not
+                              necessary, to ensure the uniqueness property.  It requires that
+                              when the axes are sorted from least to greatest value of abs(stride),
+                              for each axis-index 0 <= i < num_axes - 1:
+                                    dim(i) * abs(stride(i)) <= abs(stride(i+1)).
+                              (Note: this property doesn't require that the axes be sorted that
+                              way; if you need that, search for "Canonical form").
+                          (vi) the strides must be zero for axes with dim=1.
+
+
+     Valid- Pattern:
+                      A Pattern is valid- if it satisfies properties (i) through (v) of
+                      a valid Pattern (i.e. it may have nonzero strides for axes with dim=1).
+                      A valid pattern is also valid-.
+     Valid-- Pattern:
+                      A Pattern is valid-- if it satisfies properties (i) through (iv) of
+                      a valid Pattern.  A pattern that is valid or valid- is also valid--.
  */
 
 
@@ -312,7 +374,7 @@ namespace tensor {
 
     offset >= 0
 
-    The axis-sorting property (see property (vi) in "Valid Pattern" above)
+    The axis-sorting property (see property (v) in "Valid Pattern" above)
 
   Note: in the public interface of class Tensor, if you ask for Dim(i) it will
   return pattern.dims[pattern.num_axes - i], i.e. the interface uses the public
@@ -341,6 +403,7 @@ struct TensorPattern {
   // namely: dims and strides with index >= num_axes should be
   // 1 and 0 respectively; and the code should either be -1 or or
   // be the same as ComputePatternCode() returns on this pattern.
+  // See also IsCanonical() in tensor-pattern-utils.h.
   bool IsValid();
 
   // This comparator induces a total ordering on valid TensorPatterns.  It is a
@@ -351,6 +414,21 @@ struct TensorPattern {
 };
 
 
+/// Returns a string representing a Pattern, of the form:
+/// "offset=a dims=[b c d] strides=[e f g]"; this is for debugging
+/// purposes.
+std::string PatternAsString(const TensorPattern &pattern);
+
+/// Returns a string representing the dims of a Pattern, something like
+/// "[10 20 100]"
+std::string DimsAsString(const TensorPattern &pattern);
+
+/// Returns a string representing the strides of a Pattern, something like
+/// "[1 10 200]"
+std::string StridesAsString(const TensorPattern &pattern);
+
+
+
 // We may later get rid of this struct and just have functions to get
 // these properties.
 struct TensorPatternProperties {

From 99873c635a50bccae908b1609df38d082100555e Mon Sep 17 00:00:00 2001
From: Daniel Povey <dpovey@gmail.com>
Date: Mon, 22 Apr 2019 18:42:19 -0400
Subject: [PATCH 023/163] [src] Further progress

---
 src/tensor/tensor-pattern-extra-utils-inl.h |  66 +++
 src/tensor/tensor-pattern-extra-utils.cc    | 575 ++++++++++++--------
 src/tensor/tensor-pattern-extra-utils.h     | 145 ++++-
 src/tensor/tensor-pattern-utils.cc          |  42 +-
 src/tensor/tensor-pattern-utils.h           | 283 +++-------
 src/tensor/tensor-pattern.h                 |  92 ++--
 6 files changed, 700 insertions(+), 503 deletions(-)
 create mode 100644 src/tensor/tensor-pattern-extra-utils-inl.h

diff --git a/src/tensor/tensor-pattern-extra-utils-inl.h b/src/tensor/tensor-pattern-extra-utils-inl.h
new file mode 100644
index 00000000000..78c40d13ea2
--- /dev/null
+++ b/src/tensor/tensor-pattern-extra-utils-inl.h
@@ -0,0 +1,66 @@
+// tensor/tensor-pattern-extra-utils-inl.h
+
+//  Copyright      2019  Johns Hopkins University (author: Daniel Povey)
+
+// See ../../COPYING for clarification regarding multiple authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//  http://www.apache.org/licenses/LICENSE-2.0
+//
+// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
+// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
+// MERCHANTABLITY OR NON-INFRINGEMENT.
+// See the Apache 2 License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef KALDI_TENSOR_TENSOR_PATTERN_EXTRA_UTILS_INL_H_
+#define KALDI_TENSOR_TENSOR_PATTERN_EXTRA_UTILS_INL_H_ 1
+
+// This file is only to be included by tensor-pattern-extra-utils.h; do not include it
+// directly.
+
+
+
+namespace kaldi {
+namespace tensor {
+
+inline void ComputeMinAndMaxMindex(const TensorPattern *pattern,
+                                   int64 *min_mindex,
+                                   int64 *max_mindex) {
+  KALDI_PARANOID_ASSERT(IsValid(pattern));
+  int32 num_axes = pattern.num_axes;
+  if (ContainsNegativeStride(pattern.code)) {
+    // The if-statement above may be read as "if either pattern.code is -1 or it
+    // indicates that `pattern` contains a negative stride.  That is, at this
+    // point we know that `pattern` *might* contain a negative stride.
+    int64 min_mindex_sum = 0, max_mindex_sum = 0;
+    for (int32 raxis = 0; raxis < num_axes; raxis++) {
+      int64 prod (pattern.dims[raxis] - 1) *
+          static_cast<int64>(pattern.strides[raxis]);
+      if (pattern.strides[raxis] > 0) max_mindex_sum += prod;
+      else min_mindex_sum += prod;
+    }
+    *min_mindex = min_mindex_sum;
+    *max_mindex = max_mindex_sum;
+  } else {
+    // This is a faster branch of the code that can assume all strides are
+    // positive.
+    *min_mindex = 0;
+    int64 max_mindex_sum = 0;
+    for (int32 raxis = 0; raxis < num_axes; raxis++)
+      max_mindex_sum += (pattern.dims[raxis] - 1) *
+          static_cast<int64>(pattern.strides[raxis]);
+    *max_mindex = max_mindex_sum;
+  }
+}
+
+
+}  // namespace tensor
+}  // namespace kaldi
+
+
+#endif  // KALDI_TENSOR_TENSOR_PATTERN_EXTRA_UTILS_INL_H_
diff --git a/src/tensor/tensor-pattern-extra-utils.cc b/src/tensor/tensor-pattern-extra-utils.cc
index daa7914b1aa..c31794eee22 100644
--- a/src/tensor/tensor-pattern-extra-utils.cc
+++ b/src/tensor/tensor-pattern-extra-utils.cc
@@ -24,189 +24,34 @@ namespace kaldi {
 namespace tensor {
 
 
-class IntersectionComputer {
- public:
-  IntersectionComputer(const TensorPattern &pattern1,
-                       const TensorPattern &pattern2):
-      pattern1_(pattern1), pattern2_(pattern2);
-
-  /**
-     Computes the intersection between the pattern1 and pattern2 given to the
-     constructor (must be called only once); if it could be computed, the
-     intersection is represented as the union between all the (disjoint)
-     patterns in patterns_out.
-
-        @param [in] patterns_out  A list of patterns (in arbitrary order)
-                         is written to here.  The union of this list of patterns
-                         will, if this function returns true,
-                         represent the intersection between the pattern1 and
-                         pattern2 passed to the constructor.  These patterns
-                         will be valid, but won't be in canonical form (the user
-                         can do that themselves; we don't it here because in
-                         most cases the caller will only care whether the union
-                         is empty or not).
-  */
-  bool ComputeIntersection(std::vector<TensorPattern> *patterns_out) {
-    CanonicalizePattern(&pattern1_);
-    CanonicalizePattern(&pattern2_);
-    std::vector<int32> axes;
-    if (!FindCommonStrides(&axes))
-      return false;
-    std::vector<TensorPattern> patterns1, patterns2;
-    patterns1.reserve(8);
-    patterns2.reserve(8);
-    ConvertToCommonStrides(pattern1_, &patterns1);
-    ConvertToCommonStrides(pattern2_, &patterns2);
-    patterns_out->clear();
-    ComputeIntersection(patterns1, patterns2, patterns_out);
-    return true;
-  }
 
- private:
-
-  // Attempts to find a common list of strides which can be used for the
-  // combined patterns.  Returns false if this cannot be done.  This is done by
-  // taking the union of the strides in pattern1_ and pattern2_, sorting them,
-  // and then checking that each stride in the sequence divides the next (it
-  // returns true if this is the case, false otherwise).
-  // These strides must all be positive because pattern1_ and pattern2_ have
-  // both been canonicalized.
-  bool FindCommonStrides(std::vector<int32> *axes);
-
-
-  /**
-     Computes the intersection between pattern1 and pattern2, which must have
-     identical axes and strides, and must be valid *except* for property (iv),
-     i.e.  it's not required that axes with dim=1 must have stride=0, and the
-     code does not have to be set.
-
-
-        @param [in] pattern1   The first input pattern.  Must be valid
-                               except for property (iv), and must have positive
-                               strides.
-        @param [in] pattern2   The second input pattern.  Must be valid
-                               except for property (iv), and must have
-                               the same strides (in the same order)
-                               as pattern1.
-        @param [out] patterns_out  The output patterns; this function will write
-                               to this location a vector of disjoint patterns
-                               whose union (viewed as a memory-index-set) is
-                               identical to the intersection of pattern1
-                               and pattern2.  The patterns in this vector will
-                               be valid except for property (iv) [i.e. they
-                               won't have zero strides for axes with dim=1], and
-                               they will not have their code set.
-  */
-  static ComputeIntersection(const TensorPattern &pattern1,
-                             const TensorPattern &pattern2,
-                             std::vector<TensorPattern> *patterns_out) {
-    patterns_out->clear();
-    ComputeIntersection(pattern1, pattern2, pattern1.num_axes,
-                        patterns_out);
-  }
 
-  /**
-     In this recursive implementation of ComputeIntersection() [see version
-     above for more information on pattern1, pattern2 and patterns_out], the
-     user guarantees that for all axes with raxis-index `raxis >=
-     identical_raxis`, pattern1 and pattern2 have the same dimension, and
-     it may be assumed that we are only interested in the part of the
-     intersection where the indexes are the same for pattern1 and pattern2,
-     for all raxis >= identical_raxis.
-
-     In this recursion, when we get to 'identical_raxis == 0', it means pattern1
-     and pattern2 have identical dims and strides; and if they also have the
-     same offset, all we need to do is append one of them to 'patterns_out'
-     (otherwise this part of the intersection is empty; but note that this
-     function may in general fork into two branches each time it recurses).
-     This is all part of a process of trying to make the 'offset' identical
-     between the two patterns by discarding some leading dimensions on one of
-     the two patterns.  On raxis-indexes that we have processed, we also make
-     the 'dim' the same by lopping off trailing dimensions.
-  */
-  static bool ComputeIntersection(const TensorPattern &pattern1,
-                                  const TensorPattern &pattern2,
-                                  int32 identical_raxis,
-                                  std::vector<TensorPattern> *patterns_out);
-
-
-
-  // the same as pattern1 passed to the constructor, but reduced to
-  // canonical form
-  TensorPattern pattern1_;
-  // the same as pattern1 passed to the constructor, but reduced to
-  // canonical form
-  TensorPattern pattern2_;
-
-  // patterns1_ is the list of patterns we get when we convert pattern1_
-  // to have the shared list of strides.  Will have at least one element.
-  std::vector<TensorPattern> patterns1_;
-  // patterns2_ is the list of patterns we get when we convert pattern2_
-  // to have the shared list of strides.  Will have at least one element.
-  std::vector<TensorPattern> patterns2_;
-
-  std::vector<TensorPattern> *intersection_;
-};
-
-
-bool IntersectionComputer::FindCommonStrides(std::vector<int32> *axes) {
-  axes->clear();
-  axes->reserve(pattern1_.num_axes + pattern2_.num_axes);
-  for (int32 raxis = 0; raxis < pattern1_.num_axes; raxis++)
-    axes->push_back(pattern1_.strides[raxis]);
-  for (int32 raxis = 0; raxis < pattern2_.num_axes; raxis++)
-    axes->push_back(pattern2_.strides[raxis]);
-  SortAndUniq(axes);  // sort from least to greatest, remove duplicates.
-  int32 prev_stride = (*axes)[0];
-  size_t num_axes = axes->size();
-  for (size_t i = 1; i < num_axes; i++) {
-    int32 cur_stride = (*axes)[i];
-    if (cur_stride % prev_stride != 0)
-      return false;  // prev_stride does not divide cur_stride; our algorithm
-                     // for detecting overlap cannot be used.  This shouldn't
-                     // really happen in "reasonable" uses of Tensors.
-    prev_stride = cur_stride;
-  }
-  return true;
+/**
+   This function, not declared in the header, creates a sorted list of all the
+   stride values which are present in either 'pattern1' or 'pattern2'.  These
+   will all be positive, since pattern1 and pattern2 are required to be in
+   canonical form.
+
+     @param [in] pattern1   First input pattern, must be in canonical form.
+     @param [in] pattern2   Second input pattern, must be in canonical form.
+     @param [out] strides   A sorted list of all stride values that are present
+                            in either pattern1 or pattern2 will be written
+                            to here.  There will be no repeats.
+*/
+static void FindAllStrides(
+    const TensorPattern &pattern1,
+    const TensorPattern &pattern2,
+    std::vector<int32> *strides) {
+  KALDI_PARANOID_ASSERT(IsCanonical(pattern1) && IsCanonical(pattern2));
+  strides->clear();
+  strides->reserve(pattern1_.num_axes + pattern2_.num_axes);
+  for (int32 raxis = 0; raxis < pattern1.num_axes; raxis++)
+    strides->push_back(pattern1.strides[raxis]);
+  for (int32 raxis = 0; raxis < pattern2.num_axes; raxis++)
+    strides->push_back(pattern2_.strides[raxis]);
+  SortAndUniq(strides);  // sort from least to greatest; remove duplicates.
 }
 
-bool IntersectionComputer::EnsureAxisSortingPropertyHolds(
-    int32 raxis, TensorPattern *pattern,
-    TensorPattern *extra_pattern) {
-  KALDI_PARANOID_ASSERT(raxis + 1 < pattern->num_axes);
-  if (pattern->strides[raxis + 1] >=
-      pattern->strides[raxis] * pattern->dims[raxis]) {
-    // Property already holds -> nothing to do.  Return false
-    // because 'extra_pattern' is not needed.
-    return false;
-  }
-
-  // It would not make sense if pattern->dims[raxis + 1] were > 1; that would
-  // imply we started with some kind of self-overlapping pattern, whicg would
-  // not be valid.
-  KALDI_PARANOID_ASSERT(pattern->strides[raxis + 1] %
-                        pattern->strides[raxis] == 0 &&
-                        pattern->dims[raxis + 1] == 1);
-
-  int32 ratio = pattern->strides[raxis + 1] / pattern->strides[raxis];
-  int32 orig_dim = pattern->dims[raxis];
-  pattern->dims[raxis] = ratio;
-  int32 next_dim = orig_dim / ratio;
-  pattern->dims[raxis + 1] = orig_dim;
-
-  int32 remainder = orig_dim % ratio;
-  if (remainder == 0) {
-    // We didn't need to make use of 'extra_pattern', so return false.
-    return false;
-  } else {
-    *extra_pattern = pattern;
-    extra_pattern->dims[raxis] = remainder;
-    extra_pattern->dims[raxis + 1] = 1;
-    extra_pattern->offset += next_dim * pattern->strides[raxis];
-    // we used extra_pattern, so return true.
-    return true;
-  }
-}
 
 
 // See declaration in header.
@@ -236,8 +81,9 @@ bool IsRegular(const TensorPattern &pattern) {
 /**
    This function, called by ConvertPatternStrides(), is not declared in the
    header.  It converts a pattern in canonical form to a Pattern whose strides
-   are equal to the provided 'strides' vector, which is valid--,
-   satisfieds the uniqueness property, and has positive and increasing strides.
+   are equal to the provided 'strides' vector, which is valid-2,
+   satisfies the uniqueness property, and has normalized (i.e.
+   positive and increasing) strides.
 
        @param [in] pattern_in  The input pattern; must be valid and
                                in canonical form.
@@ -251,8 +97,9 @@ bool IsRegular(const TensorPattern &pattern) {
                                pattern_in).  On exit its memory-index-set will
                                equal that of pattern_in; its strides will be
                                equal to 'strides' (including the order, when
-                               numbered in the private numbering); and it will
-                               be valid-- and satisfy the uniqueness property.
+                               numbered in the private numbering); it will
+                               be valid-2 and satisfy the uniqueness property;
+                               and it will be linear in pattern_in.
 */
 static void ConvertPatternStridesLazily(
     const TensorPattern &pattern_in,
@@ -385,11 +232,11 @@ static bool EnsureAxisSortingPropertyHolds(
 }
 
 
-void ConvertPatternStrides(
+// see declaration in header for documentation.
+bool ConvertPatternStrides(
     const TensorPattern &pattern,
     const ArrayRef<int32> &strides,
     std::vector<TensorPattern*> *patterns) {
-
   patterns->resize(1);
   ConvertPatternStridesLazily(pattern, &((*patterns)[0]));
   int32 num_axes = strides.size();
@@ -403,24 +250,71 @@ void ConvertPatternStrides(
     }
   }
 #ifdef KALDI_PARANOID
-  for (int32 p = 0; p < static_cast<int32>(patterns->size()); p++) {
-    KALDI_PARANOID_ASSERT(IsValidM(*patterns)[p]);
+  {
+    int64 num_elements = NumElements(pattern),
+        num_elements_check = 0;
+    for (int32 p = 0; p < static_cast<int32>(patterns->size()); p++) {
+      KALDI_ASSERT(IsValidM(*patterns)[p]);
+      num_elements_check += NumElements((*patterns)[p]);
+    }
+    KALDI_ASSERT(num_elements == num_elements_check);
   }
 #endif
   return true;
 }
 
-
-// see declaration for documentation.
-void IntersectionComputer::ComputeIntersection(
-    const TensorPattern &pattern1,
-    const TensorPattern &pattern2,
-    int32 identical_raxis,
-    std::vector<TensorPattern> *patterns_out) {
+/**
+   This recursive function is used to compute the intersection between
+   pattern1 and pattern2, which must have identical num_axes and strides,
+   must have normalized strides, and must be valid-1.  The user would call
+   this with identical_raxis == pattern1.num_axes, and the recursion on
+   identical_raxis takes care of the actual implementation.
+
+
+        @param [in] pattern1   The first input pattern.  Must be valid-1 and
+                               have normalized strides.
+        @param [in] pattern2   The second input pattern.  Must be valid-1 and
+                               have the same num_axes and strides as pattern1.
+        @param [in] identical_raxis  Let num_axes be the num_axes of pattern1 or
+                               pattern2 (it's the same).  By passing in
+                               a particular value of identical_raxis, the caller
+                               asserts that for all raxis with
+                               identical_raxis <= raxis < num_axes,
+                               `pattern1.dim[raxis] == pattern2.dim[raxis]`;
+                               and furthermore that the caller is only
+                               interested in the part of the overlap for which
+                               pattern1 and pattern2 have the same index for all
+                               raxis >= identical_raxis (and if there was
+                               another part, it has been handled separately).
+        @param [out] patterns_out  The output patterns; this function will
+                               append to this location a number (possibly zero)
+                               of disjoint valid patterns, each of which is
+                               linear in pattern1 and pattern2, the union of whose
+                               memory-index-sets is identical to the intersection
+                               of pattern1 and pattern2's memory-index-sets.
+  */
+void ComputeIntersectionRecursive(const TensorPattern &pattern1,
+                                  const TensorPattern &pattern2,
+                                  int32 identical_raxis,
+                                  bool keep_all_patterns,
+                                  std::vector<TensorPattern> *patterns_out) {
   if (identical_raxis == 0) {
+    /*
+      The base-case of the recursion; if we reach here, it means pattern1 and
+      pattern2 have identical dims and strides; and if they also have the same
+      offset, all we need to do is append one of them to 'patterns_out'
+      (otherwise this part of the intersection is empty).  This is all part of a
+      process of trying to make the 'offset' identical between the two patterns
+      by discarding some leading indexes on one of the two patterns, and
+      discarding any trailing indexes as needed to make the dim the same.  (See
+      "Index:" in glossary for clarity on its meaning here).
+    */
+
     if (pattern1.offset == pattern2.offset) {
-      patterns_out->push_back(pattern1);
-      RemoveTrivialAxes(&(patterns_out->back()));
+      size_t cur_size = patterns_out->size();
+      patterns_out->resize(cur_size + 1);
+      push_back(pattern1);
+      RemoveTrivialAxes(pattern1, &(patterns_out[cur_size]));
     }
     return;
   }
@@ -429,30 +323,32 @@ void IntersectionComputer::ComputeIntersection(
       stride = pattern1.strides[raxis]; // will be the same in pattern2, and positive.
 
   // By the '?..:' statements below we possibly switch pattern2 and
-  // pattern1, thereby ensuring that pattern2_mod.offset >= pattern1_mod.offset
+  // pattern1, thereby ensuring that pattern2_mod.offset >= pattern1_mod.offset;
+  // this simplifies the later code.
   TensorPattern pattern1_mod(pattern2.offset >= pattern1.offset ? pattern1 : pattern2),
       pattern2_mod(pattern2.offset >= pattern1.offset ? pattern2 : pattern1);
 
 
   // pattern2_mod's offset is larger (or the same), so we may need to discard
-  // some leading indexes of pattern1_mod (on axis 'raxis'), increasing the
-  // offset and reducing the dim, to get the offsets closer to being the same,
-  // and then take the min of the dims on that axis.
+  // some leading indexes of pattern1_mod (on axis 'raxis'), increasing
+  // pattern1_mod's offset and reducing its dim on this raxis, to get the
+  // offsets closer to being the same.
 
-  // 'dim_discarded' below will be rounded down in the division, and we will
+  // 'min_dim1_discarded' below will be rounded down in the division, and we will
   // also need to also consider the value that's one larger than that.  We don't
-  // need to consider any other values of 'dim_discarded' other than these two,
+  // need to consider any other values of 'dim1_discarded' other than these two,
   // because it's possible to prove that if we recurse with the remaining offset
   // being greater than 'stride', we would never be able to get to offset=0
   // without discarding all dims of at least one axis numbered less than raxis.
-  // The proof requires the axis-sorting property.
+  // The proof requires the axis-dominance property (together with normalized
+  // strides).
   int32 offset_diff = pattern2_mod.offset - pattern1_mod.offset,
       min_dim1_discarded = offset_diff / stride,
       max_dim1_discarded = ((offset_diff == min_dim1_discarded * stride) ?
                             min_dim1_discarded : min_dim1_discarded + 1);
 
   // Make a copy of the relevant dims, and pattern1's offset, because the
-  // versions in the patterns may get modified in the loop.
+  // versions in the patterns may get modified in the loop below.
   int32 pattern1_dim = pattern1_mod.dims[raxis],
       pattern2_dim = pattern2_mod.dims[raxis],
       pattern1_offset = pattern1.offset;
@@ -461,53 +357,268 @@ void IntersectionComputer::ComputeIntersection(
     pattern1_mod.offset = pattern1_offset + dim1_discarded * stride;
     int32 new_pattern1_dim = pattern1_dim - dim1_discarded;
     if (new_pattern1_dim <= 0)
-      continue;
+      continue;  // There's no overlap here.
     pattern1_mod.dims[raxis] = new_pattern1_dim;
     // set both dims of pattern1_mod and pattern2_mod to the minimum
     // of the two dims.
     if (pattern2_dim > new_pattern1_dim) {
-      pattern1_mod.dims[raxis] = new_pattern1_dim;
       pattern2_mod.dims[raxis] = new_pattern1_dim;
     } else {
       pattern1_mod.dims[raxis] = pattern2_dim;
       pattern2_mod.dims[raxis] = pattern2_dim;
     }
-    // Recurse.  We would have continued above if we discarded all dims on this
-    // axis.
-    ComputeIntersection(pattern1, pattern2, raxis, patterns_out);
+    // Recurse.
+    ComputeIntersectionRecursive(pattern1, pattern2, raxis,
+                                 keep_all_patterns, patterns_out);
   }
 }
 
 
+// See documentation in header.
+bool ComputeIntersection(const TensorPattern &pattern1_in,
+                         const TensorPattern &pattern2_in,
+                         std::vector<TensorPattern> *intersection,
+                         bool keep_all_patterns) {
+  TensorPattern pattern1(pattern1_in),
+      pattern2(pattern2_in);
+  CanonicalizePattern(&pattern1);
+  CanonicalizePattern(&pattern2);
+  std::vector<int32> strides;
+  FindAllStrides(pattern1, pattern2, &strides);
+  int32 num_axes = strides.size();
+  if (num_axes == 0) {
+    // Some of the code below with num_axes - 1 would crash
+    // in this case, so handle it separately.
+    if (pattern1.offset == pattern2.offset) {
+      intersection->resize(1);
+      (*intersection)[0] = pattern1;
+    } else {
+      intersection->clear();
+    }
+    return true;
+  }
+  std::vector<TensorPattern> patterns1, patterns2;
+  patterns1.reserve(8);
+  patterns2.reserve(8);
+  intersection->clear();
+  if (!ConvertPatternStrides(pattern1, strides, &patterns1) ||
+      !ConvertPatternStrides(pattern2, strides, &patterns2))
+    return false;
 
-bool IntersectionComputer::ComputeIntersection(
-    const TensorPattern &pattern1,
-    const TensorPattern &pattern2,
-    std::vector<TensorPattern> *patterns_out) {
-  // First ensure that pattern1.offset <= pattern2.offset.
-  if (pattern1.offset > pattern2.offset)
-    return ComputeIntersection(pattern2, pattern1, pattern_out);
-
-  int64 extra_offset = pattern2.offset - pattern1.offset;
-  int32 dim_offset[KALDI_MAX_TENSOR_DIM];
-  // What we are doing conceptually here is shifting pattern1 to have the same
-  // offset as pattern2 by saying that on each axis, instead of starting the
-  // index from zero to dim - 1, we start that index from some number less than
-  // zero i.e. we shift those indexes to the left.  The index of the
-  // intersection will still start from zero though, because pattern2's index
-  // still starts from zero.
-  // We are going to express 'extra_offset' as a sum
-
-
-  // pattern1 and pattern2 are required to have the same stride and num_axes.
-  int32 num_axes = pattern1.num_axes;
+  auto iter1 = patterns1.begin(), end1 = patterns1.end();
+  for (; iter1 != end1; ++iter1) {
+    Pattern &sub_pattern1 = *iter1;
+    auto iter2 = patterns2.begin(), end2 = patterns2.end();
+
+    // Below, 'max_mindex1' is not the actual largest mindex in `sub_pattern1`,
+    // but an upper bound on it (in fact, it is strictly greater than it); to
+    // prove this we require the axis-dominance property and the fact that the
+    // strides are normalized (positive and increasing).  This is part of an
+    // optimization to more quickly skip over pairs of patterns that will have
+    // empty intersection.
+    int64 min_mindex1 = sub_pattern1.mindex,
+        max_mindex1 = min_mindex1 +
+        sub_pattern1.strides[num_axes - 1] * sub_pattern1.dims[num_axes - 1];
+
+    for (; iter2 != end2; ++iter2) {
+      Pattern &sub_pattern2 = *iter2;
+      int64 min_mindex2 = sub_pattern2.mindex,
+          max_mindex2 = min_mindex2 +
+          sub_pattern2.strides[num_axes - 1] * sub_pattern2.dims[num_axes - 1];
+      if (min_mindex2 >= max_mindex1 || min_mindex1 >= max_mindex2)
+        continue;  //  This is an optimization for efficiency when it's easy to
+                   // see that two Patterns won't overlap.
+
+      // Here, sub_pattern1 and sub_pattern2 are the sub-pieces of pattern1 and
+      // pattern2 that have been converted to share the same list of strides
+      // (That conversion process may end up splitting patterns into several
+      // pieces, even if it was possible, which is not always; hopefuly there is
+      // just one piece in each case, but there may be more).  The following
+      // call may add elements to 'intersection'.
+      ComputeIntersectionRecursive(sub_pattern1, sub_pattern2,
+                                   num_axes,
+                                   keep_all_patterns,
+                                   intersection);
+      if (!keep_all_patterns && !intersection.empty())
+        return true;
+    }
+  }
+  return true;
+}
+
+bool PatternContains(const TensorPattern &pattern_in,
+                     int64 mindex) {
+  TensorPattern pattern_mod;
+  const Pattern *pattern;
+  if (!IsCanonical(pattern_in)) {
+    CanonicalizePattern(pattern_in, &pattern_mod);
+    pattern = &pattern_mod;
+  } else {
+    pattern = &pattern_in;
+  }
+  mindex -= pattern->offset;
+  int32 num_axes = pattern->num_axes;
   for (int32 raxis = num_axes - 1; raxis >= 0; raxis--) {
-    int32 this_stride = pattern1.strides[raxis],
-        this_offset = extra_offset / this_stride;
+    int32 index = mindex / p->strides[raxis];
+    // The following expression returns true if index is outside
+    //  range [ 0, p->dims[raxis] - 1 ].
+    if (static_cast<uint32>(index) >= static_cast<uint32>(p->dims[raxis]))
+      return false;
+    mindex -= p->strides[raxis] * index;
+  }
+  return (mindex == 0);
+}
+
+
 
+bool ToMemoryIndexSet(const TensorPattern &pattern_in,
+                      std::vector<char> *s) {
+  KALDI_PARANOID_ASSERT(pattern.IsValid());
+  s->clear();
+  TensorPattern pattern_mod;
+  const Pattern *pattern;
+  if (!IsCanonical(pattern_in)) {
+    CanonicalizePattern(pattern_in, &pattern_mod);
+    pattern = &pattern_mod;
+  } else {
+    pattern = &pattern_in;
+  }
+  int32 num_axes = pattern->num_axes;
+  if (num_axes == 0)
+    num_axes = 1;  // this does the right thing, as there will be dim=1,
+                   // stride=0 physically present in the pattern.
+
+  // 'max_mindex' is actually a strict upper bound on the maximum possible
+  // memory-index, i.e. it is more than the largest possible memory-index.  We
+  // rely on the axis-dominance property and also, thanks to the canonical form,
+  // the fact that the strides are normalized (sorted and positive).
+  int64 max_mindex = pattern->strides[num_axes - 1] *
+      pattern->dims[num_axes - 1];
+  s->clear();
+  s->resize(max_mindex, static_cast<char>(0));
+
+  auto recursively_set_elements = [pattern] (int32 raxis, int64 mindex) {
+    int32 this_stride = pattern->strides[raxis],
+         this_dim = pattern->dims[raxis];
+    if (raxis == 0) {
+      // Base case
+      char *c = &((*s)[mindex]);
+      for (int32 d = 0; d < this_dim; d++)
+        c[d * static_cast<int64>(this_stride)] = static_cast<char>(1);
+    } else {
+      for (int32 d = 0; d < this_dim; d++)
+        recursively_set_elements(raxis - 1, mindex + d * this_stride);
+    }
+  }
+  recursively_set_elements(num_axes - 1, pattern->offset);
+}
+
+int64 RandomMemoryIndex(const TensorPattern &pattern) {
+  int32 num_axes = pattern.num_axes;
+  int64 mindex = pattern.offset;
+  for (int32 raxis = 0; raxis < num_axes; raxis++) {
+    mindex += RandInt(0, pattern.dims[raxis] - 1) * pattern.strides[raxis];
   }
+  return mindex;
 }
 
+
+bool PatternsIntersectExhaustive(const TensorPattern &pattern1,
+                                 const TensorPattern &pattern2) {
+}
+
+
+bool PatternsIntersect(const TensorPattern &pattern1,
+                       const TensorPattern &pattern2) {
+  KALDI_PARANOID_ASSERT(pattern1.IsValid() && pattern2.IsValid());
+  int64 min_mindex1, max_mindex1,
+      min_mindex2, max_mindex2;
+  ComputeMinAndMaxMindex(pattern1, &min_mindex1, &max_mindex1);
+  ComputeMinAndMaxMindex(pattern2, &min_mindex2, &max_mindex2);
+  if (min_mindex2 > max_mindex1 ||
+      min_mindex1 > max_mindex2)
+    return false;
+
+  // The next line is a check to see if one or other of the patterns includes
+  // the first element of the other; this much faster than the algorithm for
+  // computing pattern intersection.
+  if (min_mindex2 >= min_mindex1) {
+    if (PatternContains(pattern1, min_mindex2))
+      return true;
+  } else {
+    if (PatternContains(pattern2, min_mindex1))
+      return true;
+  }
+
+  bool keep_all_patterns = false;  // Settin keep_all_patterns to false sets
+                                   // "fast mode", used where we just want to
+                                   // see whether the intersection is empty.
+
+  std::vector<TensorPattern> intersection;
+  if (ComputeIntersection(pattern1, pattern2, &intersection,
+                          keep_all_patterns)) {
+    return (!intersection.empty());
+  }
+
+  // OK, if we reached here it was not possible to convert both patterns to the
+  // same set of strides.  This is not expected to happen in practice for any
+  // reasonable program.  Warn.
+  static int32 num_warned = 0;
+  int32 warn_limit = 10;
+  if (num_warned < warn_limit) {
+    num_warned++;
+    KALDI_WARN << "Testing intersection of patterns that cannot be brought "
+        "to common strides.  This will be extremely slow!";
+  }
+
+  // Randomly select 10 memory-indexes from the smaller pattern and see if it is
+  // in the later pattern; this is faster than the next thing we'll try.
+  const int32 num_draws = 10;
+  if (NumElements(pattern1) < NumElements(pattern2)) {
+    for (int32 i = 0; i < num_draws; i++)
+      if (PatternContains(pattern2, RandomMemoryIndex(pattern1)))
+        return true;
+  } else {
+    for (int32 i = 0; i < num_draws; i++)
+      if (PatternContains(pattern1, RandomMemoryIndex(pattern2)))
+        return true;
+  }
+  // OK, just try an exhaustive search.  If speed becomes an issue we may find a
+  // way to disable the next check, which could be extremely slow for large
+  // patterns.
+  return PatternsIntersectSlow(pattern1, pattern2);
+}
+
+bool PatternsIntersectSlow(const TensorPattern &pattern1_in,
+                           const TensorPattern &pattern2_in) {
+  TensorPattern pattern1(pattern1_in),
+      pattern2(pattern2_in);
+  Canonicalize(&pattern1);
+  Canonicalize(&pattern2);
+  // Note: the offsets are the minimum elements, now that the
+  // patterns are canonical.
+  int64 min_offset = std::min(pattern1.offset, pattern2.offset);
+  pattern1.offset -= min_offset;
+  pattern2.offset -= min_offset;
+  int64 max_offset = std::max(pattern1.offset, pattern2.offset);
+  // Explicitly get the memory-index-set of pattern1 and pattern2
+  // as possibly-huge arrays, and see if they intersect.  Obviously
+  // this will be extremely slow.
+  std::vector<char> pattern1_mindexes, pattern2_mindexes;
+  ToMemoryIndexSet(pattern1, &pattern1_mindexes);
+  ToMemoryIndexSet(pattern2, &pattern2_mindexes);
+  auto iter1 = pattern1_mindexes.begin() + max_offset,
+      iter2 = pattern2_mindexes.begin() + max_offset;
+  for (; iter1 != pattern1_mindexes.begin() &&
+           iter2 != pattern2_mindexes.end();
+       ++iter1, ++iter2) {
+    if (*iter1 && *iter2)
+      return true;
+  }
+  return false;
+}
+
+
 bool TensorPatternRebaser::Convert(TensorPattern *pattern) {
   if (!needs_conversion_)
     return;  // An optimization to make the common case fast.
diff --git a/src/tensor/tensor-pattern-extra-utils.h b/src/tensor/tensor-pattern-extra-utils.h
index 7c1206c2d02..b94a8c85cdd 100644
--- a/src/tensor/tensor-pattern-extra-utils.h
+++ b/src/tensor/tensor-pattern-extra-utils.h
@@ -17,6 +17,8 @@
 // See the Apache 2 License for the specific language governing permissions and
 // limitations under the License.
 
+#ifndef KALDI_TENSOR_TENSOR_PATTERN_EXTRA_UTILS_H_
+#define KALDI_TENSOR_TENSOR_PATTERN_EXTRA_UTILS_H_ 1
 
 #include "tensor/tensor-common.h"
 #include "tensor/tensor-pattern.h"
@@ -36,9 +38,24 @@ namespace tensor {
    Returns true if there is overlap between pattern1 and pattern2,
    meaning that pattern1's memory-index-set and pattern2's
    memory-index-set have nonempty intersection.
+
+         @param [in] First pattern.  Must be valid.
+         @param [in] Second pattern.  Must be valid.
+         @return  Return if the two patterns' memory-index-sets'
+                  intersection is nonempty.
  */
-bool PatternsOverlap(const TensorPattern &pattern1,
-                     const TensorPattern &pattern2);
+bool PatternsIntersect(const TensorPattern &pattern1,
+                       const TensorPattern &pattern2);
+
+
+/**
+   This is a slow but simple version of PatternsIntersect(), with the same
+   interface.  it should not be called by users as it is slow.  It is exposed
+   here for testing purposes.
+*/
+bool PatternsIntersectSlow(const TensorPattern &pattern1,
+                           const TensorPattern &pattern2);
+
 
 /**
    Returns true if pattern2's memory-index-set is a subset of pattern1's
@@ -76,43 +93,112 @@ bool PatternsEquivalent(const TensorPattern &pattern1,
    documentation of return status).
 
       @param [in] pattern1  The first of the two patterns of which
-                        we want the intersection.
+                        we want the intersection; must be valid.
       @param [in] pattern2  The first of the two patterns of which
-                        we want the intersection.
+                        we want the intersection; must be valid.
       @param [out] intersection  On success, this function outputs
-                       a vector of patterns (in arbitrary order), where the
-                       memory-index-set it (conceptually) outputs is the union
-                       of all the memory-index-sets of the patterns output
-                       (which this function guarantees will be disjoint).
-                       If this vector is empty and this function returns true,
-                       it means the intersection was the empty set.
+                       a possibly-empty vector of patterns (in arbitrary
+                       order), the union of whose memory-index-sets (which
+                       will all be disjoint) equals the intersection fo the
+                       memory-index-sets of `pattern1` and `pattern2`.
+                       (However, see `keep_all_patterns`).
+      @param [in]  keep_all_patterns   If this parameter is set to false,
+                       the algorithm will stop as soon as the
+                       `intersection` vector has one element.  This
+                       is used for a fast test whether an intersection
+                       is empty or ont.
 
       @return  Returns true if the intersection could be computed, and
                false otherwise.  This function will always return true if,
                when the strides of pattern1 and pattern2 are sorted and
                duplicates removed and listed in increasing order, each
-               stride divides the next on in the list exactly.  (This should
-               cover all the immediately forseeable cases we might need to
-               compute).
+               stride divides the next one in the list exactly; but this is
+               not a necessary condition.   (The necessary condition
+               is that both patterns, when compressed and converted
+               to common strides, are "Regular" (c.f. "Regularity
+               property" in glossary).
 */
 bool ComputeIntersection(const TensorPattern &pattern1,
                          const TensorPattern &pattern2,
-                         std::vector<TensorPattern> *intersection);
+                         std::vector<TensorPattern> *intersection,
+                         bool keep_all_patterns = true);
+
+
+/**
+   This function returns true if the memory-index-sets of pattern1 and pattern2
+   have nonempty intersection, and false otherwise.  Requires that
+   pattern1 and pattern2 be valid.
+
+      @param [in] pattern1  First pattern to compare; must be valid.
+      @param [in] pattern2  Second pattern to compare; must be valid.
+      @return               Returns true if the memory-index-set of
+                            pattern1 and pattern2 have nonempty intersection.
+ */
+bool PatternsIntersect(const TensorPattern &pattern1,
+                       const TensorPattern &pattern2);
+
+/**
+      @param [in] pattern   The pattern about whose memory-index-set
+                            we are asking.  Must be valid-1, or
+                            return status is undefined.
+      @param [in] mindex    The memory-index we are asking about
+      @return               Return true if the memory-index-set of `pattern`
+                            contains `mindex` (i.e. if there is an
+                            index-tuple i such that `pattern[i] == mindex`;
+                            see "Indexing a pattern" in the glossary.
+*/
+bool PatternContains(const TensorPattern &pattern,
+                     int64 mindex);
+
+/**
+   Compute the minimum and maximum memory-indexs present in
+   a pattern's memory-index-set (i.e. the minimum and maximum
+   indexes into the underlying array).
+
+   This is inlined for speed; see tensor-pattern-extra-utils-inl.h.
+
+      @param [in] pattern  The pattern whose minimum and maximum
+                           memory-index we are computing
+      @param [out] min_mindex  The minimum memory-index in the
+                           memory-index-set of the pattern.  Will
+                           be zero in Patterns with non-negative
+                           strides (e.g. Patterns in canonical form,
+                           or other Patterns with normalized
+                           strides).  Should always be >= 0 in
+                           Patterns created by a valid program.
+      @param [out] max_mindex  The maximum memory-index in the
+                           memory-index-set of the pattern.
+                           Will always be >= min_mindex.
+*/
+inline void ComputeMinAndMaxMindex(const TensorPattern &pattern,
+                                   int64 *min_mindex,
+                                   int64 *max_mindex);
 
 
 /**
    Outputs the memory-index-set corresponding to the pattern 'pattern' to 's'.
    See glossary in tensor-pattern.h for definitions.
 
-
    This is strictly to be used in debugging code, as it is extremely
    inefficient.
 
-      @param [in] pattern  The input pattern
-      @param [out] s   The memory-index-set
+      @param [in] pattern  The input pattern; must be valid
+      @param [out] s   The memory-index-set, represented as a vector
+                       of bool, actually stored as char.  This will be set to a
+                       vector at least as large as the maximum memory-index in
+                       `pattern`, containing 1 for memory-indexse in the set and 0 for
+                       those out of the set.
  */
 bool ToMemoryIndexSet(const TensorPattern &pattern,
-                      std::unordered_set<int64> *s);
+                      std::vector<char> *s);
+
+/**
+   This function returns a memory-index randomly chosen
+   from the memory-index-set of `pattern`.
+     @param [in] pattern   Pattern; must be valid-1.
+     @return  Returns randomly chosen memory-index.
+ */
+int64 RandomMemoryIndex(const TensorPattern &pattern);
 
 
 
@@ -146,19 +232,19 @@ bool PatternTuplesEquivalent(const ArrayRef<const TensorPattern*> patterns1,
    This function returns true if a Pattern is regular (see Regularity property
    in the glossary in tensor-pattern.h) and false otherwise.  'pattern' must
    have all positive strides, the strides must be in increasing order (in the
-   private numbering), and it must be valid-- (see glossary).
+   private numbering), and it must be valid-2 (see glossary).
  */
 bool IsRegular(const TensorPattern &pattern);
 
 
 /**
-   This function returns true if a Pattern is valid- (see definition in
+   This function returns true if a Pattern is valid-1 (see definition in
    glossary); see also TensorPattern::Valid() and IsValidMM().
  */
 bool IsValidM(const TensorPattern &pattern);
 
 /**
-   This function returns true if a Pattern is valid-- (see definition in
+   This function returns true if a Pattern is valid-2 (see definition in
    glossary); see also TensorPattern::Valid() and IsValidM().
  */
 bool IsValidMM(const TensorPattern &pattern);
@@ -184,10 +270,10 @@ bool IsValidMM(const TensorPattern &pattern);
                         `pattern`.
        @param [out] patterns  On success (see documentation of return status)
                         'patterns' will be set to a nonempty list of patterns,
-                        the union of whose memory-index-sets equals
-                        the memory-index-set of `pattern`, all of whose strides are
-                        equal to `strides`, and each of which is valid- and linear
-                        in `pattern` (see "Linear property").
+                        the union of whose memory-index-sets equals the
+                        memory-index-set of `pattern`; all of whose strides are
+                        equal to `strides`; and each of which is valid-1 and
+                        linear in `pattern` (see "Linear property").
 
                         except for property (iv) (search for "Valid
                         Pattern" in tensor-pattern.h): that is, they may have
@@ -195,11 +281,11 @@ bool IsValidMM(const TensorPattern &pattern);
                         of 'strides' dividing the next is a sufficient but not
                         necessary condition for this function to always return
                         true.
-                          On failure, `patterns->empty()` will be true.
+                          On failure, `patterns->empty()` will be empty.
 
         @return         Returns true if pattern strides could be converted using
                         our algorithm, false if not.  This algorithm will work
-                        for any 'reasonable' request; it doesn't attempt to
+                        for any 'reasonable' request, but it doesn't attempt to
                         cover the types of cases where, to solve them, we would
                         have to output a number of patterns that couldn't be
                         bounded given the number of axes.
@@ -336,3 +422,8 @@ class TensorPatternRebaser {
 
 }  // namespace tensor
 }  // namespace kaldi
+
+// Include implementation of inline functions.
+#include "tensor/tensor-pattern-extra-utils-inl.h"
+
+#endif  // KALDI_TENSOR_TENSOR_PATTERN_EXTRA_UTILS_H_
diff --git a/src/tensor/tensor-pattern-utils.cc b/src/tensor/tensor-pattern-utils.cc
index 56d567e58c2..95461311e6f 100644
--- a/src/tensor/tensor-pattern-utils.cc
+++ b/src/tensor/tensor-pattern-utils.cc
@@ -230,10 +230,10 @@ static inline void CombineAxes(ArrayRef<TensorPattern*> patterns,
 
    CAUTION: this function does not update the codes of 'patterns'.
  */
-static void RemoveTrivialAxes(bool is_trivial_raxis[KALDI_TENSOR_MAX_DIM],
+static void RemoveTrivialAxes(bool is_trivial_raxis[KALDI_TENSOR_MAX_AXES],
                               ArrayRef<TensorPattern*> patterns) {
   int32 first_trivial_raxis = -1;
-  for (int32 raxis = 0; raxis < KALDI_TENSOR_MAX_DIM; raxis++) {
+  for (int32 raxis = 0; raxis < KALDI_TENSOR_MAX_AXES; raxis++) {
     if (is_trivial_axis[raxis]) {
       first_trivial_raxis = raxis;
       break;
@@ -302,7 +302,7 @@ void CompressPatterns(ArrayRef<TensorPattern*> patterns,
 
   bool exists_trivial_axis = false;
   // The = {} ensures (I believe) that they are all set to 0, meaning false.
-  bool is_trivial_raxis[KALDI_TENSOR_MAX_DIM] = {};
+  bool is_trivial_raxis[KALDI_TENSOR_MAX_AXES] = {};
   for (int32 raxis = 0, mask = 1; raxis < max_num_axes; raxis++, mask <<= 1) {
     if ((combined_code | mask) == 0) {
       is_trivial_raxis[raxis] = true;
@@ -449,7 +449,7 @@ void RemoveTrivialAxes(TensorPattern *pattern) {
       num_axes_out = 0;
   for (int32 raxis = 0; raxis < num_axes; raxis++) {
     int32 this_dim = pattern->dims[raxis];
-    if (this_dim != 0) {
+    if (this_dim != 1) {
       if (num_axes_out != raxis) {
         pattern->dims[num_axes_out] = this_dim;
         pattern->strides[num_axes_out] = pattern->strides[raxis];
@@ -457,16 +457,46 @@ void RemoveTrivialAxes(TensorPattern *pattern) {
     }
   }
   // It is a requirement of struct TensorPattern that dims and
-  // strides for raxis > num_axes be 1 and 0 respectively.
+  // strides for raxis >= num_axes be 1 and 0 respectively.
   for (int32 raxis = num_axes_out; raxis < num_axes; raxis++) {
     pattern->dims[raxis] = 1;
     pattern->strides[raxis] = 0;
   }
   pattern->num_axes = num_axes;
-  // Caution: we are not updating the code.
+  pattern->code = -1;
+}
+
 
+void RemoveTrivialAxes(const TensorPattern &pattern_in,
+                       TensorPattern *pattern_out) {
+  KALDI_PARANOID_ASSERT(pattern_out != &pattern_in);
+  int32 num_axes = pattern->num_axes,
+      num_axes_out = 0;
+  for (int32 raxis = 0; raxis < num_axes; raxis++) {
+    int32 this_dim = pattern_in.dims[raxis];
+    if (this_dim != 1) {
+      pattern_out->dims[num_axes_out] = this_dim;
+      pattern_out->axes[num_axes_out] = pattern_in.strides[raxis];
+    }
+  }
+  // It is a requirement of struct TensorPattern that dims and
+  // strides for raxis >= num_axes be 1 and 0 respectively.
+  for (int32 raxis = num_axes_out;
+       raxis < KALDI_TENSOR_MAX_AXES; raxis++) {
+    pattern_out->dims[raxis] = 1;
+    pattern_out->strides[raxis] = 0;
+  }
+  pattern_out->num_axes = num_axes_out;
+  pattern_out->code = -1;
 }
 
+int64 NumElements(const TensorPattern &pattern) {
+  int32 num_axes = pattern.num_axes;
+  int64 ans = 1;
+  for (int32 raxis = 0; raxis < num_axes; raxis++)
+    ans *= pattern.dims[raxis];
+  return ans;
+}
 
 }  // namespace kaldi
 }  // namespace tensor
diff --git a/src/tensor/tensor-pattern-utils.h b/src/tensor/tensor-pattern-utils.h
index af3c7370a23..eaf512faf73 100644
--- a/src/tensor/tensor-pattern-utils.h
+++ b/src/tensor/tensor-pattern-utils.h
@@ -18,6 +18,10 @@
 // limitations under the License.
 
 
+#ifndef KALDI_TENSOR_TENSOR_PATTERN_UTILS_H_
+#define KALDI_TENSOR_TENSOR_PATTERN_UTILS_H_ 1
+
+
 #include "tensor/tensor-common.h"
 #include "tensor/tensor-pattern.h"
 #include "tensor/array-ref.h"
@@ -30,20 +34,47 @@ namespace kaldi {
 namespace tensor {
 
 
-
-enum PatternEnum {
-  kPatternContainsNegativeStride = 2048
-  // e.g.:
-  // bool contains_negative_stride =
-  //     (pattern.code | kPatternContainsNegativeStride) != 0;
-};
-
 // Returns true if the pattern code indicates that the pattern contains a
 // negative stride.
 inline bool ContainsNegativeStride(int32 pattern_code) {
-  return (pattern_code | kPatternContainsNegativeStride) != 0;
+  // 2048 is 1 << 11; 11th bit in code is set if code indicates negative stride.
+  return (pattern_code | 2048) != 0;
+}
+
+/**
+   Returns true if the pattern code indicates that the pattern contains a
+   negative stride.  Caution: will return true if pattern_code was -1, so if you
+   call this on a code on a valid Pattern where the code might be -1, all it
+   means is that the Pattern "might" contain a negative stride.
+
+     @param [in] pattern  The input pattern.  Must be valid;
+                          return status is undefined otherwise.
+     @return         Returns true if either the pattern's code was
+                     -1 (meaning: not known), or if the code
+                     indicates that a negative stride was present.
+*/
+inline bool PattenMightContainNegativeStride(
+    const TensorPattern &pattern) {
+  // 2048 is 1 << 11; 11th bit in code is set if code indicates negative stride.
+  return (pattern.code | 2048) != 0;
 }
 
+
+/**
+   Returns true if the pattern contains a negative stride.
+   See tensor-pattern-utils-inl.h for implementation.
+
+      @param [in] pattern   Input pattern.  Must be valid;
+                            return status is undefined otherwise.
+                            TODO: if we need this to work for, e.g.
+                            valid- or valid-- patterns, find
+                            the exact conditions.
+      @return     Returns true if the pattern contained at
+                  least one negative stride, false otherwise.
+ */
+inline bool ContainsNegativeStride(const Pattern &pattern);
+
+
 // Returns true if the pattern code indicates that the raxis
 // numbered 'raxis' (the r refers to the backwards numbering used
 // in 'pattern') is 'trivial' (meaning: dim=1, stride=0).
@@ -52,18 +83,29 @@ inline bool AxisIsTrivial(int32 pattern_code, int32 raxis) {
 }
 
 
+
+/**
+   This function copies pattern_in to pattern_out while removing
+   trivial axes (i.e. axes with dim=1), reducing num_axes appropriately.
+
+     @param [in] pattern_in   Input pattern.
+     @param [out] pattern_out Output pattern; may not point to pattern_in.
+                        At exit it will be the same as pattern_in except any
+                        axes with dim=1 will have been removed and the num_axes
+                        reduced.  Will be valid at output if pattern_in was
+                        valid-1 at input.
+*/
+void RemoveTrivialAxes(const TensorPattern &pattern_in,
+                       TensorPattern *pattern_out);
+
+
 /**
    This function removes trivial axes (i.e. axes with dim=1) from 'pattern'.
-   Although in a valid pattern axes with dim=1 must have stride=0
-   and vice versa, this function does not check that property; it simply
-   removes axes with dim=1, reducing num_axes appropriately.
+   This version works in-place.
 
      @param [in,out] pattern   Pattern to be modified.  Any axes with dim=1
                          will be removed and the num_axes reduced.  Will be
-                         valid at output if it was valid at input, or even if
-                         it was valid at input in all but property (iv),
-                         that strides must be zero for axes with dim=1.
-                         CAUTION: the code of 'pattern' is *not* updated.
+                         valid at output if it was valid-1 at input.
  */
 void RemoveTrivialAxes(TensorPattern *pattern);
 
@@ -454,12 +496,20 @@ void CompressOnePattern(TensorPattern *pattern);
 void SortAxes(TensorPattern *pattern);
 
 
-// TODO: document this?
+// TODO: document this.
 inline void CanonicalizePattern(TensorPattern *pattern) {
   CompressOnePattern(pattern);
   SortAxes(pattern);
 }
 
+// TODO: document this.  This will later be replaced with
+// a more efficient version.
+inline void CanonicalizePattern(contst TensorPattern &pattern_in,
+                                TensorPattern *pattern_out) {
+  *pattern_out = pattern_in;
+  CanonicalizePattern(pattern_out);
+}
+
 /**
    This pattern checks that 'pattern' is valid and in canonical form (see
    glossary for the meaning).  CanonicalizePattern() will modify a valid pattern
@@ -468,6 +518,15 @@ inline void CanonicalizePattern(TensorPattern *pattern) {
 bool IsCanonical(const TensorPattern &pattern);
 
 
+/**
+   Returns the number of elements in the pattern, computed as the
+   product of the dims.  ('pattern' is expected to either be valid or
+   to at least satisfy the uniqueness property for this to actually give
+   the number of elements, but this is not checked).
+*/
+int64 NumElements(const TensorPattern &pattern);
+
+
 /**
    This version of SortAxes() sorts the axes in 'patterns' (which must be
    nonempty and all have the same number of axes), by ordering them from the
@@ -678,193 +737,11 @@ void HasCStrides(const TensorPattern &pattern);
 bool PatternsOverlap(const TensorPattern &pattern1,
                      const TensorPattern &pattern2);
 
-/**
-   Returns true if pattern2's memory-index-set is a subset of pattern1's
-   memory-index-set.  See glossary in tensor-pattern.h for explanation of
-   memory-index-set.
- */
-bool PatternIncludes(const TensorPattern &pattern1,
-                     const TensorPattern &pattern2);
-
-
-/**
-   Returns true if the two patterns are equivalent in the sense that their
-   memory-index-sets are the same.  See glossary in tensor-pattern.h for
-   explanation.
- */
-bool PatternsEquivalent(const TensorPattern &pattern1,
-                        const TensorPattern &pattern2);
-
-
-/**
-   Outputs the memory-index-set corresponding to the pattern
-   'pattern' to 's'.   See glossary in tensor-pattern.h for
-   definitions.  This is strictly to be used in debugging
-   code, as it is extremely inefficient.
-
-      @param [in] pattern  The input pattern
-      @param [out] s   The memory-index-set
- */
-bool ToMemoryIndexSet(const TensorPattern &pattern,
-                      std::unordered_set<int64> *s);
-
 
+}  // namespace tensor
+}  // namespace kaldi
 
-/**
-   Outputs the memory-index-tuple-set corresponding to the pattern 'pattern' to
-   's' (see tensor-pattern.h for definition).  For storage in 's', each tuple is
-   converted into a single integer by a hashing function that should keep
-   distinct tuples separate as long as the memory-indexes were not huge.  (We
-   may output the actual tuples at some point in the future if they are ever
-   needed).  This function is strictly to be used in debugging code, as it is
-   extremely inefficient.
-
-      @param [in] pattern  The input pattern
-      @param [out] s   The memory-index-set
- */
-bool ToMemoryIndexTupleSet(const ArrayRef<TensorPattern*>  patterns,
-                           std::unordered_set<int64> *s);
-
-
-/**
-   Returns true if the two pattern-tuples are equivalent in the sense
-   that their memory-index-tuple-sets are the same.  See glossary
-   in tensor-pattern.h for explanation.
- */
-bool PatternTuplesEquivalent(const ArrayRef<const TensorPattern*> &patterns1,
-                             const ArrayRef<const TensorPattern*> &patterns2);
-
-
-/**
-   Class TensorPatternRebaser is an object that converts TensorPattern
-   when memory layouts change.  The main use-case is when a base Variable
-   (c.f. variable.h for definition) has a TensorPattern that is not
-   contiguous (see tensor-pattern.h for definition of 'contiguous'), and
-   its gradient Tensor is allocated contiguously.  This class is
-   needed to convert patterns for Variables into patterns for their
-   corresponding gradients.
-
-   We make it an object rather than a function in order to avoid repetition when
-   multiple patterns need to be rebased.
- */
-class TensorPatternRebaser {
-
-  /*
-    Constructor.
-       @param [in] src_pattern  The pattern that we are converting *from*,
-                              e.g. the pattern of a Variable whose gradient
-                              has a different layout from itself.
-       @param [in] dest_pattern  The pattern that we are converting *to*.
-                              Must have the same num_axes and the same dims
-                              as 'src_pattern'.
-
-    Let t be a valid index-tuple for src_pattern/dest_pattern, determined
-    by their 'dims' and 'num_axes'.  Using t to index src_pattern and
-    dest_pattern gives memory-indexes:
-       m_src = src_pattern[t]
-       m_dest = dest_pattern[t]
-    View this object as a function from memory-indexes to memory-indexes
-    f: (m_src -> m_dest), whose domain is the memory-index-set of src_pattern
-    and whose range is the memory-index-set of dest_pattern.
-
-    The purpose of this object is to modify patterns in a way that maps
-    their memory-indexes with the same function f.
-  */
-  TensorPatternRebaser(const TensorPattern &src_pattern,
-                       const TensorPattern &dest_pattern);
-
-
-  /**
-     This function attempts to modify pattern->offset and pattern->strides in a
-     way that does the mapping of memory-indexes m_src -> m_dest that is implied
-     by the src_pattern and dest_pattern passed to the constructor.  That is,
-     for any index-tuple t valid for 'pattern', the memory-index `pattern[t]`
-     evaluated before and after calling this function gets mapped according
-     to the function (m_src -> m_dest) mentioned in our documentation for
-     the constructor.
-
-     @param [in,out]  pattern  The pattern to be rebased.  Must, at entry,
-                          satisfy `PatternIncludes(src_pattern, *pattern)`,
-                          where `src_pattern` was the pattern passed to the
-                          constructor.  On success (i.e. if this function
-                          returns true), the condition
-                          `PatternIncludes(dest_pattern, *pattern)` will
-                          be satisfied.  On failure, the contents of
-                          'pattern' is undefined.
-
-     @return  Returns true if the conversion was possible.
-   */
-  bool Rebase(TensorPattern *pattern);
-
-  private:
-
-  // TODO: remove src_pattern_ and dest_pattern_ once everything
-  // is debugged.  They are copies of the src_pattern and dest_pattern
-  // passed to the constructor.
-  TensorPattern src_pattern_;
-  TensorPattern dest_pattern_;
-
-  // If needs_conversion_ is false, it means the patterns don't need any conversion
-  // at all (this is an optimization).
-  bool needs_conversion_;
-
-  // The 'offset' value of src_pattern_compressed (i.e. the src_pattern passed
-  // to the constructor, which has been jointly compressed and normalized with
-  // dest_pattern (to make all src_strides positive).
-  int64 src_offset_;
-  // The 'offset' value of dest_pattern_compressed
-  int64 dest_offset_;
-
-  // num_axes_ is the number of axes, not in the original src_pattern and
-  // dest_pattern, but after the two patterns have been jointly compressed and
-  // then sorted from the smallest to greatest stride in src_pattern.
-  // src_strides_ are the resulting strides from src_pattern_compressed, and
-  // dest_strides_ are the resulting strides from dest_pattern_compressed.
-
-  // dest_pattern_ are the strides of the thus-modified src_pattern and
-  // dest_pattern.  As an optimization, if src_strides and dest_strides end up
-  // being the same, we set num_axes to zero and skip modifying the strides when
-  // CompressPattern() is called.
-
-  // Note: all of src_strides_[0] .. src_strides_[num_axes_ - 1] will be greater
-  // than zero.  We can guarantee this because src_pattern and dest_pattern as
-  // passed to the constructor had the same dims, so any axes with dim=1 would
-  // have had dim=1 for both src and dest, hence they would have been removed by
-  // CompressPatterns(), hence no strides would be zero after
-  // CompressPatterns(); and CompressPatterns() normalizes the signs of the
-  // strides so the first one (i.e. src_pattern) has positive strides.
-  int32 num_axes_;
-  int32 src_strides_[KALDI_TENSOR_MAX_DIM];
-  int32 dest_strides_[KALDI_TENSOR_MAX_DIM];
-
-
-
-  // The basic algorithm in Convert() is:
-  //
-
-  //
-  //
-  //  First, add offset_ to its offset.
-  //   Then:
-  //     For each nontrivial axis of 'pattern', we are going to modify
-  //     its stride as needed.
-  //     Let that stride be `stride`, and the corresponding dim `dim`.
-  //     Let `pstride = abs(stride)` be the absolute value of the stride
-  //     (we'll modify that, and then restore the sign.
-  //     positive.
-  //
-
-
-
-  // Converts a memory-index from the src to dest pattern.  This is applying,
-  // to a single arbitrary memory-index m_src, the mapping (m_src -> m_dest);
-  // see the comments above for explanation of this notation.
-  // It is required that m >= 0 (otherwise it would not have been inside
-  // the source pattern).
-  int64 ConvertMemoryIndex(int64 m);
-
-};
 
+#include "tensor/tensor-pattern-utils-inl.h"
 
-}  // namespace tensor
-}  // namespace kaldi
+#endif KALDI_TENSOR_TENSOR_PATTERN_UTILS_H_
diff --git a/src/tensor/tensor-pattern.h b/src/tensor/tensor-pattern.h
index 906845a0fa0..db1f06a02f6 100644
--- a/src/tensor/tensor-pattern.h
+++ b/src/tensor/tensor-pattern.h
@@ -52,13 +52,13 @@ namespace tensor {
                       that identifies an axis in the public numbering (see "Public numbering").
                       See also: Raxis-index.
 
-    Axis-sorting property: search below for [Valid Pattern], point (vi), for the main
+    axis-dominance property: search below for [Valid Pattern], point (vi), for the main
                       definition.
-          [Axis-sorting property of an axis-index]:
+          [axis-dominance property of an axis-index]:
                       There is another sense in which we use the term
-                      'axis-sorting property': for a Pattern whose axes are sorted
+                      'axis-dominance property': for a Pattern whose axes are sorted
                       from least to greatest abs(stride) [in the private numbering],
-                      we say that "the axis-sorting property holds for axis-index i
+                      we say that "the axis-dominance property holds for axis-index i
                       of that Pattern" if:
                                  dim(i) * abs(stride(i)) <= abs(stride(i+1)).
 
@@ -113,6 +113,9 @@ namespace tensor {
                     of Patterns whose dims-vectors were ([4 1 5], [6 1], [5]),
                     the dims-vector of the tuple would be [4 6 5].
 
+    Disjoint Patterns:  When we speak of disjoint Patterns we mean that
+                    their memory-index-sets are disjoint; see memory-index-set.
+
     Extended indexing:  A convention whereby if we have a Tensor with, say,
                       `dims = [5 1]`, we can index that Tensor with an index-tuple
                       that:
@@ -124,19 +127,32 @@ namespace tensor {
                          (in the public numbering) / the right (in the private
                          numbering) with dim=1.  See also: PyTorch-style broadcasting.
 
+    Index:            If this word is used unqualified in the context of a Pattern
+                      or tensor it will generally mean an integer that's part of an
+                      index-tuple, and is being used to index a particular axis of
+                      a Pattern.  For example, on an axis where the Pattern's dimension
+                      is `dim`, a valid index i would be in the range 0 <= i < dim.
+
     Index-tuple:      A tuple of integers used as an index into a Tensor.  Must
                       have at least as many elements as the Tensor's num_axes
                       (see Extended indexing).  Elements of such tuples may
-                      not be negative.
+                      not be negative.  The elements of an index-tuple are in
+                      the same order as the axes, and in some cases it may
+                      be necessary to disambiguate whether we are referring
+                      to the public numbering or the private numbering of the
+                      axes.
 
-    (valid Index-tuple) An index-tuple is *valid for a pattern* if it may be
+    [Valid Index-tuple]: An index-tuple is *valid for a pattern* if it may be
                       used to index that Pattern, allowing extended indexing.
-                      This is true if, after padding the index-tuple with 0's
-                      on the left and padding the Pattern's dims-vector with
-                      1's on the left as needed to make them the same size,
-                      for each axis, if the element of the index-tuple is
-                      i and the element of the dims-vector is d, i >= 0
-                      and either i < d or d == 1.
+                      (see "Extended indexing" for details).
+
+    Indexing a Pattern:  For a pattern `p` and an index-tuple `i` that is valid
+                       for the pattern (see: "Valid Index-tuple"), we write
+                      `p[i] = m` meaning that when indexing a pattern `p`
+                      with index-tuple `i` we get memory-index `m`.
+                      `m` is of coure the sum of the pattern's offset plus
+                      the sum over all axis-indexes, of the element of the index-tuple
+                      multiplied by the Pattern's stride for that axis.
 
     Index-tuple-set of a Pattern: The index-tuple-set of a Pattern is the set
                       of valid index-tuples assuming we are not allowing extended
@@ -158,17 +174,15 @@ namespace tensor {
 
     Memory-pointer:   A void* pointer to the start of a memory region.
 
-    Memory-index:     A scalar (int64) index into a memory region viewed as a
+    Memory-index/mindex:  An integer (int64) index into a memory region viewed as a
                       linear array.  For example, for a Tensor of floats, we'd cast
                       the address of the memory-pointer to `float*` and then use
-                      the memory-index as an index into that array.  For a
-                      Pattern p and an index-tuple i that is valid for p, we have
-                      a memory-index m = p[i], which is equal to the
-                      pattern's offset plus the sum over all axes of the product of the
-                      element of the index-tuple times the corresponding axis's
-                      stride.  (Note: this becomes much easier to compute and
-                      explain in the private numbering, because no left-padding
-                      has to be done explicitly).
+                      the memory-index as an index into that array.  In code,
+                      this may be called 'mindex.'  For a Pattern p and an
+                      index-tuple i that is valid for p, we have a memory-index
+                      m = p[i], which is equal to the pattern's offset plus the
+                      sum over all axes of the product of the element of the
+                      index-tuple times the corresponding axis's stride.
 
     Memory-index-tuple:  A tuple of Memory-indexes.  This concept is used in connection
                       with Pattern-tuples.  For a pattern-tuple q = (p1, p2, p3)
@@ -231,7 +245,8 @@ namespace tensor {
     Set-equivalent:   Two Patterns are set-equivalent if their memory-index-sets
                       are identical.
 
-    Trivial axis:     An axis of a Pattern for which dim=1 and stride=0.
+    Trivial axis:     An axis of a Pattern for which dim=1.  Such axes will have
+                      stride=0 if the Pattern is valid.
 
     Memory-index-set of a Pattern:
                       The set of all memory-indexes obtained by indexing
@@ -245,6 +260,11 @@ namespace tensor {
                       index-tuple-set of the Pattern-tuple.  See "memory-index-tuple"
                       and "index-tuple-set of a Pattern-tuple" for more information.
 
+    Normalized strides:  We say that a Pattern has normalized strides if the
+                      strides are all positive and are strictly increasing
+                      in the private numbering (which implies strictly decreasing
+                      in the public numbering).
+
     Linear property:
                       Consider Patterns P and Q with the property that the
                       memory-index-set of P is a subset of the memory-index-set of
@@ -287,7 +307,7 @@ namespace tensor {
 
                       The reader may notice that if we were to restrict
                       k to equal i + 1, then
-                      this would be equivalent to the axis-sorting property
+                      this would be equivalent to the axis-dominance property
                       (property (v)) plus the requirement that the strides be
                       positive and sorted.
 
@@ -306,10 +326,10 @@ namespace tensor {
 
    Uniqueness property:  A property of a Pattern that no two different index-tuples,
                       when used to index the Pattern, generate the same memory-index.
-                      The axis-sorting property is sufficient, but not necessary,
+                      The axis-dominance property is sufficient, but not necessary,
                       to ensure the uniqueness property.  (The uniqueness property
                       is probably not so easy to test for efficiently in the general
-                      case; at least, we have not found a way).
+                      case).
 
     Valid Pattern:
                      A valid Pattern must be as follows.  Think of this as the mathematical definition;
@@ -321,7 +341,7 @@ namespace tensor {
                           (iii) the dims must all be >0.
                           (iv) the strides must be nonzero (but not necessarily positive) for axes with
                                 dim != 1.
-                          (v) the axis-sorting property.   This property is sufficient, but not
+                          (v) the axis-dominance property.   This property is sufficient, but not
                               necessary, to ensure the uniqueness property.  It requires that
                               when the axes are sorted from least to greatest value of abs(stride),
                               for each axis-index 0 <= i < num_axes - 1:
@@ -331,13 +351,15 @@ namespace tensor {
                           (vi) the strides must be zero for axes with dim=1.
 
 
-     Valid- Pattern:
-                      A Pattern is valid- if it satisfies properties (i) through (v) of
-                      a valid Pattern (i.e. it may have nonzero strides for axes with dim=1).
-                      A valid pattern is also valid-.
-     Valid-- Pattern:
-                      A Pattern is valid-- if it satisfies properties (i) through (iv) of
-                      a valid Pattern.  A pattern that is valid or valid- is also valid--.
+     Valid-1 Pattern:
+                      A Pattern is valid-1 (read as: valid minus one) if it
+                      satisfies properties (i) through (v) of a valid Pattern
+                      (i.e. it may have nonzero strides for axes with dim=1).  A
+                      valid pattern is also valid-1.
+     Valid-2 Pattern:
+                      A Pattern is valid-2 (read as valid minus two) if it
+                      satisfies properties (i) through (iv) of a valid Pattern.
+                      A pattern that is valid or valid-1 is also valid-2.
  */
 
 
@@ -357,7 +379,7 @@ namespace tensor {
   also don't allow zero dims (i.e. a Tensor that is initialized must not have
   num_elemnts==0).  If you want an empty Tensor, just use a null pointer.  In
   addition, we require that the stride equal zero for any axis that has dim = 1.
-  There is also the "axis-sorting" property (see its glossary entry for more info).
+  There is also the "axis-dominance" property (see its glossary entry for more info).
 
   Our requirements of a TensorPattern are:
 
@@ -374,7 +396,7 @@ namespace tensor {
 
     offset >= 0
 
-    The axis-sorting property (see property (v) in "Valid Pattern" above)
+    The axis-dominance property (see property (v) in "Valid Pattern" above)
 
   Note: in the public interface of class Tensor, if you ask for Dim(i) it will
   return pattern.dims[pattern.num_axes - i], i.e. the interface uses the public

From 493efff39a97ca8b57059cb6aa244c83874ebf5f Mon Sep 17 00:00:00 2001
From: Daniel Povey <dpovey@gmail.com>
Date: Wed, 24 Apr 2019 12:06:14 -0400
Subject: [PATCH 024/163] [src] Lots more progress, still in flux

---
 src/tensor/array-ref.h                   |  25 ++
 src/tensor/op.h                          | 128 ++++++---
 src/tensor/tensor-common.h               |   5 +-
 src/tensor/tensor-impl.h                 |   9 +-
 src/tensor/tensor-pattern-extra-utils.cc |  18 ++
 src/tensor/tensor-pattern-extra-utils.h  |  17 +-
 src/tensor/tensor-pattern-utils-inl.h    |  46 ++++
 src/tensor/tensor-pattern.h              |  27 +-
 src/tensor/tensor.h                      | 104 +++++++-
 src/tensor/variable-inl.h                |  91 +++++++
 src/tensor/variable.h                    | 323 +++++++++--------------
 11 files changed, 533 insertions(+), 260 deletions(-)
 create mode 100644 src/tensor/tensor-pattern-utils-inl.h
 create mode 100644 src/tensor/variable-inl.h

diff --git a/src/tensor/array-ref.h b/src/tensor/array-ref.h
index 873496f34df..bf6cb79fa72 100644
--- a/src/tensor/array-ref.h
+++ b/src/tensor/array-ref.h
@@ -69,6 +69,10 @@ struct ArrayRef final {
   constexpr ArrayRef(const std::initializer_list<T> &vec):
       data(vec.data()), size(vec.size()) { }
 
+
+  T *begin() { return data; }
+  T *end() { return data + size; }
+
   // We will add iterators later if they are needed.
 };
 
@@ -77,5 +81,26 @@ struct ArrayRef final {
 std::string ArrayAsString(const ArrayRef<int32> a);
 
 
+
+/**
+   This template is a mechanism of keeping a collection of shared pointers, in
+   no particular order, with a mechanism to iterate over the list (typically
+   for purposes of dependency tracking, from Ops to Nodes and vice versa).
+
+ */
+template <class T, int BlockSize>
+class SharedPointerCollection {
+ public:
+
+
+
+ private:
+  std::shared_ptr<T> data[BlockSize];
+  std::unique_ptr<SmallSharedPointerCollection<T,BlockSize> > next;
+};
+
+
+
+
 }  // namespace tensor
 }  // namespace kaldi
diff --git a/src/tensor/op.h b/src/tensor/op.h
index ce70c998d1c..fc3510f06f8 100644
--- a/src/tensor/op.h
+++ b/src/tensor/op.h
@@ -47,7 +47,7 @@ class Variable;
 class Op {
  public:
 
-  Op(): n_(GetCount()) { }
+  Op(): tick_(GetTick()) { }
 
   /// InputIteratorBegin() and InputIteratorEnd() form the begin and
   /// end points of a list of Variables that were inputs of this Op
@@ -57,35 +57,95 @@ class Op {
   /// get a cycle in the graph).  These Variables are expected to
   /// still have their graph information (i.e. sub-classes of class Op
   /// class must not call RemoveGraph() on the members of this list).
-  virtual Variable *InputIteratorBegin() = 0;
-  virtual Variable *InputIteratorEnd() = 0;
+  virtual Op *DepIteratorBegin() = 0;
+  virtual Op *DepIteratorEnd() = 0;
 
 
 
-  Op *GetTail() final;  // returns the tail (in a singly linked list of Ops for
-                        // this variable); this list will only have >1 element
-                        // only if in-place operations were done.  (If, later,
-                        // we need the shared_ptr to be returned from here, we
-                        // can change this code to return that; we just return
-                        // the raw pointer for efficiency.)
+  // This number >= 0 is used to determine the order of Ops in a graph; each
+  // time we generate an Op we increment a global counter.  Doing it this way,
+  // rather than via topological sorting, is simpler.
+  int64 GetTimestamp() const final { return tick_; }
 
+  virtual void Backprop();
+};
+
+
+template <class OpImpl>
+class OpPointer {
+
+  std::shared_ptr<OpImpl>
+
+}
+
+
+
+/**
+   This is a special version of base-class Op that is created when
+   any SharedGrad is allocated for a non-leaf Variable.  Its purpose
+   is to ensure that, when we get to this Op in the backprop, we deallocate
+   the data underlying the gradient Tensor (so we don't keep gradient
+   Tensors around for longer than is needed).
+*/
+class DeallocateOp: public Op {
+
+  // This operator has no dependencies as it will be created when a SharedGrad
+  // is first initialized, when no Ops have been done on it.
+  Op *DepIteratorBegin() override { return NULL; }
+  Op *DepIteratorEnd() override { return NULL; }
+
+  void Backprop() override {
+    if (auto s = tensor_to_deallocate_.lock())
+      ZeroDeallocating(s.get());
+  }
+
+ private:
+  // Since we just want to deallocate its underlying data, there is no point
+  // increasing its ref-count; we can just shrug our shoulders if it has
+  // already been deleted.d
+  std::weak_ptr<Tensor> tensor_to_deallocate_;
+};
+
+
+/**
+   A slight simplification of class UnaryOp for cases where it's
+   done in-place.
+ */
+class InPlaceUnaryOp: public Op {
+
+};
 
-  // Connect this Op to Variable 'v', which is expected to be an output of this
-  // Op.  What this does is to ensure that 'v' has a TensorGrad object,
-  // and add this Op as the head of the 'ops_' list of v.
-  void ConnectToOutput(Variable *v) final;
 
-  // Checks that tail_ is currently nullptr, and sets tail_ to 'op'.
-  void SetTail(std::shared_ptr<Op> op) final;
+class UnaryOp: public Op {
+
+  //
+  UnaryOp(const Variable &input, const Variable &output) {
+    if
+
+
+
+    if (SameVariable(input, output)) {
+
+    } else {
+    }
+  }
+
+ public:
+
+  std::shared_ptr<Op> op1_;
+  std::shared_ptr<Op> op2_;
+
+
+
+
+}
+
+class GenericOp: public Op {
+
+  // GenericOp is a child of class Op that is intended as a generic base-class
+  // for expressions.
 
-  // This number is used to determine the order of Ops in a graph; each time we
-  // generate an Op we increment a global counter.  Doing it this way, rather
-  // than via topological sorting, avoids the need to think about having to
-  // track implicit dependencies such as when different Ops operate on Variables
-  // that are overlapping views into an underlying Variable.
-  int64 GetNumber() const final { return n_; }
 
-  virtual void Backward();
 
  protected:
   // Constructor, to be used from child classes.  This base-class takes care
@@ -95,10 +155,10 @@ class Op {
   //                   that are inputs to, but not outputs of, i.e. not modified
   //                   by, this Op).
   //  @param [in] output_var  The output Variable of this Op, i.e. the Variable
-  //                   which is modified or set by it.  We will provide another
+  //                   which is modified or set by it.  We may provide another
   //                   constructor taking ArrayRef<Variable> in this position,
-  //                   as and when we need to support Ops that take multiple
-  //                   output Variables.
+  //                   as and when we need to support Ops that operate on
+  //                   multiple output Variables.
   void Op(const ArrayRef<Variable> &input_vars,
           const Variable &output_var);
 
@@ -107,7 +167,6 @@ class Op {
   // that are not also outputs?  Could use that for graph traversal.
 
  private:
-  static int64 counter_{0};
 
   // num_inputs_ is the number of base Variables that are the base Variables of
   // inputs of this Op (but not of outputs).  These are stored in the
@@ -116,11 +175,17 @@ class Op {
   // inputs_ is a pointer to an array of shared_ptr<Variable> of size num_inputs_, which
   // will be be allocated by new [] in the constructor and deleted by delete []
   // in the destructor.
-  // This is a list of the unique Nodes that are the Nodes of inputs (but not outputs)
-  // of this
+
+  // This is a list of the Op-input-nodes (see glossary in tensor.h for explanation).
+  // We don't store the Op-output-nodes here; instead, they refer to this Op in
+  // their op_lists.
+  // (We don't store the Node(s) that is(are) the outputs of the Op here; its own
+  // op_list refers to this Op).
   std::shared_ptr<Node> *inputs_;
 
   int32 num_inputs_;
+
+  // If num_inputs_ is 1, then inputs_ is
   void *inputs_;
 
   int64 n_;  // initialized from the counter when this object is created.
@@ -142,8 +207,8 @@ class AddToOp: public Op {
   // involved.  Obviously alpha and beta are constants,
   // and differentiation w.r.t. them is not supported.
   //
-  // The Op is only constructed if b_.HasGrad() (which it
-  // would normally if a_.HasGrad()).
+  // The Op is only constructed if b_.Tracked() (which it
+  // would normally if a_.Tracked()).
   AddToOp(float alpha, float beta,
           const Variable &a, const Variable &b):
       Op({a}),
@@ -176,7 +241,10 @@ class AddToOp: public Op {
   // (here just a_) for dependency tracking.
   Variable a_;
 
+  std::shared_ptr<Node> a_node_;
+
   std::shared_ptr<Tensor> a_data_;
+  // a_grad_ will be NULL if a was not tracked.
   std::shared_ptr<Tensor> a_grad_;
   std::shared_ptr<Tensor> b_data_;
   std::shared_ptr<Tensor> b_grad_;
diff --git a/src/tensor/tensor-common.h b/src/tensor/tensor-common.h
index d55cdc09e9c..e58d5ef59c6 100644
--- a/src/tensor/tensor-common.h
+++ b/src/tensor/tensor-common.h
@@ -109,8 +109,11 @@ class WithDtypeAs {
   DataType prev_default_;
 };
 
+// Global variable, initialized from zero, that is used in GetTick().
+// This is defined in tensor-common.cc.
+extern int64 g_tick_counter;
 
-
+inline int64 GetTick() { return g_tick_counter++; }
 
 /// Enumeration that says what strides we should choose when allocating
 /// A Tensor.
diff --git a/src/tensor/tensor-impl.h b/src/tensor/tensor-impl.h
index 7b0594f3175..ba5b2cd8304 100644
--- a/src/tensor/tensor-impl.h
+++ b/src/tensor/tensor-impl.h
@@ -46,14 +46,15 @@ struct TensorImpl {
 
   inline int32 NumAxes() { return pattern.num_axes; }
 
-  // Returns the dimension on the supplied axis (using the public axis numbering)
+  // Returns the dimension on the supplied axis (using the *public* axis
+  //                    numbering)
   //  @param [in] axis  Axis on which dimension is required, with
   //                    -NumAxes() <= axis < NumAxes(); negative axis
   //                    is interpreted as an offset from NumAxes().
   //  @return        Returns the dimension on this axis, a number >= 1.
   inline int32 Dim(int32 axis);
 
-  // Returns the stride on the supplied axis (using the public axis numbering)
+  // Returns the stride on the supplied axis (using the *public* axis numbering)
   //  @param [in] axis  Axis on which stride is required, with
   //                    -NumAxes() <= axis < NumAxes(); negative axis
   //                    is interpreted as an offset from NumAxes().
@@ -63,7 +64,9 @@ struct TensorImpl {
 
 
   // Returns the data pointer corresponding to the element whose index
-  // is all zeros.
+  // is all zeros.  [TODO: maybe have overloads of this for different types.]
+  // CAUTION: this function may allocate the data if it has not yet been
+  // allocated.
   inline void* GetData() const;
 
 
diff --git a/src/tensor/tensor-pattern-extra-utils.cc b/src/tensor/tensor-pattern-extra-utils.cc
index c31794eee22..665e8895ec3 100644
--- a/src/tensor/tensor-pattern-extra-utils.cc
+++ b/src/tensor/tensor-pattern-extra-utils.cc
@@ -670,6 +670,24 @@ int64 TensorPatternRebaser::ConvertMemoryIndex(int64 m) {
 }
 
 
+// Note on implementation: likely the most common case we'll call this
+// is when -DKALDI_PARANOID has been set and we are checking that
+// tensors we are rebasing are strictly inside the source tensor.
+// So in the common case, pattern1 *will* include pattern2.
+bool PatternIncludes(const TensorPattern &pattern1,
+                     const TensorPattern &pattern2) {
+
+  std::vector<TensorPattern> intersection;
+  if (!ComputeIntersection(pattern1, pattern2, &intersection))
+    return -1;  // Could not determine whether the patterns intersect.
+  int64 num_elements = 0;
+  for (auto pattern : intersection)
+    num_elements += NumElements(pattern);
+  if (num_elements == NumElements(pattern1))
+    return 1;  // pattern1 includes pattern2;
+  else
+    return 0;  // pattern1 does not include pattern2
+}
 
 
 }  // namespace kaldi
diff --git a/src/tensor/tensor-pattern-extra-utils.h b/src/tensor/tensor-pattern-extra-utils.h
index b94a8c85cdd..7d44f4ab302 100644
--- a/src/tensor/tensor-pattern-extra-utils.h
+++ b/src/tensor/tensor-pattern-extra-utils.h
@@ -58,12 +58,19 @@ bool PatternsIntersectSlow(const TensorPattern &pattern1,
 
 
 /**
-   Returns true if pattern2's memory-index-set is a subset of pattern1's
-   memory-index-set.  See glossary in tensor-pattern.h for explanation of
-   memory-index-set.
+   Returns information about whether pattern2's memory-index-set is a subset of
+   pattern1's memory-index-set.  See glossary in tensor-pattern.h for
+   explanation of memory-index-set.
+        @param [in] pattern1  First input pattern; must be valid.
+        @param [in] pattern2  First input pattern; must be valid.
+        @return   Returns:
+            0 if we determined that pattern1 does not include pattern2
+            1 if we determined that pattern1 includes pattern2
+           -1 if we could not compute the intersection (so our
+              algorithm could not determine whether one included the other).
  */
-bool PatternIncludes(const TensorPattern &pattern1,
-                     const TensorPattern &pattern2);
+int32 PatternIncludes(const TensorPattern &pattern1,
+                      const TensorPattern &pattern2);
 
 
 /**
diff --git a/src/tensor/tensor-pattern-utils-inl.h b/src/tensor/tensor-pattern-utils-inl.h
new file mode 100644
index 00000000000..c23a81cc6a6
--- /dev/null
+++ b/src/tensor/tensor-pattern-utils-inl.h
@@ -0,0 +1,46 @@
+// tensor/tensor-pattern-utils-inl.h
+
+//  Copyright      2019  Johns Hopkins University (author: Daniel Povey)
+
+// See ../../COPYING for clarification regarding multiple authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//  http://www.apache.org/licenses/LICENSE-2.0
+//
+// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
+// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
+// MERCHANTABLITY OR NON-INFRINGEMENT.
+// See the Apache 2 License for the specific language governing permissions and
+// limitations under the License.
+
+
+// Do not include this header directly; it is only to be included by tensor-pattern-utils.h.
+
+#ifndef KALDI_TENSOR_TENSOR_PATTERN_UTILS_INL_H_
+#define KALDI_TENSOR_TENSOR_PATTERN_UTILS_INL_H_ 1
+
+
+namespace kaldi {
+namespace tensor {
+
+// See tensor-pattern-utils.h for documentation.
+inline bool ContainsNegativeStride(const Pattern &pattern) {
+  // 2048 is 1 << 11; 11th bit in code is set if code indicates negative stride.
+  if (pattern.code >= 0 && (pattern.code | 2048) != 0)
+    return true;
+  int32 num_axes = pattern.num_axes;
+  for (int32 raxis = 0; raxis < num_axes; raxis++)
+    if (pattern.strides[raxis] < 0)
+      return true;
+  return false;
+}
+
+
+}  // namespace tensor
+}  // namespace kaldi
+
+#endif KALDI_TENSOR_TENSOR_PATTERN_UTILS_INL_H_
diff --git a/src/tensor/tensor-pattern.h b/src/tensor/tensor-pattern.h
index db1f06a02f6..4d0e57603df 100644
--- a/src/tensor/tensor-pattern.h
+++ b/src/tensor/tensor-pattern.h
@@ -23,9 +23,6 @@
 #include "tensor/tensor-common.h"
 #include <limits>
 
-/**
-   This is some notes on plans for kaldi10 tensor stuff, nothing is fully fleshed out.
-*/
 
 namespace kaldi {
 namespace tensor {
@@ -33,24 +30,23 @@ namespace tensor {
 
 
 /*
-  GLOSSARY
+  PATTERN GLOSSARY   (note: see also TENSOR GLOSSARY in tensor.h)
 
     Axis:             An axis is one of the (dim, stride) pairs that form part
-                      of a TensorPattern, and we often use the word "axis"
-                      to refer to the index of the axis, as in, for example,
+                      of a TensorPattern.  We will sometimes use the word "axis"
+                      to refer to the integer index of the axis, as in, for example,
                       in a Tensor with dims=[5 6 7], axis 0 has dim=5 and
-                      axis 2 has dim=7.  See also axis-index and raxis-index,
-                      which are more precise terms for the index of the axis
-                      and clearly disambiguate the numbering used (public
-                      numbering, or reversed private numbering).
-                      Caution: some other toolkits use the word 'dimension' where
-                      we use 'axis', but we avoid that usage because it is
-                      ambiguous.
+                      axis 2 has dim=7; but this should more precisely
+                      be called axis-index or raxis-index (see their own
+                      glossary entries; they respectively use the public
+                      numbering, or reversed private numbering).  To describe
+                      the number of axes of a Tensor, we use the term "num-axes" /
+                      "number of axes".
 
     Axis-index:       An axis-index of a Pattern or Tensor (sometimes just "axis" for short,
                       especially in code) is an index in the range [0, num_axes - 1]
                       that identifies an axis in the public numbering (see "Public numbering").
-                      See also: Raxis-index.
+                      For the index in the private numbering, see: Raxis-index.
 
     axis-dominance property: search below for [Valid Pattern], point (vi), for the main
                       definition.
@@ -189,6 +185,9 @@ namespace tensor {
                       and an index-tuple i, we may write q[i] = (p1[i], p2[i] p3[i]),
                       where expressions like p1[i] evaluate to a memory-index.
 
+    Num-axes:         The number of axes that a Tensor has.  This is a number in the
+                      range [0, KALDI_TENSOR_MAX_DIM], i.e. 0 through 6.
+
     Offset:           The memory-index of the element with index-tuple = (all zeros)
                       of a Tensor.  Offsets will always be >= 0 because they are to
                       be used as an index into a memory-region, and negative
diff --git a/src/tensor/tensor.h b/src/tensor/tensor.h
index 96eb8ea68f8..c97d49fac44 100644
--- a/src/tensor/tensor.h
+++ b/src/tensor/tensor.h
@@ -25,6 +25,97 @@
 #include "tensor/tensor-impl.h"
 #include "tensor/storage.h"
 
+
+/*
+   TENSOR GLOSSARY
+
+    Base Variable:  A Variable that is not a view into another Variable,
+             but has been created directly from a Tensor (or via Detach()).
+             Each Variable has a base Variable; a base Variable's
+             base Variable is itself.  See also: "View Variable".
+
+    Invalidated:  if some data used in backprop needs to have been unchanged since
+              a particular tick (as recorded in an Op), but it has been changed
+              since then, we say that it has been invalided.  This is an error,
+              but it will only be detected in debug mode.  In effect we store a
+              record of what time (in ticks) data last changed at the
+              individual-element level (e.g. per float), via the ChangeTracker
+              object that is attached to the Storage object.  It's done in a
+              structured way, not via a huge boolean array.  This means that the
+              change-tracking mechanism is not defeated by doing Detach() or by
+              constructing multiple Variables from the same Tensor.
+
+    In-place operation: An operation that modifies a Variable, such as adding
+              to it after it has been created.  This notion is not particularly
+              meaningful in this framework, since in a sense all operations
+              are in-place operations; conceptually, the creation of a Variable
+              is seen as separate from an operation that sets it to some value,
+              and in-place operations are thus not "special".
+
+    Lazy allocation:  We do not allocate memory as soon as a Tensor is created,
+              but wait until an operation is done on it.  This makes it easier
+              to implement backprop with views of Tensors, because we can
+              construct views of Tensors whose memory has not been allocated yet.
+              The code for this happens in class Storage (see storage.h).  We
+              can also repeat this trick: on a base Variable, you can call
+              ZeroDeallocating(), which conceptually zeroes the Variable, but
+              does it by freeing the underlying data.  This enables the autograd
+              graph to be re-used without leaving too many things allocated.
+
+     Leaf Variable:  A leaf Variable is a Variable that you create directly
+             by wrapping a Tensor (or by calling .Detach()).  A leaf
+             Variable is always a base Variable.
+
+     Node:   A node in the autograd graph (Ops correspond-- roughly-- to the
+             edges in that graph).  There is a node for each tracked base variable.
+             [See also: Tracked; Base Variable].
+
+     Op:     (see op.h)  An operation on a Tensor (e.g. addition, multiplication, etc.),
+             including in-place operations.  Each Node in the autograd graph stores
+             a list of Ops that operated on that base Variable or some sub-part of
+             it.  However, if an Op modified two Nodes we need to call its Backprop()
+             only once; after figuring out which Ops need to be done, we call their
+             Backprop() in reverse order of their ticks (see: Tick).
+
+    Op-input-node:  Relative to a particular Op, a Node is an Op-input-node if
+            it is attached to at least one Variable that is an input of that Op,
+            but is not attached to any Variable that is an output of that Op.
+            An Op-input-node may not also be an Op-output-node (they are disjoint
+            sets).
+
+    Op-output-node: Relative to a particular Op, a Node is an Op-output-node
+            if it is attached to any Variable that is an output of that Op
+            (i.e. that is modified by that Op).
+
+
+    Tick:   a tick is the value of a global 64-bit time counter that we increment
+            every time we mutate a Tensor; see GetTick(), and
+            Op::GetTimestamp().  When we create Ops for backpropagation of
+            derivatives, we record the tick at which the Op was created, for
+            purposes of checking for invalidation (see: "Invalidated"), and
+            also of ordering Ops during backprop.
+
+   Tracked:  We say a Variable is tracked if gradient-tracking is
+             enabled for it.  This will be the case if it is
+             a leaf Variable constructed with requires_grad = true,
+             or a non-leaf Variable that has been created or changed
+             by an operation that depended on a tracked Variable.
+             A non-tracked Variable can become tracked but not vice
+             versa.  The granularity of being tracked is at the
+            "base variable" level.
+
+   View Variable:  A View Variable is any variable that is not a base
+            variable.  Such variables will be views of base Variables that have
+            been created from them by some operation such as slicing
+            (e.g. taking row or column ranges).
+
+
+
+
+ */
+
+
+
 namespace kaldi {
 namespace tensor {
 
@@ -286,21 +377,14 @@ class Tensor {
 
 
   /**
-     This constructor takes the 'impl' and 'storage' provided and returns
-     a Tensor containing them.  Intended for special-purpose code such
+     This constructor takes the 'impl' provided and returns
+     a Tensor containing it.  Intended for special-purpose code such
      as when we wrap arrays from external frameworks.
    */
-  Tensor(const TensorImpl &impl, std::shared_ptr<Storage> storage);
+  Tensor(const TensorImpl &impl);
 
  private:
-  // This object contains the num-axes, dims, strides and data pointer, plus
-  // cached properties.
   TensorImpl impl_;
-
-  // The storage region where the data resides storage_->data will equal
-  // impl_.data (we duplicate it in impl_ for convenence and to avoid an extra
-  // pointer dereference).
-  std::shared_ptr<Storage> storage_;
 };
 
 
diff --git a/src/tensor/variable-inl.h b/src/tensor/variable-inl.h
new file mode 100644
index 00000000000..42917014f0f
--- /dev/null
+++ b/src/tensor/variable-inl.h
@@ -0,0 +1,91 @@
+// tensor/variable-inl.h
+
+// Copyright      2019  Johns Hopkins University (author: Daniel Povey)
+
+// See ../../COPYING for clarification regarding multiple authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//  http://www.apache.org/licenses/LICENSE-2.0
+//
+// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
+// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
+// MERCHANTABLITY OR NON-INFRINGEMENT.
+// See the Apache 2 License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef KALDI_TENSOR_VARIABLE_INL_H_
+#define KALDI_TENSOR_VARIABLE_INL_H_ 1
+
+// Do not include this file directly.  It is to be included from variable.h.
+
+namespace kaldi {
+namespace tensor {
+
+bool VariableImpl::Tracked() const {
+  if (!base_) {
+    return node_ != nullptr;
+  } else if (base_->node_ != nullptr) {
+    node_ = base_->node_;  // Re-cache it, and the corresponding grad.
+    grad_ = node_->GetGradFor(data_);  // Cache the grad too.
+    return true;
+  } else {
+    return false;
+  }
+}
+
+const std::shared_ptr<Node>& VariableImpl::GetNode() {
+  if (node_) {
+    return node_;
+  } else if (!base_) {
+    // This is a base Variable and we need to construct the node.
+    node_ = std::make_shared<Node>(data_);
+    grad_ = node_->grad;
+    return node_;
+  } else {
+    // This is a view Variable
+    if (!base_->node_) {  // make node of base if needed
+      base_->node_ = std::make_shared<Node>(base->data_);
+      base_->grad_ = node_->grad;
+    }
+    // cache node in view
+    node_ = base_->node_;
+    grad_ = node_->GetGradFor(data_);  // Cache the grad too.
+    return node_;
+  }
+}
+
+
+const std::shared_ptr<Tensor>& VariableImpl::GetGrad() {
+  // The code is almost exactly the same as GetNode() above.  Note:
+  // We assume that either grad_ and node_ are both NULL, or both
+  // non-NULL.
+  if (grad_) {
+    return grad_;
+  } else if (!base_) {
+    // This is a base Variable and we need to construct the node.  (Assume it
+    // is not allocated if grad_ was not allocated).
+    node_ = std::make_shared<Node>(data_);
+    grad_ = node_->grad;
+    return grad_;
+  } else {
+    // This is a view Variable
+    if (!base_->node_) {  // make node of base if needed
+      base_->node_ = std::make_shared<Node>(base->data_);
+      base_->grad_ = node_->grad;
+    }
+    // cache node in view
+    node_ = base_->node_;
+    grad_ = node_->GetGradFor(data_);
+    return grad_;
+  }
+}
+
+}  // namespace tensor
+}  // namespace kaldi
+
+
+#endif  // KALDI_TENSOR_VARIABLE_INL_H_
diff --git a/src/tensor/variable.h b/src/tensor/variable.h
index 08fb8b75a16..3f749652956 100644
--- a/src/tensor/variable.h
+++ b/src/tensor/variable.h
@@ -17,8 +17,8 @@
 // See the Apache 2 License for the specific language governing permissions and
 // limitations under the License.
 
-#ifndef KALDI_TENSOR_TENSOR_H_
-#define KALDI_TENSOR_TENSOR_H_ 1
+#ifndef KALDI_TENSOR_VARIABLE_H_
+#define KALDI_TENSOR_VARIABLE_H_ 1
 
 #include "tensor/variable.h"
 
@@ -26,201 +26,136 @@ namespace kaldi {
 namespace tensor {
 
 
-/**
 
-   Definitions:
-
-     Tracked:   A variable v is tracked if v.Grad() returns non-NULL.  By "tracked" we mean:
-                we are keeping autograd history.  Being tracked or not is actually a
-                property of the base variable (see "Base variable").
-    Debug mode:  Debug mode is a global bool, accessible via GetDebugMode().  When it is
-                true, we check for invalidated data in the backprop phase.  (This,
-                which performs the same function as version numbering in, say, PyTorch,
-                is quite a slow operation so we only enable it occasionally.
-   Base variable: A base Variable is a Variable that is not a sub-part (e.g. row or
-                column range) of another Variable.  Every Variable has a base Variable;
-                a base Variable is its own base Variable.  A base variable is
-                created when a Variable is initialized from a Tensor or when we
-                call .detach().  The base Variable is the unit at which we make
-                the decision "is this being tracked?".  As soon as a Variable
-                becomes tracked, all Variables sharing the same base Variable
-                become tracked.
-
-      tick:   a tick is the value of a global 64-bit time counter that we increment every
-              time we mutate a Tensor.  When we create Ops for backpropagation of
-              derivatives, we record the tick at which the Op was created.
-    invalidated:  if some data used in backprop needs to have been unchanged since
-              a particular tick (as recorded in an Op), but it has been changed
-              since then, we say that it has been invalided.  This is an error, but
-              it will only be detected in debug mode.  In effect we store a
-              record of what time (in ticks) data last changed at the individual-element
-              level, via the ChangeTracker object that is attached to the Storage
-              object (it's done in a structured way, not via a huge array).
-              This means that the change-tracking mechanism is not defeated by
-              doing .detach() or by constructing multiple Variables from the same
-              Tensor.
 
+// Shared data of a base Variable.  Each tracked base Variable gets one of
+// these; non-base Variables (views into other variables) share the Node of
+// their base Variable.
+class Node {
 
-*/
 
+  /**
+     Construct a Node.
+          @param [in] tensor   The data_ of the base Variable to which
+                     this node is to be attached.  The created 'grad' will
+                     have the same dims but different Storage and possibly
+                     different strides;
+  */
+  explicit Node(const Tensor &tensor);
 
-void Add(const Variable &a, const Variable &b, Variable *c) {
-  // assumes c already correctly sized.
 
 
-  Add(a.data(), b.data(), &(c->data()));
+  /**
+     This is to be used when setting the grad_ member of view variables.  it
+     constructs a new Tensor with the appropriate pattern for the view, but
+     pointing to the storage of 'grad'.
+          @param [in] tensor  The data_ of the view Variable for which
+                      we are requesting the gradient Tensor.
+          @return     Returns a Tensor that is a view into 'grad', with
+                      the same relationship to it as 'tensor' had to
+                      its underlying Variable.
+  */
+  std::shared_ptr<Tensor> GetGradFor(const Tensor &tensor);
 
-  Variable *a_grad = a->grad(), *b_grad = b->grad(),
-      *c_grad = c->grad();
 
-  auto gradFunc = [a_grad,b_grad,c_grad] () {
-    a_grad->Add(*c_grad);
-    b_grad->Add(*c_grad);
-  }
+  /**
+     Sets the most recent Op held here (latest_op).  This is called whenever
+     an Op is created that changed a Variable attached to this Node.  The
+     Op itself will have a shared_ptr to the previous Op that was attached
+     to this Node.
+   */
+  inline void SetOp(const std::shared_ptr<Op> &op) { latest_op = op; }
 
-  c->SetGradFunc(gradFunc);
-  c->SetDependencies(a, b);
+  // The gradient.  This is set up when the Node is created, but the data in its
+  // Storage object won't necessarily have been allocated (see "Lazy Allocation"
+  // in tensor.h)
+  std::shared_ptr<Tensor> grad;
 
-}
+  // Either NULL, or an object capable of converting patterns from
+  // tensor to gradients (used for views).  Will be NULL in the usual
+  // case where the Tensor for this base Variable has the same strides
+  // and offset as the grad.
+  std::unique_ptr<TensorPatternRebaser> rebaser;
 
+  // latest_op is the most recent of the Ops that modified the base Variable
+  // this is attached to, or any view into it.
 
-/*
-  This is the 'gradient information' that class Variable stores
-  when it is initialized with requires_grad = true (or is a result of
-  an operation on Variables one of which had requires_grad = true).
-  The Variable holds it via a shared_ptr.
-  This does not give you access to the underlying Variable; doing it
-  like this makes reference counting easier (no loops).  The GradFunc
-  will store any pointers to the original Variable that it may have
-  needed.
+  // op_list (will usually be NULL) is the head of a list of Ops that wrote to
+  // this Node (the most recent first).  In the backward pass we call Backprop()
+  // on each of these Ops in turn.  TODO: make it unique_ptr?
+  std::shared_ptr<Op> latest_op;
 
-  Users will rarely need to interact directly with this struct.
- */
-struct TensorGrad {
-  // The version of the underlying Tensor.  (this number in the TensorGrad
-  // mirrors that in the Variable; it's needed because TensorGrad's
-  // 'inputs' variable refers back to the TensorGrad and does not have
-  // access to the Variable).
-  int32 version;
-
-
-  struct InputInfo {
-    int32 version;  // the version of the input that we used.  Used so we can
-                    // check in the backprop that grad->version == version;
-                    // if not, the user did something we don't allow.
-    std::shared_ptr<TensorGrad> grad;
-  };
-
-  // The gradients corresponding to the input variables, which
-  // we may need to update.  Some subset of these may be nullptr,
-  // corresponding to input Variables for which no gradient
-  // was required.
-  std::vector<InputInfo> inputs;
-
-  // is_view is true only if the Variable underlying this TensorGrad
-  // is the result of an expression like foo.transpose() that creates
-  // a view to another Tensor.  In that case, the variables
-  // 'meta' and 'offset' become relevant, and when asked to create
-  // the 'grad' Variable, we won't allocate it directly but will
-  // instead create a view into inputs[0].grad->data.
-  bool is_view{false};
-
-  // grad_discarded will be set to true in the backprop when we are done
-  // with this->grad and have deallocated it.  If a future user
-  // attempts to reallocate the gradient, this will trigger an
-  // exception.
-  bool grad_discarded{false};
-
-  // This contains the meta-information of the Tensor for which this is the
-  // gradient (its 'data' pointer will be NULL).  Used to set up 'grad' with the
-  // correct dimension and strides when it is needed.
-  TensorMeta meta;
-  // Only if is_view == true, the offset (in elements) of the start of
-  // the Tensor described in 'meta' from the start of the source Tensor.
-  // Used in constructing 'grad'
-  int64 offset;
-
-  // This stores the gradient (if we already have one), or nullptr if not.
-  std::unique_ptr<Variable> data;
-
-  // The tail in a singly linked list of TensorGrads... used in case this
-  // Variable is a sum of several terms that were added together in-place.
-  std::unique_ptr<TensorGrad> tail;
-
-  // You call this function to ensure that the 'grad'
-  void EnsureGradAllocated();
+ private:
+  Node(const Node &other);  // Disallow copy construction
+  Node & operator = (const Node &other);  // Disallow assignment
 };
 
 
-
-struct TensorGradOp {
-  std::vector<std::shared_ptr<TensorGraph> > inputs;
-  std::vector<std::shared_ptr<TensorGrad> > outputs;
-
-
-  std::vector<std::shared_ptr<Variable> > vars_needed;
-
-  std::function<void()> op;
-
-
-  TensorGradOp(std::initializer_list<VariableRef> inputs_grads_needed,
-               std::initializer_list<VariableRef> output_grads_needed,
-               std::initializer_list<VariableRef> variables_needed,
-               std::function<void()> op);
-
-};
-
 /**
-   This contains the graph-related information stored with a Variable.
-   For Variables initialized with requires_grad = true, it's held
-   via shared_ptr as graph_.
+   Implementation class for Variable.  Variable is just a shared_ptr to this.
  */
-struct TensorGraph {
-  // creator_ops contains the op that created (or modified) the Variable that
-  // this TensorGraph is held by.  (If it modified this variable, 'tail' records
-  // any previous operations on it).
-  std::shared_ptr<TensorGradOp> creator_op;
+class VariableImpl {
 
-  std::shared_ptr<TensorGrad> grad;
+  inline const std::shared_ptr<Tensor> &GetData() const { return data_; }
 
-  std::shared_ptr<TensorGraph> tail;
-};
 
+  // Returns true if this Variable is tracked (see "Tracked" in the
+  // glossary in tensor.h).
+  inline bool Tracked() const;
 
-/**
-   GradFunc is the type that is passed into the constructor of Variable by a
-   function implementing some operation on Variables (addition, multiplication,
-   etc.).  It is at the core of the backprop mechanism, so we explain it here
 
- */
-typedef std::function<void(const Variable &grad, const std::vector<Variable> *input_grads)> GradFunc;
+  // Returns the most recent Op in the autograd
 
-typedef std::function<void(TensorGrad *grad)> GradHook;
+  inline const std::shared_ptr<Op> &GetOp();
 
+  // Returns the node in the autograd graph, as a shared_ptr; this creates it if
+  // it did not exist (so the Variable, and others sharing the same base
+  // Variable, will become tracked if it was not before).
+  inline const std::shared_ptr<Node> &GetNode();
 
-// This is an enum but will be used as if it were an int32,
-// as a bit pattern.
-// TODO: figure out proper way to do that.
-// It's used as an arg to the constructor of Variable
-class enum VariableInit {
-  CopyData = 1,
-  CopyGrad = 2,
-  CopyGraph = 4
-};
+  // Returns the Tensor corresponding to the gradient; like GetNode, this will
+  // make the Variable tracked if it were not tracked before.
+  inline const std::shared_ptr<Tensor> &GetGrad();
 
+ private:
+  // Creates the node in the autograd graph.  This must be a base Variable
+  // and the node must not already exist (i.e. we require node_ == NULL,
+  // base_ == NULL).
+  void CreateNode();
+
+  // The Tensor that this Variable wraps.  Will always be non-NULL.  (Lazy
+  // allocation may still happen in its Storage object, until we do something
+  // with it).
+  std::shared_ptr<Tensor> data_;
 
-// Shared data of a base Variable.  Each base Variable gets one of these; but
-// non-base Variables (views into other variables) share the Node of their base
-// Variable.
-struct Node {
-  // The gradient.
-  Tensor grad_;
+  // 'node_' is the node in the autograd graph, which is only allocated for
+  // tracked base Variables; otherwise it is NULL.  It is allocated at the time
+  // we realize we need gradient tracking, which might be when we create the
+  // Variable, or later on if an in-place operation on it has as input a tracked
+  // Variable.
+  //
+  // Non-base Variables cache the node of their base Variable, but if their node
+  // is requested and this pointer is NULL and base_ is non-NULL, we need to
+  // look at base_->node_ to re-check whether the base Variable is tracked, in
+  // case it became tracked since we last checked.
+  std::shared_ptr<Node> node_;
+
+  // A pointer to the gradient, or NULL if this Variable is not tracked (Note:
+  // like node_, this can get out of date if the base Variable becomes tracked,
+  // so if base_ != NULL, we need to re-check).  grad_ can then be created
+  // from the information in node_, once it exists; it is cached here
+  // for efficiency.  See "Lazy Allocation" in glossary in tensor.h;
+  // the underlying data may not have been allocated.
+  // grad_ and node_ are always either both NULL or both non-NULL.
+  std::shared_ptr<Tensor> grad_;
+
+
+  // 'base_' is NULL if this is a base Variable (i.e. not a view of another
+  // Variable); otherwise it points to the base Variable.  This also requires
+  // that class Variable store its VariableImpl as a shared_ptr.
+  std::shared_ptr<VariableImpl> base_;
 
 
-  // op_list_ (may be NULL) is the head of a list of Ops
-  // that wrote to this Node (most recent at the head).
-  // TODO: make it unique_ptr?
-  std::shared_ptr<Op> op_list_;
 };
 
 
@@ -233,7 +168,6 @@ class Variable;
 */
 class Variable {
 
-
   /** Constructor from a Tensor.
        @param [in] data  Pointer to the source Tensor.  Will accept a
                       raw Tensor* pointer, in which case it will construct a
@@ -272,6 +206,24 @@ class Variable {
    */
   std::shared_ptr<Variable> GetBaseVariable();
 
+
+  /**
+     Returns the most recent Op that modified the base Variable of this
+     Variable.  This will be called so the dependency can be recorded in other
+     Ops, and also if called Backprop() and we want to create the list
+     of Ops to do backprop on.
+   */
+  std::shared_ptr<Op> GetOp();
+
+
+  /**
+     Sets the most recent Op held in the Node underlying this Variable
+     to the Op held in this shared_ptr (which must not be NULL).  This
+     is done whenever we create an Op that modifies a particular
+     Variable.
+  */
+  void SetOp(const std::shared_ptr<Op> &op);
+
   /**
      Constructor that will be used by functions implementing mathematical
      operations on Variables.
@@ -291,41 +243,14 @@ class Variable {
            GradFunc grad_func);
 
 
-  // Returns true
-  bool Tracked() const;
-
+p
 
 
  private:
-
-  // data_ is the Tensor underlying this Variable.
-  std::shared_ptr<Tensor> data_;
-
-  // base_ is the base Variable which is non-NULL only if this Variable is a
-  // view of an underlying Variable.  This needs to be tracked even if
-  // we are not yet tracking gradients, because if any Variable with a
-  // particular base becomes tracked, all such Variables do.
-  // If base_ is NULL, then this Variable is its own base variable.
-  std::shared_ptr<Variable> base_;
-
-  // grad_ is a pointer to the struct containing gradient information (for
-  // Variables that require a gradient; else NULL).  It may also be
-  // NULL because someone called this->RemoveGrad().
-  std::shared_ptr<TensorGrad> grad_;
-
-  // ops_ is the first in singly list of Ops for this Variable.  If this
-  // Variable is not its own base variable (i.e. if base_ != NULL), this will be
-  // NULL since the Ops are only stored in the base Variables.
-
-
-  (there will be just one element,
-  // unless in-place operations were done).
-  // Will be NULL if this Variable does not require a gradient or if someone
-  // called this->RemoveGraph().
-  std::shared_ptr<Op> ops_;
+  std::shared_ptr<VariableImpl> impl_;
 };
 
-typedef std::unique_ptr<Storage>
+
 
 
 
@@ -337,4 +262,8 @@ typedef std::unique_ptr<Storage>
 }  // namespace kaldi
 
 
+// Include implementation of inline functions.
+#include "variable-inl.h"
+
+
 #endif  // KALDI_TENSOR_VARIABLE_H_

From 2a852040a857b7253d174799324d4256d4dcb529 Mon Sep 17 00:00:00 2001
From: Hossein Hadian <hn.hadian@gmail.com>
Date: Sun, 28 Apr 2019 19:38:48 +0430
Subject: [PATCH 025/163] [src,egs] Update code related to pdf-class to be
 1-based. (#3278)

---
 egs/swbd/s5c/local/chain/tuning/run_tdnn_2r.sh | 2 +-
 src/bin/cluster-phones.cc                      | 6 +++---
 src/bin/draw-tree.cc                           | 2 +-
 src/hmm/transitions.cc                         | 8 +++-----
 src/hmm/transitions.h                          | 1 +
 src/tree/build-tree-utils.cc                   | 3 ++-
 src/tree/build-tree.cc                         | 5 +++--
 src/tree/context-dep.cc                        | 3 ++-
 8 files changed, 16 insertions(+), 14 deletions(-)

diff --git a/egs/swbd/s5c/local/chain/tuning/run_tdnn_2r.sh b/egs/swbd/s5c/local/chain/tuning/run_tdnn_2r.sh
index 4bdc61ef0e5..9282b733946 100755
--- a/egs/swbd/s5c/local/chain/tuning/run_tdnn_2r.sh
+++ b/egs/swbd/s5c/local/chain/tuning/run_tdnn_2r.sh
@@ -210,7 +210,7 @@ if [ $stage -le 11 ]; then
   # needed, as in this type of topology we only have a single pdf-class,
   # numbered zero.
   steps/nnet3/chain/build_tree.sh --frame-subsampling-factor 3 \
-      --cluster-phones-opts "--pdf-class-list=0" \
+      --cluster-phones-opts "--pdf-class-list=1" \
       --leftmost-questions-truncate $leftmost_questions_truncate \
       --cmd "$train_cmd" 6000 data/$train_set data/lang_chain_2r $ali_dir $treedir
 fi
diff --git a/src/bin/cluster-phones.cc b/src/bin/cluster-phones.cc
index 1d5b3824252..24627ca3bfc 100644
--- a/src/bin/cluster-phones.cc
+++ b/src/bin/cluster-phones.cc
@@ -49,7 +49,7 @@ int main(int argc, char *argv[]) {
 
     // bool binary = true;
     int32 P = 1, N = 3; // Note: N does not matter.
-    std::string pdf_class_list_str = "1";  // 1 is just the central position of 3.
+    std::string pdf_class_list_str = "2";  // 2 is just the central position of 3.
     std::string mode = "questions";
     int32 num_classes = -1;
 
@@ -57,7 +57,7 @@ int main(int argc, char *argv[]) {
     // po.Register("binary", &binary, "Write output in binary mode");
     po.Register("central-position", &P, "Central position in context window [must match acc-tree-stats]");
     po.Register("context-width", &N, "Does not have any effect-- included for scripting convenience.");
-    po.Register("pdf-class-list", &pdf_class_list_str, "Colon-separated list of HMM positions to consider [Default = 1: just central position for 3-state models].");
+    po.Register("pdf-class-list", &pdf_class_list_str, "Colon-separated list of HMM positions to consider [Default = 2: just central position for 3-state models].");
     po.Register("mode", &mode, "Mode of operation: \"questions\"->sets suitable for decision trees; \"k-means\"->k-means algorithm, output k classes (set num-classes options)\n");
     po.Register("num-classes", &num_classes, "For k-means mode, number of classes.");
 
@@ -86,7 +86,7 @@ int main(int argc, char *argv[]) {
     std::vector<int32> pdf_class_list;
     if (!SplitStringToIntegers(pdf_class_list_str, ":", false, &pdf_class_list)
        || pdf_class_list.empty()) {
-      KALDI_ERR << "Invalid pdf-class-list string [expecting colon-separated list of integers]: " 
+      KALDI_ERR << "Invalid pdf-class-list string [expecting colon-separated list of integers]: "
                  << pdf_class_list_str;
     }
 
diff --git a/src/bin/draw-tree.cc b/src/bin/draw-tree.cc
index d107ab1cfac..f95478e7c52 100644
--- a/src/bin/draw-tree.cc
+++ b/src/bin/draw-tree.cc
@@ -33,7 +33,7 @@ void MakeEvent(std::string &qry, fst::SymbolTable *phone_syms,
     EventValueType value;
     if (key == kPdfClass) {
       value = static_cast<EventValueType>(atoi(valstr.c_str()));
-      if (value < 0) { // not valid pdf-class
+      if (value < 1) { // not valid pdf-class
         KALDI_ERR << "Bad query: invalid pdf-class (" << valstr << ')';
       }
     }
diff --git a/src/hmm/transitions.cc b/src/hmm/transitions.cc
index e802194be52..493a14f62d8 100644
--- a/src/hmm/transitions.cc
+++ b/src/hmm/transitions.cc
@@ -61,12 +61,12 @@ void Transitions::ComputeInfo(const ContextDependencyInterface &ctx_dep) {
     auto const &entry = topo_.TopologyForPhone(phone);  // an FST
     int num_states = entry.NumStates();
 
-    std::vector<StateId> state_to_self_loop_pdf_class(num_states, -1);  // TODO(hhadian): Define and use kNoPdf
+    std::vector<StateId> state_to_self_loop_pdf_class(num_states, kNoPdf);
     for (StateId state = 0; state < num_states; ++state)
       for (fst::ArcIterator<fst::StdVectorFst> aiter(entry, state); !aiter.Done(); aiter.Next()) {
         const fst::StdArc &arc(aiter.Value());
         if (arc.nextstate == state) {
-          KALDI_ASSERT(state_to_self_loop_pdf_class[state] == -1);  //kNoPdf Only 1 self-loop allowed.
+          KALDI_ASSERT(state_to_self_loop_pdf_class[state] == kNoPdf);  // Only 1 self-loop allowed.
           state_to_self_loop_pdf_class[state] = arc.ilabel;
         }
       }
@@ -76,10 +76,8 @@ void Transitions::ComputeInfo(const ContextDependencyInterface &ctx_dep) {
       for (fst::ArcIterator<fst::StdVectorFst> aiter(entry, state);
            !aiter.Done(); aiter.Next()) {
         const fst::StdArc &arc(aiter.Value());
-        int32 forward_pdf_class = arc.ilabel - 1,  // context-dep assumes classes are zero-based.
+        int32 forward_pdf_class = arc.ilabel,
             self_loop_pdf_class = state_to_self_loop_pdf_class[arc.nextstate];
-        if (self_loop_pdf_class != -1)
-          self_loop_pdf_class--;
         auto state_arc_pair = std::make_pair(state, aiter.Position());
         auto pdf_class_pair = std::make_pair(forward_pdf_class, self_loop_pdf_class);
         phone_to_arc_list[pdf_class_pair].push_back(state_arc_pair);
diff --git a/src/hmm/transitions.h b/src/hmm/transitions.h
index 0909189b88b..fa62c03bd21 100644
--- a/src/hmm/transitions.h
+++ b/src/hmm/transitions.h
@@ -32,6 +32,7 @@
 
 namespace kaldi {
 
+static const int kNoPdf = -1;
 
 // The class Transitions handles various integer mappings.
 // It used to be the home for the trainable transitions, but these
diff --git a/src/tree/build-tree-utils.cc b/src/tree/build-tree-utils.cc
index 254d7ec36d8..a1490db51bf 100644
--- a/src/tree/build-tree-utils.cc
+++ b/src/tree/build-tree-utils.cc
@@ -1024,7 +1024,8 @@ EventMap *GetStubMap(int32 P,
         }
       }
       std::map<EventValueType, EventAnswerType> m;
-      for (EventAnswerType p = 0; p < max_len; p++)
+      // Note: pdf-classes are 1-based.
+      for (EventAnswerType p = 1; p <= max_len; p++)
         m[p] = (*num_leaves_out)++;
       return new TableEventMap(kPdfClass,  // split on hmm-position
                                m);
diff --git a/src/tree/build-tree.cc b/src/tree/build-tree.cc
index 534f3352def..0234a607cba 100644
--- a/src/tree/build-tree.cc
+++ b/src/tree/build-tree.cc
@@ -76,7 +76,8 @@ void GenRandStats(int32 dim, int32 num_stats, int32 N, int32 P,
     for (int32 j = 0; j < hmm_length; j++) {
       // create event vector.
       EventType event_vec;
-      event_vec.push_back(std::make_pair(kPdfClass, (EventValueType)j));  // record the position.
+      // Use j+1 in next line becuase pdf-classes are 1-based.
+      event_vec.push_back(std::make_pair(kPdfClass, (EventValueType)(j + 1)));  // record the position.
       for (size_t pos = 0; pos < (size_t)N; pos++) {
         if (pos == (size_t)(P) || is_ctx_dep[phone_vec[P]])
           event_vec.push_back(std::make_pair((EventKeyType)pos, (EventValueType)phone_vec[pos]));
@@ -652,7 +653,7 @@ void AutomaticallyObtainQuestions(BuildTreeStatsType &stats,
                << "stats disappeared: the size changed from " << stats.size()
                << " to " << retained_stats.size() << ".  You might be using "
                << "a nonstandard topology but forgot to modify the "
-               << "--pdf-class-list option (it defaults to { 1 } which is "
+               << "--pdf-class-list option (it defaults to { 2 } which is "
                << "the central state in a 3-state left-to-right topology)."
                << " E.g. a 1-state HMM topology would require the option "
                << "--pdf-class-list=0.";
diff --git a/src/tree/context-dep.cc b/src/tree/context-dep.cc
index 7562abf29e2..9d224abfbe7 100644
--- a/src/tree/context-dep.cc
+++ b/src/tree/context-dep.cc
@@ -292,7 +292,8 @@ void ContextDependency::GetPdfInfo(
     KALDI_ASSERT(static_cast<size_t>(phone) < num_pdf_classes.size());
     EventAnswerType len = num_pdf_classes[phone];
 
-    for (int32 pos = 0; pos < len; pos++) {
+    // Pdf-classes are 1-based.
+    for (int32 pos = 1; pos <= len; pos++) {
       vec.resize(2);
       vec[0] = std::make_pair(static_cast<EventKeyType>(P_),
                               static_cast<EventValueType>(phone));

From af6a30b18419cc6148fb3981fea6b9400423f267 Mon Sep 17 00:00:00 2001
From: Daniel Povey <dpovey@gmail.com>
Date: Tue, 30 Apr 2019 17:47:08 -0400
Subject: [PATCH 026/163] [src] More drafting on tensor code

---
 src/tensor/change-tracker.h                 |  69 +++--
 src/tensor/config.h                         | 109 +++++++
 src/tensor/op.h                             |  26 ++
 src/tensor/scalar.h                         |  59 ++++
 src/tensor/storage.h                        |  26 +-
 src/tensor/tensor-common.cc                 |  27 ++
 src/tensor/tensor-common.h                  |  24 +-
 src/tensor/tensor-functions.h               | 311 +++++++++++++++-----
 src/tensor/tensor-impl-utils.h              |  96 +++---
 src/tensor/tensor-impl.h                    |  68 +++--
 src/tensor/tensor-pattern-extra-utils-inl.h |  29 --
 src/tensor/tensor-pattern-extra-utils.cc    |  46 +++
 src/tensor/tensor-pattern-extra-utils.h     |  74 ++++-
 src/tensor/tensor-pattern-utils.cc          | 184 ++++++++++--
 src/tensor/tensor-pattern-utils.h           | 136 +++++++--
 src/tensor/tensor-pattern.cc                |   2 +-
 src/tensor/tensor-pattern.h                 | 182 +++++++++---
 src/tensor/tensor-utils.h                   |  60 +++-
 src/tensor/tensor.h                         |  71 ++++-
 src/tensor/variable-inl.h                   |  93 ++++--
 src/tensor/variable.h                       | 187 +++++++-----
 src/util/text-utils.h                       |   2 +-
 22 files changed, 1457 insertions(+), 424 deletions(-)
 create mode 100644 src/tensor/config.h
 create mode 100644 src/tensor/scalar.h
 create mode 100644 src/tensor/tensor-common.cc

diff --git a/src/tensor/change-tracker.h b/src/tensor/change-tracker.h
index 1c0a1af1e63..51a73d5390b 100644
--- a/src/tensor/change-tracker.h
+++ b/src/tensor/change-tracker.h
@@ -32,14 +32,13 @@ namespace tensor {
 
 /**
    class ChangeTracker is something we only use in 'debug mode'.  Its purpose is
-   to keep track of when data was last changed
-
-   to make sure people don't mutate data via in-place operations in a way that
-   will invalidate the backprop.  This is a replacement for the 'version
-   numbering' of Variables used in PyTorch, i.e. it's a different way of solving
-   the same problem.  The mechanism is (I think) more exact than version
-   numbering, and less hassle for the calling code; but since it's slower, we
-   will only activate it occasionally.  c.f. SetDebugMode(), GetDebugMode().
+   to keep track of when data was last changed, to make sure people don't mutate
+   data via in-place operations in a way that will invalidate the backprop.
+   This is a replacement for the 'version numbering' of Variables used in
+   PyTorch, i.e. it's a different way of solving the same problem.  The
+   mechanism is (I think) more exact than version numbering, and less hassle for
+   the calling code; but since it's slower, we will only activate it
+   occasionally.  c.f. SetDebugMode(), GetDebugMode().
 
    When a computation requiring derivatives creates a graph that will (when
    Backprop()'d) require a certain Tensor's data to remain unchanged until
@@ -55,15 +54,10 @@ namespace tensor {
    Attempts to mutate that memory (assuming the code calls Mutate()) will cause a
    crash.  The solution would be to remove the offending in-place operation from
    your code.
-
  */
 class ChangeTracker {
  public:
 
-  // Increments the global counter and returns that value.
-  static uint64 GetTick();
-
-
   /** Constructor.  A Storage object is created for each allocated block of
       memory, and each Storage object has at most one ChangeTracker object.
 
@@ -77,7 +71,7 @@ class ChangeTracker {
   /**
      Record a change to this storage region at the current time (obtained by
      GetTick()).  Just appends it to the vector of changes after canonicalizing
-     the pattern.
+     the pattern.  Inlined since it's only called from Storage::ChangedSince().
 
      @param [in] element_size  The size in bytes of the data type being stored
                              here: for example, 4 for float.
@@ -85,21 +79,21 @@ class ChangeTracker {
                             to canonical form (c.f. CanonicalizePattern())
                             before being stored.
    */
-
-  void RecordChange(int32 element_size,
-                    const TensorPattern &pattern);
+  inline void RecordChange(int32 element_size,
+                           const TensorPattern &pattern);
 
 
   /**
      Returns true if any element covered by this pattern has been
-     changed since the time given by 'tick'.
+     changed since the time given by 'tick'.  Inlined since it's only
+     called from Storage::ChangedSince().
 
       @param [in] tick  The time (obtained by GetTick()) since when
                      we want to know about changes
       @param [in] pattern  The pattern that we are checking
    */
-  bool ChangedSince(int64 tick,
-                    const TensorPattern &pattern);
+  inline bool ChangedSince(int64 tick,
+                           const TensorPattern &pattern);
 
  private:
 
@@ -115,19 +109,42 @@ class ChangeTracker {
   // to increase the risk of overflowing int32 storage).
   int64 element_size_;
 
+
   struct ChangeRecord {
     TensorPattern pattern;  // The pattern (offset, dims, strides) that was
                             // changed within this storage region.  This pattern
-                            // will have been reduced to canonical form.
-    int64 tick;             // The time, in ticks (c.f. tick_counter_) at which
-                            // this set of elements was changed.
+                            // will have been reduced to canonical form.  View
+                            // it as a memory-index-set (c.f. glossary in
+                            // pattern.h).
+
+    int64 tick;             // The time, in ticks (c.f. NextTick()) at which
+                            // this set of memory-indexes was changed.
+
+    // Next in a singly linked list of ChangeRecord.
+    std::unique_ptr<ChangeRecord> tail;
   };
 
 
-  // changes_ is a sequences of changes.
-  std::vector<ChangeRecord> changes_;
+  // Head of a singly linked list of changes.  When RecordChange() is called, we
+  // will add to the head of this (and then de-dupe; see doc for change_map)).
+  // When ChangedSince() is called, we will traverse it element by element until
+  // we get to the tick passed to ChangedSince, and if there is any overlap with
+  // the passed-in pattern, we'll return true.
+  std::unique_ptr<ChangeRecord> changes_;
+
+
+  // This is a map from a pointer to the TensorPattern in ChangeRecord::pattern
+  // (hashing the pattern itself, not the pointer value), to the ChangeRecord
+  // that holds it.  We actually map to the address of the std::unique_ptr
+  // pointing to that ChangeRecord, which might be the address of this->changes_
+  // or ChangeRecord::tail, because we need to be able to write to that to
+  // remove a ChangeRecord from the singly linked list.  This map is used
+  // in de-duping the list of changes, so that if someone provides the
+  // exact same pattern twice, we only keep the most recent tick; this
+  // keeps memory usage under control.
+  std::unordered_map<TensorPattern*, std::unique_ptr<ChangeRecord>*,
+                     TensorPatternPtrHasher, TensorPatternPtrEqual> change_map_;
 
-  static int64 tick_counter_{0};
 
 };
 
diff --git a/src/tensor/config.h b/src/tensor/config.h
new file mode 100644
index 00000000000..0d14c621bba
--- /dev/null
+++ b/src/tensor/config.h
@@ -0,0 +1,109 @@
+// tensor/config.h
+
+// Copyright      2019  Johns Hopkins University (author: Daniel Povey)
+
+// See ../../COPYING for clarification regarding multiple authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//  http://www.apache.org/licenses/LICENSE-2.0
+//
+// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
+// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
+// MERCHANTABLITY OR NON-INFRINGEMENT.
+// See the Apache 2 License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef KALDI_TENSOR_CONFIG_H_
+#define KALDI_TENSOR_CONFIG_H_ 1
+
+#include <string>
+#include "util/text-utils.h"
+
+namespace kaldi {
+namespace tensor {
+
+
+/**
+   This Config class is used when we want to store configuration information
+   inside Variables (e.g., to set per-parameter learning rates).
+   We'll eventually need mechanisms to read and write this.
+ */
+class Config {
+ public:
+
+  /**
+    This template will be defined only for types
+    `T = {std::string, bool, int32, float }`.
+
+      @param [in] key   The name of the config parameter we are setting,
+                      e.g. "learning-rate".   Must satisfy IsValidName(key),
+                      i.e. starts with `[a-zA-Z]`, and contains only characters
+                      `[a-zA-Z0-9_-]`.
+      @param [in] value  The value to be set, of type string, bool, int32 or
+                      float.  Any previous value (of whatever type) set for this
+                      key will be overwritten.
+  */
+  template<typename T>  void SetValue(const std::string &key,
+                                      const T &value);
+
+
+  /**
+    This template will be defined only for types
+    `T = {std::string, bool, int32, float }`.
+
+      @param [in] key   The name of the config parameter we are querying,
+                      e.g. "learning-rate".   Must satisfy IsValidName(key),
+                      i.e. starts with `[a-zA-Z]`, and contains only characters
+                      `[a-zA-Z0-9_-]`.
+      @param [out] value  The value to be set, of type string, bool, int32 or
+                      float.  If the key was not present in the map, we return
+                      false and don't set `value`.  If the key was present and
+                      the value was of a compatible type, we set `value`.
+                      If they key was present but the value was not of a
+                      compatible type, we die with an error.
+                      As for type compatibility: all types are compatible
+                      with themselves, and the only automatic conversion we
+                      do (so far) is from int to float.  We may add more
+                      conversions later if needed.
+      @return         Returns true if the key was in the map and of a compatible
+                      type, false if the key was not in the map.  (Dies
+                      if the key was present but with an incompatible type).
+  */
+  template<typename T> bool GetValue(const std::string &key,
+                                     T *value);
+
+
+ private:
+
+  enum ValueType { kStringValue, kBoolValue, kIntValue, kFloatValue };
+
+  struct ConfigElement {
+    ValueType value_type;
+    std::string str;
+    union  {
+      int32 i;
+      float f;
+      bool b;
+    } u;
+  };
+
+  // If we later end up storing many configuration value, we could change this
+  // to unordered_map, but in most cases it will only be one or two so that
+  // would be overkill.
+  std::map<std::string, ConfigElement> map_;
+};
+
+
+}  // namespace tensor
+}  // namespace kaldi
+
+
+// Include implementation of inline functions.
+#include "tensor/config-inl.h"
+
+
+#endif  // KALDI_TENSOR_CONFIG_H_
diff --git a/src/tensor/op.h b/src/tensor/op.h
index fc3510f06f8..caa23d57376 100644
--- a/src/tensor/op.h
+++ b/src/tensor/op.h
@@ -68,6 +68,32 @@ class Op {
   int64 GetTimestamp() const final { return tick_; }
 
   virtual void Backprop();
+
+ protected:
+  // The time (`GetTick()`) at which this Op was created.
+  int64 tick_;
+
+
+  inline void RegisterTensorChange(const Tensor &tensor) {
+    if (DebugMode()) {
+    }
+  }
+
+  /*
+    This function intended to be called from the Backprop() routines
+    of child classes, for example:
+       ` if (DebugMode()) {  CheckTensorTime(*a_);  } `
+    This will die if the memory underlying the Tensor being checked has been
+    modified more recently than tick_.
+  */
+  inline void CheckTensorTime(const Tensor &tensor) {
+    if (DebugMode()) {
+    }
+  }
+
+
+
+
 };
 
 
diff --git a/src/tensor/scalar.h b/src/tensor/scalar.h
new file mode 100644
index 00000000000..dba2c183d9f
--- /dev/null
+++ b/src/tensor/scalar.h
@@ -0,0 +1,59 @@
+// tensor/scalar.h
+
+// Copyright      2019  Johns Hopkins University (author: Daniel Povey)
+
+// See ../../COPYING for clarification regarding multiple authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//  http://www.apache.org/licenses/LICENSE-2.0
+//
+// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
+// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
+// MERCHANTABLITY OR NON-INFRINGEMENT.
+// See the Apache 2 License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef KALDI_TENSOR_TENSOR_H_
+#define KALDI_TENSOR_TENSOR_H_ 1
+
+#include "tensor/tensor-common.h"
+#include "tensor/tensor-pattern.h"
+#include "tensor/tensor-impl.h"
+#include "tensor/storage.h"
+
+
+
+
+namespace kaldi {
+namespace tensor {
+
+
+/**
+   Scalar is how we wrap user-supplied constant scalar value.  Right now this
+   basically wraps a double, but for future extensibility to ints, complex
+   numbers and so on, we make it a class.
+*/
+class Scalar {
+ public:
+  Scalar(float f): value_(f) { }
+  Scalar(double d): value_(d) { }
+
+
+  float operator float() const (return value_);
+  float operator double() const (return value_);
+  // DataType Dtype() { return dtype_; }
+ private:
+  double value_;
+  // DataType dtype_;
+};
+
+
+}  // namespace tensor
+}  // namespace kaldi
+
+
+#endif  // KALDI_TENSOR_TENSOR_H_
diff --git a/src/tensor/storage.h b/src/tensor/storage.h
index c4f88adbbbb..89794813b47 100644
--- a/src/tensor/storage.h
+++ b/src/tensor/storage.h
@@ -27,15 +27,21 @@
 namespace kaldi {
 namespace tensor {
 
-struct StorageExtras;
+struct StorageAux;
 
 // 'Storage' contains a single allocated region (on CPU or GPU, according
 // to 'device').
 class Storage {
  public:
-  // This returns a reference to the object held in this->locker if it is
-  // non-NULL; otherwise it allocates one and returns that.
-  ChangeTracker &GetChangeTracker();
+
+
+  void RecordChange(int32 element_size,
+                    const TensorPattern &pattern);
+
+
+  // This initializes a ChangeTracker object in this->tracker if it
+  // does not already exist, and returns its address.
+  ChangeTracker *GetChangeTracker();
 
   inline bool Allocated() {  return (data != NULL);  }
 
@@ -119,17 +125,17 @@ class Storage {
   Device device;
 
   // contains some extra, less-often-used fields
-  std::unique_ptr<StorageExtras> extras;
+  std::unique_ptr<StorageAux> extras;
 
 };
 
 
 
-// struct StorageExtras contains what (conceptually) are some rarely-needed
-// extra fields of class Storage; we store them separately, holding a
-// possibly-NULL pointer to struct StorageExtras, to reduce the size of struct
+// struct StorageAux contains some rarely-needed extra fields that we didn't
+// want to keep in class Storage; we store them separately, holding a
+// possibly-NULL pointer to struct StorageAux, to reduce the size of struct
 // Storage in the normal case.
-struct StorageExtras {
+struct StorageAux {
   using DeallocatorFunc = std::function<void()>;
 
   // 'tracker' is used in debug mode to detect when data that might be
@@ -141,6 +147,8 @@ struct StorageExtras {
   // If non-NULL, it will be invoked when we want to deallocate the
   // storage object.
   DeallocatorFunc deallocator;
+
+  // TODO: allow for some kind of name here, reflecting where it came from?
 };
 
 
diff --git a/src/tensor/tensor-common.cc b/src/tensor/tensor-common.cc
new file mode 100644
index 00000000000..8bcb4b22093
--- /dev/null
+++ b/src/tensor/tensor-common.cc
@@ -0,0 +1,27 @@
+// tensor/tensor-common.cc
+
+// Copyright      2019  Johns Hopkins University (author: Daniel Povey)
+
+// See ../../COPYING for clarification regarding multiple authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//  http://www.apache.org/licenses/LICENSE-2.0
+//
+// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
+// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
+// MERCHANTABLITY OR NON-INFRINGEMENT.
+// See the Apache 2 License for the specific language governing permissions and
+// limitations under the License.
+
+#include "tensor/tensor-common.h"
+
+
+namespace kaldi {
+namespace tensor {
+
+}  // namespace tensor
+}  // namespace kaldi
diff --git a/src/tensor/tensor-common.h b/src/tensor/tensor-common.h
index e58d5ef59c6..b0260a9bb80 100644
--- a/src/tensor/tensor-common.h
+++ b/src/tensor/tensor-common.h
@@ -89,6 +89,16 @@ enum DataType {
 };
 
 
+
+inline int32 SizeOf(DataType dtype) {
+  switch(dtype) {
+    case 0: return 4;
+    case 1: return 8;
+    case 2: KALDI_ERR << "Invalid data-type " << int32(dtype); return 0;
+  }
+}
+
+
 aDataType GetDefaultDtype();
 void SetDefaultDtype(DataType dtype);
 
@@ -113,7 +123,19 @@ class WithDtypeAs {
 // This is defined in tensor-common.cc.
 extern int64 g_tick_counter;
 
-inline int64 GetTick() { return g_tick_counter++; }
+inline int64 NextTick() { return ++g_tick_counter; }
+
+// ? Remove this?  To be used when you don't want to increment
+// the counter.
+inline int64 CurrentTick() { return g_tick_counter; }
+
+
+// debug_mode activates code that checks for invalidated data in the backprop
+// pass; see "Invalidated:" in glossary in tensor.h.
+extern bool debug_mode;
+inline bool DebugMode() { return debug_mode; }
+inline void SetDebugMode(bool b) { debug_mode = b; }
+
 
 /// Enumeration that says what strides we should choose when allocating
 /// A Tensor.
diff --git a/src/tensor/tensor-functions.h b/src/tensor/tensor-functions.h
index 47da508d857..83d18239222 100644
--- a/src/tensor/tensor-functions.h
+++ b/src/tensor/tensor-functions.h
@@ -202,124 +202,273 @@ inline void Select(int32 axis, int32 index, Tensor *t) {
 }
 
 
+/**
+   Scales each element of the Tensor `dest` by the scalar alpha.
+   Equivalent to a special case of CopyScaled() where src and dest
+   are the same.
+*/
+void Scale(Scalar alpha, const Tensor *dest);
 
 
+/**
+   Copy `src` to `dest` with broadcasting and possibly summation depending on
+   the dims.  Equivalent to a special case of Add() with `alpha == 1.0` and
+   `beta == 0.0.`
+
+   Formally equivalent to the following; for the notation, the most relevant
+   glossary entries in tensor-pattern.h are "Dereferencing a memory-index" and
+   "Memory-index-tuple-set of a Pattern-tuple".
+       (1)  For each memory-index `m` in `dest`, do: `*m = 0.0`
+       (2)  For each memory-index-tuple `(m_src, m_dest)` in the memory-index-tuple-set
+            `M(src, dest)`, do: `*m_dest += *m_src`.
+
+     @param [in] src     Source Tensor.
+     @param [out] dest   Destination Tensor.  Must satisfy
+                        `BroadcastableAndCompatible(src, *dest) && !Overlap(src, *dest)`
+ */
+void Copy(const Tensor &src, const Tensor *dest);
+
+/**
+   Copy with a scale, `dest := src * alpha`, where the scale is a
+   user-supplied scalar constant.
+   This copying may involve broadcasting and/or summation depending on the dims.
+   Equivalent to a special case of Add() with `beta == 1.0`.
+
+   Formally equivalent to the following; for the notation, the most relevant
+   glossary entries in tensor-pattern.h are "Dereferencing a memory-index" and
+   "Memory-index-tuple-set of a Pattern-tuple".
+       (1)  For each memory-index `m` in `dest`, do: `*m = 0.0`
+       (2)  For each memory-index-tuple `(m_src, m_dest)` in the memory-index-tuple-set
+            `M(src, dest)`, do: `*m_dest += alpha * *m_src`.
+
+     @param [in]  alpha   Scale used in the operation
+     @param [in]  src     Source Tensor.
+     @param [out] dest   Destination Tensor.  Must satisfy
+                        `BroadcastableAndCompatible(src, *dest) &&
+                         !Overlap(src, *dest) || Identical(src, *dest))`
+ */
+void CopyScaled(Scalar alpha, const Tensor &src, const Tensor *dest);
+
+/**
+   Copy with a scale, where the scale is a Tensor that the user asserts has only
+   one element.  (E.g. a previously computed scalar value).
+
+   This copying may involve broadcasting and/or summation depending on the dims.
+   Equivalent to a special case of Add() with `beta == 1.0`.
+
+   Formally equivalent to the following; for the notation, the most relevant
+   glossary entries in tensor-pattern.h are "Dereferencing a memory-index" and
+   "Memory-index-tuple-set of a Pattern-tuple".
+       (1)  For each memory-index `m` in `dest`, do: `*m = 0.0`
+       (2)  For each memory-index-tuple `(m_src, m_dest)` in the memory-index-tuple-set
+            `M(src, dest)`, do: `*m_dest += alpha * *m_src`.
+
+     @param [in]  alpha   Scale used in the operation, supplied as a Tensor.
+     @param [in]  src     Source Tensor.
+     @param [out] dest   Destination Tensor.  Must satisfy
+                        `BroadcastableAndCompatible(alpha, src, *dest) &&
+                         !Overlap(src, *dest) || Identical(src, *dest))`
+
+ */
+void CopyScaled(const Tensor &alpha, const Tensor &src, const Tensor *dest);
 
 /**
    Does
 
-      dest := alpha * src  +  beta * dest
+       dest := alpha * src  +  beta * dest
 
    while supporting broadcasting and summation, as dictated by the shapes
    of src and dest.  If beta == 0, guarantees that NaN's or inf's will
    not be propagated from the original data in 'dest' (so it works with
    uninitialized 'dest' if beta == 0).
 
-   Requires Broadcastable(src, *dest) and Compatible(src, *dest).
-   If src and dest have an integer Dtype, alpha and beta will
-   be cast to integers before the operation.
+   Requires `Broadcastable(src, *dest), Compatible(src, *dest)` and
+   `Overlap(src, *dest) || Identical(src, *dest)`.  [Note: in the
+   case where `Identical(src, *dest)`, i.e. they are the same Tensor
+   with the same memory, you could also use Scale().
+
+      @param [in] alpha  Scale on 'src'
+      @param [in] beta   Scale on 'dest'
+      @param [in] src    Source Tensor, to be added to 'dest'
+      @param [in,out] dest  Destination Tensor.  Must satisfy
+                     `BroadcastableAndCompatible(src, *dest) &&
+                     !Overlap(src, *dest) || Identical(src, *dest))`,
 */
-void Add(float alpha, float beta, const Tensor &src, const Tensor *dest);
+void AddTo(Scalar alpha, Scalar beta, const Tensor &src, const Tensor *dest);
+
 
 /**
-  If possible, modifies the Tensor metadata to have the requested
-  dimensions.
-
-  The semantics are based on those of PyTorch's "view" or NumPy's
-  "reshape", except we try to be more accepting regarding the
-  acceptable striding of the input (see below).
-
-  Consider a Tensor 'a' has "C"-style strides.  Then this function will return
-  Tensor (say, 'b') that interprets the raw data of 'a' as an array with
-  "C"-style strides but with dimensions 'dims'.  (The product of 'dims' must
-  equal src.NumElements()).
-
-  Now consider a Tensor 'a2' that does not have "C"-style strides but
-  has the same elements as 'a' in the sense that a(i,j,k) == a2(i,j,k).
-  Then, *if possible*, this function will return a matrix b2 with
-  the same elements as b, e.g. b2(i,j,k) == b(i,j,k).  Of course, whether
-  this is possible depends on the details of the strides involved.
-
-  This function returns NULL if such a tensor could not be constructed.  In that
-  case,
-
-     @param   [in] dims  The dimensions that we want The tensor to have at
-                       exit; its product must equal t->NumElements().
-     @param   [in,out] t   The Tensor whose metadata is to be changed
-
-     @return  Returns true if it was possible to construct such a view, and
-              false otherwise.  If t->HasCStrides() is true at entry,
-              this function will never return false.  If this function returns
-              false, you will likely want to construct a temporary Tensor from t
-              with the same dimensions but "C"-style strides (see the
-              constructor of Tensor that accepts the 'dims' parameter), and copy
-              the data from t to that new Tensor.  You may then call View() on
-              the temporary Tensor, which is guaranteed to succeed.
-
-     Example:
+   Does
+
+       dest := alpha * src  +  beta * dest
+
+   while supporting broadcasting and summation, as dictated by the shapes
+   of src and dest.  If beta == 0, guarantees that NaN's or inf's will
+   not be propagated from the original data in 'dest' (so it works with
+   uninitialized 'dest' if beta == 0).
+
+   Requires `Broadcastable(src, *dest)`, alpha and beta
+   to have one element each, all arcs be Compatible() with each other,
+   `Overlap(src, *dest) || Identical(src, *dest)`, and for neither alpha
+   nor beta to overlap with src or dest. [Note: in the
+   case where `Identical(src, *dest)`, i.e. they are the same Tensor
+   with the same memory, you could also use Scale().
+
+      @param [in] alpha  Scale on 'src', supplied as a Tensor; must
+                         have
+      @param [in] beta   Scale on 'dest'
+      @param [in] src    Source Tensor, to be added to 'dest'
+      @param [in,out] dest  Destination Tensor.  Must satisfy
+                     `BroadcastableAndCompatible(src, *dest) &&
+                     !Overlap(src, *dest) || Identical(src, *dest))`,
+*/
+void AddTo(const Tensor &alpha, const Tensor &beta,
+           const Tensor &src, const Tensor *dest);
+
+
+
+
+
+/**
+  If possible, creates a new Tensor that has the requested dimensions,
+  as a 'view' of the provided Tensor; else returns NULL.  (For
+  explanation of the return type, see "Optional Tensor" in glossary
+  in tensor.h.)
+
+  The quick way to describe the semantics is: first, in the case where
+  'src' is laid out as a contiguous "C"-style array (w.r.t. the
+  public axis numbering), return a Tensor that's also a contiguous
+  "C"-style array looking at the same memory, with the provided
+  dims.  Then generalize this concept to when 'src' isn't laid out
+  as a "C"-style array, to preserve the same relationship between
+  the index-tuples that index "src" and the returned Tensor.
+
+  We can desribe this more precisely as follows: Consider the index-tuple-set
+  I(src) of the pattern `src`; and let list(I(src)) be that set considered as a
+  list sorted according to (the natural ordering c.f. "Natural order of
+  index-tuples").  Let I(dest) be the index-tuple-set of a Pattern with the
+  provided dimensions `dims`, and let list(I(dest)) be that set considered as an
+  ordered list as above.  Extend the notion of indexing a Pattern
+  (c.f. "Indexing a Pattern") to accept, and return, ordered lists in the
+  obvious way.  Then this function attempts to return a pointer to a TensorImpl
+  sharing the same storage as 'src', having a Pattern with the provided dims
+  `dims` satisfying dest[list(I(dest))] = src[list(I(src))] if such a Pattern
+  exists; and if that is not possible, returns NULL.
+
+
+     @param   [in] src  The source Tensor that we are attempting to
+                        construct a view of
+     @param   [in] dims  The dimensions requested of the destination
+                        Tensor.  Must be list of positive integers of size
+                        not exceeding KALDI_TENSOR_MAX_DIM, whose product
+                        equals NumElements(src).  The order is according
+                        to the public numbering of axes.
+     @return            Returns a `shared_ptr<TensorImpl>` of the constructed
+                        view, or NULL if that was not possible.
+
+# TODO: check that the following is valid.
 <code>
     Tensor a({90}, kFloatDtype, kCpuDevice);
-    Tensor b(a);
-    bool ans = View({9,5,2}, &b);
-    KALDI_ASSERT(ans);
+    Tensor v = View(a, {9,5,2});  // Tensor constructor will crash if
+                                  // View returned NULL
 </code>
  */
-bool View(ArrayRef<int32> dims, Tensor *t);
+std::shared_ptr<TensorImpl> View(const Tensor &src, ArrayRef<int32> dims);
 
 
 /**
-   Attempts to modify a Tensor to contain a new view of its data, in which the
-   axes numbered axis1 and axis1 + 1 are merged.  This is just a special case of
-   View().
-
-   For example, if 't' is a Tensor with dims (3,4,5) and you call
-   MergeAxes(1, &t), this funtion will merge axes 1 and 2 and t will, at
-   exit, have shape (3 20), with elements arranged in 4 blocks of 5
-   elements each (i.e. axis 1 having the higher stride).
-
-       @param [in] axis1  The index of the first of the two axes which
-                          this function will attempt to merge.  Must
-                          be less than t->NumAxes() - 1.
-       @param [out] t     The Tensor to be modified; on success this
-                          will be a Tensor with axes merged as requested,
-                          sharing the data of 'src'.  On failure, it will
-                          not be changed.
-       @return            Returns true on success, false if the axes could
-                          not be merged.  It returns true if and only if
-                        `t->Stride(axis1 + 1)==t->Stride(axis1)*t->Dim(axis1)`
-
-     Example:
+   Attempts to create a Tensor containing a new view of the data in the source
+   Tensor in which the axes numbered
+   (axis1, axis1+1, ... axis1+num_axes_to_merge-1) are merged.  This is
+   a special case of View(), provided for convenience.  For explanation of
+   the return type, search for "Optional Tensor" in tensor.h.
+
+   This attempt will only succeed if
+   `src.Stride(axis1) == src.Stride(axis1 + 1) * src.Dim(axis1 + 1)`, i.e.
+   if the two axes were laid out like a "C"-style array.
+
+   More formally, we can express the relationship as follows.  Suppose this
+   function returns a Tensor called `dest`; and write d = src.Dim(axis1).
+   For an index-tuple i in I(src) [c.f.: "Index-tuple-set of a Pattern" in
+   tensor-pattern.h], split up its indexes as:
+      i = j + k + l
+   where '+' in this context means appending the tuples, and 'k' corresponds
+   to the range of axes (axis1, axis1+1, ... axis1+num_axes_to_merge-1).
+   Let K be the set of such k values encountered from splitting up each
+   i in I(src) this way, and let f be a function from tuples to integers
+   that maps list(K) to a sequence of consecutive integers starting from
+   zero (search for "list:" in tensor-pattern.h for explanation).
+   Let g be a function from tuples to possibly-shorter tuples that
+   maps j + k + l to j + (f(k),) + l, here using Python-like notation to
+   interpret (x,) as a tuple with a single element x and "+" meaning appending.
+   Then this function returns a Tensor sharing the same storage as `src`
+   and with a Pattern such that dest[g(i)] = src[i] for all i in I(src) and
+   I(dest) = g(I(src)).
+
+      @param [in] src  Source Tensor which we are attempting
+                      to construct a view of
+      @param [in] axis1  Axis-index, in the public numbering.
+                      Must satisfy 0 < axis1 and
+                      axis1 + num_axes_to_merge <= src.NumAxes().
+                      The axes axis1 and axis1 + 1 will be merged.
+      @param [in] num_axes_to_merge   Default: 2.  Must be >= 1;
+                      if 1, the returned Tensor will be the same
+                      as 'src'.
+      @return         Returns a new TensorImpl that can be used to
+                      construct a Tensor with the axes merged
+                      as requested, or NULL if that was not possible.
 <code>
     Tensor a({3,4,5}, kFloatDtype, kCpuDevice);
-    MergeAxes(0, &a);  // a now has dims {12,5}.
+    Tensor b = MergeAxes(0, &a);  // a now has dims {12,5}.
 </code>
  */
-bool MergeAxes(int32 axis1, Tensor *t);
+std::shared_ptr<TensorImpl> MergeAxes(const Tensor &src, int32 axis1,
+                                      int32 num_axes_to_merge = 2);
 
 /**
-   Modifies a Tensor by splitting the axis numbered 'axis' into
-   multiple axes as supplied in the 'dims' array.
-   The interpretation will be as for a "C" array; so, for instance,
-   if the dimensions of 'src' were (10,12) and you called
+   Modifies a Tensor by splitting the axis numbered `axis` into
+   multiple axes as supplied in the `dims` array.
+   The interpretation will be as for a "C"-style array; so, for instance,
+   if the dimensions of `src` were (10,12) and you called
    `SplitAxis(src, 1, 3, 4)` resulting in a Tensor of dimensions
    (10,3,4), the indexes along the original axis of dimension 12 would be
    interpreted as 3 blocks of size 4.  (This is the normal semantics
-   of things like NumPy's reshape or PyTorch's view.)
-
+   of things like NumPy's reshape or PyTorch's view.)  Note:
+   the strides in the returned Tensor will be negative if the stride
+   of axis `axis` of `src` was negative.
+
+   More formally the relationship is as follows (most readers will want to skip
+   this).  Let `dims` be the vector of dims supplied; let I(dims) be the
+   memory-index-set of a Pattern with dimensions equal to `dims`; let
+   list(I(dims)) be that set ordered as in the natural ordering (c.f. "Natural
+   order of index-tuples" in tensor-pattern.h), and let f(i) be the function
+   from index-tuple to integers that when applied to list(I(dims)), produces a
+   sequence of consecutive integers starting from zero.  Let g be the
+   function from index-tuples to index-tuples that when applied on an
+   index-tuple i = (j, k, l), produces something like i = (j, k1, k2, k3, l)
+   where the tuple (k1,k2,k3) = f^{-1}(k), where of course f^{-1} is the inverse
+   function of f.  Then this function returns a Tensor `dest` sharing the same
+   storage as `src`, such that dest[g(i)] = src[i] for i in I(src) and
+   I(dest) = g(I(src))
+   (Relevant glossary entries in tensor-pattern.h to understand the notation
+   include "Index-tuple-set of a Pattern" and "Indexing a Pattern").
+
+      @param [in] src   The source Tensor whose axis is to be split
       @param [in] axis  The index of the axis to be split; must
-                       satisfy `0 <= axis < src.Dims().`
-      @param [in] dims  The dimensions desired in the axes to
+                        satisfy `0 <= axis < src.Dims().`
+      @param [in] dims  The dimensions desired in the axes that
                         replace axis 'axis'.  Their product must
-                        equal the value of `t->Dim(axis)` at
-                        entry.
-      param [in,out] t   Tensor whose metadata is to be modified
-   Example:
+                        equal `src.Dim(axis)`.
+      @return           Returns a Tensor whose axis is split as
+                        requested.
+
+  Example:
 <code>
   Tensor a({10,3}, kFloatDtype, kCpuDevice);
-  SplitAxis(0, {2,5}, &a);  // a now has dims {2,5,3}.
+  Tensor b = SplitAxis(a, 0, {2,5};  // b has dims {2,5,3}.
 </code>
 */
-void SplitAxis(int32 axis, ArrayRef<int32> dims, Tensor *t);
+Tensor SplitAxis(const Tensor &src, int32 axis, ArrayRef<int32> dims);
 
 
 
diff --git a/src/tensor/tensor-impl-utils.h b/src/tensor/tensor-impl-utils.h
index 7d5592b1e46..0a7a3f762af 100644
--- a/src/tensor/tensor-impl-utils.h
+++ b/src/tensor/tensor-impl-utils.h
@@ -32,52 +32,31 @@ namespace kaldi {
 namespace tensor {
 
 
-// This function returns true if a and b have the same dtype
-// and device.  See also Broadcastable().
+/**
+  This function returns true if a and b have the same dtype
+  and device.  See also Broadcastable().
+*/
 inline bool Compatible(const TensorImpl &a, const TensorImpl &b);
 
 
-// This function returns true if a and b have the same dtype
-// and device; equivalent to Compatible(a, b) && Compatible(b, c).
+/*
+  This function returns true if a, b and c have the same dtype
+  and device; equivalent to Compatible(a, b) && Compatible(b, c).
+*/
 inline bool Compatible(const TensorImpl &a, const TensorImpl &b,
                        const TensorImpl &c);
 
 
 
-
-//  This function moves the 'data' pointer stored in 't' by adding
-//  a number of elements equal to 'offset'.  It casts it to the
-// type specified in t->dtype so the memory address changes by
-// the right amount.
-inline void AddToPointer(int64 offset, TensorImpl *t) {
-  switch(t->dtype) {
-    case kFloatDtype:
-      t->data = static_cast<void*>(static_cast<float>(t->data) + offset);
-      return;
-    case kDoubleDtype:
-      t->data = static_cast<void*>(static_cast<double>(t->data) + offset);
-      return;
-    default:
-      KALDI_ERR << "Unknown data type";
-  }
-}
-
-
 /**
-   This function allocates the appropriate storage for the Tensor described
-   in 'impl', and sets is 'data' pointer to the allocated memory address.
-   It returns the address a newly allocated Storage object which manages
-   the memory location; you will probably want to construct a
-   std::unique_ptr<Storage> from this so that when it goes out of scope,
-   the memory will be freed.
+   This function creates the appropriate storage object for the Tensor described
+   in 'impl', and sets impl->storage to that value.  Due to lazy allocation (see
+   "Lazy allocation" in glossary in tensor.h) the underlying memory won't be
+   allocated, but the meta-information is set up.
 
       @param [in,out] impl   The TensorImpl object we are allocating for.
-                      Any previous value of impl->data is ignored and
-                      overwritten.
-                      It is required that that the product of dims in
-                      impl->pattern be nonzero (i.e. that the pattern
-                      is initialized to a valid value), and that its
-                      dtype and device values be set.
+                      Any previous value of impl->storage is ignored and
+                      overwritten.  Must satisfy impl->IsValid(false).
       @return         Returns a newly allocated Storage object that
                       manages this memory block.  When this object is deleted,
                       the memory block will be deallocated using a
@@ -87,17 +66,8 @@ inline void AddToPointer(int64 offset, TensorImpl *t) {
 
    See also AllocateTensorDataShared().
  */
-Storage *AllocateTensorData(TensorImpl *impl);
-
+void CreateTensorStorage(TensorImpl *impl);
 
-/**
-   This function is as AllocateTensor(), except that the Storage
-   object returned is allocated via std::make_shared (which involves
-   just one heap allocation, as opposed to two if you constructed
-   the shared_ptr from the Storage* pointer).  See the documentation
-   for AllocateTensor() for more details.
- */
-std::shared_ptr<Storage> AllocateTensorDataShared(TensorImpl *impl);
 
 
 
@@ -157,7 +127,6 @@ inline void Transpose(int32 axis1, int32 axis2, TensorImpl *t) {
 }
 
 
-
 /**
    This is like PyTorch's slice() / narrow() functions.
    It selects a range of dimensions on one of the axes.  It is similar to
@@ -174,7 +143,8 @@ inline void Transpose(int32 axis1, int32 axis2, TensorImpl *t) {
    See also: the other overloaded version of Slice() which accepts the 'step'
    parameter; and Select(), which also reduces the num-axes.
  */
-void Slice(int32 axis, int32 start, int32 end, TensorImpl *t);
+void Slice(int32 axis, int32 start, int32 end, const TensorImpl &src,
+           TensorImpl *dest);
 
 
 /**
@@ -197,22 +167,42 @@ void Slice(int32 axis, int32 start, int32 end, TensorImpl *t);
 
    See the other version of Slice(), and Select().
  */
-void Slice(int32 axis, int32 start, int32 end, int32 step, TensorImpl *t);
+void Slice(int32 axis, int32 start, int32 end, int32 step,
+           const TensorImpl &src, TensorImpl *dest);
 
 
 /**
-   Select one element from an axis of TensorImpl 't', reducing t->NumAxes() by
-   one.
+   Copy metadata from one TensorImpl to another, while modifying it
+   by selecting one index from a specified axis of a TensorImpl `t`, reducing
+   the num_axes by one.
 
        @param [in] axis Axis from which to select an element; require
-                         -t->NumAxes() <= axis < t->NumAxes(), with negative
+                         `-t->NumAxes() <= axis < t->NumAxes()`, with negative
                          axis interpreted as an offset from t->NumAxes().
        @param [in] index  Index in t to select; must be in range
                           [0, t->Dim(axis) - 1].
-       @param [in,out]  t   TensorImpl whose metadata is to be modified.
+       @param [in] src    TensorImpl which is to be copied
+       @param [out] dest  TensorImpl which we are copying to.  It is allowed
+                          to be the same object as 'src'.
+*/
+void Select(int32 axis, int32 index, const TensorImpl &src,
+            TensorImpl *dest);
+
+
+/**
+
+
  */
-void Select(int32 axis, int32 index, TensorImpl *t);
+inline void RegisterTensorChange(const TensorImpl &impl) {
+  if (DebugMode()) {
+    impl.storage_->GetChangeTracker()->RecordChange(
+        SizeOf(impl.dtype), impl.pattern);
+  }
+}
 
+inline int64 NumElements(const TensorImpl &a) {
+  return NumElements(a.pattern);
+}
 
 
 
diff --git a/src/tensor/tensor-impl.h b/src/tensor/tensor-impl.h
index ba5b2cd8304..cd196ca2608 100644
--- a/src/tensor/tensor-impl.h
+++ b/src/tensor/tensor-impl.h
@@ -26,6 +26,17 @@
 namespace kaldi {
 namespace tensor {
 
+// Metadata for a Tensor.  It's occasionally convenient to have this
+// in a struct (it's the same as a Tensor without the 'data' pointer.
+// The members must stay in sync with the corresponding members of
+// TensorImpl, as we have code that does reinterpret_cast on
+// these types.  (We don't use base-classing as it would make the code
+// harder to read).
+struct TensorMeta {
+  TensorPattern pattern;
+  DataType dtype;
+  Device device;
+};
 
 /**
    TensorImpl is the core part of a Tensor, without the wrapping code and
@@ -39,10 +50,10 @@ struct TensorImpl {
   TensorPattern pattern;
   DataType dtype;
   Device device;
-  std::shared_ptr<Storage> data;  // 'data' points to a shared Storage object
-                                  // that contains (or eventually will contain,
-                                  // due to lazy allocation) the actual data
-                                  // pointer.
+  std::shared_ptr<Storage> storage;  // 'storage' points to a shared Storage object
+                                     // that contains (or eventually will contain,
+                                     // due to lazy allocation) the actual data
+                                     // pointer.
 
   inline int32 NumAxes() { return pattern.num_axes; }
 
@@ -70,26 +81,43 @@ struct TensorImpl {
   inline void* GetData() const;
 
 
-  // Returns true if this TensorImpl is valid, false otherwise.  It is an
-  // implied requirement of functions operating on TensorImpl's, that all
-  // TensorImpl's that are provided input or input+output arguments to functions
-  // must have IsValid() == true.
-  bool IsValid() { return pattern.IsValid(true) && data != nullptr; }
-};
+  // Returns true if this TensorImpl is valid, false otherwise.  A Tensor is
+  // valid if its TensorPattern is valid, its dtype and device are valid
+  // (e.g. enums in the correct range), and (if check_storage) that the storage
+  // object is non-NULL and the memory range covered by the pattern is within
+  // the num_bytes of the storage.
+  bool IsValid(bool check_storage = true);
+
+
+  const TensorMeta &Meta() const {
+    return reinterpret_cast<const TensorMeta&>(*this);
+  }
+
+  // Constructor that is used when taking the meta-info from one source
+  // but the storage from another.
+  TensorImpl(const TensorMeta &meta,
+             const std::shared_ptr<Storage> &storage);
+
+  // Constructor that copies the meta-info provided; if create_storage
+  // == true it creates the storage reason, else leaves it NULL.
+  TensorImpl(const TensorMeta &meta,
+             bool create_storage = true);
+
+
+
+  // Constructor that is used when taking the meta-info from one source
+  // but the storage from another; this version does move-construction
+  // on 'storage'.
+  TensorImpl(const TensorMeta &meta,
+             std::shared_ptr<Storage> &&storage);
+
+  // Default constructor
+  TensorImpl() { }
 
-// Metadata for a Tensor.  It's occasionally convenient to have this
-// in a struct (it's the same as a Tensor without the 'data' pointer.
-// The members must stay in sync with the corresponding members of
-// TensorImpl, as we have code that does reinterpret_cast on
-// these types.  (We don't use base-classing as it would make the code
-// harder to read).
-struct TensorMeta {
-  TensorPattern pattern;
-  DataType dtype;
-  Device device;
 };
 
 
+
 inline int32 TensorImpl::Dim(int32 axis) {
   if (axis < 0) {
     // it will usually be known whether axis < 0 at compile time, since it's
diff --git a/src/tensor/tensor-pattern-extra-utils-inl.h b/src/tensor/tensor-pattern-extra-utils-inl.h
index 78c40d13ea2..e0af4f49b55 100644
--- a/src/tensor/tensor-pattern-extra-utils-inl.h
+++ b/src/tensor/tensor-pattern-extra-utils-inl.h
@@ -28,35 +28,6 @@
 namespace kaldi {
 namespace tensor {
 
-inline void ComputeMinAndMaxMindex(const TensorPattern *pattern,
-                                   int64 *min_mindex,
-                                   int64 *max_mindex) {
-  KALDI_PARANOID_ASSERT(IsValid(pattern));
-  int32 num_axes = pattern.num_axes;
-  if (ContainsNegativeStride(pattern.code)) {
-    // The if-statement above may be read as "if either pattern.code is -1 or it
-    // indicates that `pattern` contains a negative stride.  That is, at this
-    // point we know that `pattern` *might* contain a negative stride.
-    int64 min_mindex_sum = 0, max_mindex_sum = 0;
-    for (int32 raxis = 0; raxis < num_axes; raxis++) {
-      int64 prod (pattern.dims[raxis] - 1) *
-          static_cast<int64>(pattern.strides[raxis]);
-      if (pattern.strides[raxis] > 0) max_mindex_sum += prod;
-      else min_mindex_sum += prod;
-    }
-    *min_mindex = min_mindex_sum;
-    *max_mindex = max_mindex_sum;
-  } else {
-    // This is a faster branch of the code that can assume all strides are
-    // positive.
-    *min_mindex = 0;
-    int64 max_mindex_sum = 0;
-    for (int32 raxis = 0; raxis < num_axes; raxis++)
-      max_mindex_sum += (pattern.dims[raxis] - 1) *
-          static_cast<int64>(pattern.strides[raxis]);
-    *max_mindex = max_mindex_sum;
-  }
-}
 
 
 }  // namespace tensor
diff --git a/src/tensor/tensor-pattern-extra-utils.cc b/src/tensor/tensor-pattern-extra-utils.cc
index 665e8895ec3..74a1ad75c21 100644
--- a/src/tensor/tensor-pattern-extra-utils.cc
+++ b/src/tensor/tensor-pattern-extra-utils.cc
@@ -690,5 +690,51 @@ bool PatternIncludes(const TensorPattern &pattern1,
 }
 
 
+void MakeContiguousAndJustified(const TensorPattern &src,
+                                TensorPattern *dest) {
+  KALDI_PARANOID_ASSERT(src.IsValid());
+
+
+  int32 num_axes = src.num_axes;
+
+  // The sorter object provides an order in which we can visit the axes of 'src'
+  // that is from least to greatest abs(stride).
+  OutOfPlaceAxisSorter sorter(src);
+
+  int64 offset = 0;  // 'offset' will be the offset that ensures 'dest' is
+                     // justified (see glossary in tensor-pattern.h for
+                     // definition).
+  int32 next_abs_stride = 1;
+  for (int32 i = 0; i < num_axes; i++) {
+    int32 raxis = sorter.GetIndex(i);
+    // We are going through the raxis-indexes in increasing order of stride.
+    // We'll set each stride to the product of the preceding dims.
+    int32 this_stride = src.strides[raxis],
+        this_dim = src.dims[raxis];
+    dest->dims[raxis] = this_dim;
+    if (this_stride == 0) {
+      dest->strides[raxis] = 0;
+      // Note: if 'src' is valid, this implies the dim is 1,
+      // so no need to multiply 'next_stride'
+    } else {
+      int32 abs_stride = std::abs(this_stride);
+      KALDI_PARANOID_ASSERT(abs_stride >= next_abs_stride &&
+                            "Input pattern was not valid.");
+      if (this_stride < 0) {
+        offset += next_stride * (this_dim - 1);
+        dest->strides[raxis] = -next_stride;
+      } else {
+        dest->strides[raxis] = next_stride;
+      }
+      next_abs_stride *= this_dim;
+    }
+  }
+  dest->offset = offset;
+  KALDI_PARANOID_ASSERT(IsContiguousAndJustified(*dest) &&
+                        IsValid(*dest));
+}
+
+
+
 }  // namespace kaldi
 }  // namespace tensor
diff --git a/src/tensor/tensor-pattern-extra-utils.h b/src/tensor/tensor-pattern-extra-utils.h
index 7d44f4ab302..1701eb8d6d4 100644
--- a/src/tensor/tensor-pattern-extra-utils.h
+++ b/src/tensor/tensor-pattern-extra-utils.h
@@ -162,8 +162,6 @@ bool PatternContains(const TensorPattern &pattern,
    a pattern's memory-index-set (i.e. the minimum and maximum
    indexes into the underlying array).
 
-   This is inlined for speed; see tensor-pattern-extra-utils-inl.h.
-
       @param [in] pattern  The pattern whose minimum and maximum
                            memory-index we are computing
       @param [out] min_mindex  The minimum memory-index in the
@@ -177,9 +175,9 @@ bool PatternContains(const TensorPattern &pattern,
                            memory-index-set of the pattern.
                            Will always be >= min_mindex.
 */
-inline void ComputeMinAndMaxMindex(const TensorPattern &pattern,
-                                   int64 *min_mindex,
-                                   int64 *max_mindex);
+void ComputeMinAndMaxMindex(const TensorPattern &pattern,
+                            int64 *min_mindex,
+                            int64 *max_mindex);
 
 
 /**
@@ -301,6 +299,32 @@ bool ConvertPatternStrides(const TensorPattern &pattern,
                            const ArrayRef<int32> strides,
                            std::vector<TensorPattern> *patterns);
 
+/**
+   This function fills in any 'gaps' in the memory-indexes in 'src' and
+   shifts so the lowest memory-index is 0, copying the resulting pattern
+   to 'dest'.  It is used when constructing gradient Tensors for
+   base Variables whose data Tensor is not contiguous and justified.
+
+   The more mathematical description is as follows:
+   Let m be the memory-index-set of `src`, and let f
+   be the function that maps m to the set  \f$ [0, |m|-1] \f$ while
+   preserving the ordering of the elements.  Then the relationship
+   between 'src' and 'dest' is that 'dest' has the same num_axes and
+   dims and 'src', and the strides are such as to satisfy
+   \f$  dest[i] = f(src[i]) \f$,
+   where i is a valid Index-tuple for `src`.  See "Indexing a Pattern"
+   in the glossary in tensor-pattern.h for explanation of this notation.
+
+         @param [in] src  The source pattern.  Must be valid.
+         @param [out] dest  The destination pattern.  Will be identical
+                        to `src` if `ContiguousAndJustified(src)`, else
+                        will have the relationship explained above.
+                        Will satisfy `ContiguousAndJustified(*dest)`,
+                        and also `IsValid(*dest)`, assuming `IsValid(src)`.
+ */
+void MakeContiguousAndJustified(const TensorPattern &src,
+                                TensorPattern *dest);
+
 
 /**
    Class TensorPatternRebaser is an object that converts TensorPattern
@@ -426,6 +450,46 @@ class TensorPatternRebaser {
 
 };
 
+/**
+   This object is to be instantiated when you want to know what permutation
+   you'd get if you were to change the ordering of axes so that the abs(stride)
+   were strictly increasing.  (Note: this is not a total order if there are >1
+   axes with stride=0, so the ordering may be somewhat arbitrary).
+
+   See the documentation for its GetIndex() function.
+ */
+class OutOfPlaceAxisSorter {
+ public:
+  // Constructor.
+  inline OutOfPlaceAxisSorter(const TensorPattern &src) {
+    int32 num_axes = src.num_axes;
+    for (int32 raxis = 0; raxis < src.num_axes; raxis++)
+      orig_raxis_[raxis] = raxis;
+    std::sort(orig_raxis_, orig_raxis_ + src.num_axes,
+              // a comparator (less-than) operator implemented as a lambda is
+              // below.  Sort from least to greatest abs(stride), disambiguating
+              // based on dim.
+              [src] (int32 raxis1, int32 raxis2) {
+                int32 abs_stride1 = std::abs(src.strides[raxis1]),
+                    abs_stride1 =  std::abs(src.strides[raxis2]);
+                if (abs_stride1 < abs_stride2) return true;
+                else if (abs_stride1 > abs_stride2) return false;
+                else return (src.dims[raxis1] < src.dims[raxis2]);
+              });
+  }
+  // Returns the 'source' raxis-index for a particular destination
+  // raxis-index, e.g..:  `src_raxis = GetIndex(dest_raxis)`.
+  // Copying as e.g. `dest.strides[dest_raxis] = src.strides[src_raxis]`,
+  // and the same for the dims, would give you a `dest` with axes
+  // sorted from smallest to greatest absolute value.
+  inline int32 GetIndex(int32 raxis) { return orig_raxis_[raxis]; }
+
+ private:
+  int32 orig_raxis_[KALDI_TENSOR_MAX_DIM];
+};
+
+
+
 
 }  // namespace tensor
 }  // namespace kaldi
diff --git a/src/tensor/tensor-pattern-utils.cc b/src/tensor/tensor-pattern-utils.cc
index 95461311e6f..015752f25b6 100644
--- a/src/tensor/tensor-pattern-utils.cc
+++ b/src/tensor/tensor-pattern-utils.cc
@@ -54,6 +54,37 @@ int32 ComputePatternCode(const TensorPattern &pattern) {
 }
 
 
+void ComputeMinAndMaxMindex(const TensorPattern *pattern,
+                            int64 *min_mindex,
+                            int64 *max_mindex) {
+  KALDI_PARANOID_ASSERT(IsValid(pattern));
+  int32 num_axes = pattern.num_axes;
+  if (ContainsNegativeStride(pattern.code)) {
+    // The if-statement above may be read as "if either pattern.code is -1 or it
+    // indicates that `pattern` contains a negative stride.  That is, at this
+    // point all we know is that `pattern` *might* contain a negative stride.
+    int64 min_mindex_sum = 0, max_mindex_sum = 0;
+    for (int32 raxis = 0; raxis < num_axes; raxis++) {
+      int64 prod (pattern.dims[raxis] - 1) *
+          static_cast<int64>(pattern.strides[raxis]);
+      if (pattern.strides[raxis] > 0) max_mindex_sum += prod;
+      else min_mindex_sum += prod;
+    }
+    *min_mindex = min_mindex_sum;
+    *max_mindex = max_mindex_sum;
+  } else {
+    // This is a faster branch of the code where we know that all strides are
+    // nonnegative.
+    *min_mindex = 0;
+    int64 max_mindex_sum = 0;
+    for (int32 raxis = 0; raxis < num_axes; raxis++)
+      max_mindex_sum += (pattern.dims[raxis] - 1) *
+          static_cast<int64>(pattern.strides[raxis]);
+    *max_mindex = max_mindex_sum;
+  }
+}
+
+
 
 /**
    This utility function used in CompressPatterns() normalizes the signs of the
@@ -380,37 +411,44 @@ void SortAxes(TensorPattern *pattern) {
     case 0: case 1:
       return;
     case 2:
-      if (pattern->strides[0] > pattern->strides[1]) {
+      // Implement this as a special case, avoiding a temporary
+      if (pattern->strides[0] > pattern->strides[1] ||
+          (pattern->strides[0] == pattern->strides[1] &&
+           pattern->dims[0] > pattern->dims[1])) {
         std::swap(pattern->strides[0], pattern->strides[1]);
         std::swap(pattern->dims[0], pattern->dims[1]);
-        pattern->code = -1;
       }
+      pattern->code = -1;
       return;
     default: {
-      // This is bubble sort, which might seem super inefficient, but it avoids
-      // the need to create a temporary of pairs (or implement an appropriate
-      // in-place sort); and since num_axes will rarely be more than about 3,
-      // and never more than 6, I don't think the speed will be a problem.
-      while (true) {
-        bool changed = false;
-        for (int32 i = 0; i < num_axes - 1; i++) {
-          if (pattern->strides[i] > pattern->strides[i + 1]) {
-            std::swap(pattern->strides[i], pattern->strides[i + 1]);
-            std::swap(pattern->dims[i], pattern->dims[i + 1]);
-            changed = true;
-          }
-        }
-        if (changed)
-          pattern->code = -1;
-        else
-          return;
+      std::pair<int32,int32> dims_strides[KALDI_TENSOR_MAX_DIM];
+      for (int32 i = 0; i < num_axes; i++) {
+        dims_strides[i].first = pattern->dims[i];
+        dims_strides[i].second = pattern->strides[i];
       }
+      std::sort(dims_strides, dims_strides + num_axes,
+                // below is a C++11 lambda used as a comparator function, like
+                // the operator a < b.
+                [] (const std::pair<int32,int32> &a,
+                    const std::pair<int32,int32> &b) {
+                  int32 abs_stride_a = std::abs(a.second),
+                      abs_stride_b = std::abs(b.second);
+                  if (abs_stride_a < abs_stride_b) return true;  // a < b; sort on abs(stride) first.
+                  else if (abs_stride_a > abs_stride_b) return false;  // a > b
+                  else return (a.first < b.first);
+                  // sort on dim if strides are the same
+                  // (which should only be for stride=0 for any valid Pattern.
+                });
+      for (int32 i = 0; i < num_axes; i++) {
+        pattern->dims[i] = dims_strides[i].first;
+        pattern->strides[i] = dims_strides[i].second;
+      }
+      pattern->code = -1;
+      return;
     }
   }
 }
 
-}
-
 void Transpose(int32 raxis1, int32 raxis2, TensorPattern *p) {
   if (static_cast<uint32>(raxis1) >= static_cast<uint32>(p->num_axes) ||
       static_cast<uint32>(raxis2) >= static_cast<uint32>(p->num_axes)) {
@@ -498,5 +536,109 @@ int64 NumElements(const TensorPattern &pattern) {
   return ans;
 }
 
+void Select(int32 eaxis, int32 index,
+            const TensorPattern &src, TensorPattern *dest) {
+  KALDI_PARANOID_ASSERT(src.IsValid());
+  int32 num_axes = src.num_axes,
+      raxis = EaxisToRaxis(eaxis);
+  if (static_cast<uint32>(raxis) >= static_cast<uint32>(num_axes) ||
+      static_cast<uint32>(index) >= static_cast<uint32>(src.dims[axis])) {
+    // If raxis is not in the range [0, num_axes - 1] or the index
+    // is not in the range [0, src.dims[axis] - 1]...
+    KALDI_ERR << "Invalid args to Select(): axis=" << eaxis
+              << " index=" << index << " vs. pattern dims="
+              << DimsAsString(src);
+  }
+  dest->num_axes = src.num_axes - 1
+  for (int32 r = 0; r < raxis; r++) {
+    dest->dims[r] = src.dims[r];
+    dest->strides[r] = src.strides[r];
+  }
+  dest->offset = src.offset + index * src.strides[raxis];
+  for (int32 r = raxis + 1; r < num_axes; r++) {
+    dest->dims[r - 1] = src.dims[r];
+    dest->strides[r - 1] = src.strides[r];
+  }
+  for (int32 r = num_axes - 1; r < KALDI_TENSOR_MAX_DIM; r++) {
+    dest->dims[r] = 1;
+    dest->strides[r] = 0;
+  }
+  dest->code = -1;
+  dest->properties = 0;
+}
+
+void Slice(int32 axis, int32 start, int32 end, TensorPattern *pattern) {
+  int32 num_axes = pattern->num_axes,
+      raxis = EaxisToRaxis(eaxis);
+  KALDI_PARANOID_ASSERT(pattern->IsValid());
+  if (static_cast<uint32>(raxis) >= static_cast<uint32>(num_axes) ||
+      end <= start ||
+      static_cast<uint32>(end) >= static_cast<uint32>(src.dims[axis])) {
+    // If raxis is not in the range [0, num_axes - 1] or (end <= start)
+    // or end is not in the range [0, src.dims[axis] - 1]...
+    KALDI_ERR << "Invalid args to Slice(): axis=" << eaxis
+              << " start=" << start << " end=" << end << " vs. pattern dims="
+              << DimsAsString(*pattern);
+  }
+  int32 old_stride = pattern->strides[raxis];
+  pattern->offset += pattern->strides[raxis] * start;
+  int32 new_dim = end - start;
+  pattern->dims[raxis] = new_dim;
+  if (new_dim == 1) {
+    pattern->strides[raxis] == 0;
+    if (pattern->code >= 0) {
+      // If the code was set, the following keeps it up to date (it's faster
+      // then recomputing the whole thing.
+
+      // Make the bit in the code that says the dim was != 1, is not set.
+      pattern->code &= ~(1 << (raxis - 1));
+      if (old_stride == 1) {
+        // If the stride was 1 then the code would have had at least one of bits
+        // 8,9,10 set to indicate the value of 'raxis'.  Zero this out, since
+        // the stride is no longer 1.
+        pattern->code &= ~(0x700);
+      }
+    }
+  }
+  KALDI_PARANOID_ASSERT(pattern->IsValid());
+}
+
+
+void UnsqueezeR(int32 raxis, const TensorPattern &src, TensorPattern *dest) {
+  int32 num_axes_in = src.num_axes;
+  KALDI_ASSERT(static_cast<uint32>(raxis) <= num_axes_in &&
+               num_axes_in < KALDI_TENSOR_MAX_DIM);
+  KALDI_PARANOID_ASSERT(IsValid(src));
+  if (&src != dest) {
+    // Copy some things over
+    for (int32 r = 0; r < raxis; r++) {
+      dest->dims[r] = src.dims[r];
+      dest->strides[r] = src.strides[r];
+    }
+    for (int32 r = num_axes_in + 1; r < KALDI_TENSOR_MAX_DIM; r++) {
+      dest->dims[r] = 1;
+      dest->strides[r] = 0;
+    }
+  }
+  dest->num_axes = num_axes_in + 1;
+  for (int32 r = num_axes_in + 1; r > raxis; r--) {
+    // go in reverse order in case (&src == dest)
+    dest->dims[r] = src.dims[r - 1];
+    dest->strides[r] = src.strides[r - 1];
+  }
+  // The unsqueezed axis.
+  dest->dims[raxis] = 1;
+  dest->strides[raxis] = 0;
+  if (raxis != num_axes_in) {
+    dest->code = -1;
+    dest->properties = 0;
+  } else if (&src != dest) {
+    // code would be unaffected if raxis == num_axes_in.
+    dest->code = src.code;
+    dest->properties = src.properties;
+  }
+  KALDI_PARANOID_ASSERT(dest->IsValid());
+}
+
 }  // namespace kaldi
 }  // namespace tensor
diff --git a/src/tensor/tensor-pattern-utils.h b/src/tensor/tensor-pattern-utils.h
index eaf512faf73..d2643f54f87 100644
--- a/src/tensor/tensor-pattern-utils.h
+++ b/src/tensor/tensor-pattern-utils.h
@@ -41,6 +41,18 @@ inline bool ContainsNegativeStride(int32 pattern_code) {
   return (pattern_code | 2048) != 0;
 }
 
+
+/**
+   This function converts an eaxis-index into an raxis-index, with no error
+   checking (you would normally check afterward that the raxis-index is in the
+   correct range).  Find "Eaxis-index:" and "Raxis-index:" in tensor-pattern.h,
+   but basically and eaxis-index is an axis-index in the public numbering where
+   we allow negative values to mean offsets from the end.
+ */
+inline int32 EaxisToRaxis(int32 eaxis, int32 num_axes) {
+  return (eaxis < 0 ? 1 - eaxis : num_axes - 1 - eaxis);
+}
+
 /**
    Returns true if the pattern code indicates that the pattern contains a
    negative stride.  Caution: will return true if pattern_code was -1, so if you
@@ -53,7 +65,7 @@ inline bool ContainsNegativeStride(int32 pattern_code) {
                      -1 (meaning: not known), or if the code
                      indicates that a negative stride was present.
 */
-inline bool PattenMightContainNegativeStride(
+inline bool PatternMightContainNegativeStride(
     const TensorPattern &pattern) {
   // 2048 is 1 << 11; 11th bit in code is set if code indicates negative stride.
   return (pattern.code | 2048) != 0;
@@ -191,27 +203,20 @@ inline int64 CombineCodes(int32 code1, int32 code2, int32 code3) {
 
 
 /**
-   Modifies 'p' in-place by inserting an axis with (dim=1,stride=0) at the
-   specified position specified in the reversed numbering physically used
-   in the pattern.  Updates p->code.
-
-   Showing just the dims in the pattern (in the order physically present in the
-   dims array), for some examples:
-
-\verbatim
-    UnsqueezeR({3,4}, 0)  -> {1,3,4}
-    UnsqueezeR({3,4}, 1)  -> {3,1,4}
-    UnsqueezeR({3,4}, 2)  -> {3,4,1}
-\endverbatim
+   Copies a TensorPattern from `src` to `dest` while modifying it by inserting
+   an axis with (dim=1,stride=0) at position `raxis` (specified in the
+   private numbering).
 
      @param [in]    raxis   The index at which the extra axis is to appear.
                             We require 0 <= raxis <= p->num_axes.
-     @param [in,out] p      The pattern to which we are adding an axis.
-                            Will have its num_axes increased by 1
-                            at exit, possibly its dims and strides
-                            arrays changed, and its code updated.
+     @param [in]    src    The source pattern.  Must be valid and have
+                           NumAxes() < KALDI_TENSOR_MAX_DIM.
+     @param [out]   dest   The destination pattern.  Is allowed to be the same
+                           object as `src`.  Will be valid at exit if src
+                           was valid at entry (which this function may not
+                           check).
  */
-void UnsqueezeR(int32 raxis, TensorPattern *p);
+void UnsqueezeR(int32 raxis, const TensorPattern &src, TensorPattern *dest);
 
 
 /**
@@ -230,19 +235,16 @@ void UnsqueezeR(int32 raxis, TensorPattern *p);
     Unsqueeze([9,10], -1) -> [9,10,1]
 \endverbatim
 
-     @param [in]    axis   The index at which the extra axis is to appear.
-                           We require -p->num_axes - 1 <= raxis <= p->num_axes
-                           The large allowable range is because negative
-                           axes are permitted, e.g. -1 means insert a new
-                           axis after the last existing axis.
+     @param [in]    eaxis   The axis-index at which the extra axis is to appear,
+                           with negatives allowed (see: "Eaxis-index" in glossary
+                           in tensor-pattern.h).
      @param [in,out] p      The pattern to which we are adding an axis.
                             Will have its num_axes increased by 1
                             at exit, possibly its dims and strides
                             arrays changed, and its code updated.
  */
-inline void Unsqueeze(int32 axis, TensorPattern *p) {
-  if (axis < 0) UnsqueezeR(1 - axis, p);
-  else UnsqueezeR(p->num_axes - axis, p);
+inline void Unsqueeze(int32 eaxis, TensorPattern *p) {
+  UnsqueezeR(EaxisToRaxis(eaxis, p->num_axes));
 }
 
 /**
@@ -532,11 +534,19 @@ int64 NumElements(const TensorPattern &pattern);
    nonempty and all have the same number of axes), by ordering them from the
    most negative stride value in patterns[0] to the most positive stride value
    in patterns[0], using the strides in the other patterns to disambiguate the
-   order only in case of ties, which could only happen if some strides were
-   zero.  I.e. it's a lexical ordero the strides of the patterns.  Note: the
-   most-negative-to-most-positive ordering is in terms of the private, `raxis`
-   numbering; it would be most-positive-to-most-negative in the public
-   numbering.
+   order only in case of ties (which could only happen if some strides were
+   zero), and then the dims in the same order if the strides are all the same
+   (the strides would only be the same if they were zero, if the patterns were
+   valid).  Roughly, it's a lexical order on the (strides, then dims) of the
+   patterns.  Note: the most-negative-to-most-positive ordering is in terms of
+   the private, `raxis` numbering; it would be most-positive-to-most-negative in
+   the public numbering.
+
+   TODO: work out what the ordering should be; should it really be negative-to-
+   positive, or based on abs(stride), and do we need disambiguation with the
+   dims?
+
+   TODO: do we even need this??
 
      @param [in,out]  The patterns whose axes are to be sorted.  All
                     will have their axes subject to the same permutation.
@@ -708,6 +718,45 @@ bool CreateViewPattern(const TensorPattern &pattern_in,
                        ArrayRef<int32> dims,
                        TensorPattern *pattern_out);
 
+
+/**
+   This is like PyTorch's slice() / narrow() functions.
+   It selects a range of dimensions on one of the axes.  It is similar to
+   indexing with a range in Python, like A[10:20].
+
+      @param [in] eaxis  Eaxis-index (see glossary in tensor-pattern.h) on which
+                         to possibly reduce the dimensionality.
+      @param [in] start  Starting index; must be in range [0, t->Dim(eaxis) - 1]
+      @param [in] end    Ending index; must be in the range [start + 1, t->Dim(eaxis)]
+      @param [in,out] pattern  TensorPattern to be modified.  Will be valid at
+                         exit if it was valid at entry.
+
+   See also: the other overloaded version of Slice() which accepts the 'step'
+   parameter; and Select(), which is similar but also reduces the num-axes.
+ */
+void Slice(int32 eaxis, int32 start, int32 end, TensorPattern *pattern);
+
+
+
+/**
+   Copy one Pattern to another while modifying it by by selecting one index from
+   a specified axis (specified in the public numbering), of a TensorImpl `t`,
+   reducing the num_axes by one.
+
+       @param [in] eaxis Eaxis-index (see glossary in tensor-pattern.h) on which
+                         to possibly reduce the dimensionality.
+       @param [in] index Index to select; must be in range
+                         [0, t->Dim(eaxis) - 1].
+       @param [in,out] src   TensorPattern which is to be copied; must be valid,
+                         but we don't guarantee to check this.
+       @param [out] dest TensorPattern which we are copying to and modifying.
+                         It is allowed to be the same object as 'src'.
+                         Will be valid if src was valid.
+*/
+void Select(int32 eaxis, int32 index,
+            const TensorPattern &src, TensorPattern *dest);
+
+
 /**
    This function returns true if 'pattern' has the same strides
    as 'C' array with the same dimensions would have.  (Note:
@@ -737,6 +786,31 @@ void HasCStrides(const TensorPattern &pattern);
 bool PatternsOverlap(const TensorPattern &pattern1,
                      const TensorPattern &pattern2);
 
+/**
+   Returns true if the memory-index-set of this pattern forms a contiguous
+   range, otherwise false.  (Note: this is not the same as PyTorch's notion of
+   contiguous; see HasCStrides()).  Caution: the interface may later be changed
+   to allow caching of this property in the 'properties' field.
+*/
+bool IsContiguous(const TensorPattern &pattern);
+
+
+/**
+   Returns true if the lowest memory-index of 'pattern' is zero (see
+   "Justified" in glossary in pattern.h.
+   (see also: ComputeMinAndMaxMindex()).
+*/
+bool IsJustified(const TensorPattern &pattern);
+
+
+/**
+   This is the same is IsContiguous(pattern) &&
+   StartsFromZero(pattern).
+*/
+bool IsContiguousAndJustified(const TensorPattern &pattern);
+
+
+
 
 }  // namespace tensor
 }  // namespace kaldi
diff --git a/src/tensor/tensor-pattern.cc b/src/tensor/tensor-pattern.cc
index c9a02c3bff1..e8ef11c83ea 100644
--- a/src/tensor/tensor-pattern.cc
+++ b/src/tensor/tensor-pattern.cc
@@ -87,7 +87,7 @@ bool TensorPattern::Check(bool check_code) {
 
 // MAY DELETE THIS.  It's not up to date anyway.
 void TensorPatternProperties::UpdateProperties(const TensorPattern &pattern) {
-  KALDI_PARANOID_ASSERT(pattern.Check());
+  KALDI_PARANOID_ASSERT(pattern.IsValid());
   int32 num_axes = pattern.num_axes;
   int64 dim_prod = 1;
   bool c_strides = true;
diff --git a/src/tensor/tensor-pattern.h b/src/tensor/tensor-pattern.h
index 4d0e57603df..971c4ca7a4a 100644
--- a/src/tensor/tensor-pattern.h
+++ b/src/tensor/tensor-pattern.h
@@ -44,9 +44,15 @@ namespace tensor {
                       "number of axes".
 
     Axis-index:       An axis-index of a Pattern or Tensor (sometimes just "axis" for short,
-                      especially in code) is an index in the range [0, num_axes - 1]
-                      that identifies an axis in the public numbering (see "Public numbering").
-                      For the index in the private numbering, see: Raxis-index.
+                      especially in code) is an index that identifies an axis in the
+                      public (see "Public numbering").  A valid axis-index for a Pattern
+                      with `num_axes` axes is in the range [0, num_axes - 1].
+
+                      For an axis-index i, the corresponding raxis-index (c.f. "Raxis-index:"
+                      or "Private numbering:") would be num_axes - 1 - i.
+
+                      See also "Eaxis-index" for where we allow negative axis-indexes
+                      as offsets from the end.
 
     axis-dominance property: search below for [Valid Pattern], point (vi), for the main
                       definition.
@@ -59,18 +65,18 @@ namespace tensor {
                                  dim(i) * abs(stride(i)) <= abs(stride(i+1)).
 
 
-    Broadcasting:     A convention whereby for an operation on Tensors that would
-                      normally be required to have the same dimension, it's
-                      acceptable for, on some axis, one Tensor to have `dim = n`
-                      with `n != 1` and the other to have `dim = 1`.  I.e., two dims can be
-                      different as long as one of them is 1.  Most operations will
-                      take place as if the Tensor with `dim = 1` had been extended
-                      to `dim = n` by making identical copies.  However, if it is
-                      the output Tensor that has `dim = 1`, there would be summation
-                      or possibly some other appropriate reduction instead of making
-                      copies.  This is different from other toolkits (the fact that
-                      we extend the concept of broadcasting to encompass summation).
-                      See also: PyTorch-style broadcasting, extended indexing.
+    Broadcasting:    A convention whereby for an operation on Tensors that would
+                     normally be required to have the same dimension, it's
+                     acceptable for, on some axis, one Tensor to have `dim = n`
+                     with `n != 1` and the other to have `dim = 1`.  I.e., two dims can be
+                     different as long as one of them is 1.  Most operations will
+                     take place as if the Tensor with `dim = 1` had been extended
+                     to `dim = n` by making identical copies.  However, if it is
+                     the output Tensor that has `dim = 1`, there would be summation
+                     or possibly some other appropriate reduction instead of making
+                     copies.  This is different from other toolkits (the fact that
+                     we extend the concept of broadcasting to encompass summation).
+                     See also: PyTorch-style broadcasting, extended indexing.
 
     Broadcastable:   See documentation for function Broadcastable() in pattern-utils.h.
                      Briefly, two Patterns are broadcastable if their dims (padded
@@ -94,6 +100,12 @@ namespace tensor {
                      range of integers (no gaps).  This is different from the PyTorch
                      definition of 'contiguous', which also requires C-style strides.
 
+    Dereferencing a memory-index:
+                     Sometimes in formal explanations of algorithms we will use notation
+                     `*m` meaning, for a memory-index `m`, the location that it points to
+                     in the relevant storage region; we will assume that it is obvious
+                     from the context which storage region.   See also: "Storage region"
+
     Dims-vector of a Pattern: The vector of dimension of a Pattern: e.g. [] for
                     a Pattern with num_axes = 1 or [2 3] for a Pattern with
                     num-axes = 2.  Note: whenever we display dims vectors in
@@ -112,6 +124,15 @@ namespace tensor {
     Disjoint Patterns:  When we speak of disjoint Patterns we mean that
                     their memory-index-sets are disjoint; see memory-index-set.
 
+    Eaxis-index:      We use the term Eaxis-index (meaning: extended axis-index), or,
+                      in code, eaxis_index, to mean an axis-index in the public
+                      numbering (c.f.: Axis-index) but where negative values are
+                      allowed, as in Python.  Negative values are interpreted as
+                      offsets from the num_axes of the Pattern in question, so for
+                      instance -1 would correspond to num_axes - 1.
+                      Valid eaxis-indexes would be in the range [-num_axes, num_axes - 1].
+                      See also: Axis-index, Raxis-index.
+
     Extended indexing:  A convention whereby if we have a Tensor with, say,
                       `dims = [5 1]`, we can index that Tensor with an index-tuple
                       that:
@@ -150,18 +171,24 @@ namespace tensor {
                       the sum over all axis-indexes, of the element of the index-tuple
                       multiplied by the Pattern's stride for that axis.
 
-    Index-tuple-set of a Pattern: The index-tuple-set of a Pattern is the set
-                      of valid index-tuples assuming we are not allowing extended
+    Index-tuple-set of a Pattern: The index-tuple-set I(p) of a Pattern p is the
+                      set of valid index-tuples assuming we are not allowing extended
                       indexing.  For example, for a Tensor with `dims = [2]`, the
                       set of valid index-tuples would be `{ (0), (1) }`; for
                       a Tensor with `dims = [2 2]` the set of valid index-tuples
                       is `{ (0,0), (0,1), (1,0), (1,1) }`.
 
-    Index-tuple-set of a Pattern-tuple:  The index-tuple-set of a Pattern-tuple is
-                      the index-tuple-set that you would obtain for a Pattern whose
-                      dims equal the dims-vector of that Pattern-tuple.
-                      See "dims-vector of a Pattern-tuple" for explanation of what
-                      that is.
+    Index-tuple-set of a Pattern-tuple:  The index-tuple-set I(P, Q) of a Pattern-tuple
+                      (P, Q) is the index-tuple-set that you would obtain for a
+                      Pattern whose dims equal the dims-vector of that
+                      Pattern-tuple.  See "dims-vector of a Pattern-tuple" for
+                      explanation of what that is.  View I(P, Q) as simply
+                      shorthand for I((P, Q)).
+
+    Justified:        We say that a Pattern is justified if least (i.e. most
+                      negative) memory-index in its memory-index-set is zero.  For
+                      Patterns with nonnegative strides, this is equivalent to
+                      its offset being zero.
 
     Memory region:    A region of memory that will have been allocated with malloc()
                       or some equivalent (or obtained from some memory-management
@@ -170,11 +197,12 @@ namespace tensor {
 
     Memory-pointer:   A void* pointer to the start of a memory region.
 
-    Memory-index/mindex:  An integer (int64) index into a memory region viewed as a
-                      linear array.  For example, for a Tensor of floats, we'd cast
-                      the address of the memory-pointer to `float*` and then use
-                      the memory-index as an index into that array.  In code,
-                      this may be called 'mindex.'  For a Pattern p and an
+    Memory-index: (abbr: mindex)
+                      An integer (int64) index into a memory region viewed as a
+                      linear array.  For example, for a Tensor of floats, we'd
+                      cast the address of the memory-pointer to `float*` and
+                      then use the memory-index as an index into that array.  In
+                      code, this may be called 'mindex.'  For a Pattern p and an
                       index-tuple i that is valid for p, we have a memory-index
                       m = p[i], which is equal to the pattern's offset plus the
                       sum over all axes of the product of the element of the
@@ -185,8 +213,24 @@ namespace tensor {
                       and an index-tuple i, we may write q[i] = (p1[i], p2[i] p3[i]),
                       where expressions like p1[i] evaluate to a memory-index.
 
-    Num-axes:         The number of axes that a Tensor has.  This is a number in the
-                      range [0, KALDI_TENSOR_MAX_DIM], i.e. 0 through 6.
+    Natural order of index-tuples: Suppose we have a set of index-tuples, all with
+                    the same num-axes / length of tuple.  What we call the
+                    "natural order" (this is just a convenient name, it does not
+                    imply any objective naturalnesss) is a total order on
+                    index-tuples that corresponds to interpreting the
+                    index-tuples as indexes into a "C"-style array (in the
+                    public numbering of axes) or a Fortran-style one (in the
+                    private one) and comparing the memory addresses.  In
+                    the public numbering this order is the same as lexical
+                    order, e.g. ([0 0], [0 1], [1 0], [1 1]); in the private
+                    numbering it is lexical order but starting from the right,
+                    not the left.
+       [list:]      Given a set S of index-tuples, we will sometimes write
+                    list(S) to mean a list of index-tuples with the same
+                    elements as S, ordered in the natural order.
+
+    Num-axes:        The number of axes that a Tensor has.  This is a number in the
+                     range [0, KALDI_TENSOR_MAX_DIM], i.e. 0 through 6.
 
     Offset:           The memory-index of the element with index-tuple = (all zeros)
                       of a Tensor.  Offsets will always be >= 0 because they are to
@@ -205,9 +249,9 @@ namespace tensor {
                       (in most circumstances) satisfy.
 
 
-    Pattern-tuple:    A pattern-tuple of a tuple of Patterns, say:  (pattern1, pattern2);
-                      we require the patterns in the tuple to be broadcastable, meaning,
-                      for example: Broadcastable(pattern1, pattern2).
+    Pattern-tuple:    A pattern-tuple of a tuple of Patterns, say:  (P, Q),
+                      where the patterns in the tuple are broadcastable, meaning,
+                      for example: Broadcastable(P, Q).
 
 
     An object of type TensorPattern, representing the dims, strides
@@ -248,21 +292,27 @@ namespace tensor {
                       stride=0 if the Pattern is valid.
 
     Memory-index-set of a Pattern:
-                      The set of all memory-indexes obtained by indexing
+                      The memory-index-set M(p) of a Pattern p is
+                      the set of all memory-indexes obtained by indexing
                       the pattern with all index-tuples in the index-tuple-set
-                      of the Pattern.  The size of this set is the same as the
-                      size of the index-tuple-set (by the uniqueness property).
+                      I(p) of the Pattern.  By extending the notion of indexing
+                      a Pattern (c.f. "Indexing a Pattern") to take set
+                      arguments, this could be written as M(p) = p[I(p)].  Note:
+                      by the uniqueness property, we always have |M(p)| = |I(p)|
+                      for a valid Pattern, i.e. the sizes of the sets are the
+                      same.
 
     Memory-index-tuple-set of a Pattern-tuple:
-                      The set of all memory-index-tuples obtained by indexing
-                      the Patterns in the tuple with all members of the
+                      The set of all memory-index-tuples M(P, Q) obtained by indexing
+                      the Patterns in the tuple (P, Q) with all members of the
                       index-tuple-set of the Pattern-tuple.  See "memory-index-tuple"
                       and "index-tuple-set of a Pattern-tuple" for more information.
+                      View the notation M(P, Q) as shorthand for M((P, Q)).
 
     Normalized strides:  We say that a Pattern has normalized strides if the
                       strides are all positive and are strictly increasing
                       in the private numbering (which implies strictly decreasing
-                      in the public numbering).
+                      in the public numbering).  TODO: remove this?
 
     Linear property:
                       Consider Patterns P and Q with the property that the
@@ -310,6 +360,12 @@ namespace tensor {
                       (property (v)) plus the requirement that the strides be
                       positive and sorted.
 
+    Storage region:   A Tensor, in addition to a Pattern, has a storage region
+                      that can be though of as a pointer (say, to float) which
+                      we index with a memory-index: say, p[m], if s is the
+                      pointer and m is the memory-index.  See storage.h.
+                      See also "Dereferencing a memory-index".
+
     Stride:           A stride is the distance, in elements, between successive
                       elements of a Tensor along a particular dimension.
                       For example, a Tensor with one axis having dim=3 and
@@ -410,13 +466,20 @@ struct TensorPattern {
                                         // by 'raxis' (reversed axis)
   int32 strides[KALDI_TENSOR_MAX_DIM];  // the strides in reversed order,
                                         // indexed by 'raxis' (reversed axis)
+  int64 offset;  // Offset of the element with all-zero indexes
+                 // from the start of the originally allocated memory
+                 // region
+
   int32 code;  // pattern code; see ComputePatternCode() in tensor-pattern-utils.h
                // for details.  If this is negative then it means it has not been
                // computed.  In a valid TensorPattern the code will always be either
                // negative or up-to-date.
-  int64 offset;  // Offset of the element with all-zero indexes
-                 // from the start of the originally allocated memory
-                 // region
+
+  int32 properties;  // More occasionally-needed properties.  This is similar to
+                     // OpenFst's notion of properties, where we compute them
+                     // only on demand.  In a valid TensorPattern the properties
+                     // will always be accurate, but see "Accurate properties"
+                     // in glossary above for definition (it can be zero).
 
   // Returns true if the TensorPattern is valid.  This includes all the
   // mathematical conditions on a valid Pattern (search above for "Valid
@@ -432,6 +495,16 @@ struct TensorPattern {
   // does not need to be compared because, if not -1, it is a function of the
   // dims and strides).
   bool operator < (const TensorPattern &other) const;
+
+
+  // Equality operator on TensorPattern.  Compares the num_axes, offset, and
+  // dims and strides indexed [0... num_axes-1].  (In patterns that satisfy IsValid(),
+  // the remaining dims and strides would be 1 and 0 respectively, so checking
+  // the is pointless).
+  bool operator == (const TensorPattern &other) const;
+
+  // Assignment operator (copies all members).
+  bool operator = (const TensorPattern &other) const;
 };
 
 
@@ -488,6 +561,33 @@ struct TensorPatternProperties {
 
 
 
+/**
+   Returns a hash value for hashing TensorPattern.  Depends on num_axes,
+   offset, and dims and strides indexed [0... num_axes-1].  pattern does
+   not have to be valid.
+ */
+size_t GetHash(const TensorPattern &pattern);
+
+// C++ hashing object for TensorPattern
+struct TensorPatternHasher {
+  size_t operator (const TensorPattern &pattern) { return GetHash(pattern); }
+};
+
+// C++ hashing object for TensorPattern*; requires the pointer
+// be non-NULL and to point to a TensorPattern.
+struct TensorPatternPtrHasher {
+  size_t operator (TensorPattern *pattern) { return GetHash(*pattern); }
+};
+
+struct TensorPatternPtrEqual {
+  size_t operator (TensorPattern *pattern1,
+                   TensorPattern *pattern2) {
+    *pattern1 == *pattern2;
+  }
+};
+
+
+
 }  // namespace tensor
 }  // namespace kaldi
 
diff --git a/src/tensor/tensor-utils.h b/src/tensor/tensor-utils.h
index 9e8547b7158..1c3eef43165 100644
--- a/src/tensor/tensor-utils.h
+++ b/src/tensor/tensor-utils.h
@@ -28,6 +28,41 @@
 namespace kaldi {
 namespace tensor {
 
+
+/**
+  This function returns true if a and b have the same dtype
+  and device.  See also Broadcastable().
+*/
+inline bool Compatible(const Tensor &a, const Tensor &b) {
+  return Compatible(*a.impl_, *b.impl_);
+}
+
+/**
+  This function returns true if a and b have the same dtype
+  and device and are broadcastable; equivalent to
+  `Broadcastable(a, b) && Compatible(a, b)`.
+*/
+inline bool BroadcastableAndCompatible(const Tensor &a, const Tensor &b) {
+  return Compatible(*a.impl_, *b.impl_) &&
+      Broadcastable(*a.impl_, *b.impl_);
+}
+
+
+inline bool Overlap(const Tensor &a, const Tensor &b) {
+  return Compatible(*a.impl_, *b.impl_);
+}
+
+
+/*
+  This function returns true if a, b and c have the same dtype
+  and device; equivalent to Compatible(a, b) && Compatible(b, c).
+*/
+inline bool Compatible(const TensorImpl &a, const TensorImpl &b,
+                       const TensorImpl &c) {
+  return Compatible(*a.impl_, *b.impl_, *c.impl_);
+}
+
+
 /**  This function returns true if the dimensions of tensor patterns
      a and b are broadcastable in the PyTorch sense.  What this means
      for tensors with the same num-axes is that dims for axis i
@@ -36,7 +71,7 @@ namespace tensor {
      padding with leading (dim=1)'s; for
      instance, dims=[2,8,3] and dims=[8,1] would be broadcastable because
      the [8,1] would be interpreted as [1,8,1].  (The examples above
-     are in the public ordering, not the reversed ordering.)
+     are in the public ordering, not the reversed private ordering.)
 
      If 'b_non_reducing' is true, then we do not allow any dim of
      b to be 1 where the corresponding dim of a was not 1.
@@ -94,7 +129,30 @@ inline bool SameDim(const Tensor &a, const Tensor &b,
   return SameDim(a.impl_.pattern, b.impl_.pattern);
 }
 
+inline void CheckUnchangedSince(int64 tick, const Tensor &a) {
+  // TODO.  Access its storage and check not changed since then.
+}
+
+
+/**
+   This is to be called from any routine that writes to the memory underlying a
+   Tensor; in debug mode it registers that the Tensor has been changed, which
+   will later be used to check that the preconditions of the autograd framework
+   (in terms of in-place operations) are satisfied.
+ */
+inline void RegisterTensorChange(const Tensor &a) {
+  RegisterTensorChange(*a.impl_);
+}
+
 
+/**
+   Returns the number of elements in the Tensor, which equals the product of its
+   dimensions, i.e. the product from `axis = 0 ... a.NumAxes() - 1`, of
+   `a.Dim(axis)`.
+ */
+inline int64 NumElements(const Tensor &a) {
+  return NumElements(*a.impl_);
+}
 
 
 }  // namespace tensor
diff --git a/src/tensor/tensor.h b/src/tensor/tensor.h
index c97d49fac44..3a2ae4f65b0 100644
--- a/src/tensor/tensor.h
+++ b/src/tensor/tensor.h
@@ -87,6 +87,11 @@
             if it is attached to any Variable that is an output of that Op
             (i.e. that is modified by that Op).
 
+    Optional Tensor:  In situations where we might have a Tensor and might
+            not, we use a raw std::shared_ptr<TensorImpl>.  (A Tensor wraps
+            a std::shared_ptr<TensorImpl> that is known not to be NULL).
+            Note: we don't allow a Tensor to have zero dim, so we can't
+            use that representation when the Tensor isn't really there.
 
     Tick:   a tick is the value of a global 64-bit time counter that we increment
             every time we mutate a Tensor; see GetTick(), and
@@ -119,6 +124,7 @@
 namespace kaldi {
 namespace tensor {
 
+
 /**
    A Tensor is a multi-dimensional array (up to 5 dimensions) of types such as
    float or double (and eventually ints).  Multiple Tensors may point to data
@@ -130,13 +136,13 @@ namespace tensor {
 
    Most of the operations that you would do on a Tensor (like addition,
    multiplication and so on) are declared out-of-line in tensor-functions.h.
+
+
  */
 class Tensor {
  public:
 
-  inline bool Initialized() { return storage_->data_ != NULL; }
-
-  /// Return the number of axes (a number in {0,1,2,3,4}).  In mathematical
+  /// Return the number of axes (a number in {0,1,2,3,4,5,6}).  In mathematical
   // contexts, this is sometimes known as the rank of the tensor, or sometimes
   // even its dimension, but these terms are ambiguous so we avoid them, and use
   // the terms 'number of axes' or 'axis' throughout.
@@ -146,9 +152,9 @@ class Tensor {
   // the last axis is KALDI_TENSOR_MAX_DIM - 1.
   inline int32 NumAxes() const { return impl_.pattern.num_axes; }
 
-  const TensorImpl &Impl() { return impl_; }
+  const TensorImpl &Impl() const { return impl_; }
 
-  const TensorMeta &Meta() { return reinterpret_cast<TensorMeta&>(impl_); }
+  const TensorMeta &Meta() const { return reinterpret_cast<TensorMeta&>(impl_); }
 
   // Return reference to the struct containing the dimension and
   // stride info.
@@ -156,7 +162,7 @@ class Tensor {
 
   // Return a vector containing dimensions of the tensor; equivalent to
   // .shape in PyTorch.  Dims().size() will equal NumAxes().
-  // This cannot return a const reference because the
+  // This cannot return some kind of reference because the
   // dims are stored internally in reversed order.
   std::vector<int32> Dims() const;
 
@@ -164,7 +170,6 @@ class Tensor {
   // Strides().size() will equal NumAxes().
   std::vector<int32> Strides() const;
 
-
   // Returns the dimension on the supplied axis
   //  @param [in] axis  Axis on which dimension is required, with
   //                    -NumAxes() <= axis < NumAxes(); negative axis
@@ -187,6 +192,7 @@ class Tensor {
   // Returns true if the data forms a contiguous block in memory.
   // (not the same as 'contiguous()' in PyTorch, which also requires
   // that the strides be 'C'-style; for that, see HasCStrides().
+  // TODO: see if this needs to be cached.
   bool IsContiguous() const;
 
   // Returns true if the strides for this array are what you would
@@ -376,19 +382,60 @@ class Tensor {
   Tensor(TensorMeta &meta, InitializePolicy p);
 
 
+  // Move assignment.  TODO: check whether this really does move on the
+  // shared_ptr.
+  Tensor(Tensor &&other): impl_(other.impl_) { }
+
   /**
-     This constructor takes the 'impl' provided and returns
-     a Tensor containing it.  Intended for special-purpose code such
-     as when we wrap arrays from external frameworks.
+     Constructor from TensorImpl.  Will often be used by framework code; not
+     intended for use by users.
    */
-  Tensor(const TensorImpl &impl);
+  Tensor(const std::shared_ptr<const TensorImpl> &impl);
+
+  /**
+     Move-constructor version of constructor from TensorImpl.  Will often be
+     used by framework code; not intended for use by users.  TODO: check that
+     this really does move.
+  */
+  Tensor(const std::shared_ptr<const TensorImpl> &&impl): impl_(impl) { }
+
 
  private:
-  TensorImpl impl_;
+
+  // It might seem odd that we contain a shared_ptr to *const* TensorImpl.
+  // What is const here is the meta-information, not the underlying data
+  // (e.g. the floats).  The reason for this decision is mostly so that class
+  // Variable can store Tensors and shared_ptr's to TensorImpl and not
+  // worry about the meta-information pointed to by those pointers being
+  // unexpectedly changed.  The idea is, whenever you need to change this
+  // meta-info, you reallocate;  things that need to manipulate meta-info
+  // and don't want to reallocate can work directly with TensorImpl which
+  // is a lower-level, less safe interface intended for the developers of
+  // this toolkit.
+  //
+  // Note: the difference between a Tensor and a simple std::shared_ptr<const
+  // TensorImpl> is that in the Tensor the pointer is guaranteed to be non-NULL.
+  // We use the shared_ptr where it could be NULL, e.g. in Variables for the
+  // grad (since it might not have been set up).
+  std::shared_ptr<const TensorImpl> impl_;
 };
 
 
 
+/**
+   This is to be used when you know that 'impl' is non-NULL and you want to
+   treat it as a Tensor.  You should view the type `std::shared_ptr<const
+   TensorImpl>` as "might be Tensor, might be NULL".
+*/
+inline Tensor &AsTensor(std::shared_ptr<const TensorImpl> &impl) {
+  return reinterpret_cast<Tensor&>(impl);
+}
+
+
+
+
+
+
 
 }  // namespace tensor
 }  // namespace kaldi
diff --git a/src/tensor/variable-inl.h b/src/tensor/variable-inl.h
index 42917014f0f..3c312033dd8 100644
--- a/src/tensor/variable-inl.h
+++ b/src/tensor/variable-inl.h
@@ -26,47 +26,88 @@ namespace kaldi {
 namespace tensor {
 
 bool VariableImpl::Tracked() const {
-  if (!base_) {
-    return node_ != nullptr;
-  } else if (base_->node_ != nullptr) {
-    node_ = base_->node_;  // Re-cache it, and the corresponding grad.
-    grad_ = node_->GetGradFor(data_);  // Cache the grad too.
+  if (grad_) {
     return true;
-  } else {
+  } else if (!base_) {
+    return false;  // This is a base Variable with no grad -> not tracked.
+  } else if (base_->grad_ == nullptr) {
     return false;
+  } else {
+    // We need to obtain and cache the Tensor corresponding to this
+    // sub-part of the grad.  (See "Lazy allocation" in glossary in tensor.h
+    // for why this won't allocate much memory).
+    grad_ = base_->GetGradForView(data_);
+    return true;
   }
 }
 
-const std::shared_ptr<Node>& VariableImpl::GetNode() {
-  if (node_) {
-    return node_;
-  } else if (!base_) {
-    // This is a base Variable and we need to construct the node.
-    node_ = std::make_shared<Node>(data_);
-    grad_ = node_->grad;
-    return node_;
+Tensor VariableImpl::GetGradForView(const Tensor &data) {
+  // Check that this is a tracked base Variable.
+  KALDI_PARANOID_ASSERT(base_ == nullptr && grad_ != nullptr);
+  std::shared_ptr<TensorImpl> ans = new TensorImpl(data.Meta(),
+                                                   grad_->storage);
+  if (!rebase_grad_) {
+    // The grad will have exactly the same offset, dims and strides as the data.
+    // This is the normal case, which we encounter when the Variable was
+    // constructed from a Tensor that is justified and contiguous (see glossary
+    // in tensor-pattern.h for meanings).
+    return Tensor(ans);
   } else {
-    // This is a view Variable
-    if (!base_->node_) {  // make node of base if needed
-      base_->node_ = std::make_shared<Node>(base->data_);
-      base_->grad_ = node_->grad;
+    if (!aux_)
+      aux_ = new VariableImplAux;
+    if (!aux_->rebaser)
+      aux_->rebaser = new TensorPatternRebaser(pattern_,
+                                               grad_->pattern_);
+    const TensorPatternRebaser &rebaser = *(aux_->rebaser);
+    if (!rebaser->Rebase(&(ans->pattern))) {
+      // die.
+      KALDI_ERR << "Rebasing failed.  Likely you are using views "
+          "in a very strange way.";
     }
-    // cache node in view
-    node_ = base_->node_;
-    grad_ = node_->GetGradFor(data_);  // Cache the grad too.
-    return node_;
+    KALDI_PARANOID_ASSERT(ans->IsValid());
+    return Tensor(ans);
   }
 }
 
 
 const std::shared_ptr<Tensor>& VariableImpl::GetGrad() {
-  // The code is almost exactly the same as GetNode() above.  Note:
-  // We assume that either grad_ and node_ are both NULL, or both
-  // non-NULL.
   if (grad_) {
     return grad_;
   } else if (!base_) {
-    // This is a base Variable and we need to construct the node.  (Assume it
+    CreateGrad();
+    return grad_;
+  } else {
+    if (!base->grad_)
+      base->CreateGrad();
+    grad_ = base->GetGradForView(data_);
+    return grad_;
+  }
+}
+
+
+void VariableImpl::CreateGrad() {
+  if (ContiguousAndStartsFromZero(data_->Impl())) {
+    // the following creates a new TensorImpl with its own new
+    // Storage object with the meta-info provided; it will just
+    // mirror data_.
+    grad_ = new TensorImpl(data_.Meta(), true);
+    rebase_grad_ = false;
+  } else {
+    // Don't allocate the storage yet; we need to fix the pattern to fill in any
+    // gaps and move the offset to zero.
+    grad_ = new TensorImpl();
+    // grad_->pattern will be as the pattern of data_, but with any
+    // gaps filled in and the smallest mindex equal to zero.
+    MakeContiguousAndJustified(data_.Meta().pattern,
+                               &(grad_->pattern));
+    rebase_grad_ = true;
+  }
+
+
+    // This is a base Variable and we need to construct the grad.
+    //
+
+    // node.  (Assume it
     // is not allocated if grad_ was not allocated).
     node_ = std::make_shared<Node>(data_);
     grad_ = node_->grad;
diff --git a/src/tensor/variable.h b/src/tensor/variable.h
index 3f749652956..f8994456caa 100644
--- a/src/tensor/variable.h
+++ b/src/tensor/variable.h
@@ -59,12 +59,12 @@ class Node {
 
 
   /**
-     Sets the most recent Op held here (latest_op).  This is called whenever
+     Sets the most recent Op held here (op_).  This is called whenever
      an Op is created that changed a Variable attached to this Node.  The
-     Op itself will have a shared_ptr to the previous Op that was attached
+     Op itself should have a shared_ptr to the previous Op that was attached
      to this Node.
    */
-  inline void SetOp(const std::shared_ptr<Op> &op) { latest_op = op; }
+  inline void SetOp(const std::shared_ptr<Op> &op) { op_ = op; }
 
   // The gradient.  This is set up when the Node is created, but the data in its
   // Storage object won't necessarily have been allocated (see "Lazy Allocation"
@@ -92,70 +92,112 @@ class Node {
 
 
 /**
-   Implementation class for Variable.  Variable is just a shared_ptr to this.
+   This is an overflow from class VariableImpl of various rarely-used fields; we
+   instantiate it only when they are used.  This avoids bulking up the
+   implementation of VariableImpl with them.
+
+
  */
-class VariableImpl {
+struct VariableImplAux {
+
+  // rebaser_ is always NULL for view Variables.   For tracked base
+  // Variables where data_ and grad_ have different offset and/or
+  // strides, it is an object capable of converting patterns from
+  // tensors to gradients (used when constructing views).
+  std::unique_ptr<TensorPatternRebaser> rebaser;
+
+  // config_ is NULL if no config values have been stored; otherwise,
+  // a pointer to class Config.
+  std::unique_ptr<Config> config;
 
-  inline const std::shared_ptr<Tensor> &GetData() const { return data_; }
 
+};
+
+
+/**
+   Implementation class for Variable.  Variable just holds a shared_ptr to this.
+ */
+class VariableImpl {
+ public:
+
+  inline const Tensor &GetData() const { return data_; }
 
   // Returns true if this Variable is tracked (see "Tracked" in the
   // glossary in tensor.h).
   inline bool Tracked() const;
 
+  // Returns the most recent Op in the autograd graph (will return the same
+  // value for all Variables sharing the same base Variable).  Will be
+  // NULL if this Variable was not tracked.
+  inline const std::shared_ptr<const Op> &GetOp() const;
+
+  // Returns the Tensor corresponding to the gradient; this will make the
+  // Variable (and any other Variable sharing the same base Variable) tracked if
+  // it was not tracked before (see "Tracked" in glossary in tensor.h)
+  inline const Tensor& GetGrad();
+
 
-  // Returns the most recent Op in the autograd
+  // Returns the Tensor corresponding to the gradient if this variable is
+  // tracked; else returns NULL; Differs from GetGrad() in its behavior for
+  // non-tracked Variables.
+  inline const std::shared_ptr<const TensorImpl>& GetGradIfTracked();
 
-  inline const std::shared_ptr<Op> &GetOp();
 
-  // Returns the node in the autograd graph, as a shared_ptr; this creates it if
-  // it did not exist (so the Variable, and others sharing the same base
-  // Variable, will become tracked if it was not before).
-  inline const std::shared_ptr<Node> &GetNode();
+  // Sets the most recent Op for the base Variable of this Variable;
+  // this is called by Ops to register themselves with Variables that
+  // they modify.
+  inline void SetOp(std::shared_ptr<const Op> &op);
 
-  // Returns the Tensor corresponding to the gradient; like GetNode, this will
-  // make the Variable tracked if it were not tracked before.
-  inline const std::shared_ptr<Tensor> &GetGrad();
+
+  // This function must only be called on tracked base Variables (see glossary
+  // in tensor.h; it requires grad_ != NULL and base_ == NULL).  It gets the
+  // grad Tensor corresponding to the data in 'data', which is assumed to
+  // be a view into this->data_.  This grad Tensor will be a view into
+  // this->grad_.  This function is called from view Variables when setting
+  // up their grad_ variables.
+  inline Tensor GetGradForView(const Tensor &data);
 
  private:
-  // Creates the node in the autograd graph.  This must be a base Variable
-  // and the node must not already exist (i.e. we require node_ == NULL,
-  // base_ == NULL).
-  void CreateNode();
-
-  // The Tensor that this Variable wraps.  Will always be non-NULL.  (Lazy
-  // allocation may still happen in its Storage object, until we do something
-  // with it).
-  std::shared_ptr<Tensor> data_;
-
-  // 'node_' is the node in the autograd graph, which is only allocated for
-  // tracked base Variables; otherwise it is NULL.  It is allocated at the time
-  // we realize we need gradient tracking, which might be when we create the
-  // Variable, or later on if an in-place operation on it has as input a tracked
-  // Variable.
-  //
-  // Non-base Variables cache the node of their base Variable, but if their node
-  // is requested and this pointer is NULL and base_ is non-NULL, we need to
-  // look at base_->node_ to re-check whether the base Variable is tracked, in
-  // case it became tracked since we last checked.
-  std::shared_ptr<Node> node_;
-
-  // A pointer to the gradient, or NULL if this Variable is not tracked (Note:
-  // like node_, this can get out of date if the base Variable becomes tracked,
-  // so if base_ != NULL, we need to re-check).  grad_ can then be created
-  // from the information in node_, once it exists; it is cached here
-  // for efficiency.  See "Lazy Allocation" in glossary in tensor.h;
-  // the underlying data may not have been allocated.
-  // grad_ and node_ are always either both NULL or both non-NULL.
-  std::shared_ptr<Tensor> grad_;
 
+  // This function, which must only be called on a non-tracked base Variable,
+  // creates the 'grad_' tensor.
+  void CreateGrad();
+
+  // The Tensor that this Variable wraps.  (Note: it just holds a non-NULL
+  // shared_ptr<const TensorImpl>.  The const is to ensure the meta-info
+  // isn't changed unexpectedly).
+  Tensor data_;
+
+  // The gradient corresponding to `data_`, or NULL if this is:
+  //  (a) a base Variable that is not tracked, or
+  //  (b) a view Variable that is either not tracked, or we have
+  //      not yet cached the gradient.  (we might need to follow
+  //      the base_ pointer to get the gradient).
+  // Note: the data underlying this gradient is not necessarily allocated; see
+  // "Lazy Allocation" in the glossary in tensor.h.
+  // The type differs from Tensor only because it might be NULL.
+  std::shared_ptr<const TensorImpl> grad_;
 
   // 'base_' is NULL if this is a base Variable (i.e. not a view of another
-  // Variable); otherwise it points to the base Variable.  This also requires
-  // that class Variable store its VariableImpl as a shared_ptr.
+  // Variable); otherwise it points to the base Variable.
   std::shared_ptr<VariableImpl> base_;
 
-
+  // op_ is always NULL for view Variables.  For tracked base Variables,
+  // it is the most recent Op that modified this Variable.  (The autograd
+  // graph is solely between Ops; this latest_ is our entry point to that
+  // graph and is also used in its construction).
+  std::shared_ptr<const Op> op_;
+
+  // For tracked base Variables, this will be set to true if the pattern of
+  // grad_ is different from the pattern of data_ (because data_ was not
+  // contiguous and justified), and false otherwise.  If this is true, we need
+  // to rebase any views of this variable.  For non-tracked or non-base
+  // Variables, its value is undefined.
+  bool rebase_grad_;
+
+  // aux_ is basically a collection of less-often-used fields of class VariableImpl;
+  // it helps keep the main class uncluttered.
+  std::unique_ptr<VariableImplAux> aux_;
 };
 
 
@@ -163,48 +205,50 @@ class Variable;
 
 
 /**
-   class Variable is somewhat like class Tensor but augmented with autograd
-   machinery.
+   class Variable is like class Tensor but augmented with autograd machinery.
 */
 class Variable {
 
   /** Constructor from a Tensor.
-       @param [in] data  Pointer to the source Tensor.  Will accept a
-                      raw Tensor* pointer, in which case it will construct a
-                      shared_ptr.  (??)
+       @param [in] data  The source Tensor.  (This Variable will copy it; this
+                      is to avoid errors if you change the original Tensor).
+
        @param [in] requires_grad    If requires_grad argument is true,
                 the gradient w.r.t. this Variable will be computed if and when
                 you call Backward() on a Variable that depends on it.
                 The same as requires_grad in PyTorch.
   */
-  Variable(const std::shared_ptr<Tensor> &data, bool requires_grad);
+  Variable(const Tensor &data, bool requires_grad);
 
 
 
   /**  Returns shared pointer to the Tensor storing the data. */
-  std::shared_ptr<Tensor> Data();
+  const Tensor &Data() const;
+
+
+  Tensor &Data();
 
 
   /**  Returns pointer to the Tensor storing the derivative w.r.t.  this
        data.  Obtaining this Tensor won't allocate the memory, thanks to lazy
-       initialization.  It is an error to call this if this Variable is
-       not tracked (search for "Tracked:" above for definition).
-       See also GradDataIfPresent().
+       initialization.  Calling this will make this Variable tracked.
   */
-  std::shared_ptr<Tensor> GradData();
+  Tensor &GradData();
 
-  /**  Returns pointer to the Tensor storing the derivative w.r.t.  this
-       data, or NULL if not present..  Obtaining this Tensor won't allocate the
-       memory, thanks to lazy initialization.  See also GradData().
+
+  /**  Returns pointer to the Tensor storing the derivative w.r.t.  this data if
+       this Variable is already tracked, or NULL if not.  This is for framework
+       use, not for users.  Note: shared_ptr<TensorImpl> means "maybe a Tensor,
+       maybe NULL".
   */
-  std::shared_ptr<Tensor> GradDataIfPresent();
+  std::shared_ptr<const TensorImpl> GradDataIfPresent();
 
 
   /**
      Returns pointer to the base Variable (which may or may not be
      identical to 'this'.
    */
-  std::shared_ptr<Variable> GetBaseVariable();
+  Variable GetBaseVariable();
 
 
   /**
@@ -243,10 +287,21 @@ class Variable {
            GradFunc grad_func);
 
 
-p
 
 
  private:
+  // You may ask: Variable is just a shared_ptr<VariableImpl>, so why not just
+  // get rid of it, rename VariableImpl to Variable, and give people the choice
+  // of what memory management approach to use?  The issue is, we *require* the
+  // use of shared_ptr because the `base_` pointer in VariableImpl is also a
+  // shared_ptr to VariableImpl.  Forcing the users to always supply a
+  // shared_ptr<Variable> seems like a bad pattern, so we use this `impl_`
+  // approach where the shared_ptr is hidden.  This is similar to class Tensor,
+  // although the VariableImpl is not const because (for instance) we may
+  // need to make it tracked if it isn't currently.
+  //
+  // The difference between a Variable and std::shared_ptr<VariableImpl> is that
+  // the latter may be NULL, but a Variable never has a NULL impl_.
   std::shared_ptr<VariableImpl> impl_;
 };
 
@@ -263,7 +318,7 @@ p
 
 
 // Include implementation of inline functions.
-#include "variable-inl.h"
+#include "tensor/variable-inl.h"
 
 
 #endif  // KALDI_TENSOR_VARIABLE_H_
diff --git a/src/util/text-utils.h b/src/util/text-utils.h
index 02f4bf483fc..d2a13c391f4 100644
--- a/src/util/text-utils.h
+++ b/src/util/text-utils.h
@@ -272,7 +272,7 @@ void ParseConfigLines(const std::vector<std::string> &lines,
 
 
 /// Returns true if 'name' would be a valid name for a component or node in a
-/// nnet3Nnet.  This is a nonempty string beginning with A-Za-z_, and containing only
+/// nnet3::Nnet.  This is a nonempty string beginning with A-Za-z_, and containing only
 /// '-', '_', '.', A-Z, a-z, or 0-9.
 bool IsValidName(const std::string &name);
 

From 3647d2d4040bde3ab3a23ddd8b1614973c536cd4 Mon Sep 17 00:00:00 2001
From: Daniel Galvez <dt.galvez@gmail.com>
Date: Tue, 16 Apr 2019 12:33:21 -0400
Subject: [PATCH 027/163] [kaldi10] WIP hmm-utils.cc

---
 src/fstext/fstext-utils-inl.h |  21 +--
 src/fstext/fstext-utils.h     |   9 +-
 src/hmm/hmm-utils.cc          | 340 ++++++++++++++--------------------
 src/hmm/hmm-utils.h           |  16 +-
 src/hmm/topology.cc           |   7 +-
 src/hmm/topology.h            |   4 +-
 src/hmm/transitions.cc        |   2 +-
 src/hmm/transitions.h         |  32 +++-
 8 files changed, 192 insertions(+), 239 deletions(-)

diff --git a/src/fstext/fstext-utils-inl.h b/src/fstext/fstext-utils-inl.h
index 49594b11b60..f6283d9794e 100644
--- a/src/fstext/fstext-utils-inl.h
+++ b/src/fstext/fstext-utils-inl.h
@@ -524,27 +524,22 @@ bool FollowingInputSymbolsAreSameClass(bool end_is_epsilon, const Fst<Arc> &fst,
   return true;
 }
 
+// TODO(galv): Confirm that start_is_epsilon is no longer necessary
+// now that we longer allow epsilon transitions.
 template<class Arc>
-void MakePrecedingInputSymbolsSame(bool start_is_epsilon, MutableFst<Arc> *fst) {
+void MakePrecedingInputSymbolsSame(MutableFst<Arc> *fst) {
   IdentityFunction<typename Arc::Label> f;
-  MakePrecedingInputSymbolsSameClass(start_is_epsilon, fst, f);
+  MakePrecedingInputSymbolsSameClass(fst, f);
 }
 
 template<class Arc, class F>
-void MakePrecedingInputSymbolsSameClass(bool start_is_epsilon, MutableFst<Arc> *fst, const F &f) {
+void MakePrecedingInputSymbolsSameClass(MutableFst<Arc> *fst, const F &f) {
   typedef typename F::Result ClassType;
   typedef typename Arc::StateId StateId;
   typedef typename Arc::Weight Weight;
   vector<ClassType> classes;
   ClassType noClass = f(kNoLabel);
   ClassType epsClass = f(0);
-  if (start_is_epsilon) {  // treat having-start-state as epsilon in-transition.
-    StateId start_state = fst->Start();
-    if (start_state < 0 || start_state == kNoStateId) // empty FST.
-      return;
-    classes.resize(start_state+1, noClass);
-    classes[start_state] = epsClass;
-  }
 
   // Find bad states (states with multiple input-symbols into them).
   std::set<StateId> bad_states;  // states that we need to change.
@@ -666,12 +661,13 @@ void MakeFollowingInputSymbolsSameClass(bool end_is_epsilon, MutableFst<Arc> *fs
 
 
 template<class Arc>
-VectorFst<Arc>* MakeLoopFst(const vector<const ExpandedFst<Arc> *> &fsts) {
+std::unique_ptr<VectorFst<Arc>>
+MakeLoopFst(const vector<std::unique_ptr<const ExpandedFst<Arc>>> &fsts) {
   typedef typename Arc::Weight Weight;
   typedef typename Arc::StateId StateId;
   typedef typename Arc::Label Label;
 
-  VectorFst<Arc> *ans = new VectorFst<Arc>;
+  std::unique_ptr<VectorFst<Arc>> ans(new VectorFst<Arc>);
   StateId loop_state = ans->AddState();  // = 0.
   ans->SetStart(loop_state);
   ans->SetFinal(loop_state, Weight::One());
@@ -681,6 +677,7 @@ VectorFst<Arc>* MakeLoopFst(const vector<const ExpandedFst<Arc> *> &fsts) {
   unordered_map<const ExpandedFst<Arc> *, Arc> cache;
 
   for (Label i = 0; i < static_cast<Label>(fsts.size()); i++) {
+    // TODO(galv): I feel like this won't work with my unique_ptr usage. Call .get()?
     const ExpandedFst<Arc> *fst = fsts[i];
     if (fst == NULL) continue;
     { // optimization with cache: helpful if some members of "fsts" may
diff --git a/src/fstext/fstext-utils.h b/src/fstext/fstext-utils.h
index 7b3f098a564..2b94ca8c456 100644
--- a/src/fstext/fstext-utils.h
+++ b/src/fstext/fstext-utils.h
@@ -24,6 +24,7 @@
 #define KALDI_FSTEXT_FSTEXT_UTILS_H_
 #include <algorithm>
 #include <map>
+#include <memory>
 #include <set>
 #include <vector>
 #include <fst/fstlib.h>
@@ -252,15 +253,13 @@ bool FollowingInputSymbolsAreSameClass(bool end_is_epsilon, const Fst<Arc> &fst,
 /// that have differing input symbols going in, and inserting, for each of
 /// the preceding arcs with non-epsilon input symbol, a new dummy state that
 /// has an epsilon link to the fst state.
-/// If "start_is_epsilon", ensure that start-state can have only epsilon-links
-/// into it.
 template<class Arc>
-void MakePrecedingInputSymbolsSame(bool start_is_epsilon, MutableFst<Arc> *fst);
+void MakePrecedingInputSymbolsSame(MutableFst<Arc> *fst);
 
 
 /// As MakePrecedingInputSymbolsSame, but takes a functor object that maps labels to classes.
 template<class Arc, class F>
-void MakePrecedingInputSymbolsSameClass(bool start_is_epsilon, MutableFst<Arc> *fst, const F &f);
+void MakePrecedingInputSymbolsSameClass(MutableFst<Arc> *fst, const F &f);
 
 
 /// MakeFollowingInputSymbolsSame ensures that all arcs exiting any given fst
@@ -304,7 +303,7 @@ void MakeFollowingInputSymbolsSameClass(bool end_is_epsilon, MutableFst<Arc> *fs
 /// less well optimized and would have a lot of final-states.
 
 template<class Arc>
-VectorFst<Arc>* MakeLoopFst(const vector<const ExpandedFst<Arc> *> &fsts);
+std::unique_ptr<VectorFst<Arc>> MakeLoopFst(const vector<std::unique_ptr<const ExpandedFst<Arc>>> &fsts);
 
 
 /// ApplyProbabilityScale is applicable to FSTs in the log or tropical semiring.
diff --git a/src/hmm/hmm-utils.cc b/src/hmm/hmm-utils.cc
index 6e1bd483580..ebd5429afee 100644
--- a/src/hmm/hmm-utils.cc
+++ b/src/hmm/hmm-utils.cc
@@ -18,6 +18,7 @@
 // See the Apache 2 License for the specific language governing permissions and
 // limitations under the License.
 
+#include <memory>
 #include <vector>
 
 #include "hmm/hmm-utils.h"
@@ -27,16 +28,12 @@
 
 namespace kaldi {
 
-
-
-fst::VectorFst<fst::StdArc> *GetHmmAsFsa(
+std::shared_ptr<fst::ExpandedFst<fst::StdArc>> GetHmmAsFsa(
     std::vector<int32> phone_window,
     const ContextDependencyInterface &ctx_dep,
     const Transitions &trans_model,
     const HTransducerConfig &config,
     HmmCacheType *cache) {
-  using namespace fst;
-
   if (static_cast<int32>(phone_window.size()) != ctx_dep.ContextWidth())
     KALDI_ERR << "Context size mismatch, ilabel-info [from context FST is "
               << phone_window.size() << ", context-dependency object "
@@ -49,7 +46,6 @@ fst::VectorFst<fst::StdArc> *GetHmmAsFsa(
           "a code error.";
 
   const Topology &topo = trans_model.GetTopo();
-  const Topology::TopologyEntry &entry  = topo.TopologyForPhone(phone);
 
   // vector of the pdfs, indexed by pdf-class (pdf-classes must start from zero
   // and be contiguous).
@@ -70,6 +66,7 @@ fst::VectorFst<fst::StdArc> *GetHmmAsFsa(
           " that general nature.";
     }
   }
+
   std::pair<int32, std::vector<int32> > cache_index(phone, pdfs);
   if (cache != NULL) {
     HmmCacheType::iterator iter = cache->find(cache_index);
@@ -77,81 +74,44 @@ fst::VectorFst<fst::StdArc> *GetHmmAsFsa(
       return iter->second;
   }
 
-  VectorFst<StdArc> *ans = new VectorFst<StdArc>;
-
-  typedef StdArc Arc;
-  typedef Arc::Weight Weight;
-  typedef Arc::StateId StateId;
-  typedef Arc::Label Label;
-
-  std::vector<StateId> state_ids;
-  for (size_t i = 0; i < entry.size(); i++)
-    state_ids.push_back(ans->AddState());
-  KALDI_ASSERT(state_ids.size() != 0);  // Or empty topology entry.
-  ans->SetStart(state_ids[0]);
-  StateId final = state_ids.back();
-  ans->SetFinal(final, Weight::One());
-
-  for (int32 hmm_state = 0;
-       hmm_state < static_cast<int32>(entry.size());
-       hmm_state++) {
-    int32 forward_pdf_class = entry[hmm_state].forward_pdf_class, forward_pdf;
-    int32 self_loop_pdf_class = entry[hmm_state].self_loop_pdf_class, self_loop_pdf;
-    if (forward_pdf_class == kNoPdf) {  // nonemitting state.
-      forward_pdf = kNoPdf;
-      self_loop_pdf = kNoPdf;
-    } else {
-      KALDI_ASSERT(forward_pdf_class < static_cast<int32>(pdfs.size()));
-      KALDI_ASSERT(self_loop_pdf_class < static_cast<int32>(pdfs.size()));
-      forward_pdf = pdfs[forward_pdf_class];
-      self_loop_pdf = pdfs[self_loop_pdf_class];
+  using Arc = fst::StdArc;
+  using MyEditFst = fst::EditFst<Arc>;
+  using StateId = Arc::StateId;
+
+  const fst::StdVectorFst &entry = topo.TopologyForPhone(phone);
+  std::shared_ptr<MyEditFst> loopless_entry = std::make_shared<MyEditFst>(entry);
+
+  for (fst::StateIterator<MyEditFst> siter(*loopless_entry);
+       !siter.Done(); siter.Next()) {
+    StateId state = siter.Value();
+    std::vector<Arc> non_self_loops;
+    for (fst::ArcIterator<MyEditFst> aiter(*loopless_entry, state);
+         !aiter.Done(); aiter.Next()) {
+      const Arc& arc = aiter.Value();
+      if (arc.nextstate != state) {
+        non_self_loops.push_back(arc);
+      }
     }
-    int32 trans_idx;
-    for (trans_idx = 0;
-        trans_idx < static_cast<int32>(entry[hmm_state].transitions.size());
-        trans_idx++) {
-      BaseFloat log_prob;
-      Label label;
-      int32 dest_state = entry[hmm_state].transitions[trans_idx].first;
-      bool is_self_loop = (dest_state == hmm_state);
-      if (is_self_loop)
-        continue; // We will add self-loops in at a later stage of processing,
-      // not in this function.
-      if (forward_pdf_class == kNoPdf) {
-        // no pdf, hence non-estimated probability.
-        // [would not happen with normal topology] .  There is no transition-state
-        // involved in this case.
-        log_prob = Log(entry[hmm_state].transitions[trans_idx].second);
-        label = 0;
-      } else {  // normal probability.
-        int32 trans_state =
-            trans_model.TupleToTransitionState(phone, hmm_state, forward_pdf, self_loop_pdf);
-        int32 trans_id =
-            trans_model.PairToTransitionId(trans_state, trans_idx);
-        log_prob = trans_model.GetTransitionLogProbIgnoringSelfLoops(trans_id);
-        // log_prob is a negative number (or zero)...
-        label = trans_id;
+    if (non_self_loops.size() != loopless_entry->NumArcs(state)) {
+      loopless_entry->DeleteArcs(state);
+      for (const Arc& arc: non_self_loops) {
+        loopless_entry->AddArc(state, arc);
       }
-      // Will add probability-scale later (we may want to push first).
-      ans->AddArc(state_ids[hmm_state],
-                  Arc(label, label, Weight(-log_prob), state_ids[dest_state]));
     }
   }
 
-  fst::RemoveEpsLocal(ans);  // this is safe and will not blow up.
-
   // Now apply probability scale.
   // We waited till after the possible weight-pushing steps,
   // because weight-pushing needs "real" weights in order to work.
-  ApplyProbabilityScale(config.transition_scale, ans);
+  ApplyProbabilityScale(config.transition_scale, loopless_entry.get());
   if (cache != NULL)
-    (*cache)[cache_index] = ans;
-  return ans;
+    (*cache)[cache_index] = loopless_entry;
+  return loopless_entry;
 }
 
 
 
-fst::VectorFst<fst::StdArc>*
+const fst::VectorFst<fst::StdArc>&
 GetHmmAsFsaSimple(std::vector<int32> phone_window,
                   const ContextDependencyInterface &ctx_dep,
                   const Transitions &trans_model,
@@ -168,67 +128,7 @@ GetHmmAsFsaSimple(std::vector<int32> phone_window,
   KALDI_ASSERT(phone != 0);
 
   const Topology &topo = trans_model.GetTopo();
-  const Topology::TopologyEntry &entry  = topo.TopologyForPhone(phone);
-
-  VectorFst<StdArc> *ans = new VectorFst<StdArc>;
-
-  // Create a mini-FST with a superfinal state [in case we have emitting
-  // final-states, which we usually will.]
-  typedef StdArc Arc;
-  typedef Arc::Weight Weight;
-  typedef Arc::StateId StateId;
-  typedef Arc::Label Label;
-
-  std::vector<StateId> state_ids;
-  for (size_t i = 0; i < entry.size(); i++)
-    state_ids.push_back(ans->AddState());
-  KALDI_ASSERT(state_ids.size() > 1);  // Or invalid topology entry.
-  ans->SetStart(state_ids[0]);
-  StateId final = state_ids.back();
-  ans->SetFinal(final, Weight::One());
-
-  for (int32 hmm_state = 0;
-       hmm_state < static_cast<int32>(entry.size());
-       hmm_state++) {
-    int32 forward_pdf_class = entry[hmm_state].forward_pdf_class, forward_pdf;
-    int32 self_loop_pdf_class = entry[hmm_state].self_loop_pdf_class, self_loop_pdf;
-    if (forward_pdf_class == kNoPdf) {   // nonemitting state; not generally used.
-      forward_pdf = kNoPdf;
-      self_loop_pdf = kNoPdf;
-    } else {
-      bool ans = ctx_dep.Compute(phone_window, forward_pdf_class, &forward_pdf);
-      KALDI_ASSERT(ans && "Context-dependency computation failed.");
-      ans = ctx_dep.Compute(phone_window, self_loop_pdf_class, &self_loop_pdf);
-      KALDI_ASSERT(ans && "Context-dependency computation failed.");
-    }
-    int32 trans_idx;
-    for (trans_idx = 0;
-        trans_idx < static_cast<int32>(entry[hmm_state].transitions.size());
-        trans_idx++) {
-      BaseFloat log_prob;
-      Label label;
-      int32 dest_state = entry[hmm_state].transitions[trans_idx].first;
-      if (forward_pdf_class == kNoPdf) {
-        // no pdf, hence non-estimated probability.  very unusual case.  [would
-        // not happen with normal topology] .  There is no transition-state
-        // involved in this case.
-        KALDI_ASSERT(hmm_state != dest_state);
-        log_prob = Log(entry[hmm_state].transitions[trans_idx].second);
-        label = 0;
-      } else {  // normal probability.
-        int32 trans_state =
-            trans_model.TupleToTransitionState(phone, hmm_state, forward_pdf, self_loop_pdf);
-        int32 trans_id =
-            trans_model.PairToTransitionId(trans_state, trans_idx);
-        log_prob = prob_scale * trans_model.GetTransitionLogProb(trans_id);
-        // log_prob is a negative number (or zero)...
-        label = trans_id;
-      }
-      ans->AddArc(state_ids[hmm_state],
-                  Arc(label, label, Weight(-log_prob), state_ids[dest_state]));
-    }
-  }
-  return ans;
+  return topo.TopologyForPhone(phone);
 }
 
 
@@ -236,10 +136,11 @@ GetHmmAsFsaSimple(std::vector<int32> phone_window,
 /// This utility function, used in GetHTransducer(), creates an FSA (finite
 /// state acceptor, i.e. an FST with ilabels equal to olabels) with a single
 /// successful path, with a single label on it.
-static inline fst::VectorFst<fst::StdArc> *MakeTrivialAcceptor(int32 label) {
+static inline std::unique_ptr<fst::VectorFst<fst::StdArc>>
+MakeTrivialAcceptor(int32 label) {
   typedef fst::StdArc Arc;
   typedef Arc::Weight Weight;
-  fst::VectorFst<Arc> *ans = new fst::VectorFst<Arc>;
+  std::unique_ptr<fst::VectorFst<Arc>> ans(new fst::VectorFst<Arc>);
   ans->AddState();
   ans->AddState();
   ans->SetStart(0);
@@ -251,11 +152,12 @@ static inline fst::VectorFst<fst::StdArc> *MakeTrivialAcceptor(int32 label) {
 
 
 // The H transducer has a separate outgoing arc for each of the symbols in ilabel_info.
-fst::VectorFst<fst::StdArc> *GetHTransducer(const std::vector<std::vector<int32> > &ilabel_info,
-                                            const ContextDependencyInterface &ctx_dep,
-                                            const Transitions &trans_model,
-                                            const HTransducerConfig &config,
-                                            std::vector<int32> *disambig_syms_left) {
+std::unique_ptr<fst::VectorFst<fst::StdArc>>
+GetHTransducer(const std::vector<std::vector<int32> > &ilabel_info,
+               const ContextDependencyInterface &ctx_dep,
+               const Transitions &trans_model,
+               const HTransducerConfig &config,
+               std::vector<int32> *disambig_syms_left) {
   KALDI_ASSERT(ilabel_info.size() >= 1 && ilabel_info[0].size() == 0);  // make sure that eps == eps.
   HmmCacheType cache;
   // "cache" is an optimization that prevents GetHmmAsFsa repeating work
@@ -266,7 +168,7 @@ fst::VectorFst<fst::StdArc> *GetHTransducer(const std::vector<std::vector<int32>
   typedef Arc::StateId StateId;
   typedef Arc::Label Label;
 
-  std::vector<const ExpandedFst<Arc>* > fsts(ilabel_info.size(), NULL);
+  std::vector<std::unique_ptr<const ExpandedFst<Arc>>> fsts(ilabel_info.size(), NULL);
   std::vector<int32> phones = trans_model.GetPhones();
 
   KALDI_ASSERT(disambig_syms_left != 0);
@@ -315,19 +217,17 @@ fst::VectorFst<fst::StdArc> *GetHTransducer(const std::vector<std::vector<int32>
     } else {  // Real phone-in-context.
       std::vector<int32> phone_window = ilabel_info[j];
 
-      VectorFst<Arc> *fst = GetHmmAsFsa(phone_window,
-                                        ctx_dep,
-                                        trans_model,
-                                        config,
-                                        &cache);
-      fsts[j] = fst;
+      std::shared_ptr<ExpandedFst<Arc>> fst = GetHmmAsFsa(phone_window,
+                                                          ctx_dep,
+                                                          trans_model,
+                                                          config,
+                                                          &cache);
+      std::unique_ptr<ExpandedFst<Arc>> u_fst(fst->Copy());
+      fsts[j] = std::move(u_fst);
     }
   }
 
-  VectorFst<Arc> *ans = MakeLoopFst(fsts);
-  SortAndUniq(&fsts); // remove duplicate pointers, which we will have
-  // in general, since we used the cache.
-  DeletePointers(&fsts);
+  std::unique_ptr<VectorFst<Arc>> ans = MakeLoopFst(fsts);
   return ans;
 }
 
@@ -404,16 +304,17 @@ void GetIlabelMapping (const std::vector<std::vector<int32> > &ilabel_info_old,
 
 
 
-fst::VectorFst<fst::StdArc> *GetPdfToTransitionIdTransducer(const Transitions &trans_model) {
+std::unique_ptr<fst::VectorFst<fst::StdArc>>
+GetPdfToTransitionIdTransducer(const Transitions &trans_model) {
   using namespace fst;
-  VectorFst<StdArc> *ans = new VectorFst<StdArc>;
+  std::unique_ptr<VectorFst<StdArc>> ans(new VectorFst<StdArc>);
   typedef VectorFst<StdArc>::Weight Weight;
   typedef StdArc Arc;
   ans->AddState();
   ans->SetStart(0);
   ans->SetFinal(0, Weight::One());
   for (int32 tid = 1; tid <= trans_model.NumTransitionIds(); tid++) {
-    int32 pdf = trans_model.TransitionIdToPdf(tid);
+    int32 pdf = trans_model.TransitionIdToPdfFast(tid);
     ans->AddArc(0, Arc(pdf+1, tid, Weight::One(), 0));  // note the offset of 1 on the pdfs.
     // it's because 0 is a valid pdf.
   }
@@ -437,26 +338,52 @@ class TidToTstateMapper {
   // with values over 100000/kNontermBigNumber) to zero.
   // Its point is to provide an equivalence class on labels that's relevant to what
   // the self-loop will be on the following (or preceding) state.
+
+  // TransitionState no longer exists. It's basically a
+  // TransitionIdInfo without the arc_index field.
+
   TidToTstateMapper(const Transitions &trans_model,
                     const std::vector<int32> &disambig_syms,
                     bool check_no_self_loops):
       trans_model_(trans_model),
       disambig_syms_(disambig_syms),
       check_no_self_loops_(check_no_self_loops) { }
-  typedef int32 Result;
-  int32 operator() (int32 label) const {
-    if (label == static_cast<int32>(fst::kNoLabel)) return -1;  // -1 -> -1
-    else if (label >= 1 && label <= trans_model_.NumTransitionIds()) {
-      if (check_no_self_loops_ && trans_model_.IsSelfLoop(label))
+
+  typedef Transitions::TransitionIdInfo Result;
+  static const Result& NoLabelClass() {
+    // Take advantage of the fact that phone must be greater than or
+    // equal to 1 to create a TransitionIdInfo which in practice will
+    // never be created normally.
+
+    // Use -1 for all other fields so we can easily see when debugging
+    // whether we are using one of these invalid TransitionIdInfo
+    // classes.
+    static auto *no_label =
+      new Transitions::TransitionIdInfo{.phone = -1, .topo_state = -1, .arc_index = -1,
+                                        .pdf_id = -1, .self_loop_pdf_id = -1};
+    return *no_label;
+  }
+
+  static const Result& ZeroClass() {
+    static auto *zero_label =
+      new Transitions::TransitionIdInfo{.phone = 0, .topo_state = -1, .arc_index = -1,
+                                        .pdf_id = -1, .self_loop_pdf_id = -1};
+    return *zero_label;
+  }
+
+  const Result& operator() (int32 tid) const {
+    if (tid == static_cast<int32>(fst::kNoLabel)) return NoLabelClass();  // -1 -> -1
+    else if (tid >= 1 && tid <= trans_model_.NumTransitionIds()) {
+      if (check_no_self_loops_ && trans_model_.InfoForTransitionId(tid).is_self_loop)
         KALDI_ERR << "AddSelfLoops: graph already has self-loops.";
-      return trans_model_.TransitionIdToTransitionState(label);
+      return trans_model_.InfoForTransitionId(tid);
     } else {  // 0 or (presumably) disambiguation symbol.  Map to zero
       int32 big_number = fst::kNontermBigNumber;  // 1000000
-      if (label != 0 && label < big_number)
+      if (tid != 0 && tid < big_number)
         KALDI_ASSERT(std::binary_search(disambig_syms_.begin(),
                                         disambig_syms_.end(),
-                                        label));  // or invalid label
-      return 0;
+                                        tid));  // or invalid tid
+      return ZeroClass();
     }
   }
 
@@ -480,18 +407,17 @@ static void AddSelfLoopsReorder(const Transitions &trans_model,
   typedef Arc::StateId StateId;
   typedef Arc::Weight Weight;
 
+  typedef TidToTstateMapper::Result Class;
+
   TidToTstateMapper f(trans_model, disambig_syms, check_no_self_loops);
   // Duplicate states as necessary so that each state will require at most one
   // self-loop to be added to it.  Approximately this means that if a
   // state has multiple different symbols on arcs entering it, it will be
   // duplicated, with one copy per incoming symbol.
-  MakePrecedingInputSymbolsSameClass(true, fst, f);
-
-  int32 kNoTransState = f(kNoLabel);
-  KALDI_ASSERT(kNoTransState == -1);
+  MakePrecedingInputSymbolsSameClass(fst, f);
 
   // use the following to keep track of the transition-state for each state.
-  std::vector<int32> state_in(fst->NumStates(), kNoTransState);
+  std::vector<Class> state_in(fst->NumStates(), f.NoLabelClass());
 
   // This first loop just works out the label into each state,
   // and converts the transitions in the graph from transition-states
@@ -505,8 +431,8 @@ static void AddSelfLoopsReorder(const Transitions &trans_model,
          !aiter.Done();
          aiter.Next()) {
       Arc arc = aiter.Value();
-      int32 trans_state = f(arc.ilabel);
-      if (state_in[arc.nextstate] == kNoTransState)
+      Class trans_state = f(arc.ilabel);
+      if (state_in[arc.nextstate] == f.NoLabelClass())
         state_in[arc.nextstate] = trans_state;
       else {
         KALDI_ASSERT(state_in[arc.nextstate] == trans_state);
@@ -515,7 +441,8 @@ static void AddSelfLoopsReorder(const Transitions &trans_model,
     }
   }
 
-  KALDI_ASSERT(state_in[fst->Start()] == kNoStateId || state_in[fst->Start()] == 0);
+  KALDI_ASSERT(state_in[fst->Start()] == f.NoLabelClass() ||
+               state_in[fst->Start()] == f.ZeroClass());
   // or MakePrecedingInputSymbolsSame failed.
 
   // The next loop looks at each graph state, adds the self-loop [if needed] and
@@ -525,11 +452,15 @@ static void AddSelfLoopsReorder(const Transitions &trans_model,
   // with the corresponding labels on them by this probability).
 
   for (StateId s = 0; s < static_cast<StateId>(state_in.size()); s++) {
-    if (state_in[s] > 0) {  // defined, and not eps or a disambiguation symbol or a
-                            // nonterminal-related sybol for grammar decoding...
-      int32 trans_state = static_cast<int32>(state_in[s]);
+    if (state_in[s] != f.NoLabelClass() && state_in[s] != f.ZeroClass()) {
+      // defined, and not eps or a disambiguation symbol or a
+      // nonterminal-related sybol for grammar decoding...
+      const Class& trans_state = state_in[s];
       // First multiply all probabilities by "forward" probability.
-      BaseFloat log_prob = trans_model.GetNonSelfLoopLogProb(trans_state);
+      
+      // WARNING: This is no longer the forward probability that this code was originally using!
+      // It is difficult to get the self-loop probability just given the 
+      BaseFloat log_prob = trans_state.transition_cost;
       fst->SetFinal(s, Times(fst->Final(s), Weight(-log_prob*self_loop_scale)));
       for (MutableArcIterator<MutableFst<Arc> > aiter(fst, s);
           !aiter.Done();
@@ -539,7 +470,10 @@ static void AddSelfLoopsReorder(const Transitions &trans_model,
         aiter.SetValue(arc);
       }
       // Now add self-loop, if needed.
-      int32 trans_id = trans_model.SelfLoopOf(trans_state);
+      int32 trans_id = trans_state.self_loop_transition_id;
+      // TODO: This is adding an arc for the current state into
+      // itself, right? How are we supposed to get the current state's
+      // self-loop? Ugh. And make sure I don't repeat it...
       if (trans_id != 0) {  // has self-loop.
         BaseFloat log_prob = trans_model.GetTransitionLogProb(trans_id);
         fst->AddArc(s, Arc(trans_id, 0, Weight(-log_prob*self_loop_scale), s));
@@ -592,8 +526,9 @@ static void AddSelfLoopsNoReorder(
       // a transition-state;  add self-loop, if it has one.
       int32 trans_id = trans_model.SelfLoopOf(my_trans_state);
       if (trans_id != 0) {  // has self-loop.
-        BaseFloat log_prob = trans_model.GetTransitionLogProb(trans_id);
-        fst->AddArc(s, Arc(trans_id, 0, Weight(-log_prob*self_loop_scale), s));
+        BaseFloat neg_log_prob = \
+          trans_model.InfoForTransitionId(trans_id).transition_cost;
+        fst->AddArc(s, Arc(trans_id, 0, Weight(neg_log_prob*self_loop_scale), s));
       }
     }
   }
@@ -625,8 +560,8 @@ void AddSelfLoops(const Transitions &trans_model,
 static bool IsReordered(const Transitions &trans_model,
                         const std::vector<int32> &alignment) {
   for (size_t i = 0; i + 1 < alignment.size(); i++) {
-    int32 tstate1 = trans_model.TransitionIdToTransitionState(alignment[i]),
-        tstate2 = trans_model.TransitionIdToTransitionState(alignment[i+1]);
+    const TransitionIdInfo& tstate1 = trans_model.InfoForTransitionId(alignment[i]),
+      tstate2 = trans_model.InfoForTransitionId(alignment[i+1]);
     if (tstate1 != tstate2) {
       bool is_loop_1 = trans_model.IsSelfLoop(alignment[i]),
           is_loop_2 = trans_model.IsSelfLoop(alignment[i+1]);
@@ -668,13 +603,13 @@ static bool SplitToPhonesInternal(const Transitions &trans_model,
   bool was_ok = true;
   for (size_t i = 0; i < alignment.size(); i++) {
     int32 trans_id = alignment[i];
-    if (trans_model.IsFinal(trans_id)) {  // is final-prob
+    if (trans_model.InfoForTransitionId(trans_id).is_final) {
       if (!reordered) end_points.push_back(i+1);
       else {  // reordered.
         while (i+1 < alignment.size() &&
               trans_model.IsSelfLoop(alignment[i+1])) {
-          KALDI_ASSERT(trans_model.TransitionIdToTransitionState(alignment[i]) ==
-                 trans_model.TransitionIdToTransitionState(alignment[i+1]));
+          KALDI_ASSERT(trans_model.InfoForTransitionId(alignment[i]) ==
+                       trans_model.InfoForTransitionId(alignment[i+1]));
           i++;
         }
         end_points.push_back(i+1);
@@ -685,14 +620,11 @@ static bool SplitToPhonesInternal(const Transitions &trans_model,
       was_ok = false;
       end_points.push_back(i+1);
     } else {
-      int32 this_state = trans_model.TransitionIdToTransitionState(alignment[i]),
-          next_state = trans_model.TransitionIdToTransitionState(alignment[i+1]);
-      if (this_state == next_state) continue;  // optimization.
-      int32 this_phone = trans_model.TransitionStateToPhone(this_state),
-          next_phone = trans_model.TransitionStateToPhone(next_state);
+      int32 this_phone = trans_model.InfoForTransitionId(this_state).phone,
+          next_phone = trans_model.InfoForTransitionId(next_state).phone;
       if (this_phone != next_phone) {
         // The phone changed, but this is an error-- we should have detected this via the
-        // IsFinal check.
+        // is_final check.
         was_ok = false;
         end_points.push_back(i+1);
       }
@@ -702,16 +634,12 @@ static bool SplitToPhonesInternal(const Transitions &trans_model,
   size_t cur_point = 0;
   for (size_t i = 0; i < end_points.size(); i++) {
     split_output->push_back(std::vector<int32>());
-    // The next if-statement checks if the initial trans-id at the current end
-    // point is the initial-state of the current phone if that initial-state
-    // is emitting (a cursory check that the alignment is plausible).
-    int32 trans_state =
-      trans_model.TransitionIdToTransitionState(alignment[cur_point]);
-    int32 phone = trans_model.TransitionStateToPhone(trans_state);
-    int32 forward_pdf_class = trans_model.GetTopo().TopologyForPhone(phone)[0].forward_pdf_class;
-    if (forward_pdf_class != kNoPdf)  // initial-state of the current phone is emitting
-      if (trans_model.TransitionStateToHmmState(trans_state) != 0)
-        was_ok = false;
+    // The next if-statement checks if the initial trans-id at the
+    // current end point is the initial-state of the current phone (a
+    // cursory check that the alignment is plausible).
+    int32 topo_state = trans_model.InfoForTransitionId(InfoForTransitionId).topo_state;
+    if (topo_state != 0)
+      was_ok = false;
     for (size_t j = cur_point; j < end_points[i]; j++)
       split_output->back().push_back(alignment[j]);
     cur_point = end_points[i];
@@ -1019,12 +947,16 @@ bool ConvertAlignment(const Transitions &old_trans_model,
                       bool new_is_reordered,
                       const std::vector<int32> *phone_map,
                       std::vector<int32> *new_alignment) {
-  if (!repeat_frames || subsample_factor == 1) {
+  if (subsample_factor == 1) {
+    if (repeat_frames) {
+      KALDI_WARN << "repeat_frames being set to true has no effect when "
+        "subsample_factor=1 (its default value)";
+    }
     return ConvertAlignmentInternal(old_trans_model,
                                     new_trans_model,
                                     new_ctx_dep,
                                     old_alignment,
-                                    subsample_factor - 1,
+                                    subsample_factor - 1, // == 0
                                     subsample_factor,
                                     new_is_reordered,
                                     phone_map,
@@ -1032,14 +964,18 @@ bool ConvertAlignment(const Transitions &old_trans_model,
    // The value "subsample_factor - 1" for conversion_shift above ensures the
    // alignments have the same length as the output of 'subsample-feats'
   } else {
+    // either repeat_frames or subsample_factor >= 2. But if repeat_frames == True
+    // then and subsample_factor == 1, then it is the same as the above.
     std::vector<std::vector<int32> > shifted_alignments(subsample_factor);
+    // We create alignments for all shifts from [subsample_factor -1
+    // to 0], inclusive.
     for (int32 conversion_shift = subsample_factor - 1;
          conversion_shift >= 0; conversion_shift--) {
       if (!ConvertAlignmentInternal(old_trans_model,
                                     new_trans_model,
                                     new_ctx_dep,
                                     old_alignment,
-                                    conversion_shift,
+                                    conversion_shift, // conversion_shift
                                     subsample_factor,
                                     new_is_reordered,
                                     phone_map,
@@ -1211,9 +1147,9 @@ void GetRandomAlignmentForPhone(const ContextDependencyInterface &ctx_dep,
   typedef fst::StdArc Arc;
   int32 length = alignment->size();
   BaseFloat prob_scale = 0.0;
-  fst::VectorFst<Arc> *fst = GetHmmAsFsaSimple(phone_window, ctx_dep,
-                                               trans_model, prob_scale);
-  fst::RmEpsilon(fst);
+  fst::VectorFst<Arc> fst = GetHmmAsFsaSimple(phone_window, ctx_dep,
+                                              trans_model, prob_scale);
+  fst::RmEpsilon(&fst);
 
   fst::VectorFst<Arc> length_constraint_fst;
   {  // set up length_constraint_fst.
@@ -1236,7 +1172,7 @@ void GetRandomAlignmentForPhone(const ContextDependencyInterface &ctx_dep,
     length_constraint_fst.SetFinal(cur_state, fst::TropicalWeight::One());
   }
   fst::VectorFst<Arc> composed_fst;
-  fst::Compose(*fst, length_constraint_fst, &composed_fst);
+  fst::Compose(fst, length_constraint_fst, &composed_fst);
   fst::VectorFst<Arc> single_path_fst;
   {  // randomly generate a single path.
     fst::UniformArcSelector<Arc> selector;
diff --git a/src/hmm/hmm-utils.h b/src/hmm/hmm-utils.h
index 9cefa557bb3..588810cbd69 100644
--- a/src/hmm/hmm-utils.h
+++ b/src/hmm/hmm-utils.h
@@ -20,6 +20,8 @@
 #ifndef KALDI_HMM_HMM_UTILS_H_
 #define KALDI_HMM_HMM_UTILS_H_
 
+#include <memory>
+
 #include "hmm/topology.h"
 #include "hmm/transitions.h"
 #include "lat/kaldi-lattice.h"
@@ -66,7 +68,7 @@ struct HmmCacheHash {
 /// HmmCacheType is a map from (central-phone, sequence of pdf-ids) to FST, used
 /// as cache in GetHmmAsFsa, as an optimization.
 typedef unordered_map<std::pair<int32, std::vector<int32> >,
-                      fst::VectorFst<fst::StdArc>*,
+                      std::shared_ptr<fst::ExpandedFst<fst::StdArc>>,
                       HmmCacheHash> HmmCacheType;
 
 
@@ -76,6 +78,7 @@ typedef unordered_map<std::pair<int32, std::vector<int32> >,
 /// "Fst".  This acceptor does not include self-loops; you have to call
 /// AddSelfLoops() for that.  (We do that at a later graph compilation phase,
 /// for efficiency).  The labels on the FSA correspond to transition-ids.
+/// But now we already have self-loops... Problematic?
 ///
 /// as the symbols.
 /// For documentation in context, see \ref hmm_graph_get_hmm_as_fst
@@ -88,9 +91,9 @@ typedef unordered_map<std::pair<int32, std::vector<int32> >,
 ///   @param config Configuration object, see \ref HTransducerConfig.
 ///   @param cache Object used as a lookaside buffer to save computation;
 ///       if it finds that the object it needs is already there, it will
-///       just return a pointer value from "cache"-- not that this means
+///       just return a pointer value from "cache"-- note that this means
 ///       you have to be careful not to delete things twice.
-fst::VectorFst<fst::StdArc> *GetHmmAsFsa(
+std::shared_ptr<fst::ExpandedFst<fst::StdArc>> GetHmmAsFsa(
     std::vector<int32> context_window,
     const ContextDependencyInterface &ctx_dep,
     const Transitions &trans_model,
@@ -101,7 +104,7 @@ fst::VectorFst<fst::StdArc> *GetHmmAsFsa(
 /// Included mainly as a form of documentation, not used in any other code
 /// currently.  Creates the acceptor FST with self-loops, and with fewer
 /// options.
-fst::VectorFst<fst::StdArc>*
+const fst::StdVectorFst&
 GetHmmAsFsaSimple(std::vector<int32> context_window,
                   const ContextDependencyInterface &ctx_dep,
                   const Transitions &trans_model,
@@ -123,7 +126,7 @@ GetHmmAsFsaSimple(std::vector<int32> context_window,
   * the input of the transducer (i.e. same symbol type as whatever is on the
   * input of the transducer
   */
-fst::VectorFst<fst::StdArc>*
+std::unique_ptr<fst::VectorFst<fst::StdArc>>
 GetHTransducer(const std::vector<std::vector<int32> > &ilabel_info,
                const ContextDependencyInterface &ctx_dep,
                const Transitions &trans_model,
@@ -186,6 +189,7 @@ void AddSelfLoops(const Transitions &trans_model,
                   const std::vector<int32> &disambig_syms,  // used as a check only.
                   BaseFloat self_loop_scale,
                   bool reorder,
+                  // Use arcfilter.h for this.
                   bool check_no_self_loops,
                   fst::VectorFst<fst::StdArc> *fst);
 
@@ -224,7 +228,7 @@ void AddTransitionProbs(const Transitions &trans_model,
 
 /// Returns a transducer from pdfs plus one (input) to  transition-ids (output).
 /// Currenly of use only for testing.
-fst::VectorFst<fst::StdArc>*
+std::unique_ptr<fst::VectorFst<fst::StdArc>>
 GetPdfToTransitionIdTransducer(const Transitions &trans_model);
 
 /// Converts all transition-ids in the FST to pdfs plus one.
diff --git a/src/hmm/topology.cc b/src/hmm/topology.cc
index df11b098361..9e4f1b86771 100644
--- a/src/hmm/topology.cc
+++ b/src/hmm/topology.cc
@@ -155,6 +155,9 @@ void Topology::Check() {
     KALDI_ERR << "Entry with no corresponding phones.";
 
   for (auto const& entry: entries_) {
+    if (!fst::Verify(entry)) {
+      KALDI_ERR << "Ill-formed FST provided.";
+    }
     if (entry.NumStates() <= 1)
       KALDI_ERR << "Cannot only have one state (must have a "
                 << "final state and a start state).";
@@ -174,8 +177,8 @@ void Topology::Check() {
           KALDI_ERR << "The topology must be an acceptor but ilabel != olabel.";
         if (arc.ilabel == 0)
           KALDI_ERR << "Epsilon arcs (pdf-class 0) are not allowed.";
-        if (state != entry.Start() && arc.nextstate == entry.Start())
-          KALDI_ERR << "Start state cannot have any inward transitions.";
+        if (arc.nextstate == entry.Start())
+          KALDI_ERR << "Start state may not have any inward transitions.";
         seen_pdf_classes.push_back(arc.ilabel);
         outward_prob_sum += exp(-arc.weight.Value());
       }
diff --git a/src/hmm/topology.h b/src/hmm/topology.h
index e99b0db7c18..680329e3c24 100644
--- a/src/hmm/topology.h
+++ b/src/hmm/topology.h
@@ -55,9 +55,7 @@ namespace kaldi {
        different arcs with the same pdf-class are allowed.  (We avoid 0
        because it is "special" in OpenFST, it is used for epsilon).
      - The start state must be state 0 and there must be no
-       transitions entering it except (possibly) a self-loop (although
-       a self-loop on state 0 is not advised for decoding-graph-size
-       reasons)
+       transitions entering it.
      - The start state must not be final.
      - No phone (in the <ForPhones>...</ForPhones> block) may have the value 0.
 
diff --git a/src/hmm/transitions.cc b/src/hmm/transitions.cc
index 493a14f62d8..42986d27f66 100644
--- a/src/hmm/transitions.cc
+++ b/src/hmm/transitions.cc
@@ -29,7 +29,7 @@
 
 namespace kaldi {
 
-bool Transitions::operator == (const Transitions &other) {
+bool Transitions::operator == (const Transitions &other) const {
   return topo_ == other.topo_ && info_ == other.info_ &&
       num_pdfs_ == other.num_pdfs_;
 }
diff --git a/src/hmm/transitions.h b/src/hmm/transitions.h
index fa62c03bd21..2f19818744c 100644
--- a/src/hmm/transitions.h
+++ b/src/hmm/transitions.h
@@ -69,6 +69,7 @@ static const int kNoPdf = -1;
 //                   indexes pdf's, either forward or self-loop).  Zero-based.
 //                   In DNN-based systems this would be the column index of
 //                   the neural net output.
+//                   Here, it's "this state". Presumably the source?
 // (*)self-loop-pdf-id:  The pdf-id associated with the self-loop of this state,
 //                   if there is one (we do not allow >1), or -1 if there is no
 //                   self-loop.  This will be the same as 'pdf-id' if this transition
@@ -77,7 +78,7 @@ static const int kNoPdf = -1;
 //                   why it's necessary is that we initially create the graph
 //                   without self-loops (for efficiency) and we need to be able
 //                   to look up the corresponding self-loop transition-id to
-//                   add self-loops to the graph.
+//                   add self-loops to the graph. Duh! That makes complete sense!
 //
 //   transition-id:  The numbers that we put on the decoding-graph arcs.
 //                   Each transition-id is associated with a 4-tuple
@@ -158,8 +159,22 @@ class Transitions {
     }
     // TODO.  operator == can compare all members. Also compare derived members?
     bool operator == (const TransitionIdInfo &other) const {
-      return phone == other.phone && topo_state == other.topo_state &&
-          pdf_id == other.pdf_id && self_loop_pdf_id == other.self_loop_pdf_id;
+      if (phone == other.phone && topo_state == other.topo_state &&
+          pdf_id == other.pdf_id) {
+        // This assertion is no longer true. Two states can have
+        // different arc_index fields. This equality operator is just
+        // bizarre. Should a TransitionIdInfo really be the same as
+        // another one if they don't have the same arc_index? I don't
+        // think so...  Should probably make a TransitionState class
+        // exposing a different operator== based on this class.
+        KALDI_ASSERT(self_loop_pdf_id == other.self_loop_pdf_id);
+        return true;
+      } else {
+        return false;
+      }
+    }
+    bool operator != (const TransitionIdInfo &other) const {
+      return !(*this == other);
     }
   };
 
@@ -195,12 +210,13 @@ class Transitions {
              const Vector<double> *occs = NULL);
 
   /// returns true if this is identical to 'other'
-  bool operator == (const Transitions &other);
+  bool operator == (const Transitions &other) const;
 
  private:
 
-  // Called from constructor.  initializes info_ (at least, the first 5
-  // fields); you then have to call ComputeDerived() to initalize teh rest.
+  // Called from constructor.  initializes info_ (at least, the first
+  // 5 fields); the implementation then has to call ComputeDerived()
+  // to initalize the rest.
   void ComputeInfo(const ContextDependencyInterface &ctx_dep);
 
   void ComputeDerived();  // Called from constructor and Read function.
@@ -211,8 +227,8 @@ class Transitions {
   Topology topo_;
 
   /// Information about transition-ids, indexed by transition-id.
-  /// the tuples are in sorted order which allows us to do the reverse mapping from
-  /// tuple to transition id.
+  /// the tuples are in lexicographic sorted order which allows us to do the
+  /// reverse mapping from tuple to transition id.
   std::vector<TransitionIdInfo> info_;
 
 

From f0e32f953e3622eacfd07ada30e964aa37960676 Mon Sep 17 00:00:00 2001
From: Daniel Galvez <dt.galvez@gmail.com>
Date: Wed, 17 Apr 2019 13:28:57 -0400
Subject: [PATCH 028/163] Clean up based on feedback.

---
 src/bin/align-compiled-mapped.cc      |  13 --
 src/hmm/hmm-utils.cc                  | 206 +++++---------------------
 src/hmm/hmm-utils.h                   |  37 +----
 src/latbin/lattice-add-trans-probs.cc |  92 ------------
 4 files changed, 40 insertions(+), 308 deletions(-)
 delete mode 100644 src/latbin/lattice-add-trans-probs.cc

diff --git a/src/bin/align-compiled-mapped.cc b/src/bin/align-compiled-mapped.cc
index ab7425c1a32..a47231f7b5a 100644
--- a/src/bin/align-compiled-mapped.cc
+++ b/src/bin/align-compiled-mapped.cc
@@ -50,17 +50,11 @@ int main(int argc, char *argv[]) {
     AlignConfig align_config;
     bool binary = true;
     BaseFloat acoustic_scale = 1.0;
-    BaseFloat transition_scale = 1.0;
-    BaseFloat self_loop_scale = 1.0;
 
     align_config.Register(&po);
     po.Register("binary", &binary, "Write output in binary mode");
-    po.Register("transition-scale", &transition_scale,
-                "Transition-probability scale [relative to acoustics]");
     po.Register("acoustic-scale", &acoustic_scale,
                 "Scaling factor for acoustic likelihoods");
-    po.Register("self-loop-scale", &self_loop_scale,
-                "Scale of self-loop versus non-self-loop log probs [relative to acoustics]");
     po.Read(argc, argv);
 
     if (po.NumArgs() < 4 || po.NumArgs() > 5) {
@@ -110,13 +104,6 @@ int main(int argc, char *argv[]) {
         continue;
       }
 
-      {  // Add transition-probs to the FST.
-        std::vector<int32> disambig_syms;  // empty.
-        AddTransitionProbs(trans_model, disambig_syms,
-                           transition_scale, self_loop_scale,
-                           &decode_fst);
-      }
-
       DecodableMatrixScaledMapped decodable(trans_model, loglikes, acoustic_scale);
 
       AlignUtteranceWrapper(align_config, utt,
diff --git a/src/hmm/hmm-utils.cc b/src/hmm/hmm-utils.cc
index ebd5429afee..b1d1aa46d45 100644
--- a/src/hmm/hmm-utils.cc
+++ b/src/hmm/hmm-utils.cc
@@ -394,13 +394,12 @@ class TidToTstateMapper {
 };
 
 // This is the code that expands an FST from transition-states to
-// transition-ids, in the case where reorder == true, i.e. the non-optional
-// transition is before the self-loop.
-static void AddSelfLoopsReorder(const Transitions &trans_model,
-                                const std::vector<int32> &disambig_syms,
-                                BaseFloat self_loop_scale,
-                                bool check_no_self_loops,
-                                fst::VectorFst<fst::StdArc> *fst) {
+// transition-ids, in the case where the non-optional transition is before
+// the self-loop.
+static void AddSelfLoopsInternal(const Transitions &trans_model,
+                                 const std::vector<int32> &disambig_syms,
+                                 bool check_no_self_loops,
+                                 fst::VectorFst<fst::StdArc> *fst) {
   using namespace fst;
   typedef StdArc Arc;
   typedef Arc::Label Label;
@@ -461,12 +460,12 @@ static void AddSelfLoopsReorder(const Transitions &trans_model,
       // WARNING: This is no longer the forward probability that this code was originally using!
       // It is difficult to get the self-loop probability just given the 
       BaseFloat log_prob = trans_state.transition_cost;
-      fst->SetFinal(s, Times(fst->Final(s), Weight(-log_prob*self_loop_scale)));
+      fst->SetFinal(s, Times(fst->Final(s), Weight(-log_prob)));
       for (MutableArcIterator<MutableFst<Arc> > aiter(fst, s);
           !aiter.Done();
           aiter.Next()) {
         Arc arc = aiter.Value();
-        arc.weight = Times(arc.weight, Weight(-log_prob*self_loop_scale));
+        arc.weight = Times(arc.weight, Weight(-log_prob));
         aiter.SetValue(arc);
       }
       // Now add self-loop, if needed.
@@ -475,60 +474,10 @@ static void AddSelfLoopsReorder(const Transitions &trans_model,
       // itself, right? How are we supposed to get the current state's
       // self-loop? Ugh. And make sure I don't repeat it...
       if (trans_id != 0) {  // has self-loop.
-        BaseFloat log_prob = trans_model.GetTransitionLogProb(trans_id);
-        fst->AddArc(s, Arc(trans_id, 0, Weight(-log_prob*self_loop_scale), s));
-      }
-    }
-  }
-}
-
-
-// this is the code that expands an FST from transition-states to
-// transition-ids, in the case where reorder == false, i.e. non-optional
-// transition is after the self-loop.
-static void AddSelfLoopsNoReorder(
-    const Transitions &trans_model,
-    const std::vector<int32> &disambig_syms,
-    BaseFloat self_loop_scale,
-    bool check_no_self_loops,
-    fst::VectorFst<fst::StdArc> *fst) {
-  using namespace fst;
-  typedef StdArc Arc;
-  typedef Arc::Label Label;
-  typedef Arc::StateId StateId;
-  typedef Arc::Weight Weight;
-
-  // Duplicate states as necessary so that each state has at most one self-loop
-  // on it.
-  TidToTstateMapper f(trans_model, disambig_syms, check_no_self_loops);
-  MakeFollowingInputSymbolsSameClass(true, fst, f);
-
-  StateId num_states = fst->NumStates();
-  for (StateId s = 0; s < num_states; s++) {
-    int32 my_trans_state = f(kNoLabel);
-    KALDI_ASSERT(my_trans_state == -1);
-    for (MutableArcIterator<VectorFst<Arc> > aiter(fst, s);
-         !aiter.Done();
-         aiter.Next()) {
-      Arc arc = aiter.Value();
-      if (my_trans_state == -1) my_trans_state = f(arc.ilabel);
-      else KALDI_ASSERT(my_trans_state == f(arc.ilabel));  // or MakeFollowingInputSymbolsSameClass failed.
-      if (my_trans_state > 0) {  // transition-id; multiply weight...
-        BaseFloat log_prob = trans_model.GetNonSelfLoopLogProb(my_trans_state);
-        arc.weight = Times(arc.weight, Weight(-log_prob*self_loop_scale));
-        aiter.SetValue(arc);
-      }
-    }
-    if (fst->Final(s) != Weight::Zero()) {
-      KALDI_ASSERT(my_trans_state == kNoLabel || my_trans_state == 0);  // or MakeFollowingInputSymbolsSameClass failed.
-    }
-    if (my_trans_state != kNoLabel && my_trans_state != 0) {
-      // a transition-state;  add self-loop, if it has one.
-      int32 trans_id = trans_model.SelfLoopOf(my_trans_state);
-      if (trans_id != 0) {  // has self-loop.
-        BaseFloat neg_log_prob = \
-          trans_model.InfoForTransitionId(trans_id).transition_cost;
-        fst->AddArc(s, Arc(trans_id, 0, Weight(neg_log_prob*self_loop_scale), s));
+        auto&& trans_info = trans_model.InfoForTransitionId(trans_id).transition_cost;
+        StateId next_state = TODO;
+        BaseFloat log_prob = trans_model.InfoForTransitionId(trans_id).transition_cost;
+        fst->AddArc(next_state, Arc(trans_id, 0, Weight(-log_prob), next_state));
       }
     }
   }
@@ -536,51 +485,10 @@ static void AddSelfLoopsNoReorder(
 
 void AddSelfLoops(const Transitions &trans_model,
                   const std::vector<int32> &disambig_syms,
-                  BaseFloat self_loop_scale,
-                  bool reorder,
                   bool check_no_self_loops,
                   fst::VectorFst<fst::StdArc> *fst) {
   KALDI_ASSERT(fst->Start() != fst::kNoStateId);
-  if (reorder)
-    AddSelfLoopsReorder(trans_model, disambig_syms, self_loop_scale,
-                        check_no_self_loops, fst);
-  else
-    AddSelfLoopsNoReorder(trans_model, disambig_syms, self_loop_scale,
-                          check_no_self_loops, fst);
-}
-
-// IsReordered returns true if the transitions were possibly reordered.  This reordering
-// can happen in AddSelfLoops, if the "reorder" option was true.
-// This makes the out-transition occur before the self-loop transition.
-// The function returns false (no reordering) if there is not enough information in
-// the alignment to tell (i.e. no self-loop were taken), and in this case the calling
-// code doesn't care what the answer is.
-// The "alignment" vector contains a sequence of TransitionIds.
-
-static bool IsReordered(const Transitions &trans_model,
-                        const std::vector<int32> &alignment) {
-  for (size_t i = 0; i + 1 < alignment.size(); i++) {
-    const TransitionIdInfo& tstate1 = trans_model.InfoForTransitionId(alignment[i]),
-      tstate2 = trans_model.InfoForTransitionId(alignment[i+1]);
-    if (tstate1 != tstate2) {
-      bool is_loop_1 = trans_model.IsSelfLoop(alignment[i]),
-          is_loop_2 = trans_model.IsSelfLoop(alignment[i+1]);
-      KALDI_ASSERT(!(is_loop_1 && is_loop_2));  // Invalid.
-      if (is_loop_1) return true;  // Reordered. self-loop is last.
-      if (is_loop_2) return false;  // Not reordered.  self-loop is first.
-    }
-  }
-
-  // Just one trans-state in whole sequence.
-  if (alignment.empty()) return false;
-  else {
-    bool is_loop_front = trans_model.IsSelfLoop(alignment.front()),
-        is_loop_back = trans_model.IsSelfLoop(alignment.back());
-    if (is_loop_front) return false;  // Not reordered.  Self-loop is first.
-    if (is_loop_back) return true;  // Reordered.  Self-loop is last.
-    return false;  // We really don't know in this case but calling code should
-    // not care.
-  }
+  AddSelfLoopsInternal(trans_model, disambig_syms, check_no_self_loops, fst);
 }
 
 // SplitToPhonesInternal takes as input the "alignment" vector containing
@@ -593,7 +501,6 @@ static bool IsReordered(const Transitions &trans_model,
 
 static bool SplitToPhonesInternal(const Transitions &trans_model,
                                   const std::vector<int32> &alignment,
-                                  bool reordered,
                                   std::vector<std::vector<int32> > *split_output) {
   if (alignment.empty()) return true;  // nothing to split.
   std::vector<size_t> end_points;  // points at which phones end [in an
@@ -604,25 +511,24 @@ static bool SplitToPhonesInternal(const Transitions &trans_model,
   for (size_t i = 0; i < alignment.size(); i++) {
     int32 trans_id = alignment[i];
     if (trans_model.InfoForTransitionId(trans_id).is_final) {
-      if (!reordered) end_points.push_back(i+1);
-      else {  // reordered.
-        while (i+1 < alignment.size() &&
-              trans_model.IsSelfLoop(alignment[i+1])) {
-          KALDI_ASSERT(trans_model.InfoForTransitionId(alignment[i]) ==
-                       trans_model.InfoForTransitionId(alignment[i+1]));
-          i++;
-        }
-        end_points.push_back(i+1);
+      while (i+1 < alignment.size() &&
+             trans_model.InfoForTransitionId(alignment[i+1]).is_self_loop) {
+        KALDI_ASSERT(trans_model.InfoForTransitionId(alignment[i]) ==
+                     trans_model.InfoForTransitionId(alignment[i+1]));
+        i++;
       }
+      end_points.push_back(i+1);
     } else if (i+1 == alignment.size()) {
       // need to have an end-point at the actual end.
       // but this is an error- should have been detected already.
       was_ok = false;
       end_points.push_back(i+1);
     } else {
-      int32 this_phone = trans_model.InfoForTransitionId(this_state).phone,
-          next_phone = trans_model.InfoForTransitionId(next_state).phone;
-      if (this_phone != next_phone) {
+      int32 this_phone = trans_model.InfoForTransitionId(trans_id).phone;
+      int32 next_trans_id = alignment[i+1];
+      int32 next_phone = trans_model.InfoForTransitionId(next_trans_id).phone;
+
+      if (this_phone != next_phone){
         // The phone changed, but this is an error-- we should have detected this via the
         // is_final check.
         was_ok = false;
@@ -632,17 +538,17 @@ static bool SplitToPhonesInternal(const Transitions &trans_model,
   }
 
   size_t cur_point = 0;
-  for (size_t i = 0; i < end_points.size(); i++) {
+  for (int32 end_point: end_points) {
     split_output->push_back(std::vector<int32>());
     // The next if-statement checks if the initial trans-id at the
     // current end point is the initial-state of the current phone (a
     // cursory check that the alignment is plausible).
-    int32 topo_state = trans_model.InfoForTransitionId(InfoForTransitionId).topo_state;
+    int32 topo_state = trans_model.InfoForTransitionId(end_point).topo_state;
     if (topo_state != 0)
       was_ok = false;
-    for (size_t j = cur_point; j < end_points[i]; j++)
+    for (size_t j = cur_point; j < end_point; j++)
       split_output->back().push_back(alignment[j]);
-    cur_point = end_points[i];
+    cur_point = end_point;
   }
   return was_ok;
 }
@@ -654,9 +560,7 @@ bool SplitToPhones(const Transitions &trans_model,
   KALDI_ASSERT(split_alignment != NULL);
   split_alignment->clear();
 
-  bool is_reordered = IsReordered(trans_model, alignment);
-  return SplitToPhonesInternal(trans_model, alignment,
-                               is_reordered, split_alignment);
+  return SplitToPhonesInternal(trans_model, alignment, split_alignment);
 }
 
 
@@ -861,7 +765,7 @@ static bool ConvertAlignmentInternal(const Transitions &old_trans_model,
                       const std::vector<int32> *phone_map,
                       std::vector<int32> *new_alignment) {
   KALDI_ASSERT(0 <= conversion_shift && conversion_shift < subsample_factor);
-  bool old_is_reordered = IsReordered(old_trans_model, old_alignment);
+  bool old_is_reordered = true;
   KALDI_ASSERT(new_alignment != NULL);
   new_alignment->clear();
   new_alignment->reserve(old_alignment.size());
@@ -997,34 +901,9 @@ bool ConvertAlignment(const Transitions &old_trans_model,
   return true;
 }
 
-// Returns the scaled, but not negated, log-prob, with the given scaling factors.
-static BaseFloat GetScaledTransitionLogProb(const Transitions &trans_model,
-                                            int32 trans_id,
-                                            BaseFloat transition_scale,
-                                            BaseFloat self_loop_scale) {
-  if (transition_scale == self_loop_scale) {
-    return trans_model.GetTransitionLogProb(trans_id) * transition_scale;
-  } else {
-    if (trans_model.IsSelfLoop(trans_id)) {
-      return self_loop_scale * trans_model.GetTransitionLogProb(trans_id);
-    } else {
-      int32 trans_state = trans_model.TransitionIdToTransitionState(trans_id);
-      return self_loop_scale * trans_model.GetNonSelfLoopLogProb(trans_state)
-          + transition_scale * trans_model.GetTransitionLogProbIgnoringSelfLoops(trans_id);
-      // This could be simplified to
-      // (self_loop_scale - transition_scale) * trans_model.GetNonSelfLoopLogProb(trans_state)
-      // + trans_model.GetTransitionLogProb(trans_id);
-      // this simplifies if self_loop_scale == 0.0
-    }
-  }
-}
-
-
-
 void AddTransitionProbs(const Transitions &trans_model,
                         const std::vector<int32> &disambig_syms,  // may be empty
                         BaseFloat transition_scale,
-                        BaseFloat self_loop_scale,
                         fst::VectorFst<fst::StdArc> *fst) {
   using namespace fst;
   KALDI_ASSERT(IsSortedAndUniq(disambig_syms));
@@ -1038,16 +917,13 @@ void AddTransitionProbs(const Transitions &trans_model,
       StdArc arc = aiter.Value();
       StdArc::Label l = arc.ilabel;
       if (l >= 1 && l <= num_tids) {  // a transition-id.
-        BaseFloat scaled_log_prob = GetScaledTransitionLogProb(trans_model,
-                                                               l,
-                                                               transition_scale,
-                                                               self_loop_scale);
+        BaseFloat scaled_log_prob =
+          trans_model.InfoForTransitionId(l).transition_cost * transition_scale;
         arc.weight = Times(arc.weight, TropicalWeight(-scaled_log_prob));
-      } else if (l != 0) {
-        if (!std::binary_search(disambig_syms.begin(), disambig_syms.end(),
-                               arc.ilabel))
-          KALDI_ERR << "AddTransitionProbs: invalid symbol " << arc.ilabel
-                    << " on graph input side.";
+      } else if (l != 0 && !std::binary_search(disambig_syms.begin(),
+                                               disambig_syms.end(),l)) {
+        KALDI_ERR << "AddTransitionProbs: invalid symbol " << arc.ilabel
+                  << " on graph input side.";
       }
       aiter.SetValue(arc);
     }
@@ -1056,7 +932,6 @@ void AddTransitionProbs(const Transitions &trans_model,
 
 void AddTransitionProbs(const Transitions &trans_model,
                         BaseFloat transition_scale,
-                        BaseFloat self_loop_scale,
                         Lattice *lat) {
   using namespace fst;
   int num_tids = trans_model.NumTransitionIds();
@@ -1069,10 +944,8 @@ void AddTransitionProbs(const Transitions &trans_model,
       LatticeArc arc = aiter.Value();
       LatticeArc::Label l = arc.ilabel;
       if (l >= 1 && l <= num_tids) {  // a transition-id.
-        BaseFloat scaled_log_prob = GetScaledTransitionLogProb(trans_model,
-                                                               l,
-                                                               transition_scale,
-                                                               self_loop_scale);
+        BaseFloat scaled_log_prob =
+          trans_model.InfoForTransitionId(l).transition_cost * transition_scale;
         // cost is negated log prob.
         arc.weight.SetValue1(arc.weight.Value1() - scaled_log_prob);
       } else if (l != 0) {
@@ -1156,7 +1029,7 @@ void GetRandomAlignmentForPhone(const ContextDependencyInterface &ctx_dep,
     std::vector<int32> symbols;
     bool include_epsilon = false;
     // note: 'fst' is an acceptor so ilabels == olabels.
-    GetInputSymbols(*fst, include_epsilon, &symbols);
+    GetInputSymbols(fst, include_epsilon, &symbols);
     int32 cur_state = length_constraint_fst.AddState();
     length_constraint_fst.SetStart(cur_state);
     for (int32 i = 0; i < length; i++) {
@@ -1190,7 +1063,6 @@ void GetRandomAlignmentForPhone(const ContextDependencyInterface &ctx_dep,
       single_path_fst, &symbol_sequence, NULL, NULL);
   KALDI_ASSERT(ans && symbol_sequence.size() == length);
   symbol_sequence.swap(*alignment);
-  delete fst;
 }
 
 void ChangeReorderingOfAlignment(const Transitions &trans_model,
diff --git a/src/hmm/hmm-utils.h b/src/hmm/hmm-utils.h
index 588810cbd69..73cce66eee3 100644
--- a/src/hmm/hmm-utils.h
+++ b/src/hmm/hmm-utils.h
@@ -170,8 +170,6 @@ void GetIlabelMapping(const std::vector<std::vector<int32> > &ilabel_info_old,
   * @param trans_model [in] Transition model
   * @param disambig_syms [in] Sorted, uniq list of disambiguation symbols, required
   *       if the graph contains disambiguation symbols but only needed for sanity checks.
-  * @param self_loop_scale [in] Transition-probability scale for self-loops; c.f.
-  *                    \ref hmm_scale
   * @param reorder [in] If true, reorders the transitions (see \ref hmm_reorder).
   *                     You'll normally want this to be true.
   * @param check_no_self_loops [in]  If true, it will check that there are no
@@ -187,47 +185,14 @@ void GetIlabelMapping(const std::vector<std::vector<int32> > &ilabel_info_old,
   */
 void AddSelfLoops(const Transitions &trans_model,
                   const std::vector<int32> &disambig_syms,  // used as a check only.
-                  BaseFloat self_loop_scale,
                   bool reorder,
                   // Use arcfilter.h for this.
                   bool check_no_self_loops,
                   fst::VectorFst<fst::StdArc> *fst);
 
-/**
-  * Adds transition-probs, with the supplied
-  * scales (see \ref hmm_scale), to the graph.
-  * Useful if you want to create a graph without transition probs, then possibly
-  * train the model (including the transition probs) but keep the graph fixed,
-  * and add back in the transition probs.  It assumes the fst has transition-ids
-  * on it.  It is not an error if the FST has no states (nothing will be done).
-  * @param trans_model [in] The transition model
-  * @param disambig_syms [in] A list of disambiguation symbols, required if the
-  *                       graph has disambiguation symbols on its input but only
-  *                       used for checks.
-  * @param transition_scale [in] A scale on transition-probabilities apart from
-  *                      those involving self-loops; see \ref hmm_scale.
-  * @param self_loop_scale [in] A scale on self-loop transition probabilities;
-  *                      see \ref hmm_scale.
-  * @param  fst [in, out] The FST to be modified.
-  */
-void AddTransitionProbs(const Transitions &trans_model,
-                        const std::vector<int32> &disambig_syms,
-                        BaseFloat transition_scale,
-                        BaseFloat self_loop_scale,
-                        fst::VectorFst<fst::StdArc> *fst);
-
-/**
-   This is as AddSelfLoops(), but operates on a Lattice, where
-   it affects the graph part of the weight (the first element
-   of the pair). */
-void AddTransitionProbs(const Transitions &trans_model,
-                        BaseFloat transition_scale,
-                        BaseFloat self_loop_scale,
-                        Lattice *lat);
-
 
 /// Returns a transducer from pdfs plus one (input) to  transition-ids (output).
-/// Currenly of use only for testing.
+/// Currently of use only for testing.
 std::unique_ptr<fst::VectorFst<fst::StdArc>>
 GetPdfToTransitionIdTransducer(const Transitions &trans_model);
 
diff --git a/src/latbin/lattice-add-trans-probs.cc b/src/latbin/lattice-add-trans-probs.cc
deleted file mode 100644
index 7f764756930..00000000000
--- a/src/latbin/lattice-add-trans-probs.cc
+++ /dev/null
@@ -1,92 +0,0 @@
-// latbin/lattice-add-trans-probs.cc
-
-// Copyright 2009-2011  Microsoft Corporation
-
-// See ../../COPYING for clarification regarding multiple authors
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//  http://www.apache.org/licenses/LICENSE-2.0
-//
-// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
-// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
-// MERCHANTABLITY OR NON-INFRINGEMENT.
-// See the Apache 2 License for the specific language governing permissions and
-// limitations under the License.
-
-
-#include "base/kaldi-common.h"
-#include "util/common-utils.h"
-#include "fstext/fstext-lib.h"
-#include "lat/kaldi-lattice.h"
-#include "lat/lattice-functions.h"
-#include "hmm/transitions.h"
-#include "hmm/hmm-utils.h"
-
-int main(int argc, char *argv[]) {
-  try {
-    using namespace kaldi;
-    typedef kaldi::int32 int32;
-    typedef kaldi::int64 int64;
-    using fst::SymbolTable;
-    using fst::VectorFst;
-    using fst::StdArc;
-
-    const char *usage =
-        "Add transition probabilities into graph part of lattice scores,\n"
-        "controlled by options --transition-scale and --self-loop-scale, which\n"
-        "for compatibility with the original graph, would normally be set to the same\n"
-        "values used in graph compilatoin\n"
-        "\n"
-        "Usage: lattice-add-trans-probs [options] model lattice-rspecifier lattice-wspecifier\n"
-        " e.g.: lattice-add-trans-probs --transition-scale=1.0 --self-loop-scale=0.1 1.mdl ark:in.lats ark:out.lats\n";
-
-    ParseOptions po(usage);
-
-    BaseFloat transition_scale = 1.0, self_loop_scale = 1.0;
-
-    po.Register("transition-scale", &transition_scale,
-                "Scale for transition probabilities (excluding self-loops)");
-    po.Register("self-loop-scale", &self_loop_scale,
-                "Probability scale for self-loop vs. non-self-loop "
-                "probability mass.");
-
-    po.Read(argc, argv);
-
-    if (po.NumArgs() != 3) {
-      po.PrintUsage();
-      exit(1);
-    }
-
-    std::string
-        model_rxfilename = po.GetArg(1),
-        lats_rspecifier = po.GetArg(2),
-        lats_wspecifier = po.GetArg(3);
-
-    int32 n_done = 0;
-
-    Transitions trans_model;
-
-    ReadKaldiObject(model_rxfilename, &trans_model);
-
-    SequentialLatticeReader lattice_reader(lats_rspecifier); // read as
-    // regular lattice.
-    CompactLatticeWriter clat_writer(lats_wspecifier); // write as compact.
-    for (; !lattice_reader.Done(); lattice_reader.Next(), n_done++) {
-      Lattice lat(lattice_reader.Value());
-      AddTransitionProbs(trans_model, transition_scale, self_loop_scale, &lat);
-      CompactLattice clat;
-      ConvertLattice(lat, &clat);
-      clat_writer.Write(lattice_reader.Key(), clat);
-      n_done++;
-    }
-    KALDI_LOG << "Done adding transition probabilities to " << n_done << " lattices.";
-    return (n_done != 0 ? 0 : 1);
-  } catch(const std::exception &e) {
-    std::cerr << e.what();
-    return -1;
-  }
-}

From a8b558059487186035be9bd5b90a69240f80b901 Mon Sep 17 00:00:00 2001
From: Daniel Galvez <dt.galvez@gmail.com>
Date: Thu, 18 Apr 2019 02:53:54 -0400
Subject: [PATCH 029/163] hmm-utils.cc: Everything compiles except for
 AddSelfLoops.

---
 src/fstext/fstext-utils-inl.h |   2 +-
 src/hmm/hmm-utils.cc          | 126 +++++++++++++---------------------
 src/hmm/hmm-utils.h           |  23 ++-----
 src/hmm/transitions.cc        |   9 +++
 src/hmm/transitions.h         |   2 +
 src/hmm/tree-accu.cc          |   3 +-
 6 files changed, 63 insertions(+), 102 deletions(-)

diff --git a/src/fstext/fstext-utils-inl.h b/src/fstext/fstext-utils-inl.h
index f6283d9794e..8eadf3dfc48 100644
--- a/src/fstext/fstext-utils-inl.h
+++ b/src/fstext/fstext-utils-inl.h
@@ -678,7 +678,7 @@ MakeLoopFst(const vector<std::unique_ptr<const ExpandedFst<Arc>>> &fsts) {
 
   for (Label i = 0; i < static_cast<Label>(fsts.size()); i++) {
     // TODO(galv): I feel like this won't work with my unique_ptr usage. Call .get()?
-    const ExpandedFst<Arc> *fst = fsts[i];
+    const ExpandedFst<Arc> *fst = fsts[i].get();
     if (fst == NULL) continue;
     { // optimization with cache: helpful if some members of "fsts" may
       // contain the same pointer value (e.g. in GetHTransducer).
diff --git a/src/hmm/hmm-utils.cc b/src/hmm/hmm-utils.cc
index b1d1aa46d45..337f590503f 100644
--- a/src/hmm/hmm-utils.cc
+++ b/src/hmm/hmm-utils.cc
@@ -2,6 +2,7 @@
 
 // Copyright 2009-2011  Microsoft Corporation
 //                2018  Johns Hopkins University (author: Daniel Povey)
+//                2019  Daniel Galvez
 
 // See ../../COPYING for clarification regarding multiple authors
 //
@@ -85,25 +86,28 @@ std::shared_ptr<fst::ExpandedFst<fst::StdArc>> GetHmmAsFsa(
        !siter.Done(); siter.Next()) {
     StateId state = siter.Value();
     std::vector<Arc> non_self_loops;
+    BaseFloat non_self_loop_prob = 1.0;
     for (fst::ArcIterator<MyEditFst> aiter(*loopless_entry, state);
          !aiter.Done(); aiter.Next()) {
       const Arc& arc = aiter.Value();
       if (arc.nextstate != state) {
         non_self_loops.push_back(arc);
+      } else {
+        non_self_loop_prob -= exp(-arc.weight.Value());
       }
     }
+    KALDI_ASSERT(non_self_loop_prob >= BaseFloat(0));
     if (non_self_loops.size() != loopless_entry->NumArcs(state)) {
       loopless_entry->DeleteArcs(state);
-      for (const Arc& arc: non_self_loops) {
+      for (Arc& arc: non_self_loops) {
+        // Renormalize the remaining arcs to have an outgoing weight
+        // of 1.0, so we maintain stochasticity
+        arc.weight = Arc::Weight(-log(exp(-arc.weight.Value()) / non_self_loop_prob));
         loopless_entry->AddArc(state, arc);
       }
     }
   }
 
-  // Now apply probability scale.
-  // We waited till after the possible weight-pushing steps,
-  // because weight-pushing needs "real" weights in order to work.
-  ApplyProbabilityScale(config.transition_scale, loopless_entry.get());
   if (cache != NULL)
     (*cache)[cache_index] = loopless_entry;
   return loopless_entry;
@@ -168,7 +172,14 @@ GetHTransducer(const std::vector<std::vector<int32> > &ilabel_info,
   typedef Arc::StateId StateId;
   typedef Arc::Label Label;
 
-  std::vector<std::unique_ptr<const ExpandedFst<Arc>>> fsts(ilabel_info.size(), NULL);
+  // I would prefer to do this:
+  // std::vector<std::unique_ptr<const ExpandedFst<Arc>>> fsts(ilabel_info.size(), std::unique_ptr(nullptr));
+  // But the second arg of constructor (2) at https://en.cppreference.com/w/cpp/container/vector/vector
+  // must be able to be turned into a const-reference, which std::unique_ptr cannot be.
+  std::vector<std::unique_ptr<const ExpandedFst<Arc>>> fsts;
+  for(std::size_t i = 0; i < ilabel_info.size(); ++i) {
+    fsts.emplace_back(std::unique_ptr<const ExpandedFst<Arc>>(nullptr));
+  }
   std::vector<int32> phones = trans_model.GetPhones();
 
   KALDI_ASSERT(disambig_syms_left != 0);
@@ -415,7 +426,10 @@ static void AddSelfLoopsInternal(const Transitions &trans_model,
   // duplicated, with one copy per incoming symbol.
   MakePrecedingInputSymbolsSameClass(fst, f);
 
-  // use the following to keep track of the transition-state for each state.
+  // use the following to keep track of the transition-state incoming
+  // into each state. This works because each state now has only one
+  // transition state coming into (because of
+  // MakePrecedingInputSymbolsSameClass).
   std::vector<Class> state_in(fst->NumStates(), f.NoLabelClass());
 
   // This first loop just works out the label into each state,
@@ -429,17 +443,18 @@ static void AddSelfLoopsInternal(const Transitions &trans_model,
     for (MutableArcIterator<VectorFst<Arc> > aiter(fst, s);
          !aiter.Done();
          aiter.Next()) {
-      Arc arc = aiter.Value();
+      const Arc& arc = aiter.Value();
       Class trans_state = f(arc.ilabel);
-      if (state_in[arc.nextstate] == f.NoLabelClass())
+      if (state_in[arc.nextstate] == f.NoLabelClass()) {
         state_in[arc.nextstate] = trans_state;
-      else {
+      } else {
         KALDI_ASSERT(state_in[arc.nextstate] == trans_state);
         // or probably an error in MakePrecedingInputSymbolsSame.
       }
     }
   }
 
+  // state_in maps each state in the fst to its TransitionState
   KALDI_ASSERT(state_in[fst->Start()] == f.NoLabelClass() ||
                state_in[fst->Start()] == f.ZeroClass());
   // or MakePrecedingInputSymbolsSame failed.
@@ -459,6 +474,7 @@ static void AddSelfLoopsInternal(const Transitions &trans_model,
       
       // WARNING: This is no longer the forward probability that this code was originally using!
       // It is difficult to get the self-loop probability just given the 
+      trans_model.InfoForTransitionId(trans_state.self_loop_pdf_id).transition_cost;
       BaseFloat log_prob = trans_state.transition_cost;
       fst->SetFinal(s, Times(fst->Final(s), Weight(-log_prob)));
       for (MutableArcIterator<MutableFst<Arc> > aiter(fst, s);
@@ -577,9 +593,8 @@ static inline void ConvertAlignmentForPhone(
     const ContextDependencyInterface &new_ctx_dep,
     const std::vector<int32> &old_phone_alignment,
     const std::vector<int32> &new_phone_window,
-    bool old_is_reordered,
-    bool new_is_reordered,
     std::vector<int32> *new_phone_alignment) {
+  KALDI_ASSERT(!old_phone_alignment.empty());
   int32 alignment_size = old_phone_alignment.size();
   static bool warned_topology = false;
   int32 P = new_ctx_dep.CentralPosition(),
@@ -589,14 +604,16 @@ static inline void ConvertAlignmentForPhone(
   const Topology &old_topo = old_trans_model.GetTopo(),
       &new_topo = new_trans_model.GetTopo();
 
-  bool topology_mismatch = !(old_topo.TopologyForPhone(old_central_phone) ==
-                             new_topo.TopologyForPhone(new_central_phone));
-  if (topology_mismatch) {
-    if (!warned_topology) {
-      warned_topology = true;
-      KALDI_WARN << "Topology mismatch detected; automatically converting. "
-                 << "Won't warn again.";
-    }
+  // TODO(galv): Do we need the transition costs to be the same? Right
+  // now, I am assuming that we do, but it is unclear to me that we
+  // really need this.
+  bool topology_mismatch = !fst::Equal(old_topo.TopologyForPhone(old_central_phone),
+                                       new_topo.TopologyForPhone(new_central_phone),
+                                       0.0);
+  if (topology_mismatch && !warned_topology) {
+    warned_topology = true;
+    KALDI_WARN << "Topology mismatch detected; automatically converting. "
+               << "Won't warn again.";
   }
   bool length_mismatch =
       (new_phone_alignment->size() != old_phone_alignment.size());
@@ -605,13 +622,9 @@ static inline void ConvertAlignmentForPhone(
     // old alignment.
     GetRandomAlignmentForPhone(new_ctx_dep, new_trans_model,
                                new_phone_window, new_phone_alignment);
-    if (new_is_reordered)
-      ChangeReorderingOfAlignment(new_trans_model, new_phone_alignment);
     return;
   }
 
-  KALDI_ASSERT(!old_phone_alignment.empty());
-
   int32 new_num_pdf_classes = new_topo.NumPdfClasses(new_central_phone);
   std::vector<int32> pdf_ids(new_num_pdf_classes);  // Indexed by pdf-class
   for (int32 pdf_class = 0; pdf_class < new_num_pdf_classes; pdf_class++) {
@@ -627,26 +640,18 @@ static inline void ConvertAlignmentForPhone(
   // the topologies and lengths match -> we can directly transfer
   // the alignment.
   for (int32 j = 0; j < alignment_size; j++) {
-    int32 old_tid = old_phone_alignment[j],
-        old_tstate = old_trans_model.TransitionIdToTransitionState(old_tid);
-    int32 forward_pdf_class =
-        old_trans_model.TransitionStateToForwardPdfClass(old_tstate),
-        self_loop_pdf_class =
-        old_trans_model.TransitionStateToSelfLoopPdfClass(old_tstate);
-    int32 hmm_state = old_trans_model.TransitionIdToHmmState(old_tid);
-    int32 trans_idx = old_trans_model.TransitionIdToTransitionIndex(old_tid);
-    int32 new_forward_pdf = pdf_ids[forward_pdf_class];
-    int32 new_self_loop_pdf = pdf_ids[self_loop_pdf_class];
-    int32 new_trans_state =
-        new_trans_model.TupleToTransitionState(new_central_phone, hmm_state,
-                                               new_forward_pdf, new_self_loop_pdf);
+    int32 old_tid = old_phone_alignment[j];
+    auto&& info = old_trans_model.InfoForTransitionId(old_tid);
+    int32 old_forward_pdf_class = old_trans_model.PdfClassForTid(old_tid);
+    int32 old_self_loop_pdf_class = old_trans_model.PdfClassForTid(info.self_loop_pdf_id);
+    int32 new_forward_pdf_id = pdf_ids[old_forward_pdf_class];
+    int32 new_self_loop_pdf_id = pdf_ids[old_self_loop_pdf_class];
     int32 new_tid =
-        new_trans_model.PairToTransitionId(new_trans_state, trans_idx);
+      new_trans_model.TupleToTransitionId(new_central_phone, info.topo_state,
+                                          info.arc_index, new_forward_pdf_id,
+                                          new_self_loop_pdf_id);
     (*new_phone_alignment)[j] = new_tid;
   }
-
-  if (new_is_reordered != old_is_reordered)
-    ChangeReorderingOfAlignment(new_trans_model, new_phone_alignment);
 }
 
 
@@ -761,11 +766,9 @@ static bool ConvertAlignmentInternal(const Transitions &old_trans_model,
                       const std::vector<int32> &old_alignment,
                       int32 conversion_shift,
                       int32 subsample_factor,
-                      bool new_is_reordered,
                       const std::vector<int32> *phone_map,
                       std::vector<int32> *new_alignment) {
   KALDI_ASSERT(0 <= conversion_shift && conversion_shift < subsample_factor);
-  bool old_is_reordered = true;
   KALDI_ASSERT(new_alignment != NULL);
   new_alignment->clear();
   new_alignment->reserve(old_alignment.size());
@@ -830,7 +833,6 @@ static bool ConvertAlignmentInternal(const Transitions &old_trans_model,
 
       ConvertAlignmentForPhone(old_trans_model, new_trans_model, new_ctx_dep,
                                old_alignment_for_phone, new_phone_window,
-                               old_is_reordered, new_is_reordered,
                                &new_alignment_for_phone);
       new_alignment->insert(new_alignment->end(),
                             new_alignment_for_phone.begin(),
@@ -848,7 +850,6 @@ bool ConvertAlignment(const Transitions &old_trans_model,
                       const std::vector<int32> &old_alignment,
                       int32 subsample_factor,
                       bool repeat_frames,
-                      bool new_is_reordered,
                       const std::vector<int32> *phone_map,
                       std::vector<int32> *new_alignment) {
   if (subsample_factor == 1) {
@@ -862,7 +863,6 @@ bool ConvertAlignment(const Transitions &old_trans_model,
                                     old_alignment,
                                     subsample_factor - 1, // == 0
                                     subsample_factor,
-                                    new_is_reordered,
                                     phone_map,
                                     new_alignment);
    // The value "subsample_factor - 1" for conversion_shift above ensures the
@@ -881,7 +881,6 @@ bool ConvertAlignment(const Transitions &old_trans_model,
                                     old_alignment,
                                     conversion_shift, // conversion_shift
                                     subsample_factor,
-                                    new_is_reordered,
                                     phone_map,
                                     &shifted_alignments[conversion_shift]))
         return false;
@@ -1065,37 +1064,4 @@ void GetRandomAlignmentForPhone(const ContextDependencyInterface &ctx_dep,
   symbol_sequence.swap(*alignment);
 }
 
-void ChangeReorderingOfAlignment(const Transitions &trans_model,
-                                 std::vector<int32> *alignment) {
-  int32 start_pos = 0, size = alignment->size();
-  while (start_pos != size) {
-    int32 start_tid = (*alignment)[start_pos];
-    int32 cur_tstate = trans_model.TransitionIdToTransitionState(start_tid);
-    bool start_is_self_loop = trans_model.IsSelfLoop(start_tid) ? 0 : 1;
-    int32 end_pos = start_pos + 1;
-    // If the first instance of this transition-state was a self-loop, then eat
-    // only non-self-loops of this state; if it was a non-self-loop, then eat
-    // only self-loops of this state.  Imposing this condition on self-loops
-    // would only actually matter in the rare circumstances that phones can
-    // have length 1.
-    while (end_pos != size &&
-           trans_model.TransitionIdToTransitionState((*alignment)[end_pos]) ==
-           cur_tstate) {
-      bool this_is_self_loop = trans_model.IsSelfLoop((*alignment)[end_pos]);
-      if (!this_is_self_loop) {
-        if (start_is_self_loop) {
-          break;  // stop before including this transition-id.
-        } else {
-          end_pos++;
-          break;  // stop after including this transition-id.
-        }
-      }
-      end_pos++;
-    }
-    std::swap((*alignment)[start_pos], (*alignment)[end_pos - 1]);
-    start_pos = end_pos;
-  }
-}
-
-
 } // namespace kaldi
diff --git a/src/hmm/hmm-utils.h b/src/hmm/hmm-utils.h
index 73cce66eee3..fc704a88f91 100644
--- a/src/hmm/hmm-utils.h
+++ b/src/hmm/hmm-utils.h
@@ -1,6 +1,7 @@
 // hmm/hmm-utils.h
 
 // Copyright 2009-2011  Microsoft Corporation
+//                2019  Daniel Galvez
 
 // See ../../COPYING for clarification regarding multiple authors
 //
@@ -36,19 +37,12 @@ namespace kaldi {
 /// Configuration class for the GetHTransducer() function; see
 /// \ref hmm_graph_config for context.
 struct HTransducerConfig {
-  /// Transition log-prob scale, see \ref hmm_scale.
-  /// Note this doesn't apply to self-loops; GetHTransducer() does
-  /// not include self-loops.
-  BaseFloat transition_scale;
   int32 nonterm_phones_offset;
 
   HTransducerConfig():
-      transition_scale(1.0),
       nonterm_phones_offset(-1) { }
 
   void Register (OptionsItf *opts) {
-    opts->Register("transition-scale", &transition_scale,
-                   "Scale of transition probs (relative to LM)");
     opts->Register("nonterm-phones-offset", &nonterm_phones_offset,
                    "The integer id of #nonterm_bos in phones.txt, if present. "
                    "Only needs to be set if you are doing grammar decoding, "
@@ -181,12 +175,13 @@ void GetIlabelMapping(const std::vector<std::vector<int32> > &ilabel_info_old,
   *                      chain examples.  WARNING: this was added in 2018;
   *                      if you get a compilation error, add this as 'true',
   *                      which emulates the behavior of older code.
-  * @param  fst [in, out] The FST to be modified.
+  * @param  fst [in, out] The FST to be modified. This should normally be HCLG
+  *                       or any other FST with transition ids as its input
+  *                       labels.
   */
 void AddSelfLoops(const Transitions &trans_model,
                   const std::vector<int32> &disambig_syms,  // used as a check only.
                   bool reorder,
-                  // Use arcfilter.h for this.
                   bool check_no_self_loops,
                   fst::VectorFst<fst::StdArc> *fst);
 
@@ -246,9 +241,6 @@ bool SplitToPhones(const Transitions &trans_model,
                                 'subsample_factor' separately generated
                                 alignments, to keep the phone boundaries
                                 the same as the input where possible.]
-   @param reorder [in]          True if you want the pdf-ids on the new alignment to
-                                be 'reordered'. (vs. the way they appear in
-                                the Topology object)
    @param phone_map [in]        If non-NULL, map from old to new phones.
    @param new_alignment [out]   The converted alignment.
 */
@@ -259,7 +251,6 @@ bool ConvertAlignment(const Transitions &old_trans_model,
                       const std::vector<int32> &old_alignment,
                       int32 subsample_factor,  // 1 in the normal case -> no subsampling.
                       bool repeat_frames,
-                      bool reorder,
                       const std::vector<int32> *phone_map,  // may be NULL
                       std::vector<int32> *new_alignment);
 
@@ -292,12 +283,6 @@ void GetRandomAlignmentForPhone(const ContextDependencyInterface &ctx_dep,
                                 const std::vector<int32> &phone_window,
                                 std::vector<int32> *alignment);
 
-/*
-  If the alignment was non-reordered makes it reordered, and vice versa.
-*/
-void ChangeReorderingOfAlignment(const Transitions &trans_model,
-                                 std::vector<int32> *alignment);
-
 /// @} end "addtogroup hmm_group"
 
 } // end namespace kaldi
diff --git a/src/hmm/transitions.cc b/src/hmm/transitions.cc
index 42986d27f66..3cc8fb9d449 100644
--- a/src/hmm/transitions.cc
+++ b/src/hmm/transitions.cc
@@ -261,6 +261,15 @@ void Transitions::Print(std::ostream &os,
   }
 }
 
+int32 Transitions::PdfClassForTid(int32 tid) const {
+  auto&& info = InfoForTransitionId(tid);
+  auto&& fst = GetTopo().TopologyForPhone(info.phone);
+  fst::ArcIterator<fst::VectorFst<fst::StdArc> > aiter(fst, info.topo_state);
+  aiter.Seek(info.arc_index);
+  int32 pdf_class = aiter.Value().ilabel;
+  return pdf_class;
+}
+
 bool GetPdfsForPhones(const Transitions &trans_model,
                       const std::vector<int32> &phones,
                       std::vector<int32> *pdfs) {
diff --git a/src/hmm/transitions.h b/src/hmm/transitions.h
index 2f19818744c..ab88e75a7c3 100644
--- a/src/hmm/transitions.h
+++ b/src/hmm/transitions.h
@@ -209,6 +209,8 @@ class Transitions {
              const std::vector<std::string> &phone_names,
              const Vector<double> *occs = NULL);
 
+  int32 PdfClassForTid(int32 tid) const;
+
   /// returns true if this is identical to 'other'
   bool operator == (const Transitions &other) const;
 
diff --git a/src/hmm/tree-accu.cc b/src/hmm/tree-accu.cc
index 2a3d99fc5c5..18a613b8a5c 100644
--- a/src/hmm/tree-accu.cc
+++ b/src/hmm/tree-accu.cc
@@ -84,8 +84,7 @@ void AccumulateTreeStats(const Transitions &trans_model,
       for (int32 j = 0; j < static_cast<int32>(split_alignment[i+info.central_position].size());j++) {
         // for central phone of this window...
         EventType evec_more(evec);
-        int32 pdf_class = trans_model.TransitionIdToPdfClass(
-            split_alignment[i+info.central_position][j]);
+        int32 pdf_class = trans_model.PdfClassForTid(split_alignment[i+info.central_position][j]);
         // pdf_class will normally be 0, 1 or 2 for 3-state HMM.
         std::pair<EventKeyType, EventValueType> pr(kPdfClass, pdf_class);
         evec_more.push_back(pr);

From af91c73591ec4a032dd48e2af271e059eac08a2e Mon Sep 17 00:00:00 2001
From: Daniel Galvez <dt.galvez@gmail.com>
Date: Thu, 18 Apr 2019 12:50:42 -0400
Subject: [PATCH 030/163] hmm-utils.cc compiles, except for a bizarre problem
 with the copy-assignment operator.

---
 src/chain/chain-supervision.cc |  19 +++--
 src/fstext/fstext-utils-inl.h  |   1 -
 src/hmm/hmm-utils.cc           | 125 +++++++++++++++++++++++----------
 src/hmm/transitions.h          |  11 ++-
 4 files changed, 103 insertions(+), 53 deletions(-)

diff --git a/src/chain/chain-supervision.cc b/src/chain/chain-supervision.cc
index c702b4b1114..ab20d1ad908 100644
--- a/src/chain/chain-supervision.cc
+++ b/src/chain/chain-supervision.cc
@@ -240,7 +240,7 @@ bool TimeEnforcerFst::GetArc(StateId s, Label ilabel, fst::StdArc* oarc) {
     oarc->ilabel = ilabel;
     if (convert_to_pdfs_) {
       // the olabel will be a pdf-id plus one, not a transition-id.
-      int32 pdf_id = trans_model_.TransitionIdToPdf(ilabel);
+      int32 pdf_id = trans_model_.TransitionIdToPdfFast(ilabel);
       oarc->olabel = pdf_id + 1;
     } else {
       oarc->olabel = ilabel;
@@ -276,7 +276,7 @@ bool TrainingGraphToSupervisionE2e(
       }
       KALDI_ASSERT(arc.ilabel != 0);
       StdArc arc2(arc);
-      arc2.ilabel = arc2.olabel = trans_model.TransitionIdToPdf(arc.ilabel) + 1;
+      arc2.ilabel = arc2.olabel = trans_model.TransitionIdToPdfFast(arc.ilabel) + 1;
       aiter.SetValue(arc2);
     }
   }
@@ -337,11 +337,11 @@ bool ProtoSupervisionToSupervision(
   // when we compose with the denominator graph.
   h_cfg.transition_scale = 0.0;
 
-  VectorFst<StdArc> *h_fst = GetHTransducer(inv_cfst.IlabelInfo(),
-                                            ctx_dep,
-                                            trans_model,
-                                            h_cfg,
-                                            &disambig_syms_h);
+  std::unique_ptr<VectorFst<StdArc>> h_fst = GetHTransducer(inv_cfst.IlabelInfo(),
+                                                            ctx_dep,
+                                                            trans_model,
+                                                            h_cfg,
+                                                            &disambig_syms_h);
   KALDI_ASSERT(disambig_syms_h.empty());
 
   VectorFst<StdArc> transition_id_fst;
@@ -352,10 +352,7 @@ bool ProtoSupervisionToSupervision(
   // when we compose with the denominator graph.
   BaseFloat self_loop_scale = 0.0;
 
-  // You should always set reorder to true; for the current chain-model
-  // topologies, it will affect results if you are inconsistent about this.
-  bool reorder = true,
-      check_no_self_loops = true;
+  bool check_no_self_loops = true;
   // add self-loops to the FST with transition-ids as its labels.
   AddSelfLoops(trans_model, disambig_syms_h, self_loop_scale, reorder,
                check_no_self_loops, &transition_id_fst);
diff --git a/src/fstext/fstext-utils-inl.h b/src/fstext/fstext-utils-inl.h
index 8eadf3dfc48..681096d0cbc 100644
--- a/src/fstext/fstext-utils-inl.h
+++ b/src/fstext/fstext-utils-inl.h
@@ -539,7 +539,6 @@ void MakePrecedingInputSymbolsSameClass(MutableFst<Arc> *fst, const F &f) {
   typedef typename Arc::Weight Weight;
   vector<ClassType> classes;
   ClassType noClass = f(kNoLabel);
-  ClassType epsClass = f(0);
 
   // Find bad states (states with multiple input-symbols into them).
   std::set<StateId> bad_states;  // states that we need to change.
diff --git a/src/hmm/hmm-utils.cc b/src/hmm/hmm-utils.cc
index 337f590503f..bf67c13412e 100644
--- a/src/hmm/hmm-utils.cc
+++ b/src/hmm/hmm-utils.cc
@@ -332,7 +332,42 @@ GetPdfToTransitionIdTransducer(const Transitions &trans_model) {
   return ans;
 }
 
+struct TransitionState {
+public:
+  TransitionState(const Transitions::TransitionIdInfo& info):
+    info(info) { }
+
+  bool operator==(const TransitionState& other) const {
+    return info.phone == other.info.phone &&
+      info.topo_state == other.info.topo_state &&
+      info.pdf_id == other.info.pdf_id;
+  }
 
+  bool operator!=(const TransitionState& other) const {
+    return !(*this == other);
+  }
+
+  TransitionState& operator=(TransitionState other) {
+// TODO: Fix this bizarre error when I uncomment this:
+    // this->info = other.info;
+    KALDI_ASSERT(false);
+// hmm-utils.cc: In member function ‘kaldi::TransitionState& kaldi::TransitionState::operator=(kaldi::TransitionState)’:
+// hmm-utils.cc:351:24: error: passing ‘const kaldi::Transitions::TransitionIdInfo’ as ‘this’ argument discards qualifiers [-fpermissive]
+//      this->info = other.info;
+//                         ^~~~
+// In file included from ../hmm/hmm-utils.h:27:0,
+//                  from hmm-utils.cc:25:
+// ../hmm/transitions.h:107:10: note:   in call to ‘kaldi::Transitions::TransitionIdInfo& kaldi::Transitions::TransitionIdInfo::operator=(const kaldi::Transitions::TransitionIdInfo&)’
+
+    return *this;
+  }
+
+  bool operator<(const TransitionState& other) const {
+    return info < other.info;
+  }
+
+  const Transitions::TransitionIdInfo& info;
+};
 
 class TidToTstateMapper {
 public:
@@ -358,9 +393,12 @@ class TidToTstateMapper {
                     bool check_no_self_loops):
       trans_model_(trans_model),
       disambig_syms_(disambig_syms),
-      check_no_self_loops_(check_no_self_loops) { }
+      check_no_self_loops_(check_no_self_loops) {
+    KALDI_ASSERT((*this)(fst::kNoLabel) == NoLabelClass());
+    KALDI_ASSERT((*this)(0) == ZeroClass());
+}
 
-  typedef Transitions::TransitionIdInfo Result;
+  typedef TransitionState Result;
   static const Result& NoLabelClass() {
     // Take advantage of the fact that phone must be greater than or
     // equal to 1 to create a TransitionIdInfo which in practice will
@@ -370,24 +408,27 @@ class TidToTstateMapper {
     // whether we are using one of these invalid TransitionIdInfo
     // classes.
     static auto *no_label =
-      new Transitions::TransitionIdInfo{.phone = -1, .topo_state = -1, .arc_index = -1,
-                                        .pdf_id = -1, .self_loop_pdf_id = -1};
-    return *no_label;
+      new Transitions::TransitionIdInfo{.phone = -1, .topo_state = -1,
+                                        .arc_index = -1, .pdf_id = -1,
+                                        .self_loop_pdf_id = -1};
+    static auto *no_label_state = new TransitionState(*no_label);
+    return *no_label_state;
   }
 
   static const Result& ZeroClass() {
     static auto *zero_label =
       new Transitions::TransitionIdInfo{.phone = 0, .topo_state = -1, .arc_index = -1,
                                         .pdf_id = -1, .self_loop_pdf_id = -1};
-    return *zero_label;
+    static auto *zero_label_state = new TransitionState(*zero_label);
+    return *zero_label_state;
   }
 
-  const Result& operator() (int32 tid) const {
+  Result operator() (int32 tid) const {
     if (tid == static_cast<int32>(fst::kNoLabel)) return NoLabelClass();  // -1 -> -1
     else if (tid >= 1 && tid <= trans_model_.NumTransitionIds()) {
       if (check_no_self_loops_ && trans_model_.InfoForTransitionId(tid).is_self_loop)
         KALDI_ERR << "AddSelfLoops: graph already has self-loops.";
-      return trans_model_.InfoForTransitionId(tid);
+      return TransitionState(trans_model_.InfoForTransitionId(tid));
     } else {  // 0 or (presumably) disambiguation symbol.  Map to zero
       int32 big_number = fst::kNontermBigNumber;  // 1000000
       if (tid != 0 && tid < big_number)
@@ -404,6 +445,21 @@ class TidToTstateMapper {
   bool check_no_self_loops_;
 };
 
+// Returns true if the outgoing arcs of the state s sum to 1.0
+template<typename FST>
+static bool StateIsStochastic(FST fst, typename FST::StateId s) {
+  using namespace fst;
+  using Arc = typename FST::Arc;
+  using Weight = typename Arc::Weight;
+  Weight total_prob = Weight::Zero();
+  for (MutableArcIterator<MutableFst<Arc> > aiter(fst, s);
+       !aiter.Done();
+       aiter.Next()) {
+    total_prob = Plus(total_prob, aiter.Value());
+  }
+  return ApproxEqual(total_prob.Value(), Weight::One());
+}
+
 // This is the code that expands an FST from transition-states to
 // transition-ids, in the case where the non-optional transition is before
 // the self-loop.
@@ -417,8 +473,6 @@ static void AddSelfLoopsInternal(const Transitions &trans_model,
   typedef Arc::StateId StateId;
   typedef Arc::Weight Weight;
 
-  typedef TidToTstateMapper::Result Class;
-
   TidToTstateMapper f(trans_model, disambig_syms, check_no_self_loops);
   // Duplicate states as necessary so that each state will require at most one
   // self-loop to be added to it.  Approximately this means that if a
@@ -430,11 +484,12 @@ static void AddSelfLoopsInternal(const Transitions &trans_model,
   // into each state. This works because each state now has only one
   // transition state coming into (because of
   // MakePrecedingInputSymbolsSameClass).
-  std::vector<Class> state_in(fst->NumStates(), f.NoLabelClass());
+  std::vector<TransitionState> state_in(fst->NumStates(), f.NoLabelClass());
 
   // This first loop just works out the label into each state,
   // and converts the transitions in the graph from transition-states
   // to transition-ids.
+  // state_in maps each state in the fst to its TransitionState
 
   for (StateIterator<VectorFst<Arc> > siter(*fst);
        !siter.Done();
@@ -444,7 +499,7 @@ static void AddSelfLoopsInternal(const Transitions &trans_model,
          !aiter.Done();
          aiter.Next()) {
       const Arc& arc = aiter.Value();
-      Class trans_state = f(arc.ilabel);
+      TransitionState trans_state = f(arc.ilabel);
       if (state_in[arc.nextstate] == f.NoLabelClass()) {
         state_in[arc.nextstate] = trans_state;
       } else {
@@ -454,10 +509,8 @@ static void AddSelfLoopsInternal(const Transitions &trans_model,
     }
   }
 
-  // state_in maps each state in the fst to its TransitionState
-  KALDI_ASSERT(state_in[fst->Start()] == f.NoLabelClass() ||
-               state_in[fst->Start()] == f.ZeroClass());
-  // or MakePrecedingInputSymbolsSame failed.
+  // The start state should have no incoming arcs (invariant of Topology)
+  KALDI_ASSERT(state_in[fst->Start()] == f.ZeroClass());
 
   // The next loop looks at each graph state, adds the self-loop [if needed] and
   // multiples all the out-transitions' probs (and final-prob) by the
@@ -466,36 +519,32 @@ static void AddSelfLoopsInternal(const Transitions &trans_model,
   // with the corresponding labels on them by this probability).
 
   for (StateId s = 0; s < static_cast<StateId>(state_in.size()); s++) {
-    if (state_in[s] != f.NoLabelClass() && state_in[s] != f.ZeroClass()) {
+    const TransitionState& trans_state = state_in[s];
+    if (trans_state != f.NoLabelClass() && trans_state != f.ZeroClass() &&
+        trans_state.info.self_loop_pdf_id != -1) {
       // defined, and not eps or a disambiguation symbol or a
-      // nonterminal-related sybol for grammar decoding...
-      const Class& trans_state = state_in[s];
-      // First multiply all probabilities by "forward" probability.
-      
-      // WARNING: This is no longer the forward probability that this code was originally using!
-      // It is difficult to get the self-loop probability just given the 
-      trans_model.InfoForTransitionId(trans_state.self_loop_pdf_id).transition_cost;
-      BaseFloat log_prob = trans_state.transition_cost;
-      fst->SetFinal(s, Times(fst->Final(s), Weight(-log_prob)));
+      // nonterminal-related symbol for grammar decoding, and has a
+      // self-loop which needs to be added, while maintaining
+      int32 self_loop_tid = trans_state.info.self_loop_transition_id;
+      KALDI_ASSERT(self_loop_tid != 0 &&
+                   "Can't have a self_loop_pdf_id without a self_loop_transition_id");
+      // 1) Multiply all probabilities by "forward" probability.
+      BaseFloat self_loop_log_prob =
+        -trans_model.InfoForTransitionId(self_loop_tid).transition_cost;
+      BaseFloat log_forward_prob = log(1.0 - exp(self_loop_log_prob));
+      fst->SetFinal(s, Times(fst->Final(s), Weight(-log_forward_prob)));
       for (MutableArcIterator<MutableFst<Arc> > aiter(fst, s);
           !aiter.Done();
           aiter.Next()) {
         Arc arc = aiter.Value();
-        arc.weight = Times(arc.weight, Weight(-log_prob));
+        arc.weight = Times(arc.weight, Weight(-log_forward_prob));
         aiter.SetValue(arc);
       }
-      // Now add self-loop, if needed.
-      int32 trans_id = trans_state.self_loop_transition_id;
-      // TODO: This is adding an arc for the current state into
-      // itself, right? How are we supposed to get the current state's
-      // self-loop? Ugh. And make sure I don't repeat it...
-      if (trans_id != 0) {  // has self-loop.
-        auto&& trans_info = trans_model.InfoForTransitionId(trans_id).transition_cost;
-        StateId next_state = TODO;
-        BaseFloat log_prob = trans_model.InfoForTransitionId(trans_id).transition_cost;
-        fst->AddArc(next_state, Arc(trans_id, 0, Weight(-log_prob), next_state));
-      }
+      // 2) Add self-loop
+      fst->AddArc(s, Arc(self_loop_tid, 0, Weight(-self_loop_log_prob), s));
+
     }
+    KALDI_PARANOID_ASSERT(StateIsStochastic(fst, s));
   }
 }
 
diff --git a/src/hmm/transitions.h b/src/hmm/transitions.h
index ab88e75a7c3..0bbd45dc238 100644
--- a/src/hmm/transitions.h
+++ b/src/hmm/transitions.h
@@ -159,6 +159,9 @@ class Transitions {
     }
     // TODO.  operator == can compare all members. Also compare derived members?
     bool operator == (const TransitionIdInfo &other) const {
+      // I don't think this is being used right now. For now, just abort
+      // whenever it is used, so I can see where it is used.
+      KALDI_ASSERT(false);
       if (phone == other.phone && topo_state == other.topo_state &&
           pdf_id == other.pdf_id) {
         // This assertion is no longer true. Two states can have
@@ -173,9 +176,11 @@ class Transitions {
         return false;
       }
     }
-    bool operator != (const TransitionIdInfo &other) const {
-      return !(*this == other);
-    }
+
+    // TransitionIdInfo& operator=(const TransitionIdInfo& other) {
+    //   is_final = other.is_final;
+    //   return *this;
+    // }
   };
 
 

From 25ebce5b180d80adc66eefb9b8fab7987079c623 Mon Sep 17 00:00:00 2001
From: Daniel Galvez <dt.galvez@gmail.com>
Date: Thu, 2 May 2019 02:04:45 -0400
Subject: [PATCH 031/163] Successfully compile all binaries other than
 cuda-gpu-available.cc

It has linking errors because some templates are lacking
definitions. Handle it later.
---
 src/bin/add-self-loops.cc                     |  13 +-
 src/bin/ali-to-pdf.cc                         |   2 +-
 src/bin/align-equal.cc                        |   2 +-
 src/bin/am-info.cc                            |   2 -
 src/bin/build-pfile-from-ali.cc               |   2 +-
 src/bin/compile-graph.cc                      |  15 +-
 src/bin/convert-ali.cc                        |   5 -
 src/bin/cuda-gpu-available.cc                 |   2 +
 src/bin/hmm-info.cc                           |   2 -
 src/bin/make-h-transducer.cc                  |  12 +-
 src/bin/make-pdf-to-tid-transducer.cc         |   4 +-
 src/bin/post-to-tacc.cc                       |   2 +-
 src/bin/show-alignments.cc                    |   3 +-
 src/chain/chain-den-graph.cc                  |  16 +-
 src/chain/chain-supervision.cc                |  13 +-
 src/decoder/training-graph-compiler.cc        |  24 +-
 src/decoder/training-graph-compiler.h         |   8 +-
 src/gmmbin/gmm-acc-stats-ali.cc               |   6 +-
 src/gmmbin/gmm-acc-stats-twofeats.cc          |  10 -
 src/gmmbin/gmm-acc-stats.cc                   |  10 -
 src/gmmbin/gmm-acc-stats2.cc                  |  10 +-
 src/gmmbin/gmm-align-compiled.cc              |  13 -
 src/gmmbin/gmm-est-map.cc                     |  10 -
 src/gmmbin/gmm-est.cc                         |  10 -
 src/gmmbin/gmm-fmpe-acc-stats.cc              | 155 -------
 src/gmmbin/gmm-info.cc                        |   2 -
 src/hmm/hmm-utils.cc                          |  22 +-
 src/hmm/hmm-utils.h                           |   9 +-
 src/hmm/transitions.cc                        |   5 +
 src/hmm/transitions.h                         |   2 +
 src/latbin/lattice-arc-post.cc                |   2 +-
 src/latbin/lattice-rescore-mapped.cc          |   2 +-
 src/nnet3/discriminative-supervision.cc       |   2 +-
 src/nnet3/discriminative-training.cc          |   4 +-
 src/nnet3bin/nnet3-align-compiled.cc          |  14 -
 src/online2/Makefile                          |   2 +-
 src/online2/online-endpoint.h                 |   1 -
 src/online2/online-feature-pipeline.cc        |  16 +-
 src/online2/online-feature-pipeline.h         |  12 +-
 src/online2/online-gmm-decodable.cc           |   2 +-
 src/online2/online-gmm-decoding.cc            |   6 +-
 src/online2/online-gmm-decoding.h             |   2 +-
 src/online2/online-nnet2-decoding-threaded.h  | 434 ------------------
 src/online2/online-nnet2-decoding.h           | 131 ------
 src/online2/online-nnet2-feature-pipeline.h   |  18 +-
 src/online2/online2-feature-pipeline.cc       |  15 +-
 src/online2bin/Makefile                       |   5 +-
 .../online2-tcp-nnet3-decode-faster.cc        |   4 +-
 src/online2bin/online2-wav-dump-features.cc   |   1 -
 .../online2-wav-nnet2-am-compute.cc           | 200 --------
 .../online2-wav-nnet2-latgen-faster.cc        | 293 ------------
 .../online2-wav-nnet2-latgen-threaded.cc      | 312 -------------
 52 files changed, 104 insertions(+), 1765 deletions(-)
 delete mode 100644 src/gmmbin/gmm-fmpe-acc-stats.cc
 delete mode 100644 src/online2/online-nnet2-decoding-threaded.h
 delete mode 100644 src/online2/online-nnet2-decoding.h
 delete mode 100644 src/online2bin/online2-wav-nnet2-am-compute.cc
 delete mode 100644 src/online2bin/online2-wav-nnet2-latgen-faster.cc
 delete mode 100644 src/online2bin/online2-wav-nnet2-latgen-threaded.cc

diff --git a/src/bin/add-self-loops.cc b/src/bin/add-self-loops.cc
index 562b0977a69..601d8d587f3 100644
--- a/src/bin/add-self-loops.cc
+++ b/src/bin/add-self-loops.cc
@@ -46,20 +46,14 @@ int main(int argc, char *argv[]) {
         "is recommended as the decoding will in that case be faster.\n"
         "Usage:   add-self-loops [options] transition-gmm/acoustic-model [fst-in] [fst-out]\n"
         "e.g.: \n"
-        " add-self-loops --self-loop-scale=0.1 1.mdl HCLGa.fst HCLG.fst\n"
-        "or:  add-self-loops --self-loop-scale=0.1 1.mdl <HCLGa.fst >HCLG.fst\n";
+        " add-self-loops1.mdl HCLGa.fst HCLG.fst\n"
+        "or:  add-self-loops 1.mdl <HCLGa.fst >HCLG.fst\n";
 
-    BaseFloat self_loop_scale = 1.0;
-    bool reorder = true;
     std::string disambig_in_filename;
 
     ParseOptions po(usage);
-    po.Register("self-loop-scale", &self_loop_scale,
-                "Scale for self-loop probabilities relative to LM.");
     po.Register("disambig-syms", &disambig_in_filename,
                 "List of disambiguation symbols on input of fst-in [input file]");
-    po.Register("reorder", &reorder,
-                "If true, reorder symbols for more decoding efficiency");
     po.Read(argc, argv);
 
     if (po.NumArgs() < 1 || po.NumArgs() > 3) {
@@ -97,13 +91,14 @@ int main(int argc, char *argv[]) {
     if (!fst)
       KALDI_ERR << "add-self-loops: error reading input FST.";
 
+    BaseFloat self_loop_scale = 1.0;
     bool check_no_self_loops = true;
 
     // The work gets done here.
     AddSelfLoops(trans_model,
                  disambig_syms_in,
                  self_loop_scale,
-                 reorder, check_no_self_loops, fst);
+                 check_no_self_loops, fst);
 
     if (! fst->Write(fst_out_filename) )
       KALDI_ERR << "add-self-loops: error writing FST to "
diff --git a/src/bin/ali-to-pdf.cc b/src/bin/ali-to-pdf.cc
index 1706f5aa371..3c978ca62f0 100644
--- a/src/bin/ali-to-pdf.cc
+++ b/src/bin/ali-to-pdf.cc
@@ -60,7 +60,7 @@ int main(int argc, char *argv[]) {
       std::vector<int32> alignment = reader.Value();
 
       for (size_t i = 0; i < alignment.size(); i++)
-        alignment[i] = trans_model.TransitionIdToPdf(alignment[i]);
+        alignment[i] = trans_model.TransitionIdToPdfFast(alignment[i]);
 
       writer.Write(key, alignment);
       num_done++;
diff --git a/src/bin/align-equal.cc b/src/bin/align-equal.cc
index 671c515f33e..80caff00168 100644
--- a/src/bin/align-equal.cc
+++ b/src/bin/align-equal.cc
@@ -71,7 +71,7 @@ int main(int argc, char *argv[]) {
     // need VectorFst because we will change it by adding subseq symbol.
     VectorFst<StdArc> *lex_fst = fst::ReadFstKaldi(lex_in_filename);
 
-    TrainingGraphCompilerOptions gc_opts(1.0, true);  // true -> Dan style graph.
+    TrainingGraphCompilerOptions gc_opts;
 
     std::vector<int32> disambig_syms;
     if (disambig_rxfilename != "")
diff --git a/src/bin/am-info.cc b/src/bin/am-info.cc
index dd59047c35c..f2516c436f8 100644
--- a/src/bin/am-info.cc
+++ b/src/bin/am-info.cc
@@ -56,8 +56,6 @@ int main(int argc, char *argv[]) {
     std::cout << "number of pdfs " << trans_model.NumPdfs() << '\n';
     std::cout << "number of transition-ids " << trans_model.NumTransitionIds()
               << '\n';
-    std::cout << "number of transition-states "
-              << trans_model.NumTransitionStates() << '\n';
   } catch(const std::exception &e) {
     std::cerr << e.what() << '\n';
     return -1;
diff --git a/src/bin/build-pfile-from-ali.cc b/src/bin/build-pfile-from-ali.cc
index fb82fe27eaa..e1967c77d8c 100644
--- a/src/bin/build-pfile-from-ali.cc
+++ b/src/bin/build-pfile-from-ali.cc
@@ -115,7 +115,7 @@ int main(int argc, char *argv[]) {
           }
           // Output the class label
           ss << " ";
-          ss << trans_model.TransitionIdToPdf(alignment[i]);
+          ss << trans_model.TransitionIdToPdfFast(alignment[i]);
 
           ko.Stream() << ss.str().c_str();
           ko.Stream() << "\n";
diff --git a/src/bin/compile-graph.cc b/src/bin/compile-graph.cc
index 2dae81fa702..c9600462427 100644
--- a/src/bin/compile-graph.cc
+++ b/src/bin/compile-graph.cc
@@ -145,16 +145,15 @@ int main(int argc, char *argv[]) {
     h_cfg.nonterm_phones_offset = nonterm_phones_offset;
     std::vector<int32> disambig_syms_h; // disambiguation symbols on
                                         // input side of H.
-    VectorFst<StdArc> *h_fst = GetHTransducer(ilabels,
-                                              ctx_dep,
-                                              trans_model,
-                                              h_cfg,
-                                              &disambig_syms_h);
+    std::unique_ptr<VectorFst<StdArc>> h_fst = GetHTransducer(ilabels,
+                                                              ctx_dep,
+                                                              trans_model,
+                                                              h_cfg,
+                                                              &disambig_syms_h);
 
     VectorFst<StdArc> hclg_fst;  // transition-id to word.
     TableCompose(*h_fst, clg_fst, &hclg_fst);
     clg_fst.DeleteStates();
-    delete h_fst;
 
     KALDI_ASSERT(hclg_fst.Start() != fst::kNoStateId);
 
@@ -170,12 +169,10 @@ int main(int argc, char *argv[]) {
     MinimizeEncoded(&hclg_fst);
 
     std::vector<int32> disambig;
-    bool check_no_self_loops = true,
-        reorder = true;
+    bool check_no_self_loops = true;
     AddSelfLoops(trans_model,
                  disambig,
                  self_loop_scale,
-                 reorder,
                  check_no_self_loops,
                  &hclg_fst);
 
diff --git a/src/bin/convert-ali.cc b/src/bin/convert-ali.cc
index 7daeb40ca53..d245d93a0f8 100644
--- a/src/bin/convert-ali.cc
+++ b/src/bin/convert-ali.cc
@@ -38,7 +38,6 @@ int main(int argc, char *argv[]) {
         " convert-ali old/final.mdl new/0.mdl new/tree ark:old/ali.1 ark:new/ali.1\n";
 
     int32 frame_subsampling_factor = 1;
-    bool reorder = true;
     bool repeat_frames = false;
 
     std::string phone_map_rxfilename;
@@ -46,9 +45,6 @@ int main(int argc, char *argv[]) {
     po.Register("phone-map", &phone_map_rxfilename,
                 "File name containing old->new phone mapping (each line is: "
                 "old-integer-id new-integer-id)");
-    po.Register("reorder", &reorder,
-                "True if you want the converted alignments to be 'reordered' "
-                "versus the way they appear in the Topology object");
     po.Register("repeat-frames", &repeat_frames,
                 "Only relevant when frame-subsampling-factor != 1.  If true, "
                 "repeat frames of alignment by 'frame-subsampling-factor' "
@@ -105,7 +101,6 @@ int main(int argc, char *argv[]) {
                            old_alignment,
                            frame_subsampling_factor,
                            repeat_frames,
-                           reorder,
                            (phone_map_rxfilename != "" ? &phone_map : NULL),
                            &new_alignment)) {
         alignment_writer.Write(key, new_alignment);
diff --git a/src/bin/cuda-gpu-available.cc b/src/bin/cuda-gpu-available.cc
index 69637d3601a..923ed8280cf 100644
--- a/src/bin/cuda-gpu-available.cc
+++ b/src/bin/cuda-gpu-available.cc
@@ -50,6 +50,8 @@ int main(int argc, char *argv[]) try {
         "exit-code: 0 = success, 1 = compiled without GPU support, -1 = error\n"
         "\n"
         "Usage:  cuda-gpu-available\n";
+  // Remove unused variable warning
+  (void) usage;
 
   char hostname[100] = "UNKNOWN-HOSTNAME";
 #if !defined(_MSC_VER) && !defined(__CYGWIN__)
diff --git a/src/bin/hmm-info.cc b/src/bin/hmm-info.cc
index 30d6f999c8e..6daa0bc6385 100644
--- a/src/bin/hmm-info.cc
+++ b/src/bin/hmm-info.cc
@@ -54,8 +54,6 @@ int main(int argc, char *argv[]) {
     std::cout << "number of pdfs " << trans_model.NumPdfs() << '\n';
     std::cout << "number of transition-ids " << trans_model.NumTransitionIds()
               << '\n';
-    std::cout << "number of transition-states "
-              << trans_model.NumTransitionStates() << '\n';
   } catch(const std::exception &e) {
     std::cerr << e.what() << '\n';
     return -1;
diff --git a/src/bin/make-h-transducer.cc b/src/bin/make-h-transducer.cc
index 777cab0f94d..e3a66a99536 100644
--- a/src/bin/make-h-transducer.cc
+++ b/src/bin/make-h-transducer.cc
@@ -77,11 +77,12 @@ int main(int argc, char *argv[]) {
     std::vector<int32> disambig_syms_out;
 
     // The work gets done here.
-    fst::VectorFst<fst::StdArc> *H = GetHTransducer (ilabel_info,
-                                                     ctx_dep,
-                                                     trans_model,
-                                                     hcfg,
-                                                     &disambig_syms_out);
+    std::unique_ptr<fst::VectorFst<fst::StdArc>> 
+      H = GetHTransducer (ilabel_info,
+                          ctx_dep,
+                          trans_model,
+                          hcfg,
+                          &disambig_syms_out);
 #if _MSC_VER
     if (fst_out_filename == "")
       _setmode(_fileno(stdout),  _O_BINARY);
@@ -101,7 +102,6 @@ int main(int argc, char *argv[]) {
                  << (fst_out_filename == "" ?
                      "standard output" : fst_out_filename);
 
-    delete H;
     return 0;
   } catch(const std::exception &e) {
     std::cerr << e.what();
diff --git a/src/bin/make-pdf-to-tid-transducer.cc b/src/bin/make-pdf-to-tid-transducer.cc
index ad9c627e558..b4ed45192e6 100644
--- a/src/bin/make-pdf-to-tid-transducer.cc
+++ b/src/bin/make-pdf-to-tid-transducer.cc
@@ -50,7 +50,8 @@ int main(int argc, char *argv[]) {
     Transitions trans_model;
     ReadKaldiObject(trans_model_filename, &trans_model);
 
-    fst::VectorFst<fst::StdArc> *fst = GetPdfToTransitionIdTransducer(trans_model);
+    std::unique_ptr<fst::VectorFst<fst::StdArc>> fst =
+      GetPdfToTransitionIdTransducer(trans_model);
 
 #if _MSC_VER
     if (fst_out_filename == "")
@@ -60,7 +61,6 @@ int main(int argc, char *argv[]) {
     if (!fst->Write(fst_out_filename))
       KALDI_ERR << "Error writing fst to "
                 << (fst_out_filename == "" ? "standard output" : fst_out_filename);
-    delete fst;
   } catch(const std::exception &e) {
     std::cerr << e.what();
     return -1;
diff --git a/src/bin/post-to-tacc.cc b/src/bin/post-to-tacc.cc
index 7867e9f5697..842356f8ffb 100644
--- a/src/bin/post-to-tacc.cc
+++ b/src/bin/post-to-tacc.cc
@@ -90,7 +90,7 @@ int main(int argc, char *argv[]) {
       int32 num_pdf_ids = trans_model.NumPdfs();
       Vector<double> pdf_accs(num_pdf_ids);
       for (int32 i = 1; i < num_transition_ids; i++) {
-        int32 pid = trans_model.TransitionIdToPdf(i);
+        int32 pid = trans_model.TransitionIdToPdfFast(i);
         pdf_accs(pid) += transition_accs(i);
       }
       Vector<BaseFloat> pdf_accs_float(pdf_accs);
diff --git a/src/bin/show-alignments.cc b/src/bin/show-alignments.cc
index beadf1b590c..f8c79d2d79b 100644
--- a/src/bin/show-alignments.cc
+++ b/src/bin/show-alignments.cc
@@ -80,8 +80,7 @@ int main(int argc, char *argv[]) {
         split_str[i] = ss.str();
 
         int32 tid = split[i][0],
-            tstate = trans_model.TransitionIdToTransitionState(tid),
-            phone = trans_model.TransitionStateToPhone(tstate);
+            phone = trans_model.InfoForTransitionId(tid).phone;
         split_str_phones[i] =
             phones_symtab->Find(phone) + " ";
         std::string space;
diff --git a/src/chain/chain-den-graph.cc b/src/chain/chain-den-graph.cc
index 920ade49348..e8db0bbe5a5 100644
--- a/src/chain/chain-den-graph.cc
+++ b/src/chain/chain-den-graph.cc
@@ -171,7 +171,7 @@ void MapFstToPdfIdsPlusOne(const Transitions &trans_model,
       fst::StdArc arc = aiter.Value();
       KALDI_ASSERT(arc.ilabel == arc.olabel);
       if (arc.ilabel > 0) {
-        arc.ilabel = trans_model.TransitionIdToPdf(arc.ilabel) + 1;
+        arc.ilabel = trans_model.TransitionIdToPdfFast(arc.ilabel) + 1;
         arc.olabel = arc.ilabel;
         aiter.SetValue(arc);
       }
@@ -342,24 +342,22 @@ void CreateDenominatorFst(const ContextDependency &ctx_dep,
   // we'll use the same value in test time.  Consistency is the key here.
   h_config.transition_scale = 1.0;
 
-  StdVectorFst *h_fst = GetHTransducer(inv_cfst.IlabelInfo(),
-                                       ctx_dep,
-                                       trans_model,
-                                       h_config,
-                                       &disambig_syms_h);
+  std::unique_ptr<StdVectorFst> h_fst = GetHTransducer(inv_cfst.IlabelInfo(),
+                                                       ctx_dep,
+                                                       trans_model,
+                                                       h_config,
+                                                       &disambig_syms_h);
   KALDI_ASSERT(disambig_syms_h.empty());
   StdVectorFst transition_id_fst;
   TableCompose(*h_fst, context_dep_lm, &transition_id_fst);
-  delete h_fst;
 
   BaseFloat self_loop_scale = 1.0;  // We have to be careful to use the same
                                     // value in test time.
   // 'reorder' must always be set to true for chain models.
-  bool reorder = true;
   bool check_no_self_loops = true;
 
   // add self-loops to the FST with transition-ids as its labels.
-  AddSelfLoops(trans_model, disambig_syms_h, self_loop_scale, reorder,
+  AddSelfLoops(trans_model, disambig_syms_h, self_loop_scale,
                check_no_self_loops, &transition_id_fst);
   // at this point transition_id_fst will have transition-ids as its ilabels and
   // context-dependent phones (indexes into IlabelInfo()) as its olabels.
diff --git a/src/chain/chain-supervision.cc b/src/chain/chain-supervision.cc
index ab20d1ad908..717fb1f27a8 100644
--- a/src/chain/chain-supervision.cc
+++ b/src/chain/chain-supervision.cc
@@ -346,7 +346,6 @@ bool ProtoSupervisionToSupervision(
 
   VectorFst<StdArc> transition_id_fst;
   TableCompose(*h_fst, context_dep_fst, &transition_id_fst);
-  delete h_fst;
 
   // We don't want to add any transition probabilities as they will be added
   // when we compose with the denominator graph.
@@ -354,7 +353,7 @@ bool ProtoSupervisionToSupervision(
 
   bool check_no_self_loops = true;
   // add self-loops to the FST with transition-ids as its labels.
-  AddSelfLoops(trans_model, disambig_syms_h, self_loop_scale, reorder,
+  AddSelfLoops(trans_model, disambig_syms_h, self_loop_scale,
                check_no_self_loops, &transition_id_fst);
 
   // at this point transition_id_fst will have transition-ids as its ilabels and
@@ -997,7 +996,7 @@ bool ConvertSupervisionToUnconstrained(
     }
     for (int32 i = 0; i < supervision->frames_per_sequence; i++) {
       supervision->alignment_pdfs[i] =
-          trans_mdl.TransitionIdToPdf(supervision->alignment_pdfs[i]);
+          trans_mdl.TransitionIdToPdfFast(supervision->alignment_pdfs[i]);
     }
   }
 
@@ -1024,7 +1023,7 @@ bool ConvertSupervisionToUnconstrained(
         // because these graphs are always built with reorder == true; if it was
         // built with reorder == false, we'd have to treat the last, not first,
         // frame specially.)
-        if (trans_mdl.IsSelfLoop(transition_id) && s != start_state)
+        if (trans_mdl.InfoForTransitionId(transition_id).is_self_loop && s != start_state)
           arc.ilabel = 0;
         aiter.SetValue(arc);
       }
@@ -1063,15 +1062,13 @@ bool ConvertSupervisionToUnconstrained(
     // normalization FST for that.  (note: all transition probabilities are just
     // 0.5 anyway, for the typical chain topology).
     BaseFloat self_loop_scale = 0.0;
-    // 'reorder' must always be true for chain models.
-    bool reorder = true;
     // The FST we're about to call AddSelfLoops() on will have self-loops, on
     // the first frame, so disable the check that the FST was originally
     // self-loop-free.
     bool check_no_self_loops = false;
     supervision->e2e_fsts.resize(1);
     AddSelfLoops(trans_mdl, disambig_syms, self_loop_scale,
-                 reorder, check_no_self_loops, &(supervision->e2e_fsts[0]));
+                 check_no_self_loops, &(supervision->e2e_fsts[0]));
   }
 
   { // Convert transition-ids to pdf-ids+1 on the FST labels,
@@ -1086,7 +1083,7 @@ bool ConvertSupervisionToUnconstrained(
         // AddSelfLoops() works (it calls MakePrecedingInputSymbolsSame(), which
         // adds epsilons).  zero olabels.
         if (arc.ilabel != 0) {
-          int32 pdf_id_plus_one = trans_mdl.TransitionIdToPdf(arc.ilabel) + 1;
+          int32 pdf_id_plus_one = trans_mdl.TransitionIdToPdfFast(arc.ilabel) + 1;
           arc.ilabel = pdf_id_plus_one;
           arc.olabel = pdf_id_plus_one;
           aiter.SetValue(arc);
diff --git a/src/decoder/training-graph-compiler.cc b/src/decoder/training-graph-compiler.cc
index db1a75f7a25..865552047a4 100644
--- a/src/decoder/training-graph-compiler.cc
+++ b/src/decoder/training-graph-compiler.cc
@@ -102,11 +102,11 @@ bool TrainingGraphCompiler::CompileGraph(const fst::VectorFst<fst::StdArc> &word
 
   std::vector<int32> disambig_syms_h; // disambiguation symbols on
   // input side of H.
-  VectorFst<StdArc> *H = GetHTransducer(inv_cfst.IlabelInfo(),
-                                        ctx_dep_,
-                                        trans_model_,
-                                        h_cfg,
-                                        &disambig_syms_h);
+  std::unique_ptr<VectorFst<StdArc>> H = GetHTransducer(inv_cfst.IlabelInfo(),
+                                                        ctx_dep_,
+                                                        trans_model_,
+                                                        h_cfg,
+                                                        &disambig_syms_h);
 
   VectorFst<StdArc> &trans2word_fst = *out_fst;  // transition-id to word.
   TableCompose(*H, ctx2word_fst, &trans2word_fst);
@@ -133,11 +133,9 @@ bool TrainingGraphCompiler::CompileGraph(const fst::VectorFst<fst::StdArc> &word
   AddSelfLoops(trans_model_,
                disambig,
                opts_.self_loop_scale,
-               opts_.reorder,
                check_no_self_loops,
                &trans2word_fst);
 
-  delete H;
   return true;
 }
 
@@ -198,11 +196,11 @@ bool TrainingGraphCompiler::CompileGraphs(
   h_cfg.transition_scale = opts_.transition_scale;
 
   std::vector<int32> disambig_syms_h;
-  VectorFst<StdArc> *H = GetHTransducer(inv_cfst.IlabelInfo(),
-                                        ctx_dep_,
-                                        trans_model_,
-                                        h_cfg,
-                                        &disambig_syms_h);
+  std::unique_ptr<VectorFst<StdArc>> H = GetHTransducer(inv_cfst.IlabelInfo(),
+                                                        ctx_dep_,
+                                                        trans_model_,
+                                                        h_cfg,
+                                                        &disambig_syms_h);
 
   for (size_t i = 0; i < out_fsts->size(); i++) {
     VectorFst<StdArc> &ctx2word_fst = *((*out_fsts)[i]);
@@ -225,7 +223,6 @@ bool TrainingGraphCompiler::CompileGraphs(
     AddSelfLoops(trans_model_,
                  disambig,
                  opts_.self_loop_scale,
-                 opts_.reorder,
                  check_no_self_loops,
                  &trans2word_fst);
 
@@ -234,7 +231,6 @@ bool TrainingGraphCompiler::CompileGraphs(
     *((*out_fsts)[i]) = trans2word_fst;
   }
 
-  delete H;
   return true;
 }
 
diff --git a/src/decoder/training-graph-compiler.h b/src/decoder/training-graph-compiler.h
index 600844b8b8a..89ef72020ca 100644
--- a/src/decoder/training-graph-compiler.h
+++ b/src/decoder/training-graph-compiler.h
@@ -34,22 +34,18 @@ struct TrainingGraphCompilerOptions {
   BaseFloat transition_scale;
   BaseFloat self_loop_scale;
   bool rm_eps;
-  bool reorder;  // (Dan-style graphs)
 
   explicit TrainingGraphCompilerOptions(BaseFloat transition_scale = 1.0,
-                                        BaseFloat self_loop_scale = 1.0,
-                                        bool b = true) :
+                                        BaseFloat self_loop_scale = 1.0) :
       transition_scale(transition_scale),
       self_loop_scale(self_loop_scale),
-      rm_eps(false),
-      reorder(b) { }
+      rm_eps(false) { }
 
   void Register(OptionsItf *opts) {
     opts->Register("transition-scale", &transition_scale, "Scale of transition "
                    "probabilities (excluding self-loops)");
     opts->Register("self-loop-scale", &self_loop_scale, "Scale of self-loop vs. "
                    "non-self-loop probability mass ");
-    opts->Register("reorder", &reorder, "Reorder transition ids for greater decoding efficiency.");
     opts->Register("rm-eps", &rm_eps,  "Remove [most] epsilons before minimization (only applicable "
                    "if disambig symbols present)");
   }
diff --git a/src/gmmbin/gmm-acc-stats-ali.cc b/src/gmmbin/gmm-acc-stats-ali.cc
index b20212b4771..baee5f8b814 100644
--- a/src/gmmbin/gmm-acc-stats-ali.cc
+++ b/src/gmmbin/gmm-acc-stats-ali.cc
@@ -61,8 +61,6 @@ int main(int argc, char *argv[]) {
       am_gmm.Read(ki.Stream(), binary);
     }
 
-    Vector<double> transition_accs;
-    trans_model.InitStats(&transition_accs);
     AccumAmDiagGmm gmm_accs;
     gmm_accs.Init(am_gmm, kGmmAll);
 
@@ -94,8 +92,7 @@ int main(int argc, char *argv[]) {
 
         for (size_t i = 0; i < alignment.size(); i++) {
           int32 tid = alignment[i],  // transition identifier.
-              pdf_id = trans_model.TransitionIdToPdf(tid);
-          trans_model.Accumulate(1.0, tid, &transition_accs);
+              pdf_id = trans_model.TransitionIdToPdfFast(tid);
           tot_like_this_file += gmm_accs.AccumulateForGmm(am_gmm, mat.Row(i),
                                                           pdf_id, 1.0);
         }
@@ -117,7 +114,6 @@ int main(int argc, char *argv[]) {
 
     {
       Output ko(accs_wxfilename, binary);
-      transition_accs.Write(ko.Stream(), binary);
       gmm_accs.Write(ko.Stream(), binary);
     }
     KALDI_LOG << "Written accs.";
diff --git a/src/gmmbin/gmm-acc-stats-twofeats.cc b/src/gmmbin/gmm-acc-stats-twofeats.cc
index 3bae910233b..7b3cadfdb9b 100644
--- a/src/gmmbin/gmm-acc-stats-twofeats.cc
+++ b/src/gmmbin/gmm-acc-stats-twofeats.cc
@@ -67,8 +67,6 @@ int main(int argc, char *argv[]) {
       am_gmm.Read(ki.Stream(), binary);
     }
 
-    Vector<double> transition_accs;
-    trans_model.InitStats(&transition_accs);
     int32 new_dim = 0;
     AccumAmDiagGmm gmm_accs;
     // will initialize once we know new_dim.
@@ -129,13 +127,6 @@ int main(int argc, char *argv[]) {
                                                   weight);
             tot_weight_this_file += weight;
           }
-
-          // Accumulates for transitions.
-          for (size_t j = 0; j < posterior[i].size(); j++) {
-            int32 tid = posterior[i][j].first;
-            BaseFloat weight = posterior[i][j].second;
-            trans_model.Accumulate(weight, tid, &transition_accs);
-          }
         }
         KALDI_LOG << "Average like for this file is "
                   << (tot_like_this_file/tot_weight_this_file) << " over "
@@ -157,7 +148,6 @@ int main(int argc, char *argv[]) {
 
     {
       Output ko(accs_wxfilename, binary);
-      transition_accs.Write(ko.Stream(), binary);
       gmm_accs.Write(ko.Stream(), binary);
     }
     KALDI_LOG << "Written accs.";
diff --git a/src/gmmbin/gmm-acc-stats.cc b/src/gmmbin/gmm-acc-stats.cc
index beeee8ec758..76a3528d4f4 100644
--- a/src/gmmbin/gmm-acc-stats.cc
+++ b/src/gmmbin/gmm-acc-stats.cc
@@ -67,8 +67,6 @@ int main(int argc, char *argv[]) {
       am_gmm.Read(ki.Stream(), binary);
     }
 
-    Vector<double> transition_accs;
-    trans_model.InitStats(&transition_accs);
     AccumAmDiagGmm gmm_accs;
     gmm_accs.Init(am_gmm, StringToGmmFlags(update_flags_str));
 
@@ -110,13 +108,6 @@ int main(int argc, char *argv[]) {
                 * weight;
             tot_weight += weight;
           }
-
-          // Accumulates for transitions.
-          for (size_t j = 0; j < posterior[i].size(); j++) {
-            int32 tid = posterior[i][j].first;
-            BaseFloat weight = posterior[i][j].second;
-            trans_model.Accumulate(weight, tid, &transition_accs);
-          }
         }
         if (num_done % 50 == 0) {
           KALDI_LOG << "Processed " << num_done << " utterances; for utterance "
@@ -136,7 +127,6 @@ int main(int argc, char *argv[]) {
 
     {
       Output ko(accs_wxfilename, binary);
-      transition_accs.Write(ko.Stream(), binary);
       gmm_accs.Write(ko.Stream(), binary);
     }
     KALDI_LOG << "Written accs.";
diff --git a/src/gmmbin/gmm-acc-stats2.cc b/src/gmmbin/gmm-acc-stats2.cc
index 30f3ff80e10..15e97d07b73 100644
--- a/src/gmmbin/gmm-acc-stats2.cc
+++ b/src/gmmbin/gmm-acc-stats2.cc
@@ -70,9 +70,6 @@ int main(int argc, char *argv[]) {
       am_gmm.Read(ki.Stream(), binary);
     }
     
-    Vector<double> num_trans_accs, den_trans_accs;
-    trans_model.InitStats(&num_trans_accs);
-    trans_model.InitStats(&den_trans_accs);
     AccumAmDiagGmm num_gmm_accs, den_gmm_accs;
     num_gmm_accs.Init(am_gmm, StringToGmmFlags(update_flags_str));
     den_gmm_accs.Init(am_gmm, StringToGmmFlags(update_flags_str));
@@ -110,11 +107,8 @@ int main(int argc, char *argv[]) {
         for (size_t i = 0; i < posterior.size(); i++) {
           for (size_t j = 0; j < posterior[i].size(); j++) {
             int32 tid = posterior[i][j].first,
-                pdf_id = trans_model.TransitionIdToPdf(tid);
+                pdf_id = trans_model.TransitionIdToPdfFast(tid);
             BaseFloat weight = posterior[i][j].second;
-            trans_model.Accumulate(fabs(weight), tid,
-                                   (weight > 0.0 ?
-                                    &num_trans_accs : &den_trans_accs));
             tot_like_this_file +=
                 (weight > 0.0 ? &num_gmm_accs : &den_gmm_accs) ->
                 AccumulateForGmm(am_gmm, mat.Row(i), pdf_id, fabs(weight)) * weight;
@@ -136,12 +130,10 @@ int main(int argc, char *argv[]) {
 
     {
       Output ko(num_accs_wxfilename, binary);
-      num_trans_accs.Write(ko.Stream(), binary);
       num_gmm_accs.Write(ko.Stream(), binary);
     }
     {
       Output ko(den_accs_wxfilename, binary);
-      den_trans_accs.Write(ko.Stream(), binary);
       den_gmm_accs.Write(ko.Stream(), binary);
     }
     KALDI_LOG << "Written accs.";
diff --git a/src/gmmbin/gmm-align-compiled.cc b/src/gmmbin/gmm-align-compiled.cc
index 02beb372b60..f8b5a11d504 100644
--- a/src/gmmbin/gmm-align-compiled.cc
+++ b/src/gmmbin/gmm-align-compiled.cc
@@ -50,17 +50,11 @@ int main(int argc, char *argv[]) {
     ParseOptions po(usage);
     AlignConfig align_config;
     BaseFloat acoustic_scale = 1.0;
-    BaseFloat transition_scale = 1.0;
-    BaseFloat self_loop_scale = 1.0;
     std::string per_frame_acwt_wspecifier;
 
     align_config.Register(&po);
-    po.Register("transition-scale", &transition_scale,
-                "Transition-probability scale [relative to acoustics]");
     po.Register("acoustic-scale", &acoustic_scale,
                 "Scaling factor for acoustic likelihoods");
-    po.Register("self-loop-scale", &self_loop_scale,
-                "Scale of self-loop versus non-self-loop log probs [relative to acoustics]");
     po.Register("write-per-frame-acoustic-loglikes", &per_frame_acwt_wspecifier,
                 "Wspecifier for table of vectors containing the acoustic log-likelihoods "
                 "per frame for each utterance. E.g. ark:foo/per_frame_logprobs.1.ark");
@@ -114,13 +108,6 @@ int main(int argc, char *argv[]) {
           continue;
         }
 
-        {  // Add transition-probs to the FST.
-          std::vector<int32> disambig_syms;  // empty.
-          AddTransitionProbs(trans_model, disambig_syms,
-                             transition_scale, self_loop_scale,
-                             &decode_fst);
-        }
-
         DecodableAmDiagGmmScaled gmm_decodable(am_gmm, trans_model, features,
                                                acoustic_scale);
 
diff --git a/src/gmmbin/gmm-est-map.cc b/src/gmmbin/gmm-est-map.cc
index eb2b44d5961..6cbb864fcf7 100644
--- a/src/gmmbin/gmm-est-map.cc
+++ b/src/gmmbin/gmm-est-map.cc
@@ -36,7 +36,6 @@ int main(int argc, char *argv[]) {
         "e.g.: gmm-est-map 1.mdl 1.acc 2.mdl\n";
 
     bool binary_write = true;
-    MapTransitionUpdateConfig tcfg;
     MapDiagGmmOptions gmm_opts;
     std::string update_flags_str = "mvwt";
     std::string occs_out_filename;
@@ -47,7 +46,6 @@ int main(int argc, char *argv[]) {
                 "update: subset of mvwt.");
     po.Register("write-occs", &occs_out_filename, "File to write state "
                 "occupancies to.");
-    tcfg.Register(&po);
     gmm_opts.Register(&po);
 
     po.Read(argc, argv);
@@ -82,14 +80,6 @@ int main(int argc, char *argv[]) {
       gmm_accs.Read(ki.Stream(), binary, true);  // true == add; doesn't matter here.
     }
 
-    if (update_flags & kGmmTransitions) {  // Update transition model.
-      BaseFloat objf_impr, count;
-      trans_model.MapUpdate(transition_accs, tcfg, &objf_impr, &count);
-      KALDI_LOG << "Transition model update: Overall " << (objf_impr/count)
-                << " log-like improvement per frame over " << (count)
-                << " frames.";
-    }
-
     {  // Update GMMs.
       BaseFloat objf_impr, count;
       BaseFloat tot_like = gmm_accs.TotLogLike(),
diff --git a/src/gmmbin/gmm-est.cc b/src/gmmbin/gmm-est.cc
index 545bbc054ef..023a9f324b0 100644
--- a/src/gmmbin/gmm-est.cc
+++ b/src/gmmbin/gmm-est.cc
@@ -35,7 +35,6 @@ int main(int argc, char *argv[]) {
         "e.g.: gmm-est 1.mdl 1.acc 2.mdl\n";
 
     bool binary_write = true;
-    MleTransitionUpdateConfig tcfg;
     MleDiagGmmOptions gmm_opts;
     int32 mixup = 0;
     int32 mixdown = 0;
@@ -61,7 +60,6 @@ int main(int argc, char *argv[]) {
                 "means by standard deviation times this factor.");
     po.Register("write-occs", &occs_out_filename, "File to write pdf "
                 "occupation counts to.");
-    tcfg.Register(&po);
     gmm_opts.Register(&po);
 
     po.Read(argc, argv);
@@ -96,14 +94,6 @@ int main(int argc, char *argv[]) {
       gmm_accs.Read(ki.Stream(), binary, true);  // true == add; doesn't matter here.
     }
 
-    if (update_flags & kGmmTransitions) {  // Update transition model.
-      BaseFloat objf_impr, count;
-      trans_model.MleUpdate(transition_accs, tcfg, &objf_impr, &count);
-      KALDI_LOG << "Transition model update: Overall " << (objf_impr/count)
-                << " log-like improvement per frame over " << (count)
-                << " frames.";
-    }
-
     {  // Update GMMs.
       BaseFloat objf_impr, count;
       BaseFloat tot_like = gmm_accs.TotLogLike(),
diff --git a/src/gmmbin/gmm-fmpe-acc-stats.cc b/src/gmmbin/gmm-fmpe-acc-stats.cc
deleted file mode 100644
index 17cba7dc489..00000000000
--- a/src/gmmbin/gmm-fmpe-acc-stats.cc
+++ /dev/null
@@ -1,155 +0,0 @@
-// gmmbin/gmm-fmpe-acc-stats.cc
-
-// Copyright 2012  Johns Hopkins University (Author: Daniel Povey)
-
-// See ../../COPYING for clarification regarding multiple authors
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//  http://www.apache.org/licenses/LICENSE-2.0
-//
-// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
-// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
-// MERCHANTABLITY OR NON-INFRINGEMENT.
-// See the Apache 2 License for the specific language governing permissions and
-// limitations under the License.
-
-
-#include "base/kaldi-common.h"
-#include "util/common-utils.h"
-#include "gmm/am-diag-gmm.h"
-#include "hmm/transitions.h"
-#include "transform/fmpe.h"
-
-
-int main(int argc, char *argv[]) {
-  using namespace kaldi;
-  using kaldi::int32;
-  try {
-    const char *usage =
-        "Accumulate stats for fMPE training, using GMM model.  Note: this could\n"
-        "be done using gmm-get-feat-deriv and fmpe-acc-stats (but you'd be computing\n"
-        "the features twice).  Features input should be pre-fMPE features.\n"
-        "\n"
-        "Usage:  gmm-fmpe-acc-stats [options] <model-in> <fmpe-in> <feature-rspecifier> "
-        "<gselect-rspecifier> <posteriors-rspecifier> <fmpe-stats-out>\n"
-        "e.g.: \n"
-        " gmm-fmpe-acc-stats --model-derivative 1.accs 1.mdl 1.fmpe \"$feats\" ark:1.gselect ark:1.post 1.fmpe_stats\n";
-        
-    ParseOptions po(usage);
-    bool binary = true;
-    std::string model_derivative_rxfilename;
-    po.Register("binary", &binary, "If true, write stats in binary mode.");
-    po.Register("model-derivative", &model_derivative_rxfilename,
-                "GMM-accs file containing model derivative [note: contains no transition stats].  Used for indirect differential.  Warning: this will only work correctly in the case of MMI/BMMI objective function, with non-canceled stats.");
-    po.Read(argc, argv);
-
-    if (po.NumArgs() != 6) {
-      po.PrintUsage();
-      exit(1);
-    }
-
-    std::string model_rxfilename = po.GetArg(1),
-        fmpe_rxfilename = po.GetArg(2),
-        feature_rspecifier = po.GetArg(3),
-        gselect_rspecifier = po.GetArg(4),
-        posteriors_rspecifier = po.GetArg(5),
-        stats_wxfilename = po.GetArg(6);
-    
-    AmDiagGmm am_gmm;
-    Transitions trans_model;
-    {
-      bool binary;
-      Input ki(model_rxfilename, &binary);
-      trans_model.Read(ki.Stream(), binary);
-      am_gmm.Read(ki.Stream(), binary);
-    }
-
-    Fmpe fmpe;
-    ReadKaldiObject(fmpe_rxfilename, &fmpe);
-
-
-    bool have_indirect = (model_derivative_rxfilename != "");
-    AccumAmDiagGmm model_derivative;
-    if (have_indirect)
-      ReadKaldiObject(model_derivative_rxfilename, &model_derivative);
-    
-    FmpeStats fmpe_stats(fmpe);
-    
-    SequentialBaseFloatMatrixReader feature_reader(feature_rspecifier);
-    RandomAccessInt32VectorVectorReader gselect_reader(gselect_rspecifier);
-    RandomAccessPosteriorReader posteriors_reader(posteriors_rspecifier);
-
-    BaseFloat tot_like = 0.0; // tot like weighted by posterior.
-    int32 num_frames = 0;
-    int32 num_done = 0, num_err = 0;
-    
-    for (; !feature_reader.Done(); feature_reader.Next()) {
-      std::string key = feature_reader.Key();
-      if (!posteriors_reader.HasKey(key)) {
-        num_err++;
-        KALDI_WARN << "No posteriors for utterance " << key;
-        continue;
-      } 
-      const Matrix<BaseFloat> &feat_in = feature_reader.Value();
-      const Posterior &posterior = posteriors_reader.Value(key);
-
-      if (static_cast<int32>(posterior.size()) != feat_in.NumRows()) {
-        KALDI_WARN << "Posterior vector has wrong size " <<
-            (posterior.size()) << " vs. "<< (feat_in.NumRows());
-        num_err++;
-        continue;
-      }
-
-      if (!gselect_reader.HasKey(key)) {
-        KALDI_WARN << "No gselect information for key " << key;
-        num_err++;
-        continue;
-      }
-      const std::vector<std::vector<int32> > &gselect =
-          gselect_reader.Value(key);
-      if (static_cast<int32>(gselect.size()) != feat_in.NumRows()) {
-        KALDI_WARN << "gselect information has wrong size";
-        num_err++;
-        continue;
-      }
-      
-      num_done++;
-      Matrix<BaseFloat> fmpe_feat(feat_in.NumRows(), feat_in.NumCols());
-      fmpe.ComputeFeatures(feat_in, gselect, &fmpe_feat);
-      fmpe_feat.AddMat(1.0, feat_in);
-      
-      Matrix<BaseFloat> direct_deriv, indirect_deriv;
-
-      tot_like += ComputeAmGmmFeatureDeriv(am_gmm, trans_model, posterior,
-                                           fmpe_feat, &direct_deriv,
-                                           (have_indirect ? &model_derivative : NULL),
-                                           (have_indirect ? &indirect_deriv : NULL));
-      num_frames += feat_in.NumRows();
-
-      fmpe.AccStats(feat_in, gselect, direct_deriv,
-                    (have_indirect ? &indirect_deriv : NULL), &fmpe_stats);
-      
-      if (num_done % 100 == 0)
-        KALDI_LOG << "Processed " << num_done << " utterances.";
-    }
-
-    KALDI_LOG << "Done " << num_done << " files, " << num_err
-              << " with errors.";
-    KALDI_LOG << "Overall weighted acoustic likelihood per frame is "
-              << (tot_like/num_frames) << " over " << num_frames << " frames.";
-
-    Output ko(stats_wxfilename, binary);
-    fmpe_stats.Write(ko.Stream(), binary);
-    
-    return (num_done != 0 ? 0 : 1);
-  } catch(const std::exception &e) {
-    std::cerr << e.what();
-    return -1;
-  }
-}
-
-
diff --git a/src/gmmbin/gmm-info.cc b/src/gmmbin/gmm-info.cc
index f1c436cd57e..689c68150ec 100644
--- a/src/gmmbin/gmm-info.cc
+++ b/src/gmmbin/gmm-info.cc
@@ -58,8 +58,6 @@ int main(int argc, char *argv[]) {
     std::cout << "number of pdfs " << trans_model.NumPdfs() << '\n';
     std::cout << "number of transition-ids " << trans_model.NumTransitionIds()
               << '\n';
-    std::cout << "number of transition-states "
-              << trans_model.NumTransitionStates() << '\n';
     std::cout << "feature dimension " << am_gmm.Dim() << '\n';
     std::cout << "number of gaussians " << am_gmm.NumGauss() << '\n';
     return 0;
diff --git a/src/hmm/hmm-utils.cc b/src/hmm/hmm-utils.cc
index bf67c13412e..b7729c3b62e 100644
--- a/src/hmm/hmm-utils.cc
+++ b/src/hmm/hmm-utils.cc
@@ -108,6 +108,7 @@ std::shared_ptr<fst::ExpandedFst<fst::StdArc>> GetHmmAsFsa(
     }
   }
 
+  ApplyProbabilityScale(config.transition_scale, loopless_entry.get());
   if (cache != NULL)
     (*cache)[cache_index] = loopless_entry;
   return loopless_entry;
@@ -460,13 +461,12 @@ static bool StateIsStochastic(FST fst, typename FST::StateId s) {
   return ApproxEqual(total_prob.Value(), Weight::One());
 }
 
-// This is the code that expands an FST from transition-states to
-// transition-ids, in the case where the non-optional transition is before
-// the self-loop.
-static void AddSelfLoopsInternal(const Transitions &trans_model,
-                                 const std::vector<int32> &disambig_syms,
-                                 bool check_no_self_loops,
-                                 fst::VectorFst<fst::StdArc> *fst) {
+void AddSelfLoops(const Transitions &trans_model,
+                  const std::vector<int32> &disambig_syms,
+                  BaseFloat self_loop_scale,
+                  bool check_no_self_loops,
+                  fst::VectorFst<fst::StdArc> *fst) {
+  KALDI_ASSERT(fst->Start() != fst::kNoStateId);
   using namespace fst;
   typedef StdArc Arc;
   typedef Arc::Label Label;
@@ -548,14 +548,6 @@ static void AddSelfLoopsInternal(const Transitions &trans_model,
   }
 }
 
-void AddSelfLoops(const Transitions &trans_model,
-                  const std::vector<int32> &disambig_syms,
-                  bool check_no_self_loops,
-                  fst::VectorFst<fst::StdArc> *fst) {
-  KALDI_ASSERT(fst->Start() != fst::kNoStateId);
-  AddSelfLoopsInternal(trans_model, disambig_syms, check_no_self_loops, fst);
-}
-
 // SplitToPhonesInternal takes as input the "alignment" vector containing
 // a sequence of transition-ids, and appends a single vector to
 // "split_output" for each instance of a phone that occurs in the
diff --git a/src/hmm/hmm-utils.h b/src/hmm/hmm-utils.h
index fc704a88f91..a0a23cae5b6 100644
--- a/src/hmm/hmm-utils.h
+++ b/src/hmm/hmm-utils.h
@@ -37,12 +37,19 @@ namespace kaldi {
 /// Configuration class for the GetHTransducer() function; see
 /// \ref hmm_graph_config for context.
 struct HTransducerConfig {
+  /// Transition log-prob scale, see \ref hmm_scale.
+  /// Note this doesn't apply to self-loops; GetHTransducer() does
+  /// not include self-loops.
+  BaseFloat transition_scale;
   int32 nonterm_phones_offset;
 
   HTransducerConfig():
+      transition_scale(1.0),
       nonterm_phones_offset(-1) { }
 
   void Register (OptionsItf *opts) {
+    opts->Register("transition-scale", &transition_scale,
+                   "Scale of transition probs (relative to LM)");
     opts->Register("nonterm-phones-offset", &nonterm_phones_offset,
                    "The integer id of #nonterm_bos in phones.txt, if present. "
                    "Only needs to be set if you are doing grammar decoding, "
@@ -181,7 +188,7 @@ void GetIlabelMapping(const std::vector<std::vector<int32> > &ilabel_info_old,
   */
 void AddSelfLoops(const Transitions &trans_model,
                   const std::vector<int32> &disambig_syms,  // used as a check only.
-                  bool reorder,
+                  BaseFloat self_loop_scale,
                   bool check_no_self_loops,
                   fst::VectorFst<fst::StdArc> *fst);
 
diff --git a/src/hmm/transitions.cc b/src/hmm/transitions.cc
index 3cc8fb9d449..602f96485f9 100644
--- a/src/hmm/transitions.cc
+++ b/src/hmm/transitions.cc
@@ -34,6 +34,11 @@ bool Transitions::operator == (const Transitions &other) const {
       num_pdfs_ == other.num_pdfs_;
 }
 
+bool Transitions::Compatible(const Transitions& other) const {
+  KALDI_ASSERT(false);
+  return false;
+}
+
 void Transitions::ComputeInfo(const ContextDependencyInterface &ctx_dep) {
   using StateId = typename fst::StdFst::StateId;
   const std::vector<int32> &phones = topo_.GetPhones();
diff --git a/src/hmm/transitions.h b/src/hmm/transitions.h
index 0bbd45dc238..89af42904fb 100644
--- a/src/hmm/transitions.h
+++ b/src/hmm/transitions.h
@@ -219,6 +219,8 @@ class Transitions {
   /// returns true if this is identical to 'other'
   bool operator == (const Transitions &other) const;
 
+  bool Compatible(const Transitions& other) const;
+
  private:
 
   // Called from constructor.  initializes info_ (at least, the first
diff --git a/src/latbin/lattice-arc-post.cc b/src/latbin/lattice-arc-post.cc
index 57a761b4157..c8000d3dbba 100644
--- a/src/latbin/lattice-arc-post.cc
+++ b/src/latbin/lattice-arc-post.cc
@@ -82,7 +82,7 @@ class ArcPosteriorComputer {
           const std::vector<int32> &ali = arc.weight.String();
           bool first_phone = true;
           for (int32 frame = 0; frame < num_frames; frame++) {
-            if (trans_model_->IsFinal(ali[frame])) {
+            if (trans_model_->InfoForTransitionId(ali[frame]).is_final) {
               if (first_phone) first_phone = false;
               else os << ' ';
               os << trans_model_->InfoForTransitionId(ali[frame]).phone;
diff --git a/src/latbin/lattice-rescore-mapped.cc b/src/latbin/lattice-rescore-mapped.cc
index d0ce5c64526..ccea04b23e0 100644
--- a/src/latbin/lattice-rescore-mapped.cc
+++ b/src/latbin/lattice-rescore-mapped.cc
@@ -55,7 +55,7 @@ void LatticeAcousticRescore(const Transitions &trans_model,
         LatticeArc arc = aiter.Value();
         int32 trans_id = arc.ilabel;
         if (trans_id != 0) {  // Non-epsilon input label on arc
-          int32 pdf_id = trans_model.TransitionIdToPdf(trans_id);
+          int32 pdf_id = trans_model.TransitionIdToPdfFast(trans_id);
           if (pdf_id > log_likes.NumCols())
             KALDI_ERR << "Pdf-id " << pdf_id << " is out of the range of "
                       << "input log-likelihoods " << log_likes.NumCols()
diff --git a/src/nnet3/discriminative-supervision.cc b/src/nnet3/discriminative-supervision.cc
index 1097a4c472a..94294c4fbce 100644
--- a/src/nnet3/discriminative-supervision.cc
+++ b/src/nnet3/discriminative-supervision.cc
@@ -182,7 +182,7 @@ void DiscriminativeSupervisionSplitter::CollapseTransitionIds(
       KALDI_ASSERT(t >= 0 && t < num_frames);
       Arc arc = aiter.Value();
       KALDI_ASSERT(arc.ilabel != 0 && arc.ilabel == arc.olabel);
-      int32 pdf = tmodel_.TransitionIdToPdf(arc.ilabel);
+      int32 pdf = tmodel_.TransitionIdToPdfFast(arc.ilabel);
       if (pdf_to_tid[t].count(pdf) != 0) {
         arc.ilabel = arc.olabel = pdf_to_tid[t][pdf];
         aiter.SetValue(arc);
diff --git a/src/nnet3/discriminative-training.cc b/src/nnet3/discriminative-training.cc
index a4c967c6622..aa242f13c19 100644
--- a/src/nnet3/discriminative-training.cc
+++ b/src/nnet3/discriminative-training.cc
@@ -234,7 +234,7 @@ void DiscriminativeComputation::LookupNnetOutput(
     for (fst::ArcIterator<Lattice> aiter(den_lat_, s); !aiter.Done(); aiter.Next()) {
       const Arc &arc = aiter.Value();
       if (arc.ilabel != 0) { // input-side has transition-ids, output-side empty
-        int32 tid = arc.ilabel, pdf_id = tmodel_.TransitionIdToPdf(tid);
+        int32 tid = arc.ilabel, pdf_id = tmodel_.TransitionIdToPdfFast(tid);
         // The ordering of the indexes is similar to that in chain models
         requested_indexes->push_back(MakePair(idx * supervision_.num_sequences + seq, pdf_id));
       }
@@ -247,7 +247,7 @@ void DiscriminativeComputation::LookupNnetOutput(
       int32 seq = t / supervision_.frames_per_sequence,
             idx = t % supervision_.frames_per_sequence;
       int32 tid = supervision_.num_ali[t],
-                  pdf_id = tmodel_.TransitionIdToPdf(tid);
+                  pdf_id = tmodel_.TransitionIdToPdfFast(tid);
       KALDI_ASSERT(pdf_id >= 0 && pdf_id < num_pdfs);
       requested_indexes->push_back(MakePair(idx * supervision_.num_sequences + seq, pdf_id));
     }
diff --git a/src/nnet3bin/nnet3-align-compiled.cc b/src/nnet3bin/nnet3-align-compiled.cc
index 09d1fcc4407..a1089b48e30 100644
--- a/src/nnet3bin/nnet3-align-compiled.cc
+++ b/src/nnet3bin/nnet3-align-compiled.cc
@@ -55,8 +55,6 @@ int main(int argc, char *argv[]) {
     AlignConfig align_config;
     NnetSimpleComputationOptions decodable_opts;
     std::string use_gpu = "yes";
-    BaseFloat transition_scale = 1.0;
-    BaseFloat self_loop_scale = 1.0;
     std::string per_frame_acwt_wspecifier;
 
     std::string ivector_rspecifier,
@@ -68,11 +66,6 @@ int main(int argc, char *argv[]) {
 
     po.Register("use-gpu", &use_gpu,
                 "yes|no|optional|wait, only has effect if compiled with CUDA");
-    po.Register("transition-scale", &transition_scale,
-                "Transition-probability scale [relative to acoustics]");
-    po.Register("self-loop-scale", &self_loop_scale,
-                "Scale of self-loop versus non-self-loop "
-                "log probs [relative to acoustics]");
     po.Register("write-per-frame-acoustic-loglikes", &per_frame_acwt_wspecifier,
                 "Wspecifier for table of vectors containing the acoustic log-likelihoods "
                 "per frame for each utterance. E.g. ark:foo/per_frame_logprobs.1.ark");
@@ -176,13 +169,6 @@ int main(int argc, char *argv[]) {
           }
         }
 
-        {  // Add transition-probs to the FST.
-          std::vector<int32> disambig_syms;  // empty.
-          AddTransitionProbs(trans_model, disambig_syms,
-                             transition_scale, self_loop_scale,
-                             &decode_fst);
-        }
-
         DecodableAmNnetSimple nnet_decodable(
             decodable_opts, trans_model, am_nnet,
             features, ivector, online_ivectors,
diff --git a/src/online2/Makefile b/src/online2/Makefile
index 8b975e2ce43..4507f6252dc 100644
--- a/src/online2/Makefile
+++ b/src/online2/Makefile
@@ -13,7 +13,7 @@ OBJFILES = online-gmm-decodable.o online-feature-pipeline.o online-ivector-featu
 LIBNAME = kaldi-online2
 
 ADDLIBS = ../ivector/kaldi-ivector.a ../nnet3/kaldi-nnet3.a \
-          ../chain/kaldi-chain.a ../nnet2/kaldi-nnet2.a \
+          ../chain/kaldi-chain.a \
           ../cudamatrix/kaldi-cudamatrix.a ../decoder/kaldi-decoder.a \
           ../lat/kaldi-lat.a ../hmm/kaldi-hmm.a ../feat/kaldi-feat.a \
           ../transform/kaldi-transform.a ../gmm/kaldi-gmm.a \
diff --git a/src/online2/online-endpoint.h b/src/online2/online-endpoint.h
index 5f80403bbdc..91aa4b54781 100644
--- a/src/online2/online-endpoint.h
+++ b/src/online2/online-endpoint.h
@@ -30,7 +30,6 @@
 #include "base/kaldi-error.h"
 #include "feat/feature-functions.h"
 #include "feat/feature-mfcc.h"
-#include "feat/feature-plp.h"
 #include "itf/online-feature-itf.h"
 #include "lat/kaldi-lattice.h"
 #include "hmm/transitions.h"
diff --git a/src/online2/online-feature-pipeline.cc b/src/online2/online-feature-pipeline.cc
index 3cd3a9daaa4..f96c1a40507 100644
--- a/src/online2/online-feature-pipeline.cc
+++ b/src/online2/online-feature-pipeline.cc
@@ -25,12 +25,13 @@ namespace kaldi {
 
 OnlineFeaturePipelineConfig::OnlineFeaturePipelineConfig(
     const OnlineFeaturePipelineCommandLineConfig &config) {
-  if (config.feature_type == "mfcc" || config.feature_type == "plp" ||
+  if (config.feature_type == "mfcc" ||
       config.feature_type == "fbank") {
     feature_type = config.feature_type;
   } else {
     KALDI_ERR << "Invalid feature type: " << config.feature_type << ". "
-              << "Supported feature types: mfcc, plp.";
+              << "Supported feature types: mfcc, fbank.";
+    // TODO(galv): Make sure fbank features are really supported here!
   }
 
   if (config.mfcc_config != "") {
@@ -40,13 +41,6 @@ OnlineFeaturePipelineConfig::OnlineFeaturePipelineConfig(
                  << "since feature type is set to " << feature_type << ".";
   }  // else use the defaults.
 
-  if (config.plp_config != "") {
-    ReadConfigFromFile(config.plp_config, &plp_opts);
-    if (feature_type != "plp")
-      KALDI_WARN << "--plp-config option has no effect "
-                 << "since feature type is set to " << feature_type << ".";
-  }  // else use the defaults.
-
   if (config.fbank_config != "") {
     ReadConfigFromFile(config.fbank_config, &fbank_opts);
     if (feature_type != "fbank")
@@ -159,8 +153,6 @@ void OnlineFeaturePipeline::GetCmvnState(OnlineCmvnState *cmvn_state) {
 void OnlineFeaturePipeline::Init() {
   if (config_.feature_type == "mfcc") {
     base_feature_ = new OnlineMfcc(config_.mfcc_opts);
-  } else if (config_.feature_type == "plp") {
-    base_feature_ = new OnlinePlp(config_.plp_opts);
   } else if (config_.feature_type == "fbank") {
     base_feature_ = new OnlineFbank(config_.fbank_opts);
   } else {
@@ -286,8 +278,6 @@ void OnlineFeaturePipeline::InputFinished() {
 BaseFloat OnlineFeaturePipelineConfig::FrameShiftInSeconds() const {
   if (feature_type == "mfcc") {
     return mfcc_opts.frame_opts.frame_shift_ms / 1000.0f;
-  } else if (feature_type == "plp") {
-    return plp_opts.frame_opts.frame_shift_ms / 1000.0f;
   } else {
     KALDI_ERR << "Unknown feature type " << feature_type;
     return 0.0;
diff --git a/src/online2/online-feature-pipeline.h b/src/online2/online-feature-pipeline.h
index fab1be3cb27..e5d135c2697 100644
--- a/src/online2/online-feature-pipeline.h
+++ b/src/online2/online-feature-pipeline.h
@@ -49,7 +49,6 @@ namespace kaldi {
 struct OnlineFeaturePipelineCommandLineConfig {
   std::string feature_type;
   std::string mfcc_config;
-  std::string plp_config;
   std::string fbank_config;
   bool add_pitch;
   std::string pitch_config;
@@ -68,15 +67,13 @@ struct OnlineFeaturePipelineCommandLineConfig {
 
   void Register(OptionsItf *opts) {
     opts->Register("feature-type", &feature_type,
-                   "Base feature type [mfcc, plp, fbank]");
+                   "Base feature type [mfcc, fbank]");
     opts->Register("mfcc-config", &mfcc_config, "Configuration file for "
                    "MFCC features (e.g. conf/mfcc.conf)");
-    opts->Register("plp-config", &plp_config, "Configuration file for "
-                   "PLP features (e.g. conf/plp.conf)");
     opts->Register("fbank-config", &fbank_config, "Configuration file for "
                    "filterbank features (e.g. conf/fbank.conf)");
     opts->Register("add-pitch", &add_pitch, "Append pitch features to raw "
-                   "MFCC/PLP features.");
+                   "MFCC features.");
     opts->Register("pitch-config", &pitch_config, "Configuration file for "
                    "pitch features (e.g. conf/pitch.conf)");
     opts->Register("pitch-process-config", &pitch_process_config,
@@ -119,11 +116,10 @@ struct OnlineFeaturePipelineConfig {
 
   BaseFloat FrameShiftInSeconds() const;
 
-  std::string feature_type;  // "mfcc" or "plp" or "fbank"
+  std::string feature_type;  // "mfcc" or "fbank"
 
   MfccOptions mfcc_opts;  // options for MFCC computation,
                           // if feature_type == "mfcc"
-  PlpOptions plp_opts;  // Options for PLP computation, if feature_type == "plp"
   FbankOptions fbank_opts;  // Options for filterbank computation, if
                             // feature_type == "fbank"
 
@@ -226,7 +222,7 @@ class OnlineFeaturePipeline: public OnlineFeatureInterface {
   Matrix<BaseFloat> lda_mat_;  // LDA matrix, if supplied.
   Matrix<BaseFloat> global_cmvn_stats_;  // Global CMVN stats.
 
-  OnlineBaseFeature *base_feature_;        // MFCC/PLP
+  OnlineBaseFeature *base_feature_;        // MFCC
   OnlinePitchFeature *pitch_;              // Raw pitch
   OnlineProcessPitch *pitch_feature_;  // Processed pitch
   OnlineFeatureInterface *feature_;        // CMVN (+ processed pitch)
diff --git a/src/online2/online-gmm-decodable.cc b/src/online2/online-gmm-decodable.cc
index 33a273e8a56..0b62b3d2845 100644
--- a/src/online2/online-gmm-decodable.cc
+++ b/src/online2/online-gmm-decodable.cc
@@ -45,7 +45,7 @@ void DecodableDiagGmmScaledOnline::CacheFrame(int32 frame) {
 BaseFloat DecodableDiagGmmScaledOnline::LogLikelihood(int32 frame, int32 index) {
   if (frame != cur_frame_)
     CacheFrame(frame);
-  int32 pdf_id = trans_model_.TransitionIdToPdf(index);
+  int32 pdf_id = trans_model_.TransitionIdToPdfFast(index);
   if (cache_[pdf_id].first == frame)
     return cache_[pdf_id].second;
   BaseFloat ans = ac_model_.LogLikelihood(pdf_id, cur_feats_) * ac_scale_;
diff --git a/src/online2/online-gmm-decoding.cc b/src/online2/online-gmm-decoding.cc
index 656b2730bdf..4addc8cd969 100644
--- a/src/online2/online-gmm-decoding.cc
+++ b/src/online2/online-gmm-decoding.cc
@@ -169,7 +169,7 @@ bool SingleUtteranceGmmDecoder::GetGaussianPosteriors(bool end_of_utterance,
                 << " frames.";
 
   ConstIntegerSet<int32> silence_set(silence_phones_);  // faster lookup
-  const Transitions &trans_model = models_.GetTransitionModel();
+  const Transitions &trans_model = models_.GetTransitions();
   WeightSilencePost(trans_model, silence_set,
                     config_.silence_weight, &post);  
   
@@ -309,7 +309,7 @@ SingleUtteranceGmmDecoder::~SingleUtteranceGmmDecoder() {
 
 bool SingleUtteranceGmmDecoder::EndpointDetected(
     const OnlineEndpointConfig &config) {
-  const Transitions &tmodel = models_.GetTransitionModel();
+  const Transitions &tmodel = models_.GetTransitions();
   return kaldi::EndpointDetected(config, tmodel,
                                  feature_pipeline_->FrameShiftInSeconds(),
                                  decoder_);
@@ -386,7 +386,7 @@ OnlineGmmDecodingModels::OnlineGmmDecodingModels(
 }
 
 
-const Transitions &OnlineGmmDecodingModels::GetTransitionModel() const {
+const Transitions &OnlineGmmDecodingModels::GetTransitions() const {
   return tmodel_;
 }
 
diff --git a/src/online2/online-gmm-decoding.h b/src/online2/online-gmm-decoding.h
index f5cb725cfc1..c6b492e5e80 100644
--- a/src/online2/online-gmm-decoding.h
+++ b/src/online2/online-gmm-decoding.h
@@ -167,7 +167,7 @@ class OnlineGmmDecodingModels {
  public:
   OnlineGmmDecodingModels(const OnlineGmmDecodingConfig &config);
 
-  const Transitions &GetTransitionModel() const;
+  const Transitions &GetTransitions() const;
 
   const AmDiagGmm &GetOnlineAlignmentModel() const;
 
diff --git a/src/online2/online-nnet2-decoding-threaded.h b/src/online2/online-nnet2-decoding-threaded.h
deleted file mode 100644
index 3ca62e5ea3e..00000000000
--- a/src/online2/online-nnet2-decoding-threaded.h
+++ /dev/null
@@ -1,434 +0,0 @@
-// online2/online-nnet2-decoding-threaded.h
-
-// Copyright 2014-2015  Johns Hopkins University (author: Daniel Povey)
-
-// See ../../COPYING for clarification regarding multiple authors
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//  http://www.apache.org/licenses/LICENSE-2.0
-//
-// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
-// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
-// MERCHANTABLITY OR NON-INFRINGEMENT.
-// See the Apache 2 License for the specific language governing permissions and
-// limitations under the License.
-
-
-#ifndef KALDI_ONLINE2_ONLINE_NNET2_DECODING_THREADED_H_
-#define KALDI_ONLINE2_ONLINE_NNET2_DECODING_THREADED_H_
-
-#include <string>
-#include <vector>
-#include <deque>
-#include <mutex>
-#include <thread>
-
-#include "matrix/matrix-lib.h"
-#include "util/common-utils.h"
-#include "base/kaldi-error.h"
-#include "decoder/decodable-matrix.h"
-#include "nnet2/am-nnet.h"
-#include "online2/online-nnet2-feature-pipeline.h"
-#include "online2/online-endpoint.h"
-#include "decoder/lattice-faster-online-decoder.h"
-#include "hmm/transitions.h"
-#include "util/kaldi-semaphore.h"
-
-namespace kaldi {
-/// @addtogroup  onlinedecoding OnlineDecoding
-/// @{
-
-
-/**
-   class ThreadSynchronizer acts to guard an arbitrary type of buffer between a
-   producing and a consuming thread (note: it's all symmetric between the two
-   thread types).  It has a similar interface to a mutex, except that instead of
-   just Lock and Unlock, it has Lock, UnlockSuccess and UnlockFailure, and each
-   function takes an argument kProducer or kConsumer to identify whether the
-   producing or consuming thread is waiting.
-
-   The basic concept is that you lock the object; and if you discover the you're
-   blocked because you're either trying to read an empty buffer or trying to
-   write to a full buffer, you unlock with UnlockFailure; and this will cause
-   your next call to Lock to block until the *other* thread has called Lock and
-   then UnlockSuccess.  However, if at that point the other thread calls Lock
-   and then UnlockFailure, it is an error because you can't have both producing
-   and consuming threads claiming that the buffer is full/empty.  If you lock
-   the object and were successful you call UnlockSuccess; and you call
-   UnlockSuccess even if, for your own reasons, you ended up not changing the
-   state of the buffer.
-*/
-class ThreadSynchronizer {
- public:
-  ThreadSynchronizer();
-
-  // Most calls to this class should provide the thread-type of the caller,
-  // producing or consuming.  Actually the behavior of this class is symmetric
-  // between the two types of thread.
-  enum ThreadType { kProducer, kConsumer };
-
-  // All functions returning bool will return true normally, and false if
-  // SetAbort() was set; if they return false, you should probably call SetAbort()
-  // on any other ThreadSynchronizer classes you are using and then return from
-  // the thread.
-
-  // call this to lock the object being guarded.
-  bool Lock(ThreadType t);
-
-  // Call this to unlock the object being guarded, if you don't want the next call to
-  // Lock to stall.
-  bool UnlockSuccess(ThreadType t);
-
-  // Call this if you want the next call to Lock() to stall until the other
-  // (producer/consumer) thread has locked and then unlocked the mutex.  Note
-  // that, if the other thread then calls Lock and then UnlockFailure, this will
-  // generate a printed warning (and if repeated too many times, an exception).
-  bool UnlockFailure(ThreadType t);
-
-  // Sets abort_ flag so future calls will return false, and future calls to
-  // Lock() won't lock the mutex but will immediately return false.
-  void SetAbort();
-
-  ~ThreadSynchronizer();
-
- private:
-  bool abort_;
-  bool producer_waiting_;  // true if producer is/will be waiting on semaphore
-  bool consumer_waiting_;  // true if consumer is/will be waiting on semaphore
-  std::mutex mutex_;  // Locks the buffer object.
-  ThreadType held_by_;  // Record of which thread is holding the mutex (if
-                        // held); else undefined.  Used for validation of input.
-  Semaphore producer_semaphore_;  // The producer thread waits on this semaphore
-  Semaphore consumer_semaphore_;  // The consumer thread waits on this semaphore
-  int32 num_errors_;  // Rumber of times the threads alternated doing Lock() and
-                      // UnlockFailure().  This should not happen at all; but
-                      // it's more user-friendly to simply warn a few times; and then
-                      // only after a number of errors, to fail.
-  KALDI_DISALLOW_COPY_AND_ASSIGN(ThreadSynchronizer);
-};
-
-
-
-
-// This is the configuration class for SingleUtteranceNnet2DecoderThreaded.  The
-// actual command line program requires other configs that it creates
-// separately, and which are not included here: namely,
-// OnlineNnet2FeaturePipelineConfig and OnlineEndpointConfig.
-struct OnlineNnet2DecodingThreadedConfig {
-
-  LatticeFasterDecoderConfig decoder_opts;
-
-  BaseFloat acoustic_scale;
-
-  int32 max_buffered_features;  // maximum frames of features we allow to be
-                                // held in the feature buffer before we block
-                                // the feature-processing thread.
-
-  int32 feature_batch_size;  // maximum number of frames at a time that we decode
-                             // before unlocking the mutex.  The only real cost
-                             // here is a mutex lock/unlock, so it's OK to make
-                             // this fairly small.
-  int32 max_loglikes_copy;   // maximum unused frames of log-likelihoods we will
-                             // copy from the decodable object back into another
-                             // matrix to be supplied to the decodable object.
-                             // make this too large-> will block the
-                             // decoder-search thread while copying; too small
-                             // -> the nnet-evaluation thread may get blocked
-                             // for too long while waiting for the decodable
-                             // thread to be ready.
-  int32 nnet_batch_size;    // batch size (number of frames) we evaluate in the
-                            // neural net, if this many is available.  To take
-                            // best advantage of BLAS, you may want to set this
-                            // fairly large, e.g. 32 or 64 frames.  It probably
-                            // makes sense to tune this a bit.
-  int32 decode_batch_size;  // maximum number of frames at a time that we decode
-                            // before unlocking the mutex.  The only real cost
-                            // here is a mutex lock/unlock, so it's OK to make
-                            // this fairly small.
-
-  OnlineNnet2DecodingThreadedConfig() {
-    acoustic_scale = 0.1;
-    max_buffered_features = 100;
-    feature_batch_size = 2;
-    nnet_batch_size = 32;
-    max_loglikes_copy = 20;
-    decode_batch_size = 2;
-  }
-
-  void Check();
-
-  void Register(OptionsItf *opts) {
-    decoder_opts.Register(opts);
-    opts->Register("acoustic-scale", &acoustic_scale, "Scale used on acoustics "
-                   "when decoding");
-    opts->Register("max-buffered-features", &max_buffered_features, "Obscure "
-                   "setting, affects multi-threaded decoding.");
-    opts->Register("feature-batch-size", &max_buffered_features, "Obscure "
-                   "setting, affects multi-threaded decoding.");
-    opts->Register("nnet-batch-size", &nnet_batch_size, "Maximum batch size "
-                   "(in frames) used when evaluating neural net likelihoods");
-    opts->Register("max-loglikes-copy", &max_loglikes_copy,  "Obscure "
-                   "setting, affects multi-threaded decoding.");
-    opts->Register("decode-batch-sie", &decode_batch_size, "Obscure "
-                   "setting, affects multi-threaded decoding.");
-  }
-};
-
-/**
-   You will instantiate this class when you want to decode a single
-   utterance using the online-decoding setup for neural nets.  Each time this
-   class is created, it creates three background threads, and the feature
-   extraction, neural net evaluation, and search aspects of decoding all
-   happen in different threads.
-   Note: we assume that all calls to its public interface happen from a single
-   thread.
-*/
-class SingleUtteranceNnet2DecoderThreaded {
- public:
-  // Constructor.  Unlike SingleUtteranceNnet2Decoder, we create the
-  // feature_pipeline object inside this class, since access to it needs to be
-  // controlled by a mutex and this class knows how to handle that.  The
-  // feature_info and adaptation_state arguments are used to initialize the
-  // (locally owned) feature pipeline.
-  SingleUtteranceNnet2DecoderThreaded(
-      const OnlineNnet2DecodingThreadedConfig &config,
-      const Transitions &tmodel,
-      const nnet2::AmNnet &am_nnet,
-      const fst::Fst<fst::StdArc> &fst,
-      const OnlineNnet2FeaturePipelineInfo &feature_info,
-      const OnlineIvectorExtractorAdaptationState &adaptation_state);
-
-
-
-  /// You call this to provide this class with more waveform to decode.  This
-  /// call is, for all practical purposes, non-blocking.
-  void AcceptWaveform(BaseFloat samp_freq,
-                      const VectorBase<BaseFloat> &wave_part);
-
-  /// Returns the number of pieces of waveform that are still waiting to be
-  /// processed.  This may be useful for calling code to judge whether to supply
-  /// more waveform or to wait.
-  int32 NumWaveformPiecesPending();
-
-  /// You call this to inform the class that no more waveform will be provided;
-  /// this allows it to flush out the last few frames of features, and is
-  /// necessary if you want to call Wait() to wait until all decoding is done.
-  /// After calling InputFinished() you cannot call AcceptWaveform any more.
-  void InputFinished();
-
-  /// You can call this if you don't want the decoding to proceed further with
-  /// this utterance.  It just won't do any more processing, but you can still
-  /// use the lattice from the decoding that it's already done.  Note: it may
-  /// still continue decoding up to decode_batch_size (default: 2) frames of
-  /// data before the decoding thread exits.  You can call Wait() after calling
-  /// this, if you want to wait for that.
-  void TerminateDecoding();
-
-  /// This call will block until all the data has been decoded; it must only be
-  /// called after either InputFinished() has been called or TerminateDecoding() has
-  /// been called; otherwise, to call it is an error.
-  void Wait();
-
-  /// Finalizes the decoding. Cleans up and prunes remaining tokens, so the final
-  /// lattice is faster to obtain.  May not be called unless either InputFinished()
-  /// or TerminateDecoding() has been called.  If InputFinished() was called, it
-  /// calls Wait() to ensure that the decoding has finished (it's not an error
-  /// if you already called Wait()).
-  void FinalizeDecoding();
-
-  /// Returns *approximately* (ignoring end effects), the number of frames of
-  /// data that we expect given the amount of data that the pipeline has
-  /// received via AcceptWaveform().  (ignores small end effects).  This might
-  /// be useful in application code to compare with NumFramesDecoded() and gauge
-  /// how much latency there is.
-  int32 NumFramesReceivedApprox() const;
-
-  /// Returns the number of frames currently decoded.  Caution: don't rely on
-  /// the lattice having exactly this number if you get it after this call, as
-  /// it may increase after this-- unless you've already called either
-  /// TerminateDecoding() or InputFinished(), followed by Wait().
-  int32 NumFramesDecoded() const;
-
-  /// Gets the lattice.  The output lattice has any acoustic scaling in it
-  /// (which will typically be desirable in an online-decoding context); if you
-  /// want an un-scaled lattice, scale it using ScaleLattice() with the inverse
-  /// of the acoustic weight.  "end_of_utterance" will be true if you want the
-  /// final-probs to be included.  If this is at the end of the utterance,
-  /// you might want to first call FinalizeDecoding() first; this will make this
-  /// call return faster.
-  /// If no frames have been decoded yet, it will set clat to a lattice with
-  /// a single state that is final and with unit weight (no cost or alignment).
-  /// The output to final_relative_cost (if non-NULL) is a number >= 0 that's
-  /// closer to 0 if a final-state was close to the best-likelihood state
-  /// active on the last frame, at the time we obtained the lattice.
-  void GetLattice(bool end_of_utterance,
-                  CompactLattice *clat,
-                  BaseFloat *final_relative_cost) const;
-
-  /// Outputs an FST corresponding to the single best path through the current
-  /// lattice. If "use_final_probs" is true AND we reached the final-state of
-  /// the graph then it will include those as final-probs, else it will treat
-  /// all final-probs as one.
-  /// If no frames have been decoded yet, it will set best_path to a lattice with
-  /// a single state that is final and with unit weight (no cost).
-  /// The output to final_relative_cost (if non-NULL) is a number >= 0 that's
-  /// closer to 0 if a final-state were close to the best-likelihood state
-  /// active on the last frame, at the time we got the best path.
-  void GetBestPath(bool end_of_utterance,
-                   Lattice *best_path,
-                   BaseFloat *final_relative_cost) const;
-
-  /// This function calls EndpointDetected from online-endpoint.h,
-  /// with the required arguments.
-  bool EndpointDetected(const OnlineEndpointConfig &config);
-
-  /// Outputs the adaptation state of the feature pipeline to "adaptation_state".  This
-  /// mostly stores stats for iVector estimation, and will generally be called at the
-  /// end of an utterance, assuming it's a scenario where each speaker is seen for
-  /// more than one utterance.
-  /// You may only call this function after either calling TerminateDecoding() or
-  /// InputFinished, and then Wait().  Otherwise it is an error.
-  void GetAdaptationState(OnlineIvectorExtractorAdaptationState *adaptation_state);
-
-  /// Gets the remaining, un-decoded part of the waveform and returns the sample
-  /// rate.  May only be called after Wait(), and it only makes sense to call
-  /// this if you called TerminateDecoding() before Wait().  The idea is that
-  /// you can then provide this un-decoded piece of waveform to another decoder.
-  BaseFloat GetRemainingWaveform(Vector<BaseFloat> *waveform_out) const;
-
-  ~SingleUtteranceNnet2DecoderThreaded();
- private:
-
-  // This function will instruct all threads to abort operation as soon as they
-  // can safely do so, by calling SetAbort() in the threads
-  void AbortAllThreads(bool error);
-
-  // This function waits for all the threads that have been spawned. It is
-  // called in the destructor and Wait(). If called twice it is not an error.
-  void WaitForAllThreads();
-
-
-
-  // this function runs the thread that does the feature extraction and
-  // neural-net evaluation. In case of failure, calls
-  // me->AbortAllThreads(true).
-  static void RunNnetEvaluation(SingleUtteranceNnet2DecoderThreaded *me);
-  // member-function version of RunNnetEvaluation, called by RunNnetEvaluation.
-  bool RunNnetEvaluationInternal();
-  // the following function is called inside RunNnetEvaluationInternal(); it
-  // takes the log and subtracts the prior.
-  void ProcessLoglikes(const CuVector<BaseFloat> &log_inv_prior,
-                       CuMatrixBase<BaseFloat> *loglikes);
-  // called from RunNnetEvaluationInternal().  Returns true in the normal case,
-  // false on error; if it returns false, then we expect that the calling thread
-  // will terminate.  This assumes the caller has already
-  // locked feature_pipeline_mutex_.
-  bool FeatureComputation(int32 num_frames_output);
-
-
-  // this function runs the thread that does the neural-net evaluation.
-  // In case of failure, calls me->AbortAllThreads(true).
-  static void RunDecoderSearch(SingleUtteranceNnet2DecoderThreaded *me);
-  // member-function version of RunDecoderSearch, called by RunDecoderSearch.
-  bool RunDecoderSearchInternal();
-
-
-  // Member variables:
-
-  OnlineNnet2DecodingThreadedConfig config_;
-
-  const nnet2::AmNnet &am_nnet_;
-
-  const Transitions &tmodel_;
-
-
-  // sampling_rate_ is set the first time AcceptWaveform is called.
-  BaseFloat sampling_rate_;
-  // A record of how many samples have been provided so
-  // far via calls to AcceptWaveform.
-  int64 num_samples_received_;
-
-  // The next two variables are written to by AcceptWaveform from the main
-  // thread, and read by the feature-processing thread; they are guarded by
-  // waveform_synchronizer_.  There is no bound on the buffer size here.
-  // Later-arriving data is appended to the vector.  When InputFinished() is
-  // called from the main thread, the main thread sets input_finished_ = true.
-  // sampling_rate_ is only needed for checking that it matches the config.
-  bool input_finished_;
-  std::deque< Vector<BaseFloat>* > input_waveform_;
-
-
-  ThreadSynchronizer waveform_synchronizer_;
-
-  // feature_pipeline_ is accessed by the nnet-evaluation thread, by the main
-  // thread if GetAdaptionState() is called, and by the decoding thread via
-  // ComputeCurrentTraceback() if online silence weighting is being used.  It is
-  // guarded by feature_pipeline_mutex_.
-  OnlineNnet2FeaturePipeline feature_pipeline_;
-  std::mutex feature_pipeline_mutex_;
-
-  // The next two variables are required only for implementation of the function
-  // GetRemainingWaveform().  After we take waveform from the input_waveform_
-  // queue to be processed into features, we put them onto this deque.  Then we
-  // discard from this queue any that we can discard because we have already
-  // decoded those frames (see num_frames_decoded_), and we increment
-  // num_samples_discarded_ by the corresponding number of samples.
-  std::deque< Vector<BaseFloat>* > processed_waveform_;
-  int64 num_samples_discarded_;
-
-  // This object is used to control the (optional) downweighting of silence in iVector estimation,
-  // which is based on the decoder traceback.
-  OnlineSilenceWeighting silence_weighting_;
-  std::mutex silence_weighting_mutex_;
-
-
-  // this Decodable object just stores a matrix of scaled log-likelihoods
-  // obtained by the nnet-evaluation thread.  It is produced by the
-  // nnet-evaluation thread and consumed by the decoder-search thread.  The
-  // decoding thread sets num_frames_decoded_ so the nnet-evaluation thread
-  // knows which frames of log-likelihoods it can discard.  Both of these
-  // variables are guarded by decodable_synchronizer_.  Note:
-  // the num_frames_decoded_ may be less than the current number of frames
-  // the decoder has decoded; the decoder thread sets this variable when it
-  // locks this mutex.
-  DecodableMatrixMappedOffset decodable_;
-  int32 num_frames_decoded_;
-  ThreadSynchronizer decodable_synchronizer_;
-
-  // the decoder_ object contains everything related to the graph search.
-  LatticeFasterOnlineDecoder decoder_;
-  // decoder_mutex_ guards the decoder_ object.  It is usually held by the decoding
-  // thread (where it is released and re-obtained on each frame), but is obtained
-  // by the main (parent) thread if you call functions like NumFramesDecoded(),
-  // GetLattice() and GetBestPath().
-  mutable std::mutex decoder_mutex_;  // declared as mutable because we mutate
-                                      // this mutex in const methods
-
-  // This contains the thread pointers for the nnet-evaluation and
-  // decoder-search threads respectively (or NULL if they have been joined in
-  // Wait()).
-  std::thread threads_[2];
-
-  // This is set to true if AbortAllThreads was called for any reason, including
-  // if someone called TerminateDecoding().
-  bool abort_;
-
-  // This is set to true if any kind of unexpected error is encountered,
-  // including if exceptions are raised in any of the threads.  Will normally
-  // be a coding error, malloc failure-- something we should never encounter.
-  bool error_;
-
-};
-
-
-/// @} End of "addtogroup onlinedecoding"
-
-}  // namespace kaldi
-
-
-
-#endif  // KALDI_ONLINE2_ONLINE_NNET2_DECODING_THREADED_H_
diff --git a/src/online2/online-nnet2-decoding.h b/src/online2/online-nnet2-decoding.h
deleted file mode 100644
index b185b8b69f8..00000000000
--- a/src/online2/online-nnet2-decoding.h
+++ /dev/null
@@ -1,131 +0,0 @@
-// online2/online-nnet2-decoding.h
-
-// Copyright 2014  Johns Hopkins University (author: Daniel Povey)
-
-// See ../../COPYING for clarification regarding multiple authors
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//  http://www.apache.org/licenses/LICENSE-2.0
-//
-// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
-// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
-// MERCHANTABLITY OR NON-INFRINGEMENT.
-// See the Apache 2 License for the specific language governing permissions and
-// limitations under the License.
-
-
-#ifndef KALDI_ONLINE2_ONLINE_NNET2_DECODING_H_
-#define KALDI_ONLINE2_ONLINE_NNET2_DECODING_H_
-
-#include <string>
-#include <vector>
-#include <deque>
-
-#include "matrix/matrix-lib.h"
-#include "util/common-utils.h"
-#include "base/kaldi-error.h"
-#include "nnet2/online-nnet2-decodable.h"
-#include "itf/online-feature-itf.h"
-#include "online2/online-endpoint.h"
-#include "decoder/lattice-faster-online-decoder.h"
-#include "hmm/transitions.h"
-#include "hmm/posterior.h"
-
-namespace kaldi {
-/// @addtogroup  onlinedecoding OnlineDecoding
-/// @{
-
-
-
-
-
-// This configuration class contains the configuration classes needed to create
-// the class SingleUtteranceNnet2Decoder.  The actual command line program
-// requires other configs that it creates separately, and which are not included
-// here: namely, OnlineNnet2FeaturePipelineConfig and OnlineEndpointConfig.
-struct OnlineNnet2DecodingConfig {
-  
-  LatticeFasterDecoderConfig decoder_opts;
-  nnet2::DecodableNnet2OnlineOptions decodable_opts;
-  
-  OnlineNnet2DecodingConfig() {  decodable_opts.acoustic_scale = 0.1; }
-  
-  void Register(OptionsItf *opts) {
-    decoder_opts.Register(opts);
-    decodable_opts.Register(opts);
-  }
-};
-
-/**
-   You will instantiate this class when you want to decode a single
-   utterance using the online-decoding setup for neural nets.
-*/
-class SingleUtteranceNnet2Decoder {
- public:
-  // Constructor.  The feature_pipeline_ pointer is not owned in this
-  // class, it's owned externally.
-  SingleUtteranceNnet2Decoder(const OnlineNnet2DecodingConfig &config,
-                              const Transitions &tmodel,
-                              const nnet2::AmNnet &model,
-                              const fst::Fst<fst::StdArc> &fst,
-                              OnlineFeatureInterface *feature_pipeline);
-  
-  /// advance the decoding as far as we can.
-  void AdvanceDecoding();
-
-  /// Finalizes the decoding. Cleans up and prunes remaining tokens, so the
-  /// GetLattice() call will return faster.  You must not call this before
-  /// calling (TerminateDecoding() or InputIsFinished()) and then Wait().
-  void FinalizeDecoding();
-
-  int32 NumFramesDecoded() const;
-  
-  /// Gets the lattice.  The output lattice has any acoustic scaling in it
-  /// (which will typically be desirable in an online-decoding context); if you
-  /// want an un-scaled lattice, scale it using ScaleLattice() with the inverse
-  /// of the acoustic weight.  "end_of_utterance" will be true if you want the
-  /// final-probs to be included.
-  void GetLattice(bool end_of_utterance,
-                  CompactLattice *clat) const;
-  
-  /// Outputs an FST corresponding to the single best path through the current
-  /// lattice. If "use_final_probs" is true AND we reached the final-state of
-  /// the graph then it will include those as final-probs, else it will treat
-  /// all final-probs as one.
-  void GetBestPath(bool end_of_utterance,
-                   Lattice *best_path) const;
-
-
-  /// This function calls EndpointDetected from online-endpoint.h,
-  /// with the required arguments.
-  bool EndpointDetected(const OnlineEndpointConfig &config);
-
-  const LatticeFasterOnlineDecoder &Decoder() const { return decoder_; }
-  
-  ~SingleUtteranceNnet2Decoder() { }
- private:
-
-  OnlineNnet2DecodingConfig config_;
-
-  OnlineFeatureInterface *feature_pipeline_;
-
-  const Transitions &tmodel_;
-  
-  nnet2::DecodableNnet2Online decodable_;
-  
-  LatticeFasterOnlineDecoder decoder_;
-  
-};
-
-  
-/// @} End of "addtogroup onlinedecoding"
-
-}  // namespace kaldi
-
-
-
-#endif  // KALDI_ONLINE2_ONLINE_NNET2_DECODING_H_
diff --git a/src/online2/online-nnet2-feature-pipeline.h b/src/online2/online-nnet2-feature-pipeline.h
index 2e3fbf7bd78..30de9b6c70e 100644
--- a/src/online2/online-nnet2-feature-pipeline.h
+++ b/src/online2/online-nnet2-feature-pipeline.h
@@ -41,7 +41,7 @@ namespace kaldi {
 /// \ref online-feature-pipeline.h, specialized for use in neural network
 /// decoding with iVectors.  Our recipe is that we extract iVectors that will
 /// be used as an additional input to the neural network, in addition to
-/// a window of several frames of spliced raw features (MFCC, PLP or filterbanks).
+/// a window of several frames of spliced raw features (MFCC, or filterbanks).
 /// The iVectors are extracted on top of a (splice+LDA+MLLT) feature pipeline,
 /// with the added complication that the GMM posteriors used for the iVector
 /// extraction are obtained with a version of the features that has online
@@ -62,9 +62,8 @@ namespace kaldi {
 /// Instead of taking the options for the parts of the feature pipeline
 /// directly, it reads in the names of configuration classes.
 struct OnlineNnet2FeaturePipelineConfig {
-  std::string feature_type;  // "plp" or "mfcc" or "fbank"
+  std::string feature_type;  // or "mfcc" or "fbank"
   std::string mfcc_config;
-  std::string plp_config;
   std::string fbank_config;
 
   // Note: if we do add pitch, it will not be added to the features we give to
@@ -94,15 +93,13 @@ struct OnlineNnet2FeaturePipelineConfig {
 
   void Register(OptionsItf *opts) {
     opts->Register("feature-type", &feature_type,
-                   "Base feature type [mfcc, plp, fbank]");
+                   "Base feature type [mfcc, fbank]");
     opts->Register("mfcc-config", &mfcc_config, "Configuration file for "
                    "MFCC features (e.g. conf/mfcc.conf)");
-    opts->Register("plp-config", &plp_config, "Configuration file for "
-                   "PLP features (e.g. conf/plp.conf)");
     opts->Register("fbank-config", &fbank_config, "Configuration file for "
                    "filterbank features (e.g. conf/fbank.conf)");
     opts->Register("add-pitch", &add_pitch, "Append pitch features to raw "
-                   "MFCC/PLP/filterbank features [but not for iVector extraction]");
+                   "MFCC/filterbank features [but not for iVector extraction]");
     opts->Register("online-pitch-config", &online_pitch_config, "Configuration "
                    "file for online pitch features, if --add-pitch=true (e.g. "
                    "conf/online_pitch.conf)");
@@ -131,11 +128,10 @@ struct OnlineNnet2FeaturePipelineInfo {
 
   BaseFloat FrameShiftInSeconds() const;
 
-  std::string feature_type;  // "mfcc" or "plp" or "fbank"
+  std::string feature_type;  // "mfcc" or "fbank"
 
   MfccOptions mfcc_opts;  // options for MFCC computation,
                           // if feature_type == "mfcc"
-  PlpOptions plp_opts;  // Options for PLP computation, if feature_type == "plp"
   FbankOptions fbank_opts;  // Options for filterbank computation, if
                             // feature_type == "fbank"
 
@@ -167,7 +163,7 @@ struct OnlineNnet2FeaturePipelineInfo {
 /// OnlineNnet2FeaturePipeline is a class that's responsible for putting
 /// together the various parts of the feature-processing pipeline for neural
 /// networks, in an online setting.  The recipe here does not include fMLLR;
-/// instead, it assumes we're giving raw features such as MFCC or PLP or
+/// instead, it assumes we're giving raw features such as MFCC or
 /// filterbank (with no CMVN) to the neural network, and optionally augmenting
 /// these with an iVector that describes the speaker characteristics.  The
 /// iVector is extracted using class OnlineIvectorFeature (see that class for
@@ -269,7 +265,7 @@ class OnlineNnet2FeaturePipeline: public OnlineFeatureInterface {
 
   const OnlineNnet2FeaturePipelineInfo &info_;
 
-  OnlineBaseFeature *base_feature_;        // MFCC/PLP/filterbank
+  OnlineBaseFeature *base_feature_;        // MFCC/filterbank
 
   OnlinePitchFeature *pitch_;              // Raw pitch, if used
   OnlineProcessPitch *pitch_feature_;  // Processed pitch, if pitch used.
diff --git a/src/online2/online2-feature-pipeline.cc b/src/online2/online2-feature-pipeline.cc
index c495c9fc8ef..78a8ae7bb6a 100644
--- a/src/online2/online2-feature-pipeline.cc
+++ b/src/online2/online2-feature-pipeline.cc
@@ -25,12 +25,12 @@ namespace kaldi {
 OnlineNnet2FeaturePipelineInfo::OnlineNnet2FeaturePipelineInfo(
     const OnlineNnet2FeaturePipelineConfig &config):
     silence_weighting_config(config.silence_weighting_config) {
-  if (config.feature_type == "mfcc" || config.feature_type == "plp" ||
+  if (config.feature_type == "mfcc" ||
       config.feature_type == "fbank") {
     feature_type = config.feature_type;
   } else {
     KALDI_ERR << "Invalid feature type: " << config.feature_type << ". "
-              << "Supported feature types: mfcc, plp.";
+              << "Supported feature types: mfcc, fbank.";
   }
 
   if (config.mfcc_config != "") {
@@ -40,13 +40,6 @@ OnlineNnet2FeaturePipelineInfo::OnlineNnet2FeaturePipelineInfo(
                  << "since feature type is set to " << feature_type << ".";
   }  // else use the defaults.
 
-  if (config.plp_config != "") {
-    ReadConfigFromFile(config.plp_config, &plp_opts);
-    if (feature_type != "plp")
-      KALDI_WARN << "--plp-config option has no effect "
-                 << "since feature type is set to " << feature_type << ".";
-  }  // else use the defaults.
-
   if (config.fbank_config != "") {
     ReadConfigFromFile(config.fbank_config, &fbank_opts);
     if (feature_type != "fbank")
@@ -81,8 +74,6 @@ OnlineNnet2FeaturePipeline::OnlineNnet2FeaturePipeline(
     info_(info) {
   if (info_.feature_type == "mfcc") {
     base_feature_ = new OnlineMfcc(info_.mfcc_opts);
-  } else if (info_.feature_type == "plp") {
-    base_feature_ = new OnlinePlp(info_.plp_opts);
   } else if (info_.feature_type == "fbank") {
     base_feature_ = new OnlineFbank(info_.fbank_opts);
   } else {
@@ -194,8 +185,6 @@ BaseFloat OnlineNnet2FeaturePipelineInfo::FrameShiftInSeconds() const {
     return mfcc_opts.frame_opts.frame_shift_ms / 1000.0f;
   } else if (feature_type == "fbank") {
     return fbank_opts.frame_opts.frame_shift_ms / 1000.0f;
-  } else if (feature_type == "plp") {
-    return plp_opts.frame_opts.frame_shift_ms / 1000.0f;
   } else {
     KALDI_ERR << "Unknown feature type " << feature_type;
     return 0.0;
diff --git a/src/online2bin/Makefile b/src/online2bin/Makefile
index 28c135eb950..024ab652320 100644
--- a/src/online2bin/Makefile
+++ b/src/online2bin/Makefile
@@ -8,9 +8,8 @@ LDLIBS += $(CUDA_LDLIBS)
 
 BINFILES = online2-wav-gmm-latgen-faster apply-cmvn-online \
      extend-wav-with-silence compress-uncompress-speex \
-     online2-wav-nnet2-latgen-faster ivector-extract-online2 \
+     ivector-extract-online2 \
      online2-wav-dump-features ivector-randomize \
-     online2-wav-nnet2-am-compute  online2-wav-nnet2-latgen-threaded \
      online2-wav-nnet3-latgen-faster online2-wav-nnet3-latgen-grammar \
      online2-tcp-nnet3-decode-faster
 
@@ -19,7 +18,7 @@ OBJFILES =
 TESTFILES =
 
 ADDLIBS = ../online2/kaldi-online2.a ../ivector/kaldi-ivector.a \
-          ../nnet3/kaldi-nnet3.a ../chain/kaldi-chain.a ../nnet2/kaldi-nnet2.a \
+          ../nnet3/kaldi-nnet3.a ../chain/kaldi-chain.a \
           ../cudamatrix/kaldi-cudamatrix.a ../decoder/kaldi-decoder.a \
           ../lat/kaldi-lat.a ../fstext/kaldi-fstext.a ../hmm/kaldi-hmm.a \
           ../feat/kaldi-feat.a ../transform/kaldi-transform.a \
diff --git a/src/online2bin/online2-tcp-nnet3-decode-faster.cc b/src/online2bin/online2-tcp-nnet3-decode-faster.cc
index 46e9cbc05be..7e99bc9a840 100644
--- a/src/online2bin/online2-tcp-nnet3-decode-faster.cc
+++ b/src/online2bin/online2-tcp-nnet3-decode-faster.cc
@@ -166,7 +166,7 @@ int main(int argc, char *argv[]) {
 
     KALDI_VLOG(1) << "Loading AM...";
 
-    TransitionModel trans_model;
+    Transitions trans_model;
     nnet3::AmNnetSimple am_nnet;
     {
       bool binary;
@@ -439,4 +439,4 @@ void TcpServer::Disconnect() {
     client_desc_ = -1;
   }
 }
-}  // namespace kaldi
\ No newline at end of file
+}  // namespace kaldi
diff --git a/src/online2bin/online2-wav-dump-features.cc b/src/online2bin/online2-wav-dump-features.cc
index 6ddd3bf83e5..137d9523a4e 100644
--- a/src/online2bin/online2-wav-dump-features.cc
+++ b/src/online2bin/online2-wav-dump-features.cc
@@ -18,7 +18,6 @@
 // limitations under the License.
 
 #include "feat/wave-reader.h"
-#include "online2/online-nnet2-decoding.h"
 #include "online2/online-nnet2-feature-pipeline.h"
 #include "online2/onlinebin-util.h"
 #include "online2/online-timing.h"
diff --git a/src/online2bin/online2-wav-nnet2-am-compute.cc b/src/online2bin/online2-wav-nnet2-am-compute.cc
deleted file mode 100644
index b41351b4d35..00000000000
--- a/src/online2bin/online2-wav-nnet2-am-compute.cc
+++ /dev/null
@@ -1,200 +0,0 @@
-// online2bin/online2-wav-nnet2-am-compute.cc
-
-// Copyright 2014  Johns Hopkins University (author: Daniel Povey)
-//           2014  David Snyder
-
-// See ../../COPYING for clarification regarding multiple authors
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//  http://www.apache.org/licenses/LICENSE-2.0
-//
-// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
-// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
-// MERCHANTABLITY OR NON-INFRINGEMENT.
-// See the Apache 2 License for the specific language governing permissions and
-// limitations under the License.
-
-#include "feat/wave-reader.h"
-#include "online2/online-nnet2-decoding.h"
-#include "online2/online-nnet2-feature-pipeline.h"
-#include "online2/onlinebin-util.h"
-
-int main(int argc, char *argv[]) {
-  try {
-    using namespace kaldi;
-    using namespace kaldi::nnet2;
-    typedef kaldi::int32 int32;
-    typedef kaldi::int64 int64;
-    
-    const char *usage =
-        "Simulates the online neural net computation for each file of input\n" 
-        "features, and outputs as a matrix the result, with optional\n"
-        "iVector-based speaker adaptation. Note: some configuration values\n"
-        "and inputs are set via config files whose filenames are passed as\n"
-        "options.  Used mostly for debugging.\n"
-        "Note: if you want it to apply a log (e.g. for log-likelihoods), use\n"
-        "--apply-log=true.\n"
-        "\n"
-        "Usage:  online2-wav-nnet2-am-compute [options] <nnet-in>\n"
-        "<spk2utt-rspecifier> <wav-rspecifier> <feature-or-loglikes-wspecifier>\n"
-        "The spk2utt-rspecifier can just be <utterance-id> <utterance-id> if\n"
-        "you want to compute utterance by utterance.\n";
-    
-    BaseFloat chunk_length_secs = 0.05;
-    bool apply_log = false;
-    bool pad_input = true;
-    bool online = true;
-
-    // feature_config includes configuration for the iVector adaptation,
-    // as well as the basic features.
-    OnlineNnet2FeaturePipelineConfig feature_config;  
-    ParseOptions po(usage);
-    po.Register("apply-log", &apply_log, "Apply a log to the result of the computation "
-                "before outputting.");
-    po.Register("pad-input", &pad_input, "If true, duplicate the first and last frames "
-                "of input features as required for temporal context, to prevent #frames "
-                "of output being less than those of input.");
-    po.Register("chunk-length", &chunk_length_secs,
-                "Length of chunk size in seconds, that we process.");
-    po.Register("online", &online,
-                "You can set this to false to disable online iVector estimation "
-                "and have all the data for each utterance used, even at "
-                "utterance start.  This is useful where you just want the best "
-                "results and don't care about online operation.  Setting this to "
-                "false has the same effect as setting "
-                "--use-most-recent-ivector=true and --greedy-ivector-extractor=true "
-                "in the file given to --ivector-extraction-config, and "
-                "--chunk-length=-1.");
-    
-    feature_config.Register(&po);
-    po.Read(argc, argv);
-    if (po.NumArgs() != 4) {
-      po.PrintUsage();
-      return 1;
-    }
-    
-    std::string nnet2_rxfilename = po.GetArg(1),
-        spk2utt_rspecifier = po.GetArg(2),
-        wav_rspecifier = po.GetArg(3),
-        features_or_loglikes_wspecifier = po.GetArg(4);
-    
-    OnlineNnet2FeaturePipelineInfo feature_info(feature_config);
-    if (!online) {
-      feature_info.ivector_extractor_info.use_most_recent_ivector = true;
-      feature_info.ivector_extractor_info.greedy_ivector_extractor = true;
-      chunk_length_secs = -1.0;
-    }
-
-    Transitions trans_model;
-    AmNnet am_nnet;
-    {
-      bool binary;
-      Input ki(nnet2_rxfilename, &binary);
-      trans_model.Read(ki.Stream(), binary);
-      am_nnet.Read(ki.Stream(), binary);
-    }
-    Nnet &nnet = am_nnet.GetNnet();
-    
-    int64 num_done = 0, num_frames = 0;
-    SequentialTokenVectorReader spk2utt_reader(spk2utt_rspecifier);
-    RandomAccessTableReader<WaveHolder> wav_reader(wav_rspecifier);
-    BaseFloatCuMatrixWriter writer(features_or_loglikes_wspecifier);
-    
-    for (; !spk2utt_reader.Done(); spk2utt_reader.Next()) {
-      std::string spk = spk2utt_reader.Key();
-      const std::vector<std::string> &uttlist = spk2utt_reader.Value();
-      OnlineIvectorExtractorAdaptationState adaptation_state(
-          feature_info.ivector_extractor_info);
-      for (size_t i = 0; i < uttlist.size(); i++) {
-        std::string utt = uttlist[i];
-        if (!wav_reader.HasKey(utt)) {
-          KALDI_WARN << "Did not find audio for utterance " << utt;
-          continue;
-        }
-        const WaveData &wave_data = wav_reader.Value(utt);
-        // get the data for channel zero (if the signal is not mono, we only
-        // take the first channel).
-        SubVector<BaseFloat> data(wave_data.Data(), 0);
-
-        OnlineNnet2FeaturePipeline feature_pipeline(feature_info);
-        feature_pipeline.SetAdaptationState(adaptation_state);
-        
-        BaseFloat samp_freq = wave_data.SampFreq();
-        int32 chunk_length;
-        if (chunk_length_secs > 0) {
-          chunk_length = int32(samp_freq * chunk_length_secs);
-          if (chunk_length == 0) chunk_length = 1;
-        } else {
-          chunk_length = std::numeric_limits<int32>::max();
-        }
-        
-        int32 samp_offset = 0;
-        while (samp_offset < data.Dim()) {
-          int32 samp_remaining = data.Dim() - samp_offset;
-          int32 num_samp = chunk_length < samp_remaining ? chunk_length
-                                                         : samp_remaining;
-          
-          SubVector<BaseFloat> wave_part(data, samp_offset, num_samp);
-          feature_pipeline.AcceptWaveform(samp_freq, wave_part);
-          
-          samp_offset += num_samp;
-          if (samp_offset == data.Dim()) {
-            // no more input. flush out last frames
-            feature_pipeline.InputFinished();
-          }
-        }
-        
-        int32 feats_num_frames = feature_pipeline.NumFramesReady(),
-              feats_dim = feature_pipeline.Dim();
-        Matrix<BaseFloat> feats(feats_num_frames, feats_dim);
-
-        for (int32 i = 0; i < feats_num_frames; i++) {
-          SubVector<BaseFloat> frame_vector(feats, i);
-          feature_pipeline.GetFrame(i, &frame_vector);
-        }
-
-        // In an application you might avoid updating the adaptation state if
-        // you felt the utterance had low confidence.  See lat/confidence.h
-        feature_pipeline.GetAdaptationState(&adaptation_state);
-
-        int32 output_frames = feats.NumRows(), 
-              output_dim = nnet.OutputDim();
-        CuMatrix<BaseFloat> output(output_frames, output_dim),
-                            feats_cu(feats);
-
-        if (!pad_input)
-          output_frames -= nnet.LeftContext() + nnet.RightContext();
-        if (output_frames <= 0) {
-          KALDI_WARN << "Skipping utterance " << utt << " because output "
-                     << "would be empty.";
-          continue;
-        }
-        
-        NnetComputation(nnet, feats_cu, pad_input, &output);
-
-        if (apply_log) {
-          output.ApplyFloor(1.0e-20);
-          output.ApplyLog();
-        }
-
-        writer.Write(utt, output);
-        num_frames += feats.NumRows();
-        num_done++;
-
-        KALDI_LOG << "Processed data for utterance " << utt;
-      }
-    }
-
-    KALDI_LOG << "Processed " << num_done << " feature files, "
-              << num_frames << " frames of input were processed.";
-
-    return (num_done != 0 ? 0 : 1);
-  } catch(const std::exception& e) {
-    std::cerr << e.what() << '\n';
-    return -1;
-  }
-} // main()
diff --git a/src/online2bin/online2-wav-nnet2-latgen-faster.cc b/src/online2bin/online2-wav-nnet2-latgen-faster.cc
deleted file mode 100644
index 18f0064adea..00000000000
--- a/src/online2bin/online2-wav-nnet2-latgen-faster.cc
+++ /dev/null
@@ -1,293 +0,0 @@
-// online2bin/online2-wav-nnet2-latgen-faster.cc
-
-// Copyright 2014  Johns Hopkins University (author: Daniel Povey)
-
-// See ../../COPYING for clarification regarding multiple authors
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//  http://www.apache.org/licenses/LICENSE-2.0
-//
-// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
-// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
-// MERCHANTABLITY OR NON-INFRINGEMENT.
-// See the Apache 2 License for the specific language governing permissions and
-// limitations under the License.
-
-#include "feat/wave-reader.h"
-#include "online2/online-nnet2-decoding.h"
-#include "online2/online-nnet2-feature-pipeline.h"
-#include "online2/onlinebin-util.h"
-#include "online2/online-timing.h"
-#include "online2/online-endpoint.h"
-#include "fstext/fstext-lib.h"
-#include "lat/lattice-functions.h"
-#include "util/kaldi-thread.h"
-
-namespace kaldi {
-
-void GetDiagnosticsAndPrintOutput(const std::string &utt,
-                                  const fst::SymbolTable *word_syms,
-                                  const CompactLattice &clat,
-                                  int64 *tot_num_frames,
-                                  double *tot_like) {
-  if (clat.NumStates() == 0) {
-    KALDI_WARN << "Empty lattice.";
-    return;
-  }
-  CompactLattice best_path_clat;
-  CompactLatticeShortestPath(clat, &best_path_clat);
-
-  Lattice best_path_lat;
-  ConvertLattice(best_path_clat, &best_path_lat);
-
-  double likelihood;
-  LatticeWeight weight;
-  int32 num_frames;
-  std::vector<int32> alignment;
-  std::vector<int32> words;
-  GetLinearSymbolSequence(best_path_lat, &alignment, &words, &weight);
-  num_frames = alignment.size();
-  likelihood = -(weight.Value1() + weight.Value2());
-  *tot_num_frames += num_frames;
-  *tot_like += likelihood;
-  KALDI_VLOG(2) << "Likelihood per frame for utterance " << utt << " is "
-                << (likelihood / num_frames) << " over " << num_frames
-                << " frames.";
-
-  if (word_syms != NULL) {
-    std::cerr << utt << ' ';
-    for (size_t i = 0; i < words.size(); i++) {
-      std::string s = word_syms->Find(words[i]);
-      if (s == "")
-        KALDI_ERR << "Word-id " << words[i] << " not in symbol table.";
-      std::cerr << s << ' ';
-    }
-    std::cerr << std::endl;
-  }
-}
-
-}
-
-int main(int argc, char *argv[]) {
-  try {
-    using namespace kaldi;
-    using namespace fst;
-
-    typedef kaldi::int32 int32;
-    typedef kaldi::int64 int64;
-
-    const char *usage =
-        "Reads in wav file(s) and simulates online decoding with neural nets\n"
-        "(nnet2 setup), with optional iVector-based speaker adaptation and\n"
-        "optional endpointing.  Note: some configuration values and inputs are\n"
-        "set via config files whose filenames are passed as options\n"
-        "\n"
-        "Usage: online2-wav-nnet2-latgen-faster [options] <nnet2-in> <fst-in> "
-        "<spk2utt-rspecifier> <wav-rspecifier> <lattice-wspecifier>\n"
-        "The spk2utt-rspecifier can just be <utterance-id> <utterance-id> if\n"
-        "you want to decode utterance by utterance.\n"
-        "See egs/rm/s5/local/run_online_decoding_nnet2.sh for example\n"
-        "See also online2-wav-nnet2-latgen-threaded\n";
-
-    ParseOptions po(usage);
-
-    std::string word_syms_rxfilename;
-
-    OnlineEndpointConfig endpoint_config;
-
-    // feature_config includes configuration for the iVector adaptation,
-    // as well as the basic features.
-    OnlineNnet2FeaturePipelineConfig feature_config;
-    OnlineNnet2DecodingConfig nnet2_decoding_config;
-
-    BaseFloat chunk_length_secs = 0.05;
-    bool do_endpointing = false;
-    bool online = true;
-
-    po.Register("chunk-length", &chunk_length_secs,
-                "Length of chunk size in seconds, that we process.  Set to <= 0 "
-                "to use all input in one chunk.");
-    po.Register("word-symbol-table", &word_syms_rxfilename,
-                "Symbol table for words [for debug output]");
-    po.Register("do-endpointing", &do_endpointing,
-                "If true, apply endpoint detection");
-    po.Register("online", &online,
-                "You can set this to false to disable online iVector estimation "
-                "and have all the data for each utterance used, even at "
-                "utterance start.  This is useful where you just want the best "
-                "results and don't care about online operation.  Setting this to "
-                "false has the same effect as setting "
-                "--use-most-recent-ivector=true and --greedy-ivector-extractor=true "
-                "in the file given to --ivector-extraction-config, and "
-                "--chunk-length=-1.");
-    po.Register("num-threads-startup", &g_num_threads,
-                "Number of threads used when initializing iVector extractor.");
-
-    feature_config.Register(&po);
-    nnet2_decoding_config.Register(&po);
-    endpoint_config.Register(&po);
-
-    po.Read(argc, argv);
-
-    if (po.NumArgs() != 5) {
-      po.PrintUsage();
-      return 1;
-    }
-
-    std::string nnet2_rxfilename = po.GetArg(1),
-        fst_rxfilename = po.GetArg(2),
-        spk2utt_rspecifier = po.GetArg(3),
-        wav_rspecifier = po.GetArg(4),
-        clat_wspecifier = po.GetArg(5);
-
-    OnlineNnet2FeaturePipelineInfo feature_info(feature_config);
-
-    if (!online) {
-      feature_info.ivector_extractor_info.use_most_recent_ivector = true;
-      feature_info.ivector_extractor_info.greedy_ivector_extractor = true;
-      chunk_length_secs = -1.0;
-    }
-
-    Transitions trans_model;
-    nnet2::AmNnet nnet;
-    {
-      bool binary;
-      Input ki(nnet2_rxfilename, &binary);
-      trans_model.Read(ki.Stream(), binary);
-      nnet.Read(ki.Stream(), binary);
-    }
-
-    fst::Fst<fst::StdArc> *decode_fst = ReadFstKaldiGeneric(fst_rxfilename);
-
-    fst::SymbolTable *word_syms = NULL;
-    if (word_syms_rxfilename != "")
-      if (!(word_syms = fst::SymbolTable::ReadText(word_syms_rxfilename)))
-        KALDI_ERR << "Could not read symbol table from file "
-                  << word_syms_rxfilename;
-
-    int32 num_done = 0, num_err = 0;
-    double tot_like = 0.0;
-    int64 num_frames = 0;
-
-    SequentialTokenVectorReader spk2utt_reader(spk2utt_rspecifier);
-    RandomAccessTableReader<WaveHolder> wav_reader(wav_rspecifier);
-    CompactLatticeWriter clat_writer(clat_wspecifier);
-
-    OnlineTimingStats timing_stats;
-
-    for (; !spk2utt_reader.Done(); spk2utt_reader.Next()) {
-      std::string spk = spk2utt_reader.Key();
-      const std::vector<std::string> &uttlist = spk2utt_reader.Value();
-      OnlineIvectorExtractorAdaptationState adaptation_state(
-          feature_info.ivector_extractor_info);
-      for (size_t i = 0; i < uttlist.size(); i++) {
-        std::string utt = uttlist[i];
-        if (!wav_reader.HasKey(utt)) {
-          KALDI_WARN << "Did not find audio for utterance " << utt;
-          num_err++;
-          continue;
-        }
-        const WaveData &wave_data = wav_reader.Value(utt);
-        // get the data for channel zero (if the signal is not mono, we only
-        // take the first channel).
-        SubVector<BaseFloat> data(wave_data.Data(), 0);
-
-        OnlineNnet2FeaturePipeline feature_pipeline(feature_info);
-        feature_pipeline.SetAdaptationState(adaptation_state);
-
-        OnlineSilenceWeighting silence_weighting(
-            trans_model,
-            feature_info.silence_weighting_config);
-
-        SingleUtteranceNnet2Decoder decoder(nnet2_decoding_config,
-                                            trans_model,
-                                            nnet,
-                                            *decode_fst,
-                                            &feature_pipeline);
-        OnlineTimer decoding_timer(utt);
-
-        BaseFloat samp_freq = wave_data.SampFreq();
-        int32 chunk_length;
-        if (chunk_length_secs > 0) {
-          chunk_length = int32(samp_freq * chunk_length_secs);
-          if (chunk_length == 0) chunk_length = 1;
-        } else {
-          chunk_length = std::numeric_limits<int32>::max();
-        }
-
-        int32 samp_offset = 0;
-        std::vector<std::pair<int32, BaseFloat> > delta_weights;
-
-        while (samp_offset < data.Dim()) {
-          int32 samp_remaining = data.Dim() - samp_offset;
-          int32 num_samp = chunk_length < samp_remaining ? chunk_length
-                                                         : samp_remaining;
-
-          SubVector<BaseFloat> wave_part(data, samp_offset, num_samp);
-          feature_pipeline.AcceptWaveform(samp_freq, wave_part);
-
-          samp_offset += num_samp;
-          decoding_timer.WaitUntil(samp_offset / samp_freq);
-          if (samp_offset == data.Dim()) {
-            // no more input. flush out last frames
-            feature_pipeline.InputFinished();
-          }
-
-          if (silence_weighting.Active() &&
-              feature_pipeline.IvectorFeature() != NULL) {
-            silence_weighting.ComputeCurrentTraceback(decoder.Decoder());
-            silence_weighting.GetDeltaWeights(
-                feature_pipeline.IvectorFeature()->NumFramesReady(),
-                &delta_weights);
-            feature_pipeline.IvectorFeature()->UpdateFrameWeights(
-                delta_weights);
-          }
-
-          decoder.AdvanceDecoding();
-
-          if (do_endpointing && decoder.EndpointDetected(endpoint_config))
-            break;
-        }
-        decoder.FinalizeDecoding();
-
-        CompactLattice clat;
-        bool end_of_utterance = true;
-        decoder.GetLattice(end_of_utterance, &clat);
-
-        GetDiagnosticsAndPrintOutput(utt, word_syms, clat,
-                                     &num_frames, &tot_like);
-
-        decoding_timer.OutputStats(&timing_stats);
-
-        // In an application you might avoid updating the adaptation state if
-        // you felt the utterance had low confidence.  See lat/confidence.h
-        feature_pipeline.GetAdaptationState(&adaptation_state);
-
-        // we want to output the lattice with un-scaled acoustics.
-        BaseFloat inv_acoustic_scale =
-            1.0 / nnet2_decoding_config.decodable_opts.acoustic_scale;
-        ScaleLattice(AcousticLatticeScale(inv_acoustic_scale), &clat);
-
-        clat_writer.Write(utt, clat);
-        KALDI_LOG << "Decoded utterance " << utt;
-        num_done++;
-      }
-    }
-    timing_stats.Print(online);
-
-    KALDI_LOG << "Decoded " << num_done << " utterances, "
-              << num_err << " with errors.";
-    KALDI_LOG << "Overall likelihood per frame was " << (tot_like / num_frames)
-              << " per frame over " << num_frames << " frames.";
-    delete decode_fst;
-    delete word_syms; // will delete if non-NULL.
-    return (num_done != 0 ? 0 : 1);
-  } catch(const std::exception& e) {
-    std::cerr << e.what();
-    return -1;
-  }
-} // main()
diff --git a/src/online2bin/online2-wav-nnet2-latgen-threaded.cc b/src/online2bin/online2-wav-nnet2-latgen-threaded.cc
deleted file mode 100644
index a61d2a8ba6f..00000000000
--- a/src/online2bin/online2-wav-nnet2-latgen-threaded.cc
+++ /dev/null
@@ -1,312 +0,0 @@
-// online2bin/online2-wav-nnet2-latgen-threaded.cc
-
-// Copyright 2014-2015  Johns Hopkins University (author: Daniel Povey)
-
-// See ../../COPYING for clarification regarding multiple authors
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//  http://www.apache.org/licenses/LICENSE-2.0
-//
-// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
-// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
-// MERCHANTABLITY OR NON-INFRINGEMENT.
-// See the Apache 2 License for the specific language governing permissions and
-// limitations under the License.
-
-#include "feat/wave-reader.h"
-#include "online2/online-nnet2-decoding-threaded.h"
-#include "online2/online-nnet2-feature-pipeline.h"
-#include "online2/onlinebin-util.h"
-#include "online2/online-timing.h"
-#include "online2/online-endpoint.h"
-#include "fstext/fstext-lib.h"
-#include "lat/lattice-functions.h"
-#include "util/kaldi-thread.h"
-
-namespace kaldi {
-
-void GetDiagnosticsAndPrintOutput(const std::string &utt,
-                                  const fst::SymbolTable *word_syms,
-                                  const CompactLattice &clat,
-                                  int64 *tot_num_frames,
-                                  double *tot_like) {
-  if (clat.NumStates() == 0) {
-    KALDI_WARN << "Empty lattice.";
-    return;
-  }
-  CompactLattice best_path_clat;
-  CompactLatticeShortestPath(clat, &best_path_clat);
-  
-  Lattice best_path_lat;
-  ConvertLattice(best_path_clat, &best_path_lat);
-  
-  double likelihood;
-  LatticeWeight weight;
-  int32 num_frames;
-  std::vector<int32> alignment;
-  std::vector<int32> words;
-  GetLinearSymbolSequence(best_path_lat, &alignment, &words, &weight);
-  num_frames = alignment.size();
-  likelihood = -(weight.Value1() + weight.Value2());
-  *tot_num_frames += num_frames;
-  *tot_like += likelihood;
-  KALDI_VLOG(2) << "Likelihood per frame for utterance " << utt << " is "
-                << (likelihood / num_frames) << " over " << num_frames
-                << " frames.";
-             
-  if (word_syms != NULL) {
-    std::cerr << utt << ' ';
-    for (size_t i = 0; i < words.size(); i++) {
-      std::string s = word_syms->Find(words[i]);
-      if (s == "")
-        KALDI_ERR << "Word-id " << words[i] << " not in symbol table.";
-      std::cerr << s << ' ';
-    }
-    std::cerr << std::endl;
-  }
-}
-
-}
-
-int main(int argc, char *argv[]) {
-  try {
-    using namespace kaldi;
-    using namespace fst;
-    
-    typedef kaldi::int32 int32;
-    typedef kaldi::int64 int64;
-    
-    const char *usage =
-        "Reads in wav file(s) and simulates online decoding with neural nets\n"
-        "(nnet2 setup), with optional iVector-based speaker adaptation and\n"
-        "optional endpointing.  This version uses multiple threads for decoding.\n"
-        "Note: some configuration values and inputs are set via config files\n"
-        "whose filenames are passed as options\n"
-        "\n"
-        "Usage: online2-wav-nnet2-latgen-threaded [options] <nnet2-in> <fst-in> "
-        "<spk2utt-rspecifier> <wav-rspecifier> <lattice-wspecifier>\n"
-        "The spk2utt-rspecifier can just be <utterance-id> <utterance-id> if\n"
-        "you want to decode utterance by utterance.\n"
-        "See egs/rm/s5/local/run_online_decoding_nnet2.sh for example\n"
-        "See also online2-wav-nnet2-latgen-faster\n";
-    
-    ParseOptions po(usage);
-    
-    std::string word_syms_rxfilename;
-    
-    OnlineEndpointConfig endpoint_config;
-
-    // feature_config includes configuration for the iVector adaptation,
-    // as well as the basic features.
-    OnlineNnet2FeaturePipelineConfig feature_config;  
-    OnlineNnet2DecodingThreadedConfig nnet2_decoding_config;
-    
-    BaseFloat chunk_length_secs = 0.05;
-    bool do_endpointing = false;
-    bool modify_ivector_config = false;
-    bool simulate_realtime_decoding = true;
-    
-    po.Register("chunk-length", &chunk_length_secs,
-                "Length of chunk size in seconds, that we provide each time to the "
-                "decoder.  The actual chunk sizes it processes for various stages "
-                "of decoding are dynamically determinated, and unrelated to this");
-    po.Register("word-symbol-table", &word_syms_rxfilename,
-                "Symbol table for words [for debug output]");
-    po.Register("do-endpointing", &do_endpointing,
-                "If true, apply endpoint detection");
-    po.Register("modify-ivector-config", &modify_ivector_config,
-                "If true, modifies the iVector configuration from the config files "
-                "by setting --use-most-recent-ivector=true and --greedy-ivector-extractor=true. "
-                "This will give the best possible results, but the results may become dependent "
-                "on the speed of your machine (slower machine -> better results).  Compare "
-                "to the --online option in online2-wav-nnet2-latgen-faster");
-    po.Register("simulate-realtime-decoding", &simulate_realtime_decoding,
-                "If true, simulate real-time decoding scenario by providing the "
-                "data incrementally, calling sleep() until each piece is ready. "
-                "If false, don't sleep (so it will be faster).");
-    po.Register("num-threads-startup", &g_num_threads,
-                "Number of threads used when initializing iVector extractor.  ");
-    
-    feature_config.Register(&po);
-    nnet2_decoding_config.Register(&po);
-    endpoint_config.Register(&po);
-    
-    po.Read(argc, argv);
-    
-    if (po.NumArgs() != 5) {
-      po.PrintUsage();
-      return 1;
-    }
-    
-    std::string nnet2_rxfilename = po.GetArg(1),
-        fst_rxfilename = po.GetArg(2),
-        spk2utt_rspecifier = po.GetArg(3),
-        wav_rspecifier = po.GetArg(4),
-        clat_wspecifier = po.GetArg(5);
-    
-    OnlineNnet2FeaturePipelineInfo feature_info(feature_config);
-
-    if (modify_ivector_config) {
-      feature_info.ivector_extractor_info.use_most_recent_ivector = true;
-      feature_info.ivector_extractor_info.greedy_ivector_extractor = true;
-    }
-    
-    Transitions trans_model;
-    nnet2::AmNnet am_nnet;
-    {
-      bool binary;
-      Input ki(nnet2_rxfilename, &binary);
-      trans_model.Read(ki.Stream(), binary);
-      am_nnet.Read(ki.Stream(), binary);
-    }
-    
-    fst::Fst<fst::StdArc> *decode_fst = ReadFstKaldiGeneric(fst_rxfilename);
-    
-    fst::SymbolTable *word_syms = NULL;
-    if (word_syms_rxfilename != "")
-      if (!(word_syms = fst::SymbolTable::ReadText(word_syms_rxfilename)))
-        KALDI_ERR << "Could not read symbol table from file "
-                  << word_syms_rxfilename;
-    
-    int32 num_done = 0, num_err = 0;
-    double tot_like = 0.0;
-    int64 num_frames = 0;
-    Timer global_timer;
-    
-    SequentialTokenVectorReader spk2utt_reader(spk2utt_rspecifier);
-    RandomAccessTableReader<WaveHolder> wav_reader(wav_rspecifier);
-    CompactLatticeWriter clat_writer(clat_wspecifier);
-    
-    OnlineTimingStats timing_stats;
-    
-    for (; !spk2utt_reader.Done(); spk2utt_reader.Next()) {
-      std::string spk = spk2utt_reader.Key();
-      const std::vector<std::string> &uttlist = spk2utt_reader.Value();
-      OnlineIvectorExtractorAdaptationState adaptation_state(
-          feature_info.ivector_extractor_info);
-      for (size_t i = 0; i < uttlist.size(); i++) {
-        std::string utt = uttlist[i];
-        if (!wav_reader.HasKey(utt)) {
-          KALDI_WARN << "Did not find audio for utterance " << utt;
-          num_err++;
-          continue;
-        }
-        const WaveData &wave_data = wav_reader.Value(utt);
-        // get the data for channel zero (if the signal is not mono, we only
-        // take the first channel).
-        SubVector<BaseFloat> data(wave_data.Data(), 0);
-
-        
-        SingleUtteranceNnet2DecoderThreaded decoder(
-            nnet2_decoding_config, trans_model, am_nnet,
-            *decode_fst, feature_info, adaptation_state);
-        
-        OnlineTimer decoding_timer(utt);
-        
-        BaseFloat samp_freq = wave_data.SampFreq();
-        int32 chunk_length;
-        KALDI_ASSERT(chunk_length_secs > 0);
-        chunk_length = int32(samp_freq * chunk_length_secs);
-        if (chunk_length == 0) chunk_length = 1;
-        
-        int32 samp_offset = 0;
-        while (samp_offset < data.Dim()) {
-          int32 samp_remaining = data.Dim() - samp_offset;
-          int32 num_samp = chunk_length < samp_remaining ? chunk_length
-                                                         : samp_remaining;
-          
-          SubVector<BaseFloat> wave_part(data, samp_offset, num_samp);
-
-          // The endpointing code won't work if we let the waveform be given to
-          // the decoder all at once, because we'll exit this while loop, and
-          // the endpointing happens inside this while loop.  The next statement
-          // is intended to prevent this from happening.
-          while (do_endpointing &&
-                 decoder.NumWaveformPiecesPending() * chunk_length_secs > 2.0)
-            Sleep(0.5f);
-          
-          decoder.AcceptWaveform(samp_freq, wave_part);
-          
-          samp_offset += num_samp;
-
-          if (simulate_realtime_decoding) {
-            // Note: the next call may actually call sleep().
-            decoding_timer.SleepUntil(samp_offset / samp_freq);
-          }
-          if (samp_offset == data.Dim()) {
-            // no more input. flush out last frames
-            decoder.InputFinished();
-          }
-          
-          if (do_endpointing && decoder.EndpointDetected(endpoint_config)) {
-            decoder.TerminateDecoding();
-            break;
-          }
-        }
-        Timer timer;
-        decoder.Wait();
-        if (simulate_realtime_decoding) {
-          KALDI_VLOG(1) << "Waited " << timer.Elapsed() << " seconds for decoder to "
-                        << "finish after giving it last chunk.";
-        }
-        decoder.FinalizeDecoding();
-
-        CompactLattice clat;
-        bool end_of_utterance = true;
-        decoder.GetLattice(end_of_utterance, &clat, NULL);
-        
-        GetDiagnosticsAndPrintOutput(utt, word_syms, clat,
-                                     &num_frames, &tot_like);
-        
-        decoding_timer.OutputStats(&timing_stats);
-        
-        // In an application you might avoid updating the adaptation state if
-        // you felt the utterance had low confidence.  See lat/confidence.h
-        decoder.GetAdaptationState(&adaptation_state);
-        
-        // we want to output the lattice with un-scaled acoustics.
-        BaseFloat inv_acoustic_scale =
-            1.0 / nnet2_decoding_config.acoustic_scale;
-        ScaleLattice(AcousticLatticeScale(inv_acoustic_scale), &clat);
-
-        if (simulate_realtime_decoding) {        
-          KALDI_VLOG(1) << "Adding the various end-of-utterance tasks took the "
-                        << "total latency to " << timer.Elapsed() << " seconds.";
-        }
-        clat_writer.Write(utt, clat);
-        KALDI_LOG << "Decoded utterance " << utt;
-
-
-        
-        num_done++;
-      }
-    }
-    bool online = true;
-            
-    if (simulate_realtime_decoding) {
-      timing_stats.Print(online);
-    } else {
-      BaseFloat frame_shift = 0.01;
-      BaseFloat real_time_factor =
-          global_timer.Elapsed() / (frame_shift * num_frames);
-      if (num_frames > 0)
-        KALDI_LOG << "Real-time factor was " << real_time_factor
-                  << " assuming frame shift of " << frame_shift;
-    }
-    
-    KALDI_LOG << "Decoded " << num_done << " utterances, "
-              << num_err << " with errors.";
-    KALDI_LOG << "Overall likelihood per frame was " << (tot_like / num_frames)
-              << " per frame over " << num_frames << " frames.";
-    delete decode_fst;
-    delete word_syms; // will delete if non-NULL.
-    return (num_done != 0 ? 0 : 1);
-  } catch(const std::exception& e) {
-    std::cerr << e.what();
-    return -1;
-  }
-} // main()

From c0b6042518a85ecf41ade0b503f82fedcbd741a9 Mon Sep 17 00:00:00 2001
From: Daniel Povey <dpovey@gmail.com>
Date: Thu, 2 May 2019 22:01:28 -0400
Subject: [PATCH 032/163] [src] Further progress

---
 src/tensor/array-ref.h                   |   2 +
 src/tensor/change-tracker.h              | 156 ---------
 src/tensor/memory-checker.h              | 420 +++++++++++++++++++++++
 src/tensor/storage.h                     |  18 +-
 src/tensor/tensor-common.h               |  55 ++-
 src/tensor/tensor-functions.h            |   1 -
 src/tensor/tensor-impl-utils.h           |  20 +-
 src/tensor/tensor-impl.cc                |  72 ++++
 src/tensor/tensor-impl.h                 | 120 +++++--
 src/tensor/tensor-pattern-extra-utils.cc |  88 ++++-
 src/tensor/tensor-pattern-extra-utils.h  | 113 +++++-
 src/tensor/tensor-pattern-utils.h        |  75 ++--
 src/tensor/tensor-pattern.h              |  51 ++-
 src/tensor/tensor-utils.h                |   5 +-
 src/tensor/tensor.cc                     |  30 ++
 src/tensor/tensor.h                      |  96 ++----
 src/tensor/variable-functions.h          |  96 ++++++
 src/tensor/variable-inplace.h            | 119 +++++++
 src/tensor/variable.cc                   |   2 +-
 src/tensor/variable.h                    |  25 ++
 20 files changed, 1225 insertions(+), 339 deletions(-)
 delete mode 100644 src/tensor/change-tracker.h
 create mode 100644 src/tensor/memory-checker.h
 create mode 100644 src/tensor/tensor-impl.cc
 create mode 100644 src/tensor/tensor.cc
 create mode 100644 src/tensor/variable-functions.h
 create mode 100644 src/tensor/variable-inplace.h

diff --git a/src/tensor/array-ref.h b/src/tensor/array-ref.h
index bf6cb79fa72..e6de149e756 100644
--- a/src/tensor/array-ref.h
+++ b/src/tensor/array-ref.h
@@ -73,6 +73,8 @@ struct ArrayRef final {
   T *begin() { return data; }
   T *end() { return data + size; }
 
+
+
   // We will add iterators later if they are needed.
 };
 
diff --git a/src/tensor/change-tracker.h b/src/tensor/change-tracker.h
deleted file mode 100644
index 51a73d5390b..00000000000
--- a/src/tensor/change-tracker.h
+++ /dev/null
@@ -1,156 +0,0 @@
-// tensor/change-tracker.h
-
-// Copyright      2019  Johns Hopkins University (author: Daniel Povey)
-
-// See ../../COPYING for clarification regarding multiple authors
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//  http://www.apache.org/licenses/LICENSE-2.0
-//
-// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
-// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
-// MERCHANTABLITY OR NON-INFRINGEMENT.
-// See the Apache 2 License for the specific language governing permissions and
-// limitations under the License.
-
-#ifndef KALDI_TENSOR_CHANGE_TRACKER_H_
-#define KALDI_TENSOR_CHANGE_TRACKER_H_ 1
-
-#include <functional>
-#include "tensor/tensor-common.h"
-#include "tensor/tensor-pattern.h"
-
-
-namespace kaldi {
-namespace tensor {
-
-
-
-/**
-   class ChangeTracker is something we only use in 'debug mode'.  Its purpose is
-   to keep track of when data was last changed, to make sure people don't mutate
-   data via in-place operations in a way that will invalidate the backprop.
-   This is a replacement for the 'version numbering' of Variables used in
-   PyTorch, i.e. it's a different way of solving the same problem.  The
-   mechanism is (I think) more exact than version numbering, and less hassle for
-   the calling code; but since it's slower, we will only activate it
-   occasionally.  c.f. SetDebugMode(), GetDebugMode().
-
-   When a computation requiring derivatives creates a graph that will (when
-   Backprop()'d) require a certain Tensor's data to remain unchanged until
-   the backprop is done, we put a lock on the relevant memory region.
-   This is done by LockPattern().  Conceptually the locking is done at the
-   byte level, but without explicitly creating a byte-level map; it's
-   done by detecting overlap of Patterns and will be reasonably efficient
-   unless the user is creating a large number of different views of the same
-   memory region.
-
-   The same piece of memory may be locked multiple times.  This is not a write
-   lock, it is a lock that prevents modification of that memory location.
-   Attempts to mutate that memory (assuming the code calls Mutate()) will cause a
-   crash.  The solution would be to remove the offending in-place operation from
-   your code.
- */
-class ChangeTracker {
- public:
-
-  /** Constructor.  A Storage object is created for each allocated block of
-      memory, and each Storage object has at most one ChangeTracker object.
-
-      @param [in] num_bytes  The number of bytes allocated in this block.
-                           Only needed for checking, to make sure that
-                           the patterns do not overstep this bound.
-   */
-  ChangeTracker(size_t num_bytes);
-
-
-  /**
-     Record a change to this storage region at the current time (obtained by
-     GetTick()).  Just appends it to the vector of changes after canonicalizing
-     the pattern.  Inlined since it's only called from Storage::ChangedSince().
-
-     @param [in] element_size  The size in bytes of the data type being stored
-                             here: for example, 4 for float.
-     @param [in] pattern    The pattern being changed.  It will be reduced
-                            to canonical form (c.f. CanonicalizePattern())
-                            before being stored.
-   */
-  inline void RecordChange(int32 element_size,
-                           const TensorPattern &pattern);
-
-
-  /**
-     Returns true if any element covered by this pattern has been
-     changed since the time given by 'tick'.  Inlined since it's only
-     called from Storage::ChangedSince().
-
-      @param [in] tick  The time (obtained by GetTick()) since when
-                     we want to know about changes
-      @param [in] pattern  The pattern that we are checking
-   */
-  inline bool ChangedSince(int64 tick,
-                           const TensorPattern &pattern);
-
- private:
-
-  // number of bytes in this storage region (or possibly just a very big number,
-  // if the size of the region was not known).
-  int64 num_bytes_;
-
-  // The size of elements in this storage region (e.g. 4 for float).  If for
-  // some region the same region was accessed with multiple different element
-  // sizes, this will be their lowest common denominator and all patterns
-  // will have their strides and offsets scaled appropriately.
-  // (We don't just store patterns in terms of bytes because we don't want
-  // to increase the risk of overflowing int32 storage).
-  int64 element_size_;
-
-
-  struct ChangeRecord {
-    TensorPattern pattern;  // The pattern (offset, dims, strides) that was
-                            // changed within this storage region.  This pattern
-                            // will have been reduced to canonical form.  View
-                            // it as a memory-index-set (c.f. glossary in
-                            // pattern.h).
-
-    int64 tick;             // The time, in ticks (c.f. NextTick()) at which
-                            // this set of memory-indexes was changed.
-
-    // Next in a singly linked list of ChangeRecord.
-    std::unique_ptr<ChangeRecord> tail;
-  };
-
-
-  // Head of a singly linked list of changes.  When RecordChange() is called, we
-  // will add to the head of this (and then de-dupe; see doc for change_map)).
-  // When ChangedSince() is called, we will traverse it element by element until
-  // we get to the tick passed to ChangedSince, and if there is any overlap with
-  // the passed-in pattern, we'll return true.
-  std::unique_ptr<ChangeRecord> changes_;
-
-
-  // This is a map from a pointer to the TensorPattern in ChangeRecord::pattern
-  // (hashing the pattern itself, not the pointer value), to the ChangeRecord
-  // that holds it.  We actually map to the address of the std::unique_ptr
-  // pointing to that ChangeRecord, which might be the address of this->changes_
-  // or ChangeRecord::tail, because we need to be able to write to that to
-  // remove a ChangeRecord from the singly linked list.  This map is used
-  // in de-duping the list of changes, so that if someone provides the
-  // exact same pattern twice, we only keep the most recent tick; this
-  // keeps memory usage under control.
-  std::unordered_map<TensorPattern*, std::unique_ptr<ChangeRecord>*,
-                     TensorPatternPtrHasher, TensorPatternPtrEqual> change_map_;
-
-
-};
-
-
-
-}  // namespace tensor
-}  // namespace kaldi
-
-#endif  // KALDI_TENSOR_CHANGE_TRACKER_H_
diff --git a/src/tensor/memory-checker.h b/src/tensor/memory-checker.h
new file mode 100644
index 00000000000..8c3952de04a
--- /dev/null
+++ b/src/tensor/memory-checker.h
@@ -0,0 +1,420 @@
+// tensor/memory-checker.h
+
+// Copyright      2019  Johns Hopkins University (author: Daniel Povey)
+
+// See ../../COPYING for clarification regarding multiple authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//  http://www.apache.org/licenses/LICENSE-2.0
+//
+// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
+// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
+// MERCHANTABLITY OR NON-INFRINGEMENT.
+// See the Apache 2 License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef KALDI_TENSOR_MEMORY_CHECKER_H_
+#define KALDI_TENSOR_MEMORY_CHECKER_H_ 1
+
+#include <functional>
+#include "tensor/tensor-common.h"
+#include "tensor/tensor-pattern.h"
+
+
+namespace kaldi {
+namespace tensor {
+
+
+
+/**
+   class ChangeTracker is something we only use in 'debug mode'.  Its purpose is
+   to keep track of when data was last changed, to make sure people don't mutate
+   data via in-place operations in a way that will invalidate the backprop.
+   This is a replacement for the 'version numbering' of Variables used in
+   PyTorch, i.e. it's a different way of solving the same problem.  The
+   mechanism is (I think) more exact than version numbering, and less hassle for
+   the calling code; but since it's slower, we will only activate it
+   occasionally.  c.f. SetDebugMode(), GetDebugMode().
+
+   During the forward pass, when an Op records, as members, certain Tensors that
+   will be needed during the backprop pass, it also records the time in
+   ticks (c.f. GetTick()) at which the forward pass happened.  Then in
+   the backward pass, during debug mode we want to check that the memory
+   underlying those Tensors has not beeen changed since that recorded tick.
+   (If people use in-place operations in an unsupported way, this might
+   have happened).
+
+   This class provides a mechanism to do that.  It's actually quite an
+   interesting mathematical problem as it involves detecting overlap between
+   patterns in memory and we want to do it efficiently, not using a huge array.
+   Note: in debug mode, any time a memory region underlying a tracked Variable's
+   data is written to (whether or not that write actually went through a tracked
+   Variable or even a regular Tensor), we record the change (see member function
+   RecordChange()).
+*/
+class ChangeTracker {
+ public:
+
+  /** Constructor.  Note: a Storage object is created for each allocated block
+      of memory, and each Storage object has at most one ChangeTracker object.
+
+      @param [in] num_bytes  The number of bytes allocated in this block.
+                           Only needed for checking, to make sure that
+                           the patterns do not overstep this bound.
+   */
+  ChangeTracker(size_t num_bytes);
+
+
+  /**
+     Record a change to this storage region at the current time (obtained by
+     GetTick()).  Just appends it to the vector of changes after canonicalizing
+     the pattern.  Inlined since it's only called from Storage::ChangedSince().
+
+     @param [in] element_size  The size in bytes of the data type being stored
+                             here: for example, 4 for float.
+     @param [in] pattern    The pattern being changed.  It will be reduced
+                            to canonical form (c.f. CanonicalizePattern())
+                            before being stored.
+   */
+  inline void RecordChange(int32 element_size,
+                           const TensorPattern &pattern);
+
+
+  /**
+     Returns true if any element covered by this pattern has been
+     changed since the time given by 'tick'.  Inlined since it's only
+     called from Storage::ChangedSince().
+
+      @param [in] tick  The time (obtained by GetTick()) since when
+                     we want to know about changes
+      @param [in] pattern  The pattern that we are checking
+   */
+  inline bool ChangedSince(int64 tick,
+                           const TensorPattern &pattern);
+
+ private:
+
+  // number of bytes in this storage region (or possibly just a very big number,
+  // if the size of the region was not known).
+  int64 num_bytes_;
+
+  // The size of elements in this storage region (e.g. 4 for float).  If for
+  // some region the same region was accessed with multiple different element
+  // sizes, this will be their lowest common denominator and all patterns
+  // will have their strides and offsets scaled appropriately.
+  // (We don't just store patterns in terms of bytes because we don't want
+  // to increase the risk of overflowing int32 storage).
+  int32 element_size_;
+
+
+  struct ChangeRecord {
+    TensorPattern pattern;  // The pattern (offset, dims, strides) that was
+                            // changed within this storage region.  This pattern
+                            // will have been reduced to canonical form.  View
+                            // it as a memory-index-set (c.f. glossary in
+                            // pattern.h).
+
+    int64 tick;             // The time, in ticks (c.f. NextTick()) at which
+                            // this set of memory-indexes was changed.
+
+    // Next in a singly linked list of ChangeRecord.
+    std::unique_ptr<ChangeRecord> tail;
+  };
+
+
+  // Head of a singly linked list of changes.  When RecordChange() is called, we
+  // will add to the head of this (and then de-dupe; see doc for change_map)).
+  // When ChangedSince() is called, we will traverse it element by element until
+  // we get to the tick passed to ChangedSince, and if there is any overlap with
+  // the passed-in pattern, we'll return true.
+  std::unique_ptr<ChangeRecord> changes_;
+
+
+  // This is a map from a pointer to the TensorPattern in ChangeRecord::pattern
+  // (hashing the pattern itself, not the pointer value), to the ChangeRecord
+  // that holds it.  We actually map to the address of the std::unique_ptr
+  // pointing to that ChangeRecord, which might be the address of this->changes_
+  // or ChangeRecord::tail, because we need to be able to write to that to
+  // remove a ChangeRecord from the singly linked list.  This map is used
+  // in de-duping the list of changes, so that if someone provides the
+  // exact same pattern twice, we only keep the most recent tick; this
+  // keeps memory usage under control.
+  std::unordered_map<TensorPattern*, std::unique_ptr<ChangeRecord>*,
+                     TensorPatternPtrHasher, TensorPatternPtrEqual> change_map_;
+};
+
+
+
+// This class is a common base-class for UninitializedDataChecker and
+// InvalidDataChecker.
+class DataCheckerBase {
+ protected:
+  DataCheckerBase(int64 num_bytes);
+
+  /**
+     This function records an event (i.e. that this memory area is being written to,
+     or is now no longer valid, depending on the child class).
+     It may insert something into map_, if an event with this pattern hasn't
+     been recorded before.
+
+       @param [in] element_size  The size, in bytes, of the element that this
+                          array contains (e.g. 4 for float, 8 for double)
+                          Currently expected to be the same for all invocations
+                          (we can later extend this code to handle changes).
+       @param [in] pattern   The pattern which we are recording as an event
+                          (e.g. saying that its memory-index-set has been
+                          written to, or has been invalidated.  Its memory-index-set
+                          must be within [0, k-1] where k = num_bytes_ / element_size.
+   */
+a  void RecordEvent(int32 element_size,
+                   const TensorPattern &pattern);
+
+  /**
+     This function is intended to return true if the memory-index-set of
+     the provided Pattern is fully covered by the Patterns passed to
+     previous invocations of RecordEvent.
+
+     Because it sometimes (for efficiency) uses a randomized algorithm,
+     it may not always detect less-than-complete coverage.  That is, there
+     may be situations where `pattern` is not fully covered and it returns
+     true anyway; but if it returns false, then `pattern` is definitely
+     not covered by all the patterns passed to RecordEvent().
+
+     The algorithm is:
+
+       - If we can find a pattern identical to `pattern` in
+         `map_`, return true (this is a common special case).
+       - If `map_` contains exactly one pattern:
+         See whether the the memory-index-set of `pattern` is
+         a subset of the memory-index-set of that one pattern,
+         and return true if so; else false.
+       - Otherwise: choose a number of random memory-indexes from
+         `pattern`, and for each one, see whether they are covered
+         by any of the stored patterns.  If any such memory-index
+         is not so covered, return false; else return true.  (Note:
+         this last `true` may be inaccurate, meaning we fail to
+         detect a problem we should have detected.)
+
+      @param [in] element_size  The size, in bytes, of the element that this
+                      array contains (e.g. 4 for float, 8 for double).
+                      Currently required to be the same as the element_size
+                      provided to any invocations of RecordEvent(); we may
+                      relax that assumption in future.
+      @param [in] pattern   The pattern we are checking. Its memory-index-set
+                     must be within [0, k-1] where k = num_bytes_ / element_size.
+      @return   True if `pattern` was fully covered by patterns recorded in
+                RecordEvent() or if our randomized algorithm failed to detect
+                the less-than-complete coverage.  False otherwise.
+   */
+  bool FullyCovered(int32 element_size,
+                    const TensorPattern &pattern);
+
+  /**
+     This function is intended to return true if the memory-index-set of
+     `pattern` has nonempty intersection with the memory-index-set of at least
+     ones of the Patterns provided to RecordEvent().
+
+     Because it is a randomized algorithm, it may sometimes return false
+     when an exact version would have returned true, but not vice versa.
+
+     The algorithm is:
+
+       - If we can find a pattern identical to `pattern` in `map_`, return true
+         (this is a common special case).
+       - Otherwise:
+          - For some or all of the TensorPatterns provided to `RecordEvent()`:
+            - If `pattern` has nonempty intersection with that pattern:
+               return true
+          - return false
+   */
+  bool PartlyCovered(int32 element_size,
+                     const TensorPattern &pattern);
+
+ private:
+
+  // number of bytes in this storage region (or possibly just a very big number,
+  // if the size of the region was not known).
+  int64 num_bytes_;
+
+  // The size of elements in this storage region (e.g. 4 for float).  If for
+  // some region the same region was accessed with multiple different element
+  // sizes, this will be their lowest common denominator and all patterns
+  // will have their strides and offsets scaled appropriately.
+  // (We don't just store patterns in terms of bytes because we don't want
+  // to increase the risk of overflowing int32 storage).
+  int32 element_size_;
+
+
+  // `map` can actually be thought of as a set of TensorPatterns, but it's
+  // actually stored as a map from TensorPattern* to the std::unique_ptr holding
+  // that same TensorPattern.  This may seem an odd thing to do; it's just
+  // a convenient way to manage the memory.  Thanks to TensorPatternPtrHasher,
+  // we can avoid storing duplicate records for the same Pattern.
+  std::unordered_map<TensorPattern*, std::unique_ptr<TensorPattern*>,
+                     TensorPatternPtrHasher, TensorPatternPtrEqual> map_;
+
+
+  // This is another way of storing the TensorPatterns that have been recorded,
+  // ordered by NumElements(); this enables us to check the larger patterns
+  // first, which may be more efficient.
+  std::multimap<int64, TensorPattern*> by_size_;
+};
+
+/**
+   The purpose of this class is to check for use of uninitialized data.  It will
+   only be used when debug mode is enabled.
+
+   There are situations when initializing the memory of a Tensor/Variable (say,
+   to zero) would be wasteful because we know that we're going to eventually
+   write to all of it.  But doing this is risky because we might end up using
+   values in uninitialized memory if we're not careful.  This class detects that
+   situation, but only we are in debug mode; see SetDebugMode(), GetDebugMode().
+ */
+class UninitializedDataChecker: public DataCheckerBase {
+ public:
+
+  /** Constructor.  Note: a Storage object is created for each allocated block
+      of memory, and each Storage object has at most one
+      UninitializedDataChecker object.
+
+      @param [in] num_bytes  The number of bytes allocated in this block.
+                          Only needed for checking, to make sure that
+                          the patterns do not overstep this bound.
+   */
+  UninitializedDataChecker(size_t num_bytes):
+      DataCheckerBase(num_bytes),
+      disabled_(false) { }
+
+
+  /**
+     This function records that this memory area is being written to.
+
+        @param [in] element_size  The size of the element stored in the
+                  Tensor, e.g. 4 for float, 8 for double.
+        @param [in] pattern  The pattern which is being written; this
+                  function records the write.
+   */
+  inline void RecordWrite(int32 element_size,
+                   const TensorPattern &pattern) {
+    RecordEvent(element_size, pattern);
+  }
+
+  /**
+     This function is called when this memory area is being read from.
+     It will (usually) crash if an element of this memory area has not been
+     written to.  The algorithm is randomized so a problem won't be
+     detected in all cases.
+        @param [in] element_size  The size of the element stored in the
+                  Tensor, e.g. 4 for float, 8 for double.
+        @param [in] pattern  The pattern which is being read.  If it
+                  is not fully covered by the Patterns passed to
+                  RecordWrite, this call will (usually) crash.
+   */
+  void RecordRead(int32 element_size,
+                  const TensorPattern &pattern);
+};
+
+
+/**
+   The purpose of this class is to check for use of invalidated data.  It will
+   only be used when debug mode is enabled.
+
+   This is a checking mechanism that helps us to fairly safely avoid certain
+   unnecessary operations on parts of Variables in the backprop phase.  (If the
+   check fails, user-level code will have to be changed).  It's best illustrated
+   with an example.  Let A and B both be Variables representing 2x2 matrices
+   that have been freshly created with uninitialized data.  Suppose we do:
+
+      (1) Initialize A's data to something requiring derivative tracking
+      (2) Copy A to B
+      (3) Copy A to B again
+      (4) Do something that depends on the value of B
+
+   In the backprop, when doing the backprop of operation (3), after propagating
+   the derivative back to A we'd need to zero out the first row of B's
+   derivative matrix, to reflect the fact that its value before operation (3)
+   doesn't affect the outcome; otherwise after the backprop of (2) we would have
+   twice the value we should really have for the derivative w.r.t. A.  So
+   naively, any time we do the backprop for an operation that writes to a
+   variable that was already tracked at the time we did that operation, we would
+   have to zero out that part of the derivative matrix afterwards.  But much of
+   the time we wouldn't have previously written to that part of memory, so such
+   zeroing would be wasteful.  (Note: we can't just rely on checking whether or
+   not this base Variable has previously had an operation done on it; the hard
+   case is where there are multiple Variables that are sub-parts of the same
+   base Variable).
+
+   The way we handle this is: we assume by default that any time we do an
+   operation that sets a Variable but does not depend on its previously existing
+   value, the memory underlying it was not previously written to in an operation
+   that required derivative-tracking.  But if that is not the case (i.e.
+   if you do something that does require overwriting previously-written data
+   that required derivative tracking, like the above), you can inform the
+   framework by doing
+     DoSomethingWith(a, b, &c.Overwrite());
+   instead of
+     DoSomethingWith(a, b, &c);
+   (here a, b and c are Variables; and let's suppose this operation
+   DoSomethingWith() ignores the previous value of `c`).
+
+   This purpose of this class is to detect cases where someone should have
+   invoked Overwrite() because tracked data was overwritten, but failed to
+   do so.
+
+   See also the comment for the overwrite_ member of class VariableImpl, and
+   the Untouched() member of Variable.
+ */
+class InvalidatedDataChecker: public DataCheckerBase {
+ public:
+
+  /** Constructor.  Note: a Storage object is created for each allocated block
+      of memory, and each Storage object has at most one InvalidatedDataChecker
+      object.
+
+      @param [in] num_bytes  The number of bytes allocated in this block.
+                         Only needed for checking, to make sure that
+                         the patterns do not overstep this bound.
+   */
+  InvalidatedDataChecker(size_t num_bytes):
+      DataCheckerBase(num_bytes) { }
+
+
+  /**
+     This function records that this memory area is being invalidated Normally
+     this object will be attached to the Tensor for a derivative, and will be
+     called when we do the backprop for an Op that should ideally have zeroed
+     out this part of the matrix, but we didn't do that because we believe this
+     memory region won't be read from in future.
+   */
+  inline void RecordInvalidation(int32 element_size,
+                                 const TensorPattern &pattern) {
+    RecordEvent(element_size, pattern);
+  }
+
+
+  /**
+     This function is called when this memory area is being read from.  It will
+     (usually, since the algorithm is randomized) crash if `pattern` has
+     nonempty overlap with a pattern passed to RecordInvalidation().
+
+        @param [in] element_size  The size of the element stored in the
+                  Tensor, e.g. 4 for float, 8 for double.
+        @param [in] pattern  The pattern which is being read.  If it
+                  overlaps with an invalidated Pattern, this will
+                  (usually) crash.
+  */
+  void RecordRead(int32 element_size,
+                  const TensorPattern &pattern);
+
+
+};
+
+
+}  // namespace tensor
+}  // namespace kaldi
+
+#endif  // KALDI_TENSOR_MEMORY_CHECKER_H_
diff --git a/src/tensor/storage.h b/src/tensor/storage.h
index 89794813b47..48505adea69 100644
--- a/src/tensor/storage.h
+++ b/src/tensor/storage.h
@@ -22,6 +22,7 @@
 
 #include <functional>
 #include "tensor/tensor-common.h"
+#include "tensor/memory-checker.h"
 
 
 namespace kaldi {
@@ -138,9 +139,20 @@ class Storage {
 struct StorageAux {
   using DeallocatorFunc = std::function<void()>;
 
-  // 'tracker' is used in debug mode to detect when data that might be
-  // required in the backprop phase is invalidated.
-  std::unique_ptr<ChangeTracker> tracker;
+  // 'change_tracker' is used in debug mode to detect when data that might be
+  // required in the backprop phase has changed before we read it.
+  std::unique_ptr<ChangeTracker> change_tracker;
+
+  // 'uninitialized_checker' is used in debug mode to detect when data
+  // that has been allocated but never written to is read.
+  // required in the backprop phase has changed before we read it.
+  std::unique_ptr<UninitializedDataChecker> uninitialized_checker;
+
+  // 'invaliated_checker' is used in debug mode to detect when parts of
+  // derivatives that have been invalidated are read; read the
+  // comment for that class, in memory-checker.h, for complete
+  // info.
+  dstd::unique_ptr<InvalidatedDataChecker> invalidated_checker;
 
   // 'deallocator' is to be used with external toolkits, for example, to
   // decrease the refcount.  In normal cases it will be nullptr.
diff --git a/src/tensor/tensor-common.h b/src/tensor/tensor-common.h
index b0260a9bb80..806756acb90 100644
--- a/src/tensor/tensor-common.h
+++ b/src/tensor/tensor-common.h
@@ -84,8 +84,11 @@ class WithDeviceAs {
 enum DataType {
   // We will of course later extend this with many more types, including
   // integer types and half-precision floats.
-  kFloatDtype = 0,
-  kDoubleDtype = 1
+  kDefaultDtype = 0,
+  // kDefaultDtype means the type used when not specified; it's user definable
+  // via SetDefaultDtype.
+  kFloatDtype = 1,
+  kDoubleDtype = 2,
 };
 
 
@@ -99,7 +102,7 @@ inline int32 SizeOf(DataType dtype) {
 }
 
 
-aDataType GetDefaultDtype();
+DataType GetDefaultDtype();
 void SetDefaultDtype(DataType dtype);
 
 class WithDtypeAs {
@@ -119,6 +122,33 @@ class WithDtypeAs {
   DataType prev_default_;
 };
 
+
+
+// struct TensorOptions is used as an arg for some constructors
+// when creating Tensors and Variables; it allows flexibility
+// in specifying the device and/or dtype.  See the examples
+// shown where constructors of Tensor or Variable are declared.
+struct TensorOptions {
+  DataType dtype;
+  Device device;
+
+  TensorOptions(): dtype(GetDefaultDtype()),
+                   device(GetDefaultDevice()) { }
+  TensorOptions(DataType dtype):
+      dtype(dtype), device(GetDefaultDevice()) { }
+  TensorOptions(Device device):
+      dtype(GetDefaultDtype()), device(device) { }
+  TensorOptions(DeviceType device_type):
+      dtype(GetDefaultDtype()), device(device_type) { }
+  TensorOptions(DataType dtype, Device device):
+      dtype(dtype), device(device) { }
+  TensorOptions(DataType dtype, Device device_type):
+      dtype(dtype), device(device_type) { }
+  TensorOptions(const TensorOptions &other):
+      dtype(other.dtype), device(other.device) { }
+};
+
+
 // Global variable, initialized from zero, that is used in GetTick().
 // This is defined in tensor-common.cc.
 extern int64 g_tick_counter;
@@ -140,15 +170,16 @@ inline void SetDebugMode(bool b) { debug_mode = b; }
 /// Enumeration that says what strides we should choose when allocating
 /// A Tensor.
 enum StridePolicy {
-  kCopyStrideOrder,  // means: copy the size-ordering of the strides from the
-                     // source Tensor (they will all be positive even of some of
-                     // the source Tensor's strides were negative).
-  kCstrides      // means: strides for dimensions that are != 1 are ordered from
-                 // greatest to smallest as in a "C" array.  Per our policy,
-                 // any dimension that is 1 will have a zero stride.
-
-  // We may later add options for Fortran-style striding and for the sign of the
-  // source Tensor's strides, as well as their order, to be copied.
+  kKeepStrideOrder,  // Means: keep the size-ordering of the strides from the
+                     // source Tensor (but the chosen strides will all be
+                     // positive even of some of the source Tensor's strides
+                     // were negative).
+  kNormalized    // Means: strides for dimensions that are != 1 are ordered from
+                 // greatest to smallest as in a "C" array in the public
+                 // numbering, or smallest to greatest in the private numbering.
+                 // Per our policy, any dimension that is 1 will be given a zero stride.
+                 // C.f. "Normalized strides" in tensor-pattern.h
+  kCopyStrides   // Means: use the exact strides provided.
 };
 
 /// Enumeration that says whether to zero a freshly initialized Tensor.
diff --git a/src/tensor/tensor-functions.h b/src/tensor/tensor-functions.h
index 83d18239222..57953fcb7ed 100644
--- a/src/tensor/tensor-functions.h
+++ b/src/tensor/tensor-functions.h
@@ -73,7 +73,6 @@ inline void Transpose(int32 axis1, int32 axis2, Tensor *t) {
 void Copy(const Tensor &src, const Tensor *dest);
 
 
-
 /**
    Template used to implement unary functions such as Log, Relu, and
    so on (this avoids boilerplate).
diff --git a/src/tensor/tensor-impl-utils.h b/src/tensor/tensor-impl-utils.h
index 0a7a3f762af..39482280ba4 100644
--- a/src/tensor/tensor-impl-utils.h
+++ b/src/tensor/tensor-impl-utils.h
@@ -25,8 +25,9 @@
 
 
 /**
-   This header contains basic linear-algebra and copying types of operations
-   on TensorImpl objects.  See also tensor-impl-nonlinearly
+   This header contains mostly functions for usage by other code in the
+   framework, that operate on Tensors; see tensor-functions.h for more
+   user-facing functions.
 */
 namespace kaldi {
 namespace tensor {
@@ -48,6 +49,21 @@ inline bool Compatible(const TensorImpl &a, const TensorImpl &b,
 
 
 
+/**
+  This function returns true if the patterns of a and b are broadcastable.
+  See similar function in tensor-pattern-utils.h for more information.
+*/
+inline bool Broadcastable(const TensorImpl &a, const TensorImpl &b,
+                          bool b_non_reducing = false);
+
+/**
+  This function returns true if the patterns of a, b and c are broadcastable.
+  See similar function in tensor-pattern-utils.h for more information.
+*/
+inline bool Broadcastable(const TensorImpl &a, const TensorImpl &b,
+                          const TensorImpl &c, bool c_non_reducing = false);
+
+
 /**
    This function creates the appropriate storage object for the Tensor described
    in 'impl', and sets impl->storage to that value.  Due to lazy allocation (see
diff --git a/src/tensor/tensor-impl.cc b/src/tensor/tensor-impl.cc
new file mode 100644
index 00000000000..1d7cbbbb8af
--- /dev/null
+++ b/src/tensor/tensor-impl.cc
@@ -0,0 +1,72 @@
+// tensor/tensor-impl.cc
+
+// Copyright      2019  Johns Hopkins University (author: Daniel Povey)
+
+// See ../../COPYING for clarification regarding multiple authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//  http://www.apache.org/licenses/LICENSE-2.0
+//
+// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
+// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
+// MERCHANTABLITY OR NON-INFRINGEMENT.
+// See the Apache 2 License for the specific language governing permissions and
+// limitations under the License.
+
+#include "tensor/tensor-impl.h"
+
+
+namespace kaldi {
+namespace tensor {
+
+TensorImpl(const TensorMeta &meta,
+           StridePolicy sp):
+    dtype(meta.dtype),
+    device(meta.device) {
+  switch (sp) {
+    case kKeepStrideOrder:
+      MakeCompactNonnegativeAndJustified(meta.pattern, &pattern);
+      break;
+    case kNormalized:
+      MakeCompactNormalizedAndJustified(meta.pattern, &pattern);
+      break;
+    case kCopyStrides:
+      pattern = meta.pattern;
+      MakeJustified(&pattern);
+      break;
+    default:  // would be code error.
+      KALDI_ERR << "Stride policy out of range";
+  }
+  CreateTensorStorage(this);
+  KALDI_PARANOID_ASSERT(this->IsValid());
+}
+
+TensorImpl::TensorImpl(const TensorMeta &meta,
+                       const std::shared_ptr<Storage> &storage):
+    pattern(meta.pattern),
+    dtype(meta.dtype),
+    device(meta.device),
+    storage(storage) {
+  KALDI_PARANOID_ASSERT(this->IsValid());
+}
+
+
+TensorImpl::TensorImpl(const TensorMeta &meta,
+                       const std::shared_ptr<Storage> &&storage):
+    // todo: ask @kkm if this will actually do move construction on the
+    // shared_ptr.
+    pattern(meta.pattern),
+    dtype(meta.dtype),
+    device(meta.device),
+    storage(storage) {
+  KALDI_PARANOID_ASSERT(this->IsValid());
+}
+
+
+
+}  // namespace kaldi
+}  // namespace tensor
diff --git a/src/tensor/tensor-impl.h b/src/tensor/tensor-impl.h
index cd196ca2608..7f12b7dace3 100644
--- a/src/tensor/tensor-impl.h
+++ b/src/tensor/tensor-impl.h
@@ -57,18 +57,20 @@ struct TensorImpl {
 
   inline int32 NumAxes() { return pattern.num_axes; }
 
-  // Returns the dimension on the supplied axis (using the *public* axis
-  //                    numbering)
-  //  @param [in] axis  Axis on which dimension is required, with
-  //                    -NumAxes() <= axis < NumAxes(); negative axis
-  //                    is interpreted as an offset from NumAxes().
+  // Returns the dimension on the supplied axis, using the public axis
+  // numbering, with negative index interpreted as an offset from the end.
+  //
+  //  @param [in] eaxis  Eaxis-index (see definition in tensor-pattern.h)
+  //                    Require -NumAxes() <= eaxis < NumAxes().
   //  @return        Returns the dimension on this axis, a number >= 1.
-  inline int32 Dim(int32 axis);
-
-  // Returns the stride on the supplied axis (using the *public* axis numbering)
-  //  @param [in] axis  Axis on which stride is required, with
-  //                    -NumAxes() <= axis < NumAxes(); negative axis
-  //                    is interpreted as an offset from NumAxes().
+  inline int32 Dim(int32 eaxis);
+
+  // Returns the stride (== distance between successive elements) on the
+  // supplied axis, using the public axis numbering, with negative index
+  // interpreted as an offset from the end.
+  //
+  //  @param [in] eaxis  Eaxis-index (see definition in tensor-pattern.h)
+  //                    Require -NumAxes() <= eaxis < NumAxes().
   //  @return          Returns the stride on this axis, which will be 0 if
   //                   Dim(axis) == 1, and otherwise nonzero.
   inline int32 Stride(int32 axis);
@@ -81,11 +83,17 @@ struct TensorImpl {
   inline void* GetData() const;
 
 
-  // Returns true if this TensorImpl is valid, false otherwise.  A Tensor is
-  // valid if its TensorPattern is valid, its dtype and device are valid
-  // (e.g. enums in the correct range), and (if check_storage) that the storage
-  // object is non-NULL and the memory range covered by the pattern is within
-  // the num_bytes of the storage.
+  /**
+    Returns true if this TensorImpl is valid, false otherwise.
+
+       @param [in] check_storage   You can set this to false to disable
+                     checks related to the `storage` element (that
+                     it's non-NULL and covers the memory range used
+                     by the pattern.
+       @return   Return true if the TensorImpl is valid (requires
+                pattern.Valid(), plus checks on dtype and device,
+                plus checks on the storage object if check_storage == true.
+  */
   bool IsValid(bool check_storage = true);
 
 
@@ -93,23 +101,66 @@ struct TensorImpl {
     return reinterpret_cast<const TensorMeta&>(*this);
   }
 
-  // Constructor that is used when taking the meta-info from one source
-  // but the storage from another.
+  // Note: a copy constructor for TensorImpl might not be needed as we store
+  // shared_ptrs to it and just reuse the same object.
+
+  // Constructor that is used when copying the meta-info from one source
+  // but the storage from another; this version does move-construction
+  // on 'storage'.
   TensorImpl(const TensorMeta &meta,
              const std::shared_ptr<Storage> &storage);
 
+  // Constructor that is used when copying the meta-info from one source
+  // but the storage from another; this version does move-construction
+  // on 'storage'.
+  TensorImpl(const TensorMeta &meta,
+             std::shared_ptr<Storage> &&storage);
+
   // Constructor that copies the meta-info provided; if create_storage
   // == true it creates the storage reason, else leaves it NULL.
   TensorImpl(const TensorMeta &meta,
              bool create_storage = true);
 
-
-
-  // Constructor that is used when taking the meta-info from one source
-  // but the storage from another; this version does move-construction
-  // on 'storage'.
+  /**
+     Initializes a TensorImpl with the provided dimensions, creating a new
+     storage object for it.  The strides will be as for a "C" array; see
+     "Default strides:" in tensor-pattern.h.
+
+        @param [in] dims  The dimensions for each axis (in the public
+                       numbering).  All elements must be nonnegative,
+                       and we require `0 <= dims.size < KALDI_TENSOR_MAX_DIM`.
+        @param [in] opts  Options class to set device and dtype;
+                          see examples below
+<code>
+   TensorImpl *t = new TensorImpl({10,20}),
+       *u = new TensorImpl({9}, {kGpuDevice});
+       *v = new TensorImpl({9}, {kDoubleDtype, kGpuDevice});
+</code>
+  */
+  TensorImpl(ArrayRef<int32> dims,
+             TensorOptions opts = TensorOptions());
+
+  /**
+    This constructor initializes a TensorImpl with dtype, device and dims taken
+    from an existing TensorImpl, but a new storage object, and strides
+    determined by the StridePolicy provided.
+
+       @param [in] meta  Meta-info of another TensorImpl; the num_axes,
+                        dims, dtype and device will be taken from here
+                        and the strides may be inspected, depending
+                        on `sp`.
+       @param [in] sp   Stride policy (briefly as follows; see more by
+                      declaration of StridePolicy in tensor-common.h).
+                      kKeepStrideOrder -> use the same order of abs(stride) as
+                                          in 'meta'
+                      kNormalized -> use normalized strides (see definition
+                       in tensor-pattern.h); basically, the normal order we'd use
+                       for a new Tensor.
+                      kCopyStrides -> use the exact strides from the source
+                       pattern.
+  */
   TensorImpl(const TensorMeta &meta,
-             std::shared_ptr<Storage> &&storage);
+             StridePolicy sp);
 
   // Default constructor
   TensorImpl() { }
@@ -117,20 +168,13 @@ struct TensorImpl {
 };
 
 
-
-inline int32 TensorImpl::Dim(int32 axis) {
-  if (axis < 0) {
-    // it will usually be known whether axis < 0 at compile time, since it's
-    // inlined.
-    KALDI_ASSERT(axis >= -pattern.num_axes);
-    // num_axes - 1 - (axis + num_axes) = - 1 - axis
-    int32 raxis = -1 - axis;
-    return pattern.dims[raxis];
-  } else {
-    KALDI_ASSERT(axis < pattern.num_axes);
-    int32 raxis = pattern.num_axes - 1 - axis;
-    return pattern.dims[raxis];
-  }
+inline int32 TensorImpl::Dim(int32 eaxis) {
+  int32 raxis = EaxisToRaxis(eaxis);
+  if (raxis >= pattern.num_axes)
+    KALDI_ERR << "Invalid axis given to Dim(): "
+              << eaxis << ", num_axes = "
+              << pattern.num_axes;
+  return pattern.dims[num_axes];
 }
 
 
diff --git a/src/tensor/tensor-pattern-extra-utils.cc b/src/tensor/tensor-pattern-extra-utils.cc
index 74a1ad75c21..0b0407fdb33 100644
--- a/src/tensor/tensor-pattern-extra-utils.cc
+++ b/src/tensor/tensor-pattern-extra-utils.cc
@@ -690,11 +690,9 @@ bool PatternIncludes(const TensorPattern &pattern1,
 }
 
 
-void MakeContiguousAndJustified(const TensorPattern &src,
-                                TensorPattern *dest) {
+void MakeCompactAndJustified(const TensorPattern &src,
+                             TensorPattern *dest) {
   KALDI_PARANOID_ASSERT(src.IsValid());
-
-
   int32 num_axes = src.num_axes;
 
   // The sorter object provides an order in which we can visit the axes of 'src'
@@ -702,8 +700,7 @@ void MakeContiguousAndJustified(const TensorPattern &src,
   OutOfPlaceAxisSorter sorter(src);
 
   int64 offset = 0;  // 'offset' will be the offset that ensures 'dest' is
-                     // justified (see glossary in tensor-pattern.h for
-                     // definition).
+                     // justified (means lowest memory-index is 0).
   int32 next_abs_stride = 1;
   for (int32 i = 0; i < num_axes; i++) {
     int32 raxis = sorter.GetIndex(i);
@@ -722,19 +719,90 @@ void MakeContiguousAndJustified(const TensorPattern &src,
                             "Input pattern was not valid.");
       if (this_stride < 0) {
         offset += next_stride * (this_dim - 1);
-        dest->strides[raxis] = -next_stride;
+        dest->strides[raxis] = -next_abs_stride;
       } else {
-        dest->strides[raxis] = next_stride;
+        dest->strides[raxis] = next_abs_stride;
       }
       next_abs_stride *= this_dim;
     }
   }
+  SetUnusedDimsAndStrides(num_axes, dest);
+  dest->num_axes = num_axes;
   dest->offset = offset;
-  KALDI_PARANOID_ASSERT(IsContiguousAndJustified(*dest) &&
-                        IsValid(*dest));
+  SetDefaultCodeAndProperties(dest);
+
+  KALDI_PARANOID_ASSERT(IsCompactAndJustified(*dest) &&
+                        IsValid(*dest) && SameDims(src, *dest));
+}
+
+
+void MakeCompactNonnegativeAndJustified(const TensorPattern &src,
+                                        TensorPattern *dest) {
+  KALDI_PARANOID_ASSERT(src.IsValid());
+  int32 num_axes = src.num_axes;
+
+  // The sorter object provides an order in which we can visit the axes of 'src'
+  // that is from least to greatest abs(stride).
+  OutOfPlaceAxisSorter sorter(src);
+
+  int32 next_stride = 1;
+  for (int32 i = 0; i < num_axes; i++) {
+    int32 raxis = sorter.GetIndex(i);
+    // We are going through the raxis-indexes in increasing order of stride.
+    // We'll set each stride to the product of the preceding dims.
+    int32 this_stride = src.strides[raxis],
+        this_dim = src.dims[raxis];
+    dest->dims[raxis] = this_dim;
+    if (this_stride == 0) {
+      dest->strides[raxis] = 0;
+      // Note: if 'src' is valid, this implies the dim is 1,
+      // so no need to multiply 'next_stride'
+    } else {
+      dest->strides[raxis] = next_stride;
+      next_abs_stride *= this_dim;
+    }
+  }
+  SetUnusedDimsAndStrides(num_axes, dest);
+  dest->num_axes = num_axes;
+  dest->offset = 0;
+  SetDefaultCodeAndProperties(dest);
+  KALDI_PARANOID_ASSERT(IsCompactAndJustified(*dest) &&
+                        HasNonnegativeStrides(*dest) &&
+                        IsValid(*dest) && SameDims(src, *dest));
+}
+
+
+
+void MakeCompactNormalizedAndJustified(const TensorPattern &src,
+                                       TensorPattern *dest) {
+  KALDI_PARANOID_ASSERT(src.IsValid());
+  int32 num_axes = src.num_axes;
+
+  int32 next_stride = 1;
+  for (int32 raxis = 0; raxis < num_axes; raxis++) {
+    int32 this_dim = src.dims[raxis],
+        this_stride = src.strides[raxis];
+    dest->dims[raxis] = this_dim;
+    if (this_stride == 0) {
+      dest->strides[raxis] = 0;
+      // no need to multiply next_stride by dim, since it must be 1.
+    } else {
+      dest->strides[raxis] = next_stride;
+      next_stride *= this_dim;
+    }
+  }
+  SetUnusedDimsAndStrides(num_axes, dest);
+  dest->num_axes = num_axes;
+  dest->offset = 0;
+  SetDefaultCodeAndProperties(dest);
+  KALDI_PARANOID_ASSERT(IsCompactAndJustified(*dest) &&
+                        HasNormalizedStrides(*dest) &&
+                        IsValid(*dest) && SameDims(src, *dest));
 }
 
 
 
+
+
 }  // namespace kaldi
 }  // namespace tensor
diff --git a/src/tensor/tensor-pattern-extra-utils.h b/src/tensor/tensor-pattern-extra-utils.h
index 1701eb8d6d4..b784dd6ee31 100644
--- a/src/tensor/tensor-pattern-extra-utils.h
+++ b/src/tensor/tensor-pattern-extra-utils.h
@@ -72,6 +72,28 @@ bool PatternsIntersectSlow(const TensorPattern &pattern1,
 int32 PatternIncludes(const TensorPattern &pattern1,
                       const TensorPattern &pattern2);
 
+/**
+   Inline function that sets dim=1, stride=0 for all axes with
+   num_axes <= raxis < KALDI_TENSOR_MAX_DIM.  Often useful.
+ */
+inline void SetUnusedDimsAndStrides(int32 num_axes,
+                                    TensorPattern *dest) {
+#pragma unroll(2)
+  for (int32 raxis = num_axes; raxis < KALDI_TENSOR_MAX_DIM; raxis++) {
+    dest->dims[raxis] = 1;
+    dest->strides[raxis] = 0;
+  }
+}
+
+/**
+   Inline function that sets dest->code = -1 and dest->properties = 0;
+   often saves coding in functions that create or modify patterns.
+ */
+inline void SetDefaultCodeAndProperties(TensorPattern *dest) {
+  dest->code = -1;
+  dest->properties = 0;
+}
+
 
 /**
    Returns true if the two patterns are equivalent in the sense that their
@@ -157,6 +179,21 @@ bool PatternsIntersect(const TensorPattern &pattern1,
 bool PatternContains(const TensorPattern &pattern,
                      int64 mindex);
 
+
+/**
+   Returns true if the memory-index-set of pattern p is a subset
+   of the memory-index-set of pattern q.
+
+      @param [in] p   First pattern; must be valid.
+      @param [in] q   Second pattern; must be valid.
+      @return   Returns true if memory-index-set of p is a subset of
+                the memory-index-set of q (see tensor-pattern.h for definition;
+                of memory-index-set).
+ */
+bool PatternIsSubsetOf(const TensorPattern &p,
+                       const TensorPattern &q);
+
+
 /**
    Compute the minimum and maximum memory-indexs present in
    a pattern's memory-index-set (i.e. the minimum and maximum
@@ -233,6 +270,17 @@ bool ToMemoryIndexTupleSet(const ArrayRef<TensorPattern*>  patterns,
 bool PatternTuplesEquivalent(const ArrayRef<const TensorPattern*> patterns1,
                              const ArrayRef<const TensorPattern*> patterns2);
 
+/**
+   Returns true if TensorPattern p is linear in TensorPattern q.  (Note:
+   this is a rather technical property, see tensor-pattern.h for definition).
+
+      @param [in] p  The first pattern.  Must be valid
+      @param [in] q  The second pattern.  Must be valid and must satisfy
+                     `PatternIsSubsetOf(p, q);`
+ */
+bool IsLinearIn(const TensorPattern &p,
+                const TensorPattern &q);
+
 /**
    This function returns true if a Pattern is regular (see Regularity property
    in the glossary in tensor-pattern.h) and false otherwise.  'pattern' must
@@ -244,15 +292,15 @@ bool IsRegular(const TensorPattern &pattern);
 
 /**
    This function returns true if a Pattern is valid-1 (see definition in
-   glossary); see also TensorPattern::Valid() and IsValidMM().
+   glossary); see also TensorPattern::Valid() and IsValid2().
  */
-bool IsValidM(const TensorPattern &pattern);
+bool IsValid1(const TensorPattern &pattern);
 
 /**
    This function returns true if a Pattern is valid-2 (see definition in
-   glossary); see also TensorPattern::Valid() and IsValidM().
+   glossary); see also TensorPattern::Valid() and IsValid1().
  */
-bool IsValidMM(const TensorPattern &pattern);
+bool IsValid2(const TensorPattern &pattern);
 
 
 /**
@@ -317,13 +365,62 @@ bool ConvertPatternStrides(const TensorPattern &pattern,
 
          @param [in] src  The source pattern.  Must be valid.
          @param [out] dest  The destination pattern.  Will be identical
-                        to `src` if `ContiguousAndJustified(src)`, else
+                        to `src` if `CompactAndJustified(src)`, else
                         will have the relationship explained above.
-                        Will satisfy `ContiguousAndJustified(*dest)`,
+                        Will satisfy `CompactAndJustified(*dest)`,
                         and also `IsValid(*dest)`, assuming `IsValid(src)`.
  */
-void MakeContiguousAndJustified(const TensorPattern &src,
-                                TensorPattern *dest);
+void MakeCompactAndJustified(const TensorPattern &src,
+                             TensorPattern *dest);
+
+
+/**
+   This function possibly modifies the offset of the pattern `p`
+   so that it will be justified (meaning: lowest-numbered
+   memory-index equals zero).
+
+     @param [in,out] p    A Pattern, must be valid at entry
+                         (`p->IsValid()`).  At exit, will be
+                         valid and also justified (`IsJustified(p)`).
+ */
+void MakeJustified(TensorPattern *p);
+
+
+/**
+   This function copies the TensorPattern 'src' from 'dest', preserving the
+   num_axes and dims while possibly modifying the strides and offset.  The
+   strides of 'dest' will be normalized (i.e. nonnegative with positive strides
+   strictly increasing in the private axis-numbering), the pattern will be
+   compact (no gaps) and the offset will be set to zero (making the pattern
+   justified, since strides are nonnegative).
+
+       @param [in] src  The source pattern.  Must be valid.
+       @param [out] dest  The destination pattern.  Will share
+                      num_axes and dims with src, but the strides
+                      will be normalized, the pattern will be compact
+                      (no gaps between memory-indexes) and offset will be 0.
+ */
+void MakeCompactNormalizedAndJustified(const TensorPattern &src,
+                                       TensorPattern *dest);
+
+
+/**
+   This function copies the TensorPattern 'src' from 'dest', preserving the
+   num_axes and dims while possibly modifying the strides and offset.  The
+   strides of 'dest' will be nonnegative but the ordering from least to greatest
+   of the nonzero strides will be the same as the ordering of the absolute
+   values of the strides in 'src'.  The output pattern will be compact (no gaps)
+   and justified (meaning offset == 0, since the strides will be nonnegative).
+
+       @param [in] src  The source pattern.  Must be valid.
+       @param [out] dest  The destination pattern.  Will share
+                  num_axes and dims with src, but the strides and
+                  offset may be different.
+*/
+void MakeCompactNonnegativeAndJustified(const TensorPattern &src,
+                                        TensorPattern *dest);
+
+
 
 
 /**
diff --git a/src/tensor/tensor-pattern-utils.h b/src/tensor/tensor-pattern-utils.h
index d2643f54f87..3df482d99a3 100644
--- a/src/tensor/tensor-pattern-utils.h
+++ b/src/tensor/tensor-pattern-utils.h
@@ -384,9 +384,9 @@ inline void Squeeze(int32 axis, TensorPattern *p) {
      after padding their dims on the left with ones to make them
      have the same num-axes, corresponding dimensions are either
      identical or 1).  The previous sentence is written in terms
-     of the public numbering; in the private numbering it just means
+     of the public numbering; in the private numbering it just means:
      for each index `raxis` into the dims vector,
-     either `a.dims[raxis] == b.dims[raxis]`, or one of them si 1.
+     either `a.dims[raxis] == b.dims[raxis]`, or one of them is 1.
 
        @param [in] a  The pattern of the first Tensor
        @param [in] b  The pattern of the second Tensor
@@ -424,28 +424,48 @@ bool Broadcastable(const TensorPattern &a, const TensorPattern &b,
 
 
 /**
-   Returns true if the 'dims' vectors of a and b are the same.
-   Does not require the number of axes to be the same, so effectively
-   it's testing that the dims are the same after padding on the left
-   with dim=1 (here referring to the public, non-reversed numbering
-   of the dims).
+   Returns true if the dims-vectors of a and b are the same after padding as for
+   broadcasting.  See definition of "Dims-vector of a Pattern" in
+   tensor-pattern.h, and the entry for "PyTorch-style broadcasting".  What this
+   means in terms of the physical storage of the patterns is that a->dims and
+   b->dims contain the same elements, without requiring the num_axes to be the
+   same.
 
    This is a stronger condition than Broadcastable(a, b).
- */
-bool SameDim(const TensorPattern &a, const TensorPattern &b);
+         @param [in] a  The first pattern.  Must be valid.
+         @param [in] b  The second pattern.  Must be valid.
+         @return      Return true if the dims-vectors vectors of
+                      a and b are the same after padding as for broadcasting.
+   See also the 3-arg version of SamePaddedDims(), and SameDims().
+*/
+bool SamePaddedDims(const TensorPattern &a, const TensorPattern &b);
 
 
 /**
-   Returns true if the 'dims' vectors of a, b and c are all the same.
-   Does not require the number of axes to be the same, so effectively
-   it's testing that the dims are the same after padding on the left
-   with dim=1 (here referring to the public, non-reversed numbering
-   of the dims).
+   Returns true if the 'dims' vectors of a, b and c are all the same
+   after padding with 1's on the left (in the public numbering) to
+   make the dims the same.  Equivalent to
+   SamePaddedDims(a, b) && SamePaddedDims(b, c).
 
    This is a stronger condition than Broadcastable(a, b, c).
  */
-bool SameDim(const TensorPattern &a, const TensorPattern &b,
-             const TensorPattern &c);
+bool SamePaddedDims(const TensorPattern &a, const TensorPattern &b,
+                    const TensorPattern &c);
+
+/**
+   Return true if the two provided patterns have the same dims-vectors
+   (meaning, effectively the same num_axes and the same dim for each
+   axis; see "Dims-vector" in tensor-pattern.h).
+
+      @param [in] a  The first pattern.  Must be valid.
+      @param [in] b  The second pattern.  Must be valid.
+      @return        Returns true if a.num_axes == b.num_axes and
+                     the elements of their 'dims' members are the same.
+   See also: SamePaddedDims().
+*/
+bool SameDims(const TensorPattern &a, const TensorPattern &b);
+
+
 
 
 /**
@@ -786,13 +806,15 @@ void HasCStrides(const TensorPattern &pattern);
 bool PatternsOverlap(const TensorPattern &pattern1,
                      const TensorPattern &pattern2);
 
+
+
 /**
    Returns true if the memory-index-set of this pattern forms a contiguous
    range, otherwise false.  (Note: this is not the same as PyTorch's notion of
    contiguous; see HasCStrides()).  Caution: the interface may later be changed
    to allow caching of this property in the 'properties' field.
 */
-bool IsContiguous(const TensorPattern &pattern);
+bool IsCompact(const TensorPattern &pattern);
 
 
 /**
@@ -804,10 +826,23 @@ bool IsJustified(const TensorPattern &pattern);
 
 
 /**
-   This is the same is IsContiguous(pattern) &&
-   StartsFromZero(pattern).
+   This is the same is IsCompact(pattern) &&
+   IsJustified(pattern).
+*/
+bool IsCompactAndJustified(const TensorPattern &pattern);
+
+/**
+   Returns true if 'pattern' has normalized strides as defined
+   in tensor-pattern.h (i.e.: strides are nonnegative and
+   the nonzero ones are in strictly increasing order in the
+   private numbering / decreasing in the public).
+*/
+bool HasNormalizedStrides(const TensorPattern &pattern);
+
+/**
+   Returns true if all the stides in 'pattern' are nonnegative.
 */
-bool IsContiguousAndJustified(const TensorPattern &pattern);
+bool HasNonnegativeStrides(const TensorPattern &pattern);
 
 
 
diff --git a/src/tensor/tensor-pattern.h b/src/tensor/tensor-pattern.h
index 971c4ca7a4a..efd28388ba2 100644
--- a/src/tensor/tensor-pattern.h
+++ b/src/tensor/tensor-pattern.h
@@ -96,9 +96,22 @@ namespace tensor {
                      stride, which is consistent with "C" strides).  See
                      CanonicalizePattern().
 
-    Contiguous:      A Pattern is contiguous if its memory-index-set forms a contiguous
-                     range of integers (no gaps).  This is different from the PyTorch
-                     definition of 'contiguous', which also requires C-style strides.
+    Compact:         A Pattern is compact if its memory-index-set forms a contiguous
+                     range of integers (no gaps).  (We don't call this "contiguous"
+                     because PyTorch uses the same word with a different meaning).
+
+    Default strides:  The default strides for a pattern with provided dimensions are:
+                     of course, zero for any axis with dim=1; and otherwise (describing
+                     it in the public numbering of axes), each axis's stride is
+                     the product of the later-numbered axes' dims.  It corresponds
+                     to the strides of a "C" array.
+                     This is the policy that we will use when constructing new
+                     Tensors if only the dims are provided, which is why we call these
+                     the default strides.
+                     A Pattern having default strides is equivalent to its having
+                     normalized strides and also being compact.
+
+                     See also: Normalized strides, Compact.
 
     Dereferencing a memory-index:
                      Sometimes in formal explanations of algorithms we will use notation
@@ -124,14 +137,15 @@ namespace tensor {
     Disjoint Patterns:  When we speak of disjoint Patterns we mean that
                     their memory-index-sets are disjoint; see memory-index-set.
 
-    Eaxis-index:      We use the term Eaxis-index (meaning: extended axis-index), or,
-                      in code, eaxis_index, to mean an axis-index in the public
-                      numbering (c.f.: Axis-index) but where negative values are
-                      allowed, as in Python.  Negative values are interpreted as
-                      offsets from the num_axes of the Pattern in question, so for
-                      instance -1 would correspond to num_axes - 1.
-                      Valid eaxis-indexes would be in the range [-num_axes, num_axes - 1].
-                      See also: Axis-index, Raxis-index.
+    Eaxis-index / extended axis-index:
+                      We use the term Eaxis-index, or in code, eaxis_index, to
+                      mean an axis-index in the public numbering (c.f.:
+                      Axis-index) but where negative values are allowed, as in
+                      Python.  Negative values are interpreted as offsets from
+                      the num_axes of the Pattern in question, so for instance
+                      -1 would correspond to num_axes - 1.  Valid eaxis-indexes
+                      would be in the range [-num_axes, num_axes - 1].  See
+                      also: Axis-index, Raxis-index.
 
     Extended indexing:  A convention whereby if we have a Tensor with, say,
                       `dims = [5 1]`, we can index that Tensor with an index-tuple
@@ -276,6 +290,9 @@ namespace tensor {
                       PyTorch, if an operation is done on two Tensors with
                       dims=[5 6] and dims=[6], the second one would be interpreted
                       as having dims=[1 6].  That is: we pad with 1's on the left.
+                      Note: whenever we refer to broadcasting we include this feature;
+                      this glossary entry exists just to explain it, not to claim
+                      that we have two different versions of broadcasting.
 
     Raxis-index:      We use the term "raxis-index", often just "raxis" for short,
                       to mean the index of an axis in the reversed, private numbering.
@@ -310,12 +327,16 @@ namespace tensor {
                       View the notation M(P, Q) as shorthand for M((P, Q)).
 
     Normalized strides:  We say that a Pattern has normalized strides if the
-                      strides are all positive and are strictly increasing
-                      in the private numbering (which implies strictly decreasing
-                      in the public numbering).  TODO: remove this?
+                      strides are all nonnegative and the nonzero strides
+                      are in strictly increasing order in the private numbering
+                      (hence strictly decreasing in the public numbering).
+
+                      See also: Default strides (which is a stronger property).
 
     Linear property:
-                      Consider Patterns P and Q with the property that the
+                      This is a slightly technical property used in certain
+                      proofs involving patterns.
+                      Consider patterns P and Q with the property that the
                       memory-index-set of P is a subset of the memory-index-set of
                       Q.  If i is an index-tuple, let P(i) be the map from
                       i to a memory-index, and let
diff --git a/src/tensor/tensor-utils.h b/src/tensor/tensor-utils.h
index 1c3eef43165..c2a6e001075 100644
--- a/src/tensor/tensor-utils.h
+++ b/src/tensor/tensor-utils.h
@@ -42,9 +42,10 @@ inline bool Compatible(const Tensor &a, const Tensor &b) {
   and device and are broadcastable; equivalent to
   `Broadcastable(a, b) && Compatible(a, b)`.
 */
-inline bool BroadcastableAndCompatible(const Tensor &a, const Tensor &b) {
+inline bool BroadcastableAndCompatible(const Tensor &a, const Tensor &b,
+                                       b_non_reducing = false) {
   return Compatible(*a.impl_, *b.impl_) &&
-      Broadcastable(*a.impl_, *b.impl_);
+      Broadcastable(*a.impl_, *b.impl_, b_non_reducing);
 }
 
 
diff --git a/src/tensor/tensor.cc b/src/tensor/tensor.cc
new file mode 100644
index 00000000000..6720b5808c7
--- /dev/null
+++ b/src/tensor/tensor.cc
@@ -0,0 +1,30 @@
+// tensor/tensor.cc
+
+// Copyright      2019  Johns Hopkins University (author: Daniel Povey)
+
+// See ../../COPYING for clarification regarding multiple authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//  http://www.apache.org/licenses/LICENSE-2.0
+//
+// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
+// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
+// MERCHANTABLITY OR NON-INFRINGEMENT.
+// See the Apache 2 License for the specific language governing permissions and
+// limitations under the License.
+
+#include "tensor/tensor.h"
+
+
+namespace kaldi {
+namespace tensor {
+
+
+
+
+}  // namespace kaldi
+}  // namespace tensor
diff --git a/src/tensor/tensor.h b/src/tensor/tensor.h
index 3a2ae4f65b0..0c878a64a02 100644
--- a/src/tensor/tensor.h
+++ b/src/tensor/tensor.h
@@ -109,6 +109,10 @@
              versa.  The granularity of being tracked is at the
             "base variable" level.
 
+   Underlying / memory underlying: For a Tensor or Variable a, the "memory
+             underlying a" means the part of computer memory, accessible through
+             the storage object, that is covered by the pattern of a.
+
    View Variable:  A View Variable is any variable that is not a base
             variable.  Such variables will be views of base Variables that have
             been created from them by some operation such as slicing
@@ -274,7 +278,7 @@ class Tensor {
   /**
      Construct a new Tensor with freshly allocated underlying data with
      the data type, device and dimensions the same as `other`.  The strides
-     will be the same order as 'other' if sp == kCopyStrides.
+     will be the same order as 'other' if sp == kCopyStrideOrder.
 
        @param [in]  meta  The metadata we are copying the dims, device,
                        dtype and possibly strides from
@@ -289,83 +293,33 @@ class Tensor {
        @param [in]  ip   The data initialization policy
   */
   Tensor(const Meta &meta,
-         StridePolicy sp);
-
-
-  /** Construct a Tensor with freshly allocated data.
-       @param [in] dims    The dimensions of the tensor (zero to 5
-                    positive integers).
-       @param [in] dtype   The data type to use
-       @param [in] device  The device to put the data on
-
-       Example:  `Tensor a({3,4,5}, kDoubleDtype, kCpuDevice);`
-   */
-  Tensor(ArrayRef<int32> dims, DataType dtype, Device device);
-
-  /** Construct a Tensor with freshly allocated data, and device ==
-      `GetDefaultDevice().`.
-
-       @param [in] dims    The dimensions of the tensor (zero to 5
-                    positive integers).
-       @param [in] dtype   The data type to use
-
-       Example:  `Tensor a({3,4,5}, kDoubleDtype);`
-   */
-  Tensor(ArrayRef<int32> dims, DataType dtype);
-
-  /** Construct a Tensor with freshly allocated data, data type ==
-      `GetDefaultDtype()`,
-
-       @param [in] dims    The dimensions of the tensor (zero to 5
-                    positive integers).
-       @param [in] device  The device to put the data on
-
-       Example:  `Tensor a({3,4,5}, kCpuDevice);`
-   */
-  Tensor(ArrayRef<int32> dims, Device device);
-
-
-  /** Construct a Tensor with freshly allocated data, data type ==
-      `GetDefaultDtype()`, and device == GetDefaultDevice().
-
-       @param [in] dims    The dimensions of the tensor (zero to 5
-                    positive integers).
-       @param [in] device  The device to put the data on
-
-       Example:  `Tensor a({3,4,5}, kCpuDevice);`
-   */
-  Tensor(ArrayRef<int32> dims);
+         StridePolicy sp): impl_(new TensorImpl(meta, sp)) { }
+
+
+  /** Construct a Tensor with freshly allocated, uninitialized data.
+
+       @param [in] dims    The dimensions of the tensor, up to
+                     KALDI_TENSOR_MAX_DIM positive integers.
+       @param [in] opts    Options regarding data-type and device;
+                           see examples below.
+    Example (note: the braces are braced-initializer-lists)
+<code>
+   Tensor a({3,4});
+   Tensor b({}, kDoubleDtype);
+   Tensor c({5,6,7}, kCpuDevice);
+   Tensor d({1,2}, {kDoubleDtype, kCpuDevice});
+</code>
+  */
+  inline Tensor(ArrayRef<int32> dims,
+                TensorOptions opts = TensorOptions()):
+      impl_(new TensorImpl(meta, opts)) { }
 
 
 
-  /**
-     Construct a Tensor with the dimensions and strides provided.  This differs
-     from the constructor taking `ArrayRef<int32> dims` in that it will use
-     the strides in `pattern` (except that if the data in `pattern` is not
-     contiguous, it will make it contiguous by filling in any gaps).  This means
-     that, for example, if you use this constructor on a 2-dimensional Tensor
-     that has been transposed and thus has a column-major layout, the resulting
-     Tensor will also have a column-major layout.
-
-       @param [in] pattern  The dimension and stride information that
-                  this tensor should match (although we will fill gaps
-                  to make it contiguous)
-       @param [in] dtype   The data type to use
-       @param [in] device  The device to put the data on
-       @param [in] set_zero   If true, set the data to zero.  If false,
-                        the contents will be undefined.
-
-  */
-  Tensor(TensorPattern &pattern, DataType dtype, Device device,
-         InitializePolicy p);
-
   /**
      Construct a Tensor from the metadata in 'meta'.  Requires
      that meta.pattern be contiguous (meaning: literally contiguous,
      not the PyTorch meaning which is a stronger condition).
-     ??Possibly we could make it similar to the constructor above
-       and have it just make it contiguous if it was not.??
-
 
        @param [in] meta  Struct containing the metadata specifying
                      the Tensor's pattern, data-type and device
diff --git a/src/tensor/variable-functions.h b/src/tensor/variable-functions.h
new file mode 100644
index 00000000000..61776f6f4c9
--- /dev/null
+++ b/src/tensor/variable-functions.h
@@ -0,0 +1,96 @@
+// tensor/variable-functions.h
+
+// Copyright      2019  Johns Hopkins University (author: Daniel Povey)
+
+// See ../../COPYING for clarification regarding multiple authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//  http://www.apache.org/licenses/LICENSE-2.0
+//
+// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
+// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
+// MERCHANTABLITY OR NON-INFRINGEMENT.
+// See the Apache 2 License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef KALDI_VARIABLE_FUNCTIONS_H_
+#define KALDI_VARIALBE_FUNCTIONS_H_ 1
+
+#include "tensor/tensor.h"
+
+namespace kaldi {
+namespace tensor {
+
+// This file contains functions operating on Variables, mostly functions that
+// return other Variables.
+
+
+/**
+   Return a Variable with all-zero values, with the specified dimensions
+
+       @param [in] dims   Dimensions (in public ordering) of the requested
+                      Tensor.  Must all be positive, with the length of
+                      the list not exceeding KALDI_TENSOR_MAX_DIM = 6
+
+  An example is below.
+<code>
+   Variable scalar = Zeros({});
+   Variable a = Zeros({3,4}, {kDoubleDtype});
+   Variable b = Zeros({1,100}, {kDoubleDtype, kGpuDevice});
+</code>
+  Note on C++: reading the code above may require getting used to C++
+  braced-initializer-lists.  The {3,4} is interpreted as a
+  std::inititializer_list<int32> passed to to the constructor of ArrayRef; the
+  {kDoubleDtype} is an arg to the constructor of TensorOptions.
+ */
+inline Variable Zeros(ArrayRef<int32> dims,
+                      TensorOptions opts = TensorOptions());
+
+
+Variable Ones(ArrayRef<int32> dims);
+
+
+/**
+   Return a Tensor with
+ */
+Variable RandUniform(ArrayRef<int32> dims);
+
+/**
+   Sum all axes of a Variable and returns a Variable with one element and no
+   axes.
+
+       @param [in]  v   Variable to be summed.
+       @return          The summation; will equal the sum over, all
+                        axes of v; will have zero axes, and the same
+                        device and dtype of 'v'.
+Example:
+<code>
+   Variable v = Rand({3,4,5});
+   Variable w = v.Sum();
+</code>
+   See also the version of Sum() for which you can specify axes.
+ */
+Variable Sum(const Variable &v);
+
+/**
+   Sum specified axes of a Variable.  The returned Variable will have
+   that many fewer axes.
+
+       @param [in] v      Variable to be summed
+       @param [in] eaxes
+ */
+Variable Sum(const Variable &v, ArrayRef<int32> eaxes);
+
+
+
+
+
+}  // namespace tensor
+}  // namespace kaldi
+
+
+#endif  // KALDI_TENSOR_FUNCTIONS_H_
diff --git a/src/tensor/variable-inplace.h b/src/tensor/variable-inplace.h
new file mode 100644
index 00000000000..8c6dd219d29
--- /dev/null
+++ b/src/tensor/variable-inplace.h
@@ -0,0 +1,119 @@
+// tensor/variable-inplace.h
+
+// Copyright      2019  Johns Hopkins University (author: Daniel Povey)
+
+// See ../../COPYING for clarification regarding multiple authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//  http://www.apache.org/licenses/LICENSE-2.0
+//
+// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
+// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
+// MERCHANTABLITY OR NON-INFRINGEMENT.
+// See the Apache 2 License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef KALDI_VARIABLE_INPLACE_H_
+#define KALDI_VARIALBE_INPLACE_H_ 1
+
+#include "tensor/tensor.h"
+
+namespace kaldi {
+namespace tensor {
+
+// This file contains functions doing various in-place operations on Variables.
+// These functions will usually be called from brief inline member functions
+// within class Variable that just forward the call here.  We do it this way
+// (rather than making the implementation of these functions be
+// member-functions) to keep the code of class Variable relatively concise.
+
+
+
+/**
+   Set all elements of Variable v to scalar value 'a'.
+
+    @param [in] a  Scalar value; can be constructed from
+                   float or double.
+    @param [in,out] v  Variable to set all the values of
+*/
+void Set(Scalar a, Variable *v);
+
+/**
+   Set all elements of Variable v to zero
+      @param [in,out] v   Variable to modify
+ */
+void SetZero(Variable *v);
+
+
+
+
+
+/**
+   Return a Variable with all-zero values, with the specified dimensions
+
+       @param [in] dims   Dimensions (in public ordering) of the requested
+                      Tensor.  Must all be positive, with the length of
+                      the list not exceeding KALDI_TENSOR_MAX_DIM = 6
+
+  An example is below.
+<code>
+   Variable scalar = Zeros({});
+   Variable a = Zeros({3,4}, {kDoubleDtype});
+   Variable b = Zeros({1,100}, {kDoubleDtype, kGpuDevice});
+</code>
+  Note on C++: reading the code above may require getting used to C++
+  braced-initializer-lists.  The {3,4} is interpreted as a
+  std::inititializer_list<int32> passed to to the constructor of ArrayRef; the
+  {kDoubleDtype} is an arg to the constructor of TensorOptions.
+ */
+inline Variable Zeros(ArrayRef<int32> dims,
+                      TensorOptions opts = TensorOptions());
+
+
+Variable Ones(ArrayRef<int32> dims);
+
+
+/**
+   Return a Tensor with
+ */
+Variable RandUniform(ArrayRef<int32> dims);
+
+/**
+   Sum all axes of a Variable and returns a Variable with one element and no
+   axes.
+
+       @param [in]  v   Variable to be summed.
+       @return          The summation; will equal the sum over, all
+                        axes of v; will have zero axes, and the same
+                        device and dtype of 'v'.
+Example:
+<code>
+   Variable v = Rand({3,4,5});
+   Variable w = v.Sum();
+</code>
+   See also the version of Sum() for which you can specify axes.
+ */
+Variable Sum(const Variable &v);
+
+/**
+   Sum specified axes of a Variable.  The returned Variable will have
+   that many fewer axes.
+
+       @param [in] v      Variable to be summed
+       @param [in] eaxes
+ */
+Variable Sum(const Variable &v, ArrayRef<int32> eaxes);
+
+
+
+
+
+}  // namespace tensor
+}  // namespace kaldi
+
+
+#endif  // KALDI_TENSOR_FUNCTIONS_H_
diff --git a/src/tensor/variable.cc b/src/tensor/variable.cc
index b3ef347a076..37b09de6bb2 100644
--- a/src/tensor/variable.cc
+++ b/src/tensor/variable.cc
@@ -1,4 +1,4 @@
-// variable/variable.cc
+// tensor/variable.cc
 
 // Copyright      2019  Johns Hopkins University (author: Daniel Povey)
 
diff --git a/src/tensor/variable.h b/src/tensor/variable.h
index f8994456caa..357c1881aca 100644
--- a/src/tensor/variable.h
+++ b/src/tensor/variable.h
@@ -195,6 +195,31 @@ class VariableImpl {
   // Variables, its value is undefined.
   bool rebase_grad_;
 
+  // overwrite_ is part of a mechanism that avoids unnecessary zeroing of
+  // parts of derivatives during the backprop phase.  By default we
+  // assume that if we write to a Variable in a way that doesn't
+  // depend on the previous value (e.g. we set it, rather than
+  // add to it or multiply in-place), then the previous memory underlying
+  // that Variable has not previously participated in any operations
+  // requiring derivatives.
+  //
+  // If you are about
+  // to modify a Variable c that *has* previously participated in
+  // operations requiring derivatives, then, instead of, say:
+  //  DoSomethingWith(a, b, &c);
+  // (and let's suppose this operation ignores the previous value of `c`),
+  // you could do:
+  //  DoSomethingWith(a, b, &c.Overwrite());
+  // whereby you assert that the memory underlying this variable may have
+  // previously participated in operations requiring derivative tracking
+  // (and hence we need to an extra zeroing after the backprop).
+  // The call to Overwrite() sets the `overwrite_` bool, and then
+  // the DoSomethingWith() call should unset it.
+  //
+  // Look at the comment for class InvalidatedDataChecker in change-tracker.h
+  // for more information.
+  bool overwrite_;
+
   // aux_ is basically a collection of less-often-used fields of class VariableImpl;
   // it helps keep the main class uncluttered.
   std::unique_ptr<VariableImplAux> aux_;

From 24a85c9fb655bfdcda7060d4d0298ac25e0bd162 Mon Sep 17 00:00:00 2001
From: Daniel Povey <dpovey@gmail.com>
Date: Sat, 4 May 2019 13:20:48 -0400
Subject: [PATCH 033/163] [src] Further progress

---
 src/tensor/storage.h                          |  50 +-
 src/tensor/tensor-common.h                    |  87 ----
 src/tensor/tensor-impl-utils.h                |   6 +
 src/tensor/tensor-impl.h                      |  18 +-
 src/tensor/tensor-pattern-extra-utils.cc      | 459 +++++++++++++++++-
 src/tensor/tensor-pattern-extra-utils.h       |  69 ++-
 src/tensor/tensor-pattern-utils.h             |  11 +
 src/tensor/tensor-pattern.h                   |  67 ++-
 .../{tensor-common.cc => tensor-settings.cc}  |   4 +-
 src/tensor/tensor-settings.h                  | 184 +++++++
 src/tensor/tensor-utils.h                     |   8 +
 src/tensor/tensor.h                           |   9 +-
 src/tensor/variable-functions.h               |  32 ++
 src/tensor/variable.h                         |  37 +-
 14 files changed, 868 insertions(+), 173 deletions(-)
 rename src/tensor/{tensor-common.cc => tensor-settings.cc} (92%)
 create mode 100644 src/tensor/tensor-settings.h

diff --git a/src/tensor/storage.h b/src/tensor/storage.h
index 48505adea69..5980670b74c 100644
--- a/src/tensor/storage.h
+++ b/src/tensor/storage.h
@@ -30,8 +30,8 @@ namespace tensor {
 
 struct StorageAux;
 
-// 'Storage' contains a single allocated region (on CPU or GPU, according
-// to 'device').
+// 'Storage' contains a single allocated region (on CPU or GPU, according to
+// 'device').
 class Storage {
  public:
 
@@ -46,17 +46,31 @@ class Storage {
 
   inline bool Allocated() {  return (data != NULL);  }
 
-  // TODO: we may need a mechanism to automatically zero data when it is
-  // allocated, we have to figure out the right level to do this at.
+
+  // Returns the raw data pointer.
   inline void *Data() {
     if (data) {
       return data;
     } else {
       Allocate();
+      if (zero_upon_allocation_)
+        Zero();
       return data;
     }
   }
 
+  /**
+     This is called from TensorImpl when we call AllowUndefined() on it.
+     It gives the framework a free pass to not do zero-upon-allocation
+     on the part of memory underlying this particular TensorImpl.  It
+     will also cause data_ to be allocated if it was not already allocated.
+  */
+  inline void AllowUndefined(const TensorImpl &impl) {
+    if (data_ == nullptr && zero_upon_allocation_) {
+      Allocate();
+      ZeroEverythingElse(impl);
+    }
+  }
 
   /**
      Creates a Storage object for device 'device' with size 'num_bytes'.
@@ -81,7 +95,6 @@ class Storage {
           @param [in] deallocator A std::function, which, if not nullptr,
                               will be invoked in
    */
-
   Storage(Device device,
           void *data,
           size_t num_bytes,
@@ -94,6 +107,18 @@ class Storage {
   // will never be necessary to call this.
   bool IsAllocated();
 
+
+  /**
+     The user can call this as a low-cost mechanism to (conceptually) zero the
+     data in a storage region.  Rather than physically zeroing the data, it
+     records the intention to zero it as soon as it is allocated (see "Lazy
+     allocation" in tensor.h).  Later on, when the data is allocated, it may
+     actually not have to be zeroed if the AllowUndefined() is called.
+  */
+  inline void ZeroUponAllocation() { zero_upon_allocation_ = true; }
+
+
+
   // Deallocates the data.  This is user-callable because our autograd mechanism
   // deletes the underlying data of gradients that are no longer needed, while
   // keeping around the metadata in cases where it is instructed to retain the
@@ -112,12 +137,25 @@ class Storage {
   // Allocate the data.  It is an error to call this if data_ != NULL.
   void Allocate();
 
+  // Zero all the data held here, which is required to have already been
+  // allocated.
+  void Zero();
+
+  // Zero all the data held here *except* possibly the memory region
+  // underlying `impl` (although if it's more convenient, this function is
+  // allowed to zero it; at exit its contents will be undefined).
+  // data_ is required to have already been allocated.
+  void ZeroEverythingElse(const TensorImpl &impl);
+
+
   // 'data_' is either 'nullptr' or the actual data pointer.  Due to lazy allocation,
   // the 'data' pointer will remain NULL until it is actually needed.  Lazy
   // allocation makes it much easier to set up the autograd graph without
   // allocating the memory for the gradients.
   void *data_;
 
+  bool zero_upon_allocation_;
+
   // num_bytes is the number of bytes in the region we have allocated
   // (or are going to allocate).
   size_t num_bytes;
@@ -152,7 +190,7 @@ struct StorageAux {
   // derivatives that have been invalidated are read; read the
   // comment for that class, in memory-checker.h, for complete
   // info.
-  dstd::unique_ptr<InvalidatedDataChecker> invalidated_checker;
+  std::unique_ptr<InvalidatedDataChecker> invalidated_checker;
 
   // 'deallocator' is to be used with external toolkits, for example, to
   // decrease the refcount.  In normal cases it will be nullptr.
diff --git a/src/tensor/tensor-common.h b/src/tensor/tensor-common.h
index 806756acb90..703046645fb 100644
--- a/src/tensor/tensor-common.h
+++ b/src/tensor/tensor-common.h
@@ -59,28 +59,6 @@ struct Device {
 };
 
 
-Device GetDefaultDevice();
-void SetDefaultDevice(Device device);
-
-class WithDeviceAs {
-  // Example:
-  // {
-  //   WithDeviceAs(kCudaDevice);
-  //   // code in this block uses this default.
-  // }
- public:
-  inline WithDeviceAs(Device device):
-      prev_default_(GetDefaultDevice()) {
-    SetDefaultDevice(device);
-  }
-  ~WithDeviceAs() { SetDefaultDevice(prev_default_); }
-
- private:
-  Device prev_default_;
-};
-
-
-
 enum DataType {
   // We will of course later extend this with many more types, including
   // integer types and half-precision floats.
@@ -102,71 +80,6 @@ inline int32 SizeOf(DataType dtype) {
 }
 
 
-DataType GetDefaultDtype();
-void SetDefaultDtype(DataType dtype);
-
-class WithDtypeAs {
-  // Example:
-  // {
-  //   WithDtypeAs(kDoubleDtype);
-  //   // code in this block uses this default.
-  // }
- public:
-  inline WithDtypeAs(DataType dtype):
-      prev_default_(GetDefaultDtype()) {
-    SetDefaultDtype(dtype);
-  }
-  ~WithDtypeAs() { SetDefaultDtype(prev_default_); }
-
- private:
-  DataType prev_default_;
-};
-
-
-
-// struct TensorOptions is used as an arg for some constructors
-// when creating Tensors and Variables; it allows flexibility
-// in specifying the device and/or dtype.  See the examples
-// shown where constructors of Tensor or Variable are declared.
-struct TensorOptions {
-  DataType dtype;
-  Device device;
-
-  TensorOptions(): dtype(GetDefaultDtype()),
-                   device(GetDefaultDevice()) { }
-  TensorOptions(DataType dtype):
-      dtype(dtype), device(GetDefaultDevice()) { }
-  TensorOptions(Device device):
-      dtype(GetDefaultDtype()), device(device) { }
-  TensorOptions(DeviceType device_type):
-      dtype(GetDefaultDtype()), device(device_type) { }
-  TensorOptions(DataType dtype, Device device):
-      dtype(dtype), device(device) { }
-  TensorOptions(DataType dtype, Device device_type):
-      dtype(dtype), device(device_type) { }
-  TensorOptions(const TensorOptions &other):
-      dtype(other.dtype), device(other.device) { }
-};
-
-
-// Global variable, initialized from zero, that is used in GetTick().
-// This is defined in tensor-common.cc.
-extern int64 g_tick_counter;
-
-inline int64 NextTick() { return ++g_tick_counter; }
-
-// ? Remove this?  To be used when you don't want to increment
-// the counter.
-inline int64 CurrentTick() { return g_tick_counter; }
-
-
-// debug_mode activates code that checks for invalidated data in the backprop
-// pass; see "Invalidated:" in glossary in tensor.h.
-extern bool debug_mode;
-inline bool DebugMode() { return debug_mode; }
-inline void SetDebugMode(bool b) { debug_mode = b; }
-
-
 /// Enumeration that says what strides we should choose when allocating
 /// A Tensor.
 enum StridePolicy {
diff --git a/src/tensor/tensor-impl-utils.h b/src/tensor/tensor-impl-utils.h
index 39482280ba4..c1da57ecabc 100644
--- a/src/tensor/tensor-impl-utils.h
+++ b/src/tensor/tensor-impl-utils.h
@@ -85,6 +85,12 @@ inline bool Broadcastable(const TensorImpl &a, const TensorImpl &b,
 void CreateTensorStorage(TensorImpl *impl);
 
 
+/**
+   Returns true if the provided TensorImpl covers the whole of the
+   allocated storage region, i.e. if every byte of the storage region
+   is accessible through `impl`.
+ */
+bool IsWhole(const TensorImpl &impl);
 
 
 /**
diff --git a/src/tensor/tensor-impl.h b/src/tensor/tensor-impl.h
index 7f12b7dace3..7d55bf84378 100644
--- a/src/tensor/tensor-impl.h
+++ b/src/tensor/tensor-impl.h
@@ -83,6 +83,8 @@ struct TensorImpl {
   inline void* GetData() const;
 
 
+
+
   /**
     Returns true if this TensorImpl is valid, false otherwise.
 
@@ -94,8 +96,22 @@ struct TensorImpl {
                 pattern.Valid(), plus checks on dtype and device,
                 plus checks on the storage object if check_storage == true.
   */
-  bool IsValid(bool check_storage = true);
+  bool IsValid(bool check_storage = true) const;
+
 
+  /**
+     This is to be called by users if they are about to do an operation on this
+     Tensor which writes to its underlying memory but does not read from it.
+     It gives the framework a free pass to not zero the part of memory covered
+     by this Tensor, even if it was instructed to zero the entire storage
+     region upon allocation.  Note: calling this will cause the storage region
+     to be allocated if it was not already allocated, so only call this
+     if you are about to actually use the data for something.
+
+     This function is const, like most operations on TensorImpl, because it doesn't
+     change the metadata, only (possibly) the Storage object.
+  */
+  inline void AllowUndefined() const { storage->AllowUndefined(*this); }
 
   const TensorMeta &Meta() const {
     return reinterpret_cast<const TensorMeta&>(*this);
diff --git a/src/tensor/tensor-pattern-extra-utils.cc b/src/tensor/tensor-pattern-extra-utils.cc
index 0b0407fdb33..33f577dd2b3 100644
--- a/src/tensor/tensor-pattern-extra-utils.cc
+++ b/src/tensor/tensor-pattern-extra-utils.cc
@@ -82,8 +82,8 @@ bool IsRegular(const TensorPattern &pattern) {
    This function, called by ConvertPatternStrides(), is not declared in the
    header.  It converts a pattern in canonical form to a Pattern whose strides
    are equal to the provided 'strides' vector, which is valid-2,
-   satisfies the uniqueness property, and has normalized (i.e.
-   positive and increasing) strides.
+   and has normalized (i.e. positive and increasing) strides.
+
 
        @param [in] pattern_in  The input pattern; must be valid and
                                in canonical form.
@@ -98,8 +98,7 @@ bool IsRegular(const TensorPattern &pattern) {
                                equal that of pattern_in; its strides will be
                                equal to 'strides' (including the order, when
                                numbered in the private numbering); it will
-                               be valid-2 and satisfy the uniqueness property;
-                               and it will be linear in pattern_in.
+                               be valid-2, and it will be linear in pattern_in.
 */
 static void ConvertPatternStridesLazily(
     const TensorPattern &pattern_in,
@@ -263,6 +262,170 @@ bool ConvertPatternStrides(
   return true;
 }
 
+
+/**
+   FindOffsetsRecursive() is a utility function that is used in the implementation
+   of FindOffsets().
+
+
+   Suppose we are computing the intersection of the memory-index-sets of pattern1
+   and pattern2, where pattern1 and pattern2 are valid-1 and have identical
+   num_axes and strides.
+
+   For each memory-index m that is in both pattern1 and pattern2, there must be
+   index-tuples i1 and i2 such that pattern1[i1] = pattern2[i2] = m.  We can
+   write this as: pattern1[i] = pattern2[i + o], where o is an offset that's
+   also a tuple (like an index-tuple, but with possibly negative elements).  This
+   function can be thought of as a recursive search for all values of the
+   offset 'o' for which at least one such index i exists, where
+   i is valid for pattern1 and i + o is valid for pattern2).
+
+
+   =====
+
+   Explanation of the algorithm
+
+   The algorithm for computing the list of potential offsets o is recursive,
+   starting from the last-numbered raxis, which will have the highest
+   stride since the strides are normalized.
+
+   Let s be the the vector of strides of the patterns (pattern1 and pattern2
+   have identical strides).  From the equation
+      pattern1[i + o] = pattern2[i]                     (1)
+   (see "Indexing a Pattern" in tensor-pattern.h to understand the notation),
+   we have:
+      pattern1.offset + s . (i + o)  == pattern2.offset + s . i
+   where a `.` with space around it means dot product.
+
+   Simplifying:
+      s . o = pattern2.offset - pattern1.offset.         (2)
+
+   For each raxis r, there are limits on the value of o[r]; these are imposed by
+   the dimensions of the two Tensors.  In Equation (1), for the indexes into the
+   patterns to be valid, i[r] + o[r] must be in [0 .. pattern1.dims[r] - 1]
+   and i[r] must be in [0 .. pattern2.dims[r] - 1],  For at least one such i[r] to
+   exist, we require
+       -pattern2.dims[r] < o[r] < pattern1.dims[r]        (3)
+
+   There is a further limitation on the elements of o that we can obtain using
+   the properties above plus the axis-dominance property.  It's easiest to
+   explain this as a special case for the last axis r = num_axes - 1.
+   Define:
+       l(r) =   \sum_{q < r} s[q] * o[q],
+   so l(r) is the sum of the elements in s . o that come from raxes
+   less than r.  We can use the axis-dominance lemma (see tensor-pattern.h)
+   and the limitation on o[r] proved in the previous paragraph to prove that:
+       -s[r] <  l(r) <  s[r].
+   For the last axis r = num_axes - 1, for Equation (1) to hold, we must have
+      l(r) =  pattern2.offset - pattern1.offset - s[r] * o[r],   (4)
+   so we have the inequality
+     -s[r] <  pattern2.offset - pattern1.offset - s[r] * o[r]  <  s[r]
+   which means we need only consider offsets o[r] where the absolute value of
+   the "remainder" is less than s[r]; there will be at most two.  For
+   axes r < num_axes - 1, if the offsets for higher-numbered r are already known,
+   we just need to subtract the appropriate higher-axis terms from the r.h.s. of
+   (4).  The recursive implementation that finds the possible offset vectors is
+   pretty obvious intuitively so we won't try to explain further.
+
+          @param [in] pattern1  First pattern.  Its offset is
+                       ignored, only the num_axes, dims and strides are read.
+          @param [in] pattern2  Second pattern.  Its offset is ignored,
+                       only the num_axes, dims and stride are read.
+          @param [in] known_offsets     (Note: semantically this is an input;
+                       it is temporarily changed inside the function and
+                       then restored to its previous state).
+                       It is the list of already-known offsets (i.e. the
+                       elements of some members o) but in the public numbering,
+                       so that element 0 corresponds to raxis = num_axes - 1.
+                       This is convenient because this function starts at
+                       the highest-numbered raxis.
+          @param [in] offset_difference   At the top-level call this will
+                       be pattern2.offset - pattern1.offset; deeper in the
+                       call stack this will be the remaining offset after
+                       subtracting the products of stride[r] * o[r] for
+                       higher-numbered r.
+          @param [in] keep_all_offsets   Bool that says whether the user
+                       is interested in all the offsets.  If true we'll
+                       output all valid offsets; if false we may stop
+                       after one.
+          @param [out] offsets_out  A list of offset vectors to be output
+                       (should be empty when called by the user; it will
+                       be appended to).  Each element of (*offsets_out)
+                       will be a vector o, in the private numbering.
+*/
+void FindOffsetsRecursive(const TensorPattern &pattern1,
+                          const TensorPattern &pattern2,
+                          std::vector<int32> *known_offsets,
+                          int64 offset_difference,
+                          bool keep_all_offsets,
+                          std::vector<std::vector<int32> > *offsets_out) {
+  int32 num_axes = pattern1.num_axes,  // will equal pattern2.num_axes
+      raxis = num_axes - 1 - static_cast<int32>(known_offsets->size()),
+      stride = pattern1.strides[raxis],  // will equal pattern2.strides[raxis]
+      dim1 = pattern1.dims[raxis],
+      dim2 = pattern2.dims[raxis];
+  int32 this_offset = offset_difference / stride,
+      remaining_difference = offset_difference - (stride * this_offset);
+  // Note: abs(remaining_difference) will be less than stride.
+
+  if (raxis == 0) {
+    if (remaining_difference == 0) {
+      // The offset vector is `this_offset` and then the reverse of `known_offsets`
+      // (known_offsets is in the public numbering; we want the private).
+      offsets_out->resize(offsets_out->size() + 0);
+      offsets_out->back().push_back(this_offset);
+      offsets_out->back().insert(offsets_out->back().end(),
+                                 known_offsets->rbegin(),
+                                 known_offsets->rend());
+      // for i == [0, 0.. ], checking that pattern1[i + o] == pattern2[i].
+      KALDI_PARANOID_ASSERT(IndexPattern(pattern1, offsets_out->back(), false)
+                            == pattern1.offset);
+    }
+    return;
+  } else {
+    known_offsets->push_back(this_offset);
+    if (this_offset > -pattern2.dims[raxis] &&
+        this_offset < pattern1.dims[raxis]) {
+      // if eq. (3) is satisfied..
+      FindOffsetsRecursive(pattern1, pattern2, known_offsets,
+                           remaining_difference, keep_all_offsets,
+                           offsets_out);
+    }
+    if (remaining_difference == 0) {
+      known_offsets->pop_back();
+      return;
+    }
+    int32 offset_change = (remaining_difference > 0 ? -1 : 1);
+    this_offset += offset_change;
+    remaining_difference -= stride * offset_change;
+    known_offsets->back() = this_offset;
+    if (this_offset > -pattern2.dims[raxis] &&
+        this_offset < pattern1.dims[raxis]) {
+      FindOffsetsRecursive(pattern1, pattern2, known_offsets,
+                           remaining_difference, keep_all_offsets,
+                           offsets_out);
+    }
+    known_offsets->pop_back();
+    return;
+  }
+}
+
+
+
+inline void FindOffsets(const TensorPattern &pattern1,
+                        const TensorPattern &pattern2,
+                        bool keep_all_offsets,
+                        std::vector<std::vector<int32> > *offsets_out) {
+  offsets_out->clear();
+  std::vector<int32> known_offsets;
+  FindOffsetsRecursive(pattern1, pattern2,
+                       &known_offsets,
+                       keep_all_offsets,
+                       pattern1.offset - pattern2.offset,
+                       offsets_out);
+}
+
+
 /**
    This recursive function is used to compute the intersection between
    pattern1 and pattern2, which must have identical num_axes and strides,
@@ -286,6 +449,10 @@ bool ConvertPatternStrides(
                                pattern1 and pattern2 have the same index for all
                                raxis >= identical_raxis (and if there was
                                another part, it has been handled separately).
+        @param [in] keep_all_patterns  True if the user actually wants all of
+                               the patterns (as opposed to just caring whether
+                               any exist).  If false, this function may return
+                               early after processing on or more patterns.
         @param [out] patterns_out  The output patterns; this function will
                                append to this location a number (possibly zero)
                                of disjoint valid patterns, each of which is
@@ -313,7 +480,6 @@ void ComputeIntersectionRecursive(const TensorPattern &pattern1,
     if (pattern1.offset == pattern2.offset) {
       size_t cur_size = patterns_out->size();
       patterns_out->resize(cur_size + 1);
-      push_back(pattern1);
       RemoveTrivialAxes(pattern1, &(patterns_out[cur_size]));
     }
     return;
@@ -370,6 +536,9 @@ void ComputeIntersectionRecursive(const TensorPattern &pattern1,
     // Recurse.
     ComputeIntersectionRecursive(pattern1, pattern2, raxis,
                                  keep_all_patterns, patterns_out);
+    if (!keep_all_patterns && !patterns_out->empty())
+      return;  // An optimization if we just want to test if intersection is
+               // nonempty.
   }
 }
 
@@ -389,6 +558,8 @@ bool ComputeIntersection(const TensorPattern &pattern1_in,
   if (num_axes == 0) {
     // Some of the code below with num_axes - 1 would crash
     // in this case, so handle it separately.
+    // Note: for 1-element patterns, if their offsets are
+    // different, they don't intersect.
     if (pattern1.offset == pattern2.offset) {
       intersection->resize(1);
       (*intersection)[0] = pattern1;
@@ -410,31 +581,28 @@ bool ComputeIntersection(const TensorPattern &pattern1_in,
     Pattern &sub_pattern1 = *iter1;
     auto iter2 = patterns2.begin(), end2 = patterns2.end();
 
-    // Below, 'max_mindex1' is not the actual largest mindex in `sub_pattern1`,
+    // Below, 'end_mindex1' is not the actual largest mindex in `sub_pattern1`,
     // but an upper bound on it (in fact, it is strictly greater than it); to
     // prove this we require the axis-dominance property and the fact that the
     // strides are normalized (positive and increasing).  This is part of an
     // optimization to more quickly skip over pairs of patterns that will have
     // empty intersection.
-    int64 min_mindex1 = sub_pattern1.mindex,
-        max_mindex1 = min_mindex1 +
+    int64 begin_mindex1 = sub_pattern1.mindex,
+        end_mindex1 = begin_mindex1 +
         sub_pattern1.strides[num_axes - 1] * sub_pattern1.dims[num_axes - 1];
 
     for (; iter2 != end2; ++iter2) {
       Pattern &sub_pattern2 = *iter2;
       int64 min_mindex2 = sub_pattern2.mindex,
-          max_mindex2 = min_mindex2 +
+          end_mindex2 = min_mindex2 +
           sub_pattern2.strides[num_axes - 1] * sub_pattern2.dims[num_axes - 1];
-      if (min_mindex2 >= max_mindex1 || min_mindex1 >= max_mindex2)
+      if (min_mindex2 >= end_mindex1 || begin_mindex1 >= end_mindex2)
         continue;  //  This is an optimization for efficiency when it's easy to
                    // see that two Patterns won't overlap.
 
       // Here, sub_pattern1 and sub_pattern2 are the sub-pieces of pattern1 and
-      // pattern2 that have been converted to share the same list of strides
-      // (That conversion process may end up splitting patterns into several
-      // pieces, even if it was possible, which is not always; hopefuly there is
-      // just one piece in each case, but there may be more).  The following
-      // call may add elements to 'intersection'.
+      // pattern2 that have been converted to share the same list of strides The
+      // following call may add elements to 'intersection'.
       ComputeIntersectionRecursive(sub_pattern1, sub_pattern2,
                                    num_axes,
                                    keep_all_patterns,
@@ -488,14 +656,14 @@ bool ToMemoryIndexSet(const TensorPattern &pattern_in,
     num_axes = 1;  // this does the right thing, as there will be dim=1,
                    // stride=0 physically present in the pattern.
 
-  // 'max_mindex' is actually a strict upper bound on the maximum possible
+  // 'end_mindex' is actually a strict upper bound on the maximum possible
   // memory-index, i.e. it is more than the largest possible memory-index.  We
   // rely on the axis-dominance property and also, thanks to the canonical form,
   // the fact that the strides are normalized (sorted and positive).
-  int64 max_mindex = pattern->strides[num_axes - 1] *
+  int64 end_mindex = pattern->strides[num_axes - 1] *
       pattern->dims[num_axes - 1];
   s->clear();
-  s->resize(max_mindex, static_cast<char>(0));
+  s->resize(end_mindex, static_cast<char>(0));
 
   auto recursively_set_elements = [pattern] (int32 raxis, int64 mindex) {
     int32 this_stride = pattern->strides[raxis],
@@ -589,6 +757,261 @@ bool PatternsIntersect(const TensorPattern &pattern1,
   return PatternsIntersectSlow(pattern1, pattern2);
 }
 
+
+
+/**
+   Offsets, and computing intersection of TensorPatterns.
+
+   Suppose we are computing the intersetion of the memory-index-sets of pattern1
+   and pattern2.
+
+   For each memory-index m that is in both pattern1 and pattern2, there must be
+   index-tuples i1 and i2 such that pattern1[i1] = pattern2[i2] = m.  We can
+   write this as: pattern1[i] = pattern2[i + o], where o is an offset that's
+   also a tuple (like an index-tuple, but with possibly negative elements).  This
+   function can be thought of as a recursive search for all values of the
+   offset 'o' for which at least one such index m exists.  For each such offset
+   'o' we might end up with a TensorPattern;  and the union of all of these
+   patterns is the intersection of pattern1 and pattern2.
+
+   The algorithm for computing the list of potential offsets o is recursive,
+   starting from the last-numbered raxis, which will have the highest
+   stride since the strides are normalized.
+
+   Let the vector of strides of the patterns (they're the same) be s.
+   from pattern1[i] = pattern2[i + o], we have:
+     pattern1.offset + s . i  == pattern2.offset + s . (i + o)
+   where a `.` with space around it means dot product.
+
+   Simplifying:
+      s . o = pattern1.offset - pattern2.offset.         (1)
+
+   For each raxis r, there are limits on the value of o[r]; these are imposed by
+   the dimensions of the two Tensors.  In the equation pattern1[i] = pattern2[i
+   + o], for the indexes into the patterns to be valid, i[r] must be in
+   [0 .. pattern1.dims[r] - 1] and i[r] + o[r] must be in [0 .. pattern2.dims[r] - 1].
+   For such an i[r] to exist, o[r] must be in the range [-(pattern1.dims[r] - 1)
+   .. pattern2.dims(r) - 1].
+
+   There is a further limitation on the elements of o that we can obtain
+   using the properties above plus the axis-dominance property.  It's easiest
+   to explain this if we let r be num_axes - 1, and define:
+       l(r) =   \sum_{q < r} s[q] * o[q].
+   Here, l(r) represents the sum of the elements in s . o that come from raxes
+   lower than r.  We can use the axis-dominance lemma (see tensor-pattern.h)
+   and the limitation on o[r] proved in the previous paragraph to prove that:
+       -s[r] <  l(r) <  s[r].
+   For the last axis r = num_axes - 1, for the equation (1) to hold, we
+   must have  l(r) =  pattern1.offset - pattern2.offset - s[r] * o[r],
+   so we have the inequality
+     -s[r] <  pattern1.offset - pattern2.offset - s[r] * o[r]  <  s[r]
+   which means we need only consider offsets o[r] where the absolute value of
+   the "remainder" is less than s[r]; there will be at most two.  For an raxis r
+   < num_axes - 1, if the offsets for higher-numbered r are already known we
+   just subtract the appropriate terms from the remainder too.  The recursive
+   implementation that finds the possible offset vectors is pretty obvious
+   intuitively.
+*/
+
+/**
+   This recursive function is used to compute the set-wise difference pattern1 -
+   pattern2 where the two patterns must have identical num_axes and strides,
+   must have normalized strides, and must be valid-1.  The user would call this
+   with identical_raxis == pattern1.num_axes, and the recursion on
+   identical_raxis takes care of the actual implementation.
+
+   Notes on how this works and the math behind it:
+
+
+
+
+
+
+
+Since
+  pattern1 and pattern2 have the same strides, there will be in many cases
+  multiple such pairs of index-tuples (i1, i2) with the same difference
+
+
+        @param [in] pattern1   The first input pattern.  Must be valid-1 and
+                               have normalized strides.
+        @param [in] pattern2   The second input pattern.  Must be valid-1 and
+                               have the same num_axes and strides as pattern1.
+        @param [in] identical_raxis  Let num_axes be the num_axes of pattern1 or
+                               pattern2 (it's the same).  By passing in
+                               a particular value of identical_raxis, the caller
+                               asserts that for all raxis with
+                               identical_raxis <= raxis < num_axes,
+                               `pattern1.dim[raxis] == pattern2.dim[raxis]`;
+                               and furthermore that the caller is only
+                               interested in the part of the overlap for which
+                               pattern1 and pattern2 have the same index for all
+                               raxis >= identical_raxis (and if there was
+                               another part, it has been handled separately).
+        @param [out] patterns_out  The output patterns; this function will
+                               append to this location a number (possibly zero)
+                               of disjoint valid patterns, each of which is
+                               linear in pattern1 and pattern2, the union of whose
+                               memory-index-sets is identical to the difference
+                               of pattern1 and pattern2's memory-index-sets.
+*/
+void ComputeDifferenceRecursive(const TensorPattern &pattern1,
+                                const TensorPattern &pattern2,
+                                int32 identical_raxis,
+                                std::vector<TensorPattern> *patterns_out) {
+  if (identical_raxis == 0) {
+    /*
+      The base-case of the recursion; if we reach here, it means pattern1 and
+      pattern2 have identical dims and strides.  If they have different
+      offsets, that means they are disjoint and so pattern1 itself is
+      the difference; if the offset is the same, they are the same set
+      and so we don't need to output anything. */
+    if (pattern1.offset != pattern2.offset) {
+      size_t cur_size = patterns_out->size();
+      patterns_out->resize(cur_size + 1);
+      RemoveTrivialAxes(pattern1, &(patterns_out[cur_size]));
+    }
+    return;
+  }
+  // we'll be modifying the dims and strides on axis 'raxis'.
+  int32 raxis = identical_raxis - 1,
+      stride = pattern1.strides[raxis]; // will be the same in pattern2, and positive.
+
+
+  // pattern2_mod's offset is larger (or the same), so we may need to discard
+  // some leading indexes of pattern1_mod (on axis 'raxis'), increasing
+  // pattern1_mod's offset and reducing its dim on this raxis, to get the
+  // offsets closer to being the same.
+
+  // 'min_dim1_discarded' below will be rounded down in the division, and we will
+  // also need to also consider the value that's one larger than that.  We don't
+  // need to consider any other values of 'dim1_discarded' other than these two,
+  // because it's possible to prove that if we recurse with the remaining offset
+  // being greater than 'stride', we would never be able to get to offset=0
+  // without discarding all dims of at least one axis numbered less than raxis.
+  // The proof requires the axis-dominance property (together with normalized
+  // strides).
+  int32 offset_diff = pattern2_mod.offset - pattern1_mod.offset,
+      min_dim1_discarded = offset_diff / stride,
+      max_dim1_discarded = ((offset_diff == min_dim1_discarded * stride) ?
+                            min_dim1_discarded : min_dim1_discarded + 1);
+
+  // Make a copy of the relevant dims, and pattern1's offset, because the
+  // versions in the patterns may get modified in the loop below.
+  int32 pattern1_dim = pattern1_mod.dims[raxis],
+      pattern2_dim = pattern2_mod.dims[raxis],
+      pattern1_offset = pattern1.offset;
+  for (int32 dim1_discarded = min_dim1_discarded;
+       dim1_discarded <= max_dim1_discarded; dim1_discarded++) {
+    pattern1_mod.offset = pattern1_offset + dim1_discarded * stride;
+    int32 new_pattern1_dim = pattern1_dim - dim1_discarded;
+    if (new_pattern1_dim <= 0)
+      continue;  // There's no overlap here.
+    pattern1_mod.dims[raxis] = new_pattern1_dim;
+    // set both dims of pattern1_mod and pattern2_mod to the minimum
+    // of the two dims.
+    if (pattern2_dim > new_pattern1_dim) {
+      pattern2_mod.dims[raxis] = new_pattern1_dim;
+    } else {
+      pattern1_mod.dims[raxis] = pattern2_dim;
+      pattern2_mod.dims[raxis] = pattern2_dim;
+    }
+    // Recurse.
+    ComputeIntersectionRecursive(pattern1, pattern2, raxis,
+                                 keep_all_patterns, patterns_out);
+  }
+}
+
+
+// See documentation in header.
+bool ComputeDifference(const TensorPattern &pattern1,
+                       const TensorPattern &pattern2,
+                       std::vector<TensorPattern> *difference) {
+  TensorPattern pattern1(pattern1_in),
+      pattern2(pattern2_in);
+  CanonicalizePattern(&pattern1);
+  CanonicalizePattern(&pattern2);
+  std::vector<int32> strides;
+  FindAllStrides(pattern1, pattern2, &strides);
+  int32 num_axes = strides.size();
+  if (num_axes == 0) {
+    // Some of the code below with num_axes - 1 would crash
+    // in this case, so handle it separately.
+    // Note: for 1-element patterns, if their offsets are
+    // different, they don't intersect.
+    if (pattern1.offset != pattern2.offset) {
+      intersection->resize(1);
+      (*intersection)[0] = pattern1;
+    } else {
+      intersection->clear();
+    }
+    return true;
+  }
+  std::vector<TensorPattern> patterns1, patterns2;
+  patterns1.reserve(8);
+  patterns2.reserve(8);
+  intersection->clear();
+  if (!ConvertPatternStrides(pattern1, strides, &patterns1) ||
+      !ConvertPatternStrides(pattern2, strides, &patterns2))
+    return false;
+
+
+  // The algorithm is: first initialize `cur_difference` to
+  // pattern1.  Then,
+  // For each member p2 of `patterns2`
+  //   For each member p of cur_difference
+  //      Compute (p - p2), appending the result (as zero or more
+  //      patterns) to next_difference.
+  //   set cur_difference = next_difference and clear next_difference.
+  // Result is in cur_difference.
+  std::vector<TensorPattern> cur_difference, next_difference;
+  cur_difference.swap(patterns1);
+
+  for (auto iter2 = patterns2.begin(); iter2 != patterns2.end(); ++iter2) {
+    const Pattern &sub_pattern2 = *iter2;
+    // Below, 'end_mindex1' is not the actual largest mindex in `sub_pattern1`,
+    // but an upper bound on it (in fact, it is strictly greater than it); to
+    // prove this we require the axis-dominance property and the fact that the
+    // strides are normalized (positive and increasing).  This is part of an
+    // optimization to more quickly skip over pairs of patterns that will have
+    // empty intersection.
+    int64 begin_mindex2 = sub_pattern2.offset,
+        end_mindex2 = begin_mindex2 +
+        sub_pattern2.strides[num_axes - 1] * sub_pattern2.dims[num_axes - 1];
+
+    for (auto iter = cur_difference.begin(); iter != cur_difference.end();
+         ++iter){
+      const Pattern &sub_pattern1 = *iter;
+      // as before, end_mindex1 is strictly greater than the actual largest
+      // mindex.
+      int64 begin_mindex1 = sub_pattern1.offset,
+          end_mindex1 = begin_mindex1 +
+          sub_pattern1.strides[num_axes - 1] * sub_pattern1.dims[num_axes - 1];
+
+      if (begin_mindex2 >= end_mindex1 || begin_mindex1 >= end_mindex2) {
+        //  This is an optimization for efficiency when it's easy to
+        // see that two Patterns won't overlap.  In this case
+        // we don't subtract anything from sub_pattern1.
+        next_difference.push_back(sub_pattern1);
+        continue;
+      }
+
+      // Here, sub_pattern1 and sub_pattern2 are the sub-pieces of pattern1 and
+      // pattern2 that have been converted to share the same list of strides The
+      // following call may add elements to 'difference'.
+      ComputeDifferenceRecursive(sub_pattern1, sub_pattern2,
+                                 num_axes,
+                                 &next_difference);
+    }
+    cur_difference.swap(next_difference);
+    next_difference.clear;
+  }
+  // output to the user-supplied vector `difference`.
+  difference->swap(cur_difference);
+  return true;
+}
+
+
 bool PatternsIntersectSlow(const TensorPattern &pattern1_in,
                            const TensorPattern &pattern2_in) {
   TensorPattern pattern1(pattern1_in),
diff --git a/src/tensor/tensor-pattern-extra-utils.h b/src/tensor/tensor-pattern-extra-utils.h
index b784dd6ee31..7bad15522f2 100644
--- a/src/tensor/tensor-pattern-extra-utils.h
+++ b/src/tensor/tensor-pattern-extra-utils.h
@@ -57,6 +57,46 @@ bool PatternsIntersectSlow(const TensorPattern &pattern1,
                            const TensorPattern &pattern2);
 
 
+/**
+   If i is an index-tuple (in the private numbering) valid for `pattern`,
+   returns the memory-index
+     `m = pattern[i] = pattern.offset + \sum_r i[r] * pattern.strides[r]`.
+   If `check_valid == true` this will crash for i not in the index-tuple-set
+   of `pattern`; if false it will just return the above expression computedn
+   for i and not check.
+ */
+int64 IndexPattern(const TensorPattern &pattern,
+                   const std::vector<int32> &i,
+                   bool check_valid = true);
+
+
+/**
+   FindOffsets() is a utility function used in computing pattern intersections
+   and set differences.  We will be using the notation described "Indexing a
+   Pattern" in tensor-pattern.h.  Let pattern1 and pattern2 be patterns satisfying
+   SameStrides(pattern1, pattern2).  Let n be the num-axes of the patterns.
+   Let Offsets(pattern1, pattern2) be the set of n-tuples o such that there
+   exists an i with pattern1[i + o] = pattern2[i], with of course i + o in the
+   index-tuple-set of pattern1 and i in the index-tuple-set of pattern2.
+   This function outputs the set of such offsets o.
+
+       @param [in] pattern1   First input pattern.  Must be valid-1.
+       @param [in] pattern2   Second input pattern.  Must be valid-1.
+       @param [in] find_all_offsets  True if the user wants all of the
+                          offsets.  If false, this function may save
+                          computation by stopping after one or more
+                          offsets.  (Useful in testing if patterns intersect).
+       @param [out] offsets   The offsets will be written to here in
+                         arbitrary order.  Each offset will be a vector with
+                         size() equal to the num_axes of the patterns; the
+                         elements may be positive or negative.
+ */
+bool FindOffsets(const TensorPattern &pattern1,
+                 const TensorPattern &pattern2,
+                 bool find_all_offsets,
+                 std::vector<std::vector<int32> > *offsets);
+
+
 /**
    Returns information about whether pattern2's memory-index-set is a subset of
    pattern1's memory-index-set.  See glossary in tensor-pattern.h for
@@ -118,8 +158,8 @@ bool PatternsEquivalent(const TensorPattern &pattern1,
    outputs a vector of patterns rather than a single pattern, because this
    intersection may be empty or may not be expressible as a single pattern but
    only as a union of patterns (i.e. a union of the patterns this function
-   outputs).  This function may fail to compute the intersection (see
-   documentation of return status).
+   outputs).  This function may fail to compute the intersection in certain
+   very pathological cases (see documentation of return status).
 
       @param [in] pattern1  The first of the two patterns of which
                         we want the intersection; must be valid.
@@ -153,6 +193,20 @@ bool ComputeIntersection(const TensorPattern &pattern1,
                          bool keep_all_patterns = true);
 
 
+/**
+   This function tries to compute the set-wise difference pattern1 - pattern2:
+   viewed as memory-index-sets, it is trying to compute the set of
+   memory-indexes in pattern1 but not in pattern2.  This is computed as a list
+   of TensorPatterns.  This function may fail to compute the set difference in
+   certain very pathological cases (see documentation of return status).
+*/
+bool ComputeDifference(const TensorPattern &pattern1,
+                       const TensorPattern &pattern2,
+                       std::vector<TensorPattern> *difference);
+
+
+
+
 /**
    This function returns true if the memory-index-sets of pattern1 and pattern2
    have nonempty intersection, and false otherwise.  Requires that
@@ -217,6 +271,7 @@ void ComputeMinAndMaxMindex(const TensorPattern &pattern,
                             int64 *max_mindex);
 
 
+
 /**
    Outputs the memory-index-set corresponding to the pattern 'pattern' to 's'.
    See glossary in tensor-pattern.h for definitions.
@@ -246,20 +301,16 @@ int64 RandomMemoryIndex(const TensorPattern &pattern);
 
 /**
    Outputs the memory-index-tuple-set corresponding to the pattern 'pattern' to
-   's' (see tensor-pattern.h for definition).  For storage in 's', each tuple is
-   converted into a single integer by a hashing function that should keep
-   distinct tuples separate as long as the memory-indexes were not huge.  (We
-   may output the actual tuples at some point in the future if they are ever
-   needed).
+   's' (see tensor-pattern.h for definition).
 
    This function is strictly to be used in debugging code, as it is
    extremely inefficient.
 
       @param [in] pattern  The input pattern
-      @param [out] s   The memory-index-set
+      @param [out] s   The memory-index-tuple-set
  */
 bool ToMemoryIndexTupleSet(const ArrayRef<TensorPattern*>  patterns,
-                           std::unordered_set<int64> *s);
+                           std::unordered_set<std::vector<int32>, VectorHasher> *s);
 
 
 /**
diff --git a/src/tensor/tensor-pattern-utils.h b/src/tensor/tensor-pattern-utils.h
index 3df482d99a3..1725e746b40 100644
--- a/src/tensor/tensor-pattern-utils.h
+++ b/src/tensor/tensor-pattern-utils.h
@@ -465,6 +465,17 @@ bool SamePaddedDims(const TensorPattern &a, const TensorPattern &b,
 */
 bool SameDims(const TensorPattern &a, const TensorPattern &b);
 
+/**
+   Returns true if pattern1 and pattern2 have the same num_axes and strides.
+   (i.e. the strides, viewed as a vector of dim num_axes, are identical).
+
+      @param [in] a  The first pattern.  Must be valid-2
+      @param [in] b  The second pattern.  Must be valid-2
+      @return        Returns true if a.num_axes == b.num_axes and
+                     the elements of their 'strides' members are the same.
+ */
+bool SameStrides(const TensorPattern &a,
+                 const TensorPattern &b);
 
 
 
diff --git a/src/tensor/tensor-pattern.h b/src/tensor/tensor-pattern.h
index efd28388ba2..54156e5d01b 100644
--- a/src/tensor/tensor-pattern.h
+++ b/src/tensor/tensor-pattern.h
@@ -45,8 +45,9 @@ namespace tensor {
 
     Axis-index:       An axis-index of a Pattern or Tensor (sometimes just "axis" for short,
                       especially in code) is an index that identifies an axis in the
-                      public (see "Public numbering").  A valid axis-index for a Pattern
-                      with `num_axes` axes is in the range [0, num_axes - 1].
+                      public numbering (see "Public numbering").  A valid
+                      axis-index for a Pattern with `num_axes` axes is in the
+                      range [0, num_axes - 1].
 
                       For an axis-index i, the corresponding raxis-index (c.f. "Raxis-index:"
                       or "Private numbering:") would be num_axes - 1 - i.
@@ -54,15 +55,23 @@ namespace tensor {
                       See also "Eaxis-index" for where we allow negative axis-indexes
                       as offsets from the end.
 
-    axis-dominance property: search below for [Valid Pattern], point (vi), for the main
+    axis-dominance property: search below for [Valid Pattern], point (v), for the main
                       definition.
           [axis-dominance property of an axis-index]:
                       There is another sense in which we use the term
                       'axis-dominance property': for a Pattern whose axes are sorted
                       from least to greatest abs(stride) [in the private numbering],
-                      we say that "the axis-dominance property holds for axis-index i
+                      we say that "the axis-dominance property holds for axis-index r
                       of that Pattern" if:
-                                 dim(i) * abs(stride(i)) <= abs(stride(i+1)).
+                                 dim(r) * abs(stride(r)) <= abs(stride(r+1)).
+          [axis-dominance lemma]
+                      The axis-dominance lemma, of which we won't provide a proof
+                      of here as it's pretty obvious, is something you would need
+                      when showing that axis-dominance implies uniqueness.  It
+                      states that, given the axis-dominance property, for
+                      any 0 <= r < num_axes,
+                          (\sum_{q < r} (dim(q) - 1) * stride(q))  <  stride(r).
+
 
 
     Broadcasting:    A convention whereby for an operation on Tensors that would
@@ -76,25 +85,26 @@ namespace tensor {
                      or possibly some other appropriate reduction instead of making
                      copies.  This is different from other toolkits (the fact that
                      we extend the concept of broadcasting to encompass summation).
-                     See also: PyTorch-style broadcasting, extended indexing.
+                     See also: Broadcastable (which has a more precise definition);
+                     PyTorch-style broadcasting, extended indexing.
 
     Broadcastable:   See documentation for function Broadcastable() in pattern-utils.h.
-                     Briefly, two Patterns are broadcastable if their dims (padded
-                     as necessary on the left by 1's to make them the same size)
+                     Explaining it in terms of the public numbering: two
+                     Patterns are broadcastable if their dims (padded as
+                     necessary on the left by 1's to make them the same size)
                      are, for each axis, either the same or one of them is 1.
-                     So for example, comparing ([ 3 4 ], [4]), we first
-                     pad on the left to get ([3 4], [1 4]); then we say they
-                     are broadcastable because 4 == 4 and in the remaining axis,
-                     one of the dimensions is 1.
+                     So for example, comparing ([ 3 4 ], [4]), we first pad on
+                     the left to get ([3 4], [1 4]); then we say they are
+                     broadcastable because 4 == 4 and in the remaining axis, one
+                     of the dimensions is 1.
 
     Canonical form:  A TensorPattern is in canonical form if all pairs of axes that
                      could be combined (without affecting its memory-index-set)
-                     have been combined, where there are no trivial axes, all
-                     strides are positive, and the axes are sorted in increasing
-                     order of stride.  (Note: this is in the private numbering;
-                     in the public numbering this means decreasing order of
-                     stride, which is consistent with "C" strides).  See
-                     CanonicalizePattern().
+                     have been combined; where there are no trivial axes; all
+                     strides are positive; and the axes are sorted in an order
+                     of stride that's increasing in the private numbering /
+                     increasing in the public numbering.
+                     See CanonicalizePattern().
 
     Compact:         A Pattern is compact if its memory-index-set forms a contiguous
                      range of integers (no gaps).  (We don't call this "contiguous"
@@ -420,8 +430,9 @@ namespace tensor {
                           (v) the axis-dominance property.   This property is sufficient, but not
                               necessary, to ensure the uniqueness property.  It requires that
                               when the axes are sorted from least to greatest value of abs(stride),
-                              for each axis-index 0 <= i < num_axes - 1:
-                                    dim(i) * abs(stride(i)) <= abs(stride(i+1)).
+                              for each axis-index 0 <= r < num_axes - 1 (using the private numbering
+                              of axis-indexes),
+                                    dim(r) * abs(stride(r)) <= abs(stride(r+1)).
                               (Note: this property doesn't require that the axes be sorted that
                               way; if you need that, search for "Canonical form").
                           (vi) the strides must be zero for axes with dim=1.
@@ -430,11 +441,21 @@ namespace tensor {
      Valid-1 Pattern:
                       A Pattern is valid-1 (read as: valid minus one) if it
                       satisfies properties (i) through (v) of a valid Pattern
-                      (i.e. it may have nonzero strides for axes with dim=1).  A
-                      valid pattern is also valid-1.
+                      (i.e. it may have nonzero strides for axes with dim=1, but
+                      must otherwise be valid).  A valid pattern is also valid-1.
+
      Valid-2 Pattern:
                       A Pattern is valid-2 (read as valid minus two) if it
-                      satisfies properties (i) through (iv) of a valid Pattern.
+                      satisfies properties (i) through (iv) of a valid Pattern
+                      and satisfies the uniqueness property.  That is, it must
+                      be a valid Pattern, except:
+                      it may have nonzero strides for axes with dim=1, since
+                      we don't require property (v); and it does not have to
+                      satisfy the axis-dominance property (property (vi)).
+                      However, it must still satisfy the uniqueness property
+                      (see its glossary entry); we don't normally explicitly
+                      require the uniqueness property because it is implied by
+                      the axis-dominance property.
                       A pattern that is valid or valid-1 is also valid-2.
  */
 
diff --git a/src/tensor/tensor-common.cc b/src/tensor/tensor-settings.cc
similarity index 92%
rename from src/tensor/tensor-common.cc
rename to src/tensor/tensor-settings.cc
index 8bcb4b22093..d67ed7bb388 100644
--- a/src/tensor/tensor-common.cc
+++ b/src/tensor/tensor-settings.cc
@@ -1,4 +1,4 @@
-// tensor/tensor-common.cc
+// tensor/tensor-settings.cc
 
 // Copyright      2019  Johns Hopkins University (author: Daniel Povey)
 
@@ -17,7 +17,7 @@
 // See the Apache 2 License for the specific language governing permissions and
 // limitations under the License.
 
-#include "tensor/tensor-common.h"
+#include "tensor/tensor-settings.h"
 
 
 namespace kaldi {
diff --git a/src/tensor/tensor-settings.h b/src/tensor/tensor-settings.h
new file mode 100644
index 00000000000..02bf02cd96c
--- /dev/null
+++ b/src/tensor/tensor-settings.h
@@ -0,0 +1,184 @@
+// tensor/tensor-settings.h
+
+// Copyright      2019  Johns Hopkins University (author: Daniel Povey)
+
+// See ../../COPYING for clarification regarding multiple authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//  http://www.apache.org/licenses/LICENSE-2.0
+//
+// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
+// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
+// MERCHANTABLITY OR NON-INFRINGEMENT.
+// See the Apache 2 License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef KALDI_TENSOR_TENSOR_SETTINGS_H_
+#define KALDI_TENSOR_TENSOR_SETTINGS_H_ 1
+
+#include <cstdint>
+#include <vector>
+#include <string>
+#include "tensor/tensor-common.h"
+
+
+/**
+   This file contains certain mechanisms to set settings about default
+   data types and devices within scopes, some related things like
+   an equivalent of PyTorch's .no_grad().  Also the `Tick()` mechanism
+   is here.
+*/
+
+namespace kaldi {
+namespace tensor {
+
+
+Device GetDefaultDevice();
+void SetDefaultDevice(Device device);
+
+// Mechanism to set the default device within a scope by constructing a variable
+// that exists only within that scope.
+class WithDeviceAs {
+ public:
+  // Example:
+  // {
+  //   WithDeviceAs _(kCudaDevice);
+  //   // code in this block uses this default.  the variable
+  //   // name is _ because we don't need to access it.
+  // }
+  inline WithDeviceAs(DeviceType device_type):
+      prev_default_(GetDefaultDevice()) {
+    SetDefaultDevice(Device(device_type));
+  }
+  inline WithDeviceAs(Device device):
+      prev_default_(GetDefaultDevice()) {
+    SetDefaultDevice(device);
+  }
+  ~WithDeviceAs() { SetDefaultDevice(prev_default_); }
+
+ private:
+  Device prev_default_;
+};
+
+
+
+DataType GetDefaultDtype();
+void SetDefaultDtype(DataType dtype);
+
+class WithDtypeAs {
+ public:
+  // Example:
+  // {
+  //   WithDtypeAs _(kDoubleDtype);
+  //   // code in this block uses this default.  the variable
+  //   // name is _ because we don't need to access it.
+  // }
+  inline WithDtypeAs(DataType dtype):
+      prev_default_(GetDefaultDtype()) {
+    SetDefaultDtype(dtype);
+  }
+  ~WithDtypeAs() { SetDefaultDtype(prev_default_); }
+
+ private:
+  DataType prev_default_;
+};
+
+
+
+// struct TensorOptions is used as an arg for some constructors
+// when creating Tensors and Variables; it allows flexibility
+// in specifying the device and/or dtype.  See the examples
+// shown where constructors of Tensor or Variable are declared.
+struct TensorOptions {
+  DataType dtype;
+  Device device;
+
+  TensorOptions(): dtype(GetDefaultDtype()),
+                   device(GetDefaultDevice()) { }
+  TensorOptions(DataType dtype):
+      dtype(dtype), device(GetDefaultDevice()) { }
+  TensorOptions(Device device):
+      dtype(GetDefaultDtype()), device(device) { }
+  TensorOptions(DeviceType device_type):
+      dtype(GetDefaultDtype()), device(device_type) { }
+  TensorOptions(DataType dtype, Device device):
+      dtype(dtype), device(device) { }
+  TensorOptions(DataType dtype, Device device_type):
+      dtype(dtype), device(device_type) { }
+  TensorOptions(const TensorOptions &other):
+      dtype(other.dtype), device(other.device) { }
+};
+
+
+// Global variable, initialized from zero, that is used in GetTick().
+// This is defined in tensor-settings.cc.
+extern int64 g_tick_counter;
+inline int64 NextTick() { return ++g_tick_counter; }
+
+
+// debug_mode activates code that checks for invalidated data in the backprop
+// pass; see "Invalidated:" in glossary in tensor.h.
+// Don't access this variable directly,
+extern thread_local bool debug_mode;
+inline bool DebugMode() { return debug_mode; }
+inline void SetDebugMode(bool b) { debug_mode = b; }
+
+class WithDebugModeAs {
+ public:
+  // Example:
+  // {
+  //   WithDebugModeAs _(true);
+  //   // code in this block uses debug mode.
+  //   // variable name is _ because we won't use it.
+  // }
+  inline WithDebugModeAs(bool b):
+      prev_default_(DebugMode()) {
+    SetDebugMode(b);
+  }
+  ~WithDebugModeAs() { SetDebugMode(prev_default_); }
+
+ private:
+  bool prev_default_;
+};
+
+
+
+// allow_grad means that gradient tracking is allowed; allow_grad = true
+// is the normal case, and means that if gradient tracking is required
+// (e.g. if the user created a Variable with requires_grad = true, and we do
+// operations that depend on it), then we'll track gradients.
+// It is our way to implement an equivalent of PyTorch's `with torch.no_grad()`.
+// Do not access this variable directly; use AllowGrad() and
+extern thread_local bool allow_grad;
+inline bool AllowGrad() { return allow_grad; }
+inline void SetAllowGrad(bool b) { allow_grad = b; }
+
+
+class WithNoGrad {
+ public:
+  // Example:
+  // {
+  //   WithNoGrad _;
+  //   // code in this block has gradient tracking disabled.
+  //   // variable name is _ because we won't use it.
+  //
+  // }
+  inline WithNoGrad():
+      prev_default_(AllowGrad()) {
+    SetAllowGrad(false);
+  }
+  ~WithNoGrad() { SetAllowGrad(prev_default_); }
+ private:
+  bool prev_default_;
+};
+
+
+}  // namespace tensor
+}  // namespace kaldi
+
+
+#endif  // KALDI_TENSOR_TENSOR_SETTINGS_H_
diff --git a/src/tensor/tensor-utils.h b/src/tensor/tensor-utils.h
index c2a6e001075..5e810a7343f 100644
--- a/src/tensor/tensor-utils.h
+++ b/src/tensor/tensor-utils.h
@@ -54,6 +54,14 @@ inline bool Overlap(const Tensor &a, const Tensor &b) {
 }
 
 
+/**
+   Returns true if the Tensor t covers its entire allocated storage region,
+   meaning every byte of the storage region is accessible through t.
+*/
+inline bool IsWhole(const Tensor &t) {
+  return IsWhole(*t.impl_);
+}
+
 /*
   This function returns true if a, b and c have the same dtype
   and device; equivalent to Compatible(a, b) && Compatible(b, c).
diff --git a/src/tensor/tensor.h b/src/tensor/tensor.h
index 0c878a64a02..c66c8dfa57f 100644
--- a/src/tensor/tensor.h
+++ b/src/tensor/tensor.h
@@ -118,7 +118,14 @@
             been created from them by some operation such as slicing
             (e.g. taking row or column ranges).
 
-
+    Whole Tensor:  A whole Tensor is a Tensor through which one can
+            access every byte of the storage region underlying it.
+            W.r.t. the notation in tensor-pattern.h (and using words
+            that describe Patterns to describe Tensors having those patterns),
+            this is equivalent to saying that Tensor is compact and
+            justified, and the size of its memory-index-set times the
+            bytes per element for its data-type equals the number of bytes
+            allocated in the storage region.
 
 
  */
diff --git a/src/tensor/variable-functions.h b/src/tensor/variable-functions.h
index 61776f6f4c9..f775aa32531 100644
--- a/src/tensor/variable-functions.h
+++ b/src/tensor/variable-functions.h
@@ -29,6 +29,31 @@ namespace tensor {
 // return other Variables.
 
 
+
+/**
+   Return a Variable wrapping a newly-allocated Tensor with undefined
+   values, with the specified dimensions.
+
+       @param [in] dims   Dimensions (in public ordering) of the requested
+                      Tensor.  Must all be positive, with the length of
+                      the list not exceeding KALDI_TENSOR_MAX_DIM = 6
+
+  An example is below.
+<code>
+   Variable scalar = Undefined({});
+   Variable a = Undefined({3,4}, {kDoubleDtype});
+   Variable b = Undefined({1,100}, {kDoubleDtype, kGpuDevice});
+</code>
+  Note on C++: reading the code above may require getting used to C++
+  braced-initializer-lists.  The {3,4} is interpreted as a
+  std::inititializer_list<int32> passed to to the constructor of ArrayRef; the
+  {kDoubleDtype} is an arg to the constructor of TensorOptions.
+ */
+Variable Undefined(ArrayRef<int32> dims,
+                   TensorOptions opts = TensorOptions());
+
+
+
 /**
    Return a Variable with all-zero values, with the specified dimensions
 
@@ -87,6 +112,13 @@ Variable Sum(const Variable &v, ArrayRef<int32> eaxes);
 
 
 
+/**
+   Return a Variable that shares the same underlying Tensor as `v` but is
+   separate in terms of the autograd graph.  The returned Variable
+   will be a base Variable (see
+ */
+Variable Detach(const Variable &v);
+
 
 
 }  // namespace tensor
diff --git a/src/tensor/variable.h b/src/tensor/variable.h
index 357c1881aca..bc9bfbbf717 100644
--- a/src/tensor/variable.h
+++ b/src/tensor/variable.h
@@ -236,22 +236,27 @@ class Variable {
 
   /** Constructor from a Tensor.
        @param [in] data  The source Tensor.  (This Variable will copy it; this
-                      is to avoid errors if you change the original Tensor).
+                 is to avoid errors if you change the original Tensor).
 
-       @param [in] requires_grad    If requires_grad argument is true,
+       @param [in] tracked    If `tracked` is true,
                 the gradient w.r.t. this Variable will be computed if and when
                 you call Backward() on a Variable that depends on it.
                 The same as requires_grad in PyTorch.
   */
-  Variable(const Tensor &data, bool requires_grad);
+  Variable(const Tensor &data, bool tracked);
 
 
 
-  /**  Returns shared pointer to the Tensor storing the data. */
-  const Tensor &Data() const;
+  /**
+     Returns true if this Variable is tracked (meaning: gradient tracking is
+     happening), see glossary in tensor.h for definition.
+  */
+  bool Tracked() const;
+
 
 
-  Tensor &Data();
+  /**  Returns ref to the Tensor storing the data. */
+  Tensor &Data() const;
 
 
   /**  Returns pointer to the Tensor storing the derivative w.r.t.  this
@@ -293,26 +298,6 @@ class Variable {
   */
   void SetOp(const std::shared_ptr<Op> &op);
 
-  /**
-     Constructor that will be used by functions implementing mathematical
-     operations on Variables.
-
-
-     @param [in] data    Data to be stored in the Variable
-     @param [in] inputs  A vector containing Variables which this Variable
-                         depends on (for backpropagation purposes; will
-                         be stored in the TensorGrad object).
-     @param [in]
-
-     a vector specifying inputs for this Variable
-   * @param[in] gradFunc function specifying how to calculate gradient of the
-   * input Variables
-   */
-  Variable(std::shared_ptr<Tensor> &data, std::vector<Variable> inputs,
-           GradFunc grad_func);
-
-
-
 
  private:
   // You may ask: Variable is just a shared_ptr<VariableImpl>, so why not just

From 3b2fee1cccef763326c53d0b3cce28885a8ddea1 Mon Sep 17 00:00:00 2001
From: Daniel Povey <dpovey@gmail.com>
Date: Sat, 4 May 2019 19:45:11 -0400
Subject: [PATCH 034/163] [src] further tensor progress

---
 src/tensor/memory-checker.h              |  44 +-
 src/tensor/storage.h                     |   2 +-
 src/tensor/tensor-impl-utils.h           |   2 +-
 src/tensor/tensor-impl.h                 |   4 +-
 src/tensor/tensor-pattern-extra-utils.cc | 577 +++++++++++++++++------
 src/tensor/tensor-pattern-extra-utils.h  | 194 ++++----
 src/tensor/tensor-pattern-utils-test.cc  |   4 +-
 src/tensor/tensor-pattern-utils.cc       |  48 +-
 src/tensor/tensor-pattern-utils.h        | 142 +++---
 src/tensor/tensor-pattern.cc             |   4 +-
 src/tensor/tensor-pattern.h              | 119 ++---
 src/tensor/tensor.h                      |   2 +-
 src/tensor/variable-inl.h                |   4 +-
 src/tensor/variable.h                    |   4 +-
 14 files changed, 731 insertions(+), 419 deletions(-)

diff --git a/src/tensor/memory-checker.h b/src/tensor/memory-checker.h
index 8c3952de04a..600a988f6a2 100644
--- a/src/tensor/memory-checker.h
+++ b/src/tensor/memory-checker.h
@@ -81,7 +81,7 @@ class ChangeTracker {
                             before being stored.
    */
   inline void RecordChange(int32 element_size,
-                           const TensorPattern &pattern);
+                           const Pattern &pattern);
 
 
   /**
@@ -94,7 +94,7 @@ class ChangeTracker {
       @param [in] pattern  The pattern that we are checking
    */
   inline bool ChangedSince(int64 tick,
-                           const TensorPattern &pattern);
+                           const Pattern &pattern);
 
  private:
 
@@ -112,7 +112,7 @@ class ChangeTracker {
 
 
   struct ChangeRecord {
-    TensorPattern pattern;  // The pattern (offset, dims, strides) that was
+    Pattern pattern;  // The pattern (offset, dims, strides) that was
                             // changed within this storage region.  This pattern
                             // will have been reduced to canonical form.  View
                             // it as a memory-index-set (c.f. glossary in
@@ -134,7 +134,7 @@ class ChangeTracker {
   std::unique_ptr<ChangeRecord> changes_;
 
 
-  // This is a map from a pointer to the TensorPattern in ChangeRecord::pattern
+  // This is a map from a pointer to the Pattern in ChangeRecord::pattern
   // (hashing the pattern itself, not the pointer value), to the ChangeRecord
   // that holds it.  We actually map to the address of the std::unique_ptr
   // pointing to that ChangeRecord, which might be the address of this->changes_
@@ -143,8 +143,8 @@ class ChangeTracker {
   // in de-duping the list of changes, so that if someone provides the
   // exact same pattern twice, we only keep the most recent tick; this
   // keeps memory usage under control.
-  std::unordered_map<TensorPattern*, std::unique_ptr<ChangeRecord>*,
-                     TensorPatternPtrHasher, TensorPatternPtrEqual> change_map_;
+  std::unordered_map<Pattern*, std::unique_ptr<ChangeRecord>*,
+                     PatternPtrHasher, PatternPtrEqual> change_map_;
 };
 
 
@@ -171,7 +171,7 @@ class DataCheckerBase {
                           must be within [0, k-1] where k = num_bytes_ / element_size.
    */
 a  void RecordEvent(int32 element_size,
-                   const TensorPattern &pattern);
+                   const Pattern &pattern);
 
   /**
      This function is intended to return true if the memory-index-set of
@@ -211,7 +211,7 @@ a  void RecordEvent(int32 element_size,
                 the less-than-complete coverage.  False otherwise.
    */
   bool FullyCovered(int32 element_size,
-                    const TensorPattern &pattern);
+                    const Pattern &pattern);
 
   /**
      This function is intended to return true if the memory-index-set of
@@ -226,13 +226,13 @@ a  void RecordEvent(int32 element_size,
        - If we can find a pattern identical to `pattern` in `map_`, return true
          (this is a common special case).
        - Otherwise:
-          - For some or all of the TensorPatterns provided to `RecordEvent()`:
+          - For some or all of the Patterns provided to `RecordEvent()`:
             - If `pattern` has nonempty intersection with that pattern:
                return true
           - return false
    */
   bool PartlyCovered(int32 element_size,
-                     const TensorPattern &pattern);
+                     const Pattern &pattern);
 
  private:
 
@@ -249,19 +249,19 @@ a  void RecordEvent(int32 element_size,
   int32 element_size_;
 
 
-  // `map` can actually be thought of as a set of TensorPatterns, but it's
-  // actually stored as a map from TensorPattern* to the std::unique_ptr holding
-  // that same TensorPattern.  This may seem an odd thing to do; it's just
-  // a convenient way to manage the memory.  Thanks to TensorPatternPtrHasher,
+  // `map` can actually be thought of as a set of Patterns, but it's
+  // actually stored as a map from Pattern* to the std::unique_ptr holding
+  // that same Pattern.  This may seem an odd thing to do; it's just
+  // a convenient way to manage the memory.  Thanks to PatternPtrHasher,
   // we can avoid storing duplicate records for the same Pattern.
-  std::unordered_map<TensorPattern*, std::unique_ptr<TensorPattern*>,
-                     TensorPatternPtrHasher, TensorPatternPtrEqual> map_;
+  std::unordered_map<Pattern*, std::unique_ptr<Pattern*>,
+                     PatternPtrHasher, PatternPtrEqual> map_;
 
 
-  // This is another way of storing the TensorPatterns that have been recorded,
+  // This is another way of storing the Patterns that have been recorded,
   // ordered by NumElements(); this enables us to check the larger patterns
   // first, which may be more efficient.
-  std::multimap<int64, TensorPattern*> by_size_;
+  std::multimap<int64, Pattern*> by_size_;
 };
 
 /**
@@ -299,7 +299,7 @@ class UninitializedDataChecker: public DataCheckerBase {
                   function records the write.
    */
   inline void RecordWrite(int32 element_size,
-                   const TensorPattern &pattern) {
+                   const Pattern &pattern) {
     RecordEvent(element_size, pattern);
   }
 
@@ -315,7 +315,7 @@ class UninitializedDataChecker: public DataCheckerBase {
                   RecordWrite, this call will (usually) crash.
    */
   void RecordRead(int32 element_size,
-                  const TensorPattern &pattern);
+                  const Pattern &pattern);
 };
 
 
@@ -391,7 +391,7 @@ class InvalidatedDataChecker: public DataCheckerBase {
      memory region won't be read from in future.
    */
   inline void RecordInvalidation(int32 element_size,
-                                 const TensorPattern &pattern) {
+                                 const Pattern &pattern) {
     RecordEvent(element_size, pattern);
   }
 
@@ -408,7 +408,7 @@ class InvalidatedDataChecker: public DataCheckerBase {
                   (usually) crash.
   */
   void RecordRead(int32 element_size,
-                  const TensorPattern &pattern);
+                  const Pattern &pattern);
 
 
 };
diff --git a/src/tensor/storage.h b/src/tensor/storage.h
index 5980670b74c..bec164d82ff 100644
--- a/src/tensor/storage.h
+++ b/src/tensor/storage.h
@@ -37,7 +37,7 @@ class Storage {
 
 
   void RecordChange(int32 element_size,
-                    const TensorPattern &pattern);
+                    const Pattern &pattern);
 
 
   // This initializes a ChangeTracker object in this->tracker if it
diff --git a/src/tensor/tensor-impl-utils.h b/src/tensor/tensor-impl-utils.h
index c1da57ecabc..c4d82590b1f 100644
--- a/src/tensor/tensor-impl-utils.h
+++ b/src/tensor/tensor-impl-utils.h
@@ -118,7 +118,7 @@ inline void Unsqueeze(TensorImpl *t, int32 axis) {
    Modifies 't' in-place by removing an axis with (dim=1,stride=0) from the
    specified position.  It is an error if 't' did not initially contain
    such an axis.  This function updates the code.  See also the same-named
-   function that operates on TensorPattern.
+   function that operates on Pattern.
 
    Showing just the dims in the tensor for an example:
 
diff --git a/src/tensor/tensor-impl.h b/src/tensor/tensor-impl.h
index 7d55bf84378..6eaa8b6a98b 100644
--- a/src/tensor/tensor-impl.h
+++ b/src/tensor/tensor-impl.h
@@ -33,7 +33,7 @@ namespace tensor {
 // these types.  (We don't use base-classing as it would make the code
 // harder to read).
 struct TensorMeta {
-  TensorPattern pattern;
+  Pattern pattern;
   DataType dtype;
   Device device;
 };
@@ -47,7 +47,7 @@ struct TensorMeta {
    internals, and not for users of this library.
 */
 struct TensorImpl {
-  TensorPattern pattern;
+  Pattern pattern;
   DataType dtype;
   Device device;
   std::shared_ptr<Storage> storage;  // 'storage' points to a shared Storage object
diff --git a/src/tensor/tensor-pattern-extra-utils.cc b/src/tensor/tensor-pattern-extra-utils.cc
index 33f577dd2b3..49ee4ce5dbc 100644
--- a/src/tensor/tensor-pattern-extra-utils.cc
+++ b/src/tensor/tensor-pattern-extra-utils.cc
@@ -39,8 +39,8 @@ namespace tensor {
                             to here.  There will be no repeats.
 */
 static void FindAllStrides(
-    const TensorPattern &pattern1,
-    const TensorPattern &pattern2,
+    const Pattern &pattern1,
+    const Pattern &pattern2,
     std::vector<int32> *strides) {
   KALDI_PARANOID_ASSERT(IsCanonical(pattern1) && IsCanonical(pattern2));
   strides->clear();
@@ -55,7 +55,7 @@ static void FindAllStrides(
 
 
 // See declaration in header.
-bool IsRegular(const TensorPattern &pattern) {
+bool IsRegular(const Pattern &pattern) {
   int32 num_axes = pattern.num_axes;
 
   for (int32 i = 0; i + 1 < num_axes; i++) {
@@ -101,9 +101,9 @@ bool IsRegular(const TensorPattern &pattern) {
                                be valid-2, and it will be linear in pattern_in.
 */
 static void ConvertPatternStridesLazily(
-    const TensorPattern &pattern_in,
+    const Pattern &pattern_in,
     const std::vector<int32> &strides,
-    TensorPattern* pattern_out) {
+    Pattern* pattern_out) {
   KALDI_PARANOID_ASSERT(IsCanonical(pattern_in));
   int32 num_axes_in = pattern_in.num_axes,
       num_axes_out = strides.size();
@@ -141,7 +141,7 @@ static void ConvertPatternStridesLazily(
       `pattern->strides[raxis+1] >= pattern->strides[raxis] * pattern->dims[raxis]`.
 
    This function expects that the pattern will also satisfy that property for
-   all axis-indexes `0 <= i < raxis`, and will be valid--.  This function will
+   all axis-indexes `0 <= i < raxis`, and will be valid-2.  This function will
    always succeed if the pattern is regular (see IsRegular(), and "Regularity
    property" in the glossary).
 
@@ -173,8 +173,8 @@ static void ConvertPatternStridesLazily(
 static bool EnsureAxisSortingPropertyHolds(
     int32 raxis,
     int32 pattern_index,
-    std::vector<TensorPattern> *patterns) {
-  TensorPattern *pattern = (*patterns)[pattern_index];
+    std::vector<Pattern> *patterns) {
+  Pattern *pattern = (*patterns)[pattern_index];
   // We use 'i' as the internal name for 'raxis', because we want to mirror the
   // notation used for the regularity property in the glossary, and in the
   // function IsRegular() that checks for it.  There is an index k with `i < k
@@ -217,10 +217,10 @@ static bool EnsureAxisSortingPropertyHolds(
     if (remainder != 0) {
       patterns->resize(patterns->size() + 1);
       pattern = (*patterns)[i];  // in case it was reallocated.
-      TensorPattern *remainder_pattern = &(patterns->back());
+      Pattern *remainder_pattern = &(patterns->back());
       *remainder_pattern = *pattern;
       remainder_pattern->dims[i] = remainder;
-      remainder_pattern->offset += j_stride * j_dim;
+      remainder_pattern->offset += int64(j_stride) * j_dim;
     }
 
     pattern->dims[j] = j_dim;
@@ -233,9 +233,9 @@ static bool EnsureAxisSortingPropertyHolds(
 
 // see declaration in header for documentation.
 bool ConvertPatternStrides(
-    const TensorPattern &pattern,
+    const Pattern &pattern,
     const ArrayRef<int32> &strides,
-    std::vector<TensorPattern*> *patterns) {
+    std::vector<Pattern*> *patterns) {
   patterns->resize(1);
   ConvertPatternStridesLazily(pattern, &((*patterns)[0]));
   int32 num_axes = strides.size();
@@ -264,86 +264,88 @@ bool ConvertPatternStrides(
 
 
 /**
-   FindOffsetsRecursive() is a utility function that is used in the implementation
-   of FindOffsets().
-
-
-   Suppose we are computing the intersection of the memory-index-sets of pattern1
-   and pattern2, where pattern1 and pattern2 are valid-1 and have identical
-   num_axes and strides.
-
-   For each memory-index m that is in both pattern1 and pattern2, there must be
-   index-tuples i1 and i2 such that pattern1[i1] = pattern2[i2] = m.  We can
-   write this as: pattern1[i] = pattern2[i + o], where o is an offset that's
-   also a tuple (like an index-tuple, but with possibly negative elements).  This
-   function can be thought of as a recursive search for all values of the
-   offset 'o' for which at least one such index i exists, where
-   i is valid for pattern1 and i + o is valid for pattern2).
-
-
-   =====
-
-   Explanation of the algorithm
+   FindOffsetsRecursive() is a utility function that is used in the
+   implementation of FindOffsets().  See the documentation of FindOffsets(*) in
+   tensor-pattern-extra-utils.h for context.
+   Briefly: we are finding the set of offsets o such that there exists i
+   with pattern1[i + o] = pattern2[i].
 
    The algorithm for computing the list of potential offsets o is recursive,
    starting from the last-numbered raxis, which will have the highest
    stride since the strides are normalized.
 
-   Let s be the the vector of strides of the patterns (pattern1 and pattern2
-   have identical strides).  From the equation
+   Let s be the the vector of strides of the patterns, in the private numbering
+   (pattern1 and pattern2 have identical strides).  Expanding the equation
+
       pattern1[i + o] = pattern2[i]                     (1)
+
    (see "Indexing a Pattern" in tensor-pattern.h to understand the notation),
-   we have:
-      pattern1.offset + s . (i + o)  == pattern2.offset + s . i
+   we get:
+
+      pattern1.offset + s . (i + o)  ==  pattern2.offset + s . i
+
    where a `.` with space around it means dot product.
 
    Simplifying:
       s . o = pattern2.offset - pattern1.offset.         (2)
 
-   For each raxis r, there are limits on the value of o[r]; these are imposed by
-   the dimensions of the two Tensors.  In Equation (1), for the indexes into the
-   patterns to be valid, i[r] + o[r] must be in [0 .. pattern1.dims[r] - 1]
-   and i[r] must be in [0 .. pattern2.dims[r] - 1],  For at least one such i[r] to
-   exist, we require
-       -pattern2.dims[r] < o[r] < pattern1.dims[r]        (3)
+   which we can expand as follows (using latex notation),
+
+   \sum_{r=0}^{num_axes - 1}  s[r] o[r] = pattern2.offset - pattern1.offset.   (3)
+
+   For each raxis r, there are limits on the possible values of o[r], which are
+   imposed by the dimensions of the two Tensors.  In Equation (1), for the
+   indexes into the patterns to be valid, i[r] + o[r] must be in
+   [0 .. pattern1.dims[r] - 1] and i[r] must be in [0 .. pattern2.dims[r] - 1], For
+   at least one such i[r] to exist, we require
 
+       -pattern2.dims[r] < o[r] < pattern1.dims[r]        (4)
+
+   (a formal derivation is kind of tedious but straightforward).
    There is a further limitation on the elements of o that we can obtain using
-   the properties above plus the axis-dominance property.  It's easiest to
-   explain this as a special case for the last axis r = num_axes - 1.
-   Define:
-       l(r) =   \sum_{q < r} s[q] * o[q],
-   so l(r) is the sum of the elements in s . o that come from raxes
-   less than r.  We can use the axis-dominance lemma (see tensor-pattern.h)
-   and the limitation on o[r] proved in the previous paragraph to prove that:
-       -s[r] <  l(r) <  s[r].
-   For the last axis r = num_axes - 1, for Equation (1) to hold, we must have
-      l(r) =  pattern2.offset - pattern1.offset - s[r] * o[r],   (4)
-   so we have the inequality
-     -s[r] <  pattern2.offset - pattern1.offset - s[r] * o[r]  <  s[r]
-   which means we need only consider offsets o[r] where the absolute value of
-   the "remainder" is less than s[r]; there will be at most two.  For
-   axes r < num_axes - 1, if the offsets for higher-numbered r are already known,
-   we just need to subtract the appropriate higher-axis terms from the r.h.s. of
-   (4).  The recursive implementation that finds the possible offset vectors is
-   pretty obvious intuitively so we won't try to explain further.
-
-          @param [in] pattern1  First pattern.  Its offset is
-                       ignored, only the num_axes, dims and strides are read.
-          @param [in] pattern2  Second pattern.  Its offset is ignored,
-                       only the num_axes, dims and stride are read.
+   the properties above plus the axis-dominance property.  Our algorithm for
+   finding the list of possible offsets o is recursive starting from the
+   last-numbered raxis, and we derive it below.
+
+   Suppose for some raxis r, we are trying to find the possible values for o[r],
+   and we have been provided the values of o[q] for q > r.  Define
+
+     remainder    = pattern2.offset - pattern1.offset
+                    - \sum_{q=r+1}^{num_axes-1}  o[r] s[r]
+
+   And define
+     lower_sum =   \sum_{q=0}^{r-1} s[q] * o[q],
+
+   We can use the axis-dominance lemma (see tensor-pattern.h) and the limitation
+   on o[r] from (4) to prove that:
+         -s[r] <  lower_sum <  s[r].                 (5)
+   (the axis-dominance lemma is relevant here because o[r] behaves just like an
+   index into a pattern, except it be negative as well as positive).
+   For (3) to hold, we must have:
+       lower_sum = remainder - o[r] s[r]            (6)
+   and expanding lower_sum in (5) using (6), we have:
+     -s[r] <  remainder - s[r] * o[r]  <  s[r]       (7)
+   (notice: in the recursion o[r] is the only unknown in this equation).  There
+   will be either one or two values of o[r] satisfying (7), and Eq. (4) may
+   eliminate one or both of those.
+
+          @param [in] pattern1  First pattern; must be valid-1
+          @param [in] pattern2  Second pattern; must be valid-1 and satisfy
+                          SameStrides(pattern1, pattern2).
           @param [in] known_offsets     (Note: semantically this is an input;
                        it is temporarily changed inside the function and
                        then restored to its previous state).
                        It is the list of already-known offsets (i.e. the
                        elements of some members o) but in the public numbering,
                        so that element 0 corresponds to raxis = num_axes - 1.
-                       This is convenient because this function starts at
+                       This is convenient because the algorithm starts from
                        the highest-numbered raxis.
-          @param [in] offset_difference   At the top-level call this will
-                       be pattern2.offset - pattern1.offset; deeper in the
-                       call stack this will be the remaining offset after
-                       subtracting the products of stride[r] * o[r] for
-                       higher-numbered r.
+          @param [in] remainder    This is defined as pattern2.offset - pattern1.offset
+                         - \f$ \sum_{q=r+1}^{num_axes-1}  o[r] s[r]. \f$,
+                         where you can work out the raxis r we are immediately
+                         processing as r = pattern1.num_axes - 1 - known_offsets->size().
+                         The higher-numbered elements of o[r] are available through
+                         the recursion.
           @param [in] keep_all_offsets   Bool that says whether the user
                        is interested in all the offsets.  If true we'll
                        output all valid offsets; if false we may stop
@@ -353,10 +355,10 @@ bool ConvertPatternStrides(
                        be appended to).  Each element of (*offsets_out)
                        will be a vector o, in the private numbering.
 */
-void FindOffsetsRecursive(const TensorPattern &pattern1,
-                          const TensorPattern &pattern2,
+void FindOffsetsRecursive(const Pattern &pattern1,
+                          const Pattern &pattern2,
                           std::vector<int32> *known_offsets,
-                          int64 offset_difference,
+                          int64 remainder,
                           bool keep_all_offsets,
                           std::vector<std::vector<int32> > *offsets_out) {
   int32 num_axes = pattern1.num_axes,  // will equal pattern2.num_axes
@@ -364,45 +366,63 @@ void FindOffsetsRecursive(const TensorPattern &pattern1,
       stride = pattern1.strides[raxis],  // will equal pattern2.strides[raxis]
       dim1 = pattern1.dims[raxis],
       dim2 = pattern2.dims[raxis];
-  int32 this_offset = offset_difference / stride,
-      remaining_difference = offset_difference - (stride * this_offset);
-  // Note: abs(remaining_difference) will be less than stride.
+  int32 this_offset = remainder / stride,
+      next_remainder = remainder - (stride * this_offset);
+  // Note: abs(next_remainder) will be less than stride.
+  // 'this_offset' is one of the possible solutions for o[r].
 
   if (raxis == 0) {
-    if (remaining_difference == 0) {
-      // The offset vector is `this_offset` and then the reverse of `known_offsets`
-      // (known_offsets is in the public numbering; we want the private).
+    if (next_remainder == 0) {
+      // The offset vector we're about to append to known_offsets will be
+      // `this_offset` followed by the reverse of `known_offsets` (since
+      // known_offsets is in the public numbering; we want the private).
       offsets_out->resize(offsets_out->size() + 0);
       offsets_out->back().push_back(this_offset);
       offsets_out->back().insert(offsets_out->back().end(),
                                  known_offsets->rbegin(),
                                  known_offsets->rend());
-      // for i == [0, 0.. ], checking that pattern1[i + o] == pattern2[i].
-      KALDI_PARANOID_ASSERT(IndexPattern(pattern1, offsets_out->back(), false)
-                            == pattern1.offset);
+#ifdef KALDI_PARANOID
+      {  // Check these really are valid.  TODO: remove this eventually.
+        std::vector<int32> i1(num_axes), i2(num_axes);
+        std::vector<int32> &o = known_offsets->back();
+        for (int32 r = 0; r < num_axes; r++) {
+          if (o[r] > 0)
+            i1[r] = o;
+          else
+            i2[r] = -o;
+        }
+        // this i1 and i2 satisfy i1 = i2 + o, so i2 is the i in the
+        // equation pattern1[i + o] == pattern2[i].
+        KALDI_PARANOID_ASSERT(IndexPattern(pattern1, i1) ==
+                              IndexPattern(pattern2, i2));
+      }
+#endif
     }
     return;
   } else {
     known_offsets->push_back(this_offset);
     if (this_offset > -pattern2.dims[raxis] &&
         this_offset < pattern1.dims[raxis]) {
-      // if eq. (3) is satisfied..
+      // if eq. (4) is satisfied..
       FindOffsetsRecursive(pattern1, pattern2, known_offsets,
-                           remaining_difference, keep_all_offsets,
+                           next_remainder, keep_all_offsets,
                            offsets_out);
     }
-    if (remaining_difference == 0) {
+    if (next_remainder == 0 ||
+        (!keep_all_offsets && !offsets_out->empty())) {
+      // if next_remainder == 0 there would be only one solution to (7)
       known_offsets->pop_back();
       return;
     }
-    int32 offset_change = (remaining_difference > 0 ? -1 : 1);
+    int32 offset_change = (next_remainder > 0 ? -1 : 1);
     this_offset += offset_change;
-    remaining_difference -= stride * offset_change;
+    next_remainder -= stride * offset_change;
     known_offsets->back() = this_offset;
     if (this_offset > -pattern2.dims[raxis] &&
         this_offset < pattern1.dims[raxis]) {
+      // if eq. (4) is satisfied..
       FindOffsetsRecursive(pattern1, pattern2, known_offsets,
-                           remaining_difference, keep_all_offsets,
+                           next_remainder, keep_all_offsets,
                            offsets_out);
     }
     known_offsets->pop_back();
@@ -411,21 +431,275 @@ void FindOffsetsRecursive(const TensorPattern &pattern1,
 }
 
 
-
-inline void FindOffsets(const TensorPattern &pattern1,
-                        const TensorPattern &pattern2,
-                        bool keep_all_offsets,
-                        std::vector<std::vector<int32> > *offsets_out) {
+// Declared in header, see documentation there.
+void FindOffsets(const Pattern &pattern1,
+                 const Pattern &pattern2,
+                 bool keep_all_offsets,
+                 std::vector<std::vector<int32> > *offsets_out) {
+  KALDI_PARANOID_ASSERT(IsValid1(pattern1) && IsValid1(pattern2) &&
+                        HasNormalizedPositiveStrides(pattern1) &&
+                        SameStrides(pattern1, pattern2));
   offsets_out->clear();
   std::vector<int32> known_offsets;
   FindOffsetsRecursive(pattern1, pattern2,
                        &known_offsets,
                        keep_all_offsets,
-                       pattern1.offset - pattern2.offset,
+                       pattern2.offset - pattern1.offset,
                        offsets_out);
 }
 
 
+/*
+
+ A hyperrectangle (here expressed in terms of integers) is a Cartesian product
+ of integer intervals, here expressed as (begin, end) pairs so that the
+ integers in that interval are [ begin .. end - 1].  The vector must be
+ nonempty for us to consider this a valid hyperrectangle; and for each
+ interval we require end > begin.
+
+ [set view of hyperrectangles]
+
+ A hyperrectangle can be used to represents a set of integer tuples.
+ For a hyperrectangle h, let set(h) represent all the index-tuples i
+ with h.size() members such that, for each raxis 0 <= r < h.size(),
+      h[r].first <= i[r] < h[r].second.
+*/
+typedef std::vector<std::pair<int32, int32> > Hyperrectangle;
+
+bool IsValidHyperrectangle(const Hyperrectangle &a) {
+  if (a.empty()) return false;
+  for (auto iter = a.begin(); iter != a.end(); ++iter)
+    if (iter->first >= iter->second)
+      return false;
+}
+
+// Returns true if two hyperrectangles, as defined above,
+// intersect.  We require a.size() == b.size() and a and
+// to be valid hyperrectangles.
+bool HyperrectanglesIntersect(const Hyperrectangle &a,
+                              const Hyperrectangle &b) {
+  KALDI_PARANOID_ASSERT(a.size() == b.size() &&
+                        IsValidHyperrectangle(a) && IsValidHyperrectangle(b));
+  auto iter_a = a.begin(),  iter_b = b.begin(), end_a = a.end();
+  for (; iter_a != end_a; ++iter_a, ++iter_b) {
+    if (a->second <= b->first ||
+        b->second <= a->first)
+      return false;
+  }
+}
+
+/**
+   If called with i == 0, this recursive function computes the set-wise
+   difference of hyperrectangles a - b (viewed as sets of tuples of
+   ints, obviously).
+
+      @param [in] a  A valid hyperrectangle
+      @param [in] b  A valid hyperrectangle, must satisfy a.size() == b.size()
+      @param [in] i  An index in the range [0 .. a.size() - 1] (view this
+                     as an axis-index).  The caller asserts that for each index
+                     0 <= j < i, a's interval is contained in b's interval; that
+                     is, a[j].first >= b[j].first and a[j].second <=
+                     b[j].second.
+*/
+static void SubtractHyperrectangles(const Hyperrectangle &a,
+                                    const Hyperrectangle &b,
+                                    size_t i,
+                                    std::vector<Hyperrectangle> *difference) {
+  size_t size = a.size();
+  KALDI_PARANOID_ASSERT(i == 0 ||
+                        (a[i-1].first >= b[i-1].first &&
+                         a[i-1].second <= b[i-1].second));
+  KALDI_PARANOID_ASSERT(i != 0 ||
+                        (IsValidHyperrectangle(a) &&
+                         IsValidHyperrectangle(b)));
+
+  Hyperrectangle &a_non_const = const_cast<Hyperrectangle&> a;
+  Hyperrectangle &b_non_const = const_cast<Hyperrectangle&> b;
+
+  int32 a_start = a[i].first, a_end = a[i].second,
+      b_start = b[i].first, b_end = b[i].second;
+
+  if (b_start < a_end && b_end > a_start) {
+    // If a's and b's intervals overlap at all....
+    if (a_start < b_start) {
+      // Append to `difference` the portion of a's interval that doesn't
+      // intersect with b's interval and that is before b starts.
+      a_non_const[i].second = b_start;
+      difference->append(a);
+      a_non_const[i].second = a_end;  // restore the state.
+    }
+    if (a_end > b_end) {
+      // Append to `difference` the portion of a's interval that doesn't
+      // intersect with b's interval and that is after b ends.
+      a_non_const[i].first = b_end;
+      difference->append(a);
+      a_non_const[i].first = a_start;  // restore the state.
+    }
+    // If this is not the last axis, handle the part that overlaps.  (If this is
+    // the last axis, we don't need to do anything with it, because the
+    // overlapping part won't appear in the difference a - b).
+    if (i + 1 < size) {
+      int32 intersection_start = std::max<int32>(a_start, b_start);
+      int32 intersection_end = std::min<int32>(a_start, b_start);
+      a_non_const[i].first = intersection_start;
+      a_non_const[i].second = intersection_end;
+      SubtractHyperrectangles(a, b, i + 1, difference);
+      // now restore the state.
+      a_non_const[i].first = a_start;
+      a_non_const[i].second = a_end;
+    }
+  } else {
+    // These intervals don't overlap, so the difference is just a.
+    difference->push_back(a);
+  }
+}
+
+/**
+       @param [in] pattern1   First input pattern.  Must be valid-1 and
+                        normalized+ (i.e. HasNormalizedPositiveStrides(pattern1)).
+       @param [in] pattern2   Second input pattern.  Must be valid-1 and
+                        satisfy SameStrides(pattern1, pattern2).
+       @param [in] offset  An offset as described in the documentation for
+                        FindOffsets(): a tuples o such that there exists
+                        i with pattern1[i + o] = pattern2[i].  Its size
+                        must equal the num_axes of pattern1 and pattern2.
+       @param [out] hyperrectangle  This will be set to a hyperrectangle
+                        with hyperrectangle.size() == offset.size(),
+                        which represents the set S of index-tuples which we
+                        could use to index pattern1, satisfying pattern1[S] =
+                        pattern2[S - o].  The two elements of the pair on each
+                        axis thus correspond to (begin, end) indexes into
+                        pattern1 with end one past the end.
+                        See "[set view of hyperrectangles]" for explanation.
+*/
+static void OffsetToHyperrectangle(
+    const Pattern &pattern1,
+    const Pattern &pattern2,
+    const std::vector<int32> &offset,
+    Hyperrectangle *hyperrectangle) {
+  KALDI_PARANOID_ASSERT(IsValid1(pattern1) && IsValid1(pattern2) &&
+                        SameStrides(pattern1, pattern2) &&
+                        int32(offsets.size()) == pattern1.num_axes);
+  int32 num_axes = pattern1.num_axes;
+  hyperrectangle->resize(num_axes);
+  for (int32 raxis = 0; raxis < num_axes; raxis++) {
+    int32 o = offset[raxis];
+    // Caution: interval_start and interval_end aren't the range
+    // of possible elements of i in the equation; they represent
+    // i + o.
+    int32 interval_start = std::max<int32>(o, 0),
+        interval_end = std::min<int32>(pattern1.dims[raxis],
+                                       o + pattern2.dims[raxis]);
+      KALDI_ASSERT(interval_end > interval_start);
+      (*hyperrectangle)[raxis].first = interval_start;
+      (*hyperrectangle)[raxis].second = interval_end;
+  }
+  }
+}
+
+
+/**
+   Given a pattern `src` and a hyperrectangle h, output a pattern `dest` that
+   represents `src` indexed with all the index-tuples i in set(h).  See
+   [set view of hyperrectangles] to understand the notation.
+
+          @param [in] src     Source pattern.  Must be valid-1.
+          @param [in] h       A hyperrectangle.  Every i in set(h) must be
+                              in the index-tuple-set of src.
+          @param [out] dest   Destination pattern.  Its memory-index-set
+                              equals src[set(h)].  Will have same strides
+                              as src, and will be valid-1.
+ */
+static void HyperrectangleToPattern(const Pattern &src,
+                                    const Hyperrectangle &h,
+                                    Pattern *dest) {
+  KALDI_PARANOID_ASSERT(IsValid1(src) && IsValidHyperrectangle(h));
+  int32 num_axes = src.num_axes;
+  int64 offset = src.offset;
+  dest->num_axes = num_axes;
+  for (int32 r = 0; r < num_axes; r++) {
+    int32 src_dim = src.dims[r],
+        stride = src.strides[r],
+        begin = h[r].first,
+        end = h[r].second;
+    dest->dims[r] = end - begin;
+    dest->strides[r] = stride;
+    offset += int64(begin) * stride;
+  }
+  SetUnusedDimsAndStrides(num_axes, dest);
+  dest->num_axes = num_axes;
+  dest->offset = offset;
+  SetDefaultCodeAndProperties(dest);
+  KALDI_PARANOID_ASSERT(IsValid1(*dest));
+}
+
+/**
+   Given patterns pattern1 and pattern2 that are valid-1 and share
+   the same strides, and an offset o such that there
+   exists at least one index i with pattern1[i + o] = pattern2[i]
+   (c.f. "Indexing a Pattern" in the glossary in tensor-pattern.h),
+   outputs a Pattern representing the part of the intersection
+   of the memory-index-sets of pattern1 and pattern2 that has
+   offset o.
+
+      @param [in] pattern1   First input pattern.  Must be valid-1.
+      @param [in] pattern2   First input pattern.  Must be valid-1
+                             and satisfy SameStrides(pattern1, pattern2).
+      @param [in] o        Offset vector.  There must exist at least
+                           one index-tuple i such that
+                           pattern1[i + o] = pattern2[i].
+      @param [out] dest     Destination pattern with this part of the
+                            intersection of pattern1 and pattern2.
+                            Will be valid-1 at exit, and have the
+                            same strides as the input patterns.
+ */
+static void OffsetToPattern(const Pattern &pattern1,
+                            const Pattern &pattern2,
+                            const std::vector<int32> &o,
+                            Pattern *dest) {
+  KALDI_PARANOID_ASSERT(IsValid1(pattern1) && IsValid1(pattern2) &&
+                        SameStrides(pattern1, pattern2));
+  int32 num_axes = pattern1.num_axes;
+  int64 offset = pattern1.offset;
+  dest->num_axes = num_axes;
+  for (int32 r = 0; r < num_axes; r++) {
+    int32 stride = pattern1.strides[r],  // equals pattern2.strides[r].
+        offset = o[r];
+    dest->strides[r] = stride;
+    if (offset >= 0) {
+      // The first index into pattern1 would be offset, the first
+      // index into pattern2 would be 0.
+      // The dimension is the minimum of (pattern1.dim - offset, pattern2.dim)
+      offset += int64(offset) * stride;
+      dest->dims[r] = std::min<int32>(pattern1.dims[r] - offset,
+                                      pattern2.dims[r]);
+    } else {
+      // The first index into pattern1 would be 0, the first index
+      // into pattern2 would be -offset.  The dimension is the minimum
+      // of (pattern1.dim, pattern2.dim + offset).
+      dest->dims[r] = std::min<int32>(pattern1.dims[r],
+                                      pattern2.dims[r] + offset);
+    }
+  }
+  SetUnusedDimsAndStrides(num_axes, dest);
+  dest->num_axes = num_axes;
+  dest->offset = offset;
+  SetDefaultCodeAndProperties(dest);
+  KALDI_PARANOID_ASSERT(IsValid1(*dest));
+
+#ifdef KALDI_PARANOID
+  {  // TODO: remove this check when debugged.
+    Hyperrectangle h;
+    OffsetToHyperrectangle(pattern1, pattern2, o, &h);
+    Pattern p;
+    HyperrectangleToPattern(pattern1, h, &p);
+    KALDI_ASSERT(p == *dest);
+  }
+#endif
+}
+
+
+
 /**
    This recursive function is used to compute the intersection between
    pattern1 and pattern2, which must have identical num_axes and strides,
@@ -460,11 +734,11 @@ inline void FindOffsets(const TensorPattern &pattern1,
                                memory-index-sets is identical to the intersection
                                of pattern1 and pattern2's memory-index-sets.
   */
-void ComputeIntersectionRecursive(const TensorPattern &pattern1,
-                                  const TensorPattern &pattern2,
+void ComputeIntersectionRecursive(const Pattern &pattern1,
+                                  const Pattern &pattern2,
                                   int32 identical_raxis,
                                   bool keep_all_patterns,
-                                  std::vector<TensorPattern> *patterns_out) {
+                                  std::vector<Pattern> *patterns_out) {
   if (identical_raxis == 0) {
     /*
       The base-case of the recursion; if we reach here, it means pattern1 and
@@ -491,7 +765,7 @@ void ComputeIntersectionRecursive(const TensorPattern &pattern1,
   // By the '?..:' statements below we possibly switch pattern2 and
   // pattern1, thereby ensuring that pattern2_mod.offset >= pattern1_mod.offset;
   // this simplifies the later code.
-  TensorPattern pattern1_mod(pattern2.offset >= pattern1.offset ? pattern1 : pattern2),
+  Pattern pattern1_mod(pattern2.offset >= pattern1.offset ? pattern1 : pattern2),
       pattern2_mod(pattern2.offset >= pattern1.offset ? pattern2 : pattern1);
 
 
@@ -544,11 +818,11 @@ void ComputeIntersectionRecursive(const TensorPattern &pattern1,
 
 
 // See documentation in header.
-bool ComputeIntersection(const TensorPattern &pattern1_in,
-                         const TensorPattern &pattern2_in,
-                         std::vector<TensorPattern> *intersection,
-                         bool keep_all_patterns) {
-  TensorPattern pattern1(pattern1_in),
+bool ComputeIntersection(const Pattern &pattern1_in,
+                         const Pattern &pattern2_in,
+                         bool keep_all_patterns,
+                         std::vector<Pattern> *intersection) {
+  Pattern pattern1(pattern1_in),
       pattern2(pattern2_in);
   CanonicalizePattern(&pattern1);
   CanonicalizePattern(&pattern2);
@@ -568,7 +842,7 @@ bool ComputeIntersection(const TensorPattern &pattern1_in,
     }
     return true;
   }
-  std::vector<TensorPattern> patterns1, patterns2;
+  std::vector<Pattern> patterns1, patterns2;
   patterns1.reserve(8);
   patterns2.reserve(8);
   intersection->clear();
@@ -596,17 +870,22 @@ bool ComputeIntersection(const TensorPattern &pattern1_in,
       int64 min_mindex2 = sub_pattern2.mindex,
           end_mindex2 = min_mindex2 +
           sub_pattern2.strides[num_axes - 1] * sub_pattern2.dims[num_axes - 1];
+#if 0
       if (min_mindex2 >= end_mindex1 || begin_mindex1 >= end_mindex2)
         continue;  //  This is an optimization for efficiency when it's easy to
-                   // see that two Patterns won't overlap.
+                   // see that two Patterns won't overlap.  Will enable it
+                   // when the rest of the code is debugged.
+#endif
+
+      std::vector<std::vector<int32> > offsets;
+      FindOffsets(sub_pattern1, sub_pattern2, keep_all_patterns,
+                  &offsets);
+
+      for (auto oiter = offsets.begin; oiter != offsets.end(); ++oiter) {
+        intersection->resize(intersection->size() + 1);
+        OffsetToPattern(pattern1, pattern2, *oiter, &intersection->back());
+      }
 
-      // Here, sub_pattern1 and sub_pattern2 are the sub-pieces of pattern1 and
-      // pattern2 that have been converted to share the same list of strides The
-      // following call may add elements to 'intersection'.
-      ComputeIntersectionRecursive(sub_pattern1, sub_pattern2,
-                                   num_axes,
-                                   keep_all_patterns,
-                                   intersection);
       if (!keep_all_patterns && !intersection.empty())
         return true;
     }
@@ -614,9 +893,9 @@ bool ComputeIntersection(const TensorPattern &pattern1_in,
   return true;
 }
 
-bool PatternContains(const TensorPattern &pattern_in,
+bool PatternContains(const Pattern &pattern_in,
                      int64 mindex) {
-  TensorPattern pattern_mod;
+  Pattern pattern_mod;
   const Pattern *pattern;
   if (!IsCanonical(pattern_in)) {
     CanonicalizePattern(pattern_in, &pattern_mod);
@@ -639,11 +918,11 @@ bool PatternContains(const TensorPattern &pattern_in,
 
 
 
-bool ToMemoryIndexSet(const TensorPattern &pattern_in,
+bool ToMemoryIndexSet(const Pattern &pattern_in,
                       std::vector<char> *s) {
   KALDI_PARANOID_ASSERT(pattern.IsValid());
   s->clear();
-  TensorPattern pattern_mod;
+  Pattern pattern_mod;
   const Pattern *pattern;
   if (!IsCanonical(pattern_in)) {
     CanonicalizePattern(pattern_in, &pattern_mod);
@@ -681,7 +960,7 @@ bool ToMemoryIndexSet(const TensorPattern &pattern_in,
   recursively_set_elements(num_axes - 1, pattern->offset);
 }
 
-int64 RandomMemoryIndex(const TensorPattern &pattern) {
+int64 RandomMemoryIndex(const Pattern &pattern) {
   int32 num_axes = pattern.num_axes;
   int64 mindex = pattern.offset;
   for (int32 raxis = 0; raxis < num_axes; raxis++) {
@@ -691,13 +970,13 @@ int64 RandomMemoryIndex(const TensorPattern &pattern) {
 }
 
 
-bool PatternsIntersectExhaustive(const TensorPattern &pattern1,
-                                 const TensorPattern &pattern2) {
+bool PatternsIntersectExhaustive(const Pattern &pattern1,
+                                 const Pattern &pattern2) {
 }
 
 
-bool PatternsIntersect(const TensorPattern &pattern1,
-                       const TensorPattern &pattern2) {
+bool PatternsIntersect(const Pattern &pattern1,
+                       const Pattern &pattern2) {
   KALDI_PARANOID_ASSERT(pattern1.IsValid() && pattern2.IsValid());
   int64 min_mindex1, max_mindex1,
       min_mindex2, max_mindex2;
@@ -722,7 +1001,7 @@ bool PatternsIntersect(const TensorPattern &pattern1,
                                    // "fast mode", used where we just want to
                                    // see whether the intersection is empty.
 
-  std::vector<TensorPattern> intersection;
+  std::vector<Pattern> intersection;
   if (ComputeIntersection(pattern1, pattern2, &intersection,
                           keep_all_patterns)) {
     return (!intersection.empty());
@@ -760,7 +1039,7 @@ bool PatternsIntersect(const TensorPattern &pattern1,
 
 
 /**
-   Offsets, and computing intersection of TensorPatterns.
+   Offsets, and computing intersection of Patterns.
 
    Suppose we are computing the intersetion of the memory-index-sets of pattern1
    and pattern2.
@@ -771,7 +1050,7 @@ bool PatternsIntersect(const TensorPattern &pattern1,
    also a tuple (like an index-tuple, but with possibly negative elements).  This
    function can be thought of as a recursive search for all values of the
    offset 'o' for which at least one such index m exists.  For each such offset
-   'o' we might end up with a TensorPattern;  and the union of all of these
+   'o' we might end up with a Pattern;  and the union of all of these
    patterns is the intersection of pattern1 and pattern2.
 
    The algorithm for computing the list of potential offsets o is recursive,
@@ -855,10 +1134,10 @@ Since
                                memory-index-sets is identical to the difference
                                of pattern1 and pattern2's memory-index-sets.
 */
-void ComputeDifferenceRecursive(const TensorPattern &pattern1,
-                                const TensorPattern &pattern2,
+void ComputeDifferenceRecursive(const Pattern &pattern1,
+                                const Pattern &pattern2,
                                 int32 identical_raxis,
-                                std::vector<TensorPattern> *patterns_out) {
+                                std::vector<Pattern> *patterns_out) {
   if (identical_raxis == 0) {
     /*
       The base-case of the recursion; if we reach here, it means pattern1 and
@@ -924,10 +1203,10 @@ void ComputeDifferenceRecursive(const TensorPattern &pattern1,
 
 
 // See documentation in header.
-bool ComputeDifference(const TensorPattern &pattern1,
-                       const TensorPattern &pattern2,
-                       std::vector<TensorPattern> *difference) {
-  TensorPattern pattern1(pattern1_in),
+bool ComputeDifference(const Pattern &pattern1,
+                       const Pattern &pattern2,
+                       std::vector<Pattern> *difference) {
+  Pattern pattern1(pattern1_in),
       pattern2(pattern2_in);
   CanonicalizePattern(&pattern1);
   CanonicalizePattern(&pattern2);
@@ -947,7 +1226,7 @@ bool ComputeDifference(const TensorPattern &pattern1,
     }
     return true;
   }
-  std::vector<TensorPattern> patterns1, patterns2;
+  std::vector<Pattern> patterns1, patterns2;
   patterns1.reserve(8);
   patterns2.reserve(8);
   intersection->clear();
@@ -964,7 +1243,7 @@ bool ComputeDifference(const TensorPattern &pattern1,
   //      patterns) to next_difference.
   //   set cur_difference = next_difference and clear next_difference.
   // Result is in cur_difference.
-  std::vector<TensorPattern> cur_difference, next_difference;
+  std::vector<Pattern> cur_difference, next_difference;
   cur_difference.swap(patterns1);
 
   for (auto iter2 = patterns2.begin(); iter2 != patterns2.end(); ++iter2) {
@@ -1012,9 +1291,9 @@ bool ComputeDifference(const TensorPattern &pattern1,
 }
 
 
-bool PatternsIntersectSlow(const TensorPattern &pattern1_in,
-                           const TensorPattern &pattern2_in) {
-  TensorPattern pattern1(pattern1_in),
+bool PatternsIntersectSlow(const Pattern &pattern1_in,
+                           const Pattern &pattern2_in) {
+  Pattern pattern1(pattern1_in),
       pattern2(pattern2_in);
   Canonicalize(&pattern1);
   Canonicalize(&pattern2);
@@ -1042,7 +1321,7 @@ bool PatternsIntersectSlow(const TensorPattern &pattern1_in,
 }
 
 
-bool TensorPatternRebaser::Convert(TensorPattern *pattern) {
+bool PatternRebaser::Convert(Pattern *pattern) {
   if (!needs_conversion_)
     return;  // An optimization to make the common case fast.
 
@@ -1067,7 +1346,7 @@ bool TensorPatternRebaser::Convert(TensorPattern *pattern) {
 }
 
 
-int64 TensorPatternRebaser::ConvertMemoryIndex(int64 m) {
+int64 PatternRebaser::ConvertMemoryIndex(int64 m) {
   int32 num_axes = num_axes_;
   int64 ans = dest_offset_;
   m -= src_offset_;
@@ -1097,10 +1376,10 @@ int64 TensorPatternRebaser::ConvertMemoryIndex(int64 m) {
 // is when -DKALDI_PARANOID has been set and we are checking that
 // tensors we are rebasing are strictly inside the source tensor.
 // So in the common case, pattern1 *will* include pattern2.
-bool PatternIncludes(const TensorPattern &pattern1,
-                     const TensorPattern &pattern2) {
+bool PatternIncludes(const Pattern &pattern1,
+                     const Pattern &pattern2) {
 
-  std::vector<TensorPattern> intersection;
+  std::vector<Pattern> intersection;
   if (!ComputeIntersection(pattern1, pattern2, &intersection))
     return -1;  // Could not determine whether the patterns intersect.
   int64 num_elements = 0;
@@ -1113,8 +1392,8 @@ bool PatternIncludes(const TensorPattern &pattern1,
 }
 
 
-void MakeCompactAndJustified(const TensorPattern &src,
-                             TensorPattern *dest) {
+void MakeCompactAndJustified(const Pattern &src,
+                             Pattern *dest) {
   KALDI_PARANOID_ASSERT(src.IsValid());
   int32 num_axes = src.num_axes;
 
@@ -1141,7 +1420,7 @@ void MakeCompactAndJustified(const TensorPattern &src,
       KALDI_PARANOID_ASSERT(abs_stride >= next_abs_stride &&
                             "Input pattern was not valid.");
       if (this_stride < 0) {
-        offset += next_stride * (this_dim - 1);
+        offset += int64(next_stride) * (this_dim - 1);
         dest->strides[raxis] = -next_abs_stride;
       } else {
         dest->strides[raxis] = next_abs_stride;
@@ -1159,8 +1438,8 @@ void MakeCompactAndJustified(const TensorPattern &src,
 }
 
 
-void MakeCompactNonnegativeAndJustified(const TensorPattern &src,
-                                        TensorPattern *dest) {
+void MakeCompactNonnegativeAndJustified(const Pattern &src,
+                                        Pattern *dest) {
   KALDI_PARANOID_ASSERT(src.IsValid());
   int32 num_axes = src.num_axes;
 
@@ -1196,8 +1475,8 @@ void MakeCompactNonnegativeAndJustified(const TensorPattern &src,
 
 
 
-void MakeCompactNormalizedAndJustified(const TensorPattern &src,
-                                       TensorPattern *dest) {
+void MakeCompactNormalizedAndJustified(const Pattern &src,
+                                       Pattern *dest) {
   KALDI_PARANOID_ASSERT(src.IsValid());
   int32 num_axes = src.num_axes;
 
diff --git a/src/tensor/tensor-pattern-extra-utils.h b/src/tensor/tensor-pattern-extra-utils.h
index 7bad15522f2..3058051629d 100644
--- a/src/tensor/tensor-pattern-extra-utils.h
+++ b/src/tensor/tensor-pattern-extra-utils.h
@@ -44,8 +44,8 @@ namespace tensor {
          @return  Return if the two patterns' memory-index-sets'
                   intersection is nonempty.
  */
-bool PatternsIntersect(const TensorPattern &pattern1,
-                       const TensorPattern &pattern2);
+bool PatternsIntersect(const Pattern &pattern1,
+                       const Pattern &pattern2);
 
 
 /**
@@ -53,8 +53,8 @@ bool PatternsIntersect(const TensorPattern &pattern1,
    interface.  it should not be called by users as it is slow.  It is exposed
    here for testing purposes.
 */
-bool PatternsIntersectSlow(const TensorPattern &pattern1,
-                           const TensorPattern &pattern2);
+bool PatternsIntersectSlow(const Pattern &pattern1,
+                           const Pattern &pattern2);
 
 
 /**
@@ -65,7 +65,7 @@ bool PatternsIntersectSlow(const TensorPattern &pattern1,
    of `pattern`; if false it will just return the above expression computedn
    for i and not check.
  */
-int64 IndexPattern(const TensorPattern &pattern,
+int64 IndexPattern(const Pattern &pattern,
                    const std::vector<int32> &i,
                    bool check_valid = true);
 
@@ -78,25 +78,32 @@ int64 IndexPattern(const TensorPattern &pattern,
    Let Offsets(pattern1, pattern2) be the set of n-tuples o such that there
    exists an i with pattern1[i + o] = pattern2[i], with of course i + o in the
    index-tuple-set of pattern1 and i in the index-tuple-set of pattern2.
-   This function outputs the set of such offsets o.
+   This function outputs the set of such offsets o.  The algorithm is a little
+   complicated so we describe it with the implementation.
 
-       @param [in] pattern1   First input pattern.  Must be valid-1.
-       @param [in] pattern2   Second input pattern.  Must be valid-1.
+       @param [in] pattern1   First input pattern.  Must be valid-1 and
+                        normalized+ (i.e. HasNormalizedPositiveStrides(pattern1)).
+       @param [in] pattern2   Second input pattern.  Must be valid-1 and
+                        satisfy SameStrides(pattern1, pattern2).
        @param [in] find_all_offsets  True if the user wants all of the
-                          offsets.  If false, this function may save
-                          computation by stopping after one or more
-                          offsets.  (Useful in testing if patterns intersect).
+                        offsets.  If false, this function may save computation
+                        by stopping after one or more offsets.  (Useful in
+                        testing whether patterns intersect).
        @param [out] offsets   The offsets will be written to here in
                          arbitrary order.  Each offset will be a vector with
                          size() equal to the num_axes of the patterns; the
                          elements may be positive or negative.
+
+   See also (not all of these are declared in headers), OffsetToPattern(),
+   OffsetToHyperrectangle()).
  */
-bool FindOffsets(const TensorPattern &pattern1,
-                 const TensorPattern &pattern2,
+bool FindOffsets(const Pattern &pattern1,
+                 const Pattern &pattern2,
                  bool find_all_offsets,
                  std::vector<std::vector<int32> > *offsets);
 
 
+
 /**
    Returns information about whether pattern2's memory-index-set is a subset of
    pattern1's memory-index-set.  See glossary in tensor-pattern.h for
@@ -109,15 +116,15 @@ bool FindOffsets(const TensorPattern &pattern1,
            -1 if we could not compute the intersection (so our
               algorithm could not determine whether one included the other).
  */
-int32 PatternIncludes(const TensorPattern &pattern1,
-                      const TensorPattern &pattern2);
+int32 PatternIncludes(const Pattern &pattern1,
+                      const Pattern &pattern2);
 
 /**
    Inline function that sets dim=1, stride=0 for all axes with
    num_axes <= raxis < KALDI_TENSOR_MAX_DIM.  Often useful.
  */
 inline void SetUnusedDimsAndStrides(int32 num_axes,
-                                    TensorPattern *dest) {
+                                    Pattern *dest) {
 #pragma unroll(2)
   for (int32 raxis = num_axes; raxis < KALDI_TENSOR_MAX_DIM; raxis++) {
     dest->dims[raxis] = 1;
@@ -129,7 +136,7 @@ inline void SetUnusedDimsAndStrides(int32 num_axes,
    Inline function that sets dest->code = -1 and dest->properties = 0;
    often saves coding in functions that create or modify patterns.
  */
-inline void SetDefaultCodeAndProperties(TensorPattern *dest) {
+inline void SetDefaultCodeAndProperties(Pattern *dest) {
   dest->code = -1;
   dest->properties = 0;
 }
@@ -148,8 +155,8 @@ inline void SetDefaultCodeAndProperties(TensorPattern *dest) {
        @return  Returns true if the patterns are equivalent, otherwise
                 false.
  */
-bool PatternsEquivalent(const TensorPattern &pattern1,
-                        const TensorPattern &pattern2);
+bool PatternsEquivalent(const Pattern &pattern1,
+                        const Pattern &pattern2);
 
 
 /**
@@ -165,18 +172,17 @@ bool PatternsEquivalent(const TensorPattern &pattern1,
                         we want the intersection; must be valid.
       @param [in] pattern2  The first of the two patterns of which
                         we want the intersection; must be valid.
+      @param [in]  keep_all_patterns   If this parameter is false,
+                       the algorithm will stop as soon as the
+                       `intersection` vector has one element.  This
+                       is used for a fast test whether an intersection
+                       is empty or not.
       @param [out] intersection  On success, this function outputs
                        a possibly-empty vector of patterns (in arbitrary
                        order), the union of whose memory-index-sets (which
-                       will all be disjoint) equals the intersection fo the
+                       will all be disjoint) equals the intersection of the
                        memory-index-sets of `pattern1` and `pattern2`.
                        (However, see `keep_all_patterns`).
-      @param [in]  keep_all_patterns   If this parameter is set to false,
-                       the algorithm will stop as soon as the
-                       `intersection` vector has one element.  This
-                       is used for a fast test whether an intersection
-                       is empty or ont.
-
       @return  Returns true if the intersection could be computed, and
                false otherwise.  This function will always return true if,
                when the strides of pattern1 and pattern2 are sorted and
@@ -187,22 +193,44 @@ bool PatternsEquivalent(const TensorPattern &pattern1,
                to common strides, are "Regular" (c.f. "Regularity
                property" in glossary).
 */
-bool ComputeIntersection(const TensorPattern &pattern1,
-                         const TensorPattern &pattern2,
-                         std::vector<TensorPattern> *intersection,
-                         bool keep_all_patterns = true);
+bool ComputeIntersection(const Pattern &pattern1,
+                         const Pattern &pattern2,
+                         bool keep_all_patterns,
+                         std::vector<Pattern> *intersection);
+
 
 
 /**
-   This function tries to compute the set-wise difference pattern1 - pattern2:
-   viewed as memory-index-sets, it is trying to compute the set of
+   This function tries to compute the set-wise difference pattern1 - pattern2.
+   Viewed as memory-index-sets, it is trying to compute the set of
    memory-indexes in pattern1 but not in pattern2.  This is computed as a list
-   of TensorPatterns.  This function may fail to compute the set difference in
+   of Patterns.  This function may fail to compute the set difference in
    certain very pathological cases (see documentation of return status).
+
+      @param [in] pattern1  The pattern we are subtracting from;
+                       if it does not intersect with pattern2, the
+                       result will be identical to pattern1.
+                       Must be valid.
+      @param [in] pattern2  The pattern we are subtracting; must be valid.
+      @param [out] difference  On success, this function outputs
+                     a possibly-empty vector of patterns (in arbitrary
+                     order), the union of whose memory-index-sets (which
+                     will all be disjoint) equals the set-wise difference
+                     M(pattern1) - M(pattern2) of the memory-index-sets of
+                     `pattern1` and `pattern2`.
+      @return  Returns true if the intersection could be computed, and
+               false otherwise.  This function will always return true if,
+               when the strides of pattern1 and pattern2 are sorted and
+               duplicates removed and listed in increasing order, each
+               stride divides the next one in the list exactly; but this is
+               not a necessary condition.   (The necessary condition
+               is that both patterns, when compressed and converted
+               to common strides, are "Regular" (c.f. "Regularity
+               property" in glossary).
 */
-bool ComputeDifference(const TensorPattern &pattern1,
-                       const TensorPattern &pattern2,
-                       std::vector<TensorPattern> *difference);
+bool ComputeDifference(const Pattern &pattern1,
+                       const Pattern &pattern2,
+                       std::vector<Pattern> *difference);
 
 
 
@@ -217,8 +245,8 @@ bool ComputeDifference(const TensorPattern &pattern1,
       @return               Returns true if the memory-index-set of
                             pattern1 and pattern2 have nonempty intersection.
  */
-bool PatternsIntersect(const TensorPattern &pattern1,
-                       const TensorPattern &pattern2);
+bool PatternsIntersect(const Pattern &pattern1,
+                       const Pattern &pattern2);
 
 /**
       @param [in] pattern   The pattern about whose memory-index-set
@@ -230,7 +258,7 @@ bool PatternsIntersect(const TensorPattern &pattern1,
                             index-tuple i such that `pattern[i] == mindex`;
                             see "Indexing a pattern" in the glossary.
 */
-bool PatternContains(const TensorPattern &pattern,
+bool PatternContains(const Pattern &pattern,
                      int64 mindex);
 
 
@@ -244,29 +272,27 @@ bool PatternContains(const TensorPattern &pattern,
                 the memory-index-set of q (see tensor-pattern.h for definition;
                 of memory-index-set).
  */
-bool PatternIsSubsetOf(const TensorPattern &p,
-                       const TensorPattern &q);
+bool PatternIsSubsetOf(const Pattern &p,
+                       const Pattern &q);
 
 
 /**
-   Compute the minimum and maximum memory-indexs present in
+   Compute the minimum and maximum memory-indexes present in
    a pattern's memory-index-set (i.e. the minimum and maximum
    indexes into the underlying array).
 
       @param [in] pattern  The pattern whose minimum and maximum
                            memory-index we are computing
       @param [out] min_mindex  The minimum memory-index in the
-                           memory-index-set of the pattern.  Will
-                           be zero in Patterns with non-negative
-                           strides (e.g. Patterns in canonical form,
-                           or other Patterns with normalized
-                           strides).  Should always be >= 0 in
-                           Patterns created by a valid program.
+                           memory-index-set of the pattern.  Will be zero in
+                           Patterns with non-negative strides.  Will always be
+                           >= 0 in Patterns created by a program that's
+                           doing something that makes sense.
       @param [out] max_mindex  The maximum memory-index in the
-                           memory-index-set of the pattern.
-                           Will always be >= min_mindex.
+                           memory-index-set of the pattern.  Will always be >=
+                           min_mindex.
 */
-void ComputeMinAndMaxMindex(const TensorPattern &pattern,
+void ComputeMinAndMaxMindex(const Pattern &pattern,
                             int64 *min_mindex,
                             int64 *max_mindex);
 
@@ -286,7 +312,7 @@ void ComputeMinAndMaxMindex(const TensorPattern &pattern,
                        `pattern`, containing 1 for memory-indexse in the set and 0 for
                        those out of the set.
  */
-bool ToMemoryIndexSet(const TensorPattern &pattern,
+bool ToMemoryIndexSet(const Pattern &pattern,
                       std::vector<char> *s);
 
 /**
@@ -295,7 +321,7 @@ bool ToMemoryIndexSet(const TensorPattern &pattern,
      @param [in] pattern   Pattern; must be valid-1.
      @return  Returns randomly chosen memory-index.
  */
-int64 RandomMemoryIndex(const TensorPattern &pattern);
+int64 RandomMemoryIndex(const Pattern &pattern);
 
 
 
@@ -309,7 +335,7 @@ int64 RandomMemoryIndex(const TensorPattern &pattern);
       @param [in] pattern  The input pattern
       @param [out] s   The memory-index-tuple-set
  */
-bool ToMemoryIndexTupleSet(const ArrayRef<TensorPattern*>  patterns,
+bool ToMemoryIndexTupleSet(const ArrayRef<Pattern*>  patterns,
                            std::unordered_set<std::vector<int32>, VectorHasher> *s);
 
 
@@ -318,19 +344,19 @@ bool ToMemoryIndexTupleSet(const ArrayRef<TensorPattern*>  patterns,
    that their memory-index-tuple-sets are the same.  See glossary
    in tensor-pattern.h for explanation.
  */
-bool PatternTuplesEquivalent(const ArrayRef<const TensorPattern*> patterns1,
-                             const ArrayRef<const TensorPattern*> patterns2);
+bool PatternTuplesEquivalent(const ArrayRef<const Pattern*> patterns1,
+                             const ArrayRef<const Pattern*> patterns2);
 
 /**
-   Returns true if TensorPattern p is linear in TensorPattern q.  (Note:
+   Returns true if Pattern p is linear in Pattern q.  (Note:
    this is a rather technical property, see tensor-pattern.h for definition).
 
       @param [in] p  The first pattern.  Must be valid
       @param [in] q  The second pattern.  Must be valid and must satisfy
                      `PatternIsSubsetOf(p, q);`
  */
-bool IsLinearIn(const TensorPattern &p,
-                const TensorPattern &q);
+bool IsLinearIn(const Pattern &p,
+                const Pattern &q);
 
 /**
    This function returns true if a Pattern is regular (see Regularity property
@@ -338,20 +364,20 @@ bool IsLinearIn(const TensorPattern &p,
    have all positive strides, the strides must be in increasing order (in the
    private numbering), and it must be valid-2 (see glossary).
  */
-bool IsRegular(const TensorPattern &pattern);
+bool IsRegular(const Pattern &pattern);
 
 
 /**
    This function returns true if a Pattern is valid-1 (see definition in
-   glossary); see also TensorPattern::Valid() and IsValid2().
+   glossary); see also Pattern::Valid() and IsValid2().
  */
-bool IsValid1(const TensorPattern &pattern);
+bool IsValid1(const Pattern &pattern);
 
 /**
    This function returns true if a Pattern is valid-2 (see definition in
-   glossary); see also TensorPattern::Valid() and IsValid1().
+   glossary); see also Pattern::Valid() and IsValid1().
  */
-bool IsValid2(const TensorPattern &pattern);
+bool IsValid2(const Pattern &pattern);
 
 
 /**
@@ -394,9 +420,9 @@ bool IsValid2(const TensorPattern &pattern);
                         have to output a number of patterns that couldn't be
                         bounded given the number of axes.
   */
-bool ConvertPatternStrides(const TensorPattern &pattern,
+bool ConvertPatternStrides(const Pattern &pattern,
                            const ArrayRef<int32> strides,
-                           std::vector<TensorPattern> *patterns);
+                           std::vector<Pattern> *patterns);
 
 /**
    This function fills in any 'gaps' in the memory-indexes in 'src' and
@@ -421,8 +447,8 @@ bool ConvertPatternStrides(const TensorPattern &pattern,
                         Will satisfy `CompactAndJustified(*dest)`,
                         and also `IsValid(*dest)`, assuming `IsValid(src)`.
  */
-void MakeCompactAndJustified(const TensorPattern &src,
-                             TensorPattern *dest);
+void MakeCompactAndJustified(const Pattern &src,
+                             Pattern *dest);
 
 
 /**
@@ -434,11 +460,11 @@ void MakeCompactAndJustified(const TensorPattern &src,
                          (`p->IsValid()`).  At exit, will be
                          valid and also justified (`IsJustified(p)`).
  */
-void MakeJustified(TensorPattern *p);
+void MakeJustified(Pattern *p);
 
 
 /**
-   This function copies the TensorPattern 'src' from 'dest', preserving the
+   This function copies the Pattern 'src' from 'dest', preserving the
    num_axes and dims while possibly modifying the strides and offset.  The
    strides of 'dest' will be normalized (i.e. nonnegative with positive strides
    strictly increasing in the private axis-numbering), the pattern will be
@@ -451,12 +477,12 @@ void MakeJustified(TensorPattern *p);
                       will be normalized, the pattern will be compact
                       (no gaps between memory-indexes) and offset will be 0.
  */
-void MakeCompactNormalizedAndJustified(const TensorPattern &src,
-                                       TensorPattern *dest);
+void MakeCompactNormalizedAndJustified(const Pattern &src,
+                                       Pattern *dest);
 
 
 /**
-   This function copies the TensorPattern 'src' from 'dest', preserving the
+   This function copies the Pattern 'src' from 'dest', preserving the
    num_axes and dims while possibly modifying the strides and offset.  The
    strides of 'dest' will be nonnegative but the ordering from least to greatest
    of the nonzero strides will be the same as the ordering of the absolute
@@ -468,16 +494,16 @@ void MakeCompactNormalizedAndJustified(const TensorPattern &src,
                   num_axes and dims with src, but the strides and
                   offset may be different.
 */
-void MakeCompactNonnegativeAndJustified(const TensorPattern &src,
-                                        TensorPattern *dest);
+void MakeCompactNonnegativeAndJustified(const Pattern &src,
+                                        Pattern *dest);
 
 
 
 
 /**
-   Class TensorPatternRebaser is an object that converts TensorPattern
+   Class PatternRebaser is an object that converts Pattern
    when memory layouts change.  The main use-case is when a base Variable
-   (c.f. variable.h for definition) has a TensorPattern that is not
+   (c.f. variable.h for definition) has a Pattern that is not
    contiguous (see tensor-pattern.h for definition of 'contiguous'), and
    its gradient Tensor is allocated contiguously.  This class is
    needed to convert patterns for Variables into patterns for their
@@ -486,7 +512,7 @@ void MakeCompactNonnegativeAndJustified(const TensorPattern &src,
    We make it an object rather than a function in order to avoid repetition when
    multiple patterns need to be rebased.
  */
-class TensorPatternRebaser {
+class PatternRebaser {
 
   /*
     Constructor.
@@ -509,8 +535,8 @@ class TensorPatternRebaser {
     The purpose of this object is to modify patterns in a way that maps
     their memory-indexes with the same function.
   */
-  TensorPatternRebaser(const TensorPattern &src_pattern,
-                       const TensorPattern &dest_pattern);
+  PatternRebaser(const Pattern &src_pattern,
+                       const Pattern &dest_pattern);
 
 
   /**
@@ -533,15 +559,15 @@ class TensorPatternRebaser {
 
      @return  Returns true if the conversion was possible.
    */
-  bool Rebase(TensorPattern *pattern);
+  bool Rebase(Pattern *pattern);
 
   private:
 
   // TODO: remove src_pattern_ and dest_pattern_ once everything
   // is debugged.  They are copies of the src_pattern and dest_pattern
   // passed to the constructor.
-  TensorPattern src_pattern_;
-  TensorPattern dest_pattern_;
+  Pattern src_pattern_;
+  Pattern dest_pattern_;
 
   // If needs_conversion_ is false, it means the patterns don't need any conversion
   // at all (this is an optimization).
@@ -609,7 +635,7 @@ class TensorPatternRebaser {
 class OutOfPlaceAxisSorter {
  public:
   // Constructor.
-  inline OutOfPlaceAxisSorter(const TensorPattern &src) {
+  inline OutOfPlaceAxisSorter(const Pattern &src) {
     int32 num_axes = src.num_axes;
     for (int32 raxis = 0; raxis < src.num_axes; raxis++)
       orig_raxis_[raxis] = raxis;
diff --git a/src/tensor/tensor-pattern-utils-test.cc b/src/tensor/tensor-pattern-utils-test.cc
index 2c003174a2b..881c2c01fb7 100644
--- a/src/tensor/tensor-pattern-utils-test.cc
+++ b/src/tensor/tensor-pattern-utils-test.cc
@@ -26,7 +26,7 @@ namespace kaldi {
 namespace tensor {
 
 // We may later move this function to somewhere more permanent.
-void GenerateRandomPattern(TensorPattern *pattern) {
+void GenerateRandomPattern(Pattern *pattern) {
 
   int32 num_axes = RandInt(0, KALDI_TENSOR_MAX_DIM);
 
@@ -68,7 +68,7 @@ void GenerateRandomPattern(TensorPattern *pattern) {
 
 
 void UnitTestGenRandomPattern() {
-  TensorPattern p;
+  Pattern p;
   for (int32 i = 0; i < 100; i++) {
     GenerateRandomPattern(&p);
   }
diff --git a/src/tensor/tensor-pattern-utils.cc b/src/tensor/tensor-pattern-utils.cc
index 015752f25b6..d0931765db7 100644
--- a/src/tensor/tensor-pattern-utils.cc
+++ b/src/tensor/tensor-pattern-utils.cc
@@ -22,7 +22,7 @@
 namespace kaldi {
 namespace tensor {
 
-int32 ComputePatternCode(const TensorPattern &pattern) {
+int32 ComputePatternCode(const Pattern &pattern) {
   int32 ans = 0;
 
   int32 n = 0;
@@ -54,7 +54,7 @@ int32 ComputePatternCode(const TensorPattern &pattern) {
 }
 
 
-void ComputeMinAndMaxMindex(const TensorPattern *pattern,
+void ComputeMinAndMaxMindex(const Pattern *pattern,
                             int64 *min_mindex,
                             int64 *max_mindex) {
   KALDI_PARANOID_ASSERT(IsValid(pattern));
@@ -125,7 +125,7 @@ void ComputeMinAndMaxMindex(const TensorPattern *pattern,
    If this were moved to a header we would have to make it update the pattern
    code.
  */
-static inline bool NormalizeSigns(ArrayRef<TensorPattern*> patterns,
+static inline bool NormalizeSigns(ArrayRef<Pattern*> patterns,
                                   int32 max_num_axes,
                                   int64 *data_offsets) {
   bool changed = false;
@@ -183,7 +183,7 @@ static inline bool NormalizeSigns(ArrayRef<TensorPattern*> patterns,
 
    (We also require that the new dimension must not overflow an int32.)
  */
-static inline bool Combinable(const TensorPattern &p,
+static inline bool Combinable(const Pattern &p,
                               int32 raxis1, int32 raxis2) {
   return pattern.strides[raxis2] == p.strides[raxis1] * p.dims[raxis1] &&
       static_cast<int64>(p.dims[raxis1])*static_cast<int64>(p.dims[raxis2]) <
@@ -194,7 +194,7 @@ static inline bool Combinable(const TensorPattern &p,
 // Returns true iff the axis 'axis' has zero stride (and hence dim=1)
 // for all the supplied patterns.  An axis like this can be removed without
 // affecting the result.
-static inline bool AxisIsTrivial(ArrayRef<TensorPattern> patterns,
+static inline bool AxisIsTrivial(ArrayRef<Pattern> patterns,
                                  int32 raxis) {
   for (size_t p = 0; p < patterns.size; p++)
     if (patterns[p].strides[raxis] != 0)
@@ -208,7 +208,7 @@ static inline bool AxisIsTrivial(ArrayRef<TensorPattern> patterns,
 // of that trivial axis).  axis1 is the one with the smaller stride, and is the
 // one whose stride we keep in the combined axis; that is the asymmetry
 // between axis1 and axis2.
-static inline void CombineAxes(ArrayRef<TensorPattern*> patterns,
+static inline void CombineAxes(ArrayRef<Pattern*> patterns,
                                int32 raxis1, int32 raxis2) {
   size_t num_patterns = patterns.size;
 #ifdef KALDI_PARANOID
@@ -223,7 +223,7 @@ static inline void CombineAxes(ArrayRef<TensorPattern*> patterns,
     // the chance of having to move dims/strides around when removing
     // trivial axes later on.
     for (size_t p = 0; p < num_patterns; p++) {
-      TensorPattern *pattern = patterns[p];
+      Pattern *pattern = patterns[p];
       pattern->dims[raxis2] *= pattern->dims[raxis1];
       pattern->strides[raxis2] *= pattern->strides[raxis1];
       pattern->dims[raxis1] = 1;
@@ -232,7 +232,7 @@ static inline void CombineAxes(ArrayRef<TensorPattern*> patterns,
   } else {
     // keep raxis1, remove raxis2.
     for (size_t p = 0; p < num_patterns; p++) {
-      TensorPattern *pattern = patterns[p];
+      Pattern *pattern = patterns[p];
       pattern->dims[raxis1] *= pattern->dims[raxis1];
       pattern->dims[raxis2] = 1;
       pattern->strides[raxis2] = 0;
@@ -262,7 +262,7 @@ static inline void CombineAxes(ArrayRef<TensorPattern*> patterns,
    CAUTION: this function does not update the codes of 'patterns'.
  */
 static void RemoveTrivialAxes(bool is_trivial_raxis[KALDI_TENSOR_MAX_AXES],
-                              ArrayRef<TensorPattern*> patterns) {
+                              ArrayRef<Pattern*> patterns) {
   int32 first_trivial_raxis = -1;
   for (int32 raxis = 0; raxis < KALDI_TENSOR_MAX_AXES; raxis++) {
     if (is_trivial_axis[raxis]) {
@@ -273,7 +273,7 @@ static void RemoveTrivialAxes(bool is_trivial_raxis[KALDI_TENSOR_MAX_AXES],
   KALDI_PARANOID_ASSERT(first_trivial_raxis >= 0);
 
   for (size_t p = 0; p < patterns.size; p++) {
-    TensorPattern *pattern = patterns[p];
+    Pattern *pattern = patterns[p];
     // Keep the axes right-justified.  We work from the right to the left.
 
     // We do the loop over axes inside the loop over p for memory locality.
@@ -301,7 +301,7 @@ static void RemoveTrivialAxes(bool is_trivial_raxis[KALDI_TENSOR_MAX_AXES],
   }
 }
 
-void CompressPatterns(ArrayRef<TensorPattern*> patterns,
+void CompressPatterns(ArrayRef<Pattern*> patterns,
                       int64_t *data_offsets) {
   size_t num_patterns = patterns.size;
 #ifdef KALDI_PARANOID
@@ -398,14 +398,14 @@ void CompressPatterns(ArrayRef<TensorPattern*> patterns,
 }
 
 
-void CompressOnePattern(TensorPattern *pattern,
+void CompressOnePattern(Pattern *pattern,
                         int64 *data_offset) {
   // We may at some point implement this specially; doing this would be more efficient.
   CompressPatterns({pattern}, data_offset);
 }
 
 
-void SortAxes(TensorPattern *pattern) {
+void SortAxes(Pattern *pattern) {
   int32 num_axes = pattern->num_axes;
   switch(num_axes) {
     case 0: case 1:
@@ -449,7 +449,7 @@ void SortAxes(TensorPattern *pattern) {
   }
 }
 
-void Transpose(int32 raxis1, int32 raxis2, TensorPattern *p) {
+void Transpose(int32 raxis1, int32 raxis2, Pattern *p) {
   if (static_cast<uint32>(raxis1) >= static_cast<uint32>(p->num_axes) ||
       static_cast<uint32>(raxis2) >= static_cast<uint32>(p->num_axes)) {
     KALDI_ERR << "Invalid axes to transpose: raxis1="
@@ -461,7 +461,7 @@ void Transpose(int32 raxis1, int32 raxis2, TensorPattern *p) {
   p->code = -1;
 }
 
-void Transpose(int32 axis1, int32 axis2, TensorPattern *p) {
+void Transpose(int32 axis1, int32 axis2, Pattern *p) {
   int32 num_axes = p->num_axes;
   // interpret negative axes as offsets from num_axes.
 
@@ -482,7 +482,7 @@ void Transpose(int32 axis1, int32 axis2, TensorPattern *p) {
 
 
 
-void RemoveTrivialAxes(TensorPattern *pattern) {
+void RemoveTrivialAxes(Pattern *pattern) {
   int32 num_axes = pattern->num_axes,
       num_axes_out = 0;
   for (int32 raxis = 0; raxis < num_axes; raxis++) {
@@ -494,7 +494,7 @@ void RemoveTrivialAxes(TensorPattern *pattern) {
       }
     }
   }
-  // It is a requirement of struct TensorPattern that dims and
+  // It is a requirement of struct Pattern that dims and
   // strides for raxis >= num_axes be 1 and 0 respectively.
   for (int32 raxis = num_axes_out; raxis < num_axes; raxis++) {
     pattern->dims[raxis] = 1;
@@ -505,8 +505,8 @@ void RemoveTrivialAxes(TensorPattern *pattern) {
 }
 
 
-void RemoveTrivialAxes(const TensorPattern &pattern_in,
-                       TensorPattern *pattern_out) {
+void RemoveTrivialAxes(const Pattern &pattern_in,
+                       Pattern *pattern_out) {
   KALDI_PARANOID_ASSERT(pattern_out != &pattern_in);
   int32 num_axes = pattern->num_axes,
       num_axes_out = 0;
@@ -517,7 +517,7 @@ void RemoveTrivialAxes(const TensorPattern &pattern_in,
       pattern_out->axes[num_axes_out] = pattern_in.strides[raxis];
     }
   }
-  // It is a requirement of struct TensorPattern that dims and
+  // It is a requirement of struct Pattern that dims and
   // strides for raxis >= num_axes be 1 and 0 respectively.
   for (int32 raxis = num_axes_out;
        raxis < KALDI_TENSOR_MAX_AXES; raxis++) {
@@ -528,7 +528,7 @@ void RemoveTrivialAxes(const TensorPattern &pattern_in,
   pattern_out->code = -1;
 }
 
-int64 NumElements(const TensorPattern &pattern) {
+int64 NumElements(const Pattern &pattern) {
   int32 num_axes = pattern.num_axes;
   int64 ans = 1;
   for (int32 raxis = 0; raxis < num_axes; raxis++)
@@ -537,7 +537,7 @@ int64 NumElements(const TensorPattern &pattern) {
 }
 
 void Select(int32 eaxis, int32 index,
-            const TensorPattern &src, TensorPattern *dest) {
+            const Pattern &src, Pattern *dest) {
   KALDI_PARANOID_ASSERT(src.IsValid());
   int32 num_axes = src.num_axes,
       raxis = EaxisToRaxis(eaxis);
@@ -567,7 +567,7 @@ void Select(int32 eaxis, int32 index,
   dest->properties = 0;
 }
 
-void Slice(int32 axis, int32 start, int32 end, TensorPattern *pattern) {
+void Slice(int32 axis, int32 start, int32 end, Pattern *pattern) {
   int32 num_axes = pattern->num_axes,
       raxis = EaxisToRaxis(eaxis);
   KALDI_PARANOID_ASSERT(pattern->IsValid());
@@ -604,7 +604,7 @@ void Slice(int32 axis, int32 start, int32 end, TensorPattern *pattern) {
 }
 
 
-void UnsqueezeR(int32 raxis, const TensorPattern &src, TensorPattern *dest) {
+void UnsqueezeR(int32 raxis, const Pattern &src, Pattern *dest) {
   int32 num_axes_in = src.num_axes;
   KALDI_ASSERT(static_cast<uint32>(raxis) <= num_axes_in &&
                num_axes_in < KALDI_TENSOR_MAX_DIM);
diff --git a/src/tensor/tensor-pattern-utils.h b/src/tensor/tensor-pattern-utils.h
index 1725e746b40..95529320566 100644
--- a/src/tensor/tensor-pattern-utils.h
+++ b/src/tensor/tensor-pattern-utils.h
@@ -66,7 +66,7 @@ inline int32 EaxisToRaxis(int32 eaxis, int32 num_axes) {
                      indicates that a negative stride was present.
 */
 inline bool PatternMightContainNegativeStride(
-    const TensorPattern &pattern) {
+    const Pattern &pattern) {
   // 2048 is 1 << 11; 11th bit in code is set if code indicates negative stride.
   return (pattern.code | 2048) != 0;
 }
@@ -107,8 +107,8 @@ inline bool AxisIsTrivial(int32 pattern_code, int32 raxis) {
                         reduced.  Will be valid at output if pattern_in was
                         valid-1 at input.
 */
-void RemoveTrivialAxes(const TensorPattern &pattern_in,
-                       TensorPattern *pattern_out);
+void RemoveTrivialAxes(const Pattern &pattern_in,
+                       Pattern *pattern_out);
 
 
 /**
@@ -119,7 +119,7 @@ void RemoveTrivialAxes(const TensorPattern &pattern_in,
                          will be removed and the num_axes reduced.  Will be
                          valid at output if it was valid-1 at input.
  */
-void RemoveTrivialAxes(TensorPattern *pattern);
+void RemoveTrivialAxes(Pattern *pattern);
 
 
 /**
@@ -188,7 +188,7 @@ void RemoveTrivialAxes(TensorPattern *pattern);
 
     ...
  */
-int32 ComputePatternCode(const TensorPattern &pattern);
+int32 ComputePatternCode(const Pattern &pattern);
 
 
 inline int32 CombineCodes(int32 code1, int32 code2) {
@@ -203,7 +203,7 @@ inline int64 CombineCodes(int32 code1, int32 code2, int32 code3) {
 
 
 /**
-   Copies a TensorPattern from `src` to `dest` while modifying it by inserting
+   Copies a Pattern from `src` to `dest` while modifying it by inserting
    an axis with (dim=1,stride=0) at position `raxis` (specified in the
    private numbering).
 
@@ -216,7 +216,7 @@ inline int64 CombineCodes(int32 code1, int32 code2, int32 code3) {
                            was valid at entry (which this function may not
                            check).
  */
-void UnsqueezeR(int32 raxis, const TensorPattern &src, TensorPattern *dest);
+void UnsqueezeR(int32 raxis, const Pattern &src, Pattern *dest);
 
 
 /**
@@ -243,7 +243,7 @@ void UnsqueezeR(int32 raxis, const TensorPattern &src, TensorPattern *dest);
                             at exit, possibly its dims and strides
                             arrays changed, and its code updated.
  */
-inline void Unsqueeze(int32 eaxis, TensorPattern *p) {
+inline void Unsqueeze(int32 eaxis, Pattern *p) {
   UnsqueezeR(EaxisToRaxis(eaxis, p->num_axes));
 }
 
@@ -276,7 +276,7 @@ inline void Unsqueeze(int32 eaxis, TensorPattern *p) {
                             at exit, possibly its dims and strides
                             arrays changed, and its 'code' updated.
 */
-void SqueezeR(int32 raxis, TensorPattern *p);
+void SqueezeR(int32 raxis, Pattern *p);
 
 
 /**
@@ -307,7 +307,7 @@ void SqueezeR(int32 raxis, TensorPattern *p);
                             at exit, possibly its dims and strides
                             arrays changed, and its 'code' updated.
  */
-inline void Squeeze(int32 axis, TensorPattern *p) {
+inline void Squeeze(int32 axis, Pattern *p) {
   if (axis < 0) SqueezeR(1 - axis, p);
   else SqueezeR(p->num_axes - 1 - axis, p);
 }
@@ -315,20 +315,20 @@ inline void Squeeze(int32 axis, TensorPattern *p) {
 
 
 /** Transpose the two specified axes (specified in the private/reversed
-    numbering) of a TensorPattern.
+    numbering) of a Pattern.
 
     @param [in] raxis1  First axis to be transposed; must be in range
                         `[0, p->num_axes - 1]`
     @param [in] raxis2  Second axis to be transposed; must be in range
                         `[0, p->num_axes - 1]`
                         If identical to axis1, nothing will be done.
-    @param [in,out] p  TensorPattern whose axes are to be transposed.
+    @param [in,out] p  Pattern whose axes are to be transposed.
  */
-void TransposeR(int32 raxis1, int32 raxis2, TensorPattern *p);
+void TransposeR(int32 raxis1, int32 raxis2, Pattern *p);
 
 
 /** Transpose the two specified axes (specified in the private/reversed
-    numbering) of a TensorPattern.
+    numbering) of a Pattern.
 
     @param [in] axis1  First axis to be transposed; must be in range
                        `[-p->num_axes, p->num_axes - 1]`,
@@ -339,10 +339,10 @@ void TransposeR(int32 raxis1, int32 raxis2, TensorPattern *p);
     @param [in] axis2  Second axis to be transposed; must be in range
                        `[-p->num_axes, t->num_axes - 1]`.
                        If identical to axis1, nothing will be done.
-    @param [in,out] p  TensorPattern whose axes are to be transposed.
+    @param [in,out] p  Pattern whose axes are to be transposed.
                        p->code is updated.
  */
-void TransposeR(int32 raxis1, int32 raxis2, TensorPattern *p);
+void TransposeR(int32 raxis1, int32 raxis2, Pattern *p);
 
 
 
@@ -374,7 +374,7 @@ void TransposeR(int32 raxis1, int32 raxis2, TensorPattern *p);
                             at exit, possibly its dims and strides
                             arrays changed, and its 'code' updated.
 */
-inline void Squeeze(int32 axis, TensorPattern *p) {
+inline void Squeeze(int32 axis, Pattern *p) {
   if (axis < 0) SqueezeR(1 - axis, p);
   else SqueezeR(p->num_axes - 1 - axis, p);
 }
@@ -396,7 +396,7 @@ inline void Squeeze(int32 axis, TensorPattern *p) {
                 an additional constraint that `a.dims[i] <= b.dims[i]` if
                 `b_non_reducing == true`.
  */
-bool Broadcastable(const TensorPattern &a, const TensorPattern &b,
+bool Broadcastable(const Pattern &a, const Pattern &b,
                    bool b_non_reducing = false);
 
 
@@ -418,8 +418,8 @@ bool Broadcastable(const TensorPattern &a, const TensorPattern &b,
                 `c_non_reducing == true`).
 
  */
-bool Broadcastable(const TensorPattern &a, const TensorPattern &b,
-                   const TensorPattern &c, bool c_non_reducing = false);
+bool Broadcastable(const Pattern &a, const Pattern &b,
+                   const Pattern &c, bool c_non_reducing = false);
 
 
 
@@ -438,7 +438,7 @@ bool Broadcastable(const TensorPattern &a, const TensorPattern &b,
                       a and b are the same after padding as for broadcasting.
    See also the 3-arg version of SamePaddedDims(), and SameDims().
 */
-bool SamePaddedDims(const TensorPattern &a, const TensorPattern &b);
+bool SamePaddedDims(const Pattern &a, const Pattern &b);
 
 
 /**
@@ -449,8 +449,8 @@ bool SamePaddedDims(const TensorPattern &a, const TensorPattern &b);
 
    This is a stronger condition than Broadcastable(a, b, c).
  */
-bool SamePaddedDims(const TensorPattern &a, const TensorPattern &b,
-                    const TensorPattern &c);
+bool SamePaddedDims(const Pattern &a, const Pattern &b,
+                    const Pattern &c);
 
 /**
    Return true if the two provided patterns have the same dims-vectors
@@ -463,7 +463,7 @@ bool SamePaddedDims(const TensorPattern &a, const TensorPattern &b,
                      the elements of their 'dims' members are the same.
    See also: SamePaddedDims().
 */
-bool SameDims(const TensorPattern &a, const TensorPattern &b);
+bool SameDims(const Pattern &a, const Pattern &b);
 
 /**
    Returns true if pattern1 and pattern2 have the same num_axes and strides.
@@ -474,13 +474,13 @@ bool SameDims(const TensorPattern &a, const TensorPattern &b);
       @return        Returns true if a.num_axes == b.num_axes and
                      the elements of their 'strides' members are the same.
  */
-bool SameStrides(const TensorPattern &a,
-                 const TensorPattern &b);
+bool SameStrides(const Pattern &a,
+                 const Pattern &b);
 
 
 
 /**
-   Compresses a TensorPattern by removing or combining as many axes as possible.
+   Compresses a Pattern by removing or combining as many axes as possible.
    This version is suitable for operations that do not rely on any kind of
    structure, such as zeroing or nonlinearities; the only equivalence maintained
    is equivalence of the set of memory locations covered (the memory-index-set).
@@ -490,7 +490,7 @@ bool SameStrides(const TensorPattern &a,
 
       @param [in,out]  pattern   The pattern to be compressed
 
-   Examples are below, where we write a TensorPattern as
+   Examples are below, where we write a Pattern as
 
    `{{dim1,dim2,..}, {stride1,stride2,..} [,offset] }`
 
@@ -509,7 +509,7 @@ bool SameStrides(const TensorPattern &a,
    {{2,3,4},{100,4,1}}        {{2,12},{100,1}}
 \endverbatim
  */
-void CompressOnePattern(TensorPattern *pattern);
+void CompressOnePattern(Pattern *pattern);
 
 
 
@@ -526,19 +526,19 @@ void CompressOnePattern(TensorPattern *pattern);
                    from most negative to most positive stride (in the
                    physical ordering).
  */
-void SortAxes(TensorPattern *pattern);
+void SortAxes(Pattern *pattern);
 
 
 // TODO: document this.
-inline void CanonicalizePattern(TensorPattern *pattern) {
+inline void CanonicalizePattern(Pattern *pattern) {
   CompressOnePattern(pattern);
   SortAxes(pattern);
 }
 
 // TODO: document this.  This will later be replaced with
 // a more efficient version.
-inline void CanonicalizePattern(contst TensorPattern &pattern_in,
-                                TensorPattern *pattern_out) {
+inline void CanonicalizePattern(contst Pattern &pattern_in,
+                                Pattern *pattern_out) {
   *pattern_out = pattern_in;
   CanonicalizePattern(pattern_out);
 }
@@ -548,7 +548,7 @@ inline void CanonicalizePattern(contst TensorPattern &pattern_in,
    glossary for the meaning).  CanonicalizePattern() will modify a valid pattern
    to put it in canonical form.
  */
-bool IsCanonical(const TensorPattern &pattern);
+bool IsCanonical(const Pattern &pattern);
 
 
 /**
@@ -557,7 +557,7 @@ bool IsCanonical(const TensorPattern &pattern);
    to at least satisfy the uniqueness property for this to actually give
    the number of elements, but this is not checked).
 */
-int64 NumElements(const TensorPattern &pattern);
+int64 NumElements(const Pattern &pattern);
 
 
 /**
@@ -585,7 +585,7 @@ int64 NumElements(const TensorPattern &pattern);
                     but using the strides of later numbered patterns in
                     case of ties.
  */
-void SortAxes(ArrayRef<TensorPattern*> patterns);
+void SortAxes(ArrayRef<Pattern*> patterns);
 
 /**
   Multiplies all strides and the offset in 'pattern' by 'scale', which must be >
@@ -594,13 +594,13 @@ void SortAxes(ArrayRef<TensorPattern*> patterns);
   This function is used in the memory-locking code if the same storage location
   is accessed using different dtypes (which is unlikely).
  */
-void ScaleStridesAndOffset(int32 scale, TensorPattern *pattern);
+void ScaleStridesAndOffset(int32 scale, Pattern *pattern);
 
 
 
-/// Hashing object, used when we need an unordered_map containing TensorPattern.
+/// Hashing object, used when we need an unordered_map containing Pattern.
 class PatternHasher {
-  size_t operator () (const TensorPattern &pattern) const;
+  size_t operator () (const Pattern &pattern) const;
 };
 
 
@@ -609,14 +609,14 @@ class PatternHasher {
   are exactly two patterns to be jointly compressed.  See documentation of
   CompressPatterns() for explanation.
  */
-void CompressTwoPatterns(TensorPattern *a,
-                         TensorPattern *b);
+void CompressTwoPatterns(Pattern *a,
+                         Pattern *b);
 
 
 /**
-   Compresses one or more TensorPattern by removing or combining as many axes as
+   Compresses one or more Pattern by removing or combining as many axes as
    possible.  See the documentation for CompressOnePattern() to understand the
-   basic concept of compressing a single TensorPattern to a pattern with possibly
+   basic concept of compressing a single Pattern to a pattern with possibly
    fewer axes (and maybe with negative strides converted to positive),
    which covers the same set of memory locations as the original Tensor.
 
@@ -638,7 +638,7 @@ void CompressTwoPatterns(TensorPattern *a,
       @return  Returns true if it made any change to the patterns,
                false if they were unchanged.
 
- Examples are below, where we write a TensorPattern as
+ Examples are below, where we write a Pattern as
  `{{dim1,dim2,..}, {stride1,stride2,..}}`.
 
 \verbatim
@@ -651,10 +651,10 @@ void CompressTwoPatterns(TensorPattern *a,
  {{3,4},{4,1}}        {{1,1},{0,0}}      {{12},{1}}           {{1},{0}}    # combine
 \endverbatim
  */
-bool CompressPatterns(ArrayRef<TensorPattern*> patterns);
+bool CompressPatterns(ArrayRef<Pattern*> patterns);
 
 /**
-   Compresses a TensorPattern by removing or combining as many axes as possible,
+   Compresses a Pattern by removing or combining as many axes as possible,
    while preserving the memory-index-set of the pattern (see glossary for
    explanation), and also while respecting certain invariances that are relevant
    when constructing 'views' ('view' is PyTorch terminology; the NumPy
@@ -684,7 +684,7 @@ bool CompressPatterns(ArrayRef<TensorPattern*> patterns);
    The output pattern 'dest' is what you get if you keep applying the
    rules above until no further change is made.
 
-   Examples are below, where we write a TensorPattern as
+   Examples are below, where we write a Pattern as
   `   {{dim1,dim2,..}, {stride1,stride2,..}}`.
 \verbatim
    Input pattern             Output pattern
@@ -695,12 +695,12 @@ bool CompressPatterns(ArrayRef<TensorPattern*> patterns);
    {2,3,4},{100,-4,-1}        {{2,12},{100,-1}}
 \endverbatim
  */
-void CompressPatternC(TensorPattern *p);
+void CompressPatternC(Pattern *p);
 
 
 
 /**
-   Creates a TensorPattern corresponding to a requested 'view' of the matrix.
+   Creates a Pattern corresponding to a requested 'view' of the matrix.
    ('view' is PyTorch terminology; the NumPy equivalent is 'reshape').
 
    The PyTorch/NumPy semantics are (I believe) as follows: Firstly, a view
@@ -745,9 +745,9 @@ void CompressPatternC(TensorPattern *p);
                      returns true).
 
  */
-bool CreateViewPattern(const TensorPattern &pattern_in,
+bool CreateViewPattern(const Pattern &pattern_in,
                        ArrayRef<int32> dims,
-                       TensorPattern *pattern_out);
+                       Pattern *pattern_out);
 
 
 /**
@@ -759,13 +759,13 @@ bool CreateViewPattern(const TensorPattern &pattern_in,
                          to possibly reduce the dimensionality.
       @param [in] start  Starting index; must be in range [0, t->Dim(eaxis) - 1]
       @param [in] end    Ending index; must be in the range [start + 1, t->Dim(eaxis)]
-      @param [in,out] pattern  TensorPattern to be modified.  Will be valid at
+      @param [in,out] pattern  Pattern to be modified.  Will be valid at
                          exit if it was valid at entry.
 
    See also: the other overloaded version of Slice() which accepts the 'step'
    parameter; and Select(), which is similar but also reduces the num-axes.
  */
-void Slice(int32 eaxis, int32 start, int32 end, TensorPattern *pattern);
+void Slice(int32 eaxis, int32 start, int32 end, Pattern *pattern);
 
 
 
@@ -778,14 +778,14 @@ void Slice(int32 eaxis, int32 start, int32 end, TensorPattern *pattern);
                          to possibly reduce the dimensionality.
        @param [in] index Index to select; must be in range
                          [0, t->Dim(eaxis) - 1].
-       @param [in,out] src   TensorPattern which is to be copied; must be valid,
+       @param [in,out] src   Pattern which is to be copied; must be valid,
                          but we don't guarantee to check this.
-       @param [out] dest TensorPattern which we are copying to and modifying.
+       @param [out] dest Pattern which we are copying to and modifying.
                          It is allowed to be the same object as 'src'.
                          Will be valid if src was valid.
 */
 void Select(int32 eaxis, int32 index,
-            const TensorPattern &src, TensorPattern *dest);
+            const Pattern &src, Pattern *dest);
 
 
 /**
@@ -807,15 +807,15 @@ void Select(int32 eaxis, int32 index,
               false otherwise.   (See note above about axes
               with dim=1).
 */
-void HasCStrides(const TensorPattern &pattern);
+void HasCStrides(const Pattern &pattern);
 
 /**
    Returns true if there is overlap between pattern1 and pattern2,
    meaning that pattern1's memory-index-set and pattern2's
    memory-index-set have nonempty intersection.
  */
-bool PatternsOverlap(const TensorPattern &pattern1,
-                     const TensorPattern &pattern2);
+bool PatternsOverlap(const Pattern &pattern1,
+                     const Pattern &pattern2);
 
 
 
@@ -825,7 +825,7 @@ bool PatternsOverlap(const TensorPattern &pattern1,
    contiguous; see HasCStrides()).  Caution: the interface may later be changed
    to allow caching of this property in the 'properties' field.
 */
-bool IsCompact(const TensorPattern &pattern);
+bool IsCompact(const Pattern &pattern);
 
 
 /**
@@ -833,27 +833,33 @@ bool IsCompact(const TensorPattern &pattern);
    "Justified" in glossary in pattern.h.
    (see also: ComputeMinAndMaxMindex()).
 */
-bool IsJustified(const TensorPattern &pattern);
+bool IsJustified(const Pattern &pattern);
 
 
 /**
    This is the same is IsCompact(pattern) &&
    IsJustified(pattern).
 */
-bool IsCompactAndJustified(const TensorPattern &pattern);
+bool IsCompactAndJustified(const Pattern &pattern);
 
 /**
-   Returns true if 'pattern' has normalized strides as defined
-   in tensor-pattern.h (i.e.: strides are nonnegative and
-   the nonzero ones are in strictly increasing order in the
-   private numbering / decreasing in the public).
+   Returns true if 'pattern' has normalized strides as defined in
+   tensor-pattern.h (i.e.: strides are nonnegative and the nonzero ones are in
+   strictly increasing order in the private numbering / decreasing in the
+   public).
 */
-bool HasNormalizedStrides(const TensorPattern &pattern);
+bool HasNormalizedStrides(const Pattern &pattern);
+
+/**
+   Returns true if the strides in 'pattern' are all positive and are in strictly
+   increasing order in the private numbering / decreasing in the public.
+ */
+bool HasNormalizedPostiveStrides(const Pattern &pattern);
 
 /**
    Returns true if all the stides in 'pattern' are nonnegative.
 */
-bool HasNonnegativeStrides(const TensorPattern &pattern);
+bool HasNonnegativeStrides(const Pattern &pattern);
 
 
 
diff --git a/src/tensor/tensor-pattern.cc b/src/tensor/tensor-pattern.cc
index e8ef11c83ea..489965dd418 100644
--- a/src/tensor/tensor-pattern.cc
+++ b/src/tensor/tensor-pattern.cc
@@ -24,7 +24,7 @@
 namespace kaldi {
 namespace tensor {
 
-bool TensorPattern::Check(bool check_code) {
+bool Pattern::Check(bool check_code) {
   if (num_axes < 0 || num_axes > KALDI_TENSOR_MAX_DIM)
     return false;
 
@@ -86,7 +86,7 @@ bool TensorPattern::Check(bool check_code) {
 
 
 // MAY DELETE THIS.  It's not up to date anyway.
-void TensorPatternProperties::UpdateProperties(const TensorPattern &pattern) {
+void PatternProperties::UpdateProperties(const Pattern &pattern) {
   KALDI_PARANOID_ASSERT(pattern.IsValid());
   int32 num_axes = pattern.num_axes;
   int64 dim_prod = 1;
diff --git a/src/tensor/tensor-pattern.h b/src/tensor/tensor-pattern.h
index 54156e5d01b..39ecd344f50 100644
--- a/src/tensor/tensor-pattern.h
+++ b/src/tensor/tensor-pattern.h
@@ -33,7 +33,7 @@ namespace tensor {
   PATTERN GLOSSARY   (note: see also TENSOR GLOSSARY in tensor.h)
 
     Axis:             An axis is one of the (dim, stride) pairs that form part
-                      of a TensorPattern.  We will sometimes use the word "axis"
+                      of a Pattern.  We will sometimes use the word "axis"
                       to refer to the integer index of the axis, as in, for example,
                       in a Tensor with dims=[5 6 7], axis 0 has dim=5 and
                       axis 2 has dim=7; but this should more precisely
@@ -68,12 +68,11 @@ namespace tensor {
                       The axis-dominance lemma, of which we won't provide a proof
                       of here as it's pretty obvious, is something you would need
                       when showing that axis-dominance implies uniqueness.  It
-                      states that, given the axis-dominance property, for
+                      states that, for a pattern which is valid-1, for any
                       any 0 <= r < num_axes,
                           (\sum_{q < r} (dim(q) - 1) * stride(q))  <  stride(r).
 
 
-
     Broadcasting:    A convention whereby for an operation on Tensors that would
                      normally be required to have the same dimension, it's
                      acceptable for, on some axis, one Tensor to have `dim = n`
@@ -98,7 +97,7 @@ namespace tensor {
                      broadcastable because 4 == 4 and in the remaining axis, one
                      of the dimensions is 1.
 
-    Canonical form:  A TensorPattern is in canonical form if all pairs of axes that
+    Canonical form:  A Pattern is in canonical form if all pairs of axes that
                      could be combined (without affecting its memory-index-set)
                      have been combined; where there are no trivial axes; all
                      strides are positive; and the axes are sorted in an order
@@ -262,7 +261,7 @@ namespace tensor {
                       index would be outside that region.
 
     Pattern:          An object representing the dims, strides and offset of a Tensor.
-                      (see struct TensorPattern).  The Pattern has
+                      (see struct Pattern).  The Pattern has
                       an 'offset' which is the memory-index of the element of the Tensor
                       whose index-tuple is all zeros; the Pattern also
                       has a number of axes, `0 <= num_axes < KALDI_TENSOR_MAX_AXES`,
@@ -278,7 +277,7 @@ namespace tensor {
                       for example: Broadcastable(P, Q).
 
 
-    An object of type TensorPattern, representing the dims, strides
+    An object of type Pattern, representing the dims, strides
                       and offset of a Tensor.
 
     Public numbering: The numbering of axes used in the public interface of class
@@ -287,7 +286,7 @@ namespace tensor {
                       in the public numbering, e.g. dims=[3 4].
                       See also: axis-index
 
-    Private numbering:  The reversed numbering of axes in struct TensorPattern.
+    Private numbering:  The reversed numbering of axes in struct Pattern.
                       For an axis numbered `axis` in the public numbering, its
                       reversed axis index is `raxis = num_axes - 1 - axis`.
                       This reversal makes PyTorch-style broadcasting easier.
@@ -340,9 +339,13 @@ namespace tensor {
                       strides are all nonnegative and the nonzero strides
                       are in strictly increasing order in the private numbering
                       (hence strictly decreasing in the public numbering).
-
                       See also: Default strides (which is a stronger property).
 
+    Normalized+ strides:  Normalized+ strides are strides that are normalized
+                      but also positive.  I.e. the strides are positive, and
+                      increasing in the private numbering / decreasing in
+                      the public.
+
     Linear property:
                       This is a slightly technical property used in certain
                       proofs involving patterns.
@@ -362,34 +365,32 @@ namespace tensor {
                       that is if P is linear in Q and Q is linear in R, then
                       P is linear in R.
 
-    Regularity property:   This is a property of Patterns that is relevant when reducing
-                      Patterns to a common set of strides.
+    Regularity property:   This is a property of Patterns that is relevant when
+                      reducing Patterns to a common set of strides.  It can
+                      be thought of as a relaxed version of the axis-dominance
+                      property.
 
                       We formulate the regularity property to only apply for
-                      Patterns which are valid-- and which have positive strides in increasing order; these
-                      the stipulation on having postive, sorted strides
-                      is for convenience, since we happen to need it only for
-                      that case and it's easier to formulate in that case.
-
-                      For the regularity property to apply, a Pattern must also
-                      be valid-- (see its own glossary entry).
+                      Patterns which are valid-2 and which have positive strides
+                      in increasing order.  The stipulation on having postive,
+                      sorted strides is just for convenience, since we happen to
+                      need this property only in that case and it's easier to
+                      formulate in that case.
 
                       A Pattern is regular if, in addition to satisfying the
                       properties mentioned above, for each axis-index
-                      0 <= i < num_axes - 1,
+                        0 <= i < num_axes - 1,
                       there is an integer k with i < k <= num_axes, such that:
-                        (i) Either k == num_axes, or dim(i) * stride(i) <= stride(k),
-                      and
-                        (ii) For all j with i < j < k, stride(i) divides stride(j)
+                        (i) For all j with i < j < k, stride(i) divides stride(j)
                             exactly and dim(j) = 1.
-                      [Note: the condition that dim(j) == 1 will anyway be true if
-                      the Pattern has the uniqueness property.]
+                        (ii) Either k == num_axes, or dim(i) * stride(i) <= stride(k),
+
 
-                      The reader may notice that if we were to restrict
-                      k to equal i + 1, then
-                      this would be equivalent to the axis-dominance property
-                      (property (v)) plus the requirement that the strides be
-                      positive and sorted.
+                      The reader may notice that if we were to restrict k to
+                      equal i + 1, then this would be equivalent to the
+                      axis-dominance property (property (v)) plus the
+                      requirement that the strides be positive and sorted (which
+                      we only added for convenience).
 
     Storage region:   A Tensor, in addition to a Pattern, has a storage region
                       that can be though of as a pointer (say, to float) which
@@ -419,7 +420,7 @@ namespace tensor {
 
     Valid Pattern:
                      A valid Pattern must be as follows.  Think of this as the mathematical definition;
-                     see the declaration of struct TensorPattern for additional details about how
+                     see the declaration of struct Pattern for additional details about how
                      it is stored.
 
                           (i) The num_axes must satisfy 0 <= num_axes < KALDI_TENSOR_MAX_DIM
@@ -463,7 +464,7 @@ namespace tensor {
 /*
   This struct stores the dimension and strides of a Tensor.
 
-  Below we describe the the properties that a TensorPattern is required to have.
+  Below we describe the the properties that a Pattern is required to have.
   Most of them are described in the glossary in the entry for "Valid Pattern",
   but there are a couple more that have to do with the specifics of how we
   store things in this struct.
@@ -478,7 +479,7 @@ namespace tensor {
   addition, we require that the stride equal zero for any axis that has dim = 1.
   There is also the "axis-dominance" property (see its glossary entry for more info).
 
-  Our requirements of a TensorPattern are:
+  Our requirements of a Pattern are:
 
     0 <= num_axes <= KALDI_TENSOR_MAX_DIM.
 
@@ -502,7 +503,7 @@ namespace tensor {
   PyTorch-style broadcasting where in an operation on Tensors of dims,
   say, (3,4) and (4), the (4) is interpreted as (1,4).
 */
-struct TensorPattern {
+struct Pattern {
   int32 num_axes;
   int32 dims[KALDI_TENSOR_MAX_DIM];     // the dims in reversed order, indexed
                                         // by 'raxis' (reversed axis)
@@ -514,62 +515,62 @@ struct TensorPattern {
 
   int32 code;  // pattern code; see ComputePatternCode() in tensor-pattern-utils.h
                // for details.  If this is negative then it means it has not been
-               // computed.  In a valid TensorPattern the code will always be either
+               // computed.  In a valid Pattern the code will always be either
                // negative or up-to-date.
 
   int32 properties;  // More occasionally-needed properties.  This is similar to
                      // OpenFst's notion of properties, where we compute them
-                     // only on demand.  In a valid TensorPattern the properties
+                     // only on demand.  In a valid Pattern the properties
                      // will always be accurate, but see "Accurate properties"
                      // in glossary above for definition (it can be zero).
 
-  // Returns true if the TensorPattern is valid.  This includes all the
+  // Returns true if the Pattern is valid.  This includes all the
   // mathematical conditions on a valid Pattern (search above for "Valid
-  // Pattern"), plus extra conditions related to struct TensorPattern,
+  // Pattern"), plus extra conditions related to struct Pattern,
   // namely: dims and strides with index >= num_axes should be
   // 1 and 0 respectively; and the code should either be -1 or or
   // be the same as ComputePatternCode() returns on this pattern.
   // See also IsCanonical() in tensor-pattern-utils.h.
   bool IsValid();
 
-  // This comparator induces a total ordering on valid TensorPatterns.  It is a
+  // This comparator induces a total ordering on valid Patterns.  It is a
   // lexical comparison on the offset, num_axes, dims and strides.  (The code
   // does not need to be compared because, if not -1, it is a function of the
   // dims and strides).
-  bool operator < (const TensorPattern &other) const;
+  bool operator < (const Pattern &other) const;
 
 
-  // Equality operator on TensorPattern.  Compares the num_axes, offset, and
+  // Equality operator on Pattern.  Compares the num_axes, offset, and
   // dims and strides indexed [0... num_axes-1].  (In patterns that satisfy IsValid(),
   // the remaining dims and strides would be 1 and 0 respectively, so checking
   // the is pointless).
-  bool operator == (const TensorPattern &other) const;
+  bool operator == (const Pattern &other) const;
 
   // Assignment operator (copies all members).
-  bool operator = (const TensorPattern &other) const;
+  bool operator = (const Pattern &other) const;
 };
 
 
 /// Returns a string representing a Pattern, of the form:
 /// "offset=a dims=[b c d] strides=[e f g]"; this is for debugging
 /// purposes.
-std::string PatternAsString(const TensorPattern &pattern);
+std::string PatternAsString(const Pattern &pattern);
 
 /// Returns a string representing the dims of a Pattern, something like
 /// "[10 20 100]"
-std::string DimsAsString(const TensorPattern &pattern);
+std::string DimsAsString(const Pattern &pattern);
 
 /// Returns a string representing the strides of a Pattern, something like
 /// "[1 10 200]"
-std::string StridesAsString(const TensorPattern &pattern);
+std::string StridesAsString(const Pattern &pattern);
 
 
 
 // We may later get rid of this struct and just have functions to get
 // these properties.
-struct TensorPatternProperties {
+struct PatternProperties {
   // Below are cached properties that are derived from the underlying data in
-  // struct TensorPattern.
+  // struct Pattern.
 
   // The number of elements in the Tensor, which equals the product
   // of dims[0] .. dims[num_axes - 1].  Will always be >0.
@@ -598,32 +599,32 @@ struct TensorPatternProperties {
 
   // Sets the members of *this to be the properties of pattern 'pattern'.
   // Ignores the previously existing values of *this.
-  void UpdateProperties(const TensorPattern &pattern);
+  void UpdateProperties(const Pattern &pattern);
 };
 
 
 
 /**
-   Returns a hash value for hashing TensorPattern.  Depends on num_axes,
+   Returns a hash value for hashing Pattern.  Depends on num_axes,
    offset, and dims and strides indexed [0... num_axes-1].  pattern does
    not have to be valid.
  */
-size_t GetHash(const TensorPattern &pattern);
+size_t GetHash(const Pattern &pattern);
 
-// C++ hashing object for TensorPattern
-struct TensorPatternHasher {
-  size_t operator (const TensorPattern &pattern) { return GetHash(pattern); }
+// C++ hashing object for Pattern
+struct PatternHasher {
+  size_t operator (const Pattern &pattern) { return GetHash(pattern); }
 };
 
-// C++ hashing object for TensorPattern*; requires the pointer
-// be non-NULL and to point to a TensorPattern.
-struct TensorPatternPtrHasher {
-  size_t operator (TensorPattern *pattern) { return GetHash(*pattern); }
+// C++ hashing object for Pattern*; requires the pointer
+// be non-NULL and to point to a Pattern.
+struct PatternPtrHasher {
+  size_t operator (Pattern *pattern) { return GetHash(*pattern); }
 };
 
-struct TensorPatternPtrEqual {
-  size_t operator (TensorPattern *pattern1,
-                   TensorPattern *pattern2) {
+struct PatternPtrEqual {
+  size_t operator (Pattern *pattern1,
+                   Pattern *pattern2) {
     *pattern1 == *pattern2;
   }
 };
diff --git a/src/tensor/tensor.h b/src/tensor/tensor.h
index c66c8dfa57f..336d0c0fca1 100644
--- a/src/tensor/tensor.h
+++ b/src/tensor/tensor.h
@@ -169,7 +169,7 @@ class Tensor {
 
   // Return reference to the struct containing the dimension and
   // stride info.
-  const TensorPattern &Pattern() const { return impl_.pattern; }
+  const Pattern &Pattern() const { return impl_.pattern; }
 
   // Return a vector containing dimensions of the tensor; equivalent to
   // .shape in PyTorch.  Dims().size() will equal NumAxes().
diff --git a/src/tensor/variable-inl.h b/src/tensor/variable-inl.h
index 3c312033dd8..8179ee38971 100644
--- a/src/tensor/variable-inl.h
+++ b/src/tensor/variable-inl.h
@@ -56,9 +56,9 @@ Tensor VariableImpl::GetGradForView(const Tensor &data) {
     if (!aux_)
       aux_ = new VariableImplAux;
     if (!aux_->rebaser)
-      aux_->rebaser = new TensorPatternRebaser(pattern_,
+      aux_->rebaser = new PatternRebaser(pattern_,
                                                grad_->pattern_);
-    const TensorPatternRebaser &rebaser = *(aux_->rebaser);
+    const PatternRebaser &rebaser = *(aux_->rebaser);
     if (!rebaser->Rebase(&(ans->pattern))) {
       // die.
       KALDI_ERR << "Rebasing failed.  Likely you are using views "
diff --git a/src/tensor/variable.h b/src/tensor/variable.h
index bc9bfbbf717..709bce8afb4 100644
--- a/src/tensor/variable.h
+++ b/src/tensor/variable.h
@@ -75,7 +75,7 @@ class Node {
   // tensor to gradients (used for views).  Will be NULL in the usual
   // case where the Tensor for this base Variable has the same strides
   // and offset as the grad.
-  std::unique_ptr<TensorPatternRebaser> rebaser;
+  std::unique_ptr<PatternRebaser> rebaser;
 
   // latest_op is the most recent of the Ops that modified the base Variable
   // this is attached to, or any view into it.
@@ -104,7 +104,7 @@ struct VariableImplAux {
   // Variables where data_ and grad_ have different offset and/or
   // strides, it is an object capable of converting patterns from
   // tensors to gradients (used when constructing views).
-  std::unique_ptr<TensorPatternRebaser> rebaser;
+  std::unique_ptr<PatternRebaser> rebaser;
 
   // config_ is NULL if no config values have been stored; otherwise,
   // a pointer to class Config.

From a751cf413077d0b6cc962ddb21999a01bc6c62e9 Mon Sep 17 00:00:00 2001
From: Daniel Povey <povey@fb.com>
Date: Sun, 5 May 2019 00:11:00 -0400
Subject: [PATCH 035/163] [src] Tensor progress; rename some files

[src] Tensor progress
---
 src/tensor/memory-checker.h                   |   2 +-
 ...-utils-inl.h => pattern-extra-utils-inl.h} |   4 +-
 ...-extra-utils.cc => pattern-extra-utils.cc} | 486 +++++-------------
 ...rn-extra-utils.h => pattern-extra-utils.h} |  65 ++-
 ...attern-utils-inl.h => pattern-utils-inl.h} |   6 +-
 ...rn-utils-test.cc => pattern-utils-test.cc} |   6 +-
 ...nsor-pattern-utils.cc => pattern-utils.cc} |   4 +-
 ...tensor-pattern-utils.h => pattern-utils.h} |  24 +-
 src/tensor/{tensor-pattern.cc => pattern.cc}  |   4 +-
 src/tensor/{tensor-pattern.h => pattern.h}    |   8 +-
 src/tensor/scalar.h                           |   2 +-
 src/tensor/tensor-common.h                    |   2 +-
 src/tensor/tensor-functions.h                 |  14 +-
 src/tensor/tensor-impl-linear.cc              |   2 +-
 src/tensor/tensor-impl-utils.h                |   6 +-
 src/tensor/tensor-impl.h                      |  10 +-
 src/tensor/tensor-utils.h                     |   2 +-
 src/tensor/tensor.h                           |   4 +-
 src/tensor/variable-inl.h                     |   2 +-
 19 files changed, 220 insertions(+), 433 deletions(-)
 rename src/tensor/{tensor-pattern-extra-utils-inl.h => pattern-extra-utils-inl.h} (88%)
 rename src/tensor/{tensor-pattern-extra-utils.cc => pattern-extra-utils.cc} (72%)
 rename src/tensor/{tensor-pattern-extra-utils.h => pattern-extra-utils.h} (93%)
 rename src/tensor/{tensor-pattern-utils-inl.h => pattern-utils-inl.h} (92%)
 rename src/tensor/{tensor-pattern-utils-test.cc => pattern-utils-test.cc} (95%)
 rename src/tensor/{tensor-pattern-utils.cc => pattern-utils.cc} (99%)
 rename src/tensor/{tensor-pattern-utils.h => pattern-utils.h} (97%)
 rename src/tensor/{tensor-pattern.cc => pattern.cc} (98%)
 rename src/tensor/{tensor-pattern.h => pattern.h} (99%)

diff --git a/src/tensor/memory-checker.h b/src/tensor/memory-checker.h
index 600a988f6a2..1dc84f06306 100644
--- a/src/tensor/memory-checker.h
+++ b/src/tensor/memory-checker.h
@@ -22,7 +22,7 @@
 
 #include <functional>
 #include "tensor/tensor-common.h"
-#include "tensor/tensor-pattern.h"
+#include "tensor/pattern.h"
 
 
 namespace kaldi {
diff --git a/src/tensor/tensor-pattern-extra-utils-inl.h b/src/tensor/pattern-extra-utils-inl.h
similarity index 88%
rename from src/tensor/tensor-pattern-extra-utils-inl.h
rename to src/tensor/pattern-extra-utils-inl.h
index e0af4f49b55..2fbd08b6eed 100644
--- a/src/tensor/tensor-pattern-extra-utils-inl.h
+++ b/src/tensor/pattern-extra-utils-inl.h
@@ -1,4 +1,4 @@
-// tensor/tensor-pattern-extra-utils-inl.h
+// tensor/pattern-extra-utils-inl.h
 
 //  Copyright      2019  Johns Hopkins University (author: Daniel Povey)
 
@@ -20,7 +20,7 @@
 #ifndef KALDI_TENSOR_TENSOR_PATTERN_EXTRA_UTILS_INL_H_
 #define KALDI_TENSOR_TENSOR_PATTERN_EXTRA_UTILS_INL_H_ 1
 
-// This file is only to be included by tensor-pattern-extra-utils.h; do not include it
+// This file is only to be included by pattern-extra-utils.h; do not include it
 // directly.
 
 
diff --git a/src/tensor/tensor-pattern-extra-utils.cc b/src/tensor/pattern-extra-utils.cc
similarity index 72%
rename from src/tensor/tensor-pattern-extra-utils.cc
rename to src/tensor/pattern-extra-utils.cc
index 49ee4ce5dbc..6ebe7fcb987 100644
--- a/src/tensor/tensor-pattern-extra-utils.cc
+++ b/src/tensor/pattern-extra-utils.cc
@@ -1,4 +1,4 @@
-// tensor/tensor-pattern-extra-utils.cc
+// tensor/pattern-extra-utils.cc
 
 // Copyright      2019  Johns Hopkins University (author: Daniel Povey)
 
@@ -18,7 +18,7 @@
 // limitations under the License.
 
 
-#include "tensor/tensor-pattern-extra-utils.h"
+#include "tensor/pattern-extra-utils.h"
 
 namespace kaldi {
 namespace tensor {
@@ -266,7 +266,7 @@ bool ConvertPatternStrides(
 /**
    FindOffsetsRecursive() is a utility function that is used in the
    implementation of FindOffsets().  See the documentation of FindOffsets(*) in
-   tensor-pattern-extra-utils.h for context.
+   pattern-extra-utils.h for context.
    Briefly: we are finding the set of offsets o such that there exists i
    with pattern1[i + o] = pattern2[i].
 
@@ -279,7 +279,7 @@ bool ConvertPatternStrides(
 
       pattern1[i + o] = pattern2[i]                     (1)
 
-   (see "Indexing a Pattern" in tensor-pattern.h to understand the notation),
+   (see "Indexing a Pattern" in pattern.h to understand the notation),
    we get:
 
       pattern1.offset + s . (i + o)  ==  pattern2.offset + s . i
@@ -316,7 +316,7 @@ bool ConvertPatternStrides(
    And define
      lower_sum =   \sum_{q=0}^{r-1} s[q] * o[q],
 
-   We can use the axis-dominance lemma (see tensor-pattern.h) and the limitation
+   We can use the axis-dominance lemma (see pattern.h) and the limitation
    on o[r] from (4) to prove that:
          -s[r] <  lower_sum <  s[r].                 (5)
    (the axis-dominance lemma is relevant here because o[r] behaves just like an
@@ -473,6 +473,18 @@ bool IsValidHyperrectangle(const Hyperrectangle &a) {
       return false;
 }
 
+std::vector<int32>  RandomIndexFromHyperrectangle(const Hyperrectangle &a) {
+  // Returns a random index-tuple drawn from the set represented by the
+  // hyperrectangle a.
+  std::vector<int32> ans(a.size());
+  auto ans_iter = ans.begin(), ans_end = ans.end();
+  auto a_iter = a.begin();
+  for (; ans_iter != ans_end; ++ans_iter, ++a_iter)
+    *ans_iter = RandInt(a_iter->first, a_iter->second - 1);
+  
+
+}
+
 // Returns true if two hyperrectangles, as defined above,
 // intersect.  We require a.size() == b.size() and a and
 // to be valid hyperrectangles.
@@ -490,16 +502,22 @@ bool HyperrectanglesIntersect(const Hyperrectangle &a,
 
 /**
    If called with i == 0, this recursive function computes the set-wise
-   difference of hyperrectangles a - b (viewed as sets of tuples of
-   ints, obviously).
+   difference of hyperrectangles a - b (viewed as sets of tuples of ints,
+   obviously); it appends that difference, expressed as zero or more
+   hyperrectangles, to the vector `difference`.  See definition of
+   typedef Hyperrectangle for more explanation.
 
       @param [in] a  A valid hyperrectangle
       @param [in] b  A valid hyperrectangle, must satisfy a.size() == b.size()
-      @param [in] i  An index in the range [0 .. a.size() - 1] (view this
+      @param [in] i  The user will call this recursive function with i == 0.
+                     It is an index in the range [0 .. a.size() - 1] (view this
                      as an axis-index).  The caller asserts that for each index
                      0 <= j < i, a's interval is contained in b's interval; that
                      is, a[j].first >= b[j].first and a[j].second <=
                      b[j].second.
+      @param [out] difference   Zero or more hyperrectangles will be
+                     *appended* to `difference`.  Their union will equal
+                     the set-wise difference a - b.
 */
 static void SubtractHyperrectangles(const Hyperrectangle &a,
                                     const Hyperrectangle &b,
@@ -517,7 +535,7 @@ static void SubtractHyperrectangles(const Hyperrectangle &a,
   Hyperrectangle &b_non_const = const_cast<Hyperrectangle&> b;
 
   int32 a_start = a[i].first, a_end = a[i].second,
-      b_start = b[i].first, b_end = b[i].second;
+    b_start = b[i].first, b_end = b[i].second;
 
   if (b_start < a_end && b_end > a_start) {
     // If a's and b's intervals overlap at all....
@@ -569,7 +587,7 @@ static void SubtractHyperrectangles(const Hyperrectangle &a,
                         could use to index pattern1, satisfying pattern1[S] =
                         pattern2[S - o].  The two elements of the pair on each
                         axis thus correspond to (begin, end) indexes into
-                        pattern1 with end one past the end.
+                        pattern1 where "end" one past the last valid index.
                         See "[set view of hyperrectangles]" for explanation.
 */
 static void OffsetToHyperrectangle(
@@ -594,7 +612,15 @@ static void OffsetToHyperrectangle(
       (*hyperrectangle)[raxis].first = interval_start;
       (*hyperrectangle)[raxis].second = interval_end;
   }
+#if 1
+  {  // testing code, will remove eventually.
+    std::vector<int32> index1 = RandomIndexFromHyperrectangle(*hyperrectangle),
+      index2(index1.size());
+    for (size_t i = 0; i < index.size(); i++)
+      index2[i] = index1[i] - offset[i];
+    KALDI_ASSERT(IndexPattern(pattern1, index1) == IndexPattern(pattern2, index2));
   }
+#endif
 }
 
 
@@ -633,11 +659,28 @@ static void HyperrectangleToPattern(const Pattern &src,
   KALDI_PARANOID_ASSERT(IsValid1(*dest));
 }
 
+/**
+   Outputs to h a hyperrectangle that represents the index-tuple-set
+   of the Pattern `src`.  A vector where the r'th element is the
+   pair (0, src.dims[r]).
+ */
+static void GetFullHyperrectangleOfPattern(const Pattern &src,
+                                        Hyperrectangle *h) {
+  int32 num_axes = src.num_axes;
+  h->resize(num_axes);
+  for (int32 raxis = 0; raxis < num_axes; raxis++) {
+    (*h)[raxis].first = 0;
+    (*h)[raxis].second = src.dims[raxis];
+  }
+}
+
+
+
 /**
    Given patterns pattern1 and pattern2 that are valid-1 and share
    the same strides, and an offset o such that there
    exists at least one index i with pattern1[i + o] = pattern2[i]
-   (c.f. "Indexing a Pattern" in the glossary in tensor-pattern.h),
+   (c.f. "Indexing a Pattern" in the glossary in pattern.h),
    outputs a Pattern representing the part of the intersection
    of the memory-index-sets of pattern1 and pattern2 that has
    offset o.
@@ -700,152 +743,30 @@ static void OffsetToPattern(const Pattern &pattern1,
 
 
 
-/**
-   This recursive function is used to compute the intersection between
-   pattern1 and pattern2, which must have identical num_axes and strides,
-   must have normalized strides, and must be valid-1.  The user would call
-   this with identical_raxis == pattern1.num_axes, and the recursion on
-   identical_raxis takes care of the actual implementation.
-
-
-        @param [in] pattern1   The first input pattern.  Must be valid-1 and
-                               have normalized strides.
-        @param [in] pattern2   The second input pattern.  Must be valid-1 and
-                               have the same num_axes and strides as pattern1.
-        @param [in] identical_raxis  Let num_axes be the num_axes of pattern1 or
-                               pattern2 (it's the same).  By passing in
-                               a particular value of identical_raxis, the caller
-                               asserts that for all raxis with
-                               identical_raxis <= raxis < num_axes,
-                               `pattern1.dim[raxis] == pattern2.dim[raxis]`;
-                               and furthermore that the caller is only
-                               interested in the part of the overlap for which
-                               pattern1 and pattern2 have the same index for all
-                               raxis >= identical_raxis (and if there was
-                               another part, it has been handled separately).
-        @param [in] keep_all_patterns  True if the user actually wants all of
-                               the patterns (as opposed to just caring whether
-                               any exist).  If false, this function may return
-                               early after processing on or more patterns.
-        @param [out] patterns_out  The output patterns; this function will
-                               append to this location a number (possibly zero)
-                               of disjoint valid patterns, each of which is
-                               linear in pattern1 and pattern2, the union of whose
-                               memory-index-sets is identical to the intersection
-                               of pattern1 and pattern2's memory-index-sets.
-  */
-void ComputeIntersectionRecursive(const Pattern &pattern1,
-                                  const Pattern &pattern2,
-                                  int32 identical_raxis,
-                                  bool keep_all_patterns,
-                                  std::vector<Pattern> *patterns_out) {
-  if (identical_raxis == 0) {
-    /*
-      The base-case of the recursion; if we reach here, it means pattern1 and
-      pattern2 have identical dims and strides; and if they also have the same
-      offset, all we need to do is append one of them to 'patterns_out'
-      (otherwise this part of the intersection is empty).  This is all part of a
-      process of trying to make the 'offset' identical between the two patterns
-      by discarding some leading indexes on one of the two patterns, and
-      discarding any trailing indexes as needed to make the dim the same.  (See
-      "Index:" in glossary for clarity on its meaning here).
-    */
-
-    if (pattern1.offset == pattern2.offset) {
-      size_t cur_size = patterns_out->size();
-      patterns_out->resize(cur_size + 1);
-      RemoveTrivialAxes(pattern1, &(patterns_out[cur_size]));
-    }
-    return;
-  }
-  // we'll be modifying the dims and strides on axis 'raxis'.
-  int32 raxis = identical_raxis - 1,
-      stride = pattern1.strides[raxis]; // will be the same in pattern2, and positive.
-
-  // By the '?..:' statements below we possibly switch pattern2 and
-  // pattern1, thereby ensuring that pattern2_mod.offset >= pattern1_mod.offset;
-  // this simplifies the later code.
-  Pattern pattern1_mod(pattern2.offset >= pattern1.offset ? pattern1 : pattern2),
-      pattern2_mod(pattern2.offset >= pattern1.offset ? pattern2 : pattern1);
-
-
-  // pattern2_mod's offset is larger (or the same), so we may need to discard
-  // some leading indexes of pattern1_mod (on axis 'raxis'), increasing
-  // pattern1_mod's offset and reducing its dim on this raxis, to get the
-  // offsets closer to being the same.
-
-  // 'min_dim1_discarded' below will be rounded down in the division, and we will
-  // also need to also consider the value that's one larger than that.  We don't
-  // need to consider any other values of 'dim1_discarded' other than these two,
-  // because it's possible to prove that if we recurse with the remaining offset
-  // being greater than 'stride', we would never be able to get to offset=0
-  // without discarding all dims of at least one axis numbered less than raxis.
-  // The proof requires the axis-dominance property (together with normalized
-  // strides).
-  int32 offset_diff = pattern2_mod.offset - pattern1_mod.offset,
-      min_dim1_discarded = offset_diff / stride,
-      max_dim1_discarded = ((offset_diff == min_dim1_discarded * stride) ?
-                            min_dim1_discarded : min_dim1_discarded + 1);
-
-  // Make a copy of the relevant dims, and pattern1's offset, because the
-  // versions in the patterns may get modified in the loop below.
-  int32 pattern1_dim = pattern1_mod.dims[raxis],
-      pattern2_dim = pattern2_mod.dims[raxis],
-      pattern1_offset = pattern1.offset;
-  for (int32 dim1_discarded = min_dim1_discarded;
-       dim1_discarded <= max_dim1_discarded; dim1_discarded++) {
-    pattern1_mod.offset = pattern1_offset + dim1_discarded * stride;
-    int32 new_pattern1_dim = pattern1_dim - dim1_discarded;
-    if (new_pattern1_dim <= 0)
-      continue;  // There's no overlap here.
-    pattern1_mod.dims[raxis] = new_pattern1_dim;
-    // set both dims of pattern1_mod and pattern2_mod to the minimum
-    // of the two dims.
-    if (pattern2_dim > new_pattern1_dim) {
-      pattern2_mod.dims[raxis] = new_pattern1_dim;
-    } else {
-      pattern1_mod.dims[raxis] = pattern2_dim;
-      pattern2_mod.dims[raxis] = pattern2_dim;
-    }
-    // Recurse.
-    ComputeIntersectionRecursive(pattern1, pattern2, raxis,
-                                 keep_all_patterns, patterns_out);
-    if (!keep_all_patterns && !patterns_out->empty())
-      return;  // An optimization if we just want to test if intersection is
-               // nonempty.
-  }
-}
-
-
 // See documentation in header.
 bool ComputeIntersection(const Pattern &pattern1_in,
                          const Pattern &pattern2_in,
                          bool keep_all_patterns,
                          std::vector<Pattern> *intersection) {
+  intersection->clear();
   Pattern pattern1(pattern1_in),
-      pattern2(pattern2_in);
+    pattern2(pattern2_in);
   CanonicalizePattern(&pattern1);
   CanonicalizePattern(&pattern2);
   std::vector<int32> strides;
   FindAllStrides(pattern1, pattern2, &strides);
   int32 num_axes = strides.size();
   if (num_axes == 0) {
-    // Some of the code below with num_axes - 1 would crash
-    // in this case, so handle it separately.
-    // Note: for 1-element patterns, if their offsets are
+    // Some of the code below with num_axes - 1 would crash in this case, so
+    // handle it separately.  Note: for 1-element patterns, if their offsets are
     // different, they don't intersect.
-    if (pattern1.offset == pattern2.offset) {
-      intersection->resize(1);
-      (*intersection)[0] = pattern1;
-    } else {
-      intersection->clear();
-    }
+    if (pattern1.offset == pattern2.offset)
+      intersection->push_back(pattern1);
     return true;
   }
   std::vector<Pattern> patterns1, patterns2;
   patterns1.reserve(8);
   patterns2.reserve(8);
-  intersection->clear();
   if (!ConvertPatternStrides(pattern1, strides, &patterns1) ||
       !ConvertPatternStrides(pattern2, strides, &patterns2))
     return false;
@@ -856,11 +777,11 @@ bool ComputeIntersection(const Pattern &pattern1_in,
     auto iter2 = patterns2.begin(), end2 = patterns2.end();
 
     // Below, 'end_mindex1' is not the actual largest mindex in `sub_pattern1`,
-    // but an upper bound on it (in fact, it is strictly greater than it); to
-    // prove this we require the axis-dominance property and the fact that the
-    // strides are normalized (positive and increasing).  This is part of an
-    // optimization to more quickly skip over pairs of patterns that will have
-    // empty intersection.
+    // but an upper bound on it (in fact, it is greater than the last element in
+    // it); to prove this we require the axis-dominance property and the fact
+    // that the strides are normalized (positive and increasing).  This is part
+    // of an optimization to more quickly skip over pairs of patterns that will
+    // have empty intersection.
     int64 begin_mindex1 = sub_pattern1.mindex,
         end_mindex1 = begin_mindex1 +
         sub_pattern1.strides[num_axes - 1] * sub_pattern1.dims[num_axes - 1];
@@ -1037,175 +958,11 @@ bool PatternsIntersect(const Pattern &pattern1,
 }
 
 
-
-/**
-   Offsets, and computing intersection of Patterns.
-
-   Suppose we are computing the intersetion of the memory-index-sets of pattern1
-   and pattern2.
-
-   For each memory-index m that is in both pattern1 and pattern2, there must be
-   index-tuples i1 and i2 such that pattern1[i1] = pattern2[i2] = m.  We can
-   write this as: pattern1[i] = pattern2[i + o], where o is an offset that's
-   also a tuple (like an index-tuple, but with possibly negative elements).  This
-   function can be thought of as a recursive search for all values of the
-   offset 'o' for which at least one such index m exists.  For each such offset
-   'o' we might end up with a Pattern;  and the union of all of these
-   patterns is the intersection of pattern1 and pattern2.
-
-   The algorithm for computing the list of potential offsets o is recursive,
-   starting from the last-numbered raxis, which will have the highest
-   stride since the strides are normalized.
-
-   Let the vector of strides of the patterns (they're the same) be s.
-   from pattern1[i] = pattern2[i + o], we have:
-     pattern1.offset + s . i  == pattern2.offset + s . (i + o)
-   where a `.` with space around it means dot product.
-
-   Simplifying:
-      s . o = pattern1.offset - pattern2.offset.         (1)
-
-   For each raxis r, there are limits on the value of o[r]; these are imposed by
-   the dimensions of the two Tensors.  In the equation pattern1[i] = pattern2[i
-   + o], for the indexes into the patterns to be valid, i[r] must be in
-   [0 .. pattern1.dims[r] - 1] and i[r] + o[r] must be in [0 .. pattern2.dims[r] - 1].
-   For such an i[r] to exist, o[r] must be in the range [-(pattern1.dims[r] - 1)
-   .. pattern2.dims(r) - 1].
-
-   There is a further limitation on the elements of o that we can obtain
-   using the properties above plus the axis-dominance property.  It's easiest
-   to explain this if we let r be num_axes - 1, and define:
-       l(r) =   \sum_{q < r} s[q] * o[q].
-   Here, l(r) represents the sum of the elements in s . o that come from raxes
-   lower than r.  We can use the axis-dominance lemma (see tensor-pattern.h)
-   and the limitation on o[r] proved in the previous paragraph to prove that:
-       -s[r] <  l(r) <  s[r].
-   For the last axis r = num_axes - 1, for the equation (1) to hold, we
-   must have  l(r) =  pattern1.offset - pattern2.offset - s[r] * o[r],
-   so we have the inequality
-     -s[r] <  pattern1.offset - pattern2.offset - s[r] * o[r]  <  s[r]
-   which means we need only consider offsets o[r] where the absolute value of
-   the "remainder" is less than s[r]; there will be at most two.  For an raxis r
-   < num_axes - 1, if the offsets for higher-numbered r are already known we
-   just subtract the appropriate terms from the remainder too.  The recursive
-   implementation that finds the possible offset vectors is pretty obvious
-   intuitively.
-*/
-
-/**
-   This recursive function is used to compute the set-wise difference pattern1 -
-   pattern2 where the two patterns must have identical num_axes and strides,
-   must have normalized strides, and must be valid-1.  The user would call this
-   with identical_raxis == pattern1.num_axes, and the recursion on
-   identical_raxis takes care of the actual implementation.
-
-   Notes on how this works and the math behind it:
-
-
-
-
-
-
-
-Since
-  pattern1 and pattern2 have the same strides, there will be in many cases
-  multiple such pairs of index-tuples (i1, i2) with the same difference
-
-
-        @param [in] pattern1   The first input pattern.  Must be valid-1 and
-                               have normalized strides.
-        @param [in] pattern2   The second input pattern.  Must be valid-1 and
-                               have the same num_axes and strides as pattern1.
-        @param [in] identical_raxis  Let num_axes be the num_axes of pattern1 or
-                               pattern2 (it's the same).  By passing in
-                               a particular value of identical_raxis, the caller
-                               asserts that for all raxis with
-                               identical_raxis <= raxis < num_axes,
-                               `pattern1.dim[raxis] == pattern2.dim[raxis]`;
-                               and furthermore that the caller is only
-                               interested in the part of the overlap for which
-                               pattern1 and pattern2 have the same index for all
-                               raxis >= identical_raxis (and if there was
-                               another part, it has been handled separately).
-        @param [out] patterns_out  The output patterns; this function will
-                               append to this location a number (possibly zero)
-                               of disjoint valid patterns, each of which is
-                               linear in pattern1 and pattern2, the union of whose
-                               memory-index-sets is identical to the difference
-                               of pattern1 and pattern2's memory-index-sets.
-*/
-void ComputeDifferenceRecursive(const Pattern &pattern1,
-                                const Pattern &pattern2,
-                                int32 identical_raxis,
-                                std::vector<Pattern> *patterns_out) {
-  if (identical_raxis == 0) {
-    /*
-      The base-case of the recursion; if we reach here, it means pattern1 and
-      pattern2 have identical dims and strides.  If they have different
-      offsets, that means they are disjoint and so pattern1 itself is
-      the difference; if the offset is the same, they are the same set
-      and so we don't need to output anything. */
-    if (pattern1.offset != pattern2.offset) {
-      size_t cur_size = patterns_out->size();
-      patterns_out->resize(cur_size + 1);
-      RemoveTrivialAxes(pattern1, &(patterns_out[cur_size]));
-    }
-    return;
-  }
-  // we'll be modifying the dims and strides on axis 'raxis'.
-  int32 raxis = identical_raxis - 1,
-      stride = pattern1.strides[raxis]; // will be the same in pattern2, and positive.
-
-
-  // pattern2_mod's offset is larger (or the same), so we may need to discard
-  // some leading indexes of pattern1_mod (on axis 'raxis'), increasing
-  // pattern1_mod's offset and reducing its dim on this raxis, to get the
-  // offsets closer to being the same.
-
-  // 'min_dim1_discarded' below will be rounded down in the division, and we will
-  // also need to also consider the value that's one larger than that.  We don't
-  // need to consider any other values of 'dim1_discarded' other than these two,
-  // because it's possible to prove that if we recurse with the remaining offset
-  // being greater than 'stride', we would never be able to get to offset=0
-  // without discarding all dims of at least one axis numbered less than raxis.
-  // The proof requires the axis-dominance property (together with normalized
-  // strides).
-  int32 offset_diff = pattern2_mod.offset - pattern1_mod.offset,
-      min_dim1_discarded = offset_diff / stride,
-      max_dim1_discarded = ((offset_diff == min_dim1_discarded * stride) ?
-                            min_dim1_discarded : min_dim1_discarded + 1);
-
-  // Make a copy of the relevant dims, and pattern1's offset, because the
-  // versions in the patterns may get modified in the loop below.
-  int32 pattern1_dim = pattern1_mod.dims[raxis],
-      pattern2_dim = pattern2_mod.dims[raxis],
-      pattern1_offset = pattern1.offset;
-  for (int32 dim1_discarded = min_dim1_discarded;
-       dim1_discarded <= max_dim1_discarded; dim1_discarded++) {
-    pattern1_mod.offset = pattern1_offset + dim1_discarded * stride;
-    int32 new_pattern1_dim = pattern1_dim - dim1_discarded;
-    if (new_pattern1_dim <= 0)
-      continue;  // There's no overlap here.
-    pattern1_mod.dims[raxis] = new_pattern1_dim;
-    // set both dims of pattern1_mod and pattern2_mod to the minimum
-    // of the two dims.
-    if (pattern2_dim > new_pattern1_dim) {
-      pattern2_mod.dims[raxis] = new_pattern1_dim;
-    } else {
-      pattern1_mod.dims[raxis] = pattern2_dim;
-      pattern2_mod.dims[raxis] = pattern2_dim;
-    }
-    // Recurse.
-    ComputeIntersectionRecursive(pattern1, pattern2, raxis,
-                                 keep_all_patterns, patterns_out);
-  }
-}
-
-
 // See documentation in header.
 bool ComputeDifference(const Pattern &pattern1,
                        const Pattern &pattern2,
                        std::vector<Pattern> *difference) {
+  difference->clear();
   Pattern pattern1(pattern1_in),
       pattern2(pattern2_in);
   CanonicalizePattern(&pattern1);
@@ -1214,73 +971,99 @@ bool ComputeDifference(const Pattern &pattern1,
   FindAllStrides(pattern1, pattern2, &strides);
   int32 num_axes = strides.size();
   if (num_axes == 0) {
-    // Some of the code below with num_axes - 1 would crash
-    // in this case, so handle it separately.
-    // Note: for 1-element patterns, if their offsets are
+    // Some of the code below with num_axes - 1 would crash in this case, so
+    // handle it separately.  Note: for 1-element patterns, if their offsets are
     // different, they don't intersect.
-    if (pattern1.offset != pattern2.offset) {
-      intersection->resize(1);
-      (*intersection)[0] = pattern1;
-    } else {
-      intersection->clear();
-    }
+    if (pattern1.offset != pattern2.offset) 
+      difference->push_back(pattern1);
     return true;
   }
   std::vector<Pattern> patterns1, patterns2;
   patterns1.reserve(8);
   patterns2.reserve(8);
-  intersection->clear();
   if (!ConvertPatternStrides(pattern1, strides, &patterns1) ||
       !ConvertPatternStrides(pattern2, strides, &patterns2))
     return false;
 
-
-  // The algorithm is: first initialize `cur_difference` to
-  // pattern1.  Then,
+  // The algorithm is iterative where the iteration is over
+  // `patterns2`.
+  //
+  // First w initialize `cur_difference` to
+  // patterns1.  Then
   // For each member p2 of `patterns2`
-  //   For each member p of cur_difference
-  //      Compute (p - p2), appending the result (as zero or more
+  //   For each member p1 of cur_difference
+  //      Compute (p1 - p2), appending the result (as zero or more
   //      patterns) to next_difference.
-  //   set cur_difference = next_difference and clear next_difference.
+  //   Set cur_difference = next_difference and clear next_difference.
   // Result is in cur_difference.
   std::vector<Pattern> cur_difference, next_difference;
   cur_difference.swap(patterns1);
 
   for (auto iter2 = patterns2.begin(); iter2 != patterns2.end(); ++iter2) {
     const Pattern &sub_pattern2 = *iter2;
-    // Below, 'end_mindex1' is not the actual largest mindex in `sub_pattern1`,
-    // but an upper bound on it (in fact, it is strictly greater than it); to
-    // prove this we require the axis-dominance property and the fact that the
-    // strides are normalized (positive and increasing).  This is part of an
-    // optimization to more quickly skip over pairs of patterns that will have
-    // empty intersection.
+    // Below, 'end_mindex2' is not the actual largest mindex in `sub_pattern2`,
+    // but an upper bound on it (in fact, it is greater than the last element in
+    // it); to prove this we require the axis-dominance property and the fact
+    // that the strides are normalized (positive and increasing).  This is part
+    // of an optimization to more quickly process pairs of patterns that will
+    // have empty intersection, so won't interact.
     int64 begin_mindex2 = sub_pattern2.offset,
-        end_mindex2 = begin_mindex2 +
-        sub_pattern2.strides[num_axes - 1] * sub_pattern2.dims[num_axes - 1];
-
-    for (auto iter = cur_difference.begin(); iter != cur_difference.end();
-         ++iter){
+      end_mindex2 = begin_mindex2 +
+      sub_pattern2.strides[num_axes - 1] * sub_pattern2.dims[num_axes - 1];
+    
+    for (auto iter = cur_difference.begin(); iter != cur_difference.end(); ++iter){
       const Pattern &sub_pattern1 = *iter;
       // as before, end_mindex1 is strictly greater than the actual largest
       // mindex.
       int64 begin_mindex1 = sub_pattern1.offset,
           end_mindex1 = begin_mindex1 +
-          sub_pattern1.strides[num_axes - 1] * sub_pattern1.dims[num_axes - 1];
+        sub_pattern1.strides[num_axes - 1] * sub_pattern1.dims[num_axes - 1];
 
       if (begin_mindex2 >= end_mindex1 || begin_mindex1 >= end_mindex2) {
-        //  This is an optimization for efficiency when it's easy to
-        // see that two Patterns won't overlap.  In this case
-        // we don't subtract anything from sub_pattern1.
+        // The two Patterns don't intersect, so the set difference is
+        // just sub_pattern1.
         next_difference.push_back(sub_pattern1);
         continue;
       }
 
-      // Here, sub_pattern1 and sub_pattern2 are the sub-pieces of pattern1 and
-      // pattern2 that have been converted to share the same list of strides The
-      // following call may add elements to 'difference'.
-      ComputeDifferenceRecursive(sub_pattern1, sub_pattern2,
-                                 num_axes,
-                                 &next_difference);
+      std::vector<Hyperrectangle> cur_rects(1); 
+      // Get a hyperrectangle that represents all index-tuples into
+      // sub_pattern1.
+      GetFullHyperrectangleOfPattern(sub_pattern1, &cur_rects.back());
+      
+      // each member of `offsets` represents one part of the intersection
+      // between sub_pattern1 and sub_pattern2.  Each of these will be converted
+      // to a hyperrectangle representing the set of indexes it covers within
+      // sub_pattern1, and that hyperrectangle will be subtracted from the
+      // hyperrectangle representing all the indexes in sub_pattern1; (or,
+      // if offsets.size() > 1, from whatever hyperrectangles we have
+      // after subracting  previous things.
+      std::vector<std::vector<int32> > offsets;
+      FindOffsets(sub_pattern1, sub_pattern2, true, &offsets);
+      
+      std::vector<Hyperrectangle> next_rects;
+      for (const std::vector<int32> &offset: offsets) {
+        Hyperrectangle h;
+        OffsetToHyperrectangle(sub_pattern1, sub_pattern2, offset, &h);
+        // h represents a set of indexes into sub_pattern1, which cover one part
+        // of the intersection between sub_pattern1 and sub_pattern2.
+
+        // we need to subtract h from each hyperrectangle in cur_rects;
+        // the results are appended to next_rects;
+        for (const Hyperrectangle &rect: cur_rects)
+          SubtractHyperrectangles(*cur_iter, h, 0, &next_rects);
+
+        cur_rects.swap(next_rects);
+        next_rects.clear();
+      }
+      for (auto hiter = cur_rects.begin(); 
+           hiter !=  cur_rects.end(); ++hiter) {
+        // *hiter represents one piece of the difference sub_pattern1 -
+        // sub_pattern2, expressed as indexes into sub_pattern1.  We turn
+        // it back into a pattern and append it to 'next_difference'.
+        next_difference->resize(next_difference->size() + 1);
+        HyperrectangleToPattern(sub_pattern1, *hiter, &next_difference->back());
+      }
     }
     cur_difference.swap(next_difference);
     next_difference.clear;
@@ -1290,6 +1073,17 @@ bool ComputeDifference(const Pattern &pattern1,
   return true;
 }
 
+bool PatternIsSubsetOf(const Pattern &p,
+                       const Pattern &q) {
+  std::vector<Pattern> intersection;
+  ComputeIntersection(p, q, true, &intersection);
+  int64 total_size = 0;
+  for (Pattern &r : intersection) 
+    total_size += NumElements(r);
+  return (total_size == NumEements(p));
+}
+
+
 
 bool PatternsIntersectSlow(const Pattern &pattern1_in,
                            const Pattern &pattern2_in) {
diff --git a/src/tensor/tensor-pattern-extra-utils.h b/src/tensor/pattern-extra-utils.h
similarity index 93%
rename from src/tensor/tensor-pattern-extra-utils.h
rename to src/tensor/pattern-extra-utils.h
index 3058051629d..bf724706aae 100644
--- a/src/tensor/tensor-pattern-extra-utils.h
+++ b/src/tensor/pattern-extra-utils.h
@@ -1,4 +1,4 @@
-// tensor/tensor-pattern-extra-utils.h
+// tensor/pattern-extra-utils.h
 
 //  Copyright      2019  Johns Hopkins University (author: Daniel Povey)
 
@@ -21,7 +21,7 @@
 #define KALDI_TENSOR_TENSOR_PATTERN_EXTRA_UTILS_H_ 1
 
 #include "tensor/tensor-common.h"
-#include "tensor/tensor-pattern.h"
+#include "tensor/pattern.h"
 #include "tensor/array-ref.h"
 
 
@@ -73,7 +73,7 @@ int64 IndexPattern(const Pattern &pattern,
 /**
    FindOffsets() is a utility function used in computing pattern intersections
    and set differences.  We will be using the notation described "Indexing a
-   Pattern" in tensor-pattern.h.  Let pattern1 and pattern2 be patterns satisfying
+   Pattern" in pattern.h.  Let pattern1 and pattern2 be patterns satisfying
    SameStrides(pattern1, pattern2).  Let n be the num-axes of the patterns.
    Let Offsets(pattern1, pattern2) be the set of n-tuples o such that there
    exists an i with pattern1[i + o] = pattern2[i], with of course i + o in the
@@ -106,7 +106,7 @@ bool FindOffsets(const Pattern &pattern1,
 
 /**
    Returns information about whether pattern2's memory-index-set is a subset of
-   pattern1's memory-index-set.  See glossary in tensor-pattern.h for
+   pattern1's memory-index-set.  See glossary in pattern.h for
    explanation of memory-index-set.
         @param [in] pattern1  First input pattern; must be valid.
         @param [in] pattern2  First input pattern; must be valid.
@@ -144,7 +144,7 @@ inline void SetDefaultCodeAndProperties(Pattern *dest) {
 
 /**
    Returns true if the two patterns are equivalent in the sense that their
-   memory-index-sets are the same.  See glossary in tensor-pattern.h for
+   memory-index-sets are the same.  See glossary in pattern.h for
    explanation.
 
    This function works by reducing both patterns to canonical form
@@ -263,13 +263,15 @@ bool PatternContains(const Pattern &pattern,
 
 
 /**
-   Returns true if the memory-index-set of pattern p is a subset
-   of the memory-index-set of pattern q.
-
+   Returns true if the memory-index-set of pattern p is a subset of the
+   memory-index-set of pattern q.  Note: the algorithm is not super trivial or
+   fast (although the tiem taken doesn't grow with the dims or strides, only
+   with the number of axes).
+   
       @param [in] p   First pattern; must be valid.
       @param [in] q   Second pattern; must be valid.
       @return   Returns true if memory-index-set of p is a subset of
-                the memory-index-set of q (see tensor-pattern.h for definition;
+                the memory-index-set of q (see pattern.h for definition;
                 of memory-index-set).
  */
 bool PatternIsSubsetOf(const Pattern &p,
@@ -300,7 +302,7 @@ void ComputeMinAndMaxMindex(const Pattern &pattern,
 
 /**
    Outputs the memory-index-set corresponding to the pattern 'pattern' to 's'.
-   See glossary in tensor-pattern.h for definitions.
+   See glossary in pattern.h for definitions.
 
    This is strictly to be used in debugging code, as it is extremely
    inefficient.
@@ -327,7 +329,7 @@ int64 RandomMemoryIndex(const Pattern &pattern);
 
 /**
    Outputs the memory-index-tuple-set corresponding to the pattern 'pattern' to
-   's' (see tensor-pattern.h for definition).
+   's' (see pattern.h for definition).
 
    This function is strictly to be used in debugging code, as it is
    extremely inefficient.
@@ -342,14 +344,14 @@ bool ToMemoryIndexTupleSet(const ArrayRef<Pattern*>  patterns,
 /**
    Returns true if the two pattern-tuples are equivalent in the sense
    that their memory-index-tuple-sets are the same.  See glossary
-   in tensor-pattern.h for explanation.
+   in pattern.h for explanation.
  */
 bool PatternTuplesEquivalent(const ArrayRef<const Pattern*> patterns1,
                              const ArrayRef<const Pattern*> patterns2);
 
 /**
    Returns true if Pattern p is linear in Pattern q.  (Note:
-   this is a rather technical property, see tensor-pattern.h for definition).
+   this is a rather technical property, see pattern.h for definition).
 
       @param [in] p  The first pattern.  Must be valid
       @param [in] q  The second pattern.  Must be valid and must satisfy
@@ -360,7 +362,7 @@ bool IsLinearIn(const Pattern &p,
 
 /**
    This function returns true if a Pattern is regular (see Regularity property
-   in the glossary in tensor-pattern.h) and false otherwise.  'pattern' must
+   in the glossary in pattern.h) and false otherwise.  'pattern' must
    have all positive strides, the strides must be in increasing order (in the
    private numbering), and it must be valid-2 (see glossary).
  */
@@ -383,13 +385,12 @@ bool IsValid2(const Pattern &pattern);
 /**
    This function attempts to convert a pattern 'pattern' in canonical form
    (c.f. "Canonical form" in glossary, and CanonicalizePattern()) to a list of
-   Patterns (see documentation of `patterns` below for note on their possible
-   non-validity), whose strides (in the private numbering) are equal to the
+   valid-1 Patterns whose strides (in the private numbering) are equal to the
    provided 'strides' vector, the union of whose memory-index-sets (which will
    all be disjoint) is equal to the memory-index-set of the input Pattern, and
    which are all linear in `pattern` (c.f. documentation of "Linear Property).
 
-   This function is not guaranteed to always succeed (return true) but it will
+   This function is not guaranteed to always succeed (return true), but it will
    always succeed when people are doing "reasonable" things with Tensors.  It
    will always succeed if each element in 'strides' divides the next element
    exactly, although this is not a necessary condition for success.
@@ -399,26 +400,18 @@ bool IsValid2(const Pattern &pattern);
                         smallest to greatest; it must contain all strides in
                         `pattern`.
        @param [out] patterns  On success (see documentation of return status)
-                        'patterns' will be set to a nonempty list of patterns,
-                        the union of whose memory-index-sets equals the
-                        memory-index-set of `pattern`; all of whose strides are
-                        equal to `strides`; and each of which is valid-1 and
-                        linear in `pattern` (see "Linear property").
-
-                        except for property (iv) (search for "Valid
-                        Pattern" in tensor-pattern.h): that is, they may have
-                        nonzero strides for axes with dim == 1.  Each elements
-                        of 'strides' dividing the next is a sufficient but not
-                        necessary condition for this function to always return
-                        true.
-                          On failure, `patterns->empty()` will be empty.
-
-        @return         Returns true if pattern strides could be converted using
+                        'patterns' will be set to a nonempty list of valid-1
+                        patterns, the union of whose memory-index-sets equals
+                        the memory-index-set of `pattern`; all of whose strides
+                        are equal to `strides`; and each of which is linear in
+                        `pattern` (see "Linear property").
+                           On failure, 'patterns' will be empty.
+       @return          Returns true if pattern strides could be converted using
                         our algorithm, false if not.  This algorithm will work
                         for any 'reasonable' request, but it doesn't attempt to
                         cover the types of cases where, to solve them, we would
                         have to output a number of patterns that couldn't be
-                        bounded given the number of axes.
+                        bounded given only the number of axes.
   */
 bool ConvertPatternStrides(const Pattern &pattern,
                            const ArrayRef<int32> strides,
@@ -438,7 +431,7 @@ bool ConvertPatternStrides(const Pattern &pattern,
    dims and 'src', and the strides are such as to satisfy
    \f$  dest[i] = f(src[i]) \f$,
    where i is a valid Index-tuple for `src`.  See "Indexing a Pattern"
-   in the glossary in tensor-pattern.h for explanation of this notation.
+   in the glossary in pattern.h for explanation of this notation.
 
          @param [in] src  The source pattern.  Must be valid.
          @param [out] dest  The destination pattern.  Will be identical
@@ -504,7 +497,7 @@ void MakeCompactNonnegativeAndJustified(const Pattern &src,
    Class PatternRebaser is an object that converts Pattern
    when memory layouts change.  The main use-case is when a base Variable
    (c.f. variable.h for definition) has a Pattern that is not
-   contiguous (see tensor-pattern.h for definition of 'contiguous'), and
+   contiguous (see pattern.h for definition of 'contiguous'), and
    its gradient Tensor is allocated contiguously.  This class is
    needed to convert patterns for Variables into patterns for their
    corresponding gradients.
@@ -669,6 +662,6 @@ class OutOfPlaceAxisSorter {
 }  // namespace kaldi
 
 // Include implementation of inline functions.
-#include "tensor/tensor-pattern-extra-utils-inl.h"
+#include "tensor/pattern-extra-utils-inl.h"
 
 #endif  // KALDI_TENSOR_TENSOR_PATTERN_EXTRA_UTILS_H_
diff --git a/src/tensor/tensor-pattern-utils-inl.h b/src/tensor/pattern-utils-inl.h
similarity index 92%
rename from src/tensor/tensor-pattern-utils-inl.h
rename to src/tensor/pattern-utils-inl.h
index c23a81cc6a6..1eea287f97b 100644
--- a/src/tensor/tensor-pattern-utils-inl.h
+++ b/src/tensor/pattern-utils-inl.h
@@ -1,4 +1,4 @@
-// tensor/tensor-pattern-utils-inl.h
+// tensor/pattern-utils-inl.h
 
 //  Copyright      2019  Johns Hopkins University (author: Daniel Povey)
 
@@ -18,7 +18,7 @@
 // limitations under the License.
 
 
-// Do not include this header directly; it is only to be included by tensor-pattern-utils.h.
+// Do not include this header directly; it is only to be included by pattern-utils.h.
 
 #ifndef KALDI_TENSOR_TENSOR_PATTERN_UTILS_INL_H_
 #define KALDI_TENSOR_TENSOR_PATTERN_UTILS_INL_H_ 1
@@ -27,7 +27,7 @@
 namespace kaldi {
 namespace tensor {
 
-// See tensor-pattern-utils.h for documentation.
+// See pattern-utils.h for documentation.
 inline bool ContainsNegativeStride(const Pattern &pattern) {
   // 2048 is 1 << 11; 11th bit in code is set if code indicates negative stride.
   if (pattern.code >= 0 && (pattern.code | 2048) != 0)
diff --git a/src/tensor/tensor-pattern-utils-test.cc b/src/tensor/pattern-utils-test.cc
similarity index 95%
rename from src/tensor/tensor-pattern-utils-test.cc
rename to src/tensor/pattern-utils-test.cc
index 881c2c01fb7..4e0b1f3481f 100644
--- a/src/tensor/tensor-pattern-utils-test.cc
+++ b/src/tensor/pattern-utils-test.cc
@@ -1,4 +1,4 @@
-// util/tensor-pattern-utils-test.cc
+// util/pattern-utils-test.cc
 
 // Copyright 2009-2011  Microsoft Corporation
 
@@ -17,8 +17,8 @@
 // See the Apache 2 License for the specific language governing permissions and
 // limitations under the License.
 
-#include "tensor/tensor-pattern.h"
-#include "tensor/tensor-pattern-utils.h"
+#include "tensor/pattern.h"
+#include "tensor/pattern-utils.h"
 #include "base/kaldi-math.h"
 
 
diff --git a/src/tensor/tensor-pattern-utils.cc b/src/tensor/pattern-utils.cc
similarity index 99%
rename from src/tensor/tensor-pattern-utils.cc
rename to src/tensor/pattern-utils.cc
index d0931765db7..35d62f3ad9e 100644
--- a/src/tensor/tensor-pattern-utils.cc
+++ b/src/tensor/pattern-utils.cc
@@ -1,4 +1,4 @@
-// tensor/tensor-pattern-utils.cc
+// tensor/pattern-utils.cc
 
 // Copyright      2019  Johns Hopkins University (author: Daniel Povey)
 
@@ -17,7 +17,7 @@
 // See the Apache 2 License for the specific language governing permissions and
 // limitations under the License.
 
-#include "tensor/tensor-pattern-utils.h"
+#include "tensor/pattern-utils.h"
 
 namespace kaldi {
 namespace tensor {
diff --git a/src/tensor/tensor-pattern-utils.h b/src/tensor/pattern-utils.h
similarity index 97%
rename from src/tensor/tensor-pattern-utils.h
rename to src/tensor/pattern-utils.h
index 95529320566..d539d9a5e70 100644
--- a/src/tensor/tensor-pattern-utils.h
+++ b/src/tensor/pattern-utils.h
@@ -1,4 +1,4 @@
-// tensor/tensor-pattern-utils.h
+// tensor/pattern-utils.h
 
 //  Copyright      2019  Johns Hopkins University (author: Daniel Povey)
 
@@ -23,11 +23,11 @@
 
 
 #include "tensor/tensor-common.h"
-#include "tensor/tensor-pattern.h"
+#include "tensor/pattern.h"
 #include "tensor/array-ref.h"
 
 // This header includes various functions operating on Patterns.
-// See also tensor-pattern-extra-utils.h which contains the
+// See also pattern-extra-utils.h which contains the
 // more obscure and less user-facing functions.
 
 namespace kaldi {
@@ -45,7 +45,7 @@ inline bool ContainsNegativeStride(int32 pattern_code) {
 /**
    This function converts an eaxis-index into an raxis-index, with no error
    checking (you would normally check afterward that the raxis-index is in the
-   correct range).  Find "Eaxis-index:" and "Raxis-index:" in tensor-pattern.h,
+   correct range).  Find "Eaxis-index:" and "Raxis-index:" in pattern.h,
    but basically and eaxis-index is an axis-index in the public numbering where
    we allow negative values to mean offsets from the end.
  */
@@ -74,7 +74,7 @@ inline bool PatternMightContainNegativeStride(
 
 /**
    Returns true if the pattern contains a negative stride.
-   See tensor-pattern-utils-inl.h for implementation.
+   See pattern-utils-inl.h for implementation.
 
       @param [in] pattern   Input pattern.  Must be valid;
                             return status is undefined otherwise.
@@ -237,7 +237,7 @@ void UnsqueezeR(int32 raxis, const Pattern &src, Pattern *dest);
 
      @param [in]    eaxis   The axis-index at which the extra axis is to appear,
                            with negatives allowed (see: "Eaxis-index" in glossary
-                           in tensor-pattern.h).
+                           in pattern.h).
      @param [in,out] p      The pattern to which we are adding an axis.
                             Will have its num_axes increased by 1
                             at exit, possibly its dims and strides
@@ -426,7 +426,7 @@ bool Broadcastable(const Pattern &a, const Pattern &b,
 /**
    Returns true if the dims-vectors of a and b are the same after padding as for
    broadcasting.  See definition of "Dims-vector of a Pattern" in
-   tensor-pattern.h, and the entry for "PyTorch-style broadcasting".  What this
+   pattern.h, and the entry for "PyTorch-style broadcasting".  What this
    means in terms of the physical storage of the patterns is that a->dims and
    b->dims contain the same elements, without requiring the num_axes to be the
    same.
@@ -455,7 +455,7 @@ bool SamePaddedDims(const Pattern &a, const Pattern &b,
 /**
    Return true if the two provided patterns have the same dims-vectors
    (meaning, effectively the same num_axes and the same dim for each
-   axis; see "Dims-vector" in tensor-pattern.h).
+   axis; see "Dims-vector" in pattern.h).
 
       @param [in] a  The first pattern.  Must be valid.
       @param [in] b  The second pattern.  Must be valid.
@@ -755,7 +755,7 @@ bool CreateViewPattern(const Pattern &pattern_in,
    It selects a range of dimensions on one of the axes.  It is similar to
    indexing with a range in Python, like A[10:20].
 
-      @param [in] eaxis  Eaxis-index (see glossary in tensor-pattern.h) on which
+      @param [in] eaxis  Eaxis-index (see glossary in pattern.h) on which
                          to possibly reduce the dimensionality.
       @param [in] start  Starting index; must be in range [0, t->Dim(eaxis) - 1]
       @param [in] end    Ending index; must be in the range [start + 1, t->Dim(eaxis)]
@@ -774,7 +774,7 @@ void Slice(int32 eaxis, int32 start, int32 end, Pattern *pattern);
    a specified axis (specified in the public numbering), of a TensorImpl `t`,
    reducing the num_axes by one.
 
-       @param [in] eaxis Eaxis-index (see glossary in tensor-pattern.h) on which
+       @param [in] eaxis Eaxis-index (see glossary in pattern.h) on which
                          to possibly reduce the dimensionality.
        @param [in] index Index to select; must be in range
                          [0, t->Dim(eaxis) - 1].
@@ -844,7 +844,7 @@ bool IsCompactAndJustified(const Pattern &pattern);
 
 /**
    Returns true if 'pattern' has normalized strides as defined in
-   tensor-pattern.h (i.e.: strides are nonnegative and the nonzero ones are in
+   pattern.h (i.e.: strides are nonnegative and the nonzero ones are in
    strictly increasing order in the private numbering / decreasing in the
    public).
 */
@@ -868,6 +868,6 @@ bool HasNonnegativeStrides(const Pattern &pattern);
 }  // namespace kaldi
 
 
-#include "tensor/tensor-pattern-utils-inl.h"
+#include "tensor/pattern-utils-inl.h"
 
 #endif KALDI_TENSOR_TENSOR_PATTERN_UTILS_H_
diff --git a/src/tensor/tensor-pattern.cc b/src/tensor/pattern.cc
similarity index 98%
rename from src/tensor/tensor-pattern.cc
rename to src/tensor/pattern.cc
index 489965dd418..f4e0c237a45 100644
--- a/src/tensor/tensor-pattern.cc
+++ b/src/tensor/pattern.cc
@@ -1,4 +1,4 @@
-// tensor/tensor-pattern.cc
+// tensor/pattern.cc
 
 // Copyright      2019  Johns Hopkins University (author: Daniel Povey)
 
@@ -18,7 +18,7 @@
 // limitations under the License.
 
 #include <algorithm>
-#include "tensor/tensor-pattern.h"
+#include "tensor/pattern.h"
 
 
 namespace kaldi {
diff --git a/src/tensor/tensor-pattern.h b/src/tensor/pattern.h
similarity index 99%
rename from src/tensor/tensor-pattern.h
rename to src/tensor/pattern.h
index 39ecd344f50..53eb00929fd 100644
--- a/src/tensor/tensor-pattern.h
+++ b/src/tensor/pattern.h
@@ -1,4 +1,4 @@
-// tensor/tensor-pattern.h
+// tensor/pattern.h
 
 //  Copyright      2019  Johns Hopkins University (author: Daniel Povey)
 
@@ -513,7 +513,7 @@ struct Pattern {
                  // from the start of the originally allocated memory
                  // region
 
-  int32 code;  // pattern code; see ComputePatternCode() in tensor-pattern-utils.h
+  int32 code;  // pattern code; see ComputePatternCode() in pattern-utils.h
                // for details.  If this is negative then it means it has not been
                // computed.  In a valid Pattern the code will always be either
                // negative or up-to-date.
@@ -530,7 +530,7 @@ struct Pattern {
   // namely: dims and strides with index >= num_axes should be
   // 1 and 0 respectively; and the code should either be -1 or or
   // be the same as ComputePatternCode() returns on this pattern.
-  // See also IsCanonical() in tensor-pattern-utils.h.
+  // See also IsCanonical() in pattern-utils.h.
   bool IsValid();
 
   // This comparator induces a total ordering on valid Patterns.  It is a
@@ -578,7 +578,7 @@ struct PatternProperties {
 
 
   // Binary code describing the pattern, see GetPatternCode() in
-  // tensor-pattern-utils.h.
+  // pattern-utils.h.
   int32 code;
 
   // is_contiguous means that the data form a contiguous block in memory; it is
diff --git a/src/tensor/scalar.h b/src/tensor/scalar.h
index dba2c183d9f..6e53291b5ca 100644
--- a/src/tensor/scalar.h
+++ b/src/tensor/scalar.h
@@ -21,7 +21,7 @@
 #define KALDI_TENSOR_TENSOR_H_ 1
 
 #include "tensor/tensor-common.h"
-#include "tensor/tensor-pattern.h"
+#include "tensor/pattern.h"
 #include "tensor/tensor-impl.h"
 #include "tensor/storage.h"
 
diff --git a/src/tensor/tensor-common.h b/src/tensor/tensor-common.h
index 703046645fb..a1446e4fad8 100644
--- a/src/tensor/tensor-common.h
+++ b/src/tensor/tensor-common.h
@@ -91,7 +91,7 @@ enum StridePolicy {
                  // greatest to smallest as in a "C" array in the public
                  // numbering, or smallest to greatest in the private numbering.
                  // Per our policy, any dimension that is 1 will be given a zero stride.
-                 // C.f. "Normalized strides" in tensor-pattern.h
+                 // C.f. "Normalized strides" in pattern.h
   kCopyStrides   // Means: use the exact strides provided.
 };
 
diff --git a/src/tensor/tensor-functions.h b/src/tensor/tensor-functions.h
index 57953fcb7ed..8d1373540bf 100644
--- a/src/tensor/tensor-functions.h
+++ b/src/tensor/tensor-functions.h
@@ -215,7 +215,7 @@ void Scale(Scalar alpha, const Tensor *dest);
    `beta == 0.0.`
 
    Formally equivalent to the following; for the notation, the most relevant
-   glossary entries in tensor-pattern.h are "Dereferencing a memory-index" and
+   glossary entries in pattern.h are "Dereferencing a memory-index" and
    "Memory-index-tuple-set of a Pattern-tuple".
        (1)  For each memory-index `m` in `dest`, do: `*m = 0.0`
        (2)  For each memory-index-tuple `(m_src, m_dest)` in the memory-index-tuple-set
@@ -234,7 +234,7 @@ void Copy(const Tensor &src, const Tensor *dest);
    Equivalent to a special case of Add() with `beta == 1.0`.
 
    Formally equivalent to the following; for the notation, the most relevant
-   glossary entries in tensor-pattern.h are "Dereferencing a memory-index" and
+   glossary entries in pattern.h are "Dereferencing a memory-index" and
    "Memory-index-tuple-set of a Pattern-tuple".
        (1)  For each memory-index `m` in `dest`, do: `*m = 0.0`
        (2)  For each memory-index-tuple `(m_src, m_dest)` in the memory-index-tuple-set
@@ -256,7 +256,7 @@ void CopyScaled(Scalar alpha, const Tensor &src, const Tensor *dest);
    Equivalent to a special case of Add() with `beta == 1.0`.
 
    Formally equivalent to the following; for the notation, the most relevant
-   glossary entries in tensor-pattern.h are "Dereferencing a memory-index" and
+   glossary entries in pattern.h are "Dereferencing a memory-index" and
    "Memory-index-tuple-set of a Pattern-tuple".
        (1)  For each memory-index `m` in `dest`, do: `*m = 0.0`
        (2)  For each memory-index-tuple `(m_src, m_dest)` in the memory-index-tuple-set
@@ -389,14 +389,14 @@ std::shared_ptr<TensorImpl> View(const Tensor &src, ArrayRef<int32> dims);
    More formally, we can express the relationship as follows.  Suppose this
    function returns a Tensor called `dest`; and write d = src.Dim(axis1).
    For an index-tuple i in I(src) [c.f.: "Index-tuple-set of a Pattern" in
-   tensor-pattern.h], split up its indexes as:
+   pattern.h], split up its indexes as:
       i = j + k + l
    where '+' in this context means appending the tuples, and 'k' corresponds
    to the range of axes (axis1, axis1+1, ... axis1+num_axes_to_merge-1).
    Let K be the set of such k values encountered from splitting up each
    i in I(src) this way, and let f be a function from tuples to integers
    that maps list(K) to a sequence of consecutive integers starting from
-   zero (search for "list:" in tensor-pattern.h for explanation).
+   zero (search for "list:" in pattern.h for explanation).
    Let g be a function from tuples to possibly-shorter tuples that
    maps j + k + l to j + (f(k),) + l, here using Python-like notation to
    interpret (x,) as a tuple with a single element x and "+" meaning appending.
@@ -440,7 +440,7 @@ std::shared_ptr<TensorImpl> MergeAxes(const Tensor &src, int32 axis1,
    this).  Let `dims` be the vector of dims supplied; let I(dims) be the
    memory-index-set of a Pattern with dimensions equal to `dims`; let
    list(I(dims)) be that set ordered as in the natural ordering (c.f. "Natural
-   order of index-tuples" in tensor-pattern.h), and let f(i) be the function
+   order of index-tuples" in pattern.h), and let f(i) be the function
    from index-tuple to integers that when applied to list(I(dims)), produces a
    sequence of consecutive integers starting from zero.  Let g be the
    function from index-tuples to index-tuples that when applied on an
@@ -449,7 +449,7 @@ std::shared_ptr<TensorImpl> MergeAxes(const Tensor &src, int32 axis1,
    function of f.  Then this function returns a Tensor `dest` sharing the same
    storage as `src`, such that dest[g(i)] = src[i] for i in I(src) and
    I(dest) = g(I(src))
-   (Relevant glossary entries in tensor-pattern.h to understand the notation
+   (Relevant glossary entries in pattern.h to understand the notation
    include "Index-tuple-set of a Pattern" and "Indexing a Pattern").
 
       @param [in] src   The source Tensor whose axis is to be split
diff --git a/src/tensor/tensor-impl-linear.cc b/src/tensor/tensor-impl-linear.cc
index 9f122270756..b75d62b6c41 100644
--- a/src/tensor/tensor-impl-linear.cc
+++ b/src/tensor/tensor-impl-linear.cc
@@ -62,7 +62,7 @@ void AddProduct(float alpha, float beta,
     The case-statement values in the switch statement below may be
     interpreted in groups of 3 hex characters, are 0xAAABBBCCC,
     pertaining to Tensors a, b and c respectively.  See
-    GetPatternCode() in tensor-pattern-utils.h for documentation on
+    GetPatternCode() in pattern-utils.h for documentation on
     the meanings of the values and our notation with X,x,1.
    */
   switch(combined_code) {
diff --git a/src/tensor/tensor-impl-utils.h b/src/tensor/tensor-impl-utils.h
index c4d82590b1f..da2f1ec3272 100644
--- a/src/tensor/tensor-impl-utils.h
+++ b/src/tensor/tensor-impl-utils.h
@@ -21,7 +21,7 @@
 #define KALDI_TENSOR_IMPL_UTILS_H_ 1
 
 #include "tensor/tensor-impl.h"
-#include "tensor/tensor-patterns-utils.h"
+#include "tensor/patterns-utils.h"
 
 
 /**
@@ -51,14 +51,14 @@ inline bool Compatible(const TensorImpl &a, const TensorImpl &b,
 
 /**
   This function returns true if the patterns of a and b are broadcastable.
-  See similar function in tensor-pattern-utils.h for more information.
+  See similar function in pattern-utils.h for more information.
 */
 inline bool Broadcastable(const TensorImpl &a, const TensorImpl &b,
                           bool b_non_reducing = false);
 
 /**
   This function returns true if the patterns of a, b and c are broadcastable.
-  See similar function in tensor-pattern-utils.h for more information.
+  See similar function in pattern-utils.h for more information.
 */
 inline bool Broadcastable(const TensorImpl &a, const TensorImpl &b,
                           const TensorImpl &c, bool c_non_reducing = false);
diff --git a/src/tensor/tensor-impl.h b/src/tensor/tensor-impl.h
index 6eaa8b6a98b..2396469020a 100644
--- a/src/tensor/tensor-impl.h
+++ b/src/tensor/tensor-impl.h
@@ -21,7 +21,7 @@
 #define KALDI_TENSOR_TENSOR_IMPL_H_ 1
 
 #include "tensor/tensor-common.h"
-#include "tensor/tensor-pattern.h"
+#include "tensor/pattern.h"
 
 namespace kaldi {
 namespace tensor {
@@ -60,7 +60,7 @@ struct TensorImpl {
   // Returns the dimension on the supplied axis, using the public axis
   // numbering, with negative index interpreted as an offset from the end.
   //
-  //  @param [in] eaxis  Eaxis-index (see definition in tensor-pattern.h)
+  //  @param [in] eaxis  Eaxis-index (see definition in pattern.h)
   //                    Require -NumAxes() <= eaxis < NumAxes().
   //  @return        Returns the dimension on this axis, a number >= 1.
   inline int32 Dim(int32 eaxis);
@@ -69,7 +69,7 @@ struct TensorImpl {
   // supplied axis, using the public axis numbering, with negative index
   // interpreted as an offset from the end.
   //
-  //  @param [in] eaxis  Eaxis-index (see definition in tensor-pattern.h)
+  //  @param [in] eaxis  Eaxis-index (see definition in pattern.h)
   //                    Require -NumAxes() <= eaxis < NumAxes().
   //  @return          Returns the stride on this axis, which will be 0 if
   //                   Dim(axis) == 1, and otherwise nonzero.
@@ -140,7 +140,7 @@ struct TensorImpl {
   /**
      Initializes a TensorImpl with the provided dimensions, creating a new
      storage object for it.  The strides will be as for a "C" array; see
-     "Default strides:" in tensor-pattern.h.
+     "Default strides:" in pattern.h.
 
         @param [in] dims  The dimensions for each axis (in the public
                        numbering).  All elements must be nonnegative,
@@ -170,7 +170,7 @@ struct TensorImpl {
                       kKeepStrideOrder -> use the same order of abs(stride) as
                                           in 'meta'
                       kNormalized -> use normalized strides (see definition
-                       in tensor-pattern.h); basically, the normal order we'd use
+                       in pattern.h); basically, the normal order we'd use
                        for a new Tensor.
                       kCopyStrides -> use the exact strides from the source
                        pattern.
diff --git a/src/tensor/tensor-utils.h b/src/tensor/tensor-utils.h
index 5e810a7343f..f34aea14716 100644
--- a/src/tensor/tensor-utils.h
+++ b/src/tensor/tensor-utils.h
@@ -22,7 +22,7 @@
 
 
 #include "tensor/tensor-impl.h"
-#include "tensor/tensor-pattern-utils.h"
+#include "tensor/pattern-utils.h"
 #include "tensor/tensor.h"
 
 namespace kaldi {
diff --git a/src/tensor/tensor.h b/src/tensor/tensor.h
index 336d0c0fca1..2a871c47789 100644
--- a/src/tensor/tensor.h
+++ b/src/tensor/tensor.h
@@ -21,7 +21,7 @@
 #define KALDI_TENSOR_TENSOR_H_ 1
 
 #include "tensor/tensor-common.h"
-#include "tensor/tensor-pattern.h"
+#include "tensor/pattern.h"
 #include "tensor/tensor-impl.h"
 #include "tensor/storage.h"
 
@@ -120,7 +120,7 @@
 
     Whole Tensor:  A whole Tensor is a Tensor through which one can
             access every byte of the storage region underlying it.
-            W.r.t. the notation in tensor-pattern.h (and using words
+            W.r.t. the notation in pattern.h (and using words
             that describe Patterns to describe Tensors having those patterns),
             this is equivalent to saying that Tensor is compact and
             justified, and the size of its memory-index-set times the
diff --git a/src/tensor/variable-inl.h b/src/tensor/variable-inl.h
index 8179ee38971..f7f3f4fe68e 100644
--- a/src/tensor/variable-inl.h
+++ b/src/tensor/variable-inl.h
@@ -50,7 +50,7 @@ Tensor VariableImpl::GetGradForView(const Tensor &data) {
     // The grad will have exactly the same offset, dims and strides as the data.
     // This is the normal case, which we encounter when the Variable was
     // constructed from a Tensor that is justified and contiguous (see glossary
-    // in tensor-pattern.h for meanings).
+    // in pattern.h for meanings).
     return Tensor(ans);
   } else {
     if (!aux_)

From 670b2d5a87e0de1cff6c820a33a08078d1302f8e Mon Sep 17 00:00:00 2001
From: Daniel Povey <povey@fb.com>
Date: Sun, 5 May 2019 11:53:09 -0400
Subject: [PATCH 036/163] [src] Small changes

---
 src/tensor/storage.h           |  6 +-----
 src/tensor/tensor-impl-utils.h | 14 ++++++++++++++
 2 files changed, 15 insertions(+), 5 deletions(-)

diff --git a/src/tensor/storage.h b/src/tensor/storage.h
index bec164d82ff..1c2120c53bd 100644
--- a/src/tensor/storage.h
+++ b/src/tensor/storage.h
@@ -35,11 +35,7 @@ struct StorageAux;
 class Storage {
  public:
 
-
-  void RecordChange(int32 element_size,
-                    const Pattern &pattern);
-
-
+  
   // This initializes a ChangeTracker object in this->tracker if it
   // does not already exist, and returns its address.
   ChangeTracker *GetChangeTracker();
diff --git a/src/tensor/tensor-impl-utils.h b/src/tensor/tensor-impl-utils.h
index da2f1ec3272..e154ae22ff5 100644
--- a/src/tensor/tensor-impl-utils.h
+++ b/src/tensor/tensor-impl-utils.h
@@ -222,6 +222,20 @@ inline void RegisterTensorChange(const TensorImpl &impl) {
   }
 }
 
+/**
+   read
+   read and write
+   read and invalidation
+   invalidation
+ */
+inline void RegisterOp(const TensorImpl &impl) {
+  if (DebugMode()) {
+    impl.storage_->GetChangeTracker()->RecordChange(
+        SizeOf(impl.dtype), impl.pattern);
+  }
+}
+
+
 inline int64 NumElements(const TensorImpl &a) {
   return NumElements(a.pattern);
 }

From cf95fe47a9e0a66d5370649c5ff4494614b48def Mon Sep 17 00:00:00 2001
From: Daniel Povey <dpovey@gmail.com>
Date: Sun, 5 May 2019 11:56:24 -0400
Subject: [PATCH 037/163] [src] Add definition

---
 src/tensor/pattern.h | 9 ++++++++-
 1 file changed, 8 insertions(+), 1 deletion(-)

diff --git a/src/tensor/pattern.h b/src/tensor/pattern.h
index 53eb00929fd..19741866179 100644
--- a/src/tensor/pattern.h
+++ b/src/tensor/pattern.h
@@ -32,6 +32,14 @@ namespace tensor {
 /*
   PATTERN GLOSSARY   (note: see also TENSOR GLOSSARY in tensor.h)
 
+
+
+    Adjacent:         Two Patterns are said to be adjacent if their memory-index-sets
+                      are disjoint and a Pattern exists whose memory-index-set is
+                      their union.
+                      [TODO: come up with algorithm for testing adjacency and
+                      merging the adjacent Patterns.]
+
     Axis:             An axis is one of the (dim, stride) pairs that form part
                       of a Pattern.  We will sometimes use the word "axis"
                       to refer to the integer index of the axis, as in, for example,
@@ -385,7 +393,6 @@ namespace tensor {
                             exactly and dim(j) = 1.
                         (ii) Either k == num_axes, or dim(i) * stride(i) <= stride(k),
 
-
                       The reader may notice that if we were to restrict k to
                       equal i + 1, then this would be equivalent to the
                       axis-dominance property (property (v)) plus the

From 5d5d38704c04e87123bee02c87f3a85661291f72 Mon Sep 17 00:00:00 2001
From: Daniel Povey <dpovey@gmail.com>
Date: Sat, 11 May 2019 22:06:39 -0400
Subject: [PATCH 038/163] [src] Various progress

---
 src/tensor/memory-checker.h     | 197 ++++++++++++++++++++++++++------
 src/tensor/op.h                 | 125 +++++++++++++++++++-
 src/tensor/storage.h            |   5 +-
 src/tensor/tensor-common.h      |   6 +
 src/tensor/tensor-impl-linear.h |   2 +
 src/tensor/tensor-impl-utils.h  |  27 ++---
 src/tensor/tensor-settings.h    |  22 +++-
 src/tensor/tensor.h             |   3 +
 src/tensor/variable.h           |  25 ++--
 9 files changed, 339 insertions(+), 73 deletions(-)

diff --git a/src/tensor/memory-checker.h b/src/tensor/memory-checker.h
index 1dc84f06306..a11dc5eacca 100644
--- a/src/tensor/memory-checker.h
+++ b/src/tensor/memory-checker.h
@@ -70,9 +70,9 @@ class ChangeTracker {
 
 
   /**
-     Record a change to this storage region at the current time (obtained by
-     GetTick()).  Just appends it to the vector of changes after canonicalizing
-     the pattern.  Inlined since it's only called from Storage::ChangedSince().
+     Record a write to this storage region at the current time (obtained by
+     GetTick()).  Just appends it to the vector of writes after canonicalizing
+     the pattern.  Inlined since it's only called from Storage::WrittenSince().
 
      @param [in] element_size  The size in bytes of the data type being stored
                              here: for example, 4 for float.
@@ -80,21 +80,25 @@ class ChangeTracker {
                             to canonical form (c.f. CanonicalizePattern())
                             before being stored.
    */
-  inline void RecordChange(int32 element_size,
-                           const Pattern &pattern);
+  inline void RecordWrite(int32 element_size,
+                          const Pattern &pattern);
 
 
   /**
      Returns true if any element covered by this pattern has been
      changed since the time given by 'tick'.  Inlined since it's only
-     called from Storage::ChangedSince().
+     called from Storage::WrittenSince().
 
-      @param [in] tick  The time (obtained by GetTick()) since when
-                     we want to know about changes
+      @param [in] element_size  The size in bytes of the data type being stored
+                       here: for example, 4 for float.
       @param [in] pattern  The pattern that we are checking
+      @param [in] tick  The time (obtained by GetTick()) since when
+                       we want to know about changes
+
    */
-  inline bool ChangedSince(int64 tick,
-                           const Pattern &pattern);
+  inline bool WrittenSince(int32 element_size,
+                           const Pattern &pattern,
+                           int64 tick);
 
  private:
 
@@ -111,7 +115,7 @@ class ChangeTracker {
   int32 element_size_;
 
 
-  struct ChangeRecord {
+  struct WriteRecord {
     Pattern pattern;  // The pattern (offset, dims, strides) that was
                             // changed within this storage region.  This pattern
                             // will have been reduced to canonical form.  View
@@ -121,36 +125,36 @@ class ChangeTracker {
     int64 tick;             // The time, in ticks (c.f. NextTick()) at which
                             // this set of memory-indexes was changed.
 
-    // Next in a singly linked list of ChangeRecord.
-    std::unique_ptr<ChangeRecord> tail;
+    // Next in a singly linked list of WriteRecord.
+    std::unique_ptr<WriteRecord> tail;
   };
 
 
   // Head of a singly linked list of changes.  When RecordChange() is called, we
   // will add to the head of this (and then de-dupe; see doc for change_map)).
-  // When ChangedSince() is called, we will traverse it element by element until
-  // we get to the tick passed to ChangedSince, and if there is any overlap with
+  // When WrittenSince() is called, we will traverse it element by element until
+  // we get to the tick passed to WrittenSince, and if there is any overlap with
   // the passed-in pattern, we'll return true.
-  std::unique_ptr<ChangeRecord> changes_;
+  std::unique_ptr<WriteRecord> changes_;
 
 
-  // This is a map from a pointer to the Pattern in ChangeRecord::pattern
-  // (hashing the pattern itself, not the pointer value), to the ChangeRecord
+  // This is a map from a pointer to the Pattern in WriteRecord::pattern
+  // (hashing the pattern itself, not the pointer value), to the WriteRecord
   // that holds it.  We actually map to the address of the std::unique_ptr
-  // pointing to that ChangeRecord, which might be the address of this->changes_
-  // or ChangeRecord::tail, because we need to be able to write to that to
-  // remove a ChangeRecord from the singly linked list.  This map is used
+  // pointing to that WriteRecord, which might be the address of this->changes_
+  // or WriteRecord::tail, because we need to be able to write to that to
+  // remove a WriteRecord from the singly linked list.  This map is used
   // in de-duping the list of changes, so that if someone provides the
   // exact same pattern twice, we only keep the most recent tick; this
   // keeps memory usage under control.
-  std::unordered_map<Pattern*, std::unique_ptr<ChangeRecord>*,
+  std::unordered_map<Pattern*, std::unique_ptr<WriteRecord>*,
                      PatternPtrHasher, PatternPtrEqual> change_map_;
 };
 
 
 
 // This class is a common base-class for UninitializedDataChecker and
-// InvalidDataChecker.
+// InvalidatedDataChecker.
 class DataCheckerBase {
  protected:
   DataCheckerBase(int64 num_bytes);
@@ -299,7 +303,7 @@ class UninitializedDataChecker: public DataCheckerBase {
                   function records the write.
    */
   inline void RecordWrite(int32 element_size,
-                   const Pattern &pattern) {
+                          const Pattern &pattern) {
     RecordEvent(element_size, pattern);
   }
 
@@ -350,20 +354,22 @@ class UninitializedDataChecker: public DataCheckerBase {
 
    The way we handle this is: we assume by default that any time we do an
    operation that sets a Variable but does not depend on its previously existing
-   value, the memory underlying it was not previously written to in an operation
-   that required derivative-tracking.  But if that is not the case (i.e.
-   if you do something that does require overwriting previously-written data
-   that required derivative tracking, like the above), you can inform the
-   framework by doing
+   value, the memory underlying it has not been previously written to in an
+   operation that required derivative-tracking.  That is, the framework assumes
+   by default that you DO NOT REUSE MEMORY, except for in-place operations.  If
+   you do want to re-use memory (specifically:a if you do something that does
+   require overwriting previously-written data that required derivative
+   tracking, like the above), you can inform the framework that you plan to do
+   this as follows:
      DoSomethingWith(a, b, &c.Overwrite());
    instead of
      DoSomethingWith(a, b, &c);
    (here a, b and c are Variables; and let's suppose this operation
    DoSomethingWith() ignores the previous value of `c`).
 
-   This purpose of this class is to detect cases where someone should have
-   invoked Overwrite() because tracked data was overwritten, but failed to
-   do so.
+   This purpose of class InvalidatedDataChecker is to detect cases where someone
+   should have invoked Overwrite() because tracked data was overwritten, but
+   failed to do so.
 
    See also the comment for the overwrite_ member of class VariableImpl, and
    the Untouched() member of Variable.
@@ -409,11 +415,138 @@ class InvalidatedDataChecker: public DataCheckerBase {
   */
   void RecordRead(int32 element_size,
                   const Pattern &pattern);
+};
+
+
+class MemoryChecker {
+ public:
+
+  /**
+     Constructor: constructs a MemoryChecker object for a storage region
+
+        @param [in] num_bytes   Number of bytes in the storage region
+        @param [in] new_region  True if this object is being allocated at
+                     the same time as we are allocating this region.
+                     (may be false if debug mode was not active when
+                     the region was first allocated).
+  */
+  MemoryChecker(int64 num_bytes,
+                bool new_region): num_bytes_(num_bytes) {
+    Initialize(new_region);
+  }
+
+  /**
+     This is called by functions that implement low-level functions on tensors,
+     before or after actually accessing the memory.  The options are:
+         kRead
+         kReadWrite
+         kWrite
+     From a user's perspective the only thing this function might do is crash--
+     which it is designed to do if it detects various "disallowed" things.
+  */
+  void RecordUse(int32 element_size,
+                 const Pattern &pattern,
+                 TensorUseEnum use_type) {
+    KALDI_PARANOID_ASSERT(DebugMode());
+    if (debug_tick_ != DebugTick())
+        Initialise(false);  // false means: not a new region.
+    if (use_type == kRead || use_type == kReadWrite) {
+      invalidated_checker_->RecordRead(element_size, pattern);
+      if (uninitialized_checker_)
+        uninitialized_checker_->RecordRead(element_size, pattern);
+    }
+    if (use_type == kWrite || use_type == kReadWrite) {
+      // Important that this happens after checking the reads above.
+      // uninitialized_checker_ would never find an error in RecordRead() if it
+      // was done after the RecordWrite().
+      if (uninitialized_checker_)
+        uninitialized_checker_->RecordWrite(element_size, pattern);
+      change_tracker_->RecordWrite(element_size,  pattern);
+    }
+  }
+
+  /**
+     Record the invalidation of data.  This occurs in certain backprop
+     operations as a way to avoid unnecessary zeroing operations.  See
+     the documentation for class InvalidatedDatChecker for a longer
+     explanation.
+   */
+  void RecordInvalidation(int32 element_size,
+                          const Pattern &pattern) {
+    if (!invalidated_checker_)
+      invalidated_checker_ = new InvalidatedDataChecker(num_bytes_);
+    invalidated_checker_->RecordInvalidation(element_size, pattern);
+  }
+
+  /**
+     Record that the entire storage region has been zeroed.
+     (This avoids the need to use uninitialized_checker_, so we delete it
+      if it was set).
+   */
+  inline void RecordZeroing() { uninitialized_checker_ = NULL; }
+
+
+  /**
+     This function is called by the backprop code in Ops when it wants to
+     make sure that certain data stored from the forward pass has not
+     been written to since the specified tick.
+   */
+  void CheckUnchangedSince(
+      int32 element_size,
+      const Pattern &pattern,
+      int64 tick) {
+    if (change_tracker_ &&
+        change_tracker_->WrittenSince(element_size, pattern, tick)) {
+      KALDI_ERR << "Quantity needed during backprop has changed since "
+          "the value used in the forward pass.  You have likely used "
+          "an in-place or overwriting operation in a way that's not "
+          "allowed.  Solution: don't overwrite data if you want "
+          "to do backprop.";
+    }
+  }
+
+ private:
+  /**
+     Initialize all members of this object except for num_bytes_ (which is set
+     in the constructor).  This is called from the constructor, but also whenever
+     we detect that debug mode has been turned off and then on again.
+   */
+  void Initialize(bool new_region);
+
+  // the number of bytes in the region, set only in the constructor.
+  int64 num_bytes_;
+
+  // debug_tick_ is the value of DebugTick() at the time when Initialize() was
+  // most recently called.  I.e. it's the start of the current debug cycle.
+  // It's used to detect when debug mode has been turned off and then on, which
+  // requires us to re-initialize this object.
+  int64 debug_tick_;
+
+  // Checker object for uninitialized data.  This is only non-NULL if
+  // the following two conditions hold:
+  //   (a) `new_region` as passed to Initialize() was true (because if we
+  //      started debugging after this region was already created, we
+  //      wouldn't know whether any data in it was uninitialized, so
+  //      this check is meaningless.
+  //   (b) No-one has called RecordZeroing() since Initialize() was
+  //      last called.  (This records that the entire region was
+  //      zeroed, which means there would be no uninitialized data.
+  std::unique_ptr<UninitializedDataChecker> uninitialized_checker_;
+
+  // Checker object for invalidated data.  Will only be allocated if
+  // RecordInvalidation() has been called since Initialize().  See docs for
+  // InvalidatedDataChecker for explanation of what this means.
+  std::unique_ptr<InvalidatedDataChecker> invalidated_checker_;
+
+  // Checker object that checks that we don't overwrite quantities
+  // that will be needed in the backward pass.
+  std::unique_ptr<ChangeTracker> change_tracker_;
 
 
 };
 
 
+
 }  // namespace tensor
 }  // namespace kaldi
 
diff --git a/src/tensor/op.h b/src/tensor/op.h
index caa23d57376..72d2e20a520 100644
--- a/src/tensor/op.h
+++ b/src/tensor/op.h
@@ -70,15 +70,18 @@ class Op {
   virtual void Backprop();
 
  protected:
-  // The time (`GetTick()`) at which this Op was created.
+
+  /**
+     The time (`GetTick()`) at which this Op was created; should be set
+     in child classes by doing:
+      `tick_ = GetTick()`
+     as the last statement of the constructor.   (This ensures the
+     tick is later-numbered than any ticks stored in the ChangeTracker
+     code by operations called from the constructor.)
+  */
   int64 tick_;
 
 
-  inline void RegisterTensorChange(const Tensor &tensor) {
-    if (DebugMode()) {
-    }
-  }
-
   /*
     This function intended to be called from the Backprop() routines
     of child classes, for example:
@@ -225,6 +228,116 @@ class GenericOp: public Op {
 
 
 class AddToOp: public Op {
+ public:
+
+  // This Op corresponds to the computation:
+  //   \f$  b  :=  alpha a  +   beta b.  \f$
+  // with broadcasting or summation depending on the dimensions
+  // involved.  Alpha and beta are constants, and differentiation w.r.t. them is
+  // not supported (you wouldn't reach this code if a or b were actual
+  // variables.)
+  //
+  // The Op is only constructed if b.Tracked() (which it would normally if
+  // a.Tracked()).
+  AddToOp(float alpha, float beta,
+          const Variable &a, const Variable &b):
+      Op({a}),
+      alpha_(alpha),
+      beta_(beta),
+      a_data_(a.GetData()),
+      a_grad_(a.GetGradIfPresent()),
+      b_data_(b.GetData()),
+      b_grad_(b.GetGrad()) {
+
+    Add(alpha, beta, *a_data_, b_data_.get());
+  }
+
+
+  void Backward() {
+    // Do: a_grad += alpha * b_grad.
+    if (a_grad_ != nullptr)
+      AddTo(alpha_, 1.0, b_grad, &a_grad);
+
+    if (beta_ != 1.0)
+      Scale(beta_, b_grad.get());
+  }
+
+ private:
+
+  float alpha_;
+  float beta_;
+
+  // We hold onto all inputs that are not also outputs
+  // (here just a_) for dependency tracking.
+  Variable a_;
+
+  std::shared_ptr<Node> a_node_;
+
+  std::shared_ptr<Tensor> a_data_;
+  // a_grad_ will be NULL if a was not tracked.
+  std::shared_ptr<Tensor> a_grad_;
+  std::shared_ptr<Tensor> b_data_;
+  std::shared_ptr<Tensor> b_grad_;
+
+  Variable b_;
+  bool must_scale_b_grad_;
+
+};
+
+
+class CopyOp: public Op {
+ public:
+
+  // This Op corresponds to the computation:
+  //   \f$  b := a  \f$
+  // with broadcasting or summation depending on the dimensions.
+  //
+  // Constructing this Op will make b tracked if it was already.
+  CopyOp(const Variable &a, const Variable &b):
+      Op({a}),
+      a_data_(a.GetData()),
+      a_grad_(a.GetGradIfPresent()),
+      b_data_(b.GetData()),
+      b_grad_(b.GetGrad()) {
+    Copy(a_data_, b_data_);
+
+      `tick_ = GetTick()`
+  }
+
+
+  void Backward() {
+    // Do: a_grad += alpha * b_grad.
+    if (a_grad_ != nullptr)
+      AddTo(alpha_, 1.0, b_grad, &a_grad);
+
+    if (beta_ != 1.0)
+      Scale(beta_, b_grad.get());
+  }
+
+ private:
+
+  float alpha_;
+  float beta_;
+
+  // We hold onto all inputs that are not also outputs
+  // (here just a_) for dependency tracking.
+  Variable a_;
+
+  std::shared_ptr<Node> a_node_;
+
+  std::shared_ptr<Tensor> a_data_;
+  // a_grad_ will be NULL if a was not tracked.
+  std::shared_ptr<Tensor> a_grad_;
+  std::shared_ptr<Tensor> b_data_;
+  std::shared_ptr<Tensor> b_grad_;
+
+  Variable b_;
+  bool must_scale_b_grad_;
+
+};
+
+
+class CopyOp: public Op {
  public:
 
   // This Op corresponds to the computation:
diff --git a/src/tensor/storage.h b/src/tensor/storage.h
index 1c2120c53bd..492f89bb34f 100644
--- a/src/tensor/storage.h
+++ b/src/tensor/storage.h
@@ -35,13 +35,12 @@ struct StorageAux;
 class Storage {
  public:
 
-  
+
   // This initializes a ChangeTracker object in this->tracker if it
   // does not already exist, and returns its address.
   ChangeTracker *GetChangeTracker();
 
-  inline bool Allocated() {  return (data != NULL);  }
-
+  inline bool Allocated() { return (data != NULL); }
 
   // Returns the raw data pointer.
   inline void *Data() {
diff --git a/src/tensor/tensor-common.h b/src/tensor/tensor-common.h
index a1446e4fad8..5662cbf0277 100644
--- a/src/tensor/tensor-common.h
+++ b/src/tensor/tensor-common.h
@@ -129,6 +129,12 @@ enum BinaryFunctionEnum {
 };
 
 
+enum TensorUseEnum {
+  kRead,
+  kReadWrite,
+  kWrite
+};
+
 
 // In practice we don't expect user-owned tensors with num-axes greater than 5
 // to exist, but there are certain manipulations we do when simplifying matrix
diff --git a/src/tensor/tensor-impl-linear.h b/src/tensor/tensor-impl-linear.h
index 7ad98f13aea..0334f9a31a2 100644
--- a/src/tensor/tensor-impl-linear.h
+++ b/src/tensor/tensor-impl-linear.h
@@ -67,6 +67,8 @@ void AddProduct(float alpha, float beta,
 
 /**
    Copy elements from Tensor a to Tensor b, possibly broadcasting
+   or summing
+
       @param [in]  a    The source Tensor.
       @param [out] b   The destination Tensor.  We require
                        Broadcastable(a, b, true).
diff --git a/src/tensor/tensor-impl-utils.h b/src/tensor/tensor-impl-utils.h
index e154ae22ff5..eb36671a983 100644
--- a/src/tensor/tensor-impl-utils.h
+++ b/src/tensor/tensor-impl-utils.h
@@ -211,26 +211,19 @@ void Select(int32 axis, int32 index, const TensorImpl &src,
             TensorImpl *dest);
 
 
-/**
-
-
- */
-inline void RegisterTensorChange(const TensorImpl &impl) {
-  if (DebugMode()) {
-    impl.storage_->GetChangeTracker()->RecordChange(
-        SizeOf(impl.dtype), impl.pattern);
-  }
-}
 
 /**
-   read
-   read and write
-   read and invalidation
-   invalidation
- */
-inline void RegisterOp(const TensorImpl &impl) {
+   This is to be called when any operation makes use of the memory underlying a
+   Tensor.
+      kRead
+      kReadWrite
+      kReadInvalidate
+      kInvalidate
+*/
+inline void RecordUse(const TensorImpl &impl,
+                      TensorUseEnum use_type) {
   if (DebugMode()) {
-    impl.storage_->GetChangeTracker()->RecordChange(
+    impl.storage_->GetMemoryChecker()->RecordUse(
         SizeOf(impl.dtype), impl.pattern);
   }
 }
diff --git a/src/tensor/tensor-settings.h b/src/tensor/tensor-settings.h
index 02bf02cd96c..0de7dca2cf2 100644
--- a/src/tensor/tensor-settings.h
+++ b/src/tensor/tensor-settings.h
@@ -123,9 +123,25 @@ inline int64 NextTick() { return ++g_tick_counter; }
 // debug_mode activates code that checks for invalidated data in the backprop
 // pass; see "Invalidated:" in glossary in tensor.h.
 // Don't access this variable directly,
-extern thread_local bool debug_mode;
-inline bool DebugMode() { return debug_mode; }
-inline void SetDebugMode(bool b) { debug_mode = b; }
+extern bool debug_mode;     // Do not access directly!
+extern int64 debug_start_tick;   // Do not access directly!
+
+inline bool DebugMode() {
+  return debug_mode;
+}
+inline void SetDebugMode(bool b) {
+  if (!debug_mode)
+    debug_start_tick = NextTick();
+  debug_mode = b;
+}
+/**
+   Returns the tick at which debug mode most recently changed from false to
+   true.
+ */
+inline int64 DebugTick() {
+  KALDI_PARANOID_ASSERT(debug_mode);
+  return debug_start_tick;
+}
 
 class WithDebugModeAs {
  public:
diff --git a/src/tensor/tensor.h b/src/tensor/tensor.h
index 2a871c47789..d77a4484885 100644
--- a/src/tensor/tensor.h
+++ b/src/tensor/tensor.h
@@ -34,6 +34,9 @@
              Each Variable has a base Variable; a base Variable's
              base Variable is itself.  See also: "View Variable".
 
+    Debug mode:  A global bool that says whether we are debugging
+             (activates checks on computation correctness that are slow).
+
     Invalidated:  if some data used in backprop needs to have been unchanged since
               a particular tick (as recorded in an Op), but it has been changed
               since then, we say that it has been invalided.  This is an error,
diff --git a/src/tensor/variable.h b/src/tensor/variable.h
index 709bce8afb4..f89abafa4ee 100644
--- a/src/tensor/variable.h
+++ b/src/tensor/variable.h
@@ -195,17 +195,15 @@ class VariableImpl {
   // Variables, its value is undefined.
   bool rebase_grad_;
 
-  // overwrite_ is part of a mechanism that avoids unnecessary zeroing of
-  // parts of derivatives during the backprop phase.  By default we
-  // assume that if we write to a Variable in a way that doesn't
-  // depend on the previous value (e.g. we set it, rather than
-  // add to it or multiply in-place), then the previous memory underlying
-  // that Variable has not previously participated in any operations
-  // requiring derivatives.
+  // overwrite_ is part of a mechanism that avoids unnecessary zeroing of parts
+  // of derivatives during the backprop phase.  By default we assume that if we
+  // write to a Variable in a way that doesn't depend on the previous value
+  // (e.g. we set it, rather than add to it or multiply in-place), then the
+  // previous memory underlying that Variable has not previously participated in
+  // any operations requiring derivatives.
   //
-  // If you are about
-  // to modify a Variable c that *has* previously participated in
-  // operations requiring derivatives, then, instead of, say:
+  // If you are about to modify a Variable c that *has* previously participated
+  // in operations requiring derivatives, then, instead of, say:
   //  DoSomethingWith(a, b, &c);
   // (and let's suppose this operation ignores the previous value of `c`),
   // you could do:
@@ -213,8 +211,11 @@ class VariableImpl {
   // whereby you assert that the memory underlying this variable may have
   // previously participated in operations requiring derivative tracking
   // (and hence we need to an extra zeroing after the backprop).
-  // The call to Overwrite() sets the `overwrite_` bool, and then
-  // the DoSomethingWith() call should unset it.
+  // The call to Overwrite() sets the `overwrite_` bool, and then the
+  // DoSomethingWith() call should unset it.  (Note: even if that operation for
+  // some reason doesn't unset it, it doesn't really matter, as it would be safe
+  // to set it always).  The overwrite_ variable is intended to be read,
+  // and reset to false, within sub-classes of class Op.
   //
   // Look at the comment for class InvalidatedDataChecker in change-tracker.h
   // for more information.

From f6e9281944a28f03c36667b9616461b6a1fdfa4f Mon Sep 17 00:00:00 2001
From: Daniel Povey <dpovey@gmail.com>
Date: Fri, 17 May 2019 15:45:49 -0400
Subject: [PATCH 039/163] [src] Small tensor changes prior to rewrite

---
 src/tensor/memory-checker.h | 7 +++----
 src/tensor/pattern.h        | 3 ---
 src/tensor/storage.h        | 7 ++++++-
 3 files changed, 9 insertions(+), 8 deletions(-)

diff --git a/src/tensor/memory-checker.h b/src/tensor/memory-checker.h
index a11dc5eacca..031ed8a4453 100644
--- a/src/tensor/memory-checker.h
+++ b/src/tensor/memory-checker.h
@@ -468,7 +468,7 @@ class MemoryChecker {
   /**
      Record the invalidation of data.  This occurs in certain backprop
      operations as a way to avoid unnecessary zeroing operations.  See
-     the documentation for class InvalidatedDatChecker for a longer
+     the documentation for class InvalidatedDataChecker for a longer
      explanation.
    */
   void RecordInvalidation(int32 element_size,
@@ -479,9 +479,8 @@ class MemoryChecker {
   }
 
   /**
-     Record that the entire storage region has been zeroed.
-     (This avoids the need to use uninitialized_checker_, so we delete it
-      if it was set).
+     Record that the entire storage region is being zeroed.  (This avoids the
+     need to use uninitialized_checker_, so we delete it if it was set).
    */
   inline void RecordZeroing() { uninitialized_checker_ = NULL; }
 
diff --git a/src/tensor/pattern.h b/src/tensor/pattern.h
index 19741866179..5c2f2fe300b 100644
--- a/src/tensor/pattern.h
+++ b/src/tensor/pattern.h
@@ -285,9 +285,6 @@ namespace tensor {
                       for example: Broadcastable(P, Q).
 
 
-    An object of type Pattern, representing the dims, strides
-                      and offset of a Tensor.
-
     Public numbering: The numbering of axes used in the public interface of class
                       Tensor.  We use the index `axis` when in the public numbering.
                       We use square brackets when describing dims or strides ordered
diff --git a/src/tensor/storage.h b/src/tensor/storage.h
index 492f89bb34f..5d8fa004304 100644
--- a/src/tensor/storage.h
+++ b/src/tensor/storage.h
@@ -108,7 +108,12 @@ class Storage {
      data in a storage region.  Rather than physically zeroing the data, it
      records the intention to zero it as soon as it is allocated (see "Lazy
      allocation" in tensor.h).  Later on, when the data is allocated, it may
-     actually not have to be zeroed if the AllowUndefined() is called.
+     actually not have to be zeroed if AllowUndefined() is called by the
+     operation that acts on it.
+
+     This is anticipated to be used mostly in backprop code, for deriv_
+     matrices, since conceptually the main operation we do on deriv_ matrices is
+     to add to them.
   */
   inline void ZeroUponAllocation() { zero_upon_allocation_ = true; }
 

From 931496f9c1f41c79272486dfd872a71a42ff51b9 Mon Sep 17 00:00:00 2001
From: Daniel Povey <dpovey@gmail.com>
Date: Fri, 24 May 2019 11:39:32 -0400
Subject: [PATCH 040/163] [src] Some major changes in rough draft

---
 src/tensor/context.h       | 403 +++++++++++++++++++++++++++++++++++++
 src/tensor/deriv-map.h     | 316 +++++++++++++++++++++++++++++
 src/tensor/op.h            |  98 +++++++--
 src/tensor/pattern-utils.h |  17 +-
 src/tensor/pattern.h       |  85 ++++----
 src/tensor/storage.h       |   6 +
 6 files changed, 867 insertions(+), 58 deletions(-)
 create mode 100644 src/tensor/context.h
 create mode 100644 src/tensor/deriv-map.h

diff --git a/src/tensor/context.h b/src/tensor/context.h
new file mode 100644
index 00000000000..e899816d575
--- /dev/null
+++ b/src/tensor/context.h
@@ -0,0 +1,403 @@
+// tensor/context.h
+
+// Copyright      2019  Johns Hopkins University (author: Daniel Povey)
+
+// See ../../COPYING for clarification regarding multiple authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//  http://www.apache.org/licenses/LICENSE-2.0
+//
+// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
+// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
+// MERCHANTABLITY OR NON-INFRINGEMENT.
+// See the Apache 2 License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef KALDI_TENSOR_CONTEXT_H_
+#define KALDI_TENSOR_CONTEXT_H_ 1
+
+#include <cstdint>
+#include <vector>
+#include <string>
+#include "tensor/tensor-common.h"
+
+
+/**
+   This file contains certain mechanisms to set settings about default
+   data types and devices within scopes, some related things like
+   an equivalent of PyTorch's .no_grad().  Also the `Tick()` mechanism
+   is here.
+*/
+
+namespace kaldi {
+namespace tensor {
+
+
+// class Context contains various configurations that we will sometimes need
+// when we do operations on Tensors.  Things like the default data type, the
+// debug mode, and so on.  This will be passed around
+class Context {
+  // The default DataType for newly created Tensors
+  DataType default_dtype_;
+  // The default Device for newly created Tensors
+  Device default_device_;
+};
+
+
+
+// ExecutionContext is used when executing Ops (or doing other things
+// with them, e.g. just storing them); we explicitly pass this
+// object into functions that might want to execute Ops.
+class ExecutionContext: public Context {
+
+  // This function takes ownership of the Op in 'op', and then does something
+  // with it (e.g. does op->Do() for simple execution).
+  virtual void DoOp(Op *op);
+
+  virtual ~ExecutionContext() {}
+};
+
+
+// SimpleExecutionContext means we just execute an Op and then immediately
+// delete it.  It's used when we are just computing something with no
+// autograd.  You could, of course, just call the version of the
+// Op that doesn't take an ExecutionContext, but this option makes
+// it easier to switch between autograd and no-autograd.
+class SimpleExecutionContext: public ExecutionContext {
+
+  virtual void DoOp(Op *op) {  op->Do();  delete op;  }
+  virtual ~SimpleExecutionContext() {}
+};
+
+
+
+
+/**
+   Execution context that you use while doing a forward computation, that
+   executes the forward commands and stores the things required to later do the
+   backprop.  This is a simple version that does the 'normal case'; also
+   see GeneralBackpropExecutionContext.
+*/
+class BackpropExecutionContext: public ExecutionContext {
+
+  /**
+     Constructor of BackpropExecutionContext from an existing DerivMap, which
+     might map, for instance, parameters to their derivatives.  Its contents are
+     *copied* (so that we have a base set of things that we know need
+     derivatives); we don't want to add entries to that 'deriv_map' because it
+     has a longer lifetime than we need, and would waste memory.
+
+      @param [in] deriv_map   An existing DerivMap, to which the user will
+                      likely have added the model parameters and anything
+                      else that derivatives are needed for, with its
+                      Deriv() function.  This is *copied*, not held as a
+                      reference, by this object, to avoid a kind of memory
+                      leakage.
+      @param [in] base_context  The base execution context, which would
+                      normally be SimpleExecutionContext; it is used to
+                      execute both the forward and backward commands.
+
+
+   */
+  BackpropExecutionContext(const DerivMap &deriv_map,
+                           ExecutionContext *base_context);
+
+
+
+
+  /**
+     Constructor taking just a Context (for default dtype and device).  You
+     shouldn't very often have to use this; the constructor taking a DerivMap
+     as well is more useful.
+  */
+  BackpropExecutionContext(const Context &context);
+
+
+  // Returns pointer to this deriv_map_ (still owned by this class.  This may be
+  // used, for instance, to do
+  // backprop_context.GetDerivMap()->Deriv(some_tensor) if we want to ensure
+  // that 'some_tensor' gets a derivative.  This shouldn't very often be
+  // necessary as the usually more correct way would be to to supply a DerivMap
+  // containing all the things whose derivatives you need, to the constructor.
+  // The pointer returned is still owned by this class-- don't delete it.
+  // Also, a subsequent call to FreeDerivMap() might free it, so you
+  // should probably use it immediately and not keep it around unless you
+  // know that FreeDerivMap() will not be called
+  // (note: FreeDerivMap is implicitly called by Backprop()).
+  DerivMap *GetDerivMap() { return deriv_map_.get(); }
+
+
+  /**
+     Does the backprop on a Tensor t; propagates the derivative back to whatever
+     quantities you had added derivs for in the DerivMap passed to the constructor.
+
+     The backprop commands will be executed with a SimpleExecutionContext
+     whose Context base-class is a copy of this class's one.  If you want to
+     do something fancier (e.g. for 2, you can use the version of Backprop
+
+     If retain_info is false, it will delete deriv_map_ and clear backward_ops_.
+     This is recommended in most cases; it's more memory efficient.
+   */
+  void Backprop(const Tensor &t,
+                bool retain_info = false);
+
+
+  void Backprop(const Tensor &t,
+                bool retain_info = false);
+
+
+  virtual void DoOp(Op *op) {
+    // TODO.
+  }
+
+  virtual ~BackpropExecutionContext() { }
+
+
+
+ private:
+  std::vector<unique_ptr<Op> > backward_ops_;
+  unique_ptr<DerivMap> deriv_map_;
+  ExecutionContext *base_context_;
+
+};
+
+
+
+
+class AutogradContext: public Context {
+ public:
+
+
+  /**
+
+   */
+  inline void DoSomething(std::unique_ptr<Op> op) {
+
+  }
+
+
+ private:
+  // The default DataType for newly created Tensors
+  DataType default_dtype_;
+  // The default Device for newly created Tensors
+  Device default_device_;
+
+
+  // If true, all Tensors will be tracked, even ones that are functions
+  // of Tensors that are not tracked.  (Note: the notion of 'tracked'
+  // is only meaningful in the context of a specific AutogradContext).
+  bool all_tracked_;
+
+
+  // If this is non-NULL, whenever we execute commands we will store
+  // the Ops needed for the backprop here.
+  std::shared_ptr<std::vector<std::unique_ptr<Op> > > backward_deriv_commands_;
+
+  // If this is non-NULL, whenever we execute commands we will store the
+  // corresponding Ops in this vector.  This would allow us to do backprop
+  // later, but it's not the normal pattern.  Note: these Ops will refer
+  // to variables used in the forward pass, so
+  std::shared_ptr<std::vector<std::unique_ptr<Op> > > forward_deriv_commands_;
+
+
+  // If this is non-NULL, whenever we execute commands we will store the
+  // corresponding Ops in this vector.  This would allow us to do backprop
+  // later, but it's not the normal pattern.  Note: these Ops will refer
+  // to variables used in the forward pass, so
+  std::shared_ptr<std::vector<Op> > forward_commands_;
+
+
+  bool store_backprop_;
+
+  // if deriv_mapper_ is non-NULL
+  std::shared_ptr<DerivMap> deriv_mapper_;
+
+
+
+
+};
+
+// Once create a new Op, do something as in
+// std::function<void(Op*)>
+// my_func (op).
+// Could be a closure.
+//
+//
+// Examples:
+//   ExecuteOp().
+//   ExecuteAndStoreOp()  [closure with vector<Op>]
+//   StoreOp()
+//   ExecuteAndStoreBackwardOp()  [ closure with vector<Op> to store
+//                                  backward pass, if tracked. ]
+//   ExecuteAndForwardOp()   [Executes the forward function and also,
+//                            if this op is tracked, the forward autodiff;
+//                            that has its own AutogradContext.
+//
+//
+/
+
+Device GetDefaultDevice();
+void SetDefaultDevice(Device device);
+
+// Mechanism to set the default device within a scope by constructing a variable
+// that exists only within that scope.
+class WithDeviceAs {
+ public:
+  // Example:
+  // {
+  //   WithDeviceAs _(kCudaDevice);
+  //   // code in this block uses this default.  the variable
+  //   // name is _ because we don't need to access it.
+  // }
+  inline WithDeviceAs(DeviceType device_type):
+      prev_default_(GetDefaultDevice()) {
+    SetDefaultDevice(Device(device_type));
+  }
+  inline WithDeviceAs(Device device):
+      prev_default_(GetDefaultDevice()) {
+    SetDefaultDevice(device);
+  }
+  ~WithDeviceAs() { SetDefaultDevice(prev_default_); }
+
+ private:
+  Device prev_default_;
+};
+
+
+
+DataType GetDefaultDtype();
+void SetDefaultDtype(DataType dtype);
+
+class WithDtypeAs {
+ public:
+  // Example:
+  // {
+  //   WithDtypeAs _(kDoubleDtype);
+  //   // code in this block uses this default.  the variable
+  //   // name is _ because we don't need to access it.
+  // }
+  inline WithDtypeAs(DataType dtype):
+      prev_default_(GetDefaultDtype()) {
+    SetDefaultDtype(dtype);
+  }
+  ~WithDtypeAs() { SetDefaultDtype(prev_default_); }
+
+ private:
+  DataType prev_default_;
+};
+
+
+
+// struct TensorOptions is used as an arg for some constructors
+// when creating Tensors and Variables; it allows flexibility
+// in specifying the device and/or dtype.  See the examples
+// shown where constructors of Tensor or Variable are declared.
+struct TensorOptions {
+  DataType dtype;
+  Device device;
+
+  TensorOptions(): dtype(GetDefaultDtype()),
+                   device(GetDefaultDevice()) { }
+  TensorOptions(DataType dtype):
+      dtype(dtype), device(GetDefaultDevice()) { }
+  TensorOptions(Device device):
+      dtype(GetDefaultDtype()), device(device) { }
+  TensorOptions(DeviceType device_type):
+      dtype(GetDefaultDtype()), device(device_type) { }
+  TensorOptions(DataType dtype, Device device):
+      dtype(dtype), device(device) { }
+  TensorOptions(DataType dtype, Device device_type):
+      dtype(dtype), device(device_type) { }
+  TensorOptions(const TensorOptions &other):
+      dtype(other.dtype), device(other.device) { }
+};
+
+
+// Global variable, initialized from zero, that is used in GetTick().
+// This is defined in tensor-settings.cc.
+extern int64 g_tick_counter;
+inline int64 NextTick() { return ++g_tick_counter; }
+
+
+// debug_mode activates code that checks for invalidated data in the backprop
+// pass; see "Invalidated:" in glossary in tensor.h.
+// Don't access this variable directly,
+extern bool debug_mode;     // Do not access directly!
+extern int64 debug_start_tick;   // Do not access directly!
+
+inline bool DebugMode() {
+  return debug_mode;
+}
+inline void SetDebugMode(bool b) {
+  if (!debug_mode)
+    debug_start_tick = NextTick();
+  debug_mode = b;
+}
+/**
+   Returns the tick at which debug mode most recently changed from false to
+   true.
+ */
+inline int64 DebugTick() {
+  KALDI_PARANOID_ASSERT(debug_mode);
+  return debug_start_tick;
+}
+
+class WithDebugModeAs {
+ public:
+  // Example:
+  // {
+  //   WithDebugModeAs _(true);
+  //   // code in this block uses debug mode.
+  //   // variable name is _ because we won't use it.
+  // }
+  inline WithDebugModeAs(bool b):
+      prev_default_(DebugMode()) {
+    SetDebugMode(b);
+  }
+  ~WithDebugModeAs() { SetDebugMode(prev_default_); }
+
+ private:
+  bool prev_default_;
+};
+
+
+
+// allow_grad means that gradient tracking is allowed; allow_grad = true
+// is the normal case, and means that if gradient tracking is required
+// (e.g. if the user created a Variable with requires_grad = true, and we do
+// operations that depend on it), then we'll track gradients.
+// It is our way to implement an equivalent of PyTorch's `with torch.no_grad()`.
+// Do not access this variable directly; use AllowGrad() and
+extern thread_local bool allow_grad;
+inline bool AllowGrad() { return allow_grad; }
+inline void SetAllowGrad(bool b) { allow_grad = b; }
+
+
+class WithNoGrad {
+ public:
+  // Example:
+  // {
+  //   WithNoGrad _;
+  //   // code in this block has gradient tracking disabled.
+  //   // variable name is _ because we won't use it.
+  //
+  // }
+  inline WithNoGrad():
+      prev_default_(AllowGrad()) {
+    SetAllowGrad(false);
+  }
+  ~WithNoGrad() { SetAllowGrad(prev_default_); }
+ private:
+  bool prev_default_;
+};
+
+
+}  // namespace tensor
+}  // namespace kaldi
+
+
+#endif  // KALDI_TENSOR_CONTEXT_H_
diff --git a/src/tensor/deriv-map.h b/src/tensor/deriv-map.h
new file mode 100644
index 00000000000..6c69ac1d59a
--- /dev/null
+++ b/src/tensor/deriv-map.h
@@ -0,0 +1,316 @@
+// tensor/deriv-map.h
+
+// Copyright      2019  Johns Hopkins University (author: Daniel Povey)
+
+// See ../../COPYING for clarification regarding multiple authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//  http://www.apache.org/licenses/LICENSE-2.0
+//
+// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
+// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
+// MERCHANTABLITY OR NON-INFRINGEMENT.
+// See the Apache 2 License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef KALDI_TENSOR_TENSOR_SETTINGS_H_
+#define KALDI_TENSOR_TENSOR_SETTINGS_H_ 1
+
+#include <cstdint>
+#include <vector>
+#include <string>
+#include "tensor/tensor-common.h"
+
+
+
+
+namespace kaldi {
+namespace tensor {
+
+
+/*
+  Derivative shape:
+
+  For a quantity of shape, say, [ 2 3 ], the derivative will have the exact
+  same shape [ 2 3 ] if ExtraDim() == 0, but if ExtraDim == x with x > 0,
+  the derivative will have the shape [ x 2 3 ].  This makes it possible
+  to compute derivatives w.r.t. vector-value quantities (of course, this
+  would be more expensive).
+
+*/
+class DerivMap {
+ public:
+  DerivMap(const DerivMap &other);
+
+  // Default constructor, constructs an empty DerivMap taking the derivative
+  // w.r.t a scalar (or of a scalar w.r.t. the things in the forward pass).
+  DerivMap();
+
+
+  // Constructor where you can provide a vector of extra dimensions that the
+  // derivatives will have (ordered as in the public numbering, in which
+  // they will appear before the dimensions of the things used in the
+  // forwardpass).  This is for when you are taking the derivative w.r.t.
+  // a more-than-scalar-valued quantity (in backward mode) or taking the
+  // derivative of a more-than-scalar-valued quantity w.r.t. things
+  // (in forward mode).
+  // This should rarely be used.
+  DerivMap(conststd::vector<int32> &extra_dims);
+
+  // Returns the derivative Tensor for Tensor 't', if one exists already; else
+  // NULL.  (To explain return type, see "Optional Tensor" in tensor.h).
+  std::shared_ptr<TensorImpl> DerivIfPresent(const Tensor &t) const;
+
+  /**
+     Returns the derivative for Tensor t, creating it if it did not already
+     exist.  The mapping from t to its derivative is only stored in this class.
+     See "Derivative shape:" above for explanation of the shape of this Tensor;
+     it will usually be the same as the shape of t.
+     In order to make sure that a Tensor t has an entry in this DerivMap,
+     you can call this function and ignore the return value.
+
+     Note: the derivative objects are created at the level of the Storage
+     region, so when any Tensor that uses a particular storage region
+     becomes tracked, all other Tensors using that storage region also
+     become tracked.
+
+         @param [in] t  The Tensor whose derivative the user is requesting
+  */
+  Tensor Deriv(const Tensor &t);
+
+
+  /**
+     Must be called when the DerivMap is empty; set the dimension of the
+     quantity that we are computing the derivative of (or with respect to).
+     Would be 0 in most situations, meaning the derivative is w.r.t.  a scalar,
+     but if it is >0, the derivatives returned by this DerivMap will have an
+     extra dmension (search for "Derivative shape" above).
+  */
+  void SetExtraDim(int32 extra_dim);
+
+
+  /**
+     Returns a value that is always positive and normally 1, which is the product of extra_dims_.
+  */
+  int64 ExtraDimsProd() const  { return extra_dims_prod_; }
+
+  std::vector<int32> &ExtraDims() const  { return extra_dims_; }
+
+ private:
+
+  // extra_dims_ is the shape (in the public numbering) of the thing that we are taking
+  // the derivative of (in backward mod) or with respect to (in forward mode).
+  // It would normally be the empty vector, meaning we're taking the derivative
+  // w.r.t. a scalar.  All elements must be positive.
+  std::vector<int32> extra_dims_;
+  // extra_dims_prod_ is the product of the elements of extra_dims_.
+  // It will normally be 1.
+  int64 extra_dims_prod_;
+
+
+  // The record relating to the map from one source Storage object to the
+  // corresponding derivative.  The num_bytes of the deriv_storage object will
+  // be equal to the num_bytes of src_storage times extra_dims_prod_.
+  struct DerivRecord {
+    std::weak_ptr<Storage> src_storage;
+    std::weak_ptr<Storage> deriv_storage;
+  };
+
+  // The key in this map is the int64 tick value when the src Storage
+  // object was created (see its Id() function).
+  // The value
+  std::unordered_map<int64, DerivRecord> map_;
+
+
+};
+
+
+// class Context contains various configurations that we will sometimes need
+// when we do operations on Tensors.  Things like the default data type, the
+// debug mode, and so on.  This will be passed around
+class Context {
+
+};
+
+class AutogradContext: public Context {
+ public:
+
+
+ private:
+  DataType default_dtype_;
+  Device default_device_;
+
+
+  std::shared_ptr<
+
+
+  bool store_ops_;
+
+};
+
+
+Device GetDefaultDevice();
+void SetDefaultDevice(Device device);
+
+// Mechanism to set the default device within a scope by constructing a variable
+// that exists only within that scope.
+class WithDeviceAs {
+ public:
+  // Example:
+  // {
+  //   WithDeviceAs _(kCudaDevice);
+  //   // code in this block uses this default.  the variable
+  //   // name is _ because we don't need to access it.
+  // }
+  inline WithDeviceAs(DeviceType device_type):
+      prev_default_(GetDefaultDevice()) {
+    SetDefaultDevice(Device(device_type));
+  }
+  inline WithDeviceAs(Device device):
+      prev_default_(GetDefaultDevice()) {
+    SetDefaultDevice(device);
+  }
+  ~WithDeviceAs() { SetDefaultDevice(prev_default_); }
+
+ private:
+  Device prev_default_;
+};
+
+
+
+DataType GetDefaultDtype();
+void SetDefaultDtype(DataType dtype);
+
+class WithDtypeAs {
+ public:
+  // Example:
+  // {
+  //   WithDtypeAs _(kDoubleDtype);
+  //   // code in this block uses this default.  the variable
+  //   // name is _ because we don't need to access it.
+  // }
+  inline WithDtypeAs(DataType dtype):
+      prev_default_(GetDefaultDtype()) {
+    SetDefaultDtype(dtype);
+  }
+  ~WithDtypeAs() { SetDefaultDtype(prev_default_); }
+
+ private:
+  DataType prev_default_;
+};
+
+
+
+// struct TensorOptions is used as an arg for some constructors
+// when creating Tensors and Variables; it allows flexibility
+// in specifying the device and/or dtype.  See the examples
+// shown where constructors of Tensor or Variable are declared.
+struct TensorOptions {
+  DataType dtype;
+  Device device;
+
+  TensorOptions(): dtype(GetDefaultDtype()),
+                   device(GetDefaultDevice()) { }
+  TensorOptions(DataType dtype):
+      dtype(dtype), device(GetDefaultDevice()) { }
+  TensorOptions(Device device):
+      dtype(GetDefaultDtype()), device(device) { }
+  TensorOptions(DeviceType device_type):
+      dtype(GetDefaultDtype()), device(device_type) { }
+  TensorOptions(DataType dtype, Device device):
+      dtype(dtype), device(device) { }
+  TensorOptions(DataType dtype, Device device_type):
+      dtype(dtype), device(device_type) { }
+  TensorOptions(const TensorOptions &other):
+      dtype(other.dtype), device(other.device) { }
+};
+
+
+// Global variable, initialized from zero, that is used in GetTick().
+// This is defined in tensor-settings.cc.
+extern int64 g_tick_counter;
+inline int64 NextTick() { return ++g_tick_counter; }
+
+
+// debug_mode activates code that checks for invalidated data in the backprop
+// pass; see "Invalidated:" in glossary in tensor.h.
+// Don't access this variable directly,
+extern bool debug_mode;     // Do not access directly!
+extern int64 debug_start_tick;   // Do not access directly!
+
+inline bool DebugMode() {
+  return debug_mode;
+}
+inline void SetDebugMode(bool b) {
+  if (!debug_mode)
+    debug_start_tick = NextTick();
+  debug_mode = b;
+}
+/**
+   Returns the tick at which debug mode most recently changed from false to
+   true.
+ */
+inline int64 DebugTick() {
+  KALDI_PARANOID_ASSERT(debug_mode);
+  return debug_start_tick;
+}
+
+class WithDebugModeAs {
+ public:
+  // Example:
+  // {
+  //   WithDebugModeAs _(true);
+  //   // code in this block uses debug mode.
+  //   // variable name is _ because we won't use it.
+  // }
+  inline WithDebugModeAs(bool b):
+      prev_default_(DebugMode()) {
+    SetDebugMode(b);
+  }
+  ~WithDebugModeAs() { SetDebugMode(prev_default_); }
+
+ private:
+  bool prev_default_;
+};
+
+
+
+// allow_grad means that gradient tracking is allowed; allow_grad = true
+// is the normal case, and means that if gradient tracking is required
+// (e.g. if the user created a Variable with requires_grad = true, and we do
+// operations that depend on it), then we'll track gradients.
+// It is our way to implement an equivalent of PyTorch's `with torch.no_grad()`.
+// Do not access this variable directly; use AllowGrad() and
+extern thread_local bool allow_grad;
+inline bool AllowGrad() { return allow_grad; }
+inline void SetAllowGrad(bool b) { allow_grad = b; }
+
+
+class WithNoGrad {
+ public:
+  // Example:
+  // {
+  //   WithNoGrad _;
+  //   // code in this block has gradient tracking disabled.
+  //   // variable name is _ because we won't use it.
+  //
+  // }
+  inline WithNoGrad():
+      prev_default_(AllowGrad()) {
+    SetAllowGrad(false);
+  }
+  ~WithNoGrad() { SetAllowGrad(prev_default_); }
+ private:
+  bool prev_default_;
+};
+
+
+}  // namespace tensor
+}  // namespace kaldi
+
+
+#endif  // KALDI_TENSOR_TENSOR_SETTINGS_H_
diff --git a/src/tensor/op.h b/src/tensor/op.h
index 72d2e20a520..fd3cfb33dfd 100644
--- a/src/tensor/op.h
+++ b/src/tensor/op.h
@@ -29,24 +29,94 @@ class Variable;
 
 
 /**
-   class Op is a base-class for objects that are created when we compute
-   functions of Variables; they exist as long as we retain the computation
-   graph.  In fact, the Ops (together with the Variables) *are* the
-   computation graph.  An op may in general have multiple input Variables
-   and multiple output Variables.
-
-   Every base Variable (see variable.h for definition) that is tracked
-   has a singly linked list of Ops that changed that base Variable,
-   ordered from most recent to least recent.
-
-   When a user calls Backprop() on a Variable, the backprop code works out a
-   topological order of Ops and calls the Ops in (essentially) the reverse order
-   in which they were created.  The backprop code also frees gradients of
-   Variables when it knows they will no longer be needed.
+   class Op is a base-class for objects that are created when we do operations
+   on Variables.  The important thing to know here is that the Variables in
+   question will always have been allocated with particular dimensions,
+   and possibly even contain defined values, before we get to the Op.
+   Examples of Ops include,
+      a := b * c
+      a += b
+      a *= b
+   where the interpretation of the commands above will depend on the
+   dimensions of the Tensors involved.
+
+   Notice that all the member functions of class Op are `const`, i.e. they
+   shouldn't change this class (although of course they may change the
+   underlying Tensor data).  This is to remind users that Ops are supposed
+   to be reusable, and calls to this object shouldn't affect the behavior
+   of subsequent calls, except to the extent that the underlying Tensor
+   data has been changed.
  */
 class Op {
  public:
 
+  /**
+     Do whatever it is that this Op does (e.g. execute the command `a += b`,
+     if that was what this Op did)
+   */
+  virtual void Do() const;
+
+  /**
+     Return a copy of this object.  (This won't be needed very often but might
+     possibly be needed in the context of computing higher-order derivatives).
+  */
+  virtual Op *Copy() const;
+
+  /**
+     This is for forward-mode automatic differentiation (a rarely-used thing).
+     It appends to 'ops' the commands corresponding to the forward-mode
+     automatic differentiation w.r.t. this Op.
+
+       @param [in,out] 'map' is the map that maps from tensors to the
+             corresponding derivative values.  May be modified by adding
+             new key/value pairs.
+       @param [out] ops  This funtion will *append* to `ops` the
+             commands for computing the derivatives associated with
+             this Op in forward-mode automatic differentiation.  If none
+             of the inputs to the Op were tracked w.r.t. `map`,
+             nothing will be done.
+
+     Example: if the command was "a += b", the derivative operation would
+     be: deriv(a) += deriv(b).  In most cases these Ops would be executed
+     immediately and then deleted.
+   */
+  virtual void GetForwardDerivOps(DerivMap *map,
+                                  std::vector<std::unique_ptr<Op> > *ops) const;
+
+
+
+  /**
+     This is for reverse-mode automatic differentiation (the normal type of
+     autograd).
+
+       @param [in,out] map   This object maps from tensors to the
+                       corresponding derivative values.  It may be changed by
+                       adding new elements to the map, if its Deriv() function
+                       is called.
+       @param [out]    ops  This function may *append* to 'ops' the commands
+                       used in the reverse-mode automatic differentiation.
+                       (Note: nothing will be appended if none of the inputs
+                       to the Op were already tracked w.r.t. 'map'.)
+
+     Example: if the command was "a += b * c", the operations added to
+     'ops' would correspond to `deriv(b) += deriv(a) * c` and
+     `deriv(c) += deriv(a) * b`.
+  */
+  virtual void GetBackwardDerivOps(DerivMap *map,
+                                   std::vector<std::unique_ptr<Op> > *ops) const;
+
+
+  /** Destructor.  It's important for efficiency of memory use to destroy Ops as
+      soon as you won't need them any more, because it may trigger the freeing
+      of Tensors and hence Storage regions.
+  */
+  virtual ~Op();
+};
+
+
+
+class Op {
+
   Op(): tick_(GetTick()) { }
 
   /// InputIteratorBegin() and InputIteratorEnd() form the begin and
diff --git a/src/tensor/pattern-utils.h b/src/tensor/pattern-utils.h
index d539d9a5e70..0e951a0065d 100644
--- a/src/tensor/pattern-utils.h
+++ b/src/tensor/pattern-utils.h
@@ -424,19 +424,20 @@ bool Broadcastable(const Pattern &a, const Pattern &b,
 
 
 /**
-   Returns true if the dims-vectors of a and b are the same after padding as for
-   broadcasting.  See definition of "Dims-vector of a Pattern" in
-   pattern.h, and the entry for "PyTorch-style broadcasting".  What this
-   means in terms of the physical storage of the patterns is that a->dims and
-   b->dims contain the same elements, without requiring the num_axes to be the
-   same.
+   Returns true if the shapes of a and b (see "Shape of a Pattern" in pattern.h)
+   are the same after adding 1's on the left (padding) as for broadcasting.  See
+   definition of "Dims-vector of a Pattern" in pattern.h, and the entry for
+   "Padding".  What this means in terms of the physical storage of the patterns
+   is that a->dims and b->dims contain the same elements, without requiring the
+   num_axes to be the same.
 
    This is a stronger condition than Broadcastable(a, b).
          @param [in] a  The first pattern.  Must be valid.
          @param [in] b  The second pattern.  Must be valid.
-         @return      Return true if the dims-vectors vectors of
-                      a and b are the same after padding as for broadcasting.
+         @return      Return true if the shapes of
+                      a and b are the same after padding.
    See also the 3-arg version of SamePaddedDims(), and SameDims().
+   This is a stronger condition than Broadcastable(a, b).
 */
 bool SamePaddedDims(const Pattern &a, const Pattern &b);
 
diff --git a/src/tensor/pattern.h b/src/tensor/pattern.h
index 5c2f2fe300b..816eb2bbeff 100644
--- a/src/tensor/pattern.h
+++ b/src/tensor/pattern.h
@@ -136,21 +136,6 @@ namespace tensor {
                      in the relevant storage region; we will assume that it is obvious
                      from the context which storage region.   See also: "Storage region"
 
-    Dims-vector of a Pattern: The vector of dimension of a Pattern: e.g. [] for
-                    a Pattern with num_axes = 1 or [2 3] for a Pattern with
-                    num-axes = 2.  Note: whenever we display dims vectors in
-                    square brackets as opposed to curly, it implies we are
-                    displaying them in the public numbering.
-
-    Dims-vector of a Pattern-tuple:  The dims vector of a Pattern-tuple is
-                    formed by taking the dims-vectors of each Pattern in the
-                    tuple, extending them on the left with 1's as necessary
-                    to make the the same size, then taking the largest
-                    dim on each axis (i.e. the one that is not equal to 1,
-                    if they are different).  For example, for a Pattern-tuple
-                    of Patterns whose dims-vectors were ([4 1 5], [6 1], [5]),
-                    the dims-vector of the tuple would be [4 6 5].
-
     Disjoint Patterns:  When we speak of disjoint Patterns we mean that
                     their memory-index-sets are disjoint; see memory-index-set.
 
@@ -211,10 +196,8 @@ namespace tensor {
 
     Index-tuple-set of a Pattern-tuple:  The index-tuple-set I(P, Q) of a Pattern-tuple
                       (P, Q) is the index-tuple-set that you would obtain for a
-                      Pattern whose dims equal the dims-vector of that
-                      Pattern-tuple.  See "dims-vector of a Pattern-tuple" for
-                      explanation of what that is.  View I(P, Q) as simply
-                      shorthand for I((P, Q)).
+                      Pattern whose dims equal the shape of that Pattern-tuple
+                      (See "Shape of a Pattern-tuple").
 
     Justified:        We say that a Pattern is justified if least (i.e. most
                       negative) memory-index in its memory-index-set is zero.  For
@@ -263,21 +246,28 @@ namespace tensor {
     Num-axes:        The number of axes that a Tensor has.  This is a number in the
                      range [0, KALDI_TENSOR_MAX_DIM], i.e. 0 through 6.
 
-    Offset:           The memory-index of the element with index-tuple = (all zeros)
-                      of a Tensor.  Offsets will always be >= 0 because they are to
-                      be used as an index into a memory-region, and negative
-                      index would be outside that region.
+    Offset:          The memory-index of the element with index-tuple = (all zeros)
+                     of a Tensor.  Offsets will always be >= 0 because they are to
+                     be used as an index into a memory-region, and negative
+                     index would be outside that region.
+
+    Padding:         This refers to the fact that when testing whether Patterns
+                     are broadcastable, if their num-axes are different we
+                     pad the shorter one by adding "1" on the left (in the public
+                     numbering).  So if we are doing an operation on Tensors
+                     with shapes [7 3 2]  and [3 2], we treat the second one
+                     as having shape [1 3 2].
 
-    Pattern:          An object representing the dims, strides and offset of a Tensor.
-                      (see struct Pattern).  The Pattern has
-                      an 'offset' which is the memory-index of the element of the Tensor
-                      whose index-tuple is all zeros; the Pattern also
-                      has a number of axes, `0 <= num_axes < KALDI_TENSOR_MAX_AXES`,
-                      and for each axis from 0 <= axis < num_axes, it has a dimension
-                      dim(axis) and stride(axis).
+    Pattern:         An object representing the dims, strides and offset of a Tensor.
+                     (see struct Pattern).  The Pattern has
+                     an 'offset' which is the memory-index of the element of the Tensor
+                     whose index-tuple is all zeros; the Pattern also
+                     has a number of axes, `0 <= num_axes < KALDI_TENSOR_MAX_AXES`,
+                     and for each axis from 0 <= axis < num_axes, it has a dimension
+                     dim(axis) and stride(axis).
 
-                      Search below for 'Valid Pattern' for properties a Pattern must
-                      (in most circumstances) satisfy.
+                     Search below for 'Valid Pattern' for properties a Pattern must
+                     (in most circumstances) satisfy.
 
 
     Pattern-tuple:    A pattern-tuple of a tuple of Patterns, say:  (P, Q),
@@ -303,10 +293,11 @@ namespace tensor {
     PyTorch-style broadcasting:  We use this name to refer to the fact that in
                       PyTorch, if an operation is done on two Tensors with
                       dims=[5 6] and dims=[6], the second one would be interpreted
-                      as having dims=[1 6].  That is: we pad with 1's on the left.
-                      Note: whenever we refer to broadcasting we include this feature;
-                      this glossary entry exists just to explain it, not to claim
-                      that we have two different versions of broadcasting.
+                      as having dims=[1 6].  That is: we pad with 1's on the left
+                      (See "Padding").  Note: whenever we refer to broadcasting
+                      we include this feature; this glossary entry exists just
+                      to explain it, not to imply that we have two different
+                      versions of broadcasting.
 
     Raxis-index:      We use the term "raxis-index", often just "raxis" for short,
                       to mean the index of an axis in the reversed, private numbering.
@@ -319,6 +310,28 @@ namespace tensor {
     Set-equivalent:   Two Patterns are set-equivalent if their memory-index-sets
                       are identical.
 
+
+    Shape of a Pattern: The vector of the dimensions of a Pattern: e.g. [] for
+                    a Pattern with num_axes = 1 or [2 3] for a Pattern with
+                    num-axes = 2.  Note: whenever we display dims vectors in
+                    square brackets or use "shape" without qualification, it
+                    implies we are displaying them in the public numbering.
+                    The "private shape" of a Pattern are be the same in the
+                    private numbering, which is in the reverse order to
+                    the public numbering and which we'd display in curly
+                    braces, like {3, 2}.
+
+    Shape of a Pattern-tuple:  The shape of a Pattern-tuple is
+                    formed by taking the shapes of each Pattern in the tuple,
+                    extending them on the left with 1's as necessary to make
+                    them the same size, then taking the largest dim on each axis
+                    (i.e. the one that is not equal to 1, if they are
+                    different).  For example, for a Pattern-tuple of Patterns
+                    whose shapes were ([4 1 5], [6 1], [5]), the shape of the
+                    tuple would be [4 6 5].  (Note: the Patterns in a
+                    Pattern-tuple must be broadcastable, so if the dims are
+                    different, one of them must be 1.)
+
     Trivial axis:     An axis of a Pattern for which dim=1.  Such axes will have
                       stride=0 if the Pattern is valid.
 
diff --git a/src/tensor/storage.h b/src/tensor/storage.h
index 5d8fa004304..8358e9abca2 100644
--- a/src/tensor/storage.h
+++ b/src/tensor/storage.h
@@ -132,6 +132,8 @@ class Storage {
   // Destructor that frees any data held.
   ~Storage();
 
+  inline int64 Id() { return id_; }
+
  private:
 
   // Allocate the data.  It is an error to call this if data_ != NULL.
@@ -154,6 +156,10 @@ class Storage {
   // allocating the memory for the gradients.
   void *data_;
 
+  // The tick (see GetTick()) at which this Storage region was created; serves
+  // as a unique identifier.
+  int64 id_;
+
   bool zero_upon_allocation_;
 
   // num_bytes is the number of bytes in the region we have allocated

From 3c3d9a954cfe1f50b99dd6f0aed1d7bcc0506fa9 Mon Sep 17 00:00:00 2001
From: Daniel Povey <dpovey@gmail.com>
Date: Fri, 31 May 2019 13:28:23 -0400
Subject: [PATCH 041/163] [src] Lots of tensor changes

---
 src/tensor/array-ref.h                        |  19 +-
 src/tensor/context.h                          | 186 +++---
 src/tensor/deriv-map.h                        | 303 +++------
 src/tensor/linear-ops.cc                      | 154 +++++
 src/tensor/linear-ops.h                       | 604 +++++++++++++++++
 src/tensor/linear-special-ops.cc              | 154 +++++
 src/tensor/linear-special-ops.h               | 613 ++++++++++++++++++
 src/tensor/memory-checker.h                   |   1 +
 src/tensor/op.h                               | 412 ++----------
 ...-extra-utils.cc => pattern-tuple-utils.cc} | 134 +++-
 ...rn-extra-utils.h => pattern-tuple-utils.h} |  84 ++-
 src/tensor/pattern-utils.cc                   |   4 +
 src/tensor/pattern-utils.h                    |  88 +--
 src/tensor/pattern.cc                         |   7 +
 src/tensor/pattern.h                          |  39 +-
 src/tensor/tensor-common.h                    |   3 +-
 src/tensor/tensor-functions.h                 |  19 +-
 src/tensor/tensor-impl-utils.h                |   6 +-
 src/tensor/tensor-impl.h                      |   3 +-
 src/tensor/tensor-linear-ops.h                | 501 ++++++++++++++
 src/tensor/tensor-utils.h                     |  26 +-
 src/tensor/tensor.h                           |  36 +-
 22 files changed, 2599 insertions(+), 797 deletions(-)
 create mode 100644 src/tensor/linear-ops.cc
 create mode 100644 src/tensor/linear-ops.h
 create mode 100644 src/tensor/linear-special-ops.cc
 create mode 100644 src/tensor/linear-special-ops.h
 rename src/tensor/{pattern-extra-utils.cc => pattern-tuple-utils.cc} (91%)
 rename src/tensor/{pattern-extra-utils.h => pattern-tuple-utils.h} (90%)
 create mode 100644 src/tensor/tensor-linear-ops.h

diff --git a/src/tensor/array-ref.h b/src/tensor/array-ref.h
index e6de149e756..e37f40630c3 100644
--- a/src/tensor/array-ref.h
+++ b/src/tensor/array-ref.h
@@ -21,20 +21,19 @@
 #include <tensor/tensor-common.h>
 
 
-/**
-   This is some notes on plans for kaldi10 tensor stuff, nothing is fully fleshed out.
-*/
-
 namespace kaldi {
 namespace tensor {
 
 
-// Similar to llvm/PyTorch's ArrayRef, this is a lightweight way to store an
-// array (zero or more elements of type T).  The array is not owned here; it
-// will generally be unsafe to use an ArrayRef as other than a local variable.
-//
-// ArrayRef has only two members and it will probably make sense to pass it by
-// value most of the time.
+/**
+ Similar to llvm/PyTorch's ArrayRef, this is a lightweight way to store a const
+ array.  The data in array is not owned here; it will generally be unsafe to use
+ an ArrayRef as other than a local variable.
+
+ ArrayRef has only two members and it will probably make sense to pass it by
+ value most of the time.  Its constructors via std::vector<T> and
+ std::initializer_list<T> will be the usual way of creating it;
+*/
 template <typename T>
 struct ArrayRef final {
   const T *data;
diff --git a/src/tensor/context.h b/src/tensor/context.h
index e899816d575..928254586b6 100644
--- a/src/tensor/context.h
+++ b/src/tensor/context.h
@@ -54,9 +54,9 @@ class Context {
 // object into functions that might want to execute Ops.
 class ExecutionContext: public Context {
 
-  // This function takes ownership of the Op in 'op', and then does something
-  // with it (e.g. does op->Do() for simple execution).
-  virtual void DoOp(Op *op);
+  /// This function executes the Op (op.Do()) and/or does something else
+  /// relating to taking derivatives.
+  virtual void Execute(const Op &op);
 
   virtual ~ExecutionContext() {}
 };
@@ -69,7 +69,7 @@ class ExecutionContext: public Context {
 // it easier to switch between autograd and no-autograd.
 class SimpleExecutionContext: public ExecutionContext {
 
-  virtual void DoOp(Op *op) {  op->Do();  delete op;  }
+  virtual void Execute(const Op &op) {  op.Do();  }
   virtual ~SimpleExecutionContext() {}
 };
 
@@ -79,17 +79,13 @@ class SimpleExecutionContext: public ExecutionContext {
 /**
    Execution context that you use while doing a forward computation, that
    executes the forward commands and stores the things required to later do the
-   backprop.  This is a simple version that does the 'normal case'; also
-   see GeneralBackpropExecutionContext.
+   backprop.  See its Backprop() function for how to execute the backprop.
 */
 class BackpropExecutionContext: public ExecutionContext {
 
   /**
      Constructor of BackpropExecutionContext from an existing DerivMap, which
-     might map, for instance, parameters to their derivatives.  Its contents are
-     *copied* (so that we have a base set of things that we know need
-     derivatives); we don't want to add entries to that 'deriv_map' because it
-     has a longer lifetime than we need, and would waste memory.
+     might map, for instance, parameters to their derivatives.
 
       @param [in] deriv_map   An existing DerivMap, to which the user will
                       likely have added the model parameters and anything
@@ -100,8 +96,10 @@ class BackpropExecutionContext: public ExecutionContext {
       @param [in] base_context  The base execution context, which would
                       normally be SimpleExecutionContext; it is used to
                       execute both the forward and backward commands.
-
-
+                      This class will store the pointer but will not take
+                      ownership; it is the user's responsibility to
+                      make sure it stays alive as long as this object is
+                      alive.
    */
   BackpropExecutionContext(const DerivMap &deriv_map,
                            ExecutionContext *base_context);
@@ -109,28 +107,6 @@ class BackpropExecutionContext: public ExecutionContext {
 
 
 
-  /**
-     Constructor taking just a Context (for default dtype and device).  You
-     shouldn't very often have to use this; the constructor taking a DerivMap
-     as well is more useful.
-  */
-  BackpropExecutionContext(const Context &context);
-
-
-  // Returns pointer to this deriv_map_ (still owned by this class.  This may be
-  // used, for instance, to do
-  // backprop_context.GetDerivMap()->Deriv(some_tensor) if we want to ensure
-  // that 'some_tensor' gets a derivative.  This shouldn't very often be
-  // necessary as the usually more correct way would be to to supply a DerivMap
-  // containing all the things whose derivatives you need, to the constructor.
-  // The pointer returned is still owned by this class-- don't delete it.
-  // Also, a subsequent call to FreeDerivMap() might free it, so you
-  // should probably use it immediately and not keep it around unless you
-  // know that FreeDerivMap() will not be called
-  // (note: FreeDerivMap is implicitly called by Backprop()).
-  DerivMap *GetDerivMap() { return deriv_map_.get(); }
-
-
   /**
      Does the backprop on a Tensor t; propagates the derivative back to whatever
      quantities you had added derivs for in the DerivMap passed to the constructor.
@@ -141,23 +117,50 @@ class BackpropExecutionContext: public ExecutionContext {
 
      If retain_info is false, it will delete deriv_map_ and clear backward_ops_.
      This is recommended in most cases; it's more memory efficient.
-   */
-  void Backprop(const Tensor &t,
-                bool retain_info = false);
-
 
+        @param [in] t    The Tensor that we are taking the derivative with
+                        respect to.
+        @param [in] deriv  The derivative w.r.t. t of the function we
+                        are taking the derivative of.  Might be just
+                        1.0.  Must satsify Broadcastable(deriv, t).
+                        Note: deriv may have more axes than t, in which
+                        case the extra leading axes are required to
+                        have dimensions equal to deriv_map_->ExtraDims().
+                        If deriv_map_->ExtraDims() is nonempty,
+                        the num-axes of 'deriv' is required to equal
+                        `t.NumAxes() + deriv_map_->ExtraDims().size()`.
+   */
   void Backprop(const Tensor &t,
-                bool retain_info = false);
+                const Tensor &deriv) {
+    if (deriv_map_ == nullptr)
+      KALDI_ERR << "You cannot call Backprop twice on the same "
+          "BackpropExecutionContext";
+
+    // Delete deriv_map_.  This will help ensure that derivative
+    // quantities are deleted as soon as they are no longer needed
+    // (since once we delete the deriv_map_ and the ops referring
+    // to those derivative matrices, they will be garbage collected).
+    deriv_map_ = nullptr;
+
+    for (auto iter = backward_ops_.rbegin();
+         iter != backward_ops_.rend(); ++iter){
+      base_context_->Execute(**iter);
+      // Delete this op.  Deleting the ops also deletes the associated
+      // derivative matrices, via shared_ptr garbage collection.
+      *iter = nullptr;
+    }
+    backward_ops_.clear();
+  }
 
 
-  virtual void DoOp(Op *op) {
-    // TODO.
+  virtual void Execute(const Op &op) {
+    base_context_->Execute(op);
+    op.GetBackwardDerivOps(&deriv_map_, &backward_ops_);
   }
 
   virtual ~BackpropExecutionContext() { }
 
 
-
  private:
   std::vector<unique_ptr<Op> > backward_ops_;
   unique_ptr<DerivMap> deriv_map_;
@@ -166,82 +169,57 @@ class BackpropExecutionContext: public ExecutionContext {
 };
 
 
-
-
-class AutogradContext: public Context {
- public:
-
+/**
+   Execution context that you use while doing a forward computation, that
+   executes the forward commands and also computes forward derivatives
+   w.r.t. something.
+*/
+class ForwardPropExecutionContext: public ExecutionContext {
 
   /**
+     Constructor of ForwardPropExecutionContext from an existing DerivMap, which
+     might map, for instance, some input x to dx/da, where a is the thing
+     we're taking the derivative of.
 
+      @param [in] deriv_map   An existing DerivMap, to which the user will
+                      likely have added the thing we are taking the derivative
+                      w.r.t. (e.g. some input where we want to see its
+                      effect on the computation).  deriv_map is *copied*,
+                      not held as a reference, by this object, to avoid
+                      a kind of memory leakage.
+      @param [in] base_context  The base execution context, which would
+                      normally be SimpleExecutionContext; it is used to
+                      execute both the forward and backward commands.
+                      This class will store the pointer but will not take
+                      ownership; it is the user's responsibility to
+                      make sure it stays alive as long as this object is
+                      alive.
    */
-  inline void DoSomething(std::unique_ptr<Op> op) {
+  ForwardPropExecutionContext(const DerivMap &deriv_map,
+                              ExecutionContext *base_context);
 
-  }
-
-
- private:
-  // The default DataType for newly created Tensors
-  DataType default_dtype_;
-  // The default Device for newly created Tensors
-  Device default_device_;
-
-
-  // If true, all Tensors will be tracked, even ones that are functions
-  // of Tensors that are not tracked.  (Note: the notion of 'tracked'
-  // is only meaningful in the context of a specific AutogradContext).
-  bool all_tracked_;
-
-
-  // If this is non-NULL, whenever we execute commands we will store
-  // the Ops needed for the backprop here.
-  std::shared_ptr<std::vector<std::unique_ptr<Op> > > backward_deriv_commands_;
 
-  // If this is non-NULL, whenever we execute commands we will store the
-  // corresponding Ops in this vector.  This would allow us to do backprop
-  // later, but it's not the normal pattern.  Note: these Ops will refer
-  // to variables used in the forward pass, so
-  std::shared_ptr<std::vector<std::unique_ptr<Op> > > forward_deriv_commands_;
-
-
-  // If this is non-NULL, whenever we execute commands we will store the
-  // corresponding Ops in this vector.  This would allow us to do backprop
-  // later, but it's not the normal pattern.  Note: these Ops will refer
-  // to variables used in the forward pass, so
-  std::shared_ptr<std::vector<Op> > forward_commands_;
+  virtual void Execute(const Op &op) {
+    base_context_->Execute(op);
+    std::vector<std::unique_ptr<Op> > ops;
+    op.GetForwardDerivOps(&deriv_map_, &ops);
+    for (auto iter = ops.begin(); iter != ops.end(); ++iter)
+      base_context_->Execute(*iter);
+    // and let the ops in 'ops' go out of scope and get deleted.
+  }
 
 
-  bool store_backprop_;
+  // Returns pointer to this deriv_map_ (still owned by this class).
+  // May be used to query the derivative of some Tensor w.r.t. the
+  // input, e.g. forward_context.GetDerivMap()->DerivIfPresent(some_tensor).
+  DerivMap *GetDerivMap() { return deriv_map_.get(); }
 
-  // if deriv_mapper_ is non-NULL
-  std::shared_ptr<DerivMap> deriv_mapper_;
 
+};
 
 
 
-};
-
-// Once create a new Op, do something as in
-// std::function<void(Op*)>
-// my_func (op).
-// Could be a closure.
-//
-//
-// Examples:
-//   ExecuteOp().
-//   ExecuteAndStoreOp()  [closure with vector<Op>]
-//   StoreOp()
-//   ExecuteAndStoreBackwardOp()  [ closure with vector<Op> to store
-//                                  backward pass, if tracked. ]
-//   ExecuteAndForwardOp()   [Executes the forward function and also,
-//                            if this op is tracked, the forward autodiff;
-//                            that has its own AutogradContext.
-//
-//
-/
 
-Device GetDefaultDevice();
-void SetDefaultDevice(Device device);
 
 // Mechanism to set the default device within a scope by constructing a variable
 // that exists only within that scope.
diff --git a/src/tensor/deriv-map.h b/src/tensor/deriv-map.h
index 6c69ac1d59a..c5c8fc8e89a 100644
--- a/src/tensor/deriv-map.h
+++ b/src/tensor/deriv-map.h
@@ -33,33 +33,85 @@ namespace tensor {
 
 
 /*
+  class DerivMap stores and updates a map from a Tensor to some derivative
+  quantity related to that Tensor.  We store this map separately from
+  the Tensor itself because this seems to generalize more naturally
+  to things like higher-order derivative, and helps keep the code
+  easy to understand.
+
+  Note: the memory for the derivatives is actually allocated for the whole
+  Storage region underlying a Tensor, so if we call Deriv() to create the
+  derivative for some Tensor, all Tensors sharing the same underlying storage
+  region will now also have an entry in the DerivMap.
+
   Derivative shape:
 
-  For a quantity of shape, say, [ 2 3 ], the derivative will have the exact
-  same shape [ 2 3 ] if ExtraDim() == 0, but if ExtraDim == x with x > 0,
-  the derivative will have the shape [ x 2 3 ].  This makes it possible
+  For a quantity of shape, say, [ 2 3 ], the derivative will normally have the exact
+  same shape, e.g. [ 2 3 ].  But if the extra_dims
+
+
+  if ExtraDim() == 0, but if ExtraDim == x with x > 0,
+  the derivative will have the shape [ x 1 1 1 2 3 ].  This makes it possible
   to compute derivatives w.r.t. vector-value quantities (of course, this
   would be more expensive).
-
 */
 class DerivMap {
  public:
-  DerivMap(const DerivMap &other);
+  /** Construct a new, empty DerivMap.
+       @param [in] context  Context that determinize the dtype and device
+                     for derivatives we create.
+  */
+  DerivMap(const Context &context);
+
+
+  /**
+     Constructor where you can provide a vector of extra dimensions that the
+     derivatives will have (ordered as in the public numbering, in which
+     they will appear before the dimensions of the things used in the
+     forwardpass).  This is for when you are taking the derivative w.r.t.
+     a more-than-scalar-valued quantity (in backward mode) or taking the
+     derivative of a more-than-scalar-valued quantity w.r.t. things
+     (in forward mode).  This should rarely be used.
+
+        @param [in] context  Object that sets the default device and dtype
+        @param [in] extra_dims   Extra dimensions, ordered as in the public
+                       numbering, that the derivative has, e.g. in reverse-mode
+                       autograd (backprop) this is used when we are taking
+                       derivatives w.r.t a non-scalar quantity.
+        @param [in] axis_offset  The user should set this to a number >= the
+                       largest num_axes of any of the Tensors with which
+                       we will call Deriv() or DerivIfPresent() with this
+                       object.  (Note: any matrix multiplication implicitly
+                       adds an axis, so for example if you are doing matrix
+                       multiplication on Tensors with 3 axes, you should
+                       make sure axis_offset is at least 4) axis_offset
+                       ensures that the 'extra_dims' always appear
+                       at the same position regardless of the num_axes
+                       of the Tensor we called Deriv() with.
+                       Technically, axis_offset is only an axis offset in
+                       the private numbering;; in the public numbering it's the
+                       num_axes to which we pad the Tensors supplied to Deriv()
+                       before prepending extra_dims.
+
+     Example: if extra_dims = [2 3] and axis_offset = 4, and someone calls
+     Deriv() with a Tensor of shape [7 8], the derivative Tensor will have
+     shape [2 3 1 1 7 8].  (Note: any unused/trivial axes will have no effect
+     on the actual computation).
+  */
+  DerivMap(const Context &context,
+           ArrayRef<int32> extra_dims,
+           int32 axis_offset);
 
-  // Default constructor, constructs an empty DerivMap taking the derivative
-  // w.r.t a scalar (or of a scalar w.r.t. the things in the forward pass).
-  DerivMap();
+
+  /**
+     Copy constructor.  This is expected to be used in typical neural net
+     training workflows, where we create a DerivMap for the parameters, and then
+     use it with the copy constructor to initialize a fresh DerivMap that will
+     also store the derivatives for the temporary quantities.
+  */
+  DerivMap(const DerivMap &other);
 
 
-  // Constructor where you can provide a vector of extra dimensions that the
-  // derivatives will have (ordered as in the public numbering, in which
-  // they will appear before the dimensions of the things used in the
-  // forwardpass).  This is for when you are taking the derivative w.r.t.
-  // a more-than-scalar-valued quantity (in backward mode) or taking the
-  // derivative of a more-than-scalar-valued quantity w.r.t. things
-  // (in forward mode).
-  // This should rarely be used.
-  DerivMap(conststd::vector<int32> &extra_dims);
 
   // Returns the derivative Tensor for Tensor 't', if one exists already; else
   // NULL.  (To explain return type, see "Optional Tensor" in tensor.h).
@@ -83,16 +135,6 @@ class DerivMap {
   Tensor Deriv(const Tensor &t);
 
 
-  /**
-     Must be called when the DerivMap is empty; set the dimension of the
-     quantity that we are computing the derivative of (or with respect to).
-     Would be 0 in most situations, meaning the derivative is w.r.t.  a scalar,
-     but if it is >0, the derivatives returned by this DerivMap will have an
-     extra dmension (search for "Derivative shape" above).
-  */
-  void SetExtraDim(int32 extra_dim);
-
-
   /**
      Returns a value that is always positive and normally 1, which is the product of extra_dims_.
   */
@@ -102,13 +144,22 @@ class DerivMap {
 
  private:
 
-  // extra_dims_ is the shape (in the public numbering) of the thing that we are taking
-  // the derivative of (in backward mod) or with respect to (in forward mode).
-  // It would normally be the empty vector, meaning we're taking the derivative
-  // w.r.t. a scalar.  All elements must be positive.
+  Context context_;  // Dictates default dtype and device.
+
+  // extra_dims_ is the shape (in the public numbering) of the thing that we are
+  // taking the derivative of (in backward mod) or with respect to (in forward
+  // mode).  It would normally be the empty vector, meaning we're taking the
+  // derivative w.r.t. a scalar.  All elements must be positive.
   std::vector<int32> extra_dims_;
-  // extra_dims_prod_ is the product of the elements of extra_dims_.
-  // It will normally be 1.
+
+  // determines where we place the extra_dims_ (in the private numbering); or,
+  // in the public numbering, what num-axes we pad the arg to Deriv to, before
+  // prepending the dims in extra_dims_.   See example given in the doc
+  // for the 3-arg constructor.
+  int32 axis_offset_;
+
+  // extra_dims_prod_ is the product of the elements of extra_dims_.  It will
+  // normally be 1.
   int64 extra_dims_prod_;
 
 
@@ -120,194 +171,14 @@ class DerivMap {
     std::weak_ptr<Storage> deriv_storage;
   };
 
-  // The key in this map is the int64 tick value when the src Storage
-  // object was created (see its Id() function).
-  // The value
+  // The key in this map is the int64 tick value when the src_storage object was
+  // created (see its Id() function).  (We don't use its memory address, since
+  // those can be re-used).
   std::unordered_map<int64, DerivRecord> map_;
-
-
-};
-
-
-// class Context contains various configurations that we will sometimes need
-// when we do operations on Tensors.  Things like the default data type, the
-// debug mode, and so on.  This will be passed around
-class Context {
-
-};
-
-class AutogradContext: public Context {
- public:
-
-
- private:
-  DataType default_dtype_;
-  Device default_device_;
-
-
-  std::shared_ptr<
-
-
-  bool store_ops_;
-
-};
-
-
-Device GetDefaultDevice();
-void SetDefaultDevice(Device device);
-
-// Mechanism to set the default device within a scope by constructing a variable
-// that exists only within that scope.
-class WithDeviceAs {
- public:
-  // Example:
-  // {
-  //   WithDeviceAs _(kCudaDevice);
-  //   // code in this block uses this default.  the variable
-  //   // name is _ because we don't need to access it.
-  // }
-  inline WithDeviceAs(DeviceType device_type):
-      prev_default_(GetDefaultDevice()) {
-    SetDefaultDevice(Device(device_type));
-  }
-  inline WithDeviceAs(Device device):
-      prev_default_(GetDefaultDevice()) {
-    SetDefaultDevice(device);
-  }
-  ~WithDeviceAs() { SetDefaultDevice(prev_default_); }
-
- private:
-  Device prev_default_;
 };
 
 
 
-DataType GetDefaultDtype();
-void SetDefaultDtype(DataType dtype);
-
-class WithDtypeAs {
- public:
-  // Example:
-  // {
-  //   WithDtypeAs _(kDoubleDtype);
-  //   // code in this block uses this default.  the variable
-  //   // name is _ because we don't need to access it.
-  // }
-  inline WithDtypeAs(DataType dtype):
-      prev_default_(GetDefaultDtype()) {
-    SetDefaultDtype(dtype);
-  }
-  ~WithDtypeAs() { SetDefaultDtype(prev_default_); }
-
- private:
-  DataType prev_default_;
-};
-
-
-
-// struct TensorOptions is used as an arg for some constructors
-// when creating Tensors and Variables; it allows flexibility
-// in specifying the device and/or dtype.  See the examples
-// shown where constructors of Tensor or Variable are declared.
-struct TensorOptions {
-  DataType dtype;
-  Device device;
-
-  TensorOptions(): dtype(GetDefaultDtype()),
-                   device(GetDefaultDevice()) { }
-  TensorOptions(DataType dtype):
-      dtype(dtype), device(GetDefaultDevice()) { }
-  TensorOptions(Device device):
-      dtype(GetDefaultDtype()), device(device) { }
-  TensorOptions(DeviceType device_type):
-      dtype(GetDefaultDtype()), device(device_type) { }
-  TensorOptions(DataType dtype, Device device):
-      dtype(dtype), device(device) { }
-  TensorOptions(DataType dtype, Device device_type):
-      dtype(dtype), device(device_type) { }
-  TensorOptions(const TensorOptions &other):
-      dtype(other.dtype), device(other.device) { }
-};
-
-
-// Global variable, initialized from zero, that is used in GetTick().
-// This is defined in tensor-settings.cc.
-extern int64 g_tick_counter;
-inline int64 NextTick() { return ++g_tick_counter; }
-
-
-// debug_mode activates code that checks for invalidated data in the backprop
-// pass; see "Invalidated:" in glossary in tensor.h.
-// Don't access this variable directly,
-extern bool debug_mode;     // Do not access directly!
-extern int64 debug_start_tick;   // Do not access directly!
-
-inline bool DebugMode() {
-  return debug_mode;
-}
-inline void SetDebugMode(bool b) {
-  if (!debug_mode)
-    debug_start_tick = NextTick();
-  debug_mode = b;
-}
-/**
-   Returns the tick at which debug mode most recently changed from false to
-   true.
- */
-inline int64 DebugTick() {
-  KALDI_PARANOID_ASSERT(debug_mode);
-  return debug_start_tick;
-}
-
-class WithDebugModeAs {
- public:
-  // Example:
-  // {
-  //   WithDebugModeAs _(true);
-  //   // code in this block uses debug mode.
-  //   // variable name is _ because we won't use it.
-  // }
-  inline WithDebugModeAs(bool b):
-      prev_default_(DebugMode()) {
-    SetDebugMode(b);
-  }
-  ~WithDebugModeAs() { SetDebugMode(prev_default_); }
-
- private:
-  bool prev_default_;
-};
-
-
-
-// allow_grad means that gradient tracking is allowed; allow_grad = true
-// is the normal case, and means that if gradient tracking is required
-// (e.g. if the user created a Variable with requires_grad = true, and we do
-// operations that depend on it), then we'll track gradients.
-// It is our way to implement an equivalent of PyTorch's `with torch.no_grad()`.
-// Do not access this variable directly; use AllowGrad() and
-extern thread_local bool allow_grad;
-inline bool AllowGrad() { return allow_grad; }
-inline void SetAllowGrad(bool b) { allow_grad = b; }
-
-
-class WithNoGrad {
- public:
-  // Example:
-  // {
-  //   WithNoGrad _;
-  //   // code in this block has gradient tracking disabled.
-  //   // variable name is _ because we won't use it.
-  //
-  // }
-  inline WithNoGrad():
-      prev_default_(AllowGrad()) {
-    SetAllowGrad(false);
-  }
-  ~WithNoGrad() { SetAllowGrad(prev_default_); }
- private:
-  bool prev_default_;
-};
-
 
 }  // namespace tensor
 }  // namespace kaldi
diff --git a/src/tensor/linear-ops.cc b/src/tensor/linear-ops.cc
new file mode 100644
index 00000000000..ae4defeae44
--- /dev/null
+++ b/src/tensor/linear-ops.cc
@@ -0,0 +1,154 @@
+// tensor/linear-ops.cc
+
+// Copyright      2019  Johns Hopkins University (author: Daniel Povey)
+
+// See ../../COPYING for clarification regarding multiple authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//  http://www.apache.org/licenses/LICENSE-2.0
+//
+// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
+// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
+// MERCHANTABLITY OR NON-INFRINGEMENT.
+// See the Apache 2 License for the specific language governing permissions and
+// limitations under the License.
+
+#include "tensor/linear-ops.h"
+
+namespace kaldi {
+namespace tensor {
+
+void AddOp::Expand(std::vector<std::unique_ptr<Op> > *ops) const {
+
+  Pattern a_pattern = a_.Impl().pattern,
+      b_pattern = b_.Impl().pattern;
+  NormalizePatterns({a_pattern, b_pattern});
+
+  Tensor a(a_), b(b_);
+
+  if (a_pattern != a_.Impl().pattern)
+    a = WithPattern(a, a_pattern);
+  if (b_pattern != b_.Impl().pattern)
+    b = WithPattern(b, b_pattern);
+
+  /*
+    The case-statement values in the switch statement below may be
+    interpreted in groups of 3 hex characters, are 0xAAABBBCCC,
+    pertaining to Tensors a, b and c respectively.  See
+    GetPatternCode() in pattern-utils.h for documentation on
+    the meanings of the values and our notation with X,x,1.
+
+  */
+  int64 combined_code = CombineCodes(a_pattern.GetCode(),
+                                     b_pattern.GetCode());
+
+  // We are doing a += b.
+  switch(combined_code) {
+    // A scalar plus a scalar
+    case 0x000000000:
+
+
+
+}
+
+
+
+inline static void AddProductScalar3(
+    float alpha, float beta,
+    const TensorImpl &a, const TensorImpl &b, const TensorImpl *c) {
+  switch (a.device.device_type) {
+    case kCpuDevice:
+      AddProductScalar3Cpu(alpha, beta, a, b, c);
+      return;
+#ifdef HAVE_CUDA
+    case kGpuDevice:
+      AddProductScalar3Gpu(alpha, beta, a, b, c);
+      return;
+#endif
+    default:
+      KALDI_ERR << "Unsupported device type " << a.ToString();
+  }
+}
+
+
+void AddProduct(float alpha, float beta,
+                const TensorImpl &a, const TensorImpl &b, const TensorImpl *c){
+
+  if (a.pattern.code < b.pattern.code) {
+    // Ensure, via a recursion, that a.pattern.code >= b.pattern.code.
+    // This avoids us having to test for the swapped versions of the patterns.
+    AddProduct(alpha, beta, b, a, c);
+    return;
+  }
+
+  CheckDeviceAndDtype(a, b, *c);
+
+
+  int64 combined_code = CombineCodes(a.pattern.code, b.pattern.code,
+                                     c->pattern.code);
+
+  /*
+    The case-statement values in the switch statement below may be
+    interpreted in groups of 3 hex characters, are 0xAAABBBCCC,
+    pertaining to Tensors a, b and c respectively.  See
+    GetPatternCode() in pattern-utils.h for documentation on
+    the meanings of the values and our notation with X,x,1.
+   */
+  switch(combined_code) {
+    case 0x000000000:
+      // () * () -> ()
+      // scalar * scalar -> scalar
+      AddProductScalar3(a, b, c);
+      return;
+    case 0x101000101:
+      //  (X) * ()-> (X)
+      // vector * scalar -> vector
+      AddProductVecScalarVec(a, b, c);
+      return;
+    case 0x101101101:
+      // (X) * (X) -> (X)
+      // vector .* vector -> vector
+      AddProductVec3(a, b, c);
+      return;
+    case 0x103101202:
+      // (x,X) * (X)  -> (X,1)
+      // vector * matrix -> vector.unsqueeze(-1)
+      AddProductMatVecVec(a, b, c);
+      return;
+    case 0x203101202:
+      // (X,x) * (X) -> (X,1)
+      // transposed-matrix * vector -> vector.unsqueeze(-1)
+      AddProductTmatVecVec(a, b, c);
+      return;
+    case 0x202101103:
+      // (X,1) * (X) -> (x,X)
+      // vector * vector -> matrix (outer product)
+      AddProductVec2Mat(a, b, c);
+      return;
+
+
+    default:
+      break;
+
+  }
+
+  // If we reached this point, it means we could
+  // not handle this request with any of the basic operations above.
+  // Something is a little differ
+
+
+  SubTensor a_temp(a), b_temp(b), c_temp(*c);
+
+  PadAxes(&(a.pattern), &(b.pattern), &(c.pattern));
+
+  CompressPatterns({&a_temp, &b_temp, &c_temp});
+}
+
+
+
+}  // namespace kaldi
+}  // namespace tensor
diff --git a/src/tensor/linear-ops.h b/src/tensor/linear-ops.h
new file mode 100644
index 00000000000..3eade973772
--- /dev/null
+++ b/src/tensor/linear-ops.h
@@ -0,0 +1,604 @@
+// tensor/linear-ops.h
+
+// Copyright      2019  Johns Hopkins University (author: Daniel Povey)
+
+// See ../../COPYING for clarification regarding multiple authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//  http://www.apache.org/licenses/LICENSE-2.0
+//
+// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
+// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
+// MERCHANTABLITY OR NON-INFRINGEMENT.
+// See the Apache 2 License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef KALDI_TENSOR__LINEAR_OPS_H_
+#define KALDI_TENSOR__LINEAR_OPS_H_ 1
+
+#include "tensor/tensor.h"
+
+
+// Note: user-level code will not interact directly with these Ops.  See
+// tensor-linear.h for the user-level code.
+namespace kaldi {
+namespace tensor {
+
+
+/**
+   Add operation taking two Tensors (T), i.e. a += b, which may include
+   summation and/or broadcasting depending on the dimensions of a and b
+
+   May not be used if a and b overlap.
+*/
+class AddOp {
+ public:
+
+  AddOp(const Tensor &a, Tensor &b):
+      a_(a), b_(b) {
+    KALDI_ASSERT(!Overlap(a, b) &&
+                 BroadcastableAndCompatible(a, b));
+  }
+  AddOp(const AddOp &other):
+      a_(other.a_), b_(other.b_) { }
+
+
+  int32 Properties() { return 0 ; }  // Not concrete.
+
+  Op *Copy() const override {
+    return new AddOp(*this);
+  }
+
+  // Defined in linear-ops.cc; this function works out the more concrete
+  // structure (e.g. vectors, matrices, things like that) and chooses the
+  // appropriate implementation
+  void Expand(std::vector<std::unique_ptr<Op> > *ops) const override;
+
+  void GetBackwardDerivOps(
+      DerivMap *map,
+      std::vector<std::unique_ptr<Op> > *ops) const override {
+    std::shared_ptr<TensorImpl> b_deriv = map->DerivIfPresent(b_);
+    if (b_deriv == nullptr)  // b wasn't tracked, so a won't be.
+      return;
+    // else return the Op corresponding to:
+    // b_deriv_ += a_deriv_.
+    ops->push_back(std::unique_ptr<Op>(new AddOp(AsTensor(b_deriv),
+                                                 map->Deriv(a_))));
+
+  }
+  void GetForwardDerivOps(
+      DerivMap *map,
+      std::vector<std::unique_ptr<Op> > *ops) const override {
+    std::shared_ptr<TensorImpl> a_deriv = map->DerivIfPresent(b_);
+    if (b_deriv == nullptr)  // b wasn't tracked, so a won't be.
+      return;
+    // else return the Op corresponding to:
+    // a_deriv_ += b_deriv_.
+    ops->push_back(std::unique_ptr<Op>(new AddOp(AsTensor(a_deriv),
+                                                 map->Deriv(b_))));
+  }
+
+
+ private:
+  Tensor a_;
+  Tensor b_;
+};
+
+
+
+/**
+   Assign operation, doing
+      b := a,
+   which may actually do summation and/or broadcasting depending on the
+   dimensions of b and a.  Formally, and with reference to the notation
+   in pattern.h, we can describe its operation as follows:
+       - Set all elements of b to zero
+       - For each index-tuple i in the index-tuple-set of b, b[i] += a[i].
+   Must not be used if b and a overlap.
+
+   "Assign" means that this is the first time we are setting the memory
+   involved, except possibly for things that don't generate any derivative
+   for various reasons.
+
+   See also SetOp, which is for when the memory might previously have
+   been written to by something differentiable.]
+
+   Note: in the backprop for AssignOp, we can do Unset() after, which
+   means the memory concerned must no longer be read from.
+*/
+class AssignOp {
+ public:
+
+  AssignOp(const Tensor &a, Tensor &b):
+      a_(a), b_(b) {
+    KALDI_ASSERT(!Overlap(a, b) &&
+                 BroadcastableAndCompatible(a, b));
+  }
+  AssignOp(const AssignOp &other):
+      a_(other.a_), b_(other.b_) { }
+
+  void Do() const override {
+    Set(a, &b);  // b := a
+  }
+  Op *Copy() const override {
+    return new AssignOp(*this);
+  }
+
+  void GetBackwardDerivOps(
+      DerivMap *map,
+      std::vector<std::unique_ptr<Op> > *ops) const override {
+    std::shared_ptr<TensorImpl> a_deriv = map->DerivIfPresent(a_);
+    if (a_deriv == nullptr)  // a wasn't tracked, so b won't be.
+      return;
+    // Return the Op corresponding to:
+    // a_deriv_ += b_deriv_.
+    ops->push_back(std::unique_ptr<Op>(new AddOp(map->Deriv(b_),
+                                                 AsTensor(a_deriv))));
+  }
+
+  void GetForwardDerivOps(
+      DerivMap *map,
+      std::vector<std::unique_ptr<Op> > *ops) const override {
+    std::shared_ptr<TensorImpl> a_deriv = map->DerivIfPresent(a_);
+    if (a_deriv == nullptr)  // a wasn't tracked, so b won't be.
+      return;
+    // else return the Op corresponding to:
+    // b_deriv_ := a_deriv_.
+    ops->push_back(std::unique_ptr<Op>(new AssignOp(AsTensor(a_deriv),
+                                                  map->Deriv(b_))));
+  }
+ private:
+   Tensor a_;
+   Tensor b_;
+};
+
+
+
+
+
+/**
+   class Op is a base-class for objects that are created when we do operations
+   on Variables.  The important thing to know here is that the Variables in
+   question will always have been allocated with particular dimensions,
+   and possibly even contain defined values, before we get to the Op.
+   Examples of Ops include,
+      a := b * c
+      a += b
+      a *= b
+   where the interpretation of the commands above will depend on the
+   dimensions of the Tensors involved.
+
+   Notice that all the member functions of class Op are `const`, i.e. they
+   shouldn't change this class (although of course they may change the
+   underlying Tensor data).  This is to remind users that Ops are supposed
+   to be reusable, and calls to this object shouldn't affect the behavior
+   of subsequent calls, except to the extent that the underlying Tensor
+   data has been changed.
+ */
+class Op {
+ public:
+
+  /**
+     Do whatever it is that this Op does (e.g. execute the command `a += b`,
+     if that was what this Op did)
+   */
+  virtual void Do() const;
+
+  /**
+     Return a copy of this object.  (This won't be needed very often but might
+     possibly be needed in the context of computing higher-order derivatives).
+  */
+  virtual Op *Copy() const;
+
+  /**
+     This is for forward-mode automatic differentiation (a rarely-used thing).
+     It appends to 'ops' the commands corresponding to the forward-mode
+     automatic differentiation w.r.t. this Op.
+
+       @param [in,out] 'map' is the map that maps from tensors to the
+             corresponding derivative values.  May be modified by adding
+             new key/value pairs.
+       @param [out] ops  This funtion will *append* to `ops` the
+             commands for computing the derivatives associated with
+             this Op in forward-mode automatic differentiation.  If none
+             of the inputs to the Op were tracked w.r.t. `map`,
+             nothing will be done.
+
+     Example: if the command was "a += b", the derivative operation would
+     be: deriv(a) += deriv(b).  In most cases these Ops would be executed
+     immediately and then deleted.
+   */
+  virtual void GetForwardDerivOps(DerivMap *map,
+                                  std::vector<std::unique_ptr<Op> > *ops) const;
+
+
+
+  /**
+     This is for reverse-mode automatic differentiation (the normal type of
+     autograd).
+
+       @param [in,out] map   This object maps from tensors to the
+                       corresponding derivative values.  It may be changed by
+                       adding new elements to the map, if its Deriv() function
+                       is called.
+       @param [out]    ops  This function may *append* to 'ops' the commands
+                       used in the reverse-mode automatic differentiation.
+                       (Note: nothing will be appended if none of the inputs
+                       to the Op were already tracked w.r.t. 'map'.)
+
+     Example: if the command was "a += b * c", the operations added to
+     'ops' would correspond to `deriv(b) += deriv(a) * c` and
+     `deriv(c) += deriv(a) * b`.
+  */
+  virtual void GetBackwardDerivOps(DerivMap *map,
+                                   std::vector<std::unique_ptr<Op> > *ops) const;
+
+
+  /** Destructor.  It's important for efficiency of memory use to destroy Ops as
+      soon as you won't need them any more, because it may trigger the freeing
+      of Tensors and hence Storage regions.
+  */
+  virtual ~Op();
+};
+
+
+
+class Op {
+
+  Op(): tick_(GetTick()) { }
+
+  /// InputIteratorBegin() and InputIteratorEnd() form the begin and
+  /// end points of a list of Variables that were inputs of this Op
+  /// but were not outputs.  This is used by the backprop code when finding
+  /// the topological order of ops.  (Note: output variables themselves
+  /// refer to Ops, so if we included them in the input list we'd
+  /// get a cycle in the graph).  These Variables are expected to
+  /// still have their graph information (i.e. sub-classes of class Op
+  /// class must not call RemoveGraph() on the members of this list).
+  virtual Op *DepIteratorBegin() = 0;
+  virtual Op *DepIteratorEnd() = 0;
+
+
+
+  // This number >= 0 is used to determine the order of Ops in a graph; each
+  // time we generate an Op we increment a global counter.  Doing it this way,
+  // rather than via topological sorting, is simpler.
+  int64 GetTimestamp() const final { return tick_; }
+
+  virtual void Backprop();
+
+ protected:
+
+  /**
+     The time (`GetTick()`) at which this Op was created; should be set
+     in child classes by doing:
+      `tick_ = GetTick()`
+     as the last statement of the constructor.   (This ensures the
+     tick is later-numbered than any ticks stored in the ChangeTracker
+     code by operations called from the constructor.)
+  */
+  int64 tick_;
+
+
+  /*
+    This function intended to be called from the Backprop() routines
+    of child classes, for example:
+       ` if (DebugMode()) {  CheckTensorTime(*a_);  } `
+    This will die if the memory underlying the Tensor being checked has been
+    modified more recently than tick_.
+  */
+  inline void CheckTensorTime(const Tensor &tensor) {
+    if (DebugMode()) {
+    }
+  }
+
+
+
+
+};
+
+
+template <class OpImpl>
+class OpPointer {
+
+  std::shared_ptr<OpImpl>
+
+}
+
+
+
+/**
+   This is a special version of base-class Op that is created when
+   any SharedGrad is allocated for a non-leaf Variable.  Its purpose
+   is to ensure that, when we get to this Op in the backprop, we deallocate
+   the data underlying the gradient Tensor (so we don't keep gradient
+   Tensors around for longer than is needed).
+*/
+class DeallocateOp: public Op {
+
+  // This operator has no dependencies as it will be created when a SharedGrad
+  // is first initialized, when no Ops have been done on it.
+  Op *DepIteratorBegin() override { return NULL; }
+  Op *DepIteratorEnd() override { return NULL; }
+
+  void Backprop() override {
+    if (auto s = tensor_to_deallocate_.lock())
+      ZeroDeallocating(s.get());
+  }
+
+ private:
+  // Since we just want to deallocate its underlying data, there is no point
+  // increasing its ref-count; we can just shrug our shoulders if it has
+  // already been deleted.d
+  std::weak_ptr<Tensor> tensor_to_deallocate_;
+};
+
+
+/**
+   A slight simplification of class UnaryOp for cases where it's
+   done in-place.
+ */
+class InPlaceUnaryOp: public Op {
+
+};
+
+
+class UnaryOp: public Op {
+
+  //
+  UnaryOp(const Variable &input, const Variable &output) {
+    if
+
+
+
+    if (SameVariable(input, output)) {
+
+    } else {
+    }
+  }
+
+ public:
+
+  std::shared_ptr<Op> op1_;
+  std::shared_ptr<Op> op2_;
+
+
+
+
+}
+
+class GenericOp: public Op {
+
+  // GenericOp is a child of class Op that is intended as a generic base-class
+  // for expressions.
+
+
+
+ protected:
+  // Constructor, to be used from child classes.  This base-class takes care
+  // of storing the list of input Variables for purposes of tracing dependencies;
+  //
+  //  @param [in] input_vars  The list of input Variables (meaning: Variables
+  //                   that are inputs to, but not outputs of, i.e. not modified
+  //                   by, this Op).
+  //  @param [in] output_var  The output Variable of this Op, i.e. the Variable
+  //                   which is modified or set by it.  We may provide another
+  //                   constructor taking ArrayRef<Variable> in this position,
+  //                   as and when we need to support Ops that operate on
+  //                   multiple output Variables.
+  void Op(const ArrayRef<Variable> &input_vars,
+          const Variable &output_var);
+
+
+  // TODO: maybe have a constructor of Op that takes an ArrayRef of the inputs
+  // that are not also outputs?  Could use that for graph traversal.
+
+ private:
+
+  // num_inputs_ is the number of base Variables that are the base Variables of
+  // inputs of this Op (but not of outputs).  These are stored in the
+  // array 'inputs_'.
+
+  // inputs_ is a pointer to an array of shared_ptr<Variable> of size num_inputs_, which
+  // will be be allocated by new [] in the constructor and deleted by delete []
+  // in the destructor.
+
+  // This is a list of the Op-input-nodes (see glossary in tensor.h for explanation).
+  // We don't store the Op-output-nodes here; instead, they refer to this Op in
+  // their op_lists.
+  // (We don't store the Node(s) that is(are) the outputs of the Op here; its own
+  // op_list refers to this Op).
+  std::shared_ptr<Node> *inputs_;
+
+  int32 num_inputs_;
+
+  // If num_inputs_ is 1, then inputs_ is
+  void *inputs_;
+
+  int64 n_;  // initialized from the counter when this object is created.
+  std::shared_ptr<Op> tail_;  // TODO: make it unique_ptr?
+ protected:
+  // Return true if this is not the last Op in the list of Ops attached to this
+  // base Variable (can be useful to know whether we need bother to scale the
+  // derivative in a scaling operation, for instance).
+  bool HasTail() const { return tail_ != nullptr; }
+};
+
+
+class AddToOp: public Op {
+ public:
+
+  // This Op corresponds to the computation:
+  //   \f$  b  :=  alpha a  +   beta b.  \f$
+  // with broadcasting or summation depending on the dimensions
+  // involved.  Alpha and beta are constants, and differentiation w.r.t. them is
+  // not supported (you wouldn't reach this code if a or b were actual
+  // variables.)
+  //
+  // The Op is only constructed if b.Tracked() (which it would normally if
+  // a.Tracked()).
+  AddToOp(float alpha, float beta,
+          const Variable &a, const Variable &b):
+      Op({a}),
+      alpha_(alpha),
+      beta_(beta),
+      a_data_(a.GetData()),
+      a_grad_(a.GetGradIfPresent()),
+      b_data_(b.GetData()),
+      b_grad_(b.GetGrad()) {
+
+    Add(alpha, beta, *a_data_, b_data_.get());
+  }
+
+
+  void Backward() {
+    // Do: a_grad += alpha * b_grad.
+    if (a_grad_ != nullptr)
+      AddTo(alpha_, 1.0, b_grad, &a_grad);
+
+    if (beta_ != 1.0)
+      Scale(beta_, b_grad.get());
+  }
+
+ private:
+
+  float alpha_;
+  float beta_;
+
+  // We hold onto all inputs that are not also outputs
+  // (here just a_) for dependency tracking.
+  Variable a_;
+
+  std::shared_ptr<Node> a_node_;
+
+  std::shared_ptr<Tensor> a_data_;
+  // a_grad_ will be NULL if a was not tracked.
+  std::shared_ptr<Tensor> a_grad_;
+  std::shared_ptr<Tensor> b_data_;
+  std::shared_ptr<Tensor> b_grad_;
+
+  Variable b_;
+  bool must_scale_b_grad_;
+
+};
+
+
+class AssignOp: public Op {
+ public:
+
+  // This Op corresponds to the computation:
+  //   \f$  b := a  \f$
+  // with broadcasting or summation depending on the dimensions.
+  //
+  // Constructing this Op will make b tracked if it was already.
+  AssignOp(const Variable &a, const Variable &b):
+      Op({a}),
+      a_data_(a.GetData()),
+      a_grad_(a.GetGradIfPresent()),
+      b_data_(b.GetData()),
+      b_grad_(b.GetGrad()) {
+    Copy(a_data_, b_data_);
+
+      `tick_ = GetTick()`
+  }
+
+
+  void Backward() {
+    // Do: a_grad += alpha * b_grad.
+    if (a_grad_ != nullptr)
+      AddTo(alpha_, 1.0, b_grad, &a_grad);
+
+    if (beta_ != 1.0)
+      Scale(beta_, b_grad.get());
+  }
+
+ private:
+
+  float alpha_;
+  float beta_;
+
+  // We hold onto all inputs that are not also outputs
+  // (here just a_) for dependency tracking.
+  Variable a_;
+
+  std::shared_ptr<Node> a_node_;
+
+  std::shared_ptr<Tensor> a_data_;
+  // a_grad_ will be NULL if a was not tracked.
+  std::shared_ptr<Tensor> a_grad_;
+  std::shared_ptr<Tensor> b_data_;
+  std::shared_ptr<Tensor> b_grad_;
+
+  Variable b_;
+  bool must_scale_b_grad_;
+
+};
+
+
+class AssignOp: public Op {
+ public:
+
+  // This Op corresponds to the computation:
+  //   \f$  b  :=  alpha a  +   beta b.  \f$
+  // with broadcasting or summation depending on the dimensions
+  // involved.  Obviously alpha and beta are constants,
+  // and differentiation w.r.t. them is not supported.
+  //
+  // The Op is only constructed if b_.Tracked() (which it
+  // would normally if a_.Tracked()).
+  AddToOp(float alpha, float beta,
+          const Variable &a, const Variable &b):
+      Op({a}),
+      alpha_(alpha),
+      beta_(beta),
+      a_data_(a.GetData()),
+      a_grad_(a.GetGradIfPresent()),
+      b_data_(b.GetData()),
+      b_grad_(b.GetGrad()) {
+
+    Add(alpha, beta, *a_data_, b_data_.get());
+  }
+
+
+  void Backward() {
+    // Do: a_grad += alpha * b_grad.
+    if (a_grad_ != nullptr)
+      AddTo(alpha_, 1.0, b_grad, &a_grad);
+
+    if (beta_ != 1.0)
+      Scale(beta_, b_grad.get());
+  }
+
+ private:
+
+  float alpha_;
+  float beta_;
+
+  // We hold onto all inputs that are not also outputs
+  // (here just a_) for dependency tracking.
+  Variable a_;
+
+  std::shared_ptr<Node> a_node_;
+
+  std::shared_ptr<Tensor> a_data_;
+  // a_grad_ will be NULL if a was not tracked.
+  std::shared_ptr<Tensor> a_grad_;
+  std::shared_ptr<Tensor> b_data_;
+  std::shared_ptr<Tensor> b_grad_;
+
+  Variable b_;
+  bool must_scale_b_grad_;
+
+};
+
+
+
+}  // namespace tensor
+}  // namespace kaldi
+
+
+#endif  // KALDI_TENSOR__LINEAR_OPS_H_
diff --git a/src/tensor/linear-special-ops.cc b/src/tensor/linear-special-ops.cc
new file mode 100644
index 00000000000..ae4defeae44
--- /dev/null
+++ b/src/tensor/linear-special-ops.cc
@@ -0,0 +1,154 @@
+// tensor/linear-ops.cc
+
+// Copyright      2019  Johns Hopkins University (author: Daniel Povey)
+
+// See ../../COPYING for clarification regarding multiple authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//  http://www.apache.org/licenses/LICENSE-2.0
+//
+// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
+// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
+// MERCHANTABLITY OR NON-INFRINGEMENT.
+// See the Apache 2 License for the specific language governing permissions and
+// limitations under the License.
+
+#include "tensor/linear-ops.h"
+
+namespace kaldi {
+namespace tensor {
+
+void AddOp::Expand(std::vector<std::unique_ptr<Op> > *ops) const {
+
+  Pattern a_pattern = a_.Impl().pattern,
+      b_pattern = b_.Impl().pattern;
+  NormalizePatterns({a_pattern, b_pattern});
+
+  Tensor a(a_), b(b_);
+
+  if (a_pattern != a_.Impl().pattern)
+    a = WithPattern(a, a_pattern);
+  if (b_pattern != b_.Impl().pattern)
+    b = WithPattern(b, b_pattern);
+
+  /*
+    The case-statement values in the switch statement below may be
+    interpreted in groups of 3 hex characters, are 0xAAABBBCCC,
+    pertaining to Tensors a, b and c respectively.  See
+    GetPatternCode() in pattern-utils.h for documentation on
+    the meanings of the values and our notation with X,x,1.
+
+  */
+  int64 combined_code = CombineCodes(a_pattern.GetCode(),
+                                     b_pattern.GetCode());
+
+  // We are doing a += b.
+  switch(combined_code) {
+    // A scalar plus a scalar
+    case 0x000000000:
+
+
+
+}
+
+
+
+inline static void AddProductScalar3(
+    float alpha, float beta,
+    const TensorImpl &a, const TensorImpl &b, const TensorImpl *c) {
+  switch (a.device.device_type) {
+    case kCpuDevice:
+      AddProductScalar3Cpu(alpha, beta, a, b, c);
+      return;
+#ifdef HAVE_CUDA
+    case kGpuDevice:
+      AddProductScalar3Gpu(alpha, beta, a, b, c);
+      return;
+#endif
+    default:
+      KALDI_ERR << "Unsupported device type " << a.ToString();
+  }
+}
+
+
+void AddProduct(float alpha, float beta,
+                const TensorImpl &a, const TensorImpl &b, const TensorImpl *c){
+
+  if (a.pattern.code < b.pattern.code) {
+    // Ensure, via a recursion, that a.pattern.code >= b.pattern.code.
+    // This avoids us having to test for the swapped versions of the patterns.
+    AddProduct(alpha, beta, b, a, c);
+    return;
+  }
+
+  CheckDeviceAndDtype(a, b, *c);
+
+
+  int64 combined_code = CombineCodes(a.pattern.code, b.pattern.code,
+                                     c->pattern.code);
+
+  /*
+    The case-statement values in the switch statement below may be
+    interpreted in groups of 3 hex characters, are 0xAAABBBCCC,
+    pertaining to Tensors a, b and c respectively.  See
+    GetPatternCode() in pattern-utils.h for documentation on
+    the meanings of the values and our notation with X,x,1.
+   */
+  switch(combined_code) {
+    case 0x000000000:
+      // () * () -> ()
+      // scalar * scalar -> scalar
+      AddProductScalar3(a, b, c);
+      return;
+    case 0x101000101:
+      //  (X) * ()-> (X)
+      // vector * scalar -> vector
+      AddProductVecScalarVec(a, b, c);
+      return;
+    case 0x101101101:
+      // (X) * (X) -> (X)
+      // vector .* vector -> vector
+      AddProductVec3(a, b, c);
+      return;
+    case 0x103101202:
+      // (x,X) * (X)  -> (X,1)
+      // vector * matrix -> vector.unsqueeze(-1)
+      AddProductMatVecVec(a, b, c);
+      return;
+    case 0x203101202:
+      // (X,x) * (X) -> (X,1)
+      // transposed-matrix * vector -> vector.unsqueeze(-1)
+      AddProductTmatVecVec(a, b, c);
+      return;
+    case 0x202101103:
+      // (X,1) * (X) -> (x,X)
+      // vector * vector -> matrix (outer product)
+      AddProductVec2Mat(a, b, c);
+      return;
+
+
+    default:
+      break;
+
+  }
+
+  // If we reached this point, it means we could
+  // not handle this request with any of the basic operations above.
+  // Something is a little differ
+
+
+  SubTensor a_temp(a), b_temp(b), c_temp(*c);
+
+  PadAxes(&(a.pattern), &(b.pattern), &(c.pattern));
+
+  CompressPatterns({&a_temp, &b_temp, &c_temp});
+}
+
+
+
+}  // namespace kaldi
+}  // namespace tensor
diff --git a/src/tensor/linear-special-ops.h b/src/tensor/linear-special-ops.h
new file mode 100644
index 00000000000..49e9961cb54
--- /dev/null
+++ b/src/tensor/linear-special-ops.h
@@ -0,0 +1,613 @@
+// tensor/linear-special-ops.h
+
+// Copyright      2019  Johns Hopkins University (author: Daniel Povey)
+
+// See ../../COPYING for clarification regarding multiple authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//  http://www.apache.org/licenses/LICENSE-2.0
+//
+// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
+// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
+// MERCHANTABLITY OR NON-INFRINGEMENT.
+// See the Apache 2 License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef KALDI_TENSOR_LINEAR_SPECIAL_OPS_H_
+#define KALDI_TENSOR_LINEAR_SPECIAL_OPS_H_ 1
+
+#include "tensor/tensor.h"
+
+
+// This Ops are more specialized forms of the Ops declared in linear-ops; these
+// correspond to more specific combinations of Tensor shapes.
+namespace kaldi {
+namespace tensor {
+
+
+/**
+   Add operation taking two Tensors (T), i.e. a += b, which may include
+   summation and/or broadcasting depending on the dimensions of a and b
+
+   May not be used if a and b overlap.
+*/
+class AddScalarScalarOp {
+ public:
+
+  AddScalarScalarOp(const Tensor &a, Tensor &b):
+      a_(a), b_(b) {
+    KALDI_PARANOID_ASSERT(
+        a.NumAxes() == 0 && b.NumAxes() == 0 &&
+        !Overlap(a, b) && Compatible(a, b));
+  }
+
+  virtual void Do() const {
+
+    KALDI_ERR << "Execution not supported for this Op (not concrete); "
+        "please expand ";
+  }
+
+
+  AddOp(const AddOp &other):
+      a_(other.a_), b_(other.b_) { }
+
+
+  int32 Properties() { return 0 ; }  // Not concrete.
+
+  Op *Copy() const override {
+    return new AddOp(*this);
+  }
+
+  // Defined in linear-ops.cc; this function works out the more concrete
+  // structure (e.g. vectors, matrices, things like that) and chooses the
+  // appropriate implementation
+  void Expand(std::vector<std::unique_ptr<Op> > *ops) const override;
+
+  void GetBackwardDerivOps(
+      DerivMap *map,
+      std::vector<std::unique_ptr<Op> > *ops) const override {
+    std::shared_ptr<TensorImpl> b_deriv = map->DerivIfPresent(b_);
+    if (b_deriv == nullptr)  // b wasn't tracked, so a won't be.
+      return;
+    // else return the Op corresponding to:
+    // b_deriv_ += a_deriv_.
+    ops->push_back(std::unique_ptr<Op>(new AddOp(AsTensor(b_deriv),
+                                                 map->Deriv(a_))));
+
+  }
+  void GetForwardDerivOps(
+      DerivMap *map,
+      std::vector<std::unique_ptr<Op> > *ops) const override {
+    std::shared_ptr<TensorImpl> a_deriv = map->DerivIfPresent(b_);
+    if (b_deriv == nullptr)  // b wasn't tracked, so a won't be.
+      return;
+    // else return the Op corresponding to:
+    // a_deriv_ += b_deriv_.
+    ops->push_back(std::unique_ptr<Op>(new AddOp(AsTensor(a_deriv),
+                                                 map->Deriv(b_))));
+  }
+
+
+ private:
+  Tensor a_;
+  Tensor b_;
+};
+
+
+
+/**
+   Assign operation, doing
+      b := a,
+   which may actually do summation and/or broadcasting depending on the
+   dimensions of b and a.  Formally, and with reference to the notation
+   in pattern.h, we can describe its operation as follows:
+       - Set all elements of b to zero
+       - For each index-tuple i in the index-tuple-set of b, b[i] += a[i].
+   Must not be used if b and a overlap.
+
+   "Assign" means that this is the first time we are setting the memory
+   involved, except possibly for things that don't generate any derivative
+   for various reasons.
+
+   See also SetOp, which is for when the memory might previously have
+   been written to by something differentiable.]
+
+   Note: in the backprop for AssignOp, we can do Unset() after, which
+   means the memory concerned must no longer be read from.
+*/
+class AssignOp {
+ public:
+
+  AssignOp(const Tensor &a, Tensor &b):
+      a_(a), b_(b) {
+    KALDI_ASSERT(!Overlap(a, b) &&
+                 BroadcastableAndCompatible(a, b));
+  }
+  AssignOp(const AssignOp &other):
+      a_(other.a_), b_(other.b_) { }
+
+  void Do() const override {
+    Set(a, &b);  // b := a
+  }
+  Op *Copy() const override {
+    return new AssignOp(*this);
+  }
+
+  void GetBackwardDerivOps(
+      DerivMap *map,
+      std::vector<std::unique_ptr<Op> > *ops) const override {
+    std::shared_ptr<TensorImpl> a_deriv = map->DerivIfPresent(a_);
+    if (a_deriv == nullptr)  // a wasn't tracked, so b won't be.
+      return;
+    // Return the Op corresponding to:
+    // a_deriv_ += b_deriv_.
+    ops->push_back(std::unique_ptr<Op>(new AddOp(map->Deriv(b_),
+                                                 AsTensor(a_deriv))));
+  }
+
+  void GetForwardDerivOps(
+      DerivMap *map,
+      std::vector<std::unique_ptr<Op> > *ops) const override {
+    std::shared_ptr<TensorImpl> a_deriv = map->DerivIfPresent(a_);
+    if (a_deriv == nullptr)  // a wasn't tracked, so b won't be.
+      return;
+    // else return the Op corresponding to:
+    // b_deriv_ := a_deriv_.
+    ops->push_back(std::unique_ptr<Op>(new AssignOp(AsTensor(a_deriv),
+                                                  map->Deriv(b_))));
+  }
+ private:
+   Tensor a_;
+   Tensor b_;
+};
+
+
+
+
+
+/**
+   class Op is a base-class for objects that are created when we do operations
+   on Variables.  The important thing to know here is that the Variables in
+   question will always have been allocated with particular dimensions,
+   and possibly even contain defined values, before we get to the Op.
+   Examples of Ops include,
+      a := b * c
+      a += b
+      a *= b
+   where the interpretation of the commands above will depend on the
+   dimensions of the Tensors involved.
+
+   Notice that all the member functions of class Op are `const`, i.e. they
+   shouldn't change this class (although of course they may change the
+   underlying Tensor data).  This is to remind users that Ops are supposed
+   to be reusable, and calls to this object shouldn't affect the behavior
+   of subsequent calls, except to the extent that the underlying Tensor
+   data has been changed.
+ */
+class Op {
+ public:
+
+  /**
+     Do whatever it is that this Op does (e.g. execute the command `a += b`,
+     if that was what this Op did)
+   */
+  virtual void Do() const;
+
+  /**
+     Return a copy of this object.  (This won't be needed very often but might
+     possibly be needed in the context of computing higher-order derivatives).
+  */
+  virtual Op *Copy() const;
+
+  /**
+     This is for forward-mode automatic differentiation (a rarely-used thing).
+     It appends to 'ops' the commands corresponding to the forward-mode
+     automatic differentiation w.r.t. this Op.
+
+       @param [in,out] 'map' is the map that maps from tensors to the
+             corresponding derivative values.  May be modified by adding
+             new key/value pairs.
+       @param [out] ops  This funtion will *append* to `ops` the
+             commands for computing the derivatives associated with
+             this Op in forward-mode automatic differentiation.  If none
+             of the inputs to the Op were tracked w.r.t. `map`,
+             nothing will be done.
+
+     Example: if the command was "a += b", the derivative operation would
+     be: deriv(a) += deriv(b).  In most cases these Ops would be executed
+     immediately and then deleted.
+   */
+  virtual void GetForwardDerivOps(DerivMap *map,
+                                  std::vector<std::unique_ptr<Op> > *ops) const;
+
+
+
+  /**
+     This is for reverse-mode automatic differentiation (the normal type of
+     autograd).
+
+       @param [in,out] map   This object maps from tensors to the
+                       corresponding derivative values.  It may be changed by
+                       adding new elements to the map, if its Deriv() function
+                       is called.
+       @param [out]    ops  This function may *append* to 'ops' the commands
+                       used in the reverse-mode automatic differentiation.
+                       (Note: nothing will be appended if none of the inputs
+                       to the Op were already tracked w.r.t. 'map'.)
+
+     Example: if the command was "a += b * c", the operations added to
+     'ops' would correspond to `deriv(b) += deriv(a) * c` and
+     `deriv(c) += deriv(a) * b`.
+  */
+  virtual void GetBackwardDerivOps(DerivMap *map,
+                                   std::vector<std::unique_ptr<Op> > *ops) const;
+
+
+  /** Destructor.  It's important for efficiency of memory use to destroy Ops as
+      soon as you won't need them any more, because it may trigger the freeing
+      of Tensors and hence Storage regions.
+  */
+  virtual ~Op();
+};
+
+
+
+class Op {
+
+  Op(): tick_(GetTick()) { }
+
+  /// InputIteratorBegin() and InputIteratorEnd() form the begin and
+  /// end points of a list of Variables that were inputs of this Op
+  /// but were not outputs.  This is used by the backprop code when finding
+  /// the topological order of ops.  (Note: output variables themselves
+  /// refer to Ops, so if we included them in the input list we'd
+  /// get a cycle in the graph).  These Variables are expected to
+  /// still have their graph information (i.e. sub-classes of class Op
+  /// class must not call RemoveGraph() on the members of this list).
+  virtual Op *DepIteratorBegin() = 0;
+  virtual Op *DepIteratorEnd() = 0;
+
+
+
+  // This number >= 0 is used to determine the order of Ops in a graph; each
+  // time we generate an Op we increment a global counter.  Doing it this way,
+  // rather than via topological sorting, is simpler.
+  int64 GetTimestamp() const final { return tick_; }
+
+  virtual void Backprop();
+
+ protected:
+
+  /**
+     The time (`GetTick()`) at which this Op was created; should be set
+     in child classes by doing:
+      `tick_ = GetTick()`
+     as the last statement of the constructor.   (This ensures the
+     tick is later-numbered than any ticks stored in the ChangeTracker
+     code by operations called from the constructor.)
+  */
+  int64 tick_;
+
+
+  /*
+    This function intended to be called from the Backprop() routines
+    of child classes, for example:
+       ` if (DebugMode()) {  CheckTensorTime(*a_);  } `
+    This will die if the memory underlying the Tensor being checked has been
+    modified more recently than tick_.
+  */
+  inline void CheckTensorTime(const Tensor &tensor) {
+    if (DebugMode()) {
+    }
+  }
+
+
+
+
+};
+
+
+template <class OpImpl>
+class OpPointer {
+
+  std::shared_ptr<OpImpl>
+
+}
+
+
+
+/**
+   This is a special version of base-class Op that is created when
+   any SharedGrad is allocated for a non-leaf Variable.  Its purpose
+   is to ensure that, when we get to this Op in the backprop, we deallocate
+   the data underlying the gradient Tensor (so we don't keep gradient
+   Tensors around for longer than is needed).
+*/
+class DeallocateOp: public Op {
+
+  // This operator has no dependencies as it will be created when a SharedGrad
+  // is first initialized, when no Ops have been done on it.
+  Op *DepIteratorBegin() override { return NULL; }
+  Op *DepIteratorEnd() override { return NULL; }
+
+  void Backprop() override {
+    if (auto s = tensor_to_deallocate_.lock())
+      ZeroDeallocating(s.get());
+  }
+
+ private:
+  // Since we just want to deallocate its underlying data, there is no point
+  // increasing its ref-count; we can just shrug our shoulders if it has
+  // already been deleted.d
+  std::weak_ptr<Tensor> tensor_to_deallocate_;
+};
+
+
+/**
+   A slight simplification of class UnaryOp for cases where it's
+   done in-place.
+ */
+class InPlaceUnaryOp: public Op {
+
+};
+
+
+class UnaryOp: public Op {
+
+  //
+  UnaryOp(const Variable &input, const Variable &output) {
+    if
+
+
+
+    if (SameVariable(input, output)) {
+
+    } else {
+    }
+  }
+
+ public:
+
+  std::shared_ptr<Op> op1_;
+  std::shared_ptr<Op> op2_;
+
+
+
+
+}
+
+class GenericOp: public Op {
+
+  // GenericOp is a child of class Op that is intended as a generic base-class
+  // for expressions.
+
+
+
+ protected:
+  // Constructor, to be used from child classes.  This base-class takes care
+  // of storing the list of input Variables for purposes of tracing dependencies;
+  //
+  //  @param [in] input_vars  The list of input Variables (meaning: Variables
+  //                   that are inputs to, but not outputs of, i.e. not modified
+  //                   by, this Op).
+  //  @param [in] output_var  The output Variable of this Op, i.e. the Variable
+  //                   which is modified or set by it.  We may provide another
+  //                   constructor taking ArrayRef<Variable> in this position,
+  //                   as and when we need to support Ops that operate on
+  //                   multiple output Variables.
+  void Op(const ArrayRef<Variable> &input_vars,
+          const Variable &output_var);
+
+
+  // TODO: maybe have a constructor of Op that takes an ArrayRef of the inputs
+  // that are not also outputs?  Could use that for graph traversal.
+
+ private:
+
+  // num_inputs_ is the number of base Variables that are the base Variables of
+  // inputs of this Op (but not of outputs).  These are stored in the
+  // array 'inputs_'.
+
+  // inputs_ is a pointer to an array of shared_ptr<Variable> of size num_inputs_, which
+  // will be be allocated by new [] in the constructor and deleted by delete []
+  // in the destructor.
+
+  // This is a list of the Op-input-nodes (see glossary in tensor.h for explanation).
+  // We don't store the Op-output-nodes here; instead, they refer to this Op in
+  // their op_lists.
+  // (We don't store the Node(s) that is(are) the outputs of the Op here; its own
+  // op_list refers to this Op).
+  std::shared_ptr<Node> *inputs_;
+
+  int32 num_inputs_;
+
+  // If num_inputs_ is 1, then inputs_ is
+  void *inputs_;
+
+  int64 n_;  // initialized from the counter when this object is created.
+  std::shared_ptr<Op> tail_;  // TODO: make it unique_ptr?
+ protected:
+  // Return true if this is not the last Op in the list of Ops attached to this
+  // base Variable (can be useful to know whether we need bother to scale the
+  // derivative in a scaling operation, for instance).
+  bool HasTail() const { return tail_ != nullptr; }
+};
+
+
+class AddToOp: public Op {
+ public:
+
+  // This Op corresponds to the computation:
+  //   \f$  b  :=  alpha a  +   beta b.  \f$
+  // with broadcasting or summation depending on the dimensions
+  // involved.  Alpha and beta are constants, and differentiation w.r.t. them is
+  // not supported (you wouldn't reach this code if a or b were actual
+  // variables.)
+  //
+  // The Op is only constructed if b.Tracked() (which it would normally if
+  // a.Tracked()).
+  AddToOp(float alpha, float beta,
+          const Variable &a, const Variable &b):
+      Op({a}),
+      alpha_(alpha),
+      beta_(beta),
+      a_data_(a.GetData()),
+      a_grad_(a.GetGradIfPresent()),
+      b_data_(b.GetData()),
+      b_grad_(b.GetGrad()) {
+
+    Add(alpha, beta, *a_data_, b_data_.get());
+  }
+
+
+  void Backward() {
+    // Do: a_grad += alpha * b_grad.
+    if (a_grad_ != nullptr)
+      AddTo(alpha_, 1.0, b_grad, &a_grad);
+
+    if (beta_ != 1.0)
+      Scale(beta_, b_grad.get());
+  }
+
+ private:
+
+  float alpha_;
+  float beta_;
+
+  // We hold onto all inputs that are not also outputs
+  // (here just a_) for dependency tracking.
+  Variable a_;
+
+  std::shared_ptr<Node> a_node_;
+
+  std::shared_ptr<Tensor> a_data_;
+  // a_grad_ will be NULL if a was not tracked.
+  std::shared_ptr<Tensor> a_grad_;
+  std::shared_ptr<Tensor> b_data_;
+  std::shared_ptr<Tensor> b_grad_;
+
+  Variable b_;
+  bool must_scale_b_grad_;
+
+};
+
+
+class AssignOp: public Op {
+ public:
+
+  // This Op corresponds to the computation:
+  //   \f$  b := a  \f$
+  // with broadcasting or summation depending on the dimensions.
+  //
+  // Constructing this Op will make b tracked if it was already.
+  AssignOp(const Variable &a, const Variable &b):
+      Op({a}),
+      a_data_(a.GetData()),
+      a_grad_(a.GetGradIfPresent()),
+      b_data_(b.GetData()),
+      b_grad_(b.GetGrad()) {
+    Copy(a_data_, b_data_);
+
+      `tick_ = GetTick()`
+  }
+
+
+  void Backward() {
+    // Do: a_grad += alpha * b_grad.
+    if (a_grad_ != nullptr)
+      AddTo(alpha_, 1.0, b_grad, &a_grad);
+
+    if (beta_ != 1.0)
+      Scale(beta_, b_grad.get());
+  }
+
+ private:
+
+  float alpha_;
+  float beta_;
+
+  // We hold onto all inputs that are not also outputs
+  // (here just a_) for dependency tracking.
+  Variable a_;
+
+  std::shared_ptr<Node> a_node_;
+
+  std::shared_ptr<Tensor> a_data_;
+  // a_grad_ will be NULL if a was not tracked.
+  std::shared_ptr<Tensor> a_grad_;
+  std::shared_ptr<Tensor> b_data_;
+  std::shared_ptr<Tensor> b_grad_;
+
+  Variable b_;
+  bool must_scale_b_grad_;
+
+};
+
+
+class AssignOp: public Op {
+ public:
+
+  // This Op corresponds to the computation:
+  //   \f$  b  :=  alpha a  +   beta b.  \f$
+  // with broadcasting or summation depending on the dimensions
+  // involved.  Obviously alpha and beta are constants,
+  // and differentiation w.r.t. them is not supported.
+  //
+  // The Op is only constructed if b_.Tracked() (which it
+  // would normally if a_.Tracked()).
+  AddToOp(float alpha, float beta,
+          const Variable &a, const Variable &b):
+      Op({a}),
+      alpha_(alpha),
+      beta_(beta),
+      a_data_(a.GetData()),
+      a_grad_(a.GetGradIfPresent()),
+      b_data_(b.GetData()),
+      b_grad_(b.GetGrad()) {
+
+    Add(alpha, beta, *a_data_, b_data_.get());
+  }
+
+
+  void Backward() {
+    // Do: a_grad += alpha * b_grad.
+    if (a_grad_ != nullptr)
+      AddTo(alpha_, 1.0, b_grad, &a_grad);
+
+    if (beta_ != 1.0)
+      Scale(beta_, b_grad.get());
+  }
+
+ private:
+
+  float alpha_;
+  float beta_;
+
+  // We hold onto all inputs that are not also outputs
+  // (here just a_) for dependency tracking.
+  Variable a_;
+
+  std::shared_ptr<Node> a_node_;
+
+  std::shared_ptr<Tensor> a_data_;
+  // a_grad_ will be NULL if a was not tracked.
+  std::shared_ptr<Tensor> a_grad_;
+  std::shared_ptr<Tensor> b_data_;
+  std::shared_ptr<Tensor> b_grad_;
+
+  Variable b_;
+  bool must_scale_b_grad_;
+
+};
+
+
+
+}  // namespace tensor
+}  // namespace kaldi
+
+
+#endif  // KALDI_TENSOR__LINEAR_OPS_H_
diff --git a/src/tensor/memory-checker.h b/src/tensor/memory-checker.h
index 031ed8a4453..fb159da51bd 100644
--- a/src/tensor/memory-checker.h
+++ b/src/tensor/memory-checker.h
@@ -441,6 +441,7 @@ class MemoryChecker {
          kRead
          kReadWrite
          kWrite
+         kCheckUninitialized
      From a user's perspective the only thing this function might do is crash--
      which it is designed to do if it detects various "disallowed" things.
   */
diff --git a/src/tensor/op.h b/src/tensor/op.h
index fd3cfb33dfd..6fbc767190b 100644
--- a/src/tensor/op.h
+++ b/src/tensor/op.h
@@ -28,6 +28,15 @@ namespace tensor {
 class Variable;
 
 
+enum OpProperties {
+  kConcreteOp = 1,  // An Op that is concrete is one that can be executed
+                    // directly, i.e. its Do() function works; these Ops will
+                    // generally correspond to a single function call, e.g. a
+                    // particular BLAS call If an Op is not concrete, you should
+                    // keep expanding via Expand() until you get concrete ops,
+                    // and then execute those.
+};
+
 /**
    class Op is a base-class for objects that are created when we do operations
    on Variables.  The important thing to know here is that the Variables in
@@ -52,15 +61,37 @@ class Op {
 
   /**
      Do whatever it is that this Op does (e.g. execute the command `a += b`,
-     if that was what this Op did)
-   */
-  virtual void Do() const;
+     if that was what this Op did).  Only needs to be defined for Ops that
+     are concrete, i.e. Properties() & kOpConcrete
+  */
+  virtual void Do() const {
+    KALDI_ERR << "Execution not supported for this Op (not concrete); "
+        "please expand ";
+  }
 
   /**
-     Return a copy of this object.  (This won't be needed very often but might
-     possibly be needed in the context of computing higher-order derivatives).
+     Return a copy of this object, newly allocated using new.
   */
-  virtual Op *Copy() const;
+  virtual Op *Copy() const = 0;
+
+
+  /**
+     Properties of this Op, a bunch of boolean flags such as kConcreteOp
+     (may add more in future)
+   */
+  virtual int32 Properties() const = 0;
+
+  /**
+     To be called only for non-concrete Ops, i.e. Ops for which Properties() &
+     kConcreteOp is zero.  Calling this function will expand this Op into one or
+     more concrete Ops, appending them to 'ops'.
+        @param [out] ops
+                     Operations will be *appended* to `ops`.  These operations
+                     will be fully-expanded versions of this Op.  (i.e. they
+                     will be concrete).
+   */
+  virtual void Expand(std::vector<std::unique_ptr<Op> > *ops) = 0;
+
 
   /**
      This is for forward-mode automatic differentiation (a rarely-used thing).
@@ -79,10 +110,11 @@ class Op {
      Example: if the command was "a += b", the derivative operation would
      be: deriv(a) += deriv(b).  In most cases these Ops would be executed
      immediately and then deleted.
-   */
+  */
   virtual void GetForwardDerivOps(DerivMap *map,
-                                  std::vector<std::unique_ptr<Op> > *ops) const;
-
+                                  std::vector<std::unique_ptr<Op> > *ops) const {
+    KALDI_ERR << "Forward-mode autograd not supported for this Op";
+  }
 
 
   /**
@@ -103,7 +135,10 @@ class Op {
      `deriv(c) += deriv(a) * b`.
   */
   virtual void GetBackwardDerivOps(DerivMap *map,
-                                   std::vector<std::unique_ptr<Op> > *ops) const;
+                                   std::vector<std::unique_ptr<Op> > *ops) const {
+    KALDI_ERR << "Reverse-mode autograd not supported for this Op";
+  }
+
 
 
   /** Destructor.  It's important for efficiency of memory use to destroy Ops as
@@ -111,359 +146,24 @@ class Op {
       of Tensors and hence Storage regions.
   */
   virtual ~Op();
-};
-
-
-
-class Op {
-
-  Op(): tick_(GetTick()) { }
-
-  /// InputIteratorBegin() and InputIteratorEnd() form the begin and
-  /// end points of a list of Variables that were inputs of this Op
-  /// but were not outputs.  This is used by the backprop code when finding
-  /// the topological order of ops.  (Note: output variables themselves
-  /// refer to Ops, so if we included them in the input list we'd
-  /// get a cycle in the graph).  These Variables are expected to
-  /// still have their graph information (i.e. sub-classes of class Op
-  /// class must not call RemoveGraph() on the members of this list).
-  virtual Op *DepIteratorBegin() = 0;
-  virtual Op *DepIteratorEnd() = 0;
-
-
-
-  // This number >= 0 is used to determine the order of Ops in a graph; each
-  // time we generate an Op we increment a global counter.  Doing it this way,
-  // rather than via topological sorting, is simpler.
-  int64 GetTimestamp() const final { return tick_; }
-
-  virtual void Backprop();
-
  protected:
 
-  /**
-     The time (`GetTick()`) at which this Op was created; should be set
-     in child classes by doing:
-      `tick_ = GetTick()`
-     as the last statement of the constructor.   (This ensures the
-     tick is later-numbered than any ticks stored in the ChangeTracker
-     code by operations called from the constructor.)
-  */
-  int64 tick_;
-
-
-  /*
-    This function intended to be called from the Backprop() routines
-    of child classes, for example:
-       ` if (DebugMode()) {  CheckTensorTime(*a_);  } `
-    This will die if the memory underlying the Tensor being checked has been
-    modified more recently than tick_.
-  */
-  inline void CheckTensorTime(const Tensor &tensor) {
-    if (DebugMode()) {
-    }
-  }
-
-
-
-
-};
-
-
-template <class OpImpl>
-class OpPointer {
-
-  std::shared_ptr<OpImpl>
-
-}
-
-
-
-/**
-   This is a special version of base-class Op that is created when
-   any SharedGrad is allocated for a non-leaf Variable.  Its purpose
-   is to ensure that, when we get to this Op in the backprop, we deallocate
-   the data underlying the gradient Tensor (so we don't keep gradient
-   Tensors around for longer than is needed).
-*/
-class DeallocateOp: public Op {
-
-  // This operator has no dependencies as it will be created when a SharedGrad
-  // is first initialized, when no Ops have been done on it.
-  Op *DepIteratorBegin() override { return NULL; }
-  Op *DepIteratorEnd() override { return NULL; }
-
-  void Backprop() override {
-    if (auto s = tensor_to_deallocate_.lock())
-      ZeroDeallocating(s.get());
-  }
-
- private:
-  // Since we just want to deallocate its underlying data, there is no point
-  // increasing its ref-count; we can just shrug our shoulders if it has
-  // already been deleted.d
-  std::weak_ptr<Tensor> tensor_to_deallocate_;
-};
-
-
-/**
-   A slight simplification of class UnaryOp for cases where it's
-   done in-place.
- */
-class InPlaceUnaryOp: public Op {
-
-};
-
-
-class UnaryOp: public Op {
-
-  //
-  UnaryOp(const Variable &input, const Variable &output) {
-    if
-
-
-
-    if (SameVariable(input, output)) {
-
-    } else {
+  // This function ensures that the *last element* of `ops` is fully expanded At
+  // entry, `ops` is a nonempty vector of Op pointers, which are all concrete
+  // except the last entry.  At exit, `ops` is a nonempty vector of Op pointers
+  // which are all concrete.  This function will usually be called from Expand()
+  // after code that appends an Op that might not be concrete to `ops`.
+  void EnsureExpanded(std::vector<std::unique_ptr<Op> > *ops) {
+    if (!(ops->back()->Properties() & kConcreteOp)) {
+      Op *op = ops->back().get();
+      ops->pop_back();
+      op->Expand(ops);
     }
   }
 
- public:
-
-  std::shared_ptr<Op> op1_;
-  std::shared_ptr<Op> op2_;
-
-
-
-
-}
-
-class GenericOp: public Op {
-
-  // GenericOp is a child of class Op that is intended as a generic base-class
-  // for expressions.
-
-
-
- protected:
-  // Constructor, to be used from child classes.  This base-class takes care
-  // of storing the list of input Variables for purposes of tracing dependencies;
-  //
-  //  @param [in] input_vars  The list of input Variables (meaning: Variables
-  //                   that are inputs to, but not outputs of, i.e. not modified
-  //                   by, this Op).
-  //  @param [in] output_var  The output Variable of this Op, i.e. the Variable
-  //                   which is modified or set by it.  We may provide another
-  //                   constructor taking ArrayRef<Variable> in this position,
-  //                   as and when we need to support Ops that operate on
-  //                   multiple output Variables.
-  void Op(const ArrayRef<Variable> &input_vars,
-          const Variable &output_var);
-
-
-  // TODO: maybe have a constructor of Op that takes an ArrayRef of the inputs
-  // that are not also outputs?  Could use that for graph traversal.
-
- private:
-
-  // num_inputs_ is the number of base Variables that are the base Variables of
-  // inputs of this Op (but not of outputs).  These are stored in the
-  // array 'inputs_'.
-
-  // inputs_ is a pointer to an array of shared_ptr<Variable> of size num_inputs_, which
-  // will be be allocated by new [] in the constructor and deleted by delete []
-  // in the destructor.
-
-  // This is a list of the Op-input-nodes (see glossary in tensor.h for explanation).
-  // We don't store the Op-output-nodes here; instead, they refer to this Op in
-  // their op_lists.
-  // (We don't store the Node(s) that is(are) the outputs of the Op here; its own
-  // op_list refers to this Op).
-  std::shared_ptr<Node> *inputs_;
-
-  int32 num_inputs_;
-
-  // If num_inputs_ is 1, then inputs_ is
-  void *inputs_;
-
-  int64 n_;  // initialized from the counter when this object is created.
-  std::shared_ptr<Op> tail_;  // TODO: make it unique_ptr?
- protected:
-  // Return true if this is not the last Op in the list of Ops attached to this
-  // base Variable (can be useful to know whether we need bother to scale the
-  // derivative in a scaling operation, for instance).
-  bool HasTail() const { return tail_ != nullptr; }
-};
-
-
-class AddToOp: public Op {
- public:
-
-  // This Op corresponds to the computation:
-  //   \f$  b  :=  alpha a  +   beta b.  \f$
-  // with broadcasting or summation depending on the dimensions
-  // involved.  Alpha and beta are constants, and differentiation w.r.t. them is
-  // not supported (you wouldn't reach this code if a or b were actual
-  // variables.)
-  //
-  // The Op is only constructed if b.Tracked() (which it would normally if
-  // a.Tracked()).
-  AddToOp(float alpha, float beta,
-          const Variable &a, const Variable &b):
-      Op({a}),
-      alpha_(alpha),
-      beta_(beta),
-      a_data_(a.GetData()),
-      a_grad_(a.GetGradIfPresent()),
-      b_data_(b.GetData()),
-      b_grad_(b.GetGrad()) {
-
-    Add(alpha, beta, *a_data_, b_data_.get());
-  }
-
-
-  void Backward() {
-    // Do: a_grad += alpha * b_grad.
-    if (a_grad_ != nullptr)
-      AddTo(alpha_, 1.0, b_grad, &a_grad);
-
-    if (beta_ != 1.0)
-      Scale(beta_, b_grad.get());
-  }
-
- private:
-
-  float alpha_;
-  float beta_;
-
-  // We hold onto all inputs that are not also outputs
-  // (here just a_) for dependency tracking.
-  Variable a_;
-
-  std::shared_ptr<Node> a_node_;
-
-  std::shared_ptr<Tensor> a_data_;
-  // a_grad_ will be NULL if a was not tracked.
-  std::shared_ptr<Tensor> a_grad_;
-  std::shared_ptr<Tensor> b_data_;
-  std::shared_ptr<Tensor> b_grad_;
-
-  Variable b_;
-  bool must_scale_b_grad_;
-
-};
-
-
-class CopyOp: public Op {
- public:
-
-  // This Op corresponds to the computation:
-  //   \f$  b := a  \f$
-  // with broadcasting or summation depending on the dimensions.
-  //
-  // Constructing this Op will make b tracked if it was already.
-  CopyOp(const Variable &a, const Variable &b):
-      Op({a}),
-      a_data_(a.GetData()),
-      a_grad_(a.GetGradIfPresent()),
-      b_data_(b.GetData()),
-      b_grad_(b.GetGrad()) {
-    Copy(a_data_, b_data_);
-
-      `tick_ = GetTick()`
-  }
-
-
-  void Backward() {
-    // Do: a_grad += alpha * b_grad.
-    if (a_grad_ != nullptr)
-      AddTo(alpha_, 1.0, b_grad, &a_grad);
-
-    if (beta_ != 1.0)
-      Scale(beta_, b_grad.get());
-  }
-
- private:
-
-  float alpha_;
-  float beta_;
-
-  // We hold onto all inputs that are not also outputs
-  // (here just a_) for dependency tracking.
-  Variable a_;
-
-  std::shared_ptr<Node> a_node_;
-
-  std::shared_ptr<Tensor> a_data_;
-  // a_grad_ will be NULL if a was not tracked.
-  std::shared_ptr<Tensor> a_grad_;
-  std::shared_ptr<Tensor> b_data_;
-  std::shared_ptr<Tensor> b_grad_;
-
-  Variable b_;
-  bool must_scale_b_grad_;
-
 };
 
-
-class CopyOp: public Op {
- public:
-
-  // This Op corresponds to the computation:
-  //   \f$  b  :=  alpha a  +   beta b.  \f$
-  // with broadcasting or summation depending on the dimensions
-  // involved.  Obviously alpha and beta are constants,
-  // and differentiation w.r.t. them is not supported.
-  //
-  // The Op is only constructed if b_.Tracked() (which it
-  // would normally if a_.Tracked()).
-  AddToOp(float alpha, float beta,
-          const Variable &a, const Variable &b):
-      Op({a}),
-      alpha_(alpha),
-      beta_(beta),
-      a_data_(a.GetData()),
-      a_grad_(a.GetGradIfPresent()),
-      b_data_(b.GetData()),
-      b_grad_(b.GetGrad()) {
-
-    Add(alpha, beta, *a_data_, b_data_.get());
-  }
-
-
-  void Backward() {
-    // Do: a_grad += alpha * b_grad.
-    if (a_grad_ != nullptr)
-      AddTo(alpha_, 1.0, b_grad, &a_grad);
-
-    if (beta_ != 1.0)
-      Scale(beta_, b_grad.get());
-  }
-
- private:
-
-  float alpha_;
-  float beta_;
-
-  // We hold onto all inputs that are not also outputs
-  // (here just a_) for dependency tracking.
-  Variable a_;
-
-  std::shared_ptr<Node> a_node_;
-
-  std::shared_ptr<Tensor> a_data_;
-  // a_grad_ will be NULL if a was not tracked.
-  std::shared_ptr<Tensor> a_grad_;
-  std::shared_ptr<Tensor> b_data_;
-  std::shared_ptr<Tensor> b_grad_;
-
-  Variable b_;
-  bool must_scale_b_grad_;
-
-};
-
-
+// See linear-ops.h and nonlinear-ops.h for concrete examples of Ops.
 
 }  // namespace tensor
 }  // namespace kaldi
diff --git a/src/tensor/pattern-extra-utils.cc b/src/tensor/pattern-tuple-utils.cc
similarity index 91%
rename from src/tensor/pattern-extra-utils.cc
rename to src/tensor/pattern-tuple-utils.cc
index 6ebe7fcb987..60d60d9939d 100644
--- a/src/tensor/pattern-extra-utils.cc
+++ b/src/tensor/pattern-tuple-utils.cc
@@ -481,7 +481,7 @@ std::vector<int32>  RandomIndexFromHyperrectangle(const Hyperrectangle &a) {
   auto a_iter = a.begin();
   for (; ans_iter != ans_end; ++ans_iter, ++a_iter)
     *ans_iter = RandInt(a_iter->first, a_iter->second - 1);
-  
+
 
 }
 
@@ -974,7 +974,7 @@ bool ComputeDifference(const Pattern &pattern1,
     // Some of the code below with num_axes - 1 would crash in this case, so
     // handle it separately.  Note: for 1-element patterns, if their offsets are
     // different, they don't intersect.
-    if (pattern1.offset != pattern2.offset) 
+    if (pattern1.offset != pattern2.offset)
       difference->push_back(pattern1);
     return true;
   }
@@ -1010,7 +1010,7 @@ bool ComputeDifference(const Pattern &pattern1,
     int64 begin_mindex2 = sub_pattern2.offset,
       end_mindex2 = begin_mindex2 +
       sub_pattern2.strides[num_axes - 1] * sub_pattern2.dims[num_axes - 1];
-    
+
     for (auto iter = cur_difference.begin(); iter != cur_difference.end(); ++iter){
       const Pattern &sub_pattern1 = *iter;
       // as before, end_mindex1 is strictly greater than the actual largest
@@ -1026,11 +1026,11 @@ bool ComputeDifference(const Pattern &pattern1,
         continue;
       }
 
-      std::vector<Hyperrectangle> cur_rects(1); 
+      std::vector<Hyperrectangle> cur_rects(1);
       // Get a hyperrectangle that represents all index-tuples into
       // sub_pattern1.
       GetFullHyperrectangleOfPattern(sub_pattern1, &cur_rects.back());
-      
+
       // each member of `offsets` represents one part of the intersection
       // between sub_pattern1 and sub_pattern2.  Each of these will be converted
       // to a hyperrectangle representing the set of indexes it covers within
@@ -1040,7 +1040,7 @@ bool ComputeDifference(const Pattern &pattern1,
       // after subracting  previous things.
       std::vector<std::vector<int32> > offsets;
       FindOffsets(sub_pattern1, sub_pattern2, true, &offsets);
-      
+
       std::vector<Hyperrectangle> next_rects;
       for (const std::vector<int32> &offset: offsets) {
         Hyperrectangle h;
@@ -1056,7 +1056,7 @@ bool ComputeDifference(const Pattern &pattern1,
         cur_rects.swap(next_rects);
         next_rects.clear();
       }
-      for (auto hiter = cur_rects.begin(); 
+      for (auto hiter = cur_rects.begin();
            hiter !=  cur_rects.end(); ++hiter) {
         // *hiter represents one piece of the difference sub_pattern1 -
         // sub_pattern2, expressed as indexes into sub_pattern1.  We turn
@@ -1078,7 +1078,7 @@ bool PatternIsSubsetOf(const Pattern &p,
   std::vector<Pattern> intersection;
   ComputeIntersection(p, q, true, &intersection);
   int64 total_size = 0;
-  for (Pattern &r : intersection) 
+  for (Pattern &r : intersection)
     total_size += NumElements(r);
   return (total_size == NumEements(p));
 }
@@ -1300,5 +1300,123 @@ void MakeCompactNormalizedAndJustified(const Pattern &src,
 
 
 
+/**
+   Class TupleAxisComparator is used when we want to sort the axes of a tuple
+   of Patterns.  It helps to reduce the possible number of axis orderings that
+   we have to handle in implementation code.  (I.e. it reduces the number
+   of case statements that we have to handle in certain Ops).
+
+   Each stride is first converted to a number 0, 1 or 2, where 0 and 1
+   correspond to strides of 0 and 1 respectively and 2 means "any other value".
+   Call this number a stride-code.  The first comparion we do is on the first
+   pattern; we produce an order such that the stride-codes of the first pattern
+   are ordered from least to greatest value in the private mumbering.
+
+   In case of ties on the stride-codes of the first pattern, we then sort on the
+   sum of squares of the stride-codes of the other patterns.  (Using the
+   sum of squares rather than the simple sum reduces the chance of ties,
+   i.e. we don't get cases where 1 + 1 == 2 introduces a tie, because the
+   2's become 4's.
+
+   Note: the ordering this induces on the axes is not a total order for every
+   Pattern-tuple, so this comparator cannot be used as part of a
+   "canonicalization" process for Pattern-tuples.
+ */
+class TupleAxisComparator {
+
+  /**
+     Comparator function.  Returns true if raxis1 should appear before raxis2 in
+     the sorted ordering.
+        @param [in] raxis1  Axis in the private numbering, must be
+                            in range [0, num_axes - 1] where num_axes
+                            is the num_axes of the Patterns.
+        @param [in] raxis2  Axis in the private numbering, satisfying
+                            the same conditions as raxis1
+        @return             Returns true if the raxis numbered raxis1
+                            should come before raxis2 in the new axis
+                            ordering.  Like a less-than operator.
+  */
+  bool operator () (int32 raxis1, int32 raxis2) const {
+    KALDI_PARANOID_ASSERT(static_cast<uint32>(raxis1) <
+                          static_cast<uint32>(patterns_[0].num_axes));
+    uint32 stride_code1 = std::min<uint32>(patterns_[0].strides[raxis1], 2),
+        stride_code2 = std::min<uint32>(patterns_[0].strides[raxis2], 2);
+    if (stride_code1 < stride_code2) return true;
+    else if (stride_code1 > stride_code2) return false;
+    uint32 stride_code1_sumsq = 0,
+        stride_code2_sumsq = 0;
+    for (size_t i = 1; i < patterns_.size; i++) {
+      stride_code1 = std::min<uint32>(patterns_[i].strides[raxis1], 2);
+      stride_code2 = std::min<uint32>(patterns_[i].strides[raxis2], 2);
+      stride_code1_sumsq += stride_code1 * stride_code1;
+      stride_code2_sumsq += stride_code2 * stride_code2;
+    }
+    return stride_code1_sumsq < stride_code2_sumsq;
+  };
+
+  /**
+     Constructor
+            @param [in] patterns   The tuple of Patterns.  Must be
+                          a valid Pattern-tuple; search for
+                          "Valid Pattern-tuple" in pattern.h.
+  */
+  TupleAxisComparator(ArrayRef<Pattern*> patterns): patterns_(patterns) {
+    KALDI_PARANOID_ASSERT(IsValidPatternTuple(patterns_));
+  }
+
+private:
+  ArrayRef<Pattern*> patterns_;
+};
+
+
+/**
+   This object is to be instantiated when you want to know what permutation
+   you'd get if you were to sort the axes of this tuple of Patterns using
+   TupleAxisComparator.  Note: this is not a total order for all pattern-tuples,
+   so its behavior may not be completely deterministic, especially across
+   different versions of the stl library.
+ */
+class OutOfPlaceTupleAxisSorter {
+ public:
+  // Constructor.
+  inline OutOfPlaceTupleAxisSorter(ArrayRef<Pattern*> src) {
+    KALDI_PARANOID_ASSERT(IsValidPatternTuple(src));
+    int32 num_axes = src[0]->num_axes;
+    for (int32 raxis = 0; raxis < num_axes; raxis++)
+      orig_raxis_[raxis] = raxis;
+    TupleAxisComparator c(src);
+    std::sort(orig_raxis_, orig_raxis_ + num_axes, c);
+  }
+  // Returns the 'source' raxis-index for a particular destination
+  // raxis-index, e.g..:  `src_raxis = GetIndex(dest_raxis)`.
+  // Copying as e.g. `dest.strides[dest_raxis] = src.strides[src_raxis]`,
+  // and the same for the dims, would give you a `dest` with axes
+  // sorted from smallest to greatest absolute value.
+  inline int32 GetIndex(int32 raxis) { return orig_raxis_[raxis]; }
+
+ private:
+  int32 orig_raxis_[KALDI_TENSOR_MAX_DIM];
+};
+
+
+void SortTupleAxes(ArrayRef<Pattern*> patterns) {
+  OutOfPlaceAxisSorter sorter(src);
+  int32 num_axes = patterns[0]->num_axes;
+  Pattern temp_pattern;
+  for (size_t i = 0; i < patterns->size(); i++) {
+    Pattern &this_pattern = *(patterns[i]);
+    for (int32 i = 0; i < num_axes; i++) {
+      int32 src_raxis = sorter.GetIndex(i);
+      temp_pattern.strides[i] = this_pattern.strides[src_raxis];
+      temp_pattern.dims[i] = this_pattern.dims[src_raxis];
+    }
+    for (int32 i = 0; i < num_axes; i++) {
+      this_pattern.strides[i] = temp_pattern.strides[i];
+      this_pattern.dims[i] = temp_pattern.dims[i];
+    }
+  }
+}
+
+
 }  // namespace kaldi
 }  // namespace tensor
diff --git a/src/tensor/pattern-extra-utils.h b/src/tensor/pattern-tuple-utils.h
similarity index 90%
rename from src/tensor/pattern-extra-utils.h
rename to src/tensor/pattern-tuple-utils.h
index bf724706aae..1a9d9bc25df 100644
--- a/src/tensor/pattern-extra-utils.h
+++ b/src/tensor/pattern-tuple-utils.h
@@ -1,4 +1,4 @@
-// tensor/pattern-extra-utils.h
+// tensor/pattern-tuple-utils.h
 
 //  Copyright      2019  Johns Hopkins University (author: Daniel Povey)
 
@@ -17,8 +17,8 @@
 // See the Apache 2 License for the specific language governing permissions and
 // limitations under the License.
 
-#ifndef KALDI_TENSOR_TENSOR_PATTERN_EXTRA_UTILS_H_
-#define KALDI_TENSOR_TENSOR_PATTERN_EXTRA_UTILS_H_ 1
+#ifndef KALDI_TENSOR_TENSOR_PATTERN_TUPLE_UTILS_H_
+#define KALDI_TENSOR_TENSOR_PATTERN_TUPLE_UTILS_H_ 1
 
 #include "tensor/tensor-common.h"
 #include "tensor/pattern.h"
@@ -267,7 +267,7 @@ bool PatternContains(const Pattern &pattern,
    memory-index-set of pattern q.  Note: the algorithm is not super trivial or
    fast (although the tiem taken doesn't grow with the dims or strides, only
    with the number of axes).
-   
+
       @param [in] p   First pattern; must be valid.
       @param [in] q   Second pattern; must be valid.
       @return   Returns true if memory-index-set of p is a subset of
@@ -630,9 +630,9 @@ class OutOfPlaceAxisSorter {
   // Constructor.
   inline OutOfPlaceAxisSorter(const Pattern &src) {
     int32 num_axes = src.num_axes;
-    for (int32 raxis = 0; raxis < src.num_axes; raxis++)
+    for (int32 raxis = 0; raxis < num_axes; raxis++)
       orig_raxis_[raxis] = raxis;
-    std::sort(orig_raxis_, orig_raxis_ + src.num_axes,
+    std::sort(orig_raxis_, orig_raxis_ + num_axes,
               // a comparator (less-than) operator implemented as a lambda is
               // below.  Sort from least to greatest abs(stride), disambiguating
               // based on dim.
@@ -657,11 +657,79 @@ class OutOfPlaceAxisSorter {
 
 
 
+/**
+   This function sorts the axes in 'patterns' (which must be a valid
+   pattern-tuple, see pattern.h for explanation) using TupleAxisComparator.  See
+   its documentation in pattern-tuple-utils.cc for description of what this
+   order is.
+
+     @param [in,out]  The patterns whose axes are to be sorted.  The same
+                     permutation will be applied to all the patterns.
+ */
+void SortTupleAxes(ArrayRef<Pattern*> patterns);
+
+/**
+   Compresses a Pattern-tuple by removing or combining as many axes as possible.
+   See the documentation for CompressOnePattern() in pattern-utils.h basic
+   concept of compressing a single Pattern to a pattern with possibly fewer axes
+   (and maybe with negative strides converted to positive), which covers the
+   same set of memory locations as the original Tensor.
+
+   The difference with just calling CompressOnePattern() several times is
+   that CompressPatterns() preserves the relationships between the tensors.
+   In the language developed in pattern.h, this means the memory-index-tuple-set
+   is preserved.
+
+   Note: while the first Pattern will have no negative strides at output,
+   the others may.
+
+     @param [in,out] patterns   An nonempty array of the patterns
+                         to be jointly compressed.
+
+      @return  Returns true if it made any change to the patterns,
+               false if they were unchanged.
+
+ Examples are below, where we write a Pattern as
+ `{{dim1,dim2,..}, {stride1,stride2,..}}`.
+
+\verbatim
+    src1                src2              dest1,offset1       dest2,offset2
+  {{10},{1}}           {{10},{1}}        {{10},{1}},0        {{10},{1}},0  # no-op
+  {{8},{1}}            {{1},{0}}         {{8},{1}},0         {{1},{0}},0   # no-op
+  {{7},{-1}}           {{7},{1}}         {{7},{1}},-6         {{7},{-1}},6 # flip sign
+ {{3,4},{4,1}}        {{3,4},{4,1}}      {{12},{1}},0         {{12},{1}},0 # combine dims
+ {{3,4},{4,1}}        {{3,1},{4,0}}      {{3,4},{4,1}}        {{3,1},{4,0}} # can't combine, would be incompatible
+ {{3,4},{4,1}}        {{1,1},{0,0}}      {{12},{1}}           {{1},{0}}    # combine
+\endverbatim
+
+   See also SortTupleAxes() and NormalizePatternTuple().
+ */
+bool CompressPatternTuple(ArrayRef<Pattern*> patterns);
+
+
+/**
+   Reduces a pattern-tuple to a normalized form.  (Caution: this may not be 100%
+   deterministic, i.e. there may be two pattern-tuples in normalized form,
+   i.e. the form produced by this function, which share the same
+   memory-index-tuple-set but are not equal).
+
+   This just calls CompressPatternTuple() and then SortPatternTupleAxes().
+
+     @param [in,out] patterns.
+
+*/
+inline bool NormalizePatternTuple(ArrayRef<Pattern*> patterns) {
+  CompressPatternTuple(patterns);
+  NormalizePatternTupleAxes(patterns);
+}
+
+
+
 
 }  // namespace tensor
 }  // namespace kaldi
 
 // Include implementation of inline functions.
-#include "tensor/pattern-extra-utils-inl.h"
+#include "tensor/pattern-tuple-utils-inl.h"
 
-#endif  // KALDI_TENSOR_TENSOR_PATTERN_EXTRA_UTILS_H_
+#endif  // KALDI_TENSOR_TENSOR_PATTERN_TUPLE_UTILS_H_
diff --git a/src/tensor/pattern-utils.cc b/src/tensor/pattern-utils.cc
index 35d62f3ad9e..2eeae14949c 100644
--- a/src/tensor/pattern-utils.cc
+++ b/src/tensor/pattern-utils.cc
@@ -449,6 +449,10 @@ void SortAxes(Pattern *pattern) {
   }
 }
 
+void SortTupleAxes(ArrayRef<Pattern*> patterns) {
+
+}
+
 void Transpose(int32 raxis1, int32 raxis2, Pattern *p) {
   if (static_cast<uint32>(raxis1) >= static_cast<uint32>(p->num_axes) ||
       static_cast<uint32>(raxis2) >= static_cast<uint32>(p->num_axes)) {
diff --git a/src/tensor/pattern-utils.h b/src/tensor/pattern-utils.h
index 0e951a0065d..a1fc20f50fb 100644
--- a/src/tensor/pattern-utils.h
+++ b/src/tensor/pattern-utils.h
@@ -390,14 +390,14 @@ inline void Squeeze(int32 axis, Pattern *p) {
 
        @param [in] a  The pattern of the first Tensor
        @param [in] b  The pattern of the second Tensor
-       @param [in] b_non_reducing   If true, then we do not allow a dim of
+       @param [in] b_not_smaller   If true, then we do not allow a dim of
                       b to be 1 while corresponding dim of a is >1.
        @return  Returns true if a and b are broadcastable (with
                 an additional constraint that `a.dims[i] <= b.dims[i]` if
-                `b_non_reducing == true`.
+                `b_not_smaller == true`.
  */
 bool Broadcastable(const Pattern &a, const Pattern &b,
-                   bool b_non_reducing = false);
+                   bool b_not_smaller = false);
 
 
 /**  This function returns true if the dimensions of tensor patterns
@@ -409,17 +409,17 @@ bool Broadcastable(const Pattern &a, const Pattern &b,
        @param [in] a  The pattern of the first Tensor
        @param [in] b  The pattern of the second Tensor
        @param [in] c  The pattern of the third Tensor
-       @param [in] c_non_reducing   If true, then we do not allow a dim of
+       @param [in] c_not_smaller   If true, then we do not allow a dim of
                       c to be 1 while corresponding dims of a or b
                       are > 1.
        @return  Returns true if a, b and c are broadcastable (with
                 an additional constraint that
                 `max(a.dims[i], b.dims[i]) <= c.dims[i]` if
-                `c_non_reducing == true`).
+                `c_not_smaller == true`).
 
  */
 bool Broadcastable(const Pattern &a, const Pattern &b,
-                   const Pattern &c, bool c_non_reducing = false);
+                   const Pattern &c, bool c_not_smaller = false);
 
 
 
@@ -561,32 +561,6 @@ bool IsCanonical(const Pattern &pattern);
 int64 NumElements(const Pattern &pattern);
 
 
-/**
-   This version of SortAxes() sorts the axes in 'patterns' (which must be
-   nonempty and all have the same number of axes), by ordering them from the
-   most negative stride value in patterns[0] to the most positive stride value
-   in patterns[0], using the strides in the other patterns to disambiguate the
-   order only in case of ties (which could only happen if some strides were
-   zero), and then the dims in the same order if the strides are all the same
-   (the strides would only be the same if they were zero, if the patterns were
-   valid).  Roughly, it's a lexical order on the (strides, then dims) of the
-   patterns.  Note: the most-negative-to-most-positive ordering is in terms of
-   the private, `raxis` numbering; it would be most-positive-to-most-negative in
-   the public numbering.
-
-   TODO: work out what the ordering should be; should it really be negative-to-
-   positive, or based on abs(stride), and do we need disambiguation with the
-   dims?
-
-   TODO: do we even need this??
-
-     @param [in,out]  The patterns whose axes are to be sorted.  All
-                    will have their axes subject to the same permutation.
-                    The ordering is based on the strides of patterns[0],
-                    but using the strides of later numbered patterns in
-                    case of ties.
- */
-void SortAxes(ArrayRef<Pattern*> patterns);
 
 /**
   Multiplies all strides and the offset in 'pattern' by 'scale', which must be >
@@ -599,6 +573,8 @@ void ScaleStridesAndOffset(int32 scale, Pattern *pattern);
 
 
 
+
+
 /// Hashing object, used when we need an unordered_map containing Pattern.
 class PatternHasher {
   size_t operator () (const Pattern &pattern) const;
@@ -609,51 +585,11 @@ class PatternHasher {
   CompressTwoPatterns() is a special case of CompressPatterns() where there
   are exactly two patterns to be jointly compressed.  See documentation of
   CompressPatterns() for explanation.
- */
+*/
 void CompressTwoPatterns(Pattern *a,
                          Pattern *b);
 
 
-/**
-   Compresses one or more Pattern by removing or combining as many axes as
-   possible.  See the documentation for CompressOnePattern() to understand the
-   basic concept of compressing a single Pattern to a pattern with possibly
-   fewer axes (and maybe with negative strides converted to positive),
-   which covers the same set of memory locations as the original Tensor.
-
-   The difference with just calling CompressOnePattern() several times is
-   that CompressPatterns() preserves the relationships between the tensors.
-
-   In technical terms (and you will have to follow definitions several deep
-   in the glossary to find all the definitions), this operation
-   preserves the memory-index-tuple-set of the Pattern-tuple, and
-   also the memory-index-set of each of the Patterns (we have to specify
-   the part after "and" to disallow swapping the Patterns).
-
-   Note: while the first Pattern will have no negative strides at output,
-   the others may.
-
-      @param [in,out] patterns   An nonempty array of the patterns
-                         to be jointly compressed.
-
-      @return  Returns true if it made any change to the patterns,
-               false if they were unchanged.
-
- Examples are below, where we write a Pattern as
- `{{dim1,dim2,..}, {stride1,stride2,..}}`.
-
-\verbatim
-    src1                src2              dest1,offset1       dest2,offset2
-  {{10},{1}}           {{10},{1}}        {{10},{1}},0        {{10},{1}},0  # no-op
-  {{8},{1}}            {{1},{0}}         {{8},{1}},0         {{1},{0}},0   # no-op
-  {{7},{-1}}           {{7},{1}}         {{7},{1}},-6         {{7},{-1}},6 # flip sign
- {{3,4},{4,1}}        {{3,4},{4,1}}      {{12},{1}},0         {{12},{1}},0 # combine dims
- {{3,4},{4,1}}        {{3,1},{4,0}}      {{3,4},{4,1}}        {{3,1},{4,0}} # can't combine, would be incompatible
- {{3,4},{4,1}}        {{1,1},{0,0}}      {{12},{1}}           {{1},{0}}    # combine
-\endverbatim
- */
-bool CompressPatterns(ArrayRef<Pattern*> patterns);
-
 /**
    Compresses a Pattern by removing or combining as many axes as possible,
    while preserving the memory-index-set of the pattern (see glossary for
@@ -818,6 +754,12 @@ void HasCStrides(const Pattern &pattern);
 bool PatternsOverlap(const Pattern &pattern1,
                      const Pattern &pattern2);
 
+/**
+   Returns true if this is a valid pattern-tuple (see "Valid pattern-tuple"
+   in pattern.h)
+ */
+bool IsValidPatternTuple(ArrayRef<Pattern*> patterns);
+
 
 
 /**
diff --git a/src/tensor/pattern.cc b/src/tensor/pattern.cc
index f4e0c237a45..de1d1eba307 100644
--- a/src/tensor/pattern.cc
+++ b/src/tensor/pattern.cc
@@ -19,6 +19,7 @@
 
 #include <algorithm>
 #include "tensor/pattern.h"
+#include "tensor/pattern-utils.h"
 
 
 namespace kaldi {
@@ -85,6 +86,12 @@ bool Pattern::Check(bool check_code) {
 }
 
 
+int32 Pattern::GetCode() {
+  if (code < 0)
+    code = ComputePatternCode(*this);
+  return code;
+}
+
 // MAY DELETE THIS.  It's not up to date anyway.
 void PatternProperties::UpdateProperties(const Pattern &pattern) {
   KALDI_PARANOID_ASSERT(pattern.IsValid());
diff --git a/src/tensor/pattern.h b/src/tensor/pattern.h
index 816eb2bbeff..ed879018854 100644
--- a/src/tensor/pattern.h
+++ b/src/tensor/pattern.h
@@ -270,9 +270,18 @@ namespace tensor {
                      (in most circumstances) satisfy.
 
 
-    Pattern-tuple:    A pattern-tuple of a tuple of Patterns, say:  (P, Q),
+    Pattern-tuple:    A pattern-tuple is tuple of Patterns, say:  (P, Q),
                       where the patterns in the tuple are broadcastable, meaning,
-                      for example: Broadcastable(P, Q).
+                      for example: Broadcastable(P, Q).  The order of the tuple
+                      must be at least one (i.e. at least one Pattern).
+
+           [Valid Pattern-tuple:]
+                     This describes the properties that Pattern-tuples are expected
+                     to satisfy in most situations where we might pass them into
+                     functions (this will usually be as ArrayRef<Pattern*>).
+                     The tuple most contain at least one pattern; each pattern must
+                     be valid; and they must be broadcastable, i.e.
+                     Broadcastable(p1, p2) for each pair of Patterns.
 
 
     Public numbering: The numbering of axes used in the public interface of class
@@ -530,10 +539,13 @@ struct Pattern {
                  // from the start of the originally allocated memory
                  // region
 
-  int32 code;  // pattern code; see ComputePatternCode() in pattern-utils.h
-               // for details.  If this is negative then it means it has not been
-               // computed.  In a valid Pattern the code will always be either
-               // negative or up-to-date.
+  int32 code;  // pattern code; from user-level code it should be accessed via
+               // GetCode(), which ensures it is set.  See documentation for
+               // ComputePatternCode() in pattern-utils.h for details of what
+               // this represents.  If this is negative then it means it has not
+               // been computed.  In a valid Pattern the code will always be
+               // either negative or up-to-date; GetCode(), which assumes the
+               // Pattern was valid, computes the code if it was negative.
 
   int32 properties;  // More occasionally-needed properties.  This is similar to
                      // OpenFst's notion of properties, where we compute them
@@ -550,6 +562,11 @@ struct Pattern {
   // See also IsCanonical() in pattern-utils.h.
   bool IsValid();
 
+  // Returns the pattern's code (the correct code, not -1).  Requires
+  // the Pattern to be valid (and in a valid Pattern, the code must be -1
+  // or the correct code, so we may assume a code >= 0 is the correct one.
+  int32 GetCode();
+
   // This comparator induces a total ordering on valid Patterns.  It is a
   // lexical comparison on the offset, num_axes, dims and strides.  (The code
   // does not need to be compared because, if not -1, it is a function of the
@@ -557,12 +574,16 @@ struct Pattern {
   bool operator < (const Pattern &other) const;
 
 
-  // Equality operator on Pattern.  Compares the num_axes, offset, and
-  // dims and strides indexed [0... num_axes-1].  (In patterns that satisfy IsValid(),
+  // Equality operator on Pattern.  Compares the num_axes, offset, and dims and
+  // strides indexed [0... num_axes-1].  (In patterns that satisfy IsValid(),
   // the remaining dims and strides would be 1 and 0 respectively, so checking
-  // the is pointless).
+  // them is pointless).
   bool operator == (const Pattern &other) const;
 
+  inline bool operator != (const Pattern &other) const {
+    return !(*this == other);
+  }
+
   // Assignment operator (copies all members).
   bool operator = (const Pattern &other) const;
 };
diff --git a/src/tensor/tensor-common.h b/src/tensor/tensor-common.h
index 5662cbf0277..f4947ae92d6 100644
--- a/src/tensor/tensor-common.h
+++ b/src/tensor/tensor-common.h
@@ -132,7 +132,8 @@ enum BinaryFunctionEnum {
 enum TensorUseEnum {
   kRead,
   kReadWrite,
-  kWrite
+  kWrite,
+  kCheckUninitialized
 };
 
 
diff --git a/src/tensor/tensor-functions.h b/src/tensor/tensor-functions.h
index 8d1373540bf..e534414bc72 100644
--- a/src/tensor/tensor-functions.h
+++ b/src/tensor/tensor-functions.h
@@ -62,8 +62,7 @@ inline void Transpose(int32 axis1, int32 axis2, Tensor *t) {
 
 /**
    Copy the data from tensor 'src' to tensor 'dest', allowing broadcasting
-   (so a dim of src can be 1 while the corresponding dim of 'dest' is >1).
-   Requires Broadcastable(src, *dest, true).
+   or summation.  Requires Broadcastable(src, *dest).
 
    Does not require that the Dtype() or Device() of src and dest be the same
    (i.e. does not require Compatible(src, *dest)).  This is the only way in
@@ -321,9 +320,23 @@ void AddTo(Scalar alpha, Scalar beta, const Tensor &src, const Tensor *dest);
                      `BroadcastableAndCompatible(src, *dest) &&
                      !Overlap(src, *dest) || Identical(src, *dest))`,
 */
-void AddTo(const Tensor &alpha, const Tensor &beta,
+void AddTo(const Scalar &alpha, const Scalar &beta,
            const Tensor &src, const Tensor *dest);
 
+/**
+   Does
+       dest += src
+   (note: this may involve broadcasting or summation depending on
+   the dimensions of dest and src.  Viewing dest and src
+   as patterns, the technical definition, with respect to the
+   notation in pattern.h, is: for each index-tuple i in the
+   index-tuple-set of (src, *dest), do: dest[i] += src[i].
+
+   Requires BroadcastableAndCompatible(src, *dest) && !Overlap(src, *dest).
+
+ */
+void AddTo(const Tensor &src, const Tensor *dest);
+
 
 
 
diff --git a/src/tensor/tensor-impl-utils.h b/src/tensor/tensor-impl-utils.h
index eb36671a983..f31c71ae79f 100644
--- a/src/tensor/tensor-impl-utils.h
+++ b/src/tensor/tensor-impl-utils.h
@@ -86,9 +86,9 @@ void CreateTensorStorage(TensorImpl *impl);
 
 
 /**
-   Returns true if the provided TensorImpl covers the whole of the
-   allocated storage region, i.e. if every byte of the storage region
-   is accessible through `impl`.
+   Returns true if the provided TensorImpl covers the whole of the allocated
+   storage region, i.e. if every byte of the storage region is accessible
+   through `impl`.
  */
 bool IsWhole(const TensorImpl &impl);
 
diff --git a/src/tensor/tensor-impl.h b/src/tensor/tensor-impl.h
index 2396469020a..b5b70352fb3 100644
--- a/src/tensor/tensor-impl.h
+++ b/src/tensor/tensor-impl.h
@@ -82,8 +82,7 @@ struct TensorImpl {
   // allocated.
   inline void* GetData() const;
 
-
-
+  inline int32 GetCode() { return pattern.GetCode(); }
 
   /**
     Returns true if this TensorImpl is valid, false otherwise.
diff --git a/src/tensor/tensor-linear-ops.h b/src/tensor/tensor-linear-ops.h
new file mode 100644
index 00000000000..de2efd7274f
--- /dev/null
+++ b/src/tensor/tensor-linear-ops.h
@@ -0,0 +1,501 @@
+// tensor/tensor-linear-ops.h
+
+// Copyright      2019  Johns Hopkins University (author: Daniel Povey)
+
+// See ../../COPYING for clarification regarding multiple authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//  http://www.apache.org/licenses/LICENSE-2.0
+//
+// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
+// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
+// MERCHANTABLITY OR NON-INFRINGEMENT.
+// See the Apache 2 License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef KALDI_TENSOR_TENSOR_LINEAR_OPS_H_
+#define KALDI_TENSOR_TENSOR_LINEAR_OPS_H_ 1
+
+#include "tensor/tensor.h"
+
+
+// Note: user-level code will not interact directly with these Ops.  See
+// tensor-linear.h for the user-level code.
+namespace kaldi {
+namespace tensor {
+
+
+/**
+   Add operation taking two Tensors (T), i.e. b += a, which
+   may include summation and/or broadcasting depending on
+   the dimensions of b and a.
+
+   May not be used if b and a overlap.  Probably should not be used if b is
+   known to be zero (e.g. if it hasn't been allocated)-- in that case, it's
+   better to use CopyOp.
+*/
+class AddTTOp {
+ public:
+
+  AddTTOp(const Tensor &a, Tensor &b):
+      a_(a), b_(b) {
+    KALDI_ASSERT(!Overlap(a, b) &&
+                 BroadcastableAndCompatible(a, b));
+  }
+  void Do() const override {
+
+  }
+
+
+ private:
+   Tensor a_;
+   Tensor b_;
+};
+
+
+/**
+   class Op is a base-class for objects that are created when we do operations
+   on Variables.  The important thing to know here is that the Variables in
+   question will always have been allocated with particular dimensions,
+   and possibly even contain defined values, before we get to the Op.
+   Examples of Ops include,
+      a := b * c
+      a += b
+      a *= b
+   where the interpretation of the commands above will depend on the
+   dimensions of the Tensors involved.
+
+   Notice that all the member functions of class Op are `const`, i.e. they
+   shouldn't change this class (although of course they may change the
+   underlying Tensor data).  This is to remind users that Ops are supposed
+   to be reusable, and calls to this object shouldn't affect the behavior
+   of subsequent calls, except to the extent that the underlying Tensor
+   data has been changed.
+ */
+class Op {
+ public:
+
+  /**
+     Do whatever it is that this Op does (e.g. execute the command `a += b`,
+     if that was what this Op did)
+   */
+  virtual void Do() const;
+
+  /**
+     Return a copy of this object.  (This won't be needed very often but might
+     possibly be needed in the context of computing higher-order derivatives).
+  */
+  virtual Op *Copy() const;
+
+  /**
+     This is for forward-mode automatic differentiation (a rarely-used thing).
+     It appends to 'ops' the commands corresponding to the forward-mode
+     automatic differentiation w.r.t. this Op.
+
+       @param [in,out] 'map' is the map that maps from tensors to the
+             corresponding derivative values.  May be modified by adding
+             new key/value pairs.
+       @param [out] ops  This funtion will *append* to `ops` the
+             commands for computing the derivatives associated with
+             this Op in forward-mode automatic differentiation.  If none
+             of the inputs to the Op were tracked w.r.t. `map`,
+             nothing will be done.
+
+     Example: if the command was "a += b", the derivative operation would
+     be: deriv(a) += deriv(b).  In most cases these Ops would be executed
+     immediately and then deleted.
+   */
+  virtual void GetForwardDerivOps(DerivMap *map,
+                                  std::vector<std::unique_ptr<Op> > *ops) const;
+
+
+
+  /**
+     This is for reverse-mode automatic differentiation (the normal type of
+     autograd).
+
+       @param [in,out] map   This object maps from tensors to the
+                       corresponding derivative values.  It may be changed by
+                       adding new elements to the map, if its Deriv() function
+                       is called.
+       @param [out]    ops  This function may *append* to 'ops' the commands
+                       used in the reverse-mode automatic differentiation.
+                       (Note: nothing will be appended if none of the inputs
+                       to the Op were already tracked w.r.t. 'map'.)
+
+     Example: if the command was "a += b * c", the operations added to
+     'ops' would correspond to `deriv(b) += deriv(a) * c` and
+     `deriv(c) += deriv(a) * b`.
+  */
+  virtual void GetBackwardDerivOps(DerivMap *map,
+                                   std::vector<std::unique_ptr<Op> > *ops) const;
+
+
+  /** Destructor.  It's important for efficiency of memory use to destroy Ops as
+      soon as you won't need them any more, because it may trigger the freeing
+      of Tensors and hence Storage regions.
+  */
+  virtual ~Op();
+};
+
+
+
+class Op {
+
+  Op(): tick_(GetTick()) { }
+
+  /// InputIteratorBegin() and InputIteratorEnd() form the begin and
+  /// end points of a list of Variables that were inputs of this Op
+  /// but were not outputs.  This is used by the backprop code when finding
+  /// the topological order of ops.  (Note: output variables themselves
+  /// refer to Ops, so if we included them in the input list we'd
+  /// get a cycle in the graph).  These Variables are expected to
+  /// still have their graph information (i.e. sub-classes of class Op
+  /// class must not call RemoveGraph() on the members of this list).
+  virtual Op *DepIteratorBegin() = 0;
+  virtual Op *DepIteratorEnd() = 0;
+
+
+
+  // This number >= 0 is used to determine the order of Ops in a graph; each
+  // time we generate an Op we increment a global counter.  Doing it this way,
+  // rather than via topological sorting, is simpler.
+  int64 GetTimestamp() const final { return tick_; }
+
+  virtual void Backprop();
+
+ protected:
+
+  /**
+     The time (`GetTick()`) at which this Op was created; should be set
+     in child classes by doing:
+      `tick_ = GetTick()`
+     as the last statement of the constructor.   (This ensures the
+     tick is later-numbered than any ticks stored in the ChangeTracker
+     code by operations called from the constructor.)
+  */
+  int64 tick_;
+
+
+  /*
+    This function intended to be called from the Backprop() routines
+    of child classes, for example:
+       ` if (DebugMode()) {  CheckTensorTime(*a_);  } `
+    This will die if the memory underlying the Tensor being checked has been
+    modified more recently than tick_.
+  */
+  inline void CheckTensorTime(const Tensor &tensor) {
+    if (DebugMode()) {
+    }
+  }
+
+
+
+
+};
+
+
+template <class OpImpl>
+class OpPointer {
+
+  std::shared_ptr<OpImpl>
+
+}
+
+
+
+/**
+   This is a special version of base-class Op that is created when
+   any SharedGrad is allocated for a non-leaf Variable.  Its purpose
+   is to ensure that, when we get to this Op in the backprop, we deallocate
+   the data underlying the gradient Tensor (so we don't keep gradient
+   Tensors around for longer than is needed).
+*/
+class DeallocateOp: public Op {
+
+  // This operator has no dependencies as it will be created when a SharedGrad
+  // is first initialized, when no Ops have been done on it.
+  Op *DepIteratorBegin() override { return NULL; }
+  Op *DepIteratorEnd() override { return NULL; }
+
+  void Backprop() override {
+    if (auto s = tensor_to_deallocate_.lock())
+      ZeroDeallocating(s.get());
+  }
+
+ private:
+  // Since we just want to deallocate its underlying data, there is no point
+  // increasing its ref-count; we can just shrug our shoulders if it has
+  // already been deleted.d
+  std::weak_ptr<Tensor> tensor_to_deallocate_;
+};
+
+
+/**
+   A slight simplification of class UnaryOp for cases where it's
+   done in-place.
+ */
+class InPlaceUnaryOp: public Op {
+
+};
+
+
+class UnaryOp: public Op {
+
+  //
+  UnaryOp(const Variable &input, const Variable &output) {
+    if
+
+
+
+    if (SameVariable(input, output)) {
+
+    } else {
+    }
+  }
+
+ public:
+
+  std::shared_ptr<Op> op1_;
+  std::shared_ptr<Op> op2_;
+
+
+
+
+}
+
+class GenericOp: public Op {
+
+  // GenericOp is a child of class Op that is intended as a generic base-class
+  // for expressions.
+
+
+
+ protected:
+  // Constructor, to be used from child classes.  This base-class takes care
+  // of storing the list of input Variables for purposes of tracing dependencies;
+  //
+  //  @param [in] input_vars  The list of input Variables (meaning: Variables
+  //                   that are inputs to, but not outputs of, i.e. not modified
+  //                   by, this Op).
+  //  @param [in] output_var  The output Variable of this Op, i.e. the Variable
+  //                   which is modified or set by it.  We may provide another
+  //                   constructor taking ArrayRef<Variable> in this position,
+  //                   as and when we need to support Ops that operate on
+  //                   multiple output Variables.
+  void Op(const ArrayRef<Variable> &input_vars,
+          const Variable &output_var);
+
+
+  // TODO: maybe have a constructor of Op that takes an ArrayRef of the inputs
+  // that are not also outputs?  Could use that for graph traversal.
+
+ private:
+
+  // num_inputs_ is the number of base Variables that are the base Variables of
+  // inputs of this Op (but not of outputs).  These are stored in the
+  // array 'inputs_'.
+
+  // inputs_ is a pointer to an array of shared_ptr<Variable> of size num_inputs_, which
+  // will be be allocated by new [] in the constructor and deleted by delete []
+  // in the destructor.
+
+  // This is a list of the Op-input-nodes (see glossary in tensor.h for explanation).
+  // We don't store the Op-output-nodes here; instead, they refer to this Op in
+  // their op_lists.
+  // (We don't store the Node(s) that is(are) the outputs of the Op here; its own
+  // op_list refers to this Op).
+  std::shared_ptr<Node> *inputs_;
+
+  int32 num_inputs_;
+
+  // If num_inputs_ is 1, then inputs_ is
+  void *inputs_;
+
+  int64 n_;  // initialized from the counter when this object is created.
+  std::shared_ptr<Op> tail_;  // TODO: make it unique_ptr?
+ protected:
+  // Return true if this is not the last Op in the list of Ops attached to this
+  // base Variable (can be useful to know whether we need bother to scale the
+  // derivative in a scaling operation, for instance).
+  bool HasTail() const { return tail_ != nullptr; }
+};
+
+
+class AddToOp: public Op {
+ public:
+
+  // This Op corresponds to the computation:
+  //   \f$  b  :=  alpha a  +   beta b.  \f$
+  // with broadcasting or summation depending on the dimensions
+  // involved.  Alpha and beta are constants, and differentiation w.r.t. them is
+  // not supported (you wouldn't reach this code if a or b were actual
+  // variables.)
+  //
+  // The Op is only constructed if b.Tracked() (which it would normally if
+  // a.Tracked()).
+  AddToOp(float alpha, float beta,
+          const Variable &a, const Variable &b):
+      Op({a}),
+      alpha_(alpha),
+      beta_(beta),
+      a_data_(a.GetData()),
+      a_grad_(a.GetGradIfPresent()),
+      b_data_(b.GetData()),
+      b_grad_(b.GetGrad()) {
+
+    Add(alpha, beta, *a_data_, b_data_.get());
+  }
+
+
+  void Backward() {
+    // Do: a_grad += alpha * b_grad.
+    if (a_grad_ != nullptr)
+      AddTo(alpha_, 1.0, b_grad, &a_grad);
+
+    if (beta_ != 1.0)
+      Scale(beta_, b_grad.get());
+  }
+
+ private:
+
+  float alpha_;
+  float beta_;
+
+  // We hold onto all inputs that are not also outputs
+  // (here just a_) for dependency tracking.
+  Variable a_;
+
+  std::shared_ptr<Node> a_node_;
+
+  std::shared_ptr<Tensor> a_data_;
+  // a_grad_ will be NULL if a was not tracked.
+  std::shared_ptr<Tensor> a_grad_;
+  std::shared_ptr<Tensor> b_data_;
+  std::shared_ptr<Tensor> b_grad_;
+
+  Variable b_;
+  bool must_scale_b_grad_;
+
+};
+
+
+class CopyOp: public Op {
+ public:
+
+  // This Op corresponds to the computation:
+  //   \f$  b := a  \f$
+  // with broadcasting or summation depending on the dimensions.
+  //
+  // Constructing this Op will make b tracked if it was already.
+  CopyOp(const Variable &a, const Variable &b):
+      Op({a}),
+      a_data_(a.GetData()),
+      a_grad_(a.GetGradIfPresent()),
+      b_data_(b.GetData()),
+      b_grad_(b.GetGrad()) {
+    Copy(a_data_, b_data_);
+
+      `tick_ = GetTick()`
+  }
+
+
+  void Backward() {
+    // Do: a_grad += alpha * b_grad.
+    if (a_grad_ != nullptr)
+      AddTo(alpha_, 1.0, b_grad, &a_grad);
+
+    if (beta_ != 1.0)
+      Scale(beta_, b_grad.get());
+  }
+
+ private:
+
+  float alpha_;
+  float beta_;
+
+  // We hold onto all inputs that are not also outputs
+  // (here just a_) for dependency tracking.
+  Variable a_;
+
+  std::shared_ptr<Node> a_node_;
+
+  std::shared_ptr<Tensor> a_data_;
+  // a_grad_ will be NULL if a was not tracked.
+  std::shared_ptr<Tensor> a_grad_;
+  std::shared_ptr<Tensor> b_data_;
+  std::shared_ptr<Tensor> b_grad_;
+
+  Variable b_;
+  bool must_scale_b_grad_;
+
+};
+
+
+class CopyOp: public Op {
+ public:
+
+  // This Op corresponds to the computation:
+  //   \f$  b  :=  alpha a  +   beta b.  \f$
+  // with broadcasting or summation depending on the dimensions
+  // involved.  Obviously alpha and beta are constants,
+  // and differentiation w.r.t. them is not supported.
+  //
+  // The Op is only constructed if b_.Tracked() (which it
+  // would normally if a_.Tracked()).
+  AddToOp(float alpha, float beta,
+          const Variable &a, const Variable &b):
+      Op({a}),
+      alpha_(alpha),
+      beta_(beta),
+      a_data_(a.GetData()),
+      a_grad_(a.GetGradIfPresent()),
+      b_data_(b.GetData()),
+      b_grad_(b.GetGrad()) {
+
+    Add(alpha, beta, *a_data_, b_data_.get());
+  }
+
+
+  void Backward() {
+    // Do: a_grad += alpha * b_grad.
+    if (a_grad_ != nullptr)
+      AddTo(alpha_, 1.0, b_grad, &a_grad);
+
+    if (beta_ != 1.0)
+      Scale(beta_, b_grad.get());
+  }
+
+ private:
+
+  float alpha_;
+  float beta_;
+
+  // We hold onto all inputs that are not also outputs
+  // (here just a_) for dependency tracking.
+  Variable a_;
+
+  std::shared_ptr<Node> a_node_;
+
+  std::shared_ptr<Tensor> a_data_;
+  // a_grad_ will be NULL if a was not tracked.
+  std::shared_ptr<Tensor> a_grad_;
+  std::shared_ptr<Tensor> b_data_;
+  std::shared_ptr<Tensor> b_grad_;
+
+  Variable b_;
+  bool must_scale_b_grad_;
+
+};
+
+
+
+}  // namespace tensor
+}  // namespace kaldi
+
+
+#endif  // KALDI_TENSOR_TENSOR_LINEAR_OPS_H_
diff --git a/src/tensor/tensor-utils.h b/src/tensor/tensor-utils.h
index f34aea14716..557cba8dfd9 100644
--- a/src/tensor/tensor-utils.h
+++ b/src/tensor/tensor-utils.h
@@ -50,7 +50,8 @@ inline bool BroadcastableAndCompatible(const Tensor &a, const Tensor &b,
 
 
 inline bool Overlap(const Tensor &a, const Tensor &b) {
-  return Compatible(*a.impl_, *b.impl_);
+  return a.impl_->storage.get() == b.impl.storage.get() &&
+      PatternsOverlap(a.impl_->pattern, b.impl_->pattern);
 }
 
 
@@ -163,6 +164,29 @@ inline int64 NumElements(const Tensor &a) {
   return NumElements(*a.impl_);
 }
 
+/**
+   This is the Tensor-level version of CanonicalizePattern() from
+   pattern-utils.h.  It ensures that the Tensor's pattern is canonical.
+   If this changes the Pattern, this will involve allocating a new
+   TensorImpl (since we always assume that TensorImpl's may be shared
+   by other Tensors).
+*/
+void CanonicalizeTensor(Tensor *tensor);
+
+/**
+   This is the Tensor-level version of CompressPatterns() from pattern-utils.h.
+   It ensures that the Tensors
+ */
+void CompressTensors(ArrayRef<Tensor*> tensors);
+
+
+/**
+   Returns a Tensor referencing a new TensorImpl; it will be as t except the
+   pattern will be the one provided.
+ */
+Tensor WithPattern(const Tensor &t, const Pattern &pattern);
+
+
 
 }  // namespace tensor
 }  // namespace kaldi
diff --git a/src/tensor/tensor.h b/src/tensor/tensor.h
index d77a4484885..d3641c07f61 100644
--- a/src/tensor/tensor.h
+++ b/src/tensor/tensor.h
@@ -156,7 +156,7 @@ namespace tensor {
 class Tensor {
  public:
 
-  /// Return the number of axes (a number in {0,1,2,3,4,5,6}).  In mathematical
+  // Return the number of axes (a number in {0,1,2,3,4,5,6}).  In mathematical
   // contexts, this is sometimes known as the rank of the tensor, or sometimes
   // even its dimension, but these terms are ambiguous so we avoid them, and use
   // the terms 'number of axes' or 'axis' throughout.
@@ -364,6 +364,36 @@ class Tensor {
   Tensor(const std::shared_ptr<const TensorImpl> &&impl): impl_(impl) { }
 
 
+  /**
+     Shallow copy: just makes this point to the TensorImpl in `other`.
+   */
+  Tensor operator =(const Tensor &other) { impl_ = other.impl_; }
+
+
+  /**
+     Return a copy of the TensorImpl underlying this Tensor;
+     this would normally be done when you want to change
+     something in the TensorImpl but don't want to invalidate
+     this Tensor or others sharing the same TensorImpl object.
+
+   */
+  TensorImpl *CopyImpl();
+
+
+  /**
+     Returns the data pointer cast to type T, with the offset from
+     the pattern included.  Calling this will force allocation of
+     the storage region if it was not already allocated.
+   */
+  template <class T> T* GetData() const;
+
+  /**
+     Returns the data pointer cast to type T, but without the offset from the
+     pattern.
+   */
+  template <class T> T* GetRawData() const;
+
+
  private:
 
   // It might seem odd that we contain a shared_ptr to *const* TensorImpl.
@@ -391,8 +421,8 @@ class Tensor {
    treat it as a Tensor.  You should view the type `std::shared_ptr<const
    TensorImpl>` as "might be Tensor, might be NULL".
 */
-inline Tensor &AsTensor(std::shared_ptr<const TensorImpl> &impl) {
-  return reinterpret_cast<Tensor&>(impl);
+inline const Tensor &AsTensor(std::shared_ptr<const TensorImpl> &impl) {
+  return reinterpret_cast<const Tensor&>(impl);
 }
 
 

From 3dd5e7ec3c68502e6e907577b00aac8a56ca7836 Mon Sep 17 00:00:00 2001
From: Dan Povey <dpovey@gmail.com>
Date: Mon, 17 Jun 2019 16:43:46 -0400
Subject: [PATCH 042/163] [src] Refactoring of matrix directory to separate out
 the cblas wrappers and extensions

---
 src/cblasext/cblas-extensions.cc          |  66 +++++
 src/cblasext/cblas-extensions.h           |  95 +++++++
 src/{matrix => cblasext}/cblas-wrappers.h | 317 +++++++++-------------
 src/{matrix => cblasext}/kaldi-blas.h     |   4 +-
 src/matrix/Makefile                       |   4 +-
 src/matrix/jama-svd.h                     |   2 +-
 src/matrix/kaldi-matrix.cc                |  35 ++-
 src/matrix/kaldi-vector.cc                |  24 +-
 src/matrix/matrix-common.h                |  14 +-
 src/matrix/matrix-lib-test.cc             |   2 +-
 src/matrix/packed-matrix.cc               |  23 +-
 src/matrix/qr.cc                          |   4 +-
 src/matrix/sp-matrix.cc                   |  22 +-
 src/matrix/tp-matrix.cc                   |   2 +-
 14 files changed, 365 insertions(+), 249 deletions(-)
 create mode 100644 src/cblasext/cblas-extensions.cc
 create mode 100644 src/cblasext/cblas-extensions.h
 rename src/{matrix => cblasext}/cblas-wrappers.h (52%)
 rename src/{matrix => cblasext}/kaldi-blas.h (96%)

diff --git a/src/cblasext/cblas-extensions.cc b/src/cblasext/cblas-extensions.cc
new file mode 100644
index 00000000000..c7d65996635
--- /dev/null
+++ b/src/cblasext/cblas-extensions.cc
@@ -0,0 +1,66 @@
+// cblasext/cblas-extensions.cc
+
+// Copyright 2019       Johns Hopkins University (author: Daniel Povey)
+
+// See ../../COPYING for clarification regarding multiple authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//  http://www.apache.org/licenses/LICENSE-2.0
+//
+// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
+// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
+// MERCHANTABLITY OR NON-INFRINGEMENT.
+// See the Apache 2 License for the specific language governing permissions and
+// limitations under the License.
+
+#include "cblasext/cblas-wrappers.h"
+#include "cblasext/cblas-extensions.h"
+
+namespace kaldi {
+
+template<typename Real>
+void cblasext_Xgemv_sparsevec(CBLAS_TRANSPOSE trans, KaldiBlasInt num_rows,
+                     KaldiBlasInt num_cols, Real alpha, const Real *Mdata,
+                     KaldiBlasInt stride, const Real *xdata,
+                     KaldiBlasInt incX, Real beta, Real *ydata,
+                     KaldiBlasInt incY) {
+  if (trans == CblasNoTrans) {
+    if (beta != 1.0) cblas_Xscal(num_rows, beta, ydata, incY);
+    for (KaldiBlasInt i = 0; i < num_cols; i++) {
+      Real x_i = xdata[i * incX];
+      if (x_i == 0.0) continue;
+      // Add to ydata, the i'th column of M, times alpha * x_i
+      cblas_Xaxpy(num_rows, x_i * alpha, Mdata + i, stride, ydata, incY);
+    }
+  } else {
+    if (beta != 1.0) cblas_Xscal(num_cols, beta, ydata, incY);
+    for (KaldiBlasInt i = 0; i < num_rows; i++) {
+      Real x_i = xdata[i * incX];
+      if (x_i == 0.0) continue;
+      // Add to ydata, the i'th row of M, times alpha * x_i
+      cblas_Xaxpy(num_cols, x_i * alpha,
+                  Mdata + (i * stride), 1, ydata, incY);
+    }
+  }
+}
+
+
+template
+void cblasext_Xgemv_sparsevec(CBLAS_TRANSPOSE trans, KaldiBlasInt num_rows,
+                              KaldiBlasInt num_cols, float alpha, const float *Mdata,
+                              KaldiBlasInt stride, const float *xdata,
+                              KaldiBlasInt incX, float beta, float *ydata,
+                              KaldiBlasInt incY);
+template
+void cblasext_Xgemv_sparsevec(CBLAS_TRANSPOSE trans, KaldiBlasInt num_rows,
+                              KaldiBlasInt num_cols, double alpha, const double *Mdata,
+                              KaldiBlasInt stride, const double *xdata,
+                              KaldiBlasInt incX, double beta, double *ydata,
+                              KaldiBlasInt incY);
+
+
+} // namespace kaldi
diff --git a/src/cblasext/cblas-extensions.h b/src/cblasext/cblas-extensions.h
new file mode 100644
index 00000000000..a9346dc0330
--- /dev/null
+++ b/src/cblasext/cblas-extensions.h
@@ -0,0 +1,95 @@
+// matrix/cblas-extensions.h
+
+// Copyright 2012-2019  Johns Hopkins University (author: Daniel Povey);
+//                      Haihua Xu; Wei Shi
+
+// See ../../COPYING for clarification regarding multiple authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+
+//  http://www.apache.org/licenses/LICENSE-2.0
+
+// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
+// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
+// MERCHANTABLITY OR NON-INFRINGEMENT.
+// See the Apache 2 License for the specific language governing permissions and
+// limitations under the License.
+#ifndef KALDI_MATRIX_CBLAS_EXTENSIONS_H_
+#define KALDI_MATRIX_CBLAS_EXTENSIONS_H_ 1
+
+
+#include "cblasext/kaldi-blas.h"
+#include "cblasext/cblas-wrappers.h"
+
+// In directories other than this directory, this file is intended to mostly be
+// included from .cc files, not from headers, since it includes cblas headers
+// (via kaldi-blas.h) and those can be quite polluting.
+
+// This file contains templated wrappers for CBLAS functions, which enable C++
+// code calling these functions to be templated.
+namespace kaldi {
+
+
+
+// This has the same interface as cblas_Xgemv, i.e. it does y = alpha M x + beta y;
+// it is just specialized for the case where the vector 'x' has a lot of zeros.
+template<typename Real>
+void cblasext_Xgemv_sparsevec(CBLAS_TRANSPOSE trans, KaldiBlasInt num_rows,
+                              KaldiBlasInt num_cols, Real alpha, const Real *Mdata,
+                              KaldiBlasInt stride, const Real *xdata,
+                              KaldiBlasInt incX, Real beta, Real *ydata,
+                              KaldiBlasInt incY);
+
+
+
+/// This is not really a wrapper for CBLAS as CBLAS does not have this; in future we could
+/// extend this somehow.
+inline void mul_elements(
+    const KaldiBlasInt dim,
+    const double *a,
+    double *b) { // does b *= a, elementwise.
+  double c1, c2, c3, c4;
+  KaldiBlasInt i;
+  for (i = 0; i + 4 <= dim; i += 4) {
+    c1 = a[i] * b[i];
+    c2 = a[i+1] * b[i+1];
+    c3 = a[i+2] * b[i+2];
+    c4 = a[i+3] * b[i+3];
+    b[i] = c1;
+    b[i+1] = c2;
+    b[i+2] = c3;
+    b[i+3] = c4;
+  }
+  for (; i < dim; i++)
+    b[i] *= a[i];
+}
+
+inline void mul_elements(
+    const KaldiBlasInt dim,
+    const float *a,
+    float *b) { // does b *= a, elementwise.
+  float c1, c2, c3, c4;
+  KaldiBlasInt i;
+  for (i = 0; i + 4 <= dim; i += 4) {
+    c1 = a[i] * b[i];
+    c2 = a[i+1] * b[i+1];
+    c3 = a[i+2] * b[i+2];
+    c4 = a[i+3] * b[i+3];
+    b[i] = c1;
+    b[i+1] = c2;
+    b[i+2] = c3;
+    b[i+3] = c4;
+  }
+  for (; i < dim; i++)
+    b[i] *= a[i];
+}
+
+
+
+}
+// namespace kaldi
+
+#endif
diff --git a/src/matrix/cblas-wrappers.h b/src/cblasext/cblas-wrappers.h
similarity index 52%
rename from src/matrix/cblas-wrappers.h
rename to src/cblasext/cblas-wrappers.h
index 2a4e774a9ce..39fa12931ca 100644
--- a/src/matrix/cblas-wrappers.h
+++ b/src/cblasext/cblas-wrappers.h
@@ -1,7 +1,7 @@
 // matrix/cblas-wrappers.h
 
-// Copyright 2012  Johns Hopkins University (author: Daniel Povey);
-//                 Haihua Xu; Wei Shi
+// Copyright 2012-2019  Johns Hopkins University (author: Daniel Povey);
+//                      Haihua Xu; Wei Shi
 
 // See ../../COPYING for clarification regarding multiple authors
 //
@@ -21,281 +21,251 @@
 #define KALDI_MATRIX_CBLAS_WRAPPERS_H_ 1
 
 
-#include <limits>
-#include "matrix/sp-matrix.h"
-#include "matrix/kaldi-vector.h"
-#include "matrix/kaldi-matrix.h"
-#include "matrix/matrix-functions.h"
-#include "matrix/kaldi-blas.h"
+#include "cblasext/kaldi-blas.h"
 
-// Do not include this file directly.  It is to be included
-// by .cc files in this directory.
+// In directories other than this directory, this file is intended to mostly be
+// included from .cc files, not from headers, since it includes cblas headers
+// (via kaldi-blas.h) and those can be quite polluting.
 
+// This file contains templated wrappers for CBLAS functions, which enable C++
+// code calling these functions to be templated.
 namespace kaldi {
 
 
-inline void cblas_Xcopy(const int N, const float *X, const int incX, float *Y,
-                        const int incY) {
+inline void cblas_Xcopy(const KaldiBlasInt N, const float *X, const KaldiBlasInt incX, float *Y,
+                        const KaldiBlasInt incY) {
   cblas_scopy(N, X, incX, Y, incY);
 }
 
-inline void cblas_Xcopy(const int N, const double *X, const int incX, double *Y,
-                        const int incY) {
+inline void cblas_Xcopy(const KaldiBlasInt N, const double *X, const KaldiBlasInt incX, double *Y,
+                        const KaldiBlasInt incY) {
   cblas_dcopy(N, X, incX, Y, incY);
 }
 
-
-inline float cblas_Xasum(const int N, const float *X, const int incX) {
+inline float cblas_Xasum(const KaldiBlasInt N, const float *X, const KaldiBlasInt incX) {
   return cblas_sasum(N, X, incX);
 }
 
-inline double cblas_Xasum(const int N, const double *X, const int incX) {
+inline double cblas_Xasum(const KaldiBlasInt N, const double *X, const KaldiBlasInt incX) {
   return cblas_dasum(N, X, incX);
 }
 
-inline void cblas_Xrot(const int N, float *X, const int incX, float *Y,
-                       const int incY, const float c, const float s) {
+inline void cblas_Xrot(const KaldiBlasInt N, float *X, const KaldiBlasInt incX, float *Y,
+                       const KaldiBlasInt incY, const float c, const float s) {
   cblas_srot(N, X, incX, Y, incY, c, s);
 }
-inline void cblas_Xrot(const int N, double *X, const int incX, double *Y,
-                       const int incY, const double c, const double s) {
+inline void cblas_Xrot(const KaldiBlasInt N, double *X, const KaldiBlasInt incX, double *Y,
+                       const KaldiBlasInt incY, const double c, const double s) {
   cblas_drot(N, X, incX, Y, incY, c, s);
 }
-inline float cblas_Xdot(const int N, const float *const X,
-                        const int incX, const float *const Y,
-                        const int incY) {
+inline float cblas_Xdot(const KaldiBlasInt N, const float *const X,
+                        const KaldiBlasInt incX, const float *const Y,
+                        const KaldiBlasInt incY) {
   return cblas_sdot(N, X, incX, Y, incY);
 }
-inline double cblas_Xdot(const int N, const double *const X,
-                        const int incX, const double *const Y,
-                        const int incY) {
+inline double cblas_Xdot(const KaldiBlasInt N, const double *const X,
+                        const KaldiBlasInt incX, const double *const Y,
+                        const KaldiBlasInt incY) {
   return cblas_ddot(N, X, incX, Y, incY);
 }
-inline void cblas_Xaxpy(const int N, const float alpha, const float *X,
-                        const int incX, float *Y, const int incY) {
+inline void cblas_Xaxpy(const KaldiBlasInt N, const float alpha, const float *X,
+                        const KaldiBlasInt incX, float *Y, const KaldiBlasInt incY) {
   cblas_saxpy(N, alpha, X, incX, Y, incY);
 }
-inline void cblas_Xaxpy(const int N, const double alpha, const double *X,
-                        const int incX, double *Y, const int incY) {
+inline void cblas_Xaxpy(const KaldiBlasInt N, const double alpha, const double *X,
+                        const KaldiBlasInt incX, double *Y, const KaldiBlasInt incY) {
   cblas_daxpy(N, alpha, X, incX, Y, incY);
 }
-inline void cblas_Xscal(const int N, const float alpha, float *data,
-                        const int inc) {
+inline void cblas_Xscal(const KaldiBlasInt N, const float alpha, float *data,
+                        const KaldiBlasInt inc) {
   cblas_sscal(N, alpha, data, inc);
 }
-inline void cblas_Xscal(const int N, const double alpha, double *data,
-                        const int inc) {
+inline void cblas_Xscal(const KaldiBlasInt N, const double alpha, double *data,
+                        const KaldiBlasInt inc) {
   cblas_dscal(N, alpha, data, inc);
 }
-inline void cblas_Xtpmv(MatrixTransposeType trans, const float *Mdata,
-                        const int num_rows, float *y, const int y_inc) {
+inline void cblas_Xtpmv(CBLAS_TRANSPOSE trans, const float *Mdata,
+                        const KaldiBlasInt num_rows, float *y, const KaldiBlasInt y_inc) {
   cblas_stpmv(CblasRowMajor, CblasLower, static_cast<CBLAS_TRANSPOSE>(trans),
               CblasNonUnit, num_rows, Mdata, y, y_inc);
 }
-inline void cblas_Xtpmv(MatrixTransposeType trans, const double *Mdata,
-                        const int num_rows, double *y, const int y_inc) {
+inline void cblas_Xtpmv(CBLAS_TRANSPOSE trans, const double *Mdata,
+                        const KaldiBlasInt num_rows, double *y, const KaldiBlasInt y_inc) {
   cblas_dtpmv(CblasRowMajor, CblasLower, static_cast<CBLAS_TRANSPOSE>(trans),
               CblasNonUnit, num_rows, Mdata, y, y_inc);
 }
 
 
-inline void cblas_Xtpsv(MatrixTransposeType trans, const float *Mdata,
-                        const int num_rows, float *y, const int y_inc) {
+inline void cblas_Xtpsv(CBLAS_TRANSPOSE trans, const float *Mdata,
+                        const KaldiBlasInt num_rows, float *y, const KaldiBlasInt y_inc) {
   cblas_stpsv(CblasRowMajor, CblasLower, static_cast<CBLAS_TRANSPOSE>(trans),
               CblasNonUnit, num_rows, Mdata, y, y_inc);
 }
-inline void cblas_Xtpsv(MatrixTransposeType trans, const double *Mdata,
-                        const int num_rows, double *y, const int y_inc) {
+inline void cblas_Xtpsv(CBLAS_TRANSPOSE trans, const double *Mdata,
+                        const KaldiBlasInt num_rows, double *y, const KaldiBlasInt y_inc) {
   cblas_dtpsv(CblasRowMajor, CblasLower, static_cast<CBLAS_TRANSPOSE>(trans),
               CblasNonUnit, num_rows, Mdata, y, y_inc);
 }
 
 // x = alpha * M * y + beta * x
-inline void cblas_Xspmv(MatrixIndexT dim, float alpha, const float *Mdata,
-                        const float *ydata, MatrixIndexT ystride,
-                        float beta, float *xdata, MatrixIndexT xstride) {
+inline void cblas_Xspmv(KaldiBlasInt dim, float alpha, const float *Mdata,
+                        const float *ydata, KaldiBlasInt ystride,
+                        float beta, float *xdata, KaldiBlasInt xstride) {
   cblas_sspmv(CblasRowMajor, CblasLower, dim, alpha, Mdata,
               ydata, ystride, beta, xdata, xstride);
 }
-inline void cblas_Xspmv(MatrixIndexT dim, double alpha, const double *Mdata,
-                        const double *ydata, MatrixIndexT ystride,
-                        double beta, double *xdata, MatrixIndexT xstride) {
+inline void cblas_Xspmv(KaldiBlasInt dim, double alpha, const double *Mdata,
+                        const double *ydata, KaldiBlasInt ystride,
+                        double beta, double *xdata, KaldiBlasInt xstride) {
   cblas_dspmv(CblasRowMajor, CblasLower, dim, alpha, Mdata,
               ydata, ystride, beta, xdata, xstride);
 }
 
 // Implements  A += alpha * (x y'  + y x'); A is symmetric matrix.
-inline void cblas_Xspr2(MatrixIndexT dim, float alpha, const float *Xdata,
-                        MatrixIndexT incX, const float *Ydata, MatrixIndexT incY,
+inline void cblas_Xspr2(KaldiBlasInt dim, float alpha, const float *Xdata,
+                        KaldiBlasInt incX, const float *Ydata, KaldiBlasInt incY,
                           float *Adata) {
   cblas_sspr2(CblasRowMajor, CblasLower, dim, alpha, Xdata,
               incX, Ydata, incY, Adata);
 }
-inline void cblas_Xspr2(MatrixIndexT dim, double alpha, const double *Xdata,
-                        MatrixIndexT incX, const double *Ydata, MatrixIndexT incY,
+inline void cblas_Xspr2(KaldiBlasInt dim, double alpha, const double *Xdata,
+                        KaldiBlasInt incX, const double *Ydata, KaldiBlasInt incY,
                         double *Adata) {
   cblas_dspr2(CblasRowMajor, CblasLower, dim, alpha, Xdata,
               incX, Ydata, incY, Adata);
 }
 
 // Implements  A += alpha * (x x'); A is symmetric matrix.
-inline void cblas_Xspr(MatrixIndexT dim, float alpha, const float *Xdata,
-                       MatrixIndexT incX, float *Adata) {
+inline void cblas_Xspr(KaldiBlasInt dim, float alpha, const float *Xdata,
+                       KaldiBlasInt incX, float *Adata) {
   cblas_sspr(CblasRowMajor, CblasLower, dim, alpha, Xdata, incX, Adata);
 }
-inline void cblas_Xspr(MatrixIndexT dim, double alpha, const double *Xdata,
-                       MatrixIndexT incX, double *Adata) {
+inline void cblas_Xspr(KaldiBlasInt dim, double alpha, const double *Xdata,
+                       KaldiBlasInt incX, double *Adata) {
   cblas_dspr(CblasRowMajor, CblasLower, dim, alpha, Xdata, incX, Adata);
 }
 
 // sgemv,dgemv: y = alpha M x + beta y.
-inline void cblas_Xgemv(MatrixTransposeType trans, MatrixIndexT num_rows,
-                        MatrixIndexT num_cols, float alpha, const float *Mdata,
-                        MatrixIndexT stride, const float *xdata,
-                        MatrixIndexT incX, float beta, float *ydata, MatrixIndexT incY) {
+inline void cblas_Xgemv(CBLAS_TRANSPOSE trans, KaldiBlasInt num_rows,
+                        KaldiBlasInt num_cols, float alpha, const float *Mdata,
+                        KaldiBlasInt stride, const float *xdata,
+                        KaldiBlasInt incX, float beta, float *ydata, KaldiBlasInt incY) {
   cblas_sgemv(CblasRowMajor, static_cast<CBLAS_TRANSPOSE>(trans), num_rows,
               num_cols, alpha, Mdata, stride, xdata, incX, beta, ydata, incY);
 }
-inline void cblas_Xgemv(MatrixTransposeType trans, MatrixIndexT num_rows,
-                        MatrixIndexT num_cols, double alpha, const double *Mdata,
-                        MatrixIndexT stride, const double *xdata,
-                        MatrixIndexT incX, double beta, double *ydata, MatrixIndexT incY) {
+inline void cblas_Xgemv(CBLAS_TRANSPOSE trans, KaldiBlasInt num_rows,
+                        KaldiBlasInt num_cols, double alpha, const double *Mdata,
+                        KaldiBlasInt stride, const double *xdata,
+                        KaldiBlasInt incX, double beta, double *ydata, KaldiBlasInt incY) {
   cblas_dgemv(CblasRowMajor, static_cast<CBLAS_TRANSPOSE>(trans), num_rows,
               num_cols, alpha, Mdata, stride, xdata, incX, beta, ydata, incY);
 }
 
 // sgbmv, dgmmv: y = alpha M x +  + beta * y.
-inline void cblas_Xgbmv(MatrixTransposeType trans, MatrixIndexT num_rows,
-                        MatrixIndexT num_cols, MatrixIndexT num_below,
-                        MatrixIndexT num_above, float alpha, const float *Mdata,
-                        MatrixIndexT stride, const float *xdata,
-                        MatrixIndexT incX, float beta, float *ydata, MatrixIndexT incY) {
+inline void cblas_Xgbmv(CBLAS_TRANSPOSE trans, KaldiBlasInt num_rows,
+                        KaldiBlasInt num_cols, KaldiBlasInt num_below,
+                        KaldiBlasInt num_above, float alpha, const float *Mdata,
+                        KaldiBlasInt stride, const float *xdata,
+                        KaldiBlasInt incX, float beta, float *ydata, KaldiBlasInt incY) {
   cblas_sgbmv(CblasRowMajor, static_cast<CBLAS_TRANSPOSE>(trans), num_rows,
               num_cols, num_below, num_above, alpha, Mdata, stride, xdata,
               incX, beta, ydata, incY);
 }
-inline void cblas_Xgbmv(MatrixTransposeType trans, MatrixIndexT num_rows,
-                        MatrixIndexT num_cols, MatrixIndexT num_below,
-                        MatrixIndexT num_above, double alpha, const double *Mdata,
-                        MatrixIndexT stride, const double *xdata,
-                        MatrixIndexT incX, double beta, double *ydata, MatrixIndexT incY) {
+inline void cblas_Xgbmv(CBLAS_TRANSPOSE trans, KaldiBlasInt num_rows,
+                        KaldiBlasInt num_cols, KaldiBlasInt num_below,
+                        KaldiBlasInt num_above, double alpha, const double *Mdata,
+                        KaldiBlasInt stride, const double *xdata,
+                        KaldiBlasInt incX, double beta, double *ydata, KaldiBlasInt incY) {
   cblas_dgbmv(CblasRowMajor, static_cast<CBLAS_TRANSPOSE>(trans), num_rows,
               num_cols, num_below, num_above, alpha, Mdata, stride, xdata,
               incX, beta, ydata, incY);
 }
 
-
-template<typename Real>
-inline void Xgemv_sparsevec(MatrixTransposeType trans, MatrixIndexT num_rows,
-                            MatrixIndexT num_cols, Real alpha, const Real *Mdata,
-                            MatrixIndexT stride, const Real *xdata,
-                            MatrixIndexT incX, Real beta, Real *ydata,
-                            MatrixIndexT incY) {
-  if (trans == kNoTrans) {
-    if (beta != 1.0) cblas_Xscal(num_rows, beta, ydata, incY);
-    for (MatrixIndexT i = 0; i < num_cols; i++) {
-      Real x_i = xdata[i * incX];
-      if (x_i == 0.0) continue;
-      // Add to ydata, the i'th column of M, times alpha * x_i
-      cblas_Xaxpy(num_rows, x_i * alpha, Mdata + i, stride, ydata, incY);
-    }
-  } else {
-    if (beta != 1.0) cblas_Xscal(num_cols, beta, ydata, incY);
-    for (MatrixIndexT i = 0; i < num_rows; i++) {
-      Real x_i = xdata[i * incX];
-      if (x_i == 0.0) continue;
-      // Add to ydata, the i'th row of M, times alpha * x_i
-      cblas_Xaxpy(num_cols, x_i * alpha,
-                  Mdata + (i * stride), 1, ydata, incY);
-    }
-  }
-}
-
 inline void cblas_Xgemm(const float alpha,
-                        MatrixTransposeType transA,
+                        CBLAS_TRANSPOSE transA,
                         const float *Adata,
-                        MatrixIndexT a_num_rows, MatrixIndexT a_num_cols, MatrixIndexT a_stride,
-                        MatrixTransposeType transB,
-                        const float *Bdata, MatrixIndexT b_stride,
+                        KaldiBlasInt a_num_rows, KaldiBlasInt a_num_cols, KaldiBlasInt a_stride,
+                        CBLAS_TRANSPOSE transB,
+                        const float *Bdata, KaldiBlasInt b_stride,
                         const float beta,
                         float *Mdata,
-                        MatrixIndexT num_rows, MatrixIndexT num_cols,MatrixIndexT stride) {
+                        KaldiBlasInt num_rows, KaldiBlasInt num_cols,KaldiBlasInt stride) {
   cblas_sgemm(CblasRowMajor, static_cast<CBLAS_TRANSPOSE>(transA),
               static_cast<CBLAS_TRANSPOSE>(transB),
-              num_rows, num_cols, transA == kNoTrans ? a_num_cols : a_num_rows,
+              num_rows, num_cols, transA == CblasNoTrans ? a_num_cols : a_num_rows,
               alpha, Adata, a_stride, Bdata, b_stride,
               beta, Mdata, stride);
 }
 inline void cblas_Xgemm(const double alpha,
-                        MatrixTransposeType transA,
+                        CBLAS_TRANSPOSE transA,
                         const double *Adata,
-                        MatrixIndexT a_num_rows, MatrixIndexT a_num_cols, MatrixIndexT a_stride,
-                        MatrixTransposeType transB,
-                        const double *Bdata, MatrixIndexT b_stride,
+                        KaldiBlasInt a_num_rows, KaldiBlasInt a_num_cols, KaldiBlasInt a_stride,
+                        CBLAS_TRANSPOSE transB,
+                        const double *Bdata, KaldiBlasInt b_stride,
                         const double beta,
                         double *Mdata,
-                        MatrixIndexT num_rows, MatrixIndexT num_cols,MatrixIndexT stride) {
+                        KaldiBlasInt num_rows, KaldiBlasInt num_cols,KaldiBlasInt stride) {
   cblas_dgemm(CblasRowMajor, static_cast<CBLAS_TRANSPOSE>(transA),
               static_cast<CBLAS_TRANSPOSE>(transB),
-              num_rows, num_cols, transA == kNoTrans ? a_num_cols : a_num_rows,
+              num_rows, num_cols, transA == CblasNoTrans ? a_num_cols : a_num_rows,
               alpha, Adata, a_stride, Bdata, b_stride,
               beta, Mdata, stride);
 }
 
 
 inline void cblas_Xsymm(const float alpha,
-                        MatrixIndexT sz,
-                        const float *Adata,MatrixIndexT a_stride,
-                        const float *Bdata,MatrixIndexT b_stride,
+                        KaldiBlasInt sz,
+                        const float *Adata,KaldiBlasInt a_stride,
+                        const float *Bdata,KaldiBlasInt b_stride,
                         const float beta,
-                        float *Mdata, MatrixIndexT stride) {
+                        float *Mdata, KaldiBlasInt stride) {
   cblas_ssymm(CblasRowMajor, CblasLeft, CblasLower, sz, sz, alpha, Adata,
               a_stride, Bdata, b_stride, beta, Mdata, stride);
 }
 inline void cblas_Xsymm(const double alpha,
-                        MatrixIndexT sz,
-                        const double *Adata,MatrixIndexT a_stride,
-                        const double *Bdata,MatrixIndexT b_stride,
+                        KaldiBlasInt sz,
+                        const double *Adata,KaldiBlasInt a_stride,
+                        const double *Bdata,KaldiBlasInt b_stride,
                         const double beta,
-                        double *Mdata, MatrixIndexT stride) {
+                        double *Mdata, KaldiBlasInt stride) {
   cblas_dsymm(CblasRowMajor, CblasLeft, CblasLower, sz, sz, alpha, Adata,
               a_stride, Bdata, b_stride, beta, Mdata, stride);
 }
 // ger: M += alpha x y^T.
-inline void cblas_Xger(MatrixIndexT num_rows, MatrixIndexT num_cols, float alpha,
-                       const float *xdata, MatrixIndexT incX, const float *ydata,
-                       MatrixIndexT incY, float *Mdata, MatrixIndexT stride) {
+inline void cblas_Xger(KaldiBlasInt num_rows, KaldiBlasInt num_cols, float alpha,
+                       const float *xdata, KaldiBlasInt incX, const float *ydata,
+                       KaldiBlasInt incY, float *Mdata, KaldiBlasInt stride) {
   cblas_sger(CblasRowMajor, num_rows, num_cols, alpha, xdata, 1, ydata, 1,
              Mdata, stride);
 }
-inline void cblas_Xger(MatrixIndexT num_rows, MatrixIndexT num_cols, double alpha,
-                       const double *xdata, MatrixIndexT incX, const double *ydata,
-                       MatrixIndexT incY, double *Mdata, MatrixIndexT stride) {
+inline void cblas_Xger(KaldiBlasInt num_rows, KaldiBlasInt num_cols, double alpha,
+                       const double *xdata, KaldiBlasInt incX, const double *ydata,
+                       KaldiBlasInt incY, double *Mdata, KaldiBlasInt stride) {
   cblas_dger(CblasRowMajor, num_rows, num_cols, alpha, xdata, 1, ydata, 1,
              Mdata, stride);
 }
 
 // syrk: symmetric rank-k update.
-// if trans==kNoTrans, then C = alpha A A^T + beta C
+// if trans==CblasNoTrans, then C = alpha A A^T + beta C
 // else C = alpha A^T A + beta C.
 // note: dim_c is dim(C), other_dim_a is the "other" dimension of A, i.e.
-// num-cols(A) if kNoTrans, or num-rows(A) if kTrans.
+// num-cols(A) if CblasNoTrans, or num-rows(A) if CblasTrans.
 // We only need the row-major and lower-triangular option of this, and this
 // is hard-coded.
 inline void cblas_Xsyrk (
-    const MatrixTransposeType trans, const MatrixIndexT dim_c,
-    const MatrixIndexT other_dim_a, const float alpha, const float *A,
-    const MatrixIndexT a_stride, const float beta, float *C,
-    const MatrixIndexT c_stride) {
+    const CBLAS_TRANSPOSE trans, const KaldiBlasInt dim_c,
+    const KaldiBlasInt other_dim_a, const float alpha, const float *A,
+    const KaldiBlasInt a_stride, const float beta, float *C,
+    const KaldiBlasInt c_stride) {
   cblas_ssyrk(CblasRowMajor, CblasLower, static_cast<CBLAS_TRANSPOSE>(trans),
               dim_c, other_dim_a, alpha, A, a_stride, beta, C, c_stride);
 }
 
 inline void cblas_Xsyrk(
-    const MatrixTransposeType trans, const MatrixIndexT dim_c,
-    const MatrixIndexT other_dim_a, const double alpha, const double *A,
-    const MatrixIndexT a_stride, const double beta, double *C,
-    const MatrixIndexT c_stride) {
+    const CBLAS_TRANSPOSE trans, const KaldiBlasInt dim_c,
+    const KaldiBlasInt other_dim_a, const double alpha, const double *A,
+    const KaldiBlasInt a_stride, const double beta, double *C,
+    const KaldiBlasInt c_stride) {
   cblas_dsyrk(CblasRowMajor, CblasLower, static_cast<CBLAS_TRANSPOSE>(trans),
               dim_c, other_dim_a, alpha, A, a_stride, beta, C, c_stride);
 }
@@ -305,7 +275,7 @@ inline void cblas_Xsyrk(
 /// elementwise multiplication.  We miss some of the arguments out of this
 /// wrapper.
 inline void cblas_Xsbmv1(
-    const MatrixIndexT dim,
+    const KaldiBlasInt dim,
     const double *A,
     const double alpha,
     const double *x,
@@ -316,7 +286,7 @@ inline void cblas_Xsbmv1(
 }
 
 inline void cblas_Xsbmv1(
-    const MatrixIndexT dim,
+    const KaldiBlasInt dim,
     const float *A,
     const float alpha,
     const float *x,
@@ -326,49 +296,6 @@ inline void cblas_Xsbmv1(
               1, x, 1, beta, y, 1);
 }
 
-/// This is not really a wrapper for CBLAS as CBLAS does not have this; in future we could
-/// extend this somehow.
-inline void mul_elements(
-    const MatrixIndexT dim,
-    const double *a,
-    double *b) { // does b *= a, elementwise.
-  double c1, c2, c3, c4;
-  MatrixIndexT i;
-  for (i = 0; i + 4 <= dim; i += 4) {
-    c1 = a[i] * b[i];
-    c2 = a[i+1] * b[i+1];
-    c3 = a[i+2] * b[i+2];
-    c4 = a[i+3] * b[i+3];
-    b[i] = c1;
-    b[i+1] = c2;
-    b[i+2] = c3;
-    b[i+3] = c4;
-  }
-  for (; i < dim; i++)
-    b[i] *= a[i];
-}
-
-inline void mul_elements(
-    const MatrixIndexT dim,
-    const float *a,
-    float *b) { // does b *= a, elementwise.
-  float c1, c2, c3, c4;
-  MatrixIndexT i;
-  for (i = 0; i + 4 <= dim; i += 4) {
-    c1 = a[i] * b[i];
-    c2 = a[i+1] * b[i+1];
-    c3 = a[i+2] * b[i+2];
-    c4 = a[i+3] * b[i+3];
-    b[i] = c1;
-    b[i+1] = c2;
-    b[i+2] = c3;
-    b[i+3] = c4;
-  }
-  for (; i < dim; i++)
-    b[i] *= a[i];
-}
-
-
 
 // add clapack here
 #if !defined(HAVE_ATLAS)
@@ -441,36 +368,36 @@ void inline clapack_Xsptrf(KaldiBlasInt *num_rows, double *Mdata,
   dsptrf_(const_cast<char *>("U"), num_rows, Mdata, ipiv, result);
 }
 #else
-inline void clapack_Xgetrf(MatrixIndexT num_rows, MatrixIndexT num_cols,
-                           float *Mdata, MatrixIndexT stride,
-                           int *pivot, int *result) {
+inline void clapack_Xgetrf(KaldiBlasInt num_rows, KaldiBlasInt num_cols,
+                           float *Mdata, KaldiBlasInt stride,
+                           KaldiBlasInt *pivot, KaldiBlasInt *result) {
   *result = clapack_sgetrf(CblasColMajor, num_rows, num_cols,
                               Mdata, stride, pivot);
 }
 
-inline void clapack_Xgetrf(MatrixIndexT num_rows, MatrixIndexT num_cols,
-                           double *Mdata, MatrixIndexT stride,
-                           int *pivot, int *result) {
+inline void clapack_Xgetrf(KaldiBlasInt num_rows, KaldiBlasInt num_cols,
+                           double *Mdata, KaldiBlasInt stride,
+                           KaldiBlasInt *pivot, KaldiBlasInt *result) {
   *result = clapack_dgetrf(CblasColMajor, num_rows, num_cols,
                               Mdata, stride, pivot);
 }
 //
-inline int clapack_Xtrtri(int num_rows, float *Mdata, MatrixIndexT stride) {
+inline KaldiBlasInt clapack_Xtrtri(KaldiBlasInt num_rows, float *Mdata, KaldiBlasInt stride) {
   return  clapack_strtri(CblasColMajor, CblasUpper, CblasNonUnit, num_rows,
                               Mdata, stride);
 }
 
-inline int clapack_Xtrtri(int num_rows, double *Mdata, MatrixIndexT stride) {
+inline KaldiBlasInt clapack_Xtrtri(KaldiBlasInt num_rows, double *Mdata, KaldiBlasInt stride) {
   return  clapack_dtrtri(CblasColMajor, CblasUpper, CblasNonUnit, num_rows,
                               Mdata, stride);
 }
 //
-inline void clapack_Xgetri(MatrixIndexT num_rows, float *Mdata, MatrixIndexT stride,
-                      int *pivot, int *result) {
+inline void clapack_Xgetri(KaldiBlasInt num_rows, float *Mdata, KaldiBlasInt stride,
+                      KaldiBlasInt *pivot, KaldiBlasInt *result) {
   *result = clapack_sgetri(CblasColMajor, num_rows, Mdata, stride, pivot);
 }
-inline void clapack_Xgetri(MatrixIndexT num_rows, double *Mdata, MatrixIndexT stride,
-                      int *pivot, int *result) {
+inline void clapack_Xgetri(KaldiBlasInt num_rows, double *Mdata, KaldiBlasInt stride,
+                      KaldiBlasInt *pivot, KaldiBlasInt *result) {
   *result = clapack_dgetri(CblasColMajor, num_rows, Mdata, stride, pivot);
 }
 #endif
diff --git a/src/matrix/kaldi-blas.h b/src/cblasext/kaldi-blas.h
similarity index 96%
rename from src/matrix/kaldi-blas.h
rename to src/cblasext/kaldi-blas.h
index 8a06540bba2..88ba12a0be1 100644
--- a/src/matrix/kaldi-blas.h
+++ b/src/cblasext/kaldi-blas.h
@@ -122,10 +122,8 @@ typedef integer KaldiBlasInt;
 #ifdef HAVE_MKL
 typedef MKL_INT KaldiBlasInt;
 #endif
-
 #ifdef HAVE_ATLAS
-// in this case there is no need for KaldiBlasInt-- this typedef is only needed
-// for Svd code which is not included in ATLAS (we re-implement it).
+typedef int KaldiBlasInt;
 #endif
 
 
diff --git a/src/matrix/Makefile b/src/matrix/Makefile
index e39be1ffec9..59c28419d81 100644
--- a/src/matrix/Makefile
+++ b/src/matrix/Makefile
@@ -2,7 +2,7 @@
 
 all:
 
-OPENFST_CXXFLAGS = 
+OPENFST_CXXFLAGS =
 OPENFST_LDLIBS =
 
 include ../kaldi.mk
@@ -18,7 +18,7 @@ OBJFILES = kaldi-matrix.o kaldi-vector.o packed-matrix.o sp-matrix.o tp-matrix.o
 
 LIBNAME = kaldi-matrix
 
-ADDLIBS = ../base/kaldi-base.a 
+ADDLIBS = ../base/kaldi-base.a ../cblasext/kaldi-cblasext.a
 
 include ../makefiles/default_rules.mk
 
diff --git a/src/matrix/jama-svd.h b/src/matrix/jama-svd.h
index 8304dac63e3..33a3ef9e083 100644
--- a/src/matrix/jama-svd.h
+++ b/src/matrix/jama-svd.h
@@ -31,7 +31,7 @@
 
 #include "matrix/kaldi-matrix.h"
 #include "matrix/sp-matrix.h"
-#include "matrix/cblas-wrappers.h"
+#include "cblasext/cblas-wrappers.h"
 
 namespace kaldi {
 
diff --git a/src/matrix/kaldi-matrix.cc b/src/matrix/kaldi-matrix.cc
index 16b75d08c1a..3b079b27781 100644
--- a/src/matrix/kaldi-matrix.cc
+++ b/src/matrix/kaldi-matrix.cc
@@ -22,11 +22,14 @@
 // limitations under the License.
 
 #include "matrix/kaldi-matrix.h"
+#include "matrix/kaldi-vector.h"
 #include "matrix/sp-matrix.h"
 #include "matrix/jama-svd.h"
 #include "matrix/jama-eig.h"
 #include "matrix/compressed-matrix.h"
 #include "matrix/sparse-matrix.h"
+#include "cblasext/cblas-wrappers.h"
+#include "cblasext/cblas-extensions.h"
 
 static_assert(int(kaldi::kNoTrans) == int(CblasNoTrans) && int(kaldi::kTrans) == int(CblasTrans),
     "kaldi::kNoTrans and kaldi::kTrans must be equal to the appropriate CBLAS library constants!");
@@ -181,8 +184,10 @@ void MatrixBase<Real>::AddMatMat(const Real alpha,
                || (transA == kTrans && transB == kTrans && A.num_rows_ == B.num_cols_ && A.num_cols_ == num_rows_ && B.num_rows_ == num_cols_));
   KALDI_ASSERT(&A !=  this && &B != this);
   if (num_rows_ == 0) return;
-  cblas_Xgemm(alpha, transA, A.data_, A.num_rows_, A.num_cols_, A.stride_,
-              transB, B.data_, B.stride_, beta, data_, num_rows_, num_cols_, stride_);
+  cblas_Xgemm(alpha, static_cast<CBLAS_TRANSPOSE>(transA),
+              A.data_, A.num_rows_, A.num_cols_, A.stride_,
+              static_cast<CBLAS_TRANSPOSE>(transB),
+              B.data_, B.stride_, beta, data_, num_rows_, num_cols_, stride_);
 
 }
 
@@ -259,7 +264,8 @@ void MatrixBase<Real>::SymAddMat2(const Real alpha,
   MatrixIndexT A_other_dim = (transA == kNoTrans ? A.num_cols_ : A.num_rows_);
 
   // This function call is hard-coded to update the lower triangle.
-  cblas_Xsyrk(transA, num_rows_, A_other_dim, alpha, A.Data(),
+  cblas_Xsyrk(static_cast<CBLAS_TRANSPOSE>(transA),
+              num_rows_, A_other_dim, alpha, A.Data(),
               A.Stride(), beta, this->data_, this->stride_);
 }
 
@@ -288,16 +294,18 @@ void MatrixBase<Real>::AddMatSmat(const Real alpha,
     for (MatrixIndexT c = 0; c < num_cols; c++) {
       // for each column of *this, do
       // [this column] = [alpha * A * this column of B] + [beta * this column]
-      Xgemv_sparsevec(transA, Arows, Acols, alpha, Adata, Astride,
-                      Bdata + c, Bstride, beta, data + c, stride);
+      cblasext_Xgemv_sparsevec(static_cast<CBLAS_TRANSPOSE>(transA),
+                               Arows, Acols, alpha, Adata, Astride,
+                               Bdata + c, Bstride, beta, data + c, stride);
     }
   } else {
     // Iterate over the columns of *this and the rows of B.
     for (MatrixIndexT c = 0; c < num_cols; c++) {
       // for each column of *this, do
       // [this column] = [alpha * A * this row of B] + [beta * this column]
-      Xgemv_sparsevec(transA, Arows, Acols, alpha, Adata, Astride,
-                      Bdata + (c * Bstride), 1, beta, data + c, stride);
+      cblasext_Xgemv_sparsevec(static_cast<CBLAS_TRANSPOSE>(transA),
+                               Arows, Acols, alpha, Adata, Astride,
+                               Bdata + (c * Bstride), 1, beta, data + c, stride);
     }
   }
 }
@@ -325,16 +333,18 @@ void MatrixBase<Real>::AddSmatMat(const Real alpha,
     for (MatrixIndexT r = 0; r < num_rows; r++) {
       // for each row of *this, do
       // [this row] = [alpha * (this row of A) * B^T] + [beta * this row]
-      Xgemv_sparsevec(invTransB, Brows, Bcols, alpha, Bdata, Bstride,
-                      Adata + (r * Astride), 1, beta, data + (r * stride), 1);
+      cblasext_Xgemv_sparsevec(static_cast<CBLAS_TRANSPOSE>(invTransB),
+                               Brows, Bcols, alpha, Bdata, Bstride,
+                               Adata + (r * Astride), 1, beta, data + (r * stride), 1);
     }
   } else {
     // Iterate over the rows of *this and the columns of A.
     for (MatrixIndexT r = 0; r < num_rows; r++) {
       // for each row of *this, do
       // [this row] = [alpha * (this column of A) * B^T] + [beta * this row]
-      Xgemv_sparsevec(invTransB, Brows, Bcols, alpha, Bdata, Bstride,
-                      Adata + r, Astride, beta, data + (r * stride), 1);
+      cblasext_Xgemv_sparsevec(static_cast<CBLAS_TRANSPOSE>(invTransB),
+                               Brows, Bcols, alpha, Bdata, Bstride,
+                               Adata + r, Astride, beta, data + (r * stride), 1);
     }
   }
 }
@@ -661,8 +671,7 @@ void MatrixBase<Real>::AddMatMatElements(const Real alpha,
 template<typename Real>
 void MatrixBase<Real>::LapackGesvd(VectorBase<Real> *s, MatrixBase<Real> *U_in,
                                    MatrixBase<Real> *V_in) {
-  KALDI_ASSERT(s != NULL && U_in != this && V_in != this &&
-               s->Stride() == 1);
+  KALDI_ASSERT(s != NULL && U_in != this && V_in != this);
 
   Matrix<Real> tmpU, tmpV;
   if (U_in == NULL) tmpU.Resize(this->num_rows_, 1);  // work-space if U_in empty.
diff --git a/src/matrix/kaldi-vector.cc b/src/matrix/kaldi-vector.cc
index d1d66b7bda4..1017154c43f 100644
--- a/src/matrix/kaldi-vector.cc
+++ b/src/matrix/kaldi-vector.cc
@@ -25,12 +25,14 @@
 
 #include <algorithm>
 #include <string>
-#include "matrix/cblas-wrappers.h"
+#include "cblasext/cblas-wrappers.h"
+#include "cblasext/cblas-extensions.h"
 #include "matrix/kaldi-vector.h"
 #include "matrix/kaldi-matrix.h"
 #include "matrix/sp-matrix.h"
 #include "matrix/sparse-matrix.h"
 
+
 namespace kaldi {
 
 template<typename Real> inline const Real* Get64Ones() {
@@ -106,7 +108,8 @@ void VectorBase<Real>::AddMatVec(const Real alpha,
   KALDI_ASSERT((trans == kNoTrans && M.NumCols() == v.dim_ && M.NumRows() == dim_)
                || (trans == kTrans && M.NumRows() == v.dim_ && M.NumCols() == dim_));
   KALDI_ASSERT(&v != this);
-  cblas_Xgemv(trans, M.NumRows(), M.NumCols(), alpha, M.Data(), M.Stride(),
+  cblas_Xgemv(static_cast<CBLAS_TRANSPOSE>(trans),
+              M.NumRows(), M.NumCols(), alpha, M.Data(), M.Stride(),
               v.Data(), 1, beta, data_, 1);
 }
 
@@ -119,8 +122,9 @@ void VectorBase<Real>::AddMatSvec(const Real alpha,
   KALDI_ASSERT((trans == kNoTrans && M.NumCols() == v.dim_ && M.NumRows() == dim_)
                || (trans == kTrans && M.NumRows() == v.dim_ && M.NumCols() == dim_));
   KALDI_ASSERT(&v != this);
-  Xgemv_sparsevec(trans, M.NumRows(), M.NumCols(), alpha, M.Data(), M.Stride(),
-                  v.Data(), 1, beta, data_, 1);
+  cblasext_Xgemv_sparsevec(static_cast<CBLAS_TRANSPOSE>(trans),
+                           M.NumRows(), M.NumCols(), alpha, M.Data(), M.Stride(),
+                           v.Data(), 1, beta, data_, 1);
   return;
   /*
   MatrixIndexT this_dim = this->dim_, v_dim = v.dim_,
@@ -161,14 +165,16 @@ template<typename Real>
 void VectorBase<Real>::MulTp(const TpMatrix<Real> &M,
                               const MatrixTransposeType trans) {
   KALDI_ASSERT(M.NumRows() == dim_);
-  cblas_Xtpmv(trans, M.Data(), M.NumRows(), data_, 1);
+  cblas_Xtpmv(static_cast<CBLAS_TRANSPOSE>(trans),
+              M.Data(), M.NumRows(), data_, 1);
 }
 
 template<typename Real>
 void VectorBase<Real>::Solve(const TpMatrix<Real> &M,
                         const MatrixTransposeType trans) {
   KALDI_ASSERT(M.NumRows() == dim_);
-  cblas_Xtpsv(trans, M.Data(), M.NumRows(), data_, 1);
+  cblas_Xtpsv(static_cast<CBLAS_TRANSPOSE>(trans),
+              M.Data(), M.NumRows(), data_, 1);
 }
 
 
@@ -737,7 +743,7 @@ void VectorBase<Real>::AddRowSumMat(Real alpha, const MatrixBase<Real> &M, Real
   for (MatrixIndexT row_offset = 0; row_offset < num_rows; row_offset += 64) {
     MatrixIndexT this_num_rows =
         std::min<MatrixIndexT>(64, num_rows - row_offset);
-    cblas_Xgemv(kTrans, this_num_rows, M.NumCols(), alpha,
+    cblas_Xgemv(CblasTrans, this_num_rows, M.NumCols(), alpha,
                 M.RowData(row_offset), M.Stride(), ones, 1,
                 beta, data_, 1);
     beta = 1.0;
@@ -756,7 +762,7 @@ void VectorBase<Real>::AddColSumMat(Real alpha, const MatrixBase<Real> &M, Real
   for (MatrixIndexT col_offset = 0; col_offset < num_cols; col_offset += 64) {
     MatrixIndexT this_num_cols =
         std::min<MatrixIndexT>(64, num_cols - col_offset);
-    cblas_Xgemv(kNoTrans, M.NumRows(), this_num_cols, alpha,
+    cblas_Xgemv(CblasNoTrans, M.NumRows(), this_num_cols, alpha,
                 M.Data() + col_offset, M.Stride(),
                 ones, 1,
                 beta, data_, 1);
@@ -1009,7 +1015,7 @@ void VectorBase<Real>::AddVecVec(Real alpha, const VectorBase<Real> &v,
   KALDI_ASSERT(v.data_ != this->data_ && r.data_ != this->data_);
   // We pretend that v is a band-diagonal matrix.
   KALDI_ASSERT(dim_ == v.dim_ && dim_ == r.dim_);
-  cblas_Xgbmv(kNoTrans, dim_, dim_, 0, 0, alpha, v.data_, 1,
+  cblas_Xgbmv(CblasNoTrans, dim_, dim_, 0, 0, alpha, v.data_, 1,
               r.data_, 1, beta, this->data_, 1);
 }
 
diff --git a/src/matrix/matrix-common.h b/src/matrix/matrix-common.h
index b0dad6a0cdb..5d8e264c4a5 100644
--- a/src/matrix/matrix-common.h
+++ b/src/matrix/matrix-common.h
@@ -24,11 +24,19 @@
 // files in this directory.
 
 #include "base/kaldi-common.h"
+#include "cblasext/kaldi-blas.h"
+#include "cblasext/cblas-wrappers.h"
 
 namespace kaldi {
-// this enums equal to CblasTrans and CblasNoTrans constants from CBLAS library
-// we are writing them as literals because we don't want to include here matrix/kaldi-blas.h,
-// which puts many symbols into global scope (like "real") via the header f2c.h
+
+
+// Define Kaldi's MatrixTransposeType (which is basically equivalent to enum
+// CBLAS_TRANSPOSE) in case we're including this in a context where it was not
+// already defined.  This is part of a kludge to be able to use this enum while
+// not including the cblas headers in our headers; cblas headers can cause
+// problems because they can bring in a lot of junk (types in the global
+// namespace; preprocessor macros), and there are different flavors of cblas
+// which might put different *kinds* of junk there.
 typedef enum {
   kTrans    = 112, // = CblasTrans
   kNoTrans  = 111  // = CblasNoTrans
diff --git a/src/matrix/matrix-lib-test.cc b/src/matrix/matrix-lib-test.cc
index 8097ab119b5..afc0340c310 100644
--- a/src/matrix/matrix-lib-test.cc
+++ b/src/matrix/matrix-lib-test.cc
@@ -28,7 +28,7 @@
 #include <numeric>
 #include <time.h> // This is only needed for UnitTestSvdSpeed, you can
 // comment it (and that function) out if it causes problems.
-#include <matrix/cblas-wrappers.h>
+#include <cblasext/cblas-wrappers.h>
 
 namespace kaldi {
 
diff --git a/src/matrix/packed-matrix.cc b/src/matrix/packed-matrix.cc
index 80bf5891998..40aed24f938 100644
--- a/src/matrix/packed-matrix.cc
+++ b/src/matrix/packed-matrix.cc
@@ -23,9 +23,10 @@
  *
  * Implementation of specialized PackedMatrix template methods
  */
-#include "matrix/cblas-wrappers.h"
+#include "cblasext/cblas-wrappers.h"
 #include "matrix/packed-matrix.h"
 #include "matrix/kaldi-vector.h"
+#include "matrix/kaldi-matrix.h"
 
 namespace kaldi {
 
@@ -49,7 +50,7 @@ void PackedMatrix<Real>::SetRandn() {
   Real *data = data_;
   size_t dim = num_rows_, size = ((dim*(dim+1))/2);
   for (size_t i = 0; i < size; i++)
-    data[i] = RandGauss();  
+    data[i] = RandGauss();
 }
 
 template<typename Real>
@@ -242,7 +243,7 @@ void PackedMatrix<Real>::Write(std::ostream &os, bool binary) const {
   KALDI_ASSERT(this->NumRows() == (MatrixIndexT) size);
   MatrixIndexT num_elems = ((size+1)*(MatrixIndexT)size)/2;
 
-  if(binary) {  
+  if(binary) {
     std::string my_token = (sizeof(Real) == 4 ? "FP" : "DP");
     WriteToken(os, binary, my_token);
     WriteBasicType(os, binary, size);
@@ -256,7 +257,7 @@ void PackedMatrix<Real>::Write(std::ostream &os, bool binary) const {
     else {
       os<<"[\n";
       MatrixIndexT i = 0;
-      for (int32 j = 0; j < size; j++) {  
+      for (int32 j = 0; j < size; j++) {
         for (int32 k = 0; k < j + 1; k++) {
           WriteBasicType(os, binary, data_[i++]);
         }
@@ -337,7 +338,7 @@ void PackedMatrix<Real>::Read(std::istream& is, bool binary, bool add) {
       goto bad;
     }
     //new format it is
-    is_new_format = true; 
+    is_new_format = true;
   }
   if(!is_new_format) {
     ReadBasicType(is, binary, &size);  // throws on error.
@@ -378,7 +379,7 @@ void PackedMatrix<Real>::Read(std::istream& is, bool binary, bool add) {
         }
         //now process the data:
         num_lines = int32(sqrt(data.size()*2));
-        
+
         KALDI_ASSERT(data.size() == num_lines*(num_lines+1)/2);
 
         this->Resize(num_lines);
@@ -392,12 +393,12 @@ void PackedMatrix<Real>::Read(std::istream& is, bool binary, bool add) {
         //std::cout<<"here!!!!!hxu!!!!!"<<std::endl;
       }
       else if ( (i >= '0' && i <= '9') || i == '-' ) {  // A number...
-        Real r; 
+        Real r;
         is >> r;
         if (is.fail()) {
           specific_error << "Stream failure/EOF while reading matrix data.";
           goto bad;
-        } 
+        }
         data.push_back(r);
       }
       else if (isspace(i)) {
@@ -415,9 +416,9 @@ void PackedMatrix<Real>::Read(std::istream& is, bool binary, bool add) {
         } else {
           specific_error << "Expecting numeric matrix data, got " << str;
           goto bad;
-        } 
-      }       
-    } 
+        }
+      }
+    }
   }
 bad:
   KALDI_ERR << "Failed to read packed matrix from stream. " << specific_error.str()
diff --git a/src/matrix/qr.cc b/src/matrix/qr.cc
index 8912d2892ce..efa7a301527 100644
--- a/src/matrix/qr.cc
+++ b/src/matrix/qr.cc
@@ -23,7 +23,7 @@
 #include "matrix/kaldi-vector.h"
 #include "matrix/kaldi-matrix.h"
 #include "matrix/matrix-functions.h"
-#include "matrix/cblas-wrappers.h"
+#include "cblasext/cblas-wrappers.h"
 
 // This file contains an implementation of the Symmetric QR Algorithm
 // for the symmetric eigenvalue problem.  See Golub and Van Loan,
@@ -201,7 +201,7 @@ void SpMatrix<Real>::Tridiagonalize(MatrixBase<Real> *Q) {
       // We do (in Matlab notation):
       // Q(0:k-1,:) = (I - beta v v') * Q, i.e.:
       // Q(:,0:i-1) += -beta v (v' Q(:,0:k-1)v .. let x = -beta Q(0:k-1,:)^T v.
-      cblas_Xgemv(kTrans, k, n, -beta, qdata, qstride, v, 1, 0.0, x, 1);
+      cblas_Xgemv(CblasTrans, k, n, -beta, qdata, qstride, v, 1, 0.0, x, 1);
       // now x = -beta Q(:,0:k-1) v.
       // The next line does: Q(:,0:k-1) += v x'.
       cblas_Xger(k, n, 1.0, v, 1, x, 1, qdata, qstride);
diff --git a/src/matrix/sp-matrix.cc b/src/matrix/sp-matrix.cc
index d63e1b1aed1..32c8ccd9df0 100644
--- a/src/matrix/sp-matrix.cc
+++ b/src/matrix/sp-matrix.cc
@@ -25,7 +25,8 @@
 #include "matrix/kaldi-vector.h"
 #include "matrix/kaldi-matrix.h"
 #include "matrix/matrix-functions.h"
-#include "matrix/cblas-wrappers.h"
+#include "cblasext/cblas-wrappers.h"
+#include "cblasext/cblas-extensions.h"
 
 namespace kaldi {
 
@@ -1010,13 +1011,15 @@ void SpMatrix<Real>::AddMat2Sp(
   if (transM == kNoTrans) {
     for (MatrixIndexT r = 0; r < dim; r++, p_row_data += r) {
       cblas_Xspmv(A.NumRows(), 1.0, p_A_data, M.RowData(r), 1, 0.0, tmp_vec_data, 1);
-      cblas_Xgemv(transM, r+1, M_other_dim, alpha, M_data, M_stride,
+      cblas_Xgemv(static_cast<CBLAS_TRANSPOSE>(transM),
+                  r+1, M_other_dim, alpha, M_data, M_stride,
                   tmp_vec_data, 1, beta, p_row_data, 1);
     }
   } else {
     for (MatrixIndexT r = 0; r < dim; r++, p_row_data += r) {
       cblas_Xspmv(A.NumRows(), 1.0, p_A_data, M.Data() + r, M.Stride(), 0.0, tmp_vec_data, 1);
-      cblas_Xgemv(transM, M_other_dim, r+1, alpha, M_data, M_stride,
+      cblas_Xgemv(static_cast<CBLAS_TRANSPOSE>(transM),
+                  M_other_dim, r+1, alpha, M_data, M_stride,
                   tmp_vec_data, 1, beta, p_row_data, 1);
     }
   }
@@ -1064,15 +1067,17 @@ void SpMatrix<Real>::AddSmat2Sp(
     // The column of M^T corresponds to the rows of the supplied matrix.
     for (MatrixIndexT i = 0; i < dim; i++, data += i) {
       MatrixIndexT num_rows = i + 1, num_cols = Adim;
-      Xgemv_sparsevec(kNoTrans, num_rows, num_cols, alpha, MAdata,
-                      temp_MA_stride, Mdata + (i * Mstride), 1, beta, data, 1);
+      cblasext_Xgemv_sparsevec(CblasNoTrans, num_rows, num_cols, alpha, MAdata,
+                               temp_MA_stride, Mdata + (i * Mstride),
+                               1, beta, data, 1);
     }
   } else {
     // The column of M^T corresponds to the columns of the supplied matrix.
     for (MatrixIndexT i = 0; i < dim; i++, data += i) {
       MatrixIndexT num_rows = i + 1, num_cols = Adim;
-      Xgemv_sparsevec(kNoTrans, num_rows, num_cols, alpha, MAdata,
-                      temp_MA_stride, Mdata + i, Mstride, beta, data, 1);
+      cblasext_Xgemv_sparsevec(CblasNoTrans, num_rows, num_cols, alpha, MAdata,
+                               temp_MA_stride, Mdata + i, Mstride,
+                               beta, data, 1);
     }
   }
 }
@@ -1129,7 +1134,8 @@ void SpMatrix<Real>::AddMat2(const Real alpha, const MatrixBase<Real> &M,
   // doesn't dominate O(N) time.
 
   // This function call is hard-coded to update the lower triangle.
-  cblas_Xsyrk(transM, this_dim, m_other_dim, alpha, M.Data(),
+  cblas_Xsyrk(static_cast<CBLAS_TRANSPOSE>(transM),
+              this_dim, m_other_dim, alpha, M.Data(),
               M.Stride(), beta, temp_mat.Data(), temp_mat.Stride());
 
   this->CopyFromMat(temp_mat, kTakeLower);
diff --git a/src/matrix/tp-matrix.cc b/src/matrix/tp-matrix.cc
index 6e34dc643e9..322d3253c6e 100644
--- a/src/matrix/tp-matrix.cc
+++ b/src/matrix/tp-matrix.cc
@@ -21,7 +21,7 @@
 #include "matrix/tp-matrix.h"
 #include "matrix/sp-matrix.h"
 #include "matrix/kaldi-matrix.h"
-#include "matrix/cblas-wrappers.h"
+#include "cblasext/cblas-wrappers.h"
 
 
 namespace kaldi {

From b254c833e689803ab36556f880a0100e241562d8 Mon Sep 17 00:00:00 2001
From: Dan Povey <dpovey@gmail.com>
Date: Mon, 17 Jun 2019 18:11:21 -0400
Subject: [PATCH 043/163] [src] Add more things to cblasext

---
 src/cblasext/cblas-extensions.cc | 57 +++++++++++++++++++++++++++++
 src/cblasext/cblas-extensions.h  | 62 ++++++++++++--------------------
 src/matrix/kaldi-matrix.cc       | 17 +++------
 3 files changed, 84 insertions(+), 52 deletions(-)

diff --git a/src/cblasext/cblas-extensions.cc b/src/cblasext/cblas-extensions.cc
index c7d65996635..8b5ea941081 100644
--- a/src/cblasext/cblas-extensions.cc
+++ b/src/cblasext/cblas-extensions.cc
@@ -63,4 +63,61 @@ void cblasext_Xgemv_sparsevec(CBLAS_TRANSPOSE trans, KaldiBlasInt num_rows,
                               KaldiBlasInt incY);
 
 
+template <typename Real>
+void cblasext_mul_elements_vec(
+    const KaldiBlasInt dim,
+    const Real *a,
+    Real *b) { // does b *= a, elementwise.
+  Real c1, c2, c3, c4;
+  KaldiBlasInt i;
+  for (i = 0; i + 4 <= dim; i += 4) {
+    c1 = a[i] * b[i];
+    c2 = a[i+1] * b[i+1];
+    c3 = a[i+2] * b[i+2];
+    c4 = a[i+3] * b[i+3];
+    b[i] = c1;
+    b[i+1] = c2;
+    b[i+2] = c3;
+    b[i+3] = c4;
+  }
+  for (; i < dim; i++)
+    b[i] *= a[i];
+}
+
+template void cblasext_mul_elements_vec(const KaldiBlasInt dim,
+                                        const float *a, float *b);
+template void cblasext_mul_elements_vec(const KaldiBlasInt dim,
+                                        const double *a, double *b);
+
+
+template <typename Real>
+void cblasext_mul_elements_mat(
+    const Real *Adata,
+    KaldiBlasInt a_num_rows,
+    KaldiBlasInt a_num_cols,
+    KaldiBlasInt a_stride,
+    Real *Bdata,
+    KaldiBlasInt b_stride) {
+  if (a_num_cols == a_stride && a_num_cols == b_stride) {
+    cblasext_mul_elements_vec(a_num_rows * a_num_cols, Adata, Bdata);
+  } else {
+    for (KaldiBlasInt i = 0; i < a_num_rows; i++) {
+      cblasext_mul_elements_vec(a_num_cols, Adata, Bdata);
+      Adata += a_stride;
+      Bdata += b_stride;
+    }
+  }
+}
+
+
+template void cblasext_mul_elements_mat(
+    const float *Adata, KaldiBlasInt a_num_rows,
+    KaldiBlasInt a_num_cols, KaldiBlasInt a_stride,
+    float *Bdata, KaldiBlasInt b_stride);
+template void cblasext_mul_elements_mat(
+    const double *Adata, KaldiBlasInt a_num_rows,
+    KaldiBlasInt a_num_cols, KaldiBlasInt a_stride,
+    double *Bdata, KaldiBlasInt b_stride);
+
+
 } // namespace kaldi
diff --git a/src/cblasext/cblas-extensions.h b/src/cblasext/cblas-extensions.h
index a9346dc0330..aaf12004c25 100644
--- a/src/cblasext/cblas-extensions.h
+++ b/src/cblasext/cblas-extensions.h
@@ -45,47 +45,31 @@ void cblasext_Xgemv_sparsevec(CBLAS_TRANSPOSE trans, KaldiBlasInt num_rows,
 
 
 
-/// This is not really a wrapper for CBLAS as CBLAS does not have this; in future we could
-/// extend this somehow.
-inline void mul_elements(
+/**
+   Does, elementwise for 0 <= i < dim,
+     b[i] *= a[i].
+*/
+template <typename Real>
+void cblasext_mul_elements_vec(
     const KaldiBlasInt dim,
-    const double *a,
-    double *b) { // does b *= a, elementwise.
-  double c1, c2, c3, c4;
-  KaldiBlasInt i;
-  for (i = 0; i + 4 <= dim; i += 4) {
-    c1 = a[i] * b[i];
-    c2 = a[i+1] * b[i+1];
-    c3 = a[i+2] * b[i+2];
-    c4 = a[i+3] * b[i+3];
-    b[i] = c1;
-    b[i+1] = c2;
-    b[i+2] = c3;
-    b[i+3] = c4;
-  }
-  for (; i < dim; i++)
-    b[i] *= a[i];
-}
+    const Real *a,
+    Real *b);
+
+
+/**
+   Does b *=  where a and b are matrices of the same dimension.
+   Does not currently support transpose.
+
+   Requires that a and b do not overlap (but this is not checked).
+*/
+template <typename Real>
+void cblasext_mul_elements_mat(
+    const Real *Adata,
+    KaldiBlasInt a_num_rows, KaldiBlasInt a_num_cols, KaldiBlasInt a_stride,
+    Real *Bdata,
+    KaldiBlasInt b_stride);
+
 
-inline void mul_elements(
-    const KaldiBlasInt dim,
-    const float *a,
-    float *b) { // does b *= a, elementwise.
-  float c1, c2, c3, c4;
-  KaldiBlasInt i;
-  for (i = 0; i + 4 <= dim; i += 4) {
-    c1 = a[i] * b[i];
-    c2 = a[i+1] * b[i+1];
-    c3 = a[i+2] * b[i+2];
-    c4 = a[i+3] * b[i+3];
-    b[i] = c1;
-    b[i+1] = c2;
-    b[i+2] = c3;
-    b[i+3] = c4;
-  }
-  for (; i < dim; i++)
-    b[i] *= a[i];
-}
 
 
 
diff --git a/src/matrix/kaldi-matrix.cc b/src/matrix/kaldi-matrix.cc
index 3b079b27781..917b8848c78 100644
--- a/src/matrix/kaldi-matrix.cc
+++ b/src/matrix/kaldi-matrix.cc
@@ -1152,17 +1152,8 @@ template<typename Real>
 void MatrixBase<Real>::MulElements(const MatrixBase<Real> &a) {
   KALDI_ASSERT(a.NumRows() == num_rows_ && a.NumCols() == num_cols_);
 
-  if (num_cols_ == stride_ && num_cols_ == a.stride_) {
-    mul_elements(num_rows_ * num_cols_, a.data_, data_);
-  } else {
-    MatrixIndexT a_stride = a.stride_, stride = stride_;
-    Real *data = data_, *a_data = a.data_;
-    for (MatrixIndexT i = 0; i < num_rows_; i++) {
-      mul_elements(num_cols_, a_data, data);
-      a_data += a_stride;
-      data += stride;
-    }
-  }
+  cblasext_mul_elements_mat(a.Data(), a.NumRows(), a.NumCols(),
+                            a.Stride(), data_, stride_);
 }
 
 template<typename Real>
@@ -2657,8 +2648,8 @@ bool AttemptComplexPower(double *x_re, double *x_im, double power);
 
 template <typename Real>
 Real TraceMatMat(const MatrixBase<Real> &A,
-                  const MatrixBase<Real> &B,
-                  MatrixTransposeType trans) {  // tr(A B), equivalent to sum of each element of A times same element in B'
+                 const MatrixBase<Real> &B,
+                 MatrixTransposeType trans) {  // tr(A B), equivalent to sum of each element of A times same element in B'
   MatrixIndexT aStride = A.stride_, bStride = B.stride_;
   if (trans == kNoTrans) {
     KALDI_ASSERT(A.NumRows() == B.NumCols() && A.NumCols() == B.NumRows());

From f566e8173c4a19284ad470d6591bebfcc82f1d15 Mon Sep 17 00:00:00 2001
From: Desh Raj <r.desh26@gmail.com>
Date: Wed, 27 Mar 2019 15:11:55 -0400
Subject: [PATCH 044/163] [scripts] Fix non-randomness in getting utt2uniq,
 introduced in #3142 (#3175)

---
 egs/wsj/s5/steps/nnet3/chain/get_egs.sh | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/egs/wsj/s5/steps/nnet3/chain/get_egs.sh b/egs/wsj/s5/steps/nnet3/chain/get_egs.sh
index ae4a0474a24..0185b9fbaad 100755
--- a/egs/wsj/s5/steps/nnet3/chain/get_egs.sh
+++ b/egs/wsj/s5/steps/nnet3/chain/get_egs.sh
@@ -155,7 +155,8 @@ if [ -f $data/utt2uniq ]; then
   # Must hold out all augmented versions of the same utterance.
   echo "$0: File $data/utt2uniq exists, so ensuring the hold-out set" \
        "includes all perturbed versions of the same source utterance."
-  utils/utt2spk_to_spk2utt.pl $data/utt2uniq 2>/dev/null |
+  utils/utt2spk_to_spk2utt.pl $data/utt2uniq 2>/dev/null | \
+      utils/shuffle_list.pl 2>/dev/null | \
     awk -v max_utt=$num_utts_subset '{
         for (n=2;n<=NF;n++) print $n;
         printed += NF-1;

From 560594e188719fe4c4d6e7a1752ea6e4c4e222f8 Mon Sep 17 00:00:00 2001
From: Justin Luitjens <luitjens@users.noreply.github.com>
Date: Wed, 27 Mar 2019 14:38:19 -0600
Subject: [PATCH 045/163] [build] Don't build for Tegra sm_XX versions on
 x86/ppc and vice versa; allow --cuda-arch overrides to have multiple versions
 (#3171)

---
 src/configure | 31 ++++++++++++++++++++++---------
 1 file changed, 22 insertions(+), 9 deletions(-)

diff --git a/src/configure b/src/configure
index b21cc48f7ee..1013a3c162e 100755
--- a/src/configure
+++ b/src/configure
@@ -118,7 +118,7 @@ function rel2abs {
 }
 
 function read_value {
-  local val=`expr "X$1" : '[^=]*=\(.*\)'`;
+  local val=`expr "X$*" : '[^=]*=\(.*\)'`;
   echo $val
 }
 
@@ -430,14 +430,27 @@ function configure_cuda {
     fi
 
     if [ -z "$CUDA_ARCH" ]; then
-      case $CUDA_VERSION in
-        5_5) CUDA_ARCH="-gencode arch=compute_30,code=sm_30 -gencode arch=compute_35,code=sm_35" ;;
-        6_*) CUDA_ARCH="-gencode arch=compute_30,code=sm_30 -gencode arch=compute_35,code=sm_35 -gencode arch=compute_50,code=sm_50" ;;
-        7_*) CUDA_ARCH="-gencode arch=compute_30,code=sm_30 -gencode arch=compute_35,code=sm_35 -gencode arch=compute_50,code=sm_50 -gencode arch=compute_53,code=sm_53" ;;
-        8_*) CUDA_ARCH="-gencode arch=compute_30,code=sm_30 -gencode arch=compute_35,code=sm_35 -gencode arch=compute_50,code=sm_50 -gencode arch=compute_53,code=sm_53 -gencode arch=compute_60,code=sm_60 -gencode arch=compute_61,code=sm_61 -gencode arch=compute_62,code=sm_62" ;;
-        9_*) CUDA_ARCH="-gencode arch=compute_30,code=sm_30 -gencode arch=compute_35,code=sm_35 -gencode arch=compute_50,code=sm_50 -gencode arch=compute_53,code=sm_53 -gencode arch=compute_60,code=sm_60 -gencode arch=compute_61,code=sm_61 -gencode arch=compute_62,code=sm_62 -gencode arch=compute_70,code=sm_70" ;;
-        10_*) CUDA_ARCH="-gencode arch=compute_30,code=sm_30 -gencode arch=compute_35,code=sm_35 -gencode arch=compute_50,code=sm_50 -gencode arch=compute_53,code=sm_53 -gencode arch=compute_60,code=sm_60 -gencode arch=compute_61,code=sm_61 -gencode arch=compute_62,code=sm_62 -gencode arch=compute_70,code=sm_70 -gencode arch=compute_72,code=sm_72 -gencode arch=compute_75,code=sm_75" ;;
-        *) echo "Unsupported CUDA_VERSION (CUDA_VERSION=$CUDA_VERSION), please report it to Kaldi mailing list, together with 'nvcc -h' or 'ptxas -h' which lists allowed -gencode values..."; exit 1 ;;
+      case `uname -m` in
+        x86_64|ppc64le)
+          case $CUDA_VERSION in
+            5_5) CUDA_ARCH="-gencode arch=compute_30,code=sm_30 -gencode arch=compute_35,code=sm_35" ;;
+            6_*) CUDA_ARCH="-gencode arch=compute_30,code=sm_30 -gencode arch=compute_35,code=sm_35 -gencode arch=compute_50,code=sm_50" ;;
+            7_*) CUDA_ARCH="-gencode arch=compute_30,code=sm_30 -gencode arch=compute_35,code=sm_35 -gencode arch=compute_50,code=sm_50 -gencode arch=compute_52,code=sm_52" ;;
+            8_*) CUDA_ARCH="-gencode arch=compute_30,code=sm_30 -gencode arch=compute_35,code=sm_35 -gencode arch=compute_50,code=sm_50 -gencode arch=compute_52,code=sm_52 -gencode arch=compute_60,code=sm_60 -gencode arch=compute_61,code=sm_61" ;;
+            9_*) CUDA_ARCH="-gencode arch=compute_30,code=sm_30 -gencode arch=compute_35,code=sm_35 -gencode arch=compute_50,code=sm_50 -gencode arch=compute_52,code=sm_52 -gencode arch=compute_60,code=sm_60 -gencode arch=compute_61,code=sm_61 -gencode arch=compute_70,code=sm_70" ;;
+            10_*) CUDA_ARCH="-gencode arch=compute_30,code=sm_30 -gencode arch=compute_35,code=sm_35 -gencode arch=compute_50,code=sm_50 -gencode arch=compute_52,code=sm_52 -gencode arch=compute_60,code=sm_60 -gencode arch=compute_61,code=sm_61 -gencode arch=compute_70,code=sm_70 -gencode arch=compute_75,code=sm_75" ;;
+            *) echo "Unsupported CUDA_VERSION (CUDA_VERSION=$CUDA_VERSION), please report it to Kaldi mailing list, together with 'nvcc -h' or 'ptxas -h' which lists allowed -gencode values..."; exit 1 ;;
+          esac
+        ;;
+        aarch64)
+          case $CUDA_VERSION in
+            7_*)     CUDA_ARCH="-gencode arch=compute_53,code=sm_53" ;;
+            8_*|9_*) CUDA_ARCH="-gencode arch=compute_53,code=sm_53 -gencode arch=compute_62,code=sm_62" ;;
+            10_*)    CUDA_ARCH="-gencode arch=compute_53,code=sm_53 -gencode arch=compute_62,code=sm_62 -gencode arch=compute_72,code=sm_72" ;;
+            *) echo "Unsupported CUDA_VERSION (CUDA_VERSION=$CUDA_VERSION), please report it to Kaldi mailing list, together with 'nvcc -h' or 'ptxas -h' which lists allowed -gencode values..."; exit 1 ;;
+          esac
+        ;;
+        *) echo "Unsupported architecture for use of Kaldi with CUDA.  Please report it to Kaldi mailing list."; exit 1 ;;
       esac
     fi
 

From 4264512709e2771d3027815260d493db9110e08f Mon Sep 17 00:00:00 2001
From: Ashish Arora <ashisharora88888@gmail.com>
Date: Fri, 29 Mar 2019 16:30:53 -0400
Subject: [PATCH 046/163] [egs] Fixes Re encoding to IAM, uw3 recipes (#3012)

---
 .../s5b/local/nnet3/run_ivector_common.sh     |  1 -
 .../v1/local/unk_arc_post_to_transcription.py | 15 ++++---
 egs/madcat_ar/v1/RESULTS                      | 18 ++++++++
 egs/rimes/v1/RESULTS                          | 45 +++++++++++++++++++
 .../v1/local/unk_arc_post_to_transcription.py | 15 ++++---
 5 files changed, 79 insertions(+), 15 deletions(-)
 create mode 100644 egs/madcat_ar/v1/RESULTS
 create mode 100644 egs/rimes/v1/RESULTS

diff --git a/egs/gale_arabic/s5b/local/nnet3/run_ivector_common.sh b/egs/gale_arabic/s5b/local/nnet3/run_ivector_common.sh
index f071842dc0b..a03cc5b2fa3 100755
--- a/egs/gale_arabic/s5b/local/nnet3/run_ivector_common.sh
+++ b/egs/gale_arabic/s5b/local/nnet3/run_ivector_common.sh
@@ -138,7 +138,6 @@ if [ $stage -le 5 ]; then
   # Also extract iVectors for the test data, but in this case we don't need the speed
   # perturbation (sp).
   for data in ${test_sets}; do
-    nspk=$(wc -l <data/${data}_hires/spk2utt)
     steps/online/nnet2/extract_ivectors_online.sh --cmd "$train_cmd" --nj $nj \
       data/${data}_hires exp/nnet3${nnet3_affix}/extractor \
       exp/nnet3${nnet3_affix}/ivectors_${data}_hires
diff --git a/egs/iam/v1/local/unk_arc_post_to_transcription.py b/egs/iam/v1/local/unk_arc_post_to_transcription.py
index f8b69820601..1f1404b5165 100755
--- a/egs/iam/v1/local/unk_arc_post_to_transcription.py
+++ b/egs/iam/v1/local/unk_arc_post_to_transcription.py
@@ -25,6 +25,7 @@
       data/lang/oov.int
 """
 import argparse
+import io
 import os
 import sys
 parser = argparse.ArgumentParser(description="""uses phones to convert unk to word""")
@@ -42,17 +43,17 @@
 args = parser.parse_args()
 
 ### main ###
-phone_handle = open(args.phones, 'r', encoding='latin-1') # Create file handles 
-word_handle = open(args.words, 'r', encoding='latin-1')
-unk_handle = open(args.unk,'r', encoding='latin-1')
+phone_handle = open(args.phones, 'r', encoding='utf8') # Create file handles 
+word_handle = open(args.words, 'r', encoding='utf8')
+unk_handle = open(args.unk,'r', encoding='utf8')
 if args.one_best_arc_post == '-':
-    arc_post_handle = sys.stdin
+    arc_post_handle = io.TextIOWrapper(sys.stdin.buffer, encoding='utf8')
 else:
-    arc_post_handle = open(args.one_best_arc_post, 'r', encoding='latin-1')
+    arc_post_handle = open(args.one_best_arc_post, 'r', encoding='utf8')
 if args.output_text == '-':
-    output_text_handle = sys.stdout
+    output_text_handle = io.TextIOWrapper(sys.stdout.buffer, encoding='utf8')
 else:
-    output_text_handle = open(args.output_text, 'w', encoding='latin-1')
+    output_text_handle = open(args.output_text, 'w', encoding='utf8')
 
 id2phone = dict() # Stores the mapping from phone_id (int) to phone (char)
 phones_data = phone_handle.read().strip().split("\n")
diff --git a/egs/madcat_ar/v1/RESULTS b/egs/madcat_ar/v1/RESULTS
new file mode 100644
index 00000000000..357d209f6b9
--- /dev/null
+++ b/egs/madcat_ar/v1/RESULTS
@@ -0,0 +1,18 @@
+
+Subset - Dev: 852, Train: 23564, Test: 923
+
+BPE:  (subset) (run_end2end.sh)
+  • %WER 19.34 [ 932 / 4819, 71 ins, 75 del, 786 sub ] exp/chain/e2e_cnn_1a/decode_test/wer_7_0.0
+  • %WER 13.70 [ 660 / 4819, 52 ins, 65 del, 543 sub ] exp/chain/cnn_e2eali_1b/decode_test/wer_8_1.0
+
+word-based: (subset) (run_end2end.sh.word)
+  • %WER 27.39 [ 1320 / 4819, 209 ins, 50 del, 1061 sub ] exp/chain/e2e_cnn_1a/decode_test/wer_10_1.0
+  • %WER 24.26 [ 1169 / 4819, 123 ins, 80 del, 966 sub ] exp/chain/cnn_e2eali_1b/decode_test/wer_13_1.0
+
+BPE:  (subset) (run.sh)
+  • %WER 15.98 [ 770 / 4819, 64 ins, 48 del, 658 sub ] exp/chain/cnn_1a/decode_test/wer_8_0.5
+  
+  
+word-based: (subset) (run.sh.word)
+  • %WER 24.20 [ 1166 / 4819, 121 ins, 69 del, 976 sub ] exp/chain/cnn_1a/decode_test/wer_11_1.0
+    %WER 24.28 [ 1170 / 4819, 126 ins, 104 del, 940 sub ] exp/chain/cnn_chainali_1a/decode_test/wer_10_1.0
diff --git a/egs/rimes/v1/RESULTS b/egs/rimes/v1/RESULTS
new file mode 100644
index 00000000000..4a9d7225e33
--- /dev/null
+++ b/egs/rimes/v1/RESULTS
@@ -0,0 +1,45 @@
+Run_end2end.sh Word-based system WER using lang_unk and lang. WER at line-level and paragraph level
+flat_start:
+Line-level:
+  • %WER 13.97 [ 788 / 5639, 136 ins, 62 del, 590 sub ] exp/chain/e2e_cnn_1a/decode_test/wer_10_1.0
+  • %WER 16.56 [ 934 / 5639, 158 ins, 75 del, 701 sub ] exp/chain/e2e_cnn_1a/decode_test/wer_13_1.0
+
+Paragraph-level:
+  • %WER 12.89 [ 727 / 5639, 116 ins, 42 del, 569 sub ] exp/chain/e2e_cnn_1a/decode_test/para/wer_10_1.0
+  • %WER 15.50 [ 874 / 5639, 133 ins, 50 del, 691 sub ] exp/chain/e2e_cnn_1a/decode_test/para/wer_13_1.0
+
+cnn_e2eali_1a:
+Line-level:
+  • %WER 10.43 [ 588 / 5639, 115 ins, 57 del, 416 sub ] exp/chain/cnn_e2eali_1a/decode_test/wer_7_1.0
+  • %WER 13.78 [ 777 / 5639, 153 ins, 58 del, 566 sub ] exp/chain/cnn_e2eali_1a/decode_test/wer_10_1.0
+
+Paragraph-level:
+  • %WER 9.35 [ 527 / 5639, 89 ins, 31 del, 407 sub ] exp/chain/cnn_e2eali_1a/decode_test//para/wer_7_1.0
+  • %WER 12.70 [ 716 / 5639, 134 ins, 39 del, 543 sub ] exp/chain/cnn_e2eali_1a/decode_test/para/wer_10_1.0
+
+
+Run_end2end.sh BPE-based system WER using lang. WER at line-level and paragraph level
+flat_start:
+Line-level:
+  • %WER 11.58 [ 653 / 5639, 72 ins, 67 del, 514 sub ] exp/chain/e2e_cnn_1a/decode_test/wer_8_1.0
+
+Paragraph-level:
+  • %WER 10.50 [ 592 / 5639, 54 ins, 49 del, 489 sub ] exp/chain/e2e_cnn_1a/decode_test/para/wer_8_1.0
+
+cnn_e2eali_1a:
+Line-level:
+  • %WER 8.48 [ 478 / 5639, 56 ins, 54 del, 368 sub ] exp/chain/cnn_e2eali_1a/decode_test/wer_9_0.5
+
+Paragraph-level:
+  • %WER 7.41 [ 418 / 5639, 38 ins, 36 del, 344 sub ] exp/chain/cnn_e2eali_1a/decode_test/para/wer_9_0.5
+
+
+Run_end2end.sh BPE-based system WER using lang with optional open-source extra corpus text. 
+WER at line-level and paragraph level.
+
+cnn_e2eali_1a:
+Line-level:
+  • %WER 7.66 [ 432 / 5639, 50 ins, 38 del, 344 sub ] exp/chain/cnn_e2eali_1a/decode_test/para/wer_7_0.5
+
+Paragraph-level:
+  • %WER 6.85 [ 386 / 5639, 35 ins, 36 del, 315 sub ] exp/chain/cnn_e2eali_1a/decode_test/para/wer_8_1.0
diff --git a/egs/uw3/v1/local/unk_arc_post_to_transcription.py b/egs/uw3/v1/local/unk_arc_post_to_transcription.py
index f8b69820601..1f1404b5165 100755
--- a/egs/uw3/v1/local/unk_arc_post_to_transcription.py
+++ b/egs/uw3/v1/local/unk_arc_post_to_transcription.py
@@ -25,6 +25,7 @@
       data/lang/oov.int
 """
 import argparse
+import io
 import os
 import sys
 parser = argparse.ArgumentParser(description="""uses phones to convert unk to word""")
@@ -42,17 +43,17 @@
 args = parser.parse_args()
 
 ### main ###
-phone_handle = open(args.phones, 'r', encoding='latin-1') # Create file handles 
-word_handle = open(args.words, 'r', encoding='latin-1')
-unk_handle = open(args.unk,'r', encoding='latin-1')
+phone_handle = open(args.phones, 'r', encoding='utf8') # Create file handles 
+word_handle = open(args.words, 'r', encoding='utf8')
+unk_handle = open(args.unk,'r', encoding='utf8')
 if args.one_best_arc_post == '-':
-    arc_post_handle = sys.stdin
+    arc_post_handle = io.TextIOWrapper(sys.stdin.buffer, encoding='utf8')
 else:
-    arc_post_handle = open(args.one_best_arc_post, 'r', encoding='latin-1')
+    arc_post_handle = open(args.one_best_arc_post, 'r', encoding='utf8')
 if args.output_text == '-':
-    output_text_handle = sys.stdout
+    output_text_handle = io.TextIOWrapper(sys.stdout.buffer, encoding='utf8')
 else:
-    output_text_handle = open(args.output_text, 'w', encoding='latin-1')
+    output_text_handle = open(args.output_text, 'w', encoding='utf8')
 
 id2phone = dict() # Stores the mapping from phone_id (int) to phone (char)
 phones_data = phone_handle.read().strip().split("\n")

From 67872828deab4338b1c934afeb9d214c78f8fad9 Mon Sep 17 00:00:00 2001
From: Justin Luitjens <luitjens@users.noreply.github.com>
Date: Sat, 30 Mar 2019 14:05:45 -0600
Subject: [PATCH 047/163] [src] Efficiency improvement and extra checking for
 cudamarix, RE default stream  (#3182)

---
 src/cudamatrix/cu-block-matrix.cc  |  4 ++-
 src/cudamatrix/cu-device.cc        |  2 +-
 src/cudamatrix/cu-matrix.cc        | 48 ++++++++++++++++++++----------
 src/cudamatrix/cu-packed-matrix.cc | 10 ++++---
 src/cudamatrix/cu-packed-matrix.h  |  6 ++--
 src/cudamatrix/cu-value.h          |  9 ++++--
 src/cudamatrix/cu-vector.cc        | 31 +++++++++++--------
 7 files changed, 71 insertions(+), 39 deletions(-)

diff --git a/src/cudamatrix/cu-block-matrix.cc b/src/cudamatrix/cu-block-matrix.cc
index fc8f4b7ce72..e0c64912207 100644
--- a/src/cudamatrix/cu-block-matrix.cc
+++ b/src/cudamatrix/cu-block-matrix.cc
@@ -140,7 +140,9 @@ void CuBlockMatrix<Real>::SetCudaData() {
     size_t size = NumBlocks() * sizeof(CuBlockMatrixData);
     cu_data_ = static_cast<CuBlockMatrixData*>(
         CuDevice::Instantiate().Malloc(size));
-    CU_SAFE_CALL(cudaMemcpy(cu_data_, &(tmp_cu_data[0]), size, cudaMemcpyHostToDevice));
+    CU_SAFE_CALL(cudaMemcpyAsync(cu_data_, &(tmp_cu_data[0]), size, 
+                                 cudaMemcpyHostToDevice, cudaStreamPerThread));
+    CU_SAFE_CALL(cudaStreamSynchronize(cudaStreamPerThread));
     CuDevice::Instantiate().AccuProfile(__func__, tim);    
   }
 #endif
diff --git a/src/cudamatrix/cu-device.cc b/src/cudamatrix/cu-device.cc
index 85c2492c074..e5d161521fd 100644
--- a/src/cudamatrix/cu-device.cc
+++ b/src/cudamatrix/cu-device.cc
@@ -427,7 +427,7 @@ void CuDevice::AccuProfile(const char *function_name,
     // per-thread default stream.  Since we compile with
     // -DCUDA_API_PER_THREAD_DEFAULT_STREAM, this equates to a per-thread
     // stream.
-    cudaStreamSynchronize(0);
+    CU_SAFE_CALL(cudaStreamSynchronize(0));
     double elapsed = timer.Elapsed();
     if (profile_map_.find(key) == profile_map_.end())
       profile_map_[key] = elapsed;
diff --git a/src/cudamatrix/cu-matrix.cc b/src/cudamatrix/cu-matrix.cc
index 1f09ff278ce..efe8dec7652 100644
--- a/src/cudamatrix/cu-matrix.cc
+++ b/src/cudamatrix/cu-matrix.cc
@@ -324,7 +324,7 @@ void CuMatrixBase<Real>::CopyFromMat(const MatrixBase<Real> &src,
       CU_SAFE_CALL(cudaMemcpy2DAsync(data_, dst_pitch, src.Data(), src_pitch,
                                 width, src.NumRows(), cudaMemcpyHostToDevice,
                                 cudaStreamPerThread));
-      cudaStreamSynchronize(cudaStreamPerThread);
+      CU_SAFE_CALL(cudaStreamSynchronize(cudaStreamPerThread));
 
       CuDevice::Instantiate().AccuProfile("CuMatrixBase::CopyFromMat(from CPU)", tim);
     } else {
@@ -431,9 +431,10 @@ void CuMatrixBase<Real>::CopyToMat(MatrixBase<OtherReal> *dst,
       MatrixIndexT src_pitch = stride_*sizeof(Real);
       MatrixIndexT dst_pitch = dst->Stride()*sizeof(Real);
       MatrixIndexT width = NumCols()*sizeof(Real);
-      CU_SAFE_CALL(cudaMemcpy2D(dst->Data(), dst_pitch, this->data_, src_pitch,
-                                width, this->num_rows_, cudaMemcpyDeviceToHost));
-
+      CU_SAFE_CALL(cudaMemcpy2DAsync(dst->Data(), dst_pitch, this->data_, 
+                                     src_pitch, width, this->num_rows_, 
+                                     cudaMemcpyDeviceToHost, cudaStreamPerThread));
+      CU_SAFE_CALL(cudaStreamSynchronize(cudaStreamPerThread));
       CuDevice::Instantiate().AccuProfile("CuMatrix::CopyToMatD2H", tim);
     }
   } else
@@ -1670,7 +1671,10 @@ void CuMatrix<Real>::CompObjfAndDeriv(const std::vector<MatrixElement<Real> >& s
       return;
     }
     void *addr = CuDevice::Instantiate().Malloc(sv_labels.size() * sizeof(MatrixElement<Real>));
-    CU_SAFE_CALL(cudaMemcpy(addr, sv_labels.data(), sv_labels.size() * sizeof(MatrixElement<Real>), cudaMemcpyHostToDevice));
+    CU_SAFE_CALL(cudaMemcpyAsync(addr, sv_labels.data(), sv_labels.size() * 
+                                 sizeof(MatrixElement<Real>), 
+                                 cudaMemcpyHostToDevice, 
+                                 cudaStreamPerThread));
     CuTimer tim;
     CuVector<Real> tmp(2, kUndefined);
     int dimBlock(CU1DBLOCK);
@@ -2245,7 +2249,9 @@ void AddMatMatBatched(const Real alpha, std::vector<CuSubMatrix<Real>* > &C,
       host_c_array[i] = C[i]->data_;
     }
 
-    CU_SAFE_CALL(cudaMemcpy(device_abc_array, host_abc_array, 3*size*sizeof(Real*), cudaMemcpyHostToDevice));
+    CU_SAFE_CALL(cudaMemcpyAsync(device_abc_array, host_abc_array, 
+                                 3*size*sizeof(Real*), cudaMemcpyHostToDevice,
+                                 cudaStreamPerThread));
 
     CUBLAS_SAFE_CALL(cublas_gemmBatched(GetCublasHandle(),
                                         (transB==kTrans? CUBLAS_OP_T:CUBLAS_OP_N),
@@ -2325,15 +2331,21 @@ void CuMatrixBase<Real>::CopyRowsFromVec(const VectorBase<Real> &v) {
     if (v.Dim() == num_rows_*num_cols_) {
       if (stride_ == num_cols_) {
         const Real* v_data = v.Data();
-        cudaMemcpy(data_, v_data, sizeof(Real)*num_rows_*num_cols_, cudaMemcpyHostToDevice);
+        CU_SAFE_CALL(cudaMemcpyAsync(data_, v_data, 
+                                     sizeof(Real)*num_rows_*num_cols_, 
+                                     cudaMemcpyHostToDevice, 
+                                     cudaStreamPerThread));
       } else {
         const Real *v_data = v.Data();
         for (MatrixIndexT r = 0; r < num_rows_; r++) {
           Real *row_data = RowData(r);
-          cudaMemcpy(row_data, v_data, sizeof(Real)*num_cols_, cudaMemcpyHostToDevice);
+          CU_SAFE_CALL(cudaMemcpyAsync(row_data, v_data, sizeof(Real)*num_cols_, 
+                                       cudaMemcpyHostToDevice, 
+                                       cudaStreamPerThread));
           v_data += num_cols_;
         }
       }
+      CU_SAFE_CALL(cudaStreamSynchronize(cudaStreamPerThread));
     } else if (v.Dim() == num_cols_) {
       dim3 dimGrid, dimBlock;
       GetBlockSizesForSimpleMatrixOperation(NumRows(), NumCols(),
@@ -2599,16 +2611,19 @@ void VectorBase<Real>::CopyRowsFromMat(const CuMatrixBase<Real> &mat) {
   if (CuDevice::Instantiate().Enabled()) {
     CuTimer tim;
     if (mat.Stride() == mat.NumCols()) {
-      cudaMemcpy(data_, mat.Data(), sizeof(Real)*dim_, cudaMemcpyDeviceToHost);
+      CU_SAFE_CALL(cudaMemcpyAsync(data_, mat.Data(), sizeof(Real)*dim_, 
+                   cudaMemcpyDeviceToHost, cudaStreamPerThread));
     } else {
       // we could definitely do better than the following.
       Real* vec_data = data_;
       for (MatrixIndexT r = 0; r < mat.NumRows(); r++) {
-        cudaMemcpy(vec_data, mat.RowData(r), sizeof(Real) * mat.NumCols(),
-                   cudaMemcpyDeviceToHost);
+        CU_SAFE_CALL(cudaMemcpyAsync(vec_data, mat.RowData(r), 
+                     sizeof(Real) * mat.NumCols(), cudaMemcpyDeviceToHost, 
+                     cudaStreamPerThread));
         vec_data += mat.NumCols();
       }
     }
+    CU_SAFE_CALL(cudaStreamSynchronize(cudaStreamPerThread));
     CuDevice::Instantiate().AccuProfile("CuVectorBase::CopyRowsFromMat", tim);
   } else
 #endif
@@ -3257,9 +3272,9 @@ void CuMatrixBase<Real>::AddElements(Real alpha,
 #if HAVE_CUDA == 1
   if (CuDevice::Instantiate().Enabled()) {
     void *addr = CuDevice::Instantiate().Malloc(input.size() * sizeof(MatrixElement<Real>));
-    CU_SAFE_CALL(cudaMemcpy(addr, input.data(),
-                        input.size() * sizeof(MatrixElement<Real>),
-                            cudaMemcpyHostToDevice));
+    CU_SAFE_CALL(cudaMemcpyAsync(addr, input.data(),
+                                 input.size() * sizeof(MatrixElement<Real>),
+                                 cudaMemcpyHostToDevice, cudaStreamPerThread));
 
     CuTimer tim;
     int dimBlock(CU1DBLOCK);
@@ -3289,8 +3304,9 @@ void CuMatrixBase<Real>::AddElements(Real alpha, const CuArrayBase<Int32Pair> &i
   if (CuDevice::Instantiate().Enabled()) {
     CuTimer tim;
     CuVector<Real> tmp_vec(indexes.Dim(), kUndefined);
-    CU_SAFE_CALL(cudaMemcpy(tmp_vec.Data(), input, indexes.Dim() * sizeof(Real),
-                            cudaMemcpyHostToDevice));
+    CU_SAFE_CALL(cudaMemcpyAsync(tmp_vec.Data(), input, 
+                                 indexes.Dim() * sizeof(Real),
+                                 cudaMemcpyHostToDevice, cudaStreamPerThread));
 
     int dimBlock(CU1DBLOCK);
     int dimGrid = n_blocks(indexes.Dim(), CU1DBLOCK);
diff --git a/src/cudamatrix/cu-packed-matrix.cc b/src/cudamatrix/cu-packed-matrix.cc
index 7581b043ae0..c331920c61f 100644
--- a/src/cudamatrix/cu-packed-matrix.cc
+++ b/src/cudamatrix/cu-packed-matrix.cc
@@ -162,8 +162,9 @@ void CuPackedMatrix<Real>::CopyFromPacked(const PackedMatrix<Real> &src) {
   if (CuDevice::Instantiate().Enabled()) {
     if (num_rows_ == 0) return; // Nothing to do.
     CuTimer tim;
-    CU_SAFE_CALL(cudaMemcpy(data_, src.data_, src.SizeInBytes(),
-                            cudaMemcpyHostToDevice));
+    CU_SAFE_CALL(cudaMemcpyAsync(data_, src.data_, src.SizeInBytes(),
+                                 cudaMemcpyHostToDevice, cudaStreamPerThread));
+    CU_SAFE_CALL(cudaStreamSynchronize(cudaStreamPerThread));
     CuDevice::Instantiate().AccuProfile("CuPackedMatrix::CopyFromPacked2", tim);
   } else
 #endif
@@ -184,8 +185,9 @@ void CuPackedMatrix<Real>::CopyToPacked(PackedMatrix<Real> *dst) const {
     size_t nr = static_cast<size_t>(num_rows_),
       num_bytes = ((nr * (nr+1)) / 2) * sizeof(Real);
 
-    CU_SAFE_CALL(cudaMemcpy(dst->data_, data_, num_bytes,
-                            cudaMemcpyDeviceToHost));
+    CU_SAFE_CALL(cudaMemcpyAsync(dst->data_, data_, num_bytes,
+                                 cudaMemcpyDeviceToHost, cudaStreamPerThread));
+    CU_SAFE_CALL(cudaStreamSynchronize(cudaStreamPerThread));
     CuDevice::Instantiate().AccuProfile("CuPackedMatrix::CopyToPackedD2H", tim);
   } else
 #endif
diff --git a/src/cudamatrix/cu-packed-matrix.h b/src/cudamatrix/cu-packed-matrix.h
index 0131ba6c101..8ed7ed79f7b 100644
--- a/src/cudamatrix/cu-packed-matrix.h
+++ b/src/cudamatrix/cu-packed-matrix.h
@@ -122,8 +122,10 @@ class CuPackedMatrix {
 #if HAVE_CUDA == 1
     if (CuDevice::Instantiate().Enabled()) {    
       Real value;
-      CU_SAFE_CALL(cudaMemcpy(&value, this->data_ + (r * (r+1)) / 2 + c,
-                              sizeof(Real), cudaMemcpyDeviceToHost));
+      CU_SAFE_CALL(cudaMemcpyAsync(&value, this->data_ + (r * (r+1)) / 2 + c,
+                                   sizeof(Real), cudaMemcpyDeviceToHost,
+                                   cudaStreamPerThread));
+      CU_SAFE_CALL(cudaStreamSynchronize(cudaStreamPerThread));
       return value;
     } else
 #endif
diff --git a/src/cudamatrix/cu-value.h b/src/cudamatrix/cu-value.h
index cab0a3235d7..b5b65479e57 100644
--- a/src/cudamatrix/cu-value.h
+++ b/src/cudamatrix/cu-value.h
@@ -54,7 +54,9 @@ class CuValue {
   inline Real operator = (Real r) { // assignment from Real
 #if HAVE_CUDA == 1
     if (CuDevice::Instantiate().Enabled()) {
-      CU_SAFE_CALL(cudaMemcpy(data_, &r, sizeof(Real), cudaMemcpyHostToDevice));
+      CU_SAFE_CALL(cudaMemcpyAsync(data_, &r, sizeof(Real), 
+            cudaMemcpyHostToDevice, cudaStreamPerThread));
+      CU_SAFE_CALL(cudaStreamSynchronize(cudaStreamPerThread));
       return r;
     } else
 #endif
@@ -71,8 +73,9 @@ class CuValue {
 #if HAVE_CUDA == 1
   if (CuDevice::Instantiate().Enabled()) {
     Real value;
-    CU_SAFE_CALL(cudaMemcpy(&value, data_,
-                            sizeof(Real), cudaMemcpyDeviceToHost));
+    CU_SAFE_CALL(cudaMemcpyAsync(&value, data_, sizeof(Real), 
+                 cudaMemcpyDeviceToHost, cudaStreamPerThread));
+    CU_SAFE_CALL(cudaStreamSynchronize(cudaStreamPerThread));
     return value;
   } else
 #endif
diff --git a/src/cudamatrix/cu-vector.cc b/src/cudamatrix/cu-vector.cc
index 7c968c6550d..2e06cffad48 100644
--- a/src/cudamatrix/cu-vector.cc
+++ b/src/cudamatrix/cu-vector.cc
@@ -221,18 +221,18 @@ void CuVectorBase<Real>::CopyRowsFromMat(const MatrixBase<Real> &mat) {
     if (dim_ == 0) return;
     CuTimer tim;
     if (mat.Stride() == mat.NumCols()) {
-      CU_SAFE_CALL(cudaMemcpy(data_, mat.Data(), sizeof(Real)*dim_,
-                              cudaMemcpyHostToDevice));
+      CU_SAFE_CALL(cudaMemcpyAsync(data_, mat.Data(), sizeof(Real)*dim_,
+                              cudaMemcpyHostToDevice, cudaStreamPerThread));
     } else {
       Real* vec_data = data_;
       for (MatrixIndexT r = 0; r < mat.NumRows(); r++) {
-        CU_SAFE_CALL(cudaMemcpy(vec_data, mat.RowData(r),
+        CU_SAFE_CALL(cudaMemcpyAsync(vec_data, mat.RowData(r),
                                 sizeof(Real) * mat.NumCols(),
-                                cudaMemcpyHostToDevice));
+                                cudaMemcpyHostToDevice, cudaStreamPerThread));
         vec_data += mat.NumCols();
       }
     }
-    CU_SAFE_CALL(cudaGetLastError());
+    CU_SAFE_CALL(cudaStreamSynchronize(cudaStreamPerThread));
     CuDevice::Instantiate().AccuProfile(__func__, tim);
   } else
 #endif
@@ -249,18 +249,21 @@ void MatrixBase<Real>::CopyRowsFromVec(const CuVectorBase<Real> &v) {
     if (num_rows_ == 0) return;
     CuTimer tim;
     if (Stride() == NumCols()) {
-      CU_SAFE_CALL(cudaMemcpy(data_, v.Data(),
+      CU_SAFE_CALL(cudaMemcpyAsync(data_, v.Data(),
                               sizeof(Real)*v.Dim(),
-                              cudaMemcpyDeviceToHost));
+                              cudaMemcpyDeviceToHost,
+                              cudaStreamPerThread));
     } else {
       const Real* vec_data = v.Data();
       for (MatrixIndexT r = 0; r < NumRows(); r++) {
-        CU_SAFE_CALL(cudaMemcpy(RowData(r), vec_data,
+        CU_SAFE_CALL(cudaMemcpyAsync(RowData(r), vec_data,
                                 sizeof(Real) * NumCols(),
-                                cudaMemcpyDeviceToHost));
+                                cudaMemcpyDeviceToHost,
+                                cudaStreamPerThread));
         vec_data += NumCols();
       }
     }
+    CU_SAFE_CALL(cudaStreamSynchronize(cudaStreamPerThread));
     CuDevice::Instantiate().AccuProfile(__func__, tim);
   } else
 #endif
@@ -886,7 +889,9 @@ void CuVectorBase<Real>::CopyFromVec(const VectorBase<OtherReal> &src) {
       KALDI_ASSERT(src.Dim() == dim_);
       if (dim_ == 0) return;
       CuTimer tim;
-      CU_SAFE_CALL(cudaMemcpy(data_, src.Data(), src.Dim()*sizeof(Real), cudaMemcpyHostToDevice));
+      CU_SAFE_CALL(cudaMemcpyAsync(data_, src.Data(), src.Dim()*sizeof(Real), 
+                                   cudaMemcpyHostToDevice, cudaStreamPerThread));
+      CU_SAFE_CALL(cudaStreamSynchronize(cudaStreamPerThread));
       CuDevice::Instantiate().AccuProfile("CuVector::CopyFromVecH2D", tim);
     }
   } else
@@ -917,8 +922,10 @@ void CuVectorBase<Real>::CopyToVec(VectorBase<OtherReal> *dst) const {
     } else {
       if (dim_ == 0) return;
       CuTimer tim;
-      CU_SAFE_CALL(cudaMemcpy(dst->Data(), this->data_,
-                              sizeof(Real) * dim_, cudaMemcpyDeviceToHost));
+      CU_SAFE_CALL(cudaMemcpyAsync(dst->Data(), this->data_,
+                              sizeof(Real) * dim_, cudaMemcpyDeviceToHost,
+                              cudaStreamPerThread));
+      CU_SAFE_CALL(cudaStreamSynchronize(cudaStreamPerThread));
       CuDevice::Instantiate().AccuProfile(__func__, tim);
     }
   } else

From 8a1acde21ac926d8285650bc614a510d005d8a7e Mon Sep 17 00:00:00 2001
From: Shujian2015 <Shujian2015@users.noreply.github.com>
Date: Sat, 30 Mar 2019 16:13:17 -0400
Subject: [PATCH 048/163] [egs] Fix small typo in tedlium download script
 (#3178)

---
 egs/tedlium/s5_r3/local/download_data.sh | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/egs/tedlium/s5_r3/local/download_data.sh b/egs/tedlium/s5_r3/local/download_data.sh
index c51effdd6fa..0b31a258613 100755
--- a/egs/tedlium/s5_r3/local/download_data.sh
+++ b/egs/tedlium/s5_r3/local/download_data.sh
@@ -25,7 +25,7 @@ else
     echo "$0: extracting TEDLIUM_release-3 data"
     tar xf "TEDLIUM_release-3.tgz"
   else
-    echo "$0: not downloading or un-tarring TEDLIUM_release2 because it already exists."
+    echo "$0: not downloading or un-tarring TEDLIUM_release3 because it already exists."
   fi
 fi
 

From 5f00d0d24dff92849c5cc5645bb42e17c5572532 Mon Sep 17 00:00:00 2001
From: "kkm (aka Kirill Katsnelson)" <kkm@smartaction.ai>
Date: Sun, 31 Mar 2019 10:29:00 -0700
Subject: [PATCH 049/163] [github] Add GitHub issue templates (#3187)

---
 .github/ISSUE_TEMPLATE/bug_report.md           | 18 ++++++++++++++++++
 .../feature-proposal-discussion.md             | 18 ++++++++++++++++++
 2 files changed, 36 insertions(+)
 create mode 100644 .github/ISSUE_TEMPLATE/bug_report.md
 create mode 100644 .github/ISSUE_TEMPLATE/feature-proposal-discussion.md

diff --git a/.github/ISSUE_TEMPLATE/bug_report.md b/.github/ISSUE_TEMPLATE/bug_report.md
new file mode 100644
index 00000000000..660c62884be
--- /dev/null
+++ b/.github/ISSUE_TEMPLATE/bug_report.md
@@ -0,0 +1,18 @@
+---
+name: Bug report
+about: Create a report to help us improve
+title: ''
+labels: bug
+assignees: ''
+
+---
+
+<!--
+    WARNING: THE KALDI ISSUE TRACKER IS **ONLY** USED FOR KALDI DEVELOPMENT!
+
+    If you have a question about using Kaldi, please use the kald-help discussion group:
+
+    https://groups.google.com/forum/#!forum/kaldi-help
+
+    Instructions for joining are available at: http://kaldi-asr.org/forums.html
+-->
diff --git a/.github/ISSUE_TEMPLATE/feature-proposal-discussion.md b/.github/ISSUE_TEMPLATE/feature-proposal-discussion.md
new file mode 100644
index 00000000000..61e797b9ca1
--- /dev/null
+++ b/.github/ISSUE_TEMPLATE/feature-proposal-discussion.md
@@ -0,0 +1,18 @@
+---
+name: Feature proposal or discussion
+about: Suggest an idea for Kaldi
+title: ''
+labels: discussion
+assignees: ''
+
+---
+
+<!--
+    WARNING: THE KALDI ISSUE TRACKER IS **ONLY** USED FOR KALDI DEVELOPMENT!
+
+    If you have a question about using Kaldi, please use the kald-help discussion group:
+
+    https://groups.google.com/forum/#!forum/kaldi-help
+
+    Instructions for joining are available at: http://kaldi-asr.org/forums.html
+-->

From 6e998a97a17f90fd6a5eb51eeb4ec41a50289034 Mon Sep 17 00:00:00 2001
From: Daniel Povey <dpovey@gmail.com>
Date: Sun, 31 Mar 2019 17:34:34 -0400
Subject: [PATCH 050/163] [build] Add missing dependency to Makefile (#3191)

---
 src/Makefile | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/src/Makefile b/src/Makefile
index 88da5ed1e55..d63e642b095 100644
--- a/src/Makefile
+++ b/src/Makefile
@@ -146,7 +146,9 @@ lm: base util matrix fstext
 decoder: base util matrix gmm hmm tree transform lat
 lat: base util hmm tree matrix
 cudamatrix: base util matrix
-nnet3: base util matrix lat gmm hmm tree transform cudamatrix chain fstext
+nnet: base util hmm tree matrix cudamatrix
+nnet2: base util matrix lat gmm hmm tree transform cudamatrix
+nnet3: base util matrix decoder lat gmm hmm tree transform cudamatrix chain fstext
 rnnlm: base util matrix cudamatrix nnet3 lm hmm
 chain: lat hmm tree fstext matrix cudamatrix util base
 ivector: base util matrix transform tree gmm

From bf0af1d470e383e668163b949372358ded825e4d Mon Sep 17 00:00:00 2001
From: hainan-xv <hainan.xv@gmail.com>
Date: Sun, 31 Mar 2019 23:21:00 -0400
Subject: [PATCH 051/163] [src] Fix bug in pruned lattice rescoring when input
 lattice has epsilons (#3190)

---
 src/lat/compose-lattice-pruned.cc | 9 ++++++++-
 1 file changed, 8 insertions(+), 1 deletion(-)

diff --git a/src/lat/compose-lattice-pruned.cc b/src/lat/compose-lattice-pruned.cc
index c6e4dafc008..57a7432dca0 100644
--- a/src/lat/compose-lattice-pruned.cc
+++ b/src/lat/compose-lattice-pruned.cc
@@ -771,7 +771,14 @@ void PrunedCompactLatticeComposer::ProcessTransition(int32 src_composed_state,
   // Note: we expect that ilabel == olabel, since this is a CompactLattice, but this
   // may not be so if we extend this to work with Lattice.
   fst::StdArc lm_arc;
-  if (!det_fst_->GetArc(src_info->lm_state, olabel, &lm_arc)) {
+
+  // the input lattice might have epsilons
+  if (olabel == 0) {
+    lm_arc.ilabel = 0;
+    lm_arc.olabel = 0;
+    lm_arc.nextstate = src_info->lm_state;
+    lm_arc.weight = fst::StdArc::Weight(0.0);
+  } else if (!det_fst_->GetArc(src_info->lm_state, olabel, &lm_arc)) {
     // for normal language models we don't expect this to happen, but the
     // appropriate behavior is to do nothing; the composed arc does not exist,
     // so there is no arc to add and no new state to create.

From 7371a9525b8723a7960ec5e09828a1c9f66e6079 Mon Sep 17 00:00:00 2001
From: armusc <46787089+armusc@users.noreply.github.com>
Date: Tue, 2 Apr 2019 03:34:56 +0200
Subject: [PATCH 052/163] [scripts] Fix bug in extend_lang.sh regarding
 extra_disambig.txt (#3195)

---
 egs/wsj/s5/utils/lang/extend_lang.sh | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/egs/wsj/s5/utils/lang/extend_lang.sh b/egs/wsj/s5/utils/lang/extend_lang.sh
index c13d5d3e78b..c8f680a12fb 100755
--- a/egs/wsj/s5/utils/lang/extend_lang.sh
+++ b/egs/wsj/s5/utils/lang/extend_lang.sh
@@ -131,7 +131,7 @@ for n in $(seq 0 $ndisambig); do
   sym='#'$n; if ! grep -w -q "$sym" $dir/phones/disambig.txt; then echo "$sym"; fi
 done > $tmpdir/extra_disambig.txt
 highest_number=$(tail -n 1 $srcdir/phones.txt | awk '{print $2}')
-awk -v start=$highest_number '{print $1, NR+start}' <$tmpdir/extra_disambig.txt >>$dir/words.txt
+awk -v start=$highest_number '{print $1, NR+start}' <$tmpdir/extra_disambig.txt >>$dir/phones.txt
 echo "$0: added $(wc -l <$tmpdir/extra_disambig.txt) extra disambiguation symbols to phones.txt"
 
 

From 32496b41cb2056b5d2b57d150de930052869bd94 Mon Sep 17 00:00:00 2001
From: jyhnnhyj <48015613+jyhnnhyj@users.noreply.github.com>
Date: Wed, 3 Apr 2019 17:55:10 +0200
Subject: [PATCH 053/163] [egs] Update Tedlium s5_r3 example with more
 up-to-date chain TDNN configuration

---
 .../s5_r3/local/chain/compare_wer_general.sh  |   4 +-
 egs/tedlium/s5_r3/local/chain/run_tdnn.sh     |   1 +
 egs/tedlium/s5_r3/local/chain/run_tdnnf.sh    |   1 -
 .../s5_r3/local/chain/tuning/run_tdnn_1c.sh   | 249 ++++++++++++++++++
 4 files changed, 252 insertions(+), 3 deletions(-)
 create mode 120000 egs/tedlium/s5_r3/local/chain/run_tdnn.sh
 delete mode 120000 egs/tedlium/s5_r3/local/chain/run_tdnnf.sh
 create mode 100755 egs/tedlium/s5_r3/local/chain/tuning/run_tdnn_1c.sh

diff --git a/egs/tedlium/s5_r3/local/chain/compare_wer_general.sh b/egs/tedlium/s5_r3/local/chain/compare_wer_general.sh
index 88dde1ff0e2..c709e351e1e 100755
--- a/egs/tedlium/s5_r3/local/chain/compare_wer_general.sh
+++ b/egs/tedlium/s5_r3/local/chain/compare_wer_general.sh
@@ -55,7 +55,7 @@ for n in 0 1 2 3; do
    for x in $*; do
      set_names $x  # sets $dirname and $epoch_infix
      decode_names=(dev${epoch_infix} dev${epoch_infix}_rescore test${epoch_infix} test${epoch_infix}_rescore)
-     wer=$(grep Sum $dirname/decode_${decode_names[$n]}/score*/*ys | utils/best_wer.sh | awk '{print $2}')
+     wer=$(grep WER $dirname/decode_${decode_names[$n]}/wer_* | utils/best_wer.sh | awk '{print $2}')
      printf "% 10s" $wer
    done
    echo
@@ -64,7 +64,7 @@ for n in 0 1 2 3; do
      for x in $*; do
        set_names $x  # sets $dirname and $epoch_infix
        decode_names=(dev${epoch_infix} dev${epoch_infix}_rescore test${epoch_infix} test${epoch_infix}_rescore)
-       wer=$(grep Sum $dirname/decode_looped_${decode_names[$n]}/score*/*ys | utils/best_wer.sh | awk '{print $2}')
+       wer=$(grep WER $dirname/decode_looped_${decode_names[$n]}/wer_* | utils/best_wer.sh | awk '{print $2}')
        printf "% 10s" $wer
      done
      echo
diff --git a/egs/tedlium/s5_r3/local/chain/run_tdnn.sh b/egs/tedlium/s5_r3/local/chain/run_tdnn.sh
new file mode 120000
index 00000000000..d48449e28bd
--- /dev/null
+++ b/egs/tedlium/s5_r3/local/chain/run_tdnn.sh
@@ -0,0 +1 @@
+tuning/run_tdnn_1c.sh
\ No newline at end of file
diff --git a/egs/tedlium/s5_r3/local/chain/run_tdnnf.sh b/egs/tedlium/s5_r3/local/chain/run_tdnnf.sh
deleted file mode 120000
index 61f8f499182..00000000000
--- a/egs/tedlium/s5_r3/local/chain/run_tdnnf.sh
+++ /dev/null
@@ -1 +0,0 @@
-tuning/run_tdnn_1b.sh
\ No newline at end of file
diff --git a/egs/tedlium/s5_r3/local/chain/tuning/run_tdnn_1c.sh b/egs/tedlium/s5_r3/local/chain/tuning/run_tdnn_1c.sh
new file mode 100755
index 00000000000..faac365af54
--- /dev/null
+++ b/egs/tedlium/s5_r3/local/chain/tuning/run_tdnn_1c.sh
@@ -0,0 +1,249 @@
+#!/bin/bash
+
+# This is copied from tedlium/s5_r2/local/chain/tuning/run_tdnn_1g.sh setup, and it replaces the current run_tdnn_1b.sh script. 
+
+# local/chain/compare_wer_general.sh exp/chain_cleaned/tdnnf_1b exp/chain_cleaned/tdnnf_1c
+# System                 tdnnf_1b  tdnnf_1c
+# WER on dev(orig)           8.15      8.03
+# WER on dev(rescored)       7.69      7.44
+# WER on test(orig)          8.19      8.30
+# WER on test(rescored)      7.77      7.85
+# Final train prob        -0.0692   -0.0669
+# Final valid prob        -0.0954   -0.0838
+# Final train prob (xent)   -0.9369   -0.9596
+# Final valid prob (xent)   -1.0730   -1.0780
+# Num-params                25741728   9463968
+
+
+# steps/info/chain_dir_info.pl exp/chain_cleaned/tdnnf_1b/
+# exp/chain_cleaned/tdnnf_1b/: num-iters=945 nj=2..6 num-params=25.7M dim=40+100->3664 combine=-0.074->-0.071 (over 6) xent:train/valid[628,944,final]=(-1.07,-0.959,-0.937/-1.20,-1.10,-1.07) logprob:train/valid[628,944,final]=(-0.088,-0.070,-0.069/-0.111,-0.098,-0.095)
+# steps/info/chain_dir_info.pl exp/chain_cleaned/tdnnf_1c
+# exp/chain_cleaned/tdnn1c/: num-iters=228 nj=3..12 num-params=9.5M dim=40+100->3664 combine=-0.068->-0.068 (over 4) xent:train/valid[151,227,final]=(-1.15,-0.967,-0.960/-1.25,-1.09,-1.08) logprob:train/valid[151,227,final]=(-0.090,-0.068,-0.067/-0.102,-0.05,-0.084)
+
+## how you run this (note: this assumes that the run_tdnn.sh soft link points here;
+## otherwise call it directly in its location).
+# by default, with cleanup:
+# local/chain/run_tdnn.sh
+
+# without cleanup:
+# local/chain/run_tdnn.sh  --train-set train --gmm tri3 --nnet3-affix "" &
+
+set -e -o pipefail
+
+# First the options that are passed through to run_ivector_common.sh
+# (some of which are also used in this script directly).
+stage=0
+nj=15
+decode_nj=15
+xent_regularize=0.1
+dropout_schedule='0,0@0.20,0.5@0.50,0'
+
+train_set=train_cleaned
+gmm=tri3_cleaned  # the gmm for the target data
+num_threads_ubm=1
+nnet3_affix=_cleaned  # cleanup affix for nnet3 and chain dirs, e.g. _cleaned
+
+# The rest are configs specific to this script.  Most of the parameters
+# are just hardcoded at this level, in the commands below.
+train_stage=-10
+tree_affix=  # affix for tree directory, e.g. "a" or "b", in case we change the configuration.
+tdnn_affix=1c  #affix for TDNN directory, e.g. "a" or "b", in case we change the configuration.
+common_egs_dir=  # you can set this to use previously dumped egs.
+remove_egs=true
+
+# End configuration section.
+echo "$0 $@"  # Print the command line for logging
+
+. ./cmd.sh
+. ./path.sh
+. ./utils/parse_options.sh
+
+
+if ! cuda-compiled; then
+  cat <<EOF && exit 1
+This script is intended to be used with GPUs but you have not compiled Kaldi with CUDA
+If you want to use GPUs (and have them), go to src/, and configure and make on a machine
+where "nvcc" is installed.
+EOF
+fi
+
+local/nnet3/run_ivector_common.sh --stage $stage \
+                                  --nj $nj \
+                                  --train-set $train_set \
+                                  --gmm $gmm \
+                                  --num-threads-ubm $num_threads_ubm \
+                                  --nnet3-affix "$nnet3_affix"
+
+
+gmm_dir=exp/$gmm
+ali_dir=exp/${gmm}_ali_${train_set}_sp
+tree_dir=exp/chain${nnet3_affix}/tree_bi${tree_affix}
+lat_dir=exp/chain${nnet3_affix}/${gmm}_${train_set}_sp_lats
+dir=exp/chain${nnet3_affix}/tdnn${tdnn_affix}_sp
+train_data_dir=data/${train_set}_sp_hires
+lores_train_data_dir=data/${train_set}_sp
+train_ivector_dir=exp/nnet3${nnet3_affix}/ivectors_${train_set}_sp_hires
+
+
+for f in $gmm_dir/final.mdl $train_data_dir/feats.scp $train_ivector_dir/ivector_online.scp \
+    $lores_train_data_dir/feats.scp $ali_dir/ali.1.gz $gmm_dir/final.mdl; do
+  [ ! -f $f ] && echo "$0: expected file $f to exist" && exit 1
+done
+
+if [ $stage -le 14 ]; then
+  echo "$0: creating lang directory with one state per phone."
+  # Create a version of the lang/ directory that has one state per phone in the
+  # topo file. [note, it really has two states.. the first one is only repeated
+  # once, the second one has zero or more repeats.]
+  if [ -d data/lang_chain ]; then
+    if [ data/lang_chain/L.fst -nt data/lang/L.fst ]; then
+      echo "$0: data/lang_chain already exists, not overwriting it; continuing"
+    else
+      echo "$0: data/lang_chain already exists and seems to be older than data/lang..."
+      echo " ... not sure what to do.  Exiting."
+      exit 1;
+    fi
+  else
+    cp -r data/lang data/lang_chain
+    silphonelist=$(cat data/lang_chain/phones/silence.csl) || exit 1;
+    nonsilphonelist=$(cat data/lang_chain/phones/nonsilence.csl) || exit 1;
+    # Use our special topology... note that later on may have to tune this
+    # topology.
+    steps/nnet3/chain/gen_topo.py $nonsilphonelist $silphonelist >data/lang_chain/topo
+  fi
+fi
+
+if [ $stage -le 15 ]; then
+  # Get the alignments as lattices (gives the chain training more freedom).
+  # use the same num-jobs as the alignments
+  steps/align_fmllr_lats.sh --nj 100 --cmd "$train_cmd" ${lores_train_data_dir} \
+    data/lang $gmm_dir $lat_dir
+  rm $lat_dir/fsts.*.gz # save space
+fi
+
+if [ $stage -le 16 ]; then
+  # Build a tree using our new topology.  We know we have alignments for the
+  # speed-perturbed data (local/nnet3/run_ivector_common.sh made them), so use
+  # those.
+  if [ -f $tree_dir/final.mdl ]; then
+    echo "$0: $tree_dir/final.mdl already exists, refusing to overwrite it."
+    exit 1;
+  fi
+  steps/nnet3/chain/build_tree.sh --frame-subsampling-factor 3 \
+      --context-opts "--context-width=2 --central-position=1" \
+      --cmd "$train_cmd" 4000 ${lores_train_data_dir} data/lang_chain $ali_dir $tree_dir
+fi
+
+if [ $stage -le 17 ]; then
+  mkdir -p $dir
+
+  echo "$0: creating neural net configs using the xconfig parser";
+
+  num_targets=$(tree-info $tree_dir/tree |grep num-pdfs|awk '{print $2}')
+  learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python)
+  affine_opts="l2-regularize=0.008 dropout-proportion=0.0 dropout-per-dim-continuous=true"
+  tdnnf_opts="l2-regularize=0.008 dropout-proportion=0.0 bypass-scale=0.66"
+  linear_opts="l2-regularize=0.008 orthonormal-constraint=-1.0"
+  prefinal_opts="l2-regularize=0.008"
+  output_opts="l2-regularize=0.002"
+
+  mkdir -p $dir/configs
+  cat <<EOF > $dir/configs/network.xconfig
+  input dim=100 name=ivector
+  input dim=40 name=input
+
+  # please note that it is important to have input layer with the name=input
+  # as the layer immediately preceding the fixed-affine-layer to enable
+  # the use of short notation for the descriptor
+  fixed-affine-layer name=lda input=Append(-1,0,1,ReplaceIndex(ivector, t, 0)) affine-transform-file=$dir/configs/lda.mat
+
+  # the first splicing is moved before the lda layer, so no splicing here
+  relu-batchnorm-dropout-layer name=tdnn1 $affine_opts dim=1024
+  tdnnf-layer name=tdnnf2 $tdnnf_opts dim=1024 bottleneck-dim=128 time-stride=1
+  tdnnf-layer name=tdnnf3 $tdnnf_opts dim=1024 bottleneck-dim=128 time-stride=1
+  tdnnf-layer name=tdnnf4 $tdnnf_opts dim=1024 bottleneck-dim=128 time-stride=1
+  tdnnf-layer name=tdnnf5 $tdnnf_opts dim=1024 bottleneck-dim=128 time-stride=0
+  tdnnf-layer name=tdnnf6 $tdnnf_opts dim=1024 bottleneck-dim=128 time-stride=3
+  tdnnf-layer name=tdnnf7 $tdnnf_opts dim=1024 bottleneck-dim=128 time-stride=3
+  tdnnf-layer name=tdnnf8 $tdnnf_opts dim=1024 bottleneck-dim=128 time-stride=3
+  tdnnf-layer name=tdnnf9 $tdnnf_opts dim=1024 bottleneck-dim=128 time-stride=3
+  tdnnf-layer name=tdnnf10 $tdnnf_opts dim=1024 bottleneck-dim=128 time-stride=3
+  tdnnf-layer name=tdnnf11 $tdnnf_opts dim=1024 bottleneck-dim=128 time-stride=3
+  tdnnf-layer name=tdnnf12 $tdnnf_opts dim=1024 bottleneck-dim=128 time-stride=3
+  tdnnf-layer name=tdnnf13 $tdnnf_opts dim=1024 bottleneck-dim=128 time-stride=3
+  linear-component name=prefinal-l dim=256 $linear_opts
+
+  prefinal-layer name=prefinal-chain input=prefinal-l $prefinal_opts big-dim=1024 small-dim=256
+  output-layer name=output include-log-softmax=false dim=$num_targets $output_opts
+
+  prefinal-layer name=prefinal-xent input=prefinal-l $prefinal_opts big-dim=1024 small-dim=256
+  output-layer name=output-xent dim=$num_targets learning-rate-factor=$learning_rate_factor $output_opts
+EOF
+  steps/nnet3/xconfig_to_configs.py --xconfig-file $dir/configs/network.xconfig --config-dir $dir/configs/
+
+fi
+
+if [ $stage -le 18 ]; then
+  if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then
+    utils/create_split_dir.pl \
+     /export/b0{5,6,7,8}/$USER/kaldi-data/egs/ami-$(date +'%m_%d_%H_%M')/s5/$dir/egs/storage $dir/egs/storage
+  fi
+
+ steps/nnet3/chain/train.py --stage $train_stage \
+    --cmd "$decode_cmd" \
+    --feat.online-ivector-dir $train_ivector_dir \
+    --feat.cmvn-opts "--norm-means=false --norm-vars=false" \
+    --chain.xent-regularize $xent_regularize \
+    --chain.leaky-hmm-coefficient 0.1 \
+    --chain.l2-regularize 0.0 \
+    --chain.apply-deriv-weights false \
+    --chain.lm-opts="--num-extra-lm-states=2000" \
+    --trainer.dropout-schedule $dropout_schedule \
+    --trainer.add-option="--optimization.memory-compression-level=2" \
+    --egs.dir "$common_egs_dir" \
+    --egs.opts "--frames-overlap-per-eg 0 --constrained false" \
+    --egs.chunk-width 150,110,100 \
+    --trainer.num-chunk-per-minibatch 64 \
+    --trainer.frames-per-iter 5000000 \
+    --trainer.num-epochs 6 \
+    --trainer.optimization.num-jobs-initial 3 \
+    --trainer.optimization.num-jobs-final 12 \
+    --trainer.optimization.initial-effective-lrate 0.00025 \
+    --trainer.optimization.final-effective-lrate 0.000025 \
+    --trainer.max-param-change 2.0 \
+    --cleanup.remove-egs $remove_egs \
+    --feat-dir $train_data_dir \
+    --tree-dir $tree_dir \
+    --lat-dir $lat_dir \
+    --dir $dir
+fi
+
+
+
+if [ $stage -le 19 ]; then
+  # Note: it might appear that this data/lang_chain directory is mismatched, and it is as
+  # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
+  # the lang directory.
+  utils/mkgraph.sh --self-loop-scale 1.0 data/lang $dir $dir/graph
+fi
+
+if [ $stage -le 20 ]; then
+  rm $dir/.error 2>/dev/null || true
+  for dset in dev test; do
+      (
+      steps/nnet3/decode.sh --num-threads 4 --nj $decode_nj --cmd "$decode_cmd" \
+          --acwt 1.0 --post-decode-acwt 10.0 \
+          --online-ivector-dir exp/nnet3${nnet3_affix}/ivectors_${dset}_hires \
+          --scoring-opts "--min-lmwt 5 " \
+         $dir/graph data/${dset}_hires $dir/decode_${dset} || exit 1;
+      steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" data/lang data/lang_rescore \
+        data/${dset}_hires ${dir}/decode_${dset} ${dir}/decode_${dset}_rescore || exit 1
+    ) || touch $dir/.error &
+  done
+  wait
+  if [ -f $dir/.error ]; then
+    echo "$0: something went wrong in decoding"
+    exit 1
+  fi
+fi
+exit 0

From 43ba4f272f2600bba023e4ef3240b478543c0a5c Mon Sep 17 00:00:00 2001
From: Lucas Jo <jty016@gmail.com>
Date: Thu, 4 Apr 2019 01:18:53 +0900
Subject: [PATCH 054/163] [scripts] Fix bug in extend_lang.sh causing
 validation failure w/ extra_disambig.txt (#3202)

---
 egs/wsj/s5/utils/lang/extend_lang.sh | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/egs/wsj/s5/utils/lang/extend_lang.sh b/egs/wsj/s5/utils/lang/extend_lang.sh
index c8f680a12fb..236e3ad6dd5 100755
--- a/egs/wsj/s5/utils/lang/extend_lang.sh
+++ b/egs/wsj/s5/utils/lang/extend_lang.sh
@@ -134,6 +134,11 @@ highest_number=$(tail -n 1 $srcdir/phones.txt | awk '{print $2}')
 awk -v start=$highest_number '{print $1, NR+start}' <$tmpdir/extra_disambig.txt >>$dir/phones.txt
 echo "$0: added $(wc -l <$tmpdir/extra_disambig.txt) extra disambiguation symbols to phones.txt"
 
+# add extra_disambig symbols into disambig.txt
+cat $tmpdir/extra_disambig.txt >> $dir/phones/disambig.txt
+utils/sym2int.pl $dir/phones.txt <$dir/phones/disambig.txt >$dir/phones/disambig.int
+utils/sym2int.pl $dir/phones.txt <$dir/phones/disambig.txt | \
+  awk '{printf(":%d", $1);} END{printf "\n"}' | sed s/:// > $dir/phones/disambig.csl
 
 silphone=`cat $srcdir/phones/optional_silence.txt` || exit 1;
 [ -z "$silphone" ] && \

From c737d942ecd66b3be49349883259d367473dcec1 Mon Sep 17 00:00:00 2001
From: armusc <46787089+armusc@users.noreply.github.com>
Date: Thu, 4 Apr 2019 19:31:09 +0200
Subject: [PATCH 055/163] [scripts] Bug-fix in make_lexicon_fst.py, which
 failed when --sil-prob=0  (#3206)

---
 egs/wsj/s5/utils/lang/make_lexicon_fst.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/egs/wsj/s5/utils/lang/make_lexicon_fst.py b/egs/wsj/s5/utils/lang/make_lexicon_fst.py
index 790af2f2314..e22222db340 100755
--- a/egs/wsj/s5/utils/lang/make_lexicon_fst.py
+++ b/egs/wsj/s5/utils/lang/make_lexicon_fst.py
@@ -209,7 +209,7 @@ def write_fst_no_silence(lexicon, nonterminals=None, left_context_phones=None):
 
     if nonterminals is not None:
         next_state = write_nonterminal_arcs(
-            start_state, loop_state, next_state,
+            loop_state, loop_state, next_state,
             nonterminals, left_context_phones)
 
     print("{state}\t{final_cost}".format(

From 57d63cc0437af4a5b32ec831febdfb558d3177d5 Mon Sep 17 00:00:00 2001
From: Shujian2015 <Shujian2015@users.noreply.github.com>
Date: Thu, 4 Apr 2019 15:18:00 -0400
Subject: [PATCH 056/163] [egs] Fix very small typo in run_tdnn_1b.sh (#3207)

---
 egs/tedlium/s5_r3/local/chain/tuning/run_tdnn_1b.sh | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/egs/tedlium/s5_r3/local/chain/tuning/run_tdnn_1b.sh b/egs/tedlium/s5_r3/local/chain/tuning/run_tdnn_1b.sh
index f06ba3fa195..744c964db2f 100755
--- a/egs/tedlium/s5_r3/local/chain/tuning/run_tdnn_1b.sh
+++ b/egs/tedlium/s5_r3/local/chain/tuning/run_tdnn_1b.sh
@@ -55,7 +55,7 @@ nnet3_affix=_cleaned  # cleanup affix for nnet3 and chain dirs, e.g. _cleaned
 # are just hardcoded at this level, in the commands below.
 train_stage=-10
 tree_affix=  # affix for tree directory, e.g. "a" or "b", in case we change the configuration.
-tdnnf_affix=_1a  #affix for TDNNF directory, e.g. "a" or "b", in case we change the configuration.
+tdnnf_affix=_1b  #affix for TDNNF directory, e.g. "a" or "b", in case we change the configuration.
 common_egs_dir=  # you can set this to use previously dumped egs.
 
 # End configuration section.

From 9393b668ab2d61408886b25829e940837b487ab8 Mon Sep 17 00:00:00 2001
From: "Patrick L. Lange" <patrick.l.lange@gmail.com>
Date: Thu, 4 Apr 2019 12:25:46 -0700
Subject: [PATCH 057/163] [build] Tensorflow version update (#3204)

---
 src/tfrnnlm/Makefile                  |  8 +++++---
 src/tfrnnlmbin/Makefile               | 10 ++++++----
 tools/extras/install_tensorflow_cc.sh |  9 +++++----
 3 files changed, 16 insertions(+), 11 deletions(-)

diff --git a/src/tfrnnlm/Makefile b/src/tfrnnlm/Makefile
index db2b840b959..3dc8d584210 100644
--- a/src/tfrnnlm/Makefile
+++ b/src/tfrnnlm/Makefile
@@ -16,11 +16,13 @@ TENSORFLOW = ../../tools/tensorflow
 
 all:
 
-EXTRA_CXXFLAGS = -Wno-sign-compare -I$(TENSORFLOW)/bazel-tensorflow/external/protobuf/src \
+EXTRA_CXXFLAGS = -Wno-sign-compare \
+                 -I$(TENSORFLOW)/bazel-tensorflow/external/protobuf_archive/src \
                  -I$(TENSORFLOW)/bazel-genfiles -I$(TENSORFLOW) \
                  -I$(TENSORFLOW)/tensorflow/contrib/makefile/downloads/eigen \
                  -I$(TENSORFLOW)/tensorflow/contrib/makefile/downloads/nsync/public \
-                 -I$(TENSORFLOW)/tensorflow/contrib/makefile/downloads/protobuf/src
+                 -I$(TENSORFLOW)/tensorflow/contrib/makefile/downloads/protobuf/src \
+                 -I${TENSORFLOW}/tensorflow/contrib/makefile/downloads/absl
 
 OBJFILES = tensorflow-rnnlm.o
 
@@ -29,7 +31,7 @@ TESTFILES =
 LIBNAME = kaldi-tensorflow-rnnlm
 
 ADDLIBS = ../lm/kaldi-lm.a ../util/kaldi-util.a ../matrix/kaldi-matrix.a \
-          ../base/kaldi-base.a 
+          ../base/kaldi-base.a
 LDLIBS +=  -lz -ldl -fPIC -lrt
 LDLIBS += -L$(TENSORFLOW)/bazel-bin/tensorflow -ltensorflow_cc -ltensorflow_framework
 
diff --git a/src/tfrnnlmbin/Makefile b/src/tfrnnlmbin/Makefile
index 4beeeb0d594..6963c0b62d0 100644
--- a/src/tfrnnlmbin/Makefile
+++ b/src/tfrnnlmbin/Makefile
@@ -14,11 +14,13 @@ TENSORFLOW = $(shell pwd)/../../tools/tensorflow
 
 all:
 
-EXTRA_CXXFLAGS = -Wno-sign-compare -I$(TENSORFLOW)/bazel-tensorflow/external/protobuf/src \
+EXTRA_CXXFLAGS = -Wno-sign-compare \
+                 -I$(TENSORFLOW)/bazel-tensorflow/external/protobuf_archive/src
                  -I$(TENSORFLOW)/bazel-genfiles -I$(TENSORFLOW) \
                  -I$(TENSORFLOW)/tensorflow/contrib/makefile/downloads/eigen \
                  -I$(TENSORFLOW)/tensorflow/contrib/makefile/downloads/nsync/public \
-                 -I$(TENSORFLOW)/tensorflow/contrib/makefile/downloads/protobuf/src
+                 -I$(TENSORFLOW)/tensorflow/contrib/makefile/downloads/protobuf/src \
+                 -I${TENSORFLOW}/tensorflow/contrib/makefile/downloads/absl
 include ../kaldi.mk
 
 BINFILES = lattice-lmrescore-tf-rnnlm lattice-lmrescore-tf-rnnlm-pruned
@@ -30,11 +32,11 @@ TESTFILES =
 ADDLIBS = ../lat/kaldi-lat.a ../lm/kaldi-lm.a ../fstext/kaldi-fstext.a \
           ../hmm/kaldi-hmm.a ../tree/kaldi-tree.a ../util/kaldi-util.a \
           ../matrix/kaldi-matrix.a ../base/kaldi-base.a \
-          ../tfrnnlm/kaldi-tensorflow-rnnlm.a 
+          ../tfrnnlm/kaldi-tensorflow-rnnlm.a
 
 LDLIBS +=  -lz -ldl -fPIC -lrt
 LDLIBS += -L$(TENSORFLOW)/bazel-bin/tensorflow -ltensorflow_cc -ltensorflow_framework
 
-LDFLAGS += -Wl,-rpath=$(shell pwd)/../../tools/tensorflow/bazel-bin/tensorflow/
+LDFLAGS += -Wl,-rpath,$(TENSORFLOW)/bazel-bin/tensorflow/
 
 include ../makefiles/default_rules.mk
diff --git a/tools/extras/install_tensorflow_cc.sh b/tools/extras/install_tensorflow_cc.sh
index 95e81053e74..b13fcbeff44 100755
--- a/tools/extras/install_tensorflow_cc.sh
+++ b/tools/extras/install_tensorflow_cc.sh
@@ -25,7 +25,7 @@ else
 fi
 
 
-[ ! -f bazel.zip ] && wget https://github.com/bazelbuild/bazel/releases/download/0.5.4/bazel-0.5.4-dist.zip -O bazel.zip
+[ ! -f bazel.zip ] && wget https://github.com/bazelbuild/bazel/releases/download/0.15.0/bazel-0.15.0-dist.zip -O bazel.zip
 mkdir -p bazel
 cd bazel
 unzip ../bazel.zip
@@ -33,12 +33,13 @@ unzip ../bazel.zip
 cd ../
 
 # now bazel is built
-git clone https://github.com/tensorflow/tensorflow
+[ ! -d tensorflow ] && git clone https://github.com/tensorflow/tensorflow
 cd tensorflow
-git checkout r1.4
+git fetch --tags
+git checkout r1.12
 ./configure
 
-tensorflow/contrib/makefile/download_dependencies.sh 
+tensorflow/contrib/makefile/download_dependencies.sh
 bazel build -c opt //tensorflow:libtensorflow.so
 bazel build -c opt //tensorflow:libtensorflow_cc.so
 

From 4efc486defa78ddd6e6fb1410127005022d79da5 Mon Sep 17 00:00:00 2001
From: Shiyin Kang <kangshiyin@gmail.com>
Date: Sun, 7 Apr 2019 00:52:09 +0800
Subject: [PATCH 058/163] [src] Optimizations to CUDA kernels (#3209)

---
 src/cudamatrix/cu-kernels.cu | 57 ++++++++++++++++++------------------
 1 file changed, 28 insertions(+), 29 deletions(-)

diff --git a/src/cudamatrix/cu-kernels.cu b/src/cudamatrix/cu-kernels.cu
index 515412ca398..bc5c32714ef 100644
--- a/src/cudamatrix/cu-kernels.cu
+++ b/src/cudamatrix/cu-kernels.cu
@@ -2487,7 +2487,7 @@ static void _heaviside(Real*y, const Real*x, MatrixDim d, int src_stride) {
 template<typename Real>
 __global__
 static void _softmax_reduce(Real*y, const Real*x, MatrixDim d, int src_stride) {
-  __shared__ Real smem[CU1DBLOCK];
+  __shared__ Real smem;
   typedef cub::BlockReduce<Real, CU1DBLOCK> BlockReduceT;
   __shared__ typename BlockReduceT::TempStorage temp_storage;
   const int i = blockIdx.x;
@@ -2502,13 +2502,13 @@ static void _softmax_reduce(Real*y, const Real*x, MatrixDim d, int src_stride) {
     tmax = fmax(tmax, x[x_start + j]);
   }
   tmax = BlockReduceT(temp_storage).Reduce(tmax, cub::Max());
-  if (tid == 0) {
-    smem[0] = tmax;
-  }
 
   // broadcast max to all threads
+  if (tid == 0) {
+    smem = tmax;
+  }
   __syncthreads();
-  Real max = smem[0];
+  Real max = smem;
 
   // sum_j(exp(x(i,j)-max))
   // reduce to CU1DBLOCK elements per row.
@@ -2517,13 +2517,13 @@ static void _softmax_reduce(Real*y, const Real*x, MatrixDim d, int src_stride) {
     tsum += exp(x[x_start + j] - max);
   }
   tsum = BlockReduceT(temp_storage).Sum(tsum);
-  if (tid == 0) {
-    smem[0] = tsum;
-  }
 
   // broadcast sum to all threads
+  if (tid == 0) {
+    smem = tsum;
+  }
   __syncthreads();
-  Real inv_sum = Real(1) / smem[0];
+  Real inv_sum = Real(1) / smem;
 
   // normalize the row
   for (int j = tid; j < d.cols; j += CU1DBLOCK) {
@@ -2565,7 +2565,6 @@ static void _normalize_per_row(Real *y, int y_stride, const Real *x,
     tsum += x_row[j] * x_row[j];
   }
   tsum = BlockReduceT(temp_storage).Sum(tsum);
-  __syncthreads();
 
   if (tid == 0) {
     const Real kSquaredNormFloor = 1.3552527156068805425e-20; // 2^-66
@@ -2680,7 +2679,7 @@ template<typename Real>
 __global__
 static void _log_softmax_reduce(Real* y, const Real* x, MatrixDim y_dim,
                                 int x_stride) {
-  __shared__ Real smem[CU1DBLOCK];
+  __shared__ Real smem;
   typedef cub::BlockReduce<Real, CU1DBLOCK> BlockReduceT;
   __shared__ typename BlockReduceT::TempStorage temp_storage;
   const int i = blockIdx.x;
@@ -2695,13 +2694,13 @@ static void _log_softmax_reduce(Real* y, const Real* x, MatrixDim y_dim,
     tmax = fmax(tmax, x[x_start + j]);
   }
   tmax = BlockReduceT(temp_storage).Reduce(tmax, cub::Max());
-  if (tid == 0) {
-    smem[0] = tmax;
-  }
 
   // broadcast max to all threads
+  if (tid == 0) {
+    smem = tmax;
+  }
   __syncthreads();
-  Real max = smem[0];
+  Real max = smem;
 
   // sum_j(exp(x(i,j)-max))
   // reduce to CU1DBLOCK elements per row.
@@ -2710,13 +2709,13 @@ static void _log_softmax_reduce(Real* y, const Real* x, MatrixDim y_dim,
     tsum += exp(x[x_start + j] - max);
   }
   tsum = BlockReduceT(temp_storage).Sum(tsum);
-  if (tid == 0) {
-    smem[0] = tsum;
-  }
 
   // broadcast sum to all threads
+  if (tid == 0) {
+    smem = tsum;
+  }
   __syncthreads();
-  Real log_sum = log(smem[0]);
+  Real log_sum = log(smem);
 
   // normalize the row
   for (int j = tid; j < y_dim.cols; j += CU1DBLOCK) {
@@ -2956,7 +2955,7 @@ __global__
 static void _diff_softmax(Real* x, const MatrixDim dim, const Real* value,
                           const int value_stride, const Real* diff,
                           const int diff_stride) {
-  __shared__ Real ssum[CU1DBLOCK];
+  __shared__ Real ssum;
   typedef cub::BlockReduce<Real, CU1DBLOCK> BlockReduceT;
   __shared__ typename BlockReduceT::TempStorage temp_storage;
 
@@ -2972,13 +2971,13 @@ static void _diff_softmax(Real* x, const MatrixDim dim, const Real* value,
     tsum += value[value_start + j] * diff[diff_start + j];
   }
   tsum = BlockReduceT(temp_storage).Sum(tsum);
-  if (tid == 0) {
-    ssum[0] = tsum;
-  }
 
   // Broadcast result to all threads
+  if (tid == 0) {
+    ssum = tsum;
+  }
   __syncthreads();
-  const Real pe = ssum[0];
+  const Real pe = ssum;
 
   // Apply element-wise x = value * (diff - pe)
   for (int j = tid; j < dim.cols; j += CU1DBLOCK) {
@@ -2998,7 +2997,7 @@ static void _diff_log_softmax(const MatrixDim in_deriv_dim,
                               const Real* out_deriv, const int out_deriv_stride,
                               Real* in_deriv) {
 
-  __shared__ Real ssum[CU1DBLOCK];
+  __shared__ Real ssum;
   typedef cub::BlockReduce<Real, CU1DBLOCK> BlockReduceT;
   __shared__ typename BlockReduceT::TempStorage temp_storage;
   const int tid = threadIdx.x;
@@ -3013,13 +3012,13 @@ static void _diff_log_softmax(const MatrixDim in_deriv_dim,
     tsum += out_deriv[out_deriv_start + j];
   }
   tsum = BlockReduceT(temp_storage).Sum(tsum);
-  if (tid == 0) {
-    ssum[0] = tsum;
-  }
 
   // Broadcast result to all threads
+  if (tid == 0) {
+    ssum = tsum;
+  }
   __syncthreads();
-  const Real sum_e = ssum[0];
+  const Real sum_e = ssum;
 
   // Apply element-wise x = out_deriv - exp(value) * sum_e
   for (int j = tid; j < in_deriv_dim.cols; j += CU1DBLOCK) {

From 59523dc16778c5d73a1d9611b356fd50cbab5331 Mon Sep 17 00:00:00 2001
From: Justin Luitjens <luitjens@users.noreply.github.com>
Date: Sat, 6 Apr 2019 20:19:05 -0600
Subject: [PATCH 059/163] [src] Move curand handle out of CuRand class and into
 CuDevice.  (#3196)

---
 src/cudamatrix/cu-device.cc | 19 ++++++++++++++++++
 src/cudamatrix/cu-device.h  | 28 +++++++++++++++++++++-----
 src/cudamatrix/cu-rand.cc   | 25 ++++++++++++++---------
 src/cudamatrix/cu-rand.h    | 40 +++----------------------------------
 4 files changed, 61 insertions(+), 51 deletions(-)

diff --git a/src/cudamatrix/cu-device.cc b/src/cudamatrix/cu-device.cc
index e5d161521fd..d0708ef486d 100644
--- a/src/cudamatrix/cu-device.cc
+++ b/src/cudamatrix/cu-device.cc
@@ -123,6 +123,14 @@ void CuDevice::Initialize() {
     // Initialize the cuSPARSE library
     CUSPARSE_SAFE_CALL(cusparseCreate(&cusparse_handle_));
     CUSPARSE_SAFE_CALL(cusparseSetStream(cusparse_handle_, cudaStreamPerThread));
+
+    // Initialize the generator,
+    CURAND_SAFE_CALL(curandCreateGenerator(
+          &curand_handle_, CURAND_RNG_PSEUDO_DEFAULT));
+    // To get same random sequence, call srand() before the constructor is invoked,
+    CURAND_SAFE_CALL(curandSetGeneratorOrdering(
+          curand_handle_, CURAND_ORDERING_PSEUDO_DEFAULT));
+    SeedGpu();
   }
 }
 
@@ -258,6 +266,14 @@ void CuDevice::FinalizeActiveGpu() {
     // Initialize the cuSPARSE library
     CUSPARSE_SAFE_CALL(cusparseCreate(&cusparse_handle_));
     CUSPARSE_SAFE_CALL(cusparseSetStream(cusparse_handle_, cudaStreamPerThread));
+    
+    // Initialize the generator,
+    CURAND_SAFE_CALL(curandCreateGenerator(
+          &curand_handle_, CURAND_RNG_PSEUDO_DEFAULT));
+    // To get same random sequence, call srand() before the constructor is invoked,
+    CURAND_SAFE_CALL(curandSetGeneratorOrdering(
+          curand_handle_, CURAND_ORDERING_PSEUDO_DEFAULT));
+    SeedGpu();
 
     // Notify the user which GPU is being userd.
     char name[128];
@@ -529,6 +545,9 @@ CuDevice::~CuDevice() {
     CUBLAS_SAFE_CALL(cublasDestroy(cublas_handle_));
   if (cusparse_handle_)
     CUSPARSE_SAFE_CALL(cusparseDestroy(cusparse_handle_));
+  if (curand_handle_) {
+    CURAND_SAFE_CALL(curandDestroyGenerator(curand_handle_));
+  }
 }
 
 
diff --git a/src/cudamatrix/cu-device.h b/src/cudamatrix/cu-device.h
index 8816f9d223b..7cca69f754b 100644
--- a/src/cudamatrix/cu-device.h
+++ b/src/cudamatrix/cu-device.h
@@ -26,6 +26,7 @@
 #if HAVE_CUDA == 1
 #include <cublas_v2.h>
 #include <cusparse.h>
+#include <curand.h>
 #include <map>
 #include <string>
 #include <iostream>
@@ -34,6 +35,7 @@
 #include "base/kaldi-common.h"
 #include "base/timer.h"
 #include "cudamatrix/cu-allocator.h"
+#include "cudamatrix/cu-common.h"
 
 namespace kaldi {
 
@@ -80,7 +82,16 @@ class CuDevice {
 
   inline cublasHandle_t GetCublasHandle() { return cublas_handle_; }
   inline cusparseHandle_t GetCusparseHandle() { return cusparse_handle_; }
-
+  inline curandGenerator_t GetCurandHandle() { return curand_handle_; }
+
+  inline void SeedGpu() {
+    if (CuDevice::Instantiate().Enabled()) {
+      // To get same random sequence, call srand() before the method is invoked,
+      CURAND_SAFE_CALL(curandSetPseudoRandomGeneratorSeed(
+            curand_handle_, RandInt(128, RAND_MAX)));
+      CURAND_SAFE_CALL(curandSetGeneratorOffset(curand_handle_, 0));
+    }
+  }
   // We provide functions Malloc(), MallocPitch() and Free() which replace
   // cudaMalloc(), cudaMallocPitch() and cudaFree().  Their function is to cache
   // the results of previous allocations to avoid the very large overhead that
@@ -291,9 +302,8 @@ class CuDevice {
   int32 device_id_copy_;
 
   cublasHandle_t cublas_handle_;
-
   cusparseHandle_t cusparse_handle_;
-
+  curandGenerator_t curand_handle_;
 }; // class CuDevice
 
 
@@ -308,9 +318,17 @@ class CuTimer: public Timer {
 
 // This function is declared as a more convenient way to get the CUDA device handle for use
 // in the CUBLAS v2 API, since we so frequently need to access it.
-inline cublasHandle_t GetCublasHandle() { return CuDevice::Instantiate().GetCublasHandle(); }
+inline cublasHandle_t GetCublasHandle() { 
+  return CuDevice::Instantiate().GetCublasHandle(); 
+}
 // A more convenient way to get the handle to use cuSPARSE APIs.
-inline cusparseHandle_t GetCusparseHandle() { return CuDevice::Instantiate().GetCusparseHandle(); }
+inline cusparseHandle_t GetCusparseHandle() { 
+  return CuDevice::Instantiate().GetCusparseHandle(); 
+}
+
+inline curandGenerator_t GetCurandHandle() { 
+  return CuDevice::Instantiate().GetCurandHandle(); 
+}
 
 
 }  // namespace kaldi
diff --git a/src/cudamatrix/cu-rand.cc b/src/cudamatrix/cu-rand.cc
index 20439834a98..63d858c25e9 100644
--- a/src/cudamatrix/cu-rand.cc
+++ b/src/cudamatrix/cu-rand.cc
@@ -69,7 +69,8 @@ void CuRand<Real>::RandUniform(CuMatrixBase<Real> *tgt) {
     CuMatrix<Real> tmp(tgt->NumRows(), tgt->NumCols(), kUndefined,
                        kStrideEqualNumCols);
     size_t s = static_cast<size_t>(tmp.NumRows()) * static_cast<size_t>(tmp.Stride());
-    CURAND_SAFE_CALL(curandGenerateUniformWrap(gen_, tmp.Data(), s));
+    CURAND_SAFE_CALL(curandGenerateUniformWrap(
+          GetCurandHandle(), tmp.Data(), s));
     tgt->CopyFromMat(tmp);
     CuDevice::Instantiate().AccuProfile(__func__, tim);
   } else
@@ -86,7 +87,8 @@ void CuRand<Real>::RandUniform(CuMatrix<Real> *tgt) {
     CuTimer tim;
     // Here we don't need to use 'tmp' matrix,
     size_t s = static_cast<size_t>(tgt->NumRows()) * static_cast<size_t>(tgt->Stride());
-    CURAND_SAFE_CALL(curandGenerateUniformWrap(gen_, tgt->Data(), s));
+    CURAND_SAFE_CALL(curandGenerateUniformWrap(
+          GetCurandHandle(), tgt->Data(), s));
     CuDevice::Instantiate().AccuProfile(__func__, tim);
   } else
 #endif
@@ -100,7 +102,8 @@ void CuRand<Real>::RandUniform(CuVectorBase<Real> *tgt) {
 #if HAVE_CUDA == 1
   if (CuDevice::Instantiate().Enabled()) {
     CuTimer tim;
-    CURAND_SAFE_CALL(curandGenerateUniformWrap(gen_, tgt->Data(), tgt->Dim()));
+    CURAND_SAFE_CALL(curandGenerateUniformWrap(
+          GetCurandHandle(), tgt->Data(), tgt->Dim()));
     CuDevice::Instantiate().AccuProfile(__func__, tim);
   } else
 #endif
@@ -125,7 +128,8 @@ void CuRand<Real>::RandGaussian(CuMatrixBase<Real> *tgt) {
     MatrixIndexT num_cols_even = tgt->NumCols() + (tgt->NumCols() % 2); // + 0 or 1,
     CuMatrix<Real> tmp(tgt->NumRows(), num_cols_even, kUndefined,
                        kStrideEqualNumCols);
-    CURAND_SAFE_CALL(curandGenerateNormalWrap(gen_, tmp.Data(), tmp.NumRows()*tmp.Stride()));
+    CURAND_SAFE_CALL(curandGenerateNormalWrap(
+          GetCurandHandle(), tmp.Data(), tmp.NumRows()*tmp.Stride()));
     tgt->CopyFromMat(tmp.ColRange(0,tgt->NumCols()));
     CuDevice::Instantiate().AccuProfile(__func__, tim);
   } else
@@ -143,7 +147,8 @@ void CuRand<Real>::RandGaussian(CuMatrix<Real> *tgt) {
     // Here we don't need to use 'tmp' matrix, if the number of elements is even,
     MatrixIndexT num_elements = tgt->NumRows() * tgt->Stride();
     if (0 == (num_elements % 2)) {
-      CURAND_SAFE_CALL(curandGenerateNormalWrap(gen_, tgt->Data(), num_elements));
+      CURAND_SAFE_CALL(curandGenerateNormalWrap(
+            GetCurandHandle(), tgt->Data(), num_elements));
     } else {
       // We use 'tmp' matrix with one column added, this guarantees an even
       // number of elements.  Use the option kStrideEqualNumCols to ensure
@@ -152,8 +157,8 @@ void CuRand<Real>::RandGaussian(CuMatrix<Real> *tgt) {
       MatrixIndexT num_cols_even = tgt->NumCols() + (tgt->NumCols() % 2); // + 0 or 1,
       CuMatrix<Real> tmp(tgt->NumRows(), num_cols_even, kUndefined,
                          kStrideEqualNumCols);
-      CURAND_SAFE_CALL(curandGenerateNormalWrap(gen_, tmp.Data(),
-                                            tmp.NumRows() * tmp.Stride()));
+      CURAND_SAFE_CALL(curandGenerateNormalWrap(
+            GetCurandHandle(), tmp.Data(), tmp.NumRows() * tmp.Stride()));
       tgt->CopyFromMat(tmp.ColRange(0,tgt->NumCols()));
     }
     CuDevice::Instantiate().AccuProfile(__func__, tim);
@@ -174,11 +179,13 @@ void CuRand<Real>::RandGaussian(CuVectorBase<Real> *tgt) {
     // curandGenerateUniform(), curandGenerateUniformDouble().
     MatrixIndexT num_elements = tgt->Dim();
     if (0 == (num_elements % 2)) {
-      CURAND_SAFE_CALL(curandGenerateNormalWrap(gen_, tgt->Data(), tgt->Dim()));
+      CURAND_SAFE_CALL(curandGenerateNormalWrap(
+            GetCurandHandle(), tgt->Data(), tgt->Dim()));
     } else {
       MatrixIndexT dim_even = tgt->Dim() + (tgt->Dim() % 2); // + 0 or 1,
       CuVector<Real> tmp(dim_even, kUndefined);
-      CURAND_SAFE_CALL(curandGenerateNormalWrap(gen_, tmp.Data(), tmp.Dim()));
+      CURAND_SAFE_CALL(curandGenerateNormalWrap(
+            GetCurandHandle(), tmp.Data(), tmp.Dim()));
       tgt->CopyFromVec(tmp.Range(0,tgt->Dim()));
     }
     CuDevice::Instantiate().AccuProfile(__func__, tim);
diff --git a/src/cudamatrix/cu-rand.h b/src/cudamatrix/cu-rand.h
index fafc747df8d..6e0be648270 100644
--- a/src/cudamatrix/cu-rand.h
+++ b/src/cudamatrix/cu-rand.h
@@ -20,10 +20,7 @@
 #ifndef KALDI_CUDAMATRIX_CU_RAND_H_
 #define KALDI_CUDAMATRIX_CU_RAND_H_
 
-#if HAVE_CUDA == 1
-  #include <curand.h>
-#endif
-
+#include "cudamatrix/cu-device.h"
 #include "cudamatrix/cu-matrix.h"
 #include "cudamatrix/cu-vector.h"
 #include "base/kaldi-math.h"
@@ -33,36 +30,10 @@ namespace kaldi {
 template<typename Real>
 class CuRand {
  public:
-  CuRand() {
-  #if HAVE_CUDA == 1
-    if (CuDevice::Instantiate().Enabled()) {
-      // Initialize the generator,
-      CURAND_SAFE_CALL(curandCreateGenerator(&gen_, CURAND_RNG_PSEUDO_DEFAULT));
-      // To get same random sequence, call srand() before the constructor is invoked,
-      CURAND_SAFE_CALL(curandSetGeneratorOrdering(gen_, CURAND_ORDERING_PSEUDO_DEFAULT));
-      CURAND_SAFE_CALL(curandSetPseudoRandomGeneratorSeed(gen_, RandInt(128, RAND_MAX)));
-      CURAND_SAFE_CALL(curandSetGeneratorOffset(gen_, 0));
-    }
-  #endif
-  }
 
-  ~CuRand() {
+   void SeedGpu() {
   #if HAVE_CUDA == 1
-    if (CuDevice::Instantiate().Enabled()) {
-      // Release the generator,
-      CURAND_SAFE_CALL(curandDestroyGenerator(gen_));
-    }
-  #endif
-  }
-
-  /// Generate new seed for the GPU,
-  void SeedGpu() {
-  #if HAVE_CUDA == 1
-    if (CuDevice::Instantiate().Enabled()) {
-      // To get same random sequence, call srand() before the method is invoked,
-      CURAND_SAFE_CALL(curandSetPseudoRandomGeneratorSeed(gen_, RandInt(128, RAND_MAX)));
-      CURAND_SAFE_CALL(curandSetGeneratorOffset(gen_, 0));
-    }
+		CuDevice::Instantiate().SeedGpu();
   #endif
   }
 
@@ -88,11 +59,6 @@ class CuRand {
   void BinarizeProbs(const CuMatrix<Real> &probs, CuMatrix<Real> *states);
   /// add gaussian noise to each element,
   void AddGaussNoise(CuMatrix<Real> *tgt, Real gscale = 1.0);
-
- private:
-  #if HAVE_CUDA == 1
-  curandGenerator_t gen_;
-  #endif
 };
 
 }  // namsepace

From da729a58c189045379a5bbd733e267e7e48a57b2 Mon Sep 17 00:00:00 2001
From: "kkm (aka Kirill Katsnelson)" <kkm@smartaction.ai>
Date: Sat, 6 Apr 2019 19:20:26 -0700
Subject: [PATCH 060/163] [build] Make MKL the default BLAS library, add
 installation scripts (#3194)

---
 src/configure                      |   4 +-
 src/doc/build_setup.dox            |  10 +-
 src/doc/matrixwrap.dox             | 235 +++++++++++++++----------
 tools/extras/check_dependencies.sh | 180 ++++++++++----------
 tools/extras/install_mkl.sh        | 265 +++++++++++++++++++++++++++++
 5 files changed, 504 insertions(+), 190 deletions(-)
 create mode 100755 tools/extras/install_mkl.sh

diff --git a/src/configure b/src/configure
index 1013a3c162e..04c33236437 100755
--- a/src/configure
+++ b/src/configure
@@ -809,8 +809,8 @@ threaded_atlas=false
 mkl_threading=sequential
 android=false
 
-MATHLIB='ATLAS'
-ATLASROOT=`rel2abs ../tools/ATLAS_headers/`
+MATHLIB=MKL
+MKLROOT=/opt/intel/mkl
 FSTROOT=`rel2abs ../tools/openfst`
 CUBROOT=`rel2abs ../tools/cub`
 
diff --git a/src/doc/build_setup.dox b/src/doc/build_setup.dox
index 47ff7e033a8..5ea2e212b20 100644
--- a/src/doc/build_setup.dox
+++ b/src/doc/build_setup.dox
@@ -32,12 +32,12 @@
 
  The build process for Windows is separate from the build process for
  UNIX-like systems, and is described in windows/INSTALL (tested some time ago with
- Windows 7 and Microsoft Visual Studio 10.0).  We use scripts to
+ Windows 7 and Microsoft Visual Studio 2013).  We use scripts to
  create the Visual Studio 10.0 solution file.  There are two options for
- the math library on Windows: either you can use Cygwin to compile ATLAS, or you
- can use the Intel MKL library.  Detailed instructions are provided.  However, note
+ the math library on Windows: either Intel MKL, or use Cygwin to compile ATLAS.
+ Detailed instructions are provided.  However, note
  that the Windows setup is becoming out of date and is not regularly tested,
- and not all the code currently compiles on it.
+ and not all the may compile.
 
  \section build_setup_configure How our configure script works (for UNIX variants)
 
@@ -143,6 +143,6 @@ preprocessor variables, setting compile options, linking with libraries, and so
 
 We have compiled Kaldi on Windows, Cygwin, various flavors of Linux (including
 Ubuntu, CentOS, Debian, Red Hat and SUSE), and Darwin. We recommend you use g++ version
-4.4 or above, although other compilers such as llvm and Intel's icc are also known to work.
+4.7 or above, although other compilers such as llvm and Intel's icc are also known to work.
 
 */
diff --git a/src/doc/matrixwrap.dox b/src/doc/matrixwrap.dox
index fb595d581fe..9cf5e92ca48 100644
--- a/src/doc/matrixwrap.dox
+++ b/src/doc/matrixwrap.dox
@@ -22,93 +22,155 @@ namespace kaldi {
 
 /** \page matrixwrap External matrix libraries
 
-  Here we describe how our \ref matrix "matrix library" makes use of 
+  Here we describe how our \ref matrix "matrix library" makes use of
   external libraries.
 
   \section matrixwrap_summary Overview
- 
-  The matrix code in Kaldi is mostly a wrapper on top of the
-  linear-algebra libraries BLAS and LAPACK.  The code has been designed to be as flexible
-  as possible in terms of what libraries it can use.  Currently it supports four options:
+
+  The matrix code in Kaldi is mostly a wrapper on top of the linear-algebra
+  libraries BLAS and LAPACK.  The code has been designed to be as flexible as
+  possible in terms of what libraries it can use.  Currently it supports four
+  options:
+    -  Intel MKL, which provides both BLAS and LAPACK (the default)
+    -  OpenBLAS, which provides BLAS and LAPACK
     -  ATLAS, which is an implementation of BLAS plus a subset of LAPACK (with a different interface)
     -  Some implementation of BLAS plus CLAPACK (note: this has not been tested recently).
-    -  Intel's MKL, which provides both BLAS and LAPACK
-    -  OpenBLAS, which provides BLAS and LAPACK
 
-  The code has to "know" which of these four options is being used, because although in principle
-  BLAS and LAPACK are standardized, there are some differences in the interfaces.
-  The Kaldi code requires exactly one
-  of the three strings HAVE_ATLAS, HAVE_CLAPACK, HAVE_OPENBLAS or HAVE_MKL to be defined 
-  (e.g. using -DHAVE_ATLAS as an option to the compiler).  It must then be 
-  linked with the appropriate libraries.  The code that deals most directly
-  with including the external libraries and setting up the appropriate
-  typedef's and defines, is in \ref kaldi-blas.h.   However, the rest of
-  the matrix code is not completely insulated from these issues because the ATLAS
-  and CLAPACK versions of higher-level routines are called differently (so
-  we have a lot of "#ifdef HAVE_ATLAS" directives and the like).  Additionally, some routines
-  are not even available in ATLAS so we have had to implement them ourselves.
-
-  The "configure" script in the "src" directory is responsible for setting up Kaldi to use the libraries.
-  It does this by creating the file "kaldi.mk" in the "src" directory, which gives appropriate flags
-  to the compiler.   If called with no arguments it will use any ATLAS installation it can find in "normal" places
-  in your system, but it is quite configurable.  See the script itself for usage.
-
- \section matrixwrap_blas Basic Linear Algebra Subroutines (BLAS)
-
-   Because we refer a lot to BLAS in this section, we briefly explain what it is. 
-   BLAS is a set of subroutine declarations that correspond to low-level
-   matrix-vector operations.  There is Level 1 Blas (vector-vector), Level 2
-   (vector-matrix) and Level 3 (matrix-matrix).  They have names like daxpy (for
-   double-precision a*x plus y), and dgemm (for double general matrix-matrix
-   multiply).  BLAS has various actual implementations.  The "reference BLAS",
-   supplied I believe by Netlib (the folks who also brought us the most common version
-   of LAPACK), is one.  ATLAS is another one (but it also implements some functions
-   from LAPACK).
-
- \section matrixwrap_lapack Linear Algebra PACKage (LAPACK)
-
-   Lapack is a set of linear-algebra routines, originally written in Fortran.  It includes
-   higher-level routines than BLAS, such as matrix inversion, SVD, etc.  
-   Netlib has implemented this (this is the "normal" LAPACK).  LAPACK requires
-   BLAS.  It is possible to mix-and-match LAPACK and BLAS implementations 
-   (e.g. Netlib's LAPACK with ATLAS's BLAS).
- 
-  CLAPACK is a version of LAPACK that has been converted from Fortan to C automatically
-  using the f2c utility.  When we talk about using LAPACK, we are actually
-  talking about using CLAPACK.  Because CLAPACK has been converted to C using the
-  f2c utility, when we link against it we need to include the f2c library (e.g. -lf2c,
-  or -lg2c if using recent versions of gcc), otherwise we will get linking errors.
-
-
-  \section matrixwrap_atlas Automatically Tuned Linear Algebra Software (ATLAS) 
+  The code has to "know" which of these four options is being used, because
+  although in principle BLAS and LAPACK are standardized, there are some
+  differences in the interfaces.  The Kaldi code requires exactly one of the
+  three macros \c HAVE_ATLAS, \c HAVE_CLAPACK, \c HAVE_OPENBLAS or \c HAVE_MKL
+  to be defined (normally using \c -DHAVE_ATLAS as an option to the compiler).
+  It must then be linked with the appropriate libraries.  The code that deals
+  most directly with including the external libraries and setting up the
+  appropriate typedef's and defines, is in \ref kaldi-blas.h.  However, the rest
+  of the matrix code is not completely insulated from these issues because the
+  ATLAS and CLAPACK versions of higher-level routines are called differently (so
+  we have a lot of "#ifdef HAVE_ATLAS" directives and the like).  Additionally,
+  some routines are not even available in ATLAS so we have had to implement them
+  ourselves.
+
+  The "configure" script in the "src" directory is responsible for setting up
+  Kaldi to use the libraries.  It does this by creating the file "kaldi.mk" in
+  the "src" directory, which gives appropriate flags to the compiler. If called
+  with no arguments it will use any Intel MKL installation it can find in
+  "normal" places in your system, but it is configurable. Run the script with
+  the \c \--help option for the complete option list.
+
+ \section matrixwrap_matalgebra Understanding BLAS and LAPACK
+
+  Because we refer a lot to BLAS (and more often CBLAS) and LAPACK (or, rarely,
+  CLAPACK) in this section, we briefly explain what it is.
+
+ \subsection matrixwrap_blas Basic Linear Algebra Subroutines (BLAS)
+
+  BLAS is a set of subroutine declarations that correspond to low-level
+  matrix-vector operations.  There is BLAS Level 1 (vector-vector), Level 2
+  (vector-matrix) and Level 3 (matrix-matrix). They have names like \c daxpy
+  (for \"<b>d</b>ouble-precision \b a \b x <b>p</b>lus \b y\"), and \c dgemm
+  (for "double-precision general matrix-matrix multiply"). BLAS has various
+  actual implementations. The <a href="http://www.netlib.org/blas/">reference
+  implementation of BLAS</a> originated back in 1979, and has been maintained
+  since by Netlib. The reference implementation lacks any optimization
+  whatsoever, and exists solely as a touchstone to validate the correctness of
+  other implementations. MKL, ATLAS and OpenBLAS provide optimized
+  implementations of BLAS.
+
+  CBLAS is just the C language interface to BLAS.
+
+ \subsection matrixwrap_lapack Linear Algebra PACKage (LAPACK)
+
+  LAPACK is a set of linear-algebra routines, originally written in Fortran.  It
+  includes higher-level routines than BLAS, such as matrix inversion, SVD, etc.
+  The <a href="https://github.com/Reference-LAPACK">reference implementation of
+  LAPACK</a> was implemented and has been maintained by Netlib.  LAPACK
+  internally uses BLAS. It is possible to mix-and-match LAPACK and BLAS
+  implementations (e.g. Netlib's LAPACK with ATLAS's BLAS).
+
+  CLAPACK is a version of LAPACK that has been converted from Fortan to C
+  automatically using the f2c utility. Because of this, the f2c library is
+  required during linking with the "original" CLAPACK (usually \c -lg2c or
+  \c -lf2c).
+
+  MKL provides complete C-callable interfaces for its own BLAS and LAPACK
+  implementations; no additional libraries are required.
+
+ \section matrixwrap_mkl Intel Math Kernel Library (MKL)
+
+  Intel MKL provides C-language interface to a high-performance implementation
+  of the BLAS and LAPACK routines, and is currently the preferred CBLAS/CLAPACK
+  provider for Kaldi. To use MKL with Kaldi use the \c -DHAVE_MKL compiler flag.
+
+  Previously MKL used to be a paid product. Starting 2017, Intel made MKL freely
+  available and allows royalty-freely runtime redistribution even for commercial
+  application (although, just like, for example, CUDA, it is still a
+  closed-source commercial product).
+
+  MKL provides a very highly optimized implementation of linear algebra
+  routines, and especially on Intel CPUs. In fact, the library contains multiple
+  code paths, which are selected at runtime depending on individual features of
+  the CPU it is being loaded on. Thus with MKL you will automatically benefit
+  from all features and instruction sets (such as AVX2 and AVX512) if they are
+  available on your CPU, without any additional configuration. These
+  instructions accelerate linear algebra operations on CPU significantly.  It is
+  usually a good idea to use a recent MKL version if your CPU is of a newer
+  architecture.
+
+  To simplify MKL setup on Linux, we provide a script
+  \c tools/extras/install_mkl.sh. We install only 64-bit binaries for MKL, but
+  once the \c install_mkl.sh script completes successfully once, the Intel
+  repositories are registered on your system, and you can both obtain new
+  versions and 32-bit libraries using your system's package manager.
+
+  For Mac and Windows, <a href="https://software.intel.com/mkl/choose-download">
+  download the installer from Intel's Web site</a> (registration may be
+  required).  Refer to the same page in case the above Linux script does not
+  support your Linux distribution. The Intel installers (Mac, Windows) let you
+  select the 32-bit and 64-bit packages separately. To run Kaldi training
+  recipes only the 64-bit version is required.
+
+  We have tested Kaldi extensively with 64-bit libraries under Linux and
+  Windows.
+
+  The <a href="http://software.intel.com/articles/intel-mkl-link-line-advisor/">
+  MKL Link Line Advisor</a> is an interactive Web tool that allows configuring
+  the compiler flags for various systems and compilers, in case our "configure"
+  script does not cover it.
+  \n \b NOTE: Do not use the the multithreaded mode for
+  Kaldi training (select "sequential" as the threading option). Our script and
+  binary setups are designed to run multiple processes on a single machine,
+  presumably maxing out its CPU, and an attempt to multi-thread linear algebra
+  computations will only adversely impact the performance.
+
+  \section matrixwrap_atlas Automatically Tuned Linear Algebra Software (ATLAS)
 
   ATLAS is a well known implementation of BLAS plus a subset of LAPACK.  The
   general idea of ATLAS is to tune to the particular processor setup, so the
   compilation process is quite complex and can take a while.  For this reason,
-  it can be quite tricky to compile ATLAS.  On UNIX-based systems, you can't even do it unless you 
+  it can be quite tricky to compile ATLAS.  On UNIX-based systems, you can't even do it unless you
   are root or are friendly with your system administrator, because to compile
   it you need to turn off CPU throttling; and on Windows, ATLAS does not compile
   "natively", only in Cygwin.  Sometimes it can be a better bet to find libraries that
   have been compiled by someone else for your particular platform, but we can't offer
-  much advice on how to do this.  ATLAS generally performs better 
+  much advice on how to do this.  ATLAS generally performs better
   than the "reference BLAS" available from Netlib.   ATLAS only includes
   a few LAPACK routines.  These include matrix inversion and Cholesky factorization,
   but not SVD.  For this reason we have implemented a couple more of the LAPACK
-  routines (SVD and eigenvalue decomposition); see 
+  routines (SVD and eigenvalue decomposition); see
   the next section.
-  
+
   ATLAS conforms to the BLAS interface, but its interface for the subset of
-  LAPACK routines that it provides is not the same as Netlib's (it's more
-  C-like and less FORTRAN-ish).  For this reason, there are quite a number of #ifdef's in our code
-  to switch between the calling styles, depending whether we are
+  LAPACK routines that it provides is not the same as Netlib's (it's more C-like
+  and less FORTRAN-ish).  For this reason, there are quite a number of \#ifdef's
+  in our code to switch between the calling styles, depending whether we are
   linking with ATLAS or CLAPACK.
-  
+
   \subsection matrixwrap_atlas_install_windows Installing ATLAS (on Windows)
 
   For instructions on how to install ATLAS on Windows (and note that these
   instructions require Cygwin), see the file windows/INSTALL.atlas
   in our source distribution.  Note that our Windows setup is not being
-  actvely maintained at the moment and we don't anticipate that it will work
+  actively maintained at the moment and we don't anticipate that it will work
   very cleanly.
 
   \subsection matrixwrap_atlas_install_linux Installing ATLAS (on Linux)
@@ -118,39 +180,31 @@ namespace kaldi {
   pre-built binaries available, they may not be the best binaries possible for your
   architecture so it is probably a better idea to compile from source.
   The easiest way to do this
-  is to cd from "src" to "../tools" and to run ./install_atlas.sh.  
+  is to cd from "src" to "../tools" and to run ./install_atlas.sh.
   If this does not work, the detailed installation
-  instructions can be found at: http://math-atlas.sourceforge.net/atlas_install/. 
-	
+  instructions can be found at: http://math-atlas.sourceforge.net/atlas_install/.
+
   One useful note is that before installing ATLAS you should turn off CPU
- throttling using "cpufreq-selector -g performance" (cpufreq-selector may be in
- sbin), if it is enabled (see the ATLAS install page).  You can first try running the 
- "install_atlas.sh" script before doing this, to see whether it works-- if CPU
+  throttling using "cpufreq-selector -g performance" (cpufreq-selector may be in
+  sbin), if it is enabled (see the ATLAS install page).  You can first try running the
+  "install_atlas.sh" script before doing this, to see whether it works-- if CPU
   throttling is enabled, the ATLAS installation scripts will die with an error.
-	
-	\section matrixwrap_mkl Intel Math Kernel Library (MKL)
-	Intel MKL also provides C-language interface to the BLAS and LAPACK routines,
-	and can be used with Kaldi by using the -DHAVE_MKL compiler flag. The linker
-	flags for MKL tend to be quite different depending on the OS, architecture, 
-	compiler, etc. used. We have tested Kaldi on 32-bit Windows and x86_64 (or EMT64) Linux.
-	Flags for other platforms can be obtained from:
-  http://software.intel.com/en-us/articles/intel-mkl-link-line-advisor/
 
   \section matrixwrap_openblas OpenBLAS
 
-    Kaldi now supports linking against the OpenBLAS library, which  is an implementation
+  Kaldi now supports linking against the OpenBLAS library, which  is an implementation
   of BLAS and parts of LAPACK.  OpenBLAS also automatically compiles Netlib's implementation of LAPACK,
-  so that it can explort LAPACK in its entirety.
+  so that it can export LAPACK in its entirety.
   OpenBLAS is a fork of the GotoBLAS project (an assembler-heavy implementation of BLAS) which is no longer being
   maintained.  In order to use GotoBLAS you can cd from "src" to "../tools", type
   "make openblas", then cd to "../src" and give the correct option to the "configure" script
   to use OpenBLAS (look at the comments at the top of the configure script to find this option).
   Thanks to Sola Aina for suggesting this and helping us to get this to work.
-  
+
   \section matrixwrap_jama Java Matrix Package (JAMA)
 
   JAMA is an implementation of linear-algebra routines for Java, written
-  in collaboration between NIST and MathWorks and put into the public domain 
+  in collaboration between NIST and MathWorks and put into the public domain
   (see math.nist.gov/javanumerics/jama).  We used some of this code to fill
   in a couple of holes in ATLAS-- specifically, if we're compiling with
  -DHAVE_ATLAS, we don't have the CLAPACK routines for SVD and eigenvalue
@@ -165,7 +219,7 @@ namespace kaldi {
   directory and see if it succeeds.  A lot of compilation issues will manifest themselves
   as linking errors.  In this section we give a summary of some of the more common
   linking errors (at least, those that relate specifically to the matrix library).
- 
+
    Depending on the compilation option (-DHAVE_CLAPACK, -DHAVE_LAPACK or -DHAVE_MKL),
   the code will be expecting to link with different things.  When debugging linking
   errors, bear in mind that the problem could be a mismatch between the compilation
@@ -182,7 +236,7 @@ namespace kaldi {
    s_cat, pow_dd, r_sign, pow_ri, pow_di, s_copy, s_cmp, d_sign
 
   \subsection matrix_err_clapack CLAPACK linking errors
-    
+
    You will get these errors if you compiled with -DHAVE_CLAPACK but did
    not provide the CLAPACK library.  The symbols you will be missing are:
 
@@ -195,15 +249,15 @@ namespace kaldi {
   but it supplies different symbols.   The native CLAPACK version of liblapack
   has symbols like those above (e.g. sgesvd_, sgetrf_), but the ATLAS version
   has symbols like clapack_sgetrf and also ones like ATL_sgetrf.
-  
+
   \subsection matrix_err_blas BLAS linking errors
-  
+
    You will get these errors if you failed to link against an implementation
    of BLAS.  These errors can also occur if libraries are linked in the wrong
    order.  CLAPACK requires BLAS, so you have to link BLAS after CLAPACK.
-   
+
    The symbols you will see if you failed to link with BLAS include:
-  
+
    cblas_sger, cblas_saxpy, cblas_dapy, cblas_ddot, cblas_sdot, cblas_sgemm, cblas_dgemm
 
    To fix these, link with a static library like libcblas.a, or do -lcblas (assuming
@@ -220,7 +274,7 @@ namespace kaldi {
   CLAPACK.  The cblaswrap library should be invoked before the cblas one.  If you
   are missing cblaswrap, you will see errors about symbols like:
 
-  f2c_sgemm, f2c_strsm, f2c_sswap, f2c_scopy, f2c_sspmv, f2c_sdot, f2c_sgemv 
+  f2c_sgemm, f2c_strsm, f2c_sswap, f2c_scopy, f2c_sspmv, f2c_sdot, f2c_sgemv
 
   and so on (there are a lot of these symbols).
 
@@ -235,15 +289,15 @@ namespace kaldi {
 
   \subsection matrix_err_atl_clapack Missing the ATLAS implementation of (parts of) CLAPACK
 
-  These errors can only occur if you compiled wiht the -DHAVE_ATLAS option.
+  These errors can only occur if you compiled with the -DHAVE_ATLAS option.
   Atlas's name for the CLAPACK routines are different from clapack's own (they
   have clapack_ prepended to indicate the origin, which can be quite confusing).
 
   If you have undefined references to the following symbols:
-  
+
    clapack_sgetrf, clapack_sgetri, clapack_dgetrf, clapack_dgetri
 
-  then it means you failed to link with an ATLAS library containing these symbols.  
+  then it means you failed to link with an ATLAS library containing these symbols.
   This may be variously called liblapack.a, libclapack.a or liblapack_atlas.a,
   but you can tell that it is the right one if it defines a symbol called ATL_cgetrf
   (type "nm <library-name> | grep ATL_cgetrf" to see).  You may be able to link
@@ -254,7 +308,6 @@ namespace kaldi {
   out is to look inside it using "nm" or "strings".
 
 
-
 */
 
 }
diff --git a/tools/extras/check_dependencies.sh b/tools/extras/check_dependencies.sh
index 1b63c4c99d9..0ee7e5b38dc 100755
--- a/tools/extras/check_dependencies.sh
+++ b/tools/extras/check_dependencies.sh
@@ -10,48 +10,45 @@ debian_packages=
 opensuse_packages=
 
 function add_packages {
-  redhat_packages="$redhat_packages $1";
-  debian_packages="$debian_packages $2";
-  opensuse_packages="$opensuse_packages $3";
+  redhat_packages="$redhat_packages $1"
+  debian_packages="$debian_packages ${2:-$1}"
+  opensuse_packages="$opensuse_packages ${3:-$1}"
 }
 
-if ! which which >&/dev/null; then
-  echo "$0: which is not installed."
-  add_packages which debianutils which
-fi
+function have { type -t "$1" >/dev/null; }
 
-COMPILER_VER_INFO=$($CXX --version 2>/dev/null)
-case $COMPILER_VER_INFO in
+compiler_ver_info=$($CXX --version 2>/dev/null)
+case $compiler_ver_info in
   "")
-    echo "$0: $CXX is not installed."
+    echo "$0: Compiler '$CXX' is not installed."
     echo "$0: You need g++ >= 4.8.3, Apple Xcode >= 5.0 or clang >= 3.3."
-    add_packages gcc-c++ g++ gcc-c++
+    add_packages gcc-c++ g++
     status=1
     ;;
   "g++ "* )
-    GCC_VER=$($CXX -dumpversion)
-    GCC_VER_NUM=$(echo $GCC_VER | sed 's/\./ /g' | xargs printf "%d%02d%02d")
-    if [ $GCC_VER_NUM -lt 40803 ]; then
-        echo "$0: $CXX (g++-$GCC_VER) is not supported."
+    gcc_ver=$($CXX -dumpversion)
+    gcc_ver_num=$(echo $gcc_ver | sed 's/\./ /g' | xargs printf "%d%02d%02d")
+    if [ $gcc_ver_num -lt 40803 ]; then
+        echo "$0: Compiler '$CXX' (g++-$gcc_ver) is not supported."
         echo "$0: You need g++ >= 4.8.3, Apple clang >= 5.0 or LLVM clang >= 3.3."
         status=1
     fi
     ;;
   "Apple LLVM "* )
     # See https://gist.github.com/yamaya/2924292
-    CLANG_VER=$(echo $COMPILER_VER_INFO | grep version | sed "s/.*version \([0-9\.]*\).*/\1/")
-    CLANG_VER_NUM=$(echo $COMPILER_VER_INFO | grep version | sed "s/.*clang-\([0-9]*\).*/\1/")
-    if [ $CLANG_VER_NUM -lt 500 ]; then
-        echo "$0: $CXX (Apple clang-$CLANG_VER) is not supported."
+    clang_ver=$(echo $compiler_ver_info | grep version | sed "s/.*version \([0-9\.]*\).*/\1/")
+    clang_ver_num=$(echo $compiler_ver_info | grep version | sed "s/.*clang-\([0-9]*\).*/\1/")
+    if [ $clang_ver_num -lt 500 ]; then
+        echo "$0: Compiler '$CXX' (Apple clang-$clang_ver) is not supported."
         echo "$0: You need g++ >= 4.8.3, Apple clang >= 5.0 or LLVM clang >= 3.3."
         status=1
     fi
     ;;
   "clang "* )
-    CLANG_VER=$(echo $COMPILER_VER_INFO | grep version | sed "s/.*version \([0-9\.]*\).*/\1/")
-    CLANG_VER_NUM=$(echo $CLANG_VER | sed 's/\./ /g' | xargs printf "%d%02d")
-    if [ $CLANG_VER_NUM -lt 303 ]; then
-        echo "$0: $CXX (LLVM clang-$CLANG_VER) is not supported."
+    clang_ver=$(echo $compiler_ver_info | grep version | sed "s/.*version \([0-9\.]*\).*/\1/")
+    clang_ver_num=$(echo $clang_ver | sed 's/\./ /g' | xargs printf "%d%02d")
+    if [ $clang_ver_num -lt 303 ]; then
+        echo "$0: Compiler '$CXX' (LLVM clang-$clang_ver) is not supported."
         echo "$0: You need g++ >= 4.8.3, Apple clang >= 5.0 or LLVM clang >= 3.3."
         status=1
     fi
@@ -61,53 +58,55 @@ case $COMPILER_VER_INFO in
     ;;
 esac
 
-if ! echo "#include <zlib.h>" | $CXX -E - >&/dev/null; then
+# Cannot check this without a compiler.
+if have "$CXX" && ! echo "#include <zlib.h>" | $CXX -E - >&/dev/null; then
   echo "$0: zlib is not installed."
-  add_packages zlib-devel zlib1g-dev zlib-devel
+  add_packages zlib-devel zlib1g-dev
 fi
 
 for f in make automake autoconf patch grep bzip2 gzip unzip wget git sox; do
-  if ! which $f >&/dev/null; then
+  if ! have $f; then
     echo "$0: $f is not installed."
-    add_packages $f $f $f
+    add_packages $f
   fi
 done
 
-if ! which libtoolize >&/dev/null && ! which glibtoolize >&/dev/null; then
+if ! have libtoolize && ! have glibtoolize; then
   echo "$0: neither libtoolize nor glibtoolize is installed"
-  add_packages libtool libtool libtool
+  add_packages libtool
 fi
 
-if ! which svn >&/dev/null; then
+if ! have svn; then
   echo "$0: subversion is not installed"
-  add_packages subversion subversion subversion
+  add_packages subversion
 fi
 
-if ! which awk >&/dev/null; then
+if ! have awk; then
   echo "$0: awk is not installed"
-  add_packages gawk gawk gawk
+  add_packages gawk
 fi
 
 pythonok=true
-if ! which python2.7 >&/dev/null; then
+if ! have python2.7; then
   echo "$0: python2.7 is not installed"
-  add_packages python2.7 python2.7
+  add_packages python2.7
   pythonok=false
 fi
 
-if ! which python3 >&/dev/null; then
+if ! have python3; then
   echo "$0: python3 is not installed"
-  add_packages python3 python3
+  add_packages python3
   pythonok=false
 fi
 
 (
 #Use a subshell so that sourcing env.sh does not have an influence on the rest of the script
 [ -f ./env.sh ] && . ./env.sh
-if $pythonok && ! which python2 >&/dev/null; then
+if $pythonok && ! have python2; then
   mkdir -p $PWD/python
-  echo "$0: python2.7 is installed, but the python2 binary does not exist. Creating a symlink and adding this to tools/env.sh"
-  ln -s $(which python2.7) $PWD/python/python2
+  echo "$0: python2.7 is installed, but the python2 binary does not exist." \
+       "Creating a symlink and adding this to tools/env.sh"
+  ln -s $(command -v python2.7) $PWD/python/python2
   echo "export PATH=$PWD/python:\${PATH}" >> env.sh
 fi
 
@@ -115,13 +114,15 @@ if [[ -f $PWD/python/.use_default_python && -f $PWD/python/python ]]; then
   rm $PWD/python/python
 fi
 
-if $pythonok && which python >&/dev/null && [[ ! -f $PWD/python/.use_default_python ]]; then
-  version=`python 2>&1 --version | awk '{print $2}' `
+if $pythonok && have python && [[ ! -f $PWD/python/.use_default_python ]]; then
+  version=$(python 2>&1 --version | awk '{print $2}')
   if [[ $version != "2.7"* ]] ; then
-    echo "$0: WARNING python 2.7 is not the default python. We fixed this by adding a correct symlink more prominently on the path."
-    echo "$0: If you really want to use python $version as default, add an empty file $PWD/python/.use_default_python and run this script again."
+    echo "$0: WARNING python 2.7 is not the default python. We fixed this by" \
+         "adding a correct symlink more prominently on the path."
+    echo " ... If you really want to use python $version as default, add an" \
+         "empty file $PWD/python/.use_default_python and run this script again."
     mkdir -p $PWD/python
-    ln -s $(which python2.7) $PWD/python/python
+    ln -s $(command -v python2.7) $PWD/python/python
     echo "export PATH=$PWD/python:\${PATH}" >> env.sh
   fi
 fi
@@ -129,66 +130,61 @@ fi
 
 printed=false
 
-if which apt-get >&/dev/null && ! which zypper >/dev/null; then
-  # if we're using apt-get [but we're not OpenSuse, which uses zypper as the
-  # primary installer, but sometimes installs apt-get for some compatibility
-  # reason without it really working]...
-  if [ ! -z "$debian_packages" ]; then
-    echo "$0: we recommend that you run (our best guess):"
-    echo " sudo apt-get install $debian_packages"
-    printed=true
-    status=1
-  fi
-  if ! dpkg -l | grep -E 'libatlas3gf|libatlas3-base' >/dev/null; then
-    echo "You should probably do: "
-    echo " sudo apt-get install libatlas3-base"
-    printed=true
-  fi
-elif which yum >&/dev/null; then
-  if [ ! -z "$redhat_packages" ]; then
-    echo "$0: we recommend that you run (our best guess):"
-    echo " sudo yum install $redhat_packages"
-    printed=true
-    status=1
-  fi
-  if ! rpm -qa|  grep atlas >/dev/null; then
-    echo "You should probably do something like: "
-    echo "sudo yum install atlas.x86_64"
-    printed=true
-  fi
-elif which zypper >&/dev/null; then
-  if [ ! -z "$opensuse_packages" ]; then
-    echo "$0: we recommend that you run (our best guess):"
-    echo " sudo zypper install $opensuse_packages"
-    printed=true
-    status=1
-  fi
-  if ! zypper search -i | grep -E 'libatlas3|libatlas3-devel' >/dev/null; then
-    echo "You should probably do: "
-    echo "sudo zypper install libatlas3-devel"
-    printed=true
+# MKL. We do not know if compiler exists at this point, so double-check
+# the well-known mkl.h file location. The compiler test would still find
+# it if installed in an alternative location (this is unlikely).
+if [ ! -f /opt/intel/mkl/include/mkl.h ] &&
+   ! echo '#include <mkl.h>' | $CXX -I /opt/intel/mkl/include -E - >&/dev/null; then
+  if [[ $(uname) == Linux ]]; then
+    echo "$0: Intel MKL is not installed. Run extras/install_mkl.sh to install it."
+  else
+    echo "$0: Intel MKL is not installed. Download the installer package for your
+ ... system from: https://software.intel.com/mkl/choose-download."
   fi
+ echo "\
+ ... You can also use other matrix algebra libraries. For information, see:
+ ... http://kaldi-asr.org/doc/matrixwrap.html"
+  printed=true
 fi
 
-if [ ! -z "$debian_packages" ]; then
-  # If the list of packages to be installed is nonempty,
-  # we'll exit with error status.  Check this outside of
-  # checking for yum or apt-get, as we want it to exit with
-  # error even if we're not on Debian or red hat.
+# Report missing programs and libraries.
+if [ -n "$debian_packages" ]; then
+  install_pkg_command=$(
+    # Guess package manager from user's distribution type. Use a subshell
+    # because we are potentially importing a lot of dirt here.
+    eval $(grep 2>/dev/null ^ID /etc/os-release) 2>/dev/null
+    for rune in ${ID-} ${ID_LIKE-}; do
+      # The case '(pattern)' syntax is necessary in subshell for bash 3.x.
+      case $rune in
+        (rhel|centos|redhat) echo "yum install $redhat_packages"; break;;
+        (fedora) echo "dnx install $redhat_packages"; break;;
+        (suse) echo "zypper install $opensuse_packages"; break;;
+        (debian) echo "apt-get install $debian_packages"; break;;
+      esac
+    done
+  )
+
+  # Print the suggestion to install missing packages.
+  if [ -n "$install_pkg_command" ]; then
+    echo "$0: Some prerequisites are missing; install them using the command:"
+    echo "  sudo" $install_pkg_command
+  else
+    echo "$0: The following prerequisites are missing; install them first:"
+    echo "  " $debian_packages
+  fi
   status=1
 fi
 
-
 if [ $(pwd | wc -w) -gt 1 ]; then
   echo "*** $0: Warning: Kaldi scripts will fail if the directory name contains a space."
   echo "***  (it's OK if you just want to compile a few tools -> disable this check)."
-  status=1;
+  status=1
 fi
 
-if which grep >&/dev/null && pwd | grep -E 'JOB|LMWT' >/dev/null; then
+if pwd | grep -E 'JOB|LMWT' >/dev/null; then
   echo "*** $0: Kaldi scripts will fail if the directory name contains"
   echo "***  either of the strings 'JOB' or 'LMWT'."
-  status=1;
+  status=1
 fi
 
 if ! $printed && [ $status -eq 0 ]; then
diff --git a/tools/extras/install_mkl.sh b/tools/extras/install_mkl.sh
new file mode 100755
index 00000000000..fe2ea7bdb65
--- /dev/null
+++ b/tools/extras/install_mkl.sh
@@ -0,0 +1,265 @@
+#!/bin/bash
+
+# Intel MKL is now freely available even for commercial use. This script
+# attempts to install the MKL package automatically from Intel's repository.
+#
+# For manual repository setup instructions, see:
+#   https://software.intel.com/articles/installing-intel-free-libs-and-python-yum-repo
+#   https://software.intel.com/articles/installing-intel-free-libs-and-python-apt-repo
+#
+# For other package managers, or non-Linux platforms, see:
+#   https://software.intel.com/mkl/choose-download
+
+set -o pipefail
+
+default_package=intel-mkl-64bit-2019.2-057
+
+yum_repo='https://yum.repos.intel.com/mkl/setup/intel-mkl.repo'
+apt_repo='https://apt.repos.intel.com/mkl'
+intel_key_url='https://apt.repos.intel.com/intel-gpg-keys/GPG-PUB-KEY-INTEL-SW-PRODUCTS-2019.PUB'
+
+Usage () {
+  cat >&2 <<EOF
+Usage: $0 [-s] [<MKL-package>]
+
+Checks if MKL is present on the system, and/or attempts to install it.
+
+If <MKL-package> is not provided, ${default_package} will be installed.
+
+Intel packages are installed under the /opt/intel directory. You should be root
+to install MKL into this directory; run this script using the sudo command.
+
+Options:
+  -s  - Skip check for MKL being already present.
+  -p <suse|redhat|debian|fedora> -- Force type of package management. Use only
+                                    if automatic detection fails, as instructed.
+  -h  - Show this message.
+
+Environment:
+  CC   The C compiler to use for MKL check. If not set, uses 'cc'.
+EOF
+  exit 2
+}
+
+Fatal () { echo "$0: $@"; exit 1; }
+
+Have () { type -t "$1" >/dev/null; }
+
+# Option values.
+skip_cc=
+distro=
+
+while getopts ":hksp:" opt; do
+  case ${opt} in
+    h) Usage ;;
+    s) skip_cc=yes ;;
+    p) case $OPTARG in
+         suse|redhat|debian|fedora) distro=$OPTARG ;;
+         *) Fatal "invalid value -p '${OPTARG}'. " \
+                  "Allowed: 'suse', 'redhat', 'debian' or 'fedora'."
+       esac ;;
+    \?) echo >&2 "$0: invalid option -${OPTARG}."; Usage ;;
+  esac
+done
+shift $((OPTIND-1))
+
+orig_arg_package=${1-''}
+package=${1:-$default_package}
+
+# Check that we are actually on Linux, otherwise give a helpful reference.
+[[ $(uname) == Linux ]] || Fatal "\
+This script can be used on Linux only, and your system is $(uname).
+
+Installer packages for Mac and Windows are available for download from Intel:
+https://software.intel.com/mkl/choose-download"
+
+# Test if MKL is already installed on the system.
+if [[ ! $skip_cc ]]; then
+  : ${CC:=cc}
+  Have "$CC" || Fatal "\
+C compiler $CC not found.
+
+You can skip the check for MKL presence by invoking this script with the '-s'
+option to this script, but you will need a functional compiler anyway, so we
+recommend that you install it first."
+
+  mkl_version=$($CC -E -I /opt/intel/mkl/include - <<< \
+                      '#include <mkl_version.h>
+           __INTEL_MKL__.__INTEL_MKL_MINOR__.__INTEL_MKL_UPDATE__' 2>/dev/null |
+                  tail -n 1 ) || mkl_version=
+  mkl_version=${mkl_version// /}
+
+  [[ $mkl_version ]] && Fatal "\
+MKL version $mkl_version is already installed.
+
+You can skip the check for MKL presence by invoking this script with the '-s'
+option and proceed with automated installation, but we highly discourage
+this. This script will register Intel repositories with your system, and it
+seems that they have been already registered, or MKL has been installed some
+other way.
+
+You should use your package manager to check which MKL package is already
+installed. Note that Intel packages register the latest installed version of
+the library as the default. If your installed version is older than
+$package, it makes sense to upgrade."
+fi
+
+# Try to determine which package manager the distro uses, unless overridden.
+if [[ ! $distro ]]; then
+  dist_vars=$(cat /etc/os-release 2>/dev/null)
+  eval "$dist_vars"
+  for rune in $CPE_NAME $ID $ID_LIKE; do
+    case "$rune" in
+      cpe:/o:fedoraproject:fedora:2[01]) distro=redhat; break;;  # Use yum.
+      rhel|centos) distro=redhat; break;;
+      redhat|suse|fedora|debian) distro=$rune; break;;
+    esac
+  done
+
+  # Certain old distributions do not have /etc/os-release. We are unlikely to
+  # encounter these in the wild, but just in case.
+  # NOTE: Do not try to guess Fedora specifically here! Fedora 20 and below
+  #       detect as redhat, and this is good, because they use yum by default.
+  [[ ! $distro && -f /etc/redhat-release ]] && distro=redhat
+  [[ ! $distro && -f /etc/SuSE-release ]]   && distro=suse
+  [[ ! $distro && -f /etc/debian_release ]] && distro=debian
+
+  [[ ! $distro ]] && Fatal "\
+Unable to determine package management style.
+
+Invoke this script with the option '-p <style>', where <style> can be:
+  redhat -- RedHat-like, uses yum and rpm for package management.
+  fedora -- Fedora 22+, also RedHat-like, but uses dnf instead of yum.
+  suse   -- SUSE-like, uses zypper and rpm.
+  debian -- Debian-like, uses apt and dpkg.
+
+We do not currently support other package management systems. Check the Intel's
+documentation at https://software.intel.com/mkl/choose-download for other
+install options."
+
+  echo >&2 "$0: Your system is using ${distro}-style package management."
+fi
+
+# Check for root.
+if [[ "$(id -u)" -ne 0 ]]; then
+  echo >&2 "$0: You must be root to install MKL.
+
+Restart this script using the 'sudo' command, as:
+
+  sudo $0 -sp $distro $package
+
+We recommend adding the '-sp $distro' options to skip the MKL and distro
+detection, since this has already been done. This minimizes the number of
+programs invoked with the root privileges to keep your system safe from
+unexpected or erroneous changes. Also, if you are setting the CC environment
+variable, sudo might not allow it to propagate to the command that it invokes."
+
+  if [ -t 0 ]; then
+    echo; read -ep "Run the above sudo command now? [Y/n]:"
+    case $REPLY in
+      ''|[Yy]*) set -x; exec sudo "$0" -sp "$distro" "$package"
+    esac
+  fi
+  exit 0
+fi
+
+# The install variants, each in a finction to simplify error reporting.
+# Each one invokes a subshell with a 'set -x' to to show system-modifying
+# commands it runs. The subshells simply limit the scope of this diagnostics
+# and avoid creating noise (if we were using 'set +x', it would be printed).
+Install_redhat () {
+  # yum-utils contains yum-config-manager, in case the user does not have it.
+  ( set -x
+    yum -y install yum-utils &&
+    yum-config-manager --add-repo "$yum_repo" &&
+    yum -y install "$package" )
+}
+
+Install_fedora () {
+  ( set -x
+    dnf -y install 'dnf-command(config-manager)' &&
+    dnf config-manager --add-repo "$yum_repo" &&
+    dnf -y install "$package" )
+}
+
+Install_suse () {
+  # zypper bug until libzypp-17.6.4: '--gpg-auto-import-keys' is ignored.
+  # See https://github.com/openSUSE/zypper/issues/144#issuecomment-418685933
+  # We must disable gpg checks with '--no-gpg-checks'. I won't bend backwards
+  # as far as check the installed .so version...
+  ( set -x
+    zypper addrepo "$yum_repo" &&
+    zypper --gpg-auto-import-keys --no-gpg-checks \
+           --non-interactive install "$package" )
+}
+
+Install_debian () {
+  local keyring='/usr/share/keyrings/intel-sw-products.gpg' \
+        sources_d='/etc/apt/sources.list.d' \
+        trusted_d='/etc/apt/trusted.gpg.d' \
+        apt_maj= apt_min= apt_ver=
+
+  # apt before 1.2 does not understand the signed-by option, and always
+  # look for the keyring in their trusted.gpg.d directory. This is not
+  # considered a good security practice any more. If apt is old, add a link
+  # to the keyring file and remind the user to delete it when apt is upgraded.
+  IFS=' .' builtin read _ apt_maj apt_min _ < <(apt-get --version)
+  apt_ver=$(builtin printf '%03d%03d' $apt_maj $apt_min)
+
+  # Get alternative location of /etc/apt/sources.list.d, if so configured.
+  eval $(apt-config shell sources_d Dir::Etc::sourceparts/f \
+                          trusted_d Dir::Etc::trustedparts/f)
+
+  # apt is much more involved to configure than other package managers, as fas
+  # as third-party security keys go.
+  ( set -x;
+    apt-get update &&
+    apt-get install -y wget apt-transport-https ca-certificates gnupg &&
+    wget -qO- $intel_key_url | apt-key --keyring $keyring add - &&
+    echo "deb [signed-by=${keyring}] $apt_repo all main" \
+         > "$sources_d/intel-mkl.list" ) || return 1
+
+  if [[ $apt_ver < '001002' ]]; then
+    ( set -x; ln -s "$keyring" "${trusted_d}/" ) || return 1
+  fi
+
+  ( set +x
+    apt-get update &&
+    apt-get install -y "$package" ) || return 1
+
+  # Print the message after the large install, so the user may notice. I hope...
+  if [[ $apt_ver < '001002' ]]; then
+    echo >&2 "$0: Your apt-get version is earlier than 1.2.
+
+This version does not understand individual repositories signing keys, and
+trusts all keys in $trusted_d. We have created a link
+$trusted_d/$(basename $keyring) pointing to the file
+$keyring. If/when you upgrade your system to
+a higher version of apt, removing this link will help make it more secure.
+
+This is not considered a severe security issue, but separating keyrings is the
+current recommended security practice."
+  fi
+}
+
+# Register MKL .so libraries with the ld.so.
+ConfigLdSo() {
+  [ -d /etc/ld.so.conf.d ] || return 0
+  type -t ldconfig >/dev/null || return 0
+  echo >&2 "$0: Configuring ld runtime bindings"
+  ( set -x;
+    echo >/etc/ld.so.conf.d/intel-mkl.conf "\
+/opt/intel/lib/intel64
+/opt/intel/mkl/lib/intel64"
+    ldconfig )
+}
+
+# Invoke installation.
+if Install_${distro} && ConfigLdSo; then
+  echo >&2 "$0: MKL package $package was successfully installed"
+else
+  Fatal "MKL package $package installation FAILED.
+
+Please open an issue with us at https://github.com/kaldi-asr/kaldi/ if you
+believe this is a bug."
+fi

From c8ada0cbd6aa3d1f582f848f4aa9fc5095e6ae55 Mon Sep 17 00:00:00 2001
From: Dr-Desty-Nova <49311045+Dr-Desty-Nova@users.noreply.github.com>
Date: Sun, 7 Apr 2019 19:16:02 +0300
Subject: [PATCH 061/163] [build] check for i686 as a valid prefix for Android
 triplets (#3213)

Add check for i686 as a valid prefix for Android triplets.

Kaldi configure script for 32-bit x86 expects x86 cross-compile
toolchains to start with x86.

Android compilers, however, have triplets like i686-linux-android-gcc
and i686-linux-android-ar, using i686 as a prefix.

Fixes: #3208
---
 src/configure | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/src/configure b/src/configure
index 04c33236437..a43b97a0989 100755
--- a/src/configure
+++ b/src/configure
@@ -991,7 +991,8 @@ if is_set $HOST; then
   IFS='-' read -ra PARTS <<< "$HOST"
   # The first field in the PARTS list is the target architecture.
   TARGET_ARCH="$PARTS"
-  if [[ "$TARGET_ARCH" != aarch64* && "$TARGET_ARCH" != arm* && "$TARGET_ARCH" != ppc64le && "$TARGET_ARCH" != x86* ]] ; then
+  if [[ "$TARGET_ARCH" != aarch64* && "$TARGET_ARCH" != arm* && "$TARGET_ARCH" != ppc64le && \
+        "$TARGET_ARCH" != x86* && "$TARGET_ARCH" != i686* ]] ; then
     # We currently only support building for x86[_64], arm*, aarch64* and ppc64le.
     # If TARGET_ARCH was read from the HOST variable, it must be one of these.
     failure "$TARGET_ARCH is not a supported architecture.

From e1ac00f4980456c76ae835b591df6058c3b2efd4 Mon Sep 17 00:00:00 2001
From: kkm <kkm@smartaction.com>
Date: Tue, 9 Apr 2019 09:01:16 -0700
Subject: [PATCH 062/163] [build] Fix configure breakage from #3194 (MKL
 default)

Establish MATHLIB as the single ground-truth source for the chosen
matrix algebra library.
---
 src/configure | 104 +++++++++++++++++++++++++++++++++++++++++---------
 1 file changed, 87 insertions(+), 17 deletions(-)

diff --git a/src/configure b/src/configure
index a43b97a0989..a5adbb65788 100755
--- a/src/configure
+++ b/src/configure
@@ -46,6 +46,15 @@
 # script, i.e. any change affecting kaldi.mk or the build system as a whole.
 CONFIGURE_VERSION=10
 
+# We support bash version 3.2 (Macs still ship with this version as of 2019)
+# and above.
+[[ $BASH_VERSION < '3.2' ]] && {
+  echo >&2 "bash version ${BASH_VERSION} is too old, cannot continue." \
+           "You won't be able to run Kaldi recipes with it anyway." \
+           "Please upgrade. bash version 3.2 or higher is required."
+  exit 1;
+}
+
 if ! [ -x "$PWD/configure" ]; then
   echo 'You must run "configure" from the src/ directory.'
   exit 1
@@ -129,6 +138,8 @@ function read_dirname {
   echo $retval
 }
 
+# TODO(kkm): Kill this. `[[ ${var-} ]]' is the idiomatic equivalent in bash.
+#   Even better, do not rely on uninitialized variables.
 function is_set {
   local myvar=${1:-notset}
   if [ "$myvar" == "notset" ]; then
@@ -138,6 +149,11 @@ function is_set {
   fi
 }
 
+# Lowercase/uppercase argument. Only bash 4.2+ has internal faclilties for this,
+# and we support versions down to 3.2.
+lcase () { awk '{print tolower($0)}' <<<"$1" ; }
+ucase () { awk '{print toupper($0)}' <<<"$1" ; }
+
 function failure {
   echo "***configure failed: $* ***" >&2
   if [ -f kaldi.mk ]; then rm kaldi.mk; fi
@@ -475,14 +491,17 @@ function configure_cuda {
     elif [ "`uname -m`" == "ppc64le" ]; then
       cat makefiles/cuda_64bit.mk >> kaldi.mk
     else
-      echo "CUDA will not be used! CUDA is not supported with 32-bit builds."
+      echo "\
+WARNING: CUDA will not be used!
+         CUDA is not supported with 32-bit builds."
       exit 1;
     fi
 
   else
-    echo "CUDA will not be used! If you have already installed cuda drivers "
-    echo "and cuda toolkit, try using --cudatk-dir=... option.  Note: this is"
-    echo "only relevant for neural net experiments"
+    echo "\
+WARNING: CUDA will not be used! If you have already installed cuda drivers
+         and CUDA toolkit, try using the --cudatk-dir= option. A GPU and CUDA
+         are required to run neural net experiments in a realistic time."
   fi
 }
 
@@ -776,14 +795,17 @@ function linux_configure_dynamic {
 
 # If configuration sets any of these variables, we will switch the external
 # math library. Here we unset them so that we can check later.
-unset MKLROOT
-unset CLAPACKROOT
-unset OPENBLASROOT
-unset MKLLIBDIR
+#TODO(kkm): Maybe allow env vars to provide defaults?
+ATLASROOT=
+CLAPACKROOT=
+MATHLIB=
+MKLLIBDIR=
+MKLROOT=
+OPENBLASROOT=
 
 # This variable identifies the type of system where built programs and
 # libraries will run. It is set by the configure script when cross compiling.
-unset HOST
+HOST=
 
 # These environment variables can be used to override the default toolchain.
 CXX=${CXX:-g++}
@@ -809,8 +831,6 @@ threaded_atlas=false
 mkl_threading=sequential
 android=false
 
-MATHLIB=MKL
-MKLROOT=/opt/intel/mkl
 FSTROOT=`rel2abs ../tools/openfst`
 CUBROOT=`rel2abs ../tools/cub`
 
@@ -1002,13 +1022,63 @@ else
   TARGET_ARCH="`uname -m`"
 fi
 
-# If one of these variables is set, we switch the external math library.
-is_set $MKLLIBDIR && echo "Configuring KALDI to use MKL" && export MATHLIB="MKL"
-is_set $MKLROOT && echo "Configuring KALDI to use MKL"&& export MATHLIB="MKL"
-is_set $CLAPACKROOT && echo "Configuring KALDI to use CLAPACK"&& export MATHLIB="CLAPACK"
-is_set $OPENBLASROOT && echo "Configuring KALDI to use OPENBLAS"&& export MATHLIB="OPENBLAS"
+#------------------------------------------------------------------------------
+# Matrix algebra library selection and validation.
+#--------------
+
+declare -a mathlibs   # Contains e. g. 'atlas', 'mkl'
+declare -a incompat   # Contains mutually-inconsistent switches, if any.
+auto_lib=             # Deduced lib name, used when $MATHLIB is not set.
+
+# Validate the (optionally) provided MATHLIB value.
+case $MATHLIB in
+  ''|ATLAS|CLAPACK|MKL|OPENBLAS) : ;;
+  *) failure "Unknown --mathlib='${MATHLIB}'. Supported libs: ATLAS CLAPACK MKL OPENBLAS" ;;
+esac
+
+# See which library-root switches are set, what mathlib they imply, and whether
+# there are any conflicts betweeh the switches.
+[[ $MKLLIBDIR || $MKLROOT ]] && { mathlibs+=(mkl); auto_lib=MKL; }
+[[ $CLAPACKROOT  ]] && { mathlibs+=(clapack); auto_lib=CLAPACK; }
+[[ $OPENBLASROOT ]] && { mathlibs+=(openblas); auto_lib=OPENBLAS; }
+[[ $ATLASROOT    ]] && { mathlibs+=(atlas); auto_lib=ATLAS; }
+
+# When --mathlib= is explicitly provided, and some mathlib(s) deduced, but
+# MATHLIB is not among them, record a conflict for the --mathlib= value.
+shopt -s nocasematch
+[[ $MATHLIB && $mathlibs && ! " ${mathlibs[@]} " =~ " $MATHLIB " ]] &&
+  incompat+=(--mathlib=$MATHLIB)
+shopt -u nocasematch
+
+# If more than one library specified, or a conflict has been recorded above
+# already, then add all deduced libraries as conflicting options (not all may
+# be conflicting sensu stricto, but let the user deal with it).
+if [[ ${#mathlibs[@]} -gt 1 || $incompat ]]; then
+  for libpfx in "${mathlibs[@]}"; do
+    # Handle --mkl-libdir out of common pattern.
+    [[ $libpfx == mkl && $MKLLIBDIR ]] && incompat+=(--mkl-libdir=)
+    # All other switches follow the pattern --$libpfx-root.
+    incompat+=(--$(lcase $libpfx)-root=)
+  done
+  failure "Incompatible configuration switches: ${incompat[@]}"
+fi
+
+# When no library roots were provided, so that auto_lib is not deduced, and
+# MATHLIB is also not explicitly provided by the user, then default to MKL.
+[[ ! $auto_lib && ! $MATHLIB ]] && auto_lib=MKL
+: ${MATHLIB:=$auto_lib}
+export MATHLIB  #TODO(kkm): Likely not needed. Briefly tested without,
+                #    but left in the hotfix. Remove when doing the #3192.
+
+# Define default library roots where known (others may be found by probing).
+case $MATHLIB in
+  MKL) [[ ! $MKLLIBDIR && ! $MKLROOT ]] && MKLROOT=/opt/intel/mkl ;;
+  ATLAS) : ${ATLASROOT:=$(rel2abs ../tools/ATLAS_headers/)} ;;
+esac
+
+unset auto_lib incompat libpfx mathlibs
 
-echo "Configuring ..."
+echo "Configuring KALDI to use ${MATHLIB}."
 
 # Back up the old kaldi.mk in case we modified it
 if [ -f kaldi.mk ]; then

From c54b5e51d3cc7e8d1e2527e8fe7848ee8fbaa59f Mon Sep 17 00:00:00 2001
From: Tien-Hong Lo <teinhonglo@gmail.com>
Date: Wed, 10 Apr 2019 08:48:18 +0800
Subject: [PATCH 063/163] [build] Add missing line continuation '\' in
 tfrnnlmbin/Makefile (#3218)

---
 src/tfrnnlmbin/Makefile | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/tfrnnlmbin/Makefile b/src/tfrnnlmbin/Makefile
index 6963c0b62d0..77fe58c088c 100644
--- a/src/tfrnnlmbin/Makefile
+++ b/src/tfrnnlmbin/Makefile
@@ -15,7 +15,7 @@ TENSORFLOW = $(shell pwd)/../../tools/tensorflow
 all:
 
 EXTRA_CXXFLAGS = -Wno-sign-compare \
-                 -I$(TENSORFLOW)/bazel-tensorflow/external/protobuf_archive/src
+                 -I$(TENSORFLOW)/bazel-tensorflow/external/protobuf_archive/src \
                  -I$(TENSORFLOW)/bazel-genfiles -I$(TENSORFLOW) \
                  -I$(TENSORFLOW)/tensorflow/contrib/makefile/downloads/eigen \
                  -I$(TENSORFLOW)/tensorflow/contrib/makefile/downloads/nsync/public \

From 519493f2e132a4066d2b62147bdceba87d434a8a Mon Sep 17 00:00:00 2001
From: huangruizhe <eraser567@163.com>
Date: Thu, 11 Apr 2019 22:34:47 -0400
Subject: [PATCH 064/163] [src] Fix nnet2 DctComponent test failure (#3225)

---
 src/nnet2/nnet-component-test.cc | 915 +++++++++++++++++++++++++++++++
 1 file changed, 915 insertions(+)
 create mode 100644 src/nnet2/nnet-component-test.cc

diff --git a/src/nnet2/nnet-component-test.cc b/src/nnet2/nnet-component-test.cc
new file mode 100644
index 00000000000..40e6bef5a9f
--- /dev/null
+++ b/src/nnet2/nnet-component-test.cc
@@ -0,0 +1,915 @@
+// nnet2/nnet-component-test.cc
+
+// Copyright 2012-2014  Johns Hopkins University (author:  Daniel Povey)
+//                2015  Guoguo Chen
+
+// See ../../COPYING for clarification regarding multiple authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//  http://www.apache.org/licenses/LICENSE-2.0
+//
+// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
+// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
+// MERCHANTABLITY OR NON-INFRINGEMENT.
+// See the Apache 2 License for the specific language governing permissions and
+// limitations under the License.
+
+#include "nnet2/nnet-component.h"
+#include "util/common-utils.h"
+
+namespace kaldi {
+namespace nnet2 {
+
+
+void UnitTestGenericComponentInternal(const Component &component,
+                                      const ChunkInfo in_info,
+                                      const ChunkInfo out_info)  {
+
+  CuMatrix<BaseFloat> input(in_info.NumRows(), in_info.NumCols()),
+      output(1, out_info.NumRows() * out_info.NumCols());
+  input.SetRandn();
+  CuVector<BaseFloat> objf_vec(out_info.NumCols()); // objective function is linear function of output.
+  objf_vec.SetRandn(); // set to Gaussian noise.
+
+  int32 rand_seed = Rand();
+
+  RandomComponent *rand_component =
+      const_cast<RandomComponent*>(dynamic_cast<const RandomComponent*>(&component));
+  if (rand_component != NULL) {
+    srand(rand_seed);
+    rand_component->ResetGenerator();
+  }
+  component.Propagate(in_info, out_info, input, &output);
+  {
+    bool binary = (Rand() % 2 == 0);
+    Output ko("tmpf", binary);
+    component.Write(ko.Stream(), binary);
+  }
+  Component *component_copy;
+  {
+    bool binary_in;
+    Input ki("tmpf", &binary_in);
+    component_copy = Component::ReadNew(ki.Stream(), binary_in);
+  }
+  unlink("tmpf");
+
+  { // Test backward derivative is correct.
+    CuVector<BaseFloat> output_objfs(out_info.NumRows());
+    output_objfs.AddMatVec(1.0, output, kNoTrans, objf_vec, 0.0);
+    BaseFloat objf = output_objfs.Sum();
+
+
+    CuMatrix<BaseFloat> output_deriv(output.NumRows(), output.NumCols());
+    for (int32 i = 0; i < output_deriv.NumRows(); i++)
+      output_deriv.Row(i).CopyFromVec(objf_vec);
+
+    CuMatrix<BaseFloat> input_deriv(input.NumRows(), input.NumCols());
+
+
+    CuMatrix<BaseFloat> empty_mat;
+    CuMatrix<BaseFloat> &input_ref =
+        (component_copy->BackpropNeedsInput() ? input : empty_mat),
+        &output_ref =
+        (component_copy->BackpropNeedsOutput() ? output : empty_mat);
+
+    component_copy->Backprop(in_info, out_info, input_ref, output_ref,
+                             output_deriv, NULL, &input_deriv);
+
+    int32 num_ok = 0, num_bad = 0, num_tries = 10;
+    KALDI_LOG << "Comparing feature gradients " << num_tries << " times.";
+    for (int32 i = 0; i < num_tries; i++) {
+      CuMatrix<BaseFloat> perturbed_input(input.NumRows(), input.NumCols());
+      {
+        RandomComponent *rand_component =
+            const_cast<RandomComponent*>(dynamic_cast<const RandomComponent*>(&component));
+        if (rand_component != NULL) {
+          srand(rand_seed);
+          rand_component->ResetGenerator();
+        }
+      }
+      perturbed_input.SetRandn();
+      perturbed_input.Scale(1.0e-04); // scale by a small amount so it's like a delta.
+      BaseFloat predicted_difference = TraceMatMat(perturbed_input,
+                                                   input_deriv, kTrans);
+      perturbed_input.AddMat(1.0, input); // now it's the input + a delta.
+      { // Compute objf with perturbed input and make sure it matches
+        // prediction.
+        CuMatrix<BaseFloat> perturbed_output(output.NumRows(), output.NumCols());
+        {
+          RandomComponent *rand_component =
+              const_cast<RandomComponent*>(dynamic_cast<const RandomComponent*>(&component));
+          if (rand_component != NULL) {
+            srand(rand_seed);
+            rand_component->ResetGenerator();
+          }
+        }
+        component.Propagate(in_info, out_info, perturbed_input, &perturbed_output);
+        CuVector<BaseFloat> perturbed_output_objfs(out_info.NumRows());
+        perturbed_output_objfs.AddMatVec(1.0, perturbed_output, kNoTrans,
+                                         objf_vec, 0.0);
+        BaseFloat perturbed_objf = perturbed_output_objfs.Sum(),
+             observed_difference = perturbed_objf - objf;
+        KALDI_LOG << "Input gradients: comparing " << predicted_difference
+                  << " and " << observed_difference;
+        if (fabs(predicted_difference - observed_difference) >
+            0.15 * fabs((predicted_difference + observed_difference)/2) &&
+            fabs(predicted_difference - observed_difference) > 1.0e-06) {
+          KALDI_WARN << "Bad difference!";
+          num_bad++;
+        } else {
+          num_ok++;
+        }
+      }
+    }
+    KALDI_LOG << "Succeeded for " << num_ok << " out of " << num_tries
+              << " tries.";
+    if (num_ok <= num_bad) {
+      delete component_copy;
+      KALDI_ERR << "Feature-derivative check failed";
+    }
+  }
+
+  UpdatableComponent *ucomponent =
+      dynamic_cast<UpdatableComponent*>(component_copy);
+
+  if (ucomponent != NULL) { // Test parameter derivative is correct.
+
+    int32 num_ok = 0, num_bad = 0, num_tries = 10;
+    KALDI_LOG << "Comparing model gradients " << num_tries << " times.";
+    for (int32 i = 0; i < num_tries; i++) {
+      UpdatableComponent *perturbed_ucomponent =
+          dynamic_cast<UpdatableComponent*>(ucomponent->Copy()),
+          *gradient_ucomponent =
+          dynamic_cast<UpdatableComponent*>(ucomponent->Copy());
+      KALDI_ASSERT(perturbed_ucomponent != NULL);
+      gradient_ucomponent->SetZero(true); // set params to zero and treat as gradient.
+      BaseFloat perturb_stddev = 5.0e-04;
+      perturbed_ucomponent->PerturbParams(perturb_stddev);
+
+      CuVector<BaseFloat> output_objfs(out_info.NumRows());
+      output_objfs.AddMatVec(1.0, output, kNoTrans, objf_vec, 0.0);
+      BaseFloat objf = output_objfs.Sum();
+
+      CuMatrix<BaseFloat> output_deriv(output.NumRows(), output.NumCols());
+      for (int32 i = 0; i < output_deriv.NumRows(); i++)
+        output_deriv.Row(i).CopyFromVec(objf_vec);
+      CuMatrix<BaseFloat> input_deriv; // (input.NumRows(), input.NumCols());
+
+      // This will compute the parameter gradient.
+      ucomponent->Backprop(in_info, out_info, input, output, output_deriv,
+                           gradient_ucomponent, &input_deriv);
+
+      // Now compute the perturbed objf.
+      BaseFloat objf_perturbed;
+      {
+        CuMatrix<BaseFloat> output_perturbed; // (num_egs, output_dim);
+        {
+          RandomComponent *rand_component =
+              const_cast<RandomComponent*>(dynamic_cast<const RandomComponent*>(&component));
+          if (rand_component != NULL) {
+            srand(rand_seed);
+            rand_component->ResetGenerator();
+          }
+        }
+        perturbed_ucomponent->Propagate(in_info, out_info, input, &output_perturbed);
+        CuVector<BaseFloat> output_objfs_perturbed(out_info.NumRows());
+        output_objfs_perturbed.AddMatVec(1.0, output_perturbed,
+                                         kNoTrans, objf_vec, 0.0);
+        objf_perturbed = output_objfs_perturbed.Sum();
+      }
+
+      BaseFloat delta_objf_observed = objf_perturbed - objf,
+          delta_objf_predicted = (perturbed_ucomponent->DotProduct(*gradient_ucomponent) -
+                                  ucomponent->DotProduct(*gradient_ucomponent));
+
+      KALDI_LOG << "Model gradients: comparing " << delta_objf_observed
+                << " and " << delta_objf_predicted;
+      if (fabs(delta_objf_predicted - delta_objf_observed) >
+          0.05 * (fabs(delta_objf_predicted + delta_objf_observed)/2) &&
+          fabs(delta_objf_predicted - delta_objf_observed) > 1.0e-06) {
+        KALDI_WARN << "Bad difference!";
+        num_bad++;
+      } else {
+        num_ok++;
+      }
+      delete perturbed_ucomponent;
+      delete gradient_ucomponent;
+    }
+    if (num_ok < num_bad) {
+      delete component_copy;
+      KALDI_ERR << "model-derivative check failed";
+    }
+  }
+  delete component_copy; // No longer needed.
+}
+
+void UnitTestGenericComponentInternal(const Component &component) {
+  int32 input_dim = component.InputDim(),
+      output_dim = component.OutputDim();
+
+  KALDI_LOG << component.Info();
+  int32 num_egs = 10 + Rand() % 5;
+  int32 num_chunks = 1,
+        first_offset = 0,
+        last_offset = num_egs-1;
+
+  ChunkInfo in_info(input_dim, num_chunks, first_offset, last_offset);
+  ChunkInfo out_info(output_dim, num_chunks, first_offset, last_offset);
+  UnitTestGenericComponentInternal(component, in_info, out_info);
+}
+
+
+
+void UnitTestSigmoidComponent() {
+  // We're testing that the gradients are computed correctly:
+  // the input gradients and the model gradients.
+
+  int32 input_dim = 10 + Rand() % 50;
+  {
+    SigmoidComponent sigmoid_component(input_dim);
+    UnitTestGenericComponentInternal(sigmoid_component);
+  }
+  {
+    SigmoidComponent sigmoid_component;
+    sigmoid_component.InitFromString("dim=15");
+    UnitTestGenericComponentInternal(sigmoid_component);
+  }
+}
+
+template<class T>
+void UnitTestGenericComponent(std::string extra_str = "") {
+  // works if it has an initializer from int,
+  // e.g. tanh, sigmoid.
+
+  // We're testing that the gradients are computed correctly:
+  // the input gradients and the model gradients.
+
+  int32 input_dim = 10 + Rand() % 50;
+  {
+    T component(input_dim);
+    UnitTestGenericComponentInternal(component);
+  }
+  {
+    T component;
+    component.InitFromString(static_cast<std::string>("dim=15 ") + extra_str);
+    UnitTestGenericComponentInternal(component);
+  }
+}
+
+void UnitTestMaxoutComponent() {
+  // works if it has an initializer from int,
+  // e.g. tanh, sigmoid.
+
+  // We're testing that the gradients are computed correctly:
+  // the input gradients and the model gradients.
+
+  for (int32 i = 0; i < 5; i++) {
+    int32 output_dim = 10 + Rand() % 20,
+        group_size = 1 + Rand() % 10,
+        input_dim = output_dim * group_size;
+
+    MaxoutComponent component(input_dim, output_dim);
+    UnitTestGenericComponentInternal(component);
+  }
+
+  {
+    MaxoutComponent component;
+    component.InitFromString("input-dim=15 output-dim=5");
+    UnitTestGenericComponentInternal(component);
+  }
+}
+
+void UnitTestPnormComponent() {
+  // We're testing that the gradients are computed correctly:
+  // the input gradients and the model gradients.
+
+  int32 num_fail = 0, num_tries = 4;
+  for (int32 i = 0; i < num_tries; i++) {
+    try {
+      int32 output_dim = 10 + Rand() % 20,
+          group_size = 1 + Rand() % 10,
+          input_dim = output_dim * group_size;
+      BaseFloat p = 1.0 + 0.1 * (Rand() % 20);
+
+      PnormComponent component(input_dim, output_dim, p);
+      UnitTestGenericComponentInternal(component);
+    } catch (...) {
+      KALDI_WARN << "Ignoring test failure in UnitTestPnormComponent().";
+      num_fail++;
+    }
+  }
+  if (num_fail >= num_tries/2) {
+    KALDI_ERR << "Too many test failures.";
+  }
+}
+
+void UnitTestMaxpoolingComponent() {
+  // works if it has an initializer from int,
+  // e.g. tanh, sigmoid.
+  // We're testing that the gradients are computed correctly:
+  // the input gradients and the model gradients.
+
+  for (int32 i = 0; i < 5; i++) {
+    int32 pool_stride = 5 + Rand() % 10,
+          pool_size = 2 + Rand() % 3,
+          num_pools = 1 + Rand() % 10;
+    int32 output_dim = num_pools * pool_stride;
+    int32 num_patches = num_pools * pool_size;
+    int32 input_dim = pool_stride * num_patches;
+
+    MaxpoolingComponent component(input_dim, output_dim,
+                                  pool_size, pool_stride);
+    UnitTestGenericComponentInternal(component);
+  }
+
+  {
+    MaxpoolingComponent component;
+    component.InitFromString("input-dim=192 output-dim=64 pool-size=3 pool-stride=16");
+    UnitTestGenericComponentInternal(component);
+  }
+}
+
+
+void UnitTestAffineComponent() {
+  BaseFloat learning_rate = 0.01,
+      param_stddev = 0.1, bias_stddev = 1.0;
+  int32 input_dim = 5 + Rand() % 10, output_dim = 5 + Rand() % 10;
+  {
+    AffineComponent component;
+    if (Rand() % 2 == 0) {
+      component.Init(learning_rate, input_dim, output_dim,
+                     param_stddev, bias_stddev);
+    } else {
+      Matrix<BaseFloat> mat(output_dim + 1, input_dim);
+      mat.SetRandn();
+      mat.Scale(param_stddev);
+      WriteKaldiObject(mat, "tmpf", true);
+      Sleep(0.5);
+      component.Init(learning_rate, "tmpf");
+      unlink("tmpf");
+    }
+    UnitTestGenericComponentInternal(component);
+  }
+  {
+    const char *str = "learning-rate=0.01 input-dim=10 output-dim=15 param-stddev=0.1";
+    AffineComponent component;
+    component.InitFromString(str);
+    UnitTestGenericComponentInternal(component);
+  }
+}
+
+void UnitTestConvolutional1dComponent() {
+  BaseFloat learning_rate = 0.01,
+            param_stddev = 0.1, bias_stddev = 1.0;
+  int32 patch_stride = 10, patch_step = 1, patch_dim = 4;
+  int32 num_patches = 1 + (patch_stride - patch_dim) / patch_step;
+  int32 num_splice = 5 + Rand() % 10, num_filters = 5 + Rand() % 10;
+  int32 input_dim = patch_stride * num_splice;
+  int32 filter_dim = patch_dim * num_splice;
+  int32 output_dim = num_patches * num_filters;
+  {
+    Convolutional1dComponent component;
+    if (Rand() % 2 == 0) {
+      component.Init(learning_rate, input_dim, output_dim,
+                     patch_dim, patch_step, patch_stride,
+                     param_stddev, bias_stddev, true);
+    } else {
+      Matrix<BaseFloat> mat(num_filters, filter_dim + 1);
+      mat.SetRandn();
+      mat.Scale(param_stddev);
+      WriteKaldiObject(mat, "tmpf", true);
+      Sleep(0.5);
+      component.Init(learning_rate, patch_dim,
+                     patch_step, patch_stride, "tmpf", false);
+      unlink("tmpf");
+    }
+    UnitTestGenericComponentInternal(component);
+  }
+  {
+    // appended-conv is false by default
+    const char *str = "learning-rate=0.01 input-dim=100 output-dim=70 param-stddev=0.1 patch-dim=4 patch-step=1 patch-stride=10";
+    Convolutional1dComponent component;
+    component.InitFromString(str);
+    UnitTestGenericComponentInternal(component);
+  }
+  {
+    const char *str = "learning-rate=0.01 input-dim=100 output-dim=70 param-stddev=0.1 patch-dim=4 patch-step=1 patch-stride=10 appended-conv=true";
+    Convolutional1dComponent component;
+    component.InitFromString(str);
+    UnitTestGenericComponentInternal(component);
+  }
+}
+
+void UnitTestDropoutComponent() {
+  // We're testing that the gradients are computed correctly:
+  // the input gradients and the model gradients.
+
+  int32 num_fail = 0, num_tries = 4;
+  for (int32 i = 0; i < num_tries; i++) {
+    try {
+      int32 input_dim = 10 + Rand() % 50;
+      {
+        DropoutComponent dropout_component(input_dim, 0.5, 0.3);
+        UnitTestGenericComponentInternal(dropout_component);
+      }
+      {
+        DropoutComponent dropout_component;
+        dropout_component.InitFromString("dim=15 dropout-proportion=0.6 dropout-scale=0.1");
+        UnitTestGenericComponentInternal(dropout_component);
+      }
+    } catch (...) {
+      KALDI_WARN << "Ignoring test failure in UnitTestDropoutComponent().";
+      num_fail++;
+    }
+  }
+  if (num_fail >= num_tries/2) {
+    KALDI_ERR << "Too many test failures.";
+  }
+}
+
+void UnitTestAdditiveNoiseComponent() {
+  // We're testing that the gradients are computed correctly:
+  // the input gradients and the model gradients.
+
+  int32 num_fail = 0, num_tries = 4;
+  for (int32 i = 0; i < num_tries; i++) {
+    try {
+      int32 input_dim = 10 + Rand() % 50;
+      {
+        AdditiveNoiseComponent additive_noise_component(input_dim, 0.1);
+        UnitTestGenericComponentInternal(additive_noise_component);
+      }
+      {
+        AdditiveNoiseComponent additive_noise_component;
+        additive_noise_component.InitFromString("dim=15 stddev=0.2");
+        UnitTestGenericComponentInternal(additive_noise_component);
+      }
+    } catch (...) {
+      KALDI_WARN << "Ignoring failure in AdditiveNoiseComponent test";
+      num_fail++;
+    }
+  }
+  if (num_fail >= num_tries/2) {
+    KALDI_ERR << "Too many test failures.";
+  }
+}
+
+void UnitTestScaleComponent() {
+  int32 dim = 1 + Rand() % 10;
+  BaseFloat scale = 0.1 + Rand() % 3;
+  {
+    ScaleComponent component;
+    if (Rand() % 2 == 0) {
+      component.Init(dim, scale);
+    } else {
+      std::ostringstream str;
+      str << "dim=" << dim << " scale=" << scale;
+      component.InitFromString(str.str());
+    }
+    UnitTestGenericComponentInternal(component);
+  }
+}
+
+
+void UnitTestAffineComponentPreconditioned() {
+  BaseFloat learning_rate = 0.01,
+      param_stddev = 0.1, bias_stddev = 1.0, alpha = 0.01,
+      max_change = 100.0;
+  int32 input_dim = 5 + Rand() % 10, output_dim = 5 + Rand() % 10;
+  {
+    AffineComponentPreconditioned component;
+    if (Rand() % 2 == 0) {
+      component.Init(learning_rate, input_dim, output_dim,
+                     param_stddev, bias_stddev,
+                     alpha, max_change);
+    } else {
+      Matrix<BaseFloat> mat(output_dim + 1, input_dim);
+      mat.SetRandn();
+      mat.Scale(param_stddev);
+      WriteKaldiObject(mat, "tmpf", true);
+      Sleep(0.5);
+      component.Init(learning_rate, alpha, max_change, "tmpf");
+      unlink("tmpf");
+    }
+    UnitTestGenericComponentInternal(component);
+  }
+  {
+    const char *str = "learning-rate=0.01 input-dim=16 output-dim=15 param-stddev=0.1 alpha=0.01";
+    AffineComponentPreconditioned component;
+    component.InitFromString(str);
+    UnitTestGenericComponentInternal(component);
+  }
+}
+
+
+void UnitTestAffineComponentPreconditionedOnline() {
+  BaseFloat learning_rate = 0.01,
+      param_stddev = 0.1, bias_stddev = 1.0, num_samples_history = 2000.0, alpha = 4.0,
+      max_change_per_sample = 0.1, update_period = 1;
+  int32 input_dim = 5 + Rand() % 10, output_dim = 5 + Rand() % 10,
+      rank_in = 1 + Rand() % 5, rank_out = 1 + Rand() % 5;
+  {
+    AffineComponentPreconditionedOnline component;
+    if (Rand() % 2 == 0) {
+      component.Init(learning_rate, input_dim, output_dim,
+                     param_stddev, bias_stddev,
+                     rank_in, rank_out, update_period,
+                     num_samples_history, alpha,
+                     max_change_per_sample);
+    } else {
+      Matrix<BaseFloat> mat(output_dim + 1, input_dim);
+      mat.SetRandn();
+      mat.Scale(param_stddev);
+      WriteKaldiObject(mat, "tmpf", true);
+      Sleep(0.5);
+      component.Init(learning_rate, rank_in, rank_out,
+                     update_period, num_samples_history, alpha,
+                     max_change_per_sample, "tmpf");
+      unlink("tmpf");
+    }
+    UnitTestGenericComponentInternal(component);
+  }
+  {
+    const char *str = "learning-rate=0.01 input-dim=16 output-dim=15 param-stddev=0.1 num-samples-history=3000 alpha=2.0 update-period=1 rank-in=5 rank-out=6";
+    AffineComponentPreconditionedOnline component;
+    component.InitFromString(str);
+    UnitTestGenericComponentInternal(component);
+  }
+}
+
+void UnitTestBlockAffineComponent() {
+  BaseFloat learning_rate = 0.01,
+      param_stddev = 0.1, bias_stddev = 0.1;
+  int32 num_blocks = 1 + Rand() % 3,
+         input_dim = num_blocks * (2 + Rand() % 4),
+        output_dim = num_blocks * (2 + Rand() % 4);
+
+  {
+    BlockAffineComponent component;
+    component.Init(learning_rate, input_dim, output_dim,
+                   param_stddev, bias_stddev, num_blocks);
+    UnitTestGenericComponentInternal(component);
+  }
+  {
+    const char *str = "learning-rate=0.01 input-dim=10 output-dim=15 param-stddev=0.1 num-blocks=5";
+    BlockAffineComponent component;
+    component.InitFromString(str);
+    UnitTestGenericComponentInternal(component);
+  }
+}
+
+void UnitTestBlockAffineComponentPreconditioned() {
+  BaseFloat learning_rate = 0.01,
+      param_stddev = 0.1, bias_stddev = 1.0, alpha = 3.0;
+  int32 num_blocks = 1 + Rand() % 3,
+         input_dim = num_blocks * (2 + Rand() % 4),
+        output_dim = num_blocks * (2 + Rand() % 4);
+
+  {
+    BlockAffineComponentPreconditioned component;
+    component.Init(learning_rate, input_dim, output_dim,
+                   param_stddev, bias_stddev, num_blocks, alpha);
+    UnitTestGenericComponentInternal(component);
+  }
+  {
+    const char *str = "learning-rate=0.01 input-dim=10 output-dim=15 param-stddev=0.1 num-blocks=5 alpha=3.0";
+    BlockAffineComponentPreconditioned component;
+    component.InitFromString(str);
+    UnitTestGenericComponentInternal(component);
+  }
+}
+
+
+void UnitTestSumGroupComponent() {
+  std::vector<int32> sizes;
+  int32 num_sizes = 1 + Rand() % 5;
+  for (int32 i = 0; i < num_sizes; i++)
+    sizes.push_back(1 + Rand() % 5);
+
+  {
+    SumGroupComponent component;
+    component.Init(sizes);
+    UnitTestGenericComponentInternal(component);
+  }
+  {
+    const char *str = "sizes=3:4:5";
+    SumGroupComponent component;
+    component.InitFromString(str);
+    UnitTestGenericComponentInternal(component);
+  }
+}
+
+
+void UnitTestDctComponent() {
+  int32 m = 3 + Rand() % 4, n = 3 + Rand() % 4,
+  dct_dim = m, dim = m * n;
+  bool reorder = (Rand() % 2 == 0);
+  {
+    DctComponent component;
+    component.Init(dim, dct_dim, reorder);
+    UnitTestGenericComponentInternal(component);
+  }
+  {
+    const char *str = "dim=10 dct-dim=5 reorder=true";
+    DctComponent component;
+    component.InitFromString(str);
+    UnitTestGenericComponentInternal(component);
+  }
+  {
+    const char *str = "dim=10 dct-dim=5 reorder=true dct-keep-dim=1";
+    DctComponent component;
+    component.InitFromString(str);
+    UnitTestGenericComponentInternal(component);
+  }
+  {
+    const char *str = "dim=10 dct-dim=5 reorder=true dct-keep-dim=2";
+    DctComponent component;
+    component.InitFromString(str);
+    UnitTestGenericComponentInternal(component);
+  }
+  {
+    const char *str = "dim=10 dct-dim=5 reorder=true dct-keep-dim=3";
+    DctComponent component;
+    component.InitFromString(str);
+    UnitTestGenericComponentInternal(component);
+  }
+  {
+    const char *str = "dim=10 dct-dim=5 reorder=true dct-keep-dim=4";
+    DctComponent component;
+    component.InitFromString(str);
+    UnitTestGenericComponentInternal(component);
+  }
+}
+
+
+void UnitTestFixedLinearComponent() {
+  int32 m = 1 + Rand() % 4, n = 1 + Rand() % 4;
+  {
+    CuMatrix<BaseFloat> mat(m, n);
+    mat.SetRandn();
+    FixedLinearComponent component;
+    component.Init(mat);
+    UnitTestGenericComponentInternal(component);
+  }
+}
+
+
+void UnitTestFixedAffineComponent() {
+  int32 m = 15 + Rand() % 4, n = 15 + Rand() % 4;
+  {
+    CuMatrix<BaseFloat> mat(m, n);
+    mat.SetRandn();
+    FixedAffineComponent component;
+    component.Init(mat);
+    UnitTestGenericComponentInternal(component);
+  }
+}
+
+void UnitTestFixedScaleComponent() {
+  int32 m = 1 + Rand() % 20;
+  {
+    CuVector<BaseFloat> vec(m);
+    vec.SetRandn();
+    FixedScaleComponent component;
+    component.Init(vec);
+    UnitTestGenericComponentInternal(component);
+  }
+}
+
+void UnitTestFixedBiasComponent() {
+  int32 m = 1 + Rand() % 20;
+  {
+    CuVector<BaseFloat> vec(m);
+    vec.SetRandn();
+    FixedBiasComponent component;
+    component.Init(vec);
+    UnitTestGenericComponentInternal(component);
+  }
+}
+
+
+
+void UnitTestParsing() {
+  int32 i;
+  BaseFloat f;
+  bool b;
+  std::vector<int32> v;
+  std::string s = "x=y";
+  KALDI_ASSERT(ParseFromString("foo", &s, &i) == false
+               && s == "x=y");
+  KALDI_ASSERT(ParseFromString("foo", &s, &f) == false
+               && s == "x=y");
+  KALDI_ASSERT(ParseFromString("foo", &s, &v) == false
+               && s == "x=y");
+  KALDI_ASSERT(ParseFromString("foo", &s, &b) == false
+               && s == "x=y");
+  {
+    std::string s = "x=1";
+    KALDI_ASSERT(ParseFromString("x", &s, &i) == true
+                 && i == 1 && s == "");
+    s = "a=b x=1";
+    KALDI_ASSERT(ParseFromString("x", &s, &i) == true
+                 && i == 1 && s == "a=b");
+  }
+  {
+    std::string s = "foo=false";
+    KALDI_ASSERT(ParseFromString("foo", &s, &b) == true
+                 && b == false && s == "");
+    s = "x=y foo=true a=b";
+    KALDI_ASSERT(ParseFromString("foo", &s, &b) == true
+                 && b == true && s == "x=y a=b");
+  }
+
+  {
+    std::string s = "foobar x=1";
+    KALDI_ASSERT(ParseFromString("x", &s, &f) == true
+                 && f == 1.0 && s == "foobar");
+    s = "a=b x=1 bxy";
+    KALDI_ASSERT(ParseFromString("x", &s, &f) == true
+                 && f == 1.0 && s == "a=b bxy");
+  }
+  {
+    std::string s = "x=1:2:3";
+    KALDI_ASSERT(ParseFromString("x", &s, &v) == true
+                 && v.size() == 3 && v[0] == 1 && v[1] == 2 && v[2] == 3
+                 && s == "");
+    s = "a=b x=1:2:3 c=d";
+    KALDI_ASSERT(ParseFromString("x", &s, &v) == true
+                 && f == 1.0 && s == "a=b c=d");
+  }
+
+}
+
+void UnitTestSpliceComponent() {
+  int32 feat_dim = RandInt(1, 20),
+      const_dim =  RandInt(0, 10),
+      left_context = RandInt(-5, 0),
+      right_context = RandInt(0, 5),
+      num_chunks = RandInt(1, 20);
+        // multiple chunks are required as splice component
+        // has separate index computation logic for more than one chunks
+  KALDI_LOG << " Feat_dim :" << feat_dim << " const_dim: " << const_dim  ;
+  std::vector<bool> contiguous(2);
+  contiguous[0] = true;
+  contiguous[1] = false;
+  for (int32 i = 0; i < contiguous.size(); i++) {
+    std::vector<int32> splice_indexes;
+    if (contiguous[i]) {
+      // create contiguous set of splice indexes in the range
+      // (-left_context, right_context)
+      KALDI_LOG << "Testing contiguous splice component";
+      splice_indexes.reserve(right_context - left_context + 1);
+      for (int32 i = left_context; i <= right_context; i++)
+        splice_indexes.push_back(i);
+    } else  {
+      // generate random splice indexes in range (-left_context, right_context)
+      KALDI_LOG << "Testing non-contiguous splice component";
+      int32 num_left_splice_indexes = RandInt(0, -left_context) + 1;
+      int32 num_right_splice_indexes = RandInt(0, right_context);
+      splice_indexes.reserve(num_left_splice_indexes + num_right_splice_indexes);
+      while (splice_indexes.size() < num_left_splice_indexes)  {
+        int32 new_index = RandInt(left_context, 0);
+        // check if the index already exists in the vector
+        if (std::find(splice_indexes.begin(), splice_indexes.end(), new_index)
+            == splice_indexes.end())  {
+          splice_indexes.push_back(new_index);
+        }
+      }
+      while (splice_indexes.size() < num_left_splice_indexes + num_right_splice_indexes)  {
+        int32 new_index = RandInt(0, right_context);
+        // check if the index already exists in the vector
+        if (std::find(splice_indexes.begin(), splice_indexes.end(), new_index)
+            == splice_indexes.end())  {
+          splice_indexes.push_back(new_index);
+        }
+      }
+      sort(splice_indexes.begin(), splice_indexes.end());
+      if (splice_indexes.back() < 0) // will fail assertion in init of component
+        splice_indexes.push_back(0);
+    }
+    std::vector<int32> input_offsets;
+    for (int32 i = 0; i < splice_indexes.size(); i++) {
+      input_offsets.push_back(splice_indexes[i] - splice_indexes.front());
+      KALDI_LOG << i << " : " << splice_indexes[i] << " : " << input_offsets[i] ;
+    }
+    int32 output_offset = -splice_indexes.front();
+    SpliceComponent *component = new SpliceComponent();
+    component->Init(feat_dim + const_dim, splice_indexes, const_dim);
+    ChunkInfo in_info = ChunkInfo(feat_dim + const_dim, num_chunks,
+                                  input_offsets),
+              out_info = ChunkInfo(feat_dim * splice_indexes.size() + const_dim,
+                                   num_chunks, output_offset, output_offset);
+    UnitTestGenericComponentInternal(*component, in_info, out_info);
+    delete component;
+  }
+}
+
+void BasicDebugTestForSpliceMax(bool output=false) {
+  int32 C=5,
+        context_len=2,
+        R= 3 + 2*context_len;
+
+  SpliceMaxComponent *c = new SpliceMaxComponent();
+  std::vector<int32> context(2 * context_len + 1);
+  for (int32 i = -1 * context_len; i <= context_len; i++)
+    context[i + context_len] = i;
+  c->Init(C, context);
+  CuMatrix<BaseFloat> in(R, C), in_deriv(R, C);
+  CuMatrix<BaseFloat> out(R, c->OutputDim());
+  ChunkInfo in_info = ChunkInfo(C, 1, 0, R - 1),
+            out_info = ChunkInfo(C, 1, context_len, R - 1 - context_len);
+
+  in.SetRandn();
+  if (output)
+    KALDI_LOG << in;
+
+  c->Propagate(in_info, out_info, in, &out);
+
+  if (output)
+    KALDI_LOG << out;
+
+  out.Set(5.0);
+
+  if (output)
+    KALDI_LOG << out;
+
+  c->Backprop(in_info, out_info, in, in, out, c, &in_deriv);
+
+  if (output)
+    KALDI_LOG << in_deriv;
+
+  delete c;
+}
+
+
+} // namespace nnet2
+} // namespace kaldi
+
+#include "matrix/matrix-functions.h"
+
+
+int main() {
+  using namespace kaldi;
+  using namespace kaldi::nnet2;
+
+  int32 loop = 0;
+#if HAVE_CUDA == 1
+  for (loop = 0; loop < 2; loop++) {
+    //// Uncomment the following line to expose the bug in UnitTestDropoutComponent
+    //CuDevice::Instantiate().SetDebugStrideMode(true);
+    if (loop == 0)
+      CuDevice::Instantiate().SelectGpuId("no"); // -1 means no GPU
+    else
+      CuDevice::Instantiate().SelectGpuId("optional"); // -2 .. automatic selection
+#endif
+
+    BasicDebugTestForSpliceMax(true);
+    // We used to test this 3 times, but now that nnet2 is rarely changed,
+    // reducing it to once.
+    for (int32 i = 0; i < 1; i++) {
+      UnitTestGenericComponent<SigmoidComponent>();
+      UnitTestGenericComponent<TanhComponent>();
+      UnitTestGenericComponent<PowerComponent>("power=1.5");
+      UnitTestGenericComponent<PowerComponent>("power=1.0");
+      UnitTestGenericComponent<PermuteComponent>();
+      UnitTestGenericComponent<SoftmaxComponent>();
+      UnitTestGenericComponent<LogSoftmaxComponent>();
+      UnitTestGenericComponent<RectifiedLinearComponent>();
+      UnitTestGenericComponent<SoftHingeComponent>();
+      UnitTestSpliceComponent();
+      UnitTestMaxoutComponent();
+      UnitTestPnormComponent();
+      UnitTestMaxpoolingComponent();
+      UnitTestGenericComponent<NormalizeComponent>();
+      UnitTestSigmoidComponent();
+      UnitTestAffineComponent();
+      UnitTestScaleComponent();
+      UnitTestBlockAffineComponent();
+      UnitTestBlockAffineComponentPreconditioned();
+      UnitTestSumGroupComponent();
+      UnitTestDctComponent();
+      UnitTestFixedLinearComponent();
+      UnitTestFixedAffineComponent();
+      UnitTestFixedScaleComponent();
+      UnitTestFixedBiasComponent();
+      UnitTestAffineComponentPreconditioned();
+      UnitTestAffineComponentPreconditionedOnline();
+      UnitTestConvolutional1dComponent();
+      UnitTestDropoutComponent();
+      UnitTestAdditiveNoiseComponent();
+      UnitTestParsing();
+      if (loop == 0)
+        KALDI_LOG << "Tests without GPU use succeeded.";
+      else
+        KALDI_LOG << "Tests with GPU use (if available) succeeded.";
+    }
+#if HAVE_CUDA == 1
+  } // No for loop if 'HAVE_CUDA != 1',
+  CuDevice::Instantiate().PrintProfile();
+#endif
+  return 0;
+}

From d7685cb226763d1e6a8ab25a71099d20d1e2fd92 Mon Sep 17 00:00:00 2001
From: Shiyin Kang <kangshiyin@gmail.com>
Date: Fri, 12 Apr 2019 12:27:48 +0800
Subject: [PATCH 065/163] [src] Update  CUDA code to avoid synchronization
 errors on compute capability 7.x (#3211)

---
 src/cudamatrix/cu-kernels.cu | 138 ++++++++++++++++-------------------
 1 file changed, 63 insertions(+), 75 deletions(-)

diff --git a/src/cudamatrix/cu-kernels.cu b/src/cudamatrix/cu-kernels.cu
index bc5c32714ef..9ecf0d0a4de 100644
--- a/src/cudamatrix/cu-kernels.cu
+++ b/src/cudamatrix/cu-kernels.cu
@@ -949,14 +949,18 @@ static void _copy_cols_from_vec(Real* m_out, MatrixDim d, const Real* v_in) {
 // _trace_mat_mat reduce the partial sum to
 // value[blockIdx.y * gridDim.x + blockIdx.x]
 // It use shared mem to transpose matrix B to ensure coalesced memory access
+// 2D block (8x32) is used
 template<int TileDim, typename Real>
 __global__
 static void _trace_mat_mat(const Real* A, const Real* B, MatrixDim dA,
                            int B_stride, Real* value) {
   // Reuse shared mem and make indexing easier. "+1" to avoid bank conflict
+  const int kWarpSize = 32;
+  typedef cub::BlockReduce<Real, kWarpSize, cub::BLOCK_REDUCE_WARP_REDUCTIONS,
+      CU1DBLOCK / kWarpSize> BlockReduceT;
   __shared__ union {
     Real trans[TileDim][TileDim + 1];
-    Real sum[CU1DBLOCK];
+    typename BlockReduceT::TempStorage sum;
   } smem;
 
   // linear thread id;
@@ -999,39 +1003,27 @@ static void _trace_mat_mat(const Real* A, const Real* B, MatrixDim dA,
     jb += grid_height;
   }
 
-  smem.sum[tid] = tsum;
-  __syncthreads();
-
   // Block reduce
-# pragma unroll
-  for (int shift = CU1DBLOCK / 2; shift > warpSize; shift >>= 1) {
-    if (tid < shift)
-      smem.sum[tid] += smem.sum[tid + shift];
-    __syncthreads();
-  }
-
-  // Warp reduce. Implicitly synchronized within a warp.
-  if (tid < warpSize) {
-#   pragma unroll
-    for (int shift = warpSize; shift > 0; shift >>= 1) {
-      smem.sum[tid] += smem.sum[tid + shift];
-    }
-  }
+  tsum = BlockReduceT(smem.sum).Sum(tsum);
 
   // output 1 sum per thread block
   if (tid == 0) {
-    value[blockIdx.y * gridDim.x + blockIdx.x] = smem.sum[0];
+    value[blockIdx.y * gridDim.x + blockIdx.x] = tsum;
   }
 
 }
 
 // _trace_mat_mat_trans reduce the partial sum to
 // value[blockIdx.y * gridDim.x + blockIdx.x]
+// 2D block (8x32) is used
 template<typename Real>
 __global__
 static void _trace_mat_mat_trans(const Real* A, const Real* B, MatrixDim dA,
                                  int B_stride, Real* value) {
-  __shared__ Real ssum[CU1DBLOCK];
+  const int kWarpSize = 32;
+  typedef cub::BlockReduce<Real, kWarpSize, cub::BLOCK_REDUCE_WARP_REDUCTIONS,
+      CU1DBLOCK / kWarpSize> BlockReduceT;
+  __shared__ typename BlockReduceT::TempStorage smem;
 
   // linear thread id;
   const int32_cuda tid = threadIdx.y * blockDim.x + threadIdx.x;
@@ -1047,28 +1039,13 @@ static void _trace_mat_mat_trans(const Real* A, const Real* B, MatrixDim dA,
       i += grid_height;
     }
   }
-  ssum[tid] = tsum;
-  __syncthreads();
-  
-  // Block reduce
-# pragma unroll
-  for (int shift = CU1DBLOCK / 2; shift > warpSize; shift >>= 1) {
-    if (tid < shift)
-      ssum[tid] += ssum[tid + shift];
-    __syncthreads();
-  }
 
-  // Warp reduce. Implicitly synchronized within a warp.
-  if (tid < warpSize) {
-#   pragma unroll
-    for (int shift = warpSize; shift > 0; shift >>= 1) {
-      ssum[tid] += ssum[tid + shift];
-    }
-  }
+  // Block reduce
+  tsum = BlockReduceT(smem).Sum(tsum);
 
   // output 1 sum per thread block
   if (tid == 0) {
-    value[blockIdx.y * gridDim.x + blockIdx.x] = ssum[0];
+    value[blockIdx.y * gridDim.x + blockIdx.x] = tsum;
   }
 }
 
@@ -1079,7 +1056,8 @@ static void _add_diag_mat_mat_MNT(const Real alpha, const Real* M,
                                   const MatrixDim dim_M, const Real* N,
                                   const int stride_N, const Real beta,
                                   Real* v) {
-  __shared__ Real ssum[CU1DBLOCK];
+  typedef cub::BlockReduce<Real, CU1DBLOCK> BlockReduceT;
+  __shared__ typename BlockReduceT::TempStorage smem;
   const int tid = threadIdx.x;
   const int i = blockIdx.x;
   const int m_start = i * dim_M.stride;
@@ -1090,28 +1068,13 @@ static void _add_diag_mat_mat_MNT(const Real alpha, const Real* M,
   for (int j = tid; j < dim_M.cols; j += CU1DBLOCK) {
     tsum += M[m_start + j] * N[n_start + j];
   }
-  ssum[tid] = tsum;
-  __syncthreads();
 
-  // Tree reduce to 2x warpSize elements.
-# pragma unroll
-  for (int shift = CU1DBLOCK / 2; shift > warpSize; shift >>= 1) {
-    if (tid < shift)
-      ssum[tid] += ssum[tid + shift];
-    __syncthreads();
-  }
-
-  // Warp reduce to 1 element. Threads implicitly synchronized within a warp.
-  if (tid < warpSize) {
-#   pragma unroll
-    for (int shift = warpSize; shift > 0; shift >>= 1) {
-      ssum[tid] += ssum[tid + shift];
-    }
-  }
+  // Block reduce
+  tsum = BlockReduceT(smem).Sum(tsum);
 
   // output 1 sum per thread block
   if (tid == 0) {
-    v[i] = alpha * ssum[0] + beta * v[i];
+    v[i] = alpha * tsum + beta * v[i];
   }
 }
 
@@ -1152,11 +1115,13 @@ static void _add_diag_mat_mat_MTN(const Real alpha, const Real* M,
   }
 
   // Warp reduce to 1 element per column.
-  // Threads implicitly synchronized within a warp.
   if (tid < warpSize) {
 #   pragma unroll
     for (int shift = warpSize; shift >= TileDim; shift >>= 1) {
-      ssum[tid] += ssum[tid + shift];
+      Real buf = ssum[tid + shift];
+      __syncwarp();
+      ssum[tid] += buf;
+      __syncwarp();
     }
   }
 
@@ -1235,11 +1200,13 @@ static void _add_diag_mat_mat_MN(const Real alpha, const Real* M,
   }
 
   // Warp reduce to 1 element per column.
-  // Threads implicitly synchronized within a warp.
   if (tid < warpSize) {
 #   pragma unroll
     for (int shift = warpSize; shift >= TileDim; shift >>= 1) {
-      smem.sum[tid] += smem.sum[tid + shift];
+      Real buf = smem.sum[tid + shift];
+      __syncwarp();
+      smem.sum[tid] += buf;
+      __syncwarp();
     }
   }
 
@@ -1688,10 +1655,14 @@ static void _vec_transform_reduce(
     __syncthreads();
   }
 
-  // Reduce last warp. Threads implicitly synchronized within a warp.
+  // Reduce last warp.
   if (tid < warpSize) {
+# pragma unroll
     for (int shift = warpSize; shift > 0; shift >>= 1) {
-      sdata[tid] = op.Reduce(sdata[tid], sdata[tid + shift]);
+      Real buf = op.Reduce(sdata[tid], sdata[tid + shift]);
+      __syncwarp();
+      sdata[tid] = buf;
+      __syncwarp();
     }
   }
 
@@ -1727,10 +1698,15 @@ static void _transform_reduce_mat_cols(
     __syncthreads();
   }
 
-  // Reduce last warp. Threads implicitly synchronized within a warp.
+  // Reduce last warp.
   if (tid < warpSize) {
-    for (int shift = warpSize; shift > 0; shift >>= 1)
-      sdata[tid] = op.Reduce(sdata[tid], sdata[tid + shift]);
+# pragma unroll
+    for (int shift = warpSize; shift > 0; shift >>= 1) {
+      Real buf = op.Reduce(sdata[tid], sdata[tid + shift]);
+      __syncwarp();
+      sdata[tid] = buf;
+      __syncwarp();
+    }
   }
 
   // Output to vector result.
@@ -1784,13 +1760,15 @@ static void _group_transform_reduce(
     }
 
     // Warp-reduce to 1 element per group.
-    // Threads implicitly synchronized within the warp.
     const int warp_reduce_size =
         threads_per_group / 2 < warpSize ? threads_per_group / 2 : warpSize;
     if (threadIdx.x < warp_reduce_size) {
 #     pragma unroll
       for (int shift = warp_reduce_size; shift > 0; shift >>= 1) {
-        sreduction[tid] = op.Reduce(sreduction[tid], sreduction[tid + shift]);
+        Real buf = op.Reduce(sreduction[tid], sreduction[tid + shift]);
+        __syncwarp();
+        sreduction[tid] = buf;
+        __syncwarp();
       }
     }
 
@@ -2628,8 +2606,12 @@ static void _diff_normalize_per_row(Real *id, int id_stride, const Real *iv,
   if (tid < warpSize) {
 #   pragma unroll
     for (int shift = warpSize; shift > 0; shift >>= 1) {
-      sprod[tid] += sprod[tid + shift];
-      snorm[tid] += snorm[tid + shift];
+      Real buf_prod = sprod[tid + shift];
+      Real buf_norm = snorm[tid + shift];
+      __syncwarp();
+      sprod[tid] += buf_prod;
+      snorm[tid] += buf_norm;
+      __syncwarp();
     }
   }
 
@@ -2899,6 +2881,7 @@ static void _find_row_max_id(const Real* mat, Real* vec_val, int32_cuda* vec_id,
   sidx[tid] = tidx;
 
   // Parallel reduce
+  // Reduce to warpSize elements per row
 #pragma unroll
   for (int32_cuda num_working_threads = CU1DBLOCK / 2;
       num_working_threads >= warpSize; num_working_threads >>= 1) {
@@ -2910,16 +2893,21 @@ static void _find_row_max_id(const Real* mat, Real* vec_val, int32_cuda* vec_id,
       }
     }
   }
-  // Warp reduce without __syncthreads()
-  // (note.: synchronizes implicitly within a warp at the multiprocessor)
+
+  // Warp reduce to 1 element per row
   if (tid < warpSize / 2) {
 #pragma unroll
     for (int32_cuda num_working_threads = warpSize / 2; num_working_threads > 0;
         num_working_threads >>= 1) {
-      if (smax[tid + num_working_threads] > smax[tid]) {
-        smax[tid] = smax[tid + num_working_threads];
-        sidx[tid] = sidx[tid + num_working_threads];
+      Real max_tid = smax[tid];
+      Real max_tid_nwt = smax[tid + num_working_threads];
+      int32_cuda idx_tid_nwt = sidx[tid + num_working_threads];
+      __syncwarp();
+      if (max_tid_nwt > max_tid) {
+        smax[tid] = max_tid_nwt;
+        sidx[tid] = idx_tid_nwt;
       }
+      __syncwarp();
     }
   }
 

From cbdb930d617eb4cb26e20c624e3811301efa5749 Mon Sep 17 00:00:00 2001
From: huangruizhe <eraser567@163.com>
Date: Fri, 12 Apr 2019 02:03:28 -0400
Subject: [PATCH 066/163] [src] fix nnet2 DCTCompnent test failure -- removing
 anther dct_keep_dim=1 (#3226)

---
 src/nnet2/nnet-component-test.cc | 6 ------
 1 file changed, 6 deletions(-)

diff --git a/src/nnet2/nnet-component-test.cc b/src/nnet2/nnet-component-test.cc
index 40e6bef5a9f..5aeaf28cd1e 100644
--- a/src/nnet2/nnet-component-test.cc
+++ b/src/nnet2/nnet-component-test.cc
@@ -619,12 +619,6 @@ void UnitTestDctComponent() {
     component.InitFromString(str);
     UnitTestGenericComponentInternal(component);
   }
-  {
-    const char *str = "dim=10 dct-dim=5 reorder=true dct-keep-dim=1";
-    DctComponent component;
-    component.InitFromString(str);
-    UnitTestGenericComponentInternal(component);
-  }
   {
     const char *str = "dim=10 dct-dim=5 reorder=true dct-keep-dim=2";
     DctComponent component;

From d22530fc87c88cf4f45b1207b470c3f8c044d6e7 Mon Sep 17 00:00:00 2001
From: "kkm (aka Kirill Katsnelson)" <kkm@smartaction.ai>
Date: Sun, 14 Apr 2019 07:28:28 -0700
Subject: [PATCH 067/163] [build] Remove references to deprecated MKL libs in
 gst_plugin (#3229)

These libraries are long gone, see
https://software.intel.com/en-us/articles/intel-math-kernel-library-intel-mkl-112-deprecations

Fixes #2202
---
 src/gst-plugin/Makefile | 5 -----
 1 file changed, 5 deletions(-)

diff --git a/src/gst-plugin/Makefile b/src/gst-plugin/Makefile
index 92af0483a6e..4d4764b6006 100644
--- a/src/gst-plugin/Makefile
+++ b/src/gst-plugin/Makefile
@@ -34,11 +34,6 @@ ifneq ($(wildcard ../../tools/portaudio/install/include/pa_linux_alsa.h),)
     EXTRA_LDLIBS += -lasound
 endif
 
-# MKL libs required when linked via shared library
-ifdef MKLROOT
-	EXTRA_LDLIBS+=-lmkl_p4n -lmkl_def
-endif
-
 # Library so name and rpath
 CXX_VERSION=$(shell $(CXX) --version 2>/dev/null)
 ifneq (,$(findstring clang, $(CXX_VERSION)))

From e0cce5b19a4720cae7c6d27c8fa63df021412f12 Mon Sep 17 00:00:00 2001
From: rezame <36230722+rezame@users.noreply.github.com>
Date: Mon, 15 Apr 2019 02:16:16 +0430
Subject: [PATCH 068/163] [scripts] Fix default params in nnet3 segmentation
 script (#3230)

---
 .../cleanup/clean_and_segment_data_nnet3.sh   | 22 ++++++++++++++++---
 1 file changed, 19 insertions(+), 3 deletions(-)

diff --git a/egs/wsj/s5/steps/cleanup/clean_and_segment_data_nnet3.sh b/egs/wsj/s5/steps/cleanup/clean_and_segment_data_nnet3.sh
index 35b07d184f4..cc8da298d2f 100755
--- a/egs/wsj/s5/steps/cleanup/clean_and_segment_data_nnet3.sh
+++ b/egs/wsj/s5/steps/cleanup/clean_and_segment_data_nnet3.sh
@@ -23,12 +23,12 @@ cleanup=true  # remove temporary directories and files
 nj=4
 # Decode options
 graph_opts=
+scale_opts=
 beam=15.0
 lattice_beam=1.0
 
 acwt=0.1  # Just a default value, used for adaptation and beam-pruning..
-post_decode_acwt=1.0  # can be used in 'chain' systems to scale acoustics by 10 so the
-                      # regular scoring script works.
+lmwt=10
 
 # Contexts must ideally match training
 extra_left_context=0  # Set to some large value, typically 40 for LSTM (must match training)
@@ -109,6 +109,22 @@ cp $srcdir/cmvn_opts $dir
 cp $srcdir/{splice_opts,delta_opts,final.mat,final.alimdl} $dir 2>/dev/null || true
 cp $srcdir/frame_subsampling_factor $dir 2>/dev/null || true
 
+if [ -f $srcdir/frame_subsampling_factor ]; then
+  echo "$0: guessing that this is a chain system, checking parameters."
+  if [ -z $scale_opts ]; then
+    echo "$0: setting scale_opts"
+    scale_opts="--self-loop-scale=1.0 --transition-scale=1.0"
+  fi
+  if [ $acwt == 0.1 ]; then
+    echo "$0: setting acwt=1.0"
+    acwt=1.0
+  fi
+  if [ $lmwt == 10 ]; then
+    echo "$0: setting lmwt=1.0"
+    lmwt=1
+  fi
+fi
+
 utils/lang/check_phones_compatible.sh $lang/phones.txt $srcdir/phones.txt
 cp $lang/phones.txt $dir
 
@@ -142,7 +158,7 @@ if [ $stage -le 3 ]; then
   echo "$0: Decoding with biased language models..."
 
   steps/cleanup/decode_segmentation_nnet3.sh \
-    --acwt $acwt --post-decode-acwt $post_decode_acwt \
+    --acwt $acwt  \
     --beam $beam --lattice-beam $lattice_beam --nj $nj --cmd "$cmd --mem 4G" \
     --skip-scoring true --allow-partial false \
     --extra-left-context $extra_left_context \

From cbd1aa3c2f52bcfff9ec04e324297bf46c92dba2 Mon Sep 17 00:00:00 2001
From: Karel Vesely <vesis84@gmail.com>
Date: Tue, 16 Apr 2019 02:32:11 +0200
Subject: [PATCH 069/163] [src] Correct sanity check in nnet-example-utils.cc
 (nnet3) (#3232)

---
 src/nnet3/nnet-example-utils.cc | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/nnet3/nnet-example-utils.cc b/src/nnet3/nnet-example-utils.cc
index cc5fe3cc050..15004092eaa 100644
--- a/src/nnet3/nnet-example-utils.cc
+++ b/src/nnet3/nnet-example-utils.cc
@@ -214,8 +214,8 @@ void GetComputationRequest(const Nnet &nnet,
     const NnetIo &io = eg.io[i];
     const std::string &name = io.name;
     int32 node_index = nnet.GetNodeIndex(name);
-    if (node_index == -1 &&
-        !nnet.IsInputNode(node_index) && !nnet.IsOutputNode(node_index))
+    if (node_index == -1 ||
+        (!nnet.IsInputNode(node_index) && !nnet.IsOutputNode(node_index)))
       KALDI_ERR << "Nnet example has input or output named '" << name
                 << "', but no such input or output node is in the network.";
 

From a12ee7354488d4857252c7dfd9fbea3fd3fed10c Mon Sep 17 00:00:00 2001
From: Daniel Povey <dpovey@gmail.com>
Date: Tue, 16 Apr 2019 12:29:18 -0700
Subject: [PATCH 070/163] Revert "[src] Update  CUDA code to avoid
 synchronization errors on compute capability 7.x (#3211)" (#3236)

This reverts commit 4cfbd217ad7e0324dba90eb31d9467dae2da4e17.
---
 src/cudamatrix/cu-kernels.cu | 138 +++++++++++++++++++----------------
 1 file changed, 75 insertions(+), 63 deletions(-)

diff --git a/src/cudamatrix/cu-kernels.cu b/src/cudamatrix/cu-kernels.cu
index 9ecf0d0a4de..bc5c32714ef 100644
--- a/src/cudamatrix/cu-kernels.cu
+++ b/src/cudamatrix/cu-kernels.cu
@@ -949,18 +949,14 @@ static void _copy_cols_from_vec(Real* m_out, MatrixDim d, const Real* v_in) {
 // _trace_mat_mat reduce the partial sum to
 // value[blockIdx.y * gridDim.x + blockIdx.x]
 // It use shared mem to transpose matrix B to ensure coalesced memory access
-// 2D block (8x32) is used
 template<int TileDim, typename Real>
 __global__
 static void _trace_mat_mat(const Real* A, const Real* B, MatrixDim dA,
                            int B_stride, Real* value) {
   // Reuse shared mem and make indexing easier. "+1" to avoid bank conflict
-  const int kWarpSize = 32;
-  typedef cub::BlockReduce<Real, kWarpSize, cub::BLOCK_REDUCE_WARP_REDUCTIONS,
-      CU1DBLOCK / kWarpSize> BlockReduceT;
   __shared__ union {
     Real trans[TileDim][TileDim + 1];
-    typename BlockReduceT::TempStorage sum;
+    Real sum[CU1DBLOCK];
   } smem;
 
   // linear thread id;
@@ -1003,27 +999,39 @@ static void _trace_mat_mat(const Real* A, const Real* B, MatrixDim dA,
     jb += grid_height;
   }
 
+  smem.sum[tid] = tsum;
+  __syncthreads();
+
   // Block reduce
-  tsum = BlockReduceT(smem.sum).Sum(tsum);
+# pragma unroll
+  for (int shift = CU1DBLOCK / 2; shift > warpSize; shift >>= 1) {
+    if (tid < shift)
+      smem.sum[tid] += smem.sum[tid + shift];
+    __syncthreads();
+  }
+
+  // Warp reduce. Implicitly synchronized within a warp.
+  if (tid < warpSize) {
+#   pragma unroll
+    for (int shift = warpSize; shift > 0; shift >>= 1) {
+      smem.sum[tid] += smem.sum[tid + shift];
+    }
+  }
 
   // output 1 sum per thread block
   if (tid == 0) {
-    value[blockIdx.y * gridDim.x + blockIdx.x] = tsum;
+    value[blockIdx.y * gridDim.x + blockIdx.x] = smem.sum[0];
   }
 
 }
 
 // _trace_mat_mat_trans reduce the partial sum to
 // value[blockIdx.y * gridDim.x + blockIdx.x]
-// 2D block (8x32) is used
 template<typename Real>
 __global__
 static void _trace_mat_mat_trans(const Real* A, const Real* B, MatrixDim dA,
                                  int B_stride, Real* value) {
-  const int kWarpSize = 32;
-  typedef cub::BlockReduce<Real, kWarpSize, cub::BLOCK_REDUCE_WARP_REDUCTIONS,
-      CU1DBLOCK / kWarpSize> BlockReduceT;
-  __shared__ typename BlockReduceT::TempStorage smem;
+  __shared__ Real ssum[CU1DBLOCK];
 
   // linear thread id;
   const int32_cuda tid = threadIdx.y * blockDim.x + threadIdx.x;
@@ -1039,13 +1047,28 @@ static void _trace_mat_mat_trans(const Real* A, const Real* B, MatrixDim dA,
       i += grid_height;
     }
   }
-
+  ssum[tid] = tsum;
+  __syncthreads();
+  
   // Block reduce
-  tsum = BlockReduceT(smem).Sum(tsum);
+# pragma unroll
+  for (int shift = CU1DBLOCK / 2; shift > warpSize; shift >>= 1) {
+    if (tid < shift)
+      ssum[tid] += ssum[tid + shift];
+    __syncthreads();
+  }
+
+  // Warp reduce. Implicitly synchronized within a warp.
+  if (tid < warpSize) {
+#   pragma unroll
+    for (int shift = warpSize; shift > 0; shift >>= 1) {
+      ssum[tid] += ssum[tid + shift];
+    }
+  }
 
   // output 1 sum per thread block
   if (tid == 0) {
-    value[blockIdx.y * gridDim.x + blockIdx.x] = tsum;
+    value[blockIdx.y * gridDim.x + blockIdx.x] = ssum[0];
   }
 }
 
@@ -1056,8 +1079,7 @@ static void _add_diag_mat_mat_MNT(const Real alpha, const Real* M,
                                   const MatrixDim dim_M, const Real* N,
                                   const int stride_N, const Real beta,
                                   Real* v) {
-  typedef cub::BlockReduce<Real, CU1DBLOCK> BlockReduceT;
-  __shared__ typename BlockReduceT::TempStorage smem;
+  __shared__ Real ssum[CU1DBLOCK];
   const int tid = threadIdx.x;
   const int i = blockIdx.x;
   const int m_start = i * dim_M.stride;
@@ -1068,13 +1090,28 @@ static void _add_diag_mat_mat_MNT(const Real alpha, const Real* M,
   for (int j = tid; j < dim_M.cols; j += CU1DBLOCK) {
     tsum += M[m_start + j] * N[n_start + j];
   }
+  ssum[tid] = tsum;
+  __syncthreads();
 
-  // Block reduce
-  tsum = BlockReduceT(smem).Sum(tsum);
+  // Tree reduce to 2x warpSize elements.
+# pragma unroll
+  for (int shift = CU1DBLOCK / 2; shift > warpSize; shift >>= 1) {
+    if (tid < shift)
+      ssum[tid] += ssum[tid + shift];
+    __syncthreads();
+  }
+
+  // Warp reduce to 1 element. Threads implicitly synchronized within a warp.
+  if (tid < warpSize) {
+#   pragma unroll
+    for (int shift = warpSize; shift > 0; shift >>= 1) {
+      ssum[tid] += ssum[tid + shift];
+    }
+  }
 
   // output 1 sum per thread block
   if (tid == 0) {
-    v[i] = alpha * tsum + beta * v[i];
+    v[i] = alpha * ssum[0] + beta * v[i];
   }
 }
 
@@ -1115,13 +1152,11 @@ static void _add_diag_mat_mat_MTN(const Real alpha, const Real* M,
   }
 
   // Warp reduce to 1 element per column.
+  // Threads implicitly synchronized within a warp.
   if (tid < warpSize) {
 #   pragma unroll
     for (int shift = warpSize; shift >= TileDim; shift >>= 1) {
-      Real buf = ssum[tid + shift];
-      __syncwarp();
-      ssum[tid] += buf;
-      __syncwarp();
+      ssum[tid] += ssum[tid + shift];
     }
   }
 
@@ -1200,13 +1235,11 @@ static void _add_diag_mat_mat_MN(const Real alpha, const Real* M,
   }
 
   // Warp reduce to 1 element per column.
+  // Threads implicitly synchronized within a warp.
   if (tid < warpSize) {
 #   pragma unroll
     for (int shift = warpSize; shift >= TileDim; shift >>= 1) {
-      Real buf = smem.sum[tid + shift];
-      __syncwarp();
-      smem.sum[tid] += buf;
-      __syncwarp();
+      smem.sum[tid] += smem.sum[tid + shift];
     }
   }
 
@@ -1655,14 +1688,10 @@ static void _vec_transform_reduce(
     __syncthreads();
   }
 
-  // Reduce last warp.
+  // Reduce last warp. Threads implicitly synchronized within a warp.
   if (tid < warpSize) {
-# pragma unroll
     for (int shift = warpSize; shift > 0; shift >>= 1) {
-      Real buf = op.Reduce(sdata[tid], sdata[tid + shift]);
-      __syncwarp();
-      sdata[tid] = buf;
-      __syncwarp();
+      sdata[tid] = op.Reduce(sdata[tid], sdata[tid + shift]);
     }
   }
 
@@ -1698,15 +1727,10 @@ static void _transform_reduce_mat_cols(
     __syncthreads();
   }
 
-  // Reduce last warp.
+  // Reduce last warp. Threads implicitly synchronized within a warp.
   if (tid < warpSize) {
-# pragma unroll
-    for (int shift = warpSize; shift > 0; shift >>= 1) {
-      Real buf = op.Reduce(sdata[tid], sdata[tid + shift]);
-      __syncwarp();
-      sdata[tid] = buf;
-      __syncwarp();
-    }
+    for (int shift = warpSize; shift > 0; shift >>= 1)
+      sdata[tid] = op.Reduce(sdata[tid], sdata[tid + shift]);
   }
 
   // Output to vector result.
@@ -1760,15 +1784,13 @@ static void _group_transform_reduce(
     }
 
     // Warp-reduce to 1 element per group.
+    // Threads implicitly synchronized within the warp.
     const int warp_reduce_size =
         threads_per_group / 2 < warpSize ? threads_per_group / 2 : warpSize;
     if (threadIdx.x < warp_reduce_size) {
 #     pragma unroll
       for (int shift = warp_reduce_size; shift > 0; shift >>= 1) {
-        Real buf = op.Reduce(sreduction[tid], sreduction[tid + shift]);
-        __syncwarp();
-        sreduction[tid] = buf;
-        __syncwarp();
+        sreduction[tid] = op.Reduce(sreduction[tid], sreduction[tid + shift]);
       }
     }
 
@@ -2606,12 +2628,8 @@ static void _diff_normalize_per_row(Real *id, int id_stride, const Real *iv,
   if (tid < warpSize) {
 #   pragma unroll
     for (int shift = warpSize; shift > 0; shift >>= 1) {
-      Real buf_prod = sprod[tid + shift];
-      Real buf_norm = snorm[tid + shift];
-      __syncwarp();
-      sprod[tid] += buf_prod;
-      snorm[tid] += buf_norm;
-      __syncwarp();
+      sprod[tid] += sprod[tid + shift];
+      snorm[tid] += snorm[tid + shift];
     }
   }
 
@@ -2881,7 +2899,6 @@ static void _find_row_max_id(const Real* mat, Real* vec_val, int32_cuda* vec_id,
   sidx[tid] = tidx;
 
   // Parallel reduce
-  // Reduce to warpSize elements per row
 #pragma unroll
   for (int32_cuda num_working_threads = CU1DBLOCK / 2;
       num_working_threads >= warpSize; num_working_threads >>= 1) {
@@ -2893,21 +2910,16 @@ static void _find_row_max_id(const Real* mat, Real* vec_val, int32_cuda* vec_id,
       }
     }
   }
-
-  // Warp reduce to 1 element per row
+  // Warp reduce without __syncthreads()
+  // (note.: synchronizes implicitly within a warp at the multiprocessor)
   if (tid < warpSize / 2) {
 #pragma unroll
     for (int32_cuda num_working_threads = warpSize / 2; num_working_threads > 0;
         num_working_threads >>= 1) {
-      Real max_tid = smax[tid];
-      Real max_tid_nwt = smax[tid + num_working_threads];
-      int32_cuda idx_tid_nwt = sidx[tid + num_working_threads];
-      __syncwarp();
-      if (max_tid_nwt > max_tid) {
-        smax[tid] = max_tid_nwt;
-        sidx[tid] = idx_tid_nwt;
+      if (smax[tid + num_working_threads] > smax[tid]) {
+        smax[tid] = smax[tid + num_working_threads];
+        sidx[tid] = sidx[tid + num_working_threads];
       }
-      __syncwarp();
     }
   }
 

From b2f9c5409d10db982b78da09f2b920be1f6c7f84 Mon Sep 17 00:00:00 2001
From: mcalahan <43411605+mcalahan@users.noreply.github.com>
Date: Wed, 17 Apr 2019 18:35:24 +0200
Subject: [PATCH 071/163] [build] .gitignore autogenerated /tools/python/
 (#3241)

When python2 binary/link doesn't exist, the extra/check_dependencies.sh
creates a symlink for python2 in tools/python/.

This location is not ignored in .gitignore so it causes git to assume
directory is dirty.
---
 .gitignore | 1 +
 1 file changed, 1 insertion(+)

diff --git a/.gitignore b/.gitignore
index 4cf0fa4efa9..5764bfe22c6 100644
--- a/.gitignore
+++ b/.gitignore
@@ -148,3 +148,4 @@ GSYMS
 /tools/cub-1.8.0.zip
 /tools/cub-1.8.0/
 /tools/cub
+/tools/python/

From 364273997b72a4d92f7c52135a935d898bc23ac2 Mon Sep 17 00:00:00 2001
From: "kkm (aka Kirill Katsnelson)" <kkm@smartaction.ai>
Date: Wed, 17 Apr 2019 17:49:01 -0700
Subject: [PATCH 072/163] [scripts] Enhance argument checks in
 nnet3/align_lats.sh (#3243)

---
 egs/wsj/s5/steps/nnet3/align_lats.sh | 16 ++++++++++------
 1 file changed, 10 insertions(+), 6 deletions(-)

diff --git a/egs/wsj/s5/steps/nnet3/align_lats.sh b/egs/wsj/s5/steps/nnet3/align_lats.sh
index e4ba7309435..201cc3552ba 100755
--- a/egs/wsj/s5/steps/nnet3/align_lats.sh
+++ b/egs/wsj/s5/steps/nnet3/align_lats.sh
@@ -92,12 +92,16 @@ if [ -f $srcdir/frame_subsampling_factor ]; then
   frame_subsampling_factor=$(cat $srcdir/frame_subsampling_factor)
   frame_subsampling_opt="--frame-subsampling-factor=$frame_subsampling_factor"
   cp $srcdir/frame_subsampling_factor $dir
-  if [ "$frame_subsampling_factor" -gt 1 ] && \
-     [ "$scale_opts" == "--transition-scale=1.0 --self-loop-scale=0.1" ]; then
-    echo "$0: frame-subsampling-factor is not 1 (so likely a chain system),"
-    echo "...  but the scale opts are the defaults.  You probably want"
-    echo "--scale-opts '--transition-scale=1.0 --self-loop-scale=1.0'"
-    sleep 1
+  if [[ $frame_subsampling_factor -gt 1 ]]; then
+    # Assume a chain system, check agrument sanity.
+    if [[ ! ($scale_opts == *--self-loop-scale=1.0* &&
+             $scale_opts == *--transition-scale=1.0* &&
+             $acoustic_scale = '1.0') ]]; then
+      echo "$0: ERROR: frame-subsampling-factor is not 1, assuming a chain system."
+      echo "... You should pass the following options to this script:"
+      echo "  --scale-opts '--transition-scale=1.0 --self-loop-scale=1.0'" \
+           "--acoustic_scale 1.0"
+    fi
   fi
 fi
 

From 507145f3e64a90a2c4b5ae95ca0b78022ed11363 Mon Sep 17 00:00:00 2001
From: Xingyu Na <asr.naxingyu@gmail.com>
Date: Thu, 18 Apr 2019 14:15:40 +0800
Subject: [PATCH 073/163] [egs] invoke 'python2.7' not 'python' when using
 mmseg (#3244)

---
 egs/hkust/s5/local/hkust_data_prep.sh | 4 ++--
 egs/hkust/s5/local/hkust_segment.py   | 2 +-
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/egs/hkust/s5/local/hkust_data_prep.sh b/egs/hkust/s5/local/hkust_data_prep.sh
index 207f03af36b..c7d0e3be334 100755
--- a/egs/hkust/s5/local/hkust_data_prep.sh
+++ b/egs/hkust/s5/local/hkust_data_prep.sh
@@ -75,7 +75,7 @@ cat $train_dir/transcripts.txt |\
   sed -e 's/<noise>\(.\+\)<\/noise>/\1/g' |\
   sed -e 's/((\([^)]\{0,\}\)))/\1/g' |\
   local/hkust_normalize.pl |\
-  python local/hkust_segment.py |\
+  local/hkust_segment.py |\
   awk '{if (NF > 1) print $0;}' > $train_dir/text || exit 1;
 
 cat $dev_dir/transcripts.txt |\
@@ -84,7 +84,7 @@ cat $dev_dir/transcripts.txt |\
   sed -e 's/<noise>\(.\+\)<\/noise>/\1/g' |\
   sed -e 's/((\([^)]\{0,\}\)))/\1/g' |\
   local/hkust_normalize.pl |\
-  python local/hkust_segment.py |\
+  local/hkust_segment.py |\
   awk '{if (NF > 1) print $0;}' > $dev_dir/text || exit 1;
 
 # some data is corrupted. Delete them
diff --git a/egs/hkust/s5/local/hkust_segment.py b/egs/hkust/s5/local/hkust_segment.py
index 92d3add0e3e..d4c2b35a668 100755
--- a/egs/hkust/s5/local/hkust_segment.py
+++ b/egs/hkust/s5/local/hkust_segment.py
@@ -1,4 +1,4 @@
-#!/usr/bin/env python
+#!/usr/bin/env python2.7
 #coding:utf-8
 
 from __future__ import print_function

From db8ed5b80fdf1ed1394b30f9d4b3ce3393c2abf3 Mon Sep 17 00:00:00 2001
From: Karel Vesely <vesis84@gmail.com>
Date: Thu, 18 Apr 2019 22:08:58 +0200
Subject: [PATCH 074/163] [scripts] Make getting nnet3 model context more
 robust  (#3247)

- in 'check_model_contexts()' when parsing "left-context,right-context",
  don't apply 'nnet3-copy --edits=' on 'init.raw' !!!
   - the 'init.raw' is used as feature transform prior to LDA estimation,
     and here we don't need to rename the output layers.
   - renaming the output layers is relevant to multilingual networks,
     where we can have output layers 'output-0 output-1 output-2',
     layer renaming 'output-0 -> output' is used to get the contexts.
     But we cannot use the same rename on 'init.raw', which does not
     have a component with name 'output-0'.

- We also avoid calling pipe with 'nnet3-info {} | head -n 4'
   - if nnet3-info fails, the pipeline still returns zero status
     and no Python stack-trace was displayed.
---
 egs/wsj/s5/steps/nnet3/xconfig_to_configs.py | 14 +++++++-------
 1 file changed, 7 insertions(+), 7 deletions(-)

diff --git a/egs/wsj/s5/steps/nnet3/xconfig_to_configs.py b/egs/wsj/s5/steps/nnet3/xconfig_to_configs.py
index f025eb5b343..4fb7ec63afd 100755
--- a/egs/wsj/s5/steps/nnet3/xconfig_to_configs.py
+++ b/egs/wsj/s5/steps/nnet3/xconfig_to_configs.py
@@ -243,15 +243,15 @@ def add_nnet_context_info(config_dir, nnet_edits=None,
     if nnet_edits is not None:
         model = "nnet3-copy --edits='{0}' {1} - |".format(nnet_edits,
                                                           model)
-    out = common_lib.get_command_stdout('nnet3-info "{0}" | head -n 4 '
-                                        .format(model))
+    out = common_lib.get_command_stdout('nnet3-info "{0}"'.format(model))
     # out looks like this
     # left-context: 7
     # right-context: 0
     # num-parameters: 90543902
     # modulus: 1
+    # ...
     info = {}
-    for line in out.split("\n"):
+    for line in out.split("\n")[:4]: # take 4 initial lines,
         parts = line.split(":")
         if len(parts) != 2:
             continue
@@ -277,17 +277,17 @@ def check_model_contexts(config_dir, nnet_edits=None, existing_model=None):
                                                  None else '',
                                                  config_dir, file_name))
             model = "{0}/{1}.raw".format(config_dir, file_name)
-            if nnet_edits is not None:
+            if nnet_edits is not None and file_name != 'init':
                 model = "nnet3-copy --edits='{0}' {1} - |".format(nnet_edits,
                                                                   model)
-            out = common_lib.get_command_stdout('nnet3-info "{0}" | head -n 4 '
-                                                .format(model))
+            out = common_lib.get_command_stdout('nnet3-info "{0}"'.format(model))
             # out looks like this
             # left-context: 7
             # right-context: 0
             # num-parameters: 90543902
             # modulus: 1
-            for line in out.split("\n"):
+            # ...
+            for line in out.split("\n")[:4]: # take 4 initial lines,
                 parts = line.split(":")
                 if len(parts) != 2:
                     continue

From f8de5a81f47d1ee69321c71afcd2c6ca535049ed Mon Sep 17 00:00:00 2001
From: Bayberry Z <zhtclz@hotmail.com>
Date: Thu, 18 Apr 2019 22:07:04 -0500
Subject: [PATCH 075/163] [egs] Fix hkust_data_prep.sh w.r.t. iconv mac
 compatibility issue (#3250)

---
 egs/hkust/s5/local/hkust_data_prep.sh | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/egs/hkust/s5/local/hkust_data_prep.sh b/egs/hkust/s5/local/hkust_data_prep.sh
index c7d0e3be334..af37a156bb3 100755
--- a/egs/hkust/s5/local/hkust_data_prep.sh
+++ b/egs/hkust/s5/local/hkust_data_prep.sh
@@ -35,7 +35,7 @@ n=`cat $train_dir/sph.flist $dev_dir/sph.flist | wc -l`
 
 #collect all trans, convert encodings to utf-8,
 find $hkust_text_dir -iname "*.txt" | grep -i "trans/train" | xargs cat |\
-  iconv -f GBK -t utf-8 - | perl -e '
+  iconv -f GBK -t UTF-8 | perl -e '
     while (<STDIN>) {
       @A = split(" ", $_);
       if (@A <= 1) { next; }
@@ -50,7 +50,7 @@ find $hkust_text_dir -iname "*.txt" | grep -i "trans/train" | xargs cat |\
   ' | sort -k1 > $train_dir/transcripts.txt || exit 1;
 
 find $hkust_text_dir -iname "*.txt" | grep -i "trans/dev" | xargs cat |\
-  iconv -f GBK -t utf-8 - | perl -e '
+  iconv -f GBK -t UTF-8 | perl -e '
     while (<STDIN>) {
       @A = split(" ", $_);
       if (@A <= 1) { next; }

From 68ad4e92299a758942d05d547b41fc26da029437 Mon Sep 17 00:00:00 2001
From: Hosung Park <indra622@gmail.com>
Date: Fri, 19 Apr 2019 12:14:37 +0900
Subject: [PATCH 076/163] [egs] Update RM chain recipe with more recent
 configuration (#3237)

---
 egs/rm/s5/RESULTS                             |   5 +-
 egs/rm/s5/local/chain/run_tdnn.sh             |   1 +
 .../local/chain/{ => tuning}/run_tdnn_5g.sh   |   0
 .../local/chain/{ => tuning}/run_tdnn_5n.sh   |   0
 egs/rm/s5/local/chain/tuning/run_tdnn_5o.sh   | 179 ++++++++++++++++++
 5 files changed, 184 insertions(+), 1 deletion(-)
 create mode 120000 egs/rm/s5/local/chain/run_tdnn.sh
 rename egs/rm/s5/local/chain/{ => tuning}/run_tdnn_5g.sh (100%)
 rename egs/rm/s5/local/chain/{ => tuning}/run_tdnn_5n.sh (100%)
 create mode 100755 egs/rm/s5/local/chain/tuning/run_tdnn_5o.sh

diff --git a/egs/rm/s5/RESULTS b/egs/rm/s5/RESULTS
index a8156e10e14..f27cc0669b4 100644
--- a/egs/rm/s5/RESULTS
+++ b/egs/rm/s5/RESULTS
@@ -233,10 +233,13 @@ for x in exp/nnet2_online_wsj/nnet_ms_a_smbr_0.00005/1/decode_*; do grep WER $x/
 # current best chain result with TDNN (check local/chain/run_tdnn_5g.sh)
 %WER 2.86 [ 358 / 12533, 46 ins, 61 del, 251 sub ] exp/chain/tdnn_5g/decode/wer_5_0.0
 %WER 2.71 [ 340 / 12533, 58 ins, 59 del, 223 sub ] exp/chain/tdnn_5n/decode/wer_4_0.0
+# This is a modified version of run_tdnn_5n.sh. It uses
+# a new configs convention for chain model after kaldi 5.2.
+%WER 1.52 [ 191 / 12533, 14 ins, 39 del, 138 sub ] exp/chain/tdnn_5o/decode/wer_4_0.5
 
 ### WSJ->RM Transfer learning using chain model ###
 %WER 1.68 [ 210 / 12533, 25 ins, 33 del, 152 sub ] exp/chain/tdnn_wsj_rm_1a/decode/wer_2_0.0
-
+ 
 ### nnet1 results ###
 
 # dnn4b, MFCC,LDA,fMLLR feaures, (Karel - 30.7.2015)
diff --git a/egs/rm/s5/local/chain/run_tdnn.sh b/egs/rm/s5/local/chain/run_tdnn.sh
new file mode 120000
index 00000000000..317ebb106b9
--- /dev/null
+++ b/egs/rm/s5/local/chain/run_tdnn.sh
@@ -0,0 +1 @@
+tuning/run_tdnn_5o.sh
\ No newline at end of file
diff --git a/egs/rm/s5/local/chain/run_tdnn_5g.sh b/egs/rm/s5/local/chain/tuning/run_tdnn_5g.sh
similarity index 100%
rename from egs/rm/s5/local/chain/run_tdnn_5g.sh
rename to egs/rm/s5/local/chain/tuning/run_tdnn_5g.sh
diff --git a/egs/rm/s5/local/chain/run_tdnn_5n.sh b/egs/rm/s5/local/chain/tuning/run_tdnn_5n.sh
similarity index 100%
rename from egs/rm/s5/local/chain/run_tdnn_5n.sh
rename to egs/rm/s5/local/chain/tuning/run_tdnn_5n.sh
diff --git a/egs/rm/s5/local/chain/tuning/run_tdnn_5o.sh b/egs/rm/s5/local/chain/tuning/run_tdnn_5o.sh
new file mode 100755
index 00000000000..15d54ba7431
--- /dev/null
+++ b/egs/rm/s5/local/chain/tuning/run_tdnn_5o.sh
@@ -0,0 +1,179 @@
+#!/bin/bash
+
+# this script is a modified version of run_tdnn_5n.sh. It uses
+# a new configs convention for chain model after kaldi 5.2.
+
+
+
+set -e
+
+# configs for 'chain'
+stage=0
+train_stage=-10
+get_egs_stage=-10
+xent_regularize=0.1
+dir=exp/chain/tdnn_5o
+
+# training options
+num_epochs=12
+initial_effective_lrate=0.005
+final_effective_lrate=0.0005
+max_param_change=2.0
+final_layer_normalize_target=0.5
+num_jobs_initial=2
+num_jobs_final=4
+minibatch_size=128
+frames_per_eg=150
+remove_egs=false
+#common_egs_dir=exp/chain/tdnn_5g/egs/
+common_egs_dir=
+# End configuration section.
+echo "$0 $@"  # Print the command line for logging
+
+. ./cmd.sh
+. ./path.sh
+. ./utils/parse_options.sh
+
+if ! cuda-compiled; then
+  cat <<EOF && exit 1
+This script is intended to be used with GPUs but you have not compiled Kaldi with CUDA
+If you want to use GPUs (and have them), go to src/, and configure and make on a machine
+where "nvcc" is installed.
+EOF
+fi
+
+# The iVector-extraction and feature-dumping parts are the same as the standard
+# nnet2 setup, and you can skip them by setting "--stage 4" if you have already
+# run those things.
+
+ali_dir=exp/tri3b_ali
+treedir=exp/chain/tri4_5o_tree
+lang=data/lang_chain_5o
+
+local/online/run_nnet2_common.sh --stage $stage || exit 1;
+
+if [ $stage -le 4 ]; then
+  # Get the alignments as lattices (gives the chain training more freedom).
+  # use the same num-jobs as the alignments
+  nj=$(cat exp/tri3b_ali/num_jobs) || exit 1;
+  steps/align_fmllr_lats.sh --nj $nj --cmd "$train_cmd" data/train \
+    data/lang exp/tri3b exp/tri3b_lats
+  rm exp/tri3b_lats/fsts.*.gz # save space
+fi
+
+if [ $stage -le 5 ]; then
+  # Create a version of the lang/ directory that has one state per phone in the
+  # topo file. [note, it really has two states.. the first one is only repeated
+  # once, the second one has zero or more repeats.]
+  rm -rf $lang
+  cp -r data/lang $lang
+  silphonelist=$(cat $lang/phones/silence.csl) || exit 1;
+  nonsilphonelist=$(cat $lang/phones/nonsilence.csl) || exit 1;
+  # Use our special topology... note that later on may have to tune this
+  # topology.
+  steps/nnet3/chain/gen_topo.py $nonsilphonelist $silphonelist >$lang/topo
+fi
+
+if [ $stage -le 6 ]; then
+  # Build a tree using our new topology.
+  steps/nnet3/chain/build_tree.sh --frame-subsampling-factor 3 \
+    --cmd "$train_cmd" 1200 data/train $lang $ali_dir $treedir
+fi
+
+if [ $stage -le 7 ]; then
+  mkdir -p $dir
+  echo "$0: creating neural net configs using the xconfig parser";
+
+  num_targets=$(tree-info $treedir/tree |grep num-pdfs|awk '{print $2}')
+  learning_rate_factor=$(echo "print(0.5/$xent_regularize)" | python)
+  tdnn_opts="l2-regularize=0.01 dropout-proportion=0.0 dropout-per-dim-continuous=true"
+  tdnnf_opts="l2-regularize=0.01 dropout-proportion=0.0 bypass-scale=0.66"
+  linear_opts="l2-regularize=0.01 orthonormal-constraint=-1.0"
+  prefinal_opts="l2-regularize=0.01"
+  output_opts="l2-regularize=0.005"
+
+  mkdir -p $dir/configs
+  cat <<EOF > $dir/configs/network.xconfig
+  input dim=50 name=ivector
+  input dim=13 name=input
+  # please note that it is important to have input layer with the name=input
+  # as the layer immediately preceding the fixed-affine-layer to enable
+  # the use of short notation for the descriptor
+  fixed-affine-layer name=lda input=Append(-1,0,1,ReplaceIndex(ivector, t, 0)) affine-transform-file=$dir/configs/lda.mat
+  # the first splicing is moved before the lda layer, so no splicing here
+  relu-batchnorm-dropout-layer name=tdnn1 $tdnn_opts dim=1024
+  tdnnf-layer name=tdnnf2 $tdnnf_opts dim=1024 bottleneck-dim=128 time-stride=1
+  tdnnf-layer name=tdnnf3 $tdnnf_opts dim=1024 bottleneck-dim=128 time-stride=1
+  tdnnf-layer name=tdnnf4 $tdnnf_opts dim=1024 bottleneck-dim=128 time-stride=1
+  tdnnf-layer name=tdnnf5 $tdnnf_opts dim=1024 bottleneck-dim=128 time-stride=0
+  tdnnf-layer name=tdnnf6 $tdnnf_opts dim=1024 bottleneck-dim=128 time-stride=3
+  tdnnf-layer name=tdnnf7 $tdnnf_opts dim=1024 bottleneck-dim=128 time-stride=3
+  tdnnf-layer name=tdnnf8 $tdnnf_opts dim=1024 bottleneck-dim=128 time-stride=3
+  tdnnf-layer name=tdnnf9 $tdnnf_opts dim=1024 bottleneck-dim=128 time-stride=3
+  tdnnf-layer name=tdnnf10 $tdnnf_opts dim=1024 bottleneck-dim=128 time-stride=3
+  tdnnf-layer name=tdnnf11 $tdnnf_opts dim=1024 bottleneck-dim=128 time-stride=3
+  tdnnf-layer name=tdnnf12 $tdnnf_opts dim=1024 bottleneck-dim=128 time-stride=3
+  tdnnf-layer name=tdnnf13 $tdnnf_opts dim=1024 bottleneck-dim=128 time-stride=3
+  linear-component name=prefinal-l dim=192 $linear_opts
+  prefinal-layer name=prefinal-chain input=prefinal-l $prefinal_opts big-dim=1024 small-dim=192
+  output-layer name=output include-log-softmax=false dim=$num_targets $output_opts
+  prefinal-layer name=prefinal-xent input=prefinal-l $prefinal_opts big-dim=1024 small-dim=192
+  output-layer name=output-xent dim=$num_targets learning-rate-factor=$learning_rate_factor $output_opts
+EOF
+  steps/nnet3/xconfig_to_configs.py --xconfig-file $dir/configs/network.xconfig --config-dir $dir/configs/
+fi
+
+if [ $stage -le 8 ]; then
+ steps/nnet3/chain/train.py --stage $train_stage \
+    --cmd "$decode_cmd" \
+    --feat.online-ivector-dir exp/nnet2_online/ivectors \
+    --feat.cmvn-opts "--norm-means=false --norm-vars=false" \
+    --chain.xent-regularize 0.1 \
+    --chain.leaky-hmm-coefficient 0.1 \
+    --chain.l2-regularize 0.00005 \
+    --chain.apply-deriv-weights false \
+    --chain.lm-opts="--num-extra-lm-states=200" \
+    --egs.dir "$common_egs_dir" \
+    --egs.opts "--frames-overlap-per-eg 0" \
+    --egs.chunk-width $frames_per_eg \
+    --trainer.num-chunk-per-minibatch $minibatch_size \
+    --trainer.frames-per-iter 1000000 \
+    --trainer.num-epochs $num_epochs \
+    --trainer.optimization.num-jobs-initial $num_jobs_initial \
+    --trainer.optimization.num-jobs-final $num_jobs_final \
+    --trainer.optimization.initial-effective-lrate $initial_effective_lrate \
+    --trainer.optimization.final-effective-lrate $final_effective_lrate \
+    --trainer.max-param-change $max_param_change \
+    --cleanup.remove-egs $remove_egs \
+    --feat-dir data/train \
+    --tree-dir $treedir \
+    --lat-dir exp/tri3b_lats \
+    --dir $dir
+fi
+
+if [ $stage -le 9 ]; then
+  steps/online/nnet2/extract_ivectors_online.sh --cmd "$train_cmd" --nj 4 \
+    data/test exp/nnet2_online/extractor exp/nnet2_online/ivectors_test || exit 1;
+fi
+
+if [ $stage -le 10 ]; then
+  # Note: it might appear that this $lang directory is mismatched, and it is as
+  # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
+  # the lang directory.
+  utils/mkgraph.sh --self-loop-scale 1.0 data/lang $dir $dir/graph
+  steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \
+    --scoring-opts "--min-lmwt 1" \
+    --nj 20 --cmd "$decode_cmd" \
+    --online-ivector-dir exp/nnet2_online/ivectors_test \
+    $dir/graph data/test $dir/decode || exit 1;
+fi
+
+if [ $stage -le 11 ]; then
+  utils/mkgraph.sh --self-loop-scale 1.0 data/lang_ug $dir $dir/graph_ug
+  steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \
+    --nj 20 --cmd "$decode_cmd" \
+    --online-ivector-dir exp/nnet2_online/ivectors_test \
+    $dir/graph_ug data/test $dir/decode_ug || exit 1;
+fi
+wait;
+exit 0;

From 4831a66fcc2c542f99ff85c40394d3e76b068ad3 Mon Sep 17 00:00:00 2001
From: Soonshin Seo <ssseo@sogang.ac.kr>
Date: Sat, 20 Apr 2019 04:38:37 +0900
Subject: [PATCH 077/163] [egs] Make voxceleb recipe work with latest version
 of the dataset  (#3249)

---
 egs/voxceleb/v1/local/make_voxceleb1_v2.pl | 123 +++++++++++++++++++++
 egs/voxceleb/v1/run.sh                     |  11 +-
 egs/voxceleb/v2/run.sh                     |  11 +-
 3 files changed, 137 insertions(+), 8 deletions(-)
 create mode 100755 egs/voxceleb/v1/local/make_voxceleb1_v2.pl

diff --git a/egs/voxceleb/v1/local/make_voxceleb1_v2.pl b/egs/voxceleb/v1/local/make_voxceleb1_v2.pl
new file mode 100755
index 00000000000..905b43d31a6
--- /dev/null
+++ b/egs/voxceleb/v1/local/make_voxceleb1_v2.pl
@@ -0,0 +1,123 @@
+#!/usr/bin/perl
+#
+# Copyright 2018  Ewald Enzinger
+#           2018  David Snyder
+#           2019  Soonshin Seo
+#
+# Usage: make_voxceleb1_v2.pl /export/voxceleb1 dev data/dev
+#
+# The VoxCeleb1 corpus underwent several updates that changed the directory and speaker ID format.
+# The script 'make_voxceleb1.pl' works for the oldest version of the corpus. 
+# This script should be used if you've downloaded the corpus recently.
+
+if (@ARGV != 3) {
+  print STDERR "Usage: $0 <path-to-voxceleb1> <dataset> <path-to-data-dir>\n";
+  print STDERR "e.g. $0 /export/voxceleb1 dev data/dev\n";
+  exit(1);
+}
+
+($data_base, $dataset, $out_dir) = @ARGV;
+
+if ("$dataset" ne "dev" && "$dataset" ne "test") {
+  die "dataset parameter must be 'dev' or 'test'!";
+}
+
+if (system("mkdir -p $out_dir") != 0) {
+  die "Error making directory $out_dir";
+}
+
+opendir my $dh, "$data_base/$dataset/wav" or die "Cannot open directory: $!";
+my @spkr_dirs = grep {-d "$data_base/$dataset/wav/$_" && ! /^\.{1,2}$/} readdir($dh);
+closedir $dh;
+
+if ($dataset eq "dev"){
+  open(SPKR_TRAIN, ">", "$out_dir/utt2spk") or die "could not open the output file $out_dir/utt2spk";
+  open(WAV_TRAIN, ">", "$out_dir/wav.scp") or die "could not open the output file $out_dir/wav.scp";
+
+  foreach (@spkr_dirs) {
+    my $spkr_id = $_;
+    opendir my $dh, "$data_base/$dataset/wav/$spkr_id/" or die "Cannot open directory: $!";
+    my @rec_dirs = grep {-d "$data_base/$dataset/wav/$spkr_id/$_" && ! /^\.{1,2}$/} readdir($dh);
+    closedir $dh;
+    foreach (@rec_dirs) {
+	  my $rec_id = $_;
+	  opendir my $dh, "$data_base/$dataset/wav/$spkr_id/$rec_id/" or die "Cannot open directory: $!";
+	  my @files = map{s/\.[^.]+$//;$_}grep {/\.wav$/} readdir($dh);
+	  closedir $dh;
+  	  foreach (@files) {
+        my $name = $_;
+        my $wav = "$data_base/$dataset/wav/$spkr_id/$rec_id/$name.wav";
+        my $utt_id = "$spkr_id-$rec_id-$name";
+        print WAV_TRAIN "$utt_id", " $wav", "\n";
+        print SPKR_TRAIN "$utt_id", " $spkr_id", "\n";
+      }
+    }
+  }
+  close(SPKR_TRAIN) or die;
+  close(WAV_TRAIN) or die;
+}
+
+if ($dataset eq "test"){
+  if (! -e "$data_base/voxceleb1_test_v2.txt") {
+    system("wget -O $data_base/voxceleb1_test_v2.txt http://www.openslr.org/resources/49/voxceleb1_test_v2.txt");
+  }
+
+  open(TRIAL_IN, "<", "$data_base/voxceleb1_test_v2.txt") or die "could not open the verification trials file $data_base/voxceleb1_test_v2.txt";
+  open(TRIAL_OUT, ">", "$out_dir/trials") or die "Could not open the output file $out_test_dir/trials";
+  open(SPKR_TEST, ">", "$out_dir/utt2spk") or die "could not open the output file $out_dir/utt2spk";
+  open(WAV_TEST, ">", "$out_dir/wav.scp") or die "could not open the output file $out_dir/wav.scp";
+
+  my $test_spkrs = ();
+  while (<TRIAL_IN>) {
+    chomp;
+    my ($tar_or_non, $path1, $path2) = split;
+    # Create entry for left-hand side of trial
+    my ($spkr_id, $rec_id, $name) = split('/', $path1);
+    my $utt_id1 = "$spkr_id-$rec_id-$name";
+    $test_spkrs{$spkr_id} = ();
+
+    # Create entry for right-hand side of trial
+    my ($spkr_id, $rec_id, $name) = split('/', $path2);
+    my $utt_id2 = "$spkr_id-$rec_id-$name";
+    $test_spkrs{$spkr_id} = ();
+
+    my $target = "nontarget";
+    if ($tar_or_non eq "1") {
+      $target = "target";
+    }
+    print TRIAL_OUT "$utt_id1 $utt_id2 $target\n";
+  }
+
+  foreach (@spkr_dirs) {
+    my $spkr_id = $_;
+    opendir my $dh, "$data_base/$dataset/wav/$spkr_id/" or die "Cannot open directory: $!";
+    my @rec_dirs = grep {-d "$data_base/$dataset/wav/$spkr_id/$_" && ! /^\.{1,2}$/} readdir($dh);
+    closedir $dh;
+    foreach (@rec_dirs) {
+	  my $rec_id = $_;
+	  opendir my $dh, "$data_base/$dataset/wav/$spkr_id/$rec_id/" or die "Cannot open directory: $!";
+	  my @files = map{s/\.[^.]+$//;$_}grep {/\.wav$/} readdir($dh);
+	  closedir $dh;
+  	  foreach (@files) {
+        my $name = $_;
+        my $wav = "$data_base/$dataset/wav/$spkr_id/$rec_id/$name.wav";
+        my $utt_id = "$spkr_id-$rec_id-$name";
+        print WAV_TEST "$utt_id", " $wav", "\n";
+        print SPKR_TEST "$utt_id", " $spkr_id", "\n";
+      }
+    }
+  }
+  close(SPKR_TEST) or die;
+  close(WAV_TEST) or die;
+  close(TRIAL_OUT) or die;
+  close(TRIAL_IN) or die;
+}
+
+if (system(
+  "utils/utt2spk_to_spk2utt.pl $out_dir/utt2spk >$out_dir/spk2utt") != 0) {
+  die "Error creating spk2utt file in directory $out_dir";
+}
+system("env LC_COLLATE=C utils/fix_data_dir.sh $out_dir");
+if (system("env LC_COLLATE=C utils/validate_data_dir.sh --no-text --no-feats $out_dir") != 0) {
+  die "Error validating directory $out_dir";
+}
diff --git a/egs/voxceleb/v1/run.sh b/egs/voxceleb/v1/run.sh
index 8af2226423d..500c05c5db6 100755
--- a/egs/voxceleb/v1/run.sh
+++ b/egs/voxceleb/v1/run.sh
@@ -14,7 +14,7 @@ set -e
 mfccdir=`pwd`/mfcc
 vaddir=`pwd`/mfcc
 
-# The trials file is downloaded by local/make_voxceleb1.pl.
+# The trials file is downloaded by local/make_voxceleb1_v2.pl.
 voxceleb1_trials=data/voxceleb1_test/trials
 voxceleb1_root=/export/corpora/VoxCeleb1
 voxceleb2_root=/export/corpora/VoxCeleb2
@@ -24,11 +24,14 @@ stage=0
 if [ $stage -le 0 ]; then
   local/make_voxceleb2.pl $voxceleb2_root dev data/voxceleb2_train
   local/make_voxceleb2.pl $voxceleb2_root test data/voxceleb2_test
-  # This script reates data/voxceleb1_test and data/voxceleb1_train.
+  # This script creates data/voxceleb1_test and data/voxceleb1_train for latest version of VoxCeleb1.
   # Our evaluation set is the test portion of VoxCeleb1.
-  local/make_voxceleb1.pl $voxceleb1_root data
+  local/make_voxceleb1_v2.pl $voxceleb1_root dev data/voxceleb1_train
+  local/make_voxceleb1_v2.pl $voxceleb1_root test data/voxceleb1_test
+  # if you downloaded the dataset soon after it was released, you will want to use the make_voxceleb1.pl script instead.
+  # local/make_voxceleb1.pl $voxceleb1_root data
   # We'll train on all of VoxCeleb2, plus the training portion of VoxCeleb1.
-  # This should give 7,351 speakers and 1,277,503 utterances.
+  # This should give 7,323 speakers and 1,276,888 utterances.
   utils/combine_data.sh data/train data/voxceleb2_train data/voxceleb2_test data/voxceleb1_train
 fi
 
diff --git a/egs/voxceleb/v2/run.sh b/egs/voxceleb/v2/run.sh
index 37bb60fe35c..44340873a80 100755
--- a/egs/voxceleb/v2/run.sh
+++ b/egs/voxceleb/v2/run.sh
@@ -15,7 +15,7 @@ mfccdir=`pwd`/mfcc
 vaddir=`pwd`/mfcc
 
 
-# The trials file is downloaded by local/make_voxceleb1.pl.
+# The trials file is downloaded by local/make_voxceleb1_v2.pl.
 voxceleb1_trials=data/voxceleb1_test/trials
 voxceleb1_root=/export/corpora/VoxCeleb1
 voxceleb2_root=/export/corpora/VoxCeleb2
@@ -27,11 +27,14 @@ stage=0
 if [ $stage -le 0 ]; then
   local/make_voxceleb2.pl $voxceleb2_root dev data/voxceleb2_train
   local/make_voxceleb2.pl $voxceleb2_root test data/voxceleb2_test
-  # This script creates data/voxceleb1_test and data/voxceleb1_train.
+  # This script creates data/voxceleb1_test and data/voxceleb1_train for latest version of VoxCeleb1.
   # Our evaluation set is the test portion of VoxCeleb1.
-  local/make_voxceleb1.pl $voxceleb1_root data
+  local/make_voxceleb1_v2.pl $voxceleb1_root dev data/voxceleb1_train
+  local/make_voxceleb1_v2.pl $voxceleb1_root test data/voxceleb1_test 
+  # if you downloaded the dataset soon after it was released, you will want to use the make_voxceleb1.pl script instead.
+  # local/make_voxceleb1.pl $voxceleb1_root data
   # We'll train on all of VoxCeleb2, plus the training portion of VoxCeleb1.
-  # This should give 7,351 speakers and 1,277,503 utterances.
+  # This should give 7,323 speakers and 1,276,888 utterances.
   utils/combine_data.sh data/train data/voxceleb2_train data/voxceleb2_test data/voxceleb1_train
 fi
 

From 0534e4946e5d9144748ac8865833ee13b6a898ca Mon Sep 17 00:00:00 2001
From: Hosung Park <indra622@gmail.com>
Date: Sun, 21 Apr 2019 13:11:54 +0900
Subject: [PATCH 078/163] [egs] Improve chain example script for Resource
 Management (RM) (#3252)

---
 egs/rm/s5/RESULTS                           |  7 ++--
 egs/rm/s5/conf/mfcc_hires.conf              |  8 ++++
 egs/rm/s5/local/chain/tuning/run_tdnn_5o.sh | 46 ++++++++++++---------
 egs/rm/s5/local/online/run_nnet2_common.sh  |  8 ++--
 egs/rm/s5/run.sh                            |  2 +-
 5 files changed, 43 insertions(+), 28 deletions(-)
 create mode 100644 egs/rm/s5/conf/mfcc_hires.conf

diff --git a/egs/rm/s5/RESULTS b/egs/rm/s5/RESULTS
index f27cc0669b4..2f1262510fb 100644
--- a/egs/rm/s5/RESULTS
+++ b/egs/rm/s5/RESULTS
@@ -233,10 +233,9 @@ for x in exp/nnet2_online_wsj/nnet_ms_a_smbr_0.00005/1/decode_*; do grep WER $x/
 # current best chain result with TDNN (check local/chain/run_tdnn_5g.sh)
 %WER 2.86 [ 358 / 12533, 46 ins, 61 del, 251 sub ] exp/chain/tdnn_5g/decode/wer_5_0.0
 %WER 2.71 [ 340 / 12533, 58 ins, 59 del, 223 sub ] exp/chain/tdnn_5n/decode/wer_4_0.0
-# This is a modified version of run_tdnn_5n.sh. It uses
-# a new configs convention for chain model after kaldi 5.2.
-%WER 1.52 [ 191 / 12533, 14 ins, 39 del, 138 sub ] exp/chain/tdnn_5o/decode/wer_4_0.5
-
+# Its topology of chain model is from mini_librispeech's.
+# It uses a new configs convention for chain model after kaldi 5.2.
+%WER 1.32 [ 166 / 12533, 19 ins, 31 del, 116 sub ] exp/chain/tdnn_5o/decode/wer_4_0.0
 ### WSJ->RM Transfer learning using chain model ###
 %WER 1.68 [ 210 / 12533, 25 ins, 33 del, 152 sub ] exp/chain/tdnn_wsj_rm_1a/decode/wer_2_0.0
  
diff --git a/egs/rm/s5/conf/mfcc_hires.conf b/egs/rm/s5/conf/mfcc_hires.conf
new file mode 100644
index 00000000000..19f218f982e
--- /dev/null
+++ b/egs/rm/s5/conf/mfcc_hires.conf
@@ -0,0 +1,8 @@
+# config for high-resolution MFCC features, intended for neural network training
+# Note: we keep all cepstra, so it has the same info as filterbank features,
+# but MFCC is more easily compressible (because less correlated) which is why 
+# we prefer this method.
+--use-energy=false   # use average of log energy, not energy.
+--low-freq=20     # low cutoff frequency for mel bins... this is high-bandwidth data, so
+                  # there might be some information at the low end.
+--high-freq=-400 # high cutoff frequently, relative to Nyquist of 8000 (=7600) 
diff --git a/egs/rm/s5/local/chain/tuning/run_tdnn_5o.sh b/egs/rm/s5/local/chain/tuning/run_tdnn_5o.sh
index 15d54ba7431..db5944fdbea 100755
--- a/egs/rm/s5/local/chain/tuning/run_tdnn_5o.sh
+++ b/egs/rm/s5/local/chain/tuning/run_tdnn_5o.sh
@@ -15,7 +15,7 @@ xent_regularize=0.1
 dir=exp/chain/tdnn_5o
 
 # training options
-num_epochs=12
+num_epochs=13
 initial_effective_lrate=0.005
 final_effective_lrate=0.0005
 max_param_change=2.0
@@ -96,28 +96,34 @@ if [ $stage -le 7 ]; then
   cat <<EOF > $dir/configs/network.xconfig
   input dim=50 name=ivector
   input dim=13 name=input
+
   # please note that it is important to have input layer with the name=input
   # as the layer immediately preceding the fixed-affine-layer to enable
   # the use of short notation for the descriptor
   fixed-affine-layer name=lda input=Append(-1,0,1,ReplaceIndex(ivector, t, 0)) affine-transform-file=$dir/configs/lda.mat
+
   # the first splicing is moved before the lda layer, so no splicing here
-  relu-batchnorm-dropout-layer name=tdnn1 $tdnn_opts dim=1024
-  tdnnf-layer name=tdnnf2 $tdnnf_opts dim=1024 bottleneck-dim=128 time-stride=1
-  tdnnf-layer name=tdnnf3 $tdnnf_opts dim=1024 bottleneck-dim=128 time-stride=1
-  tdnnf-layer name=tdnnf4 $tdnnf_opts dim=1024 bottleneck-dim=128 time-stride=1
-  tdnnf-layer name=tdnnf5 $tdnnf_opts dim=1024 bottleneck-dim=128 time-stride=0
-  tdnnf-layer name=tdnnf6 $tdnnf_opts dim=1024 bottleneck-dim=128 time-stride=3
-  tdnnf-layer name=tdnnf7 $tdnnf_opts dim=1024 bottleneck-dim=128 time-stride=3
-  tdnnf-layer name=tdnnf8 $tdnnf_opts dim=1024 bottleneck-dim=128 time-stride=3
-  tdnnf-layer name=tdnnf9 $tdnnf_opts dim=1024 bottleneck-dim=128 time-stride=3
-  tdnnf-layer name=tdnnf10 $tdnnf_opts dim=1024 bottleneck-dim=128 time-stride=3
-  tdnnf-layer name=tdnnf11 $tdnnf_opts dim=1024 bottleneck-dim=128 time-stride=3
-  tdnnf-layer name=tdnnf12 $tdnnf_opts dim=1024 bottleneck-dim=128 time-stride=3
-  tdnnf-layer name=tdnnf13 $tdnnf_opts dim=1024 bottleneck-dim=128 time-stride=3
+  relu-batchnorm-dropout-layer name=tdnn1 $tdnn_opts dim=768
+  tdnnf-layer name=tdnnf2 $tdnnf_opts dim=768 bottleneck-dim=96 time-stride=1
+  tdnnf-layer name=tdnnf3 $tdnnf_opts dim=768 bottleneck-dim=96 time-stride=1
+  tdnnf-layer name=tdnnf4 $tdnnf_opts dim=768 bottleneck-dim=96 time-stride=1
+  tdnnf-layer name=tdnnf5 $tdnnf_opts dim=768 bottleneck-dim=96 time-stride=0
+  tdnnf-layer name=tdnnf6 $tdnnf_opts dim=768 bottleneck-dim=96 time-stride=3
+  tdnnf-layer name=tdnnf7 $tdnnf_opts dim=768 bottleneck-dim=96 time-stride=3
+  tdnnf-layer name=tdnnf8 $tdnnf_opts dim=768 bottleneck-dim=96 time-stride=3
+  tdnnf-layer name=tdnnf9 $tdnnf_opts dim=768 bottleneck-dim=96 time-stride=3
+  tdnnf-layer name=tdnnf10 $tdnnf_opts dim=768 bottleneck-dim=96 time-stride=3
+  tdnnf-layer name=tdnnf11 $tdnnf_opts dim=768 bottleneck-dim=96 time-stride=3
+  tdnnf-layer name=tdnnf12 $tdnnf_opts dim=768 bottleneck-dim=96 time-stride=3
+  tdnnf-layer name=tdnnf13 $tdnnf_opts dim=768 bottleneck-dim=96 time-stride=3
   linear-component name=prefinal-l dim=192 $linear_opts
-  prefinal-layer name=prefinal-chain input=prefinal-l $prefinal_opts big-dim=1024 small-dim=192
+
+  ## adding the layers for chain branch
+  prefinal-layer name=prefinal-chain input=prefinal-l $prefinal_opts small-dim=192 big-dim=768
   output-layer name=output include-log-softmax=false dim=$num_targets $output_opts
-  prefinal-layer name=prefinal-xent input=prefinal-l $prefinal_opts big-dim=1024 small-dim=192
+
+  # adding the layers for xent branch
+  prefinal-layer name=prefinal-xent input=prefinal-l $prefinal_opts small-dim=192 big-dim=768
   output-layer name=output-xent dim=$num_targets learning-rate-factor=$learning_rate_factor $output_opts
 EOF
   steps/nnet3/xconfig_to_configs.py --xconfig-file $dir/configs/network.xconfig --config-dir $dir/configs/
@@ -145,7 +151,7 @@ if [ $stage -le 8 ]; then
     --trainer.optimization.final-effective-lrate $final_effective_lrate \
     --trainer.max-param-change $max_param_change \
     --cleanup.remove-egs $remove_egs \
-    --feat-dir data/train \
+    --feat-dir data/train_hires \
     --tree-dir $treedir \
     --lat-dir exp/tri3b_lats \
     --dir $dir
@@ -153,7 +159,7 @@ fi
 
 if [ $stage -le 9 ]; then
   steps/online/nnet2/extract_ivectors_online.sh --cmd "$train_cmd" --nj 4 \
-    data/test exp/nnet2_online/extractor exp/nnet2_online/ivectors_test || exit 1;
+    data/test_hires exp/nnet2_online/extractor exp/nnet2_online/ivectors_test || exit 1;
 fi
 
 if [ $stage -le 10 ]; then
@@ -165,7 +171,7 @@ if [ $stage -le 10 ]; then
     --scoring-opts "--min-lmwt 1" \
     --nj 20 --cmd "$decode_cmd" \
     --online-ivector-dir exp/nnet2_online/ivectors_test \
-    $dir/graph data/test $dir/decode || exit 1;
+    $dir/graph data/test_hires $dir/decode || exit 1;
 fi
 
 if [ $stage -le 11 ]; then
@@ -173,7 +179,7 @@ if [ $stage -le 11 ]; then
   steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \
     --nj 20 --cmd "$decode_cmd" \
     --online-ivector-dir exp/nnet2_online/ivectors_test \
-    $dir/graph_ug data/test $dir/decode_ug || exit 1;
+    $dir/graph_ug data/test_hires $dir/decode_ug || exit 1;
 fi
 wait;
 exit 0;
diff --git a/egs/rm/s5/local/online/run_nnet2_common.sh b/egs/rm/s5/local/online/run_nnet2_common.sh
index e0034ddd7d2..fb516375543 100755
--- a/egs/rm/s5/local/online/run_nnet2_common.sh
+++ b/egs/rm/s5/local/online/run_nnet2_common.sh
@@ -36,6 +36,7 @@ else
 fi
 
 train_set=train
+test_set=test
 if [ $stage -le 0 ]; then
   echo "$0: creating high-resolution MFCC features."
   mfccdir=data/${train_set}_hires/data
@@ -48,9 +49,10 @@ if [ $stage -le 0 ]; then
     steps/compute_cmvn_stats.sh data/${datadir}_hires
     utils/fix_data_dir.sh data/${datadir}_hires
   done
+  train_set=${train_set}_hires
+  test_set=${test_set}_hires
 fi
 
-train_set=${train_set}_hires
 if [ ! -f $extractor/final.ie ] && [ $ivector_dim -gt 0 ]; then
   if [ $stage -le 1 ]; then
     mkdir -p exp/nnet2${nnet_affix}
@@ -61,7 +63,7 @@ if [ ! -f $extractor/final.ie ] && [ $ivector_dim -gt 0 ]; then
   if [ $stage -le 2 ]; then
     # use a smaller iVector dim (50) than the default (100) because RM has a very
     # small amount of data.
-    steps/online/nnet2/train_ivector_extractor.sh --cmd "$train_cmd" --nj 40 \
+    steps/online/nnet2/train_ivector_extractor.sh --cmd "$train_cmd" --nj 10 \
       --ivector-dim $ivector_dim \
      data/${train_set} exp/nnet2${nnet_affix}/diag_ubm $extractor || exit 1;
   fi
@@ -76,5 +78,5 @@ if [ $stage -le 3 ] && [ $ivector_dim -gt 0 ]; then
     data/${train_set}_max2 $extractor exp/nnet2${nnet_affix}/ivectors || exit 1;
 
   steps/online/nnet2/extract_ivectors_online.sh --cmd "$train_cmd" --nj 10 \
-    data/test_hires $extractor exp/nnet2${nnet_affix}/ivectors_test || exit 1;
+    data/${test_set} $extractor exp/nnet2${nnet_affix}/ivectors_test || exit 1;
 fi
diff --git a/egs/rm/s5/run.sh b/egs/rm/s5/run.sh
index 7d27b2c6d91..f167e590735 100755
--- a/egs/rm/s5/run.sh
+++ b/egs/rm/s5/run.sh
@@ -251,4 +251,4 @@ local/run_sgmm2.sh
 # local/nnet/run_cnn2d.sh
 
 # chain recipe
-# local/chain/run_tdnn_5f.sh
+# local/chain/run_tdnn.sh

From db2ed3276ec9372e54b2ca771f81e4bed30bc674 Mon Sep 17 00:00:00 2001
From: Justin Luitjens <luitjens@users.noreply.github.com>
Date: Mon, 22 Apr 2019 13:48:44 -0600
Subject: [PATCH 079/163] [src] GPU-related changes for speed and correctness
 on newer arch's. (#3221)

Changes:

cu-array-inl.h, cu-packed-matrix.cc:
   Remove unecessary synchronization.  Synchronization will occur with
   stream semantics

cu-device.h, cu-device.cc, cuda-common.h, cuda_64bit.mk
   Add a handle for cusolverDN library.  Future changes will rely on
   this.

cu-kernels-ansi.h, cu-kernels.cu, cu-kernels.h:
   Add RowSumMat kernel support which mirriors ColSumMat but operators on rows.

cu-matrix.cc:
   make cudaMemset2D asynchronous.  Synchronization is handled via
   streams.

cu-value.h:
   Added -= operator which mirriors += operator

cu-vector.cc, cu-vector.h:
   Added ApplyLogSoftMax which matches CPU version.
   Remove stream synchronization on AddMatVec (handled by streams)
   Use direct kernel for row sum instead of a mat vec.  This is more
   efficient as it avoids extra allocation and memset.

cu-sparse-matrix-test.cc:
   adjusted epislon to be more tolerant of order of operations floating
   point error.
---
 src/cudamatrix/cu-array-inl.h           |  1 -
 src/cudamatrix/cu-common.h              |  9 ++++
 src/cudamatrix/cu-device.h              |  7 ++++
 src/cudamatrix/cu-kernels-ansi.h        |  6 +++
 src/cudamatrix/cu-kernels.cu            | 55 +++++++++++++++++++++++++
 src/cudamatrix/cu-kernels.h             | 10 +++++
 src/cudamatrix/cu-matrix.cc             |  6 ++-
 src/cudamatrix/cu-packed-matrix.cc      |  1 -
 src/cudamatrix/cu-sparse-matrix-test.cc |  2 +-
 src/cudamatrix/cu-value.h               |  1 +
 src/cudamatrix/cu-vector.cc             | 39 +++++++++++++++---
 src/cudamatrix/cu-vector.h              |  1 +
 src/makefiles/cuda_64bit.mk             |  2 +-
 13 files changed, 129 insertions(+), 11 deletions(-)

diff --git a/src/cudamatrix/cu-array-inl.h b/src/cudamatrix/cu-array-inl.h
index 567cc0f6d18..53de59fe4fc 100644
--- a/src/cudamatrix/cu-array-inl.h
+++ b/src/cudamatrix/cu-array-inl.h
@@ -221,7 +221,6 @@ void CuArrayBase<T>::SetZero() {
     CuTimer tim;
     CU_SAFE_CALL(cudaMemsetAsync(this->data_, 0, this->dim_ * sizeof(T),
           cudaStreamPerThread));
-    CU_SAFE_CALL(cudaStreamSynchronize(cudaStreamPerThread));
     CuDevice::Instantiate().AccuProfile("CuArray::SetZero", tim);
   } else
 #endif
diff --git a/src/cudamatrix/cu-common.h b/src/cudamatrix/cu-common.h
index 7446a76bf93..54eda86f572 100644
--- a/src/cudamatrix/cu-common.h
+++ b/src/cudamatrix/cu-common.h
@@ -51,6 +51,15 @@
   } \
 }
 
+#define CUSOLVER_SAFE_CALL(fun) \
+{ \
+  int32 ret; \
+  if ((ret = (fun)) != 0) { \
+    KALDI_ERR << "cusolverStatus_t " << ret << " : \"" << ret << "\" returned from '" << #fun << "'"; \
+  } \
+}
+
+
 #define CUSPARSE_SAFE_CALL(fun) \
 { \
   int32 ret; \
diff --git a/src/cudamatrix/cu-device.h b/src/cudamatrix/cu-device.h
index 7cca69f754b..d94d1166c04 100644
--- a/src/cudamatrix/cu-device.h
+++ b/src/cudamatrix/cu-device.h
@@ -27,6 +27,7 @@
 #include <cublas_v2.h>
 #include <cusparse.h>
 #include <curand.h>
+#include <cusolverDn.h>
 #include <map>
 #include <string>
 #include <iostream>
@@ -83,6 +84,7 @@ class CuDevice {
   inline cublasHandle_t GetCublasHandle() { return cublas_handle_; }
   inline cusparseHandle_t GetCusparseHandle() { return cusparse_handle_; }
   inline curandGenerator_t GetCurandHandle() { return curand_handle_; }
+  inline cusolverDnHandle_t GetCusolverDnHandle() { return cusolverdn_handle_; }
 
   inline void SeedGpu() {
     if (CuDevice::Instantiate().Enabled()) {
@@ -304,6 +306,7 @@ class CuDevice {
   cublasHandle_t cublas_handle_;
   cusparseHandle_t cusparse_handle_;
   curandGenerator_t curand_handle_;
+  cusolverDnHandle_t cusolverdn_handle_;
 }; // class CuDevice
 
 
@@ -321,6 +324,10 @@ class CuTimer: public Timer {
 inline cublasHandle_t GetCublasHandle() { 
   return CuDevice::Instantiate().GetCublasHandle(); 
 }
+
+inline cusolverDnHandle_t GetCusolverDnHandle() { 
+  return CuDevice::Instantiate().GetCusolverDnHandle(); 
+}
 // A more convenient way to get the handle to use cuSPARSE APIs.
 inline cusparseHandle_t GetCusparseHandle() { 
   return CuDevice::Instantiate().GetCusparseHandle(); 
diff --git a/src/cudamatrix/cu-kernels-ansi.h b/src/cudamatrix/cu-kernels-ansi.h
index a61bb601e8e..75ebcf79d74 100644
--- a/src/cudamatrix/cu-kernels-ansi.h
+++ b/src/cudamatrix/cu-kernels-ansi.h
@@ -39,6 +39,12 @@ typedef float   BaseFloat;
 #endif
 
 
+void cudaD_add_row_sum_mat(int Gr, int Bl, double* result, const double* mat,
+                           const MatrixDim d, const double alpha,
+                           const double beta);
+void cudaF_add_row_sum_mat(int Gr, int Bl, float* result, const float* mat,
+                           const MatrixDim d, const float alpha,
+                           const float beta);
 void cudaD_add_col_sum_mat(int Gr, int Bl, double* result, const double* mat,
                            const MatrixDim d, const double alpha,
                            const double beta);
diff --git a/src/cudamatrix/cu-kernels.cu b/src/cudamatrix/cu-kernels.cu
index bc5c32714ef..b89fc54b6ce 100644
--- a/src/cudamatrix/cu-kernels.cu
+++ b/src/cudamatrix/cu-kernels.cu
@@ -1700,6 +1700,48 @@ static void _vec_transform_reduce(
     result[blockIdx.x] = op.PostReduce(sdata[0], result[blockIdx.x]);
 }
 
+// Reduce a matrix 'mat' to a row vector 'result'
+template<EnumTransformReduce TransReduceType, typename Real>
+__global__
+static void _transform_reduce_mat_rows(
+    Real *result, const Real *mat, const MatrixDim d,
+    const TransReduceOp<TransReduceType, Real> op) {
+
+  __shared__ Real sdata[CU1DBLOCK];
+  const int tid = threadIdx.x;
+  const int j = blockIdx.x;
+
+  Real tdata = op.InitValue();
+  for (int i = tid; i < d.rows; i += CU1DBLOCK) {
+    //Note the loads of mat are uncoalesced.  We could eliminate these
+    //with shared memory but at the matrix sizes we are currently looking 
+    //at it probably would not help much and would add a lot of complexity.
+    //Alternatively we could look at something like trov to help loads.
+    tdata = op.Reduce(tdata, op.Transform(mat[i * d.stride + j]));
+  }
+  sdata[tid] = tdata;
+  __syncthreads();
+
+  // Tree reduce
+# pragma unroll
+  for (int shift = CU1DBLOCK / 2; shift > warpSize; shift >>= 1) {
+    if (tid < shift)
+      sdata[tid] = op.Reduce(sdata[tid], sdata[tid + shift]);
+    __syncthreads();
+  }
+
+  // Reduce last warp. Threads implicitly synchronized within a warp.
+  if (tid < warpSize) {
+    for (int shift = warpSize; shift > 0; shift >>= 1)
+      sdata[tid] = op.Reduce(sdata[tid], sdata[tid + shift]);
+  }
+
+  // Output to vector result.
+  if (tid == 0) {
+    result[j] = op.PostReduce(sdata[0], result[j]);
+  }
+}
+
 // Reduce a matrix 'mat' to a column vector 'result'
 template<EnumTransformReduce TransReduceType, typename Real>
 __global__
@@ -3937,6 +3979,12 @@ void cudaF_sum_mat_cols(int Gr, int Bl, float* result, const float* mat,
   _transform_reduce_mat_cols<<<Gr,Bl>>>(result,mat,d,
       TransReduceOp<SUM,float>());
 }
+void cudaF_add_row_sum_mat(int Gr, int Bl, float* result, const float* mat,
+                           const MatrixDim d, const float alpha,
+                           const float beta) {
+  _transform_reduce_mat_rows<<<Gr, Bl>>>(result, mat, d,
+      TransReduceOp<SUMAB, float>(alpha, beta));
+}
 void cudaF_add_col_sum_mat(int Gr, int Bl, float* result, const float* mat,
                            const MatrixDim d, const float alpha,
                            const float beta) {
@@ -3944,6 +3992,7 @@ void cudaF_add_col_sum_mat(int Gr, int Bl, float* result, const float* mat,
       TransReduceOp<SUMAB, float>(alpha, beta));
 }
 
+
 void cudaF_replace_value(int Gr, int Bl, float *v, int dim, float orig,
                          float changed) {
   _replace_value<<<Gr,Bl>>>(v, dim, orig, changed);
@@ -4645,6 +4694,12 @@ void cudaD_sum_mat_cols(int Gr, int Bl, double* result, const double* mat,
   _transform_reduce_mat_cols<<<Gr,Bl>>>(result,mat,d,
       TransReduceOp<SUM,double>());
 }
+void cudaD_add_row_sum_mat(int Gr, int Bl, double* result, const double* mat,
+                           const MatrixDim d, const double alpha,
+                           const double beta) {
+  _transform_reduce_mat_rows<<<Gr, Bl>>>(result, mat, d,
+      TransReduceOp<SUMAB, double>(alpha, beta));
+}
 void cudaD_add_col_sum_mat(int Gr, int Bl, double* result, const double* mat,
                            const MatrixDim d, const double alpha,
                            const double beta) {
diff --git a/src/cudamatrix/cu-kernels.h b/src/cudamatrix/cu-kernels.h
index 6c24ce0dd58..f93c1e2b2e0 100644
--- a/src/cudamatrix/cu-kernels.h
+++ b/src/cudamatrix/cu-kernels.h
@@ -38,6 +38,16 @@
 
 namespace kaldi {
 
+inline void cuda_add_row_sum_mat(int Gr, int Bl, double* result,
+                                 const double* mat, const MatrixDim d,
+                                 const double alpha, const double beta) {
+  cudaD_add_row_sum_mat(Gr, Bl, result, mat, d, alpha, beta);
+}
+inline void cuda_add_row_sum_mat(int Gr, int Bl, float* result,
+                                 const float* mat, const MatrixDim d,
+                                 const float alpha, const float beta) {
+  cudaF_add_row_sum_mat(Gr, Bl, result, mat, d, alpha, beta);
+}
 inline void cuda_add_col_sum_mat(int Gr, int Bl, double* result,
                                  const double* mat, const MatrixDim d,
                                  const double alpha, const double beta) {
diff --git a/src/cudamatrix/cu-matrix.cc b/src/cudamatrix/cu-matrix.cc
index efe8dec7652..ae091370edd 100644
--- a/src/cudamatrix/cu-matrix.cc
+++ b/src/cudamatrix/cu-matrix.cc
@@ -482,8 +482,9 @@ void CuMatrixBase<Real>::SetZero() {
 #if HAVE_CUDA == 1
   if (CuDevice::Instantiate().Enabled()) {
     CuTimer tim;
-    CU_SAFE_CALL(cudaMemset2D(data_, stride_ * sizeof(Real), 0,
-                              num_cols_ * sizeof(Real), num_rows_ ));
+    CU_SAFE_CALL(cudaMemset2DAsync(data_, stride_ * sizeof(Real), 0,
+                              num_cols_ * sizeof(Real), num_rows_ , 
+                              cudaStreamPerThread));
     CuDevice::Instantiate().AccuProfile("CuMatrix::SetZero", tim);
   } else
 #endif
@@ -2094,6 +2095,7 @@ void CuMatrixBase<Real>::Cholesky(CuMatrixBase<Real> *inv_cholesky) {
   // (5)(d) zero L12 and M12.
   this_12.SetZero();
   inv_12.SetZero();
+
 }
 
 
diff --git a/src/cudamatrix/cu-packed-matrix.cc b/src/cudamatrix/cu-packed-matrix.cc
index c331920c61f..756d580c7cf 100644
--- a/src/cudamatrix/cu-packed-matrix.cc
+++ b/src/cudamatrix/cu-packed-matrix.cc
@@ -252,7 +252,6 @@ void CuPackedMatrix<Real>::SetZero() {
 
     CU_SAFE_CALL(cudaMemsetAsync(reinterpret_cast<void*>(this->data_), 0, 
           num_bytes, cudaStreamPerThread));
-    CU_SAFE_CALL(cudaStreamSynchronize(cudaStreamPerThread));
     CuDevice::Instantiate().AccuProfile("CuPackedMatrix::SetZero", tim);
   } else
   #endif
diff --git a/src/cudamatrix/cu-sparse-matrix-test.cc b/src/cudamatrix/cu-sparse-matrix-test.cc
index 38f78c7c5e5..aad34b5dd54 100644
--- a/src/cudamatrix/cu-sparse-matrix-test.cc
+++ b/src/cudamatrix/cu-sparse-matrix-test.cc
@@ -173,7 +173,7 @@ static void UnitTestCuSparseMatrixSum() {
 
     Real sum1 = cu_smat.Sum();
     Real sum2 = mat.Sum();
-    KALDI_ASSERT(fabs(sum1 - sum2) < 1.0e-05);
+    KALDI_ASSERT(fabs(sum1 - sum2) < 1.0e-04);
   }
 }
 
diff --git a/src/cudamatrix/cu-value.h b/src/cudamatrix/cu-value.h
index b5b65479e57..af8e19987ce 100644
--- a/src/cudamatrix/cu-value.h
+++ b/src/cudamatrix/cu-value.h
@@ -67,6 +67,7 @@ class CuValue {
   }
 
   inline Real operator += (Real r) { return (*this = r + Real(*this)); }
+  inline Real operator -= (Real r) { return (*this = Real(*this) - r); }
     
 
   inline operator Real () const { // assignment to Real
diff --git a/src/cudamatrix/cu-vector.cc b/src/cudamatrix/cu-vector.cc
index 2e06cffad48..5f030e7ca03 100644
--- a/src/cudamatrix/cu-vector.cc
+++ b/src/cudamatrix/cu-vector.cc
@@ -478,6 +478,27 @@ void CuVectorBase<Real>::ApplyLog() {
   }
 }
 
+template<typename Real>
+void CuVectorBase<Real>::ApplyLogSoftMax() {
+#if HAVE_CUDA == 1
+  if (CuDevice::Instantiate().Enabled()) {
+    if (dim_ == 0) return;
+    CuTimer tim;
+    size_t dimBlock = CU1DBLOCK;
+    size_t dimGrid = 1;       // dimGrid value represent the number of rows
+    ::MatrixDim dim = { 1, this->dim_, this->dim_};
+    
+    cuda_log_softmax_reduce(dimGrid, dimBlock, data_, data_, dim, this->dim_);
+    CU_SAFE_CALL(cudaGetLastError());
+    CuDevice::Instantiate().AccuProfile(__func__, tim);
+  } else
+#endif
+  {
+    Vec().ApplyLogSoftMax();
+  }
+}
+
+
 
 template<typename Real>
 void CuVectorBase<Real>::AddMatVec(const Real alpha,
@@ -1081,7 +1102,6 @@ void CuVectorBase<Real>::SetZero() {
     CuTimer tim;
     CU_SAFE_CALL(cudaMemsetAsync(data_, 0, dim_*sizeof(Real),
           cudaStreamPerThread));
-    CU_SAFE_CALL(cudaStreamSynchronize(cudaStreamPerThread));
     CuDevice::Instantiate().AccuProfile("CuVector::SetZero", tim);
   } else
 #endif
@@ -1257,10 +1277,19 @@ void CuVectorBase<Real>::AddRowSumMat(Real alpha, const CuMatrixBase<Real> &mat,
   KALDI_ASSERT(mat.NumCols() == Dim());
   if (Dim() == 0)
     return;
-  CuVector<Real> ones(mat.NumRows());
-  ones.Set(1.0);
-  this->AddMatVec(alpha, mat, kTrans, ones, beta);
-
+#if HAVE_CUDA == 1
+  if (CuDevice::Instantiate().Enabled()) {
+    CuTimer tim;
+    cuda_add_row_sum_mat(mat.NumCols(), CU1DBLOCK, Data(), mat.Data(),
+                         mat.Dim(), alpha, beta);
+    CU_SAFE_CALL(cudaGetLastError());
+    
+    CuDevice::Instantiate().AccuProfile(__func__, tim);
+  } else 
+#endif
+  {
+    Vec().AddRowSumMat(alpha, mat.Mat(), beta);
+  }
 }
 
 template<typename Real>
diff --git a/src/cudamatrix/cu-vector.h b/src/cudamatrix/cu-vector.h
index 69ca2ae3125..d769b614f86 100644
--- a/src/cudamatrix/cu-vector.h
+++ b/src/cudamatrix/cu-vector.h
@@ -132,6 +132,7 @@ class CuVectorBase {
                     const CuArrayBase<int32> &elements);
 
   void ApplySoftMax();
+  void ApplyLogSoftMax();
   void ApplyExp();
   void ApplyLog();
   void ApplyFloor(Real floor_val, MatrixIndexT *floored_count = NULL);
diff --git a/src/makefiles/cuda_64bit.mk b/src/makefiles/cuda_64bit.mk
index eb8cf743ab3..e4ba1f147c9 100644
--- a/src/makefiles/cuda_64bit.mk
+++ b/src/makefiles/cuda_64bit.mk
@@ -14,4 +14,4 @@ CUDA_FLAGS = --machine 64 -DHAVE_CUDA \
              --verbose -Xcompiler "$(CXXFLAGS)"
 
 CUDA_LDFLAGS += -L$(CUDATKDIR)/lib64 -Wl,-rpath,$(CUDATKDIR)/lib64
-CUDA_LDLIBS += -lcublas -lcusparse -lcudart -lcurand -lnvToolsExt #LDLIBS : The .so libs are loaded later than static libs in implicit rule
+CUDA_LDLIBS += -lcublas -lcusparse -lcusolver -lcudart -lcurand -lnvToolsExt #LDLIBS : The .so libs are loaded later than static libs in implicit rule

From 479c7324d1ceb5cfaf356379a8e8991e5e70c6a5 Mon Sep 17 00:00:00 2001
From: jyhnnhyj <48015613+jyhnnhyj@users.noreply.github.com>
Date: Tue, 23 Apr 2019 03:00:48 +0200
Subject: [PATCH 080/163] [egs] Update voxceleb v1 preparation scripts (#3255)

---
 egs/dihard_2018/v1/local/make_voxceleb1_v2.pl | 123 ++++++++++++++++++
 egs/dihard_2018/v1/run.sh                     |  11 +-
 egs/dihard_2018/v2/local/make_voxceleb1_v2.pl |   1 +
 egs/dihard_2018/v2/run.sh                     |  11 +-
 4 files changed, 140 insertions(+), 6 deletions(-)
 create mode 100755 egs/dihard_2018/v1/local/make_voxceleb1_v2.pl
 create mode 120000 egs/dihard_2018/v2/local/make_voxceleb1_v2.pl

diff --git a/egs/dihard_2018/v1/local/make_voxceleb1_v2.pl b/egs/dihard_2018/v1/local/make_voxceleb1_v2.pl
new file mode 100755
index 00000000000..0bc13bea251
--- /dev/null
+++ b/egs/dihard_2018/v1/local/make_voxceleb1_v2.pl
@@ -0,0 +1,123 @@
+#!/usr/bin/perl
+#
+# Copyright 2018  Ewald Enzinger
+#           2018  David Snyder
+#           2019  Soonshin Seo
+#
+# Usage: make_voxceleb1_v2.pl /export/voxceleb1 dev data/dev
+#
+# The VoxCeleb1 corpus underwent several updates that changed the directory and speaker ID format.
+# The script 'make_voxceleb1.pl' works for the oldest version of the corpus. 
+# This script should be used if you've downloaded the corpus recently.
+
+if (@ARGV != 3) {
+  print STDERR "Usage: $0 <path-to-voxceleb1> <dataset> <path-to-data-dir>\n";
+  print STDERR "e.g. $0 /export/voxceleb1 dev data/dev\n";
+  exit(1);
+}
+
+($data_base, $dataset, $out_dir) = @ARGV;
+
+if ("$dataset" ne "dev" && "$dataset" ne "test") {
+  die "dataset parameter must be 'dev' or 'test'!";
+}
+
+if (system("mkdir -p $out_dir") != 0) {
+  die "Error making directory $out_dir";
+}
+print "$data_base/$dataset/wav\n";
+opendir my $dh, "$data_base/$dataset/wav" or die "Cannot open directory: $!";
+my @spkr_dirs = grep {-d "$data_base/$dataset/wav/$_" && ! /^\.{1,2}$/} readdir($dh);
+closedir $dh;
+
+if ($dataset eq "dev"){
+  open(SPKR_TRAIN, ">", "$out_dir/utt2spk") or die "could not open the output file $out_dir/utt2spk";
+  open(WAV_TRAIN, ">", "$out_dir/wav.scp") or die "could not open the output file $out_dir/wav.scp";
+
+  foreach (@spkr_dirs) {
+    my $spkr_id = $_;
+    opendir my $dh, "$data_base/$dataset/wav/$spkr_id/" or die "Cannot open directory: $!";
+    my @rec_dirs = grep {-d "$data_base/$dataset/wav/$spkr_id/$_" && ! /^\.{1,2}$/} readdir($dh);
+    closedir $dh;
+    foreach (@rec_dirs) {
+	  my $rec_id = $_;
+	  opendir my $dh, "$data_base/$dataset/wav/$spkr_id/$rec_id/" or die "Cannot open directory: $!";
+	  my @files = map{s/\.[^.]+$//;$_}grep {/\.wav$/} readdir($dh);
+	  closedir $dh;
+  	  foreach (@files) {
+        my $name = $_;
+        my $wav = "$data_base/$dataset/wav/$spkr_id/$rec_id/$name.wav";
+        my $utt_id = "$spkr_id-$rec_id-$name";
+        print WAV_TRAIN "$utt_id", " $wav", "\n";
+        print SPKR_TRAIN "$utt_id", " $spkr_id", "\n";
+      }
+    }
+  }
+  close(SPKR_TRAIN) or die;
+  close(WAV_TRAIN) or die;
+}
+
+if ($dataset eq "test"){
+  if (! -e "$data_base/voxceleb1_test_v2.txt") {
+    system("wget -O $data_base/voxceleb1_test_v2.txt http://www.openslr.org/resources/49/voxceleb1_test_v2.txt");
+  }
+
+  open(TRIAL_IN, "<", "$data_base/voxceleb1_test_v2.txt") or die "could not open the verification trials file $data_base/voxceleb1_test_v2.txt";
+  open(TRIAL_OUT, ">", "$out_dir/trials") or die "Could not open the output file $out_test_dir/trials";
+  open(SPKR_TEST, ">", "$out_dir/utt2spk") or die "could not open the output file $out_dir/utt2spk";
+  open(WAV_TEST, ">", "$out_dir/wav.scp") or die "could not open the output file $out_dir/wav.scp";
+
+  my $test_spkrs = ();
+  while (<TRIAL_IN>) {
+    chomp;
+    my ($tar_or_non, $path1, $path2) = split;
+    # Create entry for left-hand side of trial
+    my ($spkr_id, $rec_id, $name) = split('/', $path1);
+    my $utt_id1 = "$spkr_id-$rec_id-$name";
+    $test_spkrs{$spkr_id} = ();
+
+    # Create entry for right-hand side of trial
+    my ($spkr_id, $rec_id, $name) = split('/', $path2);
+    my $utt_id2 = "$spkr_id-$rec_id-$name";
+    $test_spkrs{$spkr_id} = ();
+
+    my $target = "nontarget";
+    if ($tar_or_non eq "1") {
+      $target = "target";
+    }
+    print TRIAL_OUT "$utt_id1 $utt_id2 $target\n";
+  }
+
+  foreach (@spkr_dirs) {
+    my $spkr_id = $_;
+    opendir my $dh, "$data_base/$dataset/wav/$spkr_id/" or die "Cannot open directory: $!";
+    my @rec_dirs = grep {-d "$data_base/$dataset/wav/$spkr_id/$_" && ! /^\.{1,2}$/} readdir($dh);
+    closedir $dh;
+    foreach (@rec_dirs) {
+	  my $rec_id = $_;
+	  opendir my $dh, "$data_base/$dataset/wav/$spkr_id/$rec_id/" or die "Cannot open directory: $!";
+	  my @files = map{s/\.[^.]+$//;$_}grep {/\.wav$/} readdir($dh);
+	  closedir $dh;
+  	  foreach (@files) {
+        my $name = $_;
+        my $wav = "$data_base/$dataset/wav/$spkr_id/$rec_id/$name.wav";
+        my $utt_id = "$spkr_id-$rec_id-$name";
+        print WAV_TEST "$utt_id", " $wav", "\n";
+        print SPKR_TEST "$utt_id", " $spkr_id", "\n";
+      }
+    }
+  }
+  close(SPKR_TEST) or die;
+  close(WAV_TEST) or die;
+  close(TRIAL_OUT) or die;
+  close(TRIAL_IN) or die;
+}
+
+if (system(
+  "utils/utt2spk_to_spk2utt.pl $out_dir/utt2spk >$out_dir/spk2utt") != 0) {
+  die "Error creating spk2utt file in directory $out_dir";
+}
+system("env LC_COLLATE=C utils/fix_data_dir.sh $out_dir");
+if (system("env LC_COLLATE=C utils/validate_data_dir.sh --no-text --no-feats $out_dir") != 0) {
+  die "Error validating directory $out_dir";
+}
diff --git a/egs/dihard_2018/v1/run.sh b/egs/dihard_2018/v1/run.sh
index 44af9f48c3f..eb23ac500cd 100755
--- a/egs/dihard_2018/v1/run.sh
+++ b/egs/dihard_2018/v1/run.sh
@@ -28,9 +28,14 @@ stage=0
 if [ $stage -le 0 ]; then
   local/make_voxceleb2.pl $voxceleb2_root dev data/voxceleb2_train
   local/make_voxceleb2.pl $voxceleb2_root test data/voxceleb2_test
-  # This script creates data/voxceleb1_test and data/voxceleb1_train.
-  # Our evaluation set is the test portion of VoxCeleb1.
-  local/make_voxceleb1.pl $voxceleb1_root data
+
+  # Now prepare the VoxCeleb1 train and test data.  If you downloaded the corpus soon
+  # after it was first released, you may need to use an older version of the script, which
+  # can be invoked as follows:
+  # local/make_voxceleb1.pl $voxceleb1_root data
+  local/make_voxceleb1_v2.pl $voxceleb1_root dev data/voxceleb1_train
+  local/make_voxceleb1_v2.pl $voxceleb1_root test data/voxceleb1_test
+
   # We'll train on all of VoxCeleb2, plus the training portion of VoxCeleb1.
   # This should give 7,351 speakers and 1,277,503 utterances.
   utils/combine_data.sh data/train data/voxceleb2_train data/voxceleb2_test data/voxceleb1_train
diff --git a/egs/dihard_2018/v2/local/make_voxceleb1_v2.pl b/egs/dihard_2018/v2/local/make_voxceleb1_v2.pl
new file mode 120000
index 00000000000..2e7a22eaadc
--- /dev/null
+++ b/egs/dihard_2018/v2/local/make_voxceleb1_v2.pl
@@ -0,0 +1 @@
+../../v1/local/make_voxceleb1_v2.pl
\ No newline at end of file
diff --git a/egs/dihard_2018/v2/run.sh b/egs/dihard_2018/v2/run.sh
index 0da1f330ea7..d330322a5e8 100755
--- a/egs/dihard_2018/v2/run.sh
+++ b/egs/dihard_2018/v2/run.sh
@@ -27,9 +27,14 @@ stage=0
 if [ $stage -le 0 ]; then
   local/make_voxceleb2.pl $voxceleb2_root dev data/voxceleb2_train
   local/make_voxceleb2.pl $voxceleb2_root test data/voxceleb2_test
-  # This script creates data/voxceleb1_test and data/voxceleb1_train.
-  # Our evaluation set is the test portion of VoxCeleb1.
-  local/make_voxceleb1.pl $voxceleb1_root data
+
+  # Now prepare the VoxCeleb1 train and test data.  If you downloaded the corpus soon
+  # after it was first released, you may need to use an older version of the script, which
+  # can be invoked as follows:
+  # local/make_voxceleb1.pl $voxceleb1_root data
+  local/make_voxceleb1_v2.pl $voxceleb1_root dev data/voxceleb1_train
+  local/make_voxceleb1_v2.pl $voxceleb1_root test data/voxceleb1_test
+
   # We'll train on all of VoxCeleb2, plus the training portion of VoxCeleb1.
   # This should give 7,351 speakers and 1,277,503 utterances.
   utils/combine_data.sh data/train data/voxceleb2_train data/voxceleb2_test data/voxceleb1_train

From 8c197b40afcf6365e04e02e9a6b956c95b9fb0d0 Mon Sep 17 00:00:00 2001
From: "Nickolay V. Shmyrev" <nshmyrev@gmail.com>
Date: Tue, 23 Apr 2019 18:13:41 +0300
Subject: [PATCH 081/163] [build] Note default=MKL; cosmetic fix (#3257)

---
 src/configure | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/configure b/src/configure
index a5adbb65788..0daad1029d1 100755
--- a/src/configure
+++ b/src/configure
@@ -82,7 +82,7 @@ Configuration options:
   --static-fst          Build with static OpenFst libraries [default=no]
   --fst-root=DIR        OpenFst root directory [default=../tools/openfst/]
   --fst-version=STR     OpenFst version string
-  --mathlib=LIB         Math library [default=ATLAS]
+  --mathlib=LIB         Math library [default=MKL]
                         Supported libraries: ATLAS, MKL, CLAPACK, OPENBLAS.
   --static-math         Build with static math libraries [default=no]
   --threaded-math       Build with multi-threaded math libraries [default=no]

From 56dc8d97b94de1fa0037b636ee33fe8d1ed94e16 Mon Sep 17 00:00:00 2001
From: Bayberry Z <zhtclz@hotmail.com>
Date: Tue, 23 Apr 2019 10:21:33 -0500
Subject: [PATCH 082/163] [egs] Fix to hkust_data_prep.sh w.r.t. how mmseg is
 checked for (#3240)

---
 egs/hkust/s5/local/hkust_data_prep.sh | 11 ++++++-----
 1 file changed, 6 insertions(+), 5 deletions(-)

diff --git a/egs/hkust/s5/local/hkust_data_prep.sh b/egs/hkust/s5/local/hkust_data_prep.sh
index af37a156bb3..6342ccfe861 100755
--- a/egs/hkust/s5/local/hkust_data_prep.sh
+++ b/egs/hkust/s5/local/hkust_data_prep.sh
@@ -1,5 +1,5 @@
 #!/bin/bash
-
+ 
 . ./path.sh || exit 1;
 
 if [ $# != 2 ]; then
@@ -14,6 +14,11 @@ hkust_text_dir=$2
 train_dir=data/local/train
 dev_dir=data/local/dev
 
+# transcripts normalization and segmentation
+# needs external tools
+python2 -c "import mmseg" 2>/dev/null || {
+    echo "Python module mmseg is not found. To install it, run tools/extra/install_mmseg.sh"; exit 1; }
+    
 mkdir -p $train_dir
 mkdir -p $dev_dir
 
@@ -65,10 +70,6 @@ find $hkust_text_dir -iname "*.txt" | grep -i "trans/dev" | xargs cat |\
   ' | sort -k1  > $dev_dir/transcripts.txt || exit 1;
 
 #transcripts normalization and segmentation
-#(this needs external tools),
-python -c "import mmseg" 2>/dev/null || \
-  (echo "mmseg is not found. Checkout tools/extra/install_mmseg.sh" && exit 1;)
-
 cat $train_dir/transcripts.txt |\
   sed -e 's/<foreign language=\"[a-zA-Z]\+\">/ /g' |\
   sed -e 's/<\/foreign>/ /g' |\

From 16c9270eb110aad765f73d73c7a48de3966945f1 Mon Sep 17 00:00:00 2001
From: Karel Vesely <vesis84@gmail.com>
Date: Tue, 23 Apr 2019 17:25:00 +0200
Subject: [PATCH 083/163] [egs] In WSJ run_ivector_common.sh, expose i-vector
 #jobs config to run script (#3248)

* wsj: exposing i-vector #jobs config to the master script,

- there is the 3layer granularity: threads in a binary, mid-level processes
  (summing stats and saving them), top-level processes (-nj).
- in our cluster it is difficult to get a JOB with too many threads
  (the jobs get stuck in the queue).
- if the settings are exposed into the top-level script, it will be
  easier to change to setup to make it work well by modifying the top
  level script only...
---
 egs/wsj/s5/local/chain/tuning/run_tdnn_1g.sh | 10 ++++++++++
 egs/wsj/s5/local/nnet3/run_ivector_common.sh |  9 ++++++++-
 2 files changed, 18 insertions(+), 1 deletion(-)

diff --git a/egs/wsj/s5/local/chain/tuning/run_tdnn_1g.sh b/egs/wsj/s5/local/chain/tuning/run_tdnn_1g.sh
index 8f566ccfe6d..ba90afbb213 100755
--- a/egs/wsj/s5/local/chain/tuning/run_tdnn_1g.sh
+++ b/egs/wsj/s5/local/chain/tuning/run_tdnn_1g.sh
@@ -32,7 +32,14 @@ train_set=train_si284
 test_sets="test_dev93 test_eval92"
 gmm=tri4b        # this is the source gmm-dir that we'll use for alignments; it
                  # should have alignments for the specified training data.
+
 num_threads_ubm=32
+
+nj_extractor=10
+# It runs a JOB with '-pe smp N', where N=$[threads*processes]
+num_threads_extractor=4
+num_processes_extractor=4
+
 nnet3_affix=       # affix for exp dirs, e.g. it was _cleaned in tedlium.
 
 # Options which are not passed through to run_ivector_common.sh
@@ -79,6 +86,9 @@ local/nnet3/run_ivector_common.sh \
   --stage $stage --nj $nj \
   --train-set $train_set --gmm $gmm \
   --num-threads-ubm $num_threads_ubm \
+  --nj-extractor $nj_extractor \
+  --num-processes-extractor $num_processes_extractor \
+  --num-threads-extractor $num_threads_extractor \
   --nnet3-affix "$nnet3_affix"
 
 
diff --git a/egs/wsj/s5/local/nnet3/run_ivector_common.sh b/egs/wsj/s5/local/nnet3/run_ivector_common.sh
index 813c6e14aed..7d4c9ef3c48 100755
--- a/egs/wsj/s5/local/nnet3/run_ivector_common.sh
+++ b/egs/wsj/s5/local/nnet3/run_ivector_common.sh
@@ -16,6 +16,12 @@ gmm=tri4b                # This specifies a GMM-dir from the features of the typ
                          # it should contain alignments for 'train_set'.
 
 num_threads_ubm=32
+
+nj_extractor=10
+# It runs a JOB with '-pe smp N', where N=$[threads*processes]
+num_processes_extractor=4
+num_threads_extractor=4
+
 nnet3_affix=             # affix for exp/nnet3 directory to put iVector stuff in (e.g.
                          # in the tedlium recip it's _cleaned).
 
@@ -110,7 +116,8 @@ if [ $stage -le 4 ]; then
   # can be sensitive to the amount of data.  The script defaults to an iVector dimension of
   # 100.
   echo "$0: training the iVector extractor"
-  steps/online/nnet2/train_ivector_extractor.sh --cmd "$train_cmd" --nj 10 \
+  steps/online/nnet2/train_ivector_extractor.sh --cmd "$train_cmd" \
+    --nj $nj_extractor --num-threads $num_threads_extractor --num-processes $num_processes_extractor \
     data/${train_set}_sp_hires exp/nnet3${nnet3_affix}/diag_ubm exp/nnet3${nnet3_affix}/extractor || exit 1;
 fi
 

From 57205cffb9d9398353345960e977f6b2998e9155 Mon Sep 17 00:00:00 2001
From: Daniel Campoverde <alx741@riseup.net>
Date: Tue, 23 Apr 2019 13:22:57 -0500
Subject: [PATCH 084/163] [egs] Add Spanish dimex100 example (#3254)

---
 egs/spanish_dimex100/README.txt            |  22 ++
 egs/spanish_dimex100/s5/.gitignore         |   5 +
 egs/spanish_dimex100/s5/RESULTS            |   7 +
 egs/spanish_dimex100/s5/cmd.sh             |  15 ++
 egs/spanish_dimex100/s5/conf/decode.config |   3 +
 egs/spanish_dimex100/s5/conf/mfcc.conf     |   1 +
 egs/spanish_dimex100/s5/local/data_prep.sh | 286 +++++++++++++++++++++
 egs/spanish_dimex100/s5/local/lang_prep.sh |  53 ++++
 egs/spanish_dimex100/s5/local/lm_prep.sh   |  57 ++++
 egs/spanish_dimex100/s5/local/score.sh     |  53 ++++
 egs/spanish_dimex100/s5/path.sh            |   6 +
 egs/spanish_dimex100/s5/run.sh             | 118 +++++++++
 egs/spanish_dimex100/s5/steps              |   1 +
 egs/spanish_dimex100/s5/utils              |   1 +
 14 files changed, 628 insertions(+)
 create mode 100644 egs/spanish_dimex100/README.txt
 create mode 100644 egs/spanish_dimex100/s5/.gitignore
 create mode 100755 egs/spanish_dimex100/s5/RESULTS
 create mode 100644 egs/spanish_dimex100/s5/cmd.sh
 create mode 100644 egs/spanish_dimex100/s5/conf/decode.config
 create mode 100644 egs/spanish_dimex100/s5/conf/mfcc.conf
 create mode 100755 egs/spanish_dimex100/s5/local/data_prep.sh
 create mode 100755 egs/spanish_dimex100/s5/local/lang_prep.sh
 create mode 100755 egs/spanish_dimex100/s5/local/lm_prep.sh
 create mode 100755 egs/spanish_dimex100/s5/local/score.sh
 create mode 100755 egs/spanish_dimex100/s5/path.sh
 create mode 100755 egs/spanish_dimex100/s5/run.sh
 create mode 120000 egs/spanish_dimex100/s5/steps
 create mode 120000 egs/spanish_dimex100/s5/utils

diff --git a/egs/spanish_dimex100/README.txt b/egs/spanish_dimex100/README.txt
new file mode 100644
index 00000000000..19406641f56
--- /dev/null
+++ b/egs/spanish_dimex100/README.txt
@@ -0,0 +1,22 @@
+About the DIMEx100 corpus:
+    Mexican Spanish clean speech corpus introduced in Pineda, et al. (2001).
+    "DIMEx100: A New Phonetic and Speech Corpus for Mexican Spanish".
+
+        > Studio recorded audio with a total of 6000 phrases by 100 speakers.
+        > Mono/16 bit/44.1 khz
+        > Three different levels of transcription
+        > For additional information about the corpus design and
+            characteristics refer to (Pineda, 2001)
+
+
+    Created by the computer science department of the "Investigaciones en
+    Matemáticas Aplicadas y en Sistemas (IIMAS)" institute at the "National
+    Autonomous University of Mexico (UNAM)".
+
+    DIMEx100 corpus is available free of charge for academic purposes
+    exclusively. For commercial use a formal agreement with UNAM is required.
+    For more information refer to
+    http://turing.iimas.unam.mx/~luis/DIME/CORPUS-DIMEX.html
+
+Example author:
+    Daniel A. Campoverde <alx@sillybytes.net>
diff --git a/egs/spanish_dimex100/s5/.gitignore b/egs/spanish_dimex100/s5/.gitignore
new file mode 100644
index 00000000000..5936e451c95
--- /dev/null
+++ b/egs/spanish_dimex100/s5/.gitignore
@@ -0,0 +1,5 @@
+DVDCorpusDimex100.zip
+CorpusDimex100
+
+data
+*.wav
diff --git a/egs/spanish_dimex100/s5/RESULTS b/egs/spanish_dimex100/s5/RESULTS
new file mode 100755
index 00000000000..dcab09973d6
--- /dev/null
+++ b/egs/spanish_dimex100/s5/RESULTS
@@ -0,0 +1,7 @@
+#!/bin/bash
+
+for x in exp/*/decode*; do [ -d $x ] && grep WER $x/wer_* | utils/best_wer.sh; done
+exit 0
+
+# Result on decode_test (tri2b_mmi_b0.05)
+%WER 7.58 [ 72 / 950, 50 ins, 0 del, 22 sub ] exp/tri2b_mmi_b0.05/decode_test/wer_15:2
diff --git a/egs/spanish_dimex100/s5/cmd.sh b/egs/spanish_dimex100/s5/cmd.sh
new file mode 100644
index 00000000000..71dd849a93b
--- /dev/null
+++ b/egs/spanish_dimex100/s5/cmd.sh
@@ -0,0 +1,15 @@
+# you can change cmd.sh depending on what type of queue you are using.
+# If you have no queueing system and want to run on a local machine, you
+# can change all instances 'queue.pl' to run.pl (but be careful and run
+# commands one by one: most recipes will exhaust the memory on your
+# machine).  queue.pl works with GridEngine (qsub).  slurm.pl works
+# with slurm.  Different queues are configured differently, with different
+# queue names and different ways of specifying things like memory;
+# to account for these differences you can create and edit the file
+# conf/queue.conf to match your queue's configuration.  Search for
+# conf/queue.conf in http://kaldi-asr.org/doc/queue.html for more information,
+# or search for the string 'default_config' in utils/queue.pl or utils/slurm.pl.
+
+export train_cmd="queue.pl --mem 2G"
+export decode_cmd="queue.pl --mem 4G"
+export mkgraph_cmd="queue.pl --mem 8G"
diff --git a/egs/spanish_dimex100/s5/conf/decode.config b/egs/spanish_dimex100/s5/conf/decode.config
new file mode 100644
index 00000000000..81c6a7b2745
--- /dev/null
+++ b/egs/spanish_dimex100/s5/conf/decode.config
@@ -0,0 +1,3 @@
+first_beam=10.0
+beam=13.0
+lattice_beam=6.0
diff --git a/egs/spanish_dimex100/s5/conf/mfcc.conf b/egs/spanish_dimex100/s5/conf/mfcc.conf
new file mode 100644
index 00000000000..45d284ad05c
--- /dev/null
+++ b/egs/spanish_dimex100/s5/conf/mfcc.conf
@@ -0,0 +1 @@
+--use-energy=false
diff --git a/egs/spanish_dimex100/s5/local/data_prep.sh b/egs/spanish_dimex100/s5/local/data_prep.sh
new file mode 100755
index 00000000000..50cb3de4f9c
--- /dev/null
+++ b/egs/spanish_dimex100/s5/local/data_prep.sh
@@ -0,0 +1,286 @@
+#!/bin/bash
+
+## Only run this file from the example root directory
+##      $ ./local/data_prep.sh
+
+mkdir -p "data/train" "data/test" "data/local"
+
+source ./path.sh
+
+# Dimex100 unziped corpus root directory
+CORPUS_DIR="$1"
+
+# Corpus data
+#
+#   Number of Different speakers:   100
+#   Speakers common utterances:     10
+#   Speakers individual utterances: 50
+#
+# Training/testing split
+#
+#   Common utterances for training:     10 (100%)
+#   Individual utterances for training: 40 (80%)
+#   Individual utterances for testing:  10 (20%)
+N_SPEAKERS=100
+N_COMMON_UTTERANCES=10
+N_INDIVIDUAL_UTTERANCES=50
+N_INDIVIDUAL_UTTERANCES_TRAINING=40
+N_INDIVIDUAL_UTTERANCES_TESTING=10
+
+# speakerId-utteranceId-[c|i]
+#   c = speaker common utterances (10)
+#   i = speaker individual utterances (50)
+#
+#   e.g.:
+#       s001-01-c
+#       ...
+#       s001-10-c
+#       ...
+#       s001-01-i
+#       ...
+#       s001-50-i
+
+## 80-20 train-test split
+## Only individual utterances are used in testing
+#    10/10 common utterances go into training
+#    40/50 individual utterances go into training
+#    10/50 individual utterances go into testing
+
+function make_speaker_id
+{
+    printf "s%03d" "$1"
+}
+
+function make_sentence_id
+{
+    printf "%02d" "$1"
+}
+
+#####################################
+# Convert wave audio to 16-bit, 16kHz
+#####################################
+
+function convert_to_16khz
+{
+    for i in $(seq 1 $N_SPEAKERS); do
+        speaker_id=$(make_speaker_id $i)
+
+        mkdir -p "$CORPUS_DIR/$speaker_id/audio_16k/comunes"
+        mkdir -p "$CORPUS_DIR/$speaker_id/audio_16k/individuales"
+
+        # Common utterances
+        for j in $(seq 1 $N_COMMON_UTTERANCES); do
+            sentence_id=$(make_sentence_id $j)
+            old_wav_file="$CORPUS_DIR/$speaker_id/audio_editado/comunes/$speaker_id$sentence_id.wav"
+            new_wav_file="$CORPUS_DIR/$speaker_id/audio_16k/comunes/$speaker_id$sentence_id.wav"
+            sox "$old_wav_file" -r 16k "$new_wav_file"
+        done
+
+        # Individual utterances
+        for k in $(seq 1 $N_INDIVIDUAL_UTTERANCES_TRAINING); do
+            sentence_id=$(make_sentence_id $k)
+            old_wav_file="$CORPUS_DIR/$speaker_id/audio_editado/individuales/$speaker_id$sentence_id.wav"
+            new_wav_file="$CORPUS_DIR/$speaker_id/audio_16k/individuales/$speaker_id$sentence_id.wav"
+            sox "$old_wav_file" -r 16k "$new_wav_file"
+        done
+    done
+}
+
+if [[ ! -d "$CORPUS_DIR/s001/audio_16k" ]]; then
+    echo
+    echo Converting audio from 44.1kHz to 16kHz
+    echo
+    convert_to_16khz
+fi
+
+
+
+#################
+# data/train/text
+# data/test/text
+#################
+
+# speakerId-utteranceId-[c|i]
+#   c = speaker common utterances (10)
+#   i = speaker individual utterances (50)
+#
+#   e.g.:
+#       s001-01-c
+#       ...
+#       s001-10-c
+#       ...
+#       s001-01-i
+#       ...
+#       s001-50-i
+
+## 80-20 train-test split
+## Only individual utterances are used in testing
+#    10/10 common utterances go into training
+#    40/50 individual utterances go into training
+#    10/50 individual utterances go into testing
+
+
+
+function clean
+{
+    echo "$1" \
+        | tr -d '\r' \
+        | tr '[:upper:]' '[:lower:]' \
+        | sed \
+            -e 's/á/a/g' -e 's/é/e/g' -e 's/í/i/g' -e 's/ó/o/g' -e 's/ú/u/g' \
+            -e 's/Á/a/g' -e 's/É/e/g' -e 's/Í/i/g' -e 's/Ó/o/g' -e 's/Ú/u/g' \
+            -e 's/ñ/n/g' -e 's/Ñ/n/g' -e 's/ü/u/g' -e 's/Ü/u/g' \
+        | tr -d -c "a-zA-Z0-9 \r\n"
+        # | tr -d -c "_,.;:\-?¿!'\"()" \
+}
+
+### Generate data/train/text
+for i in $(seq 1 $N_SPEAKERS); do
+    speaker_id=$(make_speaker_id $i)
+
+    # Common utterances
+    for j in $(seq 1 $N_COMMON_UTTERANCES); do
+        sentence_id=$(make_sentence_id $j)
+        utterance_id="$speaker_id-$sentence_id-c"
+        trans_file="$CORPUS_DIR/$speaker_id/texto/comunes/$speaker_id$sentence_id.txt"
+        iconv -f iso-8859-1 -t utf-8 "$trans_file" > "$trans_file.utf8"
+        if [ -f "$trans_file.utf8" ]; then
+            transcription=$(cat "$trans_file.utf8")
+            transcription=$(clean "$transcription")
+            echo "$utterance_id $transcription" >> "data/train/text"
+        fi
+    done
+
+    # Individual utterances
+    for k in $(seq 1 $N_INDIVIDUAL_UTTERANCES_TRAINING); do
+        sentence_id=$(make_sentence_id $k)
+        utterance_id="$speaker_id-$sentence_id-i"
+        trans_file="$CORPUS_DIR/$speaker_id/texto/individuales/$speaker_id$sentence_id.txt"
+        iconv -f iso-8859-1 -t utf-8 "$trans_file" > "$trans_file.utf8"
+        if [ -f "$trans_file.utf8" ]; then
+            transcription=$(cat "$trans_file.utf8")
+            transcription=$(clean "$transcription")
+            echo "$utterance_id $transcription" >> "data/train/text"
+        fi
+    done
+
+done
+
+
+### Generate data/test/text
+for i in $(seq 1 $N_SPEAKERS); do
+    speaker_id=$(make_speaker_id $i)
+
+    # Individual utterances
+    for k in $(seq $N_INDIVIDUAL_UTTERANCES_TRAINING $N_INDIVIDUAL_UTTERANCES); do
+        sentence_id=$(make_sentence_id $k)
+        utterance_id="$speaker_id-$sentence_id-i"
+        trans_file="$CORPUS_DIR/$speaker_id/texto/individuales/$speaker_id$sentence_id.txt"
+        iconv -f iso-8859-1 -t utf-8 "$trans_file" > "$trans_file.utf8"
+        if [ -f "$trans_file.utf8" ]; then
+            transcription=$(cat "$trans_file.utf8")
+            transcription=$(clean "$transcription")
+            echo "$utterance_id $transcription" >> "data/test/text"
+        fi
+    done
+
+done
+
+
+
+
+####################
+# data/train/wav.scp
+# data/test/wav.scp
+####################
+
+
+### Generate data/train/wav.scp
+for i in $(seq 1 $N_SPEAKERS); do
+    speaker_id=$(make_speaker_id $i)
+
+    # Common utterances
+    for j in $(seq 1 $N_COMMON_UTTERANCES); do
+        sentence_id=$(make_sentence_id $j)
+        utterance_id="$speaker_id-$sentence_id-c"
+        wav_file="$CORPUS_DIR/$speaker_id/audio_16k/comunes/$speaker_id$sentence_id.wav"
+        if [ -f "$wav_file" ]; then
+            echo "$utterance_id $wav_file" >> "data/train/wav.scp"
+        fi
+    done
+
+    # Individual utterances
+    for k in $(seq 1 $N_INDIVIDUAL_UTTERANCES_TRAINING); do
+        sentence_id=$(make_sentence_id $k)
+        utterance_id="$speaker_id-$sentence_id-i"
+        wav_file="$CORPUS_DIR/$speaker_id/audio_16k/individuales/$speaker_id$sentence_id.wav"
+        if [ -f "$wav_file" ]; then
+            echo "$utterance_id $wav_file" >> "data/train/wav.scp"
+        fi
+    done
+
+done
+
+
+### Generate data/test/wav.scp
+for i in $(seq 1 $N_SPEAKERS); do
+    speaker_id=$(make_speaker_id $i)
+
+    # Individual utterances
+    for k in $(seq $N_INDIVIDUAL_UTTERANCES_TRAINING $N_INDIVIDUAL_UTTERANCES); do
+        sentence_id=$(make_sentence_id $k)
+        utterance_id="$speaker_id-$sentence_id-i"
+        wav_file="$CORPUS_DIR/$speaker_id/audio_16k/individuales/$speaker_id$sentence_id.wav"
+        if [ -f "$wav_file" ]; then
+            echo "$utterance_id $wav_file" >> "data/test/wav.scp"
+        fi
+    done
+
+done
+
+
+
+
+####################
+# data/train/utt2spk
+# data/test/utt2spk
+####################
+
+# Take IDs from 'text' file to avoid including missing data's IDs
+
+### Generate data/train/utt2spk
+utterance_ids=$(cat "data/train/text" | cut -d' ' -f1)
+
+while read -r utterance_id; do
+    speaker_id=$(echo "$utterance_id" | cut -d'-' -f1)
+    echo "$utterance_id $speaker_id" >> "data/train/utt2spk"
+done <<< "$utterance_ids"
+
+
+### Generate data/test/utt2spk
+utterance_ids=$(cat "data/test/text" | cut -d' ' -f1)
+
+while read -r utterance_id; do
+    speaker_id=$(echo "$utterance_id" | cut -d'-' -f1)
+    echo "$utterance_id $speaker_id" >> "data/test/utt2spk"
+done <<< "$utterance_ids"
+
+
+############
+# Sort files
+############
+
+LC_ALL=C sort -o "data/train/text" "data/train/text"
+LC_ALL=C sort -o "data/test/text" "data/test/text"
+LC_ALL=C sort -o "data/train/wav.scp" "data/train/wav.scp"
+LC_ALL=C sort -o "data/test/wav.scp" "data/test/wav.scp"
+LC_ALL=C sort -o "data/train/utt2spk" "data/train/utt2spk"
+LC_ALL=C sort -o "data/test/utt2spk" "data/test/utt2spk"
+
+
+####################
+# data/train/spk2utt
+# data/test/spk2utt
+####################
+utils/utt2spk_to_spk2utt.pl "data/train/utt2spk" > "data/train/spk2utt"
+utils/utt2spk_to_spk2utt.pl "data/test/utt2spk" > "data/test/spk2utt"
diff --git a/egs/spanish_dimex100/s5/local/lang_prep.sh b/egs/spanish_dimex100/s5/local/lang_prep.sh
new file mode 100755
index 00000000000..1ba49bac6d6
--- /dev/null
+++ b/egs/spanish_dimex100/s5/local/lang_prep.sh
@@ -0,0 +1,53 @@
+#!/bin/bash
+
+## Only run this file from the example root directory
+##      $ ./local/data_prep.sh
+
+CORPUS_DIR="$1"
+
+mkdir -p "data/local/dict"
+
+source ./path.sh
+
+#############################
+# data/local/dict/lexicon.txt
+#############################
+
+export LC_ALL=C
+
+echo -e '!SIL sil\n<UNK> spn' > data/local/dict/lexicon.txt
+cat "$CORPUS_DIR/diccionarios/T22.full.dic" \
+    | tr '[:upper:]' '[:lower:]' \
+    | sed -e 's/([0123456789]*)//g' \
+        -e 's/\([^ ]\)n\~/\1n/g' \
+        -e 's/a_7/a/g' -e 's/e_7/e/g' -e 's/i_7/i/g' -e 's/o_7/o/g' -e 's/u_7/u/g' \
+        -e 's/a-7/a/g' -e 's/e-7/e/g' -e 's/i-7/i/g' -e 's/o-7/o/g' -e 's/u-7/u/g' \
+        -e 's/a_/a/g' -e 's/e_/e/g' -e 's/i_/i/g' -e 's/o_/o/g' -e 's/u_/u/g' \
+    | sed -e 's/_7n.*$//' \
+        -e 's/atl_7tica/atletica/' \
+        -e 's/biol_7gicas/biologicas/' \
+        -e 's/elec_7ctrico/electrico/' \
+        -e 's/gr_7afico/grafico/' \
+        -e 's/s_7lo/solo/' \
+    | sed -e 's/n~/ni/g' -e 's/r(/rh/g' \
+    | sed -e 's/\t/ /g' -e '/^$/d' \
+    | sort | uniq \
+    >> data/local/dict/lexicon.txt
+
+
+#######################################
+# data/local/dict/silence_phones.txt
+# data/local/dict/optional_silence.txt
+# data/local/dict/nonsilence_phones.txt
+# data/local/dict/extra_questions.txt
+#######################################
+
+echo -e 'sil\nspn' > data/local/dict/silence_phones.txt
+echo -e 'sil' > data/local/dict/optional_silence.txt
+cat data/local/dict/lexicon.txt \
+    | grep -v '<UNK>' \
+    | grep -v '!SIL' \
+    | cut -d' ' -f1 --complement \
+    | sed 's/ /\n/g' \
+    | sort -u \
+    > data/local/dict/nonsilence_phones.txt
diff --git a/egs/spanish_dimex100/s5/local/lm_prep.sh b/egs/spanish_dimex100/s5/local/lm_prep.sh
new file mode 100755
index 00000000000..82c3c22cddd
--- /dev/null
+++ b/egs/spanish_dimex100/s5/local/lm_prep.sh
@@ -0,0 +1,57 @@
+#!/bin/bash
+
+## Install SRILM in the `tools` directory (install_srilm.sh)
+
+## Only run this file from the example root directory
+##      $ ./local/data_prep.sh
+
+mkdir -p "data/local/tmp" "data/lang/tmp"
+
+source ./path.sh
+
+if [ -d "../../../tools/srilm/bin/i686-m64" ]; then
+    ngram_count_exe="../../../tools/srilm/bin/i686-m64/ngram-count"
+elif [ -d "../../../tools/srilm/bin/i686" ]; then
+    ngram_count_exe="../../../tools/srilm/bin/i686/ngram-count"
+else
+    echo
+    echo "[!] Install SRILM in the 'tools' directory (install_srilm.sh)"
+    echo
+    exit 1
+fi
+
+
+########################
+# data/local/tmp/lm_text
+########################
+
+# Text sentences input for language model generation
+# taken from data/[train|test]/text but with utterance IDs removed
+
+cat data/train/text data/test/text | cut -d' ' -f1 --complement > data/local/tmp/lm_text
+
+
+#################################
+# data/local/tmp/3gram_arpa_lm.gz
+##################################
+
+$ngram_count_exe -lm data/local/tmp/3gram_lm.arpa.kn.gz \
+    -order 3 \
+    -write-vocab data/local/tmp/vocab-full.txt \
+    -sort \
+    -wbdiscount \
+    -unk \
+    -map-unk "<UNK>" \
+    -text data/local/tmp/lm_text
+    # -kndiscount1 -gt1min 0 -kndiscount2 -gt2min 2 \
+    # -kndiscount3 -gt3min 3 -order 3 \
+
+
+#################
+# data/lang/G.fst
+#################
+
+utils/format_lm.sh data/lang \
+    data/local/tmp/3gram_lm.arpa.kn.gz \
+    data/local/dict/lexicon.txt \
+    data/lang
diff --git a/egs/spanish_dimex100/s5/local/score.sh b/egs/spanish_dimex100/s5/local/score.sh
new file mode 100755
index 00000000000..0be7d192282
--- /dev/null
+++ b/egs/spanish_dimex100/s5/local/score.sh
@@ -0,0 +1,53 @@
+#!/bin/bash
+# Copyright 2012  Johns Hopkins University (Author: Daniel Povey)
+# Apache 2.0
+
+[ -f ./path.sh ] && . ./path.sh
+
+# begin configuration section.
+cmd=run.pl
+min_lmwt=7
+max_lmwt=17
+#end configuration section.
+
+[ -f ./path.sh ] && . ./path.sh
+. parse_options.sh || exit 1;
+
+if [ $# -ne 3 ]; then
+  echo "Usage: local/score.sh [--cmd (run.pl|queue.pl...)] <data-dir> <lang-dir|graph-dir> <decode-dir>"
+  echo " Options:"
+  echo "    --cmd (run.pl|queue.pl...)      # specify how to run the sub-processes."
+  echo "    --min_lmwt <int>                # minumum LM-weight for lattice rescoring "
+  echo "    --max_lmwt <int>                # maximum LM-weight for lattice rescoring "
+  exit 1;
+fi
+
+data=$1
+lang_or_graph=$2
+dir=$3
+
+symtab=$lang_or_graph/words.txt
+
+for f in $symtab $dir/lat.1.gz $data/text; do
+  [ ! -f $f ] && echo "score.sh: no such file $f" && exit 1;
+done
+
+mkdir -p $dir/scoring/log
+
+cat $data/text | sed 's:<NOISE>::g' | sed 's:<SPOKEN_NOISE>::g' > $dir/scoring/test_filt.txt
+
+$cmd LMWT=$min_lmwt:$max_lmwt $dir/scoring/log/best_path.LMWT.log \
+  lattice-best-path --lm-scale=LMWT --word-symbol-table=$symtab \
+    "ark:gunzip -c $dir/lat.*.gz|" ark,t:$dir/scoring/LMWT.tra || exit 1;
+
+# Note: the double level of quoting for the sed command
+$cmd LMWT=$min_lmwt:$max_lmwt $dir/scoring/log/score.LMWT.log \
+   cat $dir/scoring/LMWT.tra \| \
+    utils/int2sym.pl -f 2- $symtab \| sed 's:\<UNK\>::g' \| \
+    compute-wer --text --mode=present \
+     ark:$dir/scoring/test_filt.txt  ark,p:- ">&" $dir/wer_LMWT || exit 1;
+
+# Show results
+for f in $dir/wer_*; do echo $f; egrep  '(WER)|(SER)' < $f; done
+
+exit 0;
diff --git a/egs/spanish_dimex100/s5/path.sh b/egs/spanish_dimex100/s5/path.sh
new file mode 100755
index 00000000000..2d17b17a84a
--- /dev/null
+++ b/egs/spanish_dimex100/s5/path.sh
@@ -0,0 +1,6 @@
+export KALDI_ROOT=`pwd`/../../..
+[ -f $KALDI_ROOT/tools/env.sh ] && . $KALDI_ROOT/tools/env.sh
+export PATH=$PWD/utils/:$KALDI_ROOT/tools/openfst/bin:$PWD:$PATH
+[ ! -f $KALDI_ROOT/tools/config/common_path.sh ] && echo >&2 "The standard file $KALDI_ROOT/tools/config/common_path.sh is not present -> Exit!" && exit 1
+. $KALDI_ROOT/tools/config/common_path.sh
+export LC_ALL=C
diff --git a/egs/spanish_dimex100/s5/run.sh b/egs/spanish_dimex100/s5/run.sh
new file mode 100755
index 00000000000..30f1ad0397f
--- /dev/null
+++ b/egs/spanish_dimex100/s5/run.sh
@@ -0,0 +1,118 @@
+#!/bin/bash
+
+. ./path.sh || exit 1
+. ./cmd.sh || exit 1
+
+########
+# Config
+########
+
+train_cmd="utils/run.pl"
+decode_cmd="utils/run.pl"
+
+CORPUS_DIR="CorpusDimex100"
+
+N_HMM=2000 # leaves
+N_GAUSSIANS=11000
+
+
+#################
+# Download corpus
+#################
+
+echo
+echo Downloading corpus
+echo
+if [ ! -d "$CORPUS_DIR" ]; then
+  wget http://turing.iimas.unam.mx/~luis/DIME/DIMEx100/DVD/DVDCorpusDimex100.zip || exit 1;
+  unzip DVDCorpusDimex100.zip || exit 1;
+fi
+
+
+##################
+# Data preparation
+##################
+
+echo
+echo Data preparation
+echo
+rm -rf data exp mfcc
+local/data_prep.sh "$CORPUS_DIR"
+utils/fix_data_dir.sh "data/train"
+utils/fix_data_dir.sh "data/test"
+
+
+#####################
+# Features generation
+#####################
+
+echo
+echo Features generation
+echo
+steps/make_mfcc.sh --cmd "$train_cmd" "data/train" "exp/make_mfcc/train" mfcc
+steps/make_mfcc.sh --cmd "$train_cmd" "data/test"  "exp/make_mfcc/test"  mfcc
+
+steps/compute_cmvn_stats.sh "data/train" "exp/make_mfcc/train" mfcc
+steps/compute_cmvn_stats.sh "data/test" "exp/make_mfcc/test" mfcc
+
+utils/validate_data_dir.sh "data/train"
+utils/validate_data_dir.sh "data/test"
+
+
+#######################
+# Lang data preparation
+#######################
+
+echo
+echo Language data preparation
+echo
+rm -rf data/local/dict
+local/lang_prep.sh "$CORPUS_DIR"
+utils/prepare_lang.sh data/local/dict "<UNK>" data/local/lang data/lang
+utils/fix_data_dir.sh "data/train"
+utils/fix_data_dir.sh "data/test"
+
+
+############################
+# Language model preparation
+############################
+
+echo
+echo Language model preparation
+echo
+local/lm_prep.sh
+
+
+#######################
+# Training and Decoding
+#######################
+
+echo
+echo Training
+echo
+# utils/subset_data_dir.sh --first data/train 500 data/train_500
+
+# Training and aligning
+steps/train_mono.sh --cmd "$train_cmd" data/train data/lang exp/mono || exit 1
+steps/align_si.sh --cmd "$train_cmd" data/train data/lang exp/mono exp/mono_aligned || exit 1
+steps/train_deltas.sh "$N_HMM" "$N_GAUSSIANS" data/train data/lang exp/mono_aligned exp/tri1 || exit 1
+steps/align_si.sh --cmd "$train_cmd" data/train data/lang exp/tri1 exp/tri1_aligned || exit 1
+
+# train tri2b [LDA+MLLT]
+steps/train_lda_mllt.sh --cmd "$train_cmd" "$N_HMM" "$N_GAUSSIANS" data/train data/lang exp/tri1_aligned exp/tri2b || exit 1;
+utils/mkgraph.sh data/lang exp/tri2b exp/tri2b/graph
+steps/align_si.sh --cmd "$train_cmd" data/train data/lang exp/tri2b exp/tri2b_aligned || exit 1
+
+#  Do MMI on top of LDA+MLLT.
+steps/make_denlats.sh --cmd "$train_cmd" data/train data/lang exp/tri2b exp/tri2b_denlats || exit 1;
+steps/train_mmi.sh --boost 0.05 data/train data/lang exp/tri2b_aligned exp/tri2b_denlats exp/tri2b_mmi_b0.05 || exit 1;
+
+
+
+# Decoding
+echo
+echo Decoding
+echo
+steps/decode.sh --config conf/decode.config --cmd "$decode_cmd" exp/tri2b/graph data/test exp/tri2b_mmi_b0.05/decode_test
+
+for x in exp/*/decode*; do [ -d $x ] && grep WER $x/wer_* | utils/best_wer.sh; done
diff --git a/egs/spanish_dimex100/s5/steps b/egs/spanish_dimex100/s5/steps
new file mode 120000
index 00000000000..6e99bf5b5ad
--- /dev/null
+++ b/egs/spanish_dimex100/s5/steps
@@ -0,0 +1 @@
+../../wsj/s5/steps
\ No newline at end of file
diff --git a/egs/spanish_dimex100/s5/utils b/egs/spanish_dimex100/s5/utils
new file mode 120000
index 00000000000..b240885218f
--- /dev/null
+++ b/egs/spanish_dimex100/s5/utils
@@ -0,0 +1 @@
+../../wsj/s5/utils
\ No newline at end of file

From a756df20450655b4081b41a6a4ebf6669d638238 Mon Sep 17 00:00:00 2001
From: "kkm (aka Kirill Katsnelson)" <kkm@smartaction.com>
Date: Wed, 24 Apr 2019 17:34:07 -0700
Subject: [PATCH 085/163] [build] Build and configure OpenBLAS; default to it
 on non-x64 machine (#3261)

* Default to OpenBLAS on non-x64 platforms.
* Download and build specific release of OpenBLAS from GitHub.
* Configure looks in the OpenBLAS build location only; the user
  can still specify `--openblas-root=`.
* Refactor switch parsing, chiefly to distinguish whether the
  `--openblas-root=` switch was not given, or was but pointed
  to a non-existing directory.
* Trim some dead ATLAS-related code left after commit 3e77220b3.
* Name all ATLAS configuration functions `*_atlas_*`.
* Move common Linux CUDA and Speex configuration out of math
  library configuration functions.

Fix: #3222
Mention: #3228
---
 src/configure                      | 369 ++++++++++++-----------------
 tools/Makefile                     |  45 ++--
 tools/extras/check_dependencies.sh |  50 ++--
 tools/extras/install_openblas.sh   |   8 +-
 4 files changed, 200 insertions(+), 272 deletions(-)

diff --git a/src/configure b/src/configure
index 0daad1029d1..3fb298ea240 100755
--- a/src/configure
+++ b/src/configure
@@ -1,46 +1,41 @@
 #!/bin/bash
 
-# This configure script is hand-generated, not auto-generated.
-# It creates the file kaldi.mk, which is %included by the Makefiles
-# in the subdirectories.
+# This configure script is hand-generated, not auto-generated.  It creates the
+# file kaldi.mk, which is %included by the Makefiles in the subdirectories.
 # The file kaldi.mk is editable by hand -- for example, you may want to
-# remove the options -g -O0 -DKALDI_PARANOID, or edit the
-# DOUBLE_PRECISION variable (to be 1 not 0).
-
+# uncomment the options -O0 -DKALDI_PARANOID, or edit the DOUBLE_PRECISION
+# variable (to be 1 not 0).
 
 #  Example command lines:
-# ./configure --shared  ## shared libraries.
 # ./configure
-# ./configure --mkl-root=/opt/intel/mkl
-# ./configure --mkl-root=/opt/intel/mkl --threaded-math=yes
-# ./configure --mkl-root=/opt/intel/mkl --threaded-math=yes --mkl-threading=tbb
-#        # This is for MKL 11.3, which does not seem  to provide Intel OMP libs
-# ./configure --openblas-root=../tools/OpenBLAS/install
-#        # Before doing this, cd to ../tools and type "make openblas".
+# ./configure --shared                # Build shared Kaldi libraries.
+# ./configure --mathlib=OPENBLAS      # Build and use OpenBLAS.
+#        # Before doing this, cd to ../tools and type "make -j openblas".
+# ./configure --openblas-root=/usr    # Use system OpenBLAS.
 #        # Note: this is not working correctly on all platforms, do "make test"
 #        # and look out for segmentation faults.
 # ./configure --atlas-root=../tools/ATLAS/build
 # ./configure --use-cuda=no   # disable CUDA detection (will build cpu-only
-#                             # version of kaldi even on CUDA-enabled machine
+#                             # version of kaldi even on CUDA-enabled machine.
 # ./configure --use-cuda --cudatk-dir=/usr/local/cuda/ --cuda-arch=-arch=sm_70
 #        # Use cuda in /usr/local/cuda and set the arch to sm_70
 # ./configure --static --fst-root=/opt/cross/armv8hf \
-# --atlas-root=/opt/cross/armv8hf --host=armv8-rpi3-linux-gnueabihf
-#        # Cross compile for armv8hf, this assumes that you have openfst built
+#   --atlas-root=/opt/cross/armv8hf --host=armv8-rpi3-linux-gnueabihf
+#        # Cross-compile for armv8hf. This assumes that you have OpenFST built
 #        # with the armv8-rpi3-linux-gnueabihf toolchain and installed to
 #        # /opt/cross/armv8hf. It also assumes that you have an ATLAS library
 #        # built for the target install to /opt/cross/armv8hf and that the
-#        # armv8-rpi3-linux-gnueabihf toolchain is available in your path
+#        # armv8-rpi3-linux-gnueabihf toolchain is available in your path.
 # ./configure --static --openblas-root=/opt/cross/arm-linux-androideabi \
-# --fst-root=/opt/cross/arm-linux-androideabi --fst-version=1.4.1 \
-# --android-incdir=/opt/cross/arm-linux-androideabi/sysroot/usr/include \
-# --host=arm-linux-androideabi
-#        # Cross compile for Android on arm. The only difference here is the
+#   --fst-root=/opt/cross/arm-linux-androideabi --fst-version=1.6.9 \
+#   --android-incdir=/opt/cross/arm-linux-androideabi/sysroot/usr/include \
+#   --host=arm-linux-androideabi
+#        # Cross-compile for Android on arm. The only difference here is the
 #        # addition of the the --android-includes flag because the toolchains
 #        # produced by the Android NDK don't always include the C++ stdlib
-#        # headers in the normal cross compile include path.
-# --host=aarch64-linux-android
-#        # support for 64bit ARMv8(AArch64) architecture in Android.
+#        # headers in the normal cross-compile include path.
+#   --host=aarch64-linux-android
+#        # support for 64bit ARMv8 (AArch64) architecture in Android.
 
 # This should be incremented after any significant change to the configure
 # script, i.e. any change affecting kaldi.mk or the build system as a whole.
@@ -76,13 +71,14 @@ Configuration options:
   --shared              Build and link against shared libraries [default=no]
   --use-cuda            Build with CUDA [default=yes]
   --cudatk-dir=DIR      CUDA toolkit directory
-  --cuda-arch=FLAGS     Override the default CUDA_ARCH flags.  See https://docs.nvidia.com/cuda/cuda-compiler-driver-nvcc/index.html#nvcc-examples.
+  --cuda-arch=FLAGS     Override the default CUDA_ARCH flags. See:
+         https://docs.nvidia.com/cuda/cuda-compiler-driver-nvcc/index.html#nvcc-examples.
   --double-precision    Build with BaseFloat set to double if yes [default=no],
                         mostly useful for testing purposes.
   --static-fst          Build with static OpenFst libraries [default=no]
   --fst-root=DIR        OpenFst root directory [default=../tools/openfst/]
   --fst-version=STR     OpenFst version string
-  --mathlib=LIB         Math library [default=MKL]
+  --mathlib=LIB         Math library [default=MKL|OPENBLAS, based on platform]
                         Supported libraries: ATLAS, MKL, CLAPACK, OPENBLAS.
   --static-math         Build with static math libraries [default=no]
   --threaded-math       Build with multi-threaded math libraries [default=no]
@@ -119,23 +115,38 @@ compiler/linker.
 EOF
 }
 
+# E.g. Die "Invalid switch --foobar"
+Die() { echo >&2 "$0: FATAL:" "$@"; exit 1; }
+
+# E.g. abspath=$(rel2abs "../tools") || exit 1
+#  - Set 'abspath' to existing absolute path of $1, return 0.
+#  - print empty string if path does not exist, return non-0.
 function rel2abs {
-  if [ ! -z "$1" ]; then
-    local retval=`cd $1 2>/dev/null && pwd || exit 1`
-    echo $retval
-  fi
+  [[ $1 ]] && cd -P "$1" 2>/dev/null && pwd
+}
+
+# E.g.: GetSwitchValue var --some-switch=foo
+# Assign variable named 'var' to 'foo'. Return 0 iff value is not empty.
+GetSwitchValue() {
+  IFS='=' read -r -- _ $1 <<< "$2" && [[ ${!1} ]]
 }
 
-function read_value {
-  local val=`expr "X$*" : '[^=]*=\(.*\)'`;
-  echo $val
+# E.g.: GetSwitchValueOrDie var --some-switch=foo
+# Assign variable named 'var' to 'foo'. Die with a fatal error if value is empty.
+GetSwitchValueOrDie() {
+  GetSwitchValue "$@" ||
+    Die "'$2': switch requires a value. See '$0 --help'."
 }
 
-function read_dirname {
-  local dir_name=`read_value $1`
-  local retval=`rel2abs $dir_name`
-  [ -z $retval ] && echo "Bad option '$1': no such directory" && exit 1;
-  echo $retval
+# E.g.: GetSwitchExistingPathOrDie var --some-switch=../tools
+#  - Set 'var' to absolute path of '../tools' if exists, return 1.
+#  - Die with a fatal error if path does not exist or not given in switch.
+GetSwitchExistingPathOrDie() {
+  GetSwitchValueOrDie "$@"  # Already sets variable named $1 to path.
+  local path varname=$1
+  path=$(rel2abs "${!varname}") && [[ -d $path ]] ||
+    Die "'$2': switch must specify an existing directory. See '$0 --help'."
+  builtin printf -v $varname %s "$path"  # Assign $path to variable '$varname'.
 }
 
 # TODO(kkm): Kill this. `[[ ${var-} ]]' is the idiomatic equivalent in bash.
@@ -161,7 +172,7 @@ function failure {
 }
 
 function check_exists {
-  if [ ! -f $1 ]; then failure "$1 not found."; fi
+  if [[ ! -f $1 ]]; then failure "$1 not found."; fi
 }
 
 function check_library {
@@ -308,7 +319,7 @@ function linux_configure_mkl_extra {
   echo "$linkline ${extra_libs[$threaded]}"
 }
 
-function linux_configure_threadinglibdir {
+function linux_configure_mkl_threadinglibdir {
   local library=$1
   local mklroot=$2
   local mkllibdir=$3
@@ -358,9 +369,9 @@ function linux_configure_mkl_threading {
 
   if ! is_set $OMPLIBDIR ; then
     if  $static ; then
-      OMPLIBDIR=`linux_configure_threadinglibdir $library "$MKLROOT" "$MKLLIBDIR" "a"`
+      OMPLIBDIR=`linux_configure_mkl_threadinglibdir $library "$MKLROOT" "$MKLLIBDIR" "a"`
     else
-      OMPLIBDIR=`linux_configure_threadinglibdir $library "$MKLROOT" "$MKLLIBDIR" "so"`
+      OMPLIBDIR=`linux_configure_mkl_threadinglibdir $library "$MKLROOT" "$MKLLIBDIR" "so"`
     fi
   fi
 
@@ -479,7 +490,8 @@ function configure_cuda {
     echo "CUDA_ARCH = $CUDA_ARCH" >> kaldi.mk
     echo >> kaldi.mk
 
-    # 64bit/32bit? We do not support cross compilation with CUDA so, use direct calls to uname -m here
+    # 64bit/32bit? We do not support cross compilation with CUDA so, use direct
+    # calls to uname -m here
     if [ "`uname -m`" == "x86_64" ]; then
       if [ "`uname`" == "Darwin" ]; then
         sed 's/lib64/lib/g' < makefiles/cuda_64bit.mk >> kaldi.mk
@@ -521,8 +533,9 @@ function linux_configure_speex {
     spx_type=so
   fi
   if [ ! -f "$SPEEXLIBDIR/libspeex.${spx_type}" ];then
-    echo "Info: configuring Kaldi not to link with Speex (don't worry, it's only needed if you"
-    echo "intend to use 'compress-uncompress-speex', which is very unlikely)"
+    echo "\
+INFO: Configuring Kaldi not to link with Speex. Don't worry, it's only needed if
+      you intend to use 'compress-uncompress-speex', which is very unlikely."
     return
   fi
 
@@ -543,17 +556,11 @@ function linux_configure_speex {
   fi
 }
 
-function linux_atlas_failure {
+function linux_configure_atlas_failure {
   echo ATLASINC = $ATLASROOT/include >> kaldi.mk
   echo ATLASLIBS = [somewhere]/liblapack.a [somewhere]/libcblas.a [somewhere]/libatlas.a [somewhere]/libf77blas.a $ATLASLIBDIR >> kaldi.mk
   echo >> kaldi.mk
-  if [[ "$TARGET_ARCH" == arm* ]]; then
-    cat makefiles/linux_atlas_arm.mk >> kaldi.mk
-  elif [[ "$TARGET_ARCH" == ppc64le ]]; then
-    cat makefiles/linux_atlas_ppc64le.mk >> kaldi.mk
-  else
-    cat makefiles/linux_atlas.mk >> kaldi.mk
-  fi
+
   echo "** $* ***"
   echo "**  ERROR   **"
   echo "** Configure cannot proceed automatically."
@@ -567,11 +574,11 @@ function linux_atlas_failure {
   echo "**"
   echo "**  Otherwise (or if you prefer OpenBLAS for speed), you could go the OpenBLAS"
   echo "** route: cd to ../tools, type 'extras/install_openblas.sh', cd back to here,"
-  echo "** and type './configure  --openblas-root=../tools/OpenBLAS/install'"
+  echo "** and type './configure  --mathlib=OPENBLAS'"
   exit 1;
 }
 
-function linux_check_static {
+function linux_atlas_check_static {
   # will exit with success if $dir seems to contain ATLAS libraries with
   # right architecture (compatible with default "nm")
   echo "int main(void) { return 0; }" > test_linking.cc;
@@ -611,19 +618,10 @@ function linux_configure_atlas_generic {
   echo ATLASINC = $ATLASROOT/include >> kaldi.mk
   echo ATLASLIBS = $ATLASLIBS -Wl,-rpath=$libdir >> kaldi.mk
   echo >> kaldi.mk
-  if [[ "$TARGET_ARCH" == arm* ]]; then
-    cat makefiles/linux_atlas_arm.mk >> kaldi.mk
-  elif [[ "$TARGET_ARCH" == ppc64le ]]; then
-    cat makefiles/linux_atlas_ppc64le.mk >> kaldi.mk
-  else
-    cat makefiles/linux_atlas.mk >> kaldi.mk
-  fi
   echo "Successfully configured ATLAS with ATLASLIBS=$ATLASLIBS"
-  $use_cuda && configure_cuda
-  linux_configure_speex
 }
 
-function linux_configure_redhat_fat {
+function linux_configure_atlas_redhat_fat {
   # This is for when only two so-called 'fat' ATLAS libs are provided:
   # libsatlas.so.3 and libtatlas.so.3.
   # See http://stackoverflow.com/questions/13439296/build-shared-libraries-in-atlas.
@@ -633,18 +631,10 @@ function linux_configure_redhat_fat {
     [ ! -f $f ] && return 1;
   done
   libdir=$(dirname $(echo $ATLASLIBS | awk '{print $1}'))
-  [ -z "$libdir" ] && echo "Error getting libdir in linux_configure_redhat_fat" && exit 1;
+  [ -z "$libdir" ] && echo "Error getting libdir in linux_configure_atlas_redhat_fat" && exit 1;
   echo ATLASINC = $ATLASROOT/include >> kaldi.mk
   echo ATLASLIBS = $ATLASLIBS -Wl,-rpath=$libdir >> kaldi.mk
   echo >> kaldi.mk
-  if [[ "$TARGET_ARCH" == arm* ]]; then
-    cat makefiles/linux_atlas_arm.mk >> kaldi.mk
-  elif [[ "$TARGET_ARCH" == ppc64le ]]; then
-    cat makefiles/linux_atlas_ppc64le.mk >> kaldi.mk
-  else
-    cat makefiles/linux_atlas.mk >> kaldi.mk
-  fi
-  $use_cuda && configure_cuda
   echo "Successfully configured for red hat [dynamic libraries, fat] with ATLASLIBS =$ATLASLIBS"
 }
 
@@ -654,7 +644,7 @@ function linux_configure_atlas_static {
   if [ -z $ATLASLIBDIR ]; then # Note: it'll pick up the last one below.
     for dir in /usr{,/local}/lib{64,}{,/atlas,/atlas-sse2,/atlas-sse3} \
        /usr/local/atlas/lib{,64} `pwd`/../tools/ATLAS/build/install/lib/ $ATLASROOT/lib; do
-     linux_check_static &&  ATLASLIBDIR=$dir
+     linux_atlas_check_static && ATLASLIBDIR=$dir
     done
     if [ -z $ATLASLIBDIR ]; then # Note: it'll pick up the last one below.
       echo "Could not find libatlas.a in any of the generic-Linux places, but we'll try other stuff..."
@@ -694,103 +684,9 @@ function linux_configure_atlas_static {
   echo ATLASINC = $ATLASROOT/include >> kaldi.mk
   echo ATLASLIBS = $ATLASLIBS >> kaldi.mk
   echo >> kaldi.mk
-  if [[ "$TARGET_ARCH" == arm* ]]; then
-    cat makefiles/linux_atlas_arm.mk >> kaldi.mk
-  elif [[ "$TARGET_ARCH" == ppc64le ]]; then
-    cat makefiles/linux_atlas_ppc64le.mk >> kaldi.mk
-  else
-    cat makefiles/linux_atlas.mk >> kaldi.mk
-  fi
-  $use_cuda && configure_cuda
-  linux_configure_speex
   echo "Successfully configured for Linux [static libraries] with ATLASLIBS =$ATLASLIBS"
 }
 
-function linux_check_dynamic {
-  # will exit with success if $dir seems to contain ATLAS libraries with
-  # right architecture (compatible with default "nm")
-  if $threaded_atlas; then pt=t; else pt=s; fi
-  for atlas_libname in libatlas.so lib${pt}atlas.so; do
-    if [ -f $dir/$atlas_libname ]; then # candidate...
-      if nm --dynamic $dir/$atlas_libname 2>&1 | grep "File format not recognized" >/dev/null; then
-        echo "Directory $dir may contain dynamic ATLAS libraries but seems to be wrong architecture";
-        return 1;
-      fi
-        echo "Atlas found in $dir";
-        return 0;
-      fi
-  done
-  # echo "... no {libatlas,lib${pt}atlas}.so in $dir";
-  return 1;
-}
-
-function linux_configure_dynamic {
-  if $threaded_atlas; then pt=t; else pt=s; fi # relevant to "fat" libraries, will change later for separate ones
-  if [ -z $ATLASLIBDIR ]; then # Note: it'll pick up the last one below.
-    for dir in /usr{,/local}/lib{,64}{,/atlas,/atlas-sse2,/atlas-sse3,/x86_64-linux-gnu} \
-      `pwd`/../tools/ATLAS/build/install/lib/ $ATLASROOT/lib; do
-      linux_check_dynamic && ATLASLIBDIR=$dir && ATLASLIBNAME=$atlas_libname
-    done
-    if [ -z $ATLASLIBDIR -o -z $ATLASLIBNAME ]; then
-      echo "Could not find {libatlas,lib${pt}atlas}.so in any of the obvious places, will most likely try static:"
-      return 1;
-    fi
-  fi
-
-  # If using "fat" libraries we only need one file to link against
-  if [ $ATLASLIBNAME != libatlas.so ]; then
-    if [ -f $ATLASLIBDIR/$ATLASLIBNAME ]; then
-      ATLASLIBS="$ATLASLIBDIR/$ATLASLIBNAME"
-    else
-      echo "Configuring dynamic ATLAS library failed: library $ATLASLIBNAME not found in $ATLASLIBDIR"
-      return 1;
-    fi
-  else  # with "thin" libraries, we have several object to link against, and different single/multi-thread names
-    if $threaded_atlas; then pt=pt; else pt=""; fi
-    echo "Validating presence of ATLAS libs in $ATLASLIBDIR"
-    ATLASLIBS=
-    # The Lapack part of ATLAS seems to appear under various different names.. but it
-    # should always have symbols like clapack_cgetrf and ATL_cgetrf defined, so we test for this.
-    for libname in lapack lapack_atlas  clapack; do
-      if [ -f $ATLASLIBDIR/lib${libname}.so -a "$ATLASLIBS" == "" ]; then
-        if nm  --dynamic $ATLASLIBDIR/lib${libname}.so  | grep clapack_cgetrf >/dev/null && \
-           nm  --dynamic $ATLASLIBDIR/lib${libname}.so  | grep ATL_cgetrf >/dev/null; then
-           ATLASLIBS="$ATLASLIBDIR/lib${libname}.so"
-           echo "Using library $ATLASLIBS as ATLAS's CLAPACK library."
-        fi
-      fi
-    done
-    if [ "$ATLASLIBS" == "" ]; then
-      echo Could not find any libraries $ATLASLIBDIR/{liblapack,liblapack_atlas,libclapack} that seem to be an ATLAS CLAPACK library.
-      return 1;
-    fi
-
-    for x in ${pt}cblas atlas ${pt}f77blas; do
-      if [ ! -f $ATLASLIBDIR/lib$x.so ]; then
-        echo "Configuring dynamic ATLAS libraries failed: Could not find library $x in directory $ATLASLIBDIR"
-        return 1;
-      fi
-      ATLASLIBS="$ATLASLIBS $ATLASLIBDIR/lib${x}.so"
-    done
-    if $threaded_atlas; then ATLASLIBS="$ATLASLIBS"; fi
-  fi
-
-  echo ATLASINC = $ATLASROOT/include >> kaldi.mk
-  echo ATLASLIBS = $ATLASLIBS >> kaldi.mk
-  echo ATLASLDFLAGS = -Wl,-rpath,$ATLASLIBDIR >> kaldi.mk
-  echo >> kaldi.mk
-  if [[ "$TARGET_ARCH" == arm* ]]; then
-    cat makefiles/linux_atlas_arm.mk >> kaldi.mk
-  elif [[ "$TARGET_ARCH" == ppc64le ]]; then
-    cat makefiles/linux_atlas_ppc64le.mk >> kaldi.mk
-  else
-    cat makefiles/linux_atlas.mk >> kaldi.mk
-  fi
-  $use_cuda && configure_cuda
-  linux_configure_speex
-  echo "Successfully configured for Linux [dynamic libraries] with ATLASLIBS =$ATLASLIBS"
-}
-
 #############################    CONFIGURATION    #############################
 
 # If configuration sets any of these variables, we will switch the external
@@ -864,7 +760,7 @@ do
     double_precision=false;
     shift ;;
   --atlas-root=*)
-    ATLASROOT=`read_dirname $1`;
+    GetSwitchExistingPathOrDie ATLASROOT "$1"
     shift ;;
   --threaded-atlas)
     threaded_atlas=true;
@@ -919,56 +815,57 @@ do
     mkl_threading=sequential;
     shift ;;
   --mkl-threading=*)
-    mkl_threading=`read_value $1`;
+    GetSwitchValueOrDie mkl_threading "$1"
     threaded_atlas=true;
     shift ;;
   --fst-root=*)
-    FSTROOT=`read_dirname $1`;
+    GetSwitchExistingPathOrDie FSTROOT "$1"
     shift ;;
   --cub-root=*)
-    CUBROOT=`read_dirname $1`;
+    GetSwitchExistingPathOrDie CUBROOT "$1"
     shift ;;
   --clapack-root=*)
-    CLAPACKROOT=`read_dirname $1`;
+    GetSwitchExistingPathOrDie CLAPACKROOT "$1"
     shift ;;
   --openblas-root=*)
-    OPENBLASROOT=`read_dirname $1`;
+    GetSwitchExistingPathOrDie OPENBLASROOT "$1"
     shift ;;
   --mkl-root=*)
-    MKLROOT=`read_dirname $1`;
+    GetSwitchExistingPathOrDie MKLROOT "$1"
     shift ;;
   --mkl-libdir=*)
-    MKLLIBDIR=`read_dirname $1`;
+    GetSwitchExistingPathOrDie MKLLIBDIR "$1"
     shift ;;
   --speex-root=*)
-    SPEEXROOT=`read_dirname $1`;
+    GetSwitchExistingPathOrDie SPEEXROOT "$1"
     shift ;;
   --speex-libdir=*)
-    SPEEXLIBDIR=`read_dirname $1`;
+    GetSwitchExistingPathOrDie SPEEXLIBDIR "$1"
     shift ;;
   --speex-incdir=*)
-    SPEEXINCDIR=`read_dirname $1`;
+    GetSwitchExistingPathOrDie SPEEXINCDIR "$1"
     shift ;;
   --omp-libdir=*)
-    OMPLIBDIR=`read_dirname $1`;
+    GetSwitchExistingPathOrDie OMPLIBDIR "$1"
     shift ;;
   --mathlib=*)
-    MATHLIB=`read_value $1`;
+    GetSwitchValueOrDie MATHLIB "$1"
     shift ;;
   --cudatk-dir=*)
-    CUDATKDIR=`read_dirname $1`;
-    shift ;; #CUDA is used in src/cudamatrix and src/nnet{,bin} only
+    # CUDA is used in src/cudamatrix and src/nnet{,bin} only.
+    GetSwitchExistingPathOrDie CUDATKDIR "$1"
+    shift ;;
   --cuda-arch=*)
-    CUDA_ARCH=`read_value $1`;
+    GetSwitchValueOrDie CUDA_ARCH "$1"
     shift;;
   --fst-version=*)
-    OPENFST_VER=`read_value $1`;
+    GetSwitchValueOrDie OPENFST_VER "$1"
     shift;;
   --host=*)
     # The type of system where built programs and libraries will run.
     # It should be in the format cpu-vendor-os. If specified, this script
     # will infer the target architecture from the specified host triple.
-    HOST=`read_value $1`;
+    GetSwitchValueOrDie HOST "$1"
     shift ;;
   --android-incdir=*)
     android=true;
@@ -977,7 +874,7 @@ do
     static_fst=true;
     dynamic_kaldi=false;
     MATHLIB='OPENBLAS';
-    ANDROIDINC=`read_dirname $1`;
+    GetSwitchExistingPathOrDie ANDROIDINC "$1"
     shift;;
   *)  echo "Unknown argument: $1, exiting"; usage; exit 1 ;;
   esac
@@ -1065,7 +962,11 @@ fi
 
 # When no library roots were provided, so that auto_lib is not deduced, and
 # MATHLIB is also not explicitly provided by the user, then default to MKL.
-[[ ! $auto_lib && ! $MATHLIB ]] && auto_lib=MKL
+[[ ! $auto_lib && ! $MATHLIB ]] &&
+  case $TARGET_ARCH in
+    x86_64) auto_lib=MKL ;;
+    *) auto_lib=OPENBLAS ;;
+  esac
 : ${MATHLIB:=$auto_lib}
 export MATHLIB  #TODO(kkm): Likely not needed. Briefly tested without,
                 #    but left in the hotfix. Remove when doing the #3192.
@@ -1257,10 +1158,16 @@ elif [ "`uname`" == "Linux" ]; then
       linux_configure_atlas_generic /usr/lib64/atlas "so.3" || \
       linux_configure_atlas_generic /usr/lib/x86_64-linux-gnu/ "so.3" || \
       linux_configure_atlas_generic /usr/lib/x86_64-linux-gnu/ "so" || \
-      linux_configure_redhat_fat 64 || \
-      linux_configure_redhat_fat || \
+      linux_configure_atlas_redhat_fat 64 || \
+      linux_configure_atlas_redhat_fat || \
       linux_configure_atlas_static || \
-      linux_atlas_failure "Failed to configure ATLAS libraries";
+      linux_configure_atlas_failure "Failed to configure ATLAS libraries";
+
+    case $TARGET_ARCH in
+      arm*)    cat makefiles/linux_atlas_arm.mk ;;
+      ppc64le) cat makefiles/linux_atlas_ppc64le.mk ;;
+      *)       cat makefiles/linux_atlas.mk ;;
+    esac >> kaldi.mk
 
   elif [ "$MATHLIB" == "MKL" ]; then
     if [ "$TARGET_ARCH" != "x86_64" ]; then
@@ -1309,8 +1216,6 @@ elif [ "`uname`" == "Linux" ]; then
     cat makefiles/linux_x86_64_mkl.mk >> kaldi.mk
     echo "MKLFLAGS = ${MKL_LINK_LINE} ${THREADING_LINE} $EXTRA_LIBS " >> kaldi.mk
     echo "Successfully configured for Linux with MKL libs from $MKLROOT"
-    $use_cuda && configure_cuda
-    linux_configure_speex
 
   elif [ "$MATHLIB" == "CLAPACK" ]; then
     if [ -z "$CLAPACKROOT" ]; then
@@ -1334,13 +1239,28 @@ elif [ "`uname`" == "Linux" ]; then
     fi
     echo "Warning (CLAPACK): this part of the configure process is not properly tested and may not work."
     echo "Successfully configured for Linux with CLAPACK libs from $CLAPACKROOT"
-    $use_cuda && configure_cuda
-    linux_configure_speex
 
   elif [ "$MATHLIB" == "OPENBLAS" ]; then
-    OPENBLASROOT=`rel2abs "$OPENBLASROOT"`
-    if [ -z "$OPENBLASROOT" ]; then
-      failure "Must specify the location of OPENBLAS with --openblas-root option (and it must exist)"
+    if [[ ! $OPENBLASROOT ]]; then
+      # Either the user specified --mathlib=OPENBLAS or we've autodetected the
+      # system where OpenBLAS is the preferred option (the parser for
+      # --openblas-root fails fatally if the path does not exist, so we trust
+      # that if set, the variable contains the existing path, converted to
+      # absolute form).
+      OPENBLASROOT="$(rel2abs ../tools/OpenBLAS/install)" ||
+        Die "OpenBLAS not found in '../tools/OpenBLAS/install'.
+** This is the only place we look for it. The best option is to build OpenBLAS
+** tuned for your system and CPU. To do that, run the following commands:
+**
+**   cd ../tools; extras/install_openblas.sh
+**
+** Another option is to specify the location of existing OpenBLAS directory
+** with the switch '--openblas-root='. However, even if a package is provided
+** for your system, the packaged version is almost always significantly slower
+** and often older than the above commands can fetch and build.
+**
+** You can also use other matrix algebra libraries. For information, see:
+**   http://kaldi-asr.org/doc/matrixwrap.html"
     fi
     if [ -f $OPENBLASROOT/lib/libopenblas.so ]; then
       OPENBLASLIBDIR=$OPENBLASROOT/lib
@@ -1356,17 +1276,21 @@ elif [ "`uname`" == "Linux" ]; then
       # in REDHAT/CentOS/Ubuntu package installs, the includes are located here
       OPENBLASINCDIR=$OPENBLASROOT/include/openblas
     else
-      echo "$0: ***** Using OpenBlas from $OPENBLASROOT but cblas.h is not found. "
-      echo " ****** Assuming openblas is aleady in a default include path, but"
-      echo " ***** if you get compilation messages about not finding files like cblas.h,"
-      echo " ***** you should look into this (e.g. make sure to install the 'openblas-dev' package,"
-      echo " ***** if it is a package-based install)."
+      echo "$0: ***** Using OpenBLAS from $OPENBLASROOT but cblas.h is not found. "
+      echo "** Assuming openblas is aleady in a default include path, but"
+      echo "** if you get compilation messages about not finding files like cblas.h,"
+      echo "** you should look into this (e.g. make sure to install the 'openblas-dev' package,"
+      echo "** if it is a package-based install)."
       OPENBLASINCDIR="/usr/include"
     fi
     echo "Your math library seems to be OpenBLAS from $OPENBLASROOT.  Configuring appropriately."
+    # TODO(kkm): Probably, OpenBLAS required libgfortran.so.3 at some point, but
+    # no longer does. *My* linker does not complain about a missing library, but
+    # is it safe to keep the reference if no longer required? Try to figure out
+    # how long ago the dependency was dropped.
     if $static_math; then
       echo "Configuring static OpenBlas since --static-math=yes"
-      OPENBLASLIBS="$OPENBLASLIBDIR/libopenblas.a -lgfortran"
+      OPENBLASLIBS="-L$OPENBLASLIBDIR -l:libopenblas.a -lgfortran"
     else
       echo "Configuring dynamically loaded OpenBlas since --static-math=no (the default)"
       OPENBLASLIBS="-L$OPENBLASLIBDIR -lopenblas -lgfortran -Wl,-rpath=$OPENBLASLIBDIR"
@@ -1374,22 +1298,19 @@ elif [ "`uname`" == "Linux" ]; then
     echo "OPENBLASINC = $OPENBLASINCDIR" >> kaldi.mk
     echo "OPENBLASLIBS = $OPENBLASLIBS" >> kaldi.mk
     echo >> kaldi.mk
-    if [[ "$TARGET_ARCH" == arm* ]]; then
-      cat makefiles/linux_openblas_arm.mk >> kaldi.mk
-    elif [[ "$TARGET_ARCH" == aarch64* ]]; then
-      cat makefiles/linux_openblas_aarch64.mk >> kaldi.mk
-    elif [[ "$TARGET_ARCH" == ppc64le ]]; then
-      cat makefiles/linux_openblas_ppc64le.mk >> kaldi.mk
-    else
-      cat makefiles/linux_openblas.mk >> kaldi.mk
-    fi
-    echo "Successfully configured for Linux with OpenBLAS from $OPENBLASROOT"
-    $use_cuda && configure_cuda
-    linux_configure_speex
+    case $TARGET_ARCH in
+      aarch64*) cat makefiles/linux_openblas_aarch64.mk ;;
+      arm*)     cat makefiles/linux_openblas_arm.mk ;;
+      ppc64le)  cat makefiles/linux_openblas_ppc64le.mk ;;
+      *)        cat makefiles/linux_openblas.mk ;;
+    esac >> kaldi.mk
 
+    echo "Successfully configured for Linux with OpenBLAS from $OPENBLASROOT"
   else
     failure "Unsupported linear algebra library '$MATHLIB'"
   fi
+  $use_cuda && configure_cuda
+  linux_configure_speex
 else
   failure "Could not detect the platform or we have not yet worked out the
   appropriate configuration for this platform. Please contact the developers."
@@ -1407,7 +1328,13 @@ if [ -n "$ENV_LDLIBS" ]; then echo "LDLIBS += $ENV_LDLIBS" >> kaldi.mk; fi
 # We check for slow exp implementation just before we exit. This check uses
 # and possibly modifies the kaldi.mk file that we just generated.
 check_for_slow_expf;
-echo "SUCCESS"
-echo "To compile: make clean -j; make depend -j; make -j"
-echo " ... or e.g. -j 10, instead of -j, to use a specified number of CPUs"
-exit 0;
+echo "Kaldi has been successfully configured. To compile:
+
+  make -j clean depend; make -j <NCPU>
+
+where <NCPU> is the number of parallel builds you can afford to do. If unsure,
+use the smaller of the number of CPUs or the amount of RAM in GB divided by 2,
+to stay within safe limits. 'make -j' without the numeric value may not limit
+the number of parallel jobs at all, and overwhelm even a powerful workstation,
+since Kaldi build is highly parallelized."
+exit 0
diff --git a/tools/Makefile b/tools/Makefile
index 094a9b608d3..e690df3da88 100644
--- a/tools/Makefile
+++ b/tools/Makefile
@@ -1,14 +1,15 @@
 # SHELL += -x
 
-CXX = g++
-CC = gcc         # used for sph2pipe
-# CXX = clang++  # Uncomment these lines
-# CC = clang     # to build with Clang.
+CXX ?= g++
+CC ?= gcc        # used for sph2pipe
+# CXX = clang++  # Uncomment these lines...
+# CC = clang     # ...to build with Clang.
 
 # Note: OpenFst requires a relatively recent C++ compiler with C++11 support,
 # e.g. g++ >= 4.7, Apple clang >= 5.0 or LLVM clang >= 3.3.
 OPENFST_VERSION ?= 1.6.7
 CUB_VERSION ?= 1.8.0
+OPENBLAS_VERSION ?= 0.3.5
 
 # Default features configured for OpenFST; can be overridden in the make command line.
 OPENFST_CONFIGURE ?= --enable-static --enable-shared --enable-far --enable-ngram-fsts
@@ -129,31 +130,6 @@ sph2pipe_v2.5.tar.gz:
 	wget -T 10 -t 3 http://www.openslr.org/resources/3/sph2pipe_v2.5.tar.gz || \
 	wget --no-check-certificate -T 10  https://sourceforge.net/projects/kaldi/files/sph2pipe_v2.5.tar.gz
 
-openblas: openblas_compiled
-
-.PHONY: openblas_compiled
-
-fortran_opt = $(shell gcc -v 2>&1 | perl -e '$$x = join(" ", <STDIN>); if($$x =~ m/target=\S+64\S+/) { print "BINARY=64"; }')
-
-
-# note: you can uncomment the line that has USE_THREAD=1 and comment the line
-# that has USE_THREAD=0 if you want Open Blas to use multiple threads.  then
-# you could set, for example, OPENBLAS_NUM_THREADS=2 in your path.sh so that the
-# runtime knows how many threads to use.  Note: if you ever get the error
-# "Program is Terminated. Because you tried to allocate too many memory
-# regions.", this is because OpenBLAS has a fixed buffer size controlled by the
-# Makefile option NUM_THREADS; I believe this limits the product of number of
-# program threads that are calling BLAS by the shell variable
-# OPENBLAS_NUM_THREADS.  In that case it might help to increase the NUM_THREADS
-# option.
-openblas_compiled:
-	echo "Note: see tools/Makefile for options regarding OpenBLAS compilation"
-	-git clone https://github.com/xianyi/OpenBLAS.git
-	-cd OpenBLAS; git pull
-	cd OpenBLAS; sed 's:# FCOMMON_OPT = -frecursive:FCOMMON_OPT = -frecursive:' < Makefile.rule >tmp && mv tmp Makefile.rule
-	# $(MAKE) PREFIX=`pwd`/OpenBLAS/install FC=gfortran $(fortran_opt) DEBUG=1 USE_THREAD=1 NUM_THREADS=64 -C OpenBLAS all install
-	$(MAKE) PREFIX=`pwd`/OpenBLAS/install FC=gfortran $(fortran_opt) DEBUG=1 USE_THREAD=0 -C OpenBLAS all install
-
 
 .PHONY: cub
 cub:
@@ -161,3 +137,14 @@ cub:
 	unzip -oq cub-$(CUB_VERSION).zip
 	rm -f cub
 	ln -s cub-$(CUB_VERSION) cub
+
+# OpenBLAS is not compiled by default. Run 'make -j openblas' in this directory to build.
+.PHONY: openblas
+openblas:
+	@-rm -rf OpenBLAS xianyi-OpenBLAS-*
+	wget -t3 -nv -O- $$( \
+            wget -qO- 'https://api.github.com/repos/xianyi/OpenBLAS/releases/tags/v$(OPENBLAS_VERSION)' | \
+            python -c 'import sys,json;print(json.load(sys.stdin)["tarball_url"])') | \
+	  tar xzf -
+	mv xianyi-OpenBLAS-* OpenBLAS
+	$(MAKE) PREFIX=$$(pwd)/OpenBLAS/install USE_THREAD=0 -C OpenBLAS all install
diff --git a/tools/extras/check_dependencies.sh b/tools/extras/check_dependencies.sh
index 0ee7e5b38dc..e133961e0a3 100755
--- a/tools/extras/check_dependencies.sh
+++ b/tools/extras/check_dependencies.sh
@@ -128,24 +128,40 @@ if $pythonok && have python && [[ ! -f $PWD/python/.use_default_python ]]; then
 fi
 )
 
-printed=false
-
-# MKL. We do not know if compiler exists at this point, so double-check
-# the well-known mkl.h file location. The compiler test would still find
-# it if installed in an alternative location (this is unlikely).
-if [ ! -f /opt/intel/mkl/include/mkl.h ] &&
-   ! echo '#include <mkl.h>' | $CXX -I /opt/intel/mkl/include -E - >&/dev/null; then
-  if [[ $(uname) == Linux ]]; then
-    echo "$0: Intel MKL is not installed. Run extras/install_mkl.sh to install it."
-  else
-    echo "$0: Intel MKL is not installed. Download the installer package for your
+mathlib_missing=false
+case $(uname -m) in
+  x86_64)  # Suggest MKL on an Intel64 system (configure does not like i?86 hosts).
+    # We do not know if compiler exists at this point, so double-check the
+    # well-known mkl.h file location. The compiler test would still find it if
+    # installed in an alternative location (this is unlikely).
+    if [ ! -f /opt/intel/mkl/include/mkl.h ] &&
+         ! echo '#include <mkl.h>' | $CXX -I /opt/intel/mkl/include -E - >&/dev/null; then
+      if [[ $(uname) == Linux ]]; then
+        echo "$0: Intel MKL is not installed. Run extras/install_mkl.sh to install it."
+      else
+        echo "$0: Intel MKL is not installed. Download the installer package for your
  ... system from: https://software.intel.com/mkl/choose-download."
-  fi
- echo "\
+      fi
+      mathlib_missing=true
+    fi
+      ;;
+  *)  # Suggest OpenBLAS on other hardware.
+    if [ ! -f $(pwd)/OpenBLAS/install/include/openblas_config.h ] &&
+         ! echo '#include <openblas_config.h>' |
+            $CXX -I $(pwd)/OpenBLAS/install/include -E - >&/dev/null; then
+      echo "$0: OpenBLAS not detected. Run extras/install_openblas.sh
+ ... to compile it for your platform, or configure with --openblas-root= if you
+ ... have it installed in a location we could not guess. Note that packaged
+ ... library may be significantly slower and/or older than the one the above
+ ... would build."
+      mathlib_missing=true
+    fi
+      ;;
+esac
+$mathlib_missing &&
+  echo "\
  ... You can also use other matrix algebra libraries. For information, see:
- ... http://kaldi-asr.org/doc/matrixwrap.html"
-  printed=true
-fi
+ ...   http://kaldi-asr.org/doc/matrixwrap.html"
 
 # Report missing programs and libraries.
 if [ -n "$debian_packages" ]; then
@@ -187,7 +203,7 @@ if pwd | grep -E 'JOB|LMWT' >/dev/null; then
   status=1
 fi
 
-if ! $printed && [ $status -eq 0 ]; then
+if ! $mathlib_missing && [ $status -eq 0 ]; then
   echo "$0: all OK."
 fi
 
diff --git a/tools/extras/install_openblas.sh b/tools/extras/install_openblas.sh
index 44ff1793018..90afe8e9de4 100755
--- a/tools/extras/install_openblas.sh
+++ b/tools/extras/install_openblas.sh
@@ -1,7 +1,5 @@
 #!/bin/bash
 
-# to be run from ..
-# this script just exists to tell you how you'd make openblas- we actually did it via Makefile rules,
-# but it's not a default target.
-
-make openblas
+# OpenBLAS is downloaded and built by tools/Makefile, but not automatically by
+# its default 'all' target.
+make -j openblas

From 121dbbef4b6733a0455c572a9fb358f52c2e5ef9 Mon Sep 17 00:00:00 2001
From: songyf <songyfxjtu@gmail.com>
Date: Thu, 25 Apr 2019 11:25:26 +0800
Subject: [PATCH 086/163] [scripts] Fix of a bug in segmentation.pl (#3256)

---
 egs/wsj/s5/utils/segmentation.pl | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/egs/wsj/s5/utils/segmentation.pl b/egs/wsj/s5/utils/segmentation.pl
index 41d90f4bd9d..fa7c4429927 100755
--- a/egs/wsj/s5/utils/segmentation.pl
+++ b/egs/wsj/s5/utils/segmentation.pl
@@ -221,7 +221,8 @@ ()
         if ($A[$p] == 0) { $num_sil++; }
         else { last; }
       }
-      $num_silence_phones[$n] = $p;
+      
+      $num_silence_phones[$n] = $num_sil; # should be the num of silence
     }
   }
 

From a0b6f3ff206a0a2bad96abefadcc844e19a17824 Mon Sep 17 00:00:00 2001
From: Justin Luitjens <luitjens@users.noreply.github.com>
Date: Thu, 25 Apr 2019 15:05:56 -0600
Subject: [PATCH 087/163] [src] Fixes to cuda unit tests. (#3268)

---
 src/cudamatrix/cu-matrix-test.cc | 245 +++++++++++++++++--------------
 src/cudamatrix/cu-matrixdim.h    |   8 +
 2 files changed, 139 insertions(+), 114 deletions(-)

diff --git a/src/cudamatrix/cu-matrix-test.cc b/src/cudamatrix/cu-matrix-test.cc
index 46bc6ea0cb2..405ef16e97b 100644
--- a/src/cudamatrix/cu-matrix-test.cc
+++ b/src/cudamatrix/cu-matrix-test.cc
@@ -77,8 +77,8 @@ static void UnitTestCuMatrixTraceMatMat() {
   for (int32 i = 0; i < 2; i++) {
     int32 M = 100 + Rand() % 200, N = 100 + Rand() % 200;
     CuMatrix<Real> A(M, N);
-    A.SetRandn();
-    // add a bias to avoid numerical failure when comparing r2 and r3
+    A.SetRandUniform();
+    // Add bias to avoid numbers close to zero
     A.Add(0.1);
     if (i % 2 == 1) {
       CuMatrix<Real> B(M, N);
@@ -143,7 +143,8 @@ template<typename Real>
 static void UnitTestCuMatrixApplyLog() {
   int32 M = 100 + Rand() % 200, N = 100 + Rand() % 200;
   Matrix<Real> H(M, N);
-  H.SetRandn();
+  H.SetRandUniform(); // Using uniform distribution to ensure positive numbers
+  H.Add(0.1);         // Add bias to eliminate zeros
   H.MulElements(H); // make numbers positive
 
   CuMatrix<Real> D(H);
@@ -153,7 +154,7 @@ static void UnitTestCuMatrixApplyLog() {
 
   Matrix<Real> H2(D);
 
-  AssertEqual(H,H2);
+  KALDI_ASSERT(ApproxEqual(H,H2));
 }
 
 
@@ -174,7 +175,7 @@ static void UnitTestCuMatrixApplyExpSpecial() {
 
   Matrix<Real> H2(D);
 
-  AssertEqual(H,H2);
+  KALDI_ASSERT(ApproxEqual(H,H2));
 }
 
 template<typename Real>
@@ -190,7 +191,7 @@ static void UnitTestCuMatrixApplyExp() {
 
   Matrix<Real> H2(D);
 
-  AssertEqual(H,H2);
+  KALDI_ASSERT(ApproxEqual(H,H2));
 }
 
 
@@ -214,7 +215,7 @@ static void UnitTestCuMatrixApplyExpLimited() {
 
   Matrix<Real> H2(D);
 
-  AssertEqual(H,H2);
+  KALDI_ASSERT(ApproxEqual(H,H2));
 }
 
 
@@ -235,7 +236,7 @@ static void UnitTestCuMatrixSigmoid() {
 
     Matrix<Real> H2(E);
 
-    AssertEqual(H, H2);
+    KALDI_ASSERT(ApproxEqual(H, H2));
   }
 }
 
@@ -251,7 +252,7 @@ static void UnitTestCuMatrixScale() {
   H.Scale(scale);
   Matrix<Real> E(D);
 
-  AssertEqual(H, E);
+  KALDI_ASSERT(ApproxEqual(H, E));
 }
 
 template<typename Real>
@@ -266,7 +267,7 @@ static void UnitTestCuMatrixAdd() {
   H.Add(offset);
   Matrix<Real> E(D);
 
-  AssertEqual(H, E);
+  KALDI_ASSERT(ApproxEqual(H, E));
 }
 
 
@@ -285,7 +286,7 @@ static void UnitTestCuMatrixSoftHinge() {
 
   Matrix<Real> H2(E);
 
-  AssertEqual(H,H2);
+  KALDI_ASSERT(ApproxEqual(H,H2));
 }
 
 template<typename Real>
@@ -308,7 +309,7 @@ static void UnitTestCuMatrixGroupPnorm() {
       CuMatrix<Real> E(M, N);
       E.GroupPnorm(D, p);
       Matrix<Real> H2(E);
-      AssertEqual(H, H2);
+      KALDI_ASSERT(ApproxEqual(H, H2));
     }
   }
 }
@@ -330,7 +331,7 @@ static void UnitTestCuMatrixGroupMax() {
     CuMatrix<Real> E(M, N);
     E.GroupMax(D);
     Matrix<Real> H2(E);
-    AssertEqual(H,H2);
+    KALDI_ASSERT(ApproxEqual(H,H2));
   }
 }
 
@@ -344,7 +345,7 @@ static void UnitTestCuMatrixSet() {
     m1.Set(value);
     m2.Set(value);
     Matrix<Real> m3(m1);
-    AssertEqual(m2, m3);
+    KALDI_ASSERT(ApproxEqual(m2, m3));
   }
 }
 
@@ -369,7 +370,7 @@ static void UnitTestCuMatrixApplyPow() {
 
     H.ApplyPow(pow);
     Matrix<Real> H2(cH);
-    AssertEqual(H, H2);
+    KALDI_ASSERT(ApproxEqual(H, H2));
   }
 }
 
@@ -390,7 +391,7 @@ static void UnitTestCuMatrixApplyPowAbs() {
 
     H.ApplyPowAbs(pow, true);
     Matrix<Real> H2(cH);
-    AssertEqual(H, H2);
+    KALDI_ASSERT(ApproxEqual(H, H2));
   }
 }
 
@@ -417,7 +418,7 @@ static void UnitTestCuMatrixCopyRowsFromVec() {
     mat.CopyRowsFromVec(vec);
 
     Matrix<Real> mat2(cu_mat);
-    AssertEqual(mat, mat2);
+    KALDI_ASSERT(ApproxEqual(mat, mat2));
   }
 }
 
@@ -442,7 +443,7 @@ static void UnitTestCuMatrixCopyColsFromVec() {
     mat.CopyColsFromVec(vec);
 
     Matrix<Real> mat2(cu_mat);
-    AssertEqual(mat, mat2);
+    KALDI_ASSERT(ApproxEqual(mat, mat2));
   }
 }
 
@@ -477,8 +478,8 @@ static void UnitTestCuMatrixCopyRows() {
         if (reorder[i] < 0) O(i, j) = 0;
         else O(i, j) = M(reorder[i], j);
 
-    AssertEqual(N1, O);
-    AssertEqual(N2, O);
+    KALDI_ASSERT(ApproxEqual(N1, O));
+    KALDI_ASSERT(ApproxEqual(N2, O));
   }
 }
 
@@ -512,7 +513,7 @@ static void UnitTestCuMatrixCopyToRows() {
     CuArray<Real*> reorder_dst_cuda(reorder_dst);
     M.CopyToRows(reorder_dst_cuda);
 
-    AssertEqual(N, O);
+    KALDI_ASSERT(ApproxEqual(N, O));
   }
 }
 
@@ -552,8 +553,8 @@ static void UnitTestCuMatrixAddRows() {
       }
     }
 
-    AssertEqual(N1, O);
-    AssertEqual(N2, O);
+    KALDI_ASSERT(ApproxEqual(N1, O));
+    KALDI_ASSERT(ApproxEqual(N2, O));
   }
 }
 
@@ -588,7 +589,7 @@ static void UnitTestCuMatrixMulRows() {
       }
     }
 
-    AssertEqual(N1, O);
+    KALDI_ASSERT(ApproxEqual(N1, O));
   }
 }
 
@@ -630,8 +631,8 @@ static void UnitTestCuMatrixAddToRows() {
     CuArray<Real*> reorder_dst_cuda(reorder_dst);
     M.AddToRows(alpha, reorder_dst_cuda);
     M.AddToRows(alpha, reorder_cuda, &N2);
-    AssertEqual(N1, O);
-    AssertEqual(N2, O);
+    KALDI_ASSERT(ApproxEqual(N1, O));
+    KALDI_ASSERT(ApproxEqual(N2, O));
   }
 }
 
@@ -648,13 +649,13 @@ void UnitTestCuMatrixCopyCross() {
       mat2.CopyFromMat(mat1);
       CuMatrix<Real> mat3(M, N);
       mat3.CopyFromMat(mat2);
-      AssertEqual(mat1, mat3);
+      KALDI_ASSERT(ApproxEqual(mat1, mat3));
     } else {
       CuMatrix<float> mat2(N, M);
       mat2.CopyFromMat(mat1, kTrans);
       CuMatrix<Real> mat3(M, N);
       mat3.CopyFromMat(mat2, kTrans);
-      AssertEqual(mat1, mat3);
+      KALDI_ASSERT(ApproxEqual(mat1, mat3));
     }
   }
 }
@@ -669,7 +670,7 @@ template<typename Real> void UnitTestCuMatrixCopyCross2() {
     mat2.CopyFromMat(mat1);
     CuMatrix<Real> mat3(M, N);
     mat3.CopyFromMat(mat2);
-    AssertEqual(mat1, mat3);
+    KALDI_ASSERT(ApproxEqual(mat1, mat3));
   }
 }
 
@@ -708,7 +709,7 @@ static void UnitTestCuMatrixSumColumnRanges() {
     CuArray<Int32Pair> indices_tmp(indices);
     cu_dst.SumColumnRanges(cu_src, indices_tmp);
     Matrix<Real> dst2(cu_dst);
-    AssertEqual(dst, dst2);
+    KALDI_ASSERT(ApproxEqual(dst, dst2));
   }
 }
 
@@ -748,7 +749,7 @@ static void UnitTestCuMatrixAddRowRanges() {
     CuArray<Int32Pair> cu_indexes(indexes);
     cu_dst.AddRowRanges(cu_src, cu_indexes);
     Matrix<Real> dst2(cu_dst);
-    AssertEqual(dst1, dst2);
+    KALDI_ASSERT(ApproxEqual(dst1, dst2));
   }
 }
 
@@ -774,7 +775,7 @@ static void UnitTestCuMatrixCopyCols() {
       for (int32 j = 0; j < num_cols2; j++)
         if (reorder[j] < 0) O(i, j) = 0;
         else O(i, j) = M(i, reorder[j]);
-    AssertEqual(N, O);
+    KALDI_ASSERT(ApproxEqual(N, O));
   }
 }
 
@@ -806,7 +807,7 @@ static void UnitTextCuMatrixAddSmat() {
 
     Matrix<Real> mat2(cumat);
 
-    AssertEqual(mat, mat2);
+    KALDI_ASSERT(ApproxEqual(mat, mat2));
   }
 }
 
@@ -844,7 +845,7 @@ static void UnitTextCuMatrixAddMatSmat() {
 
     Matrix<Real> result2(curesult);
 
-    AssertEqual(result, result2);
+    KALDI_ASSERT(ApproxEqual(result, result2));
   }
 }
 
@@ -882,7 +883,7 @@ static void UnitTextCuMatrixAddSmatMat() {
 
     Matrix<Real> result2(curesult);
 
-    AssertEqual(result, result2);
+    KALDI_ASSERT(ApproxEqual(result, result2));
   }
 }
 
@@ -907,7 +908,7 @@ static void UnitTestCuMatrixAddCols() {
       for (int32 j = 0; j < num_cols2; j++)
         if (reorder[j] < 0) O(i, j) = 0;
         else O(i, j) = M(i, reorder[j]);
-    AssertEqual(N, O);
+    KALDI_ASSERT(ApproxEqual(N, O));
   }
 }
 
@@ -929,7 +930,7 @@ static void UnitTestCuMatrixApplyFloor() {
     H.ApplyFloor(floor);
     Matrix<Real> H2(cH);
 
-    AssertEqual(H, H2);
+    KALDI_ASSERT(ApproxEqual(H, H2));
   }
 }
 
@@ -950,7 +951,7 @@ static void UnitTestCuMatrixApplyCeiling() {
     H.ApplyCeiling(ceiling);
     Matrix<Real> H2(cH);
 
-    AssertEqual(H, H2);
+    KALDI_ASSERT(ApproxEqual(H, H2));
   }
 }
 
@@ -969,7 +970,7 @@ static void UnitTestCuMatrixApplyHeaviside() {
     cH.ApplyHeaviside();
     H.ApplyHeaviside();
     Matrix<Real> H2(cH);
-    AssertEqual(H, H2);
+    KALDI_ASSERT(ApproxEqual(H, H2));
   }
 }
 
@@ -988,7 +989,7 @@ static void UnitTestCuMatrixHeaviside() {
     cH2.Heaviside(cH);
     H.ApplyHeaviside();
     Matrix<Real> H2(cH2);
-    AssertEqual(H, H2);
+    KALDI_ASSERT(ApproxEqual(H, H2));
   }
 }
 
@@ -1014,7 +1015,7 @@ static void UnitTestCuMatrixMulElements() {
     Matrix<Real> Ha2(dimM, dimN);
     Da.CopyToMat(&Ha2);
 
-    AssertEqual(Ha,Ha2);
+    KALDI_ASSERT(ApproxEqual(Ha,Ha2));
   }
 }
 
@@ -1026,7 +1027,9 @@ static void UnitTestCuMatrixDivElements() {
     Matrix<Real> Ha(dimM, dimN);
     Matrix<Real> Hb(dimM, dimN);
     Ha.SetRandn();
-    Hb.SetRandn();
+
+    Hb.SetRandUniform();  // Use uniform distirbution t ensure positive numbers
+    Hb.Add(0.1);          // Add bias to ensure we do not divide by zero
 
     CuMatrix<Real> Da(dimM, dimN);
     CuMatrix<Real> Db(dimM, dimN);
@@ -1039,7 +1042,7 @@ static void UnitTestCuMatrixDivElements() {
     Matrix<Real> Ha2(dimM, dimN);
     Da.CopyToMat(&Ha2);
 
-    AssertEqual(Ha,Ha2);
+    KALDI_ASSERT(ApproxEqual(Ha,Ha2));
   }
 }
 
@@ -1052,6 +1055,7 @@ static void UnitTestCuMatrixMax() {
 
   CuMatrix<Real> Da(100,100);
   CuMatrix<Real> Db(100,100);
+  
   Da.CopyFromMat(Ha);
   Db.CopyFromMat(Hb);
 
@@ -1061,7 +1065,7 @@ static void UnitTestCuMatrixMax() {
   Matrix<Real> Ha2(100,100);
   Da.CopyToMat(&Ha2);
 
-  AssertEqual(Ha,Ha2);
+  KALDI_ASSERT(ApproxEqual(Ha,Ha2));
 }
 
 template<typename Real>
@@ -1082,7 +1086,7 @@ static void UnitTestCuMatrixMin() {
   Matrix<Real> Ha2(100,100);
   Da.CopyToMat(&Ha2);
 
-  AssertEqual(Ha, Ha2);
+  KALDI_ASSERT(ApproxEqual(Ha, Ha2));
 }
 
 
@@ -1105,7 +1109,7 @@ static void UnitTestCuMatrixMulColsVec() {
   Matrix<Real> Hm2(100,99);
   Dm.CopyToMat(&Hm2);
 
-  AssertEqual(Hm,Hm2);
+  KALDI_ASSERT(ApproxEqual(Hm,Hm2));
 }
 
 
@@ -1131,7 +1135,7 @@ static void UnitTestCuMatrixMulRowsVec() {
     Matrix<Real> Hm2(dimM, dimN);
     Dm.CopyToMat(&Hm2);
 
-    AssertEqual(Hm,Hm2);
+    KALDI_ASSERT(ApproxEqual(Hm,Hm2));
   }
 }
 
@@ -1157,7 +1161,7 @@ static void UnitTestCuMatrixMulRowsGroupMat() {
 
     Matrix<Real> Hm2(dimM, dimN);
     Dm.CopyToMat(&Hm2);
-    AssertEqual(Hm,Hm2);
+    KALDI_ASSERT(ApproxEqual(Hm,Hm2));
   }
 }
 
@@ -1196,7 +1200,7 @@ static void UnitTestCuMatrixDiffGroupPnorm() {
 
     Matrix<Real> Hid2(dimM, dimN);
     Did.CopyToMat(&Hid2);
-    AssertEqual(Hid, Hid2);
+    KALDI_ASSERT(ApproxEqual(Hid, Hid2));
   }
 }
 
@@ -1231,7 +1235,7 @@ static void UnitTestCuMatrixGroupMaxDeriv() {
   // KALDI_LOG << "Hr " << Hr << " Dr " << Dr << "Ds" << Ds << " Hs " << Hs ;
   Matrix<Real> Hr2(dimM, dimN);
   Dr.CopyToMat(&Hr2);
-  AssertEqual(Hr,Hr2);
+  KALDI_ASSERT(ApproxEqual(Hr,Hr2));
 }
 
 template<typename Real> static void UnitTestCuMatrixAddDiagVecMat() {
@@ -1266,7 +1270,7 @@ template<typename Real> static void UnitTestCuMatrixAddDiagVecMat() {
     }
 
     M.AddDiagVecMat(alpha, V, N, trans, beta);
-    AssertEqual(M, Mcheck);
+    KALDI_ASSERT(ApproxEqual(M, Mcheck));
     KALDI_ASSERT(M.Sum() != 0.0);
   }
 }
@@ -1294,7 +1298,7 @@ template<typename Real> static void UnitTestCuMatrixAddMatDiagVec() {
     Mcheck.AddMat(alpha, buf, kNoTrans);
 
     M.AddMatDiagVec(alpha, N, trans, V, beta);
-    AssertEqual(M, Mcheck);
+    KALDI_ASSERT(ApproxEqual(M, Mcheck));
     KALDI_ASSERT(M.Sum() != 0.0);
   }
 }
@@ -1313,7 +1317,7 @@ template<typename Real> static void UnitTestCuMatrixAddMatMatElements() {
   Mcheck.Scale(beta); Mcheck.AddMat(alpha, buf, kNoTrans);
 
   M.AddMatMatElements(alpha, A, B, beta);
-  AssertEqual(M, Mcheck);
+  KALDI_ASSERT(ApproxEqual(M, Mcheck));
   KALDI_ASSERT(M.Sum() != 0.0);
 }
 
@@ -1332,11 +1336,11 @@ template<typename Real> static void UnitTestCuMatrixSetMatMatDivMat() {
   M.SetMatMatDivMat(A,B,C);
   ref.AddMatMatElements(1.0, A, B, 0.0);
   ref.DivElements(C);
-  AssertEqual(M, ref);
+  KALDI_ASSERT(ApproxEqual(M, ref));
 
   C.SetZero();
   M.SetMatMatDivMat(A,B,C);
-  AssertEqual(M, A);
+  KALDI_ASSERT(ApproxEqual(M, A));
 }
 
 template<typename Real>
@@ -1359,7 +1363,7 @@ static void UnitTestCuMatrixDivRowsVec() {
   Matrix<Real> Hm2(dimM, dimN);
   Dm.CopyToMat(&Hm2);
 
-  AssertEqual(Hm, Hm2);
+  KALDI_ASSERT(ApproxEqual(Hm, Hm2));
 }
 
 
@@ -1382,13 +1386,13 @@ static void UnitTestCuMatrixAddMat() {
   Matrix<Real> Ha2(100,100);
   Da.CopyToMat(&Ha2);
 
-  AssertEqual(Ha,Ha2);
+  KALDI_ASSERT(ApproxEqual(Ha,Ha2));
 
   //check use with submatrix
   CuMatrix<Real> mat1(10,10,kSetZero);
   mat1.AddMat(1.0,Da.Range(5,10,12,10)); //different stride for mat1,mat2
   CuMatrix<Real> mat2(Da.Range(5,10,12,10));
-  AssertEqual(mat1,mat2);
+  KALDI_ASSERT(ApproxEqual(mat1,mat2));
 
   for (int i = 0; i < 10; i++) {
     int32 N = 5 * (10 + Rand() % 10),  M = 100 + Rand() % 50;
@@ -1408,14 +1412,14 @@ static void UnitTestCuMatrixAddMat() {
 
     Matrix<Real> Hc2(N,M);
     Dc.CopyToMat(&Hc2);
-    AssertEqual(Hc,Hc2);
+    KALDI_ASSERT(ApproxEqual(Hc,Hc2));
 
     // check use with submatrix
     CuMatrix<Real> mat3(N/5,M,kSetZero);
     mat3.AddMat(1.0, Dd.Range(0,M,0,N/5),kTrans);
 
     CuMatrix<Real> mat4(Dd.Range(0,M,0,N/5),kTrans);
-    AssertEqual(mat3,mat4);
+    KALDI_ASSERT(ApproxEqual(mat3,mat4));
   }
 }
 
@@ -1444,7 +1448,7 @@ static void UnitTestCuMatrixAddMatBlocks1() {
       }
     }
     dst.AddMatBlocks(alpha, src);
-    AssertEqual(dst, dst_copy);
+    KALDI_ASSERT(ApproxEqual(dst, dst_copy));
   }
 }
 
@@ -1471,7 +1475,7 @@ static void UnitTestCuMatrixAddMatBlocks1Trans() {
       }
     }
     dst.AddMatBlocks(alpha, src, kTrans);
-    AssertEqual(dst, dst_copy);
+    KALDI_ASSERT(ApproxEqual(dst, dst_copy));
   }
 }
 
@@ -1501,7 +1505,7 @@ static void UnitTestCuMatrixAddMatBlocks2() {
       }
     }
     dst.AddMatBlocks(alpha, src);
-    AssertEqual(dst, dst_copy);
+    KALDI_ASSERT(ApproxEqual(dst, dst_copy));
   }
 }
 
@@ -1553,7 +1557,7 @@ static void UnitTestCuMatrixAddVecToCols() {
   Matrix<Real> Hm2(100,99);
   Dm.CopyToMat(&Hm2);
 
-  AssertEqual(Hm,Hm2);
+  KALDI_ASSERT(ApproxEqual(Hm,Hm2));
 }
 
 
@@ -1576,7 +1580,7 @@ static void UnitTestCuMatrixAddVecToRows() {
   Matrix<Real> Hm2(100,99);
   Dm.CopyToMat(&Hm2);
 
-  AssertEqual(Hm,Hm2);
+  KALDI_ASSERT(ApproxEqual(Hm,Hm2));
 }
 
 
@@ -1603,7 +1607,7 @@ static void UnitTestCuMatrixSymAddMat2() {
 
     CuTpMatrix<Real> T1(M), T2(M2);
     CuMatrix<Real> X1(T1), X2(T2); // so we can test equality.
-    AssertEqual(X1, X2);
+    KALDI_ASSERT(ApproxEqual(X1, X2));
     KALDI_ASSERT(dimM == 0 || X1.Trace() != 0);
   }
 }
@@ -1683,8 +1687,8 @@ static void UnitTestCuMatrixAddMatMat() {
   Dc1.CopyToMat(&Hc1a);
   Dc2.CopyToMat(&Hc2a);
 
-  AssertEqual(Hc1,Hc1a);
-  AssertEqual(Hc2,Hc2a);
+  KALDI_ASSERT(ApproxEqual(Hc1,Hc1a));
+  KALDI_ASSERT(ApproxEqual(Hc2,Hc2a));
 }
 
 
@@ -1708,7 +1712,7 @@ static void UnitTestCuMatrixAddVecVec() {
   Matrix<Real> A2(100, 200);
   CuA.CopyToMat(&A2);
 
-  AssertEqual(A,A2);
+  KALDI_ASSERT(ApproxEqual(A,A2));
 }
 
 
@@ -1773,8 +1777,8 @@ static void UnitTestCuMatrixAddMatMatBatched() {
     (*HC2[i]).AddMatMat(0.5f, *(HA[i]), kTrans, *(HB[i]), kTrans, 0.0f);
     DC1[i]->CopyToMat(&Hca1);
     DC2[i]->CopyToMat(&Hca2);
-    AssertEqual(*(HC1[i]), Hca1);
-    AssertEqual(*(HC2[i]), Hca2);
+    KALDI_ASSERT(ApproxEqual(*(HC1[i]), Hca1));
+    KALDI_ASSERT(ApproxEqual(*(HC2[i]), Hca2));
     delete Ha[i]; delete Hb[i]; delete Hc1[i]; delete Hc2[i];
     delete HA[i]; delete HB[i]; delete HC1[i]; delete HC2[i];
     delete Da[i]; delete Db[i]; delete Dc1[i]; delete Dc2[i];
@@ -1796,7 +1800,7 @@ static void UnitTestCuMatrixAddToDiag() {
     M.AddToDiag(alpha);
     Mc.AddToDiag(alpha);
     Matrix<Real> M2(Mc);
-    AssertEqual(M, M2);
+    KALDI_ASSERT(ApproxEqual(M, M2));
   }
 }
 
@@ -1810,7 +1814,7 @@ static void UnitTestCuMatrixAdd2() {
     M.Add(alpha);
     Mc.Add(alpha);
     Matrix<Real> M2(Mc);
-    AssertEqual(M, M2);
+    KALDI_ASSERT(ApproxEqual(M, M2));
   }
 }
 
@@ -1826,7 +1830,7 @@ static void UnitTestCuMatrixCopyFromMat() {
     CuMatrix<Real> B(dim, dim);
     B.CopyFromMat(E);
 
-    AssertEqual<Real>(B, E);
+    KALDI_ASSERT(ApproxEqual<Real>(B, E));
   }
 }
 
@@ -1842,7 +1846,7 @@ static void UnitTestCuMatrixCopyFromTp() {
     B.CopyFromTp(A, kNoTrans);
     C.CopyFromTp(E, kNoTrans);
     CuMatrix<Real> D(B);
-    AssertEqual<Real>(D, C);
+    KALDI_ASSERT(ApproxEqual<Real>(D, C));
   }
 }
 
@@ -1865,7 +1869,7 @@ static void UnitTestCuMatrixAddMatTp() {
     D.AddMatTp(1.0, E, kNoTrans, F, kNoTrans, 1.0);
 
     CuMatrix<Real> G(A);
-    AssertEqual<Real>(G, D);
+    KALDI_ASSERT(ApproxEqual<Real>(G, D));
   }
 }
 
@@ -1884,7 +1888,7 @@ static void UnitTestCuMatrixTranspose() {
     Matrix<Real> hA(A);
     Matrix<Real> hB(B);
     hB.Transpose();
-    AssertEqual(hA, hB);
+    KALDI_ASSERT(ApproxEqual(hA, hB));
   }
 }
 
@@ -1907,7 +1911,7 @@ static void UnitTestCuMatrixAddTpMat() {
     D.AddTpMat(1.0, F, kNoTrans, E, kNoTrans, 1.0);
 
     CuMatrix<Real> G(A);
-    AssertEqual<Real>(G, D);
+    KALDI_ASSERT(ApproxEqual<Real>(G, D));
   }
 }
 
@@ -1933,7 +1937,7 @@ static void UnitTestCuVectorAddVec() {
   Vector<Real> Hv2(777);
   Dv.CopyToVec(&Hv2);
 
-  AssertEqual(Hv,Hv2);
+  KALDI_ASSERT(ApproxEqual(Hv,Hv2));
 }
 
 
@@ -1964,7 +1968,7 @@ static void UnitTestCuVectorAddRowSumMat() {
   Vector<Real> Hv2(Y);
   Dv.CopyToVec(&Hv2);
 
-  AssertEqual(Hv,Hv2);
+  KALDI_ASSERT(ApproxEqual(Hv,Hv2));
 }
 
 
@@ -1992,7 +1996,7 @@ static void UnitTestCuVectorAddRowSumMatLarge() {
   Vector<Real> Hv2(990);
   Dv.CopyToVec(&Hv2);
 
-  AssertEqual(Hv,Hv2);
+  KALDI_ASSERT(ApproxEqual(Hv,Hv2));
 }
 
 
@@ -2023,7 +2027,7 @@ static void UnitTestCuVectorAddColSumMat() {
   Vector<Real> Hv2(X);
   Dv.CopyToVec(&Hv2);
 
-  AssertEqual(Hv,Hv2);
+  KALDI_ASSERT(ApproxEqual(Hv,Hv2));
 }
 
 template<typename Real>
@@ -2068,7 +2072,7 @@ static void UnitTestCuVectorAddColSumMatLarge() {
   Vector<Real> Hv2(1000);
   Dv.CopyToVec(&Hv2);
 
-  AssertEqual(Hv,Hv2);
+  KALDI_ASSERT(ApproxEqual(Hv,Hv2));
 }
 
 
@@ -2087,7 +2091,7 @@ static void UnitTestCuVectorInvertElements() {
   Vector<Real> Hv2(777);
   Dv.CopyToVec(&Hv2);
 
-  AssertEqual(Hv,Hv2);
+  KALDI_ASSERT(ApproxEqual(Hv,Hv2));
 }
 
 template<typename Real>
@@ -2104,7 +2108,7 @@ static void UnitTestCuMatrixInvertElements() {
   Matrix<Real> Hm2(77, 77);
   Dm.CopyToMat(&Hm2);
 
-  AssertEqual(Hm,Hm2);
+  KALDI_ASSERT(ApproxEqual(Hm,Hm2));
 }
 
 
@@ -2123,7 +2127,7 @@ static void UnitTestCuMatrixIO() {
     CuMatrix<Real> mat2;
     std::istringstream is(os.str());
     mat2.Read(is, binary);
-    AssertEqual(mat, mat2);
+    KALDI_ASSERT(ApproxEqual(mat, mat2));
   }
 }
 
@@ -2151,7 +2155,7 @@ static void UnitTestCuVectorAddTpVec() {
   Vector<Real> Hv2(300);
   Dv.CopyToVec(&Hv2);
 
-  AssertEqual(Hv,Hv2);
+  KALDI_ASSERT(ApproxEqual(Hv,Hv2));
 }
 
 template<typename Real>
@@ -2189,7 +2193,7 @@ static void UnitTestCuVectorMulTp() {
   Vector<Real> Hv2(300);
   Dv.CopyToVec(&Hv2);
 
-  AssertEqual(Hv,Hv2);
+  KALDI_ASSERT(ApproxEqual(Hv,Hv2));
 }
 
 template<typename Real, typename OtherReal>
@@ -2243,7 +2247,7 @@ static void UnitTestCuSigmoid() {
   Matrix<Real> Ho2(100,111);
   Do.CopyToMat(&Ho2);
 
-  AssertEqual(Ho,Ho2);
+  KALDI_ASSERT(ApproxEqual(Ho,Ho2));
 }
 
 
@@ -2274,7 +2278,7 @@ static void UnitTestCuDiffSigmoid() {
   Matrix<Real> Ho2(100,111);
   Do.CopyToMat(&Ho2);
 
-  AssertEqual(Ho,Ho2);
+  KALDI_ASSERT(ApproxEqual(Ho,Ho2));
 }
 
 
@@ -2317,7 +2321,7 @@ static void UnitTestCuDiffSoftmax() {
     Matrix<Real> Ho2(m, n);
     Do.CopyToMat(&Ho2);
 
-    AssertEqual(Ho, Ho2);
+    KALDI_ASSERT(ApproxEqual(Ho, Ho2));
   }
 }
 
@@ -2362,7 +2366,7 @@ static void UnitTestCuDiffLogSoftmax() {
     Matrix<Real> Ho2(m, n);
     Do.CopyToMat(&Ho2);
 
-    AssertEqual(Ho, Ho2);
+    KALDI_ASSERT(ApproxEqual(Ho, Ho2));
  }
 }
 
@@ -2399,7 +2403,7 @@ static void UnitTestCuSoftmax() {
 
     Matrix<Real> Ho2(Do);
 
-    AssertEqual(Ho,Ho2,0.00001);
+    KALDI_ASSERT(ApproxEqual(Ho,Ho2,(Real)0.00001));
   }
 }
 
@@ -2436,7 +2440,7 @@ static void UnitTestCuLogSoftmax() {
 
     Matrix<Real> Ho2(Do);
 
-    AssertEqual(Ho, Ho2, 0.00001);
+    KALDI_ASSERT(ApproxEqual(Ho, Ho2, (Real)0.00001));
   }
 }
 
@@ -2469,7 +2473,12 @@ static void UnitTestCuFindRowMaxId() {
     std::vector<int32> Hmax2(dimM);
     Dmax.CopyToVec(&Hmax2);
 
-    KALDI_ASSERT(Hmax == Hmax2);
+    // If the same value were generated randomly we can get to a case
+    // where the GPU and CPU return different columns.  Both would be correct.
+    // Thus check that the max for each row is the same and not the index.
+    for (MatrixIndexT r=0; r<Hi.NumRows(); r++) {
+      KALDI_ASSERT(Hi(r, Hmax[r]) == Di(r, Hmax2[r]));
+    }
   }
 }
 
@@ -2508,8 +2517,8 @@ static void UnitTestCuDiffXent() {
   Vector<Real> Hlogpost2(X);
   Dlogpost.CopyToVec(&Hlogpost2);
 
-  AssertEqual(Hi,Hi2);
-  AssertEqual(Hlogpost,Hlogpost2);
+  KALDI_ASSERT(ApproxEqual(Hi,Hi2));
+  KALDI_ASSERT(ApproxEqual(Hlogpost,Hlogpost2));
 }
 
 template<typename Real> void UnitTestCheck() {
@@ -2542,8 +2551,8 @@ void UnitTestSwapCu2Cu() {
   Di.CopyToMat(&Hf);
   Matrix<Real> Hf2(Di2.NumRows(), Di2.NumCols());
   Di2.CopyToMat(&Hf2);
-  AssertEqual(Hi,Hf2);
-  AssertEqual(Hi2,Hf);
+  KALDI_ASSERT(ApproxEqual(Hi,Hf2));
+  KALDI_ASSERT(ApproxEqual(Hi2,Hf));
 }
 
 template<typename Real>
@@ -2561,8 +2570,8 @@ void UnitTestSwapCu2M() {
   Di.Swap(&Hi2);
   Matrix<Real> Hf(Di.NumRows(), Di.NumCols());
   Di.CopyToMat(&Hf);
-  AssertEqual(Di2,Hf);
-  AssertEqual(Hi2,Hi);
+  KALDI_ASSERT(ApproxEqual(Di2,Hf));
+  KALDI_ASSERT(ApproxEqual(Hi2,Hi));
 }
 
 
@@ -2582,7 +2591,7 @@ void UnitTestCuTanh() {
   //cpu
   Matrix<Real> Hf(H.NumRows(), H.NumCols());
   Hf.Tanh(H);
-  AssertEqual(Df,Hf);
+  KALDI_ASSERT(ApproxEqual(Df,Hf));
 }
 
 template<typename Real>
@@ -2611,7 +2620,7 @@ static void UnitTestCuDiffTanh() {
   Matrix<Real> Ho2(100,111);
   Do.CopyToMat(&Ho2);
 
-  AssertEqual(Ho,Ho2);
+  KALDI_ASSERT(ApproxEqual(Ho,Ho2));
 }
 
 // just need this for testing function below.  Compute n!!
@@ -2633,7 +2642,7 @@ static void UnitTestCuMatrixSetRandn() {
     M.SetRandn();
     srand(104);
     N.SetRandn();
-    AssertEqual(M, N);
+    KALDI_ASSERT(ApproxEqual(M, N));
   }
 
   for (int32 i = 0; i < 5; i++) {
@@ -2822,29 +2831,37 @@ static void UnitTestCuMatrixAddElements() {
     CuMatrix<Real> M(H);
     int32 num_elements = 100 + Rand() % 10;
     std::vector<MatrixElement<Real> > input;
-    std::vector<Int32Pair> input_index;
+    std::set<Int32Pair> input_index;      //Set used to ensure unique elements
+    std::vector<Int32Pair> input_index_v;
     Real *input_value = new Real[num_elements];
     BaseFloat scale = -1 + (0.33 * (Rand() % 5));
     for (int32 j = 0; j < num_elements; j++) {
-      MatrixIndexT r = Rand() % dimM;
-      MatrixIndexT c = Rand() % dimN;
       Int32Pair tmp_pair;
-      tmp_pair.first = r;
-      tmp_pair.second = c;
+      // Generate a unique random index
+      do {
+        tmp_pair.first = Rand() % dimM;
+        tmp_pair.second = Rand() % dimN;
+      } while (input_index.find(tmp_pair)!=input_index.end());
+      input_index.insert(tmp_pair);  
+
+      MatrixIndexT r = tmp_pair.first;
+      MatrixIndexT c = tmp_pair.second;
+      input_index_v.push_back(tmp_pair);
+
       Real offset = -1 + (0.33 * (Rand() % 5));
       M(r, c) += scale * offset;
       MatrixElement<Real> t = {r, c, offset};
       input.push_back(t);
-      input_index.push_back(tmp_pair);
       input_value[j] = offset;
     }
+    
     H.AddElements(scale, input);
-    CuArray<Int32Pair> cu_input_index(input_index);
+    CuArray<Int32Pair> cu_input_index(input_index_v);
     H_copy.AddElements(scale, cu_input_index, input_value);
     delete[] input_value;
 
-    AssertEqual(H, M);
-    AssertEqual(H_copy, M);
+    KALDI_ASSERT(ApproxEqual(H, M));
+    KALDI_ASSERT(ApproxEqual(H_copy, M));
   }
 }
 
@@ -2866,7 +2883,7 @@ static void UnitTestCuMatrixAddToElements() {
     }
     CuArray<int32> cu_elements(elements);
     A_copy.AddToElements(alpha, cu_elements);
-    AssertEqual(A_copy, A);
+    KALDI_ASSERT(ApproxEqual(A_copy, A));
   }
 }
 
diff --git a/src/cudamatrix/cu-matrixdim.h b/src/cudamatrix/cu-matrixdim.h
index 74912dad6e3..248e08199a1 100644
--- a/src/cudamatrix/cu-matrixdim.h
+++ b/src/cudamatrix/cu-matrixdim.h
@@ -79,6 +79,14 @@ extern "C" {
     int32_cuda first;
     int32_cuda second;
   } Int32Pair;
+
+  inline bool operator<(const Int32Pair &a, const Int32Pair &b) {
+    if (a.first < b.first)
+      return true;
+    if (a.first > b.first)
+      return false;
+    return a.second < b.second;
+  }
 }
 
 #endif

From c415cba14e5c0d6b79a3366205d971b4d5631acc Mon Sep 17 00:00:00 2001
From: Hugo Braun <to@hugobraun.me>
Date: Fri, 26 Apr 2019 18:23:42 +0200
Subject: [PATCH 088/163] [src] Adding GPU/CUDA lattice batched decoder +
 binary (#3114)

---
 src/Makefile                                  |   13 +-
 src/cudadecoder/Makefile                      |   33 +
 src/cudadecoder/README                        |  141 ++
 .../batched-threaded-nnet3-cuda-pipeline.cc   |  651 ++++++
 .../batched-threaded-nnet3-cuda-pipeline.h    |  272 +++
 src/cudadecoder/cuda-decodable-itf.h          |   33 +
 src/cudadecoder/cuda-decoder-common.h         |  468 ++++
 src/cudadecoder/cuda-decoder-kernels-utils.h  |  215 ++
 src/cudadecoder/cuda-decoder-kernels.cu       | 1878 +++++++++++++++++
 src/cudadecoder/cuda-decoder-kernels.h        |  193 ++
 src/cudadecoder/cuda-decoder.cc               | 1594 ++++++++++++++
 src/cudadecoder/cuda-decoder.h                |  755 +++++++
 src/cudadecoder/cuda-fst.cc                   |  209 ++
 src/cudadecoder/cuda-fst.h                    |  122 ++
 src/cudadecoder/decodable-cumatrix.cc         |   62 +
 src/cudadecoder/decodable-cumatrix.h          |   71 +
 src/cudadecoder/thread-pool.h                 |  117 +
 src/cudadecoderbin/Makefile                   |   27 +
 src/cudadecoderbin/batched-wav-nnet3-cuda.cc  |  306 +++
 src/nnet2/nnet-component-test.cc              |  909 --------
 tools/config/common_path.sh                   |    1 +
 21 files changed, 7155 insertions(+), 915 deletions(-)
 create mode 100644 src/cudadecoder/Makefile
 create mode 100644 src/cudadecoder/README
 create mode 100644 src/cudadecoder/batched-threaded-nnet3-cuda-pipeline.cc
 create mode 100644 src/cudadecoder/batched-threaded-nnet3-cuda-pipeline.h
 create mode 100644 src/cudadecoder/cuda-decodable-itf.h
 create mode 100644 src/cudadecoder/cuda-decoder-common.h
 create mode 100644 src/cudadecoder/cuda-decoder-kernels-utils.h
 create mode 100644 src/cudadecoder/cuda-decoder-kernels.cu
 create mode 100644 src/cudadecoder/cuda-decoder-kernels.h
 create mode 100644 src/cudadecoder/cuda-decoder.cc
 create mode 100644 src/cudadecoder/cuda-decoder.h
 create mode 100644 src/cudadecoder/cuda-fst.cc
 create mode 100644 src/cudadecoder/cuda-fst.h
 create mode 100644 src/cudadecoder/decodable-cumatrix.cc
 create mode 100644 src/cudadecoder/decodable-cumatrix.h
 create mode 100644 src/cudadecoder/thread-pool.h
 create mode 100644 src/cudadecoderbin/Makefile
 create mode 100644 src/cudadecoderbin/batched-wav-nnet3-cuda.cc
 delete mode 100644 src/nnet2/nnet-component-test.cc

diff --git a/src/Makefile b/src/Makefile
index d63e642b095..5771df33d2a 100644
--- a/src/Makefile
+++ b/src/Makefile
@@ -4,12 +4,13 @@
 
 SHELL := /bin/bash
 
-
 SUBDIRS = base matrix util feat tree gmm transform \
           fstext hmm lm decoder lat kws cudamatrix \
           bin fstbin gmmbin fgmmbin featbin \
-          latbin nnet3 rnnlm chain nnet3bin kwsbin \
-          ivector ivectorbin online2 online2bin lmbin chainbin rnnlmbin
+          latbin sgmm2 sgmm2bin nnet3 rnnlm chain nnet3bin kwsbin \
+          ivector ivectorbin online2 online2bin lmbin chainbin rnnlmbin \
+          cudadecoder cudadecoderbin
+
 
 MEMTESTDIRS = base matrix util feat tree gmm transform \
           fstext hmm lm decoder lat kws chain \
@@ -127,7 +128,7 @@ $(EXT_SUBDIRS) : checkversion kaldi.mk mklibdir ext_depend
 ### Dependency list ###
 # this is necessary for correct parallel compilation
 #1)The tools depend on all the libraries
-bin fstbin gmmbin fgmmbin sgmm2bin featbin nnet3bin chainbin latbin ivectorbin lmbin kwsbin online2bin rnnlmbin: \
+bin fstbin gmmbin fgmmbin sgmm2bin featbin nnet3bin chainbin latbin ivectorbin lmbin kwsbin online2bin rnnlmbin cudadecoderbin: \
  base matrix util feat tree gmm transform sgmm2 fstext hmm \
  lm decoder lat cudamatrix nnet3 ivector chain kws online2 rnnlm
 
@@ -146,8 +147,6 @@ lm: base util matrix fstext
 decoder: base util matrix gmm hmm tree transform lat
 lat: base util hmm tree matrix
 cudamatrix: base util matrix
-nnet: base util hmm tree matrix cudamatrix
-nnet2: base util matrix lat gmm hmm tree transform cudamatrix
 nnet3: base util matrix decoder lat gmm hmm tree transform cudamatrix chain fstext
 rnnlm: base util matrix cudamatrix nnet3 lm hmm
 chain: lat hmm tree fstext matrix cudamatrix util base
@@ -158,3 +157,5 @@ onlinebin: base matrix util feat tree gmm transform sgmm2 fstext hmm lm decoder
 online: decoder gmm transform feat matrix util base lat hmm tree
 online2: decoder gmm transform feat matrix util base lat hmm tree ivector cudamatrix nnet3 chain
 kws: base util hmm tree matrix lat
+cudadecoder:  cudamatrix online2 nnet3 ivector feat fstext lat chain transform
+cudadecoderbin: cudadecoder cudamatrix online2 nnet3 ivector feat fstext lat chain transform
diff --git a/src/cudadecoder/Makefile b/src/cudadecoder/Makefile
new file mode 100644
index 00000000000..ede7cfddbe7
--- /dev/null
+++ b/src/cudadecoder/Makefile
@@ -0,0 +1,33 @@
+all:
+
+EXTRA_CXXFLAGS = -Wno-sign-compare
+include ../kaldi.mk
+
+ifeq ($(CUDA), true)
+
+# Make sure we have CUDA_ARCH from kaldi.mk,
+ifndef CUDA_ARCH
+  $(error CUDA_ARCH is undefined, run 'src/configure')
+endif
+
+TESTFILES =
+
+OBJFILES = batched-threaded-nnet3-cuda-pipeline.o decodable-cumatrix.o \
+           cuda-decoder.o cuda-decoder-kernels.o cuda-fst.o
+
+LDFLAGS += $(CUDA_LDFLAGS)
+LDLIBS += $(CUDA_LDLIBS)
+
+LIBNAME = kaldi-cudadecoder
+
+ADDLIBS = ../cudamatrix/kaldi-cudamatrix.a ../base/kaldi-base.a ../matrix/kaldi-matrix.a \
+          ../lat/kaldi-lat.a ../util/kaldi-util.a ../matrix/kaldi-matrix.a ../gmm/kaldi-gmm.a \
+          ../fstext/kaldi-fstext.a ../hmm/kaldi-hmm.a ../gmm/kaldi-gmm.a ../transform/kaldi-transform.a \
+          ../tree/kaldi-tree.a ../online2/kaldi-online2.a ../nnet3/kaldi-nnet3.a
+
+# Implicit rule for kernel compilation
+%.o : %.cu
+	$(CUDATKDIR)/bin/nvcc -c $< -o $@ $(CUDA_INCLUDE) $(CUDA_FLAGS) $(CUDA_ARCH) -I../ -I$(OPENFSTINC)
+endif
+
+include ../makefiles/default_rules.mk
diff --git a/src/cudadecoder/README b/src/cudadecoder/README
new file mode 100644
index 00000000000..64aeee3fa35
--- /dev/null
+++ b/src/cudadecoder/README
@@ -0,0 +1,141 @@
+CUDADECODER USAGE AND TUNING GUIDE
+
+INTRODUCTION:
+
+The CudaDecoder was developed by NVIDIA with coordination from Johns Hopkins.
+This work was intended to demonstrate efficient GPU utilization across a range 
+of NVIDIA hardware from SM_35 and on.  The following guide describes how to 
+use and tune the decoder for your models.
+
+A single speech-to-text is not enough work to fully saturate any NVIDIA GPUs.
+To fully saturate GPUs we need to decode many audio files concurrently.  The
+solution provide does this through a combination of batching many audio files
+into a single speech pipeline, running multiple pipelines in parallel on the
+device, and using multiple CPU threads to perform feature extraction and 
+determinization.  Users of the decoder will need to have a high level 
+understanding of the underlying implementation to know how to tune the 
+decoder.  
+
+The interface to the decoder is defined in "batched-threaded-cuda-decoder.h".
+A binary example can be found in cudadecoderbin/batched-wav-nnet3-cuda.cc".
+Below is a simple usage example. 
+/*
+ *  BatchedThreadedCudaDecoderConfig batchedDecoderConfig;
+ *  batchedDecoderConfig.Register(&po);
+ *  po.Read(argc, argv);
+ *  ...
+ *  BatchedThreadedCudaDecoder CudaDecoder(batchedDecoderConfig);
+ *  CudaDecoder.Initialize(*decode_fst, am_nnet, trans_model);
+ *  ...
+ *
+ *  for (; !wav_reader.Done(); wav_reader.Next()) {
+ *    std::string key = wav_reader.Key();
+ *    CudaDecoder.OpenDecodeHandle(key, wave_reader.Value());
+ *    ...
+ *  }
+ *
+ *  while (!processed.empty()) {
+ *    CompactLattice clat;
+ *    CudaDecoder.GetLattice(key, &clat);
+ *    CudaDecoder.CloseDecodeHandle(key);
+ *    ...
+ *  }
+ *
+ *  CudaDecoder.Finalize();
+ */
+
+In the code above we first declare a BatchedThreadedCudaDecoderConfig
+and register its options.  This enables us to tune the configuration 
+options.   Next we declare the CudaDecoder with that configuration.
+Before we can use the CudaDecoder we need to initalize it with an
+FST, AmNnetSimple, and TransitionModel.  
+
+Next we iterate through waves and enqueue them into the decoder by
+calling OpenDecodeHandle.  Note the key must be unique for each 
+decode. Once we have enqueued work we can query the results by calling
+GetLattice on the same key we opened the handle on.  This will automatticaly
+wait for processing to complete before returning. 
+
+The key to get performance is to have many decodes active at the same time
+by opening many decode handles before querying for the lattices.
+
+
+PERFORMANCE TUNING:
+
+The CudaDecoder has a lot of tuning parameters which should be used to
+increase performance on various models and hardware.  Note that it is 
+expected that the optimal parameters will vary according to both the hardware,
+model, and data being decoded.
+
+The following will briefly describe each parameter:
+
+BatchedThreadedCudaDecoderOptions:
+  cuda-control-threads:  Number of CPU threads simultaniously submitting work
+    to the device.  For best performance this should be between 2-4.
+  cuda-worker-threads:  CPU threads for worker tasks like determinization and
+    feature extraction.  For best performance this should take up all spare
+    CPU threads available on the system.
+  max-batch-size:  Maximum batch size in a single pipeline.  This should be as
+    large as possible but is expected to be between 50-200.  
+  batch-drain-size:  How far to drain the batch before getting new work.
+    Draining the batch allows nnet3 to be better batched.  Testing has 
+    indicated that 10-30% of max-batch-size is ideal.
+  determinize-lattice:  Use cuda-worker-threads to determinize the lattice. if
+    this is true then GetRawLattice can no longer be called.
+  max-outstanding-queue-length:  The maximum number of decodes that can be
+    queued and not assigned before OpenDecodeHandle will automatically stall 
+    the submitting thread.  Raising this increases CPU resources.  This should 
+    be set to a few thousand at least.
+
+Decoder Options:
+  beam:  The width of the beam during decoding
+  lattice-beam:  The width of the lattice beam
+  ntokens-preallocated:  number of tokens allocated in host buffers.  If
+    this size is exceeded the buffer will reallocate larger consuming more
+    resources
+  max-tokens-per-frame:  maximum tokens in GPU memory per frame.  If this
+    value is exceeded the beam will tighten and accuracy may decrease.
+  max-active: at the end of each frame computation, we keep only its best max-active tokens (arc instantiations)
+
+Device Options:
+  use-tensor-cores:  Enables tensor core (fp16 math) for gemms.  This is
+    faster but less accurate.  For inference the loss of accuracy is marginal
+
+GPU MEMORY USAGE:
+
+GPU memory is limited.  Large GPUs have between 16-32GB of memory.  Consumer
+GPUs have much less.  For best performance users should have as many
+concurrent decodes as possible.  Thus users should purchase GPUs with as
+much memory as possible.  GPUs with less memory may have to sacrifice either
+performance or accuracy.  On 16GB GPUs for example we are able to support
+around 200 concurrent decodes at a time. This translates into 4
+cuda-control-threads and a max-batch-size of 50 (4x50).  If your model is
+larger or smaller than the models our models when testing you may have to
+raise or lower this.  
+
+There are a number of parameters which can be used to control GPU memory
+usage. How they impact memory usage and accuracy is discussed below:
+
+  max-tokens-per-frame: Controls how many buffers can be stored on the GPU for
+    each frame.  This buffer size cannot be exceed or reallocated.  As this
+    buffer gets closer to being exhausted the beam is reduced possibly reducing
+    quality.  This should be tuned according to the model and data.  For
+    example, a highly accurate model could set this values smaller to enable
+    more concurrent decodes.
+
+  cuda-control-threads:  Each control thread is a concurrent pipeline.  Thus
+    the GPU memory scales linearly with this parameter.  This should always be
+    at least 2 but should probably not be higher than 4 as more concurrent
+    pipelines leads to more driver contention reducing performance.
+
+  max-batch-size:  The number of concurrent decodes in each pipeline.  The
+    memory usage also scales linear with this parameter.  Setting this smaller
+    will reduce kernel runtime while increase launch latency overhead.
+    Ideally this should be as large as possible while still fitting into
+    memory.  Note that currently the maximum allowed is 200.
+
+== Acknowledgement ==
+
+We would like to thank Daniel Povey, Zhehuai Chen and Daniel Galvez for their help and expertise during the review process.
+
+
diff --git a/src/cudadecoder/batched-threaded-nnet3-cuda-pipeline.cc b/src/cudadecoder/batched-threaded-nnet3-cuda-pipeline.cc
new file mode 100644
index 00000000000..34c7ea06a9d
--- /dev/null
+++ b/src/cudadecoder/batched-threaded-nnet3-cuda-pipeline.cc
@@ -0,0 +1,651 @@
+// cudadecoder/batched-threaded-nnet3-cuda-pipeline.cc
+//
+// Copyright (c) 2019, NVIDIA CORPORATION.  All rights reserved.
+// Hugo Braun, Justin Luitjens, Ryan Leary
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#define SLEEP_BACKOFF_NS 500
+#define SLEEP_BACKOFF_S ((double)SLEEP_BACKOFF_NS/1e9)
+#if HAVE_CUDA == 1
+
+#include "cudadecoder/batched-threaded-nnet3-cuda-pipeline.h"
+#include "base/kaldi-utils.h"
+#include <nvToolsExt.h>
+
+namespace kaldi {
+namespace cuda_decoder {
+
+void BatchedThreadedNnet3CudaPipeline::Initialize(
+    const fst::Fst<fst::StdArc> &decode_fst, const nnet3::AmNnetSimple &am_nnet,
+    const TransitionModel &trans_model) {
+  KALDI_LOG << "BatchedThreadedNnet3CudaPipeline Initialize with "
+            << config_.num_control_threads << " control threads, "
+            << config_.num_worker_threads << " worker threads"
+            << " and batch size " << config_.max_batch_size;
+
+  am_nnet_ = &am_nnet;
+  trans_model_ = &trans_model;
+  cuda_fst_.Initialize(decode_fst, trans_model_);
+
+  feature_info_ = new OnlineNnet2FeaturePipelineInfo(config_.feature_opts);
+  feature_info_->ivector_extractor_info.use_most_recent_ivector = true;
+  feature_info_->ivector_extractor_info.greedy_ivector_extractor = true;
+
+  // initialize threads and save their contexts so we can join them later
+  thread_contexts_.resize(config_.num_control_threads);
+
+  // create work queue
+  pending_task_queue_ = new TaskState *[config_.max_pending_tasks + 1];
+  tasks_front_ = 0;
+  tasks_back_ = 0;
+
+  // ensure all allocations/kernels above are complete before launching threads
+  // in different streams.
+  cudaStreamSynchronize(cudaStreamPerThread);
+
+  // Create threadpool for CPU work
+  work_pool_ = new ThreadPool(config_.num_worker_threads);
+
+  exit_ = false;
+  numStarted_ = 0;
+
+  // start workers
+  for (int i = 0; i < config_.num_control_threads; i++) {
+    thread_contexts_[i] =
+        std::thread(&BatchedThreadedNnet3CudaPipeline::ExecuteWorker, this, i);
+  }
+
+  // wait for threads to start to ensure allocation time isn't in the timings
+  while (numStarted_ < config_.num_control_threads)
+    kaldi::Sleep(SLEEP_BACKOFF_S);
+}
+void BatchedThreadedNnet3CudaPipeline::Finalize() {
+
+  // Tell threads to exit and join them
+  exit_ = true;
+
+  for (int i = 0; i < config_.num_control_threads; i++) {
+    thread_contexts_[i].join();
+  }
+
+  cuda_fst_.Finalize();
+
+  delete feature_info_;
+  delete work_pool_;
+  delete[] pending_task_queue_;
+}
+
+// query a specific key to see if compute on it is complete
+bool BatchedThreadedNnet3CudaPipeline::isFinished(const std::string &key) {
+  tasks_lookup_mutex_.lock();
+  auto it = tasks_lookup_.find(key);
+  KALDI_ASSERT(it != tasks_lookup_.end());
+  tasks_lookup_mutex_.unlock();
+  return it->second.finished;
+}
+
+// remove an audio file from the decoding and clean up resources
+void BatchedThreadedNnet3CudaPipeline::CloseDecodeHandle(const std::string &key) {
+  tasks_lookup_mutex_.lock();
+  auto it = tasks_lookup_.find(key);
+  KALDI_ASSERT(it != tasks_lookup_.end());
+  tasks_lookup_mutex_.unlock();
+
+  TaskState &task = it->second;
+
+  // wait for task to finish processing
+  while (task.finished != true)
+    kaldi::Sleep(SLEEP_BACKOFF_S);
+
+  tasks_lookup_mutex_.lock();
+  tasks_lookup_.erase(it);
+  tasks_lookup_mutex_.unlock();
+}
+
+// Adds a decoding task to the decoder
+void BatchedThreadedNnet3CudaPipeline::OpenDecodeHandle(const std::string &key,
+                                                  const WaveData &wave_data) {
+
+  // ensure key is unique
+  tasks_lookup_mutex_.lock();
+  KALDI_ASSERT(tasks_lookup_.end() == tasks_lookup_.find(key));
+
+  // Create a new task in lookup map
+  TaskState *task = &tasks_lookup_[key];
+  tasks_lookup_mutex_.unlock();
+
+  task->Init(key, wave_data);
+
+  work_pool_->enqueue(&BatchedThreadedNnet3CudaPipeline::ComputeOneFeature, this,
+                      task);
+}
+
+// Add a decoding task to the decoder with a passed array of samples
+void BatchedThreadedNnet3CudaPipeline::OpenDecodeHandle(
+    const std::string &key, const VectorBase<BaseFloat> &wave_data,
+    float sample_rate) {
+  // ensure key is unique
+  tasks_lookup_mutex_.lock();
+  KALDI_ASSERT(tasks_lookup_.end() == tasks_lookup_.find(key));
+
+  // Create a new task in lookup map
+  TaskState *task = &tasks_lookup_[key];
+  tasks_lookup_mutex_.unlock();
+
+  task->Init(key, wave_data, sample_rate);
+
+  work_pool_->enqueue(&BatchedThreadedNnet3CudaPipeline::ComputeOneFeature, this,
+                      task);
+}
+
+bool BatchedThreadedNnet3CudaPipeline::GetRawLattice(const std::string &key,
+                                               Lattice *lat) {
+  nvtxRangePushA("GetRawLattice");
+  tasks_lookup_mutex_.lock();
+  auto it = tasks_lookup_.find(key);
+  KALDI_ASSERT(it != tasks_lookup_.end());
+  tasks_lookup_mutex_.unlock();
+
+  TaskState *task = &it->second;
+
+  // wait for task to finish.  This should happens automatically without
+  // intervention from the master thread.
+  while (task->finished == false)
+    kaldi::Sleep(SLEEP_BACKOFF_S);
+
+  // GetRawLattice on a determinized lattice is not supported (Per email from
+  // DanP)
+  KALDI_ASSERT(task->determinized == false);
+
+  if (task->error) {
+    nvtxRangePop();
+    return false;
+  }
+  // Store off the lattice
+  *lat = task->lat;
+  nvtxRangePop();
+  return true;
+}
+
+bool BatchedThreadedNnet3CudaPipeline::GetLattice(const std::string &key,
+                                            CompactLattice *clat) {
+  nvtxRangePushA("GetLattice");
+  tasks_lookup_mutex_.lock();
+  auto it = tasks_lookup_.find(key);
+  KALDI_ASSERT(it != tasks_lookup_.end());
+  tasks_lookup_mutex_.unlock();
+
+  TaskState *task = &it->second;
+
+  // wait for task to finish.  This should happens automatically without
+  // intervention from the master thread.
+  while (task->finished == false)
+    kaldi::Sleep(SLEEP_BACKOFF_S);
+
+  if (task->error) {
+    nvtxRangePop();
+    return false;
+  }
+
+  // if user has not requested a determinized lattice from the decoder then we
+  // must
+  // determinize it here since it was done done already.
+  if (!config_.determinize_lattice && task->determinized == false) {
+    // Determinzation was not done by worker threads so do it here
+    DeterminizeOneLattice(task);
+  }
+  *clat = task->dlat; // grab compact lattice
+  nvtxRangePop();
+  return true;
+}
+
+// Adds task to the PendingTaskQueue
+void BatchedThreadedNnet3CudaPipeline::AddTaskToPendingTaskQueue(TaskState *task) {
+  tasks_add_mutex_.lock();
+  if (NumPendingTasks() == config_.max_pending_tasks) {
+    // task queue is full launch a new thread to add this task and exit to make
+    // room for other work
+    work_pool_->enqueue(&BatchedThreadedNnet3CudaPipeline::AddTaskToPendingTaskQueue,
+                        this, task);
+  } else {
+    // there is room so let's add it
+    // insert into pending task queue
+    pending_task_queue_[tasks_back_] = task;
+    // printf("New task: %p:%s, loc: %d\n", task, key.c_str(),
+    // (int)tasks_back_);
+    tasks_back_ = (tasks_back_ + 1) % (config_.max_pending_tasks + 1);
+  }
+  tasks_add_mutex_.unlock();
+}
+
+// Attempts to fill the batch from the task queue.  May not fully fill the
+// batch.
+void BatchedThreadedNnet3CudaPipeline::AquireAdditionalTasks(
+    CudaDecoder &cuda_decoder, ChannelState &channel_state,
+    std::vector<TaskState *> &tasks) {
+  std::vector<ChannelId> &channels = channel_state.channels;
+  std::vector<ChannelId> &free_channels = channel_state.free_channels;
+
+  int tasksRequested = free_channels.size();
+  int tasksAssigned = 0;
+
+  tasks_mutex_.lock(); // lock required because front might change from other
+                       // workers
+  {
+    // compute number of tasks to grab
+    int tasksAvailable = NumPendingTasks();
+    tasksAssigned = std::min(tasksAvailable, tasksRequested);
+
+    // grab tasks
+    for (int i = 0; i < tasksAssigned; i++) {
+      // printf("%d, Assigned task[%d]: %p\n", i, (int)tasks_front_,
+      // pending_task_queue_[tasks_front_]);
+      tasks.push_back(pending_task_queue_[tasks_front_]);
+      tasks_front_ = (tasks_front_ + 1) % (config_.max_pending_tasks + 1);
+    }
+  }
+  tasks_mutex_.unlock();
+
+  if (tasksAssigned > 0) {
+    // for each assigned tasks we have to do a little bookkeeping
+
+    // list of channels that need initialization
+    std::vector<ChannelId> init_channels(tasksAssigned);
+
+    for (int i = 0; i < tasksAssigned; i++) {
+      // assign a free channel
+      ChannelId channel = free_channels.back();
+      free_channels.pop_back();
+
+      // add channel to processing list
+      channels.push_back(channel);
+      // add new channel to initialization list
+      init_channels[i] = channel;
+    }
+
+    // Setup cuda_decoder channels
+    cuda_decoder.InitDecoding(init_channels);
+  }
+}
+
+// Computes NNET3 across the tasks[first,tasks.size())
+void BatchedThreadedNnet3CudaPipeline::ComputeBatchNnet(
+    nnet3::NnetBatchComputer &computer, int32 first,
+    std::vector<TaskState *> &tasks) {
+  nvtxRangePushA("NNET3");
+
+  bool output_to_cpu = false;
+  int32 online_ivector_period = 0;
+  int max_pending_minibatches =
+      0; // zero means unlimited.  This API call should not block then.
+
+  // list of nnet tasks for each batch
+  std::vector<std::vector<nnet3::NnetInferenceTask>> nnet_tasks(tasks.size());
+
+  // for all new batches enqueue up nnet work.
+  for (int i = first; i < tasks.size(); i++) {
+    TaskState &task = *tasks[i];
+    Vector<BaseFloat> &ivector_features = task.ivector_features;
+    Matrix<BaseFloat> &input_features = task.input_features;
+    std::vector<nnet3::NnetInferenceTask> &ntasks = nnet_tasks[i];
+
+    Vector<BaseFloat> *ifeat = NULL;
+    if (ivector_features.Dim() > 0) {
+      ifeat = &ivector_features;
+    }
+    // create task list
+    computer.SplitUtteranceIntoTasks(output_to_cpu, input_features, ifeat, NULL,
+                                     online_ivector_period, &ntasks);
+
+    // Add tasks to computer
+    for (size_t j = 0; j < ntasks.size(); j++) {
+      computer.AcceptTask(&ntasks[j], max_pending_minibatches);
+    }
+  }
+
+  // process all minibatches, we allow partial minibatches but this should only
+  // occur on the last iteration
+  bool allow_partial_minibatch = true;
+  while (computer.Compute(allow_partial_minibatch))
+    ;
+
+  // Extract Posteriors
+  for (int i = first; i < tasks.size(); i++) {
+    TaskState &task = *tasks[i];
+    CuMatrix<BaseFloat> &posteriors = task.posteriors;
+    MergeTaskOutput(nnet_tasks[i], &posteriors);
+
+    // nnet output is no longer necessary as we have copied the output out
+    nnet_tasks[i].resize(0);
+
+    // featurs are no longer needed so free memory
+    task.ivector_features.Resize(0);
+    task.input_features.Resize(0, 0);
+  }
+
+  nvtxRangePop();
+}
+
+// Computes Features for a single decode instance.
+void BatchedThreadedNnet3CudaPipeline::ComputeOneFeature(TaskState *task_) {
+  nvtxRangePushA("ComputeOneFeature");
+  TaskState &task = *task_;
+  Vector<BaseFloat> &ivector_features = task.ivector_features;
+  Matrix<BaseFloat> &input_features = task.input_features;
+
+  // create decoding state
+  OnlineNnet2FeaturePipeline feature(*feature_info_);
+
+  // Accept waveforms
+  feature.AcceptWaveform(
+      task.sample_frequency,
+      SubVector<BaseFloat>(*task.wave_samples, 0, task.wave_samples->Dim()));
+  feature.InputFinished();
+  // All frames should be ready here
+  int32 numFrames = feature.NumFramesReady();
+  // If we don't have anything to do, we must return now
+  if (numFrames == 0) {
+    task_->finished = true;
+    return;
+  }
+  int32 input_dim = feature.InputFeature()->Dim();
+
+  std::vector<int> frames(numFrames);
+  // create list of frames
+  for (int j = 0; j < numFrames; j++)
+    frames[j] = j;
+
+  // Copy Features
+  input_features.Resize(numFrames, input_dim);
+  feature.InputFeature()->GetFrames(frames, &input_features);
+
+  // Ivectors are optional, if they were not provided skip this step
+  if (feature.IvectorFeature() != NULL) {
+    int32 ivector_dim = feature.IvectorFeature()->Dim();
+    ivector_features.Resize(ivector_dim);
+
+    // Copy Features
+    feature.IvectorFeature()->GetFrame(numFrames - 1, &ivector_features);
+  }
+  nvtxRangePop();
+
+  AddTaskToPendingTaskQueue(task_);
+}
+
+// Allocates decodables for tasks in the range of tasks[first,tasks.size())
+void BatchedThreadedNnet3CudaPipeline::AllocateDecodables(
+    int32 first, std::vector<TaskState *> &tasks,
+    std::vector<CudaDecodableInterface *> &decodables) {
+  // Create mapped decodable here
+  for (int i = first; i < tasks.size(); i++) {
+    CuMatrix<BaseFloat> &posteriors = tasks[i]->posteriors;
+    decodables.push_back(
+        new DecodableCuMatrixMapped(*trans_model_, posteriors, 0));
+  }
+}
+
+// Removes all completed channels from the channel list.
+// Also enqueues up work for post processing
+void BatchedThreadedNnet3CudaPipeline::RemoveCompletedChannels(
+    CudaDecoder &cuda_decoder, ChannelState &channel_state,
+    std::vector<CudaDecodableInterface *> &decodables,
+    std::vector<TaskState *> &tasks) {
+
+  std::vector<ChannelId> &channels = channel_state.channels;
+  std::vector<ChannelId> &free_channels = channel_state.free_channels;
+  std::vector<ChannelId> &completed_channels = channel_state.completed_channels;
+
+  // Here we will reorder arrays to put finished decodes at the end
+  int cur = 0; // points to the current unchecked decode
+  int back = tasks.size() - completed_channels.size() -
+             1; // points to the last unchecked decode
+
+  // for each active channel
+  // scan channels to find finished decodes
+  // move finished decodes to the end
+  for (int i = 0; i < channels.size(); i++) {
+    ChannelId channel = channels[cur];
+    int numDecoded = cuda_decoder.NumFramesDecoded(channel);
+    int toDecode = decodables[cur]->NumFramesReady();
+
+    if (toDecode == numDecoded) { // if current task is completed
+      // add channel to free and completed queues
+      completed_channels.push_back(channel);
+      free_channels.push_back(channel);
+
+      // Rearrange queues,
+      // move this element to end and end to this spot
+      std::swap(tasks[cur], tasks[back]);
+      std::swap(channels[cur], channels[back]);
+      std::swap(decodables[cur], decodables[back]);
+
+      // back is a completed decode so decrement it
+      back--;
+    } else {
+      // not completed move to next task
+      cur++;
+    } // end if completed[cur]
+  }   // end for loop
+
+  // removing finished channels from list
+  channels.resize(cur);
+}
+
+// Post decode some channels will be complete
+// For those channels we need to
+//  free up the channel
+//  get and determinize the lattice
+//
+void BatchedThreadedNnet3CudaPipeline::PostDecodeProcessing(
+    CudaDecoder &cuda_decoder, ChannelState &channel_state,
+    std::vector<CudaDecodableInterface *> &decodables,
+    std::vector<TaskState *> &tasks) {
+  std::vector<ChannelId> &channels = channel_state.channels;
+  std::vector<ChannelId> &completed_channels = channel_state.completed_channels;
+
+  // Generate lattices for GetRawLattice
+  std::vector<Lattice *> lattices(completed_channels.size());
+  for (int i = 0; i < completed_channels.size(); i++) {
+    // reverse order of lattices to match channel order
+    // tasks order was reversed when reordering to the back
+    lattices[i] = &(tasks[tasks.size() - i - 1]->lat);
+  }
+
+  // Get best path for completed tasks
+  cuda_decoder.GetRawLattice(completed_channels, lattices, true);
+
+  // clean up datastructures for completed tasks
+  for (int i = channels.size(); i < tasks.size(); i++) {
+    delete decodables[i];
+    tasks[i]->posteriors.Resize(0, 0);
+  }
+
+  if (config_.determinize_lattice) {
+    nvtxRangePushA("DeterminizeLattice");
+    // One more step to do on the tasks.  Determinize will mark task as
+    // finished.
+    for (int i = channels.size(); i < tasks.size(); i++) {
+      work_pool_->enqueue(&BatchedThreadedNnet3CudaPipeline::DeterminizeOneLattice,
+                          this, tasks[i]);
+    }
+    nvtxRangePop();
+  } else {
+    // Task is done and ready for consumption
+    for (int i = channels.size(); i < tasks.size(); i++) {
+      // notify master threads this work is complete
+      tasks[i]->finished = true;
+      ;
+    }
+  }
+
+  tasks.resize(channels.size());
+  decodables.resize(channels.size());
+  completed_channels.resize(0);
+}
+void BatchedThreadedNnet3CudaPipeline::DeterminizeOneLattice(TaskState *task) {
+  nvtxRangePushA("DeterminizeOneLattice");
+  // Note this destroys the original raw lattice
+  DeterminizeLatticePhonePrunedWrapper(*trans_model_, &task->lat,
+                                       config_.decoder_opts.lattice_beam,
+                                       &(task->dlat), config_.det_opts);
+  task->determinized = true;
+  task->finished = true;
+  nvtxRangePop();
+}
+
+void BatchedThreadedNnet3CudaPipeline::ExecuteWorker(int threadId) {
+  // Initialize this threads device
+  CuDevice::Instantiate();
+
+  // Data structures that are reusable across decodes but unique to each thread
+  CudaDecoder cuda_decoder(cuda_fst_, config_.decoder_opts,
+                           config_.max_batch_size);
+  nnet3::NnetBatchComputer computer(config_.compute_opts, am_nnet_->GetNnet(),
+                                    am_nnet_->Priors());
+
+  ChannelState channel_state;
+
+  std::vector<TaskState *> tasks; // The state for each decode
+  std::vector<CudaDecodableInterface *> decodables;
+
+  // Initialize reuseable data structures
+  {
+    channel_state.channels.reserve(config_.max_batch_size);
+    channel_state.free_channels.reserve(config_.max_batch_size);
+    channel_state.completed_channels.reserve(config_.max_batch_size);
+    tasks.reserve(config_.max_batch_size);
+    decodables.reserve(config_.max_batch_size);
+
+    // add all channels to free channel list
+    for (int i = 0; i < config_.max_batch_size; i++) {
+      channel_state.free_channels.push_back(i);
+    }
+  }
+
+  numStarted_++; // Tell master I have started
+
+  // main control loop.  At each iteration a thread will see if it has been
+  // asked to shut
+  // down.  If it has it will exit.  This loop condition will only be processed
+  // if all
+  // other work assigned to this thread has been processed.
+  while (!exit_) {
+
+    // main processing loop.  At each iteration the thread will do the
+    // following:
+    // 1) Attempt to grab more work.
+    // 2) Initialize any new work
+    // do
+    // 3) Process work in a batch
+    // while(free_channels<drain_count)
+    // 4) Postprocess any completed work
+    do {
+      // 1) attempt to fill the batch
+      if (tasks_front_ != tasks_back_) { // if work is available grab more work
+
+        int start = tasks.size(); // Save the current assigned tasks size
+
+        AquireAdditionalTasks(cuda_decoder, channel_state, tasks);
+
+        // New tasks are now in the in tasks[start,tasks.size())
+        if (start != tasks.size()) { // if there are new tasks
+          ComputeBatchNnet(computer, start, tasks);
+          AllocateDecodables(start, tasks, decodables);
+        }
+      } // end if(tasks_front_!=tasks_back_)
+
+      // check if there is no active work on this thread.
+      // This can happen if another thread was assigned the work.
+      if (tasks.size() == 0) {
+        //Thread is spinning waiting for work.  Backoff.
+        kaldi::Sleep(SLEEP_BACKOFF_S);
+        break;
+      }
+
+      // try/catch to catch and report errors inside decoder.
+      // errors can be recoverable or non-recoverable
+      // unrecoverable errors will assert
+      // recoverable errors will cancel the batch (output empty lattice)
+      // and print a warning.
+      // There should be no errors and this is just a sanity check
+      try {
+        // This is in a loop in case we want to drain the batch a little.
+        // Draining the batch will cause initialization tasks to be batched.
+        do {
+          // 3) Process outstanding work in a batch
+          // Advance decoding on all open channels
+          cuda_decoder.AdvanceDecoding(channel_state.channels, decodables);
+
+          // Adjust channel state for all completed decodes
+          RemoveCompletedChannels(cuda_decoder, channel_state, decodables,
+                                  tasks);
+
+          // do loop repeates until we meet drain size or run out of work
+        } while (channel_state.completed_channels.size() <
+                     config_.batch_drain_size &&
+                 channel_state.channels.size() > 0);
+
+        // 4) Post process work.  This reorders completed work to the end,
+        // copies results outs, and cleans up data structures
+        PostDecodeProcessing(cuda_decoder, channel_state, decodables, tasks);
+
+        // if the number of free channels is small repeat decoding loop to free
+        // up more channels
+      } catch (CudaDecoderException e) {
+        // Code to catch errors.  Most errors are unrecoverable but a user can
+        // mark them
+        // recoverable which will cancel the entire batch but keep processing.
+        if (!e.recoverable) {
+          bool UNRECOVERABLE_EXCEPTION = false;
+          KALDI_LOG << "Error unrecoverable cuda decoder error '" << e.what()
+                    << "'\n";
+          KALDI_ASSERT(UNRECOVERABLE_EXCEPTION);
+        } else {
+          KALDI_LOG << "Error recoverable cuda decoder error '" << e.what()
+                    << "'\n";
+          KALDI_LOG << "    Aborting batch for recovery.  Canceling the "
+                       "following decodes:\n";
+          //Cancel all outstanding tasks
+          for (int i = 0; i < tasks.size(); i++) {
+            // move all channels to free channel queue
+            ChannelId channel = channel_state.channels[i];
+            channel_state.free_channels.push_back(channel);
+
+            TaskState &task = *(tasks[i]);
+            KALDI_LOG << "      Canceled: " << task.key << "\n";
+
+            // set error flag
+            task.error = true;
+            task.error_string = e.what();
+
+            // cleanup memory
+            delete decodables[i];
+            task.posteriors.Resize(0, 0);
+
+            // notifiy master decode is finished
+            task.finished = true;
+          }
+          tasks.resize(0);
+          channel_state.channels.resize(0);
+          decodables.resize(0);
+        }
+      }
+    } while (tasks.size() > 0); // more work don't check exit condition
+  } // end while(!exit_)
+} // end ExecuteWorker
+
+}  // end namespace cuda_decoder
+}  // end namespace kaldi
+
+#endif  // HAVE_CUDA == 1
diff --git a/src/cudadecoder/batched-threaded-nnet3-cuda-pipeline.h b/src/cudadecoder/batched-threaded-nnet3-cuda-pipeline.h
new file mode 100644
index 00000000000..6754f9a2442
--- /dev/null
+++ b/src/cudadecoder/batched-threaded-nnet3-cuda-pipeline.h
@@ -0,0 +1,272 @@
+// cudadecoder/batched-threaded-nnet3-cuda-pipeline.h
+//
+// Copyright (c) 2019, NVIDIA CORPORATION.  All rights reserved.
+// Hugo Braun, Justin Luitjens, Ryan Leary
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef KALDI_CUDA_DECODER_BATCHED_THREADED_CUDA_DECODER_H_
+#define KALDI_CUDA_DECODER_BATCHED_THREADED_CUDA_DECODER_H_
+
+#include <atomic>
+#include <thread>
+
+#include "cudadecoder/cuda-decoder.h"
+#include "decodable-cumatrix.h"
+#include "feat/wave-reader.h"
+#include "lat/determinize-lattice-pruned.h"
+#include "nnet3/nnet-batch-compute.h"
+#include "online2/online-nnet2-feature-pipeline.h"
+#include "thread-pool.h"
+
+namespace kaldi {
+namespace cuda_decoder {
+
+/* BatchedThreadedNnet3CudaPipelineConfig
+ * This class is a common configuration class for the various components
+ * of a batched cuda multi-threaded pipeline.  It defines a single place
+ * to control all operations and ensures that the various componets
+ * match configurations
+ */
+// configuration options common to the BatchedThreadedNnet3CudaPipeline and
+// BatchedThreadedNnet3CudaPipeline
+struct BatchedThreadedNnet3CudaPipelineConfig {
+  BatchedThreadedNnet3CudaPipelineConfig()
+      : max_batch_size(100),
+        batch_drain_size(10),
+        num_control_threads(2),
+        num_worker_threads(20),
+        determinize_lattice(true),
+        max_pending_tasks(4000){};
+  void Register(OptionsItf *po) {
+    po->Register("max-batch-size", &max_batch_size,
+                 "The maximum batch size to be used by the decoder. "
+                 "Higher->Faster, more GPU memory used");
+    po->Register("batch-drain-size", &batch_drain_size,
+                 "How far to drain the batch before refilling work. This "
+                 "batches pre/post decode work");
+    po->Register("cuda-control-threads", &num_control_threads,
+                 "The number of pipeline control threads for the CUDA work. "
+                 "e.g. 2 control threads -> 2 independent CUDA pipeline (nnet3 "
+                 "and decoder)");
+    po->Register(
+        "cuda-worker-threads", &num_worker_threads,
+        "The total number of CPU threads launched to process CPU tasks");
+    po->Register("determinize-lattice", &determinize_lattice,
+                 "Determinize the lattice before output");
+    po->Register("max-outstanding-queue-length", &max_pending_tasks,
+                 "Number of files to allow to be outstanding at a time. When "
+                 "the number of files is larger than this handles will be "
+                 "closed before opening new ones in FIFO order");
+
+    decoder_opts.nlanes = max_batch_size;
+    decoder_opts.nchannels = max_batch_size;
+
+    feature_opts.Register(po);
+    decoder_opts.Register(po);
+    det_opts.Register(po);
+    compute_opts.Register(po);
+  }
+  int max_batch_size;
+  int batch_drain_size;
+  int num_control_threads;
+  int num_worker_threads;
+  bool determinize_lattice;
+  int max_pending_tasks;
+
+  OnlineNnet2FeaturePipelineConfig feature_opts;      // constant readonly
+  CudaDecoderConfig decoder_opts;                     // constant readonly
+  fst::DeterminizeLatticePhonePrunedOptions det_opts; // constant readonly
+  nnet3::NnetBatchComputerOptions compute_opts;       // constant readonly
+};
+
+/*
+ * BatchedThreadedNnet3CudaPipeline uses multiple levels of parallelism in order to
+ * decode quickly on CUDA GPUs. This is the primary interface for cuda decoding.
+ * For examples of how to use this decoder see cudadecoder/README and
+ * cudadecoderbin/batched-wav-nnet3-cuda.cc
+ */
+class BatchedThreadedNnet3CudaPipeline {
+public:
+  BatchedThreadedNnet3CudaPipeline(const BatchedThreadedNnet3CudaPipelineConfig &config)
+      : config_(config){};
+
+  // TODO should this take an nnet instead of a string?
+  // allocates reusable objects that are common across all decodings
+  void Initialize(const fst::Fst<fst::StdArc> &decode_fst,
+                  const nnet3::AmNnetSimple &nnet,
+                  const TransitionModel &trans_model);
+
+  // deallocates reusable objects
+  void Finalize();
+
+  // query a specific key to see if compute on it is complete
+  bool isFinished(const std::string &key);
+
+  // remove an audio file from the decoding and clean up resources
+  void CloseDecodeHandle(const std::string &key);
+
+  // Adds a decoding task to the decoder
+  void OpenDecodeHandle(const std::string &key, const WaveData &wave_data);
+  // When passing in a vector of data, the caller must ensure the data exists
+  // until the CloseDecodeHandle is called
+  void OpenDecodeHandle(const std::string &key,
+                        const VectorBase<BaseFloat> &wave_data,
+                        float sample_rate);
+
+  // Copies the raw lattice for decoded handle "key" into lat
+  bool GetRawLattice(const std::string &key, Lattice *lat);
+  // Determinizes raw lattice and returns a compact lattice
+  bool GetLattice(const std::string &key, CompactLattice *lat);
+
+  inline int NumPendingTasks() {
+    return (tasks_back_ - tasks_front_ + config_.max_pending_tasks + 1) %
+           (config_.max_pending_tasks + 1);
+  };
+
+private:
+  // State needed for each decode task.
+  // This state can be passed around by reference or pointer safely
+  // and provides a convieniet way to store all decoding state.
+  struct TaskState {
+    Vector<BaseFloat> raw_data; // Wave input data when wave_reader passed
+    SubVector<BaseFloat> *wave_samples; // Used as a pointer to either the raw
+                                        // data or the samples passed
+    std::string key;
+    float sample_frequency;
+    bool error;
+    std::string error_string;
+
+    Lattice lat;                // Raw Lattice output
+    CompactLattice dlat;        // Determinized lattice output.  Only set if
+                                // determinize-lattice=true
+    std::atomic<bool> finished; // Tells master thread if task has finished
+                                // execution
+
+    bool determinized;
+
+    Vector<BaseFloat> ivector_features;
+    Matrix<BaseFloat> input_features;
+    CuMatrix<BaseFloat> posteriors;
+
+    TaskState()
+        : wave_samples(NULL), sample_frequency(0), error(false),
+          finished(false), determinized(false) {}
+    ~TaskState() {
+      if (wave_samples)
+        delete wave_samples;
+    }
+
+    // Init when wave data is passed directly in.  This data is deep copied.
+    void Init(const std::string &key_in, const WaveData &wave_data_in) {
+      raw_data.Resize(wave_data_in.Data().NumRows() *
+                          wave_data_in.Data().NumCols(),
+                      kUndefined);
+      memcpy(raw_data.Data(), wave_data_in.Data().Data(),
+             raw_data.Dim() * sizeof(BaseFloat));
+      wave_samples = new SubVector<BaseFloat>(raw_data, 0, raw_data.Dim());
+      sample_frequency = wave_data_in.SampFreq();
+      determinized = false;
+      finished = false;
+      key = key_in;
+    };
+    // Init when raw data is passed in.  This data is shallow copied.
+    void Init(const std::string &key_in,
+              const VectorBase<BaseFloat> &wave_data_in, float sample_rate) {
+      wave_samples =
+          new SubVector<BaseFloat>(wave_data_in, 0, wave_data_in.Dim());
+      sample_frequency = sample_rate;
+      determinized = false;
+      finished = false;
+      key = key_in;
+    }
+  };
+
+  // Holds the current channel state for a worker
+  struct ChannelState {
+    std::vector<ChannelId> channels;
+    std::vector<ChannelId> free_channels;
+    std::vector<ChannelId> completed_channels;
+  };
+
+  // Adds task to the PendingTaskQueue
+  void AddTaskToPendingTaskQueue(TaskState *task);
+
+  // Attempts to fill the batch from the task queue.  May not fully fill the
+  // batch.
+  void AquireAdditionalTasks(CudaDecoder &cuda_decoder,
+                             ChannelState &channel_state,
+                             std::vector<TaskState *> &tasks);
+
+  // Computes Features for a single decode instance.
+  void ComputeOneFeature(TaskState *task);
+
+  // Computes Nnet across the current decode batch
+  void ComputeBatchNnet(nnet3::NnetBatchComputer &computer, int32 first,
+                        std::vector<TaskState *> &tasks);
+
+  // Allocates decodables for tasks in the range of
+  // dstates[first,dstates.size())
+  void AllocateDecodables(int32 first, std::vector<TaskState *> &tasks,
+                          std::vector<CudaDecodableInterface *> &decodables);
+
+  // Removes all completed channels from the channel list.
+  // Also enqueues up work for post processing
+  void
+  RemoveCompletedChannels(CudaDecoder &cuda_decoder,
+                          ChannelState &channel_state,
+                          std::vector<CudaDecodableInterface *> &decodables,
+                          std::vector<TaskState *> &tasks);
+
+  // For each completed decode perform post processing work and clean up
+  void PostDecodeProcessing(CudaDecoder &cuda_decoder,
+                            ChannelState &channel_state,
+                            std::vector<CudaDecodableInterface *> &decodables,
+                            std::vector<TaskState *> &tasks);
+
+  void DeterminizeOneLattice(TaskState *state);
+
+  // Thread execution function.  This is a single worker thread which processes
+  // input.
+  void ExecuteWorker(int threadId);
+
+  const BatchedThreadedNnet3CudaPipelineConfig &config_;
+
+  CudaFst cuda_fst_;
+  const TransitionModel *trans_model_;
+  const nnet3::AmNnetSimple *am_nnet_;
+  nnet3::DecodableNnetSimpleLoopedInfo *decodable_info_;
+  OnlineNnet2FeaturePipelineInfo *feature_info_;
+
+  std::mutex tasks_mutex_; // protects tasks_front_ and pending_task_queue_ for
+                           // workers
+  std::mutex tasks_add_mutex_; // protect OpenDecodeHandle if multiple threads
+                               // access
+  std::mutex tasks_lookup_mutex_; // protext tasks_lookup map
+  std::atomic<int> tasks_front_, tasks_back_;
+  TaskState **pending_task_queue_;
+
+  std::atomic<bool> exit_;      // signals threads to exit
+  std::atomic<int> numStarted_; // signals master how many threads have started
+
+  ThreadPool *work_pool_; // thread pool for CPU work
+
+  std::map<std::string, TaskState> tasks_lookup_; // Contains a map of
+                                                  // utterance to TaskState
+  std::vector<std::thread> thread_contexts_;      // A list of thread contexts
+};
+
+}  // end namespace cuda_decoder
+} // end namespace kaldi.
+
+#endif  // KALDI_CUDA_DECODER_BATCHED_THREADED_CUDA_DECODER_H_
diff --git a/src/cudadecoder/cuda-decodable-itf.h b/src/cudadecoder/cuda-decodable-itf.h
new file mode 100644
index 00000000000..98d0619b6eb
--- /dev/null
+++ b/src/cudadecoder/cuda-decodable-itf.h
@@ -0,0 +1,33 @@
+// cudadecoder/cuda-decodable-itf.h
+//
+// Copyright (c) 2019, NVIDIA CORPORATION.  All rights reserved.
+// Hugo Braun, Justin Luitjens, Ryan Leary
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef KALDI_CUDA_DECODER_DECODABLE_ITF_H
+#define KALDI_CUDA_DECODER_DECODABLE_ITF_H
+
+#include "itf/decodable-itf.h"
+
+namespace kaldi {
+namespace cuda_decoder {
+
+class CudaDecodableInterface : public DecodableInterface {
+public:
+  virtual BaseFloat *GetLogLikelihoodsCudaPointer(int32 subsampled_frame) = 0;
+};
+
+}  // end namespace cuda_decoder
+}  // end namespace kaldi.
+#endif  // KALDI_CUDA_DECODER_DECODABLE_ITF_H
diff --git a/src/cudadecoder/cuda-decoder-common.h b/src/cudadecoder/cuda-decoder-common.h
new file mode 100644
index 00000000000..a66b8c494eb
--- /dev/null
+++ b/src/cudadecoder/cuda-decoder-common.h
@@ -0,0 +1,468 @@
+// cudadecoder/cuda-decoder-common.h
+//
+// Copyright (c) 2019, NVIDIA CORPORATION.  All rights reserved.
+// Hugo Braun, Justin Luitjens, Ryan Leary
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef KALDI_CUDA_DECODER_CUDA_DECODER_UTILS_H_
+#define KALDI_CUDA_DECODER_CUDA_DECODER_UTILS_H_
+#include "cudamatrix/cu-device.h"
+#include "util/stl-utils.h"
+
+// A decoder channel is linked to one utterance. Frames
+// from the same must be sent to the same channel.
+//
+// A decoder lane is where the computation actually happens
+// a decoder lane is given a frame and its associated channel
+// and does the actual computation
+//
+// An analogy would be lane -> a core, channel -> a software thread
+
+// Number of GPU decoder lanes
+#define KALDI_CUDA_DECODER_MAX_N_LANES 200
+
+// If we're at risk of filling the tokens queue,
+// the beam is reduced to keep only the best candidates in the
+// remaining space
+// We then slowly put the beam back to its default value
+// beam_next_frame = min(default_beam, RECOVER_RATE * beam_previous_frame)
+#define KALDI_CUDA_DECODER_ADAPTIVE_BEAM_RECOVER_RATE 1.2f
+
+// Defines for the cuda decoder kernels
+// It shouldn't be necessary to change the DIMX of the kernels
+
+// Below that value, we launch the persistent kernel for NonEmitting
+#define KALDI_CUDA_DECODER_NONEM_LT_MAX_NARCS 4096
+
+// We know we will have at least X elements in the hashmap
+// We allocate space for X*KALDI_CUDA_DECODER_HASHMAP_CAPACITY_FACTOR elements
+// to avoid having too much collisions
+#define KALDI_CUDA_DECODER_HASHMAP_CAPACITY_FACTOR 1
+
+// Max size of the total kernel arguments
+// 4kb for compute capability >= 2.0
+#define KALDI_CUDA_DECODER_MAX_KERNEL_ARGUMENTS_BYTE_SIZE (4096)
+
+// When applying the max-active, we need to compute a topk
+// to perform that (soft) topk, we compute a histogram
+// here we define the number of bins in that histogram
+// it has to be less than the number of 1D threads
+#define KALDI_CUDA_DECODER_HISTO_NBINS 255
+
+// Adaptive beam parameters
+// We will decrease the beam when we detect that we are generating too many
+// tokens
+// for the first segment of the aux_q, we don't do anything (keep the original
+// beam)
+// the first segment is made of (aux_q
+// capacity)/KALDI_CUDA_DECODER_ADAPTIVE_BEAM_STATIC_SEGMENT
+// then we will decrease the beam step by step, until 0.
+// we will decrease the beam every m elements, with:
+// x = (aux_q capacity)/KALDI_CUDA_DECODER_ADAPTIVE_BEAM_STATIC_SEGMENT (static
+// segment
+// y = (aux_q capacity) - x
+// m = y / KALDI_CUDA_DECODER_ADAPTIVE_BEAM_NSTEPS
+// For more information, please refer to the definition of GetAdaptiveBeam in
+// cuda-decoder-kernels.cu
+#define KALDI_CUDA_DECODER_ADAPTIVE_BEAM_STATIC_SEGMENT 4
+#define KALDI_CUDA_DECODER_ADAPTIVE_BEAM_NSTEPS 8
+// When applying max_active we don't keep exactly max_active_ tokens,
+// but a bit more. And we can call ApplyMaxActiveAndReduceBeam multiple times
+// in the first frame (the first times as a pre-filter, the last time at the
+// very end of the frame)
+// Because keeping a bit more than max_active_ is expected, we add the tolerance
+// so that we can avoid triggering ApplyMaxActiveAndReduceBeam for just a few
+// tokens above the limit
+// at the end of the frame
+#define KALDI_CUDA_DECODER_MAX_ACTIVE_TOLERANCE 0.2
+
+#define KALDI_CUDA_DECODER_DIV_ROUND_UP(a, b) ((a + b - 1) / b)
+
+#define KALDI_CUDA_DECODER_ASSERT(val, recoverable)                     \
+  {                                                                     \
+    if ((val) != true) {                                                \
+      throw CudaDecoderException("KALDI_CUDA_DECODER_ASSERT", __FILE__, \
+                                 __LINE__, recoverable)                 \
+    }                                                                   \
+  }
+// Macro for checking cuda errors following a cuda launch or api call
+#define KALDI_DECODER_CUDA_CHECK_ERROR()                                  \
+  {                                                                       \
+    cudaError_t e = cudaGetLastError();                                   \
+    if (e != cudaSuccess) {                                               \
+      throw CudaDecoderException(cudaGetErrorName(e), __FILE__, __LINE__, \
+                                 false);                                  \
+    }                                                                     \
+  }
+
+#define KALDI_DECODER_CUDA_API_CHECK_ERROR(e)                             \
+  {                                                                       \
+    if (e != cudaSuccess) {                                               \
+      throw CudaDecoderException(cudaGetErrorName(e), __FILE__, __LINE__, \
+                                 false);                                  \
+    }                                                                     \
+  }
+
+#define KALDI_CUDA_DECODER_1D_KERNEL_LOOP(i, n)                \
+  for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < (n); \
+       i += blockDim.x * gridDim.x)
+
+#define KALDI_CUDA_DECODER_1D_BLOCK_OFFSET_KERNEL_LOOP(offset, th_idx, n) \
+  for (int offset = blockIdx.x * blockDim.x, th_idx = threadIdx.x;        \
+       offset < (n); offset += blockDim.x * gridDim.x)
+
+#define KALDI_CUDA_DECODER_IS_LAST_1D_THREAD() (threadIdx.x == (blockDim.x - 1))
+
+#define KALDI_CUDA_DECODER_BATCH_KERNEL_LOOP(i, n) \
+  for (int i = blockIdx.y; i < (n); i += gridDim.y)
+
+#define KALDI_CUDA_DECODER_DIV_ROUND_UP(a, b) ((a + b - 1) / b)
+
+#define KALDI_CUDA_DECODER_1D_BLOCK 256
+#define KALDI_CUDA_DECODER_LARGEST_1D_BLOCK 1024
+#define KALDI_CUDA_DECODER_ONE_THREAD_BLOCK 1
+
+namespace kaldi {
+namespace cuda_decoder {
+
+// Returning the number of CTAs to launch for (N,M) elements to compute
+// M is usually the batch size
+inline dim3 KaldiCudaDecoderNumBlocks(int N, int M) {
+  dim3 grid;
+  // TODO MAX_NUM_BLOCKS.
+  grid.x = KALDI_CUDA_DECODER_DIV_ROUND_UP(N, KALDI_CUDA_DECODER_1D_BLOCK);
+  grid.y = M;
+  return grid;
+}
+
+typedef int32 StateId;
+typedef float CostType;
+// IntegerCostType is the type used in the lookup table d_state_best_cost
+// and the d_cutoff
+// We use a 1:1 conversion between CostType <--> IntegerCostType
+// IntegerCostType is used because it triggers native atomic operations
+// (CostType does not)
+typedef int32 IntegerCostType;
+typedef int32 LaneId;
+typedef int32 ChannelId;
+
+// On the device we compute everything by batch
+// Data is stored as 2D matrices (BatchSize, 1D_Size)
+// For example, for the token queue, (BatchSize, max_tokens_per_frame_)
+// DeviceMatrix owns the data but is not used to access it.
+// DeviceMatrix is inherited in DeviceLaneMatrix and DeviceChannelMatrix
+// those two classes do the same thing, except that they belong either to a
+// channel or lane
+// that inheritance is done to clarify the code and help debugging
+//
+// To actually access the data, we should request an view through
+// GetView
+// That view contains both host cuda code to access the data. It does not own
+// the data.
+template <typename T>
+// if necessary, make a version that always use ncols_ as the next power of 2
+class DeviceMatrix {
+  T *data_;
+  void Allocate() {
+    KALDI_ASSERT(nrows_ > 0);
+    KALDI_ASSERT(ncols_ > 0);
+    KALDI_ASSERT(!data_);
+    data_ = static_cast<T *>(CuDevice::Instantiate().Malloc(
+        (size_t)nrows_ * ncols_ * sizeof(*data_)));
+    KALDI_ASSERT(data_);
+  }
+  void Free() {
+    KALDI_ASSERT(data_);
+    CuDevice::Instantiate().Free(data_);
+  }
+
+ protected:
+  int32 ncols_;
+  int32 nrows_;
+
+ public:
+  DeviceMatrix() : data_(NULL), ncols_(0), nrows_(0) {}
+
+  virtual ~DeviceMatrix() {
+    if (data_) Free();
+  }
+
+  void Resize(int32 nrows, int32 ncols) {
+    if (data_) Free();
+    KALDI_ASSERT(nrows > 0);
+    KALDI_ASSERT(ncols > 0);
+    nrows_ = nrows;
+    ncols_ = ncols;
+    Allocate();
+  }
+
+  T *MutableData() {
+    KALDI_ASSERT(data_);
+    return data_;
+  }
+  // abstract getInterface...
+};
+
+// Views of DeviceMatrix
+// Those views are created by either DeviceChannelMatrix or
+// DeviceLaneMatrix
+// We can access the data (the matrix) associated with that
+// Device[Channel|Lane]Matrix without owning that data.
+// Which means that we can pass those views by copy
+// without triggering a cudaFree, for instance.
+// Device[Channel|Lane]Matrix owns the data, [Channel|Lane]MatrixInterface just
+// gives access to it
+// Generating both host and device interfaces
+template <typename T>
+struct LaneMatrixView {
+  T *data_;
+  int32 ncols_;
+  __host__ __device__ __inline__ T *lane(const int32 ilane) {
+    return &data_[ilane * ncols_];
+  }
+};
+
+template <typename T>
+struct ChannelMatrixView {
+  T *data_;
+  int32 ncols_;
+  __host__ __device__ __inline__ T *channel(const int32 ichannel) {
+    return &data_[ichannel * ncols_];
+  }
+};
+
+// Specializing DeviceMatrix into lane and channel variants.
+// Helps with code clarity/debugging
+template <typename T>
+class DeviceLaneMatrix : public DeviceMatrix<T> {
+ public:
+  LaneMatrixView<T> GetView() { return {this->MutableData(), this->ncols_}; }
+
+  T *lane(const int32 ilane) {
+    return &this->MutableData()[ilane * this->ncols_];
+  }
+};
+
+template <typename T>
+class DeviceChannelMatrix : public DeviceMatrix<T> {
+ public:
+  ChannelMatrixView<T> GetView() { return {this->MutableData(), this->ncols_}; }
+  T *channel(const int32 ichannel) {
+    return &this->MutableData()[ichannel * this->ncols_];
+  }
+};
+
+// LaneCounters/ChannelCounters
+// The counters are all the singular values associated to a lane/channel
+// For instance  the main queue size. Or the min_cost of all tokens in that
+// queue
+// LaneCounters are used during computation
+struct LaneCounters {
+  // Contains both main_q_end and narcs
+  // End index of the main queue
+  // only tokens at index i with i < main_q_end
+  // are valid tokens
+  // Each valid token the subqueue main_q[main_q_local_offset, main_q_end[ has
+  // a number of outgoing arcs (out-degree)
+  // main_q_narcs is the sum of those numbers
+  // We sometime need to update both end and narcs at the same time using a
+  // single atomic,
+  // which is why they're packed together
+  int2 main_q_narcs_and_end;
+  // contains the requested queue length which can
+  // be larger then the actual queue length in the case of overflow
+  int32 main_q_requested;
+  int32 aux_q_requested;
+  int32 aux_q_end;
+  int32 post_expand_aux_q_end;  // used for double buffering
+  // Some tokens in the same frame share the same token.next_state
+  // main_q_n_extra_prev_tokens is the count of those tokens
+  int32 main_q_n_extra_prev_tokens;
+  // Depending on the value of the parameter "max_tokens_per_frame"
+  // we can end up with an overflow when generating the tokens for a frame
+  // We try to prevent this from happening using an adaptive beam
+  // If an overflow happens, then the kernels no longer insert any data into
+  // the queues and set overflow flag to true.
+  // queue length.
+  // Even if that flag is set, we can continue the execution (quality
+  // of the output can be lowered)
+  // We use that flag to display a warning to the user
+  int32 q_overflow;
+  // ExpandArcs reads the tokens in the index range [main_q_local_offset, end[
+  int32 main_q_local_offset;
+  // We transfer the tokens back to the host at the end of each frame.
+  // Which means that tokens at a frame  n > 0 have an offset compared to to
+  // those
+  // in frame n-1. main_q_global_offset is the overall offset of the current
+  // main_q,
+  // since frame 0
+  // It is used to set the prev_token index.
+  int32 main_q_global_offset;
+  // Same thing, but for main_q_n_extra_prev_tokens (those are also transfered
+  // back to host)
+  int32 main_q_extra_prev_tokens_global_offset;
+
+  // Minimum token for that frame
+  IntegerCostType min_int_cost;
+  // Current beam. Can be different from default_beam,
+  // because of the AdaptiveBeam process, or because of
+  // ApplyMaxActiveAndReduceBeam
+  IntegerCostType int_beam;
+  // Adaptive beam. The validity says until which index this adaptive beam is
+  // valid.
+  // After that index, we need to lower the adaptive beam
+  int2 adaptive_int_beam_with_validity_index;
+
+  // min_cost + beam
+  IntegerCostType int_cutoff;
+
+  // --- Only valid after calling GetBestCost
+  // min_cost and its arg. Can be different than min_cost, because we may
+  // include final costs
+  int2 min_int_cost_and_arg;
+  // Number of final tokens with cost < best + lattice_beam
+  int32 n_within_lattice_beam;
+  int32 has_reached_final;  // if there's at least one final token in the queue
+};
+
+// Channel counters
+// Their job is to save the state of a channel, when this channel is idle
+// The channel counters are loaded into the lane counters during the context
+// switches
+struct ChannelCounters {
+  // All the following values are just saved values from LaneCounters
+  // from the latest context-switch
+  int2 prev_main_q_narcs_and_end;
+  int32 prev_main_q_n_extra_prev_tokens;
+  int32 prev_main_q_global_offset;
+  int32 prev_main_q_extra_prev_tokens_global_offset;
+  CostType prev_beam;
+
+  // Only valid after calling GetBestCost
+  // different than min_int_cost : we include the "final" cost
+  int2 min_int_cost_and_arg_with_final;
+  int2 min_int_cost_and_arg_without_final;
+  //
+};
+
+class CudaDecoderException : public std::exception {
+ public:
+  CudaDecoderException(const char *str_, const char *file_, int line_,
+                       const bool recoverable_)
+      : str(str_),
+        file(file_),
+        line(line_),
+        buffer(std::string(file) + ":" + std::to_string(line) + " :" +
+               std::string(str)),
+        recoverable(recoverable_) {}
+  const char *what() const throw() { return buffer.c_str(); }
+
+  const char *str;
+  const char *file;
+  const int line;
+  const std::string buffer;
+  const bool recoverable;
+};
+
+// InfoToken contains data that needs to be saved for the backtrack
+// in GetBestPath/GetRawLattice
+// We don't need the token.cost or token.next_state.
+struct __align__(8) InfoToken {
+  int32 prev_token;
+  int32 arc_idx;
+  bool IsUniqueTokenForStateAndFrame() {
+    // This is a trick used to save space and PCI-E bandwidth (cf
+    // preprocess_in_place kernel)
+    // This token is associated with a next_state s, created during the
+    // processing of frame f.
+    // If we have multiple tokens associated with the state s in the frame f,
+    // arc_idx < 0 and -arc_idx is the
+    // count of such tokens. We will then have to look at another list to read
+    // the actually arc_idx and prev_token values
+    // If the current token is the only one, prev_token and arc_idx are valid
+    // and can be used directly
+    return (arc_idx >= 0);
+  }
+
+  // Called if this token is linked to others tokens in the same frame (cf
+  // comments for IsUniqueTokenForStateAndFrame)
+  // return the {offset,size} pair necessary to list those tokens in the
+  // extra_prev_tokens list
+  // They are stored at offset "offset", and we have "size" of those
+  std::pair<int32, int32> GetSameFSTStateTokensList() {
+    KALDI_ASSERT(!IsUniqueTokenForStateAndFrame());
+
+    return {prev_token, -arc_idx};
+  }
+};
+
+// Device function, used to set a in an InfoToken the [offset,size] related to
+// InfoToken.GetSameFSTStateTokensList
+__device__ __inline__ void SetSameFSTStateTokensList(int32 offset, int32 size,
+                                                     InfoToken *info_token) {
+  // We always have size > 0
+  *info_token = {offset, -size};
+}
+
+// Used to store the index in the GPU hashmap of that FST state
+// The hashmap is only generated with the final main queue (post max_active_) of
+// each frame
+// Also stores the information or whether or not the owner of that object is the
+// representative of this FSTState
+typedef int32 FSTStateHashIndex;
+
+// 1:1 Conversion float <---> sortable int
+// We convert floats to sortable ints in order
+// to use native atomics operation
+// Those are the host version, used when we transfer an int from the device
+// and we want to convert it to a float
+// (it was created on device by floatToOrderedInt, we'll use
+// orderedIntToFloatHost on host to convert it back to a float)
+__inline__ int32 floatToOrderedIntHost(float floatVal) {
+  int32 intVal;
+  // Should be optimized away by compiler
+  memcpy(&intVal, &floatVal, sizeof(float));
+  return (intVal >= 0) ? intVal : intVal ^ 0x7FFFFFFF;
+}
+
+__inline__ float orderedIntToFloatHost(int32 intVal) {
+  intVal = (intVal >= 0) ? intVal : intVal ^ 0x7FFFFFFF;
+  float floatVal;
+  // Should be optimized away by compiler
+  memcpy(&floatVal, &intVal, sizeof(float));
+  return floatVal;
+}
+
+// Hashmap value. Used when computing the hashmap in PostProcessingMainQueue
+struct __align__(16) HashmapValueT {
+  // Map key : fst state
+  int32 key;
+  // Number of tokens associated to that state
+  int32 count;
+  // minimum cost for that state + argmin
+  int2 min_and_argmin_int_cost;
+};
+
+enum OVERFLOW_TYPE {
+  OVERFLOW_NONE = 0,
+  OVERFLOW_MAIN_Q = 1,
+  OVERFLOW_AUX_Q = 2
+};
+
+enum QUEUE_ID { MAIN_Q = 0, AUX_Q = 1 };
+
+}  // end namespace cuda_decoder
+}  // end namespace kaldi
+
+#endif  // KALDI_CUDA_DECODER_CUDA_DECODER_UTILS_H_
diff --git a/src/cudadecoder/cuda-decoder-kernels-utils.h b/src/cudadecoder/cuda-decoder-kernels-utils.h
new file mode 100644
index 00000000000..1fd3952ee20
--- /dev/null
+++ b/src/cudadecoder/cuda-decoder-kernels-utils.h
@@ -0,0 +1,215 @@
+// cudadecoder/cuda-decoder-kernels-utils.h
+//
+// Copyright (c) 2019, NVIDIA CORPORATION.  All rights reserved.
+// Hugo Braun, Justin Luitjens, Ryan Leary
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef KALDI_CUDA_DECODER_CUDA_DECODER_KERNELS_UTILS_H_
+#define KALDI_CUDA_DECODER_CUDA_DECODER_KERNELS_UTILS_H_
+
+// NO_KEY == -1 is ok, because all keys will be >= 0 (FST states)
+#define KALDI_CUDA_DECODER_HASHMAP_NO_KEY -1
+#define KALDI_CUDA_DECODER_HASHMAP_NO_VAL                 \
+  {                                                       \
+    KALDI_CUDA_DECODER_HASHMAP_NO_KEY, 0, { INT_MAX, -1 } \
+  }
+
+#include "util/stl-utils.h"
+
+namespace kaldi {
+namespace cuda_decoder {
+
+// MinPlus and PlusPlus
+// int2 operators used in Scan or Reduce operations
+struct MinPlus {
+  __device__ int2 operator()(const int2 &a, const int2 &b) const {
+    int2 c;
+    c.x = min(a.x, b.x);
+    c.y = a.y + b.y;
+    return c;
+  }
+};
+struct PlusPlus {
+  __device__ int2 operator()(const int2 &a, const int2 &b) const {
+    int2 c;
+    c.x = a.x + b.x;
+    c.y = a.y + b.y;
+    return c;
+  }
+};
+
+// 1:1 Conversion float <---> sortable int
+// We convert floats to sortable ints in order
+// to use native atomics operation, which are
+// way faster than looping over atomicCAS
+__device__ __forceinline__ int32 floatToOrderedInt(float floatVal) {
+  int32 intVal = __float_as_int(floatVal);
+  return (intVal >= 0) ? intVal : intVal ^ 0x7FFFFFFF;
+}
+
+__device__ __forceinline__ float orderedIntToFloat(int32 intVal) {
+  return __int_as_float((intVal >= 0) ? intVal : intVal ^ 0x7FFFFFFF);
+}
+
+// binsearch_maxle (device)
+// With L=[all indexes low<=i<=high such as vec[i]<= val]
+// binsearch_maxle returns max(L)
+// the array vec must be sorted
+// Finds that value using a binary search
+__device__ __forceinline__ int32 binsearch_maxle(const int32 *vec,
+                                                 const int32 val, int32 low,
+                                                 int32 high) {
+  while (true) {
+    if (low == high) return low;  // we know it exists
+    if ((low + 1) == high) return (vec[high] <= val) ? high : low;
+
+    int32 mid = low + (high - low) / 2;
+
+    if (vec[mid] > val)
+      high = mid - 1;
+    else
+      low = mid;
+  }
+}
+
+// Atomic operations on int2 (device)
+// atomicAddI2, atomicMinI2, atomicSubI2
+//
+// union used
+union UInt64UnionInt2 {
+  int2 i2;
+  unsigned long long int ull;
+};
+
+__device__ __inline__ int2 atomicAddI2(int2 *ptr, int2 val) {
+  unsigned long long int *ptr64 =
+      reinterpret_cast<unsigned long long int *>(ptr);
+  UInt64UnionInt2 uval, uold;
+  uval.i2 = val;
+  uold.ull = atomicAdd(ptr64, uval.ull);
+  return uold.i2;
+}
+
+// We should switch to native atom64 on atomicMinI2 and atomicSubI2
+__device__ __inline__ void atomicMinI2(int2 *ptr, int2 val) {
+  unsigned long long int *ptr64 =
+      reinterpret_cast<unsigned long long int *>(ptr);
+  UInt64UnionInt2 old, assumed, value;
+  old.ull = *ptr64;
+  value.i2 = val;
+  if (old.i2.x <= val.x) return;
+  do {
+    assumed = old;
+    old.ull = atomicCAS(ptr64, assumed.ull, value.ull);
+  } while (old.ull != assumed.ull && old.i2.x > value.i2.x);
+}
+
+__device__ void atomicSubI2(int2 *ptr, int2 sub) {
+  unsigned long long int *ptr64 =
+      reinterpret_cast<unsigned long long int *>(ptr);
+  UInt64UnionInt2 old, assumed, value;
+  old.ull = *ptr64;
+  do {
+    assumed = old;
+    value.i2.x = assumed.i2.x - sub.x;
+    value.i2.y = assumed.i2.y - sub.y;
+    old.ull = atomicCAS(ptr64, assumed.ull, value.ull);
+  } while (old.ull != assumed.ull);
+}
+
+// Hash function used in the hashmap.
+// Using identity for now. They keys are the FST states, some randomness already
+// exists
+__device__ __forceinline__ int hash_func(int key) {
+  return key;  // using identity for now
+}
+
+// hashmap_insert_or_aggregate
+// Inserting a new value into the hashmap. If the key already exists in the
+// hashmap,
+// we'll aggregate the existing value with the new one, and set the result as
+// value for that key.
+// The new value inserted at key is (1, (int_cost, arg_int_cost)
+// With values being [count (int32), [min_cost, argmin_cost] (int2)]
+// If a value already exists for a key, we will aggregate the two values:
+// hashmap[key] = old_value +_ new_value
+// with +_ being (integer +, argmin)
+// It returns the hash_idx, i.e. where the key was inserted in the hashmap
+// The owner will then use that to access the data, and clear it for future use
+// It also returns local_idx, which informs how many values of that same key
+// were inserted before that call.
+// e.g. if thread 23 inserts the key 3, then thread 9 inserts the key 3,
+// thread 23 will have local_idx=0, thread 9 will have local_idx=1
+//
+// We use hashmap_insert in the context of a ReduceByKey. The same thread will
+// always
+// access the same key. Which is why we do not need a hashmap_find, and can
+// simply remember the hash_idx
+// from our last insert.
+//
+// Restriction: that function can only be used if we know that we will have
+// enough space in the hashmap
+// ie hashmap_capacity > total number of keys
+//
+// keys must be >= 0 (to avoid collisions with
+// KALDI_CUDA_DECODER_HASHMAP_NO_KEY)
+__device__ __inline__ void hashmap_insert_or_aggregate(
+    HashmapValueT *d_map_values, int key, int int_cost, int arg_int_cost,
+    int capacity, int *local_idx, int *out_hash_idx) {
+  int hash_idx = hash_func(key) % capacity;
+  int c = 0;
+  HashmapValueT *d_val = NULL;
+  do {
+    d_val = &d_map_values[hash_idx];
+    // Looking for a spot in the hashmap
+    int old = atomicCAS(&d_val->key, KALDI_CUDA_DECODER_HASHMAP_NO_KEY, key);
+    if (old == KALDI_CUDA_DECODER_HASHMAP_NO_KEY || old == key)
+      break;  // found a spot
+    hash_idx = (hash_idx + 1) % capacity;
+    ++c;
+  } while (c < capacity);
+  // The condition in which we use the hashmap always ensure that we have space
+  // asserting that we found a spot
+  assert(d_val);
+
+  // Updating values
+  *local_idx = atomicAdd(&d_val->count, 1);
+  *out_hash_idx = hash_idx;
+  atomicMinI2(&d_val->min_and_argmin_int_cost, {int_cost, arg_int_cost});
+}
+
+// In FSTStateHashIndex, we store both the hash_idx and a boolean
+// is_representative
+// which tells if the current thread is responsible for the state stored at
+// index hash_idx
+// We use the bit sign for that
+// Setter and getter
+__device__ __inline__ void SetFSTStateHashIndex(int32 raw_hash_idx,
+                                                bool is_representative,
+                                                FSTStateHashIndex *hash_idx) {
+  *hash_idx = is_representative ? (-raw_hash_idx - 1)  // -1 to force it < 0
+                                : raw_hash_idx;
+}
+
+__device__ __inline__ void GetFSTStateHashIndex(FSTStateHashIndex &hash_idx,
+                                                int32 *raw_hash_idx,
+                                                bool *is_representative) {
+  *is_representative = (hash_idx < 0);
+  *raw_hash_idx = *is_representative ? (-(hash_idx + 1)) : hash_idx;
+}
+
+}  // end namespace cuda_decoder
+}  // end namespace kaldi
+
+#endif  // KALDI_CUDA_DECODER_CUDA_DECODER_KERNELS_UTILS_H_
diff --git a/src/cudadecoder/cuda-decoder-kernels.cu b/src/cudadecoder/cuda-decoder-kernels.cu
new file mode 100644
index 00000000000..cba0ca5b84a
--- /dev/null
+++ b/src/cudadecoder/cuda-decoder-kernels.cu
@@ -0,0 +1,1878 @@
+// cudadecoder/cuda-decoder-kernels.cu
+//
+// Copyright (c) 2019, NVIDIA CORPORATION.  All rights reserved.
+// Hugo Braun, Justin Luitjens, Ryan Leary
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <cub/cub.cuh>
+#include "cuda-decoder-kernels.h"
+#include "cuda-decoder-kernels-utils.h"
+
+namespace kaldi {
+namespace cuda_decoder {
+
+// Initialize the hashmap with NO_VAL
+// Called in InitDeviceData, when building the CudaDecoder object
+__global__ void init_hashmap_kernel(DeviceParams cst_dev_params) {
+  const int max_nlanes = cst_dev_params.max_nlanes;
+  KALDI_CUDA_DECODER_BATCH_KERNEL_LOOP(ilane, max_nlanes) {
+    const int capacity = cst_dev_params.hashmap_capacity;
+    KALDI_CUDA_DECODER_1D_KERNEL_LOOP(idx, capacity) {
+      cst_dev_params.d_hashmap_values.lane(ilane)[idx] =
+          KALDI_CUDA_DECODER_HASHMAP_NO_VAL;
+    }
+  }
+}
+
+// Initialize initial channel on  device
+// Called by ComputeInitialChannel
+// It is NOT called in InitDecoding
+// In InitDecoding we will clone the initial channel into the channel we called
+// InitDecoding on
+// Here we are actually creating this initial channel
+// we do that once in the CudaDecoder constructor.
+//
+// The initial channel is the state of a channel when
+// it will start decoding a new utterance
+// thread (1, 1, 1)
+// blocks(1, 1, 1);
+__global__ void initialize_initial_lane_kernel(DeviceParams cst_dev_params) {
+  const int init_ichannel = cst_dev_params.init_channel_id;
+  const int init_ilane = 0;
+  ChannelCounters *init_channel_counters =
+      cst_dev_params.d_channels_counters.channel(init_ichannel);
+  LaneCounters *lane_counters =
+      cst_dev_params.d_lanes_counters.lane(init_ilane);
+
+  // Making the data look like an ExpandArcsEmitting just executed,
+  // and put the StartState in the aux_q. We will then pick up a normal
+  // execution from there
+  // (calling PruneAndPreprocess, then ExpandArcsNonEmitting..)
+  lane_counters->aux_q_end = 0;
+  lane_counters->aux_q_requested = 0;
+  lane_counters->post_expand_aux_q_end = 1;
+  lane_counters->main_q_global_offset = 0;
+  lane_counters->main_q_local_offset = 0;
+  lane_counters->main_q_n_extra_prev_tokens = 0;
+  lane_counters->int_cutoff = INT_MAX;
+  lane_counters->int_beam = floatToOrderedInt(cst_dev_params.default_beam);
+  lane_counters->main_q_narcs_and_end = {0, 0};
+  lane_counters->main_q_requested = 0;
+  const StateId init_state = cst_dev_params.init_state;
+  const CostType init_cost = cst_dev_params.init_cost;
+  IntegerCostType int_init_cost = floatToOrderedInt(init_cost);
+  cst_dev_params.d_aux_q_state_and_cost.lane(init_ilane)[0] = {init_state,
+                                                               int_init_cost};
+  lane_counters->min_int_cost = int_init_cost;
+  CostType cutoff = orderedIntToFloat(int_init_cost);
+  lane_counters->int_cutoff =
+      floatToOrderedInt(cutoff + cst_dev_params.default_beam);
+  cst_dev_params.d_aux_q_info.lane(init_ilane)[0] = {INT_MIN, -1};
+}
+
+// Called by InitDecoding
+// Called when some channels will start decoding a new utterance
+// do everything that's needed to do on the device to start decoding a new
+// utterance with those channels
+// It clones the initial channel (created in initialize_initial_lane_kernel)
+// into the channels we want to InitDecoding on
+__global__ void init_decoding_on_device_kernel(DeviceParams cst_dev_params,
+                                               KernelParams params) {
+  const int init_ichannel = cst_dev_params.init_channel_id;
+  const ChannelCounters *init_channel_counters =
+      cst_dev_params.d_channels_counters.channel(init_ichannel);
+  const int32 init_main_q_end =
+      init_channel_counters->prev_main_q_narcs_and_end.y;
+  const int32 nlanes = params.nlanes_used;
+  KALDI_CUDA_DECODER_BATCH_KERNEL_LOOP(ilane, nlanes) {
+    KALDI_CUDA_DECODER_1D_KERNEL_LOOP(idx, init_main_q_end) {
+      const int32 ichannel = params.channel_to_compute[ilane];
+      cst_dev_params.d_main_q_state_and_cost.channel(ichannel)[idx] =
+          cst_dev_params.d_main_q_state_and_cost.channel(init_ichannel)[idx];
+      cst_dev_params.d_main_q_degrees_prefix_sum.channel(ichannel)[idx] =
+          cst_dev_params.d_main_q_degrees_prefix_sum.channel(
+              init_ichannel)[idx];
+      cst_dev_params.d_main_q_arc_offsets.channel(ichannel)[idx] =
+          cst_dev_params.d_main_q_arc_offsets.channel(init_ichannel)[idx];
+      if (idx == 0) {
+        ChannelCounters *channel_counters =
+            cst_dev_params.d_channels_counters.channel(ichannel);
+        channel_counters->prev_main_q_narcs_and_end =
+            init_channel_counters->prev_main_q_narcs_and_end;
+        channel_counters->prev_main_q_n_extra_prev_tokens =
+            init_channel_counters->prev_main_q_n_extra_prev_tokens;
+        channel_counters->prev_main_q_global_offset = 0;
+        channel_counters->prev_main_q_extra_prev_tokens_global_offset = 0;
+        channel_counters->prev_beam = cst_dev_params.default_beam;
+      }
+    }
+  }
+}
+
+// Context switch : load
+// Called by LoadChannelsStateToLanes
+// THREADS : (1, 1, 1)
+// BLOCKS : (1, nlanes_used, 1)
+__global__ void load_channels_state_in_lanes_kernel(DeviceParams cst_dev_params,
+                                                    KernelParams params) {
+  const int nlanes = params.nlanes_used;
+  KALDI_CUDA_DECODER_BATCH_KERNEL_LOOP(ilane, nlanes) {
+    const int32 ichannel = params.channel_to_compute[ilane];
+    // Getting the lane ready for that channel
+    LaneCounters *lane_counters = cst_dev_params.d_lanes_counters.lane(ilane);
+    const ChannelCounters *channel_counters =
+        cst_dev_params.d_channels_counters.channel(ichannel);
+    int2 main_q_narcs_and_end = channel_counters->prev_main_q_narcs_and_end;
+    lane_counters->main_q_narcs_and_end = main_q_narcs_and_end;
+    lane_counters->main_q_n_extra_prev_tokens =
+        channel_counters->prev_main_q_n_extra_prev_tokens;
+    CostType beam = channel_counters->prev_beam;
+    IntegerCostType int_beam = floatToOrderedInt(beam);
+    lane_counters->int_beam = int_beam;
+    lane_counters->adaptive_int_beam_with_validity_index.x = int_beam;
+    lane_counters->adaptive_int_beam_with_validity_index.y =
+        cst_dev_params.adaptive_beam_static_segment;
+    lane_counters->main_q_global_offset =
+        channel_counters
+            ->prev_main_q_global_offset;  // we'll update it after emitting
+    lane_counters->main_q_extra_prev_tokens_global_offset =
+        channel_counters->prev_main_q_extra_prev_tokens_global_offset;
+    lane_counters->int_cutoff = INT_MAX;
+    lane_counters->min_int_cost = INT_MAX;
+    lane_counters->q_overflow = OVERFLOW_NONE;
+    lane_counters->aux_q_requested = 0;
+    lane_counters->main_q_requested = 0;
+    lane_counters->main_q_local_offset = 0;
+  }
+}
+
+// Context switch : store
+// Called by SaveChannelsStateFromLanes
+// THREADS : (1, 1, 1)
+// BLOCKS : (1, nchannel_to_compute, 1)
+__global__ void save_channels_state_from_lanes_kernel(
+    DeviceParams cst_dev_params, KernelParams params) {
+  const int nlanes = params.nlanes_used;
+  KALDI_CUDA_DECODER_BATCH_KERNEL_LOOP(ilane, nlanes) {
+    const LaneCounters *lane_counters =
+        cst_dev_params.d_lanes_counters.lane(ilane);
+    const int32 ichannel = params.channel_to_compute[ilane];
+    ChannelCounters *channel_counters =
+        cst_dev_params.d_channels_counters.channel(ichannel);
+    channel_counters->prev_main_q_global_offset =
+        lane_counters->main_q_global_offset;
+    channel_counters->prev_main_q_extra_prev_tokens_global_offset =
+        lane_counters->main_q_extra_prev_tokens_global_offset;
+    channel_counters->prev_main_q_narcs_and_end =
+        lane_counters->main_q_narcs_and_end;
+    channel_counters->prev_main_q_n_extra_prev_tokens =
+        lane_counters->main_q_n_extra_prev_tokens;
+    channel_counters->prev_beam = orderedIntToFloat(lane_counters->int_beam);
+  }
+}
+
+// concatenate_lanes_data
+// Called by PerformConcatenatedCopy
+// Creates a concatenate array into concat,
+// by concatenating all the arrays src.lane(ilane)
+// for ilane=0..params.nlanes_used
+// Used to prepare data for copy to Host. We want to avoid small Device2Host
+// copies.
+template <typename T>
+__global__ void concatenate_lanes_data_kernel(DeviceParams cst_dev_params,
+                                              KernelParams params,
+                                              LaneMatrixView<T> src,
+                                              T *concat) {
+  const int nlanes = params.nlanes_used;
+  KALDI_CUDA_DECODER_BATCH_KERNEL_LOOP(ilane, nlanes) {
+    int32 beg = params.main_q_end_lane_offsets[ilane];
+    int32 end = params.main_q_end_lane_offsets[ilane + 1];
+    int32 main_q_end = end - beg;
+    KALDI_CUDA_DECODER_1D_KERNEL_LOOP(idx, main_q_end) {
+      T d = src.lane(ilane)[idx];
+      concat[beg + idx] = d;
+    }
+  }
+}
+
+// nonemitting_preprocess_and_contract_kernel
+// Called from PruneAndPreprocess
+// This kernels prune the aux_q, move the survival tokens to the main_q,
+// and add the preprocessing information necessary for the next ExpandArcs
+// (the expand that follows PruneAndPreprocess is always non-emitting)
+// It prunes the tokens using the cutoff, and prepare the data necessary for
+// ExpandArcs:
+// d_main_q_degrees_prefix_sum, d_main_q_arc_offsets_
+// The prefix sum is done in one-pass here, using a trick (we compute the prefix
+// sum
+// as we fill the main_q)
+__global__ void nonemitting_preprocess_and_contract_kernel(
+    DeviceParams cst_dev_params, KernelParams params) {
+  typedef cub::BlockScan<int2, KALDI_CUDA_DECODER_1D_BLOCK> BlockScan;
+  __shared__ typename BlockScan::TempStorage sh_temp_storage;
+  // We need to move the survival tokens to the main_q
+  //
+  // sh_main_q_global_block_offset has two purposes :
+  // (1) to know where to store the survival tokens in the main_q
+  // (2) to perform the prefix sum degrees (of the survival tokens)
+  __shared__ int2 sh_main_q_global_block_offset;
+  const int nlanes = params.nlanes_used;
+  KALDI_CUDA_DECODER_BATCH_KERNEL_LOOP(ilane, nlanes) {
+    LaneCounters *lane_counters = cst_dev_params.d_lanes_counters.lane(ilane);
+    const int32 aux_q_end = lane_counters->post_expand_aux_q_end;
+    const IntegerCostType int_cutoff = lane_counters->int_cutoff;
+    // Keeping whole CTA alive. We'll use __syncthreads()
+    KALDI_CUDA_DECODER_1D_BLOCK_OFFSET_KERNEL_LOOP(block_offset, thread_idx,
+                                                   aux_q_end) {
+      const int32 aux_q_idx = block_offset + thread_idx;
+      const int32 ichannel = params.channel_to_compute[ilane];
+      int32 degree = 0;
+      int32 arc_start = -1;
+      StateId token_state;
+      IntegerCostType token_int_cost;
+      // We've kept the whole CTA alive. Now we keep only those will a valid
+      // token
+      if (aux_q_idx < aux_q_end) {
+        const int2 both =
+            cst_dev_params.d_aux_q_state_and_cost.lane(ilane)[aux_q_idx];
+        token_state = both.x;
+        token_int_cost = both.y;
+
+        if (token_int_cost < int_cutoff) {
+          // We'll keep that token. Loading its arc degree/csr offset now.
+          arc_start = cst_dev_params.d_arc_ne_offsets[token_state];
+          const int32 arc_end =
+              cst_dev_params.d_arc_ne_offsets[token_state + 1];
+          degree = arc_end - arc_start;
+        }
+      }
+
+      // If we've set a different arc_start,
+      // this thread has a valid unpruned token
+      int32 is_pruned = (arc_start == -1);
+
+      // We now know which tokens will be moved to the main_q, the remaining
+      // will be pruned
+      // we now compute a prefix sum inside the CUDA block to determine the
+      // local indexes of the unpruned tokens
+      // the first unpruned token will have a index of 0, the second 1, ...
+      // We also need to compute the prefix sum of the arc degrees
+      // we start by doing a local prefix sum inside the CUDA block
+      int2 block_prefix_sum_narcs_and_end = {degree, (is_pruned ? 0 : 1)};
+      const int2 zero2 = {0, 0};
+
+      // Computing the prefix sum (exclusive)
+      BlockScan(sh_temp_storage)
+          .ExclusiveScan(block_prefix_sum_narcs_and_end,
+                         block_prefix_sum_narcs_and_end, zero2, PlusPlus());
+
+      if (KALDI_CUDA_DECODER_IS_LAST_1D_THREAD()) {
+        // This conditional branch is entered by the last thread
+        // Because it is the last, the prefix_sum of that thread contains the
+        // sum of all elements
+
+        // We also add the value from this thread - the prefix sum is exclusive
+        // For the sum, we want it inclusive
+        int2 block_sum = block_prefix_sum_narcs_and_end;
+        block_sum.x += degree;
+        block_sum.y += is_pruned ? 0 : 1;
+
+        // Doing two things at the same time :
+        // requesting a spot in the main_q to store the survival tokens from
+        // this CTA
+        // We also increment the narcs value. atomic64.x will contain the number
+        // of
+        // arcs in the main_q up until the atomic64.y index
+        // That's all we need to finish our prefix sum. We add this global
+        // offset.
+
+        // First atomic to check if we are not overflowing main_q.
+        int block_offset =
+            atomicAdd(&lane_counters->main_q_requested, block_sum.y);
+
+        // Verify that we do not overflow
+        if (block_offset + block_sum.y < cst_dev_params.main_q_capacity) {
+          // we don't overflow we can safely grab a spot in the main_q
+          sh_main_q_global_block_offset =
+              atomicAddI2(&lane_counters->main_q_narcs_and_end, block_sum);
+        } else {
+          // our update would overflow
+          lane_counters->q_overflow |= OVERFLOW_MAIN_Q;  // for the host
+          sh_main_q_global_block_offset.y =
+              cst_dev_params.main_q_capacity;  // used as flag to broadcast the
+                                               // information in the CTA
+        }
+      }
+
+      // Syncing because :
+      // - Broadcasting sh_main_q_global_block_offset
+      // - We may reuse sh_temp_storage (cf CUB doc)
+      __syncthreads();
+
+      // Checking if we are overflowing the main_q
+      // All threads are executing the next line
+      if (sh_main_q_global_block_offset.y == cst_dev_params.main_q_capacity)
+        goto end_lane;  // done for this lane
+
+      // If we are executing the following lines it means that we are not
+      // overflowing the queue
+      // We then continue what we were doing
+      if (!is_pruned) {
+        // we will move our unpruned token to the main_q, at index main_q_idx
+        const int32 main_q_idx =
+            sh_main_q_global_block_offset.y + block_prefix_sum_narcs_and_end.y;
+        // Moving the token to the main q
+        cst_dev_params.d_main_q_state_and_cost.channel(ichannel)[main_q_idx] = {
+            token_state, token_int_cost};
+        cst_dev_params.d_main_q_info.lane(ilane)[main_q_idx] =
+            cst_dev_params.d_aux_q_info.lane(ilane)[aux_q_idx];
+        cst_dev_params.d_main_q_acoustic_cost.lane(ilane)[main_q_idx] =
+            cst_dev_params.d_aux_q_acoustic_cost.lane(ilane)[aux_q_idx];
+        // Saving the global prefix sum
+        const int32 prefix_sum_narcs =
+            sh_main_q_global_block_offset.x + block_prefix_sum_narcs_and_end.x;
+        cst_dev_params.d_main_q_degrees_prefix_sum.channel(
+            ichannel)[main_q_idx] = prefix_sum_narcs;
+        // Saving the CSR arc offset for that token's state
+        // it will be used by the expand kernel, and avoid doing a new random
+        // memory access in the expand kernel
+        cst_dev_params.d_main_q_arc_offsets.channel(ichannel)[main_q_idx] =
+            arc_start;
+      }
+    }
+
+  end_lane:;  // empty statement
+  }
+}
+
+// GetAdaptiveBeam is used in ExpandArcs
+// When we generate new tokens by traversing arcs, 
+// we can end up creating a lot of tokens, if the current frame 
+// generated loglikelihoods too uniform for instance (we don't have
+// any good tokens that will reduce the cutoff, so we end up generating
+// a lot of tokens)
+// To avoid overflowing the aux_q, we apply a decreasing beam.
+// With aux_q_end being the current aux_q size, we have a decrease function f, with
+// adaptive_beam = f(aux_q_end)
+// f is a decreasing piecewise constant function
+// Please note that when processing tokens, we usually have dozens of thousands of threads
+// generating tokens. Those are already in flight, and will not reload the beam immediatly.
+// It means that we need to start reducing the beam as soon as we detect that we are generating more tokens than
+// expected. 
+// We can configure the function f using KALDI_CUDA_DECODER_ADAPTIVE_BEAM_STATIC_SEGMENT
+// and KALDI_CUDA_DECODER_ADAPTIVE_BEAM_NSTEPS.
+// We will use default_beam for the first max_tokens_per_frame/KALDI_CUDA_DECODER_ADAPTIVE_BEAM_STATIC_SEGMENT
+// tokens in the aux_q.
+// Once we reach that number, we will decrease the adaptive beam linearly from default_beam to 0,
+// using KALDI_CUDA_DECODER_ADAPTIVE_BEAM_NSTEPS steps
+//
+// x-axis : aux_q_end. How much tokens are already in the aux_q
+// y-axis : adaptive_beam = f(aux_q_end)
+// default_beam _| ________________
+//               |               /\ _________
+//               |                |          _________
+//            0 _|   static_segment                   _________
+//               |________________________________________________
+//               |                                             |     
+//   aux_q_end=  0                                    max_tokens_per_frame
+// We have :     
+// static_segment = max_tokens_per_frame/KALDI_CUDA_DECODER_ADAPTIVE_BEAM_STATIC_SEGMENT
+// and KALDI_CUDA_DECODER_ADAPTIVE_BEAM_NSTEPS = 3
+__device__ void UpdateAdaptiveBeam(const DeviceParams &cst_dev_params,
+                                   const int aux_q_index_block_offset,
+                                   IntegerCostType min_int_cost,
+                                   int2 *adaptive_int_beam_with_validity_index,
+                                   LaneCounters *lane_counters) {
+  int32 beam_valid_until_idx = adaptive_int_beam_with_validity_index->y;
+  if (aux_q_index_block_offset < beam_valid_until_idx) return;  // nothing to do
+
+  CostType beam = orderedIntToFloat(adaptive_int_beam_with_validity_index->x);
+  while (aux_q_index_block_offset >= beam_valid_until_idx) {
+    beam /= 2;
+    beam_valid_until_idx += cst_dev_params.adaptive_beam_bin_width;
+  }
+
+  IntegerCostType new_int_cutoff = (min_int_cost < INT_MAX)
+      ? floatToOrderedInt(orderedIntToFloat(min_int_cost) + beam)
+      : INT_MAX;
+  IntegerCostType int_beam = floatToOrderedInt(beam);
+  adaptive_int_beam_with_validity_index->x = int_beam;
+  adaptive_int_beam_with_validity_index->y = beam_valid_until_idx;
+  // We can have races between the two atomics
+  // However the worst than can happen is a CTA might delay updating the beam
+  // This is not a critical bug. However, once we have a floatToOrderedInt
+  // that is generating unsigned ints, we could merge the two atomics into a
+  // single atomic64
+  atomicMin(&lane_counters->adaptive_int_beam_with_validity_index.x, int_beam);
+  atomicMax(&lane_counters->adaptive_int_beam_with_validity_index.y,
+            beam_valid_until_idx);
+  atomicMin(&lane_counters->int_cutoff, new_int_cutoff);
+}
+
+// ExpandArc kernel
+// This kernel does the actual work of traversing arcs
+//
+// Pseudo code :
+// for all token tok in main_q[main_q_offset...end]:
+//      u = tok.next_state
+//      for all arc a(u->v) in the FST:
+//          v_cost = tok.cost + a.cost + accoustic_cost
+//
+//          if v_cost < cutoff and v_cost < best_state_cost[v]
+//              generate token associated to v, add to aux_q
+//              if necessary update cutoff
+//              if aux_q is getting full, reduce beam
+//
+// For more information please refer to http://kaldi-asr.org/doc/decoders.html
+//
+// ExpandArc rely on some preprocessed data to be able to function
+// for instance, it needs the prefix sum of the arc degree of all token.state in
+// the main_q
+// We need to call a Preprocess kernel before ExpandArc
+//
+// ExpandArc is used for both emitting and nonemitting phases
+// Differences between emitting and nonemitting :
+//      1) params.d_q_arc_offset contains offsets to either emitting or
+//      nonemitting arcs.
+//         It is transparent for this kernel. The differentiation was done in
+//         the Preprocess kernel,
+//         which is responsible for filling the params.d_q_arc_offset array
+//      2) Computation of the acoustic cost. If nonemitting, it is equal to 0.
+//      If emitting, we need
+//         to use values from the acoustic model (through the d_loglikelihoods
+//         array)
+//
+// Note : ExpandArc is not the only kernel able to traverse arcs.
+// FinalizeProcessNonemitting contains a simplified version of expand for only
+// one CUDA block
+template <bool IS_EMITTING>
+__global__ void expand_arcs_kernel(DeviceParams cst_dev_params,
+                                   KernelParams params) {
+  // BlockScan that we will use to compute token indexes in the output queue,
+  // and to find the min cost in the block
+  typedef cub::BlockScan<int2, KALDI_CUDA_DECODER_1D_BLOCK> BlockScan;
+  __shared__ typename BlockScan::TempStorage sh_temp_storage_scan;
+
+  // This kernel writes the new token to the output queue aux_q
+  // We will request a spot to store all the new tokens created by threads in
+  // this CUDA block
+  // sh_aux_q_index_block_offset indicates where to store them in the aux_q
+  // tokens created in this CUDA block will be store in :
+  // aux_q[sh_aux_q_index_block_offset], aux_q[sh_aux_q_index_block_offset + 1],
+  __shared__ int32 sh_aux_q_index_block_offset;
+  const int nlanes = params.nlanes_used;
+  KALDI_CUDA_DECODER_BATCH_KERNEL_LOOP(ilane, nlanes) {
+    LaneCounters *lane_counters = cst_dev_params.d_lanes_counters.lane(ilane);
+    const int32 main_q_offset = lane_counters->main_q_local_offset;
+    const int32 main_q_end = lane_counters->main_q_narcs_and_end.y;
+    const int32 total_narcs = lane_counters->main_q_narcs_and_end.x;
+    KALDI_CUDA_DECODER_1D_BLOCK_OFFSET_KERNEL_LOOP(block_offset, thread_idx,
+                                                   total_narcs) {
+      int2 adaptive_int_beam_with_validity_index =
+          lane_counters->adaptive_int_beam_with_validity_index;
+      const int32 ichannel = params.channel_to_compute[ilane];
+      // Important : this thread is not responsible for a token in the input
+      // queue main_q
+      // but for an arc, going out of a token in the main_q
+      // The main_q contains in total total_narcs
+      // and this thread will compute the main_q_arc_index-th arc of the main_q
+      // For instance, first thread in the grid with threadIdx.x == 0 and
+      // blockIdx.x == 0
+      // will process the first arc of the token in main_q[main_q_offset + 0]
+      // (if that token has at least one arc)
+      //
+      // This insure a perfect one thread = one arc load balancing
+      // but we have work to do to know exactly which arc is the
+      // main_q_arc_index-th arc
+      // (what's its source ? its destination ? its arc_idx the FST CSR ?)
+      int32 main_q_arc_index = block_offset + thread_idx;
+      // We'll need those variables later in the kernel
+      // we declare them outside of the "valid_input" scope
+      // to be able to access them later
+      int32 main_q_idx;
+      int32 arc_idx;
+      StateId arc_next_state;
+      IntegerCostType int_total_cost = INT_MAX;
+      CostType acoustic_cost = 0.0f;
+      if (main_q_arc_index < total_narcs) {
+        // Current thread must take care of main_q_arc_index-th arc
+        // we need to now what's the source of that arc
+        // ie which token.state in main_q does it start from ?
+        // We use a binary search in the prefix sum of the token's degree to get
+        // that information
+        //
+        // Example : main_q contains 3 tokens
+        // - First token is associated to a state which has 3 outgoing arc
+        // - Second token is associated to a state which has 0 outgoing arc
+        // - Third token is associated to a state which has 2 outgoing arc
+        //
+        // We store the degrees in an array :
+        // [3, 0, 2]
+        //
+        // We then compute the exclusive prefix sum of that array :
+        // [0, 3, 3, 5]
+        //
+        // In total, we have 5 arcs in the main_q. ExpandArc will use 5 threads.
+        //
+        // Let's say we are the fifth thread in ExpandArc.
+        // we have threadIdx.x == 4, and blockIdx.x == 0
+        // it gives us main_q_arc_index == 4
+        // From there we have no idea what we're supposed to do next, we need to
+        // have information about the
+        // arc that we're supposed to traverse
+        //
+        // To do that, we look for the maximum index maxle_i in the prefix sum
+        // array such prefix_sum[i] <= 4
+        //
+        // [0, 3, 3, 5]
+        //          |
+        //         here
+        // maxle_i = 2
+        // it means that our source token is at index 2 in the main_q
+        // and we are computing the arc at index (main_q_arc_index -
+        // prefix_sum[maxle_i]) of that token
+        // ie the arc at index (4-3) = 1, the second arc of the second token in
+        // main_q
+
+        // Searching for the source of the arc that we will process
+        // (main_q_arc_index)
+        // we could preprocess the search in the preprocess kernels - for now
+        // this kernel is fast enough
+        const int32 *degrees_prefix_sum =
+            cst_dev_params.d_main_q_degrees_prefix_sum.channel(ichannel);
+        main_q_idx = binsearch_maxle(degrees_prefix_sum, main_q_arc_index,
+                                     main_q_offset, main_q_end - 1);
+
+        // state_first_arc_idx_in_main_q
+        // d_main_q_degrees_prefix_sum contains the prefix sum of the
+        // degrees of all tokens in the main_q
+        // d_main_q_degrees_prefix_sum[main_q_idx] contains the number of arc
+        // in the main_q until that token
+        const int32 state_first_arc_idx_in_main_q =
+            degrees_prefix_sum[main_q_idx];
+
+        // arc_offset_start is the offset in the CSR, to find the arcs
+        // related to the state main_q_state_[main_q_idx]
+        // it was set by the preprocess kernel
+        const int32 arc_offset_start =
+            cst_dev_params.d_main_q_arc_offsets.channel(ichannel)[main_q_idx];
+
+        // local_arc_index is the arc index for that state
+        // if local_arc_index == 2, we will process the second arc
+        // of state main_q_state_[main_q_idx]
+        const int32 local_arc_index =
+            main_q_arc_index - state_first_arc_idx_in_main_q;
+
+        // corresponding arc_idx in the FST
+        arc_idx = arc_offset_start + local_arc_index;
+
+        // Destination of that arc
+        arc_next_state = cst_dev_params.d_arc_nextstates[arc_idx];
+
+        // Building the total cost incrementally
+        // we'll add the acoustic cost and the old token's cost
+        const CostType arc_fixed_cost = cst_dev_params.d_arc_weights[arc_idx];
+        const CostType prev_token_cost = orderedIntToFloat(
+            cst_dev_params.d_main_q_state_and_cost.channel(ichannel)[main_q_idx]
+                .y);
+        CostType total_cost = prev_token_cost + arc_fixed_cost;
+        const int32 prev_state =
+            cst_dev_params.d_main_q_state_and_cost.channel(ichannel)[main_q_idx]
+                .x;
+        if (IS_EMITTING) {
+          const int32 arc_ilabel = cst_dev_params.d_arc_pdf_ilabels[arc_idx];
+          acoustic_cost = -params.loglikelihoods_ptrs[ilane][arc_ilabel];
+          total_cost += acoustic_cost;
+        }
+        int_total_cost = floatToOrderedInt(total_cost);
+
+        // If the total_cost is too large compared to our cutoff (beam search)
+        // then let's drop it
+        const IntegerCostType int_cutoff = lane_counters->int_cutoff;
+        if (int_total_cost >= int_cutoff) int_total_cost = INT_MAX;
+      }
+
+      // If int_total_cost < INT_MAX, it means that :
+      // - this thread had a valid input (main_q_arc_index < total_narcs)
+      // - the total_cost of the generated token is < cutoff
+      // We will then add that new token in the output queue, aux_q
+      // We need to know where to put that token in the aux_q
+      // we'll first compute its index inside the CUDA block
+      // the first valid output token in the CUDA block will have index 0,
+      // the second index 1... We compute that using a prefix sum
+      //
+      // We also need to find the overall min cost in the CUDA block
+      // a prefix sum is a scan operation, and a min a reduce operation
+      // we can perform a reduce operation using a scan (using the last value)
+      // we compute the prefix sum and the min in one scan, using the data
+      // struct CostTypeAndInt
+      const int32 has_successor = (int_total_cost < INT_MAX) ? 1 : 0;
+
+      int2 int_cost_and_index = {int_total_cost, has_successor};
+      BlockScan(sh_temp_storage_scan)
+          .InclusiveScan(int_cost_and_index, int_cost_and_index, MinPlus());
+      if (KALDI_CUDA_DECODER_IS_LAST_1D_THREAD()) {
+        // We are in a divergent branch
+        // This is the last thread. The last value of the inclusive scan is the
+        // total
+        const int32 total_successors_in_block = int_cost_and_index.y;
+        // Requesting a spot of size total_successors_in_block in the aux_q
+
+        // note:  using 2 atomics here to avoid adding another kernel
+        // first request more space
+        const int aux_q_index_block_offset = atomicAdd(
+            &lane_counters->aux_q_requested, total_successors_in_block);
+
+        // check for overflow in aux_q
+        // We try to prevent an overflow from happening using an adaptive beam
+        // (cf GetAdaptiveBeam)
+        if (aux_q_index_block_offset + total_successors_in_block <
+            cst_dev_params.aux_q_capacity) {
+          // no overflow
+
+          // grab the aux_q offset
+          sh_aux_q_index_block_offset =
+              atomicAdd(&lane_counters->aux_q_end, total_successors_in_block);
+
+          // We are not overflowing the queue, updating the global values
+          if (IS_EMITTING) {
+            // We can find a lower global_min_cost only in the emitting stage
+            IntegerCostType global_min_int_cost = lane_counters->min_int_cost;
+            IntegerCostType local_min_int_cost = int_cost_and_index.x;
+            // if we found a lower min_cost, update the global value
+            if (local_min_int_cost < global_min_int_cost) {
+              global_min_int_cost = local_min_int_cost;
+              atomicMin(&lane_counters->min_int_cost, global_min_int_cost);
+              CostType beam =
+                  orderedIntToFloat(adaptive_int_beam_with_validity_index.x);
+              IntegerCostType new_int_cutoff = floatToOrderedInt(
+                  orderedIntToFloat(local_min_int_cost) + beam);
+              atomicMin(&lane_counters->int_cutoff, new_int_cutoff);
+            }
+            int32 beam_valid_until_idx =
+                adaptive_int_beam_with_validity_index.y;
+            if (aux_q_index_block_offset >= beam_valid_until_idx) {
+              // This beam is no longer valid. Updating it
+              UpdateAdaptiveBeam(
+                  cst_dev_params, aux_q_index_block_offset, global_min_int_cost,
+                  &adaptive_int_beam_with_validity_index, lane_counters);
+            }
+          }
+        } else {
+          // sh_aux_q_index_block_offset is in shared memory
+          // its value is currently invalid (overflow)
+          // we set it to a special value and use it as a flag to broadcast
+          // the fact that we have an overflow and that all threads should exit
+          sh_aux_q_index_block_offset = cst_dev_params.aux_q_capacity;
+
+          // Setting the flag for the host. It will be used to print a warning
+          // to stderr
+          lane_counters->q_overflow |= OVERFLOW_AUX_Q;
+
+          // We do not jump to end_lane now, because only
+          // the first thread (threadIdx.x == 0) is executing this
+          // We wait until the end of the divergent branch
+        }
+      }
+
+      // Sync'ing for two reasons :
+      // - Broadcasting sh_aux_q_index_block_offset
+      // - reusing sh_temp_storage (cf CUB's doc)
+      __syncthreads();
+      // The only case where we can have that condition met,
+      // is if we detected an overflow if the previous lines
+      if (sh_aux_q_index_block_offset == cst_dev_params.aux_q_capacity)
+        goto end_lane;  // done for this lane
+      //
+      // If we're executing the following lines it means everything
+      // is valid and we are not overflowing the aux_q
+      //
+      int_cost_and_index.y -= has_successor;  // we want the exclusive sum now
+      const int32 aux_q_block_index = int_cost_and_index.y;
+      const int32 aux_q_index = sh_aux_q_index_block_offset + aux_q_block_index;
+      if (has_successor) {
+        // We save the new token to the aux_q
+        cst_dev_params.d_aux_q_state_and_cost.lane(ilane)[aux_q_index] = {
+            arc_next_state, int_total_cost};
+        cst_dev_params.d_aux_q_acoustic_cost.lane(ilane)[aux_q_index] =
+            acoustic_cost;
+        // Index of the parent token
+        // the parent is the token used as input (source of arc)
+        // that parent is at index main_q_idx in the GPU memory
+        // However, the main_q is emptied before processing a new frame
+        // we need to add the offset related to the previous frames index
+        // we add cst_dev_params.main_q_global_offset
+        const int32 prev_token =
+            lane_counters->main_q_global_offset + main_q_idx;
+        assert(main_q_idx >= 0 && main_q_idx < cst_dev_params.main_q_capacity);
+        cst_dev_params.d_aux_q_info.lane(ilane)[aux_q_index] = {prev_token,
+                                                                arc_idx};
+      }
+    }
+  end_lane:;  // ";" is an empty statement
+  }
+}
+
+// post_expand_kernel
+// Called after expand_arcs_kernel
+// Takes care of what needs to be done after an expand_arcs_kernel
+// execution. Mostly resetting the beam (if adaptive beam was triggered,
+// the max_active_ kernels will take care of selecting a good beam),
+// resetting the number of arcs in the main_q (we've processed them all),
+// etc.
+// Threads (1,1,1)
+// Blocks (1, nlanes_used, 1)
+template <bool IS_EMITTING>
+__global__ void post_expand_kernel(DeviceParams cst_dev_params,
+                                   KernelParams params) {
+  const int nlanes = params.nlanes_used;
+  KALDI_CUDA_DECODER_BATCH_KERNEL_LOOP(ilane, nlanes) {
+    LaneCounters *lane_counters = cst_dev_params.d_lanes_counters.lane(ilane);
+    const int prev_main_q_end = lane_counters->main_q_narcs_and_end.y;
+    const int prev_n_extra_prev_tokens =
+        lane_counters->main_q_n_extra_prev_tokens;
+    const int aux_q_end = lane_counters->aux_q_end;
+    CostType beam = orderedIntToFloat(lane_counters->int_beam);
+    CostType min_cost = orderedIntToFloat(lane_counters->min_int_cost);
+    // The next step is the contracting step from aux_q to main_q
+    // It will need the aux_q_end value. But it will also empty the aux_q
+    // We're resetting aux_q_end to 0 now, but we're saving its old value
+    // in another place
+    lane_counters->post_expand_aux_q_end = aux_q_end;
+    lane_counters->aux_q_end = 0;
+    lane_counters->aux_q_requested = 0;
+    // We are done processing those arcs
+    lane_counters->main_q_narcs_and_end.x = 0;
+    // Resetting the adaptive beam
+    lane_counters->adaptive_int_beam_with_validity_index.x =
+        lane_counters->int_beam;
+    lane_counters->adaptive_int_beam_with_validity_index.y =
+        cst_dev_params.adaptive_beam_static_segment;
+    // If the adaptive beam kicked in, we want to reset the beam
+    // the max-active process will take care of selecting the right beam
+    if (IS_EMITTING) {
+      // the main_q contains the tokens from the previous frame
+      // after emitting, we won't use them anymore to create new tokens
+      // we reset the main_q
+      lane_counters->main_q_narcs_and_end = {0, 0};
+      lane_counters->main_q_requested = 0;
+      // The main_q was flushed - we need to update the global_offset
+      lane_counters->main_q_global_offset += prev_main_q_end;
+      if (threadIdx.x == 0 && blockIdx.x == 0)
+        lane_counters->main_q_extra_prev_tokens_global_offset +=
+            prev_n_extra_prev_tokens;
+      // Moving local offset. Tokens created by last expand
+      // will be pruned, and survivals will be moved at the end
+      // of the main q. Those tokens will be placed after local_offset
+      lane_counters->int_cutoff = floatToOrderedInt(min_cost + beam);
+      lane_counters->main_q_requested = 0;
+    } else {
+      lane_counters->main_q_local_offset = prev_main_q_end;
+      // reset requested to end of queue
+      lane_counters->main_q_requested = prev_main_q_end;
+    }
+  }
+}
+
+// Meta-kernel (merging preprocess and expand) but only works with 1 CUDA block
+// Used to avoid calling multiple main kernels (such as expand_arcs_kernel)
+// for the tail of non emitting (lots of iterations with small number of arcs)
+//
+// Code is greatly simplified because we use only one CTA / lane
+//
+// Repeat until new queue empty:
+// 1) Preprocess
+// 2) Expand arcs
+//
+// The preprocess stage is not done on the first iteration, because it was
+// already done by the ProcessAndContract kernel. We always call
+// PruneAndPreprocess before calling FinalizeProcessNonemitting
+//
+// At the end, this kernel finalize the computation for current frame,
+// so that it's ready for next ProcessEmitting
+//
+// This kernel works, but can be greatly simplified now.
+__launch_bounds__(KALDI_CUDA_DECODER_LARGEST_1D_BLOCK, 1) __global__
+    void finalize_process_non_emitting_kernel(DeviceParams cst_dev_params,
+                                              KernelParams params) {
+  typedef cub::BlockScan<int2, KALDI_CUDA_DECODER_LARGEST_1D_BLOCK>
+      Int2BlockScan;
+  typedef cub::BlockScan<int, KALDI_CUDA_DECODER_LARGEST_1D_BLOCK> IntBlockScan;
+  __shared__ typename IntBlockScan::TempStorage sh_temp_storage_int_scan;
+  __shared__ typename Int2BlockScan::TempStorage sh_temp_storage_int2_scan;
+
+  const int nlanes = params.nlanes_used;
+  KALDI_CUDA_DECODER_BATCH_KERNEL_LOOP(ilane, nlanes) {
+    LaneCounters *lane_counters = cst_dev_params.d_lanes_counters.lane(ilane);
+    const int32 ichannel = params.channel_to_compute[ilane];
+    ChannelCounters *channel_counters =
+        cst_dev_params.d_channels_counters.channel(ichannel);
+
+    int2 both = lane_counters->main_q_narcs_and_end;
+    int32 main_q_narcs = both.x;
+    int32 main_q_end = both.y;
+    int32 main_q_local_offset = lane_counters->main_q_local_offset;
+    const int32 main_q_global_offset = lane_counters->main_q_global_offset;
+    // aux_q is empty when this kernel is called
+    int32 aux_q_end = 0;
+    IntegerCostType int_cutoff = lane_counters->int_cutoff;
+    while (main_q_narcs > 0) {
+      // Step 1 : ExpandArcs
+      KALDI_CUDA_DECODER_1D_BLOCK_OFFSET_KERNEL_LOOP(offset, thread_idx,
+                                                     main_q_narcs) {
+        const int32 main_q_arc_idx = offset + thread_idx;
+        // For details on how this code works, please refer to comments in
+        // expand_arcs
+        IntegerCostType total_int_cost = INT_MAX;
+        int32 arc_idx;
+        StateId arc_next_state;
+        int32 main_q_idx;
+        if (main_q_arc_idx < main_q_narcs) {
+          main_q_idx = binsearch_maxle(
+              cst_dev_params.d_main_q_degrees_prefix_sum.channel(ichannel),
+              main_q_arc_idx, main_q_local_offset, main_q_end - 1);
+
+          const int32 state_first_arc_idx_in_main_q =
+              cst_dev_params.d_main_q_degrees_prefix_sum.channel(
+                  ichannel)[main_q_idx];
+          const int32 arc_offset_start =
+              cst_dev_params.d_main_q_arc_offsets.channel(ichannel)[main_q_idx];
+          arc_idx = arc_offset_start +
+                    (main_q_arc_idx - state_first_arc_idx_in_main_q);
+
+          arc_next_state = cst_dev_params.d_arc_nextstates[arc_idx];
+          CostType arc_weight = cst_dev_params.d_arc_weights[arc_idx];
+          CostType prev_token_cost =
+              orderedIntToFloat(cst_dev_params.d_main_q_state_and_cost
+                                    .channel(ichannel)[main_q_idx]
+                                    .y);
+          total_int_cost = floatToOrderedInt(arc_weight + prev_token_cost);
+          if (total_int_cost >= int_cutoff) {
+            total_int_cost = INT_MAX;  // above cutoff
+          }
+        }
+        const int32 has_successor = (total_int_cost < INT_MAX) ? 1 : 0;
+
+        int32 local_aux_q_idx;
+        int32 nsuccessors;
+        IntBlockScan(sh_temp_storage_int_scan)
+            .ExclusiveSum(has_successor, local_aux_q_idx,
+                          nsuccessors);  // aggregate
+
+        // Checking if we are overflowing the aux_q
+        if ((aux_q_end + nsuccessors) >= cst_dev_params.aux_q_capacity) {
+          lane_counters->q_overflow |= OVERFLOW_AUX_Q;
+          // nothing to revert in global memory
+          goto finalize_lane;
+        }
+
+        if (has_successor) {
+          const int32 aux_q_idx = aux_q_end + local_aux_q_idx;
+          const int32 prev_token_idx = main_q_global_offset + main_q_idx;
+          cst_dev_params.d_aux_q_state_and_cost.lane(ilane)[aux_q_idx] = {
+              arc_next_state, total_int_cost};
+          cst_dev_params.d_aux_q_info.lane(ilane)[aux_q_idx] = {prev_token_idx,
+                                                                arc_idx};
+          cst_dev_params.d_aux_q_acoustic_cost.lane(ilane)[aux_q_idx] =
+              0.0f;  // we are always non-emitting in this kernel
+        }
+        aux_q_end += nsuccessors;
+        // sync: reusing sh_temp_storage_scan_int
+        __syncthreads();
+      }
+
+      // Step 2 : PreprocessAndContract
+      // Reset for new iteration
+      main_q_narcs = 0;
+      main_q_local_offset = main_q_end;
+      KALDI_CUDA_DECODER_1D_BLOCK_OFFSET_KERNEL_LOOP(offset, thread_idx,
+                                                     aux_q_end) {
+        const int32 aux_q_idx = offset + thread_idx;
+        int32 degree = 0;
+        int32 start = -1;
+        StateId token_state;
+        IntegerCostType token_int_cost;
+        if (aux_q_idx < aux_q_end) {
+          int2 both =
+              cst_dev_params.d_aux_q_state_and_cost.lane(ilane)[aux_q_idx];
+          token_state = both.x;
+          token_int_cost = both.y;
+          // beam may have changed since generation
+          // We are non-emitting in this kernel, using ne offsets
+          start = cst_dev_params.d_arc_ne_offsets[token_state];
+          int32 end = cst_dev_params.d_arc_ne_offsets[token_state + 1];
+          degree = end - start;
+        }
+        int has_valid_nonpruned_token = (start != -1) ? 1 : 0;
+        int2 narcs_and_ntokens_prefix_sum = {degree, has_valid_nonpruned_token};
+        int2 aggregate, zero2 = {0, 0};
+        Int2BlockScan(sh_temp_storage_int2_scan)
+            .ExclusiveScan(narcs_and_ntokens_prefix_sum,
+                           narcs_and_ntokens_prefix_sum, zero2, PlusPlus(),
+                           aggregate);
+        // Checking if we are not overflowing the main_q
+        const int32 total_ntokens = aggregate.y;
+        if ((main_q_end + total_ntokens) >= cst_dev_params.main_q_capacity) {
+          lane_counters->q_overflow |= OVERFLOW_MAIN_Q;
+          goto finalize_lane;
+        }
+        const int32 degree_prefix_sum =
+            main_q_narcs + narcs_and_ntokens_prefix_sum.x;
+        const int32 degree_sum = aggregate.x;
+        main_q_narcs += degree_sum;
+        if (has_valid_nonpruned_token) {
+          const int32 local_main_q_idx = narcs_and_ntokens_prefix_sum.y;
+          const int32 main_q_idx = main_q_end + local_main_q_idx;
+
+          cst_dev_params.d_main_q_arc_offsets.channel(ichannel)[main_q_idx] =
+              start;
+          cst_dev_params.d_main_q_degrees_prefix_sum.channel(
+              ichannel)[main_q_idx] = degree_prefix_sum;
+          cst_dev_params.d_main_q_state_and_cost.channel(
+              ichannel)[main_q_idx] = {token_state, token_int_cost};
+          cst_dev_params.d_main_q_info.lane(ilane)[main_q_idx] =
+              cst_dev_params.d_aux_q_info.lane(ilane)[aux_q_idx];
+          cst_dev_params.d_main_q_acoustic_cost.lane(ilane)[main_q_idx] =
+              cst_dev_params.d_aux_q_acoustic_cost.lane(ilane)[aux_q_idx];
+        }
+        main_q_end += total_ntokens;
+        __syncthreads();
+      }
+      aux_q_end = 0;  // aux_q is now empty
+    }
+
+  finalize_lane:
+    if (threadIdx.x == 0) {
+      // This main_q is now final for that frame
+      int32 min_int_cost = lane_counters->min_int_cost;
+      lane_counters->main_q_narcs_and_end = {0, main_q_end};
+      lane_counters->main_q_local_offset = 0;
+
+      // Resetting values used by GetBestCost
+      // This is just a reset : If we need to read it, we need to call
+      // GetBestCost
+      channel_counters->min_int_cost_and_arg_with_final.x =
+          INT_MAX;  // it will be set with atomicMins
+      channel_counters->min_int_cost_and_arg_without_final.x =
+          min_int_cost;  // we already know what the min cost is
+    }
+  }
+}
+
+// GetBestCost :
+// Finds all tokens with a cost in [min_cost;min_cost+lattice_beam[
+// Add the final_costs if use_final_probs
+// Does the computation in two steps
+//
+// Step 1: Find the value of min_cost, i.e. the minimum cost in the last token
+// queue
+// (the queue generated by the last frame computed)
+// We set both channel_counters->min_int_cost_and_arg_without_final
+// and channel_counters->min_int_cost_and_arg_with_final
+// One add the final_cost[token.state] before looking for the min
+__global__ void get_best_cost_step1_kernel(DeviceParams cst_dev_params,
+                                           KernelParams params,
+                                           bool use_final_probs,
+                                           CostType fst_zero) {
+  const int nlanes = params.nlanes_used;
+  KALDI_CUDA_DECODER_BATCH_KERNEL_LOOP(ilane, nlanes) {
+    LaneCounters *lane_counters = cst_dev_params.d_lanes_counters.lane(ilane);
+    const int32 ichannel = params.channel_to_compute[ilane];
+    ChannelCounters *channel_counters =
+        cst_dev_params.d_channels_counters.channel(ichannel);
+    const int32 main_q_end = channel_counters->prev_main_q_narcs_and_end.y;
+    const int32 global_offset = channel_counters->prev_main_q_global_offset;
+    const int32 min_int_cost =
+        channel_counters->min_int_cost_and_arg_without_final.x;
+    KALDI_CUDA_DECODER_1D_KERNEL_LOOP(idx, main_q_end) {
+      if (idx == 0)
+        lane_counters->n_within_lattice_beam =
+            0;  // will be used in the next kernel
+      const int2 both =
+          cst_dev_params.d_main_q_state_and_cost.channel(ichannel)[idx];
+      const int token_state = both.x;
+      const int token_int_cost = both.y;
+      CostType cost = orderedIntToFloat(token_int_cost);
+      IntegerCostType int_cost = floatToOrderedInt(cost);
+      int32 global_idx = global_offset + idx;
+      // We know what is the min cost (without final costs)
+      // we just need to have the index of one token with that min cost
+      if (int_cost == min_int_cost)
+        channel_counters->min_int_cost_and_arg_without_final.y = global_idx;
+
+      if (use_final_probs) {
+        const CostType final_cost =
+            cst_dev_params.d_fst_final_costs[token_state];
+        IntegerCostType int_cost_with_final =
+            floatToOrderedInt(cost + final_cost);
+        if (final_cost != fst_zero) {
+          int2 min_and_arg = {int_cost_with_final,
+                              global_idx};  // sort by cost, put it first
+          atomicMinI2(&channel_counters->min_int_cost_and_arg_with_final,
+                      min_and_arg);
+        }
+      }
+    }
+  }
+}
+
+// Step2: Now that step1 found the min_cost (with and without final cost)
+// If at least one final token (token associated with a final fst state)
+// exists in the token queue, AND if use_final_probs is true,
+// We can detect all tokens with a cost within [min_cost;min_cost+lattice_beam]
+// and list them into d_list_final_tokens_in_main_q
+__global__ void get_best_cost_step2_kernel(DeviceParams cst_dev_params,
+                                           KernelParams params,
+                                           bool use_final_probs,
+                                           CostType fst_zero) {
+  const int nlanes = params.nlanes_used;
+  KALDI_CUDA_DECODER_BATCH_KERNEL_LOOP(ilane, nlanes) {
+    LaneCounters *lane_counters = cst_dev_params.d_lanes_counters.lane(ilane);
+    const int32 ichannel = params.channel_to_compute[ilane];
+    const ChannelCounters *channel_counters =
+        cst_dev_params.d_channels_counters.channel(ichannel);
+    const int32 main_q_end = channel_counters->prev_main_q_narcs_and_end.y;
+    const int32 global_offset = channel_counters->prev_main_q_global_offset;
+    const int2 min_int_cost_and_arg_with_final =
+        channel_counters->min_int_cost_and_arg_with_final;
+    const int2 min_int_cost_and_arg_without_final =
+        channel_counters->min_int_cost_and_arg_without_final;
+    bool has_reached_final = (min_int_cost_and_arg_with_final.x != INT_MAX);
+    // Use final if we want to use final (use_final_probs is true) and if we
+    // found a final state in the token list
+    bool compute_final = use_final_probs && has_reached_final;
+    IntegerCostType min_cost_to_use =
+        compute_final ? min_int_cost_and_arg_with_final.x
+                      : min_int_cost_and_arg_without_final.x;
+
+    // if token.cost < lattice_cutoff, that token will belong in the output
+    // lattice
+    CostType lattice_cutoff =
+        orderedIntToFloat(min_cost_to_use) + cst_dev_params.lattice_beam;
+    IntegerCostType lattice_int_cutoff = floatToOrderedInt(lattice_cutoff);
+    KALDI_CUDA_DECODER_1D_KERNEL_LOOP(idx, main_q_end) {
+      // First thread of each lane will move the results into lane counters.
+      // That's because we never move channel counters back to host,
+      // so we move those values to the lane counters, and those lane counters
+      // will be moved to host after this kernel
+      if (idx == 0) {
+        // The lane counters will be copied to host
+        lane_counters->min_int_cost_and_arg =
+            compute_final ? min_int_cost_and_arg_with_final
+                          : min_int_cost_and_arg_without_final;
+        lane_counters->has_reached_final = has_reached_final;
+      }
+      // Looking for a token with its int_cost < lattice_int_cutoff
+      const int2 both =
+          cst_dev_params.d_main_q_state_and_cost.channel(ichannel)[idx];
+      const int32 token_state = both.x;
+      int32 token_int_cost = both.y;
+      if (compute_final) {
+        const CostType final_cost =
+            cst_dev_params.d_fst_final_costs[token_state];
+        const CostType token_cost = orderedIntToFloat(token_int_cost);
+        // final_cost == fst_zero -> this state is not final
+        token_int_cost = (final_cost != fst_zero)
+                             ? floatToOrderedInt(token_cost + final_cost)
+                             : INT_MAX;
+      }
+      if (token_int_cost < lattice_int_cutoff) {
+        // That token will be included in the lattice (last frame)
+        // save it
+        int list_idx = atomicAdd(&lane_counters->n_within_lattice_beam, 1);
+        cst_dev_params.d_list_final_tokens_in_main_q.lane(ilane)[list_idx] = {
+            global_offset + idx, token_int_cost};
+      }
+    }
+  }
+}
+
+// compute_costs_histogram_kernel
+// Used in ApplyMaxActiveAndReduceBeam
+// Compute the histogram of the token.cost in the main_q
+__global__ void compute_costs_histogram_kernel(DeviceParams cst_dev_params,
+                                               KernelParams params,
+                                               bool use_aux_q) {
+  const int nlanes = params.nlanes_used;
+  typedef cub::BlockHistogram<BinId, KALDI_CUDA_DECODER_1D_BLOCK, 1,
+                              KALDI_CUDA_DECODER_HISTO_NBINS + 1>
+      BlockHistogram;
+  __shared__ typename BlockHistogram::TempStorage temp_storage;
+  __shared__ unsigned int smem_histogram[KALDI_CUDA_DECODER_HISTO_NBINS + 1];
+
+  KALDI_CUDA_DECODER_BATCH_KERNEL_LOOP(ilane, nlanes) {
+    const int32 ichannel = params.channel_to_compute[ilane];
+    const LaneCounters *lane_counters =
+        cst_dev_params.d_lanes_counters.lane(ilane);
+    const int32 q_end = use_aux_q ? lane_counters->post_expand_aux_q_end
+                                  : lane_counters->main_q_narcs_and_end.y;
+    if (q_end <= cst_dev_params.max_active) continue;  // nothing to do
+
+    // Reset local histogram for this lane
+    BlockHistogram(temp_storage).InitHistogram(smem_histogram);
+    CostType beam = orderedIntToFloat(lane_counters->int_beam);
+    CostType min_cost = orderedIntToFloat(lane_counters->min_int_cost);
+    CostType bin_width = beam / KALDI_CUDA_DECODER_HISTO_NBINS;
+
+    // We have a sync inside the loop, keeping all threads alive
+    KALDI_CUDA_DECODER_1D_BLOCK_OFFSET_KERNEL_LOOP(block_offset, thread_idx,
+                                                   q_end) {
+      const int32 q_idx = block_offset + thread_idx;
+      // The last bin is for everything we don't want to count:
+      // cost already above the beam, or non-valid tokens
+      // It is the default bin
+      BinId bin_id[1];
+      bin_id[0] = KALDI_CUDA_DECODER_HISTO_NBINS;
+      if (q_idx < q_end) {
+        IntegerCostType int_cost =
+            use_aux_q
+                ? cst_dev_params.d_aux_q_state_and_cost.lane(ilane)[q_idx].y
+                : cst_dev_params.d_main_q_state_and_cost
+                      .channel(ichannel)[q_idx]
+                      .y;
+        CostType cost = orderedIntToFloat(int_cost);
+        CostType extra = cost - min_cost;
+        // We only count valid tokens with a cost < (min_cost + beam)
+        if (extra < beam) {
+          bin_id[0] = (BinId)__fdiv_rd(extra, bin_width);
+        }
+      }
+      BlockHistogram(temp_storage).Composite(bin_id, smem_histogram);  // sync
+      __syncthreads();  // reusing temp_storage
+    }
+
+    // Not using the macros 1D_LOOP because that loop is only within a CTA
+    for (int32 bin_id_w = threadIdx.x;
+         bin_id_w < KALDI_CUDA_DECODER_HISTO_NBINS;
+         bin_id_w += KALDI_CUDA_DECODER_1D_BLOCK) {
+      // Writing the local histo to global
+      // We don't care about the last bin (cf above)
+      int32 s_count = (int32)smem_histogram[bin_id_w];
+      atomicAdd(&cst_dev_params.d_histograms.lane(ilane)[bin_id_w], s_count);
+    }
+    // Making sure we're done reading from smem
+    __syncthreads();
+  }
+}
+
+// update_beam_using_histogram_kernel
+// used in ApplyMaxActiveAndReduceBeam
+// uses the histogram computed in compute_costs_histogram_kernel
+// to find where to cut (where to set the beam)
+// to keep only ~max_active_ tokens.
+// Important: use only one CTA per lane
+__global__ void update_beam_using_histogram_kernel(DeviceParams cst_dev_params,
+                                                   KernelParams params,
+                                                   bool use_aux_q) {
+  typedef cub::BlockScan<int, KALDI_CUDA_DECODER_1D_BLOCK> BlockScan;
+  __shared__ typename BlockScan::TempStorage temp_storage;
+
+  const int nlanes = params.nlanes_used;
+  const int max_active = cst_dev_params.max_active;
+  KALDI_CUDA_DECODER_BATCH_KERNEL_LOOP(ilane, nlanes) {
+    LaneCounters *lane_counters = cst_dev_params.d_lanes_counters.lane(ilane);
+    const int32 q_end = use_aux_q ? lane_counters->post_expand_aux_q_end
+                                  : lane_counters->main_q_narcs_and_end.y;
+    if (q_end <= max_active) continue;  // nothing to do
+    CostType beam = orderedIntToFloat(lane_counters->int_beam);
+    CostType min_cost = orderedIntToFloat(lane_counters->min_int_cost);
+    // We now have our histogram of the token costs (computed in the previous
+    // kernel)
+    // Each thread i is responsible for a bin i, with that bin containing ni
+    // tokens.
+    // We compute the prefix sum of those ni, ending up for each thread with
+    // si=sum[i=1..i](ni)
+    // If the thread i detects that si < max_active_ and s[i+1] >= max_active_,
+    // then we will cut the beam at
+    // the cost of the bin [i+1]
+    //
+    // Assert : one thread in a CTA is responsible for at most one bin
+    // we will not iterate over bins
+    assert(KALDI_CUDA_DECODER_HISTO_NBINS < KALDI_CUDA_DECODER_1D_BLOCK);
+    int bin_id = threadIdx.x;
+    int val = 0;
+    if (bin_id < KALDI_CUDA_DECODER_HISTO_NBINS) {
+      val = cst_dev_params.d_histograms.lane(ilane)[bin_id];
+      cst_dev_params.d_histograms.lane(ilane)[bin_id] =
+          0;  // reset for next time
+    }
+    int prefix_sum;
+    BlockScan(temp_storage).ExclusiveSum(val, prefix_sum);
+
+    if (prefix_sum < max_active && (prefix_sum + val) >= max_active) {
+      // We found our new beam
+      CostType new_beam =
+          (beam / KALDI_CUDA_DECODER_HISTO_NBINS) * (bin_id + 1);
+      IntegerCostType new_int_beam = floatToOrderedInt(new_beam);
+      // Saving our new beam for this lane
+      lane_counters->int_beam = new_int_beam;
+      lane_counters->adaptive_int_beam_with_validity_index.x = new_int_beam;
+      lane_counters->int_cutoff = floatToOrderedInt(min_cost + new_beam);
+    }
+  }
+}
+
+//
+// PostProcessingMainQueue kernels.
+// all the following kernels are called when postprocessing a frame
+//
+
+// Filling hashmap values with the tokens that we have in the main queue
+// We do that because multiple tokens associated with the same FST state
+// (but with different arc_idx) can exist in the main_q. We need to detect
+// that situation, count them, detect what the min_cost for that FST state is.
+// It is done using a hashmap
+__global__ void fill_hashmap_with_main_q_kernel(DeviceParams cst_dev_params,
+                                                KernelParams params) {
+  // Operator for the prefix sum inside the CUDA block
+  const int nlanes = params.nlanes_used;
+  KALDI_CUDA_DECODER_BATCH_KERNEL_LOOP(ilane, nlanes) {
+    const LaneCounters *lane_counters =
+        cst_dev_params.d_lanes_counters.lane(ilane);
+    const int32 main_q_end = lane_counters->main_q_narcs_and_end.y;
+    KALDI_CUDA_DECODER_1D_KERNEL_LOOP(main_q_idx, main_q_end) {
+      // Position of considered token in the main_q
+      const int32 ichannel = params.channel_to_compute[ilane];
+      if (main_q_idx < main_q_end) {
+        int2 both = cst_dev_params.d_main_q_state_and_cost.channel(
+            ichannel)[main_q_idx];
+        StateId token_state = both.x;
+        IntegerCostType token_int_cost = both.y;
+        int local_idx, hash_idx;
+        hashmap_insert_or_aggregate(cst_dev_params.d_hashmap_values.lane(ilane),
+                                    token_state, token_int_cost, main_q_idx,
+                                    cst_dev_params.hashmap_capacity, &local_idx,
+                                    &hash_idx);
+        cst_dev_params.d_main_q_n_extra_prev_tokens_local_idx.lane(
+            ilane)[main_q_idx] = local_idx;
+        // Saving where that token.state ended up in the hashmap
+        // false = this token is not the representative of this state
+        // We will update representing_state once we know more (in the next
+        // kernel)
+        // We first need to add all tokens to the hashmap. Which will be the
+        // case when
+        // this kernel returns.
+        SetFSTStateHashIndex(
+            hash_idx, false,
+            &cst_dev_params.d_main_q_state_hash_idx.lane(ilane)[main_q_idx]);
+      }
+    }
+  }
+}
+
+// preprocess_and_list_extra_prev_tokens_kernel_step[i] kernels
+// Called in PostProcessingMainQueue
+// They do two things:
+// - do the "emitting preprocessing". I.e. doing the preprocessing necessary for
+// the future ExpandArcsEmitting that may be done next (if the current frame is
+// not the last one)
+// It consists of filling the d_main_q_degrees_prefix_sum of the emitting arc
+// degrees of the tokens + setting d_main_q_arc_offsets
+// - when we have multiple tokens associated with the same FST state S, we will
+// list them in d_main_q_extra_prev_tokens. We need to know where to put them in
+// that array,
+// so we'll compute a prefix_sum also to compute those indexes. We'll then save
+// the location of each extra tokens list (its offset and size in
+// d_main_q_extra_prev_tokens),
+// and save it into d_main_q_info for later lattice processing
+//
+// First step : Reading the hashmap, detecting which token is representative for
+// each FST state, which is decided by fill_hashmap_with_main_q_kernel()
+// (we pick one of the best ones, with the best ones being the ones with the
+// lowest cost)
+// this representative will be responsible for K tokens, with K being the number
+// of tokens associated with that FST state. We only considers the cases where K
+// > 1,
+// because if K == 1, then we will not store that token in the special list
+// d_main_q_extra_prev_tokens
+// Each representative is also the only token that will propagate emitting arcs
+// for that FST state. Because a representative has the min_cost for that FST
+// state, it is enough to only propagate
+// that one
+// Each representative counts the number of emitting arcs it is responsible for,
+// and we will compute the prefix sum of the arc degrees
+__global__ void emitting_preprocess_and_list_extra_prev_tokens_step1_kernel(
+    DeviceParams cst_dev_params, KernelParams params) {
+  // Operator for the prefix sum inside the CUDA block
+  typedef cub::BlockScan<int2, KALDI_CUDA_DECODER_1D_BLOCK> BlockScan;
+  __shared__ typename BlockScan::TempStorage sh_temp_storage;
+  const int nlanes = params.nlanes_used;
+  KALDI_CUDA_DECODER_BATCH_KERNEL_LOOP(ilane, nlanes) {
+    const LaneCounters *lane_counters =
+        cst_dev_params.d_lanes_counters.lane(ilane);
+    const int32 main_q_end = lane_counters->main_q_narcs_and_end.y;
+    // Final cutoff from last ExpandArc execution
+    // The cutoff can have decreased since moving tokens to the main_q
+    // min_cost cannot be lower than before (we only did non-emitting phases
+    // since then)
+    // but the adaptive beam may have lowered the beam
+    const IntegerCostType int_cutoff = lane_counters->int_cutoff;
+    // Keeping all threads in CTA alive
+    // We'll __syncthreads()
+    KALDI_CUDA_DECODER_1D_BLOCK_OFFSET_KERNEL_LOOP(block_offset, thread_idx,
+                                                   main_q_end) {
+      // We'll take care of the token at index main_q_idx
+      const int32 main_q_idx = block_offset + thread_idx;
+      const int32 ichannel = params.channel_to_compute[ilane];
+      // If that token is the representative of its FST state (token.next_state)
+      // The representative of a FST state is the token with the lowest
+      // token.cost for that FST state
+      // If multiple tokens have token1.cost == token2.cost ==
+      // min_cost_for_that_state, then one is picked (first come first serve,
+      // was done in fill_hashmap_with_main_q_kernel)
+      bool representing_state = false;
+      // Number of emitting arcs for that token
+      // Only the token representative of that FST state can have degree > 0
+      int32 degree = 0;
+      // If that token is representative of a FST state S,
+      // and if multiple tokens are associated with that state S,
+      // then n_extra_prev_token will contain their count
+      int32 n_extra_prev_token = 0;
+      if (main_q_idx < main_q_end) {
+        int2 both = cst_dev_params.d_main_q_state_and_cost.channel(
+            ichannel)[main_q_idx];
+        StateId token_state = both.x;
+        IntegerCostType token_int_cost = both.y;
+        // Loading info about token.next_state. Is there multiple tokens for
+        // that state ?
+        // How many ? What's the min token.cost for that state ?
+        int32 hash_idx;    // we saved the hash_idx after inserting
+        bool bool_buffer;  // will always be false. We just need it to call the
+                           // function
+        GetFSTStateHashIndex(
+            cst_dev_params.d_main_q_state_hash_idx.lane(ilane)[main_q_idx],
+            &hash_idx, &bool_buffer);
+        HashmapValueT h_val =
+            cst_dev_params.d_hashmap_values.lane(ilane)[hash_idx];
+        // Token index of one of the token which the lowest token.cost for that
+        // state
+        const int32 state_best_int_cost_argmin =
+            h_val.min_and_argmin_int_cost.y;
+        // Checking if we're the representative of that state
+        representing_state = (main_q_idx == state_best_int_cost_argmin);
+        // Saving the hash_idx of that fst state + if we're responsible for that
+        // state
+        SetFSTStateHashIndex(
+            hash_idx, representing_state,
+            &cst_dev_params.d_main_q_state_hash_idx.lane(ilane)[main_q_idx]);
+
+        // One of the best token for that state will represent that state in the
+        // next frame
+        if (representing_state) {
+          if (token_int_cost < int_cutoff) {
+            // Next step is emitting (next frame), using emitting offsets
+            const int32 start = cst_dev_params.d_arc_e_offsets[token_state];
+            const int32 end = cst_dev_params.d_arc_e_offsets[token_state + 1];
+            degree = end - start;
+            // Saving the start offset for the expand kernel
+            // avoid a new random memory access
+            cst_dev_params.d_main_q_arc_offsets.channel(ichannel)[main_q_idx] =
+                start;
+          }
+          // If that FST state has only one token associated to it, we store
+          // that token directly in
+          // d_main_q_info (its original place)
+          // We only move it into the d_main_q_extra_prev_tokens list if
+          // multiple tokens are associated to that state
+          n_extra_prev_token = (h_val.count > 1) ? (h_val.count) : 0;
+        }
+      }
+
+      // Computing a local prefix sum inside that CUDA block
+      // Others kernels will take care of adding the necessary offset to those
+      // local prefix sums
+      int2 zeroi2 = {0, 0};
+      int2 vali2 = {degree, n_extra_prev_token};
+      int2 aggi2;
+      BlockScan(sh_temp_storage)
+          .ExclusiveScan(vali2, aggi2, zeroi2, PlusPlus());
+      int32 degree_local_prefix_sum = aggi2.x;
+      int32 n_extra_prev_token_prefix_sum = aggi2.y;
+
+      if (main_q_idx < main_q_end) {
+        // This is not the final global prefix sum
+        // Other kernels will add the necessary offset
+        cst_dev_params.d_main_q_degrees_prefix_sum.channel(
+            ichannel)[main_q_idx] = degree_local_prefix_sum;
+        cst_dev_params.d_main_q_extra_prev_tokens_prefix_sum.lane(
+            ilane)[main_q_idx] = n_extra_prev_token_prefix_sum;
+      }
+
+      if (KALDI_CUDA_DECODER_IS_LAST_1D_THREAD()) {
+        // Saving the local sum of degrees of that CUDA block
+        // That's necessary to compute the global offset of that CUDA block,
+        // and that offset is what we need to transform the local prefix sum
+        // into a global prefix sum
+        const int local_sum_index = block_offset / KALDI_CUDA_DECODER_1D_BLOCK;
+        // the prefix sum was exclusive, adding missing value
+        const int degree_inclusive_sum = degree_local_prefix_sum + degree;
+        const int n_extra_prev_tokens_inclusive_sum =
+            n_extra_prev_token_prefix_sum + n_extra_prev_token;
+        cst_dev_params.d_main_q_block_sums_prefix_sum.lane(
+            ilane)[local_sum_index] = {degree_inclusive_sum,
+                                       n_extra_prev_tokens_inclusive_sum};
+      }
+
+      // Synchronization because:
+      // - we may need to reuse sh_temp_storage if the for loop iterates (cf
+      // CUB's doc)
+      __syncthreads();
+    }
+  }
+}
+
+// In step1, we've computed the local (CTA-wide) prefix sums. We also have the
+// local sums of each individual CTAs
+// In this kernel, we will compute the offset of each CTA in the global prefix
+// sum. We will then add those offsets in step3
+// Only one CTA / lane
+__global__ void emitting_preprocess_and_list_extra_prev_tokens_step2_kernel(
+    DeviceParams cst_dev_params, KernelParams params) {
+  typedef cub::BlockScan<int2, KALDI_CUDA_DECODER_1D_BLOCK> BlockScan;
+  __shared__ typename BlockScan::TempStorage sh_temp_storage;
+  const int nlanes = params.nlanes_used;
+  KALDI_CUDA_DECODER_BATCH_KERNEL_LOOP(ilane, nlanes) {
+    LaneCounters *lane_counters = cst_dev_params.d_lanes_counters.lane(ilane);
+    const int main_q_end = lane_counters->main_q_narcs_and_end.y;
+    const int ntiles = KALDI_CUDA_DECODER_DIV_ROUND_UP(
+        main_q_end, KALDI_CUDA_DECODER_1D_BLOCK);
+    // Using block_offset loop to keep entire CTA alive (we're going to use
+    // __syncthreads in CUB)
+    int2 sum_so_far = {0, 0};
+    KALDI_CUDA_DECODER_1D_BLOCK_OFFSET_KERNEL_LOOP(offset, thread_idx, ntiles) {
+      const int32 itile = offset + thread_idx;
+      const int2 zeroi2 = {0, 0};
+      const int2 val =
+          (itile < ntiles)
+              ? cst_dev_params.d_main_q_block_sums_prefix_sum.lane(ilane)[itile]
+              : zeroi2;
+
+      int2 prefix_sum, sum;
+      BlockScan(sh_temp_storage)
+          .ExclusiveScan(val, prefix_sum, zeroi2, PlusPlus(), sum);
+      PlusPlus pp;
+      prefix_sum = pp(prefix_sum, sum_so_far);
+      sum_so_far = pp(sum_so_far, sum);
+      if (itile < ntiles) {
+        cst_dev_params.d_main_q_block_sums_prefix_sum.lane(ilane)[itile] =
+            prefix_sum;
+      }
+      if (itile == (ntiles - 1)) {
+        const int32 total_narcs = prefix_sum.x + val.x;
+        const int32 total_n_extra_prev_tokens = prefix_sum.y + val.y;
+        lane_counters->main_q_narcs_and_end.x = total_narcs;
+        lane_counters->main_q_n_extra_prev_tokens = total_n_extra_prev_tokens;
+        assert(total_n_extra_prev_tokens >= 0 &&
+               total_n_extra_prev_tokens <= main_q_end);
+      }
+
+      if (itile == 0) {
+        // Last time those were used was in previous kernel
+        // We should centralize those into a final kernel
+        lane_counters->min_int_cost = INT_MAX;
+        lane_counters->int_cutoff = INT_MAX;
+        const CostType current_beam =
+            orderedIntToFloat(lane_counters->int_beam);
+        const CostType new_beam =
+            fmin(cst_dev_params.default_beam,
+                 current_beam * KALDI_CUDA_DECODER_ADAPTIVE_BEAM_RECOVER_RATE);
+        lane_counters->int_beam = floatToOrderedInt(new_beam);
+      }
+    }
+  }
+}
+
+// Step3: Uses the CTA offsets computed in step2 to transform the CTA-wide
+// prefix sums to global prefix sums
+// The representative of each FST states saves into the hashmap the location of
+// the extra_prev_tokens of that state
+// in d_main_q_extra_prev_tokens. That way each extra tokens will know where to
+// write itself in the next kernel.
+__global__ void emitting_preprocess_and_list_extra_prev_tokens_step3_kernel(
+    DeviceParams cst_dev_params, KernelParams params) {
+  const int nlanes = params.nlanes_used;
+  KALDI_CUDA_DECODER_BATCH_KERNEL_LOOP(ilane, nlanes) {
+    const LaneCounters *lane_counters =
+        cst_dev_params.d_lanes_counters.lane(ilane);
+    const int32 ichannel = params.channel_to_compute[ilane];
+    const int main_q_end = lane_counters->main_q_narcs_and_end.y;
+    KALDI_CUDA_DECODER_1D_KERNEL_LOOP(main_q_idx, main_q_end) {
+      const int32 local_sum_idx = main_q_idx / KALDI_CUDA_DECODER_1D_BLOCK;
+      const int2 local_sum_offset =
+          cst_dev_params.d_main_q_block_sums_prefix_sum.lane(
+              ilane)[local_sum_idx];
+      cst_dev_params.d_main_q_degrees_prefix_sum.channel(
+          ichannel)[main_q_idx] += local_sum_offset.x;
+      int extra_prev_tokens_offset =
+          cst_dev_params.d_main_q_extra_prev_tokens_prefix_sum.lane(
+              ilane)[main_q_idx] +
+          local_sum_offset.y;
+      // Loading the hash index associate with token.state
+      // If representative, store the location of the extra prev tokens list for
+      // that state in the hashmap
+      bool is_representative;
+      int32 hash_idx;
+      GetFSTStateHashIndex(
+          cst_dev_params.d_main_q_state_hash_idx.lane(ilane)[main_q_idx],
+          &hash_idx, &is_representative);
+      if (is_representative) {
+        HashmapValueT &val =
+            cst_dev_params.d_hashmap_values.lane(ilane)[hash_idx];
+        val.min_and_argmin_int_cost.y = extra_prev_tokens_offset;
+      }
+    }
+  }
+}
+
+// Step4: We now know where to store our extra prev tokens in
+// d_main_q_extra_prev_tokens.
+// We will now move the tokens that need to be moved (when multiple tokens are
+// associated to the same FST state)
+// into d_main_q_extra_prev_tokens. In d_main_q_info, we will store the location
+// of that list [offset,size]
+// so that when backtracking, when we read d_main_q_info[token_idx], we know
+// where to look to have the list
+// of the same-state tokens
+// It is the last step of the
+// emitting_preprocess_and_list_extra_prev_tokens_step[i]_kernel pipeline
+__global__ void emitting_preprocess_and_list_extra_prev_tokens_step4_kernel(
+    DeviceParams cst_dev_params, KernelParams params) {
+  const int nlanes = params.nlanes_used;
+  KALDI_CUDA_DECODER_BATCH_KERNEL_LOOP(ilane, nlanes) {
+    const LaneCounters *lane_counters =
+        cst_dev_params.d_lanes_counters.lane(ilane);
+    const int32 ichannel = params.channel_to_compute[ilane];
+    const int main_q_end = lane_counters->main_q_narcs_and_end.y;
+    // Previous frames have filled d_main_q_extra_prev_tokens.
+    // d_main_q_extra_prev_tokens was then flushed to host. We want to set the
+    // global
+    // (global in the sense "for all frames") offset on where to read it the
+    // h_all_tokens_extra_prev_tokens_ on host.
+    // adding the main_q_extra_prev_tokens_global_offset for that
+    const int prev_global_idx =
+        lane_counters->main_q_extra_prev_tokens_global_offset;
+    KALDI_CUDA_DECODER_1D_KERNEL_LOOP(main_q_idx, main_q_end) {
+      // We'll take care of token at main_q_idx
+      // Loading hashmap information about token.state
+      bool is_representative;
+      int32 hash_idx;
+      GetFSTStateHashIndex(
+          cst_dev_params.d_main_q_state_hash_idx.lane(ilane)[main_q_idx],
+          &hash_idx, &is_representative);
+
+      HashmapValueT val = cst_dev_params.d_hashmap_values.lane(ilane)[hash_idx];
+      // How many tokens are associated with that fst state token.state
+      int same_count = val.count;
+      bool must_move_to_extra_prev_tokens = (same_count > 1);
+      if (must_move_to_extra_prev_tokens) {
+        // Moving to the extra_prev_tokens list.
+        // Some of those tokens have an extra cost (compared to the best cost
+        // for that FST state)
+        // Generating and saving that extra cost. We will use it when generating
+        // the lattice.
+        CostType token_cost = orderedIntToFloat(
+            cst_dev_params.d_main_q_state_and_cost.channel(ichannel)[main_q_idx]
+                .y);
+        CostType best_cost = orderedIntToFloat(val.min_and_argmin_int_cost.x);
+        CostType extra_cost = token_cost - best_cost;
+        // Loading the token to be moved
+        InfoToken inf_tok =
+            cst_dev_params.d_main_q_info.lane(ilane)[main_q_idx];
+        CostType acoustic_cost =
+            cst_dev_params.d_main_q_acoustic_cost.lane(ilane)[main_q_idx];
+        // Where to write this state list in d_main_q_extra_prev_tokens
+        int32 extra_prev_tokens_offset = val.min_and_argmin_int_cost.y;
+        // Place of that specific token in the extra_prev_tokens sublist of that
+        // specific FST state
+        int32 local_idx =
+            cst_dev_params.d_main_q_n_extra_prev_tokens_local_idx.lane(
+                ilane)[main_q_idx];
+        // Saving the location of the extra prev tokens for that state into that
+        // InfoToken
+        SetSameFSTStateTokensList(
+            prev_global_idx + extra_prev_tokens_offset, same_count,
+            &cst_dev_params.d_main_q_info.lane(ilane)[main_q_idx]);
+        // Where to write this token in d_main_q_extra_prev_tokens
+        int32 list_idx = extra_prev_tokens_offset + local_idx;
+        // Moving token. Also saving extra_cost
+        cst_dev_params.d_main_q_extra_prev_tokens.lane(ilane)[list_idx] =
+            inf_tok;
+        cst_dev_params.d_main_q_extra_and_acoustic_cost.lane(
+            ilane)[list_idx] = {extra_cost, acoustic_cost};
+        assert(inf_tok.prev_token >= (lane_counters->main_q_global_offset -
+                                      cst_dev_params.main_q_capacity) &&
+               inf_tok.prev_token <=
+                   (lane_counters->main_q_global_offset + main_q_end));
+      }
+    }
+  }
+}
+
+// Clear the hashmaps after use
+// Each element in the map has a representative in the main_q
+// Everyone of those representatives has the responsability to reset their
+// corresponding value in the hashmap
+// Once this kernel returns, the hashmaps are cleared
+__global__ void clear_hashmap_kernel(DeviceParams cst_dev_params,
+                                     KernelParams params) {
+  const int nlanes = params.nlanes_used;
+  KALDI_CUDA_DECODER_BATCH_KERNEL_LOOP(ilane, nlanes) {
+    LaneCounters *lane_counters = cst_dev_params.d_lanes_counters.lane(ilane);
+    const int main_q_end = lane_counters->main_q_narcs_and_end.y;
+    KALDI_CUDA_DECODER_1D_KERNEL_LOOP(main_q_idx, main_q_end) {
+      bool is_representative;
+      int32 hash_idx;
+      GetFSTStateHashIndex(
+          cst_dev_params.d_main_q_state_hash_idx.lane(ilane)[main_q_idx],
+          &hash_idx, &is_representative);
+      // Representative owns a state. Each representative resets its associated
+      // token.state
+      // in the hashmap
+      if (is_representative) {
+        cst_dev_params.d_hashmap_values.lane(ilane)[hash_idx] =
+            KALDI_CUDA_DECODER_HASHMAP_NO_VAL;  // clear
+      }
+    }
+
+    // This is the last kernel for that frame
+    // Resets q_overflow
+    if (threadIdx.x == 0 && blockIdx.x == 0)
+      lane_counters->q_overflow = OVERFLOW_NONE;
+  }
+}
+
+// Kernels wrappers
+
+void SaveChannelsStateFromLanesKernel(const dim3 &grid, const dim3 &block,
+                                      const cudaStream_t &st,
+                                      const DeviceParams &cst_dev_params,
+                                      const KernelParams &kernel_params) {
+  save_channels_state_from_lanes_kernel<<<grid, block, 0, st>>>(cst_dev_params,
+                                                                kernel_params);
+  KALDI_DECODER_CUDA_CHECK_ERROR();
+}
+
+void LoadChannelsStateInLanesKernel(const dim3 &grid, const dim3 &block,
+                                    const cudaStream_t &st,
+                                    const DeviceParams &cst_dev_params,
+                                    const KernelParams &kernel_params) {
+  load_channels_state_in_lanes_kernel<<<grid, block, 0, st>>>(cst_dev_params,
+                                                              kernel_params);
+  KALDI_DECODER_CUDA_CHECK_ERROR();
+}
+
+void InitDecodingOnDeviceKernel(const dim3 &grid, const dim3 &block,
+                                const cudaStream_t &st,
+                                const DeviceParams &cst_dev_params,
+                                const KernelParams &kernel_params) {
+  init_decoding_on_device_kernel<<<grid, block, 0, st>>>(cst_dev_params,
+                                                         kernel_params);
+  KALDI_DECODER_CUDA_CHECK_ERROR();
+}
+
+void InitializeInitialLaneKernel(const dim3 &grid, const dim3 &block,
+                                 const cudaStream_t &st,
+                                 const DeviceParams &cst_dev_params) {
+  initialize_initial_lane_kernel<<<grid, block, 0, st>>>(cst_dev_params);
+  KALDI_DECODER_CUDA_CHECK_ERROR();
+}
+
+template <bool IS_EMITTING>
+void ExpandArcsKernel(const dim3 &grid, const dim3 &block,
+                      const cudaStream_t &st,
+                      const DeviceParams &cst_dev_params,
+                      const KernelParams &kernel_params) {
+  expand_arcs_kernel<IS_EMITTING><<<grid, block, 0, st>>>(cst_dev_params,
+                                                          kernel_params);
+  KALDI_DECODER_CUDA_CHECK_ERROR();
+}
+
+template <bool IS_EMITTING>
+void PostExpandKernel(const dim3 &grid, const dim3 &block,
+                      const cudaStream_t &st,
+                      const DeviceParams &cst_dev_params,
+                      const KernelParams &kernel_params) {
+  post_expand_kernel<IS_EMITTING><<<grid, block, 0, st>>>(cst_dev_params,
+                                                          kernel_params);
+  KALDI_DECODER_CUDA_CHECK_ERROR();
+}
+
+void NonEmittingPreprocessAndContractKernel(const dim3 &grid, const dim3 &block,
+                                            const cudaStream_t &st,
+                                            const DeviceParams &cst_dev_params,
+                                            const KernelParams &kernel_params) {
+  nonemitting_preprocess_and_contract_kernel<<<grid, block, 0, st>>>(
+      cst_dev_params, kernel_params);
+  KALDI_DECODER_CUDA_CHECK_ERROR();
+}
+
+void FillHashmapWithMainQKernel(const dim3 &grid, const dim3 &block,
+                                const cudaStream_t &st,
+                                const DeviceParams &cst_dev_params,
+                                const KernelParams &kernel_params) {
+  fill_hashmap_with_main_q_kernel<<<grid, block, 0, st>>>(cst_dev_params,
+                                                          kernel_params);
+  KALDI_DECODER_CUDA_CHECK_ERROR();
+}
+
+void EmittingPreprocessAndListExtraPrevTokensStep1Kernel(
+    const dim3 &grid, const dim3 &block, const cudaStream_t &st,
+    const DeviceParams &cst_dev_params, const KernelParams &kernel_params) {
+  emitting_preprocess_and_list_extra_prev_tokens_step1_kernel<<<grid, block, 0,
+                                                                st>>>(
+      cst_dev_params, kernel_params);
+  KALDI_DECODER_CUDA_CHECK_ERROR();
+}
+
+void EmittingPreprocessAndListExtraPrevTokensStep2Kernel(
+    const dim3 &grid, const dim3 &block, const cudaStream_t &st,
+    const DeviceParams &cst_dev_params, const KernelParams &kernel_params) {
+  emitting_preprocess_and_list_extra_prev_tokens_step2_kernel<<<grid, block, 0,
+                                                                st>>>(
+      cst_dev_params, kernel_params);
+  KALDI_DECODER_CUDA_CHECK_ERROR();
+}
+
+void EmittingPreprocessAndListExtraPrevTokensStep3Kernel(
+    const dim3 &grid, const dim3 &block, const cudaStream_t &st,
+    const DeviceParams &cst_dev_params, const KernelParams &kernel_params) {
+  emitting_preprocess_and_list_extra_prev_tokens_step3_kernel<<<grid, block, 0,
+                                                                st>>>(
+      cst_dev_params, kernel_params);
+  KALDI_DECODER_CUDA_CHECK_ERROR();
+}
+
+void EmittingPreprocessAndListExtraPrevTokensStep4Kernel(
+    const dim3 &grid, const dim3 &block, const cudaStream_t &st,
+    const DeviceParams &cst_dev_params, const KernelParams &kernel_params) {
+  emitting_preprocess_and_list_extra_prev_tokens_step4_kernel<<<grid, block, 0,
+                                                                st>>>(
+      cst_dev_params, kernel_params);
+  KALDI_DECODER_CUDA_CHECK_ERROR();
+}
+
+template <typename T>
+void ConcatenateLanesDataKernel(const dim3 &grid, const dim3 &block,
+                                const cudaStream_t &st,
+                                const DeviceParams &cst_dev_params,
+                                const KernelParams &kernel_params,
+                                const LaneMatrixView<T> &src, T *concat) {
+  concatenate_lanes_data_kernel<<<grid, block, 0, st>>>(
+      cst_dev_params, kernel_params, src, concat);
+  KALDI_DECODER_CUDA_CHECK_ERROR();
+}
+
+void InitHashmapKernel(const dim3 &grid, const dim3 &block,
+                       const cudaStream_t &st,
+                       const DeviceParams &cst_dev_params) {
+  init_hashmap_kernel<<<grid, block, 0, st>>>(cst_dev_params);
+  KALDI_DECODER_CUDA_CHECK_ERROR();
+}
+
+void ClearHashmapKernel(const dim3 &grid, const dim3 &block,
+                        const cudaStream_t &st,
+                        const DeviceParams &cst_dev_params,
+                        const KernelParams &kernel_params) {
+  clear_hashmap_kernel<<<grid, block, 0, st>>>(cst_dev_params, kernel_params);
+  KALDI_DECODER_CUDA_CHECK_ERROR();
+}
+
+void ComputeCostsHistogramKernel(const dim3 &grid, const dim3 &block,
+                                 const cudaStream_t &st,
+                                 const DeviceParams &cst_dev_params,
+                                 const KernelParams &kernel_params,
+                                 bool use_aux_q) {
+  compute_costs_histogram_kernel<<<grid, block, 0, st>>>(
+      cst_dev_params, kernel_params, use_aux_q);
+  KALDI_DECODER_CUDA_CHECK_ERROR();
+}
+
+void UpdateBeamUsingHistogramKernel(const dim3 &grid, const dim3 &block,
+                                    const cudaStream_t &st,
+                                    const DeviceParams &cst_dev_params,
+                                    const KernelParams &kernel_params,
+                                    bool use_aux_q) {
+  update_beam_using_histogram_kernel<<<grid, block, 0, st>>>(
+      cst_dev_params, kernel_params, use_aux_q);
+  KALDI_DECODER_CUDA_CHECK_ERROR();
+}
+
+void FinalizeProcessNonEmittingKernel(const dim3 &grid, const dim3 &block,
+                                      const cudaStream_t &st,
+                                      const DeviceParams &cst_dev_params,
+                                      const KernelParams &kernel_params) {
+  finalize_process_non_emitting_kernel<<<grid, block, 0, st>>>(cst_dev_params,
+                                                               kernel_params);
+  KALDI_DECODER_CUDA_CHECK_ERROR();
+}
+
+void GetBestCostStep1Kernel(const dim3 &grid, const dim3 &block,
+                            const cudaStream_t &st,
+                            const DeviceParams &cst_dev_params,
+                            const KernelParams &kernel_params, bool isfinal,
+                            CostType fst_zero) {
+  get_best_cost_step1_kernel<<<grid, block, 0, st>>>(
+      cst_dev_params, kernel_params, isfinal, fst_zero);
+  KALDI_DECODER_CUDA_CHECK_ERROR();
+}
+
+void GetBestCostStep2Kernel(const dim3 &grid, const dim3 &block,
+                            const cudaStream_t &st,
+                            const DeviceParams &cst_dev_params,
+                            const KernelParams &kernel_params, bool isfinal,
+                            CostType fst_zero) {
+  get_best_cost_step2_kernel<<<grid, block, 0, st>>>(
+      cst_dev_params, kernel_params, isfinal, fst_zero);
+  KALDI_DECODER_CUDA_CHECK_ERROR();
+}
+
+template void ExpandArcsKernel<true>(const dim3 &grid, const dim3 &block,
+                                     const cudaStream_t &st,
+                                     const DeviceParams &cst_dev_params,
+                                     const KernelParams &params);
+template void ExpandArcsKernel<false>(const dim3 &grid, const dim3 &block,
+                                      const cudaStream_t &st,
+                                      const DeviceParams &cst_dev_params,
+                                      const KernelParams &params);
+template void PostExpandKernel<true>(const dim3 &grid, const dim3 &block,
+                                     const cudaStream_t &st,
+                                     const DeviceParams &cst_dev_params,
+                                     const KernelParams &params);
+template void PostExpandKernel<false>(const dim3 &grid, const dim3 &block,
+                                      const cudaStream_t &st,
+                                      const DeviceParams &cst_dev_params,
+                                      const KernelParams &params);
+
+template void ConcatenateLanesDataKernel<InfoToken>(
+    const dim3 &grid, const dim3 &block, const cudaStream_t &st,
+    const DeviceParams &cst_dev_params, const KernelParams &params,
+    const LaneMatrixView<InfoToken> &src, InfoToken *concat);
+
+template void ConcatenateLanesDataKernel<CostType>(
+    const dim3 &grid, const dim3 &block, const cudaStream_t &st,
+    const DeviceParams &cst_dev_params, const KernelParams &params,
+    const LaneMatrixView<CostType> &src, CostType *concat);
+
+template void ConcatenateLanesDataKernel<float2>(
+    const dim3 &grid, const dim3 &block, const cudaStream_t &st,
+    const DeviceParams &cst_dev_params, const KernelParams &params,
+    const LaneMatrixView<float2> &src, float2 *concat);
+
+template void ConcatenateLanesDataKernel<int32>(
+    const dim3 &grid, const dim3 &block, const cudaStream_t &st,
+    const DeviceParams &cst_dev_params, const KernelParams &params,
+    const LaneMatrixView<int32> &src, int32 *concat);
+
+}  // end namespace cuda_decoder
+}  // end namespace kaldi
diff --git a/src/cudadecoder/cuda-decoder-kernels.h b/src/cudadecoder/cuda-decoder-kernels.h
new file mode 100644
index 00000000000..ad5bfff3023
--- /dev/null
+++ b/src/cudadecoder/cuda-decoder-kernels.h
@@ -0,0 +1,193 @@
+// cudadecoder/cuda-decoder-kernels.h
+//
+// Copyright (c) 2019, NVIDIA CORPORATION.  All rights reserved.
+// Hugo Braun, Justin Luitjens, Ryan Leary
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef KALDI_CUDA_DECODER_CUDA_DECODER_KERNELS_H_
+#define KALDI_CUDA_DECODER_CUDA_DECODER_KERNELS_H_
+
+#include "cudadecoder/cuda-decoder-common.h"
+#include "util/stl-utils.h"
+
+namespace kaldi {
+namespace cuda_decoder {
+
+// DeviceParams contains all top-level const data used by the kernels
+// i.e. the data that won't change between kernel calls (such as memory pointers
+// to the main_q)
+struct DeviceParams {
+  ChannelMatrixView<ChannelCounters> d_channels_counters;
+  LaneMatrixView<LaneCounters> d_lanes_counters;
+
+  ChannelMatrixView<int2> d_main_q_state_and_cost;
+  ChannelMatrixView<int32> d_main_q_degrees_prefix_sum;
+  ChannelMatrixView<int32> d_main_q_arc_offsets;
+  LaneMatrixView<CostType> d_main_q_acoustic_cost;
+  LaneMatrixView<InfoToken> d_main_q_info;
+  LaneMatrixView<int2> d_aux_q_state_and_cost;
+  LaneMatrixView<CostType> d_aux_q_acoustic_cost;
+  LaneMatrixView<InfoToken> d_aux_q_info;
+  LaneMatrixView<HashmapValueT> d_hashmap_values;
+  LaneMatrixView<int2> d_list_final_tokens_in_main_q;
+  LaneMatrixView<float2> d_main_q_extra_and_acoustic_cost;
+  LaneMatrixView<int32> d_histograms;
+  LaneMatrixView<int2> d_main_q_block_sums_prefix_sum;
+  LaneMatrixView<int32> d_main_q_state_hash_idx;
+  LaneMatrixView<int32> d_main_q_extra_prev_tokens_prefix_sum;
+  LaneMatrixView<int32> d_main_q_n_extra_prev_tokens_local_idx;
+  LaneMatrixView<InfoToken> d_main_q_extra_prev_tokens;
+
+  int32 max_nlanes;
+  int32 main_q_capacity, aux_q_capacity;
+  CostType *d_arc_weights;
+  int32 *d_arc_nextstates;
+  int32 *d_arc_pdf_ilabels;
+  uint32 *d_arc_e_offsets;
+  uint32 *d_arc_ne_offsets;
+  CostType *d_fst_final_costs;
+  int32 nstates;
+  CostType default_beam;
+  CostType lattice_beam;
+  int32 init_channel_id;
+  StateId init_state;
+  CostType init_cost;
+  int32 hashmap_capacity;
+  int32 max_active;
+  int32 adaptive_beam_static_segment;
+  int32 adaptive_beam_bin_width;
+};
+
+// KernelParams contains all the kernels arguments that change between kernel
+// calls
+// For instance, a given lane does not always compute the same channel
+struct KernelParams {
+  // In AdvanceDecoding,
+  // the lane lane_id will compute the channel
+  // with channel_id = channel_to_compute[lane_id]
+  ChannelId channel_to_compute[KALDI_CUDA_DECODER_MAX_N_LANES];
+  int32 main_q_end_lane_offsets[KALDI_CUDA_DECODER_MAX_N_LANES];
+  BaseFloat *loglikelihoods_ptrs[KALDI_CUDA_DECODER_MAX_N_LANES];
+  int32 nlanes_used;
+};
+
+// Kernel wrappers
+void SaveChannelsStateFromLanesKernel(const dim3 &grid, const dim3 &block,
+                                      const cudaStream_t &st,
+                                      const DeviceParams &cst_dev_params,
+                                      const KernelParams &kernel_params);
+
+void LoadChannelsStateInLanesKernel(const dim3 &grid, const dim3 &block,
+                                    const cudaStream_t &st,
+                                    const DeviceParams &cst_dev_params,
+                                    const KernelParams &kernel_params);
+
+void InitDecodingOnDeviceKernel(const dim3 &grid, const dim3 &block,
+                                const cudaStream_t &st,
+                                const DeviceParams &cst_dev_params,
+                                const KernelParams &kernel_params);
+
+void InitializeInitialLaneKernel(const dim3 &grid, const dim3 &block,
+                                 const cudaStream_t &st,
+                                 const DeviceParams &cst_dev_params);
+
+template <bool IS_EMITTING>
+void ExpandArcsKernel(const dim3 &grid, const dim3 &block,
+                      const cudaStream_t &st,
+                      const DeviceParams &cst_dev_params,
+                      const KernelParams &kernel_params);
+
+template <bool IS_EMITTING>
+void PostExpandKernel(const dim3 &grid, const dim3 &block,
+                      const cudaStream_t &st,
+                      const DeviceParams &cst_dev_params,
+                      const KernelParams &kernel_params);
+
+void NonEmittingPreprocessAndContractKernel(const dim3 &grid, const dim3 &block,
+                                            const cudaStream_t &st,
+                                            const DeviceParams &cst_dev_params,
+                                            const KernelParams &kernel_params);
+
+void FillHashmapWithMainQKernel(const dim3 &grid, const dim3 &block,
+                                const cudaStream_t &st,
+                                const DeviceParams &cst_dev_params,
+                                const KernelParams &kernel_params);
+
+void EmittingPreprocessAndListExtraPrevTokensStep1Kernel(
+    const dim3 &grid, const dim3 &block, const cudaStream_t &st,
+    const DeviceParams &cst_dev_params, const KernelParams &kernel_params);
+
+void EmittingPreprocessAndListExtraPrevTokensStep2Kernel(
+    const dim3 &grid, const dim3 &block, const cudaStream_t &st,
+    const DeviceParams &cst_dev_params, const KernelParams &kernel_params);
+
+void EmittingPreprocessAndListExtraPrevTokensStep3Kernel(
+    const dim3 &grid, const dim3 &block, const cudaStream_t &st,
+    const DeviceParams &cst_dev_params, const KernelParams &kernel_params);
+
+void EmittingPreprocessAndListExtraPrevTokensStep4Kernel(
+    const dim3 &grid, const dim3 &block, const cudaStream_t &st,
+    const DeviceParams &cst_dev_params, const KernelParams &kernel_params);
+
+template <typename T>
+void ConcatenateLanesDataKernel(const dim3 &grid, const dim3 &block,
+                                const cudaStream_t &st,
+                                const DeviceParams &cst_dev_params,
+                                const KernelParams &kernel_params,
+                                const LaneMatrixView<T> &src, T *concat);
+
+void InitHashmapKernel(const dim3 &grid, const dim3 &block,
+                       const cudaStream_t &st,
+                       const DeviceParams &cst_dev_params);
+
+void ClearHashmapKernel(const dim3 &grid, const dim3 &block,
+                        const cudaStream_t &st,
+                        const DeviceParams &cst_dev_params,
+                        const KernelParams &kernel_params);
+
+void ComputeCostsHistogramKernel(const dim3 &grid, const dim3 &block,
+                                 const cudaStream_t &st,
+                                 const DeviceParams &cst_dev_params,
+                                 const KernelParams &kernel_params,
+                                 bool use_aux_q);
+
+void UpdateBeamUsingHistogramKernel(const dim3 &grid, const dim3 &block,
+                                    const cudaStream_t &st,
+                                    const DeviceParams &cst_dev_params,
+                                    const KernelParams &kernel_params,
+                                    bool use_aux_q);
+
+void FinalizeProcessNonEmittingKernel(const dim3 &grid, const dim3 &block,
+                                      const cudaStream_t &st,
+                                      const DeviceParams &cst_dev_params,
+                                      const KernelParams &kernel_params);
+
+void GetBestCostStep1Kernel(const dim3 &grid, const dim3 &block,
+                            const cudaStream_t &st,
+                            const DeviceParams &cst_dev_params,
+                            const KernelParams &kernel_params, bool isfinal,
+                            CostType fst_zero);
+
+void GetBestCostStep2Kernel(const dim3 &grid, const dim3 &block,
+                            const cudaStream_t &st,
+                            const DeviceParams &cst_dev_params,
+                            const KernelParams &kernel_params, bool isfinal,
+                            CostType fst_zero);
+
+typedef unsigned char BinId;
+
+}  // namespace kaldi
+}  // namespace cuda_decoder
+
+#endif  // KALDI_CUDA_DECODER_CUDA_DECODER_KERNELS_H_
diff --git a/src/cudadecoder/cuda-decoder.cc b/src/cudadecoder/cuda-decoder.cc
new file mode 100644
index 00000000000..440ff5aadba
--- /dev/null
+++ b/src/cudadecoder/cuda-decoder.cc
@@ -0,0 +1,1594 @@
+// cudadecoder/cuda-decoder.cc
+//
+// Copyright (c) 2019, NVIDIA CORPORATION.  All rights reserved.
+// Hugo Braun, Justin Luitjens, Ryan Leary
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#if HAVE_CUDA == 1
+
+#include "cuda-decoder.h"
+#include "cuda-decoder-kernels.h"
+
+#include <algorithm>
+#include <cfloat>
+#include <map>
+#include <tuple>
+#include <cuda_runtime_api.h>
+#include <nvToolsExt.h>
+
+namespace kaldi {
+namespace cuda_decoder {
+CudaDecoder::CudaDecoder(const CudaFst &fst, const CudaDecoderConfig &config,
+                         int32 nlanes, int32 nchannels)
+    : fst_(fst),
+      nlanes_(nlanes),
+      nchannels_(nchannels),
+      default_beam_(config.default_beam),
+      lattice_beam_(config.lattice_beam),
+      ntokens_pre_allocated_(config.ntokens_pre_allocated),
+      max_active_(config.max_active),
+      aux_q_capacity_(config.aux_q_capacity),
+      main_q_capacity_(config.main_q_capacity),
+      extra_cost_min_delta_(0.0f) {
+  // Static asserts on constants
+  CheckStaticAsserts();
+  // Runtime asserts
+  KALDI_ASSERT(nlanes > 0);
+  KALDI_ASSERT(nchannels > 0);
+  KALDI_ASSERT(nlanes_ <= KALDI_CUDA_DECODER_MAX_N_LANES);
+  KALDI_ASSERT(nlanes_ <= nchannels_);
+  // All GPU work in decoder will be sent to compute_st_
+  cudaStreamCreate(&compute_st_);
+  // For all the allocating/initializing process
+  // We create a special channel
+  // containing the exact state a channel should have when starting a new decode
+  // It contains fst.Start(), the non-emitting tokens created by fst.Start(),
+  // and all the data used by the decoder.
+  // When calling InitDecoding() on a new channel, we simply clone this special
+  // channel into that new channel
+  ++nchannels_;                       // adding the special initial channel
+  init_channel_id_ = nchannels_ - 1;  // Using last one as init_channel_params
+  AllocateHostData();
+  AllocateDeviceData();
+  AllocateDeviceKernelParams();
+
+  InitDeviceParams();
+  InitHostData();
+  InitDeviceData();
+
+  ComputeInitialChannel();
+  --nchannels_;  // removing the special initial channel from the count
+
+  // Making sure that everything is ready to use
+  cudaStreamSynchronize(compute_st_);
+  KALDI_DECODER_CUDA_CHECK_ERROR();
+}
+
+void CudaDecoder::AllocateDeviceData() {
+  hashmap_capacity_ =
+      KALDI_CUDA_DECODER_HASHMAP_CAPACITY_FACTOR * main_q_capacity_;
+  d_channels_counters_.Resize(nchannels_, 1);
+  d_lanes_counters_.Resize(nlanes_, 1);
+  d_main_q_state_and_cost_.Resize(nchannels_, main_q_capacity_);
+  d_main_q_info_.Resize(nlanes_, main_q_capacity_);
+  d_aux_q_state_and_cost_.Resize(nlanes_, aux_q_capacity_);
+  d_aux_q_info_.Resize(nlanes_, aux_q_capacity_);
+  d_main_q_degrees_prefix_sum_.Resize(nchannels_, main_q_capacity_);
+  d_histograms_.Resize(nlanes_, KALDI_CUDA_DECODER_HISTO_NBINS);
+  d_main_q_extra_prev_tokens_prefix_sum_.Resize(nlanes_, main_q_capacity_);
+  d_main_q_n_extra_prev_tokens_local_idx_.Resize(nlanes_, main_q_capacity_);
+
+  d_main_q_state_hash_idx_.Resize(nlanes_, main_q_capacity_);
+  d_main_q_extra_prev_tokens_.Resize(nlanes_, main_q_capacity_);
+  d_main_q_extra_and_acoustic_cost_.Resize(nlanes_, main_q_capacity_);
+  d_main_q_block_sums_prefix_sum_.Resize(
+      nlanes_, KALDI_CUDA_DECODER_DIV_ROUND_UP(main_q_capacity_,
+                                               KALDI_CUDA_DECODER_1D_BLOCK) +
+                   1);
+  d_main_q_arc_offsets_.Resize(nchannels_, main_q_capacity_);
+  d_hashmap_values_.Resize(nlanes_, hashmap_capacity_);
+  d_main_q_acoustic_cost_.Resize(nlanes_, main_q_capacity_);
+  d_aux_q_acoustic_cost_.Resize(nlanes_, aux_q_capacity_);
+  d_extra_and_acoustic_cost_concat_matrix.Resize(nlanes_, main_q_capacity_);
+  // Reusing data from aux_q. Those two are never used at the same time
+  // d_list_final_tokens_in_main_q_ is used in GetBestPath.
+  // the aux_q is used in AdvanceDecoding
+  d_list_final_tokens_in_main_q_ = d_aux_q_state_and_cost_.GetView();
+  d_extra_and_acoustic_cost_concat__ =
+      d_extra_and_acoustic_cost_concat_matrix.lane(0);
+  d_acoustic_cost_concat_ = d_aux_q_acoustic_cost_.lane(0);
+  d_infotoken_concat_ = d_aux_q_info_.lane(0);
+}
+
+void CudaDecoder::AllocateHostData() {
+  KALDI_DECODER_CUDA_API_CHECK_ERROR(
+      cudaMallocHost(&h_extra_and_acoustic_cost_concat__,
+                     nlanes_ * main_q_capacity_ *
+                         sizeof(*h_extra_and_acoustic_cost_concat__)));
+  KALDI_DECODER_CUDA_API_CHECK_ERROR(cudaMallocHost(
+      &h_acoustic_cost_concat_,
+      nlanes_ * main_q_capacity_ * sizeof(*h_acoustic_cost_concat_)));
+  KALDI_DECODER_CUDA_API_CHECK_ERROR(cudaMallocHost(
+      &h_extra_prev_tokens_concat_,
+      nlanes_ * main_q_capacity_ * sizeof(*h_extra_prev_tokens_concat_)));
+  KALDI_DECODER_CUDA_API_CHECK_ERROR(cudaMallocHost(
+      &h_infotoken_concat_,
+      nlanes_ * main_q_capacity_ * sizeof(*h_infotoken_concat_)));
+  KALDI_DECODER_CUDA_API_CHECK_ERROR(
+      cudaMallocHost(&h_lanes_counters_, nlanes_ * sizeof(*h_lanes_counters_)));
+  KALDI_DECODER_CUDA_API_CHECK_ERROR(cudaMallocHost(
+      &h_channels_counters_, nchannels_ * sizeof(*h_channels_counters_)));
+  KALDI_DECODER_CUDA_API_CHECK_ERROR(cudaMallocHost(
+      &h_list_final_tokens_in_main_q_,
+      main_q_capacity_ * sizeof(*h_list_final_tokens_in_main_q_)));
+
+  h_all_tokens_extra_prev_tokens_extra_and_acoustic_cost_.resize(nchannels_);
+  h_all_tokens_acoustic_cost_.resize(nchannels_);
+  h_all_tokens_extra_prev_tokens_.resize(nchannels_);
+  h_all_tokens_info_.resize(nchannels_);
+  for (int32 ichannel = 0; ichannel < nchannels_; ++ichannel) {
+    h_all_tokens_extra_prev_tokens_extra_and_acoustic_cost_[ichannel].reserve(
+        ntokens_pre_allocated_);
+    h_all_tokens_acoustic_cost_[ichannel].reserve(ntokens_pre_allocated_);
+    h_all_tokens_info_[ichannel].reserve(ntokens_pre_allocated_);
+  }
+  h_main_q_end_lane_offsets_.resize(nlanes_ + 1);
+  h_emitting_main_q_end_lane_offsets_.resize(nlanes_ + 1);
+  h_n_extra_prev_tokens_lane_offsets_.resize(nlanes_ + 1);
+  frame_offsets_.resize(nchannels_);
+  num_frames_decoded_.resize(nchannels_, -1);
+  main_q_emitting_end_.resize(nlanes_);
+}
+
+void CudaDecoder::InitDeviceData() {
+  KALDI_DECODER_CUDA_API_CHECK_ERROR(cudaMemsetAsync(
+      d_histograms_.lane(0), 0,
+      sizeof(int32) * KALDI_CUDA_DECODER_HISTO_NBINS * nlanes_, compute_st_));
+  KALDI_DECODER_CUDA_API_CHECK_ERROR(cudaMemsetAsync(
+      d_channels_counters_.MutableData(), 0,
+      nchannels_ * sizeof(*d_channels_counters_.MutableData()), compute_st_));
+  KALDI_DECODER_CUDA_API_CHECK_ERROR(cudaMemsetAsync(
+      d_lanes_counters_.MutableData(), 0,
+      nlanes_ * sizeof(*d_lanes_counters_.MutableData()), compute_st_));
+  InitHashmapKernel(KaldiCudaDecoderNumBlocks(hashmap_capacity_, nlanes_),
+                    KALDI_CUDA_DECODER_1D_BLOCK, compute_st_,
+                    *h_device_params_);
+  KALDI_DECODER_CUDA_CHECK_ERROR();
+}
+
+void CudaDecoder::InitHostData() {
+  // Adding a tolerance on max_active_
+  // This is because we will usually not be able to limit the number of tokens
+  // to exactly max_active
+  // We will set it as close as possible to max_active, and we don't want to
+  // keep calling the histograms kernels for a few tokens above the limit
+  int32 tolerance = max_active_ * KALDI_CUDA_DECODER_MAX_ACTIVE_TOLERANCE;
+  // Checking for overflow
+  int32 overflow_limit = INT_MAX - tolerance;
+  max_active_thresh_ =
+      (max_active_ < overflow_limit) ? (max_active_ + tolerance) : INT_MAX;
+}
+
+void CudaDecoder::AllocateDeviceKernelParams() {
+  h_device_params_ = new DeviceParams();
+  h_kernel_params_ = new KernelParams();
+}
+
+void CudaDecoder::InitDeviceParams() {
+  // Setting Kernel Params
+  // Sent to cuda kernels by copy
+  // Making sure we'll be able to send it to the kernels
+  KALDI_ASSERT((sizeof(KernelParams) + sizeof(DeviceParams)) <
+               KALDI_CUDA_DECODER_MAX_KERNEL_ARGUMENTS_BYTE_SIZE);
+
+  h_device_params_->d_channels_counters = d_channels_counters_.GetView();
+  h_device_params_->d_lanes_counters = d_lanes_counters_.GetView();
+  h_device_params_->d_main_q_state_and_cost =
+      d_main_q_state_and_cost_.GetView();
+  h_device_params_->d_main_q_info = d_main_q_info_.GetView();
+  h_device_params_->d_aux_q_state_and_cost = d_aux_q_state_and_cost_.GetView();
+  h_device_params_->d_main_q_extra_and_acoustic_cost =
+      d_main_q_extra_and_acoustic_cost_.GetView();
+  h_device_params_->d_main_q_acoustic_cost = d_main_q_acoustic_cost_.GetView();
+  h_device_params_->d_aux_q_acoustic_cost = d_aux_q_acoustic_cost_.GetView();
+  h_device_params_->d_aux_q_info = d_aux_q_info_.GetView();
+  h_device_params_->d_main_q_degrees_prefix_sum =
+      d_main_q_degrees_prefix_sum_.GetView();
+  h_device_params_->d_main_q_block_sums_prefix_sum =
+      d_main_q_block_sums_prefix_sum_.GetView();
+  h_device_params_->d_main_q_state_hash_idx =
+      d_main_q_state_hash_idx_.GetView();
+  h_device_params_->d_main_q_extra_prev_tokens_prefix_sum =
+      d_main_q_extra_prev_tokens_prefix_sum_.GetView();
+  h_device_params_->d_main_q_n_extra_prev_tokens_local_idx =
+      d_main_q_n_extra_prev_tokens_local_idx_.GetView();
+  h_device_params_->d_main_q_extra_prev_tokens =
+      d_main_q_extra_prev_tokens_.GetView();
+  h_device_params_->d_main_q_arc_offsets = d_main_q_arc_offsets_.GetView();
+  h_device_params_->d_hashmap_values = d_hashmap_values_.GetView();
+  h_device_params_->d_histograms = d_histograms_.GetView();
+  h_device_params_->d_arc_e_offsets = fst_.d_e_offsets_;
+  h_device_params_->d_arc_ne_offsets = fst_.d_ne_offsets_;
+  h_device_params_->d_arc_pdf_ilabels = fst_.d_arc_pdf_ilabels_;
+  h_device_params_->d_arc_weights = fst_.d_arc_weights_;
+  h_device_params_->d_arc_nextstates = fst_.d_arc_nextstates_;
+  h_device_params_->d_fst_final_costs = fst_.d_final_;
+  h_device_params_->default_beam = default_beam_;
+  h_device_params_->lattice_beam = lattice_beam_;
+  h_device_params_->main_q_capacity = main_q_capacity_;
+  h_device_params_->aux_q_capacity = aux_q_capacity_;
+  h_device_params_->init_channel_id = init_channel_id_;
+  h_device_params_->max_nlanes = nlanes_;
+  h_device_params_->nstates = fst_.num_states_;
+  h_device_params_->init_state = fst_.Start();
+  KALDI_ASSERT(h_device_params_->init_state != fst::kNoStateId);
+  h_device_params_->init_cost = StdWeight::One().Value();
+  h_device_params_->hashmap_capacity = hashmap_capacity_;
+  h_device_params_->max_active = max_active_;
+  // For the first static_beam_q_length elements of the queue, we will keep the
+  // beam static
+  int32 static_beam_q_length =
+      aux_q_capacity_ / KALDI_CUDA_DECODER_ADAPTIVE_BEAM_STATIC_SEGMENT;
+  // For the last adaptive_beam_q_length elements of the queue, we will decrease
+  // the beam, segment by segment
+  // For more information, please refer to the definition of GetAdaptiveBeam in
+  // cuda-decoder-kernels.cu
+  int32 adaptive_beam_q_length = (aux_q_capacity_ - static_beam_q_length);
+  int32 adaptive_beam_bin_width =
+      adaptive_beam_q_length / KALDI_CUDA_DECODER_ADAPTIVE_BEAM_NSTEPS;
+  h_device_params_->adaptive_beam_static_segment = static_beam_q_length;
+  h_device_params_->adaptive_beam_bin_width = adaptive_beam_bin_width;
+
+  // Reusing aux_q memory to list final states in GetLattice
+  // Those cannot be used at the same time
+  h_device_params_->d_list_final_tokens_in_main_q =
+      d_list_final_tokens_in_main_q_;
+}
+
+CudaDecoder::~CudaDecoder() {
+  cudaStreamDestroy(compute_st_);
+
+  KALDI_DECODER_CUDA_API_CHECK_ERROR(cudaFreeHost(h_lanes_counters_));
+  KALDI_DECODER_CUDA_API_CHECK_ERROR(cudaFreeHost(h_channels_counters_));
+  KALDI_DECODER_CUDA_API_CHECK_ERROR(
+      cudaFreeHost(h_extra_and_acoustic_cost_concat__));
+  KALDI_DECODER_CUDA_API_CHECK_ERROR(cudaFreeHost(h_acoustic_cost_concat_));
+  KALDI_DECODER_CUDA_API_CHECK_ERROR(cudaFreeHost(h_extra_prev_tokens_concat_));
+  KALDI_DECODER_CUDA_API_CHECK_ERROR(cudaFreeHost(h_infotoken_concat_));
+  KALDI_DECODER_CUDA_API_CHECK_ERROR(
+      cudaFreeHost(h_list_final_tokens_in_main_q_));
+  // Will call the cudaFrees inside destructors
+  delete h_kernel_params_;
+  delete h_device_params_;
+}
+
+void CudaDecoder::ComputeInitialChannel() {
+  KALDI_ASSERT(nlanes_ > 0);
+  const int32 ilane = 0;
+  KALDI_ASSERT(ilane == 0);
+  // Following kernels working channel_id
+  std::vector<ChannelId> channels = {init_channel_id_};
+  SetChannelsInKernelParams(channels);  // not calling LoadChannelsStateToLanes,
+                                        // init_channel_id_ is a special case
+
+  // Adding the start state to the initial token queue
+  InitializeInitialLaneKernel(KaldiCudaDecoderNumBlocks(1, 1),
+                              KALDI_CUDA_DECODER_ONE_THREAD_BLOCK, compute_st_,
+                              *h_device_params_);
+
+  h_lanes_counters_[ilane].post_expand_aux_q_end = 1;
+
+  PruneAndPreprocess();
+  FinalizeProcessNonEmittingKernel(
+      KaldiCudaDecoderNumBlocks(1, 1), KALDI_CUDA_DECODER_LARGEST_1D_BLOCK,
+      compute_st_, *h_device_params_, *h_kernel_params_);
+
+  CopyLaneCountersToHostSync();
+  PostProcessingMainQueue();
+  CopyLaneCountersToHostSync();
+
+  const int32 main_q_end = h_lanes_counters_[0].main_q_narcs_and_end.y;
+  KALDI_ASSERT(main_q_end > 0);
+  // All arcs traversed until now are non-emitting
+  h_all_tokens_acoustic_cost_[init_channel_id_].resize(main_q_end, 0.0f);
+
+  // Moving all data linked to init_channel_id_ to host
+  // that data will be cloned to other channels when calling InitDecoding
+  CopyMainQueueDataToHost();
+  SaveChannelsStateFromLanes();
+
+  KALDI_ASSERT(
+      h_channels_counters_[init_channel_id_].prev_main_q_narcs_and_end.x > 0);
+  KALDI_ASSERT(
+      h_channels_counters_[init_channel_id_].prev_main_q_narcs_and_end.y > 0);
+}
+
+void CudaDecoder::InitDecoding(const std::vector<ChannelId> &channels) {
+  // Cloning the init_channel_id_ channel into all channels in the channels vec
+  const int nlanes_used = channels.size();
+  // Getting *h_kernel_params ready to use
+  SetChannelsInKernelParams(channels);
+
+  // Size of the initial main_q
+  const int32 init_main_q_size =
+      h_channels_counters_[init_channel_id_].prev_main_q_narcs_and_end.y;
+
+  KALDI_ASSERT(init_main_q_size > 0);
+  // Getting the channels ready to compute new utterances
+  InitDecodingOnDeviceKernel(
+      KaldiCudaDecoderNumBlocks(init_main_q_size, nlanes_used),
+      KALDI_CUDA_DECODER_1D_BLOCK, compute_st_, *h_device_params_,
+      *h_kernel_params_);
+
+  cudaStreamSynchronize(compute_st_);
+  KALDI_DECODER_CUDA_CHECK_ERROR();
+  for (ChannelId ichannel : channels) {
+    // Tokens from initial main_q needed on host
+    // Deep copy
+    h_all_tokens_info_[ichannel] = h_all_tokens_info_[init_channel_id_];
+    h_all_tokens_acoustic_cost_[ichannel] =
+        h_all_tokens_acoustic_cost_[init_channel_id_];
+    h_all_tokens_extra_prev_tokens_[ichannel] =
+        h_all_tokens_extra_prev_tokens_[init_channel_id_];
+    h_all_tokens_extra_prev_tokens_extra_and_acoustic_cost_[ichannel] =
+        h_all_tokens_extra_prev_tokens_extra_and_acoustic_cost_
+            [init_channel_id_];
+
+    int32 n_initial_tokens = h_all_tokens_info_[init_channel_id_].size();
+
+    h_channels_counters_[ichannel] = h_channels_counters_[init_channel_id_];
+    num_frames_decoded_[ichannel] = 0;
+    frame_offsets_[ichannel].clear();
+    frame_offsets_[ichannel].push_back(n_initial_tokens);
+  }
+}
+
+void CudaDecoder::LoadChannelsStateToLanes(
+    const std::vector<ChannelId> &channels) {
+  // Setting that channels configuration in kernel_params
+  SetChannelsInKernelParams(channels);
+  KALDI_ASSERT(nlanes_used_ > 0);
+  for (LaneId ilane = 0; ilane < nlanes_used_; ++ilane) {
+    const ChannelId ichannel = h_kernel_params_->channel_to_compute[ilane];
+    h_lanes_counters_[ilane].main_q_narcs_and_end =
+        h_channels_counters_[ichannel].prev_main_q_narcs_and_end;
+  }
+  LoadChannelsStateInLanesKernel(KaldiCudaDecoderNumBlocks(1, nlanes_used_),
+                                 KALDI_CUDA_DECODER_ONE_THREAD_BLOCK,
+                                 compute_st_, *h_device_params_,
+                                 *h_kernel_params_);
+}
+
+void CudaDecoder::SaveChannelsStateFromLanes() {
+  KALDI_ASSERT(nlanes_used_ > 0);
+  for (LaneId ilane = 0; ilane < nlanes_used_; ++ilane) {
+    const ChannelId ichannel = h_kernel_params_->channel_to_compute[ilane];
+    h_channels_counters_[ichannel].prev_main_q_narcs_and_end =
+        h_lanes_counters_[ilane].main_q_narcs_and_end;
+    h_channels_counters_[ichannel].prev_main_q_global_offset =
+        h_lanes_counters_[ilane].main_q_global_offset;
+  }
+  SaveChannelsStateFromLanesKernel(KaldiCudaDecoderNumBlocks(1, nlanes_used_),
+                                   KALDI_CUDA_DECODER_ONE_THREAD_BLOCK,
+                                   compute_st_, *h_device_params_,
+                                   *h_kernel_params_);
+  ResetChannelsInKernelParams();
+}
+
+int32 CudaDecoder::GetMaxForAllLanes(
+    std::function<int32(const LaneCounters &)> func) {
+  int32 max_val = 0;
+  for (LaneId ilane = 0; ilane < nlanes_used_; ++ilane) {
+    const int32 val = func(h_lanes_counters_[ilane]);
+    max_val = std::max(max_val, val);
+  }
+  return max_val;
+}
+
+void CudaDecoder::CopyLaneCountersToHostAsync() {
+  cudaMemcpyAsync(h_lanes_counters_, d_lanes_counters_.MutableData(),
+                  nlanes_used_ * sizeof(*h_lanes_counters_),
+                  cudaMemcpyDeviceToHost, compute_st_);
+}
+
+void CudaDecoder::CopyLaneCountersToHostSync() {
+  CopyLaneCountersToHostAsync();
+  cudaStreamSynchronize(compute_st_);
+}
+
+template <typename T>
+void CudaDecoder::PerformConcatenatedCopy(
+    std::function<int32(const LaneCounters &)> func, LaneMatrixView<T> src,
+    T *d_concat, T *h_concat, cudaStream_t st,
+    std::vector<int32> *lanes_offsets_ptr) {
+  // Computing the lane offsets
+  // Saving them into *lanes_offsets_ptr and
+  // h_kernel_params_->main_q_end_lane_offsets
+  int32 lane_offset = 0;
+  int32 max_val = 0;
+  std::vector<int32> &lanes_offsets = *lanes_offsets_ptr;
+  KALDI_ASSERT(lanes_offsets.size() >= (nlanes_used_ + 1));
+  for (LaneId ilane = 0; ilane < nlanes_used_; ++ilane) {
+    const int32 val = func(h_lanes_counters_[ilane]);
+    max_val = std::max(max_val, val);
+    lanes_offsets[ilane] = lane_offset;
+    h_kernel_params_->main_q_end_lane_offsets[ilane] = lane_offset;
+    lane_offset += val;
+  }
+  lanes_offsets[nlanes_used_] = lane_offset;
+  h_kernel_params_->main_q_end_lane_offsets[nlanes_used_] = lane_offset;
+  int32 sum_val = lane_offset;
+  if (sum_val == 0) return;  // nothing to do
+
+  // Concatenating lanes data into a single continuous array,
+  // stored into d_concat
+  ConcatenateLanesDataKernel<T>(
+      KaldiCudaDecoderNumBlocks(max_val, nlanes_used_),
+      KALDI_CUDA_DECODER_1D_BLOCK, st, *h_device_params_, *h_kernel_params_,
+      src, d_concat);
+
+  // Moving the d_concat to h_concat (host), async
+  KALDI_DECODER_CUDA_API_CHECK_ERROR(cudaMemcpyAsync(
+      h_concat, d_concat, sum_val * sizeof(T), cudaMemcpyDeviceToHost, st));
+}
+
+// One sync has to happen between PerformConcatenatedCopy and
+// MoveConcatenatedCopyToVector
+template <typename T>
+void CudaDecoder::MoveConcatenatedCopyToVector(
+    const std::vector<int32> &lanes_offsets, T *h_concat,
+    std::vector<std::vector<T>> *vecvec) {
+  // Unpacking the concatenated vector into individual channel storage
+  for (LaneId ilane = 0; ilane < nlanes_used_; ++ilane) {
+    int32 beg = lanes_offsets[ilane];
+    int32 end = lanes_offsets[ilane + 1];
+    ChannelId ichannel = h_kernel_params_->channel_to_compute[ilane];
+    auto &vec = (*vecvec)[ichannel];
+    vec.insert(vec.end(), h_concat + beg, h_concat + end);
+  }
+}
+
+void CudaDecoder::ApplyMaxActiveAndReduceBeam(enum QUEUE_ID queue_id) {
+  // If at least one lane queue is bigger than max_active,
+  // we'll apply a topk on that queue (k=max_active_)
+  auto func_aux_q_end = [](const LaneCounters &c) {
+    return c.post_expand_aux_q_end;
+  };
+  auto func_main_q_end = [](const LaneCounters &c) {
+    return c.main_q_narcs_and_end.y;
+  };
+  int32 max_q_end = (queue_id == AUX_Q) ? GetMaxForAllLanes(func_aux_q_end)
+                                        : GetMaxForAllLanes(func_main_q_end);
+
+  if (max_q_end <= max_active_thresh_) {
+    // The queues are already smaller than max_active_thresh_
+    // nothing to do
+    return;
+  }
+
+  bool use_aux_q = (queue_id == AUX_Q);
+  ComputeCostsHistogramKernel(
+      KaldiCudaDecoderNumBlocks(max_q_end, nlanes_used_),
+      KALDI_CUDA_DECODER_1D_BLOCK, compute_st_, *h_device_params_,
+      *h_kernel_params_, use_aux_q);
+
+  UpdateBeamUsingHistogramKernel(
+      KaldiCudaDecoderNumBlocks(1, nlanes_used_), KALDI_CUDA_DECODER_1D_BLOCK,
+      compute_st_, *h_device_params_, *h_kernel_params_, use_aux_q);
+}
+
+int32 CudaDecoder::NumFramesToDecode(
+    const std::vector<ChannelId> &channels,
+    std::vector<CudaDecodableInterface *> &decodables, int32 max_num_frames) {
+  int32 nframes_to_decode = INT_MAX;
+  // std::vector<int> debug_ntokens;
+  // std::vector<int> debug_narcs;
+  for (int32 ilane = 0; ilane < nlanes_used_; ++ilane) {
+    const ChannelId ichannel = channels[ilane];
+    const int32 num_frames_decoded = num_frames_decoded_[ichannel];
+    KALDI_ASSERT(num_frames_decoded >= 0 &&
+                 "You must call InitDecoding() before AdvanceDecoding()");
+    int32 num_frames_ready = decodables[ilane]->NumFramesReady();
+    // num_frames_ready must be >= num_frames_decoded, or else
+    // the number of frames ready must have decreased (which doesn't
+    // make sense) or the decodable object changed between calls
+    // (which isn't allowed).
+    KALDI_ASSERT(num_frames_ready >= num_frames_decoded);
+    int32 channel_nframes_to_decode = num_frames_ready - num_frames_decoded;
+    nframes_to_decode = std::min(nframes_to_decode, channel_nframes_to_decode);
+  }
+  if (max_num_frames >= 0)
+    nframes_to_decode = std::min(nframes_to_decode, max_num_frames);
+
+  return nframes_to_decode;
+}
+
+void CudaDecoder::ExpandArcsEmitting() {
+  auto func_narcs = [](const LaneCounters &c) {
+    return c.main_q_narcs_and_end.x;
+  };
+  int32 max_main_q_narcs = GetMaxForAllLanes(func_narcs);
+
+  KALDI_ASSERT(max_main_q_narcs > 0);
+  ExpandArcsKernel<true>(
+      KaldiCudaDecoderNumBlocks(max_main_q_narcs, nlanes_used_),
+      KALDI_CUDA_DECODER_1D_BLOCK, compute_st_, *h_device_params_,
+      *h_kernel_params_);
+
+  // Updating a few counters, like resetting aux_q_end to 0...
+  // true is for IS_EMITTING
+  PostExpandKernel<true>(KaldiCudaDecoderNumBlocks(1, nlanes_used_),
+                         KALDI_CUDA_DECODER_ONE_THREAD_BLOCK, compute_st_,
+                         *h_device_params_, *h_kernel_params_);
+}
+
+void CudaDecoder::ExpandArcsNonEmitting(bool *should_iterate) {
+  auto func_main_q_narcs = [](const LaneCounters &c) {
+    return c.main_q_narcs_and_end.x;
+  };
+  int32 max_main_q_narcs = GetMaxForAllLanes(func_main_q_narcs);
+
+  // If we have only a few arcs, jumping to the one-CTA per lane
+  // persistent version
+  bool launch_persistent_version =
+      (max_main_q_narcs < KALDI_CUDA_DECODER_NONEM_LT_MAX_NARCS);
+  // If we cannot launch the persistent version, we will have to iterate on the
+  // heavy load kernels
+  *should_iterate = !launch_persistent_version;
+  if (launch_persistent_version) {
+    // Finalizing process non emitting. Takes care of the long tail,
+    // the final iterations with a small numbers of arcs. Do the work inside a
+    // single CTA (per lane),
+    FinalizeProcessNonEmittingKernel(KaldiCudaDecoderNumBlocks(1, nlanes_used_),
+                                     KALDI_CUDA_DECODER_LARGEST_1D_BLOCK,
+                                     compute_st_, *h_device_params_,
+                                     *h_kernel_params_);
+
+    return;
+  }
+
+  // false is for non emitting
+  ExpandArcsKernel<false>(
+      KaldiCudaDecoderNumBlocks(max_main_q_narcs, nlanes_used_),
+      KALDI_CUDA_DECODER_1D_BLOCK, compute_st_, *h_device_params_,
+      *h_kernel_params_);
+
+  // false is for non emitting
+  PostExpandKernel<false>(KaldiCudaDecoderNumBlocks(1, nlanes_used_),
+                          KALDI_CUDA_DECODER_ONE_THREAD_BLOCK, compute_st_,
+                          *h_device_params_, *h_kernel_params_);
+}
+
+void CudaDecoder::PruneAndPreprocess() {
+  auto func_aux_q_end = [](const LaneCounters &c) {
+    return c.post_expand_aux_q_end;
+  };
+  int32 max_aux_q_end = GetMaxForAllLanes(func_aux_q_end);
+
+  // having all aux_q_end == 0 is not likely, but possible
+  // in a valid workflow
+  if (max_aux_q_end > 0) {
+    NonEmittingPreprocessAndContractKernel(
+        KaldiCudaDecoderNumBlocks(max_aux_q_end, nlanes_used_),
+        KALDI_CUDA_DECODER_1D_BLOCK, compute_st_, *h_device_params_,
+        *h_kernel_params_);
+  }
+}
+
+void CudaDecoder::StartCopyAcousticCostsToHostAsync() {
+  auto func_main_q_end = [](const LaneCounters &c) {
+    return c.main_q_narcs_and_end.y;
+  };
+  PerformConcatenatedCopy(func_main_q_end,
+                          h_device_params_->d_main_q_acoustic_cost,
+                          d_acoustic_cost_concat_, h_acoustic_cost_concat_,
+                          compute_st_, &h_emitting_main_q_end_lane_offsets_);
+  for (int32 ilane = 0; ilane < nlanes_used_; ++ilane)
+    main_q_emitting_end_[ilane] = func_main_q_end(h_lanes_counters_[ilane]);
+}
+
+void CudaDecoder::FinalizeCopyAcousticCostsToHost() {
+  MoveConcatenatedCopyToVector(h_emitting_main_q_end_lane_offsets_,
+                               h_acoustic_cost_concat_,
+                               &h_all_tokens_acoustic_cost_);
+}
+
+void CudaDecoder::PostProcessingMainQueue() {
+  auto func_main_q_end = [](const LaneCounters &c) {
+    return c.main_q_narcs_and_end.y;
+  };
+  int32 max_main_q_end = GetMaxForAllLanes(func_main_q_end);
+  KALDI_ASSERT(max_main_q_end > 0);
+
+  ApplyMaxActiveAndReduceBeam(MAIN_Q);
+
+  FillHashmapWithMainQKernel(
+      KaldiCudaDecoderNumBlocks(max_main_q_end, nlanes_used_),
+      KALDI_CUDA_DECODER_1D_BLOCK, compute_st_, *h_device_params_,
+      *h_kernel_params_);
+
+  EmittingPreprocessAndListExtraPrevTokensStep1Kernel(
+      KaldiCudaDecoderNumBlocks(max_main_q_end, nlanes_used_),
+      KALDI_CUDA_DECODER_1D_BLOCK, compute_st_, *h_device_params_,
+      *h_kernel_params_);
+
+  EmittingPreprocessAndListExtraPrevTokensStep2Kernel(
+      KaldiCudaDecoderNumBlocks(max_main_q_end, nlanes_used_),
+      KALDI_CUDA_DECODER_1D_BLOCK, compute_st_, *h_device_params_,
+      *h_kernel_params_);
+
+  EmittingPreprocessAndListExtraPrevTokensStep3Kernel(
+      KaldiCudaDecoderNumBlocks(max_main_q_end, nlanes_used_),
+      KALDI_CUDA_DECODER_1D_BLOCK, compute_st_, *h_device_params_,
+      *h_kernel_params_);
+
+  EmittingPreprocessAndListExtraPrevTokensStep4Kernel(
+      KaldiCudaDecoderNumBlocks(max_main_q_end, nlanes_used_),
+      KALDI_CUDA_DECODER_1D_BLOCK, compute_st_, *h_device_params_,
+      *h_kernel_params_);
+
+  // We need the infos about number of emitting arcs (for next frame),
+  // the number of extra_prev_tokens, etc.
+  CopyLaneCountersToHostAsync();
+
+  ClearHashmapKernel(KaldiCudaDecoderNumBlocks(max_main_q_end, nlanes_used_),
+                     KALDI_CUDA_DECODER_1D_BLOCK, compute_st_,
+                     *h_device_params_, *h_kernel_params_);
+}
+
+void CudaDecoder::CopyMainQueueDataToHost() {
+  auto func_main_q_end = [](const LaneCounters &c) {
+    return c.main_q_narcs_and_end.y;
+  };
+  PerformConcatenatedCopy(func_main_q_end, h_device_params_->d_main_q_info,
+                          d_infotoken_concat_, h_infotoken_concat_, compute_st_,
+                          &h_main_q_end_lane_offsets_);
+
+  // Sync for :
+  // - h_infotoken_concat_ copy done
+  // - using lane_counters.main_q_n_extra_prev_tokens
+  cudaStreamSynchronize(compute_st_);
+  CheckOverflow();
+
+  // Starting the extra_prev_tokens copies
+  {
+    auto func_main_q_n_extra_prev_tokens = [](const LaneCounters &c) {
+      return c.main_q_n_extra_prev_tokens;
+    };
+    PerformConcatenatedCopy(func_main_q_n_extra_prev_tokens,
+                            h_device_params_->d_main_q_extra_prev_tokens,
+                            d_infotoken_concat_, h_extra_prev_tokens_concat_,
+                            compute_st_, &h_n_extra_prev_tokens_lane_offsets_);
+    PerformConcatenatedCopy(func_main_q_n_extra_prev_tokens,
+                            h_device_params_->d_main_q_extra_and_acoustic_cost,
+                            d_extra_and_acoustic_cost_concat__,
+                            h_extra_and_acoustic_cost_concat__, compute_st_,
+                            &h_n_extra_prev_tokens_lane_offsets_);
+  }
+
+  // Moving infotokens to vecs
+  MoveConcatenatedCopyToVector(h_main_q_end_lane_offsets_, h_infotoken_concat_,
+                               &h_all_tokens_info_);
+
+  // Waiting for the copies
+  cudaStreamSynchronize(compute_st_);
+
+  // Moving the extra_prev_tokens to vecs
+  MoveConcatenatedCopyToVector(h_n_extra_prev_tokens_lane_offsets_,
+                               h_extra_prev_tokens_concat_,
+                               &h_all_tokens_extra_prev_tokens_);
+  MoveConcatenatedCopyToVector(
+      h_n_extra_prev_tokens_lane_offsets_, h_extra_and_acoustic_cost_concat__,
+      &h_all_tokens_extra_prev_tokens_extra_and_acoustic_cost_);
+}
+
+void CudaDecoder::AdvanceDecoding(
+    const std::vector<ChannelId> &channels,
+    std::vector<CudaDecodableInterface *> &decodables, int32 max_num_frames) {
+  nvtxRangePushA("AdvanceDecoding");
+  if (channels.size() == 0) return;  // nothing to do
+  // Context switch : Loading the channels state in lanes
+  LoadChannelsStateToLanes(channels);
+  KALDI_ASSERT(nlanes_used_ > 0);
+
+  // We'll decode nframes_to_decode, such as all channels have at least that
+  // number
+  // of frames available
+  int32 nframes_to_decode =
+      NumFramesToDecode(channels, decodables, max_num_frames);
+
+  // Looping over the frames that we will compute
+  nvtxRangePushA("Decoding");
+  for (int32 iframe = 0; iframe < nframes_to_decode; ++iframe) {
+    // Loglikelihoods from the acoustic model
+    nvtxRangePop();  // Decoding
+    // Setting the loglikelihoods pointers for that frame
+    for (LaneId ilane = 0; ilane < nlanes_used_; ++ilane) {
+      ChannelId ichannel = h_kernel_params_->channel_to_compute[ilane];
+      int32 frame = num_frames_decoded_[ichannel];
+      h_kernel_params_->loglikelihoods_ptrs[ilane] =
+          decodables[ilane]->GetLogLikelihoodsCudaPointer(frame);
+    }
+    cudaStreamSynchronize(cudaStreamPerThread);  // Nnet3 sync
+    nvtxRangePushA("Decoding");
+
+    // Processing emitting arcs. We've done the preprocess stage at the end of
+    // the previous frame
+    ExpandArcsEmitting();
+    bool first_nonemitting = true;
+
+    // We'll loop until we have a small enough number of non-emitting arcs
+    // in the token queue. We'll then break the loop
+    while (true) {
+      // Moving the lanes_params to host,
+      // we want to know the size of the aux queues after ExpandArcsEmitting
+      CopyLaneCountersToHostSync();
+
+      // If one of the aux_q contains more than max_active_ tokens,
+      // we'll reduce the beam to only keep max_active_ tokens
+      ApplyMaxActiveAndReduceBeam(AUX_Q);
+      // Prune the aux_q. Apply the latest beam (using the one from
+      // ApplyMaxActiveAndReduceBeam if triggered)
+      // move the survival tokens to the main queue
+      // and do the preprocessing necessary for the next ExpandArcs
+      PruneAndPreprocess();
+
+      // We want to know how many tokens were not pruned, and ended up in the
+      // main queue
+      // Because we've already done the preprocess stage on those tokens, we
+      // also know
+      // the number of non-emitting arcs out of those tokens
+      // Copy for main_q_narcs and main_q_end
+      CopyLaneCountersToHostSync();
+
+      // If we're the first iteration after the emitting stage,
+      // we need to copy the acoustic costs back to host.
+      // We'll concatenate the costs from the different lanes into in a single
+      // continuous array.
+      if (first_nonemitting) {
+        // Async: We'll need a sync before calling
+        // FinalizeCopyLaneCountersToHost
+        StartCopyAcousticCostsToHostAsync();
+        first_nonemitting = false;
+      }
+
+      bool should_iterate;
+      ExpandArcsNonEmitting(&should_iterate);
+      if (!should_iterate) break;
+    }
+    // We now have our final token main queues for that frame
+
+    // Moving back to host the final (for this frame) values of :
+    // - main_q_end
+    // - main_q_narcs
+    CopyLaneCountersToHostAsync();
+
+    // Sync for :
+    // - CopyLaneCountersToHostAsync
+    // - StartCopyAcousticCostsToHostAsync
+    cudaStreamSynchronize(compute_st_);
+
+    FinalizeCopyAcousticCostsToHost();
+
+    // Post processing the tokens for that frame
+    // - do the preprocess necessary for the next emitting expand (will happen
+    // with next frame)
+    // - if a state S has more than one token associated to it, generate the
+    // list of those tokens
+    // It allows to backtrack efficiently in GetRawLattice
+    // - compute the extra costs
+    PostProcessingMainQueue();
+
+    // Moving the data necessary for GetRawLattice/GetBestPath back to host for
+    // storage
+    CopyMainQueueDataToHost();
+
+    // Few sanity checks + adding acoustic costs for non emitting arcs
+    for (LaneId ilane = 0; ilane < nlanes_used_; ++ilane) {
+      const ChannelId ichannel = h_kernel_params_->channel_to_compute[ilane];
+      KALDI_ASSERT(frame_offsets_[ichannel].back() ==
+                   h_lanes_counters_[ilane].main_q_global_offset);
+      KALDI_ASSERT(
+          h_all_tokens_extra_prev_tokens_[ichannel].size() ==
+          (h_lanes_counters_[ilane].main_q_extra_prev_tokens_global_offset +
+           h_lanes_counters_[ilane].main_q_n_extra_prev_tokens));
+      KALDI_ASSERT(h_all_tokens_extra_prev_tokens_[ichannel].size() ==
+                   h_all_tokens_extra_prev_tokens_[ichannel].size());
+      // We're done processing that frame
+      ++num_frames_decoded_[ichannel];
+      const int32 main_q_end = h_lanes_counters_[ilane].main_q_narcs_and_end.y;
+      // Saving frame offsets for GetRawLattice
+      frame_offsets_[ichannel].push_back(frame_offsets_[ichannel].back() +
+                                         main_q_end);
+
+      // Adding 0.0f acoustic_costs for non-emittings
+      int32 ntokens_nonemitting = main_q_end - main_q_emitting_end_[ilane];
+      auto &vec = h_all_tokens_acoustic_cost_[ichannel];
+      vec.insert(vec.end(), ntokens_nonemitting, 0.0f);
+      KALDI_ASSERT(vec.size() == h_all_tokens_info_[ichannel].size());
+    }
+  }
+  SaveChannelsStateFromLanes();
+  nvtxRangePop();  // Decoding
+  nvtxRangePop();  // End AdvanceDecoding
+}
+
+void CudaDecoder::CheckOverflow() {
+  for (LaneId ilane = 0; ilane < nlanes_used_; ++ilane) {
+    LaneCounters *lane_counters = &h_lanes_counters_[ilane];
+    bool q_overflow = lane_counters->q_overflow;
+    if (q_overflow != OVERFLOW_NONE) {
+      // An overflow was prevented in a kernel
+      // The algorithm can still go on but quality of the result can be reduced
+      // (less tokens were generated)
+
+      if ((q_overflow & OVERFLOW_MAIN_Q) == OVERFLOW_MAIN_Q) {
+        // overflowed main_q
+        KALDI_WARN
+            << "Preventing overflow of main_q. Continuing "
+            << "execution but the quality of the output may be decreased. "
+            << "To prevent this from happening, please increase the parameter "
+               "--main-q-capacity"
+            << " and/or decrease --max-active";
+      }
+      if ((q_overflow & OVERFLOW_AUX_Q) == OVERFLOW_AUX_Q) {
+        // overflowed aux_q
+        KALDI_WARN
+            << "Preventing overflow of aux_q. Continuing "
+            << "execution but the quality of the output may be decreased. "
+            << "To prevent this from happening, please increase the parameter "
+               "--aux-q-capacity"
+            << " and/or decrease --beam";
+      }
+
+      KALDI_ASSERT(lane_counters->main_q_narcs_and_end.y < main_q_capacity_);
+      KALDI_ASSERT(lane_counters->main_q_narcs_and_end.x >= 0);
+      KALDI_ASSERT(lane_counters->main_q_narcs_and_end.y >= 0);
+      KALDI_ASSERT(lane_counters->post_expand_aux_q_end < aux_q_capacity_);
+      KALDI_ASSERT(lane_counters->post_expand_aux_q_end >= 0);
+      KALDI_ASSERT(lane_counters->aux_q_end < aux_q_capacity_);
+      KALDI_ASSERT(lane_counters->aux_q_end >= 0);
+    }
+  }
+}
+
+// GetBestCost
+// returns the minimum cost among all tokens cost in the current frame
+// also returns the index of one token with that min cost
+//
+// Only called at the end of the computation of one audio file
+// not optimized
+void CudaDecoder::GetBestCost(const std::vector<ChannelId> &channels,
+                              bool use_final_costs,
+                              std::vector<std::pair<int32, CostType>> *argmins,
+                              std::vector<std::vector<std::pair<int, float>>>
+                                  *list_finals_token_idx_and_cost,
+                              std::vector<bool> *has_reached_final) {
+  if (channels.size() == 0) return;
+  // Getting the lanes ready to be used with those channels
+  LoadChannelsStateToLanes(channels);
+
+  auto func_main_q_end = [](const LaneCounters &c) {
+    return c.main_q_narcs_and_end.y;
+  };
+  int32 max_main_q_end = GetMaxForAllLanes(func_main_q_end);
+
+  // Step1 : Finding the best cost in the last token queue, with and without
+  // final costs.
+  // Also saving the indexes of those min.
+  GetBestCostStep1Kernel(
+      KaldiCudaDecoderNumBlocks(max_main_q_end, nlanes_used_),
+      KALDI_CUDA_DECODER_1D_BLOCK, compute_st_, *h_device_params_,
+      *h_kernel_params_, use_final_costs, StdWeight::Zero().Value());
+
+  // Step2: Now that we now what the minimum cost is, we list all tokens within
+  // [min_cost; min_cost+lattice_beam]
+  // min_cost takes into account the final costs if use_final_costs is true,
+  // AND if a final state is is present in the last token queue
+  GetBestCostStep2Kernel(
+      KaldiCudaDecoderNumBlocks(max_main_q_end, nlanes_used_),
+      KALDI_CUDA_DECODER_1D_BLOCK, compute_st_, *h_device_params_,
+      *h_kernel_params_, use_final_costs, StdWeight::Zero().Value());
+  // Moving the min_costs and their arguments to cost.
+  // get_best_cost_kernel_step2 also set the number of tokens in [min_cost;
+  // min_cost_lattice_beam]
+  // moving that number as well
+  CopyLaneCountersToHostSync();  // sync copy
+
+  // Resetting the datastructures
+  argmins->clear();
+  has_reached_final->clear();
+  list_finals_token_idx_and_cost->clear();
+  // list_finals_token_idx_and_cost is a vector<vector<>>
+  // Each channel will have its own list of tokens within [best;
+  // best+lattice_beam]
+  list_finals_token_idx_and_cost->resize(nlanes_used_);
+  for (int32 ilane = 0; ilane < nlanes_used_; ++ilane) {
+    int2 minarg = h_lanes_counters_[ilane].min_int_cost_and_arg;
+    // Min cost in that channel last token queue
+    CostType min_cost = orderedIntToFloatHost(minarg.x);
+    // index of that min cost
+    int32 arg = minarg.y;
+    // Saving both in output
+    argmins->push_back({arg, min_cost});
+    // Whether or not the last token queue contains at least one token
+    // associated with a final FST state
+    has_reached_final->push_back(h_lanes_counters_[ilane].has_reached_final);
+    // Number of tokens within [min_cost; min_cost+lattice_beam]
+    int n_within_lattice_beam = h_lanes_counters_[ilane].n_within_lattice_beam;
+    // Loading those tokens
+    (*list_finals_token_idx_and_cost)[ilane].resize(n_within_lattice_beam);
+    // Copying that list
+    cudaMemcpyAsync(
+        h_list_final_tokens_in_main_q_,
+        d_list_final_tokens_in_main_q_.lane(ilane),
+        n_within_lattice_beam * sizeof(*h_list_final_tokens_in_main_q_),
+        cudaMemcpyDeviceToHost, compute_st_);
+    // Waiting for the copy
+    cudaStreamSynchronize(compute_st_);
+    // Moving to output + int2float conversion
+    for (int i = 0; i < n_within_lattice_beam; ++i) {
+      int global_idx = h_list_final_tokens_in_main_q_[i].x;
+      float cost_with_final =
+          orderedIntToFloatHost(h_list_final_tokens_in_main_q_[i].y);
+      (*list_finals_token_idx_and_cost)[ilane][i].first = global_idx;
+      (*list_finals_token_idx_and_cost)[ilane][i].second = cost_with_final;
+    }
+  }
+}
+
+void CudaDecoder::GetBestPath(const std::vector<ChannelId> &channels,
+                              std::vector<Lattice *> &fst_out_vec,
+                              bool use_final_probs) {
+  KALDI_ASSERT(channels.size() == fst_out_vec.size());
+  nvtxRangePushA("GetBestPath");
+  GetBestCost(channels, use_final_probs, &argmins_,
+              &list_finals_token_idx_and_cost_, &has_reached_final_);
+
+  std::vector<int32> reversed_path;
+  for (int32 ilane = 0; ilane < channels.size(); ++ilane) {
+    const ChannelId ichannel = channels[ilane];
+    const int32 token_with_best_cost = argmins_[ilane].first;
+    const bool isfinal = has_reached_final_[ilane];
+    TokenId token_idx = token_with_best_cost;
+
+    // Backtracking
+    // Going all the way from the token with best cost
+    // to the beginning (StartState)
+    reversed_path.clear();
+
+    // The first token was inserted at the beginning of the queue
+    // it always has index 0
+    // We backtrack until that first token
+    while (token_idx != 0) {
+      InfoToken token = h_all_tokens_info_[ichannel][token_idx];
+      // We want an arc with extra_cost == 0
+      int32 arc_idx;
+      TokenId prev_token_idx;
+      if (token.IsUniqueTokenForStateAndFrame()) {
+        // If we have only one, it is an arc with extra_cost == 0
+        arc_idx = token.arc_idx;
+        prev_token_idx = token.prev_token;
+      } else {
+        // Using the first arc with extra_cost == 0
+        int32 offset, size;
+        std::tie(offset, size) = token.GetSameFSTStateTokensList();
+        bool found_best = false;
+        for (auto i = 0; i < size; ++i) {
+          CostType arc_extra_cost =
+              h_all_tokens_extra_prev_tokens_extra_and_acoustic_cost_[ichannel]
+                                                                     [offset +
+                                                                      i].x;
+          // Picking one arc on the best path (extra_cost == 0)
+          if (arc_extra_cost == 0.0f) {
+            InfoToken list_token =
+                h_all_tokens_extra_prev_tokens_[ichannel][offset + i];
+            arc_idx = list_token.arc_idx;
+            prev_token_idx = list_token.prev_token;
+            found_best = true;
+            break;
+          }
+        }
+        KALDI_ASSERT(found_best);
+      }
+      reversed_path.push_back(arc_idx);
+      token_idx = prev_token_idx;
+    }
+
+    Lattice *fst_out = fst_out_vec[ilane];
+    fst_out->DeleteStates();
+    // Building the output Lattice
+    OutputLatticeState curr_state = fst_out->AddState();
+    fst_out->SetStart(curr_state);
+
+    for (int32 i = reversed_path.size() - 1; i >= 1; i--) {
+      int32 arc_idx = reversed_path[i];
+
+      LatticeArc arc(fst_.h_arc_id_ilabels_[arc_idx],
+                     fst_.h_arc_olabels_[arc_idx],
+                     LatticeWeight(fst_.h_arc_weights_[arc_idx], 0),
+                     fst_.h_arc_nextstate_[arc_idx]);
+
+      arc.nextstate = fst_out->AddState();
+      fst_out->AddArc(curr_state, arc);
+      curr_state = arc.nextstate;
+    }
+
+    // Adding final cost to final state
+    if (isfinal && use_final_probs)
+      fst_out->SetFinal(
+          curr_state,
+          LatticeWeight(fst_.h_final_[fst_.h_arc_nextstate_[reversed_path[0]]],
+                        0.0));
+    else
+      fst_out->SetFinal(curr_state, LatticeWeight::One());
+
+    fst::RemoveEpsLocal(fst_out);
+  }
+  nvtxRangePop();
+}
+
+void CudaDecoder::DebugValidateLattice() {
+#if 0
+	//validate lattice consistency
+	for(int frame=0;frame<nframes;frame++) {
+		int token_start=frame_offsets_[ichannel][frame];
+		int token_end=(frame+1<nframes) ? frame_offsets_[ichannel][frame+1] : total_ntokens;
+		int prev_frame_offset=(frame>0) ? frame_offsets_[ichannel][frame-1] : 0;
+		int cur_frame_offset=token_start;
+		int next_frame_offset=token_end;
+
+		bool found_zero = false;
+		//for each token in frame
+		for(int i=token_start;i<token_end;i++) {
+			if(i==0) continue;  //initial token skip this...
+			InfoToken token=h_all_tokens_info_[ichannel][i];
+			KALDI_ASSERT(token.prev_token>=0);
+
+			if(token.IsUniqueTokenForStateAndFrame()) {
+				//previous token must be lower than the next frame start
+				KALDI_ASSERT(token.prev_token<next_frame_offset);
+				//previous token must be larger then previous frame start
+				KALDI_ASSERT(token.prev_token>=prev_frame_offset);
+			} else {
+				int32 offset, size;
+				std::tie(offset,size) = token.GetNextStateTokensList();
+				KALDI_ASSERT(size>0);
+				KALDI_ASSERT(offset>=0 && offset<h_all_tokens_extra_prev_tokens_[ichannel].size());
+				for(auto j=0; j<size; ++j) {
+					KALDI_ASSERT(offset+j<h_all_tokens_extra_prev_tokens_[ichannel].size());
+					InfoToken extra_token=h_all_tokens_extra_prev_tokens_[ichannel][offset+j];
+					//previous token must be lower than the next frame start
+					KALDI_ASSERT(extra_token.prev_token<next_frame_offset);
+					//previous token must be larger then previous frame start
+					KALDI_ASSERT(extra_token.prev_token>=prev_frame_offset);
+				}
+			}
+		}
+	}
+#endif
+}
+
+CudaDecoder::LatticeStateInternalId CudaDecoder::GetLatticeStateInternalId(
+    int32 total_ntokens, TokenId token_idx, InfoToken token) {
+  // If we have a unique token for this (frame,fst_state)
+  // Then its ID is a unique ID for (frame,fst_state)
+  if (token.IsUniqueTokenForStateAndFrame()) return token_idx;
+
+  // If we have multiple tokens for this (frame,fst_state),
+  // let's use the "extra_prev_tokens" offset, which is unique for
+  // (frame,fst_state) in that case
+
+  // Adding the total_ntokens offset to avoid collisions with the previous
+  // case
+  return (total_ntokens + token.prev_token);
+}
+
+void CudaDecoder::AddFinalTokensToLattice(LaneId ilane, ChannelId ichannel,
+                                          Lattice *fst_out) {
+  // Total number of tokens for that utterance. Used in
+  // GetLatticeStateInternalId
+  const int32 total_ntokens = h_all_tokens_info_[ichannel].size();
+  // Reading the overall best_cost for that utterance's last frame. Was set by
+  // GetBestCost
+  const CostType best_cost = argmins_[ilane].second;
+  // Iterating through tokens associated with a final state in the last frame
+  for (auto &p : list_finals_token_idx_and_cost_[ilane]) {
+    // This final token has a final cost of final_token_cost
+    CostType final_token_cost = p.second;
+    // This token has possibly an extra cost compared to the best
+    CostType extra_cost = final_token_cost - best_cost;
+    // We only want to keep paths that have a cost within [best;
+    // best+lattice_beam]
+    if (extra_cost > lattice_beam_) {
+      continue;
+    }
+
+    const TokenId final_token_idx = p.first;
+    InfoToken final_token = h_all_tokens_info_[ichannel][final_token_idx];
+
+    // Internal ID for our lattice_state=(iframe, fst_state)
+    LatticeStateInternalId state_internal_id =
+        GetLatticeStateInternalId(total_ntokens, final_token_idx, final_token);
+    decltype(curr_f_raw_lattice_state_.end()) map_it;
+    bool inserted;
+
+    // We need to create the fst_lattice_state linked to our internal id in the
+    // lattice if it doesn't already exists
+    // Inserts only if the key doesn't exist in the map
+    std::tie(map_it, inserted) = curr_f_raw_lattice_state_.insert(
+        {state_internal_id, {FLT_MAX, -1, false}});
+
+    // If we've inserted the element, it means that that state didn't exist in
+    // the map
+    // Because this is a final state, we need to do a bit of extra work to add
+    // the final_cost to it
+    if (inserted) {
+      // We want to figure out which FST state this token is associated to
+      // We don't have that info anymore, it wasn't transfered from the GPU
+      // We still need it for final tokens, because we need to know which
+      // final cost to add in the lattice.
+      // To find that original FST state, we need the id of an arc going to
+      // that state,
+      // then we'll look in the graph and figure out next_state[arc_idx]
+      // we just need a valid arc_idx
+      int32 arc_idx;
+      if (final_token.IsUniqueTokenForStateAndFrame()) {
+        // If unique, we can directly use this arc_idx
+        arc_idx = final_token.arc_idx;
+      } else {
+        // If we have multiple tokens associated to that fst state, just pick
+        // the first one
+        // from the list
+        int32 offset, size;
+        std::tie(offset, size) = final_token.GetSameFSTStateTokensList();
+        InfoToken prev_token =
+            h_all_tokens_extra_prev_tokens_[ichannel][offset];
+        arc_idx = prev_token.arc_idx;
+      }
+      // Creating the state associated with our internal id in the lattice
+      OutputLatticeState fst_lattice_final_state = fst_out->AddState();
+      map_it->second.fst_lattice_state = fst_lattice_final_state;
+      q_curr_frame_todo_.push_back({final_token_idx, final_token});
+
+      if (has_reached_final_[ilane]) {
+        // If we have reached final states, adding the final cost
+        // We now have a valid arc_idx. We can read the FST state
+        StateId fst_next_state = fst_.h_arc_nextstate_[arc_idx];
+
+        fst_out->SetFinal(fst_lattice_final_state,
+                          LatticeWeight(fst_.h_final_[fst_next_state], 0.0));
+      } else {
+        fst_out->SetFinal(fst_lattice_final_state, LatticeWeight::One());
+      }
+    }
+
+    map_it->second.token_extra_cost =
+        std::min(map_it->second.token_extra_cost, extra_cost);
+  }
+}
+
+void CudaDecoder::AddArcToLattice(int32 list_arc_idx,
+                                  TokenId list_prev_token_idx,
+                                  InfoToken list_prev_token,
+                                  int32 curr_frame_offset,
+                                  CostType acoustic_cost,
+                                  CostType this_arc_prev_token_extra_cost,
+                                  LatticeStateInternalId src_state_internal_id,
+                                  OutputLatticeState fst_lattice_start,
+                                  OutputLatticeState to_fst_lattice_state,
+                                  Lattice *fst_out, bool *must_replay_frame) {
+  // We will now add this arc to the output lattice
+  // We know the destination state of the arc (to_fst_lattice_state)
+  // We need to figure out its source
+  // And propagate the extra cost from the destination to the source of that arc
+  // (we go backward)
+  OutputLatticeState from_fst_lattice_state;
+  // Having the predecessor in the previous frame
+  // <=> that token is associated to an emiting arc
+  bool emitting = (list_prev_token_idx < curr_frame_offset);
+  // Checking if the source of that arc is the start state (original state at
+  // the beginning of the decode)
+  if (list_prev_token_idx != 0) {
+    // Selecting the right map
+    // - emitting arc -> previous frame map
+    // - non emitting arc -> same frame map
+    auto *extra_cost_map =
+        emitting ? &prev_f_raw_lattice_state_ : &curr_f_raw_lattice_state_;
+    decltype(extra_cost_map->end()) from_map_it;
+    bool inserted;
+    // Attempting to insert the state in the map
+    std::tie(from_map_it, inserted) =
+        extra_cost_map->insert({src_state_internal_id, {FLT_MAX, -1, false}});
+    // If it was inserted, its the first time we insert that key in
+    // the map
+    // we need to put that state in the todo list to be considered
+    // next
+    if (inserted) {
+      auto *todo_list = emitting ? &q_prev_frame_todo_ : &q_curr_frame_todo_;
+      todo_list->push_back({list_prev_token_idx, list_prev_token});
+      from_map_it->second.fst_lattice_state = fst_out->AddState();
+    }
+
+    // Updating the source extra cost using that arc
+    // for an arc a->b
+    // extra_cost(a) = min(extra_cost(a),
+    //		extra_cost(b) + arc_extra_cost(a->b))
+    CostType prev_token_extra_cost = from_map_it->second.token_extra_cost;
+    if (this_arc_prev_token_extra_cost < prev_token_extra_cost) {
+      // We found a new min
+      CostType diff = (prev_token_extra_cost - this_arc_prev_token_extra_cost);
+      // If the change is large enough,
+      // and if the state that we're writing to was already closed,
+      // then we need to replay that frame.
+      // if the source state is already closed it means we've
+      // read its extra_cost value. Now we're writing again to it.
+      // We have to do the first read again, to get the updated
+      // value
+      // that's why we're replaying that frame
+      // (between frames everything is in topological order)
+      if (diff > extra_cost_min_delta_ && from_map_it->second.is_state_closed) {
+        *must_replay_frame = true;
+      }
+      prev_token_extra_cost = this_arc_prev_token_extra_cost;
+      from_map_it->second.token_extra_cost = prev_token_extra_cost;
+    }
+
+    // Reading the OutputLatticeState of the source state in the output lattice
+    from_fst_lattice_state = from_map_it->second.fst_lattice_state;
+  } else {
+    from_fst_lattice_state =
+        fst_lattice_start;  // we simply link it to the source
+  }
+
+  // Checking if it's the first time we insert an arc with that
+  // arc_idx for that frame.
+  // If we're replaying that frame, we don't want duplicates
+  bool is_this_arc_new = f_arc_idx_added_.insert(list_arc_idx).second;
+  if (is_this_arc_new) {
+    // The following reads will most likely end up in cache misses
+    // we could load everything sooner
+    LatticeArc arc(
+        fst_.h_arc_id_ilabels_[list_arc_idx], fst_.h_arc_olabels_[list_arc_idx],
+        LatticeWeight(fst_.h_arc_weights_[list_arc_idx], acoustic_cost),
+        to_fst_lattice_state);
+    fst_out->AddArc(from_fst_lattice_state, arc);
+  }
+}
+
+void CudaDecoder::ResetDataForGetRawLattice() {
+  // Using one map per frame. We always know to which frame a token belongs.
+  // Using one big map slows everything down
+  prev_f_raw_lattice_state_.clear();
+  curr_f_raw_lattice_state_.clear();
+  // We want the unicity of each arc_idx for one frame. Important because we
+  // can replay a frame (and possibly add multiple time the same arc)
+  f_arc_idx_added_.clear();
+
+  // Keeping track of which tokens need to be computed. Think of those as FIFO
+  // queues, except that we don't want to pop the front right away, because we
+  // may replay a frame
+  // (and we need to remember what's in that frame)
+  // We are also not using an iterator through the
+  // [prev|curr]_f_raw_lattice_state because we are
+  // sometimes adding stuff in q_curr_frame_todo_ while reading it.
+  // We can possibly add the new element before the current map iterator
+  // (and we wouldn't read it)
+  q_curr_frame_todo_.clear();
+  q_prev_frame_todo_.clear();
+}
+
+void CudaDecoder::GetTokenRawLatticeData(
+    TokenId token_idx, InfoToken token, int32 total_ntokens,
+    CostType *token_extra_cost, OutputLatticeState *to_fst_lattice_state) {
+  LatticeStateInternalId next_state_internal_id =
+      GetLatticeStateInternalId(total_ntokens, token_idx, token);
+  auto to_map_it = curr_f_raw_lattice_state_.find(next_state_internal_id);
+  // We know this token exists in the output lattice (because it's in
+  // q_curr_frame_todo_)
+  KALDI_ASSERT(to_map_it != curr_f_raw_lattice_state_.end());
+
+  *token_extra_cost = to_map_it->second.token_extra_cost;
+  *to_fst_lattice_state = to_map_it->second.fst_lattice_state;
+
+  // We read the extra cost from lattice_next_state
+  // We are now closing the state. If we write to it again, we will have
+  // to replay that frame
+  // (so that the latest extra_cost value is read)
+  to_map_it->second.is_state_closed = true;
+}
+
+void CudaDecoder::GetSameFSTStateTokenList(
+    ChannelId ichannel, InfoToken token, InfoToken **tok_beg,
+    float2 **extra_extra_and_acoustic_cost_beg, int32 *nsame) {
+  // We now need to consider all tokens related to that (iframe,
+  // fst_state)
+  // with fst_state being the state this current token is linked to
+  // There's two possibilies:
+  // a) only one token is associated with that fst_state in that frame.
+  // The necessary information
+  // is then stored directly in the token (arc_idx, prev_token)
+  // b) multiple tokens are associated with that fst_state in that
+  // frame. The token that we have right now
+  // only contains information on where to find the list of those
+  // tokens. It contains (offset, size)
+  //
+  // In any cases we consider the list of tokens to process as an array
+  // of InfoToken, which will
+  // be of size 1 in case a), of size > 1 in case b)
+  if (token.IsUniqueTokenForStateAndFrame()) {
+    *tok_beg = &token;
+    // if we've got only one, extra_cost == 0.0
+    *extra_extra_and_acoustic_cost_beg = NULL;
+    *nsame = 1;
+  } else {
+    int32 offset, size;
+    std::tie(offset, size) = token.GetSameFSTStateTokensList();
+    *tok_beg = &h_all_tokens_extra_prev_tokens_[ichannel][offset];
+    *extra_extra_and_acoustic_cost_beg =
+        &h_all_tokens_extra_prev_tokens_extra_and_acoustic_cost_[ichannel]
+                                                                [offset];
+    *nsame = size;
+  }
+}
+
+void CudaDecoder::ConsiderTokenForLattice(
+    ChannelId ichannel, int32 iprev, int32 total_ntokens, TokenId token_idx,
+    OutputLatticeState fst_lattice_start, InfoToken *tok_beg,
+    float2 *extra_extra_and_acoustic_cost_beg, CostType token_extra_cost,
+    TokenId list_prev_token_idx, int32 list_arc_idx, InfoToken *list_prev_token,
+    CostType *this_arc_prev_token_extra_cost, CostType *acoustic_cost,
+    OutputLatticeState *lattice_src_state, bool *keep_arc,
+    bool *dbg_found_zero) {
+  CostType arc_extra_cost;
+  if (extra_extra_and_acoustic_cost_beg) {
+    float2 both = extra_extra_and_acoustic_cost_beg[iprev];
+    arc_extra_cost = both.x;
+    *acoustic_cost = both.y;
+  } else {
+    // If we have only one token for that (iframe,fst_state),
+    // Its arc has an extra_cost of zero (it's the only way to
+    // get to that state, so it's the best)
+    arc_extra_cost = 0.0f;
+    *acoustic_cost = h_all_tokens_acoustic_cost_[ichannel][token_idx];
+  }
+  // If we use that arc to go to prev_token, prev_token will have the
+  // following extra cost
+  *this_arc_prev_token_extra_cost = token_extra_cost + arc_extra_cost;
+  // We need at least one arc_extra_cost of zero for each (iframe,
+  // fst_state)
+  // The only use for that boolean is in a KALDI_ASSERT,
+  // because if something went wrong in the kernels it's not likely
+  // that this property will be verified out of luck
+  *dbg_found_zero |= (arc_extra_cost == 0.0f);
+  *list_prev_token = h_all_tokens_info_[ichannel][list_prev_token_idx];
+  // Source of the arc currently considered
+  *lattice_src_state =
+      (list_prev_token_idx != 0)
+          ? GetLatticeStateInternalId(total_ntokens, list_prev_token_idx,
+                                      *list_prev_token)
+          : fst_lattice_start;
+
+  // We only keep the arc if, when using that arc, we can end up
+  // at the last frame with a cost not worse than (best+lattice_beam)
+  // this_arc_prev_token_extra_cost contains the accumulated sums
+  // of extra costs (through the cheapest possible way) to the last
+  // frame
+  *keep_arc = (*this_arc_prev_token_extra_cost < lattice_beam_);
+}
+
+void CudaDecoder::SwapPrevAndCurrLatticeMap(int32 iframe,
+                                            bool dbg_found_best_path) {
+  q_prev_frame_todo_.swap(q_curr_frame_todo_);
+  q_prev_frame_todo_.clear();
+  prev_f_raw_lattice_state_.swap(curr_f_raw_lattice_state_);
+  prev_f_raw_lattice_state_.clear();
+  f_arc_idx_added_.clear();
+
+  KALDI_ASSERT(q_prev_frame_todo_.empty());
+  if (iframe > 0) {
+    KALDI_ASSERT(!q_curr_frame_todo_.empty());
+    if (!dbg_found_best_path) {
+      KALDI_WARN << "Warning didn't find exact best path in GetRawLattice";
+    }
+  }
+}
+
+void CudaDecoder::GetRawLattice(const std::vector<ChannelId> &channels,
+                                std::vector<Lattice *> &fst_out_vec,
+                                bool use_final_probs) {
+  KALDI_ASSERT(channels.size() == fst_out_vec.size());
+  // Getting the list of the best costs in the lastest token queue.
+  // all costs within [best;best+lattice_beam]
+  GetBestCost(channels, use_final_probs, &argmins_,
+              &list_finals_token_idx_and_cost_, &has_reached_final_);
+
+  for (int32 ilane = 0; ilane < channels.size(); ++ilane) {
+    nvtxRangePushA("GetRawLatticeOneChannel");
+    const ChannelId ichannel = channels[ilane];
+    const int32 nframes = NumFramesDecoded(ichannel);
+
+    // Total number of tokens generated by the utterance on channel ichannel
+    const int32 total_ntokens = h_all_tokens_info_[ichannel].size();
+
+    // Preparing output lattice
+    // The start state has to be 0 (cf some asserts somewhere else in Kaldi)
+    // Adding it now
+    Lattice *fst_out = fst_out_vec[ilane];
+    fst_out->DeleteStates();
+    OutputLatticeState fst_lattice_start = fst_out->AddState();
+    fst_out->SetStart(fst_lattice_start);
+
+    ResetDataForGetRawLattice();
+    // Adding the best tokens returned by GetBestCost to the lattice
+    // We also add them to q_curr_frame_todo, and we'll backtrack from there
+    AddFinalTokensToLattice(ilane, ichannel, fst_out);
+
+    // We're now going to backtrack frame by frame
+    // For each frame we're going to process tokens that need to be inserted
+    // into the output lattice
+    // and add their predecessors to the todo list
+    // iframe == -1 contains the start state and the first non emitting tokens.
+    // It is not linked to a real frame
+    for (int32 iframe = nframes - 1; iframe >= -1; --iframe) {
+      // Tokens for the current frame were inserted after this offset in the
+      // token list
+      const int32 curr_frame_offset =
+          (iframe >= 0) ? frame_offsets_[ichannel][iframe] : 0;
+
+      // bool must_replay_frame
+      // In some cases we can update an extra_cost that has already been used
+      // For instance we process arcs in that order :
+      // 1) a -> b, which updates extra_cost[b] using extra_cost[a]
+      // 2) c -> a, which updates extra-cost[a] (using extra_cost[c])
+      // because the arcs were not considered in topological order, we need to
+      // run
+      // again the step 1,
+      // to get the correct extra_cost[b] (using the latest extra_cost[a])
+      // However, we only re-run the step 1 if the value extra_cost[a] has
+      // changed more than extra_cost_min_delta_
+      bool must_replay_frame;
+
+      // dbg_found_best_path is used in an useful assert, making sure the best
+      // path is still there for each frame
+      // if something went wrong in the kernels, it's not likely we respect that
+      // property out of luck
+      bool dbg_found_best_path = false;
+      do {
+        must_replay_frame = false;
+        // Reading something to do. We are pushing stuff back in
+        // q_curr_frame_todo_ while reading it,
+        // so it's important to always read q_curr_frame_todo_.size() directly
+        // not using a queue, because we may need to recompute the frame (if
+        // must_replay_frame is true)
+        for (int32 u = 0; u < q_curr_frame_todo_.size(); ++u) {
+          TokenId token_idx;
+          InfoToken token;
+          std::tie(token_idx, token) = q_curr_frame_todo_[u];
+          KALDI_ASSERT(token_idx >= curr_frame_offset);
+          CostType token_extra_cost;
+          StateId to_fst_lattice_state;
+          // Loading the current extra_cost of that token
+          // + its associated state in the lattice
+          GetTokenRawLatticeData(token_idx, token, total_ntokens,
+                                 &token_extra_cost, &to_fst_lattice_state);
+          dbg_found_best_path |= (token_extra_cost == 0.0f);
+
+          InfoToken *tok_beg;
+          float2 *extra_extra_and_acoustic_cost_beg;
+          int32 nsamestate;
+          // Getting the list of the tokens linked to the same FST state, in the
+          // same frame
+          // In the GPU decoder a token is linked to a single arc, but we can
+          // generate
+          // multiple token for a same fst_nextstate in the same frame.
+          // In the CPU decoder we would use the forward_links list to store
+          // everything in the same metatoken
+          // GetSameFSTStateTokenList returns the list of tokens linked to the
+          // same FST state than token
+          // (in the current frame)
+          GetSameFSTStateTokenList(ichannel, token, &tok_beg,
+                                   &extra_extra_and_acoustic_cost_beg,
+                                   &nsamestate);
+
+          // Used for debugging. For each FST state, we have a token with the
+          // best cost for that FST state
+          // that token has an extra_cost of 0.0f. This is a sanity check
+          bool dbg_found_zero = false;
+          for (int32 iprev = 0; iprev < nsamestate; ++iprev) {
+            int32 list_prev_token_idx, list_arc_idx;
+            InfoToken list_prev_token;
+            CostType acoustic_cost, this_arc_prev_token_extra_cost;
+            bool keep_arc;
+            LatticeStateInternalId src_state_internal_id;
+            InfoToken list_token = tok_beg[iprev];
+            list_prev_token_idx = list_token.prev_token;
+            list_arc_idx = list_token.arc_idx;
+
+            ConsiderTokenForLattice(
+                ichannel, iprev, total_ntokens, token_idx, fst_lattice_start,
+                tok_beg, extra_extra_and_acoustic_cost_beg, token_extra_cost,
+                list_prev_token_idx, list_arc_idx, &list_prev_token,
+                &this_arc_prev_token_extra_cost, &acoustic_cost,
+                &src_state_internal_id, &keep_arc, &dbg_found_zero);
+
+            if (keep_arc)
+              AddArcToLattice(list_arc_idx, list_prev_token_idx,
+                              list_prev_token, curr_frame_offset, acoustic_cost,
+                              this_arc_prev_token_extra_cost,
+                              src_state_internal_id, fst_lattice_start,
+                              to_fst_lattice_state, fst_out,
+                              &must_replay_frame);
+          }
+          KALDI_ASSERT(dbg_found_zero);
+        }
+
+        if (must_replay_frame) {
+          // We need to replay the frame. Because all states will be read again,
+          // we can reopen them (and they will be closed again when being read
+          // from again)
+          for (auto it = curr_f_raw_lattice_state_.begin();
+               it != curr_f_raw_lattice_state_.end(); ++it) {
+            it->second.is_state_closed = false;
+          }
+        }
+      } while (must_replay_frame);
+
+      // Done processing this frame. Swap the datastructures, move on to
+      // previous frame (we go --iframe)
+      SwapPrevAndCurrLatticeMap(iframe, dbg_found_best_path);
+    }
+
+    nvtxRangePop();
+  }
+}
+
+void CudaDecoder::SetChannelsInKernelParams(
+    const std::vector<ChannelId> &channels) {
+  KALDI_ASSERT(channels.size() <= nchannels_);
+  KALDI_ASSERT(channels.size() <= nlanes_);
+  for (LaneId lane_id = 0; lane_id < channels.size(); ++lane_id)
+    h_kernel_params_->channel_to_compute[lane_id] = channels[lane_id];
+  h_kernel_params_->nlanes_used = channels.size();
+  nlanes_used_ = channels.size();
+}
+
+void CudaDecoder::ResetChannelsInKernelParams() {
+  h_kernel_params_->nlanes_used = 0;
+  nlanes_used_ = 0;
+}
+
+int32 CudaDecoder::NumFramesDecoded(ChannelId ichannel) const {
+  KALDI_ASSERT(ichannel < nchannels_);
+  return num_frames_decoded_[ichannel];
+}
+
+void CudaDecoder::CheckStaticAsserts() {
+  // Checking if all constants look ok
+
+  // We need that because we need to be able to do the scan in one pass in the
+  // kernel
+  // update_beam_using_histogram_kernel
+  KALDI_ASSERT(KALDI_CUDA_DECODER_HISTO_NBINS < KALDI_CUDA_DECODER_1D_BLOCK);
+  KALDI_ASSERT(KALDI_CUDA_DECODER_NONEM_LT_MAX_NARCS > 0);
+}
+/*
+        int32 CudaDecoder::NumFramesDecoded() const {
+                return NumFramesDecoded(0);
+        }
+*/
+}  // end namespace cuda_decoder
+}  // end namespace kaldi
+
+#endif  // HAVE_CUDA == 1
diff --git a/src/cudadecoder/cuda-decoder.h b/src/cudadecoder/cuda-decoder.h
new file mode 100644
index 00000000000..78a9047b5b0
--- /dev/null
+++ b/src/cudadecoder/cuda-decoder.h
@@ -0,0 +1,755 @@
+// cudadecoder/cuda-decoder.h
+//
+// Copyright (c) 2019, NVIDIA CORPORATION.  All rights reserved.
+// Hugo Braun, Justin Luitjens, Ryan Leary
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef KALDI_CUDA_DECODER_CUDA_DECODER_H_
+#define KALDI_CUDA_DECODER_CUDA_DECODER_H_
+
+#include "cudadecoder/cuda-decodable-itf.h"
+#include "cudadecoder/cuda-decoder-common.h"
+#include "cudadecoder/cuda-fst.h"
+#include "nnet3/decodable-online-looped.h"
+
+#include <cuda_runtime_api.h>
+#include <tuple>
+#include <vector>
+
+namespace kaldi {
+namespace cuda_decoder {
+
+struct CudaDecoderConfig {
+  BaseFloat default_beam;
+  BaseFloat lattice_beam;
+  int32 ntokens_pre_allocated;
+  int32 main_q_capacity, aux_q_capacity;
+  int32 nlanes;
+  int32 nchannels;
+  int32 max_active;
+
+  CudaDecoderConfig()
+      : default_beam(15.0),
+        lattice_beam(10.0),
+        ntokens_pre_allocated(2000000),
+        main_q_capacity(50000),
+        aux_q_capacity(500000),
+        max_active(10000) {}
+
+  void Register(OptionsItf *opts) {
+    opts->Register("beam", &default_beam,
+                   "Decoding beam. Larger->slower, more accurate. If "
+                   "aux-q-capacity is too small, we may decrease the beam "
+                   "dynamically to avoid overflow (adaptive beam, see "
+                   "aux-q-capacity parameter)");
+    opts->Register("lattice-beam", &lattice_beam,
+                   "The width of the lattice beam");
+    opts->Register("max-active", &max_active,
+                   "At the end of each frame computation, we keep only its "
+                   "best max-active tokens. One token is the instantiation of "
+                   "a single arc. Typical values are within the 5k-10k range.");
+    opts->Register("ntokens-pre-allocated", &ntokens_pre_allocated,
+                   "Advanced - Number of tokens pre-allocated in host buffers. "
+                   "If this size is exceeded the buffer will reallocate, "
+                   "reducing performance.");
+    opts->Register("aux-q-capacity", &aux_q_capacity,
+                   "Advanced - Capacity of the auxiliary queue : Maximum "
+                   "number of raw tokens that can be stored *before* pruning "
+                   "for each frame. Lower -> less memory usage, Higher -> More "
+                   "accurate. During the tokens generation, if we detect that "
+                   "we are getting close to saturating that capacity, we will "
+                   "reduce the beam dynamically (adaptive beam) to keep only "
+                   "the best tokens in the remaining space. If the aux queue "
+                   "is still too small, we will print an overflow warning, but "
+                   "prevent the overflow. The computation can safely continue, "
+                   "but the quality of the output may decrease. We strongly "
+                   "recommend keeping aux-q-capacity large (>400k), to avoid "
+                   "triggering the adaptive beam and/or the overflow.");
+    opts->Register("main-q-capacity", &main_q_capacity,
+                   "Advanced - Capacity of the main queue : Maximum number of "
+                   "tokens that can be stored *after* pruning for each frame. "
+                   "Lower -> less memory usage, Higher -> More accurate. "
+                   "Tokens stored in the main queue were already selected "
+                   "through a max-active pre-selection. It means that for each "
+                   "emitting/non-emitting iteration, we can add at most "
+                   "~max-active tokens to the main queue. Typically only the "
+                   "emitting iteration creates a large number of tokens. Using "
+                   "main-q-capacity=k*max-active with k=4..10 should be safe. "
+                   "If main-q-capacity is too small, we will print a warning "
+                   "but prevent the overflow. The computation can safely "
+                   "continue, but the quality of the output may decrease.");
+  }
+  void Check() const {
+    KALDI_ASSERT(default_beam > 0.0 && ntokens_pre_allocated >= 0 &&
+                 main_q_capacity > 0 && aux_q_capacity >= main_q_capacity &&
+                 lattice_beam >= 0.0f && max_active > 1);
+  }
+};
+
+// Forward declaration.
+// Those contains CUDA code. We don't want to include their definition
+// in this header
+class DeviceParams;
+class KernelParams;
+
+class CudaDecoder {
+ public:
+  // Creating a new CudaDecoder, associated to the FST fst
+  // nlanes and nchannels are defined as follow
+
+  // A decoder channel is linked to one utterance.
+  // When we need to perform decoding on an utterance,
+  // we pick an available channel, call InitDecoding on that channel
+  // (with that ChannelId in the channels vector in the arguments)
+  // then call AdvanceDecoding whenever frames are ready for the decoder
+  // for that utterance (also passing the same ChannelId to AdvanceDecoding)
+  //
+  // A decoder lane is where the computation actually happens
+  // a decoder lane is channel, and perform the actual decoding
+  // of that channel.
+  // If we have 200 lanes, we can compute 200 utterances (channels)
+  // at the same time. We need many lanes in parallel to saturate the big GPUs
+  //
+  // An analogy would be lane -> a CPU core, channel -> a software thread
+  // A channel saves the current state of the decoding for a given utterance.
+  // It can be kept idle until more frames are ready to be processed
+  //
+  // We will use as many lanes as necessary to saturate the GPU, but not more.
+  // A lane has an higher memory usage than a channel. If you just want to be
+  // able to
+  // keep more audio channels open at the same time (when I/O is the bottleneck
+  // for instance,
+  // typically in the context of online decoding), you should instead use more
+  // channels.
+  //
+  // A channel is typically way smaller in term of memory usage, and can be used
+  // to oversubsribe lanes in the context of online decoding
+  // For instance, we could choose nlanes=200 because it gives us good
+  // performance
+  // on a given GPU. It gives us an end-to-end performance of 3000 XRTF. We are
+  // doing online,
+  // so we only get audio at realtime speed for a given utterance/channel.
+  // We then decide to receive audio from 2500 audio channels at the same time
+  // (each at realtime speed),
+  // and as soon as we have frames ready for nlanes=200 channels, we call
+  // AdvanceDecoding on those channels
+  // In that configuration, we have nlanes=200 (for performance), and
+  // nchannels=2500 (to have enough audio
+  // available at a given time).
+  // Using nlanes=2500 in that configuration would first not be possible (out of
+  // memory), but also not necessary.
+  // Increasing the number of lanes is only useful if it increases performance.
+  // If the GPU is saturated at nlanes=200,
+  // you should not increase that number
+  CudaDecoder(const CudaFst &fst, const CudaDecoderConfig &config, int32 nlanes,
+              int32 nchannels);
+  // Special constructor for nlanes = nchannels. Here for the non-advanced user
+  // Here we can consider nchannels = batch size. If we want to decode 10
+  // utterances at a time,
+  // we can use nchannels = 10
+  CudaDecoder(const CudaFst &fst, const CudaDecoderConfig &config,
+              int32 nchannels)
+      : CudaDecoder(fst, config, nchannels, nchannels) {}
+  ~CudaDecoder();
+
+  // InitDecoding initializes the decoding, and should only be used if you
+  // intend to call AdvanceDecoding() on the channels listed in channels
+  void InitDecoding(const std::vector<ChannelId> &channels);
+  // AdvanceDecoding on a given batch
+  // a batch is defined by the channels vector
+  // We can compute N channels at the same time (in the same batch)
+  // where N = number of lanes, as defined in the constructor
+  // AdvanceDecoding will compute as many frames as possible while running the
+  // full batch
+  // when at least one channel has no more frames ready to be computed,
+  // AdvanceDecoding returns
+  // The user then decides what to do, i.e.:
+  //
+  // 1) either remove the empty channel from the channels list
+  // and call again AdvanceDecoding
+  // 2) or swap the empty channel with another one that has frames ready
+  // and call again AdvanceDecoding
+  //
+  // Solution 2) should be preferred because we need to run full, big batches to
+  // saturate the GPU
+  //
+  // If max_num_frames is >= 0 it will decode no more than
+  // that many frames.
+  void AdvanceDecoding(const std::vector<ChannelId> &channels,
+                       std::vector<CudaDecodableInterface *> &decodables,
+                       int32 max_num_frames = -1);
+
+  // Returns the number of frames already decoded in a given channel
+  int32 NumFramesDecoded(ChannelId ichannel) const;
+  // GetBestPath gets the one-best decoding traceback. If "use_final_probs" is
+  // true
+  // AND we reached a final state, it limits itself to final states;
+  // otherwise it gets the most likely token not taking into account
+  // final-probs.
+  void GetBestPath(const std::vector<ChannelId> &channels,
+                   std::vector<Lattice *> &fst_out_vec,
+                   bool use_final_probs = true);
+  // GetRawLattice gets the lattice decoding traceback (using the lattice-beam
+  // in the CudaConfig parameters).
+  // If "use_final_probs" is true
+  // AND we reached a final state, it limits itself to final states;
+  // otherwise it gets the most likely token not taking into account
+  // final-probs.
+  void GetRawLattice(const std::vector<ChannelId> &channels,
+                     std::vector<Lattice *> &fst_out_vec, bool use_final_probs);
+  // GetBestCost finds the best cost in the last tokens queue
+  // for each channel in channels. If isfinal is true,
+  // we also add the final cost to the token costs before
+  // finding the minimum cost
+  // We list all tokens that have a cost within [best; best+lattice_beam]
+  // in list_lattice_tokens.
+  // We alsos set has_reached_final[ichannel] to true if token associated to a
+  // final state
+  // exists in the last token queue of that channel
+  void GetBestCost(
+      const std::vector<ChannelId> &channels, bool isfinal,
+      std::vector<std::pair<int32, CostType>> *argmins,
+      std::vector<std::vector<std::pair<int, float>>> *list_lattice_tokens,
+      std::vector<bool> *has_reached_final);
+
+ private:
+  // Data allocation. Called in constructor
+  void AllocateDeviceData();
+  void AllocateHostData();
+  void AllocateDeviceKernelParams();
+  // Data initialization. Called in constructor
+  void InitDeviceData();
+  void InitHostData();
+  void InitDeviceParams();
+  // Computes the initial channel
+  // The initial channel is used to initialize a channel
+  // when a new utterance starts (we clone it into the given channel)
+  void ComputeInitialChannel();
+  // Updates *h_kernel_params using channels
+  void SetChannelsInKernelParams(const std::vector<ChannelId> &channels);
+  void ResetChannelsInKernelParams();
+  // Context-switch functions
+  // Used to perform the context-switch of load/saving the state of a channels
+  // into a lane. When a channel will be executed on a lane, we load that
+  // channel into that lane (same idea than when we load a software threads into
+  // the registers of a CPU)
+  void LoadChannelsStateToLanes(const std::vector<ChannelId> &channels);
+  void SaveChannelsStateFromLanes();
+  // We compute the decodes by batch. Each decodable in the batch has a
+  // different number of frames ready
+  // We compute the min number of frames ready (so that the full batch is
+  // executing). If max_num_frames
+  // is > 0, we apply that ceiling to the NumFramesToDecode.
+  int32 NumFramesToDecode(const std::vector<ChannelId> &channels,
+                          std::vector<CudaDecodableInterface *> &decodables,
+                          int32 max_num_frames);
+  // Expand the arcs, emitting stage. Must be called after
+  // a preprocess_in_place, which happens in PostProcessingMainQueue.
+  // ExpandArcsEmitting is called first when decoding a frame,
+  // using the preprocessing that happened at the end of the previous frame,
+  // in PostProcessingMainQueue
+  void ExpandArcsEmitting();
+  // ExpandArcs, non-emitting stage. Must be called after PruneAndPreprocess.
+  // if *should_iterate is true, we should do another iteration of the
+  // PruneAndPreprocess/ExpandArcsNonEmitting pair
+  void ExpandArcsNonEmitting(bool *should_iterate);
+  // If we have more than max_active_ tokens in the queue (either after an
+  // expand, or at the end of the frame)
+  // we will compute a new beam that will only keep a number of tokens as close
+  // as possible to max_active_ tokens
+  // (that number is >= max_active_) (soft topk)
+  // All ApplyMaxActiveAndReduceBeam is find the right beam for that topk and
+  // set it.
+  // We need to then call PruneAndPreprocess (explicitly pruning tokens with
+  // cost > beam)
+  // Or PostProcessingMainQueue (ignoring tokens with cost > beam in the next
+  // frame)
+  void ApplyMaxActiveAndReduceBeam(enum QUEUE_ID queue_id);
+  // Called after an ExpandArcs. Prune the aux_q (output of the ExpandArcs),
+  // move the survival tokens to the main_q, do the preprocessing at the same
+  // time
+  // We don't need it after the last ExpandArcsNonEmitting.
+  void PruneAndPreprocess();
+  // Moving the acoustic_costs of emitting tokens to host in two stage.
+  // StartCopyAcousticCostsToHostAsync concatenate the data on device and start
+  // the device2host copy
+  // FinalizeCopyAcousticCostsToHost move the data that arrived on host into the
+  // right vectors for storage
+  // a sync on compute_st_ has to happen between those two stage
+  void StartCopyAcousticCostsToHostAsync();  // stage 1
+  void FinalizeCopyAcousticCostsToHost();    // stage 2. Need a sync on
+                                             // compute_st_ between 1 on 2
+  // Once the non-emitting is done, the main_q is final for that frame.
+  // We now generate all the data associated with that main_q, such as listing
+  // the different tokens sharing the same token.next_state
+  // we also preprocess for the ExpandArcsEmitting of the next frame
+  // Once PostProcessingMainQueue, all working data is back to its original
+  // state, to make sure we're ready for the next context switch
+  void PostProcessingMainQueue();
+  // Moving the relevant data to host, ie the data that will be needed in
+  // GetBestPath/GetRawLattice.
+  // Happens when PostProcessingMainQueue is done generating that data
+  void CopyMainQueueDataToHost();
+  // CheckOverflow
+  // If a kernel sets the flag h_q_overflow, we send a warning to stderr
+  // Overflows are detected and prevented on the device. It only means
+  // that we've discarded the tokens that were created after the queue was full
+  // That's why we only send a warning. It is not a fatal error
+  void CheckOverflow();
+  // Evaluates the function func for each lane, returning the max of all return
+  // values
+  // (func returns int32)
+  // Used for instance to ge the max number of arcs for all lanes
+  // func is called with h_lanes_counters_[ilane] for each lane.
+  // h_lanes_counters_
+  // must be ready to be used when calling GetMaxForAllLanes (you might want to
+  // call
+  // CopyLaneCountersToHost[A|]sync to make sure everything is ready first)
+  int32 GetMaxForAllLanes(std::function<int32(const LaneCounters &)> func);
+  // Copy the lane counters back to host, async or sync
+  // The lanes counters contain all the information such as main_q_end (number
+  // of tokens in the main_q)
+  // main_q_narcs (number of arcs) during the computation. That's why we
+  // frequently copy it back to host
+  // to know what to do next
+  void CopyLaneCountersToHostAsync();
+  void CopyLaneCountersToHostSync();
+  // The selected tokens for each frame will be copied back to host. We will
+  // store them on host memory, and we wil use them to create the final lattice
+  // once we've reached the last frame
+  // We will also copy information on those tokens that we've generated on the
+  // device, such as which tokens are associated to the same FST state in the
+  // same frame, or their extra cost.
+  // We cannot call individuals Device2Host copies for each channel, because it
+  // would lead to a lot of small copies, reducing performance. Instead we
+  // concatenate all channels data into a single
+  // continuous array, copy that array to host, then unpack it to the individual
+  // channel vectors
+  // The first step (pack then copy to host, async) is done in
+  // PerformConcatenatedCopy
+  // The second step is done in MoveConcatenatedCopyToVector
+  // A sync on cudaStream st has to happen between the two functions to make
+  // sure that the copy is done
+  //
+  // Each lane contains X elements to be copied, where X = func(ilane)
+  // That data is contained in the array (pointer, X), with pointer = src[ilane]
+  // It will be concatenated in d_concat on device, then copied async into
+  // h_concat
+  // That copy is launched on stream st
+  // The offset of the data of each lane in the concatenate array is saved in
+  // *lanes_offsets_ptr
+  // it will be used for unpacking in MoveConcatenatedCopyToVector
+  //
+  // func is called with h_lanes_counters_[ilane] for each lane.
+  // h_lanes_counters_
+  // must be ready to be used when calling GetMaxForAllLanes (you might want to
+  // call
+  // CopyLaneCountersToHost[A|]sync to make sure everything is ready first)
+  template <typename T>
+  void PerformConcatenatedCopy(std::function<int32(const LaneCounters &)> func,
+                               LaneMatrixView<T> src, T *d_concat, T *h_concat,
+                               cudaStream_t st,
+                               std::vector<int32> *lanes_offsets_ptr);
+  template <typename T>
+  void MoveConcatenatedCopyToVector(const std::vector<int32> &lanes_offsets,
+                                    T *h_concat,
+                                    std::vector<std::vector<T>> *vecvec);
+  // Computes a set of static asserts on the static values
+  // such as the defines : KALDI_CUDA_DECODER_MAX_N_LANES for example
+  // In theory we should do them at compile time
+  void CheckStaticAsserts();
+  // Can be called in GetRawLattice to do a bunch of deep asserts on the data
+  // Slow, so disabled by default
+  void DebugValidateLattice();
+
+  //
+  // Data members
+  //
+
+  // The CudaFst data structure contains the FST graph
+  // in the CSR format, on both the GPU and CPU memory
+  const CudaFst fst_;
+  // Counters used by a decoder lane
+  // Contains all the single values generated during computation,
+  // such as the current size of the main_q, the number of arcs currently in
+  // that queue
+  // We load data from the channel state during context-switch (for instance the
+  // size of the last token queue for that channel)
+  LaneCounters *h_lanes_counters_;
+  // Counters of channels
+  // Contains all the single values saved to remember the state of a channel
+  // not used during computation. Those values are loaded/saved into/from a lane
+  // during context switching
+  ChannelCounters *h_channels_counters_;
+  // Contain the various counters used by lanes/channels, such as main_q_end,
+  // main_q_narcs. On device memory (equivalent of h_channels_counters on
+  // device)
+  DeviceChannelMatrix<ChannelCounters> d_channels_counters_;
+  DeviceLaneMatrix<LaneCounters> d_lanes_counters_;
+  // Number of lanes and channels, as defined in the constructor arguments
+  int32 nlanes_, nchannels_;
+
+  // We will now define the data used on the GPU
+  // The data is mainly linked to two token queues
+  // - the main queue
+  // - the auxiliary queue
+  //
+  // The auxiliary queue is used to store the raw output of ExpandArcs.
+  // We then prune that aux queue (and apply max-active) and move the survival
+  // tokens in the main queue.
+  // Tokens stored in the main q can then be used to generate new tokens (using
+  // ExpandArcs)
+  // We also generate more information about what's in the main_q at the end of
+  // a frame (in PostProcessingMainQueue)
+  //
+  // As a reminder, here's the data structure of a token :
+  //
+  // struct Token { state, cost, prev_token, arc_idx }
+  //
+  // Please keep in mind that this structure is also used in the context
+  // of lattice decoding. We are not storing a list of forward links like in the
+  // CPU decoder. A token stays an instanciation of an single arc.
+  //
+  // For performance reasons, we split the tokens in three parts :
+  // { state } , { cost }, { prev_token, arc_idx }
+  // Each part has its associated queue
+  // For instance, d_main_q_state[i], d_main_q_cost[i], d_main_q_info[i]
+  // all refer to the same token (at index i)
+  // The data structure InfoToken contains { prev_token, arc_idx }
+  // We also store the acoustic costs independently in d_main_q_acoustic_cost_
+  //
+  // The data is eiher linked to a channel, or to a lane.
+  //
+  // Channel data (DeviceChannelMatrix):
+  //
+  // The data linked with a channel contains the data of frame i we need to
+  // remember
+  // to compute frame i+1. It is the list of tokens from frame i, with some
+  // additional info
+  // (ie the prefix sum of the emitting arcs degrees from those tokens).
+  // We are only storing d_main_q_state_and_cost_ as channel data because that's
+  // all we need in a token to compute
+  // frame i+1. We don't need token.arc_idx or token.prev_token.
+  // The reason why we also store that prefix sum is because we do the emitting
+  // preprocessing
+  // at the end of frame i. The reason for that is that we need infos from the
+  // hashmap to do that preprocessing.
+  // The hashmap is always cleared at the end of a frame. So we need to do the
+  // preprocessing at the end of frame i,
+  // and then save d_main_q_degrees_prefix_sum_. d_main_q_arc_offsets is
+  // generated also during preprocessing.
+  //
+  // Lane data (DeviceLaneMatrix):
+  //
+  // The lane data is everything we use during computation, but which we reset
+  // at the end of each frame.
+  // For instance we use a hashmap at some point during the computation, but at
+  // the end of each frame we reset it. That
+  // way that hashmap is able to compute whichever channel the next time
+  // AdvanceDecoding is called. The reasons why we do that is :
+  //
+  // - We use context switching. Before and after every frames, we can do a
+  // context switching. Which means that a lane cannot save a channel's state
+  // in any way once AdvanceDecoding returns. e.g., during a call of
+  // AdvanceDecoding, ilane=2 may compute 5 frames from channel=57 (as defined
+  // in the std::vector<ChannelId> channels).
+  // In the next call, the same ilane=2 may compute 10 frames from channel=231.
+  // A lane data has to be reset to its original state at the end of each
+  // AdvanceDecoding call.
+  // If somehow some data has to be saved, it needs to be declared as channel
+  // data.
+  //
+  // - The reason why we make the distinction between lane and channel data (in
+  // theory everything could be consider channel data), is because
+  // a lane uses more memory than a channel. In the context of online decoding,
+  // we need to create a lot channels, and we need them to be as small as
+  // possible in memory.
+  // Everything that can be reused between channels is stored as lane data.
+
+  //
+  // Channel data members:
+  //
+
+  DeviceChannelMatrix<int2> d_main_q_state_and_cost_;
+  // Prefix sum of the arc's degrees in the main_q. Used by ExpandArcs,
+  // set in the preprocess stages (either PruneAndPreprocess or
+  // preprocess_in_place in PostProcessingMainQueue)
+  DeviceChannelMatrix<int32> d_main_q_degrees_prefix_sum_;
+  // d_main_q_arc_offsets[i] = fst_.arc_offsets[d_main_q_state[i]]
+  // we pay the price for the random memory accesses of fst_.arc_offsets in the
+  // preprocess kernel
+  // we cache the results in d_main_q_arc_offsets which will be read in a
+  // coalesced fashion in expand
+  DeviceChannelMatrix<int32> d_main_q_arc_offsets_;
+
+  //
+  // Lane data members:
+  //
+
+  // InfoToken
+  // Usually contains {prev_token, arc_idx}
+  // If more than one token is associated to a fst_state,
+  // it will contain where to find the list of those tokens in
+  // d_main_q_extra_prev_tokens
+  // ie {offset,size} in that list. We differentiate the two situations by
+  // calling InfoToken.IsUniqueTokenForStateAndFrame()
+  DeviceLaneMatrix<InfoToken> d_main_q_info_;
+  // Acoustic cost of a given token
+  DeviceLaneMatrix<CostType> d_main_q_acoustic_cost_;
+  // At the end of a frame, we use a hashmap to detect the tokens that are
+  // associated with the same FST state S
+  // We do it that the very end, to only use the hashmap on post-prune, post-max
+  // active tokens
+  DeviceLaneMatrix<HashmapValueT> d_hashmap_values_;
+  // Reminder: in the GPU lattice decoder, a token is always associated
+  // to a single arc. Which means that multiple tokens in the same frame
+  // can be associated with the same FST state.
+  //
+  // We are NOT listing those duplicates as ForwardLinks in an unique meta-token
+  // like in the CPU lattice decoder
+  //
+  // When more than one token is associated to a single FST state,
+  // we will list those tokens into another list : d_main_q_extra_prev_tokens
+  // we will also save data useful in such a case, such as the extra_cost of a
+  // token compared to the best for that state
+  DeviceLaneMatrix<InfoToken> d_main_q_extra_prev_tokens_;
+  DeviceLaneMatrix<float2> d_main_q_extra_and_acoustic_cost_;
+  // Histogram. Used to perform the histogram of the token costs
+  // in the main_q. Used to perform a soft topk of the main_q (max-active)
+  DeviceLaneMatrix<int32> d_histograms_;
+  // When filling the hashmap in PostProcessingMainQueue, we create a hashmap
+  // value for each FST state
+  // presents in the main_q (if at least one token is associated with that
+  // state)
+  // d_main_q_state_hash_idx_[token_idx] is the index of the state token.state
+  // in the hashmap
+  // Stored into a FSTStateHashIndex, which is actually a int32.
+  // FSTStateHashIndex should only
+  // be accessed through [Get|Set]FSTStateHashIndex, because it uses the bit
+  // sign to also remember if that token is the representative of that state.
+  // If only one token is associated with S, its representative will be itself
+  DeviceLaneMatrix<FSTStateHashIndex> d_main_q_state_hash_idx_;
+  // local_idx of the extra cost list for a state
+  // For a given state S, first token associated with S will have local_idx=0
+  // the second one local_idx=1, etc. The order of the local_idxs is random
+  DeviceLaneMatrix<int32> d_main_q_n_extra_prev_tokens_local_idx_;
+  // Where to write the extra_prev_tokens in the d_main_q_extra_prev_tokens_
+  // queue
+  DeviceLaneMatrix<int32> d_main_q_extra_prev_tokens_prefix_sum_;
+  // Used when computing the prefix_sums in preprocess_in_place. Stores
+  // the local_sums per CTA
+  DeviceLaneMatrix<int2> d_main_q_block_sums_prefix_sum_;
+  // Defining the aux_q. Filled by ExpandArcs.
+  // The tokens are moved to the main_q by PruneAndPreprocess
+  DeviceLaneMatrix<int2> d_aux_q_state_and_cost_;
+  DeviceLaneMatrix<CostType> d_aux_q_acoustic_cost_;
+  DeviceLaneMatrix<InfoToken> d_aux_q_info_;
+  // Dedicated space for the concat of extra_cost. We should reuse memory
+  DeviceLaneMatrix<float2> d_extra_and_acoustic_cost_concat_matrix;
+  // We will list in d_list_final_tokens_in_main_q all tokens within [min_cost;
+  // min_cost+lattice_beam]
+  // It is used when calling GetBestCost
+  // We only use an interface here because we will actually reuse data from
+  // d_aux_q_state_and_cost
+  // We are done using the aux_q when GetBestCost is called, so we can reuse
+  // that memory
+  LaneMatrixView<int2> d_list_final_tokens_in_main_q_;
+  // Parameters used by the kernels
+  // DeviceParams contains all the parameters that won't change
+  // i.e. memory address of the main_q for instance
+  // KernelParams contains information that can change.
+  // For instance which channel is executing on which lane
+  DeviceParams *h_device_params_;
+  KernelParams *h_kernel_params_;
+  int32 nlanes_used_;  // number of lanes used in h_kernel_params_
+  // Initial lane
+  // When starting a new utterance,
+  // init_channel_id is used to initialize a channel
+  int32 init_channel_id_;
+  // CUDA streams used by the decoder
+  cudaStream_t compute_st_;
+  // Parameters extracted from CudaDecoderConfig
+  // Those are defined in CudaDecoderConfig
+  CostType default_beam_;
+  CostType lattice_beam_;
+  int32 ntokens_pre_allocated_;
+  int32 max_active_;         // Target value from the parameters
+  int32 max_active_thresh_;  // Target value + tolerance
+  int32 aux_q_capacity_;
+  int32 main_q_capacity_;
+  // Hashmap capacity. Multiple of max_tokens_per_frame
+  int32 hashmap_capacity_;
+
+  // The first index of all the following vectors (or vector<vector>)
+  // is the ChannelId. e.g., to get the number of frames decoded in channel 2,
+  // look into num_frames_decoded_[2].
+
+  // Keep track of the number of frames decoded in the current file.
+  std::vector<int32> num_frames_decoded_;
+  // Offsets of each frame in h_all_tokens_info_
+  std::vector<std::vector<int32>> frame_offsets_;
+  // Size of the main_q at the end of the emitting stage
+  std::vector<int32> main_q_emitting_end_;
+  // Data storage. We store on host what we will need in
+  // GetRawLattice/GetBestPath
+  std::vector<std::vector<InfoToken>> h_all_tokens_info_;
+  std::vector<std::vector<CostType>> h_all_tokens_acoustic_cost_;
+  std::vector<std::vector<InfoToken>> h_all_tokens_extra_prev_tokens_;
+  std::vector<std::vector<float2>>
+      h_all_tokens_extra_prev_tokens_extra_and_acoustic_cost_;
+  // Pinned memory arrays. Used for the DeviceToHost copies
+  float2 *h_extra_and_acoustic_cost_concat__,
+      *d_extra_and_acoustic_cost_concat__;
+  InfoToken *h_infotoken_concat_, *d_infotoken_concat_;
+  CostType *h_acoustic_cost_concat_, *d_acoustic_cost_concat_;
+  InfoToken *h_extra_prev_tokens_concat_;
+  int2 *h_list_final_tokens_in_main_q_;
+  // Offsets used in MoveConcatenatedCopyToVector
+  std::vector<int32> h_main_q_end_lane_offsets_,
+      h_emitting_main_q_end_lane_offsets_;
+  std::vector<int32> h_n_extra_prev_tokens_lane_offsets_;
+  // Used when calling GetBestCost
+  std::vector<std::pair<int32, CostType>> argmins_;
+  std::vector<bool> has_reached_final_;
+  std::vector<std::vector<std::pair<int32, CostType>>>
+      list_finals_token_idx_and_cost_;
+
+  // GetRawLattice helper
+  // Data used when building the lattice in GetRawLattice
+
+  // few typedef to make GetRawLattice easier to understand
+  // Returns a unique id for each (iframe, fst_state) pair
+  // We need to be able to quickly identity a (iframe, fst_state) ID
+  //
+  // A lattice state is defined by the pair (iframe, fst_state)
+  // A token is associated to a lattice state (iframe, token.next_state)
+  // Multiple token in the same frame can be associated to the same lattice
+  // state
+  // (they all go to the same token.next_state)
+  // We need to quickly identify what is the lattice state of a token.
+  // We are able to do that through GetLatticeStateInternalId(token),
+  // which returns the internal unique ID for each lattice state for a token
+  //
+  // When we build the output lattice, we a get new lattice state
+  // output_lattice_state = fst_out->AddState()
+  // We call this one OutputLatticeState
+  // The conversion between the two is done through maps
+  // [curr|prev]_f_raw_lattice_state_
+  typedef int32 LatticeStateInternalId;
+  typedef StateId OutputLatticeState;
+  typedef int32 TokenId;
+  LatticeStateInternalId GetLatticeStateInternalId(int32 total_ntokens,
+                                                   TokenId token_idx,
+                                                   InfoToken token);
+  // Keeping track of a variety of info about states in the lattice
+  // - token_extra_cost. A path going from the current lattice_state to the
+  // end has an extra cost
+  // compared to the best path (which has an extra cost of 0).
+  // token_extra_cost is the minimum of the extra_cost of all paths going from
+  // the current lattice_state
+  // to the final frame.
+  // - fst_lattice_state is the StateId of the lattice_state in fst_out (in
+  // the output lattice). lattice_state is an internal state used in
+  // GetRawLattice.
+  // - is_state_closed is true if the token_extra_cost has been read by
+  // another token. It means that the
+  // token_extra_cost value has been used, and if we modify token_extra_cost
+  // again, we may need to recompute the current frame (so that everyone uses
+  // the latest
+  // token_extra_cost value)
+  struct RawLatticeState {
+    CostType token_extra_cost;
+    OutputLatticeState fst_lattice_state;
+    bool is_state_closed;
+  };
+  // [prev|curr]_f_raw_lattice_state_
+  // Used to get information about a lattice state (i.e. a (iframe, fst_state)
+  // pair)
+  // using its LatticeStateInternalId (its ID inside of the decoder)
+  // It gives us the OutputLatticeState (its ID in the output lattice)
+  // alongside with the extra_cost of that state in the lattice
+  // Those maps are used to build the external lattice using what we know
+  // internally
+  // Using one map per frame. We always know to which frame a token belongs.
+  // Using one big map slows everything down
+  std::unordered_map<LatticeStateInternalId, RawLatticeState>
+      prev_f_raw_lattice_state_, curr_f_raw_lattice_state_;
+  // We want the unicity of each arc_idx for one frame. Important because we
+  // can replay a frame (and possibly add multiple time the same arc)
+  std::unordered_set<int32> f_arc_idx_added_;
+  // When backtracking, we read tokens in the current frame (in
+  // q_curr_frame_todo_),
+  // we backtrack the associated arc, and we add the predecessor either to
+  // q_curr_frame_todo_ (non-emitting arc, same frame)
+  // or q_prev_frame_todo_ (emitting arc, source in previous frame)
+  std::vector<std::pair<TokenId, InfoToken>> q_curr_frame_todo_;
+  std::vector<std::pair<TokenId, InfoToken>> q_prev_frame_todo_;
+  // extra_cost_min_delta_ used in the must_replay_frame situation. Please read
+  // comments
+  // associated with must_replay_frame in GetRawLattice to understand what it
+  // does
+  CostType extra_cost_min_delta_;
+  // Resets the GetRawLattice datastructures for a new lattice generation
+  void ResetDataForGetRawLattice();
+  // Using the output from GetBestPath, we add the best tokens (as selected in
+  // GetBestCost)
+  // from the final frame to the output lattice. We also fill the data
+  // structures
+  // (such as q_curr_frame_todo_, or curr_f_raw_lattice_state_) accordingly
+  void AddFinalTokensToLattice(LaneId ilane, ChannelId ichannel,
+                               Lattice *fst_out);
+  // Check if a token should be added to the lattice. If it should, then
+  // keep_arc will be true
+  void ConsiderTokenForLattice(
+      ChannelId ichannel, int32 iprev, int32 total_ntokens, TokenId token_idx,
+      OutputLatticeState fst_lattice_start, InfoToken *tok_beg,
+      float2 *arc_extra_cost_beg, CostType token_extra_cost,
+      TokenId list_prev_token_idx, int32 list_arc_idx,
+      InfoToken *list_prev_token, CostType *this_arc_prev_token_extra_cost,
+      CostType *acoustic_cost, OutputLatticeState *lattice_src_state,
+      bool *keep_arc, bool *dbg_found_zero);
+  // Add the arc to the lattice. Also updates what needs to be updated in the
+  // GetRawLattice datastructures.
+  void AddArcToLattice(int32 list_arc_idx, TokenId list_prev_token_idx,
+                       InfoToken list_prev_token, int32 curr_frame_offset,
+                       CostType acoustic_cost,
+                       CostType this_arc_prev_token_extra_cost,
+                       LatticeStateInternalId src_state_internal_id,
+                       OutputLatticeState fst_lattice_start,
+                       OutputLatticeState to_fst_lattice_state,
+                       Lattice *fst_out, bool *must_replay_frame);
+  // Read a token information
+  void GetTokenRawLatticeData(TokenId token_idx, InfoToken token,
+                              int32 total_ntokens, CostType *token_extra_cost,
+                              OutputLatticeState *to_fst_lattice_state);
+
+  // A token is an instance of an arc. It goes to a FST state (token.next_state)
+  // Multiple token in the same frame can go to the same FST state.
+  // GetSameFSTStateTokenList
+  // returns that list
+  void GetSameFSTStateTokenList(ChannelId ichannel, InfoToken token,
+                                InfoToken **tok_beg,
+                                float2 **arc_extra_cost_beg, int32 *nprevs);
+
+  // Swap datastructures at the end of a frame. prev becomes curr (we go
+  // backward)
+  void SwapPrevAndCurrLatticeMap(int32 iframe, bool dbg_found_best_path);
+
+  KALDI_DISALLOW_COPY_AND_ASSIGN(CudaDecoder);
+};
+
+}  // end namespace cuda_decoder
+}  // end namespace kaldi
+
+#endif  // KALDI_CUDA_DECODER_CUDA_DECODER_H_
diff --git a/src/cudadecoder/cuda-fst.cc b/src/cudadecoder/cuda-fst.cc
new file mode 100644
index 00000000000..6f899d87321
--- /dev/null
+++ b/src/cudadecoder/cuda-fst.cc
@@ -0,0 +1,209 @@
+// cudadecoder/cuda-fst.cc
+//
+// Copyright (c) 2019, NVIDIA CORPORATION.  All rights reserved.
+// Hugo Braun, Justin Luitjens, Ryan Leary
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#if HAVE_CUDA == 1
+
+#include "cudadecoder/cuda-fst.h"
+
+#include <cuda_runtime_api.h>
+#include <nvToolsExt.h>
+
+namespace kaldi {
+namespace cuda_decoder {
+
+void CudaFst::ComputeOffsets(const fst::Fst<StdArc> &fst) {
+  // count states since Fst doesn't provide this functionality
+  num_states_ = 0;
+  for (fst::StateIterator<fst::Fst<StdArc> > iter(fst); !iter.Done();
+       iter.Next())
+    ++num_states_;
+
+  // allocate and initialize offset arrays
+  h_final_.resize(num_states_);
+  h_e_offsets_.resize(num_states_ + 1);
+  h_ne_offsets_.resize(num_states_ + 1);
+
+  // iterate through states and arcs and count number of arcs per state
+  e_count_ = 0;
+  ne_count_ = 0;
+
+  // Init first offsets
+  h_ne_offsets_[0] = 0;
+  h_e_offsets_[0] = 0;
+  for (int i = 0; i < num_states_; i++) {
+    h_final_[i] = fst.Final(i).Value();
+    // count emiting and non_emitting arcs
+    for (fst::ArcIterator<fst::Fst<StdArc> > aiter(fst, i); !aiter.Done();
+         aiter.Next()) {
+      StdArc arc = aiter.Value();
+      int32 ilabel = arc.ilabel;
+      if (ilabel != 0) {  // emitting
+        e_count_++;
+      } else {  // non-emitting
+        ne_count_++;
+      }
+    }
+    h_ne_offsets_[i + 1] = ne_count_;
+    h_e_offsets_[i + 1] = e_count_;
+  }
+
+  // We put the emitting arcs before the nonemitting arcs in the arc list
+  // adding offset to the non emitting arcs
+  // we go to num_states_+1 to take into account the last offset
+  for (int i = 0; i < num_states_ + 1; i++)
+    h_ne_offsets_[i] += e_count_;  // e_arcs before
+
+  arc_count_ = e_count_ + ne_count_;
+}
+
+void CudaFst::AllocateData(const fst::Fst<StdArc> &fst) {
+  d_e_offsets_ = static_cast<unsigned int *>(CuDevice::Instantiate().Malloc(
+      (num_states_ + 1) * sizeof(*d_e_offsets_)));
+  d_ne_offsets_ = static_cast<unsigned int *>(CuDevice::Instantiate().Malloc(
+      (num_states_ + 1) * sizeof(*d_ne_offsets_)));
+  d_final_ = static_cast<float *>(
+      CuDevice::Instantiate().Malloc((num_states_) * sizeof(*d_final_)));
+
+  h_arc_weights_.resize(arc_count_);
+  h_arc_nextstate_.resize(arc_count_);
+  // ilabels (id indexing)
+  h_arc_id_ilabels_.resize(arc_count_);
+  h_arc_olabels_.resize(arc_count_);
+
+  d_arc_weights_ = static_cast<float *>(
+      CuDevice::Instantiate().Malloc(arc_count_ * sizeof(*d_arc_weights_)));
+  d_arc_nextstates_ = static_cast<StateId *>(
+      CuDevice::Instantiate().Malloc(arc_count_ * sizeof(*d_arc_nextstates_)));
+
+  // Only the ilabels for the e_arc are needed on the device
+  d_arc_pdf_ilabels_ = static_cast<int32 *>(
+      CuDevice::Instantiate().Malloc(e_count_ * sizeof(*d_arc_pdf_ilabels_)));
+}
+
+void CudaFst::PopulateArcs(const fst::Fst<StdArc> &fst) {
+  // now populate arc data
+  int e_idx = 0;
+  int ne_idx = e_count_;  // starts where e_offsets_ ends
+  for (int i = 0; i < num_states_; i++) {
+    for (fst::ArcIterator<fst::Fst<StdArc> > aiter(fst, i); !aiter.Done();
+         aiter.Next()) {
+      StdArc arc = aiter.Value();
+      int idx;
+      if (arc.ilabel != 0) {  // emitting
+        idx = e_idx++;
+      } else {
+        idx = ne_idx++;
+      }
+      h_arc_weights_[idx] = arc.weight.Value();
+      h_arc_nextstate_[idx] = arc.nextstate;
+      h_arc_id_ilabels_[idx] = arc.ilabel;
+      // For now we consider id indexing == pdf indexing
+      // If the two are differents, we'll call ApplyTransModelOnIlabels with a
+      // TransitionModel
+      h_arc_pdf_ilabels_[idx] = arc.ilabel;
+      h_arc_olabels_[idx] = arc.olabel;
+    }
+  }
+}
+
+void CudaFst::ApplyTransitionModelOnIlabels(
+    const TransitionModel &trans_model) {
+  // Converting ilabel here, to avoid reindexing when reading nnet3 output
+  // We only need to convert the emitting arcs
+  // The emitting arcs are the first e_count_ arcs
+  for (int iarc = 0; iarc < e_count_; ++iarc)
+    h_arc_pdf_ilabels_[iarc] =
+        trans_model.TransitionIdToPdf(h_arc_id_ilabels_[iarc]);
+}
+
+void CudaFst::CopyDataToDevice() {
+  KALDI_DECODER_CUDA_API_CHECK_ERROR(cudaMemcpy(
+      d_e_offsets_, &h_e_offsets_[0], (num_states_ + 1) * sizeof(*d_e_offsets_),
+      cudaMemcpyHostToDevice));
+  KALDI_DECODER_CUDA_API_CHECK_ERROR(cudaMemcpy(
+      d_ne_offsets_, &h_ne_offsets_[0],
+      (num_states_ + 1) * sizeof(*d_ne_offsets_), cudaMemcpyHostToDevice));
+  KALDI_DECODER_CUDA_API_CHECK_ERROR(cudaMemcpy(d_final_, &h_final_[0],
+                                                num_states_ * sizeof(*d_final_),
+                                                cudaMemcpyHostToDevice));
+
+  KALDI_DECODER_CUDA_API_CHECK_ERROR(
+      cudaMemcpy(d_arc_weights_, &h_arc_weights_[0],
+                 arc_count_ * sizeof(*d_arc_weights_), cudaMemcpyHostToDevice));
+  KALDI_DECODER_CUDA_API_CHECK_ERROR(cudaMemcpy(
+      d_arc_nextstates_, &h_arc_nextstate_[0],
+      arc_count_ * sizeof(*d_arc_nextstates_), cudaMemcpyHostToDevice));
+  KALDI_DECODER_CUDA_API_CHECK_ERROR(cudaMemcpy(
+      d_arc_pdf_ilabels_, &h_arc_pdf_ilabels_[0],
+      e_count_ * sizeof(*d_arc_pdf_ilabels_), cudaMemcpyHostToDevice));
+}
+
+void CudaFst::Initialize(const fst::Fst<StdArc> &fst,
+                         const TransitionModel *trans_model) {
+  nvtxRangePushA("CudaFst constructor");
+  start_ = fst.Start();
+
+  ComputeOffsets(fst);
+  AllocateData(fst);
+  // Temporarily allocating data for this vector
+  // We just need it during CSR generation. We will clear it
+  // at the end of Initialize
+  h_arc_pdf_ilabels_.resize(arc_count_);
+  PopulateArcs(fst);
+  if (trans_model) ApplyTransitionModelOnIlabels(*trans_model);
+
+  KALDI_ASSERT(d_e_offsets_);
+  KALDI_ASSERT(d_ne_offsets_);
+  KALDI_ASSERT(d_final_);
+  KALDI_ASSERT(d_arc_weights_);
+  KALDI_ASSERT(d_arc_nextstates_);
+  KALDI_ASSERT(d_arc_pdf_ilabels_);
+
+  CopyDataToDevice();
+
+  // Making sure the graph is ready
+  cudaDeviceSynchronize();
+  KALDI_DECODER_CUDA_CHECK_ERROR();
+  h_arc_pdf_ilabels_.clear();  // we don't need those on host
+  nvtxRangePop();
+}
+
+void CudaFst::Finalize() {
+  nvtxRangePushA("CudaFst destructor");
+
+  // Making sure that Initialize was called before Finalize
+  KALDI_ASSERT(d_e_offsets_ &&
+               "Please call CudaFst::Initialize() before calling Finalize()");
+  KALDI_ASSERT(d_ne_offsets_);
+  KALDI_ASSERT(d_final_);
+  KALDI_ASSERT(d_arc_weights_);
+  KALDI_ASSERT(d_arc_nextstates_);
+  KALDI_ASSERT(d_arc_pdf_ilabels_);
+
+  CuDevice::Instantiate().Free(d_e_offsets_);
+  CuDevice::Instantiate().Free(d_ne_offsets_);
+  CuDevice::Instantiate().Free(d_final_);
+  CuDevice::Instantiate().Free(d_arc_weights_);
+  CuDevice::Instantiate().Free(d_arc_nextstates_);
+  CuDevice::Instantiate().Free(d_arc_pdf_ilabels_);
+  nvtxRangePop();
+}
+
+}  // end namespace cuda_decoder
+}  // end namespace kaldi
+
+#endif  // HAVE_CUDA == 1
diff --git a/src/cudadecoder/cuda-fst.h b/src/cudadecoder/cuda-fst.h
new file mode 100644
index 00000000000..1dac627755b
--- /dev/null
+++ b/src/cudadecoder/cuda-fst.h
@@ -0,0 +1,122 @@
+// cudadecoder/cuda-fst.h
+//
+// Copyright (c) 2019, NVIDIA CORPORATION.  All rights reserved.
+// Hugo Braun, Justin Luitjens, Ryan Leary
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef KALDI_CUDA_DECODER_CUDA_FST_H_
+#define KALDI_CUDA_DECODER_CUDA_FST_H_
+#include "cudadecoder/cuda-decoder-common.h"
+#include "cudamatrix/cu-device.h"
+#include "lat/kaldi-lattice.h"
+#include "nnet3/decodable-online-looped.h"  // TransitionModel
+
+namespace kaldi {
+namespace cuda_decoder {
+
+typedef fst::StdArc StdArc;
+typedef StdArc::Weight StdWeight;
+typedef StdArc::Label Label;
+
+// FST in both device and host memory
+// Converting the OpenFst format to the CSR Compressed Sparse Row (CSR) Matrix
+// format.
+// https://en.wikipedia.org/wiki/Sparse_matrix#Compressed_sparse_row_(CSR,_CRS_or_Yale_format)
+// Where states = rows and arcs = columns.
+// This format allows us to store the FST in a compact form, and leads to clean
+// memory accesses
+// For instance, when loading the arcs from a given source, we can load all arc
+// informations (destination, weight, etc.) with coalesced reads
+// Emitting arcs and non-emitting arcs are stored as separate matrices for
+// efficiency
+// We then copy the FST to the device (while keeping its original copy on host)
+class CudaFst {
+ public:
+  CudaFst()
+      : d_e_offsets_(nullptr),
+        d_ne_offsets_(nullptr),
+        d_arc_weights_(nullptr),
+        d_arc_nextstates_(nullptr),
+        d_arc_pdf_ilabels_(nullptr),
+        d_final_(nullptr){};
+  // Creates a CSR representation of the FST,
+  // then copies it to the GPU
+  // If a TransitionModel is passed, we'll use it to convert the ilabels id
+  // indexes into pdf indexes
+  // If no TransitionModel is passed, we'll assume TransitionModel == identity
+  // Important: The CudaDecodable won't apply the TransitionModel. If you use a
+  // TransitionModel, you need to apply it now
+  void Initialize(const fst::Fst<StdArc> &fst,
+                  const TransitionModel *trans_model = NULL);
+  void Finalize();
+
+  inline uint32_t NumStates() const { return num_states_; }
+  inline StateId Start() const { return start_; }
+
+ private:
+  friend class CudaDecoder;
+  // Counts arcs and computes offsets of the fst passed in
+  void ComputeOffsets(const fst::Fst<StdArc> &fst);
+  // Allocates memory to store FST
+  void AllocateData(const fst::Fst<StdArc> &fst);
+  // Populate the arcs data (arc.destination, arc.weights, etc.)
+  void PopulateArcs(const fst::Fst<StdArc> &fst);
+  // Converting the id ilabels into pdf ilabels using the transition model
+  // It allows the CudaDecoder to read the acoustic model loglikelihoods at the
+  // right indexes
+  void ApplyTransitionModelOnIlabels(const TransitionModel &trans_model);
+  // Copies fst to device into the pre-allocated datastructures
+  void CopyDataToDevice();
+  // Total number of states
+  unsigned int num_states_;
+  // Starting state of the FST
+  // Computation should start from state start_
+  StateId start_;
+  // Number of emitting, non-emitting, and total number of arcs
+  unsigned int e_count_, ne_count_, arc_count_;
+  // This data structure is similar to a CSR matrix format
+  // with 2 offsets matrices (one emitting one non-emitting).
+  // Offset arrays are num_states_+1 in size (last state needs
+  // its +1 arc_offset)
+  // Arc values for state i are stored in the range of [offset[i],offset[i+1][
+  unsigned int *d_e_offsets_;  // Emitting offset arrays
+  std::vector<unsigned int> h_e_offsets_;
+  unsigned int *d_ne_offsets_;  // Non-emitting offset arrays
+  std::vector<unsigned int> h_ne_offsets_;
+  // These are the values for each arc.
+  // Arcs belonging to state i are found in the range of [offsets[i],
+  // offsets[i+1][
+  // Use e_offsets or ne_offsets depending on what you need
+  // (emitting/nonemitting)
+  // The ilabels arrays are of size e_count_, not arc_count_
+  std::vector<CostType> h_arc_weights_;
+  CostType *d_arc_weights_;
+  std::vector<StateId> h_arc_nextstate_;
+  StateId *d_arc_nextstates_;
+  std::vector<int32> h_arc_id_ilabels_;
+  int32 *d_arc_pdf_ilabels_;
+  std::vector<int32> h_arc_olabels_;
+  // Final costs
+  // final cost of state i is h_final_[i]
+  std::vector<CostType> h_final_;
+  CostType *d_final_;
+
+  // ilabels (pdf indexing)
+  // only populate during CSR generation, cleared after (not needed on host)
+  std::vector<int32> h_arc_pdf_ilabels_;
+};
+
+}  // end namespace cuda_decoder
+}  // end namespace kaldi
+#endif  // KALDI_CUDA_DECODER_CUDA_FST_H_
diff --git a/src/cudadecoder/decodable-cumatrix.cc b/src/cudadecoder/decodable-cumatrix.cc
new file mode 100644
index 00000000000..d7c1d0359a5
--- /dev/null
+++ b/src/cudadecoder/decodable-cumatrix.cc
@@ -0,0 +1,62 @@
+// cudadecoder/decodable-cumatrix.cc
+/*
+ * Copyright (c) 2017, NVIDIA CORPORATION.  All rights reserved.
+ * Authors:  Hugo Braun, Justin Luitjens, Ryan Leary
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#if HAVE_CUDA == 1
+
+#include "decodable-cumatrix.h"
+
+namespace kaldi {
+namespace cuda_decoder {
+
+DecodableCuMatrixMapped::DecodableCuMatrixMapped(
+    const TransitionModel &tm, const CuMatrixBase<BaseFloat> &likes,
+    int32 frame_offset)
+    : trans_model_(tm), likes_(&likes), frame_offset_(frame_offset) {
+  if (likes.NumCols() != tm.NumPdfs())
+    KALDI_ERR << "Mismatch, matrix has " << likes.NumCols()
+              << " rows but transition-model has " << tm.NumPdfs()
+              << " pdf-ids.";
+}
+
+int32 DecodableCuMatrixMapped::NumFramesReady() const {
+  return frame_offset_ + likes_->NumRows();
+}
+
+bool DecodableCuMatrixMapped::IsLastFrame(int32 frame) const {
+  KALDI_ASSERT(frame < NumFramesReady());
+  return (frame == NumFramesReady() - 1);
+}
+
+// Indices are one-based!  This is for compatibility with OpenFst.
+int32 DecodableCuMatrixMapped::NumIndices() const {
+  return trans_model_.NumTransitionIds();
+}
+
+// returns cuda pointer to nnet3 output
+BaseFloat *
+DecodableCuMatrixMapped::GetLogLikelihoodsCudaPointer(int32 subsampled_frame) {
+  BaseFloat *frame_nnet3_out =
+      (BaseFloat *)likes_->Data() +
+      (subsampled_frame - frame_offset_) * likes_->Stride();
+  return frame_nnet3_out;
+};
+
+}  // end namespace cuda_decoder
+}  // end namespace kaldi
+
+#endif  // HAVE_CUDA == 1
diff --git a/src/cudadecoder/decodable-cumatrix.h b/src/cudadecoder/decodable-cumatrix.h
new file mode 100644
index 00000000000..d34079cc9c7
--- /dev/null
+++ b/src/cudadecoder/decodable-cumatrix.h
@@ -0,0 +1,71 @@
+// cudadecoder/decodable-cumatrix.h
+/*
+ * Copyright (c) 2017, NVIDIA CORPORATION.  All rights reserved.
+ * Authors:  Hugo Braun, Justin Luitjens, Ryan Leary
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef KALDI_CUDA_DECODER_DECODABLE_CUMATRIX_H_
+#define KALDI_CUDA_DECODER_DECODABLE_CUMATRIX_H_
+
+#include "cudadecoder/cuda-decodable-itf.h"
+#include "cudamatrix/cu-matrix.h"
+#include "decoder/decodable-matrix.h"
+
+namespace kaldi {
+namespace cuda_decoder {
+
+/**
+  Cuda Decodable matrix.  Takes transition model and posteriors and provides
+  an interface similar to the Decodable Interface
+  */
+class DecodableCuMatrixMapped : public CudaDecodableInterface {
+public:
+  // This constructor creates an object that will not delete "likes" when done.
+  // the frame_offset is the frame the row 0 of 'likes' corresponds to, would be
+  // greater than one if this is not the first chunk of likelihoods.
+  DecodableCuMatrixMapped(const TransitionModel &tm,
+                          const CuMatrixBase<BaseFloat> &likes,
+                          int32 frame_offset = 0);
+
+  virtual int32 NumFramesReady() const;
+
+  virtual bool IsLastFrame(int32 frame) const;
+
+  virtual BaseFloat LogLikelihood(int32 frame, int32 tid) {
+    KALDI_ASSERT(false);
+    return 0.0f;  // never executed, compiler requests a return
+  };
+
+  // Note: these indices are 1-based.
+  virtual int32 NumIndices() const;
+
+  virtual ~DecodableCuMatrixMapped(){};
+
+  // returns cuda pointer to nnet3 output
+  virtual BaseFloat *GetLogLikelihoodsCudaPointer(int32 subsampled_frame);
+
+private:
+  const TransitionModel &trans_model_; // for tid to pdf mapping
+  const CuMatrixBase<BaseFloat> *likes_;
+
+  int32 frame_offset_;
+
+  KALDI_DISALLOW_COPY_AND_ASSIGN(DecodableCuMatrixMapped);
+};
+
+}  // end namespace cuda_decoder
+}  // end namespace kaldi.
+
+#endif  // KALDI_CUDA_DECODER_DECODABLE_CUMATRIX_H_
diff --git a/src/cudadecoder/thread-pool.h b/src/cudadecoder/thread-pool.h
new file mode 100644
index 00000000000..43d4443818d
--- /dev/null
+++ b/src/cudadecoder/thread-pool.h
@@ -0,0 +1,117 @@
+// cudadecoder/thread-pool.h
+// Source:  https://github.com/progschj/ThreadPool
+// Unmodified except for reformatting to Google style
+// Ubtained under this license:
+/*
+Copyright (c) 2012 Jakob Progsch, Václav Zeman
+
+This software is provided 'as-is', without any express or implied
+warranty. In no event will the authors be held liable for any damages
+arising from the use of this software.
+
+Permission is granted to anyone to use this software for any purpose,
+including commercial applications, and to alter it and redistribute it
+freely, subject to the following restrictions:
+
+   1. The origin of this software must not be misrepresented; you must not
+   claim that you wrote the original software. If you use this software
+   in a product, an acknowledgment in the product documentation would be
+   appreciated but is not required.
+
+   2. Altered source versions must be plainly marked as such, and must not be
+   misrepresented as being the original software.
+
+   3. This notice may not be removed or altered from any source
+   distribution.
+*/
+
+#ifndef KALDI_CUDA_DECODER_THREAD_POOL_H_
+#define KALDI_CUDA_DECODER_THREAD_POOL_H_
+
+#include <condition_variable>
+#include <functional>
+#include <future>
+#include <memory>
+#include <mutex>
+#include <queue>
+#include <stdexcept>
+#include <thread>
+#include <vector>
+
+class ThreadPool {
+public:
+  ThreadPool(size_t);
+  template <class F, class... Args>
+  auto enqueue(F &&f, Args &&... args)
+      -> std::future<typename std::result_of<F(Args...)>::type>;
+  ~ThreadPool();
+
+private:
+  // need to keep track of threads so we can join them
+  std::vector<std::thread> workers;
+  // the task queue
+  std::queue<std::function<void()>> tasks;
+
+  // synchronization
+  std::mutex queue_mutex;
+  std::condition_variable condition;
+  bool stop;
+};
+
+// the constructor just launches some amount of workers
+inline ThreadPool::ThreadPool(size_t threads) : stop(false) {
+  for (size_t i = 0; i < threads; ++i)
+    workers.emplace_back([this] {
+      for (;;) {
+        std::function<void()> task;
+
+        {
+          std::unique_lock<std::mutex> lock(this->queue_mutex);
+          this->condition.wait(
+              lock, [this] { return this->stop || !this->tasks.empty(); });
+          if (this->stop && this->tasks.empty())
+            return;
+          task = std::move(this->tasks.front());
+          this->tasks.pop();
+        }
+
+        task();
+      }
+    });
+}
+
+// add new work item to the pool
+template <class F, class... Args>
+auto ThreadPool::enqueue(F &&f, Args &&... args)
+    -> std::future<typename std::result_of<F(Args...)>::type> {
+  using return_type = typename std::result_of<F(Args...)>::type;
+
+  auto task = std::make_shared<std::packaged_task<return_type()>>(
+      std::bind(std::forward<F>(f), std::forward<Args>(args)...));
+
+  std::future<return_type> res = task->get_future();
+  {
+    std::unique_lock<std::mutex> lock(queue_mutex);
+
+    // don't allow enqueueing after stopping the pool
+    if (stop)
+      throw std::runtime_error("enqueue on stopped ThreadPool");
+
+    tasks.emplace([task]() { (*task)(); });
+  }
+  condition.notify_one();
+  return res;
+}
+
+// the destructor joins all threads
+inline ThreadPool::~ThreadPool() {
+  {
+    std::unique_lock<std::mutex> lock(queue_mutex);
+    stop = true;
+  }
+  condition.notify_all();
+  for (std::thread &worker : workers)
+    worker.join();
+}
+
+#endif  // KALDI_CUDA_DECODER_THREAD_POOL_H_
diff --git a/src/cudadecoderbin/Makefile b/src/cudadecoderbin/Makefile
new file mode 100644
index 00000000000..0692126dacc
--- /dev/null
+++ b/src/cudadecoderbin/Makefile
@@ -0,0 +1,27 @@
+all:
+
+include ../kaldi.mk
+
+ifeq ($(CUDA), true)
+
+LDFLAGS += $(CUDA_LDFLAGS)
+LDLIBS += $(CUDA_LDLIBS)
+
+BINFILES = batched-wav-nnet3-cuda
+
+OBJFILES =
+
+TESTFILES =
+
+ADDLIBS = ../cudadecoder/kaldi-cudadecoder.a  \
+../online2/kaldi-online2.a ../ivector/kaldi-ivector.a \
+../nnet3/kaldi-nnet3.a ../chain/kaldi-chain.a ../nnet2/kaldi-nnet2.a \
+../cudamatrix/kaldi-cudamatrix.a ../decoder/kaldi-decoder.a \
+../lat/kaldi-lat.a ../fstext/kaldi-fstext.a ../hmm/kaldi-hmm.a \
+../feat/kaldi-feat.a ../transform/kaldi-transform.a \
+../gmm/kaldi-gmm.a ../tree/kaldi-tree.a ../util/kaldi-util.a \
+../matrix/kaldi-matrix.a ../base/kaldi-base.a 
+
+endif
+
+include ../makefiles/default_rules.mk
diff --git a/src/cudadecoderbin/batched-wav-nnet3-cuda.cc b/src/cudadecoderbin/batched-wav-nnet3-cuda.cc
new file mode 100644
index 00000000000..a59c1e2a1b1
--- /dev/null
+++ b/src/cudadecoderbin/batched-wav-nnet3-cuda.cc
@@ -0,0 +1,306 @@
+// cudadecoderbin/batched-wav-nnet3-cuda.cc
+//
+// Copyright (c) 2019, NVIDIA CORPORATION.  All rights reserved.
+// Hugo Braun, Justin Luitjens, Ryan Leary
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#if HAVE_CUDA == 1
+
+#include <cuda.h>
+#include <cuda_profiler_api.h>
+#include <nvToolsExt.h>
+#include <sstream>
+#include "cudadecoder/batched-threaded-nnet3-cuda-pipeline.h"
+#include "cudamatrix/cu-allocator.h"
+#include "fstext/fstext-lib.h"
+#include "lat/lattice-functions.h"
+#include "nnet3/am-nnet-simple.h"
+#include "nnet3/nnet-utils.h"
+#include "util/kaldi-thread.h"
+
+using namespace kaldi;
+using namespace cuda_decoder;
+
+void GetDiagnosticsAndPrintOutput(const std::string &utt,
+                                  const fst::SymbolTable *word_syms,
+                                  const CompactLattice &clat,
+                                  int64 *tot_num_frames, double *tot_like) {
+  if (clat.NumStates() == 0) {
+    KALDI_WARN << "Empty lattice.";
+    return;
+  }
+  CompactLattice best_path_clat;
+  CompactLatticeShortestPath(clat, &best_path_clat);
+
+  Lattice best_path_lat;
+  ConvertLattice(best_path_clat, &best_path_lat);
+
+  double likelihood;
+  LatticeWeight weight;
+  int32 num_frames;
+  std::vector<int32> alignment;
+  std::vector<int32> words;
+  GetLinearSymbolSequence(best_path_lat, &alignment, &words, &weight);
+  num_frames = alignment.size();
+  likelihood = -(weight.Value1() + weight.Value2());
+  *tot_num_frames += num_frames;
+  *tot_like += likelihood;
+  KALDI_VLOG(2) << "Likelihood per frame for utterance " << utt << " is "
+                << (likelihood / num_frames) << " over " << num_frames
+                << " frames.";
+
+  if (word_syms != NULL) {
+    std::ostringstream oss_warn;
+    oss_warn << utt << " ";
+    for (size_t i = 0; i < words.size(); i++) {
+      std::string s = word_syms->Find(words[i]);
+      if (s == "")
+        oss_warn << "Word-id " << words[i] << " not in symbol table.";
+      oss_warn << s << " ";
+    }
+    KALDI_WARN << oss_warn.str();
+  }
+}
+
+// using a macro here to avoid a ton of parameters in a function
+// while also being able to reuse this in two spots
+void FinishOneDecode(
+    const BatchedThreadedNnet3CudaPipelineConfig &batched_decoder_config,
+    const fst::SymbolTable *word_syms, const bool write_lattice,
+    const int32 total_audio, const int32 count_per_iteration,
+    BatchedThreadedNnet3CudaPipeline *cuda_pipeline,
+    std::queue<std::pair<std::string, std::string>> *processed,
+    CompactLatticeWriter *clat_writer, Timer *timer, int32 *current_count,
+    int64 *num_frames, int32 *output_iter, double *tot_like) {
+  std::string &utt = processed->front().first;
+  std::string &key = processed->front().second;
+  CompactLattice clat;
+  bool valid;
+
+  if (batched_decoder_config.determinize_lattice) {
+    valid = cuda_pipeline->GetLattice(key, &clat);
+  } else {
+    Lattice lat;
+    valid = cuda_pipeline->GetRawLattice(key, &lat);
+    ConvertLattice(lat, &clat);
+  }
+  if (valid) {
+    GetDiagnosticsAndPrintOutput(utt, word_syms, clat, num_frames, tot_like);
+    if (write_lattice && key == utt) { /*only write output on first iteration*/
+      nvtxRangePushA("Lattice Write");
+      clat_writer->Write(utt, clat);
+      nvtxRangePop();
+    }
+  }
+  cuda_pipeline->CloseDecodeHandle(key);
+  processed->pop();
+  if (++(*current_count) ==
+      count_per_iteration) { /*this utt is the last in an iter*/
+    double total_time = timer->Elapsed();
+    KALDI_VLOG(2) << "Iteration: " << *output_iter
+                  << " ~Aggregate Total Time: " << total_time
+                  << " Total Audio: " << total_audio * *output_iter
+                  << " RealTimeX: " << *output_iter * total_audio / total_time;
+    current_count = 0;
+    (*output_iter)++;
+  }
+  }
+
+int main(int argc, char *argv[]) {
+  try {
+    using namespace kaldi;
+    using namespace fst;
+
+    typedef kaldi::int32 int32;
+    typedef kaldi::int64 int64;
+
+    const char *usage =
+        "Reads in wav file(s) and simulates online decoding with neural nets\n"
+        "(nnet3 setup), with optional iVector-based speaker adaptation and\n"
+        "optional endpointing.  Note: some configuration values and inputs "
+        "are\n"
+        "set via config files whose filenames are passed as options\n"
+        "\n"
+        "Usage: batched-wav-nnet3-cuda [options] <nnet3-in> <fst-in> "
+        "<wav-rspecifier> <lattice-wspecifier>\n";
+
+    std::string word_syms_rxfilename;
+
+    bool write_lattice = true;
+    int num_todo = -1;
+    int iterations = 1;
+    ParseOptions po(usage);
+    int pipeline_length = 4000; // length of pipeline of outstanding requests,
+                                // this is independent of queue lengths in
+                                // decoder
+
+    po.Register("write-lattice", &write_lattice,
+                "Output lattice to a file. Setting to false is useful when "
+                "benchmarking");
+    po.Register("word-symbol-table", &word_syms_rxfilename,
+                "Symbol table for words [for debug output]");
+    po.Register("file-limit", &num_todo,
+                "Limits the number of files that are processed by this driver. "
+                "After N files are processed the remaining files are ignored. "
+                "Useful for profiling");
+    po.Register("iterations", &iterations,
+                "Number of times to decode the corpus. Output will be written "
+                "only once.");
+
+    // Multi-threaded CPU and batched GPU decoder
+    BatchedThreadedNnet3CudaPipelineConfig batched_decoder_config;
+
+    CuDevice::RegisterDeviceOptions(&po);
+    RegisterCuAllocatorOptions(&po);
+    batched_decoder_config.Register(&po);
+
+    po.Read(argc, argv);
+
+    if (po.NumArgs() != 4) {
+      po.PrintUsage();
+      return 1;
+    }
+
+    g_cuda_allocator.SetOptions(g_allocator_options);
+    CuDevice::Instantiate().SelectGpuId("yes");
+    CuDevice::Instantiate().AllowMultithreading();
+
+    BatchedThreadedNnet3CudaPipeline cuda_pipeline(batched_decoder_config);
+
+    std::string nnet3_rxfilename = po.GetArg(1), fst_rxfilename = po.GetArg(2),
+                wav_rspecifier = po.GetArg(3), clat_wspecifier = po.GetArg(4);
+
+    TransitionModel trans_model;
+    nnet3::AmNnetSimple am_nnet;
+
+    // read transition model and nnet
+    bool binary;
+    Input ki(nnet3_rxfilename, &binary);
+    trans_model.Read(ki.Stream(), binary);
+    am_nnet.Read(ki.Stream(), binary);
+    SetBatchnormTestMode(true, &(am_nnet.GetNnet()));
+    SetDropoutTestMode(true, &(am_nnet.GetNnet()));
+    nnet3::CollapseModel(nnet3::CollapseModelConfig(), &(am_nnet.GetNnet()));
+
+    CompactLatticeWriter clat_writer(clat_wspecifier);
+
+    fst::Fst<fst::StdArc> *decode_fst =
+        fst::ReadFstKaldiGeneric(fst_rxfilename);
+
+    cuda_pipeline.Initialize(*decode_fst, am_nnet, trans_model);
+
+    delete decode_fst;
+
+    fst::SymbolTable *word_syms = NULL;
+    if (word_syms_rxfilename != "")
+      if (!(word_syms = fst::SymbolTable::ReadText(word_syms_rxfilename)))
+        KALDI_ERR << "Could not read symbol table from file "
+                  << word_syms_rxfilename;
+
+    int32 num_done = 0, num_err = 0;
+    double tot_like = 0.0;
+    int64 num_frames = 0;
+    double total_audio = 0;
+
+    nvtxRangePush("Global Timer");
+
+    // starting timer here so we
+    // can measure throughput
+    // without allocation
+    // overheads
+    // using kaldi timer, which starts counting in the constructor
+    Timer timer;
+
+    int count_per_iteration = 0;
+    int current_count = 0;
+    int output_iter = 1;
+
+    std::queue<std::pair<std::string, std::string>> processed;
+    for (int iter = 0; iter < iterations; iter++) {
+      SequentialTableReader<WaveHolder> wav_reader(wav_rspecifier);
+
+      for (; !wav_reader.Done(); wav_reader.Next()) {
+        nvtxRangePushA("Utterance Iteration");
+
+        std::string utt = wav_reader.Key();
+        std::string key = utt;
+        if (iter > 0) {
+          // make key unique for subsequent iterations
+          key = key + "-" + std::to_string(iter);
+        }
+        const WaveData &wave_data = wav_reader.Value();
+
+        if (iter == 0) {
+          // calculating number of utterances per iteration
+          count_per_iteration++;
+          // calculating total audio time per iteration
+          total_audio += wave_data.Duration();
+        }
+
+        cuda_pipeline.OpenDecodeHandle(key, wave_data);
+        processed.push(pair<string, string>(utt, key));
+        num_done++;
+
+        while (processed.size() >= pipeline_length) {
+          FinishOneDecode(batched_decoder_config, word_syms, write_lattice,
+                          total_audio, count_per_iteration, &cuda_pipeline,
+                          &processed, &clat_writer, &timer, &current_count,
+                          &num_frames, &output_iter, &tot_like);
+        }  // end while
+
+        nvtxRangePop();
+        if (num_todo != -1 && num_done >= num_todo)
+          break;
+      } // end utterance loop
+
+    } // end iterations loop
+
+    while (processed.size() > 0) {
+      FinishOneDecode(batched_decoder_config, word_syms, write_lattice,
+                      total_audio, count_per_iteration, &cuda_pipeline,
+                      &processed, &clat_writer, &timer, &current_count,
+                      &num_frames, &output_iter, &tot_like);
+    } // end while
+
+    KALDI_LOG << "Decoded " << num_done << " utterances, " << num_err
+              << " with errors.";
+    KALDI_LOG << "Overall likelihood per frame was " << (tot_like / num_frames)
+              << " per frame over " << num_frames << " frames.";
+
+    // number of seconds elapsed since the creation of timer
+    double total_time = timer.Elapsed();
+    nvtxRangePop();
+
+    KALDI_LOG << "Overall: "
+              << " Aggregate Total Time: " << total_time
+              << " Total Audio: " << total_audio * iterations
+              << " RealTimeX: " << total_audio * iterations / total_time;
+
+    delete word_syms; // will delete if non-NULL.
+
+    clat_writer.Close();
+
+    cuda_pipeline.Finalize();
+    cudaDeviceSynchronize();
+
+    return 0;
+
+    // return (num_done != 0 ? 0 : 1);
+  } catch (const std::exception &e) {
+    std::cerr << e.what();
+    return -1;
+  }
+} // main()
+
+#endif  // if HAVE_CUDA == 1
diff --git a/src/nnet2/nnet-component-test.cc b/src/nnet2/nnet-component-test.cc
deleted file mode 100644
index 5aeaf28cd1e..00000000000
--- a/src/nnet2/nnet-component-test.cc
+++ /dev/null
@@ -1,909 +0,0 @@
-// nnet2/nnet-component-test.cc
-
-// Copyright 2012-2014  Johns Hopkins University (author:  Daniel Povey)
-//                2015  Guoguo Chen
-
-// See ../../COPYING for clarification regarding multiple authors
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//  http://www.apache.org/licenses/LICENSE-2.0
-//
-// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
-// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
-// MERCHANTABLITY OR NON-INFRINGEMENT.
-// See the Apache 2 License for the specific language governing permissions and
-// limitations under the License.
-
-#include "nnet2/nnet-component.h"
-#include "util/common-utils.h"
-
-namespace kaldi {
-namespace nnet2 {
-
-
-void UnitTestGenericComponentInternal(const Component &component,
-                                      const ChunkInfo in_info,
-                                      const ChunkInfo out_info)  {
-
-  CuMatrix<BaseFloat> input(in_info.NumRows(), in_info.NumCols()),
-      output(1, out_info.NumRows() * out_info.NumCols());
-  input.SetRandn();
-  CuVector<BaseFloat> objf_vec(out_info.NumCols()); // objective function is linear function of output.
-  objf_vec.SetRandn(); // set to Gaussian noise.
-
-  int32 rand_seed = Rand();
-
-  RandomComponent *rand_component =
-      const_cast<RandomComponent*>(dynamic_cast<const RandomComponent*>(&component));
-  if (rand_component != NULL) {
-    srand(rand_seed);
-    rand_component->ResetGenerator();
-  }
-  component.Propagate(in_info, out_info, input, &output);
-  {
-    bool binary = (Rand() % 2 == 0);
-    Output ko("tmpf", binary);
-    component.Write(ko.Stream(), binary);
-  }
-  Component *component_copy;
-  {
-    bool binary_in;
-    Input ki("tmpf", &binary_in);
-    component_copy = Component::ReadNew(ki.Stream(), binary_in);
-  }
-  unlink("tmpf");
-
-  { // Test backward derivative is correct.
-    CuVector<BaseFloat> output_objfs(out_info.NumRows());
-    output_objfs.AddMatVec(1.0, output, kNoTrans, objf_vec, 0.0);
-    BaseFloat objf = output_objfs.Sum();
-
-
-    CuMatrix<BaseFloat> output_deriv(output.NumRows(), output.NumCols());
-    for (int32 i = 0; i < output_deriv.NumRows(); i++)
-      output_deriv.Row(i).CopyFromVec(objf_vec);
-
-    CuMatrix<BaseFloat> input_deriv(input.NumRows(), input.NumCols());
-
-
-    CuMatrix<BaseFloat> empty_mat;
-    CuMatrix<BaseFloat> &input_ref =
-        (component_copy->BackpropNeedsInput() ? input : empty_mat),
-        &output_ref =
-        (component_copy->BackpropNeedsOutput() ? output : empty_mat);
-
-    component_copy->Backprop(in_info, out_info, input_ref, output_ref,
-                             output_deriv, NULL, &input_deriv);
-
-    int32 num_ok = 0, num_bad = 0, num_tries = 10;
-    KALDI_LOG << "Comparing feature gradients " << num_tries << " times.";
-    for (int32 i = 0; i < num_tries; i++) {
-      CuMatrix<BaseFloat> perturbed_input(input.NumRows(), input.NumCols());
-      {
-        RandomComponent *rand_component =
-            const_cast<RandomComponent*>(dynamic_cast<const RandomComponent*>(&component));
-        if (rand_component != NULL) {
-          srand(rand_seed);
-          rand_component->ResetGenerator();
-        }
-      }
-      perturbed_input.SetRandn();
-      perturbed_input.Scale(1.0e-04); // scale by a small amount so it's like a delta.
-      BaseFloat predicted_difference = TraceMatMat(perturbed_input,
-                                                   input_deriv, kTrans);
-      perturbed_input.AddMat(1.0, input); // now it's the input + a delta.
-      { // Compute objf with perturbed input and make sure it matches
-        // prediction.
-        CuMatrix<BaseFloat> perturbed_output(output.NumRows(), output.NumCols());
-        {
-          RandomComponent *rand_component =
-              const_cast<RandomComponent*>(dynamic_cast<const RandomComponent*>(&component));
-          if (rand_component != NULL) {
-            srand(rand_seed);
-            rand_component->ResetGenerator();
-          }
-        }
-        component.Propagate(in_info, out_info, perturbed_input, &perturbed_output);
-        CuVector<BaseFloat> perturbed_output_objfs(out_info.NumRows());
-        perturbed_output_objfs.AddMatVec(1.0, perturbed_output, kNoTrans,
-                                         objf_vec, 0.0);
-        BaseFloat perturbed_objf = perturbed_output_objfs.Sum(),
-             observed_difference = perturbed_objf - objf;
-        KALDI_LOG << "Input gradients: comparing " << predicted_difference
-                  << " and " << observed_difference;
-        if (fabs(predicted_difference - observed_difference) >
-            0.15 * fabs((predicted_difference + observed_difference)/2) &&
-            fabs(predicted_difference - observed_difference) > 1.0e-06) {
-          KALDI_WARN << "Bad difference!";
-          num_bad++;
-        } else {
-          num_ok++;
-        }
-      }
-    }
-    KALDI_LOG << "Succeeded for " << num_ok << " out of " << num_tries
-              << " tries.";
-    if (num_ok <= num_bad) {
-      delete component_copy;
-      KALDI_ERR << "Feature-derivative check failed";
-    }
-  }
-
-  UpdatableComponent *ucomponent =
-      dynamic_cast<UpdatableComponent*>(component_copy);
-
-  if (ucomponent != NULL) { // Test parameter derivative is correct.
-
-    int32 num_ok = 0, num_bad = 0, num_tries = 10;
-    KALDI_LOG << "Comparing model gradients " << num_tries << " times.";
-    for (int32 i = 0; i < num_tries; i++) {
-      UpdatableComponent *perturbed_ucomponent =
-          dynamic_cast<UpdatableComponent*>(ucomponent->Copy()),
-          *gradient_ucomponent =
-          dynamic_cast<UpdatableComponent*>(ucomponent->Copy());
-      KALDI_ASSERT(perturbed_ucomponent != NULL);
-      gradient_ucomponent->SetZero(true); // set params to zero and treat as gradient.
-      BaseFloat perturb_stddev = 5.0e-04;
-      perturbed_ucomponent->PerturbParams(perturb_stddev);
-
-      CuVector<BaseFloat> output_objfs(out_info.NumRows());
-      output_objfs.AddMatVec(1.0, output, kNoTrans, objf_vec, 0.0);
-      BaseFloat objf = output_objfs.Sum();
-
-      CuMatrix<BaseFloat> output_deriv(output.NumRows(), output.NumCols());
-      for (int32 i = 0; i < output_deriv.NumRows(); i++)
-        output_deriv.Row(i).CopyFromVec(objf_vec);
-      CuMatrix<BaseFloat> input_deriv; // (input.NumRows(), input.NumCols());
-
-      // This will compute the parameter gradient.
-      ucomponent->Backprop(in_info, out_info, input, output, output_deriv,
-                           gradient_ucomponent, &input_deriv);
-
-      // Now compute the perturbed objf.
-      BaseFloat objf_perturbed;
-      {
-        CuMatrix<BaseFloat> output_perturbed; // (num_egs, output_dim);
-        {
-          RandomComponent *rand_component =
-              const_cast<RandomComponent*>(dynamic_cast<const RandomComponent*>(&component));
-          if (rand_component != NULL) {
-            srand(rand_seed);
-            rand_component->ResetGenerator();
-          }
-        }
-        perturbed_ucomponent->Propagate(in_info, out_info, input, &output_perturbed);
-        CuVector<BaseFloat> output_objfs_perturbed(out_info.NumRows());
-        output_objfs_perturbed.AddMatVec(1.0, output_perturbed,
-                                         kNoTrans, objf_vec, 0.0);
-        objf_perturbed = output_objfs_perturbed.Sum();
-      }
-
-      BaseFloat delta_objf_observed = objf_perturbed - objf,
-          delta_objf_predicted = (perturbed_ucomponent->DotProduct(*gradient_ucomponent) -
-                                  ucomponent->DotProduct(*gradient_ucomponent));
-
-      KALDI_LOG << "Model gradients: comparing " << delta_objf_observed
-                << " and " << delta_objf_predicted;
-      if (fabs(delta_objf_predicted - delta_objf_observed) >
-          0.05 * (fabs(delta_objf_predicted + delta_objf_observed)/2) &&
-          fabs(delta_objf_predicted - delta_objf_observed) > 1.0e-06) {
-        KALDI_WARN << "Bad difference!";
-        num_bad++;
-      } else {
-        num_ok++;
-      }
-      delete perturbed_ucomponent;
-      delete gradient_ucomponent;
-    }
-    if (num_ok < num_bad) {
-      delete component_copy;
-      KALDI_ERR << "model-derivative check failed";
-    }
-  }
-  delete component_copy; // No longer needed.
-}
-
-void UnitTestGenericComponentInternal(const Component &component) {
-  int32 input_dim = component.InputDim(),
-      output_dim = component.OutputDim();
-
-  KALDI_LOG << component.Info();
-  int32 num_egs = 10 + Rand() % 5;
-  int32 num_chunks = 1,
-        first_offset = 0,
-        last_offset = num_egs-1;
-
-  ChunkInfo in_info(input_dim, num_chunks, first_offset, last_offset);
-  ChunkInfo out_info(output_dim, num_chunks, first_offset, last_offset);
-  UnitTestGenericComponentInternal(component, in_info, out_info);
-}
-
-
-
-void UnitTestSigmoidComponent() {
-  // We're testing that the gradients are computed correctly:
-  // the input gradients and the model gradients.
-
-  int32 input_dim = 10 + Rand() % 50;
-  {
-    SigmoidComponent sigmoid_component(input_dim);
-    UnitTestGenericComponentInternal(sigmoid_component);
-  }
-  {
-    SigmoidComponent sigmoid_component;
-    sigmoid_component.InitFromString("dim=15");
-    UnitTestGenericComponentInternal(sigmoid_component);
-  }
-}
-
-template<class T>
-void UnitTestGenericComponent(std::string extra_str = "") {
-  // works if it has an initializer from int,
-  // e.g. tanh, sigmoid.
-
-  // We're testing that the gradients are computed correctly:
-  // the input gradients and the model gradients.
-
-  int32 input_dim = 10 + Rand() % 50;
-  {
-    T component(input_dim);
-    UnitTestGenericComponentInternal(component);
-  }
-  {
-    T component;
-    component.InitFromString(static_cast<std::string>("dim=15 ") + extra_str);
-    UnitTestGenericComponentInternal(component);
-  }
-}
-
-void UnitTestMaxoutComponent() {
-  // works if it has an initializer from int,
-  // e.g. tanh, sigmoid.
-
-  // We're testing that the gradients are computed correctly:
-  // the input gradients and the model gradients.
-
-  for (int32 i = 0; i < 5; i++) {
-    int32 output_dim = 10 + Rand() % 20,
-        group_size = 1 + Rand() % 10,
-        input_dim = output_dim * group_size;
-
-    MaxoutComponent component(input_dim, output_dim);
-    UnitTestGenericComponentInternal(component);
-  }
-
-  {
-    MaxoutComponent component;
-    component.InitFromString("input-dim=15 output-dim=5");
-    UnitTestGenericComponentInternal(component);
-  }
-}
-
-void UnitTestPnormComponent() {
-  // We're testing that the gradients are computed correctly:
-  // the input gradients and the model gradients.
-
-  int32 num_fail = 0, num_tries = 4;
-  for (int32 i = 0; i < num_tries; i++) {
-    try {
-      int32 output_dim = 10 + Rand() % 20,
-          group_size = 1 + Rand() % 10,
-          input_dim = output_dim * group_size;
-      BaseFloat p = 1.0 + 0.1 * (Rand() % 20);
-
-      PnormComponent component(input_dim, output_dim, p);
-      UnitTestGenericComponentInternal(component);
-    } catch (...) {
-      KALDI_WARN << "Ignoring test failure in UnitTestPnormComponent().";
-      num_fail++;
-    }
-  }
-  if (num_fail >= num_tries/2) {
-    KALDI_ERR << "Too many test failures.";
-  }
-}
-
-void UnitTestMaxpoolingComponent() {
-  // works if it has an initializer from int,
-  // e.g. tanh, sigmoid.
-  // We're testing that the gradients are computed correctly:
-  // the input gradients and the model gradients.
-
-  for (int32 i = 0; i < 5; i++) {
-    int32 pool_stride = 5 + Rand() % 10,
-          pool_size = 2 + Rand() % 3,
-          num_pools = 1 + Rand() % 10;
-    int32 output_dim = num_pools * pool_stride;
-    int32 num_patches = num_pools * pool_size;
-    int32 input_dim = pool_stride * num_patches;
-
-    MaxpoolingComponent component(input_dim, output_dim,
-                                  pool_size, pool_stride);
-    UnitTestGenericComponentInternal(component);
-  }
-
-  {
-    MaxpoolingComponent component;
-    component.InitFromString("input-dim=192 output-dim=64 pool-size=3 pool-stride=16");
-    UnitTestGenericComponentInternal(component);
-  }
-}
-
-
-void UnitTestAffineComponent() {
-  BaseFloat learning_rate = 0.01,
-      param_stddev = 0.1, bias_stddev = 1.0;
-  int32 input_dim = 5 + Rand() % 10, output_dim = 5 + Rand() % 10;
-  {
-    AffineComponent component;
-    if (Rand() % 2 == 0) {
-      component.Init(learning_rate, input_dim, output_dim,
-                     param_stddev, bias_stddev);
-    } else {
-      Matrix<BaseFloat> mat(output_dim + 1, input_dim);
-      mat.SetRandn();
-      mat.Scale(param_stddev);
-      WriteKaldiObject(mat, "tmpf", true);
-      Sleep(0.5);
-      component.Init(learning_rate, "tmpf");
-      unlink("tmpf");
-    }
-    UnitTestGenericComponentInternal(component);
-  }
-  {
-    const char *str = "learning-rate=0.01 input-dim=10 output-dim=15 param-stddev=0.1";
-    AffineComponent component;
-    component.InitFromString(str);
-    UnitTestGenericComponentInternal(component);
-  }
-}
-
-void UnitTestConvolutional1dComponent() {
-  BaseFloat learning_rate = 0.01,
-            param_stddev = 0.1, bias_stddev = 1.0;
-  int32 patch_stride = 10, patch_step = 1, patch_dim = 4;
-  int32 num_patches = 1 + (patch_stride - patch_dim) / patch_step;
-  int32 num_splice = 5 + Rand() % 10, num_filters = 5 + Rand() % 10;
-  int32 input_dim = patch_stride * num_splice;
-  int32 filter_dim = patch_dim * num_splice;
-  int32 output_dim = num_patches * num_filters;
-  {
-    Convolutional1dComponent component;
-    if (Rand() % 2 == 0) {
-      component.Init(learning_rate, input_dim, output_dim,
-                     patch_dim, patch_step, patch_stride,
-                     param_stddev, bias_stddev, true);
-    } else {
-      Matrix<BaseFloat> mat(num_filters, filter_dim + 1);
-      mat.SetRandn();
-      mat.Scale(param_stddev);
-      WriteKaldiObject(mat, "tmpf", true);
-      Sleep(0.5);
-      component.Init(learning_rate, patch_dim,
-                     patch_step, patch_stride, "tmpf", false);
-      unlink("tmpf");
-    }
-    UnitTestGenericComponentInternal(component);
-  }
-  {
-    // appended-conv is false by default
-    const char *str = "learning-rate=0.01 input-dim=100 output-dim=70 param-stddev=0.1 patch-dim=4 patch-step=1 patch-stride=10";
-    Convolutional1dComponent component;
-    component.InitFromString(str);
-    UnitTestGenericComponentInternal(component);
-  }
-  {
-    const char *str = "learning-rate=0.01 input-dim=100 output-dim=70 param-stddev=0.1 patch-dim=4 patch-step=1 patch-stride=10 appended-conv=true";
-    Convolutional1dComponent component;
-    component.InitFromString(str);
-    UnitTestGenericComponentInternal(component);
-  }
-}
-
-void UnitTestDropoutComponent() {
-  // We're testing that the gradients are computed correctly:
-  // the input gradients and the model gradients.
-
-  int32 num_fail = 0, num_tries = 4;
-  for (int32 i = 0; i < num_tries; i++) {
-    try {
-      int32 input_dim = 10 + Rand() % 50;
-      {
-        DropoutComponent dropout_component(input_dim, 0.5, 0.3);
-        UnitTestGenericComponentInternal(dropout_component);
-      }
-      {
-        DropoutComponent dropout_component;
-        dropout_component.InitFromString("dim=15 dropout-proportion=0.6 dropout-scale=0.1");
-        UnitTestGenericComponentInternal(dropout_component);
-      }
-    } catch (...) {
-      KALDI_WARN << "Ignoring test failure in UnitTestDropoutComponent().";
-      num_fail++;
-    }
-  }
-  if (num_fail >= num_tries/2) {
-    KALDI_ERR << "Too many test failures.";
-  }
-}
-
-void UnitTestAdditiveNoiseComponent() {
-  // We're testing that the gradients are computed correctly:
-  // the input gradients and the model gradients.
-
-  int32 num_fail = 0, num_tries = 4;
-  for (int32 i = 0; i < num_tries; i++) {
-    try {
-      int32 input_dim = 10 + Rand() % 50;
-      {
-        AdditiveNoiseComponent additive_noise_component(input_dim, 0.1);
-        UnitTestGenericComponentInternal(additive_noise_component);
-      }
-      {
-        AdditiveNoiseComponent additive_noise_component;
-        additive_noise_component.InitFromString("dim=15 stddev=0.2");
-        UnitTestGenericComponentInternal(additive_noise_component);
-      }
-    } catch (...) {
-      KALDI_WARN << "Ignoring failure in AdditiveNoiseComponent test";
-      num_fail++;
-    }
-  }
-  if (num_fail >= num_tries/2) {
-    KALDI_ERR << "Too many test failures.";
-  }
-}
-
-void UnitTestScaleComponent() {
-  int32 dim = 1 + Rand() % 10;
-  BaseFloat scale = 0.1 + Rand() % 3;
-  {
-    ScaleComponent component;
-    if (Rand() % 2 == 0) {
-      component.Init(dim, scale);
-    } else {
-      std::ostringstream str;
-      str << "dim=" << dim << " scale=" << scale;
-      component.InitFromString(str.str());
-    }
-    UnitTestGenericComponentInternal(component);
-  }
-}
-
-
-void UnitTestAffineComponentPreconditioned() {
-  BaseFloat learning_rate = 0.01,
-      param_stddev = 0.1, bias_stddev = 1.0, alpha = 0.01,
-      max_change = 100.0;
-  int32 input_dim = 5 + Rand() % 10, output_dim = 5 + Rand() % 10;
-  {
-    AffineComponentPreconditioned component;
-    if (Rand() % 2 == 0) {
-      component.Init(learning_rate, input_dim, output_dim,
-                     param_stddev, bias_stddev,
-                     alpha, max_change);
-    } else {
-      Matrix<BaseFloat> mat(output_dim + 1, input_dim);
-      mat.SetRandn();
-      mat.Scale(param_stddev);
-      WriteKaldiObject(mat, "tmpf", true);
-      Sleep(0.5);
-      component.Init(learning_rate, alpha, max_change, "tmpf");
-      unlink("tmpf");
-    }
-    UnitTestGenericComponentInternal(component);
-  }
-  {
-    const char *str = "learning-rate=0.01 input-dim=16 output-dim=15 param-stddev=0.1 alpha=0.01";
-    AffineComponentPreconditioned component;
-    component.InitFromString(str);
-    UnitTestGenericComponentInternal(component);
-  }
-}
-
-
-void UnitTestAffineComponentPreconditionedOnline() {
-  BaseFloat learning_rate = 0.01,
-      param_stddev = 0.1, bias_stddev = 1.0, num_samples_history = 2000.0, alpha = 4.0,
-      max_change_per_sample = 0.1, update_period = 1;
-  int32 input_dim = 5 + Rand() % 10, output_dim = 5 + Rand() % 10,
-      rank_in = 1 + Rand() % 5, rank_out = 1 + Rand() % 5;
-  {
-    AffineComponentPreconditionedOnline component;
-    if (Rand() % 2 == 0) {
-      component.Init(learning_rate, input_dim, output_dim,
-                     param_stddev, bias_stddev,
-                     rank_in, rank_out, update_period,
-                     num_samples_history, alpha,
-                     max_change_per_sample);
-    } else {
-      Matrix<BaseFloat> mat(output_dim + 1, input_dim);
-      mat.SetRandn();
-      mat.Scale(param_stddev);
-      WriteKaldiObject(mat, "tmpf", true);
-      Sleep(0.5);
-      component.Init(learning_rate, rank_in, rank_out,
-                     update_period, num_samples_history, alpha,
-                     max_change_per_sample, "tmpf");
-      unlink("tmpf");
-    }
-    UnitTestGenericComponentInternal(component);
-  }
-  {
-    const char *str = "learning-rate=0.01 input-dim=16 output-dim=15 param-stddev=0.1 num-samples-history=3000 alpha=2.0 update-period=1 rank-in=5 rank-out=6";
-    AffineComponentPreconditionedOnline component;
-    component.InitFromString(str);
-    UnitTestGenericComponentInternal(component);
-  }
-}
-
-void UnitTestBlockAffineComponent() {
-  BaseFloat learning_rate = 0.01,
-      param_stddev = 0.1, bias_stddev = 0.1;
-  int32 num_blocks = 1 + Rand() % 3,
-         input_dim = num_blocks * (2 + Rand() % 4),
-        output_dim = num_blocks * (2 + Rand() % 4);
-
-  {
-    BlockAffineComponent component;
-    component.Init(learning_rate, input_dim, output_dim,
-                   param_stddev, bias_stddev, num_blocks);
-    UnitTestGenericComponentInternal(component);
-  }
-  {
-    const char *str = "learning-rate=0.01 input-dim=10 output-dim=15 param-stddev=0.1 num-blocks=5";
-    BlockAffineComponent component;
-    component.InitFromString(str);
-    UnitTestGenericComponentInternal(component);
-  }
-}
-
-void UnitTestBlockAffineComponentPreconditioned() {
-  BaseFloat learning_rate = 0.01,
-      param_stddev = 0.1, bias_stddev = 1.0, alpha = 3.0;
-  int32 num_blocks = 1 + Rand() % 3,
-         input_dim = num_blocks * (2 + Rand() % 4),
-        output_dim = num_blocks * (2 + Rand() % 4);
-
-  {
-    BlockAffineComponentPreconditioned component;
-    component.Init(learning_rate, input_dim, output_dim,
-                   param_stddev, bias_stddev, num_blocks, alpha);
-    UnitTestGenericComponentInternal(component);
-  }
-  {
-    const char *str = "learning-rate=0.01 input-dim=10 output-dim=15 param-stddev=0.1 num-blocks=5 alpha=3.0";
-    BlockAffineComponentPreconditioned component;
-    component.InitFromString(str);
-    UnitTestGenericComponentInternal(component);
-  }
-}
-
-
-void UnitTestSumGroupComponent() {
-  std::vector<int32> sizes;
-  int32 num_sizes = 1 + Rand() % 5;
-  for (int32 i = 0; i < num_sizes; i++)
-    sizes.push_back(1 + Rand() % 5);
-
-  {
-    SumGroupComponent component;
-    component.Init(sizes);
-    UnitTestGenericComponentInternal(component);
-  }
-  {
-    const char *str = "sizes=3:4:5";
-    SumGroupComponent component;
-    component.InitFromString(str);
-    UnitTestGenericComponentInternal(component);
-  }
-}
-
-
-void UnitTestDctComponent() {
-  int32 m = 3 + Rand() % 4, n = 3 + Rand() % 4,
-  dct_dim = m, dim = m * n;
-  bool reorder = (Rand() % 2 == 0);
-  {
-    DctComponent component;
-    component.Init(dim, dct_dim, reorder);
-    UnitTestGenericComponentInternal(component);
-  }
-  {
-    const char *str = "dim=10 dct-dim=5 reorder=true";
-    DctComponent component;
-    component.InitFromString(str);
-    UnitTestGenericComponentInternal(component);
-  }
-  {
-    const char *str = "dim=10 dct-dim=5 reorder=true dct-keep-dim=2";
-    DctComponent component;
-    component.InitFromString(str);
-    UnitTestGenericComponentInternal(component);
-  }
-  {
-    const char *str = "dim=10 dct-dim=5 reorder=true dct-keep-dim=3";
-    DctComponent component;
-    component.InitFromString(str);
-    UnitTestGenericComponentInternal(component);
-  }
-  {
-    const char *str = "dim=10 dct-dim=5 reorder=true dct-keep-dim=4";
-    DctComponent component;
-    component.InitFromString(str);
-    UnitTestGenericComponentInternal(component);
-  }
-}
-
-
-void UnitTestFixedLinearComponent() {
-  int32 m = 1 + Rand() % 4, n = 1 + Rand() % 4;
-  {
-    CuMatrix<BaseFloat> mat(m, n);
-    mat.SetRandn();
-    FixedLinearComponent component;
-    component.Init(mat);
-    UnitTestGenericComponentInternal(component);
-  }
-}
-
-
-void UnitTestFixedAffineComponent() {
-  int32 m = 15 + Rand() % 4, n = 15 + Rand() % 4;
-  {
-    CuMatrix<BaseFloat> mat(m, n);
-    mat.SetRandn();
-    FixedAffineComponent component;
-    component.Init(mat);
-    UnitTestGenericComponentInternal(component);
-  }
-}
-
-void UnitTestFixedScaleComponent() {
-  int32 m = 1 + Rand() % 20;
-  {
-    CuVector<BaseFloat> vec(m);
-    vec.SetRandn();
-    FixedScaleComponent component;
-    component.Init(vec);
-    UnitTestGenericComponentInternal(component);
-  }
-}
-
-void UnitTestFixedBiasComponent() {
-  int32 m = 1 + Rand() % 20;
-  {
-    CuVector<BaseFloat> vec(m);
-    vec.SetRandn();
-    FixedBiasComponent component;
-    component.Init(vec);
-    UnitTestGenericComponentInternal(component);
-  }
-}
-
-
-
-void UnitTestParsing() {
-  int32 i;
-  BaseFloat f;
-  bool b;
-  std::vector<int32> v;
-  std::string s = "x=y";
-  KALDI_ASSERT(ParseFromString("foo", &s, &i) == false
-               && s == "x=y");
-  KALDI_ASSERT(ParseFromString("foo", &s, &f) == false
-               && s == "x=y");
-  KALDI_ASSERT(ParseFromString("foo", &s, &v) == false
-               && s == "x=y");
-  KALDI_ASSERT(ParseFromString("foo", &s, &b) == false
-               && s == "x=y");
-  {
-    std::string s = "x=1";
-    KALDI_ASSERT(ParseFromString("x", &s, &i) == true
-                 && i == 1 && s == "");
-    s = "a=b x=1";
-    KALDI_ASSERT(ParseFromString("x", &s, &i) == true
-                 && i == 1 && s == "a=b");
-  }
-  {
-    std::string s = "foo=false";
-    KALDI_ASSERT(ParseFromString("foo", &s, &b) == true
-                 && b == false && s == "");
-    s = "x=y foo=true a=b";
-    KALDI_ASSERT(ParseFromString("foo", &s, &b) == true
-                 && b == true && s == "x=y a=b");
-  }
-
-  {
-    std::string s = "foobar x=1";
-    KALDI_ASSERT(ParseFromString("x", &s, &f) == true
-                 && f == 1.0 && s == "foobar");
-    s = "a=b x=1 bxy";
-    KALDI_ASSERT(ParseFromString("x", &s, &f) == true
-                 && f == 1.0 && s == "a=b bxy");
-  }
-  {
-    std::string s = "x=1:2:3";
-    KALDI_ASSERT(ParseFromString("x", &s, &v) == true
-                 && v.size() == 3 && v[0] == 1 && v[1] == 2 && v[2] == 3
-                 && s == "");
-    s = "a=b x=1:2:3 c=d";
-    KALDI_ASSERT(ParseFromString("x", &s, &v) == true
-                 && f == 1.0 && s == "a=b c=d");
-  }
-
-}
-
-void UnitTestSpliceComponent() {
-  int32 feat_dim = RandInt(1, 20),
-      const_dim =  RandInt(0, 10),
-      left_context = RandInt(-5, 0),
-      right_context = RandInt(0, 5),
-      num_chunks = RandInt(1, 20);
-        // multiple chunks are required as splice component
-        // has separate index computation logic for more than one chunks
-  KALDI_LOG << " Feat_dim :" << feat_dim << " const_dim: " << const_dim  ;
-  std::vector<bool> contiguous(2);
-  contiguous[0] = true;
-  contiguous[1] = false;
-  for (int32 i = 0; i < contiguous.size(); i++) {
-    std::vector<int32> splice_indexes;
-    if (contiguous[i]) {
-      // create contiguous set of splice indexes in the range
-      // (-left_context, right_context)
-      KALDI_LOG << "Testing contiguous splice component";
-      splice_indexes.reserve(right_context - left_context + 1);
-      for (int32 i = left_context; i <= right_context; i++)
-        splice_indexes.push_back(i);
-    } else  {
-      // generate random splice indexes in range (-left_context, right_context)
-      KALDI_LOG << "Testing non-contiguous splice component";
-      int32 num_left_splice_indexes = RandInt(0, -left_context) + 1;
-      int32 num_right_splice_indexes = RandInt(0, right_context);
-      splice_indexes.reserve(num_left_splice_indexes + num_right_splice_indexes);
-      while (splice_indexes.size() < num_left_splice_indexes)  {
-        int32 new_index = RandInt(left_context, 0);
-        // check if the index already exists in the vector
-        if (std::find(splice_indexes.begin(), splice_indexes.end(), new_index)
-            == splice_indexes.end())  {
-          splice_indexes.push_back(new_index);
-        }
-      }
-      while (splice_indexes.size() < num_left_splice_indexes + num_right_splice_indexes)  {
-        int32 new_index = RandInt(0, right_context);
-        // check if the index already exists in the vector
-        if (std::find(splice_indexes.begin(), splice_indexes.end(), new_index)
-            == splice_indexes.end())  {
-          splice_indexes.push_back(new_index);
-        }
-      }
-      sort(splice_indexes.begin(), splice_indexes.end());
-      if (splice_indexes.back() < 0) // will fail assertion in init of component
-        splice_indexes.push_back(0);
-    }
-    std::vector<int32> input_offsets;
-    for (int32 i = 0; i < splice_indexes.size(); i++) {
-      input_offsets.push_back(splice_indexes[i] - splice_indexes.front());
-      KALDI_LOG << i << " : " << splice_indexes[i] << " : " << input_offsets[i] ;
-    }
-    int32 output_offset = -splice_indexes.front();
-    SpliceComponent *component = new SpliceComponent();
-    component->Init(feat_dim + const_dim, splice_indexes, const_dim);
-    ChunkInfo in_info = ChunkInfo(feat_dim + const_dim, num_chunks,
-                                  input_offsets),
-              out_info = ChunkInfo(feat_dim * splice_indexes.size() + const_dim,
-                                   num_chunks, output_offset, output_offset);
-    UnitTestGenericComponentInternal(*component, in_info, out_info);
-    delete component;
-  }
-}
-
-void BasicDebugTestForSpliceMax(bool output=false) {
-  int32 C=5,
-        context_len=2,
-        R= 3 + 2*context_len;
-
-  SpliceMaxComponent *c = new SpliceMaxComponent();
-  std::vector<int32> context(2 * context_len + 1);
-  for (int32 i = -1 * context_len; i <= context_len; i++)
-    context[i + context_len] = i;
-  c->Init(C, context);
-  CuMatrix<BaseFloat> in(R, C), in_deriv(R, C);
-  CuMatrix<BaseFloat> out(R, c->OutputDim());
-  ChunkInfo in_info = ChunkInfo(C, 1, 0, R - 1),
-            out_info = ChunkInfo(C, 1, context_len, R - 1 - context_len);
-
-  in.SetRandn();
-  if (output)
-    KALDI_LOG << in;
-
-  c->Propagate(in_info, out_info, in, &out);
-
-  if (output)
-    KALDI_LOG << out;
-
-  out.Set(5.0);
-
-  if (output)
-    KALDI_LOG << out;
-
-  c->Backprop(in_info, out_info, in, in, out, c, &in_deriv);
-
-  if (output)
-    KALDI_LOG << in_deriv;
-
-  delete c;
-}
-
-
-} // namespace nnet2
-} // namespace kaldi
-
-#include "matrix/matrix-functions.h"
-
-
-int main() {
-  using namespace kaldi;
-  using namespace kaldi::nnet2;
-
-  int32 loop = 0;
-#if HAVE_CUDA == 1
-  for (loop = 0; loop < 2; loop++) {
-    //// Uncomment the following line to expose the bug in UnitTestDropoutComponent
-    //CuDevice::Instantiate().SetDebugStrideMode(true);
-    if (loop == 0)
-      CuDevice::Instantiate().SelectGpuId("no"); // -1 means no GPU
-    else
-      CuDevice::Instantiate().SelectGpuId("optional"); // -2 .. automatic selection
-#endif
-
-    BasicDebugTestForSpliceMax(true);
-    // We used to test this 3 times, but now that nnet2 is rarely changed,
-    // reducing it to once.
-    for (int32 i = 0; i < 1; i++) {
-      UnitTestGenericComponent<SigmoidComponent>();
-      UnitTestGenericComponent<TanhComponent>();
-      UnitTestGenericComponent<PowerComponent>("power=1.5");
-      UnitTestGenericComponent<PowerComponent>("power=1.0");
-      UnitTestGenericComponent<PermuteComponent>();
-      UnitTestGenericComponent<SoftmaxComponent>();
-      UnitTestGenericComponent<LogSoftmaxComponent>();
-      UnitTestGenericComponent<RectifiedLinearComponent>();
-      UnitTestGenericComponent<SoftHingeComponent>();
-      UnitTestSpliceComponent();
-      UnitTestMaxoutComponent();
-      UnitTestPnormComponent();
-      UnitTestMaxpoolingComponent();
-      UnitTestGenericComponent<NormalizeComponent>();
-      UnitTestSigmoidComponent();
-      UnitTestAffineComponent();
-      UnitTestScaleComponent();
-      UnitTestBlockAffineComponent();
-      UnitTestBlockAffineComponentPreconditioned();
-      UnitTestSumGroupComponent();
-      UnitTestDctComponent();
-      UnitTestFixedLinearComponent();
-      UnitTestFixedAffineComponent();
-      UnitTestFixedScaleComponent();
-      UnitTestFixedBiasComponent();
-      UnitTestAffineComponentPreconditioned();
-      UnitTestAffineComponentPreconditionedOnline();
-      UnitTestConvolutional1dComponent();
-      UnitTestDropoutComponent();
-      UnitTestAdditiveNoiseComponent();
-      UnitTestParsing();
-      if (loop == 0)
-        KALDI_LOG << "Tests without GPU use succeeded.";
-      else
-        KALDI_LOG << "Tests with GPU use (if available) succeeded.";
-    }
-#if HAVE_CUDA == 1
-  } // No for loop if 'HAVE_CUDA != 1',
-  CuDevice::Instantiate().PrintProfile();
-#endif
-  return 0;
-}
diff --git a/tools/config/common_path.sh b/tools/config/common_path.sh
index 9a7ae2d9b29..fc941c52c17 100644
--- a/tools/config/common_path.sh
+++ b/tools/config/common_path.sh
@@ -22,4 +22,5 @@ ${KALDI_ROOT}/src/rnnlmbin:\
 ${KALDI_ROOT}/src/sgmm2bin:\
 ${KALDI_ROOT}/src/sgmmbin:\
 ${KALDI_ROOT}/src/tfrnnlmbin:\
+${KALDI_ROOT}/src/cudadecoderbin:\
 $PATH

From 423110769a46b1fa78caa9fdd68b94c37caf51ab Mon Sep 17 00:00:00 2001
From: DongjiGao <dgao5@jhu.edu>
Date: Sat, 27 Apr 2019 12:06:15 -0400
Subject: [PATCH 089/163] [src] Fix unit-test failure
 UnitTestCuMatrixSetRandn() (#3274)

---
 src/cudamatrix/cu-matrix-test.cc | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/src/cudamatrix/cu-matrix-test.cc b/src/cudamatrix/cu-matrix-test.cc
index 405ef16e97b..83ed24b9847 100644
--- a/src/cudamatrix/cu-matrix-test.cc
+++ b/src/cudamatrix/cu-matrix-test.cc
@@ -2659,10 +2659,13 @@ static void UnitTestCuMatrixSetRandn() {
       // see http://en.wikipedia.org/wiki/Normal_distribution#Moments,
       // note that mu = 0 and sigma = 1.
       Real expected_moment = (pow % 2 == 1 ? 0 : DoubleFactorial(pow - 1));
+      Real expected_twice_moment = DoubleFactorial(2 * pow - 1);
       Real k = 10.0; // This is just a constant we use to give us some wiggle
                      // room before rejecting the distribution... e.g. 20 sigma,
                      // quite approximately.
-      Real allowed_deviation = k * pow / sqrt(static_cast<Real>(rows * cols));
+      // VAR(X) = E(X^2) - (E(X))^2
+      Real deviation = sqrt(expected_twice_moment - expected_moment * expected_moment);
+      Real allowed_deviation = k * deviation / sqrt(static_cast<Real>(rows * cols));
       // give it a bit more wiggle room for higher powers.. this is quite
       // unscientific, it would be better to involve the absolute moments or
       // something like that, and use one of those statistical inequalities,

From 25c72899ecba07bbdc8efe76eae91bd3eb981530 Mon Sep 17 00:00:00 2001
From: huangruizhe <eraser567@163.com>
Date: Sat, 27 Apr 2019 12:50:52 -0400
Subject: [PATCH 090/163] [src,build]  Removed cusolver for now (not needed
 yet; caused build problems) (#3276)

---
 src/cudamatrix/cu-common.h  | 9 ---------
 src/cudamatrix/cu-device.h  | 6 ------
 src/makefiles/cuda_64bit.mk | 2 +-
 3 files changed, 1 insertion(+), 16 deletions(-)

diff --git a/src/cudamatrix/cu-common.h b/src/cudamatrix/cu-common.h
index 54eda86f572..7446a76bf93 100644
--- a/src/cudamatrix/cu-common.h
+++ b/src/cudamatrix/cu-common.h
@@ -51,15 +51,6 @@
   } \
 }
 
-#define CUSOLVER_SAFE_CALL(fun) \
-{ \
-  int32 ret; \
-  if ((ret = (fun)) != 0) { \
-    KALDI_ERR << "cusolverStatus_t " << ret << " : \"" << ret << "\" returned from '" << #fun << "'"; \
-  } \
-}
-
-
 #define CUSPARSE_SAFE_CALL(fun) \
 { \
   int32 ret; \
diff --git a/src/cudamatrix/cu-device.h b/src/cudamatrix/cu-device.h
index d94d1166c04..cb54324ad29 100644
--- a/src/cudamatrix/cu-device.h
+++ b/src/cudamatrix/cu-device.h
@@ -27,7 +27,6 @@
 #include <cublas_v2.h>
 #include <cusparse.h>
 #include <curand.h>
-#include <cusolverDn.h>
 #include <map>
 #include <string>
 #include <iostream>
@@ -84,7 +83,6 @@ class CuDevice {
   inline cublasHandle_t GetCublasHandle() { return cublas_handle_; }
   inline cusparseHandle_t GetCusparseHandle() { return cusparse_handle_; }
   inline curandGenerator_t GetCurandHandle() { return curand_handle_; }
-  inline cusolverDnHandle_t GetCusolverDnHandle() { return cusolverdn_handle_; }
 
   inline void SeedGpu() {
     if (CuDevice::Instantiate().Enabled()) {
@@ -306,7 +304,6 @@ class CuDevice {
   cublasHandle_t cublas_handle_;
   cusparseHandle_t cusparse_handle_;
   curandGenerator_t curand_handle_;
-  cusolverDnHandle_t cusolverdn_handle_;
 }; // class CuDevice
 
 
@@ -325,9 +322,6 @@ inline cublasHandle_t GetCublasHandle() {
   return CuDevice::Instantiate().GetCublasHandle(); 
 }
 
-inline cusolverDnHandle_t GetCusolverDnHandle() { 
-  return CuDevice::Instantiate().GetCusolverDnHandle(); 
-}
 // A more convenient way to get the handle to use cuSPARSE APIs.
 inline cusparseHandle_t GetCusparseHandle() { 
   return CuDevice::Instantiate().GetCusparseHandle(); 
diff --git a/src/makefiles/cuda_64bit.mk b/src/makefiles/cuda_64bit.mk
index e4ba1f147c9..eb8cf743ab3 100644
--- a/src/makefiles/cuda_64bit.mk
+++ b/src/makefiles/cuda_64bit.mk
@@ -14,4 +14,4 @@ CUDA_FLAGS = --machine 64 -DHAVE_CUDA \
              --verbose -Xcompiler "$(CXXFLAGS)"
 
 CUDA_LDFLAGS += -L$(CUDATKDIR)/lib64 -Wl,-rpath,$(CUDATKDIR)/lib64
-CUDA_LDLIBS += -lcublas -lcusparse -lcusolver -lcudart -lcurand -lnvToolsExt #LDLIBS : The .so libs are loaded later than static libs in implicit rule
+CUDA_LDLIBS += -lcublas -lcusparse -lcudart -lcurand -lnvToolsExt #LDLIBS : The .so libs are loaded later than static libs in implicit rule

From e3abc6590a4b486231c29d3fa4f0fe6575f24b0b Mon Sep 17 00:00:00 2001
From: Hossein Hadian <hn.hadian@gmail.com>
Date: Tue, 30 Apr 2019 05:32:18 +0430
Subject: [PATCH 091/163] [scripts] Make fix_data_dir.sh remove utterances
 which have bad duration. (#3275)

---
 egs/wsj/s5/utils/fix_data_dir.sh | 12 +++++++++++-
 1 file changed, 11 insertions(+), 1 deletion(-)

diff --git a/egs/wsj/s5/utils/fix_data_dir.sh b/egs/wsj/s5/utils/fix_data_dir.sh
index ca0972ca85b..aba9037a080 100755
--- a/egs/wsj/s5/utils/fix_data_dir.sh
+++ b/egs/wsj/s5/utils/fix_data_dir.sh
@@ -155,12 +155,22 @@ function filter_utts {
   maybe_reco2dur=
   [ ! -f $data/segments ] && maybe_wav=wav.scp # wav indexed by utts only if segments does not exist.
   [ -s $data/reco2dur ] && [ ! -f $data/segments ] && maybe_reco2dur=reco2dur # reco2dur indexed by utts
-  for x in feats.scp text segments utt2lang $maybe_wav; do
+
+  maybe_utt2dur=
+  if [ -f $data/utt2dur ]; then
+    cat $data/utt2dur | \
+      awk '{ if (NF == 2 && $2 > 0) { print }}' > $data/utt2dur.ok || exit 1
+    maybe_utt2dur=utt2dur.ok
+  fi
+
+  for x in feats.scp text segments utt2lang $maybe_wav $maybe_utt2dur; do
     if [ -f $data/$x ]; then
       utils/filter_scp.pl $data/$x $tmpdir/utts > $tmpdir/utts.tmp
       mv $tmpdir/utts.tmp $tmpdir/utts
     fi
   done
+  rm $data/utt2dur.ok 2>/dev/null || true
+
   [ ! -s $tmpdir/utts ] && echo "fix_data_dir.sh: no utterances remained: not proceeding further." && \
     rm $tmpdir/utts && exit 1;
 

From 7a93e7f84f0ff9c1681b47051b242c224da2d9ac Mon Sep 17 00:00:00 2001
From: "kkm (aka Kirill Katsnelson)" <kkm@smartaction.com>
Date: Tue, 30 Apr 2019 17:43:52 -0700
Subject: [PATCH 092/163] [scripts] Make generate_plots.py python3-compatible
 (#3280)

* [scripts] Make generate_plots.py python3-compatible

 * Use raw strings for TeX; python3 has more escapes.
 * Use sorted() for sorting, since many functions return
   transient iterators.

Also, mostly cosmetic fixes:
 * Do not print the missing imports warning before parsing
   the command line (it can use --help).
 * Suppress false-positive matplotlib warning.
 * Add metavars to help message so it reads better.
 * Make logging lines more concise (no point printing time).
 * Simplify logger initialization.
 * Use logging.warning for warnings; warnings.warn() is for
   library code only.

* fixup! [scripts] Make generate_plots.py python3-compatible
---
 .../s5/steps/nnet3/report/generate_plots.py   | 241 ++++++++----------
 1 file changed, 107 insertions(+), 134 deletions(-)

diff --git a/egs/wsj/s5/steps/nnet3/report/generate_plots.py b/egs/wsj/s5/steps/nnet3/report/generate_plots.py
index 572e2cf08b7..d79db1604fd 100755
--- a/egs/wsj/s5/steps/nnet3/report/generate_plots.py
+++ b/egs/wsj/s5/steps/nnet3/report/generate_plots.py
@@ -23,76 +23,61 @@
     import matplotlib.pyplot as plt
     import numpy as np
     from matplotlib.patches import Rectangle
+    # matplotlib issue https://github.com/matplotlib/matplotlib/issues/12513
+    # plt.subplot() generates a false-positive warninig, suppress it for now.
+    from matplotlib.cbook import MatplotlibDeprecationWarning
+    warnings.filterwarnings('ignore', category=MatplotlibDeprecationWarning,
+                            message='Adding an axes using the same arguments')
     g_plot = True
 except ImportError:
-    warnings.warn(
-        """This script requires matplotlib and numpy.
-        Please install them to generate plots.
-        Proceeding with generation of tables.
-        If you are on a cluster where you do not have admin rights you could
-        try using virtualenv.""")
     g_plot = False
 
 
-logger = logging.getLogger('libs')
-logger.setLevel(logging.INFO)
-handler = logging.StreamHandler()
-handler.setLevel(logging.INFO)
-formatter = logging.Formatter("%(asctime)s [%(filename)s:%(lineno)s - "
-                              "%(funcName)s - %(levelname)s ] %(message)s")
-handler.setFormatter(formatter)
-logger.addHandler(handler)
-logger.info('Generating plots')
+logging.basicConfig(format="%(filename)s:%(lineno)s:%(levelname)s:%(message)s",
+                    level=logging.INFO)
+logger = logging.getLogger(__name__)
 
 
 def get_args():
     parser = argparse.ArgumentParser(
-        description="""Parses the training logs and generates a variety of
-        plots.
-        e.g. (deprecated): steps/nnet3/report/generate_plots.py
-        --comparison-dir exp/nnet3/tdnn1 --comparison-dir exp/nnet3/tdnn2
-        exp/nnet3/tdnn exp/nnet3/tdnn/report
-        or (current): steps/nnet3/report/generate_plots.py
-        exp/nnet3/tdnn exp/nnet3/tdnn1 exp/nnet3/tdnn2 exp/nnet3/tdnn/report.
-        Look for the report.pdf in the output (report) directory.""")
-
-    parser.add_argument("--comparison-dir", type=str, action='append',
-                        help="other experiment directories for comparison. "
-                        "These will only be used for plots, not tables"
-                        "Note: this option is deprecated.")
-    parser.add_argument("--start-iter", type=int,
-                        help="Iteration from which plotting will start",
-                        default=1)
-    parser.add_argument("--is-chain", type=str, default=False,
-                        action=common_lib.StrToBoolAction,
-                        help="True if directory contains chain models")
-    parser.add_argument("--is-rnnlm", type=str, default=False,
-                        action=common_lib.StrToBoolAction,
-                        help="True if directory contains RNNLM.")
-    parser.add_argument("--output-nodes", type=str, default=None,
+        prog=sys.argv[0],  # By default, prog is set this to filename only.
+        formatter_class=type('', (argparse.RawDescriptionHelpFormatter,
+                                  argparse.ArgumentDefaultsHelpFormatter), {}),
+        description="Parses the training logs and generates a variety of plots.\n"
+        "e.g.: %(prog)s \\\n"
+        "  exp/nnet3/tdnn exp/nnet3/tdnn1 exp/nnet3/tdnn2 exp/nnet3/tdnn/report.\n"
+        "The report file 'report.pdf' will be generated in the <output_dir> directory.")
+
+    parser.add_argument("--start-iter", type=int, metavar='N', default=1,
+                        help="Iteration from which plotting will start.")
+    parser.add_argument("--is-chain", type=common_lib.str_to_bool, default='false', metavar='BOOL',
+                        help="Set to 'true' if <exp_dir>s contain chain models.")
+    parser.add_argument("--is-rnnlm", type=common_lib.str_to_bool, default='false', metavar='BOOL',
+                        help="Set to 'true' if <exp_dir>s contain RNNLM.")
+    parser.add_argument("--output-nodes", type=str, metavar='NODES',
                         action=common_lib.NullstrToNoneAction,
-                        help="""List of space separated
-                        <output-node>:<objective-type> entities,
-                        one for each output node""")
+                        help="List of space separated <output-node>:<objective-type> entries, "
+                        "one for each output node")
+    parser.add_argument("--comparison-dir", type=str, metavar='DIR', action='append',
+                        help="[DEPRECATED] Experiment directories for comparison. "
+                        "These will only be used for plots, not tables.")
     parser.add_argument("exp_dir", nargs='+',
-                        help="the first dir is the experiment directory, "
-                        "e.g. exp/nnet3/tdnn, the rest dirs (if exist) "
-                        "are other experiment directories for comparison.")
+                        help="The first <exp_dir> is the current experiment directory, e.g. "
+                        "'exp/nnet3/tdnn'; the rest are up to 6 optional directories of other "
+                        "experiments to be graphed on same plots for comparison.")
     parser.add_argument("output_dir",
-                        help="experiment directory, "
-                        "e.g. exp/nnet3/tdnn/report")
+                        help="output directory for reports, e.g. 'exp/nnet3/tdnn/report'")
 
     args = parser.parse_args()
-    if (args.comparison_dir is not None and len(args.comparison_dir) > 6) or \
-    (args.exp_dir is not None and len(args.exp_dir) > 7):
+    if ((args.comparison_dir is not None and len(args.comparison_dir) > 6) or
+        (args.exp_dir is not None and len(args.exp_dir) > 7)):
         raise Exception(
-            """max 6 comparison directories can be specified.
-            If you want to compare with more comparison_dir, you would have to
-            carefully tune the plot_colors variable which specified colors used
-            for plotting.""")
+            "Up to 6 comparison directories may be specified. "
+            "If you want to compare with more experiments, you would have to carefully tune "
+            "the plot_colors variable which specified colors used for plotting.")
     assert args.start_iter >= 1
     if args.is_chain and args.is_rnnlm:
-        raise Exception("""is_chain and is_rnnlm is not compatible.""")
+        raise Exception("Options --is-chain and --is-rnnlm cannot be both true.")
     return args
 
 
@@ -104,23 +89,24 @@ class LatexReport(object):
     def __init__(self, pdf_file):
         self.pdf_file = pdf_file
         self.document = []
-        self.document.append("""
+        self.document.append(r"""
 \documentclass[prl,10pt,twocolumn]{revtex4}
 \usepackage{graphicx}    % Used to import the graphics
-\\begin{document}
+\begin{document}
 """)
 
     def add_figure(self, figure_pdf, title):
         """we will have keep extending this replacement list based on errors
         during compilation escaping underscores in the title"""
-        title = "\\texttt{"+re.sub("_", "\_", title)+"}"
-        fig_latex = """
+
+        title = r"\texttt{"+re.sub("_", "\_", title)+"}"
+        fig_latex = r"""
 %...
-\\newpage
-\\begin{figure}[h]
-  \\begin{center}
-    \caption{""" + title + """}
-    \includegraphics[width=\\textwidth]{""" + figure_pdf + """}
+\newpage
+\begin{figure}[h]
+  \begin{center}
+    \caption{""" + title + r"""}
+    \includegraphics[width=\textwidth]{""" + figure_pdf + r"""}
   \end{center}
 \end{figure}
 \clearpage
@@ -129,7 +115,7 @@ def add_figure(self, figure_pdf, title):
         self.document.append(fig_latex)
 
     def close(self):
-        self.document.append("\end{document}")
+        self.document.append(r"\end{document}")
         return self.compile()
 
     def compile(self):
@@ -139,14 +125,15 @@ def compile(self):
         lat_file = open(latex_file, "w")
         lat_file.write("\n".join(self.document))
         lat_file.close()
-        logger.info("Compiling the latex report.")
+        logger.info("Compiling the LaTeX report.")
         try:
             common_lib.execute_command(
                 "pdflatex -interaction=batchmode "
                 "-output-directory={0} {1}".format(dir_name, latex_file))
         except Exception as e:
-            logger.warning("There was an error compiling the latex file {0}, "
-                           "please do it manually: {1}".format(latex_file, e))
+            logger.warning("There was an error compiling LaTeX file %s. "
+                           "Check report.log generated by pdflatex in the same directory. %s",
+                           latex_file, e)
             return False
         return True
 
@@ -222,10 +209,11 @@ def generate_acc_logprob_plots(exp_dir, output_dir, plot, key='accuracy',
 # The name of five gates of lstmp
 g_lstm_gate = ['i_t_sigmoid', 'f_t_sigmoid', 'c_t_tanh', 'o_t_sigmoid', 'm_t_tanh']
 
-# The "extra" item looks like a placeholder. As each unit in python plot is
+# The "extra" item is a placeholder. As each unit in python plot is
 # composed by a legend_handle(linestyle) and a legend_label(description).
 # For the unit which doesn't have linestyle, we use the "extra" placeholder.
-extra = Rectangle((0, 0), 1, 1, facecolor="w", fill=False, edgecolor='none', linewidth=0)
+if g_plot:
+    extra = Rectangle((0, 0), 1, 1, facecolor="w", fill=False, edgecolor='none', linewidth=0)
 
 # This function is used to insert a column to the legend, the column_index is 1-based
 def insert_a_column_legend(legend_handle, legend_label, lp, mp, hp,
@@ -390,8 +378,7 @@ def generate_nonlin_stats_plots(exp_dir, output_dir, plot, comparison_dir=None,
             comp_data = stats_per_component_per_iter[component_name]
             comp_type = comp_data['type']
             comp_stats = comp_data['stats']
-            iters = comp_stats.keys()
-            iters.sort()
+            iters = sorted(comp_stats)
             iter_stats = []
             for iter in iters:
                 iter_stats.append([iter] + comp_stats[iter])
@@ -407,15 +394,16 @@ def generate_nonlin_stats_plots(exp_dir, output_dir, plot, comparison_dir=None,
                     dir=output_dir, comp_name=component_name), "w") as f:
             if with_oderiv:
                 # with oderiv-rms
-                f.write("Iteration\tValueMean\tValueStddev\tDerivMean\tDerivStddev\tOderivMean\tOderivStddev\t"
-                               "Value_5th\tValue_50th\tValue_95th\t"
-                               "Deriv_5th\tDeriv_50th\tDeriv_95th\t"
-                               "Oderiv_5th\tOderiv_50th\tOderiv_95th\n")
+                f.write("Iteration\tValueMean\tValueStddev\tDerivMean\tDerivStddev\t"
+                        "OderivMean\tOderivStddev\t"
+                        "Value_5th\tValue_50th\tValue_95th\t"
+                        "Deriv_5th\tDeriv_50th\tDeriv_95th\t"
+                        "Oderiv_5th\tOderiv_50th\tOderiv_95th\n")
             else:
                 # without oderiv-rms
                 f.write("Iteration\tValueMean\tValueStddev\tDerivMean\tDerivStddev\t"
-                               "Value_5th\tValue_50th\tValue_95th\t"
-                               "Deriv_5th\tDeriv_50th\tDeriv_95th\n")
+                        "Value_5th\tValue_50th\tValue_95th\t"
+                        "Deriv_5th\tDeriv_50th\tDeriv_95th\n")
             iter_stat_report = []
             iter_stats = main_stat_tables[component_name]
             for row in iter_stats:
@@ -423,21 +411,18 @@ def generate_nonlin_stats_plots(exp_dir, output_dir, plot, comparison_dir=None,
             f.write("\n".join(iter_stat_report))
             f.close()
     if plot:
-        main_component_names = list(main_stat_tables.keys())
-        main_component_names.sort()
-
+        main_component_names = sorted(main_stat_tables)
         plot_component_names = set(main_component_names)
         for dir in dirs:
             component_names = set(stats_per_dir[dir].keys())
             plot_component_names = plot_component_names.intersection(
                 component_names)
-        plot_component_names = list(plot_component_names)
-        plot_component_names.sort()
+        plot_component_names = sorted(plot_component_names)
         if plot_component_names != main_component_names:
-            logger.warning("""The components in all the neural networks in the
-            given experiment dirs are not the same, so comparison plots are
-            provided only for common component names. Make sure that these are
-            comparable experiments before analyzing these plots.""")
+            logger.warning("The components in all the neural networks in the "
+                           "given experiment dirs are not the same, so comparison plots are "
+                           "provided only for common component names. Make sure that these are "
+                           "comparable experiments before analyzing these plots.")
 
         fig = plt.figure()
 
@@ -510,9 +495,8 @@ def generate_clipped_proportion_plots(exp_dir, output_dir, plot,
         except log_parse.MalformedClippedProportionLineException as e:
             raise e
         except common_lib.KaldiCommandException as e:
-            warnings.warn("Could not extract the clipped proportions for {0},"
-                          " this might be because there are no "
-                          "ClipGradientComponents.".format(dir))
+            logger.warning("Could not extract the clipped proportions for %s, "
+                           "this might be because there are no ClipGradientComponents.", dir)
             continue
         if len(stats_per_dir[dir]) == 0:
             logger.warning("Couldn't find any rows for the"
@@ -520,9 +504,8 @@ def generate_clipped_proportion_plots(exp_dir, output_dir, plot,
     try:
         main_cp_stats = stats_per_dir[exp_dir]['table']
     except KeyError:
-        warnings.warn("The main experiment directory {0} does not have "
-                      "clipped proportions. So not generating clipped "
-                      "proportion plots.".format(exp_dir))
+        logger.warning("The main experiment directory %s does not have clipped proportions. "
+                       "Not generating clipped proportion plots.", exp_dir)
         return
 
     # this is the main experiment directory
@@ -534,26 +517,22 @@ def generate_clipped_proportion_plots(exp_dir, output_dir, plot,
     file.close()
 
     if plot:
-        main_component_names = (
-            list(stats_per_dir[exp_dir]['cp_per_iter_per_component'].keys()))
-        main_component_names.sort()
+        main_component_names = sorted(stats_per_dir[exp_dir]['cp_per_iter_per_component'])
         plot_component_names = set(main_component_names)
         for dir in dirs:
             try:
-                component_names = set(
-                    stats_per_dir[dir]['cp_per_iter_per_component'].keys())
+                component_names = set(stats_per_dir[dir]['cp_per_iter_per_component'])
                 plot_component_names = (
                     plot_component_names.intersection(component_names))
             except KeyError:
                 continue
-        plot_component_names = list(plot_component_names)
-        plot_component_names.sort()
+        plot_component_names = sorted(plot_component_names)
         if plot_component_names != main_component_names:
             logger.warning(
-                """The components in all the neural networks in the given
-                experiment dirs are not the same, so comparison plots are
-                provided only for common component names. Make sure that these
-                are comparable experiments before analyzing these plots.""")
+                "The components in all the neural networks in the given "
+                "experiment dirs are not the same, so comparison plots are "
+                "provided only for common component names. Make sure that these "
+                "are comparable experiments before analyzing these plots.")
 
         fig = plt.figure()
         for component_name in main_component_names:
@@ -638,32 +617,25 @@ def generate_parameter_diff_plots(exp_dir, output_dir, plot,
                         iter_data.append("NA")
                 if (float(total_missing_iterations)/len(component_names) > 20
                         and not gave_user_warning):
-                    logger.warning("There are more than {0} missing "
-                                   "iterations per component. "
-                                   "Something might be wrong.".format(
-                                       float(total_missing_iterations)/ len(component_names)))
+                    logger.warning("There are more than %.0f missing iterations per component. "
+                                   "Something might be wrong.",
+                                   float(total_missing_iterations)/ len(component_names))
                     gave_user_warning = True
 
-                f.write(" ".join(iter_data)+"\n")
+                f.write(" ".join(iter_data) + "\n")
 
     if plot:
         # get the component names
         diff_type = list(key_file.keys())[0]
-        main_component_names = list(stats_per_dir[exp_dir][diff_type][
-            'progress_per_component'].keys())
-        main_component_names.sort()
+        main_component_names = sorted(stats_per_dir[exp_dir][diff_type]['progress_per_component'])
         plot_component_names = set(main_component_names)
-
         for dir in dirs:
             try:
-                component_names = set(stats_per_dir[dir][diff_type][
-                    'progress_per_component'].keys())
-                plot_component_names = plot_component_names.intersection(
-                    component_names)
+                component_names = set(stats_per_dir[dir][diff_type]['progress_per_component'])
+                plot_component_names = plot_component_names.intersection(component_names)
             except KeyError:
                 continue
-        plot_component_names = list(plot_component_names)
-        plot_component_names.sort()
+        plot_component_names = sorted(plot_component_names)
         if plot_component_names != main_component_names:
             logger.warning("The components in all the neural networks in the "
                            "given experiment dirs are not the same, "
@@ -675,9 +647,8 @@ def generate_parameter_diff_plots(exp_dir, output_dir, plot,
         assert main_component_names
 
         fig = plt.figure()
-        logger.info("Generating parameter-difference plots for the "
-                    "following components:{0}".format(
-                        ', '.join(main_component_names)))
+        logger.info("Plotting parameter differences for components: " +
+                    ", ".join(main_component_names))
 
         for component_name in main_component_names:
             fig.clf()
@@ -698,12 +669,9 @@ def generate_parameter_diff_plots(exp_dir, output_dir, plot,
                     # this component is not available in this network so lets
                     # not just plot it
                     if dir == exp_dir:
-                        raise Exception("No parameter differences were "
-                                        "available even in the main "
-                                        "experiment dir for the component "
-                                        "{0}. Something went wrong: "
-                                        "{1}.".format(
-                                            component_name, str(e)))
+                        raise Exception("No parameter differences were available even in the main "
+                                        "experiment dir for the component {0}. Something went "
+                                        "wrong: {1}.".format(component_name, e))
                     continue
                 ax = plt.subplot(211)
                 mp, = ax.plot(iter_stats[0][:, 0], iter_stats[0][:, 1],
@@ -755,35 +723,35 @@ def generate_plots(exp_dir, output_dir, output_names, comparison_dir=None,
 
     for (output_name, objective_type) in output_names:
         if objective_type == "linear":
-            logger.info("Generating accuracy plots")
+            logger.info("Generating accuracy plots for '%s'", output_name)
             generate_acc_logprob_plots(
                 exp_dir, output_dir, g_plot, key='accuracy',
                 file_basename='accuracy', comparison_dir=comparison_dir,
                 start_iter=start_iter,
                 latex_report=latex_report, output_name=output_name)
 
-            logger.info("Generating log-likelihood plots")
+            logger.info("Generating log-likelihood plots for '%s'", output_name)
             generate_acc_logprob_plots(
                 exp_dir, output_dir, g_plot, key='log-likelihood',
                 file_basename='loglikelihood', comparison_dir=comparison_dir,
                 start_iter=start_iter,
                 latex_report=latex_report, output_name=output_name)
         elif objective_type == "chain":
-            logger.info("Generating log-probability plots")
+            logger.info("Generating log-probability plots for '%s'", output_name)
             generate_acc_logprob_plots(
                 exp_dir, output_dir, g_plot,
                 key='log-probability', file_basename='log_probability',
                 comparison_dir=comparison_dir, start_iter=start_iter,
                 latex_report=latex_report, output_name=output_name)
         elif objective_type == "rnnlm_objective":
-            logger.info("Generating RNNLM objective plots")
+            logger.info("Generating RNNLM objective plots for '%s'", output_name)
             generate_acc_logprob_plots(
                 exp_dir, output_dir, g_plot, key='rnnlm_objective',
                 file_basename='objective', comparison_dir=comparison_dir,
                 start_iter=start_iter,
                 latex_report=latex_report, output_name=output_name)
         else:
-            logger.info("Generating " + objective_type + " objective plots")
+            logger.info("Generating %s objective plots for '%s'", objective_type, output_name)
             generate_acc_logprob_plots(
                 exp_dir, output_dir, g_plot, key='objective',
                 file_basename='objective', comparison_dir=comparison_dir,
@@ -808,14 +776,19 @@ def generate_plots(exp_dir, output_dir, output_names, comparison_dir=None,
     if g_plot and latex_report is not None:
         has_compiled = latex_report.close()
         if has_compiled:
-            logger.info("Report has been generated. "
-                        "You can find it at the location "
-                        "{0}".format("{0}/report.pdf".format(output_dir)))
+            logger.info("Report file %s/report.pdf has been generated successfully.", output_dir)
 
 
 def main():
     args = get_args()
 
+    if not g_plot:
+        logger.warning(
+            "This script requires matplotlib and numpy.\n"
+            "... Install these packages to generate plots.\n"
+            "... If you are on a cluster where you do not have admin rights, use venv.\n"
+            "... Generating text data table files only.")
+
     output_nodes = []
 
     if args.output_nodes is not None:

From c9a1257ab568db4c1cc11447d61a8be905169d03 Mon Sep 17 00:00:00 2001
From: Sawyer X <xsawyerx@cpan.org>
Date: Wed, 1 May 2019 21:07:33 +0300
Subject: [PATCH 093/163] [scripts] Add --one-based option to split_scp.pl
 (#3279)

---
 egs/wsj/s5/utils/split_scp.pl | 12 ++++++++++--
 1 file changed, 10 insertions(+), 2 deletions(-)

diff --git a/egs/wsj/s5/utils/split_scp.pl b/egs/wsj/s5/utils/split_scp.pl
index 994c62e7a2d..7eca0294262 100755
--- a/egs/wsj/s5/utils/split_scp.pl
+++ b/egs/wsj/s5/utils/split_scp.pl
@@ -44,8 +44,9 @@
 $num_jobs = 0;
 $job_id = 0;
 $utt2spk_file = "";
+$one_based = 0;
 
-for ($x = 1; $x <= 2 && @ARGV > 0; $x++) {
+for ($x = 1; $x <= 3 && @ARGV > 0; $x++) {
     if ($ARGV[0] eq "-j") {
         shift @ARGV;
         $num_jobs = shift @ARGV;
@@ -58,11 +59,18 @@
         $utt2spk_file=$1;
         shift;
     }
+    if ($ARGV[0] eq '--one-based') {
+        $one_based = 1;
+        shift @ARGV;
+    }
 }
 
+$one_based
+    and $job_id--;
+
 if(($num_jobs == 0 && @ARGV < 2) || ($num_jobs > 0 && (@ARGV < 1 || @ARGV > 2))) {
     die "Usage: split_scp.pl [--utt2spk=<utt2spk_file>] in.scp out1.scp out2.scp ... \n" .
-        " or: split_scp.pl -j num-jobs job-id [--utt2spk=<utt2spk_file>] in.scp [out.scp]\n" .
+        " or: split_scp.pl -j num-jobs job-id [--one-based] [--utt2spk=<utt2spk_file>] in.scp [out.scp]\n" .
         " ... where 0 <= job-id < num-jobs.";
 }
 

From aae8be4a07d12d443b27eeba5624f578bc23d668 Mon Sep 17 00:00:00 2001
From: rezame <36230722+rezame@users.noreply.github.com>
Date: Wed, 1 May 2019 23:23:30 +0430
Subject: [PATCH 094/163] [scripts] Allow UTF utterance-ids by removing
 unnecessary assert (#3283)

---
 egs/wsj/s5/steps/cleanup/internal/tf_idf.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/egs/wsj/s5/steps/cleanup/internal/tf_idf.py b/egs/wsj/s5/steps/cleanup/internal/tf_idf.py
index a098d9f2a44..15773d0977e 100644
--- a/egs/wsj/s5/steps/cleanup/internal/tf_idf.py
+++ b/egs/wsj/s5/steps/cleanup/internal/tf_idf.py
@@ -401,7 +401,6 @@ def read_key(fd):
     str += char
   str = str.strip()
   if str == '': return None # end of file,
-  assert(re.match('^[\.a-zA-Z0-9_:-]+$',str) != None) # check format,
   return str
 
 

From 803e3ee0142a23dbda2f9f534b33a55609e7831a Mon Sep 17 00:00:00 2001
From: Daniel Povey <dpovey@gmail.com>
Date: Wed, 1 May 2019 19:13:53 -0700
Subject: [PATCH 095/163] [src] Keep nnet output in the [-30,30] range required
 by chain denominator (#3024)

Intended to help resolve occasional instabilities.  Not 100% clear that it was helping, as instabilities don't tend to be that reproducible.
---
 src/chain/chain-denominator.cc | 14 +++++++
 src/chain/chain-denominator.h  |  3 +-
 src/chain/chain-kernels-ansi.h |  6 +++
 src/chain/chain-kernels.cu     | 31 ++++++++++++++-
 src/chain/chain-training.cc    | 72 ++++++++++++++++++++++++++++++++++
 src/chain/chain-training.h     | 16 +++++++-
 6 files changed, 139 insertions(+), 3 deletions(-)

diff --git a/src/chain/chain-denominator.cc b/src/chain/chain-denominator.cc
index b644e429b67..b9023f02f5e 100644
--- a/src/chain/chain-denominator.cc
+++ b/src/chain/chain-denominator.cc
@@ -24,6 +24,7 @@
 namespace kaldi {
 namespace chain {
 
+
 DenominatorComputation::DenominatorComputation(
     const ChainTrainingOptions &opts,
     const DenominatorGraph &den_graph,
@@ -54,6 +55,18 @@ DenominatorComputation::DenominatorComputation(
   // log-space.
   KALDI_ASSERT(opts_.leaky_hmm_coefficient > 0.0 &&
                opts_.leaky_hmm_coefficient < 1.0);
+
+  if (RandInt(0, 99) == 0) {
+    // A check, that all values in nnet_output are in the range [-30, 30]..
+    // otherwise derivatives will be wrong (search below for 30).
+    BaseFloat max_val = nnet_output.Max(), min_val = nnet_output.Min();
+    if (max_val > 30.0 || min_val < -30.0) {
+      KALDI_WARN << "Nnet outputs " << min_val << ", "
+                 << max_val <<
+          " outside the range [-30,30], derivs may be inaccurate.";
+    }
+  }
+
   // make sure the alpha sums and beta sums are zeroed.
   alpha_.ColRange(den_graph_.NumStates() * num_sequences_,
                   num_sequences_).SetZero();
@@ -294,6 +307,7 @@ bool DenominatorComputation::Backward(
         transposed_deriv_part.SetZero();
     }
   }
+
   return ok_;
 }
 
diff --git a/src/chain/chain-denominator.h b/src/chain/chain-denominator.h
index 9960dfede0b..68e6e32682d 100644
--- a/src/chain/chain-denominator.h
+++ b/src/chain/chain-denominator.h
@@ -218,7 +218,8 @@ class DenominatorComputation {
   BaseFloat Forward();
 
   // this adds deriv_weight times (the derivative of the log-prob w.r.t. the
-  // nnet output), to 'nnet_output_deriv'.
+  // nnet output), to 'nnet_output_deriv'.  Note: normally, deriv_weight
+  // will be -1, or some other negative number if we are doing data weighting.
   // returns true if everything seemed OK, false if a failure was detected.
   bool Backward(BaseFloat deriv_weight,
                 CuMatrixBase<BaseFloat> *nnet_output_deriv);
diff --git a/src/chain/chain-kernels-ansi.h b/src/chain/chain-kernels-ansi.h
index 388c78ab2ee..f5814d7c11c 100644
--- a/src/chain/chain-kernels-ansi.h
+++ b/src/chain/chain-kernels-ansi.h
@@ -48,6 +48,12 @@ extern "C" {
                               const BaseFloat *prev_alpha,
                               BaseFloat *this_alpha);
 
+  void cuda_penalize_out_of_range(dim3 Gr, dim3 Bl, BaseFloat limit,
+                                  BaseFloat scale, const BaseFloat *in_data,
+                                  MatrixDim dim, int out_stride,
+                                  BaseFloat *out_deriv);
+
+
 } // extern "C"
 
 #endif  // HAVE_CUDA
diff --git a/src/chain/chain-kernels.cu b/src/chain/chain-kernels.cu
index f093f21a5a5..a63944f0012 100644
--- a/src/chain/chain-kernels.cu
+++ b/src/chain/chain-kernels.cu
@@ -1,6 +1,6 @@
 // chain/chain-kernels.cu
 
-// Copyright  2015  Johns Hopkins University (author: Daniel Povey)
+// Copyright  2015-2019  Johns Hopkins University (author: Daniel Povey)
 
 
 // Licensed under the Apache License, Version 2.0 (the "License");
@@ -287,3 +287,32 @@ void cuda_chain_hmm_backward(dim3 Gr, dim3 Bl,
                                       this_beta, log_prob_deriv,
                                       log_prob_deriv_stride);
 }
+
+
+// See documentation for PenalizeOutOfRange() in chain-training.cc to see what
+// this is about.
+__global__
+static void _penalize_out_of_range(
+    BaseFloat limit, BaseFloat scale, const BaseFloat *in_data, MatrixDim dim,
+    int out_stride, BaseFloat *out_deriv) {
+  int i = blockIdx.x * blockDim.x + threadIdx.x;
+  int j = blockIdx.y * blockDim.y + threadIdx.y;
+  int in_index = i + j * dim.stride,
+      out_index = i + j * out_stride;
+  if (i < dim.cols && j < dim.rows) {
+    BaseFloat val = in_data[in_index];
+    if (val < -limit) {
+      out_deriv[out_index] -= scale * (val + limit);
+    } else if (val > limit) {
+      out_deriv[out_index] -= scale * (val - limit);
+    }
+  }
+}
+
+void cuda_penalize_out_of_range(dim3 Gr, dim3 Bl, BaseFloat limit,
+                                BaseFloat scale, const BaseFloat *in_data,
+                                MatrixDim dim, int out_stride,
+                                BaseFloat *out_deriv) {
+  _penalize_out_of_range<<<Gr,Bl>>>(limit, scale, in_data,
+                                    dim, out_stride, out_deriv);
+}
diff --git a/src/chain/chain-training.cc b/src/chain/chain-training.cc
index 6b4a7b593c2..d20ecfa4c1e 100644
--- a/src/chain/chain-training.cc
+++ b/src/chain/chain-training.cc
@@ -28,6 +28,62 @@ namespace kaldi {
 namespace chain {
 
 
+/**
+   This is a rather special-purpose function which adds something to
+   the derivative in order to encourage the value to stay within
+   a specified range.  This is something we use in chain training
+   in order to encourage the nnet outputs to stay within the
+   range [-30, 30] (needed because we don't do the forward-backward
+   denominator computation in log space).
+
+   It's very similar to l2 regularization but only applied once you depart
+   the range [-limit, limit].
+
+   Basically, this function does as follows:
+
+     (*out_deriv)(i,j) +=   0                                if   -limit <= in_value(i,j) <= limit
+                            (-limit - in_value(i,j)) * scale if  in_value(i,j) < -limit
+                            (limit - in_value(i,j)) * scale  if  in_value(i,j) > limit
+   If limit were zero, this would be the same as l2 regularization with scale 'scale'.
+ */
+static void PenalizeOutOfRange(const CuMatrixBase<BaseFloat> &in_value,
+                               BaseFloat limit,
+                               BaseFloat scale,
+                               CuMatrixBase<BaseFloat> *out_deriv) {
+  KALDI_ASSERT(SameDim(in_value, *out_deriv) && limit > 0 && scale >= 0);
+  if (scale == 0)
+    return;
+#if HAVE_CUDA == 1
+  if (CuDevice::Instantiate().Enabled()) {
+    CuTimer tim;
+    dim3 dimBlock(CU2DBLOCK, CU2DBLOCK);
+    dim3 dimGrid(n_blocks(in_value.NumCols(), CU2DBLOCK),
+                 n_blocks(in_value.NumRows(), CU2DBLOCK));
+    cuda_penalize_out_of_range(dimGrid, dimBlock, limit, scale,
+                               in_value.Data(), in_value.Dim(),
+                               out_deriv->Stride(), out_deriv->Data());
+    CU_SAFE_CALL(cudaGetLastError());
+    CuDevice::Instantiate().AccuProfile(__func__, tim);
+  } else
+#endif
+  {
+    int32 num_rows = in_value.NumRows(),
+        num_cols = in_value.NumCols();
+    for (int32 r = 0; r < num_rows; r++) {
+      const BaseFloat *in_row_data =  in_value.RowData(r);
+      BaseFloat *out_row_data = out_deriv->RowData(r);
+      for (int32 c = 0; c < num_cols; c++) {
+        BaseFloat val = in_row_data[c];
+        if (val < -limit) {
+          out_row_data[c] -= scale * (val + limit);
+        } else if (val > limit) {
+          out_row_data[c] -= scale * (val - limit);
+        }
+      }
+    }
+  }
+}
+
 
 void ComputeChainObjfAndDerivE2e(const ChainTrainingOptions &opts,
                                  const DenominatorGraph &den_graph,
@@ -47,6 +103,14 @@ void ComputeChainObjfAndDerivE2e(const ChainTrainingOptions &opts,
   if (nnet_output_deriv != NULL)
     nnet_output_deriv->SetZero();
 
+  if (nnet_output_deriv != NULL && RandInt(0, 1) == 0) {
+    // Only do this about every other frame, for efficiency; we'll multiply the
+    // scale by 2 to compensate.  See docs for the function, for its purpose.
+    PenalizeOutOfRange(nnet_output, 30.0,
+                       2.0 * opts.out_of_range_regularize,
+                       nnet_output_deriv);
+  }
+
   { // Doing the denominator first helps to reduce the maximum
     // memory use, as we can set 'xent_deriv' to nonempty after
     // we've freed the memory in this object.
@@ -172,6 +236,14 @@ void ComputeChainObjfAndDeriv(const ChainTrainingOptions &opts,
                                 nnet_output_deriv);
   }
 
+  if (nnet_output_deriv != NULL && RandInt(0, 1) == 0) {
+    // Only do this about every other frame, for efficiency; we'll multiply the
+    // scale by 2 to compensate.  See docs for the function, for its purpose.
+    PenalizeOutOfRange(nnet_output, 30.0,
+                       2.0 * opts.out_of_range_regularize,
+                       nnet_output_deriv);
+  }
+
   if (xent_output_deriv != NULL) {
     // the reason for kStrideEqualNumCols is so that we can share the memory
     // block with the memory that was used for exp_nnet_output_transposed_ from
diff --git a/src/chain/chain-training.h b/src/chain/chain-training.h
index 7dbc1a058c2..cd243ff06ba 100644
--- a/src/chain/chain-training.h
+++ b/src/chain/chain-training.h
@@ -45,6 +45,14 @@ struct ChainTrainingOptions {
   // (squared so it's additive across the dimensions).  e.g. try 0.0005.
   BaseFloat l2_regularize;
 
+
+  // This is similar to an l2 regularization constant (like l2-regularize) but
+  // applied on the part of the nnet output matrix that exceeds the range
+  // [-30,30]... this is necessary to avoid things regularly going out of the
+  // range that we can do exp() on, since the denominator computation is not in
+  // log space and to avoid NaNs we limit the outputs to the range [-30,30].
+  BaseFloat out_of_range_regularize;
+
   // Coefficient for 'leaky hmm'.  This means we have an epsilon-transition from
   // each state to a special state with probability one, and then another
   // epsilon-transition from that special state to each state, with probability
@@ -62,13 +70,19 @@ struct ChainTrainingOptions {
   // should have a softmax as its final nonlinearity.
   BaseFloat xent_regularize;
 
-  ChainTrainingOptions(): l2_regularize(0.0), leaky_hmm_coefficient(1.0e-05),
+  ChainTrainingOptions(): l2_regularize(0.0), out_of_range_regularize(0.01),
+                          leaky_hmm_coefficient(1.0e-05),
                           xent_regularize(0.0) { }
 
   void Register(OptionsItf *opts) {
     opts->Register("l2-regularize", &l2_regularize, "l2 regularization "
                    "constant for 'chain' training, applied to the output "
                    "of the neural net.");
+    opts->Register("out-of-range-regularize", &out_of_range_regularize,
+                   "Constant that controls how much we penalize the nnet output "
+                   "being outside the range [-30,30].  This is needed because we "
+                   "limit it to that range in the denominator computation (which "
+                   "is to avoid NaNs because it is not done in log space.");
     opts->Register("leaky-hmm-coefficient", &leaky_hmm_coefficient, "Coefficient "
                    "that allows transitions from each HMM state to each other "
                    "HMM state, to ensure gradual forgetting of context (can "

From b44f7083a1c833499dbad87d2b2f1f1fa684345b Mon Sep 17 00:00:00 2001
From: Sawyer X <xsawyerx@cpan.org>
Date: Thu, 2 May 2019 20:58:26 +0300
Subject: [PATCH 096/163] [scripts] Clean up filehandle usage in split_scp.pl
 (#3285)

* Move to three-arg open() for security reasons:

A two-arg open() call is a security flaw, since the filename could affect
the method of opening a file. This can be very, *very* risky. If the input
comes from the command line, triggered by a system that uses information from
the customer to affect filenames, it could *easily* lead to serious security
risks.

* Replace bareword filehandles with proper variables:

When using bareword filehandles, you are creating global variables. Instead,
we're using proper scalars for the filehandles. Much safer!

This commit also closes filehandles that were left open. Perl tries to
implicitly close them for you, but it's better to be explicit.
---
 egs/wsj/s5/utils/split_scp.pl | 27 +++++++++++++++------------
 1 file changed, 15 insertions(+), 12 deletions(-)

diff --git a/egs/wsj/s5/utils/split_scp.pl b/egs/wsj/s5/utils/split_scp.pl
index 7eca0294262..d46b7e20521 100755
--- a/egs/wsj/s5/utils/split_scp.pl
+++ b/egs/wsj/s5/utils/split_scp.pl
@@ -90,16 +90,17 @@
 }
 
 if ($utt2spk_file ne "") {  # We have the --utt2spk option...
-    open(U, "<$utt2spk_file") || die "Failed to open utt2spk file $utt2spk_file";
-    while(<U>) {
+    open($u_fh, '<', $utt2spk_file) || die "Failed to open utt2spk file $utt2spk_file";
+    while(<$u_fh>) {
         @A = split;
         @A == 2 || die "Bad line $_ in utt2spk file $utt2spk_file";
         ($u,$s) = @A;
         $utt2spk{$u} = $s;
     }
-    open(I, "<$inscp") || die "Opening input scp file $inscp";
+    close $u_fh;
+    open($i_fh, '<', $inscp) || die "Opening input scp file $inscp";
     @spkrs = ();
-    while(<I>) {
+    while(<$i_fh>) {
         @A = split;
         if(@A == 0) { die "Empty or space-only line in scp file $inscp"; }
         $u = $A[0];
@@ -113,6 +114,7 @@
         $spk_count{$s}++;
         push @{$spk_data{$s}}, $_;
     }
+    close $i_fh;
     # Now split as equally as possible ..
     # First allocate spks to files by allocating an approximately
     # equal number of speakers.
@@ -183,31 +185,32 @@
     # Now print out the files...
     for($scpidx = 0; $scpidx < $numscps; $scpidx++) {
         $scpfn = $OUTPUTS[$scpidx];
-        open(F, ">$scpfn") || die "Could not open scp file $scpfn for writing.";
+        open($f_fh, '>', $scpfn) || die "Could not open scp file $scpfn for writing.";
         $count = 0;
         if(@{$scparray[$scpidx]} == 0) {
             print STDERR "Error: split_scp.pl producing empty .scp file $scpfn (too many splits and too few speakers?)\n";
             $error = 1;
         } else {
             foreach $spk ( @{$scparray[$scpidx]} ) {
-                print F @{$spk_data{$spk}};
+                print $f_fh @{$spk_data{$spk}};
                 $count += $spk_count{$spk};
             }
             if($count != $scpcount[$scpidx]) { die "Count mismatch [code error]"; }
         }
-        close(F);
+        close($f_fh);
     }
 } else {
    # This block is the "normal" case where there is no --utt2spk
    # option and we just break into equal size chunks.
 
-    open(I, "<$inscp") || die "Opening input scp file $inscp";
+    open($i_fh, '<', $inscp) || die "Opening input scp file $inscp";
 
     $numscps = @OUTPUTS;  # size of array.
     @F = ();
-    while(<I>) {
+    while(<$i_fh>) {
         push @F, $_;
     }
+    close $i_fh;
     $numlines = @F;
     if($numlines == 0) {
         print STDERR "split_scp.pl: error: empty input scp file $inscp , ";
@@ -221,11 +224,11 @@
     $n = 0;
     for($scpidx = 0; $scpidx < @OUTPUTS; $scpidx++) {
         $scpfile = $OUTPUTS[$scpidx];
-        open(O, ">$scpfile") || die "Opening output scp file $scpfile";
+        open($o_fh, '>', $scpfile) || die "Opening output scp file $scpfile";
         for($k = 0; $k < $linesperscp + ($scpidx < $remainder ? 1 : 0); $k++) {
-            print O $F[$n++];
+            print $o_fh $F[$n++];
         }
-        close(O) || die "Closing scp file $scpfile";
+        close($o_fh) || die "Closing scp file $scpfile";
     }
     $n == $numlines || die "split_scp.pl: code error., $n != $numlines";
 }

From 7055784a4d0be1bb998e41bb24a019c6991bf8c3 Mon Sep 17 00:00:00 2001
From: Daniel Povey <dpovey@gmail.com>
Date: Thu, 2 May 2019 13:20:25 -0700
Subject: [PATCH 097/163] [src] Fix to bug in online-feature.cc that caused
 crash at end of utterance. (#3286)

---
 src/feat/online-feature.cc | 18 ++++++++++++------
 1 file changed, 12 insertions(+), 6 deletions(-)

diff --git a/src/feat/online-feature.cc b/src/feat/online-feature.cc
index 3e8bf483694..b2c4799dacf 100644
--- a/src/feat/online-feature.cc
+++ b/src/feat/online-feature.cc
@@ -104,16 +104,22 @@ void OnlineGenericBaseFeature<C>::MaybeCreateResampler(
 template <class C>
 void OnlineGenericBaseFeature<C>::InputFinished() {
   if (resampler_ != nullptr) {
+    // There may be a few samples left once we flush the resampler_ object, telling it
+    // that the file has finished.  This should rarely make any difference.
     Vector<BaseFloat> appended_wave;
     Vector<BaseFloat> resampled_wave;
     resampler_->Resample(appended_wave, true, &resampled_wave);
 
-    if (waveform_remainder_.Dim() != 0)
-      appended_wave.Range(0, waveform_remainder_.Dim())
-          .CopyFromVec(waveform_remainder_);
-    appended_wave.Range(waveform_remainder_.Dim(), resampled_wave.Dim())
-        .CopyFromVec(resampled_wave);
-    waveform_remainder_.Swap(&appended_wave);
+    if (resampled_wave.Dim() != 0) {
+      appended_wave.Resize(waveform_remainder_.Dim() +
+                           resampled_wave.Dim());
+      if (waveform_remainder_.Dim() != 0)
+        appended_wave.Range(0, waveform_remainder_.Dim())
+            .CopyFromVec(waveform_remainder_);
+      appended_wave.Range(waveform_remainder_.Dim(), resampled_wave.Dim())
+          .CopyFromVec(resampled_wave);
+      waveform_remainder_.Swap(&appended_wave);
+    }
   }
   input_finished_ = true;
   ComputeFeatures();

From 1bcea238aedf284fae1bb00a2a7745bebff16b73 Mon Sep 17 00:00:00 2001
From: Sawyer X <xsawyerx@cpan.org>
Date: Fri, 3 May 2019 00:08:52 +0300
Subject: [PATCH 098/163] [scripts] Use correct compile-time regex syntax in
 split_scp.pl (#3287)

Regular expressions should be written with // to allow the language to do
compile-time checks for the regexp pattern.

This is from `perldoc perlop`:

    If the right argument is an expression rather than a search pattern,
    substitution, or transliteration, it is interpreted as a search pattern at
    run time. Note that this means that its contents will be interpolated
    twice, so

        '\\' =~ q'\\';

    is not ok, as the regex engine will end up trying to compile the pattern
    "\", which it will consider a syntax error.
---
 egs/wsj/s5/utils/split_scp.pl | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/egs/wsj/s5/utils/split_scp.pl b/egs/wsj/s5/utils/split_scp.pl
index d46b7e20521..79c92faac5c 100755
--- a/egs/wsj/s5/utils/split_scp.pl
+++ b/egs/wsj/s5/utils/split_scp.pl
@@ -55,7 +55,7 @@
             die "Invalid num-jobs and job-id: $num_jobs and $job_id";
         }
     }
-    if ($ARGV[0] =~ "--utt2spk=(.+)") {
+    if ($ARGV[0] =~ /--utt2spk=(.+)/) {
         $utt2spk_file=$1;
         shift;
     }

From bfbe8615f36ed03edb99b285306fbd315ebd82b5 Mon Sep 17 00:00:00 2001
From: Xiaohui Zhang <samuelzhang1104@gmail.com>
Date: Thu, 2 May 2019 17:35:35 -0400
Subject: [PATCH 099/163] [scripts] Fix a typo in
 steps/dict/learn_lexicon_bayesian.sh (#3288)

---
 egs/wsj/s5/steps/dict/learn_lexicon_bayesian.sh | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/egs/wsj/s5/steps/dict/learn_lexicon_bayesian.sh b/egs/wsj/s5/steps/dict/learn_lexicon_bayesian.sh
index 042f8f94da4..adff11dd1b4 100755
--- a/egs/wsj/s5/steps/dict/learn_lexicon_bayesian.sh
+++ b/egs/wsj/s5/steps/dict/learn_lexicon_bayesian.sh
@@ -329,7 +329,7 @@ if [ $stage -le 5 ]; then
   # Prune away pronunciations which have low acoustic evidence from the first pass of lattice alignment.
   $cmd $dir/lats_iter1/log/prune_pron_candidates.log steps/dict/internal/prune_pron_candidates.py \
     --variant-counts-ratio $variant_counts_ratio \
-    $dir/lats_iter1/pron_stats.txt $dir/lexicon_phonetic_decoding_pruned.txt $dir/lexiconp_g2p.txt $dir/ref_lexicon.txt \
+    $dir/lats_iter1/pron_stats.txt $dir/lexicon_phonetic_decoding.txt $dir/lexiconp_g2p.txt $dir/ref_lexicon.txt \
     $dir/lexicon_phonetic_decoding_pruned.txt $dir/lexicon_g2p_pruned.txt
 
   # Filter out words which don't appear in the acoustic training data

From 61b2347d421e383de9d7a6cf32f033012c625185 Mon Sep 17 00:00:00 2001
From: DongjiGao <dgao5@jhu.edu>
Date: Sun, 5 May 2019 16:52:51 -0400
Subject: [PATCH 100/163] [egs,scripts] Scripts and an example of BPE-based
 sub-word decoding (#3101)

---
 egs/gale_arabic/s5c/RESULT                    |   4 +
 egs/gale_arabic/s5c/cmd.sh                    |  15 +
 egs/gale_arabic/s5c/conf/decode.config        |   1 +
 egs/gale_arabic/s5c/conf/mfcc.conf            |   1 +
 egs/gale_arabic/s5c/conf/mfcc_hires.conf      |  10 +
 egs/gale_arabic/s5c/conf/online_cmvn.conf     |   1 +
 egs/gale_arabic/s5c/local/bad_segments        |  10 +
 .../s5c/local/chain/compare_wer.sh            |  72 +++
 .../s5c/local/chain/run_chain_common.sh       |  82 ++++
 egs/gale_arabic/s5c/local/chain/run_tdnn.sh   |   1 +
 .../s5c/local/chain/run_tdnn_lstm.sh          |   1 +
 .../s5c/local/chain/tuning/run_tdnn_1a.sh     | 220 +++++++++
 .../local/chain/tuning/run_tdnn_lstm_1a.sh    | 222 +++++++++
 .../s5c/local/nnet3/run_ivector_common.sh     | 182 ++++++++
 egs/gale_arabic/s5c/local/nnet3/run_lstm.sh   |   1 +
 egs/gale_arabic/s5c/local/nnet3/run_tdnn.sh   |   1 +
 .../s5c/local/nnet3/tuning/run_lstm_1a.sh     | 161 +++++++
 .../s5c/local/nnet3/tuning/run_tdnn_1a.sh     |  88 ++++
 .../s5c/local/normalize_transcript_BW.pl      | 111 +++++
 egs/gale_arabic/s5c/local/prepare_data.sh     | 104 +++++
 egs/gale_arabic/s5c/local/prepare_dict.sh     |  48 ++
 .../s5c/local/prepare_dict_subword.sh         |  64 +++
 egs/gale_arabic/s5c/local/prepare_lexicon.py  |  26 ++
 egs/gale_arabic/s5c/local/prepare_lm.sh       |  51 +++
 .../s5c/local/prepare_lm_subword.sh           |  53 +++
 egs/gale_arabic/s5c/local/score.sh            |   6 +
 egs/gale_arabic/s5c/local/split_wer.sh        |  72 +++
 egs/gale_arabic/s5c/local/test_list           |  11 +
 egs/gale_arabic/s5c/local/wer_output_filter   |   4 +
 egs/gale_arabic/s5c/path.sh                   |   5 +
 egs/gale_arabic/s5c/run.sh                    | 131 ++++++
 egs/gale_arabic/s5c/steps                     |   1 +
 egs/gale_arabic/s5c/utils                     |   1 +
 ...make_position_dependent_subword_lexicon.py | 107 +++++
 .../s5/utils/lang/make_subword_lexicon_fst.py | 301 +++++++++++++
 .../s5/utils/subword/prepare_lang_subword.sh  | 423 ++++++++++++++++++
 .../s5/utils/subword/prepare_subword_text.sh  |  48 ++
 egs/wsj/s5/utils/validate_lang.pl             | 144 ++++--
 38 files changed, 2753 insertions(+), 31 deletions(-)
 create mode 100644 egs/gale_arabic/s5c/RESULT
 create mode 100755 egs/gale_arabic/s5c/cmd.sh
 create mode 100644 egs/gale_arabic/s5c/conf/decode.config
 create mode 100644 egs/gale_arabic/s5c/conf/mfcc.conf
 create mode 100644 egs/gale_arabic/s5c/conf/mfcc_hires.conf
 create mode 100644 egs/gale_arabic/s5c/conf/online_cmvn.conf
 create mode 100644 egs/gale_arabic/s5c/local/bad_segments
 create mode 100755 egs/gale_arabic/s5c/local/chain/compare_wer.sh
 create mode 100755 egs/gale_arabic/s5c/local/chain/run_chain_common.sh
 create mode 120000 egs/gale_arabic/s5c/local/chain/run_tdnn.sh
 create mode 120000 egs/gale_arabic/s5c/local/chain/run_tdnn_lstm.sh
 create mode 100755 egs/gale_arabic/s5c/local/chain/tuning/run_tdnn_1a.sh
 create mode 100755 egs/gale_arabic/s5c/local/chain/tuning/run_tdnn_lstm_1a.sh
 create mode 100755 egs/gale_arabic/s5c/local/nnet3/run_ivector_common.sh
 create mode 120000 egs/gale_arabic/s5c/local/nnet3/run_lstm.sh
 create mode 120000 egs/gale_arabic/s5c/local/nnet3/run_tdnn.sh
 create mode 100755 egs/gale_arabic/s5c/local/nnet3/tuning/run_lstm_1a.sh
 create mode 100755 egs/gale_arabic/s5c/local/nnet3/tuning/run_tdnn_1a.sh
 create mode 100755 egs/gale_arabic/s5c/local/normalize_transcript_BW.pl
 create mode 100755 egs/gale_arabic/s5c/local/prepare_data.sh
 create mode 100755 egs/gale_arabic/s5c/local/prepare_dict.sh
 create mode 100755 egs/gale_arabic/s5c/local/prepare_dict_subword.sh
 create mode 100755 egs/gale_arabic/s5c/local/prepare_lexicon.py
 create mode 100755 egs/gale_arabic/s5c/local/prepare_lm.sh
 create mode 100755 egs/gale_arabic/s5c/local/prepare_lm_subword.sh
 create mode 100755 egs/gale_arabic/s5c/local/score.sh
 create mode 100755 egs/gale_arabic/s5c/local/split_wer.sh
 create mode 100644 egs/gale_arabic/s5c/local/test_list
 create mode 100755 egs/gale_arabic/s5c/local/wer_output_filter
 create mode 100755 egs/gale_arabic/s5c/path.sh
 create mode 100755 egs/gale_arabic/s5c/run.sh
 create mode 120000 egs/gale_arabic/s5c/steps
 create mode 120000 egs/gale_arabic/s5c/utils
 create mode 100755 egs/wsj/s5/utils/lang/make_position_dependent_subword_lexicon.py
 create mode 100755 egs/wsj/s5/utils/lang/make_subword_lexicon_fst.py
 create mode 100755 egs/wsj/s5/utils/subword/prepare_lang_subword.sh
 create mode 100755 egs/wsj/s5/utils/subword/prepare_subword_text.sh

diff --git a/egs/gale_arabic/s5c/RESULT b/egs/gale_arabic/s5c/RESULT
new file mode 100644
index 00000000000..d56c9e2dbc6
--- /dev/null
+++ b/egs/gale_arabic/s5c/RESULT
@@ -0,0 +1,4 @@
+%WER 41.98 [ 29249 / 69668, 2672 ins, 5990 del, 20587 sub ] exp/tri1_subword/decode/wer_15_0.0
+%WER 37.66 [ 26239 / 69668, 2660 ins, 5255 del, 18324 sub ] exp/tri2b_subword/decode/wer_17_0.0
+%WER 35.26 [ 24565 / 69668, 2879 ins, 4892 del, 16794 sub ] exp/tri3b_subword/decode/wer_17_0.5
+%WER 17.29 [ 12049 / 69668, 1244 ins, 2758 del, 8047 sub ] exp/chain/tdnn_1a_sp/decode_test/wer_10_0.5
diff --git a/egs/gale_arabic/s5c/cmd.sh b/egs/gale_arabic/s5c/cmd.sh
new file mode 100755
index 00000000000..ea341c98d4a
--- /dev/null
+++ b/egs/gale_arabic/s5c/cmd.sh
@@ -0,0 +1,15 @@
+# you can change cmd.sh depending on what type of queue you are using.
+# If you have no queueing system and want to run on a local machine, you
+# can change all instances 'queue.pl' to run.pl (but be careful and run
+# commands one by one: most recipes will exhaust the memory on your
+# machine).  queue.pl works with GridEngine (qsub).  slurm.pl works
+# with slurm.  Different queues are configured differently, with different
+# queue names and different ways of specifying things like memory;
+# to account for these differences you can create and edit the file
+# conf/queue.conf to match your queue's configuration.  Search for
+# conf/queue.conf in http://kaldi-asr.org/doc/queue.html for more information,
+# or search for the string 'default_config' in utils/queue.pl or utils/slurm.pl.
+
+export train_cmd="retry.pl queue.pl --mem 2G"
+export decode_cmd="retry.pl queue.pl --mem 4G"
+export mkgraph_cmd="retry.pl queue.pl --mem 8G"
diff --git a/egs/gale_arabic/s5c/conf/decode.config b/egs/gale_arabic/s5c/conf/decode.config
new file mode 100644
index 00000000000..6f503eab35e
--- /dev/null
+++ b/egs/gale_arabic/s5c/conf/decode.config
@@ -0,0 +1 @@
+link decode_dnn.config
\ No newline at end of file
diff --git a/egs/gale_arabic/s5c/conf/mfcc.conf b/egs/gale_arabic/s5c/conf/mfcc.conf
new file mode 100644
index 00000000000..7361509099f
--- /dev/null
+++ b/egs/gale_arabic/s5c/conf/mfcc.conf
@@ -0,0 +1 @@
+--use-energy=false   # only non-default option.
diff --git a/egs/gale_arabic/s5c/conf/mfcc_hires.conf b/egs/gale_arabic/s5c/conf/mfcc_hires.conf
new file mode 100644
index 00000000000..c45f2b691a9
--- /dev/null
+++ b/egs/gale_arabic/s5c/conf/mfcc_hires.conf
@@ -0,0 +1,10 @@
+# config for high-resolution MFCC features, intended for neural network training.
+# Note: we keep all cepstra, so it has the same info as filterbank features,
+# but MFCC is more easily compressible (because less correlated) which is why
+# we prefer this method.
+--use-energy=false   # use average of log energy, not energy.
+--sample-frequency=16000 
+--num-mel-bins=40    
+--num-ceps=40   
+--low-freq=40    # low cutoff frequency for mel bins
+--high-freq=-400 # high cutoff frequently, relative to Nyquist of 8000 (=7600)
diff --git a/egs/gale_arabic/s5c/conf/online_cmvn.conf b/egs/gale_arabic/s5c/conf/online_cmvn.conf
new file mode 100644
index 00000000000..cbdaf5f281c
--- /dev/null
+++ b/egs/gale_arabic/s5c/conf/online_cmvn.conf
@@ -0,0 +1 @@
+# configuration file for apply-cmvn-online, used in the script ../local/online/run_online_decoding_nnet2.sh
diff --git a/egs/gale_arabic/s5c/local/bad_segments b/egs/gale_arabic/s5c/local/bad_segments
new file mode 100644
index 00000000000..c3413f0714c
--- /dev/null
+++ b/egs/gale_arabic/s5c/local/bad_segments
@@ -0,0 +1,10 @@
+ARABIYA_FROMIRAQ_ARB_20070302_175801_2326286_2327450
+ARABIYA_BILARABI_ARB_20061005_201400_221375_223694
+LBC_NAHAR_ARB_20060911_142800_3683267_3685290
+LBC_NAHAR_ARB_20070303_145800_3249800_3251128
+LBC_NAHAR_ARB_20070303_145800_3623646_3624152
+LBC_NAHAR_ARB_20070305_035800_481003_484069
+ALAM_WITHEVENT_ARB_20070227_205800_3141876_3144152
+ALAM_NEWSRPT_ARB_20070130_015801_2875054_2876396
+ALJZ_TODHARV_ARB_20060914_155800_2947717_2949041
+ALJZ_TODHARV_ARB_20070107_145800_2417848_2419238
diff --git a/egs/gale_arabic/s5c/local/chain/compare_wer.sh b/egs/gale_arabic/s5c/local/chain/compare_wer.sh
new file mode 100755
index 00000000000..1a40523355a
--- /dev/null
+++ b/egs/gale_arabic/s5c/local/chain/compare_wer.sh
@@ -0,0 +1,72 @@
+#!/bin/bash
+
+# this script is used for comparing decoding results between systems.
+# e.g. local/chain/compare_wer.sh exp/chain/cnn{1a,1b}
+
+# ./local/chain/compare_wer.sh exp/chain/cnn1a
+# System                          cnn1a
+# WER                              0.61
+# CER                              0.15
+# Final train prob              -0.0377
+# Final valid prob              -0.0380
+# Final train prob (xent)       -0.0830
+# Final valid prob (xent)       -0.0838
+
+if [ $# == 0 ]; then
+  echo "Usage: $0: <dir1> [<dir2> ... ]"
+  echo "e.g.: $0 exp/chain/cnn{1a,1b}"
+  exit 1
+fi
+
+echo "# $0 $*"
+used_epochs=false
+
+echo -n "# System                     "
+for x in $*; do   printf "% 10s" " $(basename $x)";   done
+echo
+
+echo -n "# WER                        "
+for x in $*; do
+  wer=$(cat $x/decode_test/scoring_kaldi/best_wer | awk '{print $2}')
+  printf "% 10s" $wer
+done
+echo
+
+echo -n "# CER                        "
+for x in $*; do
+  cer=$(cat $x/decode_test/scoring_kaldi/best_cer | awk '{print $2}')
+  printf "% 10s" $cer
+done
+echo
+
+if $used_epochs; then
+  exit 0;  # the diagnostics aren't comparable between regular and discriminatively trained systems.
+fi
+
+echo -n "# Final train prob           "
+for x in $*; do
+  prob=$(grep Overall $x/log/compute_prob_train.final.log | grep -v xent | awk '{printf("%.4f", $8)}')
+  printf "% 10s" $prob
+done
+echo
+
+echo -n "# Final valid prob           "
+for x in $*; do
+  prob=$(grep Overall $x/log/compute_prob_valid.final.log | grep -v xent | awk '{printf("%.4f", $8)}')
+  printf "% 10s" $prob
+done
+echo
+
+echo -n "# Final train prob (xent)    "
+for x in $*; do
+  prob=$(grep Overall $x/log/compute_prob_train.final.log | grep -w xent | awk '{printf("%.4f", $8)}')
+  printf "% 10s" $prob
+done
+echo
+
+echo -n "# Final valid prob (xent)    "
+for x in $*; do
+  prob=$(grep Overall $x/log/compute_prob_valid.final.log | grep -w xent | awk '{printf("%.4f", $8)}')
+  printf "% 10s" $prob
+done
+echo
diff --git a/egs/gale_arabic/s5c/local/chain/run_chain_common.sh b/egs/gale_arabic/s5c/local/chain/run_chain_common.sh
new file mode 100755
index 00000000000..da37e148441
--- /dev/null
+++ b/egs/gale_arabic/s5c/local/chain/run_chain_common.sh
@@ -0,0 +1,82 @@
+#!/bin/bash
+
+# this script has common stages shared across librispeech chain recipes.
+# It generates a new topology in a new lang directory, gets the alignments as
+# lattices, and builds a tree for the new topology
+set -e
+
+stage=11
+
+# input directory names. These options are actually compulsory, and they have
+# been named for convenience
+gmm_dir=
+ali_dir=
+lores_train_data_dir=
+
+num_leaves=6000
+
+# output directory names. They are also compulsory.
+lang=
+lat_dir=
+tree_dir=
+# End configuration section.
+echo "$0 $@"  # Print the command line for logging
+
+. ./cmd.sh
+. ./path.sh
+. ./utils/parse_options.sh
+
+[ -z $lang ] && echo "Set --lang, this specifies the new lang directory which will have the new topology" && exit 1;
+[ -z $lat_dir ] && echo "Set --lat-dir, this specifies the experiment directory to store lattice" && exit 1;
+[ -z $tree_dir ] && echo "Set --tree-dir, this specifies the directory to store new tree " && exit 1;
+
+for f in $gmm_dir/final.mdl $ali_dir/ali.1.gz $lores_train_data_dir/feats.scp; do
+  [ ! -f $f ] && echo "$0: expected file $f to exist" && exit 1
+done
+
+if [ $stage -le 11 ]; then
+  echo "$0: creating lang directory with one state per phone."
+  # Create a version of the lang/ directory that has one state per phone in the
+  # topo file. [note, it really has two states.. the first one is only repeated
+  # once, the second one has zero or more repeats.]
+  if [ -d $lang ]; then
+    if [ $lang/L.fst -nt data/lang/L.fst ]; then
+      echo "$0: $lang already exists, not overwriting it; continuing"
+    else
+      echo "$0: $lang already exists and seems to be older than data/lang..."
+      echo " ... not sure what to do.  Exiting."
+      exit 1;
+    fi
+  else
+    cp -r data/lang $lang
+    silphonelist=$(cat $lang/phones/silence.csl) || exit 1;
+    nonsilphonelist=$(cat $lang/phones/nonsilence.csl) || exit 1;
+    # Use our special topology... note that later on may have to tune this
+    # topology.
+    steps/nnet3/chain/gen_topo.py $nonsilphonelist $silphonelist >$lang/topo
+  fi
+fi
+
+if [ $stage -le 12 ]; then
+  # Get the alignments as lattices (gives the chain training more freedom).
+  # use the same num-jobs as the alignments
+  nj=$(cat ${ali_dir}/num_jobs) || exit 1;
+  steps/align_fmllr_lats.sh --nj $nj --cmd "$train_cmd" ${lores_train_data_dir} \
+    $lang $gmm_dir $lat_dir
+  rm $lat_dir/fsts.*.gz # save space
+fi
+
+if [ $stage -le 13 ]; then
+  # Build a tree using our new topology. We know we have alignments for the
+  # speed-perturbed data (local/nnet3/run_ivector_common.sh made them), so use
+  # those.
+  if [ -f $tree_dir/final.mdl ]; then
+    echo "$0: $tree_dir/final.mdl already exists, refusing to overwrite it."
+    exit 1;
+  fi
+  steps/nnet3/chain/build_tree.sh --frame-subsampling-factor 3 \
+      --context-opts "--context-width=2 --central-position=1" \
+      --cmd "$train_cmd" $num_leaves ${lores_train_data_dir} $lang $ali_dir $tree_dir
+fi
+
+exit 0;
diff --git a/egs/gale_arabic/s5c/local/chain/run_tdnn.sh b/egs/gale_arabic/s5c/local/chain/run_tdnn.sh
new file mode 120000
index 00000000000..34499362831
--- /dev/null
+++ b/egs/gale_arabic/s5c/local/chain/run_tdnn.sh
@@ -0,0 +1 @@
+tuning/run_tdnn_1a.sh
\ No newline at end of file
diff --git a/egs/gale_arabic/s5c/local/chain/run_tdnn_lstm.sh b/egs/gale_arabic/s5c/local/chain/run_tdnn_lstm.sh
new file mode 120000
index 00000000000..8e647598556
--- /dev/null
+++ b/egs/gale_arabic/s5c/local/chain/run_tdnn_lstm.sh
@@ -0,0 +1 @@
+tuning/run_tdnn_lstm_1a.sh
\ No newline at end of file
diff --git a/egs/gale_arabic/s5c/local/chain/tuning/run_tdnn_1a.sh b/egs/gale_arabic/s5c/local/chain/tuning/run_tdnn_1a.sh
new file mode 100755
index 00000000000..bf2e45c9914
--- /dev/null
+++ b/egs/gale_arabic/s5c/local/chain/tuning/run_tdnn_1a.sh
@@ -0,0 +1,220 @@
+#!/bin/bash
+
+# ./local/chain/compare_wer.sh exp/chain/tdnn_1a_sp
+# System                      tdnn_1a_sp
+# WER                             16.47
+# CER                              6.68
+# Final train prob              -0.0652
+# Final valid prob              -0.0831
+# Final train prob (xent)       -0.8965
+# Final valid prob (xent)       -0.9964
+
+# steps/info/chain_dir_info.pl exp/chain/tdnn_1a_sp/
+# exp/chain/tdnn_1a_sp/: num-iters=441 nj=3..16 num-params=18.6M dim=40+100->5816 combine=-0.063->-0.062 (over 6) xent:train/valid[293,440,final]=(-1.22,-0.912,-0.896/-1.29,-1.01,-0.996) logprob:train/valid[293,440,final]=(-0.097,-0.066,-0.065/-0.108,-0.084,-0.083)
+
+
+set -e -o pipefail
+stage=0
+nj=30
+train_set=train
+test_set=test
+gmm=tri3b        # this is the source gmm-dir that we'll use for alignments; it
+                 # should have alignments for the specified training data.
+num_threads_ubm=32
+nnet3_affix=       # affix for exp dirs, e.g. it was _cleaned in tedlium.
+
+# Options which are not passed through to run_ivector_common.sh
+affix=_1a   #affix for TDNN+LSTM directory e.g. "1a" or "1b", in case we change the configuration.
+common_egs_dir=
+reporting_email=
+
+# LSTM/chain options
+train_stage=-10
+xent_regularize=0.1
+dropout_schedule='0,0@0.20,0.5@0.50,0'
+
+# training chunk-options
+chunk_width=150,110,100
+get_egs_stage=-10
+
+# training options
+srand=0
+remove_egs=true
+run_ivector_common=true
+run_chain_common=true
+# End configuration section.
+echo "$0 $@"  # Print the command line for logging
+
+
+. ./cmd.sh
+. ./path.sh
+. ./utils/parse_options.sh
+
+
+if ! cuda-compiled; then
+  cat <<EOF && exit 1
+This script is intended to be used with GPUs but you have not compiled Kaldi with CUDA
+If you want to use GPUs (and have them), go to src/, and configure and make on a machine
+where "nvcc" is installed.
+EOF
+fi
+
+if $run_ivector_common; then
+  local/nnet3/run_ivector_common.sh \
+    --stage $stage --nj $nj \
+    --train-set $train_set --gmm $gmm \
+    --num-threads-ubm $num_threads_ubm \
+    --nnet3-affix "$nnet3_affix"
+fi
+
+gmm_dir=exp/${gmm}
+ali_dir=exp/${gmm}_ali_${train_set}_sp
+lat_dir=exp/chain${nnet3_affix}/${gmm}_${train_set}_sp_lats
+dir=exp/chain${nnet3_affix}/tdnn${affix}_sp
+train_data_dir=data/${train_set}_sp_hires
+train_ivector_dir=exp/nnet3${nnet3_affix}/ivectors_${train_set}_sp_hires
+lores_train_data_dir=data/${train_set}_sp
+
+# note: you don't necessarily have to change the treedir name
+# each time you do a new experiment-- only if you change the
+# configuration in a way that affects the tree.
+tree_dir=exp/chain${nnet3_affix}/tree_a_sp
+# the 'lang' directory is created by this script.
+# If you create such a directory with a non-standard topology
+# you should probably name it differently.
+lang=data/lang_chain
+
+for f in $train_data_dir/feats.scp $train_ivector_dir/ivector_online.scp \
+    $lores_train_data_dir/feats.scp $gmm_dir/final.mdl \
+    $ali_dir/ali.1.gz $gmm_dir/final.mdl; do
+  [ ! -f $f ] && echo "$0: expected file $f to exist" && exit 1
+done
+
+# Please take this as a reference on how to specify all the options of
+# local/chain/run_chain_common.sh
+if $run_chain_common; then
+  local/chain/run_chain_common.sh --stage $stage \
+                                  --gmm-dir $gmm_dir \
+                                  --ali-dir $ali_dir \
+                                  --lores-train-data-dir ${lores_train_data_dir} \
+                                  --lang $lang \
+                                  --lat-dir $lat_dir \
+                                  --num-leaves 7000 \
+                                  --tree-dir $tree_dir || exit 1;
+fi
+
+if [ $stage -le 15 ]; then
+  mkdir -p $dir
+  echo "$0: creating neural net configs using the xconfig parser";
+
+  num_targets=$(tree-info $tree_dir/tree |grep num-pdfs|awk '{print $2}')
+  learning_rate_factor=$(echo "print (0.5/$xent_regularize)" | python)
+  affine_opts="l2-regularize=0.01 dropout-proportion=0.0 dropout-per-dim=true dropout-per-dim-continuous=true"
+  tdnnf_opts="l2-regularize=0.01 dropout-proportion=0.0 bypass-scale=0.66"
+  linear_opts="l2-regularize=0.01 orthonormal-constraint=-1.0"
+  prefinal_opts="l2-regularize=0.01"
+  output_opts="l2-regularize=0.002"
+
+  mkdir -p $dir/configs
+
+  cat <<EOF > $dir/configs/network.xconfig
+  input dim=100 name=ivector
+  input dim=40 name=input
+  # please note that it is important to have input layer with the name=input
+  # as the layer immediately preceding the fixed-affine-layer to enable
+  # the use of short notation for the descriptor
+  fixed-affine-layer name=lda input=Append(-1,0,1,ReplaceIndex(ivector, t, 0)) affine-transform-file=$dir/configs/lda.mat
+  # the first splicing is moved before the lda layer, so no splicing here
+  relu-batchnorm-dropout-layer name=tdnn1 $affine_opts dim=1536
+  tdnnf-layer name=tdnnf2 $tdnnf_opts dim=1536 bottleneck-dim=160 time-stride=1
+  tdnnf-layer name=tdnnf3 $tdnnf_opts dim=1536 bottleneck-dim=160 time-stride=1
+  tdnnf-layer name=tdnnf4 $tdnnf_opts dim=1536 bottleneck-dim=160 time-stride=1
+  tdnnf-layer name=tdnnf5 $tdnnf_opts dim=1536 bottleneck-dim=160 time-stride=0
+  tdnnf-layer name=tdnnf6 $tdnnf_opts dim=1536 bottleneck-dim=160 time-stride=3
+  tdnnf-layer name=tdnnf7 $tdnnf_opts dim=1536 bottleneck-dim=160 time-stride=3
+  tdnnf-layer name=tdnnf8 $tdnnf_opts dim=1536 bottleneck-dim=160 time-stride=3
+  tdnnf-layer name=tdnnf9 $tdnnf_opts dim=1536 bottleneck-dim=160 time-stride=3
+  tdnnf-layer name=tdnnf10 $tdnnf_opts dim=1536 bottleneck-dim=160 time-stride=3
+  tdnnf-layer name=tdnnf11 $tdnnf_opts dim=1536 bottleneck-dim=160 time-stride=3
+  tdnnf-layer name=tdnnf12 $tdnnf_opts dim=1536 bottleneck-dim=160 time-stride=3
+  tdnnf-layer name=tdnnf13 $tdnnf_opts dim=1536 bottleneck-dim=160 time-stride=3
+  tdnnf-layer name=tdnnf14 $tdnnf_opts dim=1536 bottleneck-dim=160 time-stride=3
+  tdnnf-layer name=tdnnf15 $tdnnf_opts dim=1536 bottleneck-dim=160 time-stride=3
+  linear-component name=prefinal-l dim=256 $linear_opts
+  prefinal-layer name=prefinal-chain input=prefinal-l $prefinal_opts big-dim=1536 small-dim=256
+  output-layer name=output include-log-softmax=false dim=$num_targets $output_opts
+  prefinal-layer name=prefinal-xent input=prefinal-l $prefinal_opts big-dim=1536 small-dim=256
+  output-layer name=output-xent dim=$num_targets learning-rate-factor=$learning_rate_factor $output_opts
+EOF
+  steps/nnet3/xconfig_to_configs.py --xconfig-file $dir/configs/network.xconfig --config-dir $dir/configs/
+fi
+
+
+if [ $stage -le 16 ]; then
+  if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then
+    utils/create_split_dir.pl \
+     /export/b0{3,4,5,6}/$USER/kaldi-data/egs/wsj-$(date +'%m_%d_%H_%M')/s5/$dir/egs/storage $dir/egs/storage
+  fi
+
+  steps/nnet3/chain/train.py --stage $train_stage \
+    --cmd "$decode_cmd" \
+    --feat.online-ivector-dir $train_ivector_dir \
+    --feat.cmvn-opts "--norm-means=false --norm-vars=false" \
+    --chain.xent-regularize $xent_regularize \
+    --chain.leaky-hmm-coefficient 0.1 \
+    --chain.l2-regularize 0.0 \
+    --chain.apply-deriv-weights false \
+    --chain.lm-opts="--num-extra-lm-states=2000" \
+    --trainer.dropout-schedule $dropout_schedule \
+    --trainer.srand=$srand \
+    --trainer.max-param-change=2.0 \
+    --trainer.num-epochs 6 \
+    --trainer.frames-per-iter 1500000 \
+    --trainer.optimization.num-jobs-initial 3 \
+    --trainer.optimization.num-jobs-final 16 \
+    --trainer.optimization.initial-effective-lrate 0.00025 \
+    --trainer.optimization.final-effective-lrate 0.000025 \
+    --trainer.num-chunk-per-minibatch=64,32 \
+    --trainer.add-option="--optimization.memory-compression-level=2" \
+    --egs.chunk-width=$chunk_width \
+    --egs.dir="$common_egs_dir" \
+    --egs.opts "--frames-overlap-per-eg 0 --constrained false" \
+    --egs.stage $get_egs_stage \
+    --reporting.email="$reporting_email" \
+    --cleanup.remove-egs=$remove_egs \
+    --feat-dir=$train_data_dir \
+    --tree-dir $tree_dir \
+    --lat-dir=$lat_dir \
+    --dir $dir  || exit 1;
+
+fi
+
+if [ $stage -le 17 ]; then
+  # The reason we are using data/lang here, instead of $lang, is just to
+  # emphasize that it's not actually important to give mkgraph.sh the
+  # lang directory with the matched topology (since it gets the
+  # topology file from the model).  So you could give it a different
+  # lang directory, one that contained a wordlist and LM of your choice,
+  # as long as phones.txt was compatible.
+
+  utils/lang/check_phones_compatible.sh \
+    data/lang_test/phones.txt $lang/phones.txt
+  utils/mkgraph.sh \
+    --self-loop-scale 1.0 data/lang_test \
+    $tree_dir $tree_dir/graph || exit 1;
+fi
+
+if [ $stage -le 18 ]; then
+  frames_per_chunk=$(echo $chunk_width | cut -d, -f1)
+  rm $dir/.error 2>/dev/null || true
+
+    steps/nnet3/decode.sh \
+      --acwt 1.0 --post-decode-acwt 10.0 \
+      --extra-left-context 0 --extra-right-context 0 \
+      --extra-left-context-initial 0 \
+      --extra-right-context-final 0 \
+      --frames-per-chunk $frames_per_chunk \
+      --nj $nj --cmd "$decode_cmd"  --num-threads 4 \
+      --online-ivector-dir exp/nnet3${nnet3_affix}/ivectors_${test_set}_hires \
+      $tree_dir/graph data/${test_set}_hires ${dir}/decode_${test_set} || exit 1
+fi
diff --git a/egs/gale_arabic/s5c/local/chain/tuning/run_tdnn_lstm_1a.sh b/egs/gale_arabic/s5c/local/chain/tuning/run_tdnn_lstm_1a.sh
new file mode 100755
index 00000000000..deebafc95e4
--- /dev/null
+++ b/egs/gale_arabic/s5c/local/chain/tuning/run_tdnn_lstm_1a.sh
@@ -0,0 +1,222 @@
+#!/bin/bash
+
+#started from tedlium recipe with few edits
+
+
+set -e -o pipefail
+
+# First the options that are passed through to run_ivector_common.sh
+# (some of which are also used in this script directly).
+stage=17
+nj=30
+decode_nj=30
+min_seg_len=1.55
+chunk_left_context=40
+chunk_right_context=0
+label_delay=5
+xent_regularize=0.1
+train_set=train
+gmm=tri2b # the gmm for the target data gmm for the target data
+num_threads_ubm=32
+nnet3_affix=_cleaned  # cleanup affix for nnet3 and chain dirs, e.g. _cleaned
+# decode options
+extra_left_context=50
+extra_right_context=0
+frames_per_chunk=150
+
+# The rest are configs specific to this script.  Most of the parameters
+# are just hardcoded at this level, in the commands below.
+train_stage=-10
+tree_affix=  # affix for tree directory, e.g. "a" or "b", in case we change the configuration.
+tdnn_lstm_affix=1a  #affix for TDNN-LSTM directory, e.g. "a" or "b", in case we change the configuration.
+common_egs_dir=  # you can set this to use previously dumped egs.
+
+# End configuration section.
+echo "$0 $@"  # Print the command line for logging
+
+. ./cmd.sh
+. ./path.sh
+. ./utils/parse_options.sh
+
+
+if ! cuda-compiled; then
+  cat <<EOF && exit 1
+This script is intended to be used with GPUs but you have not compiled Kaldi with CUDA
+If you want to use GPUs (and have them), go to src/, and configure and make on a machine
+where "nvcc" is installed.
+EOF
+fi
+
+local/nnet3/run_ivector_common.sh --stage $stage \
+                                  --nj $nj \
+                                  --min-seg-len $min_seg_len \
+                                  --train-set $train_set \
+                                  --gmm $gmm \
+                                  --num-threads-ubm $num_threads_ubm \
+                                  --nnet3-affix "$nnet3_affix"
+
+gmm_dir=exp/$gmm
+ali_dir=exp/${gmm}_ali_${train_set}_sp_comb
+tree_dir=exp/chain${nnet3_affix}/tree_bi${tree_affix}
+lat_dir=exp/chain${nnet3_affix}/${gmm}_${train_set}_sp_comb_lats
+dir=exp/chain${nnet3_affix}/tdnn_lstm${tdnn_lstm_affix}_sp_bi
+train_data_dir=data/${train_set}_sp_hires_comb
+lores_train_data_dir=data/${train_set}_sp_comb
+train_ivector_dir=exp/nnet3${nnet3_affix}/ivectors_${train_set}_sp_hires_comb
+
+
+for f in $gmm_dir/final.mdl $train_data_dir/feats.scp $train_ivector_dir/ivector_online.scp \
+    $lores_train_data_dir/feats.scp $ali_dir/ali.1.gz $gmm_dir/final.mdl; do
+  [ ! -f $f ] && echo "$0: expected file $f to exist" && exit 1
+done
+
+if [ $stage -le 14 ]; then
+  echo "$0: creating lang directory with one state per phone."
+  # Create a version of the lang/ directory that has one state per phone in the
+  # topo file. [note, it really has two states.. the first one is only repeated
+  # once, the second one has zero or more repeats.]
+  if [ -d data/lang_chain ]; then
+    if [ data/lang_chain/L.fst -nt data/lang/L.fst ]; then
+      echo "$0: data/lang_chain already exists, not overwriting it; continuing"
+    else
+      echo "$0: data/lang_chain already exists and seems to be older than data/lang..."
+      echo " ... not sure what to do.  Exiting."
+      exit 1;
+    fi
+  else
+    cp -r data/lang data/lang_chain
+    silphonelist=$(cat data/lang_chain/phones/silence.csl) || exit 1;
+    nonsilphonelist=$(cat data/lang_chain/phones/nonsilence.csl) || exit 1;
+    # Use our special topology... note that later on may have to tune this
+    # topology.
+    steps/nnet3/chain/gen_topo.py $nonsilphonelist $silphonelist >data/lang_chain/topo
+  fi
+fi
+
+if [ $stage -le 15 ]; then
+  # Get the alignments as lattices (gives the chain training more freedom).
+  # use the same num-jobs as the alignments
+  steps/align_fmllr_lats.sh --nj 100 --cmd "$train_cmd" ${lores_train_data_dir} \
+    data/lang $gmm_dir $lat_dir
+  rm $lat_dir/fsts.*.gz # save space
+fi
+
+if [ $stage -le 16 ]; then
+  # Build a tree using our new topology.  We know we have alignments for the
+  # speed-perturbed data (local/nnet3/run_ivector_common.sh made them), so use
+  # those.
+  if [ -f $tree_dir/final.mdl ]; then
+    echo "$0: $tree_dir/final.mdl already exists, refusing to overwrite it."
+    exit 1;
+  fi
+  steps/nnet3/chain/build_tree.sh --frame-subsampling-factor 3 \
+      --context-opts "--context-width=2 --central-position=1" \
+      --cmd "$train_cmd" 4000 ${lores_train_data_dir} data/lang_chain $ali_dir $tree_dir
+fi
+
+
+if [ $stage -le 17 ]; then
+  mkdir -p $dir
+  echo "$0: creating neural net configs using the xconfig parser";
+
+  num_targets=$(tree-info $tree_dir/tree |grep num-pdfs|awk '{print $2}')
+  learning_rate_factor=$(echo "print (0.5/$xent_regularize)" | python)
+
+  mkdir -p $dir/configs
+  cat <<EOF > $dir/configs/network.xconfig
+  input dim=100 name=ivector
+  input dim=40 name=input
+
+  # please note that it is important to have input layer with the name=input
+  # as the layer immediately preceding the fixed-affine-layer to enable
+  # the use of short notation for the descriptor
+  fixed-affine-layer name=lda input=Append(-2,-1,0,1,2,ReplaceIndex(ivector, t, 0)) affine-transform-file=$dir/configs/lda.mat
+
+  # the first splicing is moved before the lda layer, so no splicing here
+  relu-renorm-layer name=tdnn1 dim=512
+  relu-renorm-layer name=tdnn2 dim=512 input=Append(-1,0,1)
+  fast-lstmp-layer name=lstm1 cell-dim=512 recurrent-projection-dim=128 non-recurrent-projection-dim=128 delay=-3
+  relu-renorm-layer name=tdnn3 dim=512 input=Append(-3,0,3)
+  relu-renorm-layer name=tdnn4 dim=512 input=Append(-3,0,3)
+  fast-lstmp-layer name=lstm2 cell-dim=512 recurrent-projection-dim=128 non-recurrent-projection-dim=128 delay=-3
+  relu-renorm-layer name=tdnn5 dim=512 input=Append(-3,0,3)
+  relu-renorm-layer name=tdnn6 dim=512 input=Append(-3,0,3)
+  fast-lstmp-layer name=lstm3 cell-dim=512 recurrent-projection-dim=128 non-recurrent-projection-dim=128 delay=-3
+
+  ## adding the layers for chain branch
+  output-layer name=output input=lstm3 output-delay=$label_delay include-log-softmax=false dim=$num_targets max-change=1.5
+
+  # adding the layers for xent branch
+  # This block prints the configs for a separate output that will be
+  # trained with a cross-entropy objective in the 'chain' models... this
+  # has the effect of regularizing the hidden parts of the model.  we use
+  # 0.5 / args.xent_regularize as the learning rate factor- the factor of
+  # 0.5 / args.xent_regularize is suitable as it means the xent
+  # final-layer learns at a rate independent of the regularization
+  # constant; and the 0.5 was tuned so as to make the relative progress
+  # similar in the xent and regular final layers.
+  output-layer name=output-xent input=lstm3 output-delay=$label_delay dim=$num_targets learning-rate-factor=$learning_rate_factor max-change=1.5
+
+EOF
+  steps/nnet3/xconfig_to_configs.py --xconfig-file $dir/configs/network.xconfig --config-dir $dir/configs/
+fi
+
+
+if [ $stage -le 18 ]; then
+  if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then
+    utils/create_split_dir.pl \
+     /export/b0{5,6,7,8}/$USER/kaldi-data/egs/ami-$(date +'%m_%d_%H_%M')/s5/$dir/egs/storage $dir/egs/storage
+  fi
+
+ steps/nnet3/chain/train.py --stage $train_stage \
+    --cmd "$decode_cmd" \
+    --feat.online-ivector-dir $train_ivector_dir \
+    --feat.cmvn-opts "--norm-means=false --norm-vars=false" \
+    --chain.xent-regularize $xent_regularize \
+    --chain.leaky-hmm-coefficient 0.1 \
+    --chain.l2-regularize 0.00005 \
+    --chain.apply-deriv-weights false \
+    --chain.lm-opts="--num-extra-lm-states=2000" \
+    --egs.dir "$common_egs_dir" \
+    --egs.opts "--frames-overlap-per-eg 0" \
+    --egs.chunk-width "$frames_per_chunk" \
+    --egs.chunk-left-context "$chunk_left_context" \
+    --egs.chunk-right-context "$chunk_right_context" \
+    --trainer.num-chunk-per-minibatch 128 \
+    --trainer.frames-per-iter 1500000 \
+    --trainer.max-param-change 2.0 \
+    --trainer.num-epochs 4 \
+    --trainer.deriv-truncate-margin 10 \
+    --trainer.optimization.shrink-value 0.99 \
+    --trainer.optimization.num-jobs-initial 2 \
+    --trainer.optimization.num-jobs-final 3 \
+    --trainer.optimization.initial-effective-lrate 0.001 \
+    --trainer.optimization.final-effective-lrate 0.0001 \
+    --trainer.optimization.momentum 0.0 \
+    --cleanup.remove-egs true \
+    --feat-dir $train_data_dir \
+    --tree-dir $tree_dir \
+    --lat-dir $lat_dir \
+    --dir $dir
+fi
+
+
+
+if [ $stage -le 19 ]; then
+  # Note: it might appear that this data/lang_chain directory is mismatched, and it is as
+  # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
+  # the lang directory.
+  utils/mkgraph.sh --left-biphone --self-loop-scale 1.0 data/lang_test $dir $dir/graph
+fi
+
+if [ $stage -le 20 ]; then
+  steps/nnet3/decode.sh --num-threads 4 --nj $decode_nj --cmd "$decode_cmd" \
+    --acwt 1.0 --post-decode-acwt 10.0 \
+    --extra-left-context $extra_left_context  \
+    --extra-right-context $extra_right_context  \
+    --frames-per-chunk "$frames_per_chunk" \
+    --online-ivector-dir exp/nnet3${nnet3_affix}/ivectors_test_hires \
+    --scoring-opts "--min-lmwt 5 " \
+    $dir/graph data/test_hires $dir/decode || exit 1;
+fi
+exit 0
diff --git a/egs/gale_arabic/s5c/local/nnet3/run_ivector_common.sh b/egs/gale_arabic/s5c/local/nnet3/run_ivector_common.sh
new file mode 100755
index 00000000000..a03cc5b2fa3
--- /dev/null
+++ b/egs/gale_arabic/s5c/local/nnet3/run_ivector_common.sh
@@ -0,0 +1,182 @@
+#!/bin/bash
+
+set -e -o pipefail
+
+# This script is called from scripts like local/nnet3/run_tdnn.sh and
+# local/chain/run_tdnn.sh (and may eventually be called by more scripts).  It
+# contains the common feature preparation and iVector-related parts of the
+# script.  See those scripts for examples of usage.
+
+
+stage=0
+nj=100
+train_set=train   # you might set this to e.g. train.
+test_sets="test"
+gmm=tri3b # This specifies a GMM-dir from the features of the type you're training the system on;
+                         # it should contain alignments for 'train_set'.
+
+num_threads_ubm=32
+nnet3_affix= # affix for exp/nnet3 directory to put iVector stuff
+
+. ./cmd.sh
+. ./path.sh
+. utils/parse_options.sh
+
+
+gmm_dir=exp/${gmm}
+ali_dir=exp/${gmm}_ali_${train_set}_sp
+
+for f in data/${train_set}/feats.scp ${gmm_dir}/final.mdl; do
+  if [ ! -f $f ]; then
+    echo "$0: expected file $f to exist"
+    exit 1
+  fi
+done
+
+
+
+if [ $stage -le 2 ] && [ -f data/${train_set}_sp_hires/feats.scp ]; then
+  echo "$0: data/${train_set}_sp_hires/feats.scp already exists."
+  echo " ... Please either remove it, or rerun this script with stage > 2."
+  exit 1
+fi
+
+
+if [ $stage -le 1 ]; then
+  echo "$0: preparing directory for speed-perturbed data"
+  utils/data/perturb_data_dir_speed_3way.sh data/${train_set} data/${train_set}_sp
+fi
+
+if [ $stage -le 2 ]; then
+  echo "$0: creating high-resolution MFCC features"
+
+  # this shows how you can split across multiple file-systems.  we'll split the
+  # MFCC dir across multiple locations.  You might want to be careful here, if you
+  # have multiple copies of Kaldi checked out and run the same recipe, not to let
+  # them overwrite each other.
+  mfccdir=data/${train_set}_sp_hires/data
+  if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $mfccdir/storage ]; then
+    utils/create_split_dir.pl /export/b0{5,6,7,8}/$USER/kaldi-data/mfcc/gale_arabic-$(date +'%m_%d_%H_%M')/s5/$mfccdir/storage $mfccdir/storage
+  fi
+
+  for datadir in ${train_set}_sp ${test_sets}; do
+    utils/copy_data_dir.sh data/$datadir data/${datadir}_hires
+  done
+
+  # do volume-perturbation on the training data prior to extracting hires
+  # features; this helps make trained nnets more invariant to test data volume.
+  utils/data/perturb_data_dir_volume.sh data/${train_set}_sp_hires
+
+  for datadir in ${train_set}_sp ${test_sets}; do
+    steps/make_mfcc.sh --nj $nj --mfcc-config conf/mfcc_hires.conf \
+      --cmd "$train_cmd" data/${datadir}_hires
+    steps/compute_cmvn_stats.sh data/${datadir}_hires
+    utils/fix_data_dir.sh data/${datadir}_hires
+  done
+fi
+
+if [ $stage -le 3 ]; then
+  echo "$0: computing a subset of data to train the diagonal UBM."
+  mkdir -p exp/nnet3${nnet3_affix}/diag_ubm
+  temp_data_root=exp/nnet3${nnet3_affix}/diag_ubm
+
+  # train a diagonal UBM using a subset of about a quarter of the data
+  num_utts_total=$(wc -l <data/${train_set}_sp_hires/utt2spk)
+  num_utts=$[$num_utts_total/4]
+  utils/data/subset_data_dir.sh data/${train_set}_sp_hires \
+      $num_utts ${temp_data_root}/${train_set}_sp_hires_subset
+
+  echo "$0: computing a PCA transform from the hires data."
+  steps/online/nnet2/get_pca_transform.sh --cmd "$train_cmd" \
+      --splice-opts "--left-context=3 --right-context=3" \
+      --max-utts 10000 --subsample 2 \
+       ${temp_data_root}/${train_set}_sp_hires_subset \
+       exp/nnet3${nnet3_affix}/pca_transform
+
+  echo "$0: training the diagonal UBM."
+  # Use 512 Gaussians in the UBM.
+  steps/online/nnet2/train_diag_ubm.sh --cmd "$train_cmd" --nj 30 \
+    --num-frames 700000 \
+    --num-threads $num_threads_ubm \
+    ${temp_data_root}/${train_set}_sp_hires_subset 512 \
+    exp/nnet3${nnet3_affix}/pca_transform exp/nnet3${nnet3_affix}/diag_ubm
+fi
+
+if [ $stage -le 4 ]; then
+  # Train the iVector extractor.  Use all of the speed-perturbed data since iVector extractors
+  # can be sensitive to the amount of data.  The script defaults to an iVector dimension of
+  # 100.
+  echo "$0: training the iVector extractor"
+  steps/online/nnet2/train_ivector_extractor.sh --cmd "$train_cmd" --nj 10 \
+    data/${train_set}_sp_hires exp/nnet3${nnet3_affix}/diag_ubm exp/nnet3${nnet3_affix}/extractor || exit 1;
+fi
+
+if [ $stage -le 5 ]; then
+  # note, we don't encode the 'max2' in the name of the ivectordir even though
+  # that's the data we extract the ivectors from, as it's still going to be
+  # valid for the non-'max2' data; the utterance list is the same.
+  ivectordir=exp/nnet3${nnet3_affix}/ivectors_${train_set}_sp_hires
+  if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $ivectordir/storage ]; then
+    utils/create_split_dir.pl /export/b0{5,6,7,8}/$USER/kaldi-data/ivectors/gale_arabic-$(date +'%m_%d_%H_%M')/s5/$ivectordir/storage $ivectordir/storage
+  fi
+  # We extract iVectors on the speed-perturbed training data .  With
+  # --utts-per-spk-max 2, the script pairs the utterances into twos, and treats
+  # each of these pairs as one speaker; this gives more diversity in iVectors..
+  # Note that these are extracted 'online' (they vary within the utterance).
+
+  # Having a larger number of speakers is helpful for generalization, and to
+  # handle per-utterance decoding well (the iVector starts at zero at the beginning
+  # of each pseudo-speaker).
+  temp_data_root=${ivectordir}
+  utils/data/modify_speaker_info.sh --utts-per-spk-max 2 \
+    data/${train_set}_sp_hires ${temp_data_root}/${train_set}_sp_hires_max2
+
+  steps/online/nnet2/extract_ivectors_online.sh --cmd "$train_cmd" --nj $nj \
+    ${temp_data_root}/${train_set}_sp_hires_max2 \
+    exp/nnet3${nnet3_affix}/extractor $ivectordir
+
+  # Also extract iVectors for the test data, but in this case we don't need the speed
+  # perturbation (sp).
+  for data in ${test_sets}; do
+    steps/online/nnet2/extract_ivectors_online.sh --cmd "$train_cmd" --nj $nj \
+      data/${data}_hires exp/nnet3${nnet3_affix}/extractor \
+      exp/nnet3${nnet3_affix}/ivectors_${data}_hires
+  done
+fi
+
+if [ -f data/${train_set}_sp/feats.scp ] && [ $stage -le 7 ]; then
+  echo "$0: data/${train_set}_sp/feats.scp already exists.  Refusing to overwrite the features "
+  echo " to avoid wasting time.  Please remove the file and continue if you really mean this."
+  exit 1;
+fi
+
+
+if [ $stage -le 6 ]; then
+  echo "$0: preparing directory for low-resolution speed-perturbed data (for alignment)"
+  utils/data/perturb_data_dir_speed_3way.sh \
+    data/${train_set} data/${train_set}_sp
+fi
+
+if [ $stage -le 7 ]; then
+  echo "$0: making MFCC features for low-resolution speed-perturbed data (needed for alignments)"
+  steps/make_mfcc.sh --nj $nj \
+    --cmd "$train_cmd" data/${train_set}_sp
+  steps/compute_cmvn_stats.sh data/${train_set}_sp
+  echo "$0: fixing input data-dir to remove nonexistent features, in case some "
+  echo ".. speed-perturbed segments were too short."
+  utils/fix_data_dir.sh data/${train_set}_sp
+fi
+
+if [ $stage -le 8 ]; then
+  if [ -f $ali_dir/ali.1.gz ]; then
+    echo "$0: alignments in $ali_dir appear to already exist.  Please either remove them "
+    echo " ... or use a later --stage option."
+    exit 1
+  fi
+  echo "$0: aligning with the perturbed low-resolution data"
+  steps/align_fmllr.sh --nj $nj --cmd "$train_cmd" \
+    data/${train_set}_sp data/lang $gmm_dir $ali_dir
+fi
+
+
+exit 0;
diff --git a/egs/gale_arabic/s5c/local/nnet3/run_lstm.sh b/egs/gale_arabic/s5c/local/nnet3/run_lstm.sh
new file mode 120000
index 00000000000..c53740399ce
--- /dev/null
+++ b/egs/gale_arabic/s5c/local/nnet3/run_lstm.sh
@@ -0,0 +1 @@
+tuning/run_lstm_1a.sh
\ No newline at end of file
diff --git a/egs/gale_arabic/s5c/local/nnet3/run_tdnn.sh b/egs/gale_arabic/s5c/local/nnet3/run_tdnn.sh
new file mode 120000
index 00000000000..34499362831
--- /dev/null
+++ b/egs/gale_arabic/s5c/local/nnet3/run_tdnn.sh
@@ -0,0 +1 @@
+tuning/run_tdnn_1a.sh
\ No newline at end of file
diff --git a/egs/gale_arabic/s5c/local/nnet3/tuning/run_lstm_1a.sh b/egs/gale_arabic/s5c/local/nnet3/tuning/run_lstm_1a.sh
new file mode 100755
index 00000000000..7f7b8b3ba56
--- /dev/null
+++ b/egs/gale_arabic/s5c/local/nnet3/tuning/run_lstm_1a.sh
@@ -0,0 +1,161 @@
+#!/bin/bash
+
+#started from tedlium recipe with few edits
+
+
+set -e -o pipefail -u
+
+# First the options that are passed through to run_ivector_common.sh
+# (some of which are also used in this script directly).
+stage=0
+nj=30
+decode_nj=30
+min_seg_len=1.55
+train_set=train
+gmm=tri2b  # this is the source gmm-dir for the data-type of interest; it
+                  # should have alignments for the specified training data.
+num_threads_ubm=32
+nnet3_affix=_cleaned  # cleanup affix for exp dirs, e.g. _cleaned
+
+# Options which are not passed through to run_ivector_common.sh
+affix=
+common_egs_dir=
+reporting_email=
+
+# LSTM options
+train_stage=-10
+splice_indexes="-2,-1,0,1,2 0 0"
+lstm_delay=" -1 -2 -3 "
+label_delay=5
+num_lstm_layers=3
+cell_dim=1024
+hidden_dim=1024
+recurrent_projection_dim=256
+non_recurrent_projection_dim=256
+chunk_width=20
+chunk_left_context=40
+chunk_right_context=0
+max_param_change=2.0
+
+# training options
+srand=0
+num_epochs=6
+initial_effective_lrate=0.0003
+final_effective_lrate=0.00003
+num_jobs_initial=2
+num_jobs_final=3
+momentum=0.5
+num_chunk_per_minibatch=100
+samples_per_iter=20000
+remove_egs=true
+
+#decode options
+extra_left_context=
+extra_right_context=
+frames_per_chunk=
+
+. ./cmd.sh
+. ./path.sh
+. ./utils/parse_options.sh
+
+if ! cuda-compiled; then
+  cat <<EOF && exit 1
+This script is intended to be used with GPUs but you have not compiled Kaldi with CUDA
+If you want to use GPUs (and have them), go to src/, and configure and make on a machine
+where "nvcc" is installed.
+EOF
+fi
+
+local/nnet3/run_ivector_common.sh --stage $stage \
+                                  --nj $nj \
+                                  --min-seg-len $min_seg_len \
+                                  --train-set $train_set \
+                                  --gmm $gmm \
+                                  --num-threads-ubm $num_threads_ubm \
+                                  --nnet3-affix "$nnet3_affix"
+
+
+
+gmm_dir=exp/${gmm}
+graph_dir=$gmm_dir/graph
+ali_dir=exp/${gmm}_ali_${train_set}_sp_comb
+dir=exp/nnet3${nnet3_affix}/lstm${affix:+_$affix}
+if [ $label_delay -gt 0 ]; then dir=${dir}_ld$label_delay; fi
+dir=${dir}_sp
+train_data_dir=data/${train_set}_sp_hires_comb
+train_ivector_dir=exp/nnet3${nnet3_affix}/ivectors_${train_set}_sp_hires_comb
+
+
+for f in $train_data_dir/feats.scp $train_ivector_dir/ivector_online.scp \
+     $graph_dir/HCLG.fst $ali_dir/ali.1.gz $gmm_dir/final.mdl; do
+  [ ! -f $f ] && echo "$0: expected file $f to exist" && exit 1
+done
+
+if [ $stage -le 12 ]; then
+  echo "$0: creating neural net configs"
+  config_extra_opts=()
+  [ ! -z "$lstm_delay" ] && config_extra_opts+=(--lstm-delay "$lstm_delay")
+  steps/nnet3/lstm/make_configs.py  "${config_extra_opts[@]}" \
+    --feat-dir $train_data_dir \
+    --ivector-dir $train_ivector_dir \
+    --ali-dir $ali_dir \
+    --num-lstm-layers $num_lstm_layers \
+    --splice-indexes "$splice_indexes " \
+    --cell-dim $cell_dim \
+    --hidden-dim $hidden_dim \
+    --recurrent-projection-dim $recurrent_projection_dim \
+    --non-recurrent-projection-dim $non_recurrent_projection_dim \
+    --label-delay $label_delay \
+    --self-repair-scale-nonlinearity 0.00001 \
+  $dir/configs || exit 1;
+fi
+
+if [ $stage -le 13 ]; then
+  if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then
+    utils/create_split_dir.pl \
+     /export/b0{3,4,5,6}/$USER/kaldi-data/egs/tedlium-$(date +'%m_%d_%H_%M')/s5_r2/$dir/egs/storage $dir/egs/storage
+  fi
+  
+  steps/nnet3/train_rnn.py --stage=$train_stage \
+    --cmd="$decode_cmd" \
+    --feat.online-ivector-dir=$train_ivector_dir \
+    --feat.cmvn-opts="--norm-means=false --norm-vars=false" \
+    --trainer.srand=$srand \
+    --trainer.num-epochs=$num_epochs \
+    --trainer.samples-per-iter=$samples_per_iter \
+    --trainer.optimization.num-jobs-initial=$num_jobs_initial \
+    --trainer.optimization.num-jobs-final=$num_jobs_final \
+    --trainer.optimization.initial-effective-lrate=$initial_effective_lrate \
+    --trainer.optimization.final-effective-lrate=$final_effective_lrate \
+    --trainer.optimization.shrink-value 0.99 \
+    --trainer.rnn.num-chunk-per-minibatch=$num_chunk_per_minibatch \
+    --trainer.optimization.momentum=$momentum \
+    --egs.chunk-width=$chunk_width \
+    --egs.chunk-left-context=$chunk_left_context \
+    --egs.chunk-right-context=$chunk_right_context \
+    --egs.dir="$common_egs_dir" \
+    --cleanup.remove-egs=$remove_egs \
+    --cleanup.preserve-model-interval=1 \
+    --use-gpu=true \
+    --feat-dir=$train_data_dir \
+    --ali-dir=$ali_dir \
+    --lang=data/lang \
+    --reporting.email="$reporting_email" \
+    --dir=$dir  || exit 1;
+fi
+
+if [ $stage -le 14 ]; then
+  [ -z $extra_left_context ] && extra_left_context=$chunk_left_context;
+  [ -z $extra_right_context ] && extra_right_context=$chunk_right_context;
+  [ -z $frames_per_chunk ] && frames_per_chunk=$chunk_width;
+  rm $dir/.error 2>/dev/null || true
+  steps/nnet3/decode.sh --nj $decode_nj --cmd "$decode_cmd"  --num-threads 4 \
+    --extra-left-context $extra_left_context \
+    --extra-right-context $extra_right_context \
+    --online-ivector-dir exp/nnet3${nnet3_affix}/ivectors_test_hires \
+    ${graph_dir} data/test_hires ${dir}/decode || exit 1
+    steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" data/lang data/lang_rescore \
+    data/test_hires ${dir}/decode_test ${dir}/decode_test_rescore || exit 1
+fi
+
+exit 0;
diff --git a/egs/gale_arabic/s5c/local/nnet3/tuning/run_tdnn_1a.sh b/egs/gale_arabic/s5c/local/nnet3/tuning/run_tdnn_1a.sh
new file mode 100755
index 00000000000..6619df668ef
--- /dev/null
+++ b/egs/gale_arabic/s5c/local/nnet3/tuning/run_tdnn_1a.sh
@@ -0,0 +1,88 @@
+#!/bin/bash
+
+# started from tedlium recipe with few edits
+
+set -e -o pipefail -u
+
+# First the options that are passed through to run_ivector_common.sh
+# (some of which are also used in this script directly).
+stage=0
+nj=30
+decode_nj=30
+min_seg_len=1.55
+train_set=train
+gmm=tri2b  # this is the source gmm-dir for the data-type of interest; it
+                  # should have alignments for the specified training data.
+num_threads_ubm=32
+nnet3_affix=_cleaned  # cleanup affix for exp dirs, e.g. _cleaned
+tdnn_affix=  #affix for TDNN directory e.g. "a" or "b", in case we change the configuration.
+
+# Options which are not passed through to run_ivector_common.sh
+train_stage=-10
+splice_indexes="-2,-1,0,1,2 -1,2 -3,3 -7,2 -3,3 0 0"
+remove_egs=true
+relu_dim=850
+num_epochs=3
+
+. ./cmd.sh
+. ./path.sh
+. ./utils/parse_options.sh
+
+if ! cuda-compiled; then
+  cat <<EOF && exit 1
+This script is intended to be used with GPUs but you have not compiled Kaldi with CUDA
+If you want to use GPUs (and have them), go to src/, and configure and make on a machine
+where "nvcc" is installed.
+EOF
+fi
+
+local/nnet3/run_ivector_common.sh --stage $stage \
+                                  --nj $nj \
+                                  --min-seg-len $min_seg_len \
+                                  --train-set $train_set \
+                                  --gmm $gmm \
+                                  --num-threads-ubm $num_threads_ubm \
+                                  --nnet3-affix "$nnet3_affix"
+
+
+
+gmm_dir=exp/${gmm}
+graph_dir=$gmm_dir/graph
+ali_dir=exp/${gmm}_ali_${train_set}_sp_comb
+dir=exp/nnet3${nnet3_affix}/tdnn${tdnn_affix}_sp
+train_data_dir=data/${train_set}_sp_hires_comb
+train_ivector_dir=exp/nnet3${nnet3_affix}/ivectors_${train_set}_sp_hires_comb
+
+
+for f in $train_data_dir/feats.scp $train_ivector_dir/ivector_online.scp \
+     $graph_dir/HCLG.fst $ali_dir/ali.1.gz $gmm_dir/final.mdl; do
+  [ ! -f $f ] && echo "$0: expected file $f to exist" && exit 1
+done
+
+
+if [ $stage -le 12 ]; then
+  if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then
+    utils/create_split_dir.pl \
+     /export/b0{3,4,5,6}/$USER/kaldi-data/egs/tedlium-$(date +'%m_%d_%H_%M')/s5_r2/$dir/egs/storage $dir/egs/storage
+  fi
+   
+  steps/nnet3/tdnn/train.sh --stage $train_stage \
+    --num-epochs $num_epochs --num-jobs-initial 2 --num-jobs-final 2 \
+    --splice-indexes "$splice_indexes" \
+    --feat-type raw \
+    --online-ivector-dir ${train_ivector_dir} \
+    --cmvn-opts "--norm-means=false --norm-vars=false" \
+    --initial-effective-lrate 0.0015 --final-effective-lrate 0.00015 \
+    --cmd "$decode_cmd" \
+    --relu-dim "$relu_dim" \
+    --remove-egs "$remove_egs" \
+    $train_data_dir data/lang $ali_dir $dir
+fi
+
+if [ $stage -le 13 ]; then
+  steps/nnet3/decode.sh --nj $decode_nj --cmd "$decode_cmd"  --num-threads 4 \
+    --online-ivector-dir exp/nnet3${nnet3_affix}/ivectors_test_hires \
+    ${graph_dir} data/test_hires ${dir}/decode || exit 1 
+fi
+
+exit 0;
diff --git a/egs/gale_arabic/s5c/local/normalize_transcript_BW.pl b/egs/gale_arabic/s5c/local/normalize_transcript_BW.pl
new file mode 100755
index 00000000000..df01c5d7b85
--- /dev/null
+++ b/egs/gale_arabic/s5c/local/normalize_transcript_BW.pl
@@ -0,0 +1,111 @@
+#!/usr/bin/env perl
+
+# Copyright 2014 QCRI (author: Ahmed Ali)
+# Apache 2.0
+
+use warnings;
+use strict;
+use Encode;
+use utf8;
+
+
+
+if (@ARGV !=2 )
+    {#
+	print "usage: $0 <inFile> <onlyArabicFile>\n"; 
+	exit (1);   
+    }
+    
+# <\check usage>
+my $inFile = shift (@ARGV);
+my $ouFile = shift(@ARGV);
+
+
+open INFILE, "<$inFile" || die "unable to open the input file $inFile\n";
+binmode INFILE, ":encoding(utf8)";
+
+
+open OUTPUTFILE, ">$ouFile" or die "unable to open the output mlf file $ouFile\n";
+binmode OUTPUTFILE, ":encoding(utf8)";
+
+
+while (<INFILE>) {
+  s/[^اأإآبتثجحخدذرزسشصضطظعغفقكلمنهويىئءؤة0-9]+/ /g;  ## Removes non Arabic or numbers
+  my $BW = convertUTF8ToBuckwalter ($_);
+  print OUTPUTFILE "$BW"."\n";
+}
+close INFILE;
+close OUTPUTFILE;
+
+
+
+# this function is copied from MADATools.pm: MADA Tools
+ sub convertUTF8ToBuckwalter {
+
+    my ($line)= (@_);
+    #$line = $UTF8_ENCODING_OBJ->decode($line);  ## Same as Encode::decode("utf8",$line), but faster since object already created
+    $line =~ s/\x{0621}/\'/g;   ## HAMZA
+    $line =~ s/\x{0622}/\|/g;   ## ALEF WITH MADDA ABOVE
+    $line =~ s/\x{0623}/\>/g;   ## ALEF WITH HAMZA ABOVE
+    $line =~ s/\x{0624}/\&/g;   ## WAW WITH HAMZA ABOVE
+    $line =~ s/\x{0625}/\</g;   ## ALEF WITH HAMZA BELOW
+    $line =~ s/\x{0626}/\}/g;   ## YEH WITH HAMZA ABOVE
+    $line =~ s/\x{0627}/A/g;    ## ALEF
+    $line =~ s/\x{0628}/b/g;    ## BEH
+    $line =~ s/\x{0629}/p/g;    ## TEH MARBUTA
+    $line =~ s/\x{062A}/t/g;    ## TEH
+    $line =~ s/\x{062B}/v/g;    ## THEH
+    $line =~ s/\x{062C}/j/g;    ## JEEM
+    $line =~ s/\x{062D}/H/g;    ## HAH
+    $line =~ s/\x{062E}/x/g;    ## KHAH
+    $line =~ s/\x{062F}/d/g;    ## DAL
+    $line =~ s/\x{0630}/\*/g;   ## THAL
+    $line =~ s/\x{0631}/r/g;    ## REH
+    $line =~ s/\x{0632}/z/g;    ## ZAIN
+    $line =~ s/\x{0633}/s/g;    ## SEEN
+    $line =~ s/\x{0634}/\$/g;   ## SHEEN
+    $line =~ s/\x{0635}/S/g;    ## SAD
+    $line =~ s/\x{0636}/D/g;    ## DAD
+    $line =~ s/\x{0637}/T/g;    ## TAH
+    $line =~ s/\x{0638}/Z/g;    ## ZAH
+    $line =~ s/\x{0639}/E/g;    ## AIN
+    $line =~ s/\x{063A}/g/g;    ## GHAIN
+    $line =~ s/\x{0640}/_/g;    ## TATWEEL
+    $line =~ s/\x{0641}/f/g;    ## FEH
+    $line =~ s/\x{0642}/q/g;    ## QAF
+    $line =~ s/\x{0643}/k/g;    ## KAF
+    $line =~ s/\x{0644}/l/g;    ## LAM
+    $line =~ s/\x{0645}/m/g;    ## MEEM
+    $line =~ s/\x{0646}/n/g;    ## NOON
+    $line =~ s/\x{0647}/h/g;    ## HEH
+    $line =~ s/\x{0648}/w/g;    ## WAW
+    $line =~ s/\x{0649}/Y/g;    ## ALEF MAKSURA
+    $line =~ s/\x{064A}/y/g;    ## YEH
+
+    ## Diacritics
+    $line =~ s/\x{064B}/F/g;    ## FATHATAN
+    $line =~ s/\x{064C}/N/g;    ## DAMMATAN
+    $line =~ s/\x{064D}/K/g;    ## KASRATAN
+    $line =~ s/\x{064E}/a/g;    ## FATHA
+    $line =~ s/\x{064F}/u/g;    ## DAMMA
+    $line =~ s/\x{0650}/i/g;    ## KASRA
+    $line =~ s/\x{0651}/\~/g;   ## SHADDA
+    $line =~ s/\x{0652}/o/g;    ## SUKUN
+    $line =~ s/\x{0670}/\`/g;   ## SUPERSCRIPT ALEF
+
+    $line =~ s/\x{0671}/\{/g;   ## ALEF WASLA
+    $line =~ s/\x{067E}/P/g;    ## PEH
+    $line =~ s/\x{0686}/J/g;    ## TCHEH
+    $line =~ s/\x{06A4}/V/g;    ## VEH
+    $line =~ s/\x{06AF}/G/g;    ## GAF
+
+
+    ## Punctuation should really be handled by the utf8 cleaner or other method
+#   $line =~ s/\xa2/\,/g; # comma
+#    $line =~ s//\,/g; # comma
+#    $line =~ s//\,/g;
+#    $line =~ s//\;/g; # semicolon
+#    $line =~ s//\?/g; # questionmark
+
+    return $line;
+}
diff --git a/egs/gale_arabic/s5c/local/prepare_data.sh b/egs/gale_arabic/s5c/local/prepare_data.sh
new file mode 100755
index 00000000000..aea9ba2dc8e
--- /dev/null
+++ b/egs/gale_arabic/s5c/local/prepare_data.sh
@@ -0,0 +1,104 @@
+#!/bin/bash 
+
+# Copyright 2014 QCRI (author: Ahmed Ali)
+# Apache 2.0
+
+dir1=/export/corpora/LDC/LDC2013S02/
+dir2=/export/corpora/LDC/LDC2013S07/
+dir3=/export/corpora/LDC/LDC2014S07/
+text1=/export/corpora/LDC/LDC2013T17/
+text2=/export/corpora/LDC/LDC2013T04/
+text3=/export/corpora/LDC/LDC2014T17/
+gale_data=GALE
+
+mkdir -p $gale_data 
+# check that sox is installed 
+which sox  &>/dev/null
+if [[ $? != 0 ]]; then 
+ echo "$0: sox is not installed"; exit 1
+fi
+
+for dvd in $dir1 $dir2 $dir3; do
+  dvd_full_path=$(utils/make_absolute.sh $dvd)
+  if [[ ! -e $dvd_full_path ]]; then 
+    echo "$0: missing $dvd_full_path"; exit 1;
+  fi
+  find $dvd_full_path \( -name "*.wav" -o -name "*.flac" \)  | while read file; do
+    id=$(basename $file | awk '{gsub(".wav","");gsub(".flac","");print}')
+    echo "$id sox $file -r 16000 -t wav - |"
+  done 
+done | sort -u > $gale_data/wav.scp
+echo "$0:data prep audio succeded"
+
+gale_data=$(utils/make_absolute.sh "GALE" );
+top_pwd=`pwd`
+txtdir=$gale_data/txt
+mkdir -p $txtdir; cd $txtdir
+
+for cdx in $text1 $text2 $text3; do
+  echo "$0:Preparing $cdx"
+  if [[ $cdx  == *.tgz ]] ; then
+     tar -xvf $cdx
+  elif [  -d "$cdx" ]; then
+    ln -s $cdx `basename $cdx`
+  else
+    echo "$0:I don't really know what I shall do with $cdx " >&2
+  fi
+done
+
+find -L . -type f -name "*.tdf" | while read file; do
+sed '1,3d' $file  # delete the first 3 lines
+done >  all.tmp$$
+
+perl -e '
+    ($inFile,$idFile,$txtFile)= split /\s+/, $ARGV[0];
+    open(IN, "$inFile");
+    open(ID, ">$idFile");
+    open(TXT, ">$txtFile");
+    while (<IN>) {
+      @arr= split /\t/,$_;
+      $start=sprintf ("%0.3f",$arr[2]);$rStart=$start;$start=~s/\.//; $start=~s/^0+$/0/; $start=~s/^0+([^0])/$1/; # remove zeros at the beginning
+      $end=sprintf ("%0.3f",$arr[3]);$rEnd=$end;$end=~s/^0+([^0])/$1/;$end=~s/\.//;
+      if ( ($arr[11] !~ m/report/) && ($arr[11] !~ m/conversational/) ){$arr[11]="UNK";}
+      $id="$arr[11] $arr[0] $arr[0]_${start}_${end} $rStart $rEnd\n";
+      next if ($rStart == $rEnd);
+      $id =~ s/.sph//g;
+      print ID $id;
+      print TXT "$arr[7]\n";
+ }' "all.tmp$$ allid.tmp$$ contentall.tmp$$"
+
+perl ${top_pwd}/local/normalize_transcript_BW.pl contentall.tmp$$ contentall.buck.tmp$$
+paste allid.tmp$$ contentall.buck.tmp$$ | sed 's: $::' | awk '{if (NF>5) {print $0}}'  > all_1.tmp$$
+
+
+awk '{$1="";print $0}' all_1.tmp$$ | sed 's:^ ::' > $gale_data/all
+awk '{if ($1 == "report") {$1="";print $0}}' all_1.tmp$$ | sed 's:^ ::' >  $gale_data/report
+awk '{if ($1 == "conversational") {$1="";print $0}}' all_1.tmp$$ | sed 's:^ ::' > $gale_data/conversational
+
+cd ..;
+rm -fr $txtdir
+cd $top_pwd
+echo "$0:dat a prep text succeeded"
+
+mkdir -p data
+dir=$(utils/make_absolute.sh data/)
+grep -f local/test_list $gale_data/all | grep -v -f local/bad_segments > $gale_data/all.test
+grep -v -f local/test_list $gale_data/all | grep -v -f local/bad_segments > $gale_data/all.train 
+
+for x in test train; do
+ outdir=data/$x
+ file=$gale_data/all.$x 
+ mkdir -p $outdir
+ awk '{print $2 " " $2}' $file | sort -u > $outdir/utt2spk 
+ cp -pr $outdir/utt2spk $outdir/spk2utt
+ awk '{print $2 " " $1 " " $3 " " $4}' $file  | sort -u > $outdir/segments
+ awk '{printf $2 " "; for (i=5; i<=NF; i++) {printf $i " "} printf "\n"}' $file | sort -u > $outdir/text
+done 
+
+grep -f local/test_list $gale_data/wav.scp > $dir/test/wav.scp
+
+cat $gale_data/wav.scp | awk -v seg=$dir/train/segments 'BEGIN{while((getline<seg) >0) {seen[$2]=1;}}
+ {if (seen[$1]) { print $0}}' > $dir/train/wav.scp
+ 
+echo "$0:data prep split succeeded"
+exit 0
diff --git a/egs/gale_arabic/s5c/local/prepare_dict.sh b/egs/gale_arabic/s5c/local/prepare_dict.sh
new file mode 100755
index 00000000000..47b5869fdf1
--- /dev/null
+++ b/egs/gale_arabic/s5c/local/prepare_dict.sh
@@ -0,0 +1,48 @@
+#!/usr/bin/env bash
+
+# Copyright 2017 QCRI (author: Ahmed Ali)
+# Apache 2.0
+# This script prepares the dictionary.
+
+set -e
+dir=data/local/dict
+lexicon_url1="http://alt.qcri.org//resources/speech/dictionary/ar-ar_grapheme_lexicon_2016-02-09.bz2";
+lexicon_url2="http://alt.qcri.org//resources/speech/dictionary/ar-ar_lexicon_2014-03-17.txt.bz2";
+stage=0
+. ./cmd.sh
+. ./path.sh
+. ./utils/parse_options.sh || exit 1;
+mkdir -p $dir data/local/lexicon_data
+
+if [ $stage -le 0 ]; then
+  echo "$0: Downloading text for lexicon... $(date)."
+  wget -P data/local/lexicon_data $lexicon_url1
+  wget -P data/local/lexicon_data $lexicon_url2
+  bzcat data/local/lexicon_data/ar-ar_grapheme_lexicon_2016-02-09.bz2  | sed '1,3d' | awk '{print $1}'  >  data/local/lexicon_data/grapheme_lexicon
+  bzcat data/local/lexicon_data/ar-ar_lexicon_2014-03-17.txt.bz2 | sed '1,3d' | awk '{print $1}' >>  data/local/lexicon_data/grapheme_lexicon
+  cat data/train/text | cut -d ' ' -f 2- | tr -s " " "\n" | sort -u >> data/local/lexicon_data/grapheme_lexicon
+fi
+
+
+if [ $stage -le 0 ]; then
+  echo "$0: processing lexicon text and creating lexicon... $(date)."
+  # remove vowels and  rare alef wasla
+  grep -v [0-9] data/local/lexicon_data/grapheme_lexicon |  sed -e 's:[FNKaui\~o\`]::g' -e 's:{:}:g' | sort -u > data/local/lexicon_data/processed_lexicon
+  local/prepare_lexicon.py
+fi
+
+cut -d' ' -f2- $dir/lexicon.txt | sed 's/SIL//g' | tr ' ' '\n' | sort -u | sed '/^$/d' >$dir/nonsilence_phones.txt || exit 1;
+
+sed -i '1i<UNK> UNK' $dir/lexicon.txt
+
+echo UNK >> $dir/nonsilence_phones.txt
+
+echo '<sil> SIL' >> $dir/lexicon.txt
+
+echo SIL > $dir/silence_phones.txt
+
+echo SIL >$dir/optional_silence.txt
+
+echo -n "" >$dir/extra_questions.txt
+
+echo "$0: Dictionary preparation succeeded"
diff --git a/egs/gale_arabic/s5c/local/prepare_dict_subword.sh b/egs/gale_arabic/s5c/local/prepare_dict_subword.sh
new file mode 100755
index 00000000000..330de664349
--- /dev/null
+++ b/egs/gale_arabic/s5c/local/prepare_dict_subword.sh
@@ -0,0 +1,64 @@
+#!/usr/bin/env bash
+
+# Copyright 2017 QCRI (author: Ahmed Ali)
+#           2019 Dongji Gao
+# Apache 2.0
+# This script prepares the subword dictionary.
+
+set -e
+dir=data/local/dict
+lexicon_url1="http://alt.qcri.org//resources/speech/dictionary/ar-ar_grapheme_lexicon_2016-02-09.bz2";
+lexicon_url2="http://alt.qcri.org//resources/speech/dictionary/ar-ar_lexicon_2014-03-17.txt.bz2";
+num_merges=1000
+stage=0
+. ./cmd.sh
+. ./path.sh
+. ./utils/parse_options.sh || exit 1;
+mkdir -p $dir data/local/lexicon_data
+
+if [ $stage -le 0 ]; then
+  echo "$0: Downloading text for lexicon... $(date)."
+  wget -P data/local/lexicon_data $lexicon_url1
+  wget -P data/local/lexicon_data $lexicon_url2
+  bzcat data/local/lexicon_data/ar-ar_grapheme_lexicon_2016-02-09.bz2  | sed '1,3d' | awk '{print $1}'  >  data/local/lexicon_data/grapheme_lexicon
+  bzcat data/local/lexicon_data/ar-ar_lexicon_2014-03-17.txt.bz2 | sed '1,3d' | awk '{print $1}' >>  data/local/lexicon_data/grapheme_lexicon
+  cat data/train/text | cut -d ' ' -f 2- | tr -s " " "\n" | sort -u >> data/local/lexicon_data/grapheme_lexicon
+fi
+
+
+if [ $stage -le 0 ]; then
+  echo "$0: processing lexicon text and creating lexicon... $(date)."
+  # remove vowels and  rare alef wasla
+  grep -v [0-9] data/local/lexicon_data/grapheme_lexicon |  sed -e 's:[FNKaui\~o\`]::g' -e 's:{:}:g' | sort -u > data/local/lexicon_data/processed_lexicon
+  local/prepare_lexicon.py
+fi
+
+cut -d' ' -f2- $dir/lexicon.txt | sed 's/SIL//g' | tr ' ' '\n' | sort -u | sed '/^$/d' >$dir/nonsilence_phones.txt || exit 1;
+
+echo UNK >> $dir/nonsilence_phones.txt
+
+echo SIL > $dir/silence_phones.txt
+
+echo SIL >$dir/optional_silence.txt
+
+echo -n "" >$dir/extra_questions.txt
+
+# Make a subword lexicon based on current word lexicon
+glossaries="<UNK> <sil>"
+if [ $stage -le 0 ]; then
+  echo "$0: making subword lexicon... $(date)."
+  # get pair_code file
+  cut -d ' ' -f2- data/train/text | sed 's/<[^>]*>//g' | utils/lang/bpe/learn_bpe.py -s $num_merges > data/local/pair_code.txt
+  mv $dir/lexicon.txt $dir/lexicon_word.txt
+  # get words
+  cut -d ' ' -f1 $dir/lexicon_word.txt > $dir/words.txt
+  utils/lang/bpe/apply_bpe.py -c data/local/pair_code.txt --glossaries $glossaries < $dir/words.txt | \
+  sed 's/ /\n/g' | sort -u > $dir/subwords.txt
+  sed 's/./& /g' $dir/subwords.txt | sed 's/@ @ //g' | sed 's/*/V/g' | paste -d ' ' $dir/subwords.txt - > $dir/lexicon.txt
+fi
+
+sed -i '1i<UNK> UNK' $dir/lexicon.txt
+
+echo '<sil> SIL' >> $dir/lexicon.txt
+
+echo "$0: Dictionary preparation succeeded"
diff --git a/egs/gale_arabic/s5c/local/prepare_lexicon.py b/egs/gale_arabic/s5c/local/prepare_lexicon.py
new file mode 100755
index 00000000000..215541585eb
--- /dev/null
+++ b/egs/gale_arabic/s5c/local/prepare_lexicon.py
@@ -0,0 +1,26 @@
+#!/usr/bin/env python3
+
+# Copyright      2018  Ashish Arora
+# Apache 2.0
+
+# This script prepares lexicon.
+
+import argparse
+import os
+
+parser = argparse.ArgumentParser(description="""Creates the list of characters and words in lexicon""")
+args = parser.parse_args()
+
+### main ###
+lex = {}
+text_path = os.path.join('data','local', 'lexicon_data', 'processed_lexicon')
+with open(text_path, 'r', encoding='utf-8') as f:
+    for line in f:
+        line = line.strip()
+        characters = list(line)
+        characters = " ".join(['V' if char == '*' else char for char in characters])
+        lex[line] = characters
+
+with open(os.path.join('data','local','dict', 'lexicon.txt'), 'w', encoding='utf-8') as fp:
+    for key in sorted(lex):
+        fp.write(key + "  " + lex[key] + "\n")
diff --git a/egs/gale_arabic/s5c/local/prepare_lm.sh b/egs/gale_arabic/s5c/local/prepare_lm.sh
new file mode 100755
index 00000000000..6fdf35f471a
--- /dev/null
+++ b/egs/gale_arabic/s5c/local/prepare_lm.sh
@@ -0,0 +1,51 @@
+#!/bin/bash
+
+# Copyright 2012  Vassil Panayotov
+#           2017  Ewald Enzinger
+# Apache 2.0
+
+. ./path.sh || exit 1
+
+echo "=== Building a language model ..."
+
+dir=data/local/lm/
+text=data/train/text
+lexicon=data/local/dict/lexicon.txt
+# Language model order
+order=3
+
+. utils/parse_options.sh
+
+# Prepare a LM training corpus from the transcripts
+mkdir -p $dir
+
+for f in "$text" "$lexicon"; do
+  [ ! -f $f ] && echo "$0: No such file $f" && exit 1;
+done
+
+loc=`which ngram-count`;
+if [ -z $loc ]; then
+  if uname -a | grep 64 >/dev/null; then # some kind of 64 bit...
+    sdir=$KALDI_ROOT/tools/srilm/bin/i686-m64 
+  else
+    sdir=$KALDI_ROOT/tools/srilm/bin/i686
+  fi
+  if [ -f $sdir/ngram-count ]; then
+    echo Using SRILM tools from $sdir
+    export PATH=$PATH:$sdir
+  else
+    echo You appear to not have SRILM tools installed, either on your path,
+    echo or installed in $sdir.  See tools/install_srilm.sh for installation
+    echo instructions.
+    exit 1
+  fi
+fi
+
+cat data/train/text | cut -d " " -f 2- >  $dir/text.txt
+cut -d' ' -f1 $lexicon > $dir/wordlist
+
+ngram-count -text $dir/text.txt -order $order -limit-vocab -vocab $dir/wordlist \
+  -unk -map-unk "<UNK>" -kndiscount -interpolate -lm $dir/lm.gz
+
+#ngram -lm $dir/lm.gz -ppl $dir/dev.txt
+echo "*** Finished building the LM model!"
diff --git a/egs/gale_arabic/s5c/local/prepare_lm_subword.sh b/egs/gale_arabic/s5c/local/prepare_lm_subword.sh
new file mode 100755
index 00000000000..a5d5c1d1c94
--- /dev/null
+++ b/egs/gale_arabic/s5c/local/prepare_lm_subword.sh
@@ -0,0 +1,53 @@
+#!/bin/bash
+
+# Copyright 2012  Vassil Panayotov
+#           2017  Ewald Enzinger
+#           2019  Dongji Gao
+# Apache 2.0
+
+. ./path.sh || exit 1
+
+echo "=== Building a language model ..."
+
+dir=data/local/lm/
+text=data/train/text
+lexicon=data/local/dict/lexicon.txt
+# Language model order
+order=6
+
+. utils/parse_options.sh
+
+# Prepare a LM training corpus from the transcripts
+mkdir -p $dir
+
+for f in "$text" "$lexicon"; do
+  [ ! -f $f ] && echo "$0: No such file $f" && exit 1;
+done
+
+loc=`which ngram-count`;
+if [ -z $loc ]; then
+  if uname -a | grep 64 >/dev/null; then # some kind of 64 bit...
+    sdir=$KALDI_ROOT/tools/srilm/bin/i686-m64 
+  else
+    sdir=$KALDI_ROOT/tools/srilm/bin/i686
+  fi
+  if [ -f $sdir/ngram-count ]; then
+    echo Using SRILM tools from $sdir
+    export PATH=$PATH:$sdir
+  else
+    echo You appear to not have SRILM tools installed, either on your path,
+    echo or installed in $sdir.  See tools/install_srilm.sh for installation
+    echo instructions.
+    exit 1
+  fi
+fi
+
+cat data/train/text | cut -d " " -f 2- >  $dir/text.txt
+cat data/test/text | cut -d ' ' -f2- > $dir/dev.txt
+cut -d' ' -f1 $lexicon > $dir/wordlist
+
+ngram-count -text $dir/text.txt -order $order -vocab $dir/wordlist \
+  -unk -map-unk "<UNK>" -wbdiscount1 -kndiscount2 -kndiscount3 -kndiscount4 -kndiscount5 -kndiscount6 -interpolate -lm $dir/lm.gz
+
+ngram -order $order -lm $dir/lm.gz -ppl $dir/dev.txt
+echo "*** Finished building the LM model!"
diff --git a/egs/gale_arabic/s5c/local/score.sh b/egs/gale_arabic/s5c/local/score.sh
new file mode 100755
index 00000000000..1d84815fc69
--- /dev/null
+++ b/egs/gale_arabic/s5c/local/score.sh
@@ -0,0 +1,6 @@
+
+#!/bin/bash
+
+
+steps/scoring/score_kaldi_wer.sh "$@"
+steps/scoring/score_kaldi_cer.sh --stage 2 "$@"
diff --git a/egs/gale_arabic/s5c/local/split_wer.sh b/egs/gale_arabic/s5c/local/split_wer.sh
new file mode 100755
index 00000000000..d83a0f79e8c
--- /dev/null
+++ b/egs/gale_arabic/s5c/local/split_wer.sh
@@ -0,0 +1,72 @@
+#!/bin/bash
+
+# Report WER for reports and conversational
+# Copyright 2014 QCRI (author: Ahmed Ali)
+# Apache 2.0
+
+if [ $# -ne 1 ]; then
+   echo "Arguments should be the gale folder, see ../run.sh for example."
+   exit 1;
+fi
+
+[ -f ./path.sh ] && . ./path.sh
+
+
+galeFolder=$(utils/make_absolute.sh $1)
+symtab=./data/lang/words.txt
+find exp/ -maxdepth 3 -type d -name decode\* > list_decode$$
+
+#split the test set per type:
+awk '{print $2}' $galeFolder/all.test | sort -u > $galeFolder/test_id$$
+
+# generate the report test set
+awk '{print $2}' $galeFolder/report | sort -u  > $galeFolder/report_id$$
+comm -1 -2 $galeFolder/test_id$$ $galeFolder/report_id$$ > $galeFolder/report.test
+
+# generate the conversational test set
+awk '{print $2}' $galeFolder/conversational | sort -u  > $galeFolder/conversational_id$$
+
+comm -1 -2 $galeFolder/test_id$$ $galeFolder/conversational_id$$ > $galeFolder/conversational.test
+
+rm -fr $galeFolder/test_id$$ $galeFolder/report_id$$ $galeFolder/conversational_id$$
+
+min_lmwt=7
+max_lmwt=20
+cat list_decode$$ | while read dir; do
+ for type in report conversational; do
+ #echo "Processing: $dir $type"
+  rm -fr $dir/scoring_$type
+  cp -pr $dir/scoring  $dir/scoring_$type
+  ( cd $dir/scoring_$type;
+    for x in *.tra test_filt.txt; do
+      sort -u $x > tmp$$
+      join tmp$$ $galeFolder/${type}.test > $x
+      rm -fr tmp$$
+    done
+   )
+
+utils/run.pl LMWT=$min_lmwt:$max_lmwt $dir/scoring_$type/log/score.LMWT.log \
+   cat $dir/scoring_${type}/LMWT.tra \| \
+    utils/int2sym.pl -f 2- $symtab \| sed 's:\<UNK\>::g' \| \
+    compute-wer --text --mode=present \
+     ark:$dir/scoring_${type}/test_filt.txt  ark,p:- ">&" $dir/wer_${type}_LMWT
+done
+done
+
+
+time=$(date +"%Y-%m-%d-%H-%M-%S")
+echo "RESULTS generated by $USER at $time"
+
+echo "Report Results WER:"
+cat list_decode$$ | while read x; do [ -d $x ] && grep WER $x/wer_report_* | utils/best_wer.sh; done | sort -n -k2
+
+echo "Conversational Results WER:"
+cat list_decode$$ | while read x; do [ -d $x ] && grep WER $x/wer_conversational_* | utils/best_wer.sh; done | sort -n -k2
+
+echo "Combined Results for Reports and Conversational WER:"
+cat list_decode$$ | while read x; do [ -d $x ] && grep WER $x/wer_?? $x/wer_?| utils/best_wer.sh; done | sort -n -k2
+
+rm list_decode$$
+
+
+
diff --git a/egs/gale_arabic/s5c/local/test_list b/egs/gale_arabic/s5c/local/test_list
new file mode 100644
index 00000000000..d82cf498804
--- /dev/null
+++ b/egs/gale_arabic/s5c/local/test_list
@@ -0,0 +1,11 @@
+ALAM_WITHEVENT_ARB_20070116_205800
+ALAM_WITHEVENT_ARB_20070130_205800
+ALAM_WITHEVENT_ARB_20070206_205801
+ALAM_WITHEVENT_ARB_20070213_205800
+ALAM_WITHEVENT_ARB_20070227_205800
+ALAM_WITHEVENT_ARB_20070306_205800
+ALAM_WITHEVENT_ARB_20070313_205800
+ARABIYA_FROMIRAQ_ARB_20070216_175800
+ARABIYA_FROMIRAQ_ARB_20070223_175801
+ARABIYA_FROMIRAQ_ARB_20070302_175801
+ARABIYA_FROMIRAQ_ARB_20070309_175800
diff --git a/egs/gale_arabic/s5c/local/wer_output_filter b/egs/gale_arabic/s5c/local/wer_output_filter
new file mode 100755
index 00000000000..fcd40539e7f
--- /dev/null
+++ b/egs/gale_arabic/s5c/local/wer_output_filter
@@ -0,0 +1,4 @@
+#!/bin/sed -f
+s/@@ //g
+s/<sil>//g
+s/<UNK>//g
diff --git a/egs/gale_arabic/s5c/path.sh b/egs/gale_arabic/s5c/path.sh
new file mode 100755
index 00000000000..be11b34cbc6
--- /dev/null
+++ b/egs/gale_arabic/s5c/path.sh
@@ -0,0 +1,5 @@
+export KALDI_ROOT=$(pwd)/../../..
+export PATH=$PWD/utils/:$KALDI_ROOT/tools/openfst/bin:$PWD:$PATH
+[ ! -f $KALDI_ROOT/tools/config/common_path.sh ] && echo >&2 "The standard file $KALDI_ROOT/tools/config/common_path.sh is not present -> Exit!" && exit 1
+. $KALDI_ROOT/tools/config/common_path.sh
+export LC_ALL=C
diff --git a/egs/gale_arabic/s5c/run.sh b/egs/gale_arabic/s5c/run.sh
new file mode 100755
index 00000000000..3e363816812
--- /dev/null
+++ b/egs/gale_arabic/s5c/run.sh
@@ -0,0 +1,131 @@
+#!/bin/bash -e
+
+# Copyright 2014 QCRI (author: Ahmed Ali)
+#           2019 Dongji Gao
+# Apache 2.0
+
+# This is an example script for subword implementation
+
+num_jobs=120
+num_decode_jobs=40
+decode_gmm=true
+stage=0
+overwrite=false
+num_merges=1000
+
+dir1=/export/corpora/LDC/LDC2013S02/
+dir2=/export/corpora/LDC/LDC2013S07/
+dir3=/export/corpora/LDC/LDC2014S07/
+text1=/export/corpora/LDC/LDC2013T17/
+text2=/export/corpora/LDC/LDC2013T04/
+text3=/export/corpora/LDC/LDC2014T17/
+
+galeData=GALE
+. ./cmd.sh ## You'll want to change cmd.sh to something that will work on your system.
+           ## This relates to the queue.
+. ./path.sh
+. ./utils/parse_options.sh  # e.g. this parses the above options
+                            # if supplied.
+
+if [ $stage -le 0 ]; then
+
+  if [ -f data/train/text ] && ! $overwrite; then
+    echo "$0: Not processing, probably script have run from wrong stage"
+    echo "Exiting with status 1 to avoid data corruption"
+    exit 1;
+  fi
+
+  echo "$0: preparing data..."
+  local/prepare_data.sh --dir1 $dir1 --dir2 $dir2 --dir3 $dir3 \
+                        --text1 $text1 --text2 $text2 --text3 $text3
+
+  echo "$0: Preparing lexicon and LM..." 
+  local/prepare_dict_subword.sh --num_merges $num_merges
+
+  utils/subword/prepare_lang_subword.sh data/local/dict "<UNK>" data/local/lang data/lang
+
+  for set in train test; do
+    utils/subword/prepare_subword_text.sh data/$set/text data/local/pair_code.txt data/$set/text
+  done
+
+  local/prepare_lm_subword.sh
+
+  utils/format_lm.sh data/lang data/local/lm/lm.gz \
+                     data/local/dict/lexicon.txt data/lang_test
+fi
+
+mfccdir=mfcc
+if [ $stage -le 1 ]; then
+  echo "$0: Preparing the test and train feature files..."
+  for x in train test ; do
+    steps/make_mfcc.sh --cmd "$train_cmd" --nj $num_jobs \
+      data/$x exp/make_mfcc/$x $mfccdir
+    utils/fix_data_dir.sh data/$x # some files fail to get mfcc for many reasons
+    steps/compute_cmvn_stats.sh data/$x exp/make_mfcc/$x $mfccdir
+  done
+fi
+
+if [ $stage -le 2 ]; then
+  echo "$0: creating sub-set and training monophone system"
+  utils/subset_data_dir.sh data/train 10000 data/train.10K || exit 1;
+
+  steps/train_mono.sh --nj 40 --cmd "$train_cmd" \
+    data/train.10K data/lang exp/mono_subword || exit 1;
+fi
+
+if [ $stage -le 3 ]; then
+  echo "$0: Aligning data using monophone system"
+  steps/align_si.sh --nj $num_jobs --cmd "$train_cmd" \
+    data/train data/lang exp/mono_subword exp/mono_ali_subword || exit 1;
+
+  echo "$0: training triphone system with delta features"
+  steps/train_deltas.sh --cmd "$train_cmd" \
+    2500 30000 data/train data/lang exp/mono_ali_subword exp/tri1_subword || exit 1;
+fi
+
+if [ $stage -le 4 ] && $decode_gmm; then
+  utils/mkgraph.sh data/lang_test exp/tri1_subword exp/tri1_subword/graph
+  steps/decode.sh  --nj $num_decode_jobs --cmd "$decode_cmd" \
+    exp/tri1_subword/graph data/test exp/tri1_subword/decode
+fi
+
+if [ $stage -le 5 ]; then
+  echo "$0: Aligning data and retraining and realigning with lda_mllt"
+  steps/align_si.sh --nj $num_jobs --cmd "$train_cmd" \
+    data/train data/lang exp/tri1_subword exp/tri1_ali_subword || exit 1;
+
+  steps/train_lda_mllt.sh --cmd "$train_cmd" 4000 50000 \
+    data/train data/lang exp/tri1_ali_subword exp/tri2b_subword || exit 1;
+fi
+
+if [ $stage -le 6 ] && $decode_gmm; then
+  utils/mkgraph.sh data/lang_test exp/tri2b_subword exp/tri2b_subword/graph
+  steps/decode.sh --nj $num_decode_jobs --cmd "$decode_cmd" \
+    exp/tri2b_subword/graph data/test exp/tri2b_subword/decode
+fi
+
+if [ $stage -le 7 ]; then
+  echo "$0: Aligning data and retraining and realigning with sat_basis"
+  steps/align_si.sh --nj $num_jobs --cmd "$train_cmd" \
+    data/train data/lang exp/tri2b_subword exp/tri2b_ali_subword || exit 1;
+
+  steps/train_sat_basis.sh --cmd "$train_cmd" \
+    5000 100000 data/train data/lang exp/tri2b_ali_subword exp/tri3b_subword || exit 1;
+
+  steps/align_fmllr.sh --nj $num_jobs --cmd "$train_cmd" \
+    data/train data/lang exp/tri3b_subword exp/tri3b_ali_subword || exit 1;
+fi
+
+if [ $stage -le 8 ] && $decode_gmm; then
+  utils/mkgraph.sh data/lang_test exp/tri3b_subword exp/tri3b_subword/graph
+  steps/decode_fmllr.sh --nj $num_decode_jobs --cmd \
+    "$decode_cmd" exp/tri3b_subword/graph data/test exp/tri3b_subword/decode
+fi
+
+if [ $stage -le 9 ]; then
+  echo "$0: Training a regular chain model using the e2e alignments..."
+  local/chain/run_tdnn.sh --gmm tri3b_subword
+fi
+
+echo "$0: training succeed"
+exit 0
diff --git a/egs/gale_arabic/s5c/steps b/egs/gale_arabic/s5c/steps
new file mode 120000
index 00000000000..1b186770dd1
--- /dev/null
+++ b/egs/gale_arabic/s5c/steps
@@ -0,0 +1 @@
+../../wsj/s5/steps/
\ No newline at end of file
diff --git a/egs/gale_arabic/s5c/utils b/egs/gale_arabic/s5c/utils
new file mode 120000
index 00000000000..a3279dc8679
--- /dev/null
+++ b/egs/gale_arabic/s5c/utils
@@ -0,0 +1 @@
+../../wsj/s5/utils/
\ No newline at end of file
diff --git a/egs/wsj/s5/utils/lang/make_position_dependent_subword_lexicon.py b/egs/wsj/s5/utils/lang/make_position_dependent_subword_lexicon.py
new file mode 100755
index 00000000000..83aa145c946
--- /dev/null
+++ b/egs/wsj/s5/utils/lang/make_position_dependent_subword_lexicon.py
@@ -0,0 +1,107 @@
+#!/usr/bin/env python3
+
+# 2019 Dongji Gao
+
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#  http://www.apache.org/licenses/LICENSE-2.0
+#
+# THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
+# WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
+# MERCHANTABLITY OR NON-INFRINGEMENT.
+# See the Apache 2 License for the specific language governing permissions and
+# limitations under the License.
+
+from make_lexicon_fst import read_lexiconp
+import argparse
+import math
+
+def get_args():
+    parser = argparse.ArgumentParser(description="""This script creates a
+        position-dependent subword lexicon from a position-independent subword lexicon
+        by adding suffixes ("_B", "_I", "_E", "_S") to the related phones.
+        It assumes that the input lexicon does not contain disambiguation symbols.""")
+    parser.add_argument("--separator", type=str, default="@@", help="""Separator
+        indicates the position of a subword in a word. 
+        Subword ends with separator can only appear at the beginning or middle of a word. 
+        Subword without separator can only appear at the end of a word or is a word itself.
+        E.g. "international -> inter@@ nation@@ al";
+             "nation        -> nation"
+        The separator should match the separator used in the input lexicon.""")
+    parser.add_argument("lexiconp", type=str, help="""Filename of subword position-independent 
+        lexicon with pronunciation probabilities, with lines of the form 'subword prob p1 p2 ...'""")
+    args = parser.parse_args()
+    return args
+
+def is_end(subword, separator):
+    """Return true if the subword can appear at the end of a word (i.e., the subword 
+    does not end with separator). Return false otherwise."""
+    return not subword.endswith(separator)
+
+def write_position_dependent_lexicon(lexiconp, separator):
+    """Print a position-dependent lexicon for each subword from the input lexiconp by adding
+    appropriate suffixes ("_B", "_I", "_E", "_S") to the phone sequence related to the subword.
+    There are 4 types of position-dependent subword:
+    1) Beginning subword. It can only appear at the beginning of a word.
+       The first phone suffix should be "_B" and other suffixes should be "_I"s:
+        nation@@ 1.0 n_B ey_I sh_I ih_I n_I
+        n@@      1.0 n_B
+    2) Middle subword. It can only appear at the middle of a word.
+       All phone suffixes should be "_I"s:
+        nation@@ 1.0 n_I ey_I sh_I ih_I n_I
+    3) End subword. It can only appear at the end of a word.
+       The last phone suffix should be "_E" and other suffixes should be "_I"s:
+        nation   1.0 n_I ey_I sh_I ih_I n_E
+        n        1.0 n_E
+    4) Singleton subword (i.e., the subword is word it self). 
+       The first phone suffix should be "_B" and the last suffix should be "_E".
+       All other suffixes should be "_I"s. If there is only one phone, its suffix should be "_S":
+        nation   1.0 n_B ey_I sh_I ih_I n_E
+        n        1.0 n_S
+    In most cases (i.e., subwords have more than 1 phones), the suffixes of phones in the middle are "_I"s.
+    So the suffix_list is initialized with all _I and we only replace the first and last phone suffix when
+    dealing with different cases when necessary.
+    """
+    for (word, prob, phones) in lexiconp:
+        phones_length = len(phones)
+
+        # suffix_list is initialized by all "_I"s.
+        suffix_list = ["_I" for i in range(phones_length)]
+
+        if is_end(word, separator):
+            # print end subword lexicon by replacing the last phone suffix by "_E"
+            suffix_list[-1] = "_E"
+            phones_list = [phone + suffix for (phone, suffix) in zip(phones, suffix_list)]
+            print("{} {} {}".format(word, prob, ' '.join(phones_list)))
+
+            # print singleton subword lexicon
+            # the phone suffix is "_S" if the there is only 1 phone.
+            if phones_length == 1:
+                suffix_list[0] = "_S"
+                phones_list = [phone + suffix for (phone, suffix) in zip(phones, suffix_list)]
+                print("{} {} {}".format(word, prob, ' '.join(phones_list)))
+            # the first phone suffix is "_B" is there is more than 1 phones.
+            else:
+                suffix_list[0] = "_B"
+                phones_list = [phone + suffix for (phone, suffix) in zip(phones, suffix_list)]
+                print("{} {} {}".format(word, prob, ' '.join(phones_list)))
+        else:
+            # print middle subword lexicon
+            phones_list = [phone + suffix for (phone, suffix) in zip(phones, suffix_list)]
+            print("{} {} {}".format(word, prob, ' '.join(phones_list)))
+
+            # print beginning subword lexicon by replacing the first phone suffix by "_B"
+            suffix_list[0] = "_B"
+            phones_list = [phone + suffix for (phone, suffix) in zip(phones, suffix_list)]
+            print("{} {} {}".format(word, prob, ' '.join(phones_list)))
+
+def main():
+    args = get_args()
+    lexiconp = read_lexiconp(args.lexiconp)
+    write_position_dependent_lexicon(lexiconp, args.separator)
+
+if __name__ == "__main__":
+    main()
diff --git a/egs/wsj/s5/utils/lang/make_subword_lexicon_fst.py b/egs/wsj/s5/utils/lang/make_subword_lexicon_fst.py
new file mode 100755
index 00000000000..1beec500c13
--- /dev/null
+++ b/egs/wsj/s5/utils/lang/make_subword_lexicon_fst.py
@@ -0,0 +1,301 @@
+#!/usr/bin/env python3
+
+# 2019 Dongji Gao
+# Apache 2.0.
+
+from make_lexicon_fst import read_lexiconp
+import argparse
+import math
+import sys
+
+# see get_args() below for usage mesage
+def get_args():
+    parser = argparse.ArgumentParser(description="""This script creates the
+        text form of a subword lexicon FST to be compiled by fstcompile using
+        the appropriate symbol tables (phones.txt and words.txt). It will mostly
+        be invoked indirectly via utils/prepare_lang_subword.sh. The output
+        goes to the stdout. This script is the subword version of make_lexicon_fst.py.
+        It only allows optional silence to appear after end-subword or singleton-subword,
+        (i.e., subwords without separator). In this version we do not support
+        pronunciation probability. (i.e., pron-prob = 1.0)""")
+
+    parser.add_argument('--sil-phone', type=str, help="""Text form of
+        optional-silence phone, e.g. 'SIL'. See also the --sil-prob option.""")
+    parser.add_argument('--sil-prob', type=float, default=0.0, help="""Probability
+        of silence between words (including the beginning and end of word sequence).
+        Must be in range [0.0, 1.0). This refer to the optional silence inserted by
+        the lexicon; see the --sil-phone option.""")
+    parser.add_argument('--sil-disambig', type=str, help="""Disambiguation symbol
+        to disambiguate silence, e.g. #5. Will only be supplied if you are creating 
+        the version of L.fst with disambiguation symbols, intended for use with cyclic 
+        G.fst. This symbol was introduced to fix a rather obscure source of nondeterminism 
+        of CLG.fst, that has to do with reordering of disambiguation symbols and phone symbols.""")
+    parser.add_argument('--position-dependent', action="store_true", help="""Whether 
+        the input lexicon is position-dependent.""")
+    parser.add_argument("--separator", type=str, default="@@", help="""Separator
+        indicates the position of a subword in a word.
+        Subword followed by separator can only appear at the beginning or middle of a word.
+        Subword without separator can only appear at the end of a word or is a word itself.
+        E.g. "international -> inter@@ nation@@ al";
+             "nation        -> nation"
+    The separator should match the separator used in the input lexicon.""")
+    parser.add_argument('lexiconp', type=str, help="""Filename of lexicon with
+        pronunciation probabilities (normally lexiconp.txt), with lines of the
+        form 'subword prob p1 p2...', e.g. 'a, 1.0 ay'""")
+    args = parser.parse_args()
+    return args
+
+def contain_disambig_symbol(phones):
+    """Return true if the phone sequence contains disambiguation symbol.
+    Return false otherwise. Disambiguation symbol is at the end of phones 
+    in the form of #1, #2... There is at most one disambiguation 
+    symbol for each phone sequence"""
+    return True if phones[-1].startswith("#") else False
+
+def print_arc(src, dest, phone, word, cost):
+    print('{}\t{}\t{}\t{}\t{}'.format(src, dest, phone, word, cost))
+
+def is_end(word, separator):
+    """Return true if the subword can appear at the end of a word (i.e., the subword
+    does not end with separator). Return false otherwise."""
+    return not word.endswith(separator)
+
+def get_suffix(phone):
+    """Return the suffix of a phone. The suffix is in the form of '_B', '_I'..."""
+    if len(phone) < 3:
+        print("{}: invalid phone {} (please check if the phone is position-dependent)".format(
+              sys.argv[0], phone), file=sys.stderr)
+        sys.exit(1)
+    return phone[-2:]
+
+def write_fst_no_silence(lexicon, position_dependent, separator):
+    """Writes the text format of L.fst to the standard output.  This version is for
+    when --sil-prob=0.0, meaning there is no optional silence allowed.
+    loop_state here is the start and final state of the fst. It goes to word_start_state
+    via epsilon transition.
+    In position-independent case, there is no difference between beginning word and 
+    middle word. So all subwords with separator would leave from and enter word_start_state.
+    All subword without separator would leave from word_start_state and enter loop_state.
+    This guarantees that optional silence can only follow a word-end subword.
+
+    In position-dependent case, there are 4 types of position-dependent subword:
+    1) Beginning subword. The first phone suffix should be "_B" and other suffixes should be "_I"s:
+        nation@@ 1.0 n_B ey_I sh_I ih_I n_I
+        n@@      1.0 n_B
+    2) Middle subword. All phone suffixes should be "_I"s:
+        nation@@ 1.0 n_I ey_I sh_I ih_I n_I
+    3) End subword. The last phone suffix should be "_E" and other suffixes be should "_I"s:
+        nation   1.0 n_I ey_I sh_I ih_I n_E
+        n        1.0 n_E
+    4) Singleton subword (i.e., the subword is word it self).
+       The first phone suffix should be "_B" and the last suffix should be "_E".
+       All other suffix should be "_I"s. If there is only one phone, its suffix should be "_S":
+        nation   1.0 n_B ey_I sh_I ih_I n_E
+        n        1.0 n_S
+
+    So we need an extra word_internal_state. The beginning word 
+    would leave from word_start_state and enter word_internal_state and middle word
+    would leave from and enter word_internal_state. The rest part is same.
+
+      'lexicon' is a list of 3-tuples (subword, pron-prob, prons) as returned by
+      'position_dependent', which is true is the lexicon is position-dependent.
+      'separator' is a symbol which indicates the position of a subword in word.
+    """
+    # regular setting
+    loop_state = 0
+    word_start_state = 1
+    next_state = 2
+
+    print_arc(loop_state, word_start_state, "<eps>", "<eps>", 0.0)
+
+    # optional setting for word_internal_state
+    if position_dependent:
+        word_internal_state = next_state
+        next_state += 1
+
+    for (word, pron_prob, phones) in lexicon:
+        pron_cost = 0.0                # do not support pron_prob
+        phones_len = len(phones)
+
+        # set start and end state for different cases
+        if position_dependent:
+            first_phone_suffix = get_suffix(phones[0])
+            last_phone = phones[-2] if contain_disambig_symbol(phones) else phones[-1]
+            last_phone_suffix = get_suffix(last_phone)
+
+            # singleton word
+            if first_phone_suffix == "_S":
+                current_state = word_start_state
+                end_state = loop_state
+            # set the current_state
+            elif first_phone_suffix == "_B":
+                current_state = word_start_state
+            elif first_phone_suffix == "_I" or first_phone_suffix == "_E":
+                current_state = word_internal_state
+            # then set the end_state
+            if last_phone_suffix == "_B" or last_phone_suffix == "_I":
+                end_state = word_internal_state
+            elif last_phone_suffix == "_E":
+                end_state = loop_state
+        else:
+            current_state = word_start_state
+            end_state = loop_state if is_end(word, separator) else word_start_state
+
+        # print arcs (except the last one) for the subword
+        for i in range(phones_len - 1):
+            word = word if i == 0 else "<eps>"
+            cost = pron_cost if i == 0 else 0.0
+            print_arc(current_state, next_state, phones[i], word, cost)
+            current_state = next_state
+            next_state += 1
+
+        # print the last arc
+        i = phones_len - 1
+        phone = phones[i] if i >=0 else "<eps>"
+        word = word if i <= 0 else "<eps>"
+        cost = pron_cost if i <= 0 else 0.0
+        print_arc(current_state, end_state, phone, word, cost)
+
+    # set the final state
+    print("{state}\t{final_cost}".format(state=loop_state, final_cost=0.0))
+
+def write_fst_with_silence(lexicon, sil_phone, sil_prob, sil_disambig, position_dependent, separator):
+    """Writes the text format of L.fst to the standard output.  This version is for
+    when --sil-prob=0.0, meaning there is no optional silence allowed.
+    loop_state here is the start and final state of the fst. It goes to word_start_state
+    via epsilon transition.
+
+    In position-independent case, there is no difference between beginning word and 
+    middle word. So all subwords with separator would leave from and enter word_start_state.
+    All subword without separator would leave from word_start_state and enter sil_state.
+    This guarantees that optional silence can only follow a word-end subword and such subwords
+    must appear at the end of the whole subword sequence.
+
+    In position-dependent case, there are 4 types of position-dependent subword:
+    1) Beginning subword. The first phone suffix should be "_B" and other suffixes should be "_I"s:
+        nation@@ 1.0 n_B ey_I sh_I ih_I n_I
+        n@@      1.0 n_B
+    2) Middle subword. All phone suffixes should be "_I"s:
+        nation@@ 1.0 n_I ey_I sh_I ih_I n_I
+    3) End subword. The last phone suffix should be "_E" and other suffixes be should "_I"s:
+        nation   1.0 n_I ey_I sh_I ih_I n_E
+        n        1.0 n_E
+    4) Singleton subword (i.e., the subword is word it self).
+       The first phone suffix should be "_B" and the last suffix should be "_E".
+       All other suffix should be "_I"s. If there is only one phone, its suffix should be "_S":
+        nation   1.0 n_B ey_I sh_I ih_I n_E
+        n        1.0 n_S
+
+    So we need an extra word_internal_state. The beginning word 
+    would leave from word_start_state and enter word_internal_state and middle word
+    would leave from and enter word_internal_state. The rest part is same.
+
+      'lexicon' is a list of 3-tuples (subword, pron-prob, prons)
+         as returned by read_lexiconp().
+      'sil_prob', which is expected to be strictly between 0.0 and 1.0, is the
+         probability of silence
+      'sil_phone' is the silence phone, e.g. "SIL".
+      'sil_disambig' is either None, or the silence disambiguation symbol, e.g. "#5".
+      'position_dependent', which is True is the lexicion is position-dependent.
+      'separator' is the symbol we use to indicate the position of a subword in word.
+    """
+
+    sil_cost = -math.log(sil_prob)
+    no_sil_cost = -math.log(1 - sil_prob)
+
+    # regular setting
+    start_state = 0
+    loop_state = 1         # also the final state
+    sil_state = 2          # words terminate here when followed by silence; this state
+                           # has a licence transition to loop_state
+    word_start_state = 3   # subword leave from here
+    next_state = 4         # the next un-allocated state, will be incremented as we go
+
+    print_arc(start_state, loop_state, "<eps>", "<eps>", no_sil_cost)
+    print_arc(start_state, sil_state, "<eps>", "<eps>", sil_cost)
+    print_arc(loop_state, word_start_state, "<eps>", "<eps>", 0.0)
+
+    # optional setting for disambig_state
+    if sil_disambig is None:
+        print_arc(sil_state, loop_state, sil_phone, "<eps>", 0.0)
+    else:
+        disambig_state = next_state
+        next_state += 1
+        print_arc(sil_state, disambig_state, sil_phone, "<eps>", 0.0)
+        print_arc(disambig_state, loop_state, sil_disambig, "<eps>", 0.0)
+
+    # optional setting for word_internal_state
+    if position_dependent:
+        word_internal_state = next_state
+        next_state += 1
+
+    for (word, pron_prob, phones) in lexicon:
+        pron_cost = 0.0           # do not support pron_prob
+        phones_len = len(phones)
+        
+        # set start and end state for different cases
+        if position_dependent:
+            first_phone_suffix = get_suffix(phones[0])
+            last_phone = phones[-2] if contain_disambig_symbol(phones) else phones[-1]
+            last_phone_suffix = get_suffix(last_phone)
+
+            # singleton subword
+            if first_phone_suffix == "_S":
+                current_state = word_start_state
+                end_state_list = [loop_state, sil_state]
+                end_cost_list = [no_sil_cost, sil_cost]
+            # first set the current_state
+            elif first_phone_suffix == "_B":
+                current_state = word_start_state
+            elif first_phone_suffix == "_I" or first_phone_suffix == "_E":
+                current_state = word_internal_state
+            # then set the end_state (end_state_list)
+            if last_phone_suffix == "_B" or last_phone_suffix == "_I":
+                end_state_list = [word_internal_state]
+                end_cost_list = [0.0]
+            elif last_phone_suffix == "_E":
+                end_state_list = [loop_state, sil_state]
+                end_cost_list = [no_sil_cost, sil_cost]
+        else:
+            current_state = word_start_state
+            if is_end(word, separator):
+                end_state_list = [loop_state, sil_state]
+                end_cost_list = [no_sil_cost, sil_cost]
+            else:
+                end_state_list = [word_start_state]
+                end_cost_list = [0.0]
+
+        # print arcs (except the last one) for the subword
+        for i in range(phones_len - 1):
+            word = word if i == 0 else "<eps>"
+            cost = pron_cost if i == 0 else 0.0
+            print_arc(current_state, next_state, phones[i], word, cost)
+            current_state = next_state
+            next_state += 1
+
+        # print the last arc
+        i = phones_len - 1
+        phone = phones[i] if i >= 0 else "<eps>"
+        word = word if i <= 0 else "<eps>"
+        cost = pron_cost if i <= 0 else 0.0
+        for (end_state, end_cost) in zip(end_state_list, end_cost_list):
+            print_arc(current_state, end_state, phone, word, cost + end_cost)
+
+    # set the final state
+    print("{state}\t{final_cost}".format(state=loop_state, final_cost=0.0))
+
+def main():
+    args = get_args()
+    if args.sil_prob < 0.0 or args.sil_prob >= 1.0:
+        print("{}: invalid value specified --sil-prob={}".format(
+              sys.argv[0], args.sil_prob), file=sys.stderr)
+        sys.exit(1)
+    lexicon = read_lexiconp(args.lexiconp)
+    if args.sil_prob == 0.0:
+        write_fst_no_silence(lexicon, args.position_dependent, args.separator)
+    else:
+        write_fst_with_silence(lexicon, args.sil_phone, args.sil_prob, 
+            args.sil_disambig, args.position_dependent, args.separator)
+
+if __name__ == "__main__":
+    main()
diff --git a/egs/wsj/s5/utils/subword/prepare_lang_subword.sh b/egs/wsj/s5/utils/subword/prepare_lang_subword.sh
new file mode 100755
index 00000000000..f2432e91825
--- /dev/null
+++ b/egs/wsj/s5/utils/subword/prepare_lang_subword.sh
@@ -0,0 +1,423 @@
+#!/bin/bash
+# Copyright 2012-2013  Johns Hopkins University (Author: Daniel Povey);
+#                      Arnab Ghoshal
+#                2014  Guoguo Chen
+#                2015  Hainan Xu
+#                2016  FAU Erlangen (Author: Axel Horndasch)
+#                2019  Dongji Gao
+
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#  http://www.apache.org/licenses/LICENSE-2.0
+#
+# THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
+# WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
+# MERCHANTABLITY OR NON-INFRINGEMENT.
+# See the Apache 2 License for the specific language governing permissions and
+# limitations under the License.
+
+# This script prepares a directory (for subword) such as data/lang_subword/, in the standard format,
+# given a source directory containing a subword dictionary lexicon.txt in a form like:
+# subword phone1 phone2 ... phoneN
+# per line (alternate prons would be separate lines), or a dictionary with probabilities
+# called lexiconp.txt in a form:
+# subword pron-prob phone1 phone2 ... phoneN
+# (with 0.0 < pron-prob <= 1.0); note: if lexiconp.txt exists, we use it even if
+# lexicon.txt exists.
+# and also files silence_phones.txt, nonsilence_phones.txt, optional_silence.txt
+# and extra_questions.txt
+# Here, silence_phones.txt and nonsilence_phones.txt are lists of silence and
+# non-silence phones respectively (where silence includes various kinds of
+# noise, laugh, cough, filled pauses etc., and nonsilence phones includes the
+# "real" phones.)
+# In each line of those files is a list of phones, and the phones on each line
+# are assumed to correspond to the same "base phone", i.e. they will be
+# different stress or tone variations of the same basic phone.
+# The file "optional_silence.txt" contains just a single phone (typically SIL)
+# which is used for optional silence in the lexicon.
+# extra_questions.txt might be empty; typically will consist of lists of phones,
+# all members of each list with the same stress or tone; and also possibly a
+# list for the silence phones.  This will augment the automatically generated
+# questions (note: the automatically generated ones will treat all the
+# stress/tone versions of a phone the same, so will not "get to ask" about
+# stress or tone).
+#
+
+# This script adds word-position-dependent phones and constructs a host of other
+# derived files, that go in data/lang_subword/.
+
+# Currently it only support the most basic functions.
+# Begin configuration section.
+num_sil_states=5
+num_nonsil_states=3
+position_dependent_phones=true
+# position_dependent_phones is false also when position dependent phones and word_boundary.txt
+# have been generated by another source
+share_silence_phones=false  # if true, then share pdfs of different silence
+                            # phones together.
+sil_prob=0.5
+num_extra_phone_disambig_syms=1 # Standard one phone disambiguation symbol is used for optional silence.
+                                # Increasing this number does not harm, but is only useful if you later
+                                # want to introduce this labels to L_disambig.fst
+separator="@@"   # Separator is a suffix or prefix of subword indicating the position of this subword in word.
+                 # By default, subword which is not at the end of word would have separator as suffix.
+                 # For example: international -> inter@@ nation@@ al
+
+# end configuration sections
+
+echo "$0 $@"  # Print the command line for logging
+
+. utils/parse_options.sh
+
+if [ $# -ne 4 ]; then
+  echo "Usage: utils/prepare_lang.sh <dict-src-dir> <oov-dict-entry> <tmp-dir> <lang-dir>"
+  echo "e.g.: utils/prepare_lang.sh data/local/dict <SPOKEN_NOISE> data/local/lang data/lang"
+  echo "<dict-src-dir> should contain the following files:"
+  echo " extra_questions.txt  lexicon.txt nonsilence_phones.txt  optional_silence.txt  silence_phones.txt"
+  echo "See http://kaldi-asr.org/doc/data_prep.html#data_prep_lang_creating for more info."
+  echo "options: "
+  echo "<dict-src-dir> may also, for the grammar-decoding case (see http://kaldi-asr.org/doc/grammar.html)"
+  echo "contain a file nonterminals.txt containing symbols like #nonterm:contact_list, one per line."
+  echo "     --num-sil-states <number of states>             # default: 5, #states in silence models."
+  echo "     --num-nonsil-states <number of states>          # default: 3, #states in non-silence models."
+  echo "     --position-dependent-phones (true|false)        # default: true; if true, use _B, _E, _S & _I"
+  echo "                                                     # markers on phones to indicate word-internal positions. "
+  echo "     --share-silence-phones (true|false)             # default: false; if true, share pdfs of "
+  echo "                                                     # all silence phones. "
+  echo "     --sil-prob <probability of silence>             # default: 0.5 [must have 0 <= silprob < 1]"
+  echo "     --separator <separator>                         # default: @@"
+  exit 1;
+fi
+
+srcdir=$1
+oov_word=$2
+tmpdir=$3
+dir=$4
+mkdir -p $dir $tmpdir $dir/phones
+
+silprob=false
+[ -f $srcdir/lexiconp_silprob.txt ] && echo "$0: Currently we do not support word-dependent silence probability." && exit 1;
+
+if [ -f $srcdir/nonterminals.txt ]; then
+  echo "$0: Currently we do not support nonterminals" && exit 1;
+else
+  grammar_opts=
+fi
+
+[ -f path.sh ] && . ./path.sh
+
+# Validate dict directory
+! utils/validate_dict_dir.pl $srcdir && \
+  echo "*Error validating directory $srcdir*" && exit 1;
+
+if [[ ! -f $srcdir/lexicon.txt ]]; then
+  echo "**Creating $srcdir/lexicon.txt from $srcdir/lexiconp.txt"
+  perl -ape 's/(\S+\s+)\S+\s+(.+)/$1$2/;' < $srcdir/lexiconp.txt > $srcdir/lexicon.txt || exit 1;
+fi
+if [[ ! -f $srcdir/lexiconp.txt ]]; then
+  echo "**Creating $srcdir/lexiconp.txt from $srcdir/lexicon.txt"
+  perl -ape 's/(\S+\s+)(.+)/${1}1.0\t$2/;' < $srcdir/lexicon.txt > $srcdir/lexiconp.txt || exit 1;
+fi
+
+# Currently The lexicon in dict directory have to be a subword lexicon.
+# If the lexicon is for word and is not phonemic, we can not get a subword lexicon without knowing the alignment.
+! grep -q $separator $srcdir/lexiconp.txt && \
+echo "$0: Warning, this lexicon contains no separator \"$separator\" and may not be a subword lexicon." && exit 1;
+
+# Write the separator into file for future use.
+echo $separator > $dir/subword_separator.txt
+
+if ! utils/validate_dict_dir.pl $srcdir >&/dev/null; then
+  utils/validate_dict_dir.pl $srcdir  # show the output.
+  echo "Validation failed (second time)"
+  exit 1;
+fi
+
+if $position_dependent_phones; then
+  # Create $tmpdir/lexiconp.txt from $srcdir/lexiconp.txt (or
+  # $tmpdir/lexiconp_silprob.txt from $srcdir/lexiconp_silprob.txt) by
+  # adding the markers _B, _E, _S, _I depending on word position.
+  # In this recipe, these markers apply to silence also.
+  # Do this starting from lexiconp.txt only.
+  if "$silprob"; then
+    echo "$0: Currently we do not support word-dependent silence probability" && exit 1;
+  else
+    utils/lang/make_position_dependent_subword_lexicon.py $srcdir/lexiconp.txt > $tmpdir/lexiconp.txt || exit 1;
+  fi
+
+  # create $tmpdir/phone_map.txt
+  # this has the format (on each line)
+  # <original phone> <version 1 of original phone> <version 2> ...
+  # where the versions depend on the position of the phone within a word.
+  # For instance, we'd have:
+  # AA AA_B AA_E AA_I AA_S
+  # for (B)egin, (E)nd, (I)nternal and (S)ingleton
+  # and in the case of silence
+  # SIL SIL SIL_B SIL_E SIL_I SIL_S
+  # [because SIL on its own is one of the variants; this is for when it doesn't
+  #  occur inside a word but as an option in the lexicon.]
+
+  # This phone map expands the phone lists into all the word-position-dependent
+  # versions of the phone lists.
+  cat <(set -f; for x in `cat $srcdir/silence_phones.txt`; do for y in "" "" "_B" "_E" "_I" "_S"; do echo -n "$x$y "; done; echo; done) \
+    <(set -f; for x in `cat $srcdir/nonsilence_phones.txt`; do for y in "" "_B" "_E" "_I" "_S"; do echo -n "$x$y "; done; echo; done) \
+    > $tmpdir/phone_map.txt
+else
+  if "$silprob"; then
+    echo "$0: Currently we do not support word-dependent silence probability" && exit 1;
+  else
+    cp $srcdir/lexiconp.txt $tmpdir/lexiconp.txt
+  fi
+
+  cat $srcdir/silence_phones.txt $srcdir/nonsilence_phones.txt | \
+    awk '{for(n=1;n<=NF;n++) print $n; }' > $tmpdir/phones
+  paste -d' ' $tmpdir/phones $tmpdir/phones > $tmpdir/phone_map.txt
+fi
+
+mkdir -p $dir/phones  # various sets of phones...
+
+# Sets of phones for use in clustering, and making monophone systems.
+
+if $share_silence_phones; then
+  # build a roots file that will force all the silence phones to share the
+  # same pdf's. [three distinct states, only the transitions will differ.]
+  # 'shared'/'not-shared' means, do we share the 3 states of the HMM
+  # in the same tree-root?
+  # Sharing across models(phones) is achieved by writing several phones
+  # into one line of roots.txt (shared/not-shared doesn't affect this).
+  # 'not-shared not-split' means we have separate tree roots for the 3 states,
+  # but we never split the tree so they remain stumps,
+  # so all phones in the line correspond to the same model.
+
+  cat $srcdir/silence_phones.txt | awk '{printf("%s ", $0); } END{printf("\n");}' | cat - $srcdir/nonsilence_phones.txt | \
+    utils/apply_map.pl $tmpdir/phone_map.txt > $dir/phones/sets.txt
+  cat $dir/phones/sets.txt | \
+    awk '{if(NR==1) print "not-shared", "not-split", $0; else print "shared", "split", $0;}' > $dir/phones/roots.txt
+else
+  # different silence phones will have different GMMs.  [note: here, all "shared split" means
+  # is that we may have one GMM for all the states, or we can split on states.  because they're
+  # context-independent phones, they don't see the context.]
+  cat $srcdir/{,non}silence_phones.txt | utils/apply_map.pl $tmpdir/phone_map.txt > $dir/phones/sets.txt
+  cat $dir/phones/sets.txt | awk '{print "shared", "split", $0;}' > $dir/phones/roots.txt
+fi
+
+cat $srcdir/silence_phones.txt | utils/apply_map.pl $tmpdir/phone_map.txt | \
+  awk '{for(n=1;n<=NF;n++) print $n;}' > $dir/phones/silence.txt
+cat $srcdir/nonsilence_phones.txt | utils/apply_map.pl $tmpdir/phone_map.txt | \
+  awk '{for(n=1;n<=NF;n++) print $n;}' > $dir/phones/nonsilence.txt
+cp $srcdir/optional_silence.txt $dir/phones/optional_silence.txt
+cp $dir/phones/silence.txt $dir/phones/context_indep.txt
+
+# if extra_questions.txt is empty, it's OK.
+cat $srcdir/extra_questions.txt 2>/dev/null | utils/apply_map.pl $tmpdir/phone_map.txt \
+  >$dir/phones/extra_questions.txt
+
+# Want extra questions about the word-start/word-end stuff. Make it separate for
+# silence and non-silence. Probably doesn't matter, as silence will rarely
+# be inside a word.
+if $position_dependent_phones; then
+  for suffix in _B _E _I _S; do
+    (set -f; for x in `cat $srcdir/nonsilence_phones.txt`; do echo -n "$x$suffix "; done; echo) >>$dir/phones/extra_questions.txt
+  done
+  for suffix in "" _B _E _I _S; do
+    (set -f; for x in `cat $srcdir/silence_phones.txt`; do echo -n "$x$suffix "; done; echo) >>$dir/phones/extra_questions.txt
+  done
+fi
+
+# add_lex_disambig.pl is responsible for adding disambiguation symbols to
+# the lexicon, for telling us how many disambiguation symbols it used,
+# and and also for modifying the unknown-word's pronunciation (if the
+# --unk-fst was provided) to the sequence "#1 #2 #3", and reserving those
+# disambig symbols for that purpose.
+# The #2 will later be replaced with the actual unk model.  The reason
+# for the #1 and the #3 is for disambiguation and also to keep the
+# FST compact.  If we didn't have the #1, we might have a different copy of
+# the unk-model FST, or at least some of its arcs, for each start-state from
+# which an <unk> transition comes (instead of per end-state, which is more compact);
+# and adding the #3 prevents us from potentially having 2 copies of the unk-model
+# FST due to the optional-silence [the last phone of any word gets 2 arcs].
+
+if "$silprob"; then
+  echo "$0: Currently we do not support word-dependent silence probability" && exit 1;
+else
+  ndisambig=$(utils/add_lex_disambig.pl $unk_opt --pron-probs $tmpdir/lexiconp.txt $tmpdir/lexiconp_disambig.txt)
+fi
+ndisambig=$[$ndisambig+$num_extra_phone_disambig_syms]; # add (at least) one disambig symbol for silence in lexicon FST.
+echo $ndisambig > $tmpdir/lex_ndisambig
+
+# Format of lexiconp_disambig.txt:
+# !SIL	1.0   SIL_S
+# <SPOKEN_NOISE>	1.0   SPN_S #1
+# <UNK>	1.0  SPN_S #2
+# <NOISE>	1.0  NSN_S
+# !EXCLAMATION-POINT	1.0  EH2_B K_I S_I K_I L_I AH0_I M_I EY1_I SH_I AH0_I N_I P_I OY2_I N_I T_E
+
+( for n in `seq 0 $ndisambig`; do echo '#'$n; done ) >$dir/phones/disambig.txt
+
+# Create phone symbol table.
+echo "<eps>" | cat - $dir/phones/{silence,nonsilence,disambig}.txt | \
+  awk '{n=NR-1; print $1, n;}' > $dir/phones.txt
+
+# Create a file that describes the word-boundary information for
+# each phone.  5 categories.
+if $position_dependent_phones; then
+  cat $dir/phones/{silence,nonsilence}.txt | \
+    awk '/_I$/{print $1, "internal"; next;} /_B$/{print $1, "begin"; next; }
+         /_S$/{print $1, "singleton"; next;} /_E$/{print $1, "end"; next; }
+         {print $1, "nonword";} ' > $dir/phones/word_boundary_moved.txt
+else
+  # word_boundary.txt might have been generated by another source
+  [ -f $srcdir/word_boundary.txt ] && cp $srcdir/word_boundary.txt $dir/phones/word_boundary_moved.txt
+fi
+
+# Create word symbol table.
+# <s> and </s> are only needed due to the need to rescore lattices with
+# ConstArpaLm format language model. They do not normally appear in G.fst or
+# L.fst.
+
+if "$silprob"; then
+  echo "$0: Currently we do not support word-dependent silence probability" && exit 1;
+fi
+
+cat $tmpdir/lexiconp.txt | awk '{print $1}' | sort | uniq  | awk '
+  BEGIN {
+    print "<eps> 0";
+  }
+  {
+    if ($1 == "<s>") {
+      print "<s> is in the vocabulary!" | "cat 1>&2"
+      exit 1;
+    }
+    if ($1 == "</s>") {
+      print "</s> is in the vocabulary!" | "cat 1>&2"
+      exit 1;
+    }
+    printf("%s %d\n", $1, NR);
+  }
+  END {
+    printf("#0 %d\n", NR+1);
+    printf("<s> %d\n", NR+2);
+    printf("</s> %d\n", NR+3);
+  }' > $dir/words.txt || exit 1;
+
+# In case there are extra word-level disambiguation symbols they also
+# need to be added to words.txt
+
+# format of $dir/words.txt:
+# <eps> 0
+# a 1
+# aa 2
+# aarvark 3
+# ...
+
+silphone=`cat $srcdir/optional_silence.txt` || exit 1;
+[ -z "$silphone" ] && \
+  ( echo "You have no optional-silence phone; it is required in the current scripts"
+    echo "but you may use the option --sil-prob 0.0 to stop it being used." ) && \
+   exit 1;
+
+# create $dir/phones/align_lexicon.{txt,int}.
+# This is the method we use for lattice word alignment if we are not
+# using word-position-dependent phones.
+
+# First remove pron-probs from the lexicon.
+perl -ape 's/(\S+\s+)\S+\s+(.+)/$1$2/;' <$tmpdir/lexiconp.txt >$tmpdir/align_lexicon.txt
+
+# Note: here, $silphone will have no suffix e.g. _S because it occurs as optional-silence,
+# and is not part of a word.
+[ ! -z "$silphone" ] && echo "<eps> $silphone" >> $tmpdir/align_lexicon.txt
+
+cat $tmpdir/align_lexicon.txt | \
+  perl -ane '@A = split; print $A[0], " ", join(" ", @A), "\n";' | sort | uniq > $dir/phones/align_lexicon.txt
+
+# create phones/align_lexicon.int from phones/align_lexicon.txt
+cat $dir/phones/align_lexicon.txt | utils/sym2int.pl -f 3- $dir/phones.txt | \
+  utils/sym2int.pl -f 1-2 $dir/words.txt > $dir/phones/align_lexicon.int
+
+# Create the basic L.fst without disambiguation symbols, for use
+# in training.
+
+if $silprob; then
+#  # Add silence probabilities (models the prob. of silence before and after each
+#  # word).  On some setups this helps a bit.  See utils/dict_dir_add_pronprobs.sh
+#  # and where it's called in the example scripts (run.sh).
+  echo "$0: Currently we do not support word-dependnet silence probability" && exit 1;
+else
+  utils/lang/make_subword_lexicon_fst.py $grammar_opts --sil-prob=$sil_prob --sil-phone=$silphone --position-dependent\
+            --separator=$separator $tmpdir/lexiconp.txt | \
+    fstcompile --isymbols=$dir/phones.txt --osymbols=$dir/words.txt \
+      --keep_isymbols=false --keep_osymbols=false | \
+    fstarcsort --sort_type=olabel > $dir/L.fst || exit 1;
+fi
+
+# The file oov.txt contains a word that we will map any OOVs to during
+# training.
+echo "$oov_word" > $dir/oov.txt || exit 1;
+cat $dir/oov.txt | utils/sym2int.pl $dir/words.txt >$dir/oov.int || exit 1;
+# integer version of oov symbol, used in some scripts.
+
+# the file wdisambig.txt contains a (line-by-line) list of the text-form of the
+# disambiguation symbols that are used in the grammar and passed through by the
+# lexicon.  At this stage it's hardcoded as '#0', but we're laying the groundwork
+# for more generality (which probably would be added by another script).
+# wdisambig_words.int contains the corresponding list interpreted by the
+# symbol table words.txt, and wdisambig_phones.int contains the corresponding
+# list interpreted by the symbol table phones.txt.
+echo '#0' >$dir/phones/wdisambig.txt
+
+utils/sym2int.pl $dir/phones.txt <$dir/phones/wdisambig.txt >$dir/phones/wdisambig_phones.int
+utils/sym2int.pl $dir/words.txt <$dir/phones/wdisambig.txt >$dir/phones/wdisambig_words.int
+
+# Create these lists of phones in colon-separated integer list form too,
+# for purposes of being given to programs as command-line options.
+for f in silence nonsilence optional_silence disambig context_indep; do
+  utils/sym2int.pl $dir/phones.txt <$dir/phones/$f.txt >$dir/phones/$f.int
+  utils/sym2int.pl $dir/phones.txt <$dir/phones/$f.txt | \
+   awk '{printf(":%d", $1);} END{printf "\n"}' | sed s/:// > $dir/phones/$f.csl || exit 1;
+done
+
+for x in sets extra_questions; do
+  utils/sym2int.pl $dir/phones.txt <$dir/phones/$x.txt > $dir/phones/$x.int || exit 1;
+done
+
+utils/sym2int.pl -f 3- $dir/phones.txt <$dir/phones/roots.txt \
+   > $dir/phones/roots.int || exit 1;
+
+if [ -f $dir/phones/word_boundary_moved.txt ]; then
+  utils/sym2int.pl -f 1 $dir/phones.txt <$dir/phones/word_boundary_moved.txt \
+    > $dir/phones/word_boundary_moved.int || exit 1;
+fi
+
+silphonelist=`cat $dir/phones/silence.csl`
+nonsilphonelist=`cat $dir/phones/nonsilence.csl`
+
+# Note: it's OK, after generating the 'lang' directory, to overwrite the topo file
+# with another one of your choice if the 'topo' file you want can't be generated by
+# utils/gen_topo.pl.  We do this in the 'chain' recipes.  Of course, the 'topo' file
+# should cover all the phones.  Try running utils/validate_lang.pl to check that
+# everything is OK after modifying the topo file.
+utils/gen_topo.pl $num_nonsil_states $num_sil_states $nonsilphonelist $silphonelist >$dir/topo
+
+# Create the lexicon FST with disambiguation symbols, and put it in lang_test.
+# There is an extra step where we create a loop to "pass through" the
+# disambiguation symbols from G.fst.
+
+if $silprob; then
+  echo "$0: Currently we do not support word-dependnet silence probability" && exit 1;
+else
+  utils/lang/make_subword_lexicon_fst.py $grammar_opts \
+       --sil-prob=$sil_prob --sil-phone=$silphone --sil-disambig='#'$ndisambig --position-dependent \
+       --separator=$separator $tmpdir/lexiconp_disambig.txt | \
+     fstcompile --isymbols=$dir/phones.txt --osymbols=$dir/words.txt \
+       --keep_isymbols=false --keep_osymbols=false |   \
+     fstaddselfloops  $dir/phones/wdisambig_phones.int $dir/phones/wdisambig_words.int | \
+     fstarcsort --sort_type=olabel > $dir/L_disambig.fst || exit 1;
+fi
+
+echo "$(basename $0): validating output directory"
+! utils/validate_lang.pl $dir && echo "$(basename $0): error validating output" &&  exit 1;
+
+exit 0;
diff --git a/egs/wsj/s5/utils/subword/prepare_subword_text.sh b/egs/wsj/s5/utils/subword/prepare_subword_text.sh
new file mode 100755
index 00000000000..0f0ce68c44f
--- /dev/null
+++ b/egs/wsj/s5/utils/subword/prepare_subword_text.sh
@@ -0,0 +1,48 @@
+#!/bin/bash
+
+# 2019 Dongji Gao
+
+# This script generates subword text form word text.
+# For example, <noise> internatioal -> <noise> inter@@ nation@@ al
+# @@ here is the separator indicate the poisition of subword in word.
+# Subword directly followed by separator can only appear at he begining or middle of word.
+# "<noise>" here can be reserved if added to the option "--glossaries"
+
+# Begin configuration section
+separator="@@"
+glossaries=
+# End configuration section
+
+. utils/parse_options.sh
+
+echo "$0 $@"
+
+if [ $# -ne 3 ]; then
+  echo "Usage: utils/prepare_subword_text.sh <word-text> <pair_code> <subword-text>"
+  echo "e.g.: utils/prepare_subword_text.sh data/train/text data/local/pair_code.txt data/train/text_subword"
+  echo "    --seperator <separator>         # default: @@"
+  echo "    --glossaries <reserved-words>   # glossaries are words reserved"
+  exit 1;
+fi
+
+word_text=$1
+pair_code=$2
+subword_text=$3
+
+[ ! -f $word_text ] && echo "Word text $word_text does not exits." && exit 1;
+
+grep -q $separator $word_text && echo "$0: Error, word text file contains separator $separator. This might be a subword text file or you need to choose a different separator" && exit 1;
+
+glossaries_opt=
+[ -z $glossaires ] && glossaries_opt="--glossaries $glossaries"
+cut -d ' ' -f2- $word_text | \
+  utils/lang/bpe/apply_bpe.py -c $pair_code --separator $separator $glossaires_opt > ${word_text}.sub
+  if [ $word_text == $subword_text ]; then
+    mv $word_text ${word_text}.old
+    cut -d ' ' -f1 ${word_text}.old | paste -d ' ' - ${word_text}.sub > $subword_text
+  else
+    cut -d ' ' -f1 $word_text | paste -d ' ' - ${word_text}.sub > $subword_text
+  fi
+
+rm ${word_text}.sub
+echo "Subword text created."
diff --git a/egs/wsj/s5/utils/validate_lang.pl b/egs/wsj/s5/utils/validate_lang.pl
index ea2272f3cda..8dba2a0ca69 100755
--- a/egs/wsj/s5/utils/validate_lang.pl
+++ b/egs/wsj/s5/utils/validate_lang.pl
@@ -4,6 +4,7 @@
 # Copyright  2012   Guoguo Chen
 #            2014   Neil Nelson
 #            2017   Johns Hopkins University (Jan "Yenda" Trmal <jtrmal@gmail.com>)
+#            2019   Dongji Gao
 #
 # Validation script for data/lang
 
@@ -101,6 +102,7 @@ sub check_allowed_whitespace {
 $skip_det_check = 0;
 $skip_disambig_check = 0;
 $skip_generate_words_check = 0;
+$subword_check = 0;
 
 for ($x=0; $x <= 3; $x++) {
   if (@ARGV > 0 && $ARGV[0] eq "--skip-determinization-check") {
@@ -121,6 +123,7 @@ sub check_allowed_whitespace {
   print "Usage: $0 [options] <lang_directory>\n";
   print "e.g.:  $0 data/lang\n";
   print "Options:\n";
+  print " --skip-det-check                         (this flag causes it to skip a deterministic fst check).\n";
   print " --skip-determinization-check             (this flag causes it to skip a time consuming check).\n";
   print " --skip-disambig-check                    (this flag causes it to skip a disambig check in phone bigram models).\n";
   exit(1);
@@ -131,6 +134,40 @@ sub check_allowed_whitespace {
 $lang = shift @ARGV;
 $exit = 0;
 $warning = 0;
+
+# Checking existence of separator file ------------------
+print "Checking existence of separator file\n";
+if (!-e "$lang/subword_separator.txt") {
+  print "separator file $lang/subword_separator.txt is empty or does not exist, deal in word case.\n";
+} else {
+  if (!open(S, "<$lang/subword_separator.txt")) {
+    print "--> ERROR: fail to open $lang/subword_separator.txt\n"; exit 1;
+  } else {
+    $line_num = `wc -l <$lang/subword_separator.txt`;
+    if ($line_num != 1) {
+      print "--> ERROR, $lang/subword_separator.txt should only contain one line.\n"; exit 1;
+    } else {
+      while (<S>) {
+        chomp;
+        my @col = split(" ", $_);
+        if (@col != 1) {
+          print "--> ERROR, invalid separator.\n"; exit 1;
+        } else {
+         $separator = shift @col;
+         $separator_length = length $separator;
+         $subword_check = 1;
+        }
+      }
+    }
+  }
+}
+
+if (!$subword_check) {
+  $word_boundary = "word_boundary";
+} else {
+  $word_boundary = "word_boundary_moved";
+}
+
 # Checking phones.txt -------------------------------
 print "Checking $lang/phones.txt ...\n";
 if (-z "$lang/phones.txt") {
@@ -492,7 +529,7 @@ sub check_summation {
   my $ok = 1;
   foreach $p (keys %psymtab) {
     if (! defined $sum{$p} && $p !~ m/^#nonterm/) {
-      $exit = 1;  $ok = 0;  print("--> ERROR: phone $p is not in silence.txt, nonsilence.txt or disambig.txt...");
+      $exit = 1;  $ok = 0;  print("--> ERROR: phone $p is not in silence.txt, nonsilence.txt or disambig.txt...\n");
     }
   }
 
@@ -530,8 +567,8 @@ sub check_summation {
     $exit = 1;
   }
 }
-if (-e "$lang/phones/word_boundary.txt") {
-  check_txt_int("$lang/phones/word_boundary", \%psymtab, 0); print "\n";
+if (-e "$lang/phones/$word_boundary.txt") {
+  check_txt_int("$lang/phones/$word_boundary", \%psymtab, 0); print "\n";
 }
 
 # Checking optional_silence.txt -------------------------------
@@ -634,10 +671,10 @@ sub check_summation {
 $end       = "";
 $internal  = "";
 $singleton = "";
-if (-s "$lang/phones/word_boundary.txt") {
-  print "Checking word_boundary.txt: silence.txt, nonsilence.txt, disambig.txt ...\n";
-  if (!open (W, "<$lang/phones/word_boundary.txt")) {
-    $exit = 1; print "--> ERROR: fail to open $lang/phones/word_boundary.txt\n";
+if (-s "$lang/phones/$word_boundary.txt") {
+  print "Checking $word_boundary.txt: silence.txt, nonsilence.txt, disambig.txt ...\n";
+  if (!open (W, "<$lang/phones/$word_boundary.txt")) {
+    $exit = 1; print "--> ERROR: fail to open $lang/phones/$word_boundary.txt\n";
   }
   $idx = 1;
   %wb = ();
@@ -660,7 +697,7 @@ sub check_summation {
       s/ singleton$//g; @col = split(" ", $_); if (@col == 1) {$singleton .= "$col[0] ";}
     }
     if (@col != 1) {
-      $exit = 1; print "--> ERROR: expect 1 column in $lang/phones/word_boundary.txt (line $idx)\n";
+      $exit = 1; print "--> ERROR: expect 1 column in $lang/phones/$word_boundary.txt (line $idx)\n";
     }
     $wb{shift @col} = 1;
     $idx ++;
@@ -671,13 +708,13 @@ sub check_summation {
   $success1 = 1;
   if (@itset != 0) {
     $success1 = 0;
-    $exit = 1; print "--> ERROR: $lang/phones/word_boundary.txt has disambiguation symbols -- ";
+    $exit = 1; print "--> ERROR: $lang/phones/$word_boundary.txt has disambiguation symbols -- ";
     foreach (@itset) {
       print "$_ ";
     }
     print "\n";
   }
-  $success1 == 0 || print "--> $lang/phones/word_boundary.txt doesn't include disambiguation symbols\n";
+  $success1 == 0 || print "--> $lang/phones/$word_boundary.txt doesn't include disambiguation symbols\n";
 
   %sum = (%silence, %nonsilence);
   @itset = intersect(\%sum, \%wb);
@@ -685,7 +722,7 @@ sub check_summation {
   $success2 = 1;
   if (@itset < scalar(keys %sum)) {
     $success2 = 0;
-    $exit = 1; print "--> ERROR: phones in nonsilence.txt and silence.txt but not in word_boundary.txt -- ";
+    $exit = 1; print "--> ERROR: phones in nonsilence.txt and silence.txt but not in $word_boundary.txt -- ";
     foreach (keys %sum) {
       if (!$itset{$_}) {
         print "$_ ";
@@ -695,7 +732,7 @@ sub check_summation {
   }
   if (@itset < scalar(keys %wb)) {
     $success2 = 0;
-    $exit = 1; print "--> ERROR: phones in word_boundary.txt but not in nonsilence.txt or silence.txt -- ";
+    $exit = 1; print "--> ERROR: phones in $word_boundary.txt but not in nonsilence.txt or silence.txt -- ";
     foreach (keys %wb) {
       if (!$itset{$_}) {
         print "$_ ";
@@ -703,8 +740,8 @@ sub check_summation {
     }
     print "\n";
   }
-  $success2 == 0 || print "--> $lang/phones/word_boundary.txt is the union of nonsilence.txt and silence.txt\n";
-  $success1 != 1 or $success2 != 1 || print "--> $lang/phones/word_boundary.txt is OK\n";
+  $success2 == 0 || print "--> $lang/phones/$word_boundary.txt is the union of nonsilence.txt and silence.txt\n";
+  $success1 != 1 or $success2 != 1 || print "--> $lang/phones/$word_boundary.txt is OK\n";
   print "\n";
 }
 
@@ -750,11 +787,11 @@ sub check_summation {
     close(P);
     my $len = @wdisambig, $len2;
     if (($len2 = @wdisambig_words) != $len) {
-      print "--> ERROR: files $lang/phones/wdisambig.txt and $lang/phones/wdisambig_words.int have different lengths";
+      print "--> ERROR: files $lang/phones/wdisambig.txt and $lang/phones/wdisambig_words.int have different lengths\n";
       $exit = 1; return;
     }
     if (($len2 = @wdisambig_phones) != $len) {
-      print "--> ERROR: files $lang/phones/wdisambig.txt and $lang/phones/wdisambig_phones.int have different lengths";
+      print "--> ERROR: files $lang/phones/wdisambig.txt and $lang/phones/wdisambig_phones.int have different lengths\n";
       $exit = 1; return;
     }
     for (my $i = 0; $i < $len; $i++) {
@@ -777,16 +814,23 @@ sub check_summation {
   }
 }
 
-
-if (-s "$lang/phones/word_boundary.int") {
-  print "Checking word_boundary.int and disambig.int\n";
-  if (!open (W, "<$lang/phones/word_boundary.int")) {
-    $exit = 1; print "--> ERROR: fail to open $lang/phones/word_boundary.int\n";
+# Check validity of L.fst, L_disambig.fst, and word_boundary.int.
+# First we generate a random word/subword sequence. We then compile it into fst and compose it with L.fst/L_disambig.fst.
+# For subword case the last subword of the sequence must be a end-subword 
+# (i.e. the subword can only be at the end of word or is a single word itself) 
+# to guarantee the composition would not fail.
+# We then get the corresponging phones sequence and apply a transition matrix on it to get the number of valid boundaries.
+# In word case, the number of valid boundaries should be equal to the number of words.
+# In subword case, the number of valid boundaries should be equal to the number of end-subwords.
+if (-s "$lang/phones/$word_boundary.int") {
+  print "Checking $word_boundary.int and disambig.int\n";
+  if (!open (W, "<$lang/phones/$word_boundary.int")) {
+    $exit = 1; print "--> ERROR: fail to open $lang/phones/$word_boundary.int\n";
   }
   while (<W>) {
     @A = split;
     if (@A != 2) {
-      $exit = 1; print "--> ERROR: bad line $_ in $lang/phones/word_boundary.int\n";
+      $exit = 1; print "--> ERROR: bad line $_ in $lang/phones/$word_boundary.int\n";
     }
     $wbtype{$A[0]} = $A[1];
   }
@@ -814,23 +858,58 @@ sub check_summation {
       next;
     }
     $wlen = int(rand(100)) + 1;
-    print "--> generating a $wlen word sequence\n";
+    $end_subword = 0;
+    print "--> generating a $wlen word/subword sequence\n";
     $wordseq = "";
     $sid = 0;
     $wordseq_syms = "";
-    foreach (1 .. $wlen) {
+    # exclude disambiguation symbols, BOS and EOS, epsilon, and
+    # grammar-related symbols from the word sequence.
+    while ($sid < ($wlen - 1)) {
       $id = int(rand(scalar(keys %wint2sym)));
-      # exclude disambiguation symbols, BOS and EOS, epsilon, and
-      # grammar-related symbols from the word sequence.
       while (defined $wdisambig_words_hash{$id} or
-             $wint2sym{$id} eq "<s>" or $wint2sym{$id} eq "</s>" or
-             $wint2sym{$id} =~ m/^#nonterm/ or $id == 0) {
+           $wint2sym{$id} eq "<s>" or $wint2sym{$id} eq "</s>" or
+           $wint2sym{$id} =~ m/^#nonterm/ or $id == 0) {
         $id = int(rand(scalar(keys %wint2sym)));
       }
       $wordseq_syms = $wordseq_syms . $wint2sym{$id} . " ";
       $wordseq = $wordseq . "$sid ". ($sid + 1) . " $id $id 0\n";
       $sid ++;
+
+      if ($subword_check) {
+        $subword = $wint2sym{$id};
+        $suffix = substr($subword, -$separator_length, $separator_length);
+        if ($suffix ne $separator) {
+          $end_subword ++;
+        }
+      }
+    } 
+
+    # generate the last word (subword)
+    $id = int(rand(scalar(keys %wint2sym)));
+    if ($subword_check) {
+      $subword = $wint2sym{$id};
+      $suffix = substr($subword, -$separator_length, $separator_length);
+      # the last subword can not followed by separator  
+      while (defined $wdisambig_words_hash{$id} or
+           $wint2sym{$id} eq "<s>" or $wint2sym{$id} eq "</s>" or
+           $wint2sym{$id} =~ m/^#nonterm/ or $id == 0 or $suffix eq $separator) {
+        $id = int(rand(scalar(keys %wint2sym)));
+        $subword = $wint2sym{$id};
+        $suffix = substr($subword, -$separator_length, $separator_length);
+      }
+      $end_subword ++;
+    } else {
+      while (defined $wdisambig_words_hash{$id} or
+           $wint2sym{$id} eq "<s>" or $wint2sym{$id} eq "</s>" or
+           $wint2sym{$id} =~ m/^#nonterm/ or $id == 0) {
+       $id = int(rand(scalar(keys %wint2sym)));
+      }
     }
+    $wordseq_syms = $wordseq_syms . $wint2sym{$id} . " ";
+    $wordseq = $wordseq . "$sid ". ($sid + 1) . " $id $id 0\n";
+    $sid ++;
+
     $wordseq = $wordseq . "$sid 0";
     $phoneseq = `. ./path.sh; echo \"$wordseq" | fstcompile | fstcompose $lang/$fst - | fstproject | fstrandgen | fstrmepsilon | fsttopsort | fstprint | awk '{if (NF > 2) {print \$3}}';`;
     $transition = { }; # empty assoc. array of allowed transitions between phone types.  1 means we count a word,
@@ -861,10 +940,10 @@ sub check_summation {
           $state = $wbtype{$phone};
         }
         if (!defined $state) {
-          $exit = 1; print "--> ERROR: phone $phone is not specified in $lang/phones/word_boundary.int\n";
+          $exit = 1; print "--> ERROR: phone $phone is not specified in $lang/phones/$word_boundary.int\n";
           last;
         } elsif (!defined $transition{$cur_state, $state}) {
-          $exit = 1; print "--> ERROR: transition from state $cur_state to $state indicates error in word_boundary.int or L.fst\n";
+          $exit = 1; print "--> ERROR: transition from state $cur_state to $state indicates error in $word_boundary.int or L.fst\n";
           last;
         } else {
           $num_words += $transition{$cur_state, $state};
@@ -873,10 +952,13 @@ sub check_summation {
       }
     }
     if (!$exit) {
+      if ($subword_check) { 
+        $wlen = $end_subword;
+      }
       if ($num_words != $wlen) {
         $phoneseq_syms = "";
         foreach my $id (split(" ", $phoneseq)) { $phoneseq_syms = $phoneseq_syms . " " . $pint2sym{$id}; }
-        $exit = 1; print "--> ERROR: number of reconstructed words $num_words does not match real number of words $wlen; indicates problem in $fst or word_boundary.int.  phoneseq = $phoneseq_syms, wordseq = $wordseq_syms\n";
+        $exit = 1; print "--> ERROR: number of reconstructed words $num_words does not match real number of words $wlen; indicates problem in $fst or $word_boundary.int.  phoneseq = $phoneseq_syms, wordseq = $wordseq_syms\n";
       } else {
         print "--> resulting phone sequence from $fst corresponds to the word sequence\n";
         print "--> $fst is OK\n";

From 49bccbba07e94164a426adc491ad4e215068eee8 Mon Sep 17 00:00:00 2001
From: "kkm (aka Kirill Katsnelson)" <kkm@smartaction.com>
Date: Mon, 6 May 2019 22:35:57 -0700
Subject: [PATCH 101/163] [scripts] Add trainer option
 --trainer.optimization.num-jobs-step (#3205)

---
 egs/wsj/s5/steps/libs/nnet3/train/common.py   | 69 ++++++++++++++++---
 egs/wsj/s5/steps/nnet3/chain/e2e/train_e2e.py | 29 ++++----
 egs/wsj/s5/steps/nnet3/chain/train.py         | 25 ++++---
 egs/wsj/s5/steps/nnet3/train_dnn.py           | 25 ++++---
 egs/wsj/s5/steps/nnet3/train_raw_dnn.py       | 25 ++++---
 egs/wsj/s5/steps/nnet3/train_raw_rnn.py       | 25 ++++---
 egs/wsj/s5/steps/nnet3/train_rnn.py           | 28 ++++----
 7 files changed, 145 insertions(+), 81 deletions(-)

diff --git a/egs/wsj/s5/steps/libs/nnet3/train/common.py b/egs/wsj/s5/steps/libs/nnet3/train/common.py
index 1a038cc23f2..f230e12e96f 100644
--- a/egs/wsj/s5/steps/libs/nnet3/train/common.py
+++ b/egs/wsj/s5/steps/libs/nnet3/train/common.py
@@ -269,8 +269,7 @@ def validate_minibatch_size_str(minibatch_size_str):
                 return False
         # check that the thing before the '=' sign is a positive integer
         try:
-            i = b[0]
-            if i <= 0:
+            if int(b[0]) <= 0:
                 return False
         except:
             return False  # not an integer at all.
@@ -602,6 +601,16 @@ def get_model_combine_iters(num_iters, num_epochs,
     return models_to_combine
 
 
+def get_current_num_jobs(it, num_it, start, step, end):
+    "Get number of jobs for iteration number 'it' of range('num_it')"
+
+    ideal = float(start) + (end - start) * float(it) / num_it
+    if ideal < step:
+        return int(0.5 + ideal)
+    else:
+        return int(0.5 + ideal / step) * step
+
+
 def get_learning_rate(iter, num_jobs, num_iters, num_archives_processed,
                       num_archives_to_process,
                       initial_effective_lrate, final_effective_lrate):
@@ -682,13 +691,11 @@ def remove_model(nnet_dir, iter, num_iters, models_to_combine=None,
         os.remove(file_name)
 
 
-def self_test():
-    assert halve_minibatch_size_str('64') == '32'
-    assert halve_minibatch_size_str('64,16:32') == '32,8:16'
-    assert halve_minibatch_size_str('1') == '1'
-    assert halve_minibatch_size_str('128=64/256=40,80:100') == '128=32/256=20,40:50'
-    assert validate_chunk_width('64')
-    assert validate_chunk_width('64,25,128')
+def positive_int(arg):
+   val = int(arg)
+   if (val <= 0):
+      raise argparse.ArgumentTypeError("must be positive int: '%s'" % arg)
+   return val
 
 
 class CommonParser(object):
@@ -845,6 +852,10 @@ def __init__(self,
                                  type=int, dest='num_jobs_final', default=8,
                                  help="Number of neural net jobs to run in "
                                  "parallel at the end of training")
+        self.parser.add_argument("--trainer.optimization.num-jobs-step",
+            type=positive_int,  metavar='N', dest='num_jobs_step', default=1,
+            help="""Number of jobs increment, when exceeds this number. For
+            example, if N=3, the number of jobs may progress as 1, 2, 3, 6, 9...""")
         self.parser.add_argument("--trainer.optimization.max-models-combine",
                                  "--trainer.max-models-combine",
                                  type=int, dest='max_models_combine',
@@ -983,5 +994,43 @@ def __init__(self,
                                  then only failure notifications are sent""")
 
 
+import unittest
+
+class SelfTest(unittest.TestCase):
+
+    def test_halve_minibatch_size_str(self):
+        self.assertEqual('32', halve_minibatch_size_str('64'))
+        self.assertEqual('32,8:16', halve_minibatch_size_str('64,16:32'))
+        self.assertEqual('1', halve_minibatch_size_str('1'))
+        self.assertEqual('128=32/256=20,40:50', halve_minibatch_size_str('128=64/256=40,80:100'))
+
+
+    def test_validate_chunk_width(self):
+        for s in [ '64', '64,25,128' ]:
+            self.assertTrue(validate_chunk_width(s), s)
+
+
+    def test_validate_minibatch_size_str(self):
+        # Good descriptors.
+        for s in [ '32', '32,64', '1:32', '1:32,64', '64,1:32', '1:5,10:15',
+                   '128=64:128/256=32,64', '1=2/3=4', '1=1/2=2/3=3/4=4' ]:
+            self.assertTrue(validate_minibatch_size_str(s), s)
+        # Bad descriptors.
+        for s in [ None, 42, (43,), '', '1:', ':2', '3,', ',4', '5:6,', ',7:8',
+                   '9=', '10=10/', '11=11/11', '12=1:2//13=1:3' '14=/15=15',
+                   '16/17=17', '/18=18', '/18', '//19', '/' ]:
+            self.assertFalse(validate_minibatch_size_str(s), s)
+
+
+    def test_get_current_num_jobs(self):
+        niters = 12
+        self.assertEqual([2, 3, 3, 4, 4, 5, 6, 6, 7, 7, 8, 8],
+                         [get_current_num_jobs(i, niters, 2, 1, 9)
+                              for i in range(niters)])
+        self.assertEqual([2, 3, 3, 3, 3, 6, 6, 6, 6, 6, 9, 9],
+                         [get_current_num_jobs(i, niters, 2, 3, 9)
+                              for i in range(niters)])
+
+
 if __name__ == '__main__':
-    _self_test()
+    unittest.main()
diff --git a/egs/wsj/s5/steps/nnet3/chain/e2e/train_e2e.py b/egs/wsj/s5/steps/nnet3/chain/e2e/train_e2e.py
index e96f2a10820..d5fa89f3ce0 100755
--- a/egs/wsj/s5/steps/nnet3/chain/e2e/train_e2e.py
+++ b/egs/wsj/s5/steps/nnet3/chain/e2e/train_e2e.py
@@ -202,11 +202,10 @@ def process_args(args):
             "--trainer.deriv-truncate-margin.".format(
                 args.deriv_truncate_margin))
 
-    if (not os.path.exists(args.dir)
-            or not os.path.exists(args.dir+"/configs")):
-        raise Exception("This scripts expects {0} to exist and have a configs "
-                        "directory which is the output of "
-                        "make_configs.py script")
+    if (not os.path.exists(args.dir + "/configs")):
+        raise Exception("This scripts expects the directory specified with "
+                        "--dir={0} to exist and have a configs/ directory which "
+                        "is the output of make_configs.py script".format(args.dir))
 
     # set the options corresponding to args.use_gpu
     run_opts = common_train_lib.RunOpts()
@@ -423,9 +422,10 @@ def train(args, run_opts):
         if (args.exit_stage is not None) and (iter == args.exit_stage):
             logger.info("Exiting early due to --exit-stage {0}".format(iter))
             return
-        current_num_jobs = int(0.5 + args.num_jobs_initial
-                               + (args.num_jobs_final - args.num_jobs_initial)
-                               * float(iter) / num_iters)
+
+        current_num_jobs = common_train_lib.get_current_num_jobs(
+            iter, num_iters,
+            args.num_jobs_initial, args.num_jobs_step, args.num_jobs_final)
 
         if args.stage <= iter:
             model_file = "{dir}/{iter}.mdl".format(dir=args.dir, iter=iter)
@@ -451,12 +451,13 @@ def train(args, run_opts):
             shrink_info_str = ''
             if shrinkage_value != 1.0:
                 shrink_info_str = 'shrink: {0:0.5f}'.format(shrinkage_value)
-            logger.info("Iter: {0}/{1}    "
-                        "Epoch: {2:0.2f}/{3:0.1f} ({4:0.1f}% complete)    "
-                        "lr: {5:0.6f}    {6}".format(iter, num_iters - 1,
-                                                     epoch, args.num_epochs,
-                                                     percent,
-                                                     lrate, shrink_info_str))
+            logger.info("Iter: {0}/{1}   Jobs: {2}   "
+                        "Epoch: {3:0.2f}/{4:0.1f} ({5:0.1f}% complete)   "
+                        "lr: {6:0.6f}   {7}".format(iter, num_iters - 1,
+                                                    current_num_jobs,
+                                                    epoch, args.num_epochs,
+                                                    percent,
+                                                    lrate, shrink_info_str))
 
             chain_lib.train_one_iteration(
                 dir=args.dir,
diff --git a/egs/wsj/s5/steps/nnet3/chain/train.py b/egs/wsj/s5/steps/nnet3/chain/train.py
index 40b65afe273..67cb9f90620 100755
--- a/egs/wsj/s5/steps/nnet3/chain/train.py
+++ b/egs/wsj/s5/steps/nnet3/chain/train.py
@@ -219,8 +219,9 @@ def process_args(args):
                 args.deriv_truncate_margin))
 
     if (not os.path.exists(args.dir)):
-        raise Exception("This script expects --dir={0} to exist.")
-    if (not os.path.exists(args.dir+"/configs") and
+        raise Exception("Directory specified with --dir={0} "
+                        "does not exist.".format(args.dir))
+    if (not os.path.exists(args.dir + "/configs") and
         (args.input_model is None or not os.path.exists(args.input_model))):
         raise Exception("Either --trainer.input-model option should be supplied, "
                         "and exist; or the {0}/configs directory should exist."
@@ -470,9 +471,10 @@ def train(args, run_opts):
         if (args.exit_stage is not None) and (iter == args.exit_stage):
             logger.info("Exiting early due to --exit-stage {0}".format(iter))
             return
-        current_num_jobs = int(0.5 + args.num_jobs_initial
-                               + (args.num_jobs_final - args.num_jobs_initial)
-                               * float(iter) / num_iters)
+
+        current_num_jobs = common_train_lib.get_current_num_jobs(
+            iter, num_iters,
+            args.num_jobs_initial, args.num_jobs_step, args.num_jobs_final)
 
         if args.stage <= iter:
             model_file = "{dir}/{iter}.mdl".format(dir=args.dir, iter=iter)
@@ -501,12 +503,13 @@ def train(args, run_opts):
             shrink_info_str = ''
             if shrinkage_value != 1.0:
                 shrink_info_str = 'shrink: {0:0.5f}'.format(shrinkage_value)
-            logger.info("Iter: {0}/{1}    "
-                        "Epoch: {2:0.2f}/{3:0.1f} ({4:0.1f}% complete)    "
-                        "lr: {5:0.6f}    {6}".format(iter, num_iters - 1,
-                                                     epoch, args.num_epochs,
-                                                     percent,
-                                                     lrate, shrink_info_str))
+            logger.info("Iter: {0}/{1}   Jobs: {2}   "
+                        "Epoch: {3:0.2f}/{4:0.1f} ({5:0.1f}% complete)   "
+                        "lr: {6:0.6f}   {7}".format(iter, num_iters - 1,
+                                                    current_num_jobs,
+                                                    epoch, args.num_epochs,
+                                                    percent,
+                                                    lrate, shrink_info_str))
 
             chain_lib.train_one_iteration(
                 dir=args.dir,
diff --git a/egs/wsj/s5/steps/nnet3/train_dnn.py b/egs/wsj/s5/steps/nnet3/train_dnn.py
index e72b29297a4..84817446b6e 100755
--- a/egs/wsj/s5/steps/nnet3/train_dnn.py
+++ b/egs/wsj/s5/steps/nnet3/train_dnn.py
@@ -117,8 +117,9 @@ def process_args(args):
         raise Exception("--trainer.rnn.num-chunk-per-minibatch has an invalid value")
 
     if (not os.path.exists(args.dir)):
-        raise Exception("This script expects --dir={0} to exist.")
-    if (not os.path.exists(args.dir+"/configs") and
+        raise Exception("Directory specified with --dir={0} "
+                        "does not exist.".format(args.dir))
+    if (not os.path.exists(args.dir + "/configs") and
         (args.input_model is None or not os.path.exists(args.input_model))):
         raise Exception("Either --trainer.input-model option should be supplied, "
                         "and exist; or the {0}/configs directory should exist."
@@ -321,9 +322,10 @@ def train(args, run_opts):
         if (args.exit_stage is not None) and (iter == args.exit_stage):
             logger.info("Exiting early due to --exit-stage {0}".format(iter))
             return
-        current_num_jobs = int(0.5 + args.num_jobs_initial
-                               + (args.num_jobs_final - args.num_jobs_initial)
-                               * float(iter) / num_iters)
+
+        current_num_jobs = common_train_lib.get_current_num_jobs(
+            iter, num_iters,
+            args.num_jobs_initial, args.num_jobs_step, args.num_jobs_final)
 
         if args.stage <= iter:
             lrate = common_train_lib.get_learning_rate(iter, current_num_jobs,
@@ -344,12 +346,13 @@ def train(args, run_opts):
             shrink_info_str = ''
             if shrinkage_value != 1.0:
                 shrink_info_str = 'shrink: {0:0.5f}'.format(shrinkage_value)
-            logger.info("Iter: {0}/{1}    "
-                        "Epoch: {2:0.2f}/{3:0.1f} ({4:0.1f}% complete)    "
-                        "lr: {5:0.6f}    {6}".format(iter, num_iters - 1,
-                                                     epoch, args.num_epochs,
-                                                     percent,
-                                                     lrate, shrink_info_str))
+            logger.info("Iter: {0}/{1}   Jobs: {2}   "
+                        "Epoch: {3:0.2f}/{4:0.1f} ({5:0.1f}% complete)   "
+                        "lr: {6:0.6f}   {7}".format(iter, num_iters - 1,
+                                                    current_num_jobs,
+                                                    epoch, args.num_epochs,
+                                                    percent,
+                                                    lrate, shrink_info_str))
 
             train_lib.common.train_one_iteration(
                 dir=args.dir,
diff --git a/egs/wsj/s5/steps/nnet3/train_raw_dnn.py b/egs/wsj/s5/steps/nnet3/train_raw_dnn.py
index ffccf443b99..af921048bb5 100755
--- a/egs/wsj/s5/steps/nnet3/train_raw_dnn.py
+++ b/egs/wsj/s5/steps/nnet3/train_raw_dnn.py
@@ -135,8 +135,9 @@ def process_args(args):
         raise Exception("--trainer.optimization.minibatch-size has an invalid value")
 
     if (not os.path.exists(args.dir)):
-        raise Exception("This script expects --dir={0} to exist.")
-    if (not os.path.exists(args.dir+"/configs") and
+        raise Exception("Directory specified with --dir={0} "
+                        "does not exist.".format(args.dir))
+    if (not os.path.exists(args.dir + "/configs") and
         (args.input_model is None or not os.path.exists(args.input_model))):
         raise Exception("Either --trainer.input-model option should be supplied, "
                         "and exist; or the {0}/configs directory should exist."
@@ -356,9 +357,10 @@ def train(args, run_opts):
         if (args.exit_stage is not None) and (iter == args.exit_stage):
             logger.info("Exiting early due to --exit-stage {0}".format(iter))
             return
-        current_num_jobs = int(0.5 + args.num_jobs_initial
-                               + (args.num_jobs_final - args.num_jobs_initial)
-                               * float(iter) / num_iters)
+
+        current_num_jobs = common_train_lib.get_current_num_jobs(
+            iter, num_iters,
+            args.num_jobs_initial, args.num_jobs_step, args.num_jobs_final)
 
         if args.stage <= iter:
             lrate = common_train_lib.get_learning_rate(iter, current_num_jobs,
@@ -380,12 +382,13 @@ def train(args, run_opts):
             shrink_info_str = ''
             if shrinkage_value != 1.0:
                 shrink_info_str = 'shrink: {0:0.5f}'.format(shrinkage_value)
-            logger.info("Iter: {0}/{1}    "
-                        "Epoch: {2:0.2f}/{3:0.1f} ({4:0.1f}% complete)    "
-                        "lr: {5:0.6f}    {6}".format(iter, num_iters - 1,
-                                                     epoch, args.num_epochs,
-                                                     percent,
-                                                     lrate, shrink_info_str))
+            logger.info("Iter: {0}/{1}   Jobs: {2}   "
+                        "Epoch: {3:0.2f}/{4:0.1f} ({5:0.1f}% complete)   "
+                        "lr: {6:0.6f}   {7}".format(iter, num_iters - 1,
+                                                    current_num_jobs,
+                                                    epoch, args.num_epochs,
+                                                    percent,
+                                                    lrate, shrink_info_str))
 
             train_lib.common.train_one_iteration(
                 dir=args.dir,
diff --git a/egs/wsj/s5/steps/nnet3/train_raw_rnn.py b/egs/wsj/s5/steps/nnet3/train_raw_rnn.py
index c704b0725d3..b2d55ac20e7 100755
--- a/egs/wsj/s5/steps/nnet3/train_raw_rnn.py
+++ b/egs/wsj/s5/steps/nnet3/train_raw_rnn.py
@@ -181,8 +181,9 @@ def process_args(args):
         raise Exception("--egs.chunk-right-context should be non-negative")
 
     if (not os.path.exists(args.dir)):
-        raise Exception("This script expects --dir={0} to exist.")
-    if (not os.path.exists(args.dir+"/configs") and
+        raise Exception("Directory specified with --dir={0} "
+                        "does not exist.".format(args.dir))
+    if (not os.path.exists(args.dir + "/configs") and
         (args.input_model is None or not os.path.exists(args.input_model))):
         raise Exception("Either --trainer.input-model option should be supplied, "
                         "and exist; or the {0}/configs directory should exist."
@@ -411,9 +412,10 @@ def train(args, run_opts):
         if (args.exit_stage is not None) and (iter == args.exit_stage):
             logger.info("Exiting early due to --exit-stage {0}".format(iter))
             return
-        current_num_jobs = int(0.5 + args.num_jobs_initial
-                               + (args.num_jobs_final - args.num_jobs_initial)
-                               * float(iter) / num_iters)
+
+        current_num_jobs = common_train_lib.get_current_num_jobs(
+            iter, num_iters,
+            args.num_jobs_initial, args.num_jobs_step, args.num_jobs_final)
 
         if args.stage <= iter:
             model_file = "{dir}/{iter}.raw".format(dir=args.dir, iter=iter)
@@ -445,12 +447,13 @@ def train(args, run_opts):
             shrink_info_str = ''
             if shrinkage_value != 1.0:
                 shrink_info_str = 'shrink: {0:0.5f}'.format(shrinkage_value)
-            logger.info("Iter: {0}/{1}    "
-                        "Epoch: {2:0.2f}/{3:0.1f} ({4:0.1f}% complete)    "
-                        "lr: {5:0.6f}    {6}".format(iter, num_iters - 1,
-                                                     epoch, args.num_epochs,
-                                                     percent,
-                                                     lrate, shrink_info_str))
+            logger.info("Iter: {0}/{1}   Jobs: {2}   "
+                        "Epoch: {3:0.2f}/{4:0.1f} ({5:0.1f}% complete)   "
+                        "lr: {6:0.6f}   {7}".format(iter, num_iters - 1,
+                                                    current_num_jobs,
+                                                    epoch, args.num_epochs,
+                                                    percent,
+                                                    lrate, shrink_info_str))
 
             train_lib.common.train_one_iteration(
                 dir=args.dir,
diff --git a/egs/wsj/s5/steps/nnet3/train_rnn.py b/egs/wsj/s5/steps/nnet3/train_rnn.py
index ab2aa0c4d8d..6ed7197c22b 100755
--- a/egs/wsj/s5/steps/nnet3/train_rnn.py
+++ b/egs/wsj/s5/steps/nnet3/train_rnn.py
@@ -172,12 +172,12 @@ def process_args(args):
         raise Exception("--egs.chunk-right-context should be non-negative")
 
     if (not os.path.exists(args.dir)):
-        raise Exception("This script expects --dir={0} to exist.")
-
-    if (not os.path.exists(args.dir+"/configs") and
+        raise Exception("Directory specified with --dir={0} "
+                        "does not exist.".format(args.dir))
+    if (not os.path.exists(args.dir + "/configs") and
         (args.input_model is None or not os.path.exists(args.input_model))):
         raise Exception("Either --trainer.input-model option should be supplied, "
-                        "and exist; or the {0}/configs directory should exist."
+                        "and exist; or the {0}/configs directory should exist. "
                         "{0}/configs is the output of make_configs.py"
                         "".format(args.dir))
 
@@ -396,9 +396,10 @@ def train(args, run_opts):
         if (args.exit_stage is not None) and (iter == args.exit_stage):
             logger.info("Exiting early due to --exit-stage {0}".format(iter))
             return
-        current_num_jobs = int(0.5 + args.num_jobs_initial
-                               + (args.num_jobs_final - args.num_jobs_initial)
-                               * float(iter) / num_iters)
+
+        current_num_jobs = common_train_lib.get_current_num_jobs(
+            iter, num_iters,
+            args.num_jobs_initial, args.num_jobs_step, args.num_jobs_final)
 
         if args.stage <= iter:
             model_file = "{dir}/{iter}.mdl".format(dir=args.dir, iter=iter)
@@ -428,12 +429,13 @@ def train(args, run_opts):
             shrink_info_str = ''
             if shrinkage_value != 1.0:
                 shrink_info_str = 'shrink: {0:0.5f}'.format(shrinkage_value)
-            logger.info("Iter: {0}/{1}    "
-                        "Epoch: {2:0.2f}/{3:0.1f} ({4:0.1f}% complete)    "
-                        "lr: {5:0.6f}    {6}".format(iter, num_iters - 1,
-                                                     epoch, args.num_epochs,
-                                                     percent,
-                                                     lrate, shrink_info_str))
+            logger.info("Iter: {0}/{1}   Jobs: {2}   "
+                        "Epoch: {3:0.2f}/{4:0.1f} ({5:0.1f}% complete)   "
+                        "lr: {6:0.6f}   {7}".format(iter, num_iters - 1,
+                                                    current_num_jobs,
+                                                    epoch, args.num_epochs,
+                                                    percent,
+                                                    lrate, shrink_info_str))
 
             train_lib.common.train_one_iteration(
                 dir=args.dir,

From 8209d1894696e1e8230cb23c18e6587b94a4ac34 Mon Sep 17 00:00:00 2001
From: Ahmed <amali@qf.org.qa>
Date: Thu, 9 May 2019 02:05:52 +0300
Subject: [PATCH 102/163] [egs] Add MGB-5 recipe; https://arabicspeech.org/mgb5
 (#3299)

---
 egs/mgb5/README                               |  18 ++
 egs/mgb5/s5/RESULTS                           |  14 +
 egs/mgb5/s5/cmd.sh                            |  10 +
 egs/mgb5/s5/conf/decode.config                |   4 +
 egs/mgb5/s5/conf/decode_dnn.config            |   0
 egs/mgb5/s5/conf/mfcc.conf                    |   1 +
 egs/mgb5/s5/conf/mfcc_hires.conf              |  10 +
 egs/mgb5/s5/conf/online_cmvn.conf             |   1 +
 egs/mgb5/s5/conf/slurm.conf                   |  10 +
 egs/mgb5/s5/local/chain/run_tdnn.sh           |   1 +
 egs/mgb5/s5/local/chain/tuning/run_tdnn_1a.sh | 264 ++++++++++++++++++
 egs/mgb5/s5/local/nnet3/run_ivector_common.sh | 126 +++++++++
 egs/mgb5/s5/local/prepare_data.sh             |  39 +++
 egs/mgb5/s5/local/prepare_dict.sh             |  61 ++++
 egs/mgb5/s5/local/prepare_lexicon.py          |  26 ++
 egs/mgb5/s5/local/prepare_lm.sh               |  25 ++
 egs/mgb5/s5/local/score.sh                    | 149 ++++++++++
 egs/mgb5/s5/local/train_lms_srilm.sh          | 233 ++++++++++++++++
 egs/mgb5/s5/path.sh                           |   8 +
 egs/mgb5/s5/run.sh                            | 190 +++++++++++++
 egs/mgb5/s5/steps                             |   1 +
 egs/mgb5/s5/utils                             |   1 +
 22 files changed, 1192 insertions(+)
 create mode 100644 egs/mgb5/README
 create mode 100644 egs/mgb5/s5/RESULTS
 create mode 100644 egs/mgb5/s5/cmd.sh
 create mode 100644 egs/mgb5/s5/conf/decode.config
 create mode 100644 egs/mgb5/s5/conf/decode_dnn.config
 create mode 100644 egs/mgb5/s5/conf/mfcc.conf
 create mode 100644 egs/mgb5/s5/conf/mfcc_hires.conf
 create mode 100644 egs/mgb5/s5/conf/online_cmvn.conf
 create mode 100644 egs/mgb5/s5/conf/slurm.conf
 create mode 120000 egs/mgb5/s5/local/chain/run_tdnn.sh
 create mode 100644 egs/mgb5/s5/local/chain/tuning/run_tdnn_1a.sh
 create mode 100644 egs/mgb5/s5/local/nnet3/run_ivector_common.sh
 create mode 100755 egs/mgb5/s5/local/prepare_data.sh
 create mode 100755 egs/mgb5/s5/local/prepare_dict.sh
 create mode 100755 egs/mgb5/s5/local/prepare_lexicon.py
 create mode 100755 egs/mgb5/s5/local/prepare_lm.sh
 create mode 100755 egs/mgb5/s5/local/score.sh
 create mode 100755 egs/mgb5/s5/local/train_lms_srilm.sh
 create mode 100644 egs/mgb5/s5/path.sh
 create mode 100755 egs/mgb5/s5/run.sh
 create mode 120000 egs/mgb5/s5/steps
 create mode 120000 egs/mgb5/s5/utils

diff --git a/egs/mgb5/README b/egs/mgb5/README
new file mode 100644
index 00000000000..5114b278b71
--- /dev/null
+++ b/egs/mgb5/README
@@ -0,0 +1,18 @@
+###
+# MGB-5 corpus: Moroccan Arabic Automatic Speech Recognition
+# Created in collaboration between QCRI and ELRA
+# More details can be found here: https://arabicspeech.org/mgb5
+###
+
+
+## INTRODUCTION ##
+Training data: 10.2 hours from 69 programs
+Development data: 1.8 hours from 10 programs
+Testing data: 2.0 hours from 14 programs
+
+## KNOWN ISSUES ##
+1- The dev data does not have the same alignment across the four annotators 
+2- Once alignment is consistent, we can include multi-refence word error rate
+3- Use MGB-2 as background model
+
+
diff --git a/egs/mgb5/s5/RESULTS b/egs/mgb5/s5/RESULTS
new file mode 100644
index 00000000000..5ac2daf0d49
--- /dev/null
+++ b/egs/mgb5/s5/RESULTS
@@ -0,0 +1,14 @@
+%WER 75.59 [ 51973 / 68755, 1993 ins, 19098 del, 30882 sub ] exp/chain/tdnn_1a/decode_dev/wer_10_0.0
+%WER 84.47 [ 58079 / 68755, 1679 ins, 18637 del, 37763 sub ] exp/sgmm2_5b2/decode_dev.rescored/wer_9_0.0
+%WER 84.48 [ 58087 / 68755, 1720 ins, 18518 del, 37849 sub ] exp/sgmm2_5b2/decode_dev.big/wer_9_0.0
+%WER 84.62 [ 58180 / 68755, 1746 ins, 18289 del, 38145 sub ] exp/sgmm2_5b2/decode_dev/wer_9_0.0
+%WER 86.93 [ 59766 / 68755, 1634 ins, 19636 del, 38496 sub ] exp/tri3b/decode_dev.rescored/wer_10_0.0
+%WER 87.01 [ 59822 / 68755, 1508 ins, 20885 del, 37429 sub ] exp/tri3b/decode_dev/wer_11_0.0
+%WER 87.23 [ 59974 / 68755, 1686 ins, 18873 del, 39415 sub ] exp/tri3b/decode_dev.si/wer_10_0.0
+%WER 87.57 [ 60209 / 68755, 1325 ins, 21282 del, 37602 sub ] exp/tri2b/decode_dev.rescored/wer_11_0.0
+%WER 87.59 [ 60225 / 68755, 1133 ins, 21631 del, 37461 sub ] exp/tri2b/decode_dev/wer_10_0.5
+%WER 88.35 [ 60745 / 68755, 1359 ins, 20030 del, 39356 sub ] exp/tri2a/decode_dev.rescored/wer_11_0.0
+%WER 88.50 [ 60849 / 68755, 1469 ins, 18597 del, 40783 sub ] exp/tri1/decode_dev.rescored/wer_10_0.0
+%WER 88.53 [ 60866 / 68755, 1229 ins, 20752 del, 38885 sub ] exp/tri2a/decode_dev/wer_10_0.5
+%WER 88.59 [ 60909 / 68755, 1567 ins, 17986 del, 41356 sub ] exp/tri1/decode_dev/wer_10_0.0
+%WER 94.78 [ 65167 / 68755, 664 ins, 23336 del, 41167 sub ] exp/mono/decode_dev/wer_7_0.0
diff --git a/egs/mgb5/s5/cmd.sh b/egs/mgb5/s5/cmd.sh
new file mode 100644
index 00000000000..86240967f67
--- /dev/null
+++ b/egs/mgb5/s5/cmd.sh
@@ -0,0 +1,10 @@
+# "queue.pl" uses qsub.  The options to it are
+# options to qsub.  If you have GridEngine installed,
+# change this to a queue you have access to.
+# Otherwise, use "run.pl", which will run jobs locally
+# (make sure your --num-jobs options are no more than
+# the number of cpus on your machine.
+
+export train_cmd="slurm.pl --mem 6G --config conf/slurm.conf"
+export decode_cmd="slurm.pl  --config conf/slurm.conf"
+export cuda_cmd="slurm.pl gpu --mem 6G --gpu 2 --config conf/slurm.conf"
diff --git a/egs/mgb5/s5/conf/decode.config b/egs/mgb5/s5/conf/decode.config
new file mode 100644
index 00000000000..10b0eee900b
--- /dev/null
+++ b/egs/mgb5/s5/conf/decode.config
@@ -0,0 +1,4 @@
+# Use wider-than-normal decoding beams for RM.
+first_beam=16.0
+beam=20.0
+lattice_beam=10.0
diff --git a/egs/mgb5/s5/conf/decode_dnn.config b/egs/mgb5/s5/conf/decode_dnn.config
new file mode 100644
index 00000000000..e69de29bb2d
diff --git a/egs/mgb5/s5/conf/mfcc.conf b/egs/mgb5/s5/conf/mfcc.conf
new file mode 100644
index 00000000000..7361509099f
--- /dev/null
+++ b/egs/mgb5/s5/conf/mfcc.conf
@@ -0,0 +1 @@
+--use-energy=false   # only non-default option.
diff --git a/egs/mgb5/s5/conf/mfcc_hires.conf b/egs/mgb5/s5/conf/mfcc_hires.conf
new file mode 100644
index 00000000000..434834a6725
--- /dev/null
+++ b/egs/mgb5/s5/conf/mfcc_hires.conf
@@ -0,0 +1,10 @@
+# config for high-resolution MFCC features, intended for neural network training
+# Note: we keep all cepstra, so it has the same info as filterbank features,
+# but MFCC is more easily compressible (because less correlated) which is why 
+# we prefer this method.
+--use-energy=false   # use average of log energy, not energy.
+--num-mel-bins=40     # similar to Google's setup.
+--num-ceps=40     # there is no dimensionality reduction.
+--low-freq=20     # low cutoff frequency for mel bins... this is high-bandwidth data, so
+                  # there might be some information at the low end.
+--high-freq=-400 # high cutoff frequently, relative to Nyquist of 8000 (=7600) 
diff --git a/egs/mgb5/s5/conf/online_cmvn.conf b/egs/mgb5/s5/conf/online_cmvn.conf
new file mode 100644
index 00000000000..7748a4a4dd3
--- /dev/null
+++ b/egs/mgb5/s5/conf/online_cmvn.conf
@@ -0,0 +1 @@
+# configuration file for apply-cmvn-online, used in the script ../local/run_online_decoding.sh
diff --git a/egs/mgb5/s5/conf/slurm.conf b/egs/mgb5/s5/conf/slurm.conf
new file mode 100644
index 00000000000..2cc4052a0a9
--- /dev/null
+++ b/egs/mgb5/s5/conf/slurm.conf
@@ -0,0 +1,10 @@
+command sbatch --export=PATH --ntasks-per-node=1 --partition=cpu 
+option mem=* --mem-per-cpu=$0
+option mem=0          # Do not add anything to qsub_opts
+option num_threads=* --cpus-per-task=$0 --ntasks-per-node=1
+option num_threads=1 --cpus-per-task=1  --ntasks-per-node=1 # Do not add anything to qsub_opts
+option max_jobs_run=*     # Do nothing
+option gpu=* -N1 -n1 -p gpu --mem=4GB --gres=gpu:$0 --cpus-per-task=6 --time=72:0:0  # in reality, we probably should have --cpus-per-task=$((6*$0))
+option gpu=0
+
+
diff --git a/egs/mgb5/s5/local/chain/run_tdnn.sh b/egs/mgb5/s5/local/chain/run_tdnn.sh
new file mode 120000
index 00000000000..34499362831
--- /dev/null
+++ b/egs/mgb5/s5/local/chain/run_tdnn.sh
@@ -0,0 +1 @@
+tuning/run_tdnn_1a.sh
\ No newline at end of file
diff --git a/egs/mgb5/s5/local/chain/tuning/run_tdnn_1a.sh b/egs/mgb5/s5/local/chain/tuning/run_tdnn_1a.sh
new file mode 100644
index 00000000000..6300511e817
--- /dev/null
+++ b/egs/mgb5/s5/local/chain/tuning/run_tdnn_1a.sh
@@ -0,0 +1,264 @@
+#!/bin/bash
+
+
+# Copyright 2017-2018  Johns Hopkins University (author: Daniel Povey)
+#           2017-2018  Yiming Wang
+
+# The script is copied from egs/iban
+# 1a is trying an architecture with factored parameter matrices with dropout.
+
+# grep WER exp/chain/tdnn_1a/decode_dev/wer_10_0.0
+# %WER 75.59 [ 51973 / 68755, 1993 ins, 19098 del, 30882 sub ] 
+
+
+# steps/info/chain_dir_info.pl exp/chain/tdnn_1a
+# exp/chain/tdnn_1a: num-iters=38 nj=2..5 num-params=12.6M dim=40+50->1592 combine=-0.069->-0.067 (over 2) xent:train/valid[24,37,final]=(-1.41,-1.18,-1.12/-1.68,-1.54,-1.47) logprob:train/valid[24,37,final]=(-0.071,-0.057,-0.053/-0.124,-0.122,-0.121)
+
+set -e -o pipefail
+
+# First the options that are passed through to run_ivector_common.sh
+# (some of which are also used in this script directly).
+stage=0
+nj=30
+train_set=train
+test_sets="dev"
+gmm=tri3b
+
+# Options which are not passed through to run_ivector_common.sh
+affix=1a   #affix for TDNN+LSTM directory e.g. "1a" or "1b", in case we change the configuration.
+common_egs_dir=
+reporting_email=
+
+# LSTM/chain options
+train_stage=-10
+get_egs_stage=-10
+xent_regularize=0.1
+
+# training chunk-options
+chunk_width=140,100,160
+# we don't need extra left/right context for TDNN systems.
+chunk_left_context=0
+chunk_right_context=0
+dropout_schedule='0,0@0.20,0.3@0.50,0'
+num_epochs=15
+
+# training options
+srand=0
+remove_egs=true
+
+
+# End configuration section.
+echo "$0 $@"  # Print the command line for logging
+
+
+. ./cmd.sh
+. ./path.sh
+. ./utils/parse_options.sh
+
+
+if ! cuda-compiled; then
+  cat <<EOF && exit 1
+This script is intended to be used with GPUs but you have not compiled Kaldi with CUDA
+If you want to use GPUs (and have them), go to src/, and configure and make on a machine
+where "nvcc" is installed.
+EOF
+fi
+
+local/nnet3/run_ivector_common.sh --stage $stage \
+                                  --train-set $train_set \
+                                  --gmm $gmm || exit 1;
+
+
+gmm_dir=exp/$gmm
+ali_dir=exp/${gmm}_ali_${train_set}_sp
+tree_dir=exp/chain/tree_sp
+lang=data/lang_chain
+lat_dir=exp/chain/${gmm}_${train_set}_sp_lats
+dir=exp/chain/tdnn_${affix}
+train_data_dir=data/${train_set}_sp_hires
+train_ivector_dir=exp/nnet3/ivectors_${train_set}_sp_hires
+lores_train_data_dir=data/${train_set}_sp
+
+for f in $gmm_dir/final.mdl $train_data_dir/feats.scp $train_ivector_dir/ivector_online.scp \
+    $lores_train_data_dir/feats.scp $ali_dir/ali.1.gz; do
+  [ ! -f $f ] && echo "$0: expected file $f to exist" && exit 1
+done
+
+if [ $stage -le 9 ]; then
+  echo "$0: creating lang directory $lang with chain-type topology"
+  # Create a version of the lang/ directory that has one state per phone in the
+  # topo file. [note, it really has two states.. the first one is only repeated
+  # once, the second one has zero or more repeats.]
+  if [ -d $lang ]; then
+    if [ $lang/L.fst -nt data/lang_test/L.fst ]; then
+      echo "$0: $lang already exists, not overwriting it; continuing"
+    else
+      echo "$0: $lang already exists and seems to be older than data/lang_test ..."
+      echo " ... not sure what to do.  Exiting."
+      exit 1;
+    fi
+  else
+    cp -r data/lang_test $lang
+    silphonelist=$(cat $lang/phones/silence.csl) || exit 1;
+    nonsilphonelist=$(cat $lang/phones/nonsilence.csl) || exit 1;
+    # Use our special topology... note that later on may have to tune this
+    # topology.
+    steps/nnet3/chain/gen_topo.py $nonsilphonelist $silphonelist >$lang/topo
+  fi
+fi
+
+if [ $stage -le 10 ]; then
+  # Get the alignments as lattices (gives the chain training more freedom).
+  # use the same num-jobs as the alignments
+  steps/align_fmllr_lats.sh --nj 50 --cmd "$train_cmd" ${lores_train_data_dir} \
+    data/lang $gmm_dir $lat_dir
+  rm $lat_dir/fsts.*.gz # save space
+fi
+
+if [ $stage -le 11 ]; then
+  # Build a tree using our new topology.  We know we have alignments for the
+  # speed-perturbed data (local/nnet3/run_ivector_common.sh made them), so use
+  # those.  The num-leaves is always somewhat less than the num-leaves from
+  # the GMM baseline.
+   if [ -f $tree_dir/final.mdl ]; then
+     echo "$0: $tree_dir/final.mdl already exists, refusing to overwrite it."
+     exit 1;
+  fi
+  steps/nnet3/chain/build_tree.sh \
+    --frame-subsampling-factor 3 \
+    --context-opts "--context-width=2 --central-position=1" \
+    --cmd "$train_cmd" 3500 ${lores_train_data_dir} \
+    $lang $ali_dir $tree_dir
+fi
+
+
+if [ $stage -le 12 ]; then
+  mkdir -p $dir
+  echo "$0: creating neural net configs using the xconfig parser";
+
+  num_targets=$(tree-info $tree_dir/tree |grep num-pdfs|awk '{print $2}')
+  learning_rate_factor=$(echo "print (0.5/$xent_regularize)" | python)
+  opts="l2-regularize=0.08 dropout-per-dim-continuous=true"
+  output_opts="l2-regularize=0.02 bottleneck-dim=256"
+
+  mkdir -p $dir/configs
+  cat <<EOF > $dir/configs/network.xconfig
+  input dim=50 name=ivector
+  input dim=40 name=input
+
+  # please note that it is important to have input layer with the name=input
+  # as the layer immediately preceding the fixed-affine-layer to enable
+  # the use of short notation for the descriptor
+  fixed-affine-layer name=lda input=Append(-2,-1,0,1,2,ReplaceIndex(ivector, t, 0)) affine-transform-file=$dir/configs/lda.mat
+
+  # the first splicing is moved before the lda layer, so no splicing here
+  relu-batchnorm-dropout-layer name=tdnn1 $opts dim=768
+  relu-batchnorm-dropout-layer name=tdnn2 $opts dim=768 input=Append(-1,0,1)
+  relu-batchnorm-dropout-layer name=tdnn3 $opts dim=768
+  relu-batchnorm-dropout-layer name=tdnn4 $opts dim=768 input=Append(-1,0,1)
+  relu-batchnorm-dropout-layer name=tdnn5 $opts dim=768
+  relu-batchnorm-dropout-layer name=tdnn6 $opts dim=768 input=Append(-3,0,3)
+  relu-batchnorm-dropout-layer name=tdnn7 $opts dim=768 input=Append(-3,0,3)
+  relu-batchnorm-dropout-layer name=tdnn8 $opts dim=768 input=Append(-6,-3,0)
+
+  ## adding the layers for chain branch
+  relu-batchnorm-layer name=prefinal-chain $opts dim=768
+  output-layer name=output include-log-softmax=false $output_opts dim=$num_targets max-change=1.5
+
+  # adding the layers for xent branch
+  # This block prints the configs for a separate output that will be
+  # trained with a cross-entropy objective in the 'chain' models... this
+  # has the effect of regularizing the hidden parts of the model.  we use
+  # 0.5 / args.xent_regularize as the learning rate factor- the factor of
+  # 0.5 / args.xent_regularize is suitable as it means the xent
+  # final-layer learns at a rate independent of the regularization
+  # constant; and the 0.5 was tuned so as to make the relative progress
+  # similar in the xent and regular final layers.
+  relu-batchnorm-layer name=prefinal-xent input=tdnn8 $opts dim=768
+  output-layer name=output-xent $output_opts dim=$num_targets learning-rate-factor=$learning_rate_factor max-change=1.5
+EOF
+  steps/nnet3/xconfig_to_configs.py --xconfig-file $dir/configs/network.xconfig --config-dir $dir/configs/
+fi
+
+
+if [ $stage -le 13 ]; then
+
+  steps/nnet3/chain/train.py --stage=$train_stage \
+    --cmd="$decode_cmd" \
+    --feat.online-ivector-dir=$train_ivector_dir \
+    --feat.cmvn-opts="--norm-means=false --norm-vars=false" \
+    --chain.xent-regularize $xent_regularize \
+    --chain.leaky-hmm-coefficient=0.1 \
+    --chain.l2-regularize=0.00005 \
+    --chain.apply-deriv-weights=false \
+    --chain.lm-opts="--num-extra-lm-states=2000" \
+    --trainer.dropout-schedule $dropout_schedule \
+    --trainer.add-option="--optimization.memory-compression-level=2" \
+    --trainer.srand=$srand \
+    --trainer.max-param-change=2.0 \
+    --trainer.num-epochs=$num_epochs \
+    --trainer.frames-per-iter=3000000 \
+    --trainer.optimization.num-jobs-initial=2 \
+    --trainer.optimization.num-jobs-final=5 \
+    --trainer.optimization.initial-effective-lrate=0.001 \
+    --trainer.optimization.final-effective-lrate=0.0001 \
+    --trainer.num-chunk-per-minibatch=256,128,64 \
+    --trainer.optimization.momentum=0.0 \
+    --egs.chunk-width=$chunk_width \
+    --egs.chunk-left-context=0 \
+    --egs.chunk-right-context=0 \
+    --egs.chunk-left-context-initial=0 \
+    --egs.chunk-right-context-final=0 \
+    --egs.dir="$common_egs_dir" \
+    --egs.opts="--frames-overlap-per-eg 0" \
+    --cleanup.remove-egs=$remove_egs \
+    --use-gpu=true \
+    --reporting.email="$reporting_email" \
+    --feat-dir=$train_data_dir \
+    --tree-dir=$tree_dir \
+    --lat-dir=$lat_dir \
+    --dir=$dir  || exit 1;
+fi
+
+if [ $stage -le 14 ]; then
+  # Note: it's not important to give mkgraph.sh the lang directory with the
+  # matched topology (since it gets the topology file from the model).
+  utils/mkgraph.sh \
+    --self-loop-scale 1.0 data/lang_test \
+    $tree_dir $tree_dir/graph || exit 1;
+fi
+
+if [ $stage -le 15 ]; then
+  frames_per_chunk=$(echo $chunk_width | cut -d, -f1)
+  rm $dir/.error 2>/dev/null || true
+
+  for data in $test_sets; do
+    (
+      nspk=$(wc -l <data/${data}_hires/spk2utt)  
+      steps/nnet3/decode.sh \
+          --acwt 1.0 --post-decode-acwt 10.0 \
+          --extra-left-context 0 --extra-right-context 0 \
+          --extra-left-context-initial 0 \
+          --extra-right-context-final 0 \
+          --frames-per-chunk $frames_per_chunk \
+          --nj $nspk --cmd "$decode_cmd"  --num-threads 4 \
+          --online-ivector-dir exp/nnet3/ivectors_${data}_hires \
+          $tree_dir/graph data/${data}_hires ${dir}/decode_${data} || exit 1
+    ) || touch $dir/.error &
+  done
+  wait
+  [ -f $dir/.error ] && echo "$0: there was a problem while decoding" && exit 1
+fi
+
+if [ $stage -le 16 ]; then
+  for data in $test_sets; do
+    (
+      steps/lmrescore_const_arpa.sh  --cmd "$decode_cmd" \
+        data/lang_test/ data/lang_big/ data/${data} \
+        ${dir}/decode_${data} ${dir}/decode_${data}.rescored
+    )
+  done
+  wait
+fi
+
+exit 0;
diff --git a/egs/mgb5/s5/local/nnet3/run_ivector_common.sh b/egs/mgb5/s5/local/nnet3/run_ivector_common.sh
new file mode 100644
index 00000000000..b909ed04cde
--- /dev/null
+++ b/egs/mgb5/s5/local/nnet3/run_ivector_common.sh
@@ -0,0 +1,126 @@
+#!/bin/bash
+
+set -euo pipefail
+
+# This script is called from local/nnet3/run_tdnn.sh and
+# local/chain/run_tdnn.sh (and may eventually be called by more
+# scripts).  It contains the common feature preparation and
+# iVector-related parts of the script.  See those scripts for examples
+# of usage.
+
+stage=0
+train_set=train
+test_sets="dev"
+gmm=tri3b
+
+. ./cmd.sh
+. ./path.sh
+. ./utils/parse_options.sh
+
+gmm_dir=exp/${gmm}
+ali_dir=exp/${gmm}_ali_${train_set}_sp
+
+for f in data/${train_set}/feats.scp ${gmm_dir}/final.mdl; do
+  if [ ! -f $f ]; then
+    echo "$0: expected file $f to exist"
+    exit 1
+  fi
+done
+
+if [ $stage -le 1 ]; then
+  # Although the nnet will be trained by high resolution data, we still have to
+  # perturb the normal data to get the alignment _sp stands for speed-perturbed
+  echo "$0: preparing directory for low-resolution speed-perturbed data (for alignment)"
+  utils/data/perturb_data_dir_speed_3way.sh data/${train_set} data/${train_set}_sp
+  echo "$0: making MFCC features for low-resolution speed-perturbed data"
+  steps/make_mfcc.sh --cmd "$train_cmd" --nj 17 data/${train_set}_sp || exit 1;
+  steps/compute_cmvn_stats.sh data/${train_set}_sp || exit 1;
+  utils/fix_data_dir.sh data/${train_set}_sp
+fi
+
+if [ $stage -le 2 ]; then
+  echo "$0: aligning with the perturbed low-resolution data"
+  steps/align_fmllr.sh --nj 16 --cmd "$train_cmd" \
+    data/${train_set}_sp data/lang $gmm_dir $ali_dir || exit 1
+fi
+
+if [ $stage -le 3 ]; then
+  # Create high-resolution MFCC features (with 40 cepstra instead of 13).
+  # this shows how you can split across multiple file-systems.
+  echo "$0: creating high-resolution MFCC features"
+  mfccdir=data/${train_set}_sp_hires/data
+
+  for datadir in ${train_set}_sp ${test_sets}; do
+    utils/copy_data_dir.sh data/$datadir data/${datadir}_hires
+  done
+
+  # do volume-perturbation on the training data prior to extracting hires
+  # features; this helps make trained nnets more invariant to test data volume.
+  #utils/data/perturb_data_dir_volume.sh data/${train_set}_sp_hires || exit 1;
+
+  for datadir in ${train_set}_sp ${test_sets}; do
+    steps/make_mfcc.sh --nj 16 --mfcc-config conf/mfcc_hires.conf \
+     --cmd "$train_cmd" data/${datadir}_hires || exit 1;
+    steps/compute_cmvn_stats.sh data/${datadir}_hires || exit 1;
+    utils/fix_data_dir.sh data/${datadir}_hires || exit 1;
+  done
+fi
+
+if [ $stage -le 4 ]; then
+  # Train a small system just for its LDA+MLLT transform.  We use --num-iters 13
+  # because after we get the transform (12th iter is the last), any further
+  # training is pointless.
+  steps/train_lda_mllt.sh --cmd "$train_cmd" --num-iters 13 \
+    --realign-iters "" --splice-opts "--left-context=3 --right-context=3" \
+    5000 10000 data/${train_set}_sp_hires data/lang \
+     $ali_dir exp/nnet3/tri5b || exit 1
+fi
+
+if [ $stage -le 5 ]; then
+  echo "$0: training the diagonal UBM."
+  steps/online/nnet2/train_diag_ubm.sh --cmd "$train_cmd" --nj 16  --num-frames 200000 \
+     data/${train_set}_sp_hires 256 exp/nnet3/tri5b exp/nnet3/diag_ubm || exit 1
+fi
+
+if [ $stage -le 6 ]; then
+  # Train the iVector extractor.  Use all of the speed-perturbed data since iVector extractors
+  # can be sensitive to the amount of data. The iVector dimension of 50.
+  # even though $nj is just 10, each job uses multiple processes and threads.
+  echo "$0: training the iVector extractor"
+  steps/online/nnet2/train_ivector_extractor.sh --cmd "$train_cmd" \
+    --nj 10 --num-processes 1 --num-threads 2 --ivector-dim 50 \
+    data/${train_set}_sp_hires exp/nnet3/diag_ubm exp/nnet3/extractor || exit 1;
+fi
+
+if [ $stage -le 7 ]; then
+  # We extract iVectors on the speed-perturbed training data after combining
+  # short segments, which will be what we train the system on.  With
+  # --utts-per-spk-max 2, the script pairs the utterances into twos, and treats
+  # each of these pairs as one speaker; this gives more diversity in iVectors..
+  # Note that these are extracted 'online'.
+
+  # note, we don't encode the 'max2' in the name of the ivectordir even though
+  # that's the data we extract the ivectors from, as it's still going to be
+  # valid for the non-'max2' data, the utterance list is the same.
+
+  ivectordir=exp/nnet3/ivectors_${train_set}_sp_hires
+
+  # having a larger number of speakers is helpful for generalization, and to
+  # handle per-utterance decoding well (iVector starts at zero).
+  temp_data_root=${ivectordir}
+  utils/data/modify_speaker_info.sh --utts-per-spk-max 2 \
+    data/${train_set}_sp_hires ${temp_data_root}/${train_set}_sp_hires_max2
+
+  steps/online/nnet2/extract_ivectors_online.sh --cmd "$train_cmd" --nj 16 \
+    ${temp_data_root}/${train_set}_sp_hires_max2 \
+    exp/nnet3/extractor $ivectordir
+
+  # Also extract iVectors for the test data, but in this case we don't need the speed
+  # perturbation (sp).
+  for data in $test_sets; do
+    steps/online/nnet2/extract_ivectors_online.sh --cmd "$train_cmd" --nj 6 \
+      data/${data}_hires exp/nnet3/extractor exp/nnet3/ivectors_${data}_hires
+  done
+fi
+
+exit 0;
diff --git a/egs/mgb5/s5/local/prepare_data.sh b/egs/mgb5/s5/local/prepare_data.sh
new file mode 100755
index 00000000000..36cb4d8fa3f
--- /dev/null
+++ b/egs/mgb5/s5/local/prepare_data.sh
@@ -0,0 +1,39 @@
+#!/bin/bash
+# Copyright 2019 QCRI (Author: Ahmed Ali)
+# Apache 2.0
+
+set -e -o pipefail
+
+
+###
+# The script assumes you have downloaded to the MGB-5 corpus: https://arabicspeech.org/mgb5
+# DB/{dev.tar.gz,train.tar.gz}
+###
+echo "Preparing train and dev data"
+
+if [[ ! -e "DB/train.tar.gz" || ! -e "DB/dev.tar.gz" ]]; then
+  echo "You need to download the MGB-5 first and copy dev.tar.gz and train.tar.gz to DB folder"
+  echo "check: https://arabicspeech.org/mgb5"
+  exit 1
+fi
+
+# We will extract data again even if you did this before.
+(cd DB; rm -fr train dev;for x in *; do tar -xvf $x; done)
+
+mkdir -p data/local data/train data/dev
+
+for x in train dev; do
+    sed -e 's:UNK: :g' -e 's:  : :g' DB/$x/$x.txt.bw > data/$x/text #removing words that annotators couldn't understand
+    cp DB/$x/$x.segments.bw data/$x/segments
+    awk '{print $1 " " $1}' DB/$x/$x.segments.bw > data/$x/spk2utt
+    cp data/$x/spk2utt data/$x/utt2spk 
+    find $PWD/DB/$x/ -name \*.wav | while read wav; do
+        id=$(basename $wav | sed 's:.wav::')
+        echo $id $wav
+    done | sort -u > data/$x/wav.scp
+    utils/fix_data_dir.sh data/$x
+done
+
+
+echo "Data preparation completed."
+
diff --git a/egs/mgb5/s5/local/prepare_dict.sh b/egs/mgb5/s5/local/prepare_dict.sh
new file mode 100755
index 00000000000..5ea0938af90
--- /dev/null
+++ b/egs/mgb5/s5/local/prepare_dict.sh
@@ -0,0 +1,61 @@
+#!/usr/bin/env bash
+
+# Copyright 2019 QCRI (author: Ahmed Ali)
+# Apache 2.0
+# This script prepares the grapaheme dictionary
+
+set -e
+dir=data/local/dict
+lexicon_url1="https://arabicspeech.org/static/data_resources/ar-ar_grapheme_lexicon_20160209.bz2";
+lexicon_url2="https://arabicspeech.org/static/data_resources/ar-ar_phoneme_lexicon_20140317.bz2";
+stage=0
+. ./cmd.sh
+. ./path.sh
+. ./utils/parse_options.sh || exit 1;
+mkdir -p $dir data/local/lexicon_data
+
+if [ $stage -le 0 ]; then
+  echo "$0: Downloading text for lexicon... $(date)."
+  if [ ! -f data/local/lexicon_data/ar-ar_grapheme_lexicon_20160209.bz2 ]; then
+    wget -P data/local/lexicon_data $lexicon_url1
+  else
+    echo "data/local/lexicon_data/ar-ar_grapheme_lexicon_20160209.bz2 already exist on disk"
+  fi 
+  
+  if [ ! -f data/local/lexicon_data/ar-ar_phoneme_lexicon_20140317.bz2 ]; then
+    wget -P data/local/lexicon_data $lexicon_url2
+  else
+    echo "data/local/lexicon_data/ar-ar_phoneme_lexicon_20140317.bz2 already exist on disk"
+  fi 
+  
+  rm -fr data/local/lexicon_data/grapheme_lexicon
+  for dict in ar-ar_grapheme_lexicon_20160209.bz2 ar-ar_phoneme_lexicon_20140317.bz2; do
+    bzcat data/local/lexicon_data/$dict | sed '1,3d' | \
+    awk '{print $1}'  >>  data/local/lexicon_data/grapheme_lexicon
+  done
+  cat data/train/text | cut -d ' ' -f 2- | tr -s " " "\n" | grep -v UNK |  sort -u >> data/local/lexicon_data/grapheme_lexicon
+fi
+
+
+if [ $stage -le 0 ]; then
+  echo "$0: processing lexicon text and creating lexicon... $(date)."
+  # remove vowels and  rare alef wasla
+  grep -v [0-9] data/local/lexicon_data/grapheme_lexicon |  sed -e 's:[FNKaui\~o\`]::g' -e 's:{:}:g' | sort -u > data/local/lexicon_data/processed_lexicon
+  local/prepare_lexicon.py
+fi
+
+cut -d' ' -f2- $dir/lexicon.txt | sed 's/SIL//g' | tr ' ' '\n' | sort -u | sed '/^$/d' >$dir/nonsilence_phones.txt || exit 1;
+
+sed -i '1i<UNK> UNK' $dir/lexicon.txt
+
+echo UNK >> $dir/nonsilence_phones.txt
+
+echo '<sil> SIL' >> $dir/lexicon.txt
+
+echo SIL > $dir/silence_phones.txt
+
+echo SIL >$dir/optional_silence.txt
+
+echo -n "" >$dir/extra_questions.txt
+
+echo "$0: Dictionary preparation succeeded"
diff --git a/egs/mgb5/s5/local/prepare_lexicon.py b/egs/mgb5/s5/local/prepare_lexicon.py
new file mode 100755
index 00000000000..215541585eb
--- /dev/null
+++ b/egs/mgb5/s5/local/prepare_lexicon.py
@@ -0,0 +1,26 @@
+#!/usr/bin/env python3
+
+# Copyright      2018  Ashish Arora
+# Apache 2.0
+
+# This script prepares lexicon.
+
+import argparse
+import os
+
+parser = argparse.ArgumentParser(description="""Creates the list of characters and words in lexicon""")
+args = parser.parse_args()
+
+### main ###
+lex = {}
+text_path = os.path.join('data','local', 'lexicon_data', 'processed_lexicon')
+with open(text_path, 'r', encoding='utf-8') as f:
+    for line in f:
+        line = line.strip()
+        characters = list(line)
+        characters = " ".join(['V' if char == '*' else char for char in characters])
+        lex[line] = characters
+
+with open(os.path.join('data','local','dict', 'lexicon.txt'), 'w', encoding='utf-8') as fp:
+    for key in sorted(lex):
+        fp.write(key + "  " + lex[key] + "\n")
diff --git a/egs/mgb5/s5/local/prepare_lm.sh b/egs/mgb5/s5/local/prepare_lm.sh
new file mode 100755
index 00000000000..02fb59aba87
--- /dev/null
+++ b/egs/mgb5/s5/local/prepare_lm.sh
@@ -0,0 +1,25 @@
+#!/bin/bash
+# Copyright 2019  QCRI (Author: Ahmed Ali)
+# Apache 2.0
+
+set -e -o pipefail
+
+# To create G.fst from ARPA language model
+. ./path.sh || die "path.sh expected";
+
+local/train_lms_srilm.sh --train-text data/train/text data/ data/srilm
+
+# for basic decoding, let's use only a trigram LM
+[ -d data/lang_test/ ] && rm -rf data/lang_test
+cp -R data/lang data/lang_test
+lm=data/srilm/3gram.me.gz
+utils/format_lm.sh data/lang_test $lm data/local/dict/lexicon.txt data/lang_test
+
+# for decoding using bigger, we build 4-gram using the same transcription text
+[ -d data/lang_big ] && rm -rf data/lang_big
+cp -R data/lang data/lang_big
+lm=data/srilm/4gram.me.gz
+utils/format_lm.sh data/lang_big $lm data/local/dict/lexicon.txt data/lang_big
+
+utils/build_const_arpa_lm.sh $lm data/lang_big data/lang_big
+exit 0;
diff --git a/egs/mgb5/s5/local/score.sh b/egs/mgb5/s5/local/score.sh
new file mode 100755
index 00000000000..9988c941441
--- /dev/null
+++ b/egs/mgb5/s5/local/score.sh
@@ -0,0 +1,149 @@
+#!/bin/bash
+# Copyright 2012-2014  Johns Hopkins University (Author: Daniel Povey, Yenda Trmal)
+# Apache 2.0
+
+# See the script steps/scoring/score_kaldi_cer.sh in case you need to evalutate CER
+
+[ -f ./path.sh ] && . ./path.sh
+
+# begin configuration section.
+cmd=run.pl
+stage=0
+decode_mbr=false
+stats=true
+beam=6
+word_ins_penalty=0.0,0.5,1.0
+min_lmwt=7
+max_lmwt=17
+iter=final
+#end configuration section.
+
+echo "$0 $@"  # Print the command line for logging
+[ -f ./path.sh ] && . ./path.sh
+. parse_options.sh || exit 1;
+
+if [ $# -ne 3 ]; then
+  echo "Usage: $0 [--cmd (run.pl|queue.pl...)] <data-dir> <lang-dir|graph-dir> <decode-dir>"
+  echo " Options:"
+  echo "    --cmd (run.pl|queue.pl...)      # specify how to run the sub-processes."
+  echo "    --stage (0|1|2)                 # start scoring script from part-way through."
+  echo "    --decode_mbr (true/false)       # maximum bayes risk decoding (confusion network)."
+  echo "    --min_lmwt <int>                # minumum LM-weight for lattice rescoring "
+  echo "    --max_lmwt <int>                # maximum LM-weight for lattice rescoring "
+  exit 1;
+fi
+
+data=$1
+lang_or_graph=$2
+dir=$3
+
+symtab=$lang_or_graph/words.txt
+
+for f in $symtab $dir/lat.1.gz $data/text; do
+  [ ! -f $f ] && echo "score.sh: no such file $f" && exit 1;
+done
+
+
+ref_filtering_cmd="cat"
+[ -x local/wer_output_filter ] && ref_filtering_cmd="local/wer_output_filter"
+[ -x local/wer_ref_filter ] && ref_filtering_cmd="local/wer_ref_filter"
+hyp_filtering_cmd="cat"
+[ -x local/wer_output_filter ] && hyp_filtering_cmd="local/wer_output_filter"
+[ -x local/wer_hyp_filter ] && hyp_filtering_cmd="local/wer_hyp_filter"
+
+
+if $decode_mbr ; then
+  echo "$0: scoring with MBR, word insertion penalty=$word_ins_penalty"
+else
+  echo "$0: scoring with word insertion penalty=$word_ins_penalty"
+fi
+
+
+mkdir -p $dir/scoring_kaldi
+cat $data/text | $ref_filtering_cmd > $dir/scoring_kaldi/test_filt.txt || exit 1;
+if [ $stage -le 0 ]; then
+
+  for wip in $(echo $word_ins_penalty | sed 's/,/ /g'); do
+    mkdir -p $dir/scoring_kaldi/penalty_$wip/log
+
+    if $decode_mbr ; then
+      $cmd LMWT=$min_lmwt:$max_lmwt $dir/scoring_kaldi/penalty_$wip/log/best_path.LMWT.log \
+        acwt=\`perl -e \"print 1.0/LMWT\"\`\; \
+        lattice-scale --inv-acoustic-scale=LMWT "ark:gunzip -c $dir/lat.*.gz|" ark:- \| \
+        lattice-add-penalty --word-ins-penalty=$wip ark:- ark:- \| \
+        lattice-prune --beam=$beam ark:- ark:- \| \
+        lattice-mbr-decode  --word-symbol-table=$symtab \
+        ark:- ark,t:- \| \
+        utils/int2sym.pl -f 2- $symtab \| \
+        $hyp_filtering_cmd '>' $dir/scoring_kaldi/penalty_$wip/LMWT.txt || exit 1;
+
+    else
+      $cmd LMWT=$min_lmwt:$max_lmwt $dir/scoring_kaldi/penalty_$wip/log/best_path.LMWT.log \
+        lattice-scale --inv-acoustic-scale=LMWT "ark:gunzip -c $dir/lat.*.gz|" ark:- \| \
+        lattice-add-penalty --word-ins-penalty=$wip ark:- ark:- \| \
+        lattice-best-path --word-symbol-table=$symtab ark:- ark,t:- \| \
+        utils/int2sym.pl -f 2- $symtab \| \
+        $hyp_filtering_cmd '>' $dir/scoring_kaldi/penalty_$wip/LMWT.txt || exit 1;
+    fi
+
+    $cmd LMWT=$min_lmwt:$max_lmwt $dir/scoring_kaldi/penalty_$wip/log/score.LMWT.log \
+      cat $dir/scoring_kaldi/penalty_$wip/LMWT.txt \| \
+      compute-wer --text --mode=present \
+      ark:$dir/scoring_kaldi/test_filt.txt  ark,p:- ">&" $dir/wer_LMWT_$wip || exit 1;
+
+  done
+fi
+
+
+
+if [ $stage -le 1 ]; then
+
+  for wip in $(echo $word_ins_penalty | sed 's/,/ /g'); do
+    for lmwt in $(seq $min_lmwt $max_lmwt); do
+      # adding /dev/null to the command list below forces grep to output the filename
+      grep WER $dir/wer_${lmwt}_${wip} /dev/null
+    done
+  done | utils/best_wer.sh  >& $dir/scoring_kaldi/best_wer || exit 1
+
+  best_wer_file=$(awk '{print $NF}' $dir/scoring_kaldi/best_wer)
+  best_wip=$(echo $best_wer_file | awk -F_ '{print $NF}')
+  best_lmwt=$(echo $best_wer_file | awk -F_ '{N=NF-1; print $N}')
+
+  if [ -z "$best_lmwt" ]; then
+    echo "$0: we could not get the details of the best WER from the file $dir/wer_*.  Probably something went wrong."
+    exit 1;
+  fi
+
+  if $stats; then
+    mkdir -p $dir/scoring_kaldi/wer_details
+    echo $best_lmwt > $dir/scoring_kaldi/wer_details/lmwt # record best language model weight
+    echo $best_wip > $dir/scoring_kaldi/wer_details/wip # record best word insertion penalty
+
+    $cmd $dir/scoring_kaldi/log/stats1.log \
+      cat $dir/scoring_kaldi/penalty_$best_wip/$best_lmwt.txt \| \
+      align-text --special-symbol="'***'" ark:$dir/scoring_kaldi/test_filt.txt ark:- ark,t:- \|  \
+      utils/scoring/wer_per_utt_details.pl --special-symbol "'***'" \| tee $dir/scoring_kaldi/wer_details/per_utt \|\
+       utils/scoring/wer_per_spk_details.pl $data/utt2spk \> $dir/scoring_kaldi/wer_details/per_spk || exit 1;
+
+    $cmd $dir/scoring_kaldi/log/stats2.log \
+      cat $dir/scoring_kaldi/wer_details/per_utt \| \
+      utils/scoring/wer_ops_details.pl --special-symbol "'***'" \| \
+      sort -b -i -k 1,1 -k 4,4rn -k 2,2 -k 3,3 \> $dir/scoring_kaldi/wer_details/ops || exit 1;
+
+    $cmd $dir/scoring_kaldi/log/wer_bootci.log \
+      compute-wer-bootci --mode=present \
+        ark:$dir/scoring_kaldi/test_filt.txt ark:$dir/scoring_kaldi/penalty_$best_wip/$best_lmwt.txt \
+        '>' $dir/scoring_kaldi/wer_details/wer_bootci || exit 1;
+
+  fi
+fi
+
+# If we got here, the scoring was successful.
+# As a  small aid to prevent confusion, we remove all wer_{?,??} files;
+# these originate from the previous version of the scoring files
+# i keep both statement here because it could lead to confusion about
+# the capabilities of the script (we don't do cer in the script)
+rm $dir/wer_{?,??} 2>/dev/null
+rm $dir/cer_{?,??} 2>/dev/null
+
+exit 0;
diff --git a/egs/mgb5/s5/local/train_lms_srilm.sh b/egs/mgb5/s5/local/train_lms_srilm.sh
new file mode 100755
index 00000000000..6af13921511
--- /dev/null
+++ b/egs/mgb5/s5/local/train_lms_srilm.sh
@@ -0,0 +1,233 @@
+#!/bin/bash
+
+#      2019 QCRI (Ahmed Ali)
+
+export LC_ALL=C
+
+words_file=
+train_text=
+dev_text=
+oov_symbol="<UNK>"
+
+echo "$0 $@"
+
+[ -f path.sh ]  && . ./path.sh
+. ./utils/parse_options.sh || exit 1
+
+echo "-------------------------------------"
+echo "Building an SRILM language model     "
+echo "-------------------------------------"
+
+if [ $# -ne 2 ] ; then
+  echo "Incorrect number of parameters. "
+  echo "Script has to be called like this:"
+  echo "  $0 [switches] <datadir> <tgtdir>"
+  echo "For example: "
+  echo "  $0 data data/srilm"
+  echo "The allowed switches are: "
+  echo "    words_file=<word_file|>        word list file -- data/lang/words.txt by default"
+  echo "    train_text=<train_text|>       data/train/text is used in case when not specified"
+  echo "    dev_text=<dev_text|>           last 10 % of the train text is used by default"
+  echo "    oov_symbol=<unk_sumbol|<UNK>>  symbol to use for oov modeling -- <UNK> by default"
+  exit 1
+fi
+
+datadir=$1
+tgtdir=$2
+outlm=lm.gz
+
+
+##End of configuration
+loc=`which ngram-count`;
+if [ -z $loc ]; then
+  if uname -a | grep 64 >/dev/null; then # some kind of 64 bit...
+    sdir=`pwd`/../../../tools/srilm/bin/i686-m64
+  else
+    sdir=`pwd`/../../../tools/srilm/bin/i686
+  fi
+  if [ -f $sdir/ngram-count ]; then
+    echo Using SRILM tools from $sdir
+    export PATH=$PATH:$sdir
+  else
+    echo You appear to not have SRILM tools installed, either on your path,
+    echo or installed in $sdir.  See tools/install_srilm.sh for installation
+    echo instructions.
+    exit 1
+  fi
+fi
+
+# Prepare the destination directory
+mkdir -p $tgtdir
+
+for f in $words_file $train_text $dev_text; do
+  [ ! -s $f ] && echo "No such file $f" && exit 1;
+done
+
+[ -z $words_file ] && words_file=$datadir/lang/words.txt
+if [ ! -z "$train_text" ] && [ -z "$dev_text" ] ; then
+  nr=`cat  $train_text | wc -l`
+  nr_dev=$(($nr / 10 ))
+  nr_train=$(( $nr - $nr_dev ))
+  orig_train_text=$train_text
+  head -n $nr_train $train_text > $tgtdir/train_text
+  tail -n $nr_dev $train_text > $tgtdir/dev_text
+
+  train_text=$tgtdir/train_text
+  dev_text=$tgtdir/dev_text
+  echo "Using words file: $words_file"
+  echo "Using train text: 9/10 of $orig_train_text"
+  echo "Using dev text  : 1/10 of $orig_train_text"
+elif [ ! -z "$train_text" ] && [ ! -z "$dev_text" ] ; then
+  echo "Using words file: $words_file"
+  echo "Using train text: $train_text"
+  echo "Using dev text  : $dev_text"
+  train_text=$train_text
+  dev_text=$dev_text
+else
+  train_text=$datadir/train/text
+  dev_text=$datadir/dev2h/text
+  echo "Using words file: $words_file"
+  echo "Using train text: $train_text"
+  echo "Using dev text  : $dev_text"
+fi
+
+
+
+# Extract the word list from the training dictionary; exclude special symbols
+sort $words_file | awk '{print $1}' | grep -v '\#0' | grep -v '<eps>' | grep -v -F "$oov_symbol" > $tgtdir/vocab
+if (($?)); then
+  echo "Failed to create vocab from $words_file"
+  exit 1
+else
+  # wc vocab # doesn't work due to some encoding issues
+  echo vocab contains `cat $tgtdir/vocab | perl -ne 'BEGIN{$l=$w=0;}{split; $w+=$#_; $w++; $l++;}END{print "$l lines, $w words\n";}'`
+fi
+
+# Kaldi transcript files contain Utterance_ID as the first word; remove it
+cat $train_text | cut -f2- -d' ' > $tgtdir/train.txt
+if (($?)); then
+    echo "Failed to create $tgtdir/train.txt from $train_text"
+    exit 1
+else
+    echo "Removed first word (uid) from every line of $train_text"
+    # wc text.train train.txt # doesn't work due to some encoding issues
+    echo $train_text contains `cat $train_text | perl -ane 'BEGIN{$w=$s=0;}{$w+=@F; $w--; $s++;}END{print "$w words, $s sentences\n";}'`
+    echo train.txt contains `cat $tgtdir/train.txt | perl -ane 'BEGIN{$w=$s=0;}{$w+=@F; $s++;}END{print "$w words, $s sentences\n";}'`
+fi
+
+# Kaldi transcript files contain Utterance_ID as the first word; remove it
+cat $dev_text | cut -f2- -d' ' > $tgtdir/dev.txt
+if (($?)); then
+    echo "Failed to create $tgtdir/dev.txt from $dev_text"
+    exit 1
+else
+    echo "Removed first word (uid) from every line of $dev_text"
+    # wc text.train train.txt # doesn't work due to some encoding issues
+    echo $dev_text contains `cat $dev_text | perl -ane 'BEGIN{$w=$s=0;}{$w+=@F; $w--; $s++;}END{print "$w words, $s sentences\n";}'`
+    echo $tgtdir/dev.txt contains `cat $tgtdir/dev.txt | perl -ane 'BEGIN{$w=$s=0;}{$w+=@F;  $s++;}END{print "$w words, $s sentences\n";}'`
+fi
+
+echo "-------------------"
+echo "Good-Turing 2grams"
+echo "-------------------"
+ngram-count -lm $tgtdir/2gram.gt01.gz -gt1min 0 -gt2min 1 -order 2 -text $tgtdir/train.txt -vocab $tgtdir/vocab -unk -sort -map-unk "$oov_symbol"
+ngram-count -lm $tgtdir/2gram.gt02.gz -gt1min 0 -gt2min 2 -order 2 -text $tgtdir/train.txt -vocab $tgtdir/vocab -unk -sort -map-unk "$oov_symbol"
+
+echo "-------------------"
+echo "Kneser-Ney 2grams"
+echo "-------------------"
+ngram-count -lm $tgtdir/2gram.kn01.gz -kndiscount1 -gt1min 0 -kndiscount2 -gt2min 1 -order 2 -text $tgtdir/train.txt -vocab $tgtdir/vocab -unk -sort -map-unk "$oov_symbol"
+ngram-count -lm $tgtdir/2gram.kn02.gz -kndiscount1 -gt1min 0 -kndiscount2 -gt2min 2 -order 2 -text $tgtdir/train.txt -vocab $tgtdir/vocab -unk -sort -map-unk "$oov_symbol"
+
+echo "-------------------"
+echo "Good-Turing 3grams"
+echo "-------------------"
+ngram-count -lm $tgtdir/3gram.gt011.gz -gt1min 0 -gt2min 1 -gt3min 1 -order 3 -text $tgtdir/train.txt -vocab $tgtdir/vocab -unk -sort -map-unk "$oov_symbol"
+ngram-count -lm $tgtdir/3gram.gt012.gz -gt1min 0 -gt2min 1 -gt3min 2 -order 3 -text $tgtdir/train.txt -vocab $tgtdir/vocab -unk -sort -map-unk "$oov_symbol"
+ngram-count -lm $tgtdir/3gram.gt022.gz -gt1min 0 -gt2min 2 -gt3min 2 -order 3 -text $tgtdir/train.txt -vocab $tgtdir/vocab -unk -sort -map-unk "$oov_symbol"
+ngram-count -lm $tgtdir/3gram.gt023.gz -gt1min 0 -gt2min 2 -gt3min 3 -order 3 -text $tgtdir/train.txt -vocab $tgtdir/vocab -unk -sort -map-unk "$oov_symbol"
+
+echo "-------------------"
+echo "Kneser-Ney 3grams"
+echo "-------------------"
+ngram-count -lm $tgtdir/3gram.kn011.gz -kndiscount1 -gt1min 0 -kndiscount2 -gt2min 1 -kndiscount3 -gt3min 1 -order 3 -text $tgtdir/train.txt -vocab $tgtdir/vocab -unk -sort -map-unk "$oov_symbol"
+ngram-count -lm $tgtdir/3gram.kn012.gz -kndiscount1 -gt1min 0 -kndiscount2 -gt2min 1 -kndiscount3 -gt3min 2 -order 3 -text $tgtdir/train.txt -vocab $tgtdir/vocab -unk -sort -map-unk "$oov_symbol"
+ngram-count -lm $tgtdir/3gram.kn022.gz -kndiscount1 -gt1min 0 -kndiscount2 -gt2min 2 -kndiscount3 -gt3min 2 -order 3 -text $tgtdir/train.txt -vocab $tgtdir/vocab -unk -sort -map-unk "$oov_symbol"
+ngram-count -lm $tgtdir/3gram.kn023.gz -kndiscount1 -gt1min 0 -kndiscount2 -gt2min 2 -kndiscount3 -gt3min 3 -order 3 -text $tgtdir/train.txt -vocab $tgtdir/vocab -unk -sort -map-unk "$oov_symbol"
+
+
+echo "-------------------"
+echo "Good-Turing 4grams"
+echo "-------------------"
+ngram-count -lm $tgtdir/4gram.gt0111.gz -gt1min 0 -gt2min 1 -gt3min 1 -gt4min 1 -order 4 -text $tgtdir/train.txt -vocab $tgtdir/vocab -unk -sort -map-unk "$oov_symbol"
+ngram-count -lm $tgtdir/4gram.gt0112.gz -gt1min 0 -gt2min 1 -gt3min 1 -gt4min 2 -order 4 -text $tgtdir/train.txt -vocab $tgtdir/vocab -unk -sort -map-unk "$oov_symbol"
+ngram-count -lm $tgtdir/4gram.gt0122.gz -gt1min 0 -gt2min 1 -gt3min 2 -gt4min 2 -order 4 -text $tgtdir/train.txt -vocab $tgtdir/vocab -unk -sort -map-unk "$oov_symbol"
+ngram-count -lm $tgtdir/4gram.gt0123.gz -gt1min 0 -gt2min 1 -gt3min 2 -gt4min 3 -order 4 -text $tgtdir/train.txt -vocab $tgtdir/vocab -unk -sort -map-unk "$oov_symbol"
+ngram-count -lm $tgtdir/4gram.gt0113.gz -gt1min 0 -gt2min 1 -gt3min 1 -gt4min 3 -order 4 -text $tgtdir/train.txt -vocab $tgtdir/vocab -unk -sort -map-unk "$oov_symbol"
+ngram-count -lm $tgtdir/4gram.gt0222.gz -gt1min 0 -gt2min 2 -gt3min 2 -gt4min 2 -order 4 -text $tgtdir/train.txt -vocab $tgtdir/vocab -unk -sort -map-unk "$oov_symbol"
+ngram-count -lm $tgtdir/4gram.gt0223.gz -gt1min 0 -gt2min 2 -gt3min 2 -gt4min 3 -order 4 -text $tgtdir/train.txt -vocab $tgtdir/vocab -unk -sort -map-unk "$oov_symbol"
+
+echo "-------------------"
+echo "Kneser-Ney 4grams"
+echo "-------------------"
+ngram-count -lm $tgtdir/4gram.kn0111.gz -kndiscount1 -gt1min 0 -kndiscount2 -gt2min 1 -kndiscount3 -gt3min 1 -kndiscount4 -gt4min 1 -order 4 -text $tgtdir/train.txt -vocab $tgtdir/vocab -unk -sort -map-unk "$oov_symbol"
+ngram-count -lm $tgtdir/4gram.kn0112.gz -kndiscount1 -gt1min 0 -kndiscount2 -gt2min 1 -kndiscount3 -gt3min 1 -kndiscount4 -gt4min 2 -order 4 -text $tgtdir/train.txt -vocab $tgtdir/vocab -unk -sort -map-unk "$oov_symbol"
+ngram-count -lm $tgtdir/4gram.kn0113.gz -kndiscount1 -gt1min 0 -kndiscount2 -gt2min 1 -kndiscount3 -gt3min 1 -kndiscount4 -gt4min 3 -order 4 -text $tgtdir/train.txt -vocab $tgtdir/vocab -unk -sort -map-unk "$oov_symbol"
+ngram-count -lm $tgtdir/4gram.kn0122.gz -kndiscount1 -gt1min 0 -kndiscount2 -gt2min 1 -kndiscount3 -gt3min 2 -kndiscount4 -gt4min 2 -order 4 -text $tgtdir/train.txt -vocab $tgtdir/vocab -unk -sort -map-unk "$oov_symbol"
+ngram-count -lm $tgtdir/4gram.kn0123.gz -kndiscount1 -gt1min 0 -kndiscount2 -gt2min 1 -kndiscount3 -gt3min 2 -kndiscount4 -gt4min 3 -order 4 -text $tgtdir/train.txt -vocab $tgtdir/vocab -unk -sort -map-unk "$oov_symbol"
+ngram-count -lm $tgtdir/4gram.kn0222.gz -kndiscount1 -gt1min 0 -kndiscount2 -gt2min 2 -kndiscount3 -gt3min 2 -kndiscount4 -gt4min 2 -order 4 -text $tgtdir/train.txt -vocab $tgtdir/vocab -unk -sort -map-unk "$oov_symbol"
+ngram-count -lm $tgtdir/4gram.kn0223.gz -kndiscount1 -gt1min 0 -kndiscount2 -gt2min 2 -kndiscount3 -gt3min 2 -kndiscount4 -gt4min 3 -order 4 -text $tgtdir/train.txt -vocab $tgtdir/vocab -unk -sort -map-unk "$oov_symbol"
+
+if [ ! -z ${LIBLBFGS} ]; then
+  #please not that if the switch -map-unk "$oov_symbol" is used with -maxent-convert-to-arpa, ngram-count will segfault
+  #instead of that, we simply output the model in the maxent format and convert it using the "ngram"
+  echo "-------------------"
+  echo "Maxent 2grams"
+  echo "-------------------"
+  sed 's/'${oov_symbol}'/<unk>/g' $tgtdir/train.txt | \
+    ngram-count -lm - -order 2 -text - -vocab $tgtdir/vocab -unk -sort -maxent -maxent-convert-to-arpa|\
+    sed 's/<unk>/'${oov_symbol}'/g' | gzip -c > $tgtdir/2gram.me.gz || exit 1
+
+  echo "-------------------"
+  echo "Maxent 3grams"
+  echo "-------------------"
+  sed 's/'${oov_symbol}'/<unk>/g' $tgtdir/train.txt | \
+    ngram-count -lm - -order 3 -text - -vocab $tgtdir/vocab -unk -sort -maxent -maxent-convert-to-arpa|\
+    sed 's/<unk>/'${oov_symbol}'/g' | gzip -c > $tgtdir/3gram.me.gz || exit 1
+
+  echo "-------------------"
+  echo "Maxent 4grams"
+  echo "-------------------"
+  sed 's/'${oov_symbol}'/<unk>/g' $tgtdir/train.txt | \
+    ngram-count -lm - -order 4 -text - -vocab $tgtdir/vocab -unk -sort -maxent -maxent-convert-to-arpa|\
+    sed 's/<unk>/'${oov_symbol}'/g' | gzip -c > $tgtdir/4gram.me.gz || exit 1
+
+fi
+
+
+echo "--------------------"
+echo "Computing perplexity"
+echo "--------------------"
+(
+  for f in $tgtdir/2gram* ; do ( echo $f; ngram -order 2 -lm $f -unk -map-unk "$oov_symbol" -ppl $tgtdir/dev.txt ) | paste -s -d ' ' - ; done
+  for f in $tgtdir/3gram* ; do ( echo $f; ngram -order 3 -lm $f -unk -map-unk "$oov_symbol" -ppl $tgtdir/dev.txt ) | paste -s -d ' ' - ; done
+  for f in $tgtdir/4gram* ; do ( echo $f; ngram -order 4 -lm $f -unk -map-unk "$oov_symbol" -ppl $tgtdir/dev.txt ) | paste -s -d ' ' - ; done
+)  | sort  -r -n -k 15,15g | column -t | tee $tgtdir/perplexities.txt
+
+echo "The perlexity scores report is stored in $tgtdir/perplexities.txt "
+
+#This will link the lowest perplexity LM as the output LM.
+#ln -sf $tgtdir/`head -n 1 $tgtdir/perplexities.txt | cut -f 1 -d ' '` $outlm
+
+#A slight modification of the previous approach:
+#We look at the two lowest perplexity LMs and use a 3gram LM if one of the two, even if the 4gram is of lower ppl
+nof_trigram_lm=`head -n 2 $tgtdir/perplexities.txt | grep 3gram | wc -l`
+if [[ $nof_trigram_lm -eq 0 ]] ; then
+  lmfilename=`head -n 1 $tgtdir/perplexities.txt | cut -f 1 -d ' '`
+elif [[ $nof_trigram_lm -eq 2 ]] ; then
+  lmfilename=`head -n 1 $tgtdir/perplexities.txt | cut -f 1 -d ' '`
+else  #exactly one 3gram LM
+  lmfilename=`head -n 2 $tgtdir/perplexities.txt | grep 3gram | cut -f 1 -d ' '`
+fi
+(cd $tgtdir; ln -sf `basename $lmfilename` $outlm )
+
diff --git a/egs/mgb5/s5/path.sh b/egs/mgb5/s5/path.sh
new file mode 100644
index 00000000000..ebc3e1f4ee0
--- /dev/null
+++ b/egs/mgb5/s5/path.sh
@@ -0,0 +1,8 @@
+export KALDI_ROOT=`pwd`/../../..
+[ -f $KALDI_ROOT/tools/env.sh ] && . $KALDI_ROOT/tools/env.sh
+export PATH=$PWD/utils/:$KALDI_ROOT/tools/openfst/bin:$PWD:$PATH
+[ ! -f $KALDI_ROOT/tools/config/common_path.sh ] && echo >&2 "The standard file $KALDI_ROOT/tools/config/common_path.sh is not present -> Exit!" && exit 1
+. $KALDI_ROOT/tools/config/common_path.sh
+export LC_ALL=C
+export CUDA_CACHE_DISABLE=1
+
diff --git a/egs/mgb5/s5/run.sh b/egs/mgb5/s5/run.sh
new file mode 100755
index 00000000000..6fc21629f0f
--- /dev/null
+++ b/egs/mgb5/s5/run.sh
@@ -0,0 +1,190 @@
+#!/bin/bash
+
+# Copyright 2019 QCRI (Author:Ahmed Ali)
+# Apache 2.0
+
+
+stage=0
+
+# initialization PATH
+. ./path.sh  || die "path.sh expected";
+# initialization commands
+. ./cmd.sh
+. ./utils/parse_options.sh
+
+set -e -o pipefail
+
+
+nj=16
+dev_nj=16
+
+if [ $stage -le 1 ]; then
+  echo "Preparing data and training language models"
+  local/prepare_data.sh 
+  local/prepare_dict.sh 
+  utils/prepare_lang.sh data/local/dict "<UNK>" data/local/lang data/lang
+  local/prepare_lm.sh
+fi
+
+
+if [ $stage -le 2 ]; then
+  # Feature extraction
+  for x in train dev; do
+      steps/make_mfcc.sh --nj $nj --cmd "$train_cmd" data/$x exp/make_mfcc/$x mfcc
+      steps/compute_cmvn_stats.sh data/$x exp/make_mfcc/$x mfcc
+  done
+fi
+
+if [ $stage -le 3 ]; then
+  ### Monophone
+  echo "Starting monophone training."
+  utils/subset_data_dir.sh data/train 1000 data/train.1k
+  steps/train_mono.sh --nj $nj --cmd "$train_cmd" data/train.1k data/lang exp/mono
+  echo "Mono training done."
+
+  (
+  echo "Decoding the dev set using monophone models."
+  utils/mkgraph.sh data/lang_test exp/mono exp/mono/graph
+
+  steps/decode.sh --config conf/decode.config --nj $dev_nj --cmd "$decode_cmd" \
+    exp/mono/graph data/dev exp/mono/decode_dev
+  echo "Monophone decoding done."
+  ) &
+fi
+
+
+if [ $stage -le 4 ]; then
+  ### Triphone
+  echo "Starting triphone training."
+  steps/align_si.sh --nj $nj --cmd "$train_cmd" \
+      data/train data/lang exp/mono exp/mono_ali
+  steps/train_deltas.sh --boost-silence 1.25 --cmd "$train_cmd"  \
+      3200 30000 data/train data/lang exp/mono_ali exp/tri1
+  echo "Triphone training done."
+
+  (
+  echo "Decoding the dev set using triphone models."
+  utils/mkgraph.sh data/lang_test  exp/tri1 exp/tri1/graph
+  steps/decode.sh --nj $dev_nj --cmd "$decode_cmd"  \
+      exp/tri1/graph  data/dev exp/tri1/decode_dev
+
+  steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \
+      data/lang_test/ data/lang_big/ data/dev \
+      exp/tri1/decode_dev exp/tri1/decode_dev.rescored
+  echo "Triphone decoding done."
+  ) &
+fi
+
+if [ $stage -le 5 ]; then
+  ## Triphones + delta delta
+  # Training
+  echo "Starting (larger) triphone training."
+  steps/align_si.sh --nj $nj --cmd "$train_cmd" --use-graphs true \
+       data/train data/lang exp/tri1 exp/tri1_ali
+  steps/train_deltas.sh --cmd "$train_cmd"  \
+      4200 40000 data/train data/lang exp/tri1_ali exp/tri2a
+  echo "Triphone (large) training done."
+
+  (
+  echo "Decoding the dev set using triphone(large) models."
+  utils/mkgraph.sh data/lang_test exp/tri2a exp/tri2a/graph
+  steps/decode.sh --nj $dev_nj --cmd "$decode_cmd" \
+      exp/tri2a/graph data/dev exp/tri2a/decode_dev
+
+  steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \
+      data/lang_test/ data/lang_big/ data/dev \
+      exp/tri2a/decode_dev exp/tri2a/decode_dev.rescored
+  echo "Triphone(large) decoding done."
+  ) &
+fi
+
+if [ $stage -le 6 ]; then
+  ### Triphone + LDA and MLLT
+  # Training
+  echo "Starting LDA+MLLT training."
+  steps/align_si.sh --nj $nj --cmd "$train_cmd"  \
+      data/train data/lang exp/tri2a exp/tri2a_ali
+
+  steps/train_lda_mllt.sh --cmd "$train_cmd"  \
+    --splice-opts "--left-context=3 --right-context=3" \
+    4200 40000 data/train data/lang exp/tri2a_ali exp/tri2b
+  echo "LDA+MLLT training done."
+
+  (
+  echo "Decoding the dev set using LDA+MLLT models."
+  utils/mkgraph.sh data/lang_test exp/tri2b exp/tri2b/graph
+  steps/decode.sh --nj $dev_nj --cmd "$decode_cmd" \
+      exp/tri2b/graph data/dev exp/tri2b/decode_dev
+
+  steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \
+      data/lang_test/ data/lang_big/ data/dev \
+      exp/tri2b/decode_dev exp/tri2b/decode_dev.rescored
+  echo "LDA+MLLT decoding done."
+  ) &
+fi
+
+
+if [ $stage -le 7 ]; then
+  ### Triphone + LDA and MLLT + SAT and FMLLR
+  # Training
+  echo "Starting SAT+FMLLR training."
+  steps/align_si.sh --nj $nj --cmd "$train_cmd" \
+      --use-graphs true data/train data/lang exp/tri2b exp/tri2b_ali
+  steps/train_sat.sh --cmd "$train_cmd" 4200 40000 \
+      data/train data/lang exp/tri2b_ali exp/tri3b
+  echo "SAT+FMLLR training done."
+
+  (
+  echo "Decoding the dev set using SAT+FMLLR models."
+  utils/mkgraph.sh data/lang_test  exp/tri3b exp/tri3b/graph
+  steps/decode_fmllr.sh --nj $dev_nj --cmd "$decode_cmd" \
+      exp/tri3b/graph  data/dev exp/tri3b/decode_dev
+
+  steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \
+      data/lang_test/ data/lang_big/ data/dev \
+      exp/tri3b/decode_dev exp/tri3b/decode_dev.rescored
+  echo "SAT+FMLLR decoding done."
+  ) &
+fi
+
+
+if [ $stage -le 8 ]; then
+  echo "Starting SGMM training."
+  steps/align_fmllr.sh --nj $nj --cmd "$train_cmd" \
+      data/train data/lang exp/tri3b exp/tri3b_ali
+
+  steps/train_ubm.sh --cmd "$train_cmd"  \
+      600 data/train data/lang exp/tri3b_ali exp/ubm5b2
+
+  steps/train_sgmm2.sh --cmd "$train_cmd"  \
+       5200 12000 data/train data/lang exp/tri3b_ali exp/ubm5b2/final.ubm exp/sgmm2_5b2
+  echo "SGMM training done."
+
+  (
+  echo "Decoding the dev set using SGMM models"
+  # Graph compilation
+  utils/mkgraph.sh data/lang_test exp/sgmm2_5b2 exp/sgmm2_5b2/graph
+  utils/mkgraph.sh data/lang_big/ exp/sgmm2_5b2 exp/sgmm2_5b2/graph_big
+
+  steps/decode_sgmm2.sh --nj $dev_nj --cmd "$decode_cmd" \
+      --transform-dir exp/tri3b/decode_dev \
+      exp/sgmm2_5b2/graph data/dev exp/sgmm2_5b2/decode_dev
+
+  steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \
+      data/lang_test/ data/lang_big/ data/dev \
+      exp/sgmm2_5b2/decode_dev exp/sgmm2_5b2/decode_dev.rescored
+
+  steps/decode_sgmm2.sh --nj $dev_nj --cmd "$decode_cmd" \
+      --transform-dir exp/tri3b/decode_dev \
+      exp/sgmm2_5b2/graph_big data/dev exp/sgmm2_5b2/decode_dev.big
+  echo "SGMM decoding done."
+  ) &
+fi
+
+wait;
+
+time bash -x ./local/chain/run_tdnn.sh &> chain_run_tdnn.log
+#score
+for x in exp/chain/*/decode* exp/*/decode*; do [ -d $x ] && grep WER $x/wer_* | utils/best_wer.sh; done | sort -k2 -n > RESULTS
+
+
diff --git a/egs/mgb5/s5/steps b/egs/mgb5/s5/steps
new file mode 120000
index 00000000000..1b186770dd1
--- /dev/null
+++ b/egs/mgb5/s5/steps
@@ -0,0 +1 @@
+../../wsj/s5/steps/
\ No newline at end of file
diff --git a/egs/mgb5/s5/utils b/egs/mgb5/s5/utils
new file mode 120000
index 00000000000..a3279dc8679
--- /dev/null
+++ b/egs/mgb5/s5/utils
@@ -0,0 +1 @@
+../../wsj/s5/utils/
\ No newline at end of file

From 5fbc9eb5ff13aec4d3aaf94f13d2c77ad37bb978 Mon Sep 17 00:00:00 2001
From: Daniel Povey <dpovey@gmail.com>
Date: Thu, 9 May 2019 12:50:20 -0400
Subject: [PATCH 103/163] Revert "[scripts] Clean up filehandle usage in
 split_scp.pl (#3285)" (#3307)

This reverts commit b17fc84472b9526b518c907148e74958f622135f.
---
 egs/wsj/s5/utils/split_scp.pl | 27 ++++++++++++---------------
 1 file changed, 12 insertions(+), 15 deletions(-)

diff --git a/egs/wsj/s5/utils/split_scp.pl b/egs/wsj/s5/utils/split_scp.pl
index 79c92faac5c..f60219938a0 100755
--- a/egs/wsj/s5/utils/split_scp.pl
+++ b/egs/wsj/s5/utils/split_scp.pl
@@ -90,17 +90,16 @@
 }
 
 if ($utt2spk_file ne "") {  # We have the --utt2spk option...
-    open($u_fh, '<', $utt2spk_file) || die "Failed to open utt2spk file $utt2spk_file";
-    while(<$u_fh>) {
+    open(U, "<$utt2spk_file") || die "Failed to open utt2spk file $utt2spk_file";
+    while(<U>) {
         @A = split;
         @A == 2 || die "Bad line $_ in utt2spk file $utt2spk_file";
         ($u,$s) = @A;
         $utt2spk{$u} = $s;
     }
-    close $u_fh;
-    open($i_fh, '<', $inscp) || die "Opening input scp file $inscp";
+    open(I, "<$inscp") || die "Opening input scp file $inscp";
     @spkrs = ();
-    while(<$i_fh>) {
+    while(<I>) {
         @A = split;
         if(@A == 0) { die "Empty or space-only line in scp file $inscp"; }
         $u = $A[0];
@@ -114,7 +113,6 @@
         $spk_count{$s}++;
         push @{$spk_data{$s}}, $_;
     }
-    close $i_fh;
     # Now split as equally as possible ..
     # First allocate spks to files by allocating an approximately
     # equal number of speakers.
@@ -185,32 +183,31 @@
     # Now print out the files...
     for($scpidx = 0; $scpidx < $numscps; $scpidx++) {
         $scpfn = $OUTPUTS[$scpidx];
-        open($f_fh, '>', $scpfn) || die "Could not open scp file $scpfn for writing.";
+        open(F, ">$scpfn") || die "Could not open scp file $scpfn for writing.";
         $count = 0;
         if(@{$scparray[$scpidx]} == 0) {
             print STDERR "Error: split_scp.pl producing empty .scp file $scpfn (too many splits and too few speakers?)\n";
             $error = 1;
         } else {
             foreach $spk ( @{$scparray[$scpidx]} ) {
-                print $f_fh @{$spk_data{$spk}};
+                print F @{$spk_data{$spk}};
                 $count += $spk_count{$spk};
             }
             if($count != $scpcount[$scpidx]) { die "Count mismatch [code error]"; }
         }
-        close($f_fh);
+        close(F);
     }
 } else {
    # This block is the "normal" case where there is no --utt2spk
    # option and we just break into equal size chunks.
 
-    open($i_fh, '<', $inscp) || die "Opening input scp file $inscp";
+    open(I, "<$inscp") || die "Opening input scp file $inscp";
 
     $numscps = @OUTPUTS;  # size of array.
     @F = ();
-    while(<$i_fh>) {
+    while(<I>) {
         push @F, $_;
     }
-    close $i_fh;
     $numlines = @F;
     if($numlines == 0) {
         print STDERR "split_scp.pl: error: empty input scp file $inscp , ";
@@ -224,11 +221,11 @@
     $n = 0;
     for($scpidx = 0; $scpidx < @OUTPUTS; $scpidx++) {
         $scpfile = $OUTPUTS[$scpidx];
-        open($o_fh, '>', $scpfile) || die "Opening output scp file $scpfile";
+        open(O, ">$scpfile") || die "Opening output scp file $scpfile";
         for($k = 0; $k < $linesperscp + ($scpidx < $remainder ? 1 : 0); $k++) {
-            print $o_fh $F[$n++];
+            print O $F[$n++];
         }
-        close($o_fh) || die "Closing scp file $scpfile";
+        close(O) || die "Closing scp file $scpfile";
     }
     $n == $numlines || die "split_scp.pl: code error., $n != $numlines";
 }

From b78d92e6da5f10a16d4d8e766021ccd5d8313656 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E9=BB=84=E5=9F=B9=E6=9D=BE?= <bringtree@qq.com>
Date: Fri, 10 May 2019 05:43:57 +0800
Subject: [PATCH 104/163] [src] Fix bug in GeneralMatrix::Uncompress() (#3304)

---
 src/matrix/sparse-matrix.cc | 1 +
 1 file changed, 1 insertion(+)

diff --git a/src/matrix/sparse-matrix.cc b/src/matrix/sparse-matrix.cc
index 896d70ae799..0e854d1ff9a 100644
--- a/src/matrix/sparse-matrix.cc
+++ b/src/matrix/sparse-matrix.cc
@@ -810,6 +810,7 @@ void GeneralMatrix::Compress() {
 
 void GeneralMatrix::Uncompress() {
   if (cmat_.NumRows() != 0) {
+    mat_.Resize(cmat_.NumRows(), cmat_.NumCols(), kUndefined);
     cmat_.CopyToMat(&mat_);
     cmat_.Clear();
   }

From fee2acd27f661888d3db4d6a7730f6eb94e875fc Mon Sep 17 00:00:00 2001
From: ChunhuiW <tiantang0809@163.com>
Date: Fri, 10 May 2019 11:06:45 +0800
Subject: [PATCH 105/163] [doc] add an omission in Doxyfile (#3309)

---
 src/Doxyfile | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/Doxyfile b/src/Doxyfile
index a6c0b434ff2..0032214f803 100644
--- a/src/Doxyfile
+++ b/src/Doxyfile
@@ -457,7 +457,7 @@ INPUT    = doc itf \
           fstext hmm lm decoder lat cudamatrix nnet \
           bin fstbin gmmbin fgmmbin featbin \
           nnetbin latbin sgmm2 sgmm2bin nnet2 nnet2bin nnet3 nnet3bin \
-          kwsbin ivector ivectorbin
+          kws kwsbin ivector ivectorbin online onlinebin online2 online2bin
 
 # If the value of the INPUT tag contains directories, you can use the
 # FILE_PATTERNS tag to specify one or more wildcard pattern (like *.cpp

From de81d0c7f62d55c5cd2d1a3d7e333775280a55f8 Mon Sep 17 00:00:00 2001
From: "kkm (aka Kirill Katsnelson)" <kkm@smartaction.com>
Date: Fri, 10 May 2019 15:53:34 -0700
Subject: [PATCH 106/163] [scripts] Fix utils/split_scp.pl breakage (#3308)

---
 egs/wsj/s5/utils/split_scp.pl | 77 ++++++++++++++++++++---------------
 1 file changed, 45 insertions(+), 32 deletions(-)

diff --git a/egs/wsj/s5/utils/split_scp.pl b/egs/wsj/s5/utils/split_scp.pl
index f60219938a0..dc798282f79 100755
--- a/egs/wsj/s5/utils/split_scp.pl
+++ b/egs/wsj/s5/utils/split_scp.pl
@@ -1,7 +1,9 @@
 #!/usr/bin/env perl
-use warnings; #sed replacement for -w perl parameter
+
 # Copyright 2010-2011 Microsoft Corporation
 
+# See ../../COPYING for clarification regarding multiple authors
+#
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
@@ -16,7 +18,6 @@
 # limitations under the License.
 
 
-
 # This program splits up any kind of .scp or archive-type file.
 # If there is no utt2spk option it will work on any text  file and
 # will split it up with an approximately equal number of lines in
@@ -41,6 +42,8 @@
 # [note: with this option, it assumes zero-based indexing of the split parts,
 # i.e. the second number must be 0 <= n < num-jobs.]
 
+use warnings;
+
 $num_jobs = 0;
 $job_id = 0;
 $utt2spk_file = "";
@@ -51,9 +54,6 @@
         shift @ARGV;
         $num_jobs = shift @ARGV;
         $job_id = shift @ARGV;
-        if ($num_jobs <= 0 || $job_id < 0 || $job_id >= $num_jobs) {
-            die "Invalid num-jobs and job-id: $num_jobs and $job_id";
-        }
     }
     if ($ARGV[0] =~ /--utt2spk=(.+)/) {
         $utt2spk_file=$1;
@@ -65,13 +65,20 @@
     }
 }
 
+if ($num_jobs != 0 && ($num_jobs < 0 || $job_id - $one_based < 0 ||
+                       $job_id - $one_based >= $num_jobs)) {
+  die "$0: Invalid job number/index values for '-j $num_jobs $job_id" .
+      ($one_based ? " --one-based" : "") . "'\n"
+}
+
 $one_based
     and $job_id--;
 
 if(($num_jobs == 0 && @ARGV < 2) || ($num_jobs > 0 && (@ARGV < 1 || @ARGV > 2))) {
-    die "Usage: split_scp.pl [--utt2spk=<utt2spk_file>] in.scp out1.scp out2.scp ... \n" .
-        " or: split_scp.pl -j num-jobs job-id [--one-based] [--utt2spk=<utt2spk_file>] in.scp [out.scp]\n" .
-        " ... where 0 <= job-id < num-jobs.";
+    die
+"Usage: split_scp.pl [--utt2spk=<utt2spk_file>] in.scp out1.scp out2.scp ...
+   or: split_scp.pl -j num-jobs job-id [--one-based] [--utt2spk=<utt2spk_file>] in.scp [out.scp]
+ ... where 0 <= job-id < num-jobs, or 1 <= job-id <- num-jobs if --one-based.\n";
 }
 
 $error = 0;
@@ -90,21 +97,22 @@
 }
 
 if ($utt2spk_file ne "") {  # We have the --utt2spk option...
-    open(U, "<$utt2spk_file") || die "Failed to open utt2spk file $utt2spk_file";
-    while(<U>) {
+    open($u_fh, '<', $utt2spk_file) || die "$0: Error opening utt2spk file $utt2spk_file: $!\n";
+    while(<$u_fh>) {
         @A = split;
-        @A == 2 || die "Bad line $_ in utt2spk file $utt2spk_file";
+        @A == 2 || die "$0: Bad line $_ in utt2spk file $utt2spk_file\n";
         ($u,$s) = @A;
         $utt2spk{$u} = $s;
     }
-    open(I, "<$inscp") || die "Opening input scp file $inscp";
+    close $u_fh;
+    open($i_fh, '<', $inscp) || die "$0: Error opening input scp file $inscp: $!\n";
     @spkrs = ();
-    while(<I>) {
+    while(<$i_fh>) {
         @A = split;
-        if(@A == 0) { die "Empty or space-only line in scp file $inscp"; }
+        if(@A == 0) { die "$0: Empty or space-only line in scp file $inscp\n"; }
         $u = $A[0];
         $s = $utt2spk{$u};
-        if(!defined $s) { die "No such utterance $u in utt2spk file $utt2spk_file"; }
+        defined $s || die "$0: No utterance $u in utt2spk file $utt2spk_file\n";
         if(!defined $spk_count{$s}) {
             push @spkrs, $s;
             $spk_count{$s} = 0;
@@ -119,8 +127,8 @@
     $numspks = @spkrs;  # number of speakers.
     $numscps = @OUTPUTS; # number of output files.
     if ($numspks < $numscps) {
-      die "Refusing to split data because number of speakers $numspks is less " .
-          "than the number of output .scp files $numscps";
+      die "$0: Refusing to split data because number of speakers $numspks " .
+          "is less than the number of output .scp files $numscps\n";
     }
     for($scpidx = 0; $scpidx < $numscps; $scpidx++) {
         $scparray[$scpidx] = []; # [] is array reference.
@@ -182,52 +190,57 @@
     }
     # Now print out the files...
     for($scpidx = 0; $scpidx < $numscps; $scpidx++) {
-        $scpfn = $OUTPUTS[$scpidx];
-        open(F, ">$scpfn") || die "Could not open scp file $scpfn for writing.";
+        $scpfile = $OUTPUTS[$scpidx];
+        ($scpfile ne '-' ? open($f_fh, '>', $scpfile)
+                         : open($f_fh, '>&', \*STDOUT)) ||
+            die "$0: Could not open scp file $scpfile for writing: $!\n";
         $count = 0;
         if(@{$scparray[$scpidx]} == 0) {
-            print STDERR "Error: split_scp.pl producing empty .scp file $scpfn (too many splits and too few speakers?)\n";
+            print STDERR "$0: eError: split_scp.pl producing empty .scp file " .
+                         "$scpfile (too many splits and too few speakers?)\n";
             $error = 1;
         } else {
             foreach $spk ( @{$scparray[$scpidx]} ) {
-                print F @{$spk_data{$spk}};
+                print $f_fh @{$spk_data{$spk}};
                 $count += $spk_count{$spk};
             }
-            if($count != $scpcount[$scpidx]) { die "Count mismatch [code error]"; }
+            $count == $scpcount[$scpidx] || die "Count mismatch [code error]";
         }
-        close(F);
+        close($f_fh);
     }
 } else {
    # This block is the "normal" case where there is no --utt2spk
    # option and we just break into equal size chunks.
 
-    open(I, "<$inscp") || die "Opening input scp file $inscp";
+    open($i_fh, '<', $inscp) || die "$0: Error opening input scp file $inscp: $!\n";
 
     $numscps = @OUTPUTS;  # size of array.
     @F = ();
-    while(<I>) {
+    while(<$i_fh>) {
         push @F, $_;
     }
     $numlines = @F;
     if($numlines == 0) {
-        print STDERR "split_scp.pl: error: empty input scp file $inscp , ";
+        print STDERR "$0: error: empty input scp file $inscp\n";
         $error = 1;
     }
     $linesperscp = int( $numlines / $numscps); # the "whole part"..
-    $linesperscp >= 1 || die "You are splitting into too many pieces! [reduce \$nj]";
+    $linesperscp >= 1 || die "$0: You are splitting into too many pieces! [reduce \$nj]\n";
     $remainder = $numlines - ($linesperscp * $numscps);
     ($remainder >= 0 && $remainder < $numlines) || die "bad remainder $remainder";
     # [just doing int() rounds down].
     $n = 0;
     for($scpidx = 0; $scpidx < @OUTPUTS; $scpidx++) {
         $scpfile = $OUTPUTS[$scpidx];
-        open(O, ">$scpfile") || die "Opening output scp file $scpfile";
+        ($scpfile ne '-' ? open($o_fh, '>', $scpfile)
+                         : open($o_fh, '>&', \*STDOUT)) ||
+            die "$0: Could not open scp file $scpfile for writing: $!\n";
         for($k = 0; $k < $linesperscp + ($scpidx < $remainder ? 1 : 0); $k++) {
-            print O $F[$n++];
+            print $o_fh $F[$n++];
         }
-        close(O) || die "Closing scp file $scpfile";
+        close($o_fh) || die "$0: Eror closing scp file $scpfile: $!\n";
     }
-    $n == $numlines || die "split_scp.pl: code error., $n != $numlines";
+    $n == $numlines || die "$n != $numlines [code error]";
 }
 
-exit ($error ? 1 : 0);
+exit ($error);

From 3453b5ac94a06147c7bb8b05f0e1011f0baeba70 Mon Sep 17 00:00:00 2001
From: saikiranvalluri <41471921+saikiranvalluri@users.noreply.github.com>
Date: Sat, 11 May 2019 23:50:19 +0530
Subject: [PATCH 107/163] [egs] Bug-fix to shebang in fisher_callhome_spanish
 (#3312)

---
 egs/fisher_callhome_spanish/s5/local/merge_lexicons.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/egs/fisher_callhome_spanish/s5/local/merge_lexicons.py b/egs/fisher_callhome_spanish/s5/local/merge_lexicons.py
index b42eb52d20a..c7aa6affb11 100755
--- a/egs/fisher_callhome_spanish/s5/local/merge_lexicons.py
+++ b/egs/fisher_callhome_spanish/s5/local/merge_lexicons.py
@@ -1,8 +1,8 @@
-# Copyright 2014  Gaurav Kumar.   Apache 2.0
-#    2018  Saikiran Valluri, GoVivace inc., Avaaya
 #!/usr/bin/env python
 # -*- coding: utf-8 -*-
 #
+# Copyright 2014  Gaurav Kumar.   Apache 2.0
+#    2018  Nagendra Kumar Goel, Saikiran Valluri, GoVivace inc., Avaaya
 # Merges unique words from Spanish Fisher, Gigaword and the LDC spanish lexicon
 from __future__ import print_function
 import sys, re

From 5ca7f589f173c0a29a441eacc53597923d65ed7b Mon Sep 17 00:00:00 2001
From: "kkm (aka Kirill Katsnelson)" <kkm@smartaction.com>
Date: Sat, 11 May 2019 12:41:58 -0700
Subject: [PATCH 108/163] [scripts] Fix error messages in run.pl (#3314)

---
 egs/wsj/s5/utils/parallel/run.pl | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/egs/wsj/s5/utils/parallel/run.pl b/egs/wsj/s5/utils/parallel/run.pl
index f23bb8dc0b0..d648abd2382 100755
--- a/egs/wsj/s5/utils/parallel/run.pl
+++ b/egs/wsj/s5/utils/parallel/run.pl
@@ -72,13 +72,13 @@
     $jobname = $1;
     $jobstart = $2;
     $jobend = $3;
-    shift;
     if ($jobstart > $jobend) {
       die "run.pl: invalid job range $ARGV[0]";
     }
     if ($jobstart <= 0) {
       die "run.pl: invalid job range $ARGV[0], start must be strictly positive (this is required for GridEngine compatibility).";
     }
+    shift;
   } elsif ($ARGV[0] =~ m/^([\w_][\w\d_]*)+=(\d+)$/) { # e.g. JOB=1.
     $jobname = $1;
     $jobstart = $2;
@@ -181,7 +181,7 @@
         delete $active_pids{$r};
         # print STDERR "Finished: $r/$jid " .  Dumper(\%active_pids) . "\n";
     } else {
-        die "run.pl: Cannot find the PID of the chold process that just finished.";
+        die "run.pl: Cannot find the PID of the child process that just finished.";
     }
 
     # In theory we could do a non-blocking waitpid over all jobs running just
@@ -243,7 +243,7 @@
 # Some sanity checks:
 # The $fail array should not contain undefined codes
 # The number of non-zeros in that array  should be equal to $numfail
-# We cannot do foreach() here, as the JOB ids do not necessarily start by zero
+# We cannot do foreach() here, as the JOB ids do not start at zero
 $failed_jids=0;
 for ($jobid = $jobstart; $jobid <= $jobend; $jobid++) {
   $job_return = $fail[$jobid];

From e2dc9c33ff070c03e5adbf958622c88a56fd0b54 Mon Sep 17 00:00:00 2001
From: Vimal Manohar <vimal.manohar91@gmail.com>
Date: Sat, 11 May 2019 20:37:02 -0400
Subject: [PATCH 109/163] [egs] New chime-5 recipe (#2893)

---
 egs/chime5/s5/cmd.sh                          |   2 +-
 .../s5/local/chain/tuning/run_tdnn_1a.sh      |  15 -
 .../s5/local/nnet3/run_ivector_common.sh      |   2 +-
 egs/chime5/s5/local/run_wpe.py                |  54 +++
 egs/chime5/s5/local/run_wpe.sh                |  85 ++++
 egs/chime5/s5b/RESULTS                        |  33 ++
 egs/chime5/s5b/cmd.sh                         |  15 +
 egs/chime5/s5b/conf/beamformit.cfg            |  50 +++
 egs/chime5/s5b/conf/mfcc.conf                 |   2 +
 egs/chime5/s5b/conf/mfcc_hires.conf           |  10 +
 egs/chime5/s5b/conf/online_cmvn.conf          |   1 +
 egs/chime5/s5b/local/chain/run_tdnn.sh        |   1 +
 .../chain/tuning/run_cnn_tdnn_lstm_1a.sh      | 304 ++++++++++++++
 .../s5b/local/chain/tuning/run_tdnn_1a.sh     | 270 +++++++++++++
 .../s5b/local/chain/tuning/run_tdnn_1b.sh     | 249 ++++++++++++
 .../local/chain/tuning/run_tdnn_lstm_1a.sh    | 297 ++++++++++++++
 egs/chime5/s5b/local/check_tools.sh           |  71 ++++
 egs/chime5/s5b/local/copy_lat_dir_parallel.sh |  97 +++++
 egs/chime5/s5b/local/distant_audio_list       | 376 ++++++++++++++++++
 egs/chime5/s5b/local/extract_noises.py        |  83 ++++
 egs/chime5/s5b/local/extract_vad_weights.sh   |  86 ++++
 egs/chime5/s5b/local/json2text.py             |  84 ++++
 egs/chime5/s5b/local/make_noise_list.py       |  17 +
 .../chain => s5b/local/nnet3}/compare_wer.sh  |  15 +-
 egs/chime5/s5b/local/nnet3/decode.sh          | 162 ++++++++
 .../s5b/local/nnet3/run_ivector_common.sh     | 151 +++++++
 egs/chime5/s5b/local/prepare_data.sh          | 136 +++++++
 egs/chime5/s5b/local/prepare_dict.sh          | 124 ++++++
 egs/chime5/s5b/local/reverberate_lat_dir.sh   |  93 +++++
 egs/chime5/s5b/local/run_beamformit.sh        |  87 ++++
 egs/chime5/s5b/local/run_recog.sh             | 164 ++++++++
 egs/chime5/s5b/local/run_wpe.py               |  59 +++
 egs/chime5/s5b/local/run_wpe.sh               |  85 ++++
 egs/chime5/s5b/local/score.sh                 |   1 +
 egs/chime5/s5b/local/score_for_submit.sh      | 119 ++++++
 egs/chime5/s5b/local/train_lms_srilm.sh       | 261 ++++++++++++
 egs/chime5/s5b/local/wer_output_filter        |  25 ++
 egs/chime5/s5b/local/worn_audio_list          |  64 +++
 egs/chime5/s5b/path.sh                        |   7 +
 egs/chime5/s5b/run.sh                         | 297 ++++++++++++++
 egs/chime5/s5b/steps                          |   1 +
 egs/chime5/s5b/utils                          |   1 +
 egs/wsj/s5/steps/conf/get_ctm_conf.sh         |  17 +-
 .../s5/steps/online/nnet2/extract_ivectors.sh |  42 +-
 egs/wsj/s5/utils/perturb_data_dir_speed.sh    |   2 +-
 egs/wsj/s5/utils/subset_data_dir.sh           |   4 +
 46 files changed, 4072 insertions(+), 49 deletions(-)
 create mode 100644 egs/chime5/s5/local/run_wpe.py
 create mode 100755 egs/chime5/s5/local/run_wpe.sh
 create mode 100644 egs/chime5/s5b/RESULTS
 create mode 100644 egs/chime5/s5b/cmd.sh
 create mode 100755 egs/chime5/s5b/conf/beamformit.cfg
 create mode 100644 egs/chime5/s5b/conf/mfcc.conf
 create mode 100644 egs/chime5/s5b/conf/mfcc_hires.conf
 create mode 100644 egs/chime5/s5b/conf/online_cmvn.conf
 create mode 120000 egs/chime5/s5b/local/chain/run_tdnn.sh
 create mode 100755 egs/chime5/s5b/local/chain/tuning/run_cnn_tdnn_lstm_1a.sh
 create mode 100755 egs/chime5/s5b/local/chain/tuning/run_tdnn_1a.sh
 create mode 100755 egs/chime5/s5b/local/chain/tuning/run_tdnn_1b.sh
 create mode 100755 egs/chime5/s5b/local/chain/tuning/run_tdnn_lstm_1a.sh
 create mode 100755 egs/chime5/s5b/local/check_tools.sh
 create mode 100755 egs/chime5/s5b/local/copy_lat_dir_parallel.sh
 create mode 100644 egs/chime5/s5b/local/distant_audio_list
 create mode 100755 egs/chime5/s5b/local/extract_noises.py
 create mode 100755 egs/chime5/s5b/local/extract_vad_weights.sh
 create mode 100755 egs/chime5/s5b/local/json2text.py
 create mode 100755 egs/chime5/s5b/local/make_noise_list.py
 rename egs/chime5/{s5/local/chain => s5b/local/nnet3}/compare_wer.sh (84%)
 create mode 100755 egs/chime5/s5b/local/nnet3/decode.sh
 create mode 100755 egs/chime5/s5b/local/nnet3/run_ivector_common.sh
 create mode 100755 egs/chime5/s5b/local/prepare_data.sh
 create mode 100755 egs/chime5/s5b/local/prepare_dict.sh
 create mode 100755 egs/chime5/s5b/local/reverberate_lat_dir.sh
 create mode 100755 egs/chime5/s5b/local/run_beamformit.sh
 create mode 100755 egs/chime5/s5b/local/run_recog.sh
 create mode 100755 egs/chime5/s5b/local/run_wpe.py
 create mode 100755 egs/chime5/s5b/local/run_wpe.sh
 create mode 120000 egs/chime5/s5b/local/score.sh
 create mode 100755 egs/chime5/s5b/local/score_for_submit.sh
 create mode 100755 egs/chime5/s5b/local/train_lms_srilm.sh
 create mode 100755 egs/chime5/s5b/local/wer_output_filter
 create mode 100644 egs/chime5/s5b/local/worn_audio_list
 create mode 100644 egs/chime5/s5b/path.sh
 create mode 100755 egs/chime5/s5b/run.sh
 create mode 120000 egs/chime5/s5b/steps
 create mode 120000 egs/chime5/s5b/utils

diff --git a/egs/chime5/s5/cmd.sh b/egs/chime5/s5/cmd.sh
index a697a22cda3..9702501f1a7 100644
--- a/egs/chime5/s5/cmd.sh
+++ b/egs/chime5/s5/cmd.sh
@@ -10,6 +10,6 @@
 # conf/queue.conf in http://kaldi-asr.org/doc/queue.html for more information,
 # or search for the string 'default_config' in utils/queue.pl or utils/slurm.pl.
 
-export train_cmd="queue.pl --mem 2G"
+export train_cmd="retry.pl queue.pl --mem 2G"
 export decode_cmd="queue.pl --mem 4G"
 
diff --git a/egs/chime5/s5/local/chain/tuning/run_tdnn_1a.sh b/egs/chime5/s5/local/chain/tuning/run_tdnn_1a.sh
index 5418ecf2b4f..d60e6a4aa04 100755
--- a/egs/chime5/s5/local/chain/tuning/run_tdnn_1a.sh
+++ b/egs/chime5/s5/local/chain/tuning/run_tdnn_1a.sh
@@ -24,21 +24,16 @@ decode_iter=
 # training options
 # training chunk-options
 chunk_width=140,100,160
-# we don't need extra left/right context for TDNN systems.
-chunk_left_context=0
-chunk_right_context=0
 common_egs_dir=
 xent_regularize=0.1
 
 # training options
 srand=0
 remove_egs=true
-reporting_email=
 
 #decode options
 test_online_decoding=false  # if true, it will run the last decoding stage.
 
-
 # End configuration section.
 echo "$0 $@"  # Print the command line for logging
 
@@ -176,7 +171,6 @@ EOF
   steps/nnet3/xconfig_to_configs.py --xconfig-file $dir/configs/network.xconfig --config-dir $dir/configs/
 fi
 
-
 if [ $stage -le 14 ]; then
   if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then
     utils/create_split_dir.pl \
@@ -204,15 +198,10 @@ if [ $stage -le 14 ]; then
     --trainer.num-chunk-per-minibatch=256,128,64 \
     --trainer.optimization.momentum=0.0 \
     --egs.chunk-width=$chunk_width \
-    --egs.chunk-left-context=$chunk_left_context \
-    --egs.chunk-right-context=$chunk_right_context \
-    --egs.chunk-left-context-initial=0 \
-    --egs.chunk-right-context-final=0 \
     --egs.dir="$common_egs_dir" \
     --egs.opts="--frames-overlap-per-eg 0" \
     --cleanup.remove-egs=$remove_egs \
     --use-gpu=true \
-    --reporting.email="$reporting_email" \
     --feat-dir=$train_data_dir \
     --tree-dir=$tree_dir \
     --lat-dir=$lat_dir \
@@ -235,10 +224,6 @@ if [ $stage -le 16 ]; then
     (
       steps/nnet3/decode.sh \
           --acwt 1.0 --post-decode-acwt 10.0 \
-          --extra-left-context $chunk_left_context \
-          --extra-right-context $chunk_right_context \
-          --extra-left-context-initial 0 \
-          --extra-right-context-final 0 \
           --frames-per-chunk $frames_per_chunk \
           --nj 8 --cmd "$decode_cmd"  --num-threads 4 \
           --online-ivector-dir exp/nnet3${nnet3_affix}/ivectors_${data}_hires \
diff --git a/egs/chime5/s5/local/nnet3/run_ivector_common.sh b/egs/chime5/s5/local/nnet3/run_ivector_common.sh
index e28e5ce996d..2b672063be7 100755
--- a/egs/chime5/s5/local/nnet3/run_ivector_common.sh
+++ b/egs/chime5/s5/local/nnet3/run_ivector_common.sh
@@ -23,7 +23,7 @@ nnet3_affix=_train_worn_u100k
 gmm_dir=exp/${gmm}
 ali_dir=exp/${gmm}_ali_${train_set}_sp
 
-for f in data/${train_set}/feats.scp ${gmm_dir}/final.mdl; do
+for f in data/${train_set}/utt2spk ${gmm_dir}/final.mdl; do
   if [ ! -f $f ]; then
     echo "$0: expected file $f to exist"
     exit 1
diff --git a/egs/chime5/s5/local/run_wpe.py b/egs/chime5/s5/local/run_wpe.py
new file mode 100644
index 00000000000..cc9cd41927a
--- /dev/null
+++ b/egs/chime5/s5/local/run_wpe.py
@@ -0,0 +1,54 @@
+#!/usr/bin/env python
+# Copyright 2018 Johns Hopkins University (Author: Aswin Shanmugam Subramanian)
+# Apache 2.0
+# Works with both python2 and python3
+
+import numpy as np
+import soundfile as sf
+import time
+import os, errno
+from tqdm import tqdm
+import argparse
+
+from nara_wpe.wpe import wpe
+from nara_wpe.utils import stft, istft 
+from nara_wpe import project_root
+
+parser = argparse.ArgumentParser()
+parser.add_argument('--files', '-f', nargs='+')
+args = parser.parse_args()
+
+input_files = args.files[:len(args.files)//2]
+output_files = args.files[len(args.files)//2:]
+out_dir = os.path.dirname(output_files[0])
+try: 
+    os.makedirs(out_dir)
+except OSError as e:
+    if e.errno != errno.EEXIST:
+        raise
+
+stft_options = dict(
+    size=512,
+    shift=128,
+    window_length=None,
+    fading=True,
+    pad=True,
+    symmetric_window=False
+)
+
+sampling_rate = 16000
+delay = 3
+iterations = 5
+taps = 10
+
+signal_list = [
+    sf.read(f)[0]
+    for f in input_files
+]
+y = np.stack(signal_list, axis=0)
+Y = stft(y, **stft_options).transpose(2, 0, 1)
+Z = wpe(Y, iterations=iterations, statistics_mode='full').transpose(1, 2, 0)
+z = istft(Z, size=stft_options['size'], shift=stft_options['shift'])
+
+for d in range(len(signal_list)):
+    sf.write(output_files[d], z[d,:], sampling_rate)
diff --git a/egs/chime5/s5/local/run_wpe.sh b/egs/chime5/s5/local/run_wpe.sh
new file mode 100755
index 00000000000..8ecbbd6182a
--- /dev/null
+++ b/egs/chime5/s5/local/run_wpe.sh
@@ -0,0 +1,85 @@
+#!/bin/bash
+# Copyright 2018 Johns Hopkins University (Author: Aswin Shanmugam Subramanian)
+# Apache 2.0
+
+. ./cmd.sh
+. ./path.sh
+
+# Config:
+nj=4
+cmd=run.pl
+
+. utils/parse_options.sh || exit 1;
+
+if [ $# != 3 ]; then
+   echo "Wrong #arguments ($#, expected 3)"
+   echo "Usage: local/run_wpe.sh [options] <wav-in-dir> <wav-out-dir> <array-id>"
+   echo "main options (for others, see top of script file)"
+   echo "  --cmd <cmd>                              # Command to run in parallel with"
+   echo "  --nj 50                        # number of jobs for parallel processing"
+   exit 1;
+fi
+
+sdir=$1
+odir=$2
+array=$3
+task=`basename $sdir`
+expdir=exp/wpe/${task}_${array}
+# Set bash to 'debug' mode, it will exit on :
+# -e 'error', -u 'undefined variable', -o ... 'error in pipeline', -x 'print commands',
+set -e
+set -u
+set -o pipefail
+
+miniconda_dir=$HOME/miniconda3/
+if [ ! -d $miniconda_dir ]; then
+    echo "$miniconda_dir does not exist. Please run '../../../tools/extras/install_miniconda.sh' and '../../../tools/extras/install_wpe.sh';"
+fi
+
+# check if WPE is installed
+result=`$HOME/miniconda3/bin/python -c "\
+try:
+    import nara_wpe
+    print('1')
+except ImportError:
+    print('0')"`
+
+if [ "$result" == "1" ]; then
+    echo "WPE is installed"
+else
+    echo "WPE is not installed. Please run ../../../tools/extras/install_wpe.sh"
+    exit 1
+fi
+
+mkdir -p $odir
+mkdir -p $expdir/log
+
+# wavfiles.list can be used as the name of the output files
+output_wavfiles=$expdir/wavfiles.list
+find -L ${sdir} | grep -i ${array} > $expdir/channels_input
+cat $expdir/channels_input | awk -F '/' '{print $NF}' | sed "s@S@$odir\/S@g" > $expdir/channels_output
+paste -d" " $expdir/channels_input $expdir/channels_output > $output_wavfiles
+
+# split the list for parallel processing
+split_wavfiles=""
+for n in `seq $nj`; do
+  split_wavfiles="$split_wavfiles $output_wavfiles.$n"
+done
+utils/split_scp.pl $output_wavfiles $split_wavfiles || exit 1;
+
+echo -e "Dereverberation - $task - $array\n"
+# making a shell script for each job
+for n in `seq $nj`; do
+cat <<-EOF > $expdir/log/wpe.$n.sh
+while read line; do
+  $HOME/miniconda3/bin/python local/run_wpe.py \
+    --file \$line
+done < $output_wavfiles.$n
+EOF
+done
+
+chmod a+x $expdir/log/wpe.*.sh
+$cmd JOB=1:$nj $expdir/log/wpe.JOB.log \
+  $expdir/log/wpe.JOB.sh
+
+echo "`basename $0` Done."
diff --git a/egs/chime5/s5b/RESULTS b/egs/chime5/s5b/RESULTS
new file mode 100644
index 00000000000..0dcea1f0031
--- /dev/null
+++ b/egs/chime5/s5b/RESULTS
@@ -0,0 +1,33 @@
+
+# tri2
+%WER 76.40 [ 44985 / 58881, 3496 ins, 17652 del, 23837 sub ] exp/tri2/decode_dev_worn/wer_13_1.0
+%WER 93.56 [ 55091 / 58881, 2132 ins, 35555 del, 17404 sub ] exp/tri2/decode_dev_beamformit_ref/wer_17_1.0
+
+# tri3
+%WER 72.81 [ 42869 / 58881, 3629 ins, 15998 del, 23242 sub ] exp/tri3/decode_dev_worn/wer_15_1.0
+%WER 91.73 [ 54013 / 58881, 3519 ins, 27098 del, 23396 sub ] exp/tri3/decode_dev_beamformit_ref/wer_17_1.0
+
+# nnet3 tdnn+chain
+%WER 47.91 [ 28212 / 58881, 2843 ins, 8957 del, 16412 sub ] exp/chain_train_worn_u100k_cleaned/tdnn1a_sp/decode_dev_worn/wer_9_0.0
+%WER 81.28 [ 47859 / 58881, 4210 ins, 27511 del, 16138 sub ] exp/chain_train_worn_u100k_cleaned/tdnn1a_sp/decode_dev_beamformit_ref/wer_9_0.5
+
+# result with the challenge submission format (July 9, 2018)
+# before the fix of speaker ID across arrays
+session S02 room DINING: #words 8288, #errors 6593, wer 79.54 %
+session S02 room KITCHEN: #words 12696, #errors 11096, wer 87.39 %
+session S02 room LIVING: #words 15460, #errors 12219, wer 79.03 %
+session S09 room DINING: #words 5766, #errors 4651, wer 80.66 %
+session S09 room KITCHEN: #words 8911, #errors 7277, wer 81.66 %
+session S09 room LIVING: #words 7760, #errors 6023, wer 77.61 %
+overall: #words 58881, #errors 47859, wer 81.28 %
+
+# result with the challenge submission format (July 9, 2018)
+# after the fix of speaker ID across arrays
+==== development set ====
+session S02 room DINING: #words 8288, #errors 6556, wer 79.10 %
+session S02 room KITCHEN: #words 12696, #errors 11096, wer 87.39 %
+session S02 room LIVING: #words 15460, #errors 12182, wer 78.79 %
+session S09 room DINING: #words 5766, #errors 4648, wer 80.61 %
+session S09 room KITCHEN: #words 8911, #errors 7277, wer 81.66 %
+session S09 room LIVING: #words 7760, #errors 6022, wer 77.60 %
+overall: #words 58881, #errors 47781, wer 81.14 %
diff --git a/egs/chime5/s5b/cmd.sh b/egs/chime5/s5b/cmd.sh
new file mode 100644
index 00000000000..9702501f1a7
--- /dev/null
+++ b/egs/chime5/s5b/cmd.sh
@@ -0,0 +1,15 @@
+# you can change cmd.sh depending on what type of queue you are using.
+# If you have no queueing system and want to run on a local machine, you
+# can change all instances 'queue.pl' to run.pl (but be careful and run
+# commands one by one: most recipes will exhaust the memory on your
+# machine).  queue.pl works with GridEngine (qsub).  slurm.pl works
+# with slurm.  Different queues are configured differently, with different
+# queue names and different ways of specifying things like memory;
+# to account for these differences you can create and edit the file
+# conf/queue.conf to match your queue's configuration.  Search for
+# conf/queue.conf in http://kaldi-asr.org/doc/queue.html for more information,
+# or search for the string 'default_config' in utils/queue.pl or utils/slurm.pl.
+
+export train_cmd="retry.pl queue.pl --mem 2G"
+export decode_cmd="queue.pl --mem 4G"
+
diff --git a/egs/chime5/s5b/conf/beamformit.cfg b/egs/chime5/s5b/conf/beamformit.cfg
new file mode 100755
index 00000000000..70fdd858651
--- /dev/null
+++ b/egs/chime5/s5b/conf/beamformit.cfg
@@ -0,0 +1,50 @@
+#BeamformIt sample configuration file for AMI data (http://groups.inf.ed.ac.uk/ami/download/)
+
+# scrolling size to compute the delays
+scroll_size = 250
+
+# cross correlation computation window size
+window_size = 500
+
+#amount of maximum points for the xcorrelation taken into account
+nbest_amount = 4
+
+#flag wether to apply an automatic noise thresholding 
+do_noise_threshold = 1
+
+#Percentage of frames with lower xcorr taken as noisy
+noise_percent = 10
+
+######## acoustic modelling parameters
+
+#transition probabilities weight for multichannel decoding
+trans_weight_multi = 25
+trans_weight_nbest = 25
+
+###
+
+#flag wether to print the feaures after setting them, or not
+print_features = 1
+
+#flag wether to use the bad frames in the sum process
+do_avoid_bad_frames = 1
+
+#flag to use the best channel (SNR) as a reference
+#defined from command line
+do_compute_reference = 1
+
+#flag wether to use a uem file or not(process all the file)
+do_use_uem_file = 0
+
+#flag wether to use an adaptative weights scheme or fixed weights
+do_adapt_weights = 1
+
+#flag wether to output the sph files or just run the system to create the auxiliary files
+do_write_sph_files = 1
+
+####directories where to store/retrieve info####
+#channels_file = ./cfg-files/channels
+
+#show needs to be passed as argument normally, here a default one is given just in case
+#show_id = Ttmp
+
diff --git a/egs/chime5/s5b/conf/mfcc.conf b/egs/chime5/s5b/conf/mfcc.conf
new file mode 100644
index 00000000000..32988403b00
--- /dev/null
+++ b/egs/chime5/s5b/conf/mfcc.conf
@@ -0,0 +1,2 @@
+--use-energy=false
+--sample-frequency=16000
diff --git a/egs/chime5/s5b/conf/mfcc_hires.conf b/egs/chime5/s5b/conf/mfcc_hires.conf
new file mode 100644
index 00000000000..fd64b62eb16
--- /dev/null
+++ b/egs/chime5/s5b/conf/mfcc_hires.conf
@@ -0,0 +1,10 @@
+# config for high-resolution MFCC features, intended for neural network training.
+# Note: we keep all cepstra, so it has the same info as filterbank features,
+# but MFCC is more easily compressible (because less correlated) which is why
+# we prefer this method.
+--use-energy=false   # use average of log energy, not energy.
+--sample-frequency=16000 
+--num-mel-bins=40
+--num-ceps=40
+--low-freq=40
+--high-freq=-400
diff --git a/egs/chime5/s5b/conf/online_cmvn.conf b/egs/chime5/s5b/conf/online_cmvn.conf
new file mode 100644
index 00000000000..7748a4a4dd3
--- /dev/null
+++ b/egs/chime5/s5b/conf/online_cmvn.conf
@@ -0,0 +1 @@
+# configuration file for apply-cmvn-online, used in the script ../local/run_online_decoding.sh
diff --git a/egs/chime5/s5b/local/chain/run_tdnn.sh b/egs/chime5/s5b/local/chain/run_tdnn.sh
new file mode 120000
index 00000000000..34499362831
--- /dev/null
+++ b/egs/chime5/s5b/local/chain/run_tdnn.sh
@@ -0,0 +1 @@
+tuning/run_tdnn_1a.sh
\ No newline at end of file
diff --git a/egs/chime5/s5b/local/chain/tuning/run_cnn_tdnn_lstm_1a.sh b/egs/chime5/s5b/local/chain/tuning/run_cnn_tdnn_lstm_1a.sh
new file mode 100755
index 00000000000..95e9d934bd3
--- /dev/null
+++ b/egs/chime5/s5b/local/chain/tuning/run_cnn_tdnn_lstm_1a.sh
@@ -0,0 +1,304 @@
+#!/bin/bash
+
+# Set -e here so that we catch if any executable fails immediately
+set -euo pipefail
+
+# First the options that are passed through to run_ivector_common.sh
+# (some of which are also used in this script directly).
+stage=0
+nj=96
+train_set=train_worn_u400k_cleaned
+test_sets="dev_beamformit_ref"
+gmm=tri3_cleaned
+nnet3_affix=_train_worn_u400k_cleaned
+lm_suffix=
+
+# The rest are configs specific to this script.  Most of the parameters
+# are just hardcoded at this level, in the commands below.
+affix=_1a   # affix for the TDNN directory name
+tree_affix=
+train_stage=-10
+get_egs_stage=-10
+decode_iter=
+
+common_egs_dir=
+
+hidden_dim=1024
+cell_dim=1024
+projection_dim=256
+
+# training options
+num_epochs=2  # 2 works better than 4
+chunk_width=140,100,160
+chunk_left_context=40
+chunk_right_context=0
+dropout_schedule='0,0@0.20,0.3@0.50,0'
+xent_regularize=0.025
+label_delay=5
+
+# decode options
+extra_left_context=50
+extra_right_context=0
+
+# training options
+srand=0
+remove_egs=true
+
+#decode options
+test_online_decoding=false  # if true, it will run the last decoding stage.
+
+
+# End configuration section.
+echo "$0 $@"  # Print the command line for logging
+
+. ./cmd.sh
+. ./path.sh
+. ./utils/parse_options.sh
+
+if ! cuda-compiled; then
+  cat <<EOF && exit 1
+This script is intended to be used with GPUs but you have not compiled Kaldi with CUDA
+If you want to use GPUs (and have them), go to src/, and configure and make on a machine
+where "nvcc" is installed.
+EOF
+fi
+
+# The iVector-extraction and feature-dumping parts are the same as the standard
+# nnet3 setup, and you can skip them by setting "--stage 11" if you have already
+# run those things.
+local/nnet3/run_ivector_common.sh --stage $stage \
+                                  --train-set $train_set \
+				  --test-sets "$test_sets" \
+                                  --gmm $gmm \
+                                  --nnet3-affix "$nnet3_affix" || exit 1;
+
+# Problem: We have removed the "train_" prefix of our training set in
+# the alignment directory names! Bad!
+gmm_dir=exp/$gmm
+ali_dir=exp/${gmm}_ali_${train_set}_sp
+tree_dir=exp/chain${nnet3_affix}/tree_sp${tree_affix:+_$tree_affix}
+lang=data/lang_chain
+lat_dir=exp/chain${nnet3_affix}/${gmm}_${train_set}_sp_lats
+dir=exp/chain${nnet3_affix}/cnn_tdnn_lstm${affix}_sp
+train_data_dir=data/${train_set}_sp_hires
+lores_train_data_dir=data/${train_set}_sp
+train_ivector_dir=exp/nnet3${nnet3_affix}/ivectors_${train_set}_sp_hires
+
+for f in $gmm_dir/final.mdl $train_data_dir/feats.scp $train_ivector_dir/ivector_online.scp \
+    $lores_train_data_dir/feats.scp $ali_dir/ali.1.gz; do
+  [ ! -f $f ] && echo "$0: expected file $f to exist" && exit 1
+done
+
+if [ $stage -le 10 ]; then
+  echo "$0: creating lang directory $lang with chain-type topology"
+  # Create a version of the lang/ directory that has one state per phone in the
+  # topo file. [note, it really has two states.. the first one is only repeated
+  # once, the second one has zero or more repeats.]
+  if [ -d $lang ]; then
+    if [ $lang/L.fst -nt data/lang/L.fst ]; then
+      echo "$0: $lang already exists, not overwriting it; continuing"
+    else
+      echo "$0: $lang already exists and seems to be older than data/lang..."
+      echo " ... not sure what to do.  Exiting."
+      exit 1;
+    fi
+  else
+    cp -r data/lang $lang
+    silphonelist=$(cat $lang/phones/silence.csl) || exit 1;
+    nonsilphonelist=$(cat $lang/phones/nonsilence.csl) || exit 1;
+    # Use our special topology... note that later on may have to tune this
+    # topology.
+    steps/nnet3/chain/gen_topo.py $nonsilphonelist $silphonelist >$lang/topo
+  fi
+fi
+
+if [ $stage -le 11 ]; then
+  # Get the alignments as lattices (gives the chain training more freedom).
+  # use the same num-jobs as the alignments
+  steps/align_fmllr_lats.sh --nj ${nj} --cmd "$train_cmd" ${lores_train_data_dir} \
+    data/lang $gmm_dir $lat_dir
+  rm $lat_dir/fsts.*.gz # save space
+fi
+
+if [ $stage -le 12 ]; then
+  # Build a tree using our new topology.  We know we have alignments for the
+  # speed-perturbed data (local/nnet3/run_ivector_common.sh made them), so use
+  # those.  The num-leaves is always somewhat less than the num-leaves from
+  # the GMM baseline.
+   if [ -f $tree_dir/final.mdl ]; then
+     echo "$0: $tree_dir/final.mdl already exists, refusing to overwrite it."
+     exit 1;
+  fi
+  steps/nnet3/chain/build_tree.sh \
+    --frame-subsampling-factor 3 \
+    --context-opts "--context-width=2 --central-position=1" \
+    --cmd "$train_cmd" 3500 ${lores_train_data_dir} \
+    $lang $ali_dir $tree_dir
+fi
+
+if [ $stage -le 13 ]; then
+  mkdir -p $dir
+  echo "$0: creating neural net configs using the xconfig parser";
+
+  num_targets=$(tree-info $tree_dir/tree |grep num-pdfs|awk '{print $2}')
+  learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python)
+
+  lstm_opts="decay-time=40"
+
+  mkdir -p $dir/configs
+  cat <<EOF > $dir/configs/network.xconfig
+  input dim=100 name=ivector
+  input dim=40 name=input
+
+  # please note that it is important to have input layer with the name=input
+  # as the layer immediately preceding the fixed-affine-layer to enable
+  # the use of short notation for the descriptor
+  fixed-affine-layer name=lda input=Append(-1,0,1,ReplaceIndex(ivector, t, 0)) affine-transform-file=$dir/configs/lda.mat
+  idct-layer name=idct input=input dim=40 cepstral-lifter=22 affine-transform-file=$dir/configs/idct.mat
+      
+  conv-relu-batchnorm-layer name=cnn1 input=idct height-in=40 height-out=20 height-subsample-out=2 time-offsets=-1,0,1 height-offsets=-1,0,1 num-filters-out=256 learning-rate-factor=0.333 max-change=0.25
+  conv-relu-batchnorm-layer name=cnn2 input=cnn1 height-in=20 height-out=20 time-offsets=-1,0,1 height-offsets=-1,0,1 num-filters-out=128
+
+  relu-batchnorm-layer name=affine1 input=lda dim=512
+ 
+  # the first splicing is moved before the lda layer, so no splicing here
+  relu-batchnorm-layer name=tdnn1 input=cnn2 dim=1024
+  relu-batchnorm-layer name=tdnn2 input=Append(-1,0,1,affine1) dim=1024
+  relu-batchnorm-layer name=tdnn3 input=Append(-1,0,1) dim=1024
+
+  # check steps/libs/nnet3/xconfig/lstm.py for the other options and defaults
+  fast-lstmp-layer name=lstm1 cell-dim=1024 recurrent-projection-dim=256 non-recurrent-projection-dim=256 delay=-3 dropout-proportion=0.0 $lstm_opts
+  relu-batchnorm-layer name=tdnn4 input=Append(-3,0,3) dim=1024
+  relu-batchnorm-layer name=tdnn5 input=Append(-3,0,3) dim=1024
+  relu-batchnorm-layer name=tdnn6 input=Append(-3,0,3) dim=1024
+  fast-lstmp-layer name=lstm2 cell-dim=1024 recurrent-projection-dim=256 non-recurrent-projection-dim=256 delay=-3 dropout-proportion=0.0 $lstm_opts
+  relu-batchnorm-layer name=tdnn7 input=Append(-3,0,3) dim=1024
+  relu-batchnorm-layer name=tdnn8 input=Append(-3,0,3) dim=1024
+  relu-batchnorm-layer name=tdnn9 input=Append(-3,0,3) dim=1024
+  fast-lstmp-layer name=lstm3 cell-dim=1024 recurrent-projection-dim=256 non-recurrent-projection-dim=256 delay=-3 dropout-proportion=0.0 $lstm_opts
+
+  ## adding the layers for chain branch
+  output-layer name=output input=lstm3 output-delay=$label_delay include-log-softmax=false dim=$num_targets max-change=1.5
+
+  # adding the layers for xent branch
+  # This block prints the configs for a separate output that will be
+  # trained with a cross-entropy objective in the 'chain' models... this
+  # has the effect of regularizing the hidden parts of the model.  we use
+  # 0.5 / args.xent_regularize as the learning rate factor- the factor of
+  # 0.5 / args.xent_regularize is suitable as it means the xent
+  # final-layer learns at a rate independent of the regularization
+  # constant; and the 0.5 was tuned so as to make the relative progress
+  # similar in the xent and regular final layers.
+  output-layer name=output-xent input=lstm3 output-delay=$label_delay dim=$num_targets learning-rate-factor=$learning_rate_factor max-change=1.5
+
+EOF
+
+  steps/nnet3/xconfig_to_configs.py --xconfig-file $dir/configs/network.xconfig --config-dir $dir/configs/
+fi
+
+if [ $stage -le 14 ]; then
+  if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then
+    utils/create_split_dir.pl \
+     /export/b0{3,4,5,6}/$USER/kaldi-data/egs/chime5-$(date +'%m_%d_%H_%M')/s5/$dir/egs/storage $dir/egs/storage
+  fi
+
+  mkdir -p $dir/egs
+  touch $dir/egs/.nodelete # keep egs around when that run dies.
+
+  steps/nnet3/chain/train.py --stage=$train_stage \
+    --cmd="$train_cmd --mem 4G" \
+    --feat.online-ivector-dir=$train_ivector_dir \
+    --feat.cmvn-opts="--norm-means=false --norm-vars=false" \
+    --chain.xent-regularize $xent_regularize \
+    --chain.leaky-hmm-coefficient=0.1 \
+    --chain.l2-regularize=0.00005 \
+    --chain.apply-deriv-weights=false \
+    --chain.lm-opts="--num-extra-lm-states=2000" \
+    --trainer.dropout-schedule $dropout_schedule \
+    --trainer.num-chunk-per-minibatch 64,32 \
+    --trainer.frames-per-iter 1500000 \
+    --trainer.max-param-change 2.0 \
+    --trainer.num-epochs $num_epochs \
+    --trainer.srand=$srand \
+    --trainer.optimization.shrink-value 0.99 \
+    --trainer.optimization.num-jobs-initial=3 \
+    --trainer.optimization.num-jobs-final=16 \
+    --trainer.optimization.initial-effective-lrate=0.001 \
+    --trainer.optimization.final-effective-lrate=0.0001 \
+    --trainer.optimization.momentum=0.0 \
+    --trainer.deriv-truncate-margin 8 \
+    --egs.stage $get_egs_stage \
+    --egs.opts="--frames-overlap-per-eg 0" \
+    --egs.chunk-width=$chunk_width \
+    --egs.chunk-left-context=$chunk_left_context \
+    --egs.chunk-right-context=$chunk_right_context \
+    --egs.chunk-left-context-initial=0 \
+    --egs.chunk-right-context-final=0 \
+    --egs.dir="$common_egs_dir" \
+    --cleanup.remove-egs=$remove_egs \
+    --feat-dir=$train_data_dir \
+    --tree-dir=$tree_dir \
+    --lat-dir=$lat_dir \
+    --dir=$dir  || exit 1;
+fi
+
+if [ $stage -le 15 ]; then
+  # Note: it's not important to give mkgraph.sh the lang directory with the
+  # matched topology (since it gets the topology file from the model).
+  utils/mkgraph.sh \
+    --self-loop-scale 1.0 data/lang${lm_suffix}/ \
+    $tree_dir $tree_dir/graph${lm_suffix} || exit 1;
+fi
+
+if [ $stage -le 16 ]; then
+  frames_per_chunk=$(echo $chunk_width | cut -d, -f1)
+  rm $dir/.error 2>/dev/null || true
+
+  for data in $test_sets; do
+    (
+      steps/nnet3/decode.sh \
+          --acwt 1.0 --post-decode-acwt 10.0 \
+          --extra-left-context $chunk_left_context \
+          --extra-right-context $chunk_right_context \
+          --extra-left-context-initial 0 \
+          --extra-right-context-final 0 \
+          --frames-per-chunk $frames_per_chunk \
+          --nj 8 --cmd "$decode_cmd"  --num-threads 4 \
+          --online-ivector-dir exp/nnet3${nnet3_affix}/ivectors_${data}_hires \
+          $tree_dir/graph${lm_suffix} data/${data}_hires ${dir}/decode${lm_suffix}_${data} || exit 1
+    ) || touch $dir/.error &
+  done
+  wait
+  [ -f $dir/.error ] && echo "$0: there was a problem while decoding" && exit 1
+fi
+
+# Not testing the 'looped' decoding separately, because for
+# TDNN systems it would give exactly the same results as the
+# normal decoding.
+
+if $test_online_decoding && [ $stage -le 17 ]; then
+  # note: if the features change (e.g. you add pitch features), you will have to
+  # change the options of the following command line.
+  steps/online/nnet3/prepare_online_decoding.sh \
+    --mfcc-config conf/mfcc_hires.conf \
+    $lang exp/nnet3${nnet3_affix}/extractor ${dir} ${dir}_online
+
+  rm $dir/.error 2>/dev/null || true
+
+  for data in $test_sets; do
+    (
+      nspk=$(wc -l <data/${data}_hires/spk2utt)
+      # note: we just give it "data/${data}" as it only uses the wav.scp, the
+      # feature type does not matter.
+      steps/online/nnet3/decode.sh \
+        --acwt 1.0 --post-decode-acwt 10.0 \
+        --nj 8 --cmd "$decode_cmd" \
+        $tree_dir/graph${lm_suffix} data/${data} ${dir}_online/decode${lm_suffix}_${data} || exit 1
+    ) || touch $dir/.error &
+  done
+  wait
+  [ -f $dir/.error ] && echo "$0: there was a problem while decoding" && exit 1
+fi
+
+
+exit 0;
diff --git a/egs/chime5/s5b/local/chain/tuning/run_tdnn_1a.sh b/egs/chime5/s5b/local/chain/tuning/run_tdnn_1a.sh
new file mode 100755
index 00000000000..daad37e2cd7
--- /dev/null
+++ b/egs/chime5/s5b/local/chain/tuning/run_tdnn_1a.sh
@@ -0,0 +1,270 @@
+#!/bin/bash
+
+# Set -e here so that we catch if any executable fails immediately
+set -euo pipefail
+
+# First the options that are passed through to run_ivector_common.sh
+# (some of which are also used in this script directly).
+stage=0
+nj=96
+train_set=train_worn_u100k
+test_sets="dev_worn dev_beamformit_ref"
+gmm=tri3
+nnet3_affix=_train_worn_u100k
+lm_suffix=
+
+# The rest are configs specific to this script.  Most of the parameters
+# are just hardcoded at this level, in the commands below.
+affix=1a   # affix for the TDNN directory name
+tree_affix=
+train_stage=-10
+get_egs_stage=-10
+decode_iter=
+
+# training options
+# training chunk-options
+chunk_width=140,100,160
+common_egs_dir=
+xent_regularize=0.1
+
+# training options
+srand=0
+remove_egs=true
+
+#decode options
+test_online_decoding=false  # if true, it will run the last decoding stage.
+
+# End configuration section.
+echo "$0 $@"  # Print the command line for logging
+
+. ./cmd.sh
+. ./path.sh
+. ./utils/parse_options.sh
+
+if ! cuda-compiled; then
+  cat <<EOF && exit 1
+This script is intended to be used with GPUs but you have not compiled Kaldi with CUDA
+If you want to use GPUs (and have them), go to src/, and configure and make on a machine
+where "nvcc" is installed.
+EOF
+fi
+
+# The iVector-extraction and feature-dumping parts are the same as the standard
+# nnet3 setup, and you can skip them by setting "--stage 11" if you have already
+# run those things.
+local/nnet3/run_ivector_common.sh --stage $stage \
+                                  --train-set $train_set \
+				  --test-sets "$test_sets" \
+                                  --gmm $gmm \
+                                  --nnet3-affix "$nnet3_affix" || exit 1;
+
+# Problem: We have removed the "train_" prefix of our training set in
+# the alignment directory names! Bad!
+gmm_dir=exp/$gmm
+ali_dir=exp/${gmm}_ali_${train_set}_sp
+tree_dir=exp/chain${nnet3_affix}/tree_sp${tree_affix:+_$tree_affix}
+lang=data/lang_chain
+lat_dir=exp/chain${nnet3_affix}/${gmm}_${train_set}_sp_lats
+dir=exp/chain${nnet3_affix}/tdnn${affix}_sp
+train_data_dir=data/${train_set}_sp_hires
+lores_train_data_dir=data/${train_set}_sp
+train_ivector_dir=exp/nnet3${nnet3_affix}/ivectors_${train_set}_sp_hires
+
+for f in $gmm_dir/final.mdl $train_data_dir/feats.scp $train_ivector_dir/ivector_online.scp \
+    $lores_train_data_dir/feats.scp $ali_dir/ali.1.gz; do
+  [ ! -f $f ] && echo "$0: expected file $f to exist" && exit 1
+done
+
+if [ $stage -le 10 ]; then
+  echo "$0: creating lang directory $lang with chain-type topology"
+  # Create a version of the lang/ directory that has one state per phone in the
+  # topo file. [note, it really has two states.. the first one is only repeated
+  # once, the second one has zero or more repeats.]
+  if [ -d $lang ]; then
+    if [ $lang/L.fst -nt data/lang/L.fst ]; then
+      echo "$0: $lang already exists, not overwriting it; continuing"
+    else
+      echo "$0: $lang already exists and seems to be older than data/lang..."
+      echo " ... not sure what to do.  Exiting."
+      exit 1;
+    fi
+  else
+    cp -r data/lang $lang
+    silphonelist=$(cat $lang/phones/silence.csl) || exit 1;
+    nonsilphonelist=$(cat $lang/phones/nonsilence.csl) || exit 1;
+    # Use our special topology... note that later on may have to tune this
+    # topology.
+    steps/nnet3/chain/gen_topo.py $nonsilphonelist $silphonelist >$lang/topo
+  fi
+fi
+
+if [ $stage -le 11 ]; then
+  # Get the alignments as lattices (gives the chain training more freedom).
+  # use the same num-jobs as the alignments
+  steps/align_fmllr_lats.sh --nj ${nj} --cmd "$train_cmd" ${lores_train_data_dir} \
+    data/lang $gmm_dir $lat_dir
+  rm $lat_dir/fsts.*.gz # save space
+fi
+
+if [ $stage -le 12 ]; then
+  # Build a tree using our new topology.  We know we have alignments for the
+  # speed-perturbed data (local/nnet3/run_ivector_common.sh made them), so use
+  # those.  The num-leaves is always somewhat less than the num-leaves from
+  # the GMM baseline.
+   if [ -f $tree_dir/final.mdl ]; then
+     echo "$0: $tree_dir/final.mdl already exists, refusing to overwrite it."
+     exit 1;
+  fi
+  steps/nnet3/chain/build_tree.sh \
+    --frame-subsampling-factor 3 \
+    --context-opts "--context-width=2 --central-position=1" \
+    --cmd "$train_cmd" 3500 ${lores_train_data_dir} \
+    $lang $ali_dir $tree_dir
+fi
+
+
+if [ $stage -le 13 ]; then
+  mkdir -p $dir
+  echo "$0: creating neural net configs using the xconfig parser";
+
+  num_targets=$(tree-info $tree_dir/tree |grep num-pdfs|awk '{print $2}')
+  learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python)
+  opts="l2-regularize=0.05"
+  output_opts="l2-regularize=0.01 bottleneck-dim=320"
+
+  mkdir -p $dir/configs
+  cat <<EOF > $dir/configs/network.xconfig
+  input dim=100 name=ivector
+  input dim=40 name=input
+
+  # please note that it is important to have input layer with the name=input
+  # as the layer immediately preceding the fixed-affine-layer to enable
+  # the use of short notation for the descriptor
+  fixed-affine-layer name=lda input=Append(-2,-1,0,1,2,ReplaceIndex(ivector, t, 0)) affine-transform-file=$dir/configs/lda.mat
+
+  # the first splicing is moved before the lda layer, so no splicing here
+  relu-batchnorm-layer name=tdnn1 $opts dim=512
+  relu-batchnorm-layer name=tdnn2 $opts dim=512 input=Append(-1,0,1)
+  relu-batchnorm-layer name=tdnn3 $opts dim=512
+  relu-batchnorm-layer name=tdnn4 $opts dim=512 input=Append(-1,0,1)
+  relu-batchnorm-layer name=tdnn5 $opts dim=512
+  relu-batchnorm-layer name=tdnn6 $opts dim=512 input=Append(-3,0,3)
+  relu-batchnorm-layer name=tdnn7 $opts dim=512 input=Append(-3,0,3)
+  relu-batchnorm-layer name=tdnn8 $opts dim=512 input=Append(-6,-3,0)
+
+  ## adding the layers for chain branch
+  relu-batchnorm-layer name=prefinal-chain $opts dim=512 target-rms=0.5
+  output-layer name=output include-log-softmax=false $output_opts dim=$num_targets max-change=1.5
+
+  # adding the layers for xent branch
+  # This block prints the configs for a separate output that will be
+  # trained with a cross-entropy objective in the 'chain' models... this
+  # has the effect of regularizing the hidden parts of the model.  we use
+  # 0.5 / args.xent_regularize as the learning rate factor- the factor of
+  # 0.5 / args.xent_regularize is suitable as it means the xent
+  # final-layer learns at a rate independent of the regularization
+  # constant; and the 0.5 was tuned so as to make the relative progress
+  # similar in the xent and regular final layers.
+  relu-batchnorm-layer name=prefinal-xent input=tdnn8 $opts dim=512 target-rms=0.5
+  output-layer name=output-xent $output_opts dim=$num_targets learning-rate-factor=$learning_rate_factor max-change=1.5
+EOF
+  steps/nnet3/xconfig_to_configs.py --xconfig-file $dir/configs/network.xconfig --config-dir $dir/configs/
+fi
+
+if [ $stage -le 14 ]; then
+  if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then
+    utils/create_split_dir.pl \
+     /export/b0{3,4,5,6}/$USER/kaldi-data/egs/chime5-$(date +'%m_%d_%H_%M')/s5/$dir/egs/storage $dir/egs/storage
+  fi
+
+  steps/nnet3/chain/train.py --stage=$train_stage \
+    --cmd="$decode_cmd" \
+    --feat.online-ivector-dir=$train_ivector_dir \
+    --feat.cmvn-opts="--norm-means=false --norm-vars=false" \
+    --chain.xent-regularize $xent_regularize \
+    --chain.leaky-hmm-coefficient=0.1 \
+    --chain.l2-regularize=0.00005 \
+    --chain.apply-deriv-weights=false \
+    --chain.lm-opts="--num-extra-lm-states=2000" \
+    --trainer.srand=$srand \
+    --trainer.max-param-change=2.0 \
+    --trainer.num-epochs=10 \
+    --trainer.frames-per-iter=3000000 \
+    --trainer.optimization.num-jobs-initial=2 \
+    --trainer.optimization.num-jobs-final=4 \
+    --trainer.optimization.initial-effective-lrate=0.001 \
+    --trainer.optimization.final-effective-lrate=0.0001 \
+    --trainer.optimization.shrink-value=1.0 \
+    --trainer.num-chunk-per-minibatch=256,128,64 \
+    --trainer.optimization.momentum=0.0 \
+    --egs.chunk-width=$chunk_width \
+    --egs.chunk-left-context=$chunk_left_context \
+    --egs.chunk-right-context=$chunk_right_context \
+    --egs.chunk-left-context-initial=0 \
+    --egs.chunk-right-context-final=0 \
+    --egs.dir="$common_egs_dir" \
+    --egs.opts="--frames-overlap-per-eg 0" \
+    --cleanup.remove-egs=$remove_egs \
+    --use-gpu=true \
+    --feat-dir=$train_data_dir \
+    --tree-dir=$tree_dir \
+    --lat-dir=$lat_dir \
+    --dir=$dir  || exit 1;
+fi
+
+if [ $stage -le 15 ]; then
+  # Note: it's not important to give mkgraph.sh the lang directory with the
+  # matched topology (since it gets the topology file from the model).
+  utils/mkgraph.sh \
+    --self-loop-scale 1.0 data/lang${lm_suffix}/ \
+    $tree_dir $tree_dir/graph${lm_suffix} || exit 1;
+fi
+
+if [ $stage -le 16 ]; then
+  frames_per_chunk=$(echo $chunk_width | cut -d, -f1)
+  rm $dir/.error 2>/dev/null || true
+
+  for data in $test_sets; do
+    (
+      steps/nnet3/decode.sh \
+          --acwt 1.0 --post-decode-acwt 10.0 \
+          --frames-per-chunk $frames_per_chunk \
+          --nj 8 --cmd "$decode_cmd"  --num-threads 4 \
+          --online-ivector-dir exp/nnet3${nnet3_affix}/ivectors_${data}_hires \
+          $tree_dir/graph${lm_suffix} data/${data}_hires ${dir}/decode${lm_suffix}_${data} || exit 1
+    ) || touch $dir/.error &
+  done
+  wait
+  [ -f $dir/.error ] && echo "$0: there was a problem while decoding" && exit 1
+fi
+
+# Not testing the 'looped' decoding separately, because for
+# TDNN systems it would give exactly the same results as the
+# normal decoding.
+
+if $test_online_decoding && [ $stage -le 17 ]; then
+  # note: if the features change (e.g. you add pitch features), you will have to
+  # change the options of the following command line.
+  steps/online/nnet3/prepare_online_decoding.sh \
+    --mfcc-config conf/mfcc_hires.conf \
+    $lang exp/nnet3${nnet3_affix}/extractor ${dir} ${dir}_online
+
+  rm $dir/.error 2>/dev/null || true
+
+  for data in $test_sets; do
+    (
+      nspk=$(wc -l <data/${data}_hires/spk2utt)
+      # note: we just give it "data/${data}" as it only uses the wav.scp, the
+      # feature type does not matter.
+      steps/online/nnet3/decode.sh \
+        --acwt 1.0 --post-decode-acwt 10.0 \
+        --nj 8 --cmd "$decode_cmd" \
+        $tree_dir/graph${lm_suffix} data/${data} ${dir}_online/decode${lm_suffix}_${data} || exit 1
+    ) || touch $dir/.error &
+  done
+  wait
+  [ -f $dir/.error ] && echo "$0: there was a problem while decoding" && exit 1
+fi
+
+
+exit 0;
diff --git a/egs/chime5/s5b/local/chain/tuning/run_tdnn_1b.sh b/egs/chime5/s5b/local/chain/tuning/run_tdnn_1b.sh
new file mode 100755
index 00000000000..e033715d884
--- /dev/null
+++ b/egs/chime5/s5b/local/chain/tuning/run_tdnn_1b.sh
@@ -0,0 +1,249 @@
+#!/bin/bash
+
+# This factorized TDNN (TDNN-F) script is adapted from SWBD recipe 7q.
+# It uses resnet-style skip connections.
+# For details, refer to the paper:
+# "Semi-Orthogonal Low-Rank Matrix Factorization for Deep Neural Networks", Daniel Povey, Gaofeng Cheng, Yiming Wang, Ke Li, Hainan Xu, Mahsa Yarmohamadi, Sanjeev Khudanpur, Interspeech 2018
+
+# %WER 70.27 [ 41375 / 58881, 3487 ins, 22831 del, 15057 sub ] exp/chain_train_worn_simu_u400k_cleaned_rvb/tdnn1b_sp/decode_dev_beamformit_dereverb_ref_2stage/wer_12_0.0
+# %WER 70.28 [ 41383 / 58881, 4486 ins, 19616 del, 17281 sub ] exp/chain_train_worn_simu_u400k_cleaned_rvb/tdnn1b_sp/decode_dev_beamformit_ref_2stage/wer_11_0.0
+# %WER 72.62 [ 42761 / 58881, 4545 ins, 21618 del, 16598 sub ] exp/chain_train_worn_simu_u400k_cleaned_rvb/tdnn1b_sp/decode_dev_beamformit_ref/wer_11_0.0
+# %WER 72.64 [ 42772 / 58881, 4556 ins, 21618 del, 16598 sub ] exp/chain_train_worn_simu_u400k_cleaned_rvb/tdnn1b_sp/decode_dev_beamformit_dereverb_ref/wer_11_0.0
+
+# steps/info/chain_dir_info.pl exp/chain_train_worn_simu_u400k_cleaned_rvb/tdnn_1b_sp
+# exp/chain_train_worn_simu_u400k_cleaned_rvb/tdnn1b_sp/: num-iters=317 nj=3..16 num-params=17.0M dim=40+100->2792 combine=-0.149->-0.149 (over 2) xent:train/valid[210,316,final]=(-2.50,-1.99,-2.00/-2.36,-1.95,-1.95) logprob:train/valid[210,316,final]=(-0.228,-0.136,-0.136/-0.223,-0.156,-0.155)
+
+set -e
+
+# configs for 'chain'
+stage=0
+nj=96
+train_set=train_worn_u400k
+test_sets="dev_worn dev_beamformit_ref"
+gmm=tri3
+nnet3_affix=_train_worn_u400k
+lm_suffix=
+
+# The rest are configs specific to this script.  Most of the parameters
+# are just hardcoded at this level, in the commands below.
+affix=1b   # affix for the TDNN directory name
+tree_affix=
+train_stage=-10
+get_egs_stage=-10
+decode_iter=
+
+num_epochs=4
+common_egs_dir=
+# training options
+# training chunk-options
+chunk_width=140,100,160
+xent_regularize=0.1
+dropout_schedule='0,0@0.20,0.5@0.50,0'
+
+# training options
+srand=0
+remove_egs=true
+
+#decode options
+test_online_decoding=false  # if true, it will run the last decoding stage.
+
+# End configuration section.
+echo "$0 $@"  # Print the command line for logging
+
+. ./cmd.sh
+. ./path.sh
+. ./utils/parse_options.sh
+
+if ! cuda-compiled; then
+  cat <<EOF && exit 1
+This script is intended to be used with GPUs but you have not compiled Kaldi with CUDA
+If you want to use GPUs (and have them), go to src/, and configure and make on a machine
+where "nvcc" is installed.
+EOF
+fi
+
+# The iVector-extraction and feature-dumping parts are the same as the standard
+# nnet3 setup, and you can skip them by setting "--stage 11" if you have already
+# run those things.
+local/nnet3/run_ivector_common.sh --stage $stage \
+                                  --train-set $train_set \
+                                  --test-sets "$test_sets" \
+                                  --gmm $gmm \
+                                  --nnet3-affix "$nnet3_affix" || exit 1;
+
+# Problem: We have removed the "train_" prefix of our training set in
+# the alignment directory names! Bad!
+gmm_dir=exp/$gmm
+tree_dir=exp/chain${nnet3_affix}/tree_sp${tree_affix:+_$tree_affix}
+lang=data/lang_chain
+lat_dir=exp/chain${nnet3_affix}/${gmm}_${train_set}_sp_lats
+dir=exp/chain${nnet3_affix}/tdnn${affix}_sp
+train_data_dir=data/${train_set}_sp_hires
+lores_train_data_dir=data/${train_set}_sp
+train_ivector_dir=exp/nnet3${nnet3_affix}/ivectors_${train_set}_sp_hires
+
+for f in $gmm_dir/final.mdl $train_data_dir/feats.scp $train_ivector_dir/ivector_online.scp \
+    $lores_train_data_dir/feats.scp; do
+  [ ! -f $f ] && echo "$0: expected file $f to exist" && exit 1
+done
+
+if [ $stage -le 10 ]; then
+  echo "$0: creating lang directory $lang with chain-type topology"
+  # Create a version of the lang/ directory that has one state per phone in the
+  # topo file. [note, it really has two states.. the first one is only repeated
+  # once, the second one has zero or more repeats.]
+  if [ -d $lang ]; then
+    if [ $lang/L.fst -nt data/lang/L.fst ]; then
+      echo "$0: $lang already exists, not overwriting it; continuing"
+    else
+      echo "$0: $lang already exists and seems to be older than data/lang..."
+      echo " ... not sure what to do.  Exiting."
+      exit 1;
+    fi
+  else
+    cp -r data/lang $lang
+    silphonelist=$(cat $lang/phones/silence.csl) || exit 1;
+    nonsilphonelist=$(cat $lang/phones/nonsilence.csl) || exit 1;
+    # Use our special topology... note that later on may have to tune this
+    # topology.
+    steps/nnet3/chain/gen_topo.py $nonsilphonelist $silphonelist >$lang/topo
+  fi
+fi
+
+if [ $stage -le 11 ]; then
+  # Get the alignments as lattices (gives the chain training more freedom).
+  # use the same num-jobs as the alignments
+  steps/align_fmllr_lats.sh --nj ${nj} --cmd "$train_cmd" --generate-ali-from-lats true \
+    ${lores_train_data_dir} \
+    data/lang $gmm_dir $lat_dir
+  rm $lat_dir/fsts.*.gz # save space
+fi
+
+if [ $stage -le 12 ]; then
+  # Build a tree using our new topology.  We know we have alignments for the
+  # speed-perturbed data (local/nnet3/run_ivector_common.sh made them), so use
+  # those.  The num-leaves is always somewhat less than the num-leaves from
+  # the GMM baseline.
+  if [ -f $tree_dir/final.mdl ]; then
+     echo "$0: $tree_dir/final.mdl already exists, refusing to overwrite it."
+     exit 1;
+  fi
+  steps/nnet3/chain/build_tree.sh \
+    --frame-subsampling-factor 3 \
+    --cmd "$train_cmd" 3500 ${lores_train_data_dir} \
+    $lang $lat_dir $tree_dir
+fi
+
+if [ $stage -le 13 ]; then
+  mkdir -p $dir
+  echo "$0: creating neural net configs using the xconfig parser";
+
+  num_targets=$(tree-info $tree_dir/tree |grep num-pdfs|awk '{print $2}')
+  learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python)
+  affine_opts="l2-regularize=0.01 dropout-proportion=0.0 dropout-per-dim=true dropout-per-dim-continuous=true"
+  tdnnf_opts="l2-regularize=0.01 dropout-proportion=0.0 bypass-scale=0.66"
+  linear_opts="l2-regularize=0.01 orthonormal-constraint=-1.0"
+  prefinal_opts="l2-regularize=0.01"
+  output_opts="l2-regularize=0.002"
+
+  mkdir -p $dir/configs
+  cat <<EOF > $dir/configs/network.xconfig
+  input dim=100 name=ivector
+  input dim=40 name=input
+
+  # please note that it is important to have input layer with the name=input
+  # as the layer immediately preceding the fixed-affine-layer to enable
+  # the use of short notation for the descriptor
+  fixed-affine-layer name=lda input=Append(-1,0,1,ReplaceIndex(ivector, t, 0)) affine-transform-file=$dir/configs/lda.mat
+
+  # the first splicing is moved before the lda layer, so no splicing here
+  relu-batchnorm-dropout-layer name=tdnn1 $affine_opts dim=1536
+  tdnnf-layer name=tdnnf2 $tdnnf_opts dim=1536 bottleneck-dim=160 time-stride=1
+  tdnnf-layer name=tdnnf3 $tdnnf_opts dim=1536 bottleneck-dim=160 time-stride=1
+  tdnnf-layer name=tdnnf4 $tdnnf_opts dim=1536 bottleneck-dim=160 time-stride=1
+  tdnnf-layer name=tdnnf5 $tdnnf_opts dim=1536 bottleneck-dim=160 time-stride=0
+  tdnnf-layer name=tdnnf6 $tdnnf_opts dim=1536 bottleneck-dim=160 time-stride=3
+  tdnnf-layer name=tdnnf7 $tdnnf_opts dim=1536 bottleneck-dim=160 time-stride=3
+  tdnnf-layer name=tdnnf8 $tdnnf_opts dim=1536 bottleneck-dim=160 time-stride=3
+  tdnnf-layer name=tdnnf9 $tdnnf_opts dim=1536 bottleneck-dim=160 time-stride=3
+  tdnnf-layer name=tdnnf10 $tdnnf_opts dim=1536 bottleneck-dim=160 time-stride=3
+  tdnnf-layer name=tdnnf11 $tdnnf_opts dim=1536 bottleneck-dim=160 time-stride=3
+  tdnnf-layer name=tdnnf12 $tdnnf_opts dim=1536 bottleneck-dim=160 time-stride=3
+  tdnnf-layer name=tdnnf13 $tdnnf_opts dim=1536 bottleneck-dim=160 time-stride=3
+  tdnnf-layer name=tdnnf14 $tdnnf_opts dim=1536 bottleneck-dim=160 time-stride=3
+  tdnnf-layer name=tdnnf15 $tdnnf_opts dim=1536 bottleneck-dim=160 time-stride=3
+  linear-component name=prefinal-l dim=256 $linear_opts
+
+  prefinal-layer name=prefinal-chain input=prefinal-l $prefinal_opts big-dim=1536 small-dim=256
+  output-layer name=output include-log-softmax=false dim=$num_targets $output_opts
+
+  prefinal-layer name=prefinal-xent input=prefinal-l $prefinal_opts big-dim=1536 small-dim=256
+  output-layer name=output-xent dim=$num_targets learning-rate-factor=$learning_rate_factor $output_opts
+EOF
+  steps/nnet3/xconfig_to_configs.py --xconfig-file $dir/configs/network.xconfig --config-dir $dir/configs/
+fi
+
+if [ $stage -le 14 ]; then
+  if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then
+    utils/create_split_dir.pl \
+     /export/b0{3,4,5,6}/$USER/kaldi-data/egs/chime5-$(date +'%m_%d_%H_%M')/s5/$dir/egs/storage $dir/egs/storage
+  fi
+
+  steps/nnet3/chain/train.py --stage $train_stage \
+    --cmd "$train_cmd --mem 4G" \
+    --feat.online-ivector-dir=$train_ivector_dir \
+    --feat.cmvn-opts "--norm-means=false --norm-vars=false" \
+    --chain.xent-regularize $xent_regularize \
+    --chain.leaky-hmm-coefficient 0.1 \
+    --chain.l2-regularize 0.0 \
+    --chain.apply-deriv-weights false \
+    --chain.lm-opts="--num-extra-lm-states=2000" \
+    --trainer.dropout-schedule "$dropout_schedule" \
+    --trainer.add-option="--optimization.memory-compression-level=2" \
+    --egs.dir "$common_egs_dir" \
+    --egs.stage $get_egs_stage \
+    --egs.opts "--frames-overlap-per-eg 0" \
+    --egs.chunk-width $chunk_width \
+    --trainer.num-chunk-per-minibatch 64 \
+    --trainer.frames-per-iter 1500000 \
+    --trainer.num-epochs $num_epochs \
+    --trainer.optimization.num-jobs-initial 3 \
+    --trainer.optimization.num-jobs-final 16 \
+    --trainer.optimization.initial-effective-lrate 0.00025 \
+    --trainer.optimization.final-effective-lrate 0.000025 \
+    --trainer.max-param-change 2.0 \
+    --cleanup.remove-egs $remove_egs \
+    --feat-dir=$train_data_dir \
+    --tree-dir=$tree_dir \
+    --lat-dir=$lat_dir \
+    --dir $dir  || exit 1;
+
+fi
+
+if [ $stage -le 15 ]; then
+  # Note: it's not important to give mkgraph.sh the lang directory with the
+  # matched topology (since it gets the topology file from the model).
+  utils/mkgraph.sh \
+    --self-loop-scale 1.0 data/lang${lm_suffix}/ \
+    $tree_dir $tree_dir/graph${lm_suffix} || exit 1;
+fi
+
+if [ $stage -le 16 ]; then
+  frames_per_chunk=$(echo $chunk_width | cut -d, -f1)
+  rm $dir/.error 2>/dev/null || true
+
+  for data in $test_sets; do
+    (
+      steps/nnet3/decode.sh \
+          --acwt 1.0 --post-decode-acwt 10.0 \
+          --frames-per-chunk $frames_per_chunk \
+          --nj 8 --cmd "$decode_cmd"  --num-threads 4 \
+          --online-ivector-dir exp/nnet3${nnet3_affix}/ivectors_${data}_hires \
+          $tree_dir/graph${lm_suffix} data/${data}_hires ${dir}/decode${lm_suffix}_${data} || exit 1
+    ) || touch $dir/.error &
+  done
+  wait
+  [ -f $dir/.error ] && echo "$0: there was a problem while decoding" && exit 1
+fi
+
+exit 0;
diff --git a/egs/chime5/s5b/local/chain/tuning/run_tdnn_lstm_1a.sh b/egs/chime5/s5b/local/chain/tuning/run_tdnn_lstm_1a.sh
new file mode 100755
index 00000000000..e3d8e6ac4dc
--- /dev/null
+++ b/egs/chime5/s5b/local/chain/tuning/run_tdnn_lstm_1a.sh
@@ -0,0 +1,297 @@
+#!/bin/bash
+
+# Set -e here so that we catch if any executable fails immediately
+set -euo pipefail
+
+# First the options that are passed through to run_ivector_common.sh
+# (some of which are also used in this script directly).
+stage=0
+nj=96
+train_set=train_worn_u400k_cleaned
+test_sets="dev_worn dev_beamformit_ref"
+gmm=tri3_cleaned
+nnet3_affix=_train_worn_u400k_cleaned
+lm_suffix=
+
+# The rest are configs specific to this script.  Most of the parameters
+# are just hardcoded at this level, in the commands below.
+affix=_1a   # affix for the TDNN directory name
+tree_affix=
+train_stage=-10
+get_egs_stage=-10
+decode_iter=
+
+common_egs_dir=
+
+hidden_dim=1024
+cell_dim=1024
+projection_dim=256
+
+# training options
+num_epochs=2  # 2 works better than 4
+chunk_width=140,100,160
+chunk_left_context=40
+chunk_right_context=0
+dropout_schedule='0,0@0.20,0.3@0.50,0'
+xent_regularize=0.025
+label_delay=5
+
+# decode options
+extra_left_context=50
+extra_right_context=0
+
+# training options
+srand=0
+remove_egs=true
+
+#decode options
+test_online_decoding=false  # if true, it will run the last decoding stage.
+
+
+# End configuration section.
+echo "$0 $@"  # Print the command line for logging
+
+. ./cmd.sh
+. ./path.sh
+. ./utils/parse_options.sh
+
+if ! cuda-compiled; then
+  cat <<EOF && exit 1
+This script is intended to be used with GPUs but you have not compiled Kaldi with CUDA
+If you want to use GPUs (and have them), go to src/, and configure and make on a machine
+where "nvcc" is installed.
+EOF
+fi
+
+# The iVector-extraction and feature-dumping parts are the same as the standard
+# nnet3 setup, and you can skip them by setting "--stage 11" if you have already
+# run those things.
+local/nnet3/run_ivector_common.sh --stage $stage \
+                                  --train-set $train_set \
+				  --test-sets "$test_sets" \
+                                  --gmm $gmm \
+                                  --nnet3-affix "$nnet3_affix" || exit 1;
+
+# Problem: We have removed the "train_" prefix of our training set in
+# the alignment directory names! Bad!
+gmm_dir=exp/$gmm
+ali_dir=exp/${gmm}_ali_${train_set}_sp
+tree_dir=exp/chain${nnet3_affix}/tree_sp${tree_affix:+_$tree_affix}
+lang=data/lang_chain
+lat_dir=exp/chain${nnet3_affix}/${gmm}_${train_set}_sp_lats
+dir=exp/chain${nnet3_affix}/tdnn_lstm${affix}_sp
+train_data_dir=data/${train_set}_sp_hires
+lores_train_data_dir=data/${train_set}_sp
+train_ivector_dir=exp/nnet3${nnet3_affix}/ivectors_${train_set}_sp_hires
+
+for f in $gmm_dir/final.mdl $train_data_dir/feats.scp $train_ivector_dir/ivector_online.scp \
+    $lores_train_data_dir/feats.scp $ali_dir/ali.1.gz; do
+  [ ! -f $f ] && echo "$0: expected file $f to exist" && exit 1
+done
+
+if [ $stage -le 10 ]; then
+  echo "$0: creating lang directory $lang with chain-type topology"
+  # Create a version of the lang/ directory that has one state per phone in the
+  # topo file. [note, it really has two states.. the first one is only repeated
+  # once, the second one has zero or more repeats.]
+  if [ -d $lang ]; then
+    if [ $lang/L.fst -nt data/lang/L.fst ]; then
+      echo "$0: $lang already exists, not overwriting it; continuing"
+    else
+      echo "$0: $lang already exists and seems to be older than data/lang..."
+      echo " ... not sure what to do.  Exiting."
+      exit 1;
+    fi
+  else
+    cp -r data/lang $lang
+    silphonelist=$(cat $lang/phones/silence.csl) || exit 1;
+    nonsilphonelist=$(cat $lang/phones/nonsilence.csl) || exit 1;
+    # Use our special topology... note that later on may have to tune this
+    # topology.
+    steps/nnet3/chain/gen_topo.py $nonsilphonelist $silphonelist >$lang/topo
+  fi
+fi
+
+if [ $stage -le 11 ]; then
+  # Get the alignments as lattices (gives the chain training more freedom).
+  # use the same num-jobs as the alignments
+  steps/align_fmllr_lats.sh --nj ${nj} --cmd "$train_cmd" ${lores_train_data_dir} \
+    data/lang $gmm_dir $lat_dir
+  rm $lat_dir/fsts.*.gz # save space
+fi
+
+if [ $stage -le 12 ]; then
+  # Build a tree using our new topology.  We know we have alignments for the
+  # speed-perturbed data (local/nnet3/run_ivector_common.sh made them), so use
+  # those.  The num-leaves is always somewhat less than the num-leaves from
+  # the GMM baseline.
+   if [ -f $tree_dir/final.mdl ]; then
+     echo "$0: $tree_dir/final.mdl already exists, refusing to overwrite it."
+     exit 1;
+  fi
+  steps/nnet3/chain/build_tree.sh \
+    --frame-subsampling-factor 3 \
+    --context-opts "--context-width=2 --central-position=1" \
+    --cmd "$train_cmd" 3500 ${lores_train_data_dir} \
+    $lang $ali_dir $tree_dir
+fi
+
+if [ $stage -le 13 ]; then
+  mkdir -p $dir
+  echo "$0: creating neural net configs using the xconfig parser";
+
+  num_targets=$(tree-info $tree_dir/tree |grep num-pdfs|awk '{print $2}')
+  learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python)
+
+  lstm_opts="decay-time=40"
+
+  mkdir -p $dir/configs
+  cat <<EOF > $dir/configs/network.xconfig
+  input dim=100 name=ivector
+  input dim=40 name=input
+
+  # please note that it is important to have input layer with the name=input
+  # as the layer immediately preceding the fixed-affine-layer to enable
+  # the use of short notation for the descriptor
+  fixed-affine-layer name=lda input=Append(-2,-1,0,1,2,ReplaceIndex(ivector, t, 0)) affine-transform-file=$dir/configs/lda.mat
+
+  # the first splicing is moved before the lda layer, so no splicing here
+  relu-batchnorm-layer name=tdnn1 dim=$hidden_dim
+  relu-batchnorm-layer name=tdnn2 input=Append(-1,0,1) dim=$hidden_dim
+  relu-batchnorm-layer name=tdnn3 input=Append(-1,0,1) dim=$hidden_dim
+
+  fast-lstmp-layer name=lstm1 cell-dim=$cell_dim recurrent-projection-dim=$projection_dim non-recurrent-projection-dim=$projection_dim delay=-3 dropout-proportion=0.0 $lstm_opts
+  relu-batchnorm-layer name=tdnn4 input=Append(-3,0,3) dim=$hidden_dim
+  relu-batchnorm-layer name=tdnn5 input=Append(-3,0,3) dim=$hidden_dim
+  fast-lstmp-layer name=lstm2 cell-dim=$cell_dim recurrent-projection-dim=$projection_dim non-recurrent-projection-dim=$projection_dim delay=-3 dropout-proportion=0.0 $lstm_opts
+  relu-batchnorm-layer name=tdnn6 input=Append(-3,0,3) dim=$hidden_dim
+  relu-batchnorm-layer name=tdnn7 input=Append(-3,0,3) dim=$hidden_dim
+  fast-lstmp-layer name=lstm3 cell-dim=$cell_dim recurrent-projection-dim=$projection_dim non-recurrent-projection-dim=$projection_dim delay=-3 dropout-proportion=0.0 $lstm_opts
+  relu-batchnorm-layer name=tdnn8 input=Append(-3,0,3) dim=$hidden_dim
+  relu-batchnorm-layer name=tdnn9 input=Append(-3,0,3) dim=$hidden_dim
+  fast-lstmp-layer name=lstm4 cell-dim=$cell_dim recurrent-projection-dim=$projection_dim non-recurrent-projection-dim=$projection_dim delay=-3 dropout-proportion=0.0 $lstm_opts
+
+  ## adding the layers for chain branch
+  output-layer name=output input=lstm4 output-delay=$label_delay include-log-softmax=false dim=$num_targets max-change=1.5
+
+  # adding the layers for xent branch
+  # This block prints the configs for a separate output that will be
+  # trained with a cross-entropy objective in the 'chain' models... this
+  # has the effect of regularizing the hidden parts of the model.  we use
+  # 0.5 / args.xent_regularize as the learning rate factor- the factor of
+  # 0.5 / args.xent_regularize is suitable as it means the xent
+  # final-layer learns at a rate independent of the regularization
+  # constant; and the 0.5 was tuned so as to make the relative progress
+  # similar in the xent and regular final layers.
+  output-layer name=output-xent input=lstm4 output-delay=$label_delay dim=$num_targets learning-rate-factor=$learning_rate_factor max-change=1.5
+
+EOF
+  steps/nnet3/xconfig_to_configs.py --xconfig-file $dir/configs/network.xconfig --config-dir $dir/configs/
+fi
+
+if [ $stage -le 14 ]; then
+  if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then
+    utils/create_split_dir.pl \
+     /export/b0{3,4,5,6}/$USER/kaldi-data/egs/chime5-$(date +'%m_%d_%H_%M')/s5/$dir/egs/storage $dir/egs/storage
+  fi
+
+  mkdir -p $dir/egs
+  touch $dir/egs/.nodelete # keep egs around when that run dies.
+
+  steps/nnet3/chain/train.py --stage=$train_stage \
+    --cmd="$train_cmd --mem 4G" \
+    --feat.online-ivector-dir=$train_ivector_dir \
+    --feat.cmvn-opts="--norm-means=false --norm-vars=false" \
+    --chain.xent-regularize $xent_regularize \
+    --chain.leaky-hmm-coefficient=0.1 \
+    --chain.l2-regularize=0.00005 \
+    --chain.apply-deriv-weights=false \
+    --chain.lm-opts="--num-extra-lm-states=2000" \
+    --trainer.dropout-schedule $dropout_schedule \
+    --trainer.num-chunk-per-minibatch 64,32 \
+    --trainer.frames-per-iter 1500000 \
+    --trainer.max-param-change 2.0 \
+    --trainer.num-epochs $num_epochs \
+    --trainer.srand=$srand \
+    --trainer.optimization.shrink-value 0.99 \
+    --trainer.optimization.num-jobs-initial=3 \
+    --trainer.optimization.num-jobs-final=16 \
+    --trainer.optimization.initial-effective-lrate=0.001 \
+    --trainer.optimization.final-effective-lrate=0.0001 \
+    --trainer.optimization.momentum=0.0 \
+    --trainer.deriv-truncate-margin 8 \
+    --egs.stage $get_egs_stage \
+    --egs.opts="--frames-overlap-per-eg 0" \
+    --egs.chunk-width=$chunk_width \
+    --egs.chunk-left-context=$chunk_left_context \
+    --egs.chunk-right-context=$chunk_right_context \
+    --egs.chunk-left-context-initial=0 \
+    --egs.chunk-right-context-final=0 \
+    --egs.dir="$common_egs_dir" \
+    --cleanup.remove-egs=$remove_egs \
+    --feat-dir=$train_data_dir \
+    --tree-dir=$tree_dir \
+    --lat-dir=$lat_dir \
+    --dir=$dir  || exit 1;
+fi
+
+if [ $stage -le 15 ]; then
+  # Note: it's not important to give mkgraph.sh the lang directory with the
+  # matched topology (since it gets the topology file from the model).
+  utils/mkgraph.sh \
+    --self-loop-scale 1.0 data/lang${lm_suffix}/ \
+    $tree_dir $tree_dir/graph${lm_suffix} || exit 1;
+fi
+
+if [ $stage -le 16 ]; then
+  frames_per_chunk=$(echo $chunk_width | cut -d, -f1)
+  rm $dir/.error 2>/dev/null || true
+
+  for data in $test_sets; do
+    (
+      steps/nnet3/decode.sh \
+          --acwt 1.0 --post-decode-acwt 10.0 \
+          --extra-left-context $chunk_left_context \
+          --extra-right-context $chunk_right_context \
+          --extra-left-context-initial 0 \
+          --extra-right-context-final 0 \
+          --frames-per-chunk $frames_per_chunk \
+          --nj 8 --cmd "$decode_cmd"  --num-threads 4 \
+          --online-ivector-dir exp/nnet3${nnet3_affix}/ivectors_${data}_hires \
+          $tree_dir/graph${lm_suffix} data/${data}_hires ${dir}/decode${lm_suffix}_${data} || exit 1
+    ) || touch $dir/.error &
+  done
+  wait
+  [ -f $dir/.error ] && echo "$0: there was a problem while decoding" && exit 1
+fi
+
+# Not testing the 'looped' decoding separately, because for
+# TDNN systems it would give exactly the same results as the
+# normal decoding.
+
+if $test_online_decoding && [ $stage -le 17 ]; then
+  # note: if the features change (e.g. you add pitch features), you will have to
+  # change the options of the following command line.
+  steps/online/nnet3/prepare_online_decoding.sh \
+    --mfcc-config conf/mfcc_hires.conf \
+    $lang exp/nnet3${nnet3_affix}/extractor ${dir} ${dir}_online
+
+  rm $dir/.error 2>/dev/null || true
+
+  for data in $test_sets; do
+    (
+      nspk=$(wc -l <data/${data}_hires/spk2utt)
+      # note: we just give it "data/${data}" as it only uses the wav.scp, the
+      # feature type does not matter.
+      steps/online/nnet3/decode.sh \
+        --acwt 1.0 --post-decode-acwt 10.0 \
+        --nj 8 --cmd "$decode_cmd" \
+        $tree_dir/graph${lm_suffix} data/${data} ${dir}_online/decode${lm_suffix}_${data} || exit 1
+    ) || touch $dir/.error &
+  done
+  wait
+  [ -f $dir/.error ] && echo "$0: there was a problem while decoding" && exit 1
+fi
+
+
+exit 0;
diff --git a/egs/chime5/s5b/local/check_tools.sh b/egs/chime5/s5b/local/check_tools.sh
new file mode 100755
index 00000000000..9c0f9290a75
--- /dev/null
+++ b/egs/chime5/s5b/local/check_tools.sh
@@ -0,0 +1,71 @@
+#!/bin/bash -u
+
+# Copyright 2015 (c) Johns Hopkins University (Jan Trmal <jtrmal@gmail.com>)
+
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#  http://www.apache.org/licenses/LICENSE-2.0
+#
+# THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
+# WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
+# MERCHANTABLITY OR NON-INFRINGEMENT.
+# See the Apache 2 License for the specific language governing permissions and
+# limitations under the License.
+
+[ -f ./path.sh ] && . ./path.sh
+
+command -v uconv &>/dev/null \
+  || { echo  >&2 "uconv not found on PATH. You will have to install ICU4C"; exit 1; }
+
+command -v ngram &>/dev/null \
+  || { echo  >&2 "srilm not found on PATH. Please use the script $KALDI_ROOT/tools/extras/install_srilm.sh to install it"; exit 1; }
+
+if [  -z ${LIBLBFGS} ]; then
+  echo >&2  "SRILM is not compiled with the support of MaxEnt models."
+  echo >&2  "You should use the script in \$KALDI_ROOT/tools/install_srilm.sh"
+  echo >&2  "which will take care of compiling the SRILM with MaxEnt support"
+  exit 1;
+fi
+
+sox=`command -v sox 2>/dev/null` \
+  || { echo  >&2 "sox not found on PATH. Please install it manually (you will need version 14.4.0 and higher)."; exit 1; }
+
+# If sox is found on path, check if the version is correct
+if [ ! -z "$sox" ]; then
+  sox_version=`$sox --version 2>&1| head -1 | sed -e 's?.*: ??' -e 's?.* ??'`
+  if [[ ! $sox_version =~ v14.4.* ]]; then
+    echo "Unsupported sox version $sox_version found on path. You will need version v14.4.0 and higher."
+    exit 1
+  fi
+fi
+
+command -v phonetisaurus-align &>/dev/null \
+  || { echo  >&2 "Phonetisaurus not found on PATH. Please use the script $KALDI_ROOT/tools/extras/install_phonetisaurus.sh to install it"; exit 1; }
+
+command -v BeamformIt &>/dev/null \
+  || { echo  >&2 "BeamformIt not found on PATH. Please use the script $KALDI_ROOT/tools/extras/install_beamformit.sh to install it"; exit 1; }
+
+miniconda_dir=$HOME/miniconda3/
+if [ ! -d $miniconda_dir ]; then
+    echo "$miniconda_dir does not exist. Please run '../../../tools/extras/install_miniconda.sh'"
+fi
+
+# check if WPE is installed
+result=`$miniconda_dir/bin/python -c "\
+try:
+    import nara_wpe
+    print('1')
+except ImportError:
+    print('0')"`
+
+if [ "$result" == "1" ]; then
+    echo "WPE is installed"
+else
+    echo "WPE is not installed. Please run ../../../tools/extras/install_wpe.sh"
+    exit 1
+fi
+
+exit  0
diff --git a/egs/chime5/s5b/local/copy_lat_dir_parallel.sh b/egs/chime5/s5b/local/copy_lat_dir_parallel.sh
new file mode 100755
index 00000000000..82839604c9e
--- /dev/null
+++ b/egs/chime5/s5b/local/copy_lat_dir_parallel.sh
@@ -0,0 +1,97 @@
+#!/bin/bash
+
+cmd=queue.pl
+nj=40
+stage=0
+speed_perturb=true
+
+. ./path.sh
+. utils/parse_options.sh
+
+if [ $# -ne 4 ]; then
+  echo "Usage: $0 <utt-map> <data-dir> <src-lat-dir> <out-lat-dir>"
+  exit 1
+fi
+
+utt_map=$1
+data=$2
+srcdir=$3
+dir=$4
+
+mkdir -p $dir
+
+cp $srcdir/{phones.txt,tree,final.mdl} $dir || exit 1
+cp $srcdir/{final.alimdl,final.occs,splice_opts,cmvn_opts,delta_opts,final.mat,full.mat} 2>/dev/null || true
+
+nj_src=$(cat $srcdir/num_jobs) || exit 1
+
+if [ $stage -le 1 ]; then
+  $cmd JOB=1:$nj_src $dir/log/copy_lats_orig.JOB.log \
+    lattice-copy "ark:gunzip -c $srcdir/lat.JOB.gz |" \
+    ark,scp:$dir/lat_orig.JOB.ark,$dir/lat_orig.JOB.scp || exit 1
+fi
+
+for n in $(seq $nj_src); do
+  cat $dir/lat_orig.$n.scp
+done > $dir/lat_orig.scp || exit 1
+
+if $speed_perturb; then
+  for s in 0.9 1.1; do
+    awk -v s=$s '{print "sp"s"-"$1" sp"s"-"$2}' $utt_map
+  done | cat - $utt_map | sort -k1,1 > $dir/utt_map
+  utt_map=$dir/utt_map
+fi
+
+if [ $stage -le 2 ]; then
+  utils/filter_scp.pl -f 2 $dir/lat_orig.scp < $utt_map | \
+    utils/apply_map.pl -f 2 $dir/lat_orig.scp > \
+    $dir/lat.scp || exit 1
+
+  if [ ! -s $dir/lat.scp ]; then
+    echo "$0: $dir/lat.scp is empty. Something went wrong!"
+    exit 1
+  fi
+fi
+
+utils/split_data.sh $data $nj
+
+if [ $stage -le 3 ]; then
+  $cmd JOB=1:$nj $dir/log/copy_lats.JOB.log \
+    lattice-copy "scp:utils/filter_scp.pl $data/split$nj/JOB/utt2spk $dir/lat.scp |" \
+    "ark:|gzip -c > $dir/lat.JOB.gz" || exit 1
+fi
+
+echo $nj > $dir/num_jobs
+
+if [ -f $srcdir/ali.1.gz ]; then
+  if [ $stage -le 4 ]; then
+    $cmd JOB=1:$nj_src $dir/log/copy_ali_orig.JOB.log \
+      copy-int-vector "ark:gunzip -c $srcdir/ali.JOB.gz |" \
+      ark,scp:$dir/ali_orig.JOB.ark,$dir/ali_orig.JOB.scp || exit 1
+  fi
+
+  for n in $(seq $nj_src); do
+    cat $dir/ali_orig.$n.scp
+  done > $dir/ali_orig.scp || exit 1
+
+  if [ $stage -le 5 ]; then
+    utils/filter_scp.pl -f 2 $dir/ali_orig.scp < $utt_map | \
+      utils/apply_map.pl -f 2 $dir/ali_orig.scp > \
+      $dir/ali.scp || exit 1
+  
+    if [ ! -s $dir/ali.scp ]; then
+      echo "$0: $dir/ali.scp is empty. Something went wrong!"
+      exit 1
+    fi
+  fi
+
+  utils/split_data.sh $data $nj
+
+  if [ $stage -le 6 ]; then
+    $cmd JOB=1:$nj $dir/log/copy_ali.JOB.log \
+      copy-int-vector "scp:utils/filter_scp.pl $data/split$nj/JOB/utt2spk $dir/ali.scp |" \
+      "ark:|gzip -c > $dir/ali.JOB.gz" || exit 1
+  fi
+fi
+
+rm $dir/lat_orig.*.{ark,scp} $dir/ali_orig.*.{ark,scp} 2>/dev/null || true
diff --git a/egs/chime5/s5b/local/distant_audio_list b/egs/chime5/s5b/local/distant_audio_list
new file mode 100644
index 00000000000..fc7aff15cd0
--- /dev/null
+++ b/egs/chime5/s5b/local/distant_audio_list
@@ -0,0 +1,376 @@
+S03_U01.CH1
+S03_U01.CH2
+S03_U01.CH3
+S03_U01.CH4
+S03_U02.CH1
+S03_U02.CH2
+S03_U02.CH3
+S03_U02.CH4
+S03_U03.CH1
+S03_U03.CH2
+S03_U03.CH3
+S03_U03.CH4
+S03_U04.CH1
+S03_U04.CH2
+S03_U04.CH3
+S03_U04.CH4
+S03_U05.CH1
+S03_U05.CH2
+S03_U05.CH3
+S03_U05.CH4
+S03_U06.CH1
+S03_U06.CH2
+S03_U06.CH3
+S03_U06.CH4
+S04_U01.CH1
+S04_U01.CH2
+S04_U01.CH3
+S04_U01.CH4
+S04_U02.CH1
+S04_U02.CH2
+S04_U02.CH3
+S04_U02.CH4
+S04_U03.CH1
+S04_U03.CH2
+S04_U03.CH3
+S04_U03.CH4
+S04_U04.CH1
+S04_U04.CH2
+S04_U04.CH3
+S04_U04.CH4
+S04_U05.CH1
+S04_U05.CH2
+S04_U05.CH3
+S04_U05.CH4
+S04_U06.CH1
+S04_U06.CH2
+S04_U06.CH3
+S04_U06.CH4
+S05_U01.CH1
+S05_U01.CH2
+S05_U01.CH3
+S05_U01.CH4
+S05_U02.CH1
+S05_U02.CH2
+S05_U02.CH3
+S05_U02.CH4
+S05_U04.CH1
+S05_U04.CH2
+S05_U04.CH3
+S05_U04.CH4
+S05_U05.CH1
+S05_U05.CH2
+S05_U05.CH3
+S05_U05.CH4
+S05_U06.CH1
+S05_U06.CH2
+S05_U06.CH3
+S05_U06.CH4
+S06_U01.CH1
+S06_U01.CH2
+S06_U01.CH3
+S06_U01.CH4
+S06_U02.CH1
+S06_U02.CH2
+S06_U02.CH3
+S06_U02.CH4
+S06_U03.CH1
+S06_U03.CH2
+S06_U03.CH3
+S06_U03.CH4
+S06_U04.CH1
+S06_U04.CH2
+S06_U04.CH3
+S06_U04.CH4
+S06_U05.CH1
+S06_U05.CH2
+S06_U05.CH3
+S06_U05.CH4
+S06_U06.CH1
+S06_U06.CH2
+S06_U06.CH3
+S06_U06.CH4
+S07_U01.CH1
+S07_U01.CH2
+S07_U01.CH3
+S07_U01.CH4
+S07_U02.CH1
+S07_U02.CH2
+S07_U02.CH3
+S07_U02.CH4
+S07_U03.CH1
+S07_U03.CH2
+S07_U03.CH3
+S07_U03.CH4
+S07_U04.CH1
+S07_U04.CH2
+S07_U04.CH3
+S07_U04.CH4
+S07_U05.CH1
+S07_U05.CH2
+S07_U05.CH3
+S07_U05.CH4
+S07_U06.CH1
+S07_U06.CH2
+S07_U06.CH3
+S07_U06.CH4
+S08_U01.CH1
+S08_U01.CH2
+S08_U01.CH3
+S08_U01.CH4
+S08_U02.CH1
+S08_U02.CH2
+S08_U02.CH3
+S08_U02.CH4
+S08_U03.CH1
+S08_U03.CH2
+S08_U03.CH3
+S08_U03.CH4
+S08_U04.CH1
+S08_U04.CH2
+S08_U04.CH3
+S08_U04.CH4
+S08_U05.CH1
+S08_U05.CH2
+S08_U05.CH3
+S08_U05.CH4
+S08_U06.CH1
+S08_U06.CH2
+S08_U06.CH3
+S08_U06.CH4
+S12_U01.CH1
+S12_U01.CH2
+S12_U01.CH3
+S12_U01.CH4
+S12_U02.CH1
+S12_U02.CH2
+S12_U02.CH3
+S12_U02.CH4
+S12_U03.CH1
+S12_U03.CH2
+S12_U03.CH3
+S12_U03.CH4
+S12_U04.CH1
+S12_U04.CH2
+S12_U04.CH3
+S12_U04.CH4
+S12_U05.CH1
+S12_U05.CH2
+S12_U05.CH3
+S12_U05.CH4
+S12_U06.CH1
+S12_U06.CH2
+S12_U06.CH3
+S12_U06.CH4
+S13_U01.CH1
+S13_U01.CH2
+S13_U01.CH3
+S13_U01.CH4
+S13_U02.CH1
+S13_U02.CH2
+S13_U02.CH3
+S13_U02.CH4
+S13_U03.CH1
+S13_U03.CH2
+S13_U03.CH3
+S13_U03.CH4
+S13_U04.CH1
+S13_U04.CH2
+S13_U04.CH3
+S13_U04.CH4
+S13_U05.CH1
+S13_U05.CH2
+S13_U05.CH3
+S13_U05.CH4
+S13_U06.CH1
+S13_U06.CH2
+S13_U06.CH3
+S13_U06.CH4
+S16_U01.CH1
+S16_U01.CH2
+S16_U01.CH3
+S16_U01.CH4
+S16_U02.CH1
+S16_U02.CH2
+S16_U02.CH3
+S16_U02.CH4
+S16_U03.CH1
+S16_U03.CH2
+S16_U03.CH3
+S16_U03.CH4
+S16_U04.CH1
+S16_U04.CH2
+S16_U04.CH3
+S16_U04.CH4
+S16_U05.CH1
+S16_U05.CH2
+S16_U05.CH3
+S16_U05.CH4
+S16_U06.CH1
+S16_U06.CH2
+S16_U06.CH3
+S16_U06.CH4
+S17_U01.CH1
+S17_U01.CH2
+S17_U01.CH3
+S17_U01.CH4
+S17_U02.CH1
+S17_U02.CH2
+S17_U02.CH3
+S17_U02.CH4
+S17_U03.CH1
+S17_U03.CH2
+S17_U03.CH3
+S17_U03.CH4
+S17_U04.CH1
+S17_U04.CH2
+S17_U04.CH3
+S17_U04.CH4
+S17_U05.CH1
+S17_U05.CH2
+S17_U05.CH3
+S17_U05.CH4
+S17_U06.CH1
+S17_U06.CH2
+S17_U06.CH3
+S17_U06.CH4
+S18_U01.CH1
+S18_U01.CH2
+S18_U01.CH3
+S18_U01.CH4
+S18_U02.CH1
+S18_U02.CH2
+S18_U02.CH3
+S18_U02.CH4
+S18_U03.CH1
+S18_U03.CH2
+S18_U03.CH3
+S18_U03.CH4
+S18_U04.CH1
+S18_U04.CH2
+S18_U04.CH3
+S18_U04.CH4
+S18_U05.CH1
+S18_U05.CH2
+S18_U05.CH3
+S18_U05.CH4
+S18_U06.CH1
+S18_U06.CH2
+S18_U06.CH3
+S18_U06.CH4
+S19_U01.CH1
+S19_U01.CH2
+S19_U01.CH3
+S19_U01.CH4
+S19_U02.CH1
+S19_U02.CH2
+S19_U02.CH3
+S19_U02.CH4
+S19_U03.CH1
+S19_U03.CH2
+S19_U03.CH3
+S19_U03.CH4
+S19_U04.CH1
+S19_U04.CH2
+S19_U04.CH3
+S19_U04.CH4
+S19_U05.CH1
+S19_U05.CH2
+S19_U05.CH3
+S19_U05.CH4
+S19_U06.CH1
+S19_U06.CH2
+S19_U06.CH3
+S19_U06.CH4
+S20_U01.CH1
+S20_U01.CH2
+S20_U01.CH3
+S20_U01.CH4
+S20_U02.CH1
+S20_U02.CH2
+S20_U02.CH3
+S20_U02.CH4
+S20_U03.CH1
+S20_U03.CH2
+S20_U03.CH3
+S20_U03.CH4
+S20_U04.CH1
+S20_U04.CH2
+S20_U04.CH3
+S20_U04.CH4
+S20_U05.CH1
+S20_U05.CH2
+S20_U05.CH3
+S20_U05.CH4
+S20_U06.CH1
+S20_U06.CH2
+S20_U06.CH3
+S20_U06.CH4
+S22_U01.CH1
+S22_U01.CH2
+S22_U01.CH3
+S22_U01.CH4
+S22_U02.CH1
+S22_U02.CH2
+S22_U02.CH3
+S22_U02.CH4
+S22_U04.CH1
+S22_U04.CH2
+S22_U04.CH3
+S22_U04.CH4
+S22_U05.CH1
+S22_U05.CH2
+S22_U05.CH3
+S22_U05.CH4
+S22_U06.CH1
+S22_U06.CH2
+S22_U06.CH3
+S22_U06.CH4
+S23_U01.CH1
+S23_U01.CH2
+S23_U01.CH3
+S23_U01.CH4
+S23_U02.CH1
+S23_U02.CH2
+S23_U02.CH3
+S23_U02.CH4
+S23_U03.CH1
+S23_U03.CH2
+S23_U03.CH3
+S23_U03.CH4
+S23_U04.CH1
+S23_U04.CH2
+S23_U04.CH3
+S23_U04.CH4
+S23_U05.CH1
+S23_U05.CH2
+S23_U05.CH3
+S23_U05.CH4
+S23_U06.CH1
+S23_U06.CH2
+S23_U06.CH3
+S23_U06.CH4
+S24_U01.CH1
+S24_U01.CH2
+S24_U01.CH3
+S24_U01.CH4
+S24_U02.CH1
+S24_U02.CH2
+S24_U02.CH3
+S24_U02.CH4
+S24_U03.CH1
+S24_U03.CH2
+S24_U03.CH3
+S24_U03.CH4
+S24_U04.CH1
+S24_U04.CH2
+S24_U04.CH3
+S24_U04.CH4
+S24_U05.CH1
+S24_U05.CH2
+S24_U05.CH3
+S24_U05.CH4
+S24_U06.CH1
+S24_U06.CH2
+S24_U06.CH3
+S24_U06.CH4
diff --git a/egs/chime5/s5b/local/extract_noises.py b/egs/chime5/s5b/local/extract_noises.py
new file mode 100755
index 00000000000..f7b7f752d9e
--- /dev/null
+++ b/egs/chime5/s5b/local/extract_noises.py
@@ -0,0 +1,83 @@
+#!/usr/bin/env python3
+
+import argparse
+import json
+import logging
+import os
+import sys
+import scipy.io.wavfile as siw
+import math
+import numpy as np
+
+
+def get_args():
+    parser = argparse.ArgumentParser(
+        """Extract noises from the corpus based on the non-speech regions.
+        e.g. {} /export/corpora4/CHiME5/audio/train/ \\
+                /export/corpora4/CHiME5/transcriptions/train/ \\
+                /export/b05/zhiqiw/noise/""".format(sys.argv[0]))
+
+    parser.add_argument("--segment-length", default=20)
+    parser.add_argument("audio_dir", help="""Location of the CHiME5 Audio files. e.g. /export/corpora4/CHiME5/audio/train/""")
+    parser.add_argument("trans_dir", help="""Location of the CHiME5 Transcriptions. e.g. /export/corpora4/CHiME5/transcriptions/train/""")
+    parser.add_argument("audio_list", help="""List of ids of the CHiME5 recordings from which noise is extracted. e.g. local/distant_audio_list""")
+    parser.add_argument("out_dir", help="Output directory to write noise files. e.g. /export/b05/zhiqiw/noise/")
+
+    args = parser.parse_args()
+    return args
+
+
+def Trans_time(time, fs):
+    units = time.split(':')
+    time_second = float(units[0]) * 3600 + float(units[1]) * 60 + float(units[2])
+    return int(time_second*fs)
+
+
+def Get_time(conf, tag, mic, fs):
+    for i in conf:
+        st = Trans_time(i['start_time'][mic], fs)
+        ed = Trans_time(i['end_time'][mic], fs)
+        tag[st:ed] = 0
+    return tag
+
+
+def write_noise(out_dir, seg, audio, sig, tag, fs, cnt):
+    sig_noise = sig[np.nonzero(tag)]
+    for i in range(math.floor(len(sig_noise)/(seg*fs))):
+        siw.write(out_dir +'/noise'+str(cnt)+'.wav', fs, sig_noise[i*seg*fs:(i+1)*seg*fs])
+        cnt += 1
+    return cnt
+
+
+def main():
+    args = get_args()
+
+    if not os.path.exists(args.out_dir):
+        os.makedirs(args.out_dir)
+
+    wav_list = open(args.audio_list).readlines()
+
+    cnt = 1
+    for i, audio in enumerate(wav_list):
+        parts = audio.strip().split('.')
+        if len(parts) == 2:
+            # Assuming distant mic with name like S03_U01.CH1
+            session, mic = parts[0].split('_')
+            channel = parts[1]
+            base_name = session + "_" + mic + "." + channel
+        else:
+            # Assuming close talk mic with name like S03_P09
+            session, mic = audio.strip().split('_')
+            base_name = session + "_" + mic
+        fs, sig = siw.read(args.audio_dir + "/" + base_name + '.wav')
+        tag = np.ones(len(sig))
+        if i == 0 or session != session_p:
+            with open(args.trans_dir + "/" + session + '.json') as f:
+                conf = json.load(f)
+        tag = Get_time(conf, tag, mic, fs)
+        cnt = write_noise(args.out_dir, args.segment_length, audio, sig, tag, fs, cnt)
+        session_p = session
+
+
+if __name__ == '__main__':
+    main()
diff --git a/egs/chime5/s5b/local/extract_vad_weights.sh b/egs/chime5/s5b/local/extract_vad_weights.sh
new file mode 100755
index 00000000000..250b021bd8f
--- /dev/null
+++ b/egs/chime5/s5b/local/extract_vad_weights.sh
@@ -0,0 +1,86 @@
+#!/bin/bash
+
+# Copyright 2016 Johns Hopkins University (Author: Daniel Povey, Vijayaditya Peddinti)
+#           2019 Vimal Manohar
+# Apache 2.0.
+
+# This script converts lattices available from a first pass decode into a per-frame weights file
+# The ctms generated from the lattices are filtered. Silence frames are assigned a low weight (e.g.0.00001)
+# and voiced frames have a weight of 1.
+
+set -e
+
+stage=1
+cmd=run.pl
+silence_weight=0.00001
+#end configuration section.
+
+. ./cmd.sh
+
+[ -f ./path.sh ] && . ./path.sh
+. utils/parse_options.sh || exit 1;
+if [ $# -ne 4 ]; then
+  echo "Usage: $0 [--cmd (run.pl|queue.pl...)] <data-dir> <lang-dir|graph-dir> <input-decode-dir> <output-wts-file-gzipped>"
+  echo " Options:"
+  echo "    --cmd (run.pl|queue.pl...)      # specify how to run the sub-processes."
+  exit 1;
+fi
+
+data_dir=$1
+lang=$2 # Note: may be graph directory not lang directory, but has the necessary stuff copied.
+decode_dir=$3
+output_wts_file_gz=$4
+
+if [ $stage -le 1 ]; then
+  echo "$0: generating CTM from input lattices"
+  steps/get_ctm_conf.sh --cmd "$cmd" \
+    --use-segments false \
+    $data_dir \
+    $lang \
+    $decode_dir
+fi
+
+if [ $stage -le 2 ]; then
+  name=`basename $data_dir`
+  # we just take the ctm from LMWT 10, it doesn't seem to affect the results a lot
+  ctm=$decode_dir/score_10/$name.ctm
+  echo "$0: generating weights file from ctm $ctm"
+
+  pad_frames=0  # this did not seem to be helpful but leaving it as an option.
+  feat-to-len scp:$data_dir/feats.scp ark,t:- >$decode_dir/utt.lengths
+  if [ ! -f $ctm ]; then  echo "$0: expected ctm to exist: $ctm"; exit 1; fi
+
+  cat $ctm | awk '$6 == 1.0 && $4 < 1.0' | \
+  grep -v -w mm | grep -v -w mhm | grep -v -F '[noise]' | \
+  grep -v -F '[laughter]' | grep -v -F '<unk>' | \
+  perl -e ' $lengths=shift @ARGV;  $pad_frames=shift @ARGV; $silence_weight=shift @ARGV;
+   $pad_frames >= 0 || die "bad pad-frames value $pad_frames";
+   open(L, "<$lengths") || die "opening lengths file";
+   @all_utts = ();
+   $utt2ref = { };
+   while (<L>) {
+     ($utt, $len) = split(" ", $_);
+     push @all_utts, $utt;
+     $array_ref = [ ];
+     for ($n = 0; $n < $len; $n++) { ${$array_ref}[$n] = $silence_weight; }
+     $utt2ref{$utt} = $array_ref;
+   }
+   while (<STDIN>) {
+     @A = split(" ", $_);
+     @A == 6 || die "bad ctm line $_";
+     $utt = $A[0]; $beg = $A[2]; $len = $A[3];
+     $beg_int = int($beg * 100) - $pad_frames;
+     $len_int = int($len * 100) + 2*$pad_frames;
+     $array_ref = $utt2ref{$utt};
+     !defined $array_ref  && die "No length info for utterance $utt";
+     for ($t = $beg_int; $t < $beg_int + $len_int; $t++) {
+       if ($t >= 0 && $t < @$array_ref) {
+         ${$array_ref}[$t] = 1;
+        }
+      }
+    }
+    foreach $utt (@all_utts) {  $array_ref = $utt2ref{$utt};
+      print $utt, " [ ", join(" ", @$array_ref), " ]\n";
+      } ' $decode_dir/utt.lengths $pad_frames $silence_weight | \
+        gzip -c > $output_wts_file_gz
+fi
diff --git a/egs/chime5/s5b/local/json2text.py b/egs/chime5/s5b/local/json2text.py
new file mode 100755
index 00000000000..4df0160efb6
--- /dev/null
+++ b/egs/chime5/s5b/local/json2text.py
@@ -0,0 +1,84 @@
+#!/usr/bin/env python3
+
+# Copyright 2017 Johns Hopkins University (Shinji Watanabe)
+#  Apache 2.0  (http://www.apache.org/licenses/LICENSE-2.0)
+
+import json
+import argparse
+import logging
+import sys
+
+
+def hms_to_seconds(hms):
+    hour = hms.split(':')[0]
+    minute = hms.split(':')[1]
+    second = hms.split(':')[2].split('.')[0]
+
+    # .xx (10 ms order)
+    ms10 = hms.split(':')[2].split('.')[1]
+
+    # total seconds
+    seconds = int(hour) * 3600 + int(minute) * 60 + int(second)
+
+    return '{:07d}'.format(int(str(seconds) + ms10))
+
+
+if __name__ == '__main__':
+    parser = argparse.ArgumentParser()
+    parser.add_argument('json', type=str, help='JSON transcription file')
+    parser.add_argument('--mictype', type=str,
+                        choices=['ref', 'worn', 'u01', 'u02', 'u03', 'u04', 'u05', 'u06'],
+                        help='Type of microphones')
+    args = parser.parse_args()
+
+    # logging info
+    log_format = "%(asctime)s (%(module)s:%(lineno)d) %(levelname)s:%(message)s"
+    logging.basicConfig(level=logging.INFO, format=log_format)
+
+    logging.debug("reading %s", args.json)
+    with open(args.json, 'rt', encoding="utf-8") as f:
+        j = json.load(f)
+
+    for x in j:
+        if '[redacted]' not in x['words']:
+            session_id = x['session_id']
+            speaker_id = x['speaker']
+            if args.mictype == 'ref':
+                mictype = x['ref']
+            elif args.mictype == 'worn':
+                mictype = 'original'
+            else:
+                mictype = args.mictype.upper() # convert from u01 to U01
+
+            # add location tag for scoring (only for dev and eval sets)
+            if 'location' in x.keys():
+                location = x['location'].upper()
+            else:
+                location = 'NOLOCATION'
+
+            start_time = x['start_time'][mictype]
+            end_time = x['end_time'][mictype]
+        
+            # remove meta chars and convert to lower
+            words = x['words'].replace('"', '')\
+                              .replace('.', '')\
+                              .replace('?', '')\
+                              .replace(',', '')\
+                              .replace(':', '')\
+                              .replace(';', '')\
+                              .replace('!', '').lower()
+
+            # remove multiple spaces
+            words = " ".join(words.split())
+
+            # convert to seconds, e.g., 1:10:05.55 -> 3600 + 600 + 5.55 = 4205.55
+            start_time = hms_to_seconds(start_time)
+            end_time = hms_to_seconds(end_time)
+
+            uttid = speaker_id + '_' + session_id
+            if not args.mictype == 'worn':
+                uttid += '_' + mictype
+            uttid += '_' + location + '-' + start_time + '-' + end_time
+
+            if end_time > start_time:
+                sys.stdout.buffer.write((uttid + ' ' + words + '\n').encode("utf-8"))
diff --git a/egs/chime5/s5b/local/make_noise_list.py b/egs/chime5/s5b/local/make_noise_list.py
new file mode 100755
index 00000000000..5aaf7fa4062
--- /dev/null
+++ b/egs/chime5/s5b/local/make_noise_list.py
@@ -0,0 +1,17 @@
+#!/usr/bin/env python3
+
+import glob
+import os
+import sys
+
+
+if len(sys.argv) != 2:
+    print ("Usage: {} <noises-dir>".format(sys.argv[0]))
+    raise SystemExit(1)
+
+
+for line in glob.glob("{}/*.wav".format(sys.argv[1])):
+    fname = os.path.basename(line.strip())
+
+    print ("--noise-id {} --noise-type point-source "
+           "--bg-fg-type foreground {}".format(fname, line.strip()))
diff --git a/egs/chime5/s5/local/chain/compare_wer.sh b/egs/chime5/s5b/local/nnet3/compare_wer.sh
similarity index 84%
rename from egs/chime5/s5/local/chain/compare_wer.sh
rename to egs/chime5/s5b/local/nnet3/compare_wer.sh
index cd6be14ed88..095e85cc338 100755
--- a/egs/chime5/s5/local/chain/compare_wer.sh
+++ b/egs/chime5/s5b/local/nnet3/compare_wer.sh
@@ -101,31 +101,32 @@ if $used_epochs; then
   exit 0;  # the diagnostics aren't comparable between regular and discriminatively trained systems.
 fi
 
-
 echo -n "# Final train prob     "
 for x in $*; do
-  prob=$(grep Overall $x/log/compute_prob_train.final.log | grep -v xent | awk '{printf("%.4f", $8)}')
+  prob=$(grep Overall $x/log/compute_prob_train.{final,combined}.log 2>/dev/null | grep log-like | awk '{printf("%.4f", $8)}')
   printf "% 10s" $prob
 done
 echo
 
 echo -n "# Final valid prob     "
 for x in $*; do
-  prob=$(grep Overall $x/log/compute_prob_valid.final.log | grep -v xent | awk '{printf("%.4f", $8)}')
+  prob=$(grep Overall $x/log/compute_prob_valid.{final,combined}.log 2>/dev/null | grep log-like | awk '{printf("%.4f", $8)}')
   printf "% 10s" $prob
 done
 echo
 
-echo -n "# Final train prob (xent)"
+echo -n "# Final train acc      "
 for x in $*; do
-  prob=$(grep Overall $x/log/compute_prob_train.final.log | grep -w xent | awk '{printf("%.4f", $8)}')
+  prob=$(grep Overall $x/log/compute_prob_train.{final,combined}.log 2>/dev/null | grep accuracy | awk '{printf("%.4f", $8)}')
   printf "% 10s" $prob
 done
 echo
 
-echo -n "# Final valid prob (xent)"
+echo -n "# Final valid acc      "
 for x in $*; do
-  prob=$(grep Overall $x/log/compute_prob_valid.final.log | grep -w xent | awk '{printf("%.4f", $8)}')
+  prob=$(grep Overall $x/log/compute_prob_valid.{final,combined}.log 2>/dev/null | grep accuracy | awk '{printf("%.4f", $8)}')
   printf "% 10s" $prob
 done
 echo
+
+echo
diff --git a/egs/chime5/s5b/local/nnet3/decode.sh b/egs/chime5/s5b/local/nnet3/decode.sh
new file mode 100755
index 00000000000..7af09f36a13
--- /dev/null
+++ b/egs/chime5/s5b/local/nnet3/decode.sh
@@ -0,0 +1,162 @@
+#!/bin/bash
+
+# Copyright 2016 Johns Hopkins University (Author: Daniel Povey, Vijayaditya Peddinti)
+#           2019 Vimal Manohar 
+# Apache 2.0.
+
+# This script does 2-stage decoding where the first stage is used to get 
+# reliable frames for i-vector extraction.
+
+set -e
+
+# general opts
+iter=
+stage=0
+nj=30
+affix=  # affix for decode directory
+
+# ivector opts
+max_count=75  # parameter for extract_ivectors.sh
+sub_speaker_frames=6000
+ivector_scale=0.75
+get_weights_from_ctm=true
+weights_file=   # use weights from this archive (must be compressed using gunzip)
+silence_weight=0.00001   # apply this weight to silence frames during i-vector extraction
+ivector_dir=exp/nnet3
+
+# decode opts
+pass2_decode_opts="--min-active 1000"
+lattice_beam=8
+extra_left_context=0 # change for (B)LSTM
+extra_right_context=0 # change for BLSTM
+frames_per_chunk=50 # change for (B)LSTM
+acwt=0.1 # important to change this when using chain models
+post_decode_acwt=1.0 # important to change this when using chain models
+extra_left_context_initial=0
+extra_right_context_final=0
+
+score_opts="--min-lmwt 6 --max-lmwt 13"
+
+. ./cmd.sh
+[ -f ./path.sh ] && . ./path.sh
+. utils/parse_options.sh || exit 1;
+
+if [ $# -ne 4 ]; then
+  echo "Usage: $0 [options] <data-dir> <lang-dir> <graph-dir> <model-dir>"
+  echo " Options:"
+  echo "    --stage (0|1|2)   # start scoring script from part-way through."
+  echo "e.g.:"
+  echo "$0 data/dev data/lang exp/tri5a/graph_pp exp/nnet3/tdnn"
+  exit 1;
+fi
+
+data=$1 # data directory 
+lang=$2 # data/lang
+graph=$3 #exp/tri5a/graph_pp
+dir=$4 # exp/nnet3/tdnn
+
+model_affix=`basename $dir`
+ivector_affix=${affix:+_$affix}_chain_${model_affix}${iter:+_iter$iter}
+affix=${affix:+_${affix}}${iter:+_iter${iter}}
+
+if [ $stage -le 1 ]; then
+  if [ ! -s ${data}_hires/feats.scp ]; then
+    utils/copy_data_dir.sh $data ${data}_hires
+    steps/make_mfcc.sh --mfcc-config conf/mfcc_hires.conf --nj $nj --cmd "$train_cmd" ${data}_hires
+    steps/compute_cmvn_stats.sh ${data}_hires
+    utils/fix_data_dir.sh ${data}_hires
+  fi
+fi
+
+data_set=$(basename $data)
+if [ $stage -le 2 ]; then
+  echo "Extracting i-vectors, stage 1"
+  steps/online/nnet2/extract_ivectors_online.sh --cmd "$train_cmd" --nj $nj \
+    --max-count $max_count \
+    ${data}_hires $ivector_dir/extractor \
+    $ivector_dir/ivectors_${data_set}${ivector_affix}_stage1;
+  # float comparisons are hard in bash
+  if [ `bc <<< "$ivector_scale != 1"` -eq 1 ]; then
+    ivector_scale_affix=_scale$ivector_scale
+  else
+    ivector_scale_affix=
+  fi
+
+  if [ ! -z "$ivector_scale_affix" ]; then
+    echo "$0: Scaling iVectors, stage 1"
+    srcdir=$ivector_dir/ivectors_${data_set}${ivector_affix}_stage1
+    outdir=$ivector_dir/ivectors_${data_set}${ivector_affix}${ivector_scale_affix}_stage1
+    mkdir -p $outdir
+    $train_cmd $outdir/log/scale_ivectors.log \
+      copy-matrix --scale=$ivector_scale scp:$srcdir/ivector_online.scp ark:- \| \
+      copy-feats --compress=true ark:-  ark,scp:$outdir/ivector_online.ark,$outdir/ivector_online.scp;
+    cp $srcdir/ivector_period $outdir/ivector_period
+  fi
+fi
+
+decode_dir=$dir/decode_${data_set}${affix}
+# generate the lattices
+if [ $stage -le 3 ]; then
+  echo "Generating lattices, stage 1"
+  steps/nnet3/decode.sh --nj $nj --cmd "$decode_cmd" \
+    --acwt $acwt --post-decode-acwt $post_decode_acwt \
+    --extra-left-context $extra_left_context  \
+    --extra-right-context $extra_right_context  \
+    --extra-left-context-initial $extra_left_context_initial \
+    --extra-right-context-final $extra_right_context_final \
+    --frames-per-chunk "$frames_per_chunk" \
+    --online-ivector-dir $ivector_dir/ivectors_${data_set}${ivector_affix}${ivector_scale_affix}_stage1 \
+    --skip-scoring true ${iter:+--iter $iter} \
+    $graph ${data}_hires ${decode_dir}_stage1;
+fi
+
+if [ $stage -le 4 ]; then
+  if $get_weights_from_ctm; then
+    if [ ! -z $weights_file ]; then
+      echo "$0: Using provided vad weights file $weights_file"
+      ivector_extractor_weights=$weights_file
+    else
+      echo "$0 : Generating vad weights file"
+      ivector_extractor_weights=${decode_dir}_stage1/weights${affix}.gz
+      local/extract_vad_weights.sh --silence-weight $silence_weight \
+        --cmd "$decode_cmd" ${iter:+--iter $iter} \
+        ${data}_hires $lang \
+        ${decode_dir}_stage1 $ivector_extractor_weights
+    fi
+  else
+    # get weights from best path decoding
+    ivector_extractor_weights=${decode_dir}_stage1
+  fi
+fi
+
+if [ $stage -le 5 ]; then
+  echo "Extracting i-vectors, stage 2 with weights from $ivector_extractor_weights"
+  # this does offline decoding, except we estimate the iVectors per
+  # speaker, excluding silence (based on alignments from a DNN decoding), with a
+  # different script.  This is just to demonstrate that script.
+  # the --sub-speaker-frames is optional; if provided, it will divide each speaker
+  # up into "sub-speakers" of at least that many frames... can be useful if
+  # acoustic conditions drift over time within the speaker's data.
+  steps/online/nnet2/extract_ivectors.sh --cmd "$train_cmd" --nj $nj \
+    --silence-weight $silence_weight \
+    --sub-speaker-frames $sub_speaker_frames --max-count $max_count \
+    ${data}_hires $lang $ivector_dir/extractor \
+    $ivector_extractor_weights $ivector_dir/ivectors_${data_set}${ivector_affix};
+fi
+
+if [ $stage -le 6 ]; then
+  echo "Generating lattices, stage 2 with --acwt $acwt"
+  rm -f ${decode_dir}/.error
+  steps/nnet3/decode.sh --nj $nj --cmd "$decode_cmd" $pass2_decode_opts \
+      --acwt $acwt --post-decode-acwt $post_decode_acwt \
+      --extra-left-context $extra_left_context  \
+      --extra-right-context $extra_right_context  \
+      --extra-left-context-initial $extra_left_context_initial \
+      --extra-right-context-final $extra_right_context_final \
+      --frames-per-chunk "$frames_per_chunk" \
+      --skip-scoring false ${iter:+--iter $iter} --lattice-beam $lattice_beam \
+      --online-ivector-dir $ivector_dir/ivectors_${data_set}${ivector_affix} \
+     $graph ${data}_hires ${decode_dir} || touch ${decode_dir}/.error
+  [ -f ${decode_dir}/.error ] && echo "$0: Error decoding" && exit 1;
+fi
+exit 0
diff --git a/egs/chime5/s5b/local/nnet3/run_ivector_common.sh b/egs/chime5/s5b/local/nnet3/run_ivector_common.sh
new file mode 100755
index 00000000000..3910e1812a3
--- /dev/null
+++ b/egs/chime5/s5b/local/nnet3/run_ivector_common.sh
@@ -0,0 +1,151 @@
+#!/bin/bash
+
+set -euo pipefail
+
+# This script is called from local/nnet3/run_tdnn.sh and
+# local/chain/run_tdnn.sh (and may eventually be called by more
+# scripts).  It contains the common feature preparation and
+# iVector-related parts of the script.  See those scripts for examples
+# of usage.
+
+stage=0
+train_set=train_worn_u100k
+test_sets="dev_worn dev_beamformit_ref"
+gmm=tri3
+nj=96
+
+nnet3_affix=_train_worn_u100k
+
+. ./cmd.sh
+. ./path.sh
+. utils/parse_options.sh
+
+gmm_dir=exp/${gmm}
+ali_dir=exp/${gmm}_ali_${train_set}_sp
+
+for f in ${gmm_dir}/final.mdl; do
+  if [ ! -f $f ]; then
+    echo "$0: expected file $f to exist"
+    exit 1
+  fi
+done
+
+if [ $stage -le 1 ]; then
+  # Although the nnet will be trained by high resolution data, we still have to
+  # perturb the normal data to get the alignment _sp stands for speed-perturbed
+  echo "$0: preparing directory for low-resolution speed-perturbed data (for alignment)"
+  utils/data/perturb_data_dir_speed_3way.sh data/${train_set} data/${train_set}_sp
+  echo "$0: making MFCC features for low-resolution speed-perturbed data"
+  steps/make_mfcc.sh --cmd "$train_cmd" --nj 20 data/${train_set}_sp || exit 1;
+  steps/compute_cmvn_stats.sh data/${train_set}_sp || exit 1;
+  utils/fix_data_dir.sh data/${train_set}_sp
+fi
+
+if [ $stage -le 2 ]; then
+  echo "$0: aligning with the perturbed low-resolution data"
+  steps/align_fmllr.sh --nj ${nj} --cmd "$train_cmd" \
+    data/${train_set}_sp data/lang $gmm_dir $ali_dir || exit 1
+fi
+
+if [ $stage -le 3 ]; then
+  # Create high-resolution MFCC features (with 40 cepstra instead of 13).
+  # this shows how you can split across multiple file-systems.
+  echo "$0: creating high-resolution MFCC features"
+  mfccdir=data/${train_set}_sp_hires/data
+  if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $mfccdir/storage ]; then
+    utils/create_split_dir.pl /export/b1{5,6,8,9}/$USER/kaldi-data/mfcc/chime5-$(date +'%m_%d_%H_%M')/s5/$mfccdir/storage $mfccdir/storage
+  fi
+
+  for datadir in ${train_set}_sp ${test_sets}; do
+    utils/copy_data_dir.sh data/$datadir data/${datadir}_hires
+  done
+
+  # do volume-perturbation on the training data prior to extracting hires
+  # features; this helps make trained nnets more invariant to test data volume.
+  utils/data/perturb_data_dir_volume.sh data/${train_set}_sp_hires || exit 1;
+
+  for datadir in ${train_set}_sp ${test_sets}; do
+    steps/make_mfcc.sh --nj 20 --mfcc-config conf/mfcc_hires.conf \
+      --cmd "$train_cmd" data/${datadir}_hires || exit 1;
+    steps/compute_cmvn_stats.sh data/${datadir}_hires || exit 1;
+    utils/fix_data_dir.sh data/${datadir}_hires || exit 1;
+  done
+fi
+
+if [ $stage -le 4 ]; then
+  echo "$0: computing a subset of data to train the diagonal UBM."
+  # We'll use about a quarter of the data.
+  mkdir -p exp/nnet3${nnet3_affix}/diag_ubm
+  temp_data_root=exp/nnet3${nnet3_affix}/diag_ubm
+
+  num_utts_total=$(wc -l <data/${train_set}_sp_hires/utt2spk)
+  num_utts=$[$num_utts_total/4]
+  utils/data/subset_data_dir.sh data/${train_set}_sp_hires \
+     $num_utts ${temp_data_root}/${train_set}_sp_hires_subset
+
+  echo "$0: computing a PCA transform from the hires data."
+  steps/online/nnet2/get_pca_transform.sh --cmd "$train_cmd" \
+      --splice-opts "--left-context=3 --right-context=3" \
+      --max-utts 10000 --subsample 2 \
+       ${temp_data_root}/${train_set}_sp_hires_subset \
+       exp/nnet3${nnet3_affix}/pca_transform
+
+  echo "$0: training the diagonal UBM."
+  # Use 512 Gaussians in the UBM.
+  steps/online/nnet2/train_diag_ubm.sh --cmd "$train_cmd" --nj 30 \
+    --num-frames 700000 \
+    --num-threads 8 \
+    ${temp_data_root}/${train_set}_sp_hires_subset 512 \
+    exp/nnet3${nnet3_affix}/pca_transform exp/nnet3${nnet3_affix}/diag_ubm
+fi
+
+if [ $stage -le 5 ]; then
+  # Train the iVector extractor.  Use all of the speed-perturbed data since iVector extractors
+  # can be sensitive to the amount of data.  The script defaults to an iVector dimension of
+  # 100.
+  echo "$0: training the iVector extractor"
+  steps/online/nnet2/train_ivector_extractor.sh --cmd "$train_cmd" --nj 20 \
+     data/${train_set}_sp_hires exp/nnet3${nnet3_affix}/diag_ubm \
+     exp/nnet3${nnet3_affix}/extractor || exit 1;
+fi
+
+
+if [ $stage -le 6 ]; then
+  # We extract iVectors on the speed-perturbed training data after combining
+  # short segments, which will be what we train the system on.  With
+  # --utts-per-spk-max 2, the script pairs the utterances into twos, and treats
+  # each of these pairs as one speaker; this gives more diversity in iVectors..
+  # Note that these are extracted 'online'.
+
+  # note, we don't encode the 'max2' in the name of the ivectordir even though
+  # that's the data we extract the ivectors from, as it's still going to be
+  # valid for the non-'max2' data, the utterance list is the same.
+
+  ivectordir=exp/nnet3${nnet3_affix}/ivectors_${train_set}_sp_hires
+  if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $ivectordir/storage ]; then
+    utils/create_split_dir.pl /export/b0{5,6,7,8}/$USER/kaldi-data/ivectors/chime5-$(date +'%m_%d_%H_%M')/s5/$ivectordir/storage $ivectordir/storage
+  fi
+
+
+  # having a larger number of speakers is helpful for generalization, and to
+  # handle per-utterance decoding well (iVector starts at zero).
+  temp_data_root=${ivectordir}
+  utils/data/modify_speaker_info.sh --utts-per-spk-max 2 \
+    data/${train_set}_sp_hires ${temp_data_root}/${train_set}_sp_hires_max2
+
+  steps/online/nnet2/extract_ivectors_online.sh --cmd "$train_cmd" --nj ${nj} \
+    ${temp_data_root}/${train_set}_sp_hires_max2 \
+    exp/nnet3${nnet3_affix}/extractor $ivectordir
+fi
+
+if [ $stage -le 7 ]; then
+  # Also extract iVectors for the test data, but in this case we don't need the speed
+  # perturbation (sp).
+  for data in $test_sets; do
+    steps/online/nnet2/extract_ivectors_online.sh --cmd "$train_cmd" --nj 20 \
+      data/${data}_hires exp/nnet3${nnet3_affix}/extractor \
+      exp/nnet3${nnet3_affix}/ivectors_${data}_hires
+  done
+fi
+
+exit 0
diff --git a/egs/chime5/s5b/local/prepare_data.sh b/egs/chime5/s5b/local/prepare_data.sh
new file mode 100755
index 00000000000..98087322c38
--- /dev/null
+++ b/egs/chime5/s5b/local/prepare_data.sh
@@ -0,0 +1,136 @@
+#!/bin/bash
+#
+# Copyright  2017  Johns Hopkins University (Author: Shinji Watanabe, Yenda Trmal)
+# Apache 2.0
+
+# Begin configuration section.
+mictype=worn # worn, ref or others
+cleanup=true
+# End configuration section
+. ./utils/parse_options.sh  # accept options.. you can run this run.sh with the
+
+. ./path.sh
+
+echo >&2 "$0" "$@"
+if [ $# -ne 3 ] ; then
+  echo >&2 "$0" "$@"
+  echo >&2 "$0: Error: wrong number of arguments"
+  echo -e >&2 "Usage:\n  $0 [opts] <audio-dir> <json-transcript-dir> <output-dir>"
+  echo -e >&2 "eg:\n  $0 /corpora/chime5/audio/train /corpora/chime5/transcriptions/train data/train"
+  exit 1
+fi
+
+set -e -o pipefail
+
+adir=$1
+jdir=$2
+dir=$3
+
+json_count=$(find -L $jdir -name "*.json" | wc -l)
+wav_count=$(find -L $adir -name "*.wav" | wc -l)
+
+if [ "$json_count" -eq 0 ]; then
+  echo >&2 "We expect that the directory $jdir will contain json files."
+  echo >&2 "That implies you have supplied a wrong path to the data."
+  exit 1
+fi
+if [ "$wav_count" -eq 0 ]; then
+  echo >&2 "We expect that the directory $adir will contain wav files."
+  echo >&2 "That implies you have supplied a wrong path to the data."
+  exit 1
+fi
+
+echo "$0: Converting transcription to text"
+
+mkdir -p $dir
+for file in $jdir/*json; do
+  ./local/json2text.py --mictype $mictype $file
+done | \
+  sed -e "s/\[inaudible[- 0-9]*\]/[inaudible]/g" |\
+  sed -e 's/ - / /g' |\
+  sed -e 's/mm-/mm/g' > $dir/text.orig
+
+echo "$0: Creating datadir $dir for type=\"$mictype\""
+
+if [ $mictype == "worn" ]; then
+  # convert the filenames to wav.scp format, use the basename of the file
+  # as a the wav.scp key, add .L and .R for left and right channel
+  # i.e. each file will have two entries (left and right channel)
+  find -L $adir -name  "S[0-9]*_P[0-9]*.wav" | \
+    perl -ne '{
+      chomp;
+      $path = $_;
+      next unless $path;
+      @F = split "/", $path;
+      ($f = $F[@F-1]) =~ s/.wav//;
+      @F = split "_", $f;
+      print "${F[1]}_${F[0]}.L sox $path -t wav - remix 1 |\n";
+      print "${F[1]}_${F[0]}.R sox $path -t wav - remix 2 |\n";
+    }' | sort > $dir/wav.scp
+
+  # generate the transcripts for both left and right channel
+  # from the original transcript in the form
+  # P09_S03-0006072-0006147 gimme the baker
+  # create left and right channel transcript
+  # P09_S03.L-0006072-0006147 gimme the baker
+  # P09_S03.R-0006072-0006147 gimme the baker
+  sed -n 's/  *$//; h; s/-/\.L-/p; g; s/-/\.R-/p' $dir/text.orig | sort > $dir/text
+elif [ $mictype == "ref" ]; then
+  # fixed reference array
+
+  # first get a text, which will be used to extract reference arrays
+  perl -ne 's/-/.ENH-/;print;' $dir/text.orig | sort > $dir/text
+
+  find -L $adir | grep "\.wav" | sort > $dir/wav.flist
+  # following command provide the argument for grep to extract only reference arrays
+  grep `cut -f 1 -d"-" $dir/text | awk -F"_" '{print $2 "_" $3}' | sed -e "s/\.ENH//" | sort | uniq | sed -e "s/^/ -e /" | tr "\n" " "` $dir/wav.flist > $dir/wav.flist2
+  paste -d" " \
+	<(awk -F "/" '{print $NF}' $dir/wav.flist2 | sed -e "s/\.wav/.ENH/") \
+	$dir/wav.flist2 | sort > $dir/wav.scp
+else
+  # array mic case
+  # convert the filenames to wav.scp format, use the basename of the file
+  # as a the wav.scp key
+  find -L $adir -name "*.wav" -ipath "*${mictype}*" |\
+    perl -ne '$p=$_;chomp $_;@F=split "/";$F[$#F]=~s/\.wav//;print "$F[$#F] $p";' |\
+    sort -u > $dir/wav.scp
+
+  # convert the transcripts from
+  # P09_S03-0006072-0006147 gimme the baker
+  # to the per-channel transcripts
+  # P09_S03_U01_NOLOCATION.CH1-0006072-0006147 gimme the baker
+  # P09_S03_U01_NOLOCATION.CH2-0006072-0006147 gimme the baker
+  # P09_S03_U01_NOLOCATION.CH3-0006072-0006147 gimme the baker
+  # P09_S03_U01_NOLOCATION.CH4-0006072-0006147 gimme the baker
+  perl -ne '$l=$_;
+    for($i=1; $i<=4; $i++) {
+      ($x=$l)=~ s/-/.CH\Q$i\E-/;
+      print $x;}' $dir/text.orig | sort > $dir/text
+
+fi
+$cleanup && rm -f $dir/text.* $dir/wav.scp.* $dir/wav.flist
+
+# Prepare 'segments', 'utt2spk', 'spk2utt'
+if [ $mictype == "worn" ]; then
+  cut -d" " -f 1 $dir/text | \
+    awk -F"-" '{printf("%s %s %08.2f %08.2f\n", $0, $1, $2/100.0, $3/100.0)}' |\
+    sed -e "s/_[A-Z]*\././2" \
+    > $dir/segments
+elif [ $mictype == "ref" ]; then
+  cut -d" " -f 1 $dir/text | \
+    awk -F"-" '{printf("%s %s %08.2f %08.2f\n", $0, $1, $2/100.0, $3/100.0)}' |\
+    sed -e "s/_[A-Z]*\././2" |\
+    sed -e "s/ P.._/ /" > $dir/segments
+else
+  cut -d" " -f 1 $dir/text | \
+    awk -F"-" '{printf("%s %s %08.2f %08.2f\n", $0, $1, $2/100.0, $3/100.0)}' |\
+    sed -e "s/_[A-Z]*\././2" |\
+    sed -e 's/ P.._/ /' > $dir/segments
+fi
+cut -f 1 -d ' ' $dir/segments | \
+  perl -ne 'chomp;$utt=$_;s/_.*//;print "$utt $_\n";' > $dir/utt2spk
+
+utils/utt2spk_to_spk2utt.pl $dir/utt2spk > $dir/spk2utt
+
+# Check that data dirs are okay!
+utils/validate_data_dir.sh --no-feats $dir || exit 1
diff --git a/egs/chime5/s5b/local/prepare_dict.sh b/egs/chime5/s5b/local/prepare_dict.sh
new file mode 100755
index 00000000000..09083d0e795
--- /dev/null
+++ b/egs/chime5/s5b/local/prepare_dict.sh
@@ -0,0 +1,124 @@
+#!/bin/bash
+# Copyright (c) 2018, Johns Hopkins University (Jan "Yenda" Trmal<jtrmal@gmail.com>)
+# License: Apache 2.0
+
+# Begin configuration section.
+# End configuration section
+. ./utils/parse_options.sh
+
+. ./path.sh
+
+set -e -o pipefail
+set -o nounset                              # Treat unset variables as an error
+
+
+# The parts of the output of this that will be needed are
+# [in data/local/dict/ ]
+# lexicon.txt
+# extra_questions.txt
+# nonsilence_phones.txt
+# optional_silence.txt
+# silence_phones.txt
+
+
+# check existing directories
+[ $# != 0 ] && echo "Usage: $0" && exit 1;
+
+dir=data/local/dict
+
+mkdir -p $dir
+echo "$0: Getting CMU dictionary"
+if [ ! -f $dir/cmudict.done ]; then
+  [ -d $dir/cmudict ] && rm -rf $dir/cmudict
+  svn co https://svn.code.sf.net/p/cmusphinx/code/trunk/cmudict $dir/cmudict
+  touch $dir/cmudict.done
+fi
+
+# silence phones, one per line.
+for w in sil spn inaudible laughs noise; do
+  echo $w;
+done > $dir/silence_phones.txt
+echo sil > $dir/optional_silence.txt
+
+# For this setup we're discarding stress.
+cat $dir/cmudict/cmudict-0.7b.symbols | \
+  perl -ne 's:[0-9]::g; s:\r::; print lc($_)' | \
+  sort -u > $dir/nonsilence_phones.txt
+
+# An extra question will be added by including the silence phones in one class.
+paste -d ' ' -s $dir/silence_phones.txt > $dir/extra_questions.txt
+
+grep -v ';;;' $dir/cmudict/cmudict-0.7b |\
+  uconv -f latin1 -t utf-8 -x Any-Lower |\
+  perl -ne 's:(\S+)\(\d+\) :$1 :; s:  : :; print;' |\
+  perl -ne '@F = split " ",$_,2; $F[1] =~ s/[0-9]//g; print "$F[0] $F[1]";' \
+  > $dir/lexicon1_raw_nosil.txt || exit 1;
+
+# Add prons for laughter, noise, oov
+for w in `grep -v sil $dir/silence_phones.txt`; do
+  echo "[$w] $w"
+done | cat - $dir/lexicon1_raw_nosil.txt > $dir/lexicon2_raw.txt || exit 1;
+
+# we keep all words from the cmudict in the lexicon
+# might reduce OOV rate on dev and eval
+cat $dir/lexicon2_raw.txt  \
+   <( echo "mm m"
+      echo "<unk> spn"
+      echo "cuz k aa z"
+      echo "cuz k ah z"
+      echo "cuz k ao z"
+      echo "mmm m"; \
+      echo "hmm hh m"; \
+    ) | sort -u | sed 's/[\t ]/\t/' > $dir/iv_lexicon.txt
+
+
+cat data/train*/text  | \
+  awk '{for (n=2;n<=NF;n++){ count[$n]++; } } END { for(n in count) { print count[n], n; }}' | \
+  sort -nr > $dir/word_counts
+
+cat $dir/word_counts | awk '{print $2}' > $dir/word_list
+
+awk '{print $1}' $dir/iv_lexicon.txt | \
+  perl -e '($word_counts)=@ARGV;
+   open(W, "<$word_counts")||die "opening word-counts $word_counts";
+   while(<STDIN>) { chop; $seen{$_}=1; }
+   while(<W>) {
+     ($c,$w) = split;
+     if (!defined $seen{$w}) { print; }
+   } ' $dir/word_counts > $dir/oov_counts.txt
+
+echo "*Highest-count OOVs (including fragments) are:"
+head -n 10 $dir/oov_counts.txt
+echo "*Highest-count OOVs (excluding fragments) are:"
+grep -v -E '^-|-$' $dir/oov_counts.txt | head -n 10 || true
+
+echo "*Training a G2P and generating missing pronunciations"
+mkdir -p $dir/g2p/
+phonetisaurus-align --input=$dir/iv_lexicon.txt --ofile=$dir/g2p/aligned_lexicon.corpus
+ngram-count -order 4 -kn-modify-counts-at-end -ukndiscount\
+  -gt1min 0 -gt2min 0 -gt3min 0 -gt4min 0 \
+  -text $dir/g2p/aligned_lexicon.corpus -lm $dir/g2p/aligned_lexicon.arpa
+phonetisaurus-arpa2wfst --lm=$dir/g2p/aligned_lexicon.arpa --ofile=$dir/g2p/g2p.fst
+awk '{print $2}' $dir/oov_counts.txt > $dir/oov_words.txt
+phonetisaurus-apply --nbest 2 --model $dir/g2p/g2p.fst --thresh 5 --accumulate \
+  --word_list $dir/oov_words.txt > $dir/oov_lexicon.txt
+
+## The next section is again just for debug purposes
+## to show words for which the G2P failed
+cat $dir/oov_lexicon.txt $dir/iv_lexicon.txt | sort -u > $dir/lexicon.txt
+rm -f $dir/lexiconp.txt 2>/dev/null; # can confuse later script if this exists.
+awk '{print $1}' $dir/lexicon.txt | \
+  perl -e '($word_counts)=@ARGV;
+   open(W, "<$word_counts")||die "opening word-counts $word_counts";
+   while(<STDIN>) { chop; $seen{$_}=1; }
+   while(<W>) {
+     ($c,$w) = split;
+     if (!defined $seen{$w}) { print; }
+   } ' $dir/word_counts > $dir/oov_counts.g2p.txt
+
+echo "*Highest-count OOVs (including fragments) after G2P are:"
+head -n 10 $dir/oov_counts.g2p.txt
+
+utils/validate_dict_dir.pl $dir
+exit 0;
+
diff --git a/egs/chime5/s5b/local/reverberate_lat_dir.sh b/egs/chime5/s5b/local/reverberate_lat_dir.sh
new file mode 100755
index 00000000000..f601a37c0e1
--- /dev/null
+++ b/egs/chime5/s5b/local/reverberate_lat_dir.sh
@@ -0,0 +1,93 @@
+#!/bin/bash
+
+# Copyright 2018  Vimal Manohar
+# Apache 2.0
+
+num_data_reps=1
+cmd=run.pl
+nj=20
+include_clean=false
+
+. utils/parse_options.sh
+. ./path.sh
+
+if [ $# -ne 4 ]; then
+  echo "Usage: $0 <train-data-dir> <noisy-latdir> <clean-latdir> <output-latdir>"
+  exit 1
+fi
+
+train_data_dir=$1
+noisy_latdir=$2
+clean_latdir=$3
+dir=$4
+
+clean_nj=$(cat $clean_latdir/num_jobs)
+
+$cmd JOB=1:$clean_nj $dir/copy_clean_lattices.JOB.log \
+  lattice-copy "ark:gunzip -c $clean_latdir/lat.JOB.gz |" \
+  ark,scp:$dir/lats_clean.JOB.ark,$dir/lats_clean.JOB.scp || exit 1
+  
+for n in $(seq $clean_nj); do
+  cat $dir/lats_clean.$n.scp 
+done > $dir/lats_clean.scp
+
+for i in $(seq $num_data_reps); do
+  cat $dir/lats_clean.scp | awk -vi=$i '{print "rev"i"_"$0}'
+done > $dir/lats_rvb.scp
+
+noisy_nj=$(cat $noisy_latdir/num_jobs)
+$cmd JOB=1:$noisy_nj $dir/copy_noisy_lattices.JOB>log \
+  lattice-copy "ark:gunzip -c $noisy_latdir/lat.JOB.gz |" \
+  ark,scp:$dir/lats_noisy.JOB.ark,$dir/lats_noisy.JOB.scp || exit 1
+
+optional_clean=
+if $include_clean; then
+  optional_clean=$dir/lats_clean.scp
+fi
+
+for n in $(seq $noisy_nj); do
+  cat $dir/lats_noisy.$n.scp
+done | cat - $dir/lats_rvb.scp ${optional_clean} | sort -k1,1 > $dir/lats.scp
+
+utils/split_data.sh $train_data_dir $nj
+$cmd JOB=1:$nj $dir/copy_lattices.JOB.log \
+  lattice-copy "scp:utils/filter_scp.pl $train_data_dir/split$nj/JOB/utt2spk $dir/lats.scp |" \
+  "ark:|gzip -c >$dir/lat.JOB.gz" || exit 1
+
+echo $nj > $dir/num_jobs
+
+if [ -f $clean_latdir/ali.1.gz ]; then
+  $cmd JOB=1:$clean_nj $dir/copy_clean_alignments.JOB.log \
+    copy-int-vector "ark:gunzip -c $clean_latdir/ali.JOB.gz |" \
+    ark,scp:$dir/ali_clean.JOB.ark,$dir/ali_clean.JOB.scp
+    
+  for n in $(seq $clean_nj); do
+    cat $dir/ali_clean.$n.scp 
+  done > $dir/ali_clean.scp
+
+  for i in $(seq $num_data_reps); do
+    cat $dir/ali_clean.scp | awk -vi=$i '{print "rev"i"_"$0}'
+  done > $dir/ali_rvb.scp
+  
+  optional_clean=
+  if $include_clean; then
+    optional_clean=$dir/ali_clean.scp
+  fi
+
+  $cmd JOB=1:$noisy_nj $dir/copy_noisy_alignments.JOB.log \
+    copy-int-vector "ark:gunzip -c $noisy_latdir/ali.JOB.gz |" \
+    ark,scp:$dir/ali_noisy.JOB.ark,$dir/ali_noisy.JOB.scp
+
+  for n in $(seq $noisy_nj); do
+    cat $dir/ali_noisy.$n.scp
+  done | cat - $dir/ali_rvb.scp $optional_clean | sort -k1,1 > $dir/ali.scp
+
+  utils/split_data.sh $train_data_dir $nj || exit 1
+  $cmd JOB=1:$nj $dir/copy_rvb_alignments.JOB.log \
+    copy-int-vector "scp:utils/filter_scp.pl $train_data_dir/split$nj/JOB/utt2spk $dir/ali.scp |" \
+    "ark:|gzip -c >$dir/ali.JOB.gz" || exit 1
+fi
+
+cp $clean_latdir/{final.*,tree,*.mat,*opts,*.txt} $dir || true
+
+rm $dir/lats_{clean,noisy}.*.{ark,scp} $dir/ali_{clean,noisy}.*.{ark,scp} || true # save space
diff --git a/egs/chime5/s5b/local/run_beamformit.sh b/egs/chime5/s5b/local/run_beamformit.sh
new file mode 100755
index 00000000000..aa3badd90d8
--- /dev/null
+++ b/egs/chime5/s5b/local/run_beamformit.sh
@@ -0,0 +1,87 @@
+#!/bin/bash
+
+# Copyright 2015, Mitsubishi Electric Research Laboratories, MERL (Author: Shinji Watanabe)
+
+. ./cmd.sh
+. ./path.sh
+
+# Config:
+cmd=run.pl
+bmf="1 2 3 4"
+
+. utils/parse_options.sh || exit 1;
+
+if [ $# != 3 ]; then
+   echo "Wrong #arguments ($#, expected 3)"
+   echo "Usage: local/run_beamformit.sh [options] <wav-in-dir> <wav-out-dir> <array-id>"
+   echo "main options (for others, see top of script file)"
+   echo "  --cmd <cmd>                              # Command to run in parallel with"
+   echo "  --bmf \"1 2 3 4\"                        # microphones used for beamforming"
+   exit 1;
+fi
+
+sdir=$1
+odir=$2
+array=$3
+expdir=exp/enhan/`echo $odir | awk -F '/' '{print $NF}'`_`echo $bmf | tr ' ' '_'`
+
+if ! command  -v BeamformIt &>/dev/null ; then
+  echo "Missing BeamformIt, run 'cd $KALDI_ROOT/tools/; ./extras/install_beamformit.sh; cd -;'" && exit 1
+fi
+
+# Set bash to 'debug' mode, it will exit on :
+# -e 'error', -u 'undefined variable', -o ... 'error in pipeline', -x 'print commands',
+set -e
+set -u
+set -o pipefail
+
+mkdir -p $odir
+mkdir -p $expdir/log
+
+echo "Will use the following channels: $bmf"
+# number of channels
+numch=`echo $bmf | tr ' ' '\n' | wc -l`
+echo "the number of channels: $numch"
+
+# wavfiles.list can be used as the name of the output files
+output_wavfiles=$expdir/wavfiles.list
+find -L ${sdir} | grep -i ${array} | awk -F "/" '{print $NF}' | sed -e "s/\.CH.\.wav//" | sort | uniq > $expdir/wavfiles.list
+
+# this is an input file list of the microphones
+# format: 1st_wav 2nd_wav ... nth_wav
+input_arrays=$expdir/channels_$numch
+for x in `cat $output_wavfiles`; do
+  echo -n "$x"
+  for ch in $bmf; do
+    echo -n " $x.CH$ch.wav"
+  done
+  echo ""
+done > $input_arrays
+
+# split the list for parallel processing
+# number of jobs are set by the number of WAV files
+nj=`wc -l $expdir/wavfiles.list | awk '{print $1}'`
+split_wavfiles=""
+for n in `seq $nj`; do
+  split_wavfiles="$split_wavfiles $output_wavfiles.$n"
+done
+utils/split_scp.pl $output_wavfiles $split_wavfiles || exit 1;
+
+echo -e "Beamforming\n"
+# making a shell script for each job
+for n in `seq $nj`; do
+cat << EOF > $expdir/log/beamform.$n.sh
+while read line; do
+  $BEAMFORMIT/BeamformIt -s \$line -c $input_arrays \
+    --config_file `pwd`/conf/beamformit.cfg \
+    --source_dir $sdir \
+    --result_dir $odir
+done < $output_wavfiles.$n
+EOF
+done
+
+chmod a+x $expdir/log/beamform.*.sh
+$cmd JOB=1:$nj $expdir/log/beamform.JOB.log \
+  $expdir/log/beamform.JOB.sh
+
+echo "`basename $0` Done."
diff --git a/egs/chime5/s5b/local/run_recog.sh b/egs/chime5/s5b/local/run_recog.sh
new file mode 100755
index 00000000000..5c74c9ff242
--- /dev/null
+++ b/egs/chime5/s5b/local/run_recog.sh
@@ -0,0 +1,164 @@
+#!/bin/bash
+#
+# Based mostly on the TED-LIUM and Switchboard recipe
+#
+# Copyright  2017  Johns Hopkins University (Author: Shinji Watanabe and Yenda Trmal)
+# Apache 2.0
+#
+# This is a subset of run.sh to only perform recognition experiments with evaluation data
+
+# Begin configuration section.
+decode_nj=20
+stage=0
+enhancement=beamformit # for a new enhancement method,
+                       # change this variable and stage 4
+# End configuration section
+. ./utils/parse_options.sh
+
+. ./cmd.sh
+. ./path.sh
+
+
+set -e # exit on error
+
+# chime5 main directory path
+# please change the path accordingly
+chime5_corpus=/export/corpora4/CHiME5
+json_dir=${chime5_corpus}/transcriptions
+audio_dir=${chime5_corpus}/audio
+
+# training and test data
+train_set=train_worn_u100k
+test_sets="eval_${enhancement}_ref"
+
+# This script also needs the phonetisaurus g2p, srilm, beamformit
+./local/check_tools.sh || exit 1
+
+if [ $stage -le 4 ]; then
+  # Beamforming using reference arrays
+  # enhanced WAV directory
+  enhandir=enhan
+  for dset in eval; do
+    for mictype in u01 u02 u03 u04 u05 u06; do
+      local/run_beamformit.sh --cmd "$train_cmd" \
+			      ${audio_dir}/${dset} \
+			      ${enhandir}/${dset}_${enhancement}_${mictype} \
+			      ${mictype}
+    done
+  done
+  
+  for dset in eval; do
+    local/prepare_data.sh --mictype ref "$PWD/${enhandir}/${dset}_${enhancement}_u0*" \
+			  ${json_dir}/${dset} data/${dset}_${enhancement}_ref
+  done
+fi
+
+if [ $stage -le 6 ]; then
+  # fix speaker ID issue (thanks to Dr. Naoyuki Kanda)
+  # add array ID to the speaker ID to avoid the use of other array information to meet regulations
+  # Before this fix
+  # $ head -n 2 data/eval_beamformit_ref_nosplit/utt2spk
+  # P01_S01_U02_KITCHEN.ENH-0000192-0001278 P01
+  # P01_S01_U02_KITCHEN.ENH-0001421-0001481 P01
+  # After this fix
+  # $ head -n 2 data/eval_beamformit_ref_nosplit_fix/utt2spk
+  # P01_S01_U02_KITCHEN.ENH-0000192-0001278 P01_U02
+  # P01_S01_U02_KITCHEN.ENH-0001421-0001481 P01_U02
+  for dset in ${test_sets}; do
+    utils/copy_data_dir.sh data/${dset} data/${dset}_nosplit
+    mkdir -p data/${dset}_nosplit_fix
+    cp data/${dset}_nosplit/{segments,text,wav.scp} data/${dset}_nosplit_fix/
+    awk -F "_" '{print $0 "_" $3}' data/${dset}_nosplit/utt2spk > data/${dset}_nosplit_fix/utt2spk
+    utils/utt2spk_to_spk2utt.pl data/${dset}_nosplit_fix/utt2spk > data/${dset}_nosplit_fix/spk2utt
+  done
+
+  # Split speakers up into 3-minute chunks.  This doesn't hurt adaptation, and
+  # lets us use more jobs for decoding etc.
+  for dset in ${test_sets}; do
+    utils/data/modify_speaker_info.sh --seconds-per-spk-max 180 data/${dset}_nosplit_fix data/${dset}
+  done
+fi
+
+if [ $stage -le 7 ]; then
+  # Now make MFCC features.
+  # mfccdir should be some place with a largish disk where you
+  # want to store MFCC features.
+  mfccdir=mfcc
+  for x in ${test_sets}; do
+    steps/make_mfcc.sh --nj 20 --cmd "$train_cmd" \
+		       data/$x exp/make_mfcc/$x $mfccdir
+    steps/compute_cmvn_stats.sh data/$x exp/make_mfcc/$x $mfccdir
+    utils/fix_data_dir.sh data/$x
+  done
+fi
+
+if [ $stage -le 17 ]; then
+  nnet3_affix=_${train_set}_cleaned
+  for datadir in ${test_sets}; do
+    utils/copy_data_dir.sh data/$datadir data/${datadir}_hires
+  done
+  for datadir in ${test_sets}; do
+    steps/make_mfcc.sh --nj 20 --mfcc-config conf/mfcc_hires.conf \
+      --cmd "$train_cmd" data/${datadir}_hires || exit 1;
+    steps/compute_cmvn_stats.sh data/${datadir}_hires || exit 1;
+    utils/fix_data_dir.sh data/${datadir}_hires || exit 1;
+  done
+  for data in $test_sets; do
+    steps/online/nnet2/extract_ivectors_online.sh --cmd "$train_cmd" --nj 20 \
+      data/${data}_hires exp/nnet3${nnet3_affix}/extractor \
+      exp/nnet3${nnet3_affix}/ivectors_${data}_hires
+  done
+fi
+
+if [ $stage -le 18 ]; then
+  # First the options that are passed through to run_ivector_common.sh
+  # (some of which are also used in this script directly).
+  lm_suffix=
+
+  # The rest are configs specific to this script.  Most of the parameters
+  # are just hardcoded at this level, in the commands below.
+  affix=1a   # affix for the TDNN directory name
+  tree_affix=
+  tree_dir=exp/chain${nnet3_affix}/tree_sp${tree_affix:+_$tree_affix}
+  dir=exp/chain${nnet3_affix}/tdnn${affix}_sp
+
+  # training options
+  # training chunk-options
+  chunk_width=140,100,160
+  # we don't need extra left/right context for TDNN systems.
+  chunk_left_context=0
+  chunk_right_context=0
+  
+  utils/mkgraph.sh \
+      --self-loop-scale 1.0 data/lang${lm_suffix}/ \
+      $tree_dir $tree_dir/graph${lm_suffix} || exit 1;
+
+  frames_per_chunk=$(echo $chunk_width | cut -d, -f1)
+  rm $dir/.error 2>/dev/null || true
+
+  for data in $test_sets; do
+    (
+      steps/nnet3/decode.sh \
+          --acwt 1.0 --post-decode-acwt 10.0 \
+          --extra-left-context $chunk_left_context \
+          --extra-right-context $chunk_right_context \
+          --extra-left-context-initial 0 \
+          --extra-right-context-final 0 \
+          --frames-per-chunk $frames_per_chunk \
+          --nj 8 --cmd "$decode_cmd"  --num-threads 4 \
+          --online-ivector-dir exp/nnet3${nnet3_affix}/ivectors_${data}_hires \
+          $tree_dir/graph${lm_suffix} data/${data}_hires ${dir}/decode${lm_suffix}_${data} || exit 1
+    ) || touch $dir/.error &
+  done
+  wait
+  [ -f $dir/.error ] && echo "$0: there was a problem while decoding" && exit 1
+fi
+
+if [ $stage -le 20 ]; then
+  # final scoring to get the official challenge result
+  # please specify both dev and eval set directories so that the search parameters
+  # (insertion penalty and language model weight) will be tuned using the dev set
+  local/score_for_submit.sh \
+      --dev exp/chain_${train_set}_cleaned/tdnn1a_sp/decode_dev_${enhancement}_ref \
+      --eval exp/chain_${train_set}_cleaned/tdnn1a_sp/decode_eval_${enhancement}_ref
+fi
diff --git a/egs/chime5/s5b/local/run_wpe.py b/egs/chime5/s5b/local/run_wpe.py
new file mode 100755
index 00000000000..2f3818f9c42
--- /dev/null
+++ b/egs/chime5/s5b/local/run_wpe.py
@@ -0,0 +1,59 @@
+#!/usr/bin/env python
+# Copyright 2018 Johns Hopkins University (Author: Aswin Shanmugam Subramanian)
+# Apache 2.0
+# Works with both python2 and python3
+# This script assumes that WPE (nara_wpe) is installed locally using miniconda.
+# ../../../tools/extras/install_miniconda.sh and ../../../tools/extras/install_wpe.sh
+# needs to be run and this script needs to be launched run with that version of
+# python.
+# See local/run_wpe.sh for example.
+
+import numpy as np
+import soundfile as sf
+import time
+import os, errno
+from tqdm import tqdm
+import argparse
+
+from nara_wpe.wpe import wpe
+from nara_wpe.utils import stft, istft
+from nara_wpe import project_root
+
+parser = argparse.ArgumentParser()
+parser.add_argument('--files', '-f', nargs='+')
+args = parser.parse_args()
+
+input_files = args.files[:len(args.files)//2]
+output_files = args.files[len(args.files)//2:]
+out_dir = os.path.dirname(output_files[0])
+try:
+    os.makedirs(out_dir)
+except OSError as e:
+    if e.errno != errno.EEXIST:
+        raise
+
+stft_options = dict(
+    size=512,
+    shift=128,
+    window_length=None,
+    fading=True,
+    pad=True,
+    symmetric_window=False
+)
+
+sampling_rate = 16000
+delay = 3
+iterations = 5
+taps = 10
+
+signal_list = [
+    sf.read(f)[0]
+    for f in input_files
+]
+y = np.stack(signal_list, axis=0)
+Y = stft(y, **stft_options).transpose(2, 0, 1)
+Z = wpe(Y, iterations=iterations, statistics_mode='full').transpose(1, 2, 0)
+z = istft(Z, size=stft_options['size'], shift=stft_options['shift'])
+
+for d in range(len(signal_list)):
+    sf.write(output_files[d], z[d,:], sampling_rate)
diff --git a/egs/chime5/s5b/local/run_wpe.sh b/egs/chime5/s5b/local/run_wpe.sh
new file mode 100755
index 00000000000..1c4b1c80291
--- /dev/null
+++ b/egs/chime5/s5b/local/run_wpe.sh
@@ -0,0 +1,85 @@
+#!/bin/bash
+# Copyright 2018 Johns Hopkins University (Author: Aswin Shanmugam Subramanian)
+# Apache 2.0
+
+. ./cmd.sh
+. ./path.sh
+
+# Config:
+nj=4
+cmd=run.pl
+
+. utils/parse_options.sh || exit 1;
+
+if [ $# != 3 ]; then
+   echo "Wrong #arguments ($#, expected 3)"
+   echo "Usage: local/run_wpe.sh [options] <wav-in-dir> <wav-out-dir> <array-id>"
+   echo "main options (for others, see top of script file)"
+   echo "  --cmd <cmd>                              # Command to run in parallel with"
+   echo "  --nj 50                        # number of jobs for parallel processing"
+   exit 1;
+fi
+
+sdir=$1
+odir=$2
+array=$3
+task=`basename $sdir`
+expdir=exp/wpe/${task}_${array}
+# Set bash to 'debug' mode, it will exit on :
+# -e 'error', -u 'undefined variable', -o ... 'error in pipeline', -x 'print commands',
+set -e
+set -u
+set -o pipefail
+
+miniconda_dir=$HOME/miniconda3/
+if [ ! -d $miniconda_dir ]; then
+    echo "$miniconda_dir does not exist. Please run '../../../tools/extras/install_miniconda.sh' and '../../../tools/extras/install_wpe.sh';"
+fi
+
+# check if WPE is installed
+result=`$miniconda_dir/bin/python -c "\
+try:
+    import nara_wpe
+    print('1')
+except ImportError:
+    print('0')"`
+
+if [ "$result" == "1" ]; then
+    echo "WPE is installed"
+else
+    echo "WPE is not installed. Please run ../../../tools/extras/install_wpe.sh"
+    exit 1
+fi
+
+mkdir -p $odir
+mkdir -p $expdir/log
+
+# wavfiles.list can be used as the name of the output files
+output_wavfiles=$expdir/wavfiles.list
+find -L ${sdir} | grep -i ${array} > $expdir/channels_input
+cat $expdir/channels_input | awk -F '/' '{print $NF}' | sed "s@S@$odir\/S@g" > $expdir/channels_output
+paste -d" " $expdir/channels_input $expdir/channels_output > $output_wavfiles
+
+# split the list for parallel processing
+split_wavfiles=""
+for n in `seq $nj`; do
+  split_wavfiles="$split_wavfiles $output_wavfiles.$n"
+done
+utils/split_scp.pl $output_wavfiles $split_wavfiles || exit 1;
+
+echo -e "Dereverberation - $task - $array\n"
+# making a shell script for each job
+for n in `seq $nj`; do
+cat <<-EOF > $expdir/log/wpe.$n.sh
+while read line; do
+  $miniconda_dir/bin/python local/run_wpe.py \
+    --file \$line
+done < $output_wavfiles.$n
+EOF
+done
+
+chmod a+x $expdir/log/wpe.*.sh
+$cmd JOB=1:$nj $expdir/log/wpe.JOB.log \
+  $expdir/log/wpe.JOB.sh
+
+echo "`basename $0` Done."
diff --git a/egs/chime5/s5b/local/score.sh b/egs/chime5/s5b/local/score.sh
new file mode 120000
index 00000000000..6a200b42ed3
--- /dev/null
+++ b/egs/chime5/s5b/local/score.sh
@@ -0,0 +1 @@
+../steps/scoring/score_kaldi_wer.sh
\ No newline at end of file
diff --git a/egs/chime5/s5b/local/score_for_submit.sh b/egs/chime5/s5b/local/score_for_submit.sh
new file mode 100755
index 00000000000..23121d68b93
--- /dev/null
+++ b/egs/chime5/s5b/local/score_for_submit.sh
@@ -0,0 +1,119 @@
+#!/bin/bash
+# Copyright 2012-2014  Johns Hopkins University (Author: Daniel Povey, Yenda Trmal)
+# Apache 2.0
+#
+# This script provides official CHiME-5 challenge submission scores per room and session.
+# It first calculates the best search parameter configurations by using the dev set
+# and also create the transcriptions for dev and eval sets to be submitted.
+# The default setup does not calculate scores of the evaluation set since
+# the evaluation transcription is not distributed (July 9 2018)
+
+cmd=run.pl
+dev=exp/chain_train_worn_u100k_cleaned/tdnn1a_sp/decode_dev_beamformit_ref
+eval=exp/chain_train_worn_u100k_cleaned/tdnn1a_sp/decode_eval_beamformit_ref
+do_eval=false
+
+echo "$0 $@"  # Print the command line for logging
+[ -f ./path.sh ] && . ./path.sh
+. parse_options.sh || exit 1;
+
+if [ $# -ne 0 ]; then
+    echo "Usage: $0 [--cmd (run.pl|queue.pl...)]"
+    echo "This script provides official CHiME-5 challenge submission scores"
+    echo " Options:"
+    echo "    --cmd (run.pl|queue.pl...)      # specify how to run the sub-processes."
+    echo "    --dev <dev-decode-dir>          # dev set decoding directory"
+    echo "    --eval <eval-decode-dir>        # eval set decoding directory"
+    exit 1;
+fi
+
+# get language model weight and word insertion penalty from the dev set
+best_lmwt=`cat $dev/scoring_kaldi/wer_details/lmwt`
+best_wip=`cat $dev/scoring_kaldi/wer_details/wip`
+
+echo "best LM weight: $best_lmwt"
+echo "insertion penalty weight: $best_wip"
+
+echo "==== development set ===="
+# development set
+# get the scoring result per utterance
+score_result=$dev/scoring_kaldi/wer_details/per_utt
+for session in S02 S09; do
+    for room in DINING KITCHEN LIVING; do
+	# get nerror
+	nerr=`grep "\#csid" $score_result | grep $room | grep $session | awk '{sum+=$4+$5+$6} END {print sum}'`
+	# get nwords from references (NF-2 means to exclude utterance id and " ref ")
+	nwrd=`grep "\#csid" $score_result | grep $room | grep $session | awk '{sum+=$3+$4+$6} END {print sum}'`
+	# compute wer with scale=2
+	wer=`echo "scale=2; 100 * $nerr / $nwrd" | bc`
+	
+	# report the results
+	echo -n "session $session "
+	echo -n "room $room: "
+	echo -n "#words $nwrd, "
+	echo -n "#errors $nerr, "
+	echo "wer $wer %"
+    done
+done
+echo -n "overall: "
+# get nerror
+nerr=`grep "\#csid" $score_result | awk '{sum+=$4+$5+$6} END {print sum}'`
+# get nwords from references (NF-2 means to exclude utterance id and " ref ")
+nwrd=`grep "\#csid" $score_result | awk '{sum+=$3+$4+$6} END {print sum}'`
+# compute wer with scale=2
+wer=`echo "scale=2; 100 * $nerr / $nwrd" | bc`
+echo -n "#words $nwrd, "
+echo -n "#errors $nerr, "
+echo "wer $wer %"
+
+echo "==== evaluation set ===="
+# evaluation set
+# get the scoring result per utterance. Copied from local/score.sh
+mkdir -p $eval/scoring_kaldi/wer_details_devbest
+$cmd $eval/scoring_kaldi/log/stats1.log \
+     cat $eval/scoring_kaldi/penalty_$best_wip/$best_lmwt.txt \| \
+     align-text --special-symbol="'***'" ark:$eval/scoring_kaldi/test_filt.txt ark:- ark,t:- \|  \
+     utils/scoring/wer_per_utt_details.pl --special-symbol "'***'" \> $eval/scoring_kaldi/wer_details_devbest/per_utt
+score_result=$eval/scoring_kaldi/wer_details_devbest/per_utt
+for session in S01 S21; do
+    for room in DINING KITCHEN LIVING; do
+	if $do_eval; then
+	    # get nerror
+	    nerr=`grep "\#csid" $score_result | grep $room | grep $session | awk '{sum+=$4+$5+$6} END {print sum}'`
+	    # get nwords from references (NF-2 means to exclude utterance id and " ref ")
+	    nwrd=`grep "\#csid" $score_result | grep $room | grep $session | awk '{sum+=$3+$4+$6} END {print sum}'`
+	    # compute wer with scale=2
+	    wer=`echo "scale=2; 100 * $nerr / $nwrd" | bc`
+	
+	    # report the results
+	    echo -n "session $session "
+	    echo -n "room $room: "
+	    echo -n "#words $nwrd, "
+	    echo -n "#errors $nerr, "
+	    echo "wer $wer %"
+	fi
+    done
+done
+if $do_eval; then
+    # get nerror
+    nerr=`grep "\#csid" $score_result | awk '{sum+=$4+$5+$6} END {print sum}'`
+    # get nwords from references (NF-2 means to exclude utterance id and " ref ")
+    nwrd=`grep "\#csid" $score_result | awk '{sum+=$3+$4+$6} END {print sum}'`
+    # compute wer with scale=2
+    wer=`echo "scale=2; 100 * $nerr / $nwrd" | bc`
+    echo -n "overall: "
+    echo -n "#words $nwrd, "
+    echo -n "#errors $nerr, "
+    echo "wer $wer %"
+else
+    echo "skip evaluation scoring"
+    echo ""
+    echo "==== when you submit your result to the CHiME-5 challenge ===="
+    echo "Please rename your recognition results of "
+    echo "$dev/scoring_kaldi/penalty_$best_wip/$best_lmwt.txt"
+    echo "$eval/scoring_kaldi/penalty_$best_wip/$best_lmwt.txt"
+    echo "with {dev,eval}_<last name>_<affiliation>.txt, e.g., dev_watanabe_jhu.txt and eval_watanabe_jhu.txt, "
+    echo "and submit both of them as your final challenge result"
+    echo "=================================================================="    
+fi
+
diff --git a/egs/chime5/s5b/local/train_lms_srilm.sh b/egs/chime5/s5b/local/train_lms_srilm.sh
new file mode 100755
index 00000000000..5a1d56d24b3
--- /dev/null
+++ b/egs/chime5/s5b/local/train_lms_srilm.sh
@@ -0,0 +1,261 @@
+#!/bin/bash
+# Copyright (c) 2017  Johns Hopkins University (Author: Yenda Trmal, Shinji Watanabe)
+# Apache 2.0
+
+export LC_ALL=C
+
+# Begin configuration section.
+words_file=
+train_text=
+dev_text=
+oov_symbol="<UNK>"
+# End configuration section
+
+echo "$0 $@"
+
+[ -f path.sh ]  && . ./path.sh
+. ./utils/parse_options.sh || exit 1
+
+echo "-------------------------------------"
+echo "Building an SRILM language model     "
+echo "-------------------------------------"
+
+if [ $# -ne 2 ] ; then
+  echo "Incorrect number of parameters. "
+  echo "Script has to be called like this:"
+  echo "  $0 [switches] <datadir> <tgtdir>"
+  echo "For example: "
+  echo "  $0 data data/srilm"
+  echo "The allowed switches are: "
+  echo "    words_file=<word_file|>        word list file -- data/lang/words.txt by default"
+  echo "    train_text=<train_text|>       data/train/text is used in case when not specified"
+  echo "    dev_text=<dev_text|>           last 10 % of the train text is used by default"
+  echo "    oov_symbol=<unk_sumbol|<UNK>>  symbol to use for oov modeling -- <UNK> by default"
+  exit 1
+fi
+
+datadir=$1
+tgtdir=$2
+
+##End of configuration
+loc=`which ngram-count`;
+if [ -z $loc ]; then
+  echo >&2 "You appear to not have SRILM tools installed, either on your path,"
+  echo >&2 "Use the script \$KALDI_ROOT/tools/install_srilm.sh to install it."
+  exit 1
+fi
+
+# Prepare the destination directory
+mkdir -p $tgtdir
+
+for f in $words_file $train_text $dev_text; do
+  [ ! -s $f ] && echo "No such file $f" && exit 1;
+done
+
+[ -z $words_file ] && words_file=$datadir/lang/words.txt
+if [ ! -z "$train_text" ] && [ -z "$dev_text" ] ; then
+  nr=`cat  $train_text | wc -l`
+  nr_dev=$(($nr / 10 ))
+  nr_train=$(( $nr - $nr_dev ))
+  orig_train_text=$train_text
+  head -n $nr_train $train_text > $tgtdir/train_text
+  tail -n $nr_dev $train_text > $tgtdir/dev_text
+
+  train_text=$tgtdir/train_text
+  dev_text=$tgtdir/dev_text
+  echo "Using words file: $words_file"
+  echo "Using train text: 9/10 of $orig_train_text"
+  echo "Using dev text  : 1/10 of $orig_train_text"
+elif [ ! -z "$train_text" ] && [ ! -z "$dev_text" ] ; then
+  echo "Using words file: $words_file"
+  echo "Using train text: $train_text"
+  echo "Using dev text  : $dev_text"
+  train_text=$train_text
+  dev_text=$dev_text
+else
+  train_text=$datadir/train/text
+  dev_text=$datadir/dev2h/text
+  echo "Using words file: $words_file"
+  echo "Using train text: $train_text"
+  echo "Using dev text  : $dev_text"
+
+fi
+
+[ ! -f $words_file ] && echo >&2 "File $words_file must exist!" && exit 1
+[ ! -f $train_text ] && echo >&2 "File $train_text must exist!" && exit 1
+[ ! -f $dev_text ] && echo >&2 "File $dev_text must exist!" && exit 1
+
+
+# Extract the word list from the training dictionary; exclude special symbols
+sort $words_file | awk '{print $1}' | grep -v '\#0' | grep -v '<eps>' | grep -v -F "$oov_symbol" > $tgtdir/vocab
+if (($?)); then
+  echo "Failed to create vocab from $words_file"
+  exit 1
+else
+  # wc vocab # doesn't work due to some encoding issues
+  echo vocab contains `cat $tgtdir/vocab | perl -ne 'BEGIN{$l=$w=0;}{split; $w+=$#_; $w++; $l++;}END{print "$l lines, $w words\n";}'`
+fi
+
+# Kaldi transcript files contain Utterance_ID as the first word; remove it
+# We also have to avoid skewing the LM by incorporating  the same sentences
+# from different channels
+sed -e "s/\.CH.//" -e "s/_.\-./_/" -e "s/NOLOCATION\(\.[LR]\)*-//" -e "s/U[0-9][0-9]_//" $train_text | sort -u | \
+  perl -ane 'print join(" ", @F[1..$#F]) . "\n" if @F > 1' > $tgtdir/train.txt
+if (($?)); then
+    echo "Failed to create $tgtdir/train.txt from $train_text"
+    exit 1
+else
+    echo "Removed first word (uid) from every line of $train_text"
+    # wc text.train train.txt # doesn't work due to some encoding issues
+    echo $train_text contains `cat $train_text | perl -ane 'BEGIN{$w=$s=0;}{$w+=@F; $w--; $s++;}END{print "$w words, $s sentences\n";}'`
+    echo train.txt contains `cat $tgtdir/train.txt | perl -ane 'BEGIN{$w=$s=0;}{$w+=@F; $s++;}END{print "$w words, $s sentences\n";}'`
+fi
+
+# Kaldi transcript files contain Utterance_ID as the first word; remove it
+sed -e "s/\.CH.//" -e "s/_.\-./_/" $dev_text | sort -u | \
+  perl -ane 'print join(" ", @F[1..$#F]) . "\n" if @F > 1' > $tgtdir/dev.txt
+if (($?)); then
+    echo "Failed to create $tgtdir/dev.txt from $dev_text"
+    exit 1
+else
+    echo "Removed first word (uid) from every line of $dev_text"
+    # wc text.train train.txt # doesn't work due to some encoding issues
+    echo $dev_text contains `cat $dev_text | perl -ane 'BEGIN{$w=$s=0;}{$w+=@F; $w--; $s++;}END{print "$w words, $s sentences\n";}'`
+    echo $tgtdir/dev.txt contains `cat $tgtdir/dev.txt | perl -ane 'BEGIN{$w=$s=0;}{$w+=@F;  $s++;}END{print "$w words, $s sentences\n";}'`
+fi
+
+
+echo "-------------------"
+echo "Good-Turing 3grams"
+echo "-------------------"
+ngram-count -lm $tgtdir/3gram.gt011.gz -gt1min 0 -gt2min 1 -gt3min 1 -order 3 \
+  -text $tgtdir/train.txt -vocab $tgtdir/vocab -unk -sort -map-unk "$oov_symbol"
+ngram-count -lm $tgtdir/3gram.gt012.gz -gt1min 0 -gt2min 1 -gt3min 2 -order 3 \
+  -text $tgtdir/train.txt -vocab $tgtdir/vocab -unk -sort -map-unk "$oov_symbol"
+ngram-count -lm $tgtdir/3gram.gt022.gz -gt1min 0 -gt2min 2 -gt3min 2 -order 3 \
+  -text $tgtdir/train.txt -vocab $tgtdir/vocab -unk -sort -map-unk "$oov_symbol"
+ngram-count -lm $tgtdir/3gram.gt023.gz -gt1min 0 -gt2min 2 -gt3min 3 -order 3 \
+  -text $tgtdir/train.txt -vocab $tgtdir/vocab -unk -sort -map-unk "$oov_symbol"
+
+echo "-------------------"
+echo "Kneser-Ney 3grams"
+echo "-------------------"
+ngram-count -lm $tgtdir/3gram.kn011.gz -kndiscount1 -gt1min 0 \
+  -kndiscount2 -gt2min 1 -kndiscount3 -gt3min 1 -order 3 -interpolate \
+  -text $tgtdir/train.txt -vocab $tgtdir/vocab -unk -sort -map-unk "$oov_symbol"
+ngram-count -lm $tgtdir/3gram.kn012.gz -kndiscount1 -gt1min 0 \
+  -kndiscount2 -gt2min 1 -kndiscount3 -gt3min 2 -order 3 -interpolate \
+  -text $tgtdir/train.txt -vocab $tgtdir/vocab -unk -sort -map-unk "$oov_symbol"
+ngram-count -lm $tgtdir/3gram.kn022.gz -kndiscount1 -gt1min 0 \
+  -kndiscount2 -gt2min 2 -kndiscount3 -gt3min 2 -order 3 -interpolate \
+  -text $tgtdir/train.txt -vocab $tgtdir/vocab -unk -sort -map-unk "$oov_symbol"
+ngram-count -lm $tgtdir/3gram.kn023.gz -kndiscount1 -gt1min 0 \
+  -kndiscount2 -gt2min 2 -kndiscount3 -gt3min 3 -order 3 -interpolate \
+  -text $tgtdir/train.txt -vocab $tgtdir/vocab -unk -sort -map-unk "$oov_symbol"
+ngram-count -lm $tgtdir/3gram.kn111.gz -kndiscount1 -gt1min 1 \
+  -kndiscount2 -gt2min 1 -kndiscount3 -gt3min 1 -order 3 -interpolate \
+  -text $tgtdir/train.txt -vocab $tgtdir/vocab -unk -sort -map-unk "$oov_symbol"
+ngram-count -lm $tgtdir/3gram.kn112.gz -kndiscount1 -gt1min 1 \
+  -kndiscount2 -gt2min 1 -kndiscount3 -gt3min 2 -order 3 -interpolate \
+  -text $tgtdir/train.txt -vocab $tgtdir/vocab -unk -sort -map-unk "$oov_symbol"
+ngram-count -lm $tgtdir/3gram.kn122.gz -kndiscount1 -gt1min 1 \
+  -kndiscount2 -gt2min 2 -kndiscount3 -gt3min 2 -order 3 -interpolate \
+  -text $tgtdir/train.txt -vocab $tgtdir/vocab -unk -sort -map-unk "$oov_symbol"
+ngram-count -lm $tgtdir/3gram.kn123.gz -kndiscount1 -gt1min 1 \
+  -kndiscount2 -gt2min 2 -kndiscount3 -gt3min 3 -order 3 -interpolate \
+  -text $tgtdir/train.txt -vocab $tgtdir/vocab -unk -sort -map-unk "$oov_symbol"
+
+
+echo "-------------------"
+echo "Good-Turing 4grams"
+echo "-------------------"
+ngram-count -lm $tgtdir/4gram.gt0111.gz \
+  -gt1min 0 -gt2min 1 -gt3min 1 -gt4min 1 -order 4 \
+  -text $tgtdir/train.txt -vocab $tgtdir/vocab -unk -sort -map-unk "$oov_symbol"
+ngram-count -lm $tgtdir/4gram.gt0112.gz \
+  -gt1min 0 -gt2min 1 -gt3min 1 -gt4min 2 -order 4 \
+  -text $tgtdir/train.txt -vocab $tgtdir/vocab -unk -sort -map-unk "$oov_symbol"
+ngram-count -lm $tgtdir/4gram.gt0122.gz \
+  -gt1min 0 -gt2min 1 -gt3min 2 -gt4min 2 -order 4 \
+  -text $tgtdir/train.txt -vocab $tgtdir/vocab -unk -sort -map-unk "$oov_symbol"
+ngram-count -lm $tgtdir/4gram.gt0123.gz \
+  -gt1min 0 -gt2min 1 -gt3min 2 -gt4min 3 -order 4 \
+  -text $tgtdir/train.txt -vocab $tgtdir/vocab -unk -sort -map-unk "$oov_symbol"
+ngram-count -lm $tgtdir/4gram.gt0113.gz \
+  -gt1min 0 -gt2min 1 -gt3min 1 -gt4min 3 -order 4 \
+  -text $tgtdir/train.txt -vocab $tgtdir/vocab -unk -sort -map-unk "$oov_symbol"
+ngram-count -lm $tgtdir/4gram.gt0222.gz \
+  -gt1min 0 -gt2min 2 -gt3min 2 -gt4min 2 -order 4 \
+  -text $tgtdir/train.txt -vocab $tgtdir/vocab -unk -sort -map-unk "$oov_symbol"
+ngram-count -lm $tgtdir/4gram.gt0223.gz \
+  -gt1min 0 -gt2min 2 -gt3min 2 -gt4min 3 -order 4 \
+  -text $tgtdir/train.txt -vocab $tgtdir/vocab -unk -sort -map-unk "$oov_symbol"
+
+echo "-------------------"
+echo "Kneser-Ney 4grams"
+echo "-------------------"
+ngram-count -lm $tgtdir/4gram.kn0111.gz \
+  -kndiscount1 -gt1min 0 -kndiscount2 -gt2min 1 -kndiscount3 -gt3min 1 -kndiscount4 -gt4min 1 -order 4 \
+  -text $tgtdir/train.txt -vocab $tgtdir/vocab -unk -sort -map-unk "$oov_symbol"
+ngram-count -lm $tgtdir/4gram.kn0112.gz \
+  -kndiscount1 -gt1min 0 -kndiscount2 -gt2min 1 -kndiscount3 -gt3min 1 -kndiscount4 -gt4min 2 -order 4 \
+  -text $tgtdir/train.txt -vocab $tgtdir/vocab -unk -sort -map-unk "$oov_symbol"
+ngram-count -lm $tgtdir/4gram.kn0113.gz \
+  -kndiscount1 -gt1min 0 -kndiscount2 -gt2min 1 -kndiscount3 -gt3min 1 -kndiscount4 -gt4min 3 -order 4 \
+  -text $tgtdir/train.txt -vocab $tgtdir/vocab -unk -sort -map-unk "$oov_symbol"
+ngram-count -lm $tgtdir/4gram.kn0122.gz \
+  -kndiscount1 -gt1min 0 -kndiscount2 -gt2min 1 -kndiscount3 -gt3min 2 -kndiscount4 -gt4min 2 -order 4 \
+  -text $tgtdir/train.txt -vocab $tgtdir/vocab -unk -sort -map-unk "$oov_symbol"
+ngram-count -lm $tgtdir/4gram.kn0123.gz \
+  -kndiscount1 -gt1min 0 -kndiscount2 -gt2min 1 -kndiscount3 -gt3min 2 -kndiscount4 -gt4min 3 -order 4 \
+  -text $tgtdir/train.txt -vocab $tgtdir/vocab -unk -sort -map-unk "$oov_symbol"
+ngram-count -lm $tgtdir/4gram.kn0222.gz \
+  -kndiscount1 -gt1min 0 -kndiscount2 -gt2min 2 -kndiscount3 -gt3min 2 -kndiscount4 -gt4min 2 -order 4 \
+  -text $tgtdir/train.txt -vocab $tgtdir/vocab -unk -sort -map-unk "$oov_symbol"
+ngram-count -lm $tgtdir/4gram.kn0223.gz \
+  -kndiscount1 -gt1min 0 -kndiscount2 -gt2min 2 -kndiscount3 -gt3min 2 -kndiscount4 -gt4min 3 -order 4 \
+  -text $tgtdir/train.txt -vocab $tgtdir/vocab -unk -sort -map-unk "$oov_symbol"
+
+if [ ! -z ${LIBLBFGS} ]; then
+  #please note that if the switch -map-unk "$oov_symbol" is used with -maxent-convert-to-arpa, ngram-count will segfault
+  #instead of that, we simply output the model in the maxent format and convert it using the "ngram"
+  echo "-------------------"
+  echo "Maxent 3grams"
+  echo "-------------------"
+  sed 's/'${oov_symbol}'/<unk>/g' $tgtdir/train.txt | \
+    ngram-count -lm - -order 3 -text - -vocab $tgtdir/vocab -unk -sort -maxent -maxent-convert-to-arpa|\
+    ngram -lm - -order 3 -unk -map-unk "$oov_symbol" -prune-lowprobs -write-lm - |\
+    sed 's/<unk>/'${oov_symbol}'/g' | gzip -c > $tgtdir/3gram.me.gz || exit 1
+
+  echo "-------------------"
+  echo "Maxent 4grams"
+  echo "-------------------"
+  sed 's/'${oov_symbol}'/<unk>/g' $tgtdir/train.txt | \
+    ngram-count -lm - -order 4 -text - -vocab $tgtdir/vocab -unk -sort -maxent -maxent-convert-to-arpa|\
+    ngram -lm - -order 4 -unk -map-unk "$oov_symbol" -prune-lowprobs -write-lm - |\
+    sed 's/<unk>/'${oov_symbol}'/g' | gzip -c > $tgtdir/4gram.me.gz || exit 1
+else
+  echo >&2  "SRILM is not compiled with the support of MaxEnt models."
+  echo >&2  "You should use the script in \$KALDI_ROOT/tools/install_srilm.sh"
+  echo >&2  "which will take care of compiling the SRILM with MaxEnt support"
+  exit 1;
+fi
+
+
+echo "--------------------"
+echo "Computing perplexity"
+echo "--------------------"
+(
+  for f in $tgtdir/3gram* ; do ( echo $f; ngram -order 3 -lm $f -unk -map-unk "$oov_symbol" -prune-lowprobs -ppl $tgtdir/dev.txt ) | paste -s -d ' ' ; done
+  for f in $tgtdir/4gram* ; do ( echo $f; ngram -order 4 -lm $f -unk -map-unk "$oov_symbol" -prune-lowprobs -ppl $tgtdir/dev.txt ) | paste -s -d ' ' ; done
+)  | sort  -r -n -k 15,15g | column -t | tee $tgtdir/perplexities.txt
+
+echo "The perlexity scores report is stored in $tgtdir/perplexities.txt "
+echo ""
+
+for best_ngram in {3,4}gram ; do
+  outlm=best_${best_ngram}.gz
+  lmfilename=$(grep "${best_ngram}" $tgtdir/perplexities.txt | head -n 1 | cut -f 1 -d ' ')
+  echo "$outlm -> $lmfilename"
+  (cd $tgtdir; rm -f $outlm; ln -sf $(basename $lmfilename) $outlm )
+done
diff --git a/egs/chime5/s5b/local/wer_output_filter b/egs/chime5/s5b/local/wer_output_filter
new file mode 100755
index 00000000000..6f4b6400716
--- /dev/null
+++ b/egs/chime5/s5b/local/wer_output_filter
@@ -0,0 +1,25 @@
+#!/bin/bash
+# Copyright (c) 2017  Johns Hopkins University (Author: Yenda Trmal <jtrmal@gmail.com>)
+# Apache 2.0
+
+
+## Filter for scoring of the STT results. Convert everything to lowercase
+## and add some ad-hoc fixes for the hesitations
+
+perl -e '
+   while(<STDIN>) {
+     @A  = split(" ", $_);
+     $id = shift @A; print "$id ";
+     foreach $a (@A) {
+       print lc($a) . " " unless $a =~ /\[.*\]/;
+     }
+     print "\n";
+    }' | \
+sed -e '
+    s/\<mhm\>/hmm/g;
+    s/\<mm\>/hmm/g;
+    s/\<mmm\>/hmm/g;
+'
+
+#| uconv -f  utf-8  -t utf-8 -x Latin-ASCII
+
diff --git a/egs/chime5/s5b/local/worn_audio_list b/egs/chime5/s5b/local/worn_audio_list
new file mode 100644
index 00000000000..fc7a44ad77d
--- /dev/null
+++ b/egs/chime5/s5b/local/worn_audio_list
@@ -0,0 +1,64 @@
+/export/corpora4/CHiME5/audio/train/S03_P09.wav
+/export/corpora4/CHiME5/audio/train/S03_P10.wav
+/export/corpora4/CHiME5/audio/train/S03_P11.wav
+/export/corpora4/CHiME5/audio/train/S03_P12.wav
+/export/corpora4/CHiME5/audio/train/S04_P09.wav
+/export/corpora4/CHiME5/audio/train/S04_P10.wav
+/export/corpora4/CHiME5/audio/train/S04_P11.wav
+/export/corpora4/CHiME5/audio/train/S04_P12.wav
+/export/corpora4/CHiME5/audio/train/S05_P13.wav
+/export/corpora4/CHiME5/audio/train/S05_P14.wav
+/export/corpora4/CHiME5/audio/train/S05_P15.wav
+/export/corpora4/CHiME5/audio/train/S05_P16.wav
+/export/corpora4/CHiME5/audio/train/S06_P13.wav
+/export/corpora4/CHiME5/audio/train/S06_P14.wav
+/export/corpora4/CHiME5/audio/train/S06_P15.wav
+/export/corpora4/CHiME5/audio/train/S06_P16.wav
+/export/corpora4/CHiME5/audio/train/S07_P17.wav
+/export/corpora4/CHiME5/audio/train/S07_P18.wav
+/export/corpora4/CHiME5/audio/train/S07_P19.wav
+/export/corpora4/CHiME5/audio/train/S07_P20.wav
+/export/corpora4/CHiME5/audio/train/S08_P21.wav
+/export/corpora4/CHiME5/audio/train/S08_P22.wav
+/export/corpora4/CHiME5/audio/train/S08_P23.wav
+/export/corpora4/CHiME5/audio/train/S08_P24.wav
+/export/corpora4/CHiME5/audio/train/S12_P33.wav
+/export/corpora4/CHiME5/audio/train/S12_P34.wav
+/export/corpora4/CHiME5/audio/train/S12_P35.wav
+/export/corpora4/CHiME5/audio/train/S12_P36.wav
+/export/corpora4/CHiME5/audio/train/S13_P33.wav
+/export/corpora4/CHiME5/audio/train/S13_P34.wav
+/export/corpora4/CHiME5/audio/train/S13_P35.wav
+/export/corpora4/CHiME5/audio/train/S13_P36.wav
+/export/corpora4/CHiME5/audio/train/S16_P21.wav
+/export/corpora4/CHiME5/audio/train/S16_P22.wav
+/export/corpora4/CHiME5/audio/train/S16_P23.wav
+/export/corpora4/CHiME5/audio/train/S16_P24.wav
+/export/corpora4/CHiME5/audio/train/S17_P17.wav
+/export/corpora4/CHiME5/audio/train/S17_P18.wav
+/export/corpora4/CHiME5/audio/train/S17_P19.wav
+/export/corpora4/CHiME5/audio/train/S17_P20.wav
+/export/corpora4/CHiME5/audio/train/S18_P41.wav
+/export/corpora4/CHiME5/audio/train/S18_P42.wav
+/export/corpora4/CHiME5/audio/train/S18_P43.wav
+/export/corpora4/CHiME5/audio/train/S18_P44.wav
+/export/corpora4/CHiME5/audio/train/S19_P49.wav
+/export/corpora4/CHiME5/audio/train/S19_P50.wav
+/export/corpora4/CHiME5/audio/train/S19_P51.wav
+/export/corpora4/CHiME5/audio/train/S19_P52.wav
+/export/corpora4/CHiME5/audio/train/S20_P49.wav
+/export/corpora4/CHiME5/audio/train/S20_P50.wav
+/export/corpora4/CHiME5/audio/train/S20_P51.wav
+/export/corpora4/CHiME5/audio/train/S20_P52.wav
+/export/corpora4/CHiME5/audio/train/S22_P41.wav
+/export/corpora4/CHiME5/audio/train/S22_P42.wav
+/export/corpora4/CHiME5/audio/train/S22_P43.wav
+/export/corpora4/CHiME5/audio/train/S22_P44.wav
+/export/corpora4/CHiME5/audio/train/S23_P53.wav
+/export/corpora4/CHiME5/audio/train/S23_P54.wav
+/export/corpora4/CHiME5/audio/train/S23_P55.wav
+/export/corpora4/CHiME5/audio/train/S23_P56.wav
+/export/corpora4/CHiME5/audio/train/S24_P53.wav
+/export/corpora4/CHiME5/audio/train/S24_P54.wav
+/export/corpora4/CHiME5/audio/train/S24_P55.wav
+/export/corpora4/CHiME5/audio/train/S24_P56.wav
diff --git a/egs/chime5/s5b/path.sh b/egs/chime5/s5b/path.sh
new file mode 100644
index 00000000000..fb1c0489386
--- /dev/null
+++ b/egs/chime5/s5b/path.sh
@@ -0,0 +1,7 @@
+export KALDI_ROOT=`pwd`/../../..
+[ -f $KALDI_ROOT/tools/env.sh ] && . $KALDI_ROOT/tools/env.sh
+export PATH=$PWD/utils/:$KALDI_ROOT/tools/openfst/bin:$PWD:$PATH
+[ ! -f $KALDI_ROOT/tools/config/common_path.sh ] && echo >&2 "The standard file $KALDI_ROOT/tools/config/common_path.sh is not present -> Exit!" && exit 1
+. $KALDI_ROOT/tools/config/common_path.sh
+export LC_ALL=C
+
diff --git a/egs/chime5/s5b/run.sh b/egs/chime5/s5b/run.sh
new file mode 100755
index 00000000000..37bc5c2c94e
--- /dev/null
+++ b/egs/chime5/s5b/run.sh
@@ -0,0 +1,297 @@
+#!/bin/bash
+#
+# Based mostly on the TED-LIUM and Switchboard recipe
+#
+# Copyright  2017  Johns Hopkins University (Author: Shinji Watanabe and Yenda Trmal)
+# Apache 2.0
+#
+
+# Begin configuration section.
+nj=96
+decode_nj=20
+stage=0
+nnet_stage=-10
+num_data_reps=4
+snrs="20:10:15:5:0"
+foreground_snrs="20:10:15:5:0"
+background_snrs="20:10:15:5:0"
+enhancement=beamformit # for a new enhancement method,
+                       # change this variable and stage 4
+# End configuration section
+. ./utils/parse_options.sh
+
+. ./cmd.sh
+. ./path.sh
+
+
+set -e # exit on error
+
+# chime5 main directory path
+# please change the path accordingly
+chime5_corpus=/export/corpora4/CHiME5
+json_dir=${chime5_corpus}/transcriptions
+audio_dir=${chime5_corpus}/audio
+
+# training and test data
+train_set=train_worn_simu_u400k
+test_sets="dev_${enhancement}_dereverb_ref" #"dev_worn dev_addition_dereverb_ref"
+#test_sets="dev_${enhancement}_ref" #"dev_worn dev_addition_dereverb_ref"
+
+# This script also needs the phonetisaurus g2p, srilm, beamformit
+./local/check_tools.sh || exit 1
+
+if [ $stage -le 1 ]; then
+  # skip u03 as they are missing
+  for mictype in worn u01 u02 u04 u05 u06; do
+    local/prepare_data.sh --mictype ${mictype} \
+			  ${audio_dir}/train ${json_dir}/train data/train_${mictype}
+  done
+  for dataset in dev; do
+    for mictype in worn; do
+      local/prepare_data.sh --mictype ${mictype} \
+			    ${audio_dir}/${dataset} ${json_dir}/${dataset} \
+			    data/${dataset}_${mictype}
+    done
+  done
+fi
+
+if [ $stage -le 2 ]; then
+  local/prepare_dict.sh
+
+  utils/prepare_lang.sh \
+    data/local/dict "<unk>" data/local/lang data/lang
+
+  local/train_lms_srilm.sh \
+    --train-text data/train_worn/text --dev-text data/dev_worn/text \
+    --oov-symbol "<unk>" --words-file data/lang/words.txt \
+    data/ data/srilm
+fi
+
+LM=data/srilm/best_3gram.gz
+if [ $stage -le 3 ]; then
+  # Compiles G for chime5 trigram LM
+  utils/format_lm.sh \
+		data/lang $LM data/local/dict/lexicon.txt data/lang
+
+fi
+
+if [ $stage -le 4 ]; then
+  # Beamforming using reference arrays
+  # enhanced WAV directory
+  enhandir=enhan
+  dereverb_dir=${PWD}/wav/wpe/
+  for dset in dev eval; do
+    for mictype in u01 u02 u03 u04 u06; do
+      local/run_wpe.sh --nj 4 --cmd "$train_cmd --mem 120G" \
+			      ${audio_dir}/${dset} \
+			      ${dereverb_dir}/${dset} \
+			      ${mictype}
+    done
+  done
+
+  for dset in dev eval; do
+    for mictype in u01 u02 u03 u04 u06; do
+      local/run_beamformit.sh --cmd "$train_cmd" \
+			      ${dereverb_dir}/${dset} \
+			      ${enhandir}/${dset}_${enhancement}_${mictype} \
+			      ${mictype}
+    done
+  done
+
+  for dset in dev eval; do
+    local/prepare_data.sh --mictype ref "$PWD/${enhandir}/${dset}_${enhancement}_u0*" \
+			  ${json_dir}/${dset} data/${dset}_${enhancement}_dereverb_ref
+  done
+fi
+
+if [ $stage -le 5 ]; then
+  # remove possibly bad sessions (P11_S03, P52_S19, P53_S24, P54_S24)
+  # see http://spandh.dcs.shef.ac.uk/chime_challenge/data.html for more details
+  utils/copy_data_dir.sh data/train_worn data/train_worn_org # back up
+  grep -v -e "^P11_S03" -e "^P52_S19" -e "^P53_S24" -e "^P54_S24" data/train_worn_org/text > data/train_worn/text
+  utils/fix_data_dir.sh data/train_worn
+fi
+
+if [ $stage -le 6 ]; then
+  local/extract_noises.py $chime5_corpus/audio/train $chime5_corpus/transcriptions/train \
+    local/distant_audio_list distant_noises
+  local/make_noise_list.py distant_noises > distant_noise_list
+
+  noise_list=distant_noise_list
+  
+  if [ ! -d RIRS_NOISES/ ]; then
+    # Download the package that includes the real RIRs, simulated RIRs, isotropic noises and point-source noises
+    wget --no-check-certificate http://www.openslr.org/resources/28/rirs_noises.zip
+    unzip rirs_noises.zip
+  fi
+
+  # This is the config for the system using simulated RIRs and point-source noises
+  rvb_opts+=(--rir-set-parameters "0.5, RIRS_NOISES/simulated_rirs/smallroom/rir_list")
+  rvb_opts+=(--rir-set-parameters "0.5, RIRS_NOISES/simulated_rirs/mediumroom/rir_list")
+  rvb_opts+=(--noise-set-parameters $noise_list)
+
+  steps/data/reverberate_data_dir.py \
+    "${rvb_opts[@]}" \
+    --prefix "rev" \
+    --foreground-snrs $foreground_snrs \
+    --background-snrs $background_snrs \
+    --speech-rvb-probability 1 \
+    --pointsource-noise-addition-probability 1 \
+    --isotropic-noise-addition-probability 1 \
+    --num-replications $num_data_reps \
+    --max-noises-per-minute 1 \
+    --source-sampling-rate 16000 \
+    data/train_worn data/train_worn_rvb
+fi
+
+if [ $stage -le 7 ]; then
+  # combine mix array and worn mics
+  # randomly extract first 100k utterances from all mics
+  # if you want to include more training data, you can increase the number of array mic utterances
+  utils/combine_data.sh data/train_uall data/train_u01 data/train_u02 data/train_u04 data/train_u05 data/train_u06
+  utils/subset_data_dir.sh data/train_uall 400000 data/train_u400k
+  utils/combine_data.sh data/${train_set} data/train_worn data/train_worn_rvb data/train_u400k
+
+  # only use left channel for worn mic recognition
+  # you can use both left and right channels for training
+  for dset in train dev; do
+    utils/copy_data_dir.sh data/${dset}_worn data/${dset}_worn_stereo
+    grep "\.L-" data/${dset}_worn_stereo/text > data/${dset}_worn/text
+    utils/fix_data_dir.sh data/${dset}_worn
+  done
+fi
+
+if [ $stage -le 8 ]; then
+  # fix speaker ID issue (thanks to Dr. Naoyuki Kanda)
+  # add array ID to the speaker ID to avoid the use of other array information to meet regulations
+  # Before this fix
+  # $ head -n 2 data/eval_beamformit_ref_nosplit/utt2spk
+  # P01_S01_U02_KITCHEN.ENH-0000192-0001278 P01
+  # P01_S01_U02_KITCHEN.ENH-0001421-0001481 P01
+  # After this fix
+  # $ head -n 2 data/eval_beamformit_ref_nosplit_fix/utt2spk
+  # P01_S01_U02_KITCHEN.ENH-0000192-0001278 P01_U02
+  # P01_S01_U02_KITCHEN.ENH-0001421-0001481 P01_U02
+  for dset in dev_${enhancement}_dereverb_ref eval_${enhancement}_dereverb_ref; do
+    utils/copy_data_dir.sh data/${dset} data/${dset}_nosplit
+    mkdir -p data/${dset}_nosplit_fix
+    cp data/${dset}_nosplit/{segments,text,wav.scp} data/${dset}_nosplit_fix/
+    awk -F "_" '{print $0 "_" $3}' data/${dset}_nosplit/utt2spk > data/${dset}_nosplit_fix/utt2spk
+    utils/utt2spk_to_spk2utt.pl data/${dset}_nosplit_fix/utt2spk > data/${dset}_nosplit_fix/spk2utt
+  done
+
+  # Split speakers up into 3-minute chunks.  This doesn't hurt adaptation, and
+  # lets us use more jobs for decoding etc.
+  for dset in ${train_set} dev_worn; do
+    utils/copy_data_dir.sh data/${dset} data/${dset}_nosplit
+    utils/data/modify_speaker_info.sh --seconds-per-spk-max 180 data/${dset}_nosplit data/${dset}
+  done
+  for dset in dev_${enhancement}_dereverb_ref eval_${enhancement}_dereverb_ref; do
+    utils/data/modify_speaker_info.sh --seconds-per-spk-max 180 data/${dset}_nosplit_fix data/${dset}
+  done
+fi
+
+if [ $stage -le 8 ]; then
+  # Now make MFCC features.
+  # mfccdir should be some place with a largish disk where you
+  # want to store MFCC features.
+  mfccdir=mfcc
+  for x in ${train_set} ${test_sets}; do
+    steps/make_mfcc.sh --nj 20 --cmd "$train_cmd" \
+		       data/$x exp/make_mfcc/$x $mfccdir
+    steps/compute_cmvn_stats.sh data/$x exp/make_mfcc/$x $mfccdir
+    utils/fix_data_dir.sh data/$x
+  done
+fi
+
+if [ $stage -le 9 ]; then
+  # make a subset for monophone training
+  utils/subset_data_dir.sh --shortest data/${train_set} 100000 data/${train_set}_100kshort
+  utils/subset_data_dir.sh data/${train_set}_100kshort 30000 data/${train_set}_30kshort
+fi
+
+if [ $stage -le 10 ]; then
+  # Starting basic training on MFCC features
+  steps/train_mono.sh --nj $nj --cmd "$train_cmd" \
+		      data/${train_set}_30kshort data/lang exp/mono
+fi
+
+if [ $stage -le 11 ]; then
+  steps/align_si.sh --nj $nj --cmd "$train_cmd" \
+		    data/${train_set} data/lang exp/mono exp/mono_ali
+
+  steps/train_deltas.sh --cmd "$train_cmd" \
+			2500 30000 data/${train_set} data/lang exp/mono_ali exp/tri1
+fi
+
+if [ $stage -le 12 ]; then
+  steps/align_si.sh --nj $nj --cmd "$train_cmd" \
+		    data/${train_set} data/lang exp/tri1 exp/tri1_ali
+
+  steps/train_lda_mllt.sh --cmd "$train_cmd" \
+			  4000 50000 data/${train_set} data/lang exp/tri1_ali exp/tri2
+fi
+
+if [ $stage -le 13 ]; then
+  utils/mkgraph.sh data/lang exp/tri2 exp/tri2/graph
+  for dset in ${test_sets}; do
+    steps/decode.sh --nj $decode_nj --cmd "$decode_cmd"  --num-threads 4 \
+		    exp/tri2/graph data/${dset} exp/tri2/decode_${dset} &
+  done
+  wait
+fi
+
+if [ $stage -le 14 ]; then
+  steps/align_si.sh --nj $nj --cmd "$train_cmd" \
+		    data/${train_set} data/lang exp/tri2 exp/tri2_ali
+
+  steps/train_sat.sh --cmd "$train_cmd" \
+		     5000 100000 data/${train_set} data/lang exp/tri2_ali exp/tri3
+fi
+
+if [ $stage -le 15 ]; then
+  utils/mkgraph.sh data/lang exp/tri3 exp/tri3/graph
+  for dset in ${test_sets}; do
+    steps/decode_fmllr.sh --nj $decode_nj --cmd "$decode_cmd"  --num-threads 4 \
+			  exp/tri3/graph data/${dset} exp/tri3/decode_${dset} &
+  done
+  wait
+fi
+
+if [ $stage -le 16 ]; then
+  # The following script cleans the data and produces cleaned data
+  steps/cleanup/clean_and_segment_data.sh --nj ${nj} --cmd "$train_cmd" \
+    --segmentation-opts "--min-segment-length 0.3 --min-new-segment-length 0.6" \
+    data/${train_set} data/lang exp/tri3 exp/tri3_cleaned data/${train_set}_cleaned
+fi
+
+if [ $stage -le 17 ]; then
+  # chain TDNN
+  local/chain/tuning/run_tdnn_1b.sh --nj ${nj} \
+    --stage $nnet_stage \
+    --train-set ${train_set}_cleaned \
+    --test-sets "$test_sets" \
+    --gmm tri3_cleaned --nnet3-affix _${train_set}_cleaned_rvb
+fi
+
+if [ $stage -le 18 ]; then
+  # 2-stage decoding
+  for test_set in $test_sets; do
+    local/nnet3/decode.sh --affix 2stage --pass2-decode-opts "--min-active 1000" \
+      --acwt 1.0 --post-decode-acwt 10.0 \
+      --frames-per-chunk 150 --nj $decode_nj \
+      --ivector-dir exp/nnet3_${train_set}_cleaned_rvb \
+      data/${test_set} data/lang_chain \
+      exp/chain_${train_set}_cleaned_rvb/tree_sp/graph \
+      exp/chain_${train_set}_cleaned_rvb/tdnn1b_sp 
+  done
+fi
+
+if [ $stage -le 19 ]; then
+  # final scoring to get the official challenge result
+  # please specify both dev and eval set directories so that the search parameters
+  # (insertion penalty and language model weight) will be tuned using the dev set
+  local/score_for_submit.sh \
+      --dev exp/chain_${train_set}_cleaned_rvb/tdnn1b_sp/decode_dev_${enhancement}_dereverb_ref \
+      --eval exp/chain_${train_set}_cleaned_rvb/tdnn1b_sp/decode_eval_${enhancement}_dereverb_ref
+fi
diff --git a/egs/chime5/s5b/steps b/egs/chime5/s5b/steps
new file mode 120000
index 00000000000..1b186770dd1
--- /dev/null
+++ b/egs/chime5/s5b/steps
@@ -0,0 +1 @@
+../../wsj/s5/steps/
\ No newline at end of file
diff --git a/egs/chime5/s5b/utils b/egs/chime5/s5b/utils
new file mode 120000
index 00000000000..a3279dc8679
--- /dev/null
+++ b/egs/chime5/s5b/utils
@@ -0,0 +1 @@
+../../wsj/s5/utils/
\ No newline at end of file
diff --git a/egs/wsj/s5/steps/conf/get_ctm_conf.sh b/egs/wsj/s5/steps/conf/get_ctm_conf.sh
index 8dbc9f449cd..5ce39b1ddb6 100755
--- a/egs/wsj/s5/steps/conf/get_ctm_conf.sh
+++ b/egs/wsj/s5/steps/conf/get_ctm_conf.sh
@@ -2,7 +2,8 @@
 # Copyright Johns Hopkins University (Author: Daniel Povey) 2012.  Apache 2.0.
 
 # This script produces CTM files from a decoding directory that has lattices
-# present. This version gives you confidence scores. See also steps/get_ctm.sh
+# present.  This version gives you confidence scores using MBR decoding.
+# See also steps/get_ctm.sh
 
 
 # begin configuration section.
@@ -13,6 +14,7 @@ max_lmwt=20
 use_segments=true # if we have a segments file, use it to convert
                   # the segments to be relative to the original files.
 iter=final
+beam=5  # pruning beam before MBR decoding
 #end configuration section.
 
 echo "$0 $@"  # Print the command line for logging
@@ -21,6 +23,8 @@ echo "$0 $@"  # Print the command line for logging
 . parse_options.sh || exit 1;
 
 if [ $# -ne 3 ]; then
+  echo "This script produces CTM files from a decoding directory that has lattices "
+  echo "present.  This version gives you confidence scores using MBR decoding."
   echo "Usage: $0 [options] <data-dir> <lang-dir|graph-dir> <decode-dir>"
   echo " Options:"
   echo "    --cmd (run.pl|queue.pl...)      # specify how to run the sub-processes."
@@ -50,6 +54,7 @@ name=`basename $data`; # e.g. eval2000
 
 mkdir -p $dir/scoring/log
 
+frame_shift_opt=
 if [ -f $dir/../frame_shift ]; then
   frame_shift_opt="--frame-shift=$(cat $dir/../frame_shift)"
   echo "$0: $dir/../frame_shift exists, using $frame_shift_opt"
@@ -68,10 +73,12 @@ if [ $stage -le 0 ]; then
     filter_cmd=cat
   fi
 
+  nj=$(cat $dir/num_jobs)
+  lats=$(for n in $(seq $nj); do echo -n "$dir/lat.$n.gz "; done)
   if [ -f $lang/phones/word_boundary.int ]; then
     $cmd LMWT=$min_lmwt:$max_lmwt $dir/scoring/log/get_ctm.LMWT.log \
-      mkdir -p $dir/score_LMWT/ '&&' \
-      lattice-prune --inv-acoustic-scale=LMWT --beam=5 "ark:gunzip -c $dir/lat.*.gz|" ark:- \| \
+      set -o pipefail '&&' mkdir -p $dir/score_LMWT/ '&&' \
+      lattice-prune --inv-acoustic-scale=LMWT --beam=$beam "ark:gunzip -c $lats|" ark:- \| \
       lattice-align-words $lang/phones/word_boundary.int $model ark:- ark:- \| \
       lattice-to-ctm-conf $frame_shift_opt --decode-mbr=true --inv-acoustic-scale=LMWT ark:- - \| \
       utils/int2sym.pl -f 5 $lang/words.txt \| \
@@ -82,8 +89,8 @@ if [ $stage -le 0 ]; then
       exit 1;
     fi
     $cmd LMWT=$min_lmwt:$max_lmwt $dir/scoring/log/get_ctm.LMWT.log \
-      mkdir -p $dir/score_LMWT/ '&&' \
-      lattice-prune --inv-acoustic-scale=LMWT --beam=5 "ark:gunzip -c $dir/lat.*.gz|" ark:- \| \
+      set -o pipefail '&&' mkdir -p $dir/score_LMWT/ '&&' \
+      lattice-prune --inv-acoustic-scale=LMWT --beam=$beam "ark:gunzip -c $lats|" ark:- \| \
       lattice-align-words-lexicon $lang/phones/align_lexicon.int $model ark:- ark:- \| \
       lattice-to-ctm-conf $frame_shift_opt --decode-mbr=true --inv-acoustic-scale=LMWT ark:- - \| \
       utils/int2sym.pl -f 5 $lang/words.txt \| \
diff --git a/egs/wsj/s5/steps/online/nnet2/extract_ivectors.sh b/egs/wsj/s5/steps/online/nnet2/extract_ivectors.sh
index a423be7aa20..858dd4b6730 100755
--- a/egs/wsj/s5/steps/online/nnet2/extract_ivectors.sh
+++ b/egs/wsj/s5/steps/online/nnet2/extract_ivectors.sh
@@ -64,6 +64,10 @@ if [ -f path.sh ]; then . ./path.sh; fi
 if [ $# != 4 ] && [ $# != 5 ]; then
   echo "Usage: $0 [options] <data> <lang> <extractor-dir> [<alignment-dir>|<decode-dir>|<weights-archive>] <ivector-dir>"
   echo " e.g.: $0 data/test data/lang exp/nnet2_online/extractor exp/tri3/decode_test exp/nnet2_online/ivectors_test"
+  echo "If <alignment-dir|decode-dir> is provided, it is converted to frame-weights "
+  echo "giving silence frames a weight of --silence-weight (default: 0.0). "
+  echo "If <weights-archive> is provided, it must be a single archive file compressed "
+  echo "(using gunzip) containing per-frame weights for each utterance."
   echo "main options (for others, see top of script file)"
   echo "  --config <config-file>                           # config containing options"
   echo "  --cmd (utils/run.pl|utils/queue.pl <queue opts>) # how to run jobs."
@@ -90,7 +94,7 @@ else # 5 arguments
   data=$1
   lang=$2
   srcdir=$3
-  ali_or_decode_dir=$4
+  ali_or_decode_dir_or_weights=$4
   dir=$5
 fi
 
@@ -102,23 +106,23 @@ done
 mkdir -p $dir/log
 silphonelist=$(cat $lang/phones/silence.csl) || exit 1;
 
-if [ ! -z "$ali_or_decode_dir" ]; then
+if [ ! -z "$ali_or_decode_dir_or_weights" ]; then
 
 
-  if [ -f $ali_or_decode_dir/ali.1.gz ]; then
-    if [ ! -f $ali_or_decode_dir/${mdl}.mdl ]; then
-      echo "$0: expected $ali_or_decode_dir/${mdl}.mdl to exist."
+  if [ -f $ali_or_decode_dir_or_weights/ali.1.gz ]; then
+    if [ ! -f $ali_or_decode_dir_or_weights/${mdl}.mdl ]; then
+      echo "$0: expected $ali_or_decode_dir_or_weights/${mdl}.mdl to exist."
       exit 1;
     fi
-    nj_orig=$(cat $ali_or_decode_dir/num_jobs) || exit 1;
+    nj_orig=$(cat $ali_or_decode_dir_or_weights/num_jobs) || exit 1;
 
     if [ $stage -le 0 ]; then
       rm $dir/weights.*.gz 2>/dev/null
 
       $cmd JOB=1:$nj_orig  $dir/log/ali_to_post.JOB.log \
-        gunzip -c $ali_or_decode_dir/ali.JOB.gz \| \
+        gunzip -c $ali_or_decode_dir_or_weights/ali.JOB.gz \| \
         ali-to-post ark:- ark:- \| \
-        weight-silence-post $silence_weight $silphonelist $ali_or_decode_dir/final.mdl ark:- ark:- \| \
+        weight-silence-post $silence_weight $silphonelist $ali_or_decode_dir_or_weights/final.mdl ark:- ark:- \| \
         post-to-weights ark:- "ark:|gzip -c >$dir/weights.JOB.gz" || exit 1;
 
       # put all the weights in one archive.
@@ -126,10 +130,10 @@ if [ ! -z "$ali_or_decode_dir" ]; then
       rm $dir/weights.*.gz || exit 1;
     fi
 
-  elif [ -f $ali_or_decode_dir/lat.1.gz ]; then
-    nj_orig=$(cat $ali_or_decode_dir/num_jobs) || exit 1;
-    if [ ! -f $ali_or_decode_dir/../${mdl}.mdl ]; then
-      echo "$0: expected $ali_or_decode_dir/../${mdl}.mdl to exist."
+  elif [ -f $ali_or_decode_dir_or_weights/lat.1.gz ]; then
+    nj_orig=$(cat $ali_or_decode_dir_or_weights/num_jobs) || exit 1;
+    if [ ! -f $ali_or_decode_dir_or_weights/../${mdl}.mdl ]; then
+      echo "$0: expected $ali_or_decode_dir_or_weights/../${mdl}.mdl to exist."
       exit 1;
     fi
 
@@ -138,19 +142,19 @@ if [ ! -z "$ali_or_decode_dir" ]; then
       rm $dir/weights.*.gz 2>/dev/null
 
       $cmd JOB=1:$nj_orig  $dir/log/lat_to_post.JOB.log \
-        lattice-best-path --acoustic-scale=$acwt "ark:gunzip -c $ali_or_decode_dir/lat.JOB.gz|" ark:/dev/null ark:- \| \
+        lattice-best-path --acoustic-scale=$acwt "ark:gunzip -c $ali_or_decode_dir_or_weights/lat.JOB.gz|" ark:/dev/null ark:- \| \
         ali-to-post ark:- ark:- \| \
-        weight-silence-post $silence_weight $silphonelist $ali_or_decode_dir/../${mdl}.mdl ark:- ark:- \| \
+        weight-silence-post $silence_weight $silphonelist $ali_or_decode_dir_or_weights/../${mdl}.mdl ark:- ark:- \| \
         post-to-weights ark:- "ark:|gzip -c >$dir/weights.JOB.gz" || exit 1;
 
       # put all the weights in one archive.
       for j in $(seq $nj_orig); do gunzip -c $dir/weights.$j.gz; done | gzip -c >$dir/weights.gz || exit 1;
       rm $dir/weights.*.gz || exit 1;
     fi
-  elif [ -f $ali_or_decode_dir ] && gunzip -c $ali_or_decode_dir >/dev/null; then
-    cp $ali_or_decode_dir $dir/weights.gz || exit 1;
+  elif [ -f $ali_or_decode_dir_or_weights ] && gunzip -c $ali_or_decode_dir_or_weights >/dev/null; then
+    cp $ali_or_decode_dir_or_weights $dir/weights.gz || exit 1;
   else
-    echo "$0: expected ali.1.gz or lat.1.gz to exist in $ali_or_decode_dir";
+    echo "$0: expected ali.1.gz or lat.1.gz to exist in $ali_or_decode_dir_or_weights";
     exit 1;
   fi
 fi
@@ -169,7 +173,7 @@ if [ $sub_speaker_frames -gt 0 ]; then
 
   if [ $stage -le 1 ]; then
   # We work out 'fake' spk2utt files that possibly split each speaker into multiple pieces.
-    if [ ! -z "$ali_or_decode_dir" ]; then
+    if [ ! -z "$ali_or_decode_dir_or_weights" ]; then
       gunzip -c $dir/weights.gz | copy-vector ark:- ark,t:- | \
         awk '{ sum=0; for (n=3;n<NF;n++) sum += $n; print $1, sum; }' > $dir/utt_counts || exit 1;
     else
@@ -230,7 +234,7 @@ else
 fi
 
 if [ $stage -le 2 ]; then
-  if [ ! -z "$ali_or_decode_dir" ]; then
+  if [ ! -z "$ali_or_decode_dir_or_weights" ]; then
     $cmd --num-threads $num_threads JOB=1:$nj $dir/log/extract_ivectors.JOB.log \
       gmm-global-get-post --n=$num_gselect --min-post=$min_post $srcdir/final.dubm "$gmm_feats" ark:- \| \
       weight-post ark:- "ark,s,cs:gunzip -c $dir/weights.gz|" ark:- \| \
diff --git a/egs/wsj/s5/utils/perturb_data_dir_speed.sh b/egs/wsj/s5/utils/perturb_data_dir_speed.sh
index 99c9cbdb1f0..924ebdc3473 100755
--- a/egs/wsj/s5/utils/perturb_data_dir_speed.sh
+++ b/egs/wsj/s5/utils/perturb_data_dir_speed.sh
@@ -73,7 +73,7 @@ if [ -f $srcdir/segments ]; then
   utils/apply_map.pl -f 1 $destdir/utt_map <$srcdir/segments | \
     utils/apply_map.pl -f 2 $destdir/reco_map | \
       awk -v factor=$factor \
-        '{printf("%s %s %.2f %.2f\n", $1, $2, $3/factor, $4/factor);}' >$destdir/segments
+        '{s=$3/factor; e=$4/factor; if (e > s + 0.01) { printf("%s %s %.2f %.2f\n", $1, $2, $3/factor, $4/factor);} }' >$destdir/segments
 
   utils/apply_map.pl -f 1 $destdir/reco_map <$srcdir/wav.scp | sed 's/| *$/ |/' | \
     # Handle three cases of rxfilenames appropriately; "input piped command", "file offset" and "filename" 
diff --git a/egs/wsj/s5/utils/subset_data_dir.sh b/egs/wsj/s5/utils/subset_data_dir.sh
index 93ee0971b88..4cd3f9b7711 100755
--- a/egs/wsj/s5/utils/subset_data_dir.sh
+++ b/egs/wsj/s5/utils/subset_data_dir.sh
@@ -123,6 +123,8 @@ function do_filtering {
      [ -f $srcdir/wav.scp ] && utils/filter_scp.pl $destdir/reco <$srcdir/wav.scp >$destdir/wav.scp
      [ -f $srcdir/reco2file_and_channel ] && \
        utils/filter_scp.pl $destdir/reco <$srcdir/reco2file_and_channel >$destdir/reco2file_and_channel
+     [ -f $srcdir/reco2dur ] && \
+       utils/filter_scp.pl $destdir/reco <$srcdir/reco2dur >$destdir/reco2dur
 
      # Filter the STM file for proper sclite scoring
      # Copy over the comments from STM file
@@ -134,6 +136,8 @@ function do_filtering {
      awk '{print $1;}' $destdir/wav.scp | sort | uniq > $destdir/reco
      [ -f $srcdir/reco2file_and_channel ] && \
        utils/filter_scp.pl $destdir/reco <$srcdir/reco2file_and_channel >$destdir/reco2file_and_channel
+     [ -f $srcdir/reco2dur ] && \
+       utils/filter_scp.pl $destdir/reco <$srcdir/reco2dur >$destdir/reco2dur
      
      rm $destdir/reco
   fi

From e3303207f4b90636137233cd25fef51e2085dcaf Mon Sep 17 00:00:00 2001
From: phanisankar-nidadavolu
 <32964714+phanisankar-nidadavolu@users.noreply.github.com>
Date: Mon, 13 May 2019 14:57:48 -0400
Subject: [PATCH 110/163] [scripts,egs] Made changes to the augmentation script
 to make it work for ASR and speaker ID (#3119)

Now multi-style training with noise and reverberation is an option (instead of speed augmentation).
Multi-style training seems to be more robust to unseen/noisy conditions.
---
 .../chain/multi_condition/run_tdnn_aug_1a.sh  | 281 ++++++++++++++++++
 egs/swbd/s5c/local/chain/run_tdnn_aug.sh      |   1 +
 .../nnet3/multi_condition/run_aug_common.sh   | 242 +++++++++++++++
 egs/wsj/s5/steps/copy_ali_dir.sh              |  75 +++++
 egs/wsj/s5/steps/copy_lat_dir.sh              |  74 +++++
 egs/wsj/s5/steps/data/augment_data_dir.py     | 187 +++++++++---
 egs/wsj/s5/steps/data/make_musan.py           | 178 +++++++++++
 egs/wsj/s5/steps/data/make_musan.sh           |  71 +++++
 egs/wsj/s5/steps/data/reverberate_data_dir.py | 279 +++++++++--------
 9 files changed, 1216 insertions(+), 172 deletions(-)
 create mode 100755 egs/swbd/s5c/local/chain/multi_condition/run_tdnn_aug_1a.sh
 create mode 120000 egs/swbd/s5c/local/chain/run_tdnn_aug.sh
 create mode 100755 egs/swbd/s5c/local/nnet3/multi_condition/run_aug_common.sh
 create mode 100755 egs/wsj/s5/steps/copy_ali_dir.sh
 create mode 100755 egs/wsj/s5/steps/copy_lat_dir.sh
 create mode 100755 egs/wsj/s5/steps/data/make_musan.py
 create mode 100755 egs/wsj/s5/steps/data/make_musan.sh

diff --git a/egs/swbd/s5c/local/chain/multi_condition/run_tdnn_aug_1a.sh b/egs/swbd/s5c/local/chain/multi_condition/run_tdnn_aug_1a.sh
new file mode 100755
index 00000000000..8762430ee7f
--- /dev/null
+++ b/egs/swbd/s5c/local/chain/multi_condition/run_tdnn_aug_1a.sh
@@ -0,0 +1,281 @@
+#!/bin/bash
+
+# This recipe does multi-style training of TDNN model
+
+# local/chain/compare_wer_general.sh --rt03 tdnn7q_sp tdnn1a_aug
+# System                tdnn7q_sp tdnn1a_aug
+# WER on train_dev(tg)      11.91     12.06
+# WER on train_dev(fg)      10.99     10.92
+# WER on eval2000(tg)        14.3      14.4
+# WER on eval2000(fg)        12.8      12.9
+# WER on rt03(tg)            17.2      17.1
+# WER on rt03(fg)            15.1      14.8
+# Final train prob         -0.062    -0.087
+# Final valid prob         -0.074    -0.105
+# Final train prob (xent)        -0.933    -1.164
+# Final valid prob (xent)       -0.9027   -1.2246
+# Num-parameters               18693376  18483664
+
+set -e
+
+# configs for 'chain'
+stage=0
+train_stage=-10
+get_egs_stage=-10
+num_epochs=3
+
+# Augmentation options
+aug_list="reverb babble music noise clean" # Original train dir is referred to as `clean`
+num_reverb_copies=1
+use_ivectors=true
+
+affix=1a
+suffix="_aug"
+if [ -e data/rt03 ]; then maybe_rt03=rt03; else maybe_rt03= ; fi
+
+decode_iter=
+decode_nj=50
+
+# training options
+frames_per_eg=150,110,100
+remove_egs=false
+common_egs_dir=
+xent_regularize=0.1
+dropout_schedule='0,0@0.20,0.5@0.50,0'
+
+test_online_decoding=false  # if true, it will run the last decoding stage.
+
+# End configuration section.
+echo "$0 $@"  # Print the command line for logging
+
+. ./cmd.sh
+. ./path.sh
+. ./utils/parse_options.sh
+
+dir=exp/chain/tdnn${affix}${suffix}
+
+if ! cuda-compiled; then
+  cat <<EOF && exit 1
+This script is intended to be used with GPUs but you have not compiled Kaldi with CUDA
+If you want to use GPUs (and have them), go to src/, and configure and make on a machine
+where "nvcc" is installed.
+EOF
+fi
+
+clean_set=train_nodup
+clean_ali=tri4_ali_nodup
+train_set=$clean_set$suffix # Will be prepared by the script local/nnet3/prepare_multistyle_data.sh
+ali_dir=$clean_ali$suffix
+treedir=exp/chain/tri5_7d_tree$suffix
+lang=data/lang_chain_2y
+
+# First creates augmented data and then extracts features for it data
+# The script also creates alignments for aug data by copying clean alignments
+local/nnet3/multi_condition/run_aug_common.sh --stage $stage \
+  --aug-list "$aug_list" --num-reverb-copies $num_reverb_copies \
+  --use-ivectors "$use_ivectors" \
+  --train-set $clean_set --clean-ali $clean_ali || exit 1;
+
+if [ $stage -le 11 ]; then
+  # Get the alignments as lattices (gives the LF-MMI training more freedom).
+  # use the same num-jobs as the alignments
+  prefixes=""
+  include_original=false
+  for n in $aug_list; do
+    if [ "$n" == "reverb" ]; then
+      for i in `seq 1 $num_reverb_copies`; do
+        prefixes="$prefixes "reverb$i
+      done
+    elif [ "$n" != "clean" ]; then
+      prefixes="$prefixes "$n
+    else
+      # The original train directory will not have any prefix
+      # include_original flag will take care of copying the original lattices
+      include_original=true
+    fi
+  done
+  nj=$(cat exp/tri4_ali_nodup$suffix/num_jobs) || exit 1;
+  steps/align_fmllr_lats.sh --nj $nj --cmd "$train_cmd" data/${clean_set} \
+    data/lang exp/tri4 exp/tri4_lats_nodup${suffix}_clean
+  rm exp/tri4_lats_nodup${suffix}_clean/fsts.*.gz # save space
+  steps/copy_lat_dir.sh --nj $nj --cmd "$train_cmd" \
+    --include-original "$include_original" --prefixes "$prefixes" \
+    data/${train_set} exp/tri4_lats_nodup${suffix}_clean exp/tri4_lats_nodup${suffix} || exit 1;
+fi
+
+if [ $stage -le 12 ]; then
+  # Create a version of the lang/ directory that has one state per phone in the
+  # topo file. [note, it really has two states.. the first one is only repeated
+  # once, the second one has zero or more repeats.]
+  rm -rf $lang
+  cp -r data/lang $lang
+  silphonelist=$(cat $lang/phones/silence.csl) || exit 1;
+  nonsilphonelist=$(cat $lang/phones/nonsilence.csl) || exit 1;
+  # Use our special topology... note that later on may have to tune this
+  # topology.
+  steps/nnet3/chain/gen_topo.py $nonsilphonelist $silphonelist >$lang/topo
+fi
+
+if [ $stage -le 13 ]; then
+  # Build a tree using our new topology. This is the critically different
+  # step compared with other recipes.
+  steps/nnet3/chain/build_tree.sh --frame-subsampling-factor 3 \
+      --context-opts "--context-width=2 --central-position=1" \
+      --cmd "$train_cmd" 7000 data/$train_set $lang exp/$ali_dir $treedir
+fi
+
+if [ $stage -le 14 ]; then
+  echo "$0: creating neural net configs using the xconfig parser";
+
+  num_targets=$(tree-info $treedir/tree |grep num-pdfs|awk '{print $2}')
+  learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python)
+  affine_opts="l2-regularize=0.01 dropout-proportion=0.0 dropout-per-dim=true dropout-per-dim-continuous=true"
+  tdnnf_opts="l2-regularize=0.01 dropout-proportion=0.0 bypass-scale=0.66"
+  linear_opts="l2-regularize=0.01 orthonormal-constraint=-1.0"
+  prefinal_opts="l2-regularize=0.01"
+  output_opts="l2-regularize=0.002"
+
+  mkdir -p $dir/configs
+
+  cat <<EOF > $dir/configs/network.xconfig
+  input dim=100 name=ivector
+  input dim=40 name=input
+
+  # please note that it is important to have input layer with the name=input
+  # as the layer immediately preceding the fixed-affine-layer to enable
+  # the use of short notation for the descriptor
+  fixed-affine-layer name=lda input=Append(-1,0,1,ReplaceIndex(ivector, t, 0)) affine-transform-file=$dir/configs/lda.mat
+
+  # the first splicing is moved before the lda layer, so no splicing here
+  relu-batchnorm-dropout-layer name=tdnn1 $affine_opts dim=1536
+  tdnnf-layer name=tdnnf2 $tdnnf_opts dim=1536 bottleneck-dim=160 time-stride=1
+  tdnnf-layer name=tdnnf3 $tdnnf_opts dim=1536 bottleneck-dim=160 time-stride=1
+  tdnnf-layer name=tdnnf4 $tdnnf_opts dim=1536 bottleneck-dim=160 time-stride=1
+  tdnnf-layer name=tdnnf5 $tdnnf_opts dim=1536 bottleneck-dim=160 time-stride=0
+  tdnnf-layer name=tdnnf6 $tdnnf_opts dim=1536 bottleneck-dim=160 time-stride=3
+  tdnnf-layer name=tdnnf7 $tdnnf_opts dim=1536 bottleneck-dim=160 time-stride=3
+  tdnnf-layer name=tdnnf8 $tdnnf_opts dim=1536 bottleneck-dim=160 time-stride=3
+  tdnnf-layer name=tdnnf9 $tdnnf_opts dim=1536 bottleneck-dim=160 time-stride=3
+  tdnnf-layer name=tdnnf10 $tdnnf_opts dim=1536 bottleneck-dim=160 time-stride=3
+  tdnnf-layer name=tdnnf11 $tdnnf_opts dim=1536 bottleneck-dim=160 time-stride=3
+  tdnnf-layer name=tdnnf12 $tdnnf_opts dim=1536 bottleneck-dim=160 time-stride=3
+  tdnnf-layer name=tdnnf13 $tdnnf_opts dim=1536 bottleneck-dim=160 time-stride=3
+  tdnnf-layer name=tdnnf14 $tdnnf_opts dim=1536 bottleneck-dim=160 time-stride=3
+  tdnnf-layer name=tdnnf15 $tdnnf_opts dim=1536 bottleneck-dim=160 time-stride=3
+  linear-component name=prefinal-l dim=256 $linear_opts
+
+  prefinal-layer name=prefinal-chain input=prefinal-l $prefinal_opts big-dim=1536 small-dim=256
+  output-layer name=output include-log-softmax=false dim=$num_targets $output_opts
+
+  prefinal-layer name=prefinal-xent input=prefinal-l $prefinal_opts big-dim=1536 small-dim=256
+  output-layer name=output-xent dim=$num_targets learning-rate-factor=$learning_rate_factor $output_opts
+EOF
+  steps/nnet3/xconfig_to_configs.py --xconfig-file $dir/configs/network.xconfig --config-dir $dir/configs/
+fi
+
+if [ $stage -le 15 ]; then
+  if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then
+    utils/create_split_dir.pl \
+     /export/b0{5,6,7,8}/$USER/kaldi-data/egs/swbd-$(date +'%m_%d_%H_%M')/s5c/$dir/egs/storage $dir/egs/storage
+  fi
+
+  steps/nnet3/chain/train.py --stage $train_stage \
+    --cmd "$train_cmd" \
+    --feat.online-ivector-dir exp/nnet3/ivectors_${train_set} \
+    --feat.cmvn-opts "--norm-means=false --norm-vars=false" \
+    --chain.xent-regularize $xent_regularize \
+    --chain.leaky-hmm-coefficient 0.1 \
+    --chain.l2-regularize 0.0 \
+    --chain.apply-deriv-weights false \
+    --chain.lm-opts="--num-extra-lm-states=2000" \
+    --trainer.dropout-schedule $dropout_schedule \
+    --trainer.add-option="--optimization.memory-compression-level=2" \
+    --egs.dir "$common_egs_dir" \
+    --egs.stage $get_egs_stage \
+    --egs.opts "--frames-overlap-per-eg 0 --constrained false" \
+    --egs.chunk-width $frames_per_eg \
+    --trainer.num-chunk-per-minibatch 64 \
+    --trainer.frames-per-iter 1500000 \
+    --trainer.num-epochs $num_epochs \
+    --trainer.optimization.num-jobs-initial 3 \
+    --trainer.optimization.num-jobs-final 16 \
+    --trainer.optimization.initial-effective-lrate 0.00025 \
+    --trainer.optimization.final-effective-lrate 0.000025 \
+    --trainer.max-param-change 2.0 \
+    --cleanup.remove-egs $remove_egs \
+    --feat-dir data/${train_set}_hires \
+    --tree-dir $treedir \
+    --lat-dir exp/tri4_lats_nodup$suffix \
+    --dir $dir  || exit 1;
+
+fi
+
+if [ $stage -le 16 ]; then
+  # Note: it might appear that this $lang directory is mismatched, and it is as
+  # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
+  # the lang directory.
+  utils/mkgraph.sh --self-loop-scale 1.0 data/lang_sw1_tg $dir $dir/graph_sw1_tg
+fi
+
+
+graph_dir=$dir/graph_sw1_tg
+iter_opts=
+if [ ! -z $decode_iter ]; then
+  iter_opts=" --iter $decode_iter "
+fi
+if [ $stage -le 17 ]; then
+  rm $dir/.error 2>/dev/null || true
+  for decode_set in train_dev eval2000 $maybe_rt03; do
+      (
+      steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \
+          --nj $decode_nj --cmd "$decode_cmd" $iter_opts \
+          --online-ivector-dir exp/nnet3/ivectors_${decode_set} \
+          $graph_dir data/${decode_set}_hires \
+          $dir/decode_${decode_set}${decode_iter:+_$decode_iter}_sw1_tg || exit 1;
+      if $has_fisher; then
+          steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \
+            data/lang_sw1_{tg,fsh_fg} data/${decode_set}_hires \
+            $dir/decode_${decode_set}${decode_iter:+_$decode_iter}_sw1_{tg,fsh_fg} || exit 1;
+      fi
+      ) || touch $dir/.error &
+  done
+  wait
+  if [ -f $dir/.error ]; then
+    echo "$0: something went wrong in decoding"
+    exit 1
+  fi
+fi
+
+if $test_online_decoding && [ $stage -le 16 ]; then
+  # note: if the features change (e.g. you add pitch features), you will have to
+  # change the options of the following command line.
+  steps/online/nnet3/prepare_online_decoding.sh \
+       --mfcc-config conf/mfcc_hires.conf \
+       $lang exp/nnet3/extractor $dir ${dir}_online
+
+  rm $dir/.error 2>/dev/null || true
+  for decode_set in train_dev eval2000 $maybe_rt03; do
+    (
+      # note: we just give it "$decode_set" as it only uses the wav.scp, the
+      # feature type does not matter.
+
+      steps/online/nnet3/decode.sh --nj $decode_nj --cmd "$decode_cmd" \
+          --acwt 1.0 --post-decode-acwt 10.0 \
+         $graph_dir data/${decode_set}_hires \
+         ${dir}_online/decode_${decode_set}${decode_iter:+_$decode_iter}_sw1_tg || exit 1;
+      if $has_fisher; then
+          steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \
+            data/lang_sw1_{tg,fsh_fg} data/${decode_set}_hires \
+            ${dir}_online/decode_${decode_set}${decode_iter:+_$decode_iter}_sw1_{tg,fsh_fg} || exit 1;
+      fi
+    ) || touch $dir/.error &
+  done
+  wait
+  if [ -f $dir/.error ]; then
+    echo "$0: something went wrong in decoding"
+    exit 1
+  fi
+fi
+
+
+exit 0;
diff --git a/egs/swbd/s5c/local/chain/run_tdnn_aug.sh b/egs/swbd/s5c/local/chain/run_tdnn_aug.sh
new file mode 120000
index 00000000000..390ed99f5cc
--- /dev/null
+++ b/egs/swbd/s5c/local/chain/run_tdnn_aug.sh
@@ -0,0 +1 @@
+multi_condition/run_tdnn_aug_1a.sh
\ No newline at end of file
diff --git a/egs/swbd/s5c/local/nnet3/multi_condition/run_aug_common.sh b/egs/swbd/s5c/local/nnet3/multi_condition/run_aug_common.sh
new file mode 100755
index 00000000000..3603dd59d79
--- /dev/null
+++ b/egs/swbd/s5c/local/nnet3/multi_condition/run_aug_common.sh
@@ -0,0 +1,242 @@
+#!/bin/bash
+# Copyright 2019   Phani Sankar Nidadavolu
+# Apache 2.0.
+
+. ./cmd.sh
+
+set -e
+stage=0
+aug_list="reverb music noise babble clean"  #clean refers to the original train dir
+use_ivectors=true
+num_reverb_copies=1
+
+# Alignment directories
+lda_mllt_ali=tri2_ali_100k_nodup
+clean_ali=tri4_ali_nodup
+
+# train directories for ivectors and TDNNs
+ivector_trainset=train_100k_nodup
+train_set=train_nodup
+
+. ./path.sh
+. ./utils/parse_options.sh
+
+if [ -e data/rt03 ]; then maybe_rt03=rt03; else maybe_rt03= ; fi
+
+if [ $stage -le 0 ]; then
+  # Adding simulated RIRs to the original data directory
+  echo "$0: Preparing data/${train_set}_reverb directory"
+
+  if [ ! -d "RIRS_NOISES" ]; then
+    # Download the package that includes the real RIRs, simulated RIRs, isotropic noises and point-source noises
+    wget --no-check-certificate http://www.openslr.org/resources/28/rirs_noises.zip
+    unzip rirs_noises.zip
+  fi
+
+  if [ ! -f data/$train_set/reco2dur ]; then
+    utils/data/get_reco2dur.sh --nj 6 --cmd "$train_cmd" data/$train_set || exit 1;
+  fi
+
+  # Make a version with reverberated speech
+  rvb_opts=()
+  rvb_opts+=(--rir-set-parameters "0.5, RIRS_NOISES/simulated_rirs/smallroom/rir_list")
+  rvb_opts+=(--rir-set-parameters "0.5, RIRS_NOISES/simulated_rirs/mediumroom/rir_list")
+
+  # Make a reverberated version of the SWBD train_nodup.
+  # Note that we don't add any additive noise here.
+  steps/data/reverberate_data_dir.py \
+    "${rvb_opts[@]}" \
+    --speech-rvb-probability 1 \
+    --prefix "reverb" \
+    --pointsource-noise-addition-probability 0 \
+    --isotropic-noise-addition-probability 0 \
+    --num-replications $num_reverb_copies \
+    --source-sampling-rate 8000 \
+    data/$train_set data/${train_set}_reverb
+fi
+
+if [ $stage -le 1 ]; then
+  # Prepare the MUSAN corpus, which consists of music, speech, and noise
+  # We will use them as additive noises for data augmentation.
+  steps/data/make_musan.sh --sampling-rate 8000 --use-vocals "true" \
+        /export/corpora/JHU/musan data
+
+  # Augment with musan_noise
+  steps/data/augment_data_dir.py --utt-prefix "noise" --modify-spk-id "true" \
+    --fg-interval 1 --fg-snrs "15:10:5:0" --fg-noise-dir "data/musan_noise" \
+    data/${train_set} data/${train_set}_noise
+
+  # Augment with musan_music
+  steps/data/augment_data_dir.py --utt-prefix "music" --modify-spk-id "true" \
+    --bg-snrs "15:10:8:5" --num-bg-noises "1" --bg-noise-dir "data/musan_music" \
+    data/${train_set} data/${train_set}_music
+
+  # Augment with musan_speech
+  steps/data/augment_data_dir.py --utt-prefix "babble" --modify-spk-id "true" \
+    --bg-snrs "20:17:15:13" --num-bg-noises "3:4:5:6:7" \
+    --bg-noise-dir "data/musan_speech" \
+    data/${train_set} data/${train_set}_babble
+
+  # Combine all the augmentation dirs
+  # This part can be simplified once we know what noise types we will add
+  combine_str=""
+  for n in $aug_list; do
+    if [ "$n" == "clean" ]; then
+      # clean refers to original of training directory
+      combine_str+="data/$train_set "
+    else
+      combine_str+="data/${train_set}_${n} "
+    fi
+  done
+  utils/combine_data.sh data/${train_set}_aug $combine_str
+fi
+
+if [ $stage -le 2 ]; then
+  # Extract low-resolution MFCCs for the augmented data
+  # To be used later to generate alignments for augmented data
+  echo "$0: Extracting low-resolution MFCCs for the augmented data. Useful for generating alignments"
+  mfccdir=mfcc_aug
+  if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $mfccdir/storage ]; then
+    date=$(date +'%m_%d_%H_%M')
+    utils/create_split_dir.pl /export/b0{1,2,3,4}/$USER/kaldi-data/mfcc/swbd-$date/s5c/$mfccdir/storage $mfccdir/storage
+  fi
+  steps/make_mfcc.sh --cmd "$train_cmd" --nj 50 \
+                     data/${train_set}_aug exp/make_mfcc/${train_set}_aug $mfccdir
+  steps/compute_cmvn_stats.sh data/${train_set}_aug exp/make_mfcc/${train_set}_aug $mfccdir
+  utils/fix_data_dir.sh data/${train_set}_aug || exit 1;
+fi
+
+if [ $stage -le 3 ] && $generate_alignments; then
+  # obtain the alignment of augmented data from clean data
+  include_original=false
+  prefixes=""
+  for n in $aug_list; do
+    if [ "$n" == "reverb" ]; then
+      for i in `seq 1 $num_reverb_copies`; do
+        prefixes="$prefixes "reverb$i
+      done
+    elif [ "$n" != "clean" ]; then
+      prefixes="$prefixes "$n
+    else
+      # The original train directory will not have any prefix
+      # include_original flag will take care of copying the original alignments
+      include_original=true
+    fi
+  done
+  echo "$0: Creating alignments of aug data by copying alignments of clean data"
+  steps/copy_ali_dir.sh --nj 40 --cmd "$train_cmd" \
+    --include-original "$include_original" --prefixes "$prefixes" \
+    data/${train_set}_aug exp/${clean_ali} exp/${clean_ali}_aug
+fi
+
+if [ $stage -le 4 ]; then
+  mfccdir=mfcc_hires
+  if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $mfccdir/storage ]; then
+    date=$(date +'%m_%d_%H_%M')
+    utils/create_split_dir.pl /export/b0{1,2,3,4}/$USER/kaldi-data/mfcc/swbd-$date/s5c/$mfccdir/storage $mfccdir/storage
+  fi
+
+  for dataset in ${train_set}_aug; do
+    echo "$0: Creating hi resolution MFCCs for dir data/$dataset"
+    utils/copy_data_dir.sh data/$dataset data/${dataset}_hires
+    utils/data/perturb_data_dir_volume.sh data/${dataset}_hires
+
+    steps/make_mfcc.sh --nj 70 --mfcc-config conf/mfcc_hires.conf \
+        --cmd "$train_cmd" data/${dataset}_hires exp/make_hires/$dataset $mfccdir;
+    steps/compute_cmvn_stats.sh data/${dataset}_hires exp/make_hires/${dataset} $mfccdir;
+
+    # Remove the small number of utterances that couldn't be extracted for some
+    # reason (e.g. too short; no such file).
+    utils/fix_data_dir.sh data/${dataset}_hires;
+  done
+fi
+
+if [ $stage -le 5 ]; then
+  mfccdir=mfcc_hires
+  for dataset in eval2000 train_dev $maybe_rt03; do
+    echo "$0: Creating hi resolution MFCCs for data/$dataset"
+    # Create MFCCs for the eval set
+    utils/copy_data_dir.sh data/$dataset data/${dataset}_hires
+    steps/make_mfcc.sh --cmd "$train_cmd" --nj 10 --mfcc-config conf/mfcc_hires.conf \
+        data/${dataset}_hires exp/make_hires/$dataset $mfccdir;
+    steps/compute_cmvn_stats.sh data/${dataset}_hires exp/make_hires/$dataset $mfccdir;
+    utils/fix_data_dir.sh data/${dataset}_hires  # remove segments with problems
+  done
+fi
+
+if [ "$use_ivectors" == "true" ]; then
+  if [ $stage -le 6 ]; then
+    # Take  30k utterances from MS data this will be used for the diagubm training.
+    utils/subset_data_dir.sh data/${train_set}_aug_hires 30000 data/${train_set}_aug_30k_hires
+    utils/data/remove_dup_utts.sh 200 data/${train_set}_aug_30k_hires data/${train_set}_aug_30k_nodup_hires  # 33hr
+
+    # Make a 140 hr subset of augmented data to train i-vector extractor
+    # we don't extract hi res features again for ivector training data
+    # we take it from the ms features extracted on the entire training set
+    # First augment the train_100k_nodup directory which is used to train the i-vector extractor in baseline
+    utils/copy_data_dir.sh data/${train_set}_aug_hires data/${ivector_trainset}_aug_hires
+    utils/filter_scp.pl -f 2 data/${ivector_trainset}/utt2spk data/${train_set}_aug_hires/utt2uniq | \
+        utils/filter_scp.pl - data/${train_set}_aug_hires/utt2spk > data/${ivector_trainset}_aug_hires/utt2spk
+    utils/fix_data_dir.sh data/${ivector_trainset}_aug_hires
+
+    # Since the data size is now increased make a subset of it to bring the duration back to required size (140hr)
+    utils/subset_data_dir.sh data/${ivector_trainset}_aug_hires 100000 data/${ivector_trainset}_aug_hires_subset
+    utils/data/remove_dup_utts.sh 200 data/${ivector_trainset}_aug_hires_subset data/${ivector_trainset}_aug_hires
+    steps/compute_cmvn_stats.sh data/${ivector_trainset}_aug_hires exp/make_hires/${ivector_trainset} $mfccdir;
+    utils/fix_data_dir.sh data/${ivector_trainset}_aug_hires
+  fi
+
+  # ivector extractor training
+  if [ $stage -le 7 ]; then
+    # First copy the clean alignments to augmented alignments to train LDA+MLLT transform
+    # Since the alignments are created using  low-res mfcc features make a copy of ivector training directory
+    utils/copy_data_dir.sh data/${ivector_trainset}_aug_hires data/${ivector_trainset}_aug
+    utils/filter_scp.pl data/${ivector_trainset}_aug/utt2spk data/${train_set}_aug/feats.scp > data/${ivector_trainset}_aug/feats.scp
+    utils/fix_data_dir.sh data/${ivector_trainset}_aug
+    echo "$0: Creating alignments of aug data by copying alignments of clean data"
+    local/copy_ali_dir.sh --nj 40 --cmd "$train_cmd" \
+        data/${ivector_trainset}_aug exp/${lda_mllt_ali} exp/${lda_mllt_ali}_aug
+
+    # We need to build a small system just because we need the LDA+MLLT transform
+    # to train the diag-UBM on top of.  We use --num-iters 13 because after we get
+    # the transform (12th iter is the last), any further training is pointless.
+    # this decision is based on fisher_english
+    steps/train_lda_mllt.sh --cmd "$train_cmd" --num-iters 13 \
+      --splice-opts "--left-context=3 --right-context=3" \
+      5500 90000 data/${ivector_trainset}_aug_hires \
+      data/lang exp/${lda_mllt_ali}_aug exp/nnet3/tri3b
+  fi
+
+  if [ $stage -le 8 ]; then
+    # To train a diagonal UBM we don't need very much data, so use the smallest subset.
+    echo "$0: Training diagonal UBM for i-vector extractor"
+    steps/online/nnet2/train_diag_ubm.sh --cmd "$train_cmd" --nj 30 --num-frames 200000 \
+      data/${train_set}_aug_30k_nodup_hires 512 exp/nnet3/tri3b exp/nnet3/diag_ubm
+  fi
+
+  if [ $stage -le 9 ]; then
+    # iVector extractors can be sensitive to the amount of data, but this one has a
+    # fairly small dim (defaults to 100) so we don't use all of it, we use just the
+    # 100k subset (just under half the data).
+    echo "$0: Training i-vector extractor for speaker adaptation"
+    steps/online/nnet2/train_ivector_extractor.sh --cmd "$train_cmd" --nj 10 \
+      data/${ivector_trainset}_aug_hires exp/nnet3/diag_ubm exp/nnet3/extractor || exit 1;
+  fi
+
+  if [ $stage -le 10 ]; then
+    # We extract iVectors on all the train_nodup data, which will be what we
+    # train the system on.
+    # having a larger number of speakers is helpful for generalization, and to
+    # handle per-utterance decoding well (iVector starts at zero).
+    echo "$0: Extracting ivectors for train and eval directories"
+    utils/data/modify_speaker_info.sh --utts-per-spk-max 2 data/${train_set}_aug_hires data/${train_set}_aug_max2_hires
+
+    steps/online/nnet2/extract_ivectors_online.sh --cmd "$train_cmd" --nj 30 \
+      data/${train_set}_aug_max2_hires exp/nnet3/extractor exp/nnet3/ivectors_${train_set}_aug || exit 1;
+
+    for dataset in eval2000 train_dev $maybe_rt03; do
+      steps/online/nnet2/extract_ivectors_online.sh --cmd "$train_cmd" --nj 30 \
+        data/${dataset}_hires exp/nnet3/extractor exp/nnet3/ivectors_$dataset || exit 1;
+    done
+  fi
+fi
diff --git a/egs/wsj/s5/steps/copy_ali_dir.sh b/egs/wsj/s5/steps/copy_ali_dir.sh
new file mode 100755
index 00000000000..60618a2f4bf
--- /dev/null
+++ b/egs/wsj/s5/steps/copy_ali_dir.sh
@@ -0,0 +1,75 @@
+#!/bin/bash
+# Copyright 2019   Phani Sankar Nidadavolu
+# Apache 2.0.
+
+prefixes="reverb1 babble music noise"
+include_original=true
+max_jobs_run=50
+nj=100
+cmd=queue.pl
+write_binary=true
+
+. ./path.sh
+. utils/parse_options.sh
+
+if [ $# -ne 3 ]; then
+  echo "Usage: $0 <out-data> <src-ali-dir> <out-ali-dir>"
+  echo "This script creates alignments for the aug dirs by copying "
+  echo " the alignments of original train dir"
+  echo "While copying it adds prefix to the utterances specified by prefixes option"
+  echo "Note that the original train dir does not have any prefix"
+  echo "To include the original training directory in the copied "
+  echo "version set the --include-original option to true"
+  echo "main options (for others, see top of script file)"
+  echo "  --prefixes <string of prefixes to add>    # All the prefixes of aug data to be included"
+  echo "  --include-original <true/false>           # If true, will copy the alignements of original dir"
+  echo "  --write-compact <true/false>              # Write lattices in compact mode"
+  exit 1
+fi
+
+data=$1
+src_dir=$2
+dir=$3
+
+mkdir -p $dir
+
+num_jobs=$(cat $src_dir/num_jobs)
+
+rm -f $dir/ali_tmp.*.{ark,scp} 2>/dev/null
+
+# Copy the alignments temporarily
+echo "creating temporary alignments in $dir"
+$cmd --max-jobs-run $max_jobs_run JOB=1:$num_jobs $dir/log/copy_ali_temp.JOB.log \
+  copy-int-vector --binary=$write_binary \
+  "ark:gunzip -c $src_dir/ali.JOB.gz |" \
+  ark,scp:$dir/ali_tmp.JOB.ark,$dir/ali_tmp.JOB.scp || exit 1
+
+# Make copies of utterances for perturbed data
+for p in $prefixes; do
+  cat $dir/ali_tmp.*.scp | awk -v p=$p '{print p"-"$0}'
+done | sort -k1,1 > $dir/ali_out.scp.aug
+
+if [ "$include_original" == "true" ]; then
+  cat $dir/ali_tmp.*.scp | awk '{print $0}' | sort -k1,1 > $dir/ali_out.scp.clean
+  cat $dir/ali_out.scp.clean $dir/ali_out.scp.aug | sort -k1,1 > $dir/ali_out.scp
+else
+  cat $dir/ali_out.scp.aug | sort -k1,1 > $dir/ali_out.scp
+fi
+
+utils/split_data.sh ${data} $nj
+
+# Copy and dump the lattices for perturbed data
+echo Creating alignments for augmented data by copying alignments from clean data
+$cmd --max-jobs-run $max_jobs_run JOB=1:$nj $dir/log/copy_out_ali.JOB.log \
+  copy-int-vector --binary=$write_binary \
+  "scp:utils/filter_scp.pl ${data}/split$nj/JOB/utt2spk $dir/ali_out.scp |" \
+  "ark:| gzip -c > $dir/ali.JOB.gz" || exit 1
+
+rm $dir/ali_out.scp.{aug,clean} $dir/ali_out.scp
+rm $dir/ali_tmp.*
+
+echo $nj > $dir/num_jobs
+
+for f in cmvn_opts tree splice_opts phones.txt final.mdl splice_opts tree frame_subsampling_factor; do
+  if [ -f $src_dir/$f ]; then cp $src_dir/$f $dir/$f; fi
+done
diff --git a/egs/wsj/s5/steps/copy_lat_dir.sh b/egs/wsj/s5/steps/copy_lat_dir.sh
new file mode 100755
index 00000000000..dd1e10fb307
--- /dev/null
+++ b/egs/wsj/s5/steps/copy_lat_dir.sh
@@ -0,0 +1,74 @@
+#!/bin/bash
+# Copyright 2019   Phani Sankar Nidadavolu
+# Apache 2.0.
+
+prefixes="reverb1 babble music noise"
+include_original=true
+max_jobs_run=50
+nj=100
+cmd=queue.pl
+write_compact=true
+
+. ./path.sh
+. utils/parse_options.sh
+
+if [ $# -ne 3 ]; then
+  echo "Usage: $0 [options] <out-data> <src-lat-dir> <out-lat-dir>"
+  echo "This script creates lattices for the aug dirs by copying the lattices of original train dir"
+  echo "While copying it adds prefix to the utterances specified by prefixes option"
+  echo "Note that the original train dir does not have any prefix"
+  echo "To include the original training directory in the copied "
+  echo "version set the --include-original option to true"
+  echo "main options (for others, see top of script file)"
+  echo "  --prefixes <string of prefixes to add>             # All the prefixes of aug data to be included"
+  echo "  --include-original <true/false>                    # If true, will copy the lattices of original dir"
+  echo "  --write-compact <true/false>                       # Write lattices in compact mode"
+  exit 1
+fi
+
+data=$1
+src_dir=$2
+dir=$3
+
+mkdir -p $dir
+
+num_jobs=$(cat $src_dir/num_jobs)
+
+rm -f $dir/lat_tmp.*.{ark,scp} 2>/dev/null
+
+# Copy the alignments temporarily
+echo "creating temporary lattices in $dir"
+$cmd --max-jobs-run $max_jobs_run JOB=1:$num_jobs $dir/log/copy_lat_temp.JOB.log \
+  lattice-copy --write-compact=$write_compact \
+  "ark:gunzip -c $src_dir/lat.JOB.gz |" \
+  ark,scp:$dir/lat_tmp.JOB.ark,$dir/lat_tmp.JOB.scp || exit 1
+
+# Make copies of utterances for perturbed data
+for p in $prefixes; do
+  cat $dir/lat_tmp.*.scp | awk -v p=$p '{print p"-"$0}'
+done | sort -k1,1 > $dir/lat_out.scp.aug
+
+if [ "$include_original" == "true" ]; then
+  cat $dir/lat_tmp.*.scp | awk '{print $0}' | sort -k1,1 > $dir/lat_out.scp.clean
+  cat $dir/lat_out.scp.clean $dir/lat_out.scp.aug | sort -k1,1 > $dir/lat_out.scp
+else
+  cat $dir/lat_out.scp.aug | sort -k1,1 > $dir/lat_out.scp
+fi
+
+utils/split_data.sh ${data} $nj
+
+# Copy and dump the lattices for perturbed data
+echo Creating lattices for augmented data by copying lattices from clean data
+$cmd --max-jobs-run $max_jobs_run JOB=1:$nj $dir/log/copy_out_lat.JOB.log \
+  lattice-copy --write-compact=$write_compact \
+  "scp:utils/filter_scp.pl ${data}/split$nj/JOB/utt2spk $dir/lat_out.scp |" \
+  "ark:| gzip -c > $dir/lat.JOB.gz" || exit 1
+
+rm $dir/lat_out.scp.{aug,clean} $dir/lat_out.scp
+rm $dir/lat_tmp.*
+
+echo $nj > $dir/num_jobs
+
+for f in cmvn_opts splice_opts final.mdl splice_opts tree frame_subsampling_factor; do
+  if [ -f $src_dir/$f ]; then cp $src_dir/$f $dir/$f; fi
+done
diff --git a/egs/wsj/s5/steps/data/augment_data_dir.py b/egs/wsj/s5/steps/data/augment_data_dir.py
index 7edcdda2636..f9aaaf40d59 100755
--- a/egs/wsj/s5/steps/data/augment_data_dir.py
+++ b/egs/wsj/s5/steps/data/augment_data_dir.py
@@ -1,6 +1,7 @@
 #!/usr/bin/env python3
 # Copyright 2017  David Snyder
 #           2017  Ye Bai
+#           2019  Phani Sankar Nidadavolu
 # Apache 2.0
 #
 # This script generates augmented data.  It is based on
@@ -10,11 +11,14 @@
 from __future__ import print_function
 import sys, random, argparse, os, imp
 sys.path.append("steps/data/")
-from reverberate_data_dir import ParseFileToDict
-from reverberate_data_dir import WriteDictToFile
+sys.path.insert(0, 'steps/')
+
+from reverberate_data_dir import parse_file_to_dict
+from reverberate_data_dir import write_dict_to_file
+import libs.common as common_lib
 data_lib = imp.load_source('dml', 'steps/data/data_dir_manipulation_lib.py')
 
-def GetArgs():
+def get_args():
     parser = argparse.ArgumentParser(description="Augment the data directory with additive noises. "
         "Noises are separated into background and foreground noises which are added together or "
         "separately.  Background noises are added to the entire recording, and repeated as necessary "
@@ -29,13 +33,29 @@ def GetArgs():
                         help='When foreground noises are being added, the script will iterate through these SNRs.')
     parser.add_argument('--bg-snrs', type=str, dest = "bg_snr_str", default = '20:10:0',
                         help='When background noises are being added, the script will iterate through these SNRs.')
-    parser.add_argument('--num-bg-noises', type=str, dest = "num_bg_noises", default = '1',
-                        help='Number of overlapping background noises that we iterate over. For example, if the input is "1:2:3" then the output wavs will have either 1, 2, or 3 randomly chosen background noises overlapping the entire recording')
-    parser.add_argument('--fg-interval', type=int, dest = "fg_interval", default = 0,
-                        help='Number of seconds between the end of one foreground noise and the beginning of the next.')
-    parser.add_argument('--utt-suffix', type=str, dest = "utt_suffix", default = "aug", help='Suffix added to utterance IDs.')
-    parser.add_argument('--random-seed', type=int, dest = "random_seed", default = 123, help='Random seed.')
-
+    parser.add_argument('--num-bg-noises', type=str,
+                        dest = "num_bg_noises", default = '1',
+                        help='Number of overlapping background noises that we iterate over.'
+                            ' For example, if the input is "1:2:3" then the output wavs will have either '
+                            '1, 2, or 3 randomly chosen background noises overlapping the entire recording')
+    parser.add_argument('--fg-interval', type=int,
+                        dest = "fg_interval", default = 0,
+                        help='Number of seconds between the end of one '
+                            'foreground noise and the beginning of the next.')
+    parser.add_argument('--utt-suffix', type=str,
+                        dest = "utt_suffix", default = None,
+                        help='Suffix added to utterance IDs.')
+    parser.add_argument('--utt-prefix', type=str,
+                        dest = "utt_prefix", default = None,
+                        help='Prefix added to utterance IDs.')
+    parser.add_argument('--random-seed', type=int, dest = "random_seed",
+                        default = 123, help='Random seed.')
+    parser.add_argument("--modify-spk-id", type=str,
+                        dest='modify_spk_id', default=True,
+                        action=common_lib.StrToBoolAction,
+                        choices=["true", "false"],
+                        help='Utt prefix or suffix would be added to the spk id '
+                            'also (used in ASR), in speaker id it is left unmodifed')
     parser.add_argument("--bg-noise-dir", type=str, dest="bg_noise_dir",
                         help="Background noise data directory")
     parser.add_argument("--fg-noise-dir", type=str, dest="fg_noise_dir",
@@ -45,10 +65,23 @@ def GetArgs():
 
     print(' '.join(sys.argv))
     args = parser.parse_args()
-    args = CheckArgs(args)
+    args = check_args(args)
     return args
 
-def CheckArgs(args):
+def check_args(args):
+    # Check args
+    if args.utt_suffix is None and args.utt_prefix is None:
+        args.utt_modifier_type = None
+        args.utt_modifier = ""
+    elif args.utt_suffix is None and args.utt_prefix is not None:
+        args.utt_modifier_type = "prefix"
+        args.utt_modifier = args.utt_prefix
+    elif args.utt_suffix is not None and args.utt_prefix is None:
+        args.utt_modifier_type = "suffix"
+        args.utt_modifier = args.utt_suffix
+    else:
+        raise Exception("Trying to add both prefix and suffix. Choose either of them")
+
     if not os.path.exists(args.output_dir):
         os.makedirs(args.output_dir)
     if not args.fg_interval >= 0:
@@ -57,7 +90,7 @@ def CheckArgs(args):
         raise Exception("Either --fg-noise-dir or --bg-noise-dir must be specified")
     return args
 
-def GetNoiseList(noise_wav_scp_filename):
+def get_noise_list(noise_wav_scp_filename):
     noise_wav_scp_file = open(noise_wav_scp_filename, 'r').readlines()
     noise_wavs = {}
     noise_utts = []
@@ -68,7 +101,7 @@ def GetNoiseList(noise_wav_scp_filename):
         noise_wavs[toks[0]] = wav.rstrip()
     return noise_utts, noise_wavs
 
-def AugmentWav(utt, wav, dur, fg_snr_opts, bg_snr_opts, fg_noise_utts, \
+def augment_wav(utt, wav, dur, fg_snr_opts, bg_snr_opts, fg_noise_utts, \
     bg_noise_utts, noise_wavs, noise2dur, interval, num_opts):
     # This section is common to both foreground and background noises
     new_wav = ""
@@ -117,25 +150,59 @@ def AugmentWav(utt, wav, dur, fg_snr_opts, bg_snr_opts, fg_noise_utts, \
             + start_times_str + " " + snrs_str + " - - |"
     return new_wav
 
-def CopyFileIfExists(utt_suffix, filename, input_dir, output_dir):
-    if os.path.isfile(input_dir + "/" + filename):
-        dict = ParseFileToDict(input_dir + "/" + filename,
+def get_new_id(utt, utt_modifier_type, utt_modifier):
+    """ This function generates a new id from the input id
+        This is needed when we have to create multiple copies of the original data
+        E.g. get_new_id("swb0035", prefix="rvb", copy=1) returns a string "rvb1_swb0035"
+    """
+    if utt_modifier_type == "suffix" and len(utt_modifier) > 0:
+        new_utt = utt + "-" + utt_modifier
+    elif utt_modifier_type == "prefix" and len(utt_modifier) > 0:
+        new_utt = utt_modifier + "-" + utt
+    else:
+        new_utt = utt
+
+    return new_utt
+
+def copy_file_if_exists(input_file, output_file, utt_modifier_type,
+                        utt_modifier, fields=[0]):
+    if os.path.isfile(input_file):
+        clean_dict = parse_file_to_dict(input_file,
             value_processor = lambda x: " ".join(x))
-        if len(utt_suffix) > 0:
-            new_dict = {}
-            for key in dict.keys():
-                new_dict[key + "-" + utt_suffix] = dict[key]
-            dict = new_dict
-        WriteDictToFile(dict, output_dir + "/" + filename)
+        new_dict = {}
+        for key in clean_dict.keys():
+            modified_key = get_new_id(key, utt_modifier_type, utt_modifier)
+            if len(fields) > 1:
+                values = clean_dict[key].split(" ")
+                modified_values = values
+                for idx in range(1, len(fields)):
+                    modified_values[idx-1] = get_new_id(values[idx-1],
+                                            utt_modifier_type, utt_modifier)
+                new_dict[modified_key] = " ".join(modified_values)
+            else:
+                new_dict[modified_key] = clean_dict[key]
+        write_dict_to_file(new_dict, output_file)
+
+def create_augmented_utt2uniq(input_dir, output_dir,
+                            utt_modifier_type, utt_modifier):
+    clean_utt2spk_file = input_dir + "/utt2spk"
+    clean_utt2spk_dict = parse_file_to_dict(clean_utt2spk_file,
+                            value_processor = lambda x: " ".join(x))
+    augmented_utt2uniq_dict = {}
+    for key in clean_utt2spk_dict.keys():
+        modified_key = get_new_id(key, utt_modifier_type, utt_modifier)
+        augmented_utt2uniq_dict[modified_key] = key
+    write_dict_to_file(augmented_utt2uniq_dict, output_dir + "/utt2uniq")
 
 def main():
-    args = GetArgs()
-    fg_snrs = [int(i) for i in args.fg_snr_str.split(":")]
-    bg_snrs = [int(i) for i in args.bg_snr_str.split(":")]
+    args = get_args()
     input_dir = args.input_dir
     output_dir = args.output_dir
+
+    fg_snrs = [int(i) for i in args.fg_snr_str.split(":")]
+    bg_snrs = [int(i) for i in args.bg_snr_str.split(":")]
     num_bg_noises = [int(i) for i in args.num_bg_noises.split(":")]
-    reco2dur = ParseFileToDict(input_dir + "/reco2dur",
+    reco2dur = parse_file_to_dict(input_dir + "/reco2dur",
         value_processor = lambda x: float(x[0]))
     wav_scp_file = open(input_dir + "/wav.scp", 'r').readlines()
 
@@ -147,18 +214,18 @@ def main():
     # Load background noises
     if args.bg_noise_dir:
         bg_noise_wav_filename = args.bg_noise_dir + "/wav.scp"
-        bg_noise_utts, bg_noise_wavs = GetNoiseList(bg_noise_wav_filename)
-        bg_noise_reco2dur = ParseFileToDict(args.bg_noise_dir + "/reco2dur",
+        bg_noise_utts, bg_noise_wavs = get_noise_list(bg_noise_wav_filename)
+        bg_noise_reco2dur = parse_file_to_dict(args.bg_noise_dir + "/reco2dur",
             value_processor = lambda x: float(x[0]))
         noise_wavs.update(bg_noise_wavs)
         noise_reco2dur.update(bg_noise_reco2dur)
 
-    # Load background noises
+    # Load foreground noises
     if args.fg_noise_dir:
         fg_noise_wav_filename = args.fg_noise_dir + "/wav.scp"
         fg_noise_reco2dur_filename = args.fg_noise_dir + "/reco2dur"
-        fg_noise_utts, fg_noise_wavs = GetNoiseList(fg_noise_wav_filename)
-        fg_noise_reco2dur = ParseFileToDict(args.fg_noise_dir + "/reco2dur",
+        fg_noise_utts, fg_noise_wavs = get_noise_list(fg_noise_wav_filename)
+        fg_noise_reco2dur = parse_file_to_dict(args.fg_noise_dir + "/reco2dur",
             value_processor = lambda x: float(x[0]))
         noise_wavs.update(fg_noise_wavs)
         noise_reco2dur.update(fg_noise_reco2dur)
@@ -173,24 +240,58 @@ def main():
         utt = toks[0]
         wav = " ".join(toks[1:])
         dur = reco2dur[utt]
-        new_wav = AugmentWav(utt, wav, dur, fg_snrs, bg_snrs, fg_noise_utts,
+        new_wav = augment_wav(utt, wav, dur, fg_snrs, bg_snrs, fg_noise_utts,
             bg_noise_utts, noise_wavs, noise_reco2dur, args.fg_interval,
             num_bg_noises)
-        new_utt = utt + "-" + args.utt_suffix
+
+        new_utt = get_new_id(utt, args.utt_modifier_type, args.utt_modifier)
+
         new_utt2wav[new_utt] = new_wav
 
     if not os.path.exists(output_dir):
         os.makedirs(output_dir)
 
-    WriteDictToFile(new_utt2wav, output_dir + "/wav.scp")
-    CopyFileIfExists(args.utt_suffix, "reco2dur", input_dir, output_dir)
-    CopyFileIfExists(args.utt_suffix, "utt2dur", input_dir, output_dir)
-    CopyFileIfExists(args.utt_suffix, "utt2spk", input_dir, output_dir)
-    CopyFileIfExists(args.utt_suffix, "utt2lang", input_dir, output_dir)
-    CopyFileIfExists(args.utt_suffix, "text", input_dir, output_dir)
-    CopyFileIfExists(args.utt_suffix, "utt2spk", input_dir, output_dir)
-    CopyFileIfExists(args.utt_suffix, "vad.scp", input_dir, output_dir)
-    CopyFileIfExists("", "spk2gender", input_dir, output_dir)
+    write_dict_to_file(new_utt2wav, output_dir + "/wav.scp")
+    copy_file_if_exists(input_dir + "/reco2dur", output_dir + "/reco2dur",
+                                args.utt_modifier_type, args.utt_modifier)
+    copy_file_if_exists(input_dir + "/utt2dur", output_dir + "/utt2dur",
+                                args.utt_modifier_type, args.utt_modifier)
+
+    # Check whether to modify the speaker id or not while creating utt2spk file
+    fields = ([0, 1] if args.modify_spk_id else [0])
+    copy_file_if_exists(input_dir + "/utt2spk", output_dir + "/utt2spk",
+                        args.utt_modifier_type, args.utt_modifier, fields=fields)
+    copy_file_if_exists(input_dir + "/utt2lang", output_dir + "/utt2lang",
+                        args.utt_modifier_type, args.utt_modifier)
+    copy_file_if_exists(input_dir + "/utt2num_frames", output_dir + "/utt2num_frames",
+                        args.utt_modifier_type, args.utt_modifier)
+    copy_file_if_exists(input_dir + "/text", output_dir + "/text", args.utt_modifier_type,
+                        args.utt_modifier)
+    copy_file_if_exists(input_dir + "/segments", output_dir + "/segments",
+                        args.utt_modifier_type, args.utt_modifier, fields=[0, 1])
+    copy_file_if_exists(input_dir + "/vad.scp", output_dir + "/vad.scp",
+                        args.utt_modifier_type, args.utt_modifier)
+    copy_file_if_exists(input_dir + "/reco2file_and_channel",
+                        output_dir + "/reco2file_and_channel",
+                        args.utt_modifier_type, args.utt_modifier, fields=[0, 1])
+
+    if args.modify_spk_id:
+        copy_file_if_exists(input_dir + "/spk2gender", output_dir + "/spk2gender",
+                        args.utt_modifier_type, args.utt_modifier)
+    else:
+        copy_file_if_exists(input_dir + "/spk2gender", output_dir + "/spk2gender", None, "")
+
+    # Create utt2uniq file
+    if os.path.isfile(input_dir + "/utt2uniq"):
+        copy_file_if_exists(input_dir + "/utt2uniq", output_dir + "/utt2uniq",
+                        args.utt_modifier_type, args.utt_modifier, fields=[0])
+    else:
+        create_augmented_utt2uniq(input_dir, output_dir,
+                        args.utt_modifier_type, args.utt_modifier)
+
+    data_lib.RunKaldiCommand("utils/utt2spk_to_spk2utt.pl <{output_dir}/utt2spk >{output_dir}/spk2utt"
+                    .format(output_dir = output_dir))
+
     data_lib.RunKaldiCommand("utils/fix_data_dir.sh {output_dir}".format(output_dir = output_dir))
 
 if __name__ == "__main__":
diff --git a/egs/wsj/s5/steps/data/make_musan.py b/egs/wsj/s5/steps/data/make_musan.py
new file mode 100755
index 00000000000..2a7bed453cb
--- /dev/null
+++ b/egs/wsj/s5/steps/data/make_musan.py
@@ -0,0 +1,178 @@
+#!/usr/bin/env python3
+# Copyright 2015   David Snyder
+#           2019   Phani Sankar Nidadavolu
+# Apache 2.0.
+#
+# This file is meant to be invoked by make_musan.sh.
+
+import os, sys, argparse
+sys.path.append("steps/data/")
+sys.path.insert(0, 'steps/')
+import libs.common as common_lib
+
+def get_args():
+    parser = argparse.ArgumentParser(description="Create MUSAN corpus",
+                        formatter_class=argparse.ArgumentDefaultsHelpFormatter)
+    parser.add_argument("--use-vocals", type=str,
+                        dest='use_vocals', default=True,
+                        action=common_lib.StrToBoolAction,
+                        choices=["true", "false"],
+                        help='use vocals from the music corpus')
+    parser.add_argument('--sampling-rate', type=int, default=None,
+                        help="Sampling rate of the source data. If a positive integer is specified with this option, "
+                        "the MUSAN corpus will be resampled to the rate of the source data."
+                        "Original MUSAN corpus is sampled at 16KHz")
+    parser.add_argument("in_dir", help="Input data directory")
+    parser.add_argument("out_dir", help="Output data directory")
+
+    print(' '.join(sys.argv))
+    args = parser.parse_args()
+    args = check_args(args)
+
+    return args
+
+def check_args(args):
+    if not os.path.exists(args.in_dir):
+        raise Exception('input dir {0} does not exist'.format(args.in_dir))
+    if not os.path.exists(args.out_dir):
+        print("Preparing {0}/musan...".format(args.out_dir))
+        os.makedirs(args.output_dir)
+
+    return args
+
+def process_music_annotations(path):
+    utt2spk = {}
+    utt2vocals = {}
+    lines = open(path, 'r').readlines()
+    for line in lines:
+        utt, genres, vocals, musician = line.rstrip().split()[:4]
+        # For this application, the musican ID isn't important
+        utt2spk[utt] = utt
+        utt2vocals[utt] = vocals == "Y"
+    return utt2spk, utt2vocals
+
+def prepare_music(root_dir, use_vocals, sampling_rate):
+    utt2vocals = {}
+    utt2spk = {}
+    utt2wav = {}
+    num_good_files = 0
+    num_bad_files = 0
+    music_dir = os.path.join(root_dir, "music")
+    for root, dirs, files in os.walk(music_dir):
+        for file in files:
+            file_path = os.path.join(root, file)
+            if file.endswith(".wav"):
+                utt = str(file).replace(".wav", "")
+                utt2wav[utt] = file_path
+            elif str(file) == "ANNOTATIONS":
+                utt2spk_part, utt2vocals_part = process_music_annotations(file_path)
+                utt2spk.update(utt2spk_part)
+                utt2vocals.update(utt2vocals_part)
+
+    utt2spk_str = ""
+    utt2wav_str = ""
+    for utt in utt2vocals:
+        if utt in utt2wav:
+            if use_vocals or not utt2vocals[utt]:
+                utt2spk_str = utt2spk_str + utt + " " + utt2spk[utt] + "\n"
+                if sampling_rate is not None or sampling_rate != 16000:
+                    utt2wav_str = utt2wav_str + utt + " sox -t wav " + utt2wav[utt] + " -r" \
+                                    " {fs} -t wav - |\n".format(fs=sampling_rate)
+                else:
+                    utt2wav_str = utt2wav_str + utt + " " + utt2wav[utt] + "\n"
+            num_good_files += 1
+        else:
+            print("Missing file {}".format(utt))
+            num_bad_files += 1
+    print("In music directory, processed {} files; {} had missing wav data".format(
+                                                    num_good_files, num_bad_files))
+    return utt2spk_str, utt2wav_str
+
+
+def prepare_speech(root_dir, sampling_rate):
+    utt2spk = {}
+    utt2wav = {}
+    num_good_files = 0
+    num_bad_files = 0
+    speech_dir = os.path.join(root_dir, "speech")
+    for root, dirs, files in os.walk(speech_dir):
+        for file in files:
+            file_path = os.path.join(root, file)
+            if file.endswith(".wav"):
+                utt = str(file).replace(".wav", "")
+                utt2wav[utt] = file_path
+                utt2spk[utt] = utt
+
+    utt2spk_str = ""
+    utt2wav_str = ""
+    for utt in utt2spk:
+        if utt in utt2wav:
+            utt2spk_str = utt2spk_str + utt + " " + utt2spk[utt] + "\n"
+            if sampling_rate is not None or sampling_rate != 16000:
+                utt2wav_str = utt2wav_str + utt + " sox -t wav " + utt2wav[utt] + " -r" \
+                        " {fs} -t wav - |\n".format(fs=sampling_rate)
+            else:
+                utt2wav_str = utt2wav_str + utt + " " + utt2wav[utt] + "\n"
+            num_good_files += 1
+        else:
+            print("Missing file {}".format(utt))
+            num_bad_files += 1
+    print("In speech directory, processed {} files; {} had missing wav data".format(
+                                                    num_good_files, num_bad_files))
+    return utt2spk_str, utt2wav_str
+
+
+def prepare_noise(root_dir, sampling_rate):
+    utt2spk = {}
+    utt2wav = {}
+    num_good_files = 0
+    num_bad_files = 0
+    noise_dir = os.path.join(root_dir, "noise")
+    for root, dirs, files in os.walk(noise_dir):
+        for file in files:
+            file_path = os.path.join(root, file)
+            if file.endswith(".wav"):
+                utt = str(file).replace(".wav", "")
+                utt2wav[utt] = file_path
+                utt2spk[utt] = utt
+
+    utt2spk_str = ""
+    utt2wav_str = ""
+    for utt in utt2spk:
+        if utt in utt2wav:
+            utt2spk_str = utt2spk_str + utt + " " + utt2spk[utt] + "\n"
+            if sampling_rate is not None or sampling_rate != 16000:
+                utt2wav_str = utt2wav_str + utt + " sox -t wav " + utt2wav[utt] + " -r " \
+                                    "{fs} -t wav - |\n".format(fs=sampling_rate)
+            else:
+                utt2wav_str = utt2wav_str + utt + " " + utt2wav[utt] + "\n"
+            num_good_files += 1
+        else:
+            print("Missing file {}".format(utt))
+            num_bad_files += 1
+    print("In noise directory, processed {} files; {} had missing wav data".format(
+                                    num_good_files, num_bad_files))
+    return utt2spk_str, utt2wav_str
+
+
+def main():
+    args = get_args()
+    in_dir = args.in_dir
+    out_dir = args.out_dir
+    use_vocals = args.use_vocals
+    sampling_rate = args.sampling_rate
+
+    utt2spk_music, utt2wav_music = prepare_music(in_dir, use_vocals, sampling_rate)
+    utt2spk_speech, utt2wav_speech = prepare_speech(in_dir, sampling_rate)
+    utt2spk_noise, utt2wav_noise = prepare_noise(in_dir, sampling_rate)
+
+    utt2spk = utt2spk_speech + utt2spk_music + utt2spk_noise
+    utt2wav = utt2wav_speech + utt2wav_music + utt2wav_noise
+    wav_fi = open(os.path.join(out_dir, "wav.scp"), 'w')
+    wav_fi.write(utt2wav)
+    utt2spk_fi = open(os.path.join(out_dir, "utt2spk"), 'w')
+    utt2spk_fi.write(utt2spk)
+
+
+if __name__=="__main__":
+    main()
diff --git a/egs/wsj/s5/steps/data/make_musan.sh b/egs/wsj/s5/steps/data/make_musan.sh
new file mode 100755
index 00000000000..40ec9b9a279
--- /dev/null
+++ b/egs/wsj/s5/steps/data/make_musan.sh
@@ -0,0 +1,71 @@
+#!/bin/bash
+# Copyright 2015   David Snyder
+#           2019   Phani Sankar Nidadavolu
+# Apache 2.0.
+#
+# This script creates the MUSAN data directory.
+# Consists of babble, music and noise files.
+# Used to create augmented data
+# The required dataset is freely available at http://www.openslr.org/17/
+
+# The corpus can be cited as follows:
+# @misc{musan2015,
+#  author = {David Snyder and Guoguo Chen and Daniel Povey},
+#  title = {{MUSAN}: {A} {M}usic, {S}peech, and {N}oise {C}orpus},
+#  year = {2015},
+#  eprint = {1510.08484},
+#  note = {arXiv:1510.08484v1}
+# }
+
+set -e
+use_vocals=true
+sampling_rate=16000
+stage=0
+
+echo "$0 $@"  # Print the command line for logging
+
+if [ -f path.sh ]; then . ./path.sh; fi
+. parse_options.sh || exit 1;
+
+if [ $# -ne 2 ]; then
+    echo USAGE: $0 input_dir output_dir
+    echo input_dir is the path where the MUSAN corpus is located
+    echo e.g: $0 /export/corpora/JHU/musan data
+    echo "main options (for others, see top of script file)"
+    echo "  --sampling-rate <sampling frequency>        # Sampling frequency of source dir"
+    echo "  --use-vocals <true/false>        # Use vocals from music portion of MUSAN corpus"
+    exit 1;
+fi
+
+in_dir=$1
+data_dir=$2
+
+mkdir -p local/musan.tmp
+
+# The below script will create the musan corpus
+steps/data/make_musan.py --use-vocals ${use_vocals} \
+                        --sampling-rate ${sampling_rate} \
+                        ${in_dir} ${data_dir}/musan || exit 1;
+
+utils/fix_data_dir.sh ${data_dir}/musan
+
+grep "music" ${data_dir}/musan/utt2spk > local/musan.tmp/utt2spk_music
+grep "speech" ${data_dir}/musan/utt2spk > local/musan.tmp/utt2spk_speech
+grep "noise" ${data_dir}/musan/utt2spk > local/musan.tmp/utt2spk_noise
+
+utils/subset_data_dir.sh --utt-list local/musan.tmp/utt2spk_music \
+        ${data_dir}/musan ${data_dir}/musan_music
+utils/subset_data_dir.sh --utt-list local/musan.tmp/utt2spk_speech \
+        ${data_dir}/musan ${data_dir}/musan_speech
+utils/subset_data_dir.sh --utt-list local/musan.tmp/utt2spk_noise \
+        ${data_dir}/musan ${data_dir}/musan_noise
+
+utils/fix_data_dir.sh ${data_dir}/musan_music
+utils/fix_data_dir.sh ${data_dir}/musan_speech
+utils/fix_data_dir.sh ${data_dir}/musan_noise
+
+rm -rf local/musan.tmp
+
+for name in speech noise music; do
+    utils/data/get_reco2dur.sh ${data_dir}/musan_${name}
+done
diff --git a/egs/wsj/s5/steps/data/reverberate_data_dir.py b/egs/wsj/s5/steps/data/reverberate_data_dir.py
index b1745a4b723..1fc6bf7055c 100755
--- a/egs/wsj/s5/steps/data/reverberate_data_dir.py
+++ b/egs/wsj/s5/steps/data/reverberate_data_dir.py
@@ -1,6 +1,7 @@
 #!/usr/bin/env python3
 # Copyright 2016  Tom Ko
 #           2018  David Snyder
+#           2019  Phani Sankar Nidadavolu
 # Apache 2.0
 # script to generate reverberated data
 
@@ -9,7 +10,7 @@
 
 data_lib = imp.load_source('dml', 'steps/data/data_dir_manipulation_lib.py')
 
-def GetArgs():
+def get_args():
     # we add required arguments as named arguments for readability
     parser = argparse.ArgumentParser(description="Reverberate the data directory with an option "
                                                  "to add isotropic and point source noises. "
@@ -79,15 +80,11 @@ def GetArgs():
     print(' '.join(sys.argv))
 
     args = parser.parse_args()
-    args = CheckArgs(args)
+    args = check_args(args)
 
     return args
 
-def CheckArgs(args):
-    if not os.path.exists(args.output_dir):
-        os.makedirs(args.output_dir)
-
-    ## Check arguments
+def check_args(args):
     if args.prefix is None:
         if args.num_replicas > 1 or args.include_original_data == "true":
             args.prefix = "rvb"
@@ -121,39 +118,42 @@ def CheckArgs(args):
 
 
 class list_cyclic_iterator(object):
-  def __init__(self, list):
-    self.list_index = 0
-    self.list = list
-    random.shuffle(self.list)
-
-  def __next__(self):
-    item = self.list[self.list_index]
-    self.list_index = (self.list_index + 1) % len(self.list)
-    return item
-
-  next = __next__  # for Python 2
-
-# This functions picks an item from the collection according to the associated probability distribution.
-# The probability estimate of each item in the collection is stored in the "probability" field of
-# the particular item. x : a collection (list or dictionary) where the values contain a field called probability
-def PickItemWithProbability(x):
-   if isinstance(x, dict):
-     plist = list(set(x.values()))
-   else:
-     plist = x
-   total_p = sum(item.probability for item in plist)
-   p = random.uniform(0, total_p)
-   accumulate_p = 0
-   for item in plist:
-      if accumulate_p + item.probability >= p:
-         return item
-      accumulate_p += item.probability
-   assert False, "Shouldn't get here as the accumulated probability should always equal to 1"
-
-
-# This function parses a file and pack the data into a dictionary
-# It is useful for parsing file like wav.scp, utt2spk, text...etc
-def ParseFileToDict(file, assert2fields = False, value_processor = None):
+    def __init__(self, list):
+        self.list_index = 0
+        self.list = list
+        random.shuffle(self.list)
+
+    def __next__(self):
+        item = self.list[self.list_index]
+        self.list_index = (self.list_index + 1) % len(self.list)
+        return item
+
+    next = __next__  # for Python 2
+
+def pick_item_with_probability(x):
+    """ This functions picks an item from the collection according to the associated
+        probability distribution. The probability estimate of each item in the collection
+        is stored in the "probability" field of the particular item. x : a
+        collection (list or dictionary) where the values contain a field called probability
+    """
+    if isinstance(x, dict):
+        plist = list(set(x.values()))
+    else:
+        plist = x
+    total_p = sum(item.probability for item in plist)
+    p = random.uniform(0, total_p)
+    accumulate_p = 0
+    for item in plist:
+        if accumulate_p + item.probability >= p:
+            return item
+        accumulate_p += item.probability
+    assert False, "Shouldn't get here as the accumulated probability should always equal to 1"
+
+
+def parse_file_to_dict(file, assert2fields = False, value_processor = None):
+    """ This function parses a file and pack the data into a dictionary
+        It is useful for parsing file like wav.scp, utt2spk, text...etc
+    """
     if value_processor is None:
         value_processor = lambda x: x[0]
     dict = {}
@@ -165,8 +165,9 @@ def ParseFileToDict(file, assert2fields = False, value_processor = None):
         dict[parts[0]] = value_processor(parts[1:])
     return dict
 
-# This function creates a file and write the content of a dictionary into it
-def WriteDictToFile(dict, file_name):
+def write_dict_to_file(dict, file_name):
+    """ This function creates a file and write the content of a dictionary into it
+    """
     file = open(file_name, 'w')
     keys = sorted(dict.keys())
     for key in keys:
@@ -180,11 +181,12 @@ def WriteDictToFile(dict, file_name):
     file.close()
 
 
-# This function creates the utt2uniq file from the utterance id in utt2spk file
-def CreateCorruptedUtt2uniq(input_dir, output_dir, num_replicas, include_original, prefix):
+def create_corrupted_utt2uniq(input_dir, output_dir, num_replicas, include_original, prefix):
+    """This function creates the utt2uniq file from the utterance id in utt2spk file
+    """
     corrupted_utt2uniq = {}
     # Parse the utt2spk to get the utterance id
-    utt2spk = ParseFileToDict(input_dir + "/utt2spk", value_processor = lambda x: " ".join(x))
+    utt2spk = parse_file_to_dict(input_dir + "/utt2spk", value_processor = lambda x: " ".join(x))
     keys = sorted(utt2spk.keys())
     if include_original:
         start_index = 0
@@ -193,13 +195,13 @@ def CreateCorruptedUtt2uniq(input_dir, output_dir, num_replicas, include_origina
 
     for i in range(start_index, num_replicas+1):
         for utt_id in keys:
-            new_utt_id = GetNewId(utt_id, prefix, i)
+            new_utt_id = get_new_id(utt_id, prefix, i)
             corrupted_utt2uniq[new_utt_id] = utt_id
 
-    WriteDictToFile(corrupted_utt2uniq, output_dir + "/utt2uniq")
+    write_dict_to_file(corrupted_utt2uniq, output_dir + "/utt2uniq")
 
 
-def AddPointSourceNoise(noise_addition_descriptor,  # descriptor to store the information of the noise added
+def add_point_source_noise(noise_addition_descriptor,  # descriptor to store the information of the noise added
                         room,  # the room selected
                         pointsource_noise_list, # the point source noise list
                         pointsource_noise_addition_probability, # Probability of adding point-source noises
@@ -211,8 +213,8 @@ def AddPointSourceNoise(noise_addition_descriptor,  # descriptor to store the in
     if len(pointsource_noise_list) > 0 and random.random() < pointsource_noise_addition_probability and max_noises_recording >= 1:
         for k in range(random.randint(1, max_noises_recording)):
             # pick the RIR to reverberate the point-source noise
-            noise = PickItemWithProbability(pointsource_noise_list)
-            noise_rir = PickItemWithProbability(room.rir_list)
+            noise = pick_item_with_probability(pointsource_noise_list)
+            noise_rir = pick_item_with_probability(room.rir_list)
             # If it is a background noise, the noise will be extended and be added to the whole speech
             # if it is a foreground noise, the noise will not extended and be added at a random time of the speech
             if noise.bg_fg_type == "background":
@@ -233,10 +235,7 @@ def AddPointSourceNoise(noise_addition_descriptor,  # descriptor to store the in
     return noise_addition_descriptor
 
 
-# This function randomly decides whether to reverberate, and sample a RIR if it does
-# It also decides whether to add the appropriate noises
-# This function return the string of options to the binary wav-reverberate
-def GenerateReverberationOpts(room_dict,  # the room dictionary, please refer to MakeRoomDict() for the format
+def generate_reverberation_opts(room_dict,  # the room dictionary, please refer to make_room_dict() for the format
                               pointsource_noise_list, # the point source noise list
                               iso_noise_dict, # the isotropic noise dictionary
                               foreground_snrs, # the SNR for adding the foreground noises
@@ -247,15 +246,19 @@ def GenerateReverberationOpts(room_dict,  # the room dictionary, please refer to
                               speech_dur,  # duration of the recording
                               max_noises_recording  # Maximum number of point-source noises that can be added
                               ):
+    """ This function randomly decides whether to reverberate, and sample a RIR if it does
+        It also decides whether to add the appropriate noises
+        This function return the string of options to the binary wav-reverberate
+    """
     reverberate_opts = ""
     noise_addition_descriptor = {'noise_io': [],
                                  'start_times': [],
                                  'snrs': []}
     # Randomly select the room
     # Here the room probability is a sum of the probabilities of the RIRs recorded in the room.
-    room = PickItemWithProbability(room_dict)
+    room = pick_item_with_probability(room_dict)
     # Randomly select the RIR in the room
-    speech_rir = PickItemWithProbability(room.rir_list)
+    speech_rir = pick_item_with_probability(room.rir_list)
     if random.random() < speech_rvb_probability:
         # pick the RIR to reverberate the speech
         reverberate_opts += """--impulse-response="{0}" """.format(speech_rir.rir_rspecifier)
@@ -265,7 +268,7 @@ def GenerateReverberationOpts(room_dict,  # the room dictionary, please refer to
         rir_iso_noise_list = iso_noise_dict[speech_rir.room_id]
     # Add the corresponding isotropic noise associated with the selected RIR
     if len(rir_iso_noise_list) > 0 and random.random() < isotropic_noise_addition_probability:
-        isotropic_noise = PickItemWithProbability(rir_iso_noise_list)
+        isotropic_noise = pick_item_with_probability(rir_iso_noise_list)
         # extend the isotropic noise to the length of the speech waveform
         # check if the rspecifier is a pipe or not
         if len(isotropic_noise.noise_rspecifier.split()) == 1:
@@ -275,7 +278,7 @@ def GenerateReverberationOpts(room_dict,  # the room dictionary, please refer to
         noise_addition_descriptor['start_times'].append(0)
         noise_addition_descriptor['snrs'].append(next(background_snrs))
 
-    noise_addition_descriptor = AddPointSourceNoise(noise_addition_descriptor,  # descriptor to store the information of the noise added
+    noise_addition_descriptor = add_point_source_noise(noise_addition_descriptor,  # descriptor to store the information of the noise added
                                                     room,  # the room selected
                                                     pointsource_noise_list, # the point source noise list
                                                     pointsource_noise_addition_probability, # Probability of adding point-source noises
@@ -294,26 +297,23 @@ def GenerateReverberationOpts(room_dict,  # the room dictionary, please refer to
 
     return reverberate_opts
 
-# This function generates a new id from the input id
-# This is needed when we have to create multiple copies of the original data
-# E.g. GetNewId("swb0035", prefix="rvb", copy=1) returns a string "rvb1_swb0035"
-def GetNewId(id, prefix=None, copy=0):
+def get_new_id(id, prefix=None, copy=0):
+    """ This function generates a new id from the input id
+        This is needed when we have to create multiple copies of the original data
+        E.g. get_new_id("swb0035", prefix="rvb", copy=1) returns a string "rvb1_swb0035"
+    """
     if prefix is not None:
-        new_id = prefix + str(copy) + "_" + id
+        new_id = prefix + str(copy) + "-" + id
     else:
         new_id = id
 
     return new_id
 
 
-# This is the main function to generate pipeline command for the corruption
-# The generic command of wav-reverberate will be like:
-# wav-reverberate --duration=t --impulse-response=rir.wav
-# --additive-signals='noise1.wav,noise2.wav' --snrs='snr1,snr2' --start-times='s1,s2' input.wav output.wav
-def GenerateReverberatedWavScp(wav_scp,  # a dictionary whose values are the Kaldi-IO strings of the speech recordings
+def generate_reverberated_wav_scp(wav_scp,  # a dictionary whose values are the Kaldi-IO strings of the speech recordings
                                durations, # a dictionary whose values are the duration (in sec) of the speech recordings
                                output_dir, # output directory to write the corrupted wav.scp
-                               room_dict,  # the room dictionary, please refer to MakeRoomDict() for the format
+                               room_dict,  # the room dictionary, please refer to make_room_dict() for the format
                                pointsource_noise_list, # the point source noise list
                                iso_noise_dict, # the isotropic noise dictionary
                                foreground_snr_array, # the SNR for adding the foreground noises
@@ -327,6 +327,11 @@ def GenerateReverberatedWavScp(wav_scp,  # a dictionary whose values are the Kal
                                pointsource_noise_addition_probability, # Probability of adding point-source noises
                                max_noises_per_minute # maximum number of point-source noises that can be added to a recording according to its duration
                                ):
+    """ This is the main function to generate pipeline command for the corruption
+        The generic command of wav-reverberate will be like:
+        wav-reverberate --duration=t --impulse-response=rir.wav
+        --additive-signals='noise1.wav,noise2.wav' --snrs='snr1,snr2' --start-times='s1,s2' input.wav output.wav
+    """
     foreground_snrs = list_cyclic_iterator(foreground_snr_array)
     background_snrs = list_cyclic_iterator(background_snr_array)
     corrupted_wav_scp = {}
@@ -345,7 +350,7 @@ def GenerateReverberatedWavScp(wav_scp,  # a dictionary whose values are the Kal
             speech_dur = durations[recording_id]
             max_noises_recording = math.floor(max_noises_per_minute * speech_dur / 60)
 
-            reverberate_opts = GenerateReverberationOpts(room_dict,  # the room dictionary, please refer to MakeRoomDict() for the format
+            reverberate_opts = generate_reverberation_opts(room_dict,  # the room dictionary, please refer to make_room_dict() for the format
                                                          pointsource_noise_list, # the point source noise list
                                                          iso_noise_dict, # the isotropic noise dictionary
                                                          foreground_snrs, # the SNR for adding the foreground noises
@@ -363,14 +368,15 @@ def GenerateReverberatedWavScp(wav_scp,  # a dictionary whose values are the Kal
             else:
                 wav_corrupted_pipe = "{0} wav-reverberate --shift-output={1} {2} - - |".format(wav_original_pipe, shift_output, reverberate_opts)
 
-            new_recording_id = GetNewId(recording_id, prefix, i)
+            new_recording_id = get_new_id(recording_id, prefix, i)
             corrupted_wav_scp[new_recording_id] = wav_corrupted_pipe
 
-    WriteDictToFile(corrupted_wav_scp, output_dir + "/wav.scp")
+    write_dict_to_file(corrupted_wav_scp, output_dir + "/wav.scp")
 
 
-# This function replicate the entries in files like segments, utt2spk, text
-def AddPrefixToFields(input_file, output_file, num_replicas, include_original, prefix, field = [0]):
+def add_prefix_to_fields(input_file, output_file, num_replicas, include_original, prefix, field = [0]):
+    """ This function replicate the entries in files like segments, utt2spk, text
+    """
     list = [x.strip() for x in open(input_file, encoding='utf-8')]
     f = open(output_file, "w" ,encoding='utf-8')
     if include_original:
@@ -383,17 +389,16 @@ def AddPrefixToFields(input_file, output_file, num_replicas, include_original, p
             if len(line) > 0 and line[0] != ';':
                 split1 = line.split()
                 for j in field:
-                    split1[j] = GetNewId(split1[j], prefix, i)
+                    split1[j] = get_new_id(split1[j], prefix, i)
                 print(" ".join(split1), file=f)
             else:
                 print(line, file=f)
     f.close()
 
 
-# This function creates multiple copies of the necessary files, e.g. utt2spk, wav.scp ...
-def CreateReverberatedCopy(input_dir,
+def create_reverberated_copy(input_dir,
                            output_dir,
-                           room_dict,  # the room dictionary, please refer to MakeRoomDict() for the format
+                           room_dict,  # the room dictionary, please refer to make_room_dict() for the format
                            pointsource_noise_list, # the point source noise list
                            iso_noise_dict, # the isotropic noise dictionary
                            foreground_snr_string, # the SNR for adding the foreground noises
@@ -407,43 +412,48 @@ def CreateReverberatedCopy(input_dir,
                            pointsource_noise_addition_probability, # Probability of adding point-source noises
                            max_noises_per_minute  # maximum number of point-source noises that can be added to a recording according to its duration
                            ):
-
-    wav_scp = ParseFileToDict(input_dir + "/wav.scp", value_processor = lambda x: " ".join(x))
+    """ This function creates multiple copies of the necessary files,
+        e.g. utt2spk, wav.scp ...
+    """
+    if not os.path.exists(output_dir):
+        os.makedirs(output_dir)
+    wav_scp = parse_file_to_dict(input_dir + "/wav.scp", value_processor = lambda x: " ".join(x))
     if not os.path.isfile(input_dir + "/reco2dur"):
         print("Getting the duration of the recordings...");
         data_lib.RunKaldiCommand("utils/data/get_reco2dur.sh {}".format(input_dir))
-    durations = ParseFileToDict(input_dir + "/reco2dur", value_processor = lambda x: float(x[0]))
+    durations = parse_file_to_dict(input_dir + "/reco2dur", value_processor = lambda x: float(x[0]))
     foreground_snr_array = [float(x) for x in foreground_snr_string.split(':')]
     background_snr_array = [float(x) for x in background_snr_string.split(':')]
 
-    GenerateReverberatedWavScp(wav_scp, durations, output_dir, room_dict, pointsource_noise_list, iso_noise_dict,
+    generate_reverberated_wav_scp(wav_scp, durations, output_dir, room_dict, pointsource_noise_list, iso_noise_dict,
                foreground_snr_array, background_snr_array, num_replicas, include_original, prefix,
                speech_rvb_probability, shift_output, isotropic_noise_addition_probability,
                pointsource_noise_addition_probability, max_noises_per_minute)
 
-    AddPrefixToFields(input_dir + "/utt2spk", output_dir + "/utt2spk", num_replicas, include_original, prefix, field = [0,1])
+    add_prefix_to_fields(input_dir + "/utt2spk", output_dir + "/utt2spk", num_replicas, include_original, prefix, field = [0,1])
     data_lib.RunKaldiCommand("utils/utt2spk_to_spk2utt.pl <{output_dir}/utt2spk >{output_dir}/spk2utt"
                     .format(output_dir = output_dir))
 
     if os.path.isfile(input_dir + "/utt2uniq"):
-        AddPrefixToFields(input_dir + "/utt2uniq", output_dir + "/utt2uniq", num_replicas, include_original, prefix, field =[0])
+        add_prefix_to_fields(input_dir + "/utt2uniq", output_dir + "/utt2uniq", num_replicas, include_original, prefix, field =[0])
     else:
         # Create the utt2uniq file
-        CreateCorruptedUtt2uniq(input_dir, output_dir, num_replicas, include_original, prefix)
+        create_corrupted_utt2uniq(input_dir, output_dir, num_replicas, include_original, prefix)
 
     if os.path.isfile(input_dir + "/text"):
-        AddPrefixToFields(input_dir + "/text", output_dir + "/text", num_replicas, include_original, prefix, field =[0])
+        add_prefix_to_fields(input_dir + "/text", output_dir + "/text", num_replicas, include_original, prefix, field =[0])
     if os.path.isfile(input_dir + "/segments"):
-        AddPrefixToFields(input_dir + "/segments", output_dir + "/segments", num_replicas, include_original, prefix, field = [0,1])
+        add_prefix_to_fields(input_dir + "/segments", output_dir + "/segments", num_replicas, include_original, prefix, field = [0,1])
     if os.path.isfile(input_dir + "/reco2file_and_channel"):
-        AddPrefixToFields(input_dir + "/reco2file_and_channel", output_dir + "/reco2file_and_channel", num_replicas, include_original, prefix, field = [0,1])
+        add_prefix_to_fields(input_dir + "/reco2file_and_channel", output_dir + "/reco2file_and_channel", num_replicas, include_original, prefix, field = [0,1])
 
     data_lib.RunKaldiCommand("utils/validate_data_dir.sh --no-feats --no-text {output_dir}"
                     .format(output_dir = output_dir))
 
 
-# This function smooths the probability distribution in the list
-def SmoothProbabilityDistribution(set_list, smoothing_weight=0.0, target_sum=1.0):
+def smooth_probability_distribution(set_list, smoothing_weight=0.0, target_sum=1.0):
+    """ This function smooths the probability distribution in the list
+    """
     if len(list(set_list)) > 0:
       num_unspecified = 0
       accumulated_prob = 0
@@ -476,10 +486,11 @@ def SmoothProbabilityDistribution(set_list, smoothing_weight=0.0, target_sum=1.0
     return set_list
 
 
-# This function parse the array of rir set parameter strings.
-# It will assign probabilities to those rir sets which don't have a probability
-# It will also check the existence of the rir list files.
-def ParseSetParameterStrings(set_para_array):
+def parse_set_parameter_strings(set_para_array):
+    """ This function parse the array of rir set parameter strings.
+        It will assign probabilities to those rir sets which don't have a probability
+        It will also check the existence of the rir list files.
+    """
     set_list = []
     for set_para in set_para_array:
         set = lambda: None
@@ -495,14 +506,15 @@ def ParseSetParameterStrings(set_para_array):
             raise Exception(set.filename + " not found")
         set_list.append(set)
 
-    return SmoothProbabilityDistribution(set_list)
+    return smooth_probability_distribution(set_list)
 
 
-# This function creates the RIR list
-# Each rir object in the list contains the following attributes:
-# rir_id, room_id, receiver_position_id, source_position_id, rt60, drr, probability
-# Please refer to the help messages in the parser for the meaning of these attributes
-def ParseRirList(rir_set_para_array, smoothing_weight, sampling_rate = None):
+def parse_rir_list(rir_set_para_array, smoothing_weight, sampling_rate = None):
+    """ This function creates the RIR list
+        Each rir object in the list contains the following attributes:
+        rir_id, room_id, receiver_position_id, source_position_id, rt60, drr, probability
+        Please refer to the help messages in the parser for the meaning of these attributes
+    """
     rir_parser = argparse.ArgumentParser()
     rir_parser.add_argument('--rir-id', type=str, required=True, help='This id is unique for each RIR and the noise may associate with a particular RIR by refering to this id')
     rir_parser.add_argument('--room-id', type=str, required=True, help='This is the room that where the RIR is generated')
@@ -515,7 +527,7 @@ def ParseRirList(rir_set_para_array, smoothing_weight, sampling_rate = None):
     rir_parser.add_argument('rir_rspecifier', type=str, help="""rir rspecifier, it can be either a filename or a piped command.
                             E.g. data/impulses/Room001-00001.wav or "sox data/impulses/Room001-00001.wav -t wav - |" """)
 
-    set_list = ParseSetParameterStrings(rir_set_para_array)
+    set_list = parse_set_parameter_strings(rir_set_para_array)
 
     rir_list = []
     for rir_set in set_list:
@@ -528,20 +540,23 @@ def ParseRirList(rir_set_para_array, smoothing_weight, sampling_rate = None):
                 else:
                     rir.rir_rspecifier = "{0} sox -t wav - -r {1} -t wav - |".format(rir.rir_rspecifier, sampling_rate)
 
-        rir_list += SmoothProbabilityDistribution(current_rir_list, smoothing_weight, rir_set.probability)
+        rir_list += smooth_probability_distribution(current_rir_list, smoothing_weight, rir_set.probability)
 
     return rir_list
 
 
-# This dunction checks if the inputs are approximately equal assuming they are floats.
 def almost_equal(value_1, value_2, accuracy = 10**-8):
+    """ This function checks if the inputs are approximately equal assuming they are floats.
+    """
     return abs(value_1 - value_2) < accuracy
 
-# This function converts a list of RIRs into a dictionary of RIRs indexed by the room-id.
-# Its values are objects with two attributes: a local RIR list
-# and the probability of the corresponding room
-# Please look at the comments at ParseRirList() for the attributes that a RIR object contains
-def MakeRoomDict(rir_list):
+
+def make_room_dict(rir_list):
+    """ This function converts a list of RIRs into a dictionary of RIRs indexed by the room-id.
+        Its values are objects with two attributes: a local RIR list
+        and the probability of the corresponding room
+        Please look at the comments at parse_rir_list() for the attributes that a RIR object contains
+    """
     room_dict = {}
     for rir in rir_list:
         if rir.room_id not in room_dict:
@@ -559,15 +574,15 @@ def MakeRoomDict(rir_list):
 
     return room_dict
 
-
-# This function creates the point-source noise list
-# and the isotropic noise dictionary from the noise information file
-# The isotropic noise dictionary is indexed by the room
-# and its value is the corrresponding isotropic noise list
-# Each noise object in the list contains the following attributes:
-# noise_id, noise_type, bg_fg_type, room_linkage, probability, noise_rspecifier
-# Please refer to the help messages in the parser for the meaning of these attributes
-def ParseNoiseList(noise_set_para_array, smoothing_weight, sampling_rate = None):
+def parse_noise_list(noise_set_para_array, smoothing_weight, sampling_rate = None):
+    """ This function creates the point-source noise list
+         and the isotropic noise dictionary from the noise information file
+         The isotropic noise dictionary is indexed by the room
+         and its value is the corrresponding isotropic noise list
+         Each noise object in the list contains the following attributes:
+         noise_id, noise_type, bg_fg_type, room_linkage, probability, noise_rspecifier
+         Please refer to the help messages in the parser for the meaning of these attributes
+    """
     noise_parser = argparse.ArgumentParser()
     noise_parser.add_argument('--noise-id', type=str, required=True, help='noise id')
     noise_parser.add_argument('--noise-type', type=str, required=True, help='the type of noise; i.e. isotropic or point-source', choices = ["isotropic", "point-source"])
@@ -579,7 +594,7 @@ def ParseNoiseList(noise_set_para_array, smoothing_weight, sampling_rate = None)
     noise_parser.add_argument('noise_rspecifier', type=str, help="""noise rspecifier, it can be either a filename or a piped command.
                               E.g. type5_noise_cirline_ofc_ambient1.wav or "sox type5_noise_cirline_ofc_ambient1.wav -t wav - |" """)
 
-    set_list = ParseSetParameterStrings(noise_set_para_array)
+    set_list = parse_set_parameter_strings(noise_set_para_array)
 
     pointsource_noise_list = []
     iso_noise_dict = {}
@@ -604,40 +619,42 @@ def ParseNoiseList(noise_set_para_array, smoothing_weight, sampling_rate = None)
             else:
                 current_pointsource_noise_list.append(noise)
 
-        pointsource_noise_list += SmoothProbabilityDistribution(current_pointsource_noise_list, smoothing_weight, noise_set.probability)
+        pointsource_noise_list += smooth_probability_distribution(current_pointsource_noise_list, smoothing_weight, noise_set.probability)
 
     # ensure the point-source noise probabilities sum to 1
-    pointsource_noise_list = SmoothProbabilityDistribution(pointsource_noise_list, smoothing_weight, 1.0)
+    pointsource_noise_list = smooth_probability_distribution(pointsource_noise_list, smoothing_weight, 1.0)
     if len(pointsource_noise_list) > 0:
         assert almost_equal(sum(noise.probability for noise in pointsource_noise_list), 1.0)
 
     # ensure the isotropic noise source probabilities for a given room sum to 1
     for key in iso_noise_dict.keys():
-        iso_noise_dict[key] = SmoothProbabilityDistribution(iso_noise_dict[key])
+        iso_noise_dict[key] = smooth_probability_distribution(iso_noise_dict[key])
         assert almost_equal(sum(noise.probability for noise in iso_noise_dict[key]), 1.0)
 
     return (pointsource_noise_list, iso_noise_dict)
 
 
-def Main():
-    args = GetArgs()
+def main():
+    args = get_args()
+
     random.seed(args.random_seed)
-    rir_list = ParseRirList(args.rir_set_para_array, args.rir_smoothing_weight, args.source_sampling_rate)
+    rir_list = parse_rir_list(args.rir_set_para_array, args.rir_smoothing_weight, args.source_sampling_rate)
     print("Number of RIRs is {0}".format(len(rir_list)))
     pointsource_noise_list = []
     iso_noise_dict = {}
     if args.noise_set_para_array is not None:
-        pointsource_noise_list, iso_noise_dict = ParseNoiseList(args.noise_set_para_array, args.noise_smoothing_weight, args.source_sampling_rate)
+        pointsource_noise_list, iso_noise_dict = parse_noise_list(args.noise_set_para_array,
+                                                                args.noise_smoothing_weight,
+                                                                args.source_sampling_rate)
         print("Number of point-source noises is {0}".format(len(pointsource_noise_list)))
         print("Number of isotropic noises is {0}".format(sum(len(iso_noise_dict[key]) for key in iso_noise_dict.keys())))
-    room_dict = MakeRoomDict(rir_list)
+    room_dict = make_room_dict(rir_list)
 
     if args.include_original_data == "true":
         include_original = True
     else:
         include_original = False
-
-    CreateReverberatedCopy(input_dir = args.input_dir,
+    create_reverberated_copy(input_dir = args.input_dir,
                            output_dir = args.output_dir,
                            room_dict = room_dict,
                            pointsource_noise_list = pointsource_noise_list,
@@ -653,6 +670,10 @@ def Main():
                            pointsource_noise_addition_probability = args.pointsource_noise_addition_probability,
                            max_noises_per_minute = args.max_noises_per_minute)
 
+
+    data_lib.RunKaldiCommand("utils/validate_data_dir.sh --no-feats --no-text {output_dir}"
+                    .format(output_dir = args.output_dir))
+
 if __name__ == "__main__":
-    Main()
+    main()
 

From 2826b358572d7a8d54a8c73ae7c1e4194821c0da Mon Sep 17 00:00:00 2001
From: phanisankar-nidadavolu
 <32964714+phanisankar-nidadavolu@users.noreply.github.com>
Date: Mon, 13 May 2019 19:43:32 -0400
Subject: [PATCH 111/163] [egs] updated local/musan.sh to
 steps/data/make_musan.sh in speaker id scripts (#3320)

---
 egs/bn_music_speech/v1/local/make_musan.py    | 119 -----------------
 egs/bn_music_speech/v1/local/make_musan.sh    |  37 ------
 egs/bn_music_speech/v1/run.sh                 |   2 +-
 .../v1/local/make_musan.py                    | 119 -----------------
 .../v1/local/make_musan.sh                    |  37 ------
 egs/callhome_diarization/v2/run.sh            |   2 +-
 egs/dihard_2018/v2/local/make_musan.py        | 123 ------------------
 egs/dihard_2018/v2/local/make_musan.sh        |  39 ------
 egs/dihard_2018/v2/run.sh                     |  14 +-
 egs/sitw/v1/local/make_musan.py               | 123 ------------------
 egs/sitw/v1/local/make_musan.sh               |  39 ------
 egs/sitw/v1/run.sh                            |   2 +-
 egs/sitw/v2/run.sh                            |   2 +-
 egs/sre16/v1/local/make_musan.py              | 119 -----------------
 egs/sre16/v1/local/make_musan.sh              |  37 ------
 egs/sre16/v1/run.sh                           |   2 +-
 egs/sre16/v2/run.sh                           |   3 +-
 egs/voxceleb/v1/local/make_musan.py           | 123 ------------------
 egs/voxceleb/v1/local/make_musan.sh           |  39 ------
 egs/voxceleb/v2/run.sh                        |   4 +-
 egs/wsj/s5/steps/data/make_musan.py           |  26 ++--
 21 files changed, 29 insertions(+), 982 deletions(-)
 delete mode 100755 egs/bn_music_speech/v1/local/make_musan.py
 delete mode 100755 egs/bn_music_speech/v1/local/make_musan.sh
 delete mode 100755 egs/callhome_diarization/v1/local/make_musan.py
 delete mode 100755 egs/callhome_diarization/v1/local/make_musan.sh
 delete mode 100755 egs/dihard_2018/v2/local/make_musan.py
 delete mode 100755 egs/dihard_2018/v2/local/make_musan.sh
 delete mode 100755 egs/sitw/v1/local/make_musan.py
 delete mode 100755 egs/sitw/v1/local/make_musan.sh
 delete mode 100755 egs/sre16/v1/local/make_musan.py
 delete mode 100755 egs/sre16/v1/local/make_musan.sh
 delete mode 100755 egs/voxceleb/v1/local/make_musan.py
 delete mode 100755 egs/voxceleb/v1/local/make_musan.sh

diff --git a/egs/bn_music_speech/v1/local/make_musan.py b/egs/bn_music_speech/v1/local/make_musan.py
deleted file mode 100755
index eb739b68180..00000000000
--- a/egs/bn_music_speech/v1/local/make_musan.py
+++ /dev/null
@@ -1,119 +0,0 @@
-#!/usr/bin/env python3
-# Copyright 2015   David Snyder
-# Apache 2.0.
-#
-# This file is meant to be invoked by make_musan.sh.
-
-import os, sys
-
-def process_music_annotations(path):
-  utt2spk = {}
-  utt2vocals = {}
-  lines = open(path, 'r').readlines()
-  for line in lines:
-    utt, genres, vocals, musician = line.rstrip().split()[:4]
-    # For this application, the musican ID isn't important
-    utt2spk[utt] = utt
-    utt2vocals[utt] = vocals == "Y"
-  return utt2spk, utt2vocals
-
-def prepare_music(root_dir, use_vocals):
-  utt2vocals = {}
-  utt2spk = {}
-  utt2wav = {}
-  num_good_files = 0
-  num_bad_files = 0
-  music_dir = os.path.join(root_dir, "music")
-  for root, dirs, files in os.walk(music_dir):
-    for file in files:
-      file_path = os.path.join(root, file)
-      if file.endswith(".wav"):
-        utt = str(file).replace(".wav", "")
-        utt2wav[utt] = file_path
-      elif str(file) == "ANNOTATIONS":
-        utt2spk_part, utt2vocals_part = process_music_annotations(file_path)
-        utt2spk.update(utt2spk_part)
-        utt2vocals.update(utt2vocals_part)
-  utt2spk_str = ""
-  utt2wav_str = ""
-  for utt in utt2vocals:
-    if utt in utt2wav:
-      if use_vocals or not utt2vocals[utt]:
-        utt2spk_str = utt2spk_str + utt + " " + utt2spk[utt] + "\n"
-        utt2wav_str = utt2wav_str + utt + " " + utt2wav[utt] + "\n"
-      num_good_files += 1
-    else:
-      print("Missing file {}".format(utt))
-      num_bad_files += 1
-  print("In music directory, processed {} files: {} had missing wav data".format(num_good_files, num_bad_files))
-  return utt2spk_str, utt2wav_str
-
-def prepare_speech(root_dir):
-  utt2spk = {}
-  utt2wav = {}
-  num_good_files = 0
-  num_bad_files = 0
-  speech_dir = os.path.join(root_dir, "speech")
-  for root, dirs, files in os.walk(speech_dir):
-    for file in files:
-      file_path = os.path.join(root, file)
-      if file.endswith(".wav"):
-        utt = str(file).replace(".wav", "")
-        utt2wav[utt] = file_path
-        utt2spk[utt] = utt
-  utt2spk_str = ""
-  utt2wav_str = ""
-  for utt in utt2spk:
-    if utt in utt2wav:
-      utt2spk_str = utt2spk_str + utt + " " + utt2spk[utt] + "\n"
-      utt2wav_str = utt2wav_str + utt + " " + utt2wav[utt] + "\n"
-      num_good_files += 1
-    else:
-      print("Missing file {}".format(utt))
-      num_bad_files += 1
-  print("In speech directory, processed {} files: {} had missing wav data".format(num_good_files, num_bad_files))
-  return utt2spk_str, utt2wav_str
-
-def prepare_noise(root_dir):
-  utt2spk = {}
-  utt2wav = {}
-  num_good_files = 0
-  num_bad_files = 0
-  noise_dir = os.path.join(root_dir, "noise")
-  for root, dirs, files in os.walk(noise_dir):
-    for file in files:
-      file_path = os.path.join(root, file)
-      if file.endswith(".wav"):
-        utt = str(file).replace(".wav", "")
-        utt2wav[utt] = file_path
-        utt2spk[utt] = utt
-  utt2spk_str = ""
-  utt2wav_str = ""
-  for utt in utt2spk:
-    if utt in utt2wav:
-      utt2spk_str = utt2spk_str + utt + " " + utt2spk[utt] + "\n"
-      utt2wav_str = utt2wav_str + utt + " " + utt2wav[utt] + "\n"
-      num_good_files += 1
-    else:
-      print("Missing file {}".format(utt))
-      num_bad_files += 1
-  print("In noise directory, processed {} files: {} had missing wav data".format(num_good_files, num_bad_files))
-  return utt2spk_str, utt2wav_str
-
-def main():
-  in_dir = sys.argv[1]
-  out_dir = sys.argv[2]
-  use_vocals = sys.argv[3] == "Y"
-  utt2spk_music, utt2wav_music = prepare_music(in_dir, use_vocals)
-  utt2spk_speech, utt2wav_speech = prepare_speech(in_dir)
-  utt2spk_noise, utt2wav_noise = prepare_noise(in_dir)
-  utt2spk = utt2spk_speech + utt2spk_music + utt2spk_noise
-  utt2wav = utt2wav_speech + utt2wav_music + utt2wav_noise
-  wav_fi = open(os.path.join(out_dir, "wav.scp"), 'w')
-  wav_fi.write(utt2wav)
-  utt2spk_fi = open(os.path.join(out_dir, "utt2spk"), 'w')
-  utt2spk_fi.write(utt2spk)
-
-
-if __name__=="__main__":
-  main()
diff --git a/egs/bn_music_speech/v1/local/make_musan.sh b/egs/bn_music_speech/v1/local/make_musan.sh
deleted file mode 100755
index 694940ad70f..00000000000
--- a/egs/bn_music_speech/v1/local/make_musan.sh
+++ /dev/null
@@ -1,37 +0,0 @@
-#!/bin/bash
-# Copyright 2015   David Snyder
-# Apache 2.0.
-#
-# This script, called by ../run.sh, creates the MUSAN
-# data directory. The required dataset is freely available at
-#   http://www.openslr.org/17/
-
-set -e
-in_dir=$1
-data_dir=$2
-use_vocals='Y'
-
-mkdir -p local/musan.tmp
-
-echo "Preparing ${data_dir}/musan..."
-mkdir -p ${data_dir}/musan
-local/make_musan.py ${in_dir} ${data_dir}/musan ${use_vocals}
-
-utils/fix_data_dir.sh ${data_dir}/musan
-
-grep "music" ${data_dir}/musan/utt2spk > local/musan.tmp/utt2spk_music
-grep "speech" ${data_dir}/musan/utt2spk > local/musan.tmp/utt2spk_speech
-grep "noise" ${data_dir}/musan/utt2spk > local/musan.tmp/utt2spk_noise
-utils/subset_data_dir.sh --utt-list local/musan.tmp/utt2spk_music \
-  ${data_dir}/musan ${data_dir}/musan_music
-utils/subset_data_dir.sh --utt-list local/musan.tmp/utt2spk_speech \
-  ${data_dir}/musan ${data_dir}/musan_speech
-utils/subset_data_dir.sh --utt-list local/musan.tmp/utt2spk_noise \
-  ${data_dir}/musan ${data_dir}/musan_noise
-
-utils/fix_data_dir.sh ${data_dir}/musan_music
-utils/fix_data_dir.sh ${data_dir}/musan_speech
-utils/fix_data_dir.sh ${data_dir}/musan_noise
-
-rm -rf local/musan.tmp
-
diff --git a/egs/bn_music_speech/v1/run.sh b/egs/bn_music_speech/v1/run.sh
index 6cc0531e9d7..08d5c022a9d 100755
--- a/egs/bn_music_speech/v1/run.sh
+++ b/egs/bn_music_speech/v1/run.sh
@@ -20,7 +20,7 @@ vaddir=`pwd`/mfcc
 local/make_bn.sh /export/corpora5/LDC/LDC97S44 \
                  /export/corpora/LDC/LDC97T22 data
 
-local/make_musan.sh /export/corpora/JHU/musan data
+steps/data/make_musan.sh --sampling-rate 16000 /export/corpora/JHU/musan data
 
 steps/make_mfcc.sh --mfcc-config conf/mfcc.conf --nj 30 --cmd "$train_cmd" \
     data/musan_speech exp/make_mfcc $mfccdir
diff --git a/egs/callhome_diarization/v1/local/make_musan.py b/egs/callhome_diarization/v1/local/make_musan.py
deleted file mode 100755
index 7c50adf7c83..00000000000
--- a/egs/callhome_diarization/v1/local/make_musan.py
+++ /dev/null
@@ -1,119 +0,0 @@
-#!/usr/bin/env python3
-# Copyright 2015   David Snyder
-# Apache 2.0.
-#
-# This file is meant to be invoked by make_musan.sh.
-
-import os, sys
-
-def process_music_annotations(path):
-  utt2spk = {}
-  utt2vocals = {}
-  lines = open(path, 'r').readlines()
-  for line in lines:
-    utt, genres, vocals, musician = line.rstrip().split()[:4]
-    # For this application, the musican ID isn't important
-    utt2spk[utt] = utt
-    utt2vocals[utt] = vocals == "Y"
-  return utt2spk, utt2vocals
-
-def prepare_music(root_dir, use_vocals):
-  utt2vocals = {}
-  utt2spk = {}
-  utt2wav = {}
-  num_good_files = 0
-  num_bad_files = 0
-  music_dir = os.path.join(root_dir, "music")
-  for root, dirs, files in os.walk(music_dir):
-    for file in files:
-      file_path = os.path.join(root, file)
-      if file.endswith(".wav"):
-        utt = str(file).replace(".wav", "")
-        utt2wav[utt] = file_path
-      elif str(file) == "ANNOTATIONS":
-        utt2spk_part, utt2vocals_part = process_music_annotations(file_path)
-        utt2spk.update(utt2spk_part)
-        utt2vocals.update(utt2vocals_part)
-  utt2spk_str = ""
-  utt2wav_str = ""
-  for utt in utt2vocals:
-    if utt in utt2wav:
-      if use_vocals or not utt2vocals[utt]:
-        utt2spk_str = utt2spk_str + utt + " " + utt2spk[utt] + "\n"
-        utt2wav_str = utt2wav_str + utt + " sox -t wav " + utt2wav[utt] + " -r 8k -t wav - |\n"
-      num_good_files += 1
-    else:
-      print("Missing file: {}".format(utt))
-      num_bad_files += 1
-  print("In music directory, processed {} files: {} had missing wav data".format(num_good_files, num_bad_files))
-  return utt2spk_str, utt2wav_str
-
-def prepare_speech(root_dir):
-  utt2spk = {}
-  utt2wav = {}
-  num_good_files = 0
-  num_bad_files = 0
-  speech_dir = os.path.join(root_dir, "speech")
-  for root, dirs, files in os.walk(speech_dir):
-    for file in files:
-      file_path = os.path.join(root, file)
-      if file.endswith(".wav"):
-        utt = str(file).replace(".wav", "")
-        utt2wav[utt] = file_path
-        utt2spk[utt] = utt
-  utt2spk_str = ""
-  utt2wav_str = ""
-  for utt in utt2spk:
-    if utt in utt2wav:
-      utt2spk_str = utt2spk_str + utt + " " + utt2spk[utt] + "\n"
-      utt2wav_str = utt2wav_str + utt + " sox -t wav " + utt2wav[utt] + " -r 8k -t wav - |\n"
-      num_good_files += 1
-    else:
-      print("Missing file: {}".format(utt))
-      num_bad_files += 1
-  print("In speech directory, processed {} files: {} had missing wav data".format(num_good_files, num_bad_files))
-  return utt2spk_str, utt2wav_str
-
-def prepare_noise(root_dir):
-  utt2spk = {}
-  utt2wav = {}
-  num_good_files = 0
-  num_bad_files = 0
-  noise_dir = os.path.join(root_dir, "noise")
-  for root, dirs, files in os.walk(noise_dir):
-    for file in files:
-      file_path = os.path.join(root, file)
-      if file.endswith(".wav"):
-        utt = str(file).replace(".wav", "")
-        utt2wav[utt] = file_path
-        utt2spk[utt] = utt
-  utt2spk_str = ""
-  utt2wav_str = ""
-  for utt in utt2spk:
-    if utt in utt2wav:
-      utt2spk_str = utt2spk_str + utt + " " + utt2spk[utt] + "\n"
-      utt2wav_str = utt2wav_str + utt + " sox -t wav " + utt2wav[utt] + " -r 8k -t wav - |\n"
-      num_good_files += 1
-    else:
-      print("Missing file: {}".format(utt))
-      num_bad_files += 1
-  print("In noise directory, processed {} files: {} had missing wav data".format(num_good_files, num_bad_files))
-  return utt2spk_str, utt2wav_str
-
-def main():
-  in_dir = sys.argv[1]
-  out_dir = sys.argv[2]
-  use_vocals = sys.argv[3] == "Y"
-  utt2spk_music, utt2wav_music = prepare_music(in_dir, use_vocals)
-  utt2spk_speech, utt2wav_speech = prepare_speech(in_dir)
-  utt2spk_noise, utt2wav_noise = prepare_noise(in_dir)
-  utt2spk = utt2spk_speech + utt2spk_music + utt2spk_noise
-  utt2wav = utt2wav_speech + utt2wav_music + utt2wav_noise
-  wav_fi = open(os.path.join(out_dir, "wav.scp"), 'w')
-  wav_fi.write(utt2wav)
-  utt2spk_fi = open(os.path.join(out_dir, "utt2spk"), 'w')
-  utt2spk_fi.write(utt2spk)
-
-
-if __name__=="__main__":
-  main()
diff --git a/egs/callhome_diarization/v1/local/make_musan.sh b/egs/callhome_diarization/v1/local/make_musan.sh
deleted file mode 100755
index 694940ad70f..00000000000
--- a/egs/callhome_diarization/v1/local/make_musan.sh
+++ /dev/null
@@ -1,37 +0,0 @@
-#!/bin/bash
-# Copyright 2015   David Snyder
-# Apache 2.0.
-#
-# This script, called by ../run.sh, creates the MUSAN
-# data directory. The required dataset is freely available at
-#   http://www.openslr.org/17/
-
-set -e
-in_dir=$1
-data_dir=$2
-use_vocals='Y'
-
-mkdir -p local/musan.tmp
-
-echo "Preparing ${data_dir}/musan..."
-mkdir -p ${data_dir}/musan
-local/make_musan.py ${in_dir} ${data_dir}/musan ${use_vocals}
-
-utils/fix_data_dir.sh ${data_dir}/musan
-
-grep "music" ${data_dir}/musan/utt2spk > local/musan.tmp/utt2spk_music
-grep "speech" ${data_dir}/musan/utt2spk > local/musan.tmp/utt2spk_speech
-grep "noise" ${data_dir}/musan/utt2spk > local/musan.tmp/utt2spk_noise
-utils/subset_data_dir.sh --utt-list local/musan.tmp/utt2spk_music \
-  ${data_dir}/musan ${data_dir}/musan_music
-utils/subset_data_dir.sh --utt-list local/musan.tmp/utt2spk_speech \
-  ${data_dir}/musan ${data_dir}/musan_speech
-utils/subset_data_dir.sh --utt-list local/musan.tmp/utt2spk_noise \
-  ${data_dir}/musan ${data_dir}/musan_noise
-
-utils/fix_data_dir.sh ${data_dir}/musan_music
-utils/fix_data_dir.sh ${data_dir}/musan_speech
-utils/fix_data_dir.sh ${data_dir}/musan_noise
-
-rm -rf local/musan.tmp
-
diff --git a/egs/callhome_diarization/v2/run.sh b/egs/callhome_diarization/v2/run.sh
index b79717e2348..e85baa50691 100755
--- a/egs/callhome_diarization/v2/run.sh
+++ b/egs/callhome_diarization/v2/run.sh
@@ -130,7 +130,7 @@ if [ $stage -le 2 ]; then
 
   # Prepare the MUSAN corpus, which consists of music, speech, and noise
   # suitable for augmentation.
-  local/make_musan.sh /export/corpora/JHU/musan data
+  steps/data/make_musan.sh --sampling-rate 8000 /export/corpora/JHU/musan data
 
   # Get the duration of the MUSAN recordings.  This will be used by the
   # script augment_data_dir.py.
diff --git a/egs/dihard_2018/v2/local/make_musan.py b/egs/dihard_2018/v2/local/make_musan.py
deleted file mode 100755
index c4b5c9359b4..00000000000
--- a/egs/dihard_2018/v2/local/make_musan.py
+++ /dev/null
@@ -1,123 +0,0 @@
-#!/usr/bin/env python3
-# Copyright 2015   David Snyder
-#           2018   Ewald Enzinger
-# Apache 2.0.
-#
-# Modified version of egs/sre16/v1/local/make_musan.py (commit e3fb7c4a0da4167f8c94b80f4d3cc5ab4d0e22e8).
-# This version uses the raw MUSAN audio files (16 kHz) and does not use sox to resample at 8 kHz.
-#
-# This file is meant to be invoked by make_musan.sh.
-
-import os, sys
-
-def process_music_annotations(path):
-  utt2spk = {}
-  utt2vocals = {}
-  lines = open(path, 'r').readlines()
-  for line in lines:
-    utt, genres, vocals, musician = line.rstrip().split()[:4]
-    # For this application, the musican ID isn't important
-    utt2spk[utt] = utt
-    utt2vocals[utt] = vocals == "Y"
-  return utt2spk, utt2vocals
-
-def prepare_music(root_dir, use_vocals):
-  utt2vocals = {}
-  utt2spk = {}
-  utt2wav = {}
-  num_good_files = 0
-  num_bad_files = 0
-  music_dir = os.path.join(root_dir, "music")
-  for root, dirs, files in os.walk(music_dir):
-    for file in files:
-      file_path = os.path.join(root, file)
-      if file.endswith(".wav"):
-        utt = str(file).replace(".wav", "")
-        utt2wav[utt] = file_path
-      elif str(file) == "ANNOTATIONS":
-        utt2spk_part, utt2vocals_part = process_music_annotations(file_path)
-        utt2spk.update(utt2spk_part)
-        utt2vocals.update(utt2vocals_part)
-  utt2spk_str = ""
-  utt2wav_str = ""
-  for utt in utt2vocals:
-    if utt in utt2wav:
-      if use_vocals or not utt2vocals[utt]:
-        utt2spk_str = utt2spk_str + utt + " " + utt2spk[utt] + "\n"
-        utt2wav_str = utt2wav_str + utt + " " + utt2wav[utt] + "\n"
-      num_good_files += 1
-    else:
-      print("Missing file {}".format(utt))
-      num_bad_files += 1
-  print(("In music directory, processed {} files: {} had missing wav data".format(num_good_files, num_bad_files))
-  return utt2spk_str, utt2wav_str
-
-def prepare_speech(root_dir):
-  utt2spk = {}
-  utt2wav = {}
-  num_good_files = 0
-  num_bad_files = 0
-  speech_dir = os.path.join(root_dir, "speech")
-  for root, dirs, files in os.walk(speech_dir):
-    for file in files:
-      file_path = os.path.join(root, file)
-      if file.endswith(".wav"):
-        utt = str(file).replace(".wav", "")
-        utt2wav[utt] = file_path
-        utt2spk[utt] = utt
-  utt2spk_str = ""
-  utt2wav_str = ""
-  for utt in utt2spk:
-    if utt in utt2wav:
-      utt2spk_str = utt2spk_str + utt + " " + utt2spk[utt] + "\n"
-      utt2wav_str = utt2wav_str + utt + " " + utt2wav[utt] + "\n"
-      num_good_files += 1
-    else:
-      print("Missing file {}".format(utt))
-      num_bad_files += 1
-  print(("In speech directory, processed {} files: {} had missing wav data".format(num_good_files, num_bad_files))
-  return utt2spk_str, utt2wav_str
-
-def prepare_noise(root_dir):
-  utt2spk = {}
-  utt2wav = {}
-  num_good_files = 0
-  num_bad_files = 0
-  noise_dir = os.path.join(root_dir, "noise")
-  for root, dirs, files in os.walk(noise_dir):
-    for file in files:
-      file_path = os.path.join(root, file)
-      if file.endswith(".wav"):
-        utt = str(file).replace(".wav", "")
-        utt2wav[utt] = file_path
-        utt2spk[utt] = utt
-  utt2spk_str = ""
-  utt2wav_str = ""
-  for utt in utt2spk:
-    if utt in utt2wav:
-      utt2spk_str = utt2spk_str + utt + " " + utt2spk[utt] + "\n"
-      utt2wav_str = utt2wav_str + utt + " " + utt2wav[utt] + "\n"
-      num_good_files += 1
-    else:
-      print("Missing file {}".format(utt))
-      num_bad_files += 1
-  print(("In noise directory, processed {} files: {} had missing wav data".format(num_good_files, num_bad_files))
-  return utt2spk_str, utt2wav_str
-
-def main():
-  in_dir = sys.argv[1]
-  out_dir = sys.argv[2]
-  use_vocals = sys.argv[3] == "Y"
-  utt2spk_music, utt2wav_music = prepare_music(in_dir, use_vocals)
-  utt2spk_speech, utt2wav_speech = prepare_speech(in_dir)
-  utt2spk_noise, utt2wav_noise = prepare_noise(in_dir)
-  utt2spk = utt2spk_speech + utt2spk_music + utt2spk_noise
-  utt2wav = utt2wav_speech + utt2wav_music + utt2wav_noise
-  wav_fi = open(os.path.join(out_dir, "wav.scp"), 'w')
-  wav_fi.write(utt2wav)
-  utt2spk_fi = open(os.path.join(out_dir, "utt2spk"), 'w')
-  utt2spk_fi.write(utt2spk)
-
-
-if __name__=="__main__":
-  main()
diff --git a/egs/dihard_2018/v2/local/make_musan.sh b/egs/dihard_2018/v2/local/make_musan.sh
deleted file mode 100755
index 1565ef0d85c..00000000000
--- a/egs/dihard_2018/v2/local/make_musan.sh
+++ /dev/null
@@ -1,39 +0,0 @@
-#!/bin/bash
-# Copyright 2015   David Snyder
-# Apache 2.0.
-#
-# Copy of egs/sre16/v1/local/make_musan.sh (commit e3fb7c4a0da4167f8c94b80f4d3cc5ab4d0e22e8).
-#
-# This script, called by ../run.sh, creates the MUSAN
-# data directory. The required dataset is freely available at
-#   http://www.openslr.org/17/
-
-set -e
-in_dir=$1
-data_dir=$2
-use_vocals='Y'
-
-mkdir -p local/musan.tmp
-
-echo "Preparing ${data_dir}/musan..."
-mkdir -p ${data_dir}/musan
-local/make_musan.py ${in_dir} ${data_dir}/musan ${use_vocals}
-
-utils/fix_data_dir.sh ${data_dir}/musan
-
-grep "music" ${data_dir}/musan/utt2spk > local/musan.tmp/utt2spk_music
-grep "speech" ${data_dir}/musan/utt2spk > local/musan.tmp/utt2spk_speech
-grep "noise" ${data_dir}/musan/utt2spk > local/musan.tmp/utt2spk_noise
-utils/subset_data_dir.sh --utt-list local/musan.tmp/utt2spk_music \
-  ${data_dir}/musan ${data_dir}/musan_music
-utils/subset_data_dir.sh --utt-list local/musan.tmp/utt2spk_speech \
-  ${data_dir}/musan ${data_dir}/musan_speech
-utils/subset_data_dir.sh --utt-list local/musan.tmp/utt2spk_noise \
-  ${data_dir}/musan ${data_dir}/musan_noise
-
-utils/fix_data_dir.sh ${data_dir}/musan_music
-utils/fix_data_dir.sh ${data_dir}/musan_speech
-utils/fix_data_dir.sh ${data_dir}/musan_noise
-
-rm -rf local/musan.tmp
-
diff --git a/egs/dihard_2018/v2/run.sh b/egs/dihard_2018/v2/run.sh
index d330322a5e8..b0fbdcc968c 100755
--- a/egs/dihard_2018/v2/run.sh
+++ b/egs/dihard_2018/v2/run.sh
@@ -118,7 +118,7 @@ if [ $stage -le 2 ]; then
 
   # Prepare the MUSAN corpus, which consists of music, speech, and noise
   # suitable for augmentation.
-  local/make_musan.sh $musan_root data
+  steps/data/make_musan.sh --sampling-rate 16000 $musan_root data
 
   # Get the duration of the MUSAN recordings.  This will be used by the
   # script augment_data_dir.py.
@@ -251,7 +251,7 @@ if [ $stage -le 12 ]; then
 
   # The threshold is in terms of the log likelihood ratio provided by the
   # PLDA scores.  In a perfectly calibrated system, the threshold is 0.
-  # In the following loop, we evaluate DER performance on DIHARD 2018 development 
+  # In the following loop, we evaluate DER performance on DIHARD 2018 development
   # set using some reasonable thresholds for a well-calibrated system.
   for threshold in -0.5 -0.4 -0.3 -0.2 -0.1 -0.05 0 0.05 0.1 0.2 0.3 0.4 0.5; do
     diarization/cluster.sh --cmd "$train_cmd --mem 4G" --nj 20 \
@@ -276,16 +276,16 @@ if [ $stage -le 12 ]; then
     --threshold $(cat $nnet_dir/tuning/dihard_2018_dev_best) --rttm-channel 1 \
     $nnet_dir/xvectors_dihard_2018_dev/plda_scores $nnet_dir/xvectors_dihard_2018_dev/plda_scores
 
-  # Cluster DIHARD 2018 evaluation set using the best threshold found for the DIHARD 
-  # 2018 development set. The DIHARD 2018 development set is used as the validation 
-  # set to tune the parameters. 
+  # Cluster DIHARD 2018 evaluation set using the best threshold found for the DIHARD
+  # 2018 development set. The DIHARD 2018 development set is used as the validation
+  # set to tune the parameters.
   diarization/cluster.sh --cmd "$train_cmd --mem 4G" --nj 20 \
     --threshold $(cat $nnet_dir/tuning/dihard_2018_dev_best) --rttm-channel 1 \
     $nnet_dir/xvectors_dihard_2018_eval/plda_scores $nnet_dir/xvectors_dihard_2018_eval/plda_scores
 
   mkdir -p $nnet_dir/results
-  # Compute the DER on the DIHARD 2018 evaluation set. We use the official metrics of   
-  # the DIHARD challenge. The DER is calculated with no unscored collars and including  
+  # Compute the DER on the DIHARD 2018 evaluation set. We use the official metrics of
+  # the DIHARD challenge. The DER is calculated with no unscored collars and including
   # overlapping speech.
   md-eval.pl -r data/dihard_2018_eval/rttm \
     -s $nnet_dir/xvectors_dihard_2018_eval/plda_scores/rttm 2> $nnet_dir/results/threshold.log \
diff --git a/egs/sitw/v1/local/make_musan.py b/egs/sitw/v1/local/make_musan.py
deleted file mode 100755
index 833da0619c9..00000000000
--- a/egs/sitw/v1/local/make_musan.py
+++ /dev/null
@@ -1,123 +0,0 @@
-#!/usr/bin/env python3
-# Copyright 2015   David Snyder
-#           2018   Ewald Enzinger
-# Apache 2.0.
-#
-# Modified version of egs/sre16/v1/local/make_musan.py (commit e3fb7c4a0da4167f8c94b80f4d3cc5ab4d0e22e8).
-# This version uses the raw MUSAN audio files (16 kHz) and does not use sox to resample at 8 kHz.
-#
-# This file is meant to be invoked by make_musan.sh.
-
-import os, sys
-
-def process_music_annotations(path):
-  utt2spk = {}
-  utt2vocals = {}
-  lines = open(path, 'r').readlines()
-  for line in lines:
-    utt, genres, vocals, musician = line.rstrip().split()[:4]
-    # For this application, the musican ID isn't important
-    utt2spk[utt] = utt
-    utt2vocals[utt] = vocals == "Y"
-  return utt2spk, utt2vocals
-
-def prepare_music(root_dir, use_vocals):
-  utt2vocals = {}
-  utt2spk = {}
-  utt2wav = {}
-  num_good_files = 0
-  num_bad_files = 0
-  music_dir = os.path.join(root_dir, "music")
-  for root, dirs, files in os.walk(music_dir):
-    for file in files:
-      file_path = os.path.join(root, file)
-      if file.endswith(".wav"):
-        utt = str(file).replace(".wav", "")
-        utt2wav[utt] = file_path
-      elif str(file) == "ANNOTATIONS":
-        utt2spk_part, utt2vocals_part = process_music_annotations(file_path)
-        utt2spk.update(utt2spk_part)
-        utt2vocals.update(utt2vocals_part)
-  utt2spk_str = ""
-  utt2wav_str = ""
-  for utt in utt2vocals:
-    if utt in utt2wav:
-      if use_vocals or not utt2vocals[utt]:
-        utt2spk_str = utt2spk_str + utt + " " + utt2spk[utt] + "\n"
-        utt2wav_str = utt2wav_str + utt + " " + utt2wav[utt] + "\n"
-      num_good_files += 1
-    else:
-      print("Missing file {}".format(utt))
-      num_bad_files += 1
-  print("In music directory, processed {} files: {} had missing wav data".format(num_good_files, num_bad_files))
-  return utt2spk_str, utt2wav_str
-
-def prepare_speech(root_dir):
-  utt2spk = {}
-  utt2wav = {}
-  num_good_files = 0
-  num_bad_files = 0
-  speech_dir = os.path.join(root_dir, "speech")
-  for root, dirs, files in os.walk(speech_dir):
-    for file in files:
-      file_path = os.path.join(root, file)
-      if file.endswith(".wav"):
-        utt = str(file).replace(".wav", "")
-        utt2wav[utt] = file_path
-        utt2spk[utt] = utt
-  utt2spk_str = ""
-  utt2wav_str = ""
-  for utt in utt2spk:
-    if utt in utt2wav:
-      utt2spk_str = utt2spk_str + utt + " " + utt2spk[utt] + "\n"
-      utt2wav_str = utt2wav_str + utt + " " + utt2wav[utt] + "\n"
-      num_good_files += 1
-    else:
-      print("Missing file {}".format(utt))
-      num_bad_files += 1
-  print("In speech directory, processed {} files: {} had missing wav data".format(num_good_files, num_bad_files))
-  return utt2spk_str, utt2wav_str
-
-def prepare_noise(root_dir):
-  utt2spk = {}
-  utt2wav = {}
-  num_good_files = 0
-  num_bad_files = 0
-  noise_dir = os.path.join(root_dir, "noise")
-  for root, dirs, files in os.walk(noise_dir):
-    for file in files:
-      file_path = os.path.join(root, file)
-      if file.endswith(".wav"):
-        utt = str(file).replace(".wav", "")
-        utt2wav[utt] = file_path
-        utt2spk[utt] = utt
-  utt2spk_str = ""
-  utt2wav_str = ""
-  for utt in utt2spk:
-    if utt in utt2wav:
-      utt2spk_str = utt2spk_str + utt + " " + utt2spk[utt] + "\n"
-      utt2wav_str = utt2wav_str + utt + " " + utt2wav[utt] + "\n"
-      num_good_files += 1
-    else:
-      print("Missing file {}".format(utt))
-      num_bad_files += 1
-  print("In noise directory, processed {} files: {} had missing wav data".format(num_good_files, num_bad_files))
-  return utt2spk_str, utt2wav_str
-
-def main():
-  in_dir = sys.argv[1]
-  out_dir = sys.argv[2]
-  use_vocals = sys.argv[3] == "Y"
-  utt2spk_music, utt2wav_music = prepare_music(in_dir, use_vocals)
-  utt2spk_speech, utt2wav_speech = prepare_speech(in_dir)
-  utt2spk_noise, utt2wav_noise = prepare_noise(in_dir)
-  utt2spk = utt2spk_speech + utt2spk_music + utt2spk_noise
-  utt2wav = utt2wav_speech + utt2wav_music + utt2wav_noise
-  wav_fi = open(os.path.join(out_dir, "wav.scp"), 'w')
-  wav_fi.write(utt2wav)
-  utt2spk_fi = open(os.path.join(out_dir, "utt2spk"), 'w')
-  utt2spk_fi.write(utt2spk)
-
-
-if __name__=="__main__":
-  main()
diff --git a/egs/sitw/v1/local/make_musan.sh b/egs/sitw/v1/local/make_musan.sh
deleted file mode 100755
index 1565ef0d85c..00000000000
--- a/egs/sitw/v1/local/make_musan.sh
+++ /dev/null
@@ -1,39 +0,0 @@
-#!/bin/bash
-# Copyright 2015   David Snyder
-# Apache 2.0.
-#
-# Copy of egs/sre16/v1/local/make_musan.sh (commit e3fb7c4a0da4167f8c94b80f4d3cc5ab4d0e22e8).
-#
-# This script, called by ../run.sh, creates the MUSAN
-# data directory. The required dataset is freely available at
-#   http://www.openslr.org/17/
-
-set -e
-in_dir=$1
-data_dir=$2
-use_vocals='Y'
-
-mkdir -p local/musan.tmp
-
-echo "Preparing ${data_dir}/musan..."
-mkdir -p ${data_dir}/musan
-local/make_musan.py ${in_dir} ${data_dir}/musan ${use_vocals}
-
-utils/fix_data_dir.sh ${data_dir}/musan
-
-grep "music" ${data_dir}/musan/utt2spk > local/musan.tmp/utt2spk_music
-grep "speech" ${data_dir}/musan/utt2spk > local/musan.tmp/utt2spk_speech
-grep "noise" ${data_dir}/musan/utt2spk > local/musan.tmp/utt2spk_noise
-utils/subset_data_dir.sh --utt-list local/musan.tmp/utt2spk_music \
-  ${data_dir}/musan ${data_dir}/musan_music
-utils/subset_data_dir.sh --utt-list local/musan.tmp/utt2spk_speech \
-  ${data_dir}/musan ${data_dir}/musan_speech
-utils/subset_data_dir.sh --utt-list local/musan.tmp/utt2spk_noise \
-  ${data_dir}/musan ${data_dir}/musan_noise
-
-utils/fix_data_dir.sh ${data_dir}/musan_music
-utils/fix_data_dir.sh ${data_dir}/musan_speech
-utils/fix_data_dir.sh ${data_dir}/musan_noise
-
-rm -rf local/musan.tmp
-
diff --git a/egs/sitw/v1/run.sh b/egs/sitw/v1/run.sh
index e016f8a4752..797451df263 100755
--- a/egs/sitw/v1/run.sh
+++ b/egs/sitw/v1/run.sh
@@ -137,7 +137,7 @@ if [ $stage -le 4 ]; then
 
   # Prepare the MUSAN corpus, which consists of music, speech, and noise
   # suitable for augmentation.
-  local/make_musan.sh $musan_root data
+  steps/data/make_musan.sh --sampling-rate 16000 $musan_root data
 
   # Get the duration of the MUSAN recordings.  This will be used by the
   # script augment_data_dir.py.
diff --git a/egs/sitw/v2/run.sh b/egs/sitw/v2/run.sh
index 8aeecc18b3f..aad58e4a853 100755
--- a/egs/sitw/v2/run.sh
+++ b/egs/sitw/v2/run.sh
@@ -103,7 +103,7 @@ if [ $stage -le 2 ]; then
 
   # Prepare the MUSAN corpus, which consists of music, speech, and noise
   # suitable for augmentation.
-  local/make_musan.sh $musan_root data
+  steps/data/make_musan.sh --sampling-rate 16000 $musan_root data
 
   # Get the duration of the MUSAN recordings.  This will be used by the
   # script augment_data_dir.py.
diff --git a/egs/sre16/v1/local/make_musan.py b/egs/sre16/v1/local/make_musan.py
deleted file mode 100755
index 7735bd28818..00000000000
--- a/egs/sre16/v1/local/make_musan.py
+++ /dev/null
@@ -1,119 +0,0 @@
-#!/usr/bin/env python3
-# Copyright 2015   David Snyder
-# Apache 2.0.
-#
-# This file is meant to be invoked by make_musan.sh.
-
-import os, sys
-
-def process_music_annotations(path):
-  utt2spk = {}
-  utt2vocals = {}
-  lines = open(path, 'r').readlines()
-  for line in lines:
-    utt, genres, vocals, musician = line.rstrip().split()[:4]
-    # For this application, the musican ID isn't important
-    utt2spk[utt] = utt
-    utt2vocals[utt] = vocals == "Y"
-  return utt2spk, utt2vocals
-
-def prepare_music(root_dir, use_vocals):
-  utt2vocals = {}
-  utt2spk = {}
-  utt2wav = {}
-  num_good_files = 0
-  num_bad_files = 0
-  music_dir = os.path.join(root_dir, "music")
-  for root, dirs, files in os.walk(music_dir):
-    for file in files:
-      file_path = os.path.join(root, file)
-      if file.endswith(".wav"):
-        utt = str(file).replace(".wav", "")
-        utt2wav[utt] = file_path
-      elif str(file) == "ANNOTATIONS":
-        utt2spk_part, utt2vocals_part = process_music_annotations(file_path)
-        utt2spk.update(utt2spk_part)
-        utt2vocals.update(utt2vocals_part)
-  utt2spk_str = ""
-  utt2wav_str = ""
-  for utt in utt2vocals:
-    if utt in utt2wav:
-      if use_vocals or not utt2vocals[utt]:
-        utt2spk_str = utt2spk_str + utt + " " + utt2spk[utt] + "\n"
-        utt2wav_str = utt2wav_str + utt + " sox -t wav " + utt2wav[utt] + " -r 8k -t wav - |\n"
-      num_good_files += 1
-    else:
-      print("Missing file {}".format(utt))
-      num_bad_files += 1
-  print("In music directory, processed {} files; {} had missing wav data".format(num_good_files, num_bad_files))
-  return utt2spk_str, utt2wav_str
-
-def prepare_speech(root_dir):
-  utt2spk = {}
-  utt2wav = {}
-  num_good_files = 0
-  num_bad_files = 0
-  speech_dir = os.path.join(root_dir, "speech")
-  for root, dirs, files in os.walk(speech_dir):
-    for file in files:
-      file_path = os.path.join(root, file)
-      if file.endswith(".wav"):
-        utt = str(file).replace(".wav", "")
-        utt2wav[utt] = file_path
-        utt2spk[utt] = utt
-  utt2spk_str = ""
-  utt2wav_str = ""
-  for utt in utt2spk:
-    if utt in utt2wav:
-      utt2spk_str = utt2spk_str + utt + " " + utt2spk[utt] + "\n"
-      utt2wav_str = utt2wav_str + utt + " sox -t wav " + utt2wav[utt] + " -r 8k -t wav - |\n"
-      num_good_files += 1
-    else:
-      print("Missing file {}".format(utt))
-      num_bad_files += 1
-  print("In speech directory, processed {} files; {} had missing wav data".format(num_good_files, num_bad_files))
-  return utt2spk_str, utt2wav_str
-
-def prepare_noise(root_dir):
-  utt2spk = {}
-  utt2wav = {}
-  num_good_files = 0
-  num_bad_files = 0
-  noise_dir = os.path.join(root_dir, "noise")
-  for root, dirs, files in os.walk(noise_dir):
-    for file in files:
-      file_path = os.path.join(root, file)
-      if file.endswith(".wav"):
-        utt = str(file).replace(".wav", "")
-        utt2wav[utt] = file_path
-        utt2spk[utt] = utt
-  utt2spk_str = ""
-  utt2wav_str = ""
-  for utt in utt2spk:
-    if utt in utt2wav:
-      utt2spk_str = utt2spk_str + utt + " " + utt2spk[utt] + "\n"
-      utt2wav_str = utt2wav_str + utt + " sox -t wav " + utt2wav[utt] + " -r 8k -t wav - |\n"
-      num_good_files += 1
-    else:
-      print("Missing file {}".format(utt))
-      num_bad_files += 1
-  print("In noise directory, processed {} files; {} had missing wav data".format(num_good_files, num_bad_files))
-  return utt2spk_str, utt2wav_str
-
-def main():
-  in_dir = sys.argv[1]
-  out_dir = sys.argv[2]
-  use_vocals = sys.argv[3] == "Y"
-  utt2spk_music, utt2wav_music = prepare_music(in_dir, use_vocals)
-  utt2spk_speech, utt2wav_speech = prepare_speech(in_dir)
-  utt2spk_noise, utt2wav_noise = prepare_noise(in_dir)
-  utt2spk = utt2spk_speech + utt2spk_music + utt2spk_noise
-  utt2wav = utt2wav_speech + utt2wav_music + utt2wav_noise
-  wav_fi = open(os.path.join(out_dir, "wav.scp"), 'w')
-  wav_fi.write(utt2wav)
-  utt2spk_fi = open(os.path.join(out_dir, "utt2spk"), 'w')
-  utt2spk_fi.write(utt2spk)
-
-
-if __name__=="__main__":
-  main()
diff --git a/egs/sre16/v1/local/make_musan.sh b/egs/sre16/v1/local/make_musan.sh
deleted file mode 100755
index 694940ad70f..00000000000
--- a/egs/sre16/v1/local/make_musan.sh
+++ /dev/null
@@ -1,37 +0,0 @@
-#!/bin/bash
-# Copyright 2015   David Snyder
-# Apache 2.0.
-#
-# This script, called by ../run.sh, creates the MUSAN
-# data directory. The required dataset is freely available at
-#   http://www.openslr.org/17/
-
-set -e
-in_dir=$1
-data_dir=$2
-use_vocals='Y'
-
-mkdir -p local/musan.tmp
-
-echo "Preparing ${data_dir}/musan..."
-mkdir -p ${data_dir}/musan
-local/make_musan.py ${in_dir} ${data_dir}/musan ${use_vocals}
-
-utils/fix_data_dir.sh ${data_dir}/musan
-
-grep "music" ${data_dir}/musan/utt2spk > local/musan.tmp/utt2spk_music
-grep "speech" ${data_dir}/musan/utt2spk > local/musan.tmp/utt2spk_speech
-grep "noise" ${data_dir}/musan/utt2spk > local/musan.tmp/utt2spk_noise
-utils/subset_data_dir.sh --utt-list local/musan.tmp/utt2spk_music \
-  ${data_dir}/musan ${data_dir}/musan_music
-utils/subset_data_dir.sh --utt-list local/musan.tmp/utt2spk_speech \
-  ${data_dir}/musan ${data_dir}/musan_speech
-utils/subset_data_dir.sh --utt-list local/musan.tmp/utt2spk_noise \
-  ${data_dir}/musan ${data_dir}/musan_noise
-
-utils/fix_data_dir.sh ${data_dir}/musan_music
-utils/fix_data_dir.sh ${data_dir}/musan_speech
-utils/fix_data_dir.sh ${data_dir}/musan_noise
-
-rm -rf local/musan.tmp
-
diff --git a/egs/sre16/v1/run.sh b/egs/sre16/v1/run.sh
index 28481e27c3a..2315d7ac78a 100755
--- a/egs/sre16/v1/run.sh
+++ b/egs/sre16/v1/run.sh
@@ -145,7 +145,7 @@ if [ $stage -le 4 ]; then
 
   # Prepare the MUSAN corpus, which consists of music, speech, and noise
   # suitable for augmentation.
-  local/make_musan.sh /export/corpora/JHU/musan data
+  steps/data/make_musan.sh --sampling-rate 8000 /export/corpora/JHU/musan data
 
   # Get the duration of the MUSAN recordings.  This will be used by the
   # script augment_data_dir.py.
diff --git a/egs/sre16/v2/run.sh b/egs/sre16/v2/run.sh
index b2072dfd69d..7780c30560b 100755
--- a/egs/sre16/v2/run.sh
+++ b/egs/sre16/v2/run.sh
@@ -135,7 +135,7 @@ if [ $stage -le 2 ]; then
 
   # Prepare the MUSAN corpus, which consists of music, speech, and noise
   # suitable for augmentation.
-  local/make_musan.sh /export/corpora/JHU/musan data
+  steps/data/make_musan.sh --sampling-rate 8000 /export/corpora/JHU/musan data
 
   # Get the duration of the MUSAN recordings.  This will be used by the
   # script augment_data_dir.py.
@@ -174,6 +174,7 @@ if [ $stage -le 2 ]; then
   utils/copy_data_dir.sh data/swbd_sre_combined data/sre_combined
   utils/filter_scp.pl data/sre/spk2utt data/swbd_sre_combined/spk2utt | utils/spk2utt_to_utt2spk.pl > data/sre_combined/utt2spk
   utils/fix_data_dir.sh data/sre_combined
+
 fi
 
 # Now we prepare the features to generate examples for xvector training.
diff --git a/egs/voxceleb/v1/local/make_musan.py b/egs/voxceleb/v1/local/make_musan.py
deleted file mode 100755
index 565bfce0cc9..00000000000
--- a/egs/voxceleb/v1/local/make_musan.py
+++ /dev/null
@@ -1,123 +0,0 @@
-#!/usr/bin/env python3
-# Copyright 2015   David Snyder
-#           2018   Ewald Enzinger
-# Apache 2.0.
-#
-# Modified version of egs/sre16/v1/local/make_musan.py (commit e3fb7c4a0da4167f8c94b80f4d3cc5ab4d0e22e8).
-# This version uses the raw MUSAN audio files (16 kHz) and does not use sox to resample at 8 kHz.
-#
-# This file is meant to be invoked by make_musan.sh.
-
-import os, sys
-
-def process_music_annotations(path):
-  utt2spk = {}
-  utt2vocals = {}
-  lines = open(path, 'r').readlines()
-  for line in lines:
-    utt, genres, vocals, musician = line.rstrip().split()[:4]
-    # For this application, the musican ID isn't important
-    utt2spk[utt] = utt
-    utt2vocals[utt] = vocals == "Y"
-  return utt2spk, utt2vocals
-
-def prepare_music(root_dir, use_vocals):
-  utt2vocals = {}
-  utt2spk = {}
-  utt2wav = {}
-  num_good_files = 0
-  num_bad_files = 0
-  music_dir = os.path.join(root_dir, "music")
-  for root, dirs, files in os.walk(music_dir):
-    for file in files:
-      file_path = os.path.join(root, file)
-      if file.endswith(".wav"):
-        utt = str(file).replace(".wav", "")
-        utt2wav[utt] = file_path
-      elif str(file) == "ANNOTATIONS":
-        utt2spk_part, utt2vocals_part = process_music_annotations(file_path)
-        utt2spk.update(utt2spk_part)
-        utt2vocals.update(utt2vocals_part)
-  utt2spk_str = ""
-  utt2wav_str = ""
-  for utt in utt2vocals:
-    if utt in utt2wav:
-      if use_vocals or not utt2vocals[utt]:
-        utt2spk_str = utt2spk_str + utt + " " + utt2spk[utt] + "\n"
-        utt2wav_str = utt2wav_str + utt + " " + utt2wav[utt] + "\n"
-      num_good_files += 1
-    else:
-      print("Missing file {}".format(utt))
-      num_bad_files += 1
-  print("In music directory, processed {} files; {} had missing wav data".format(num_good_files, num_bad_files))
-  return utt2spk_str, utt2wav_str
-
-def prepare_speech(root_dir):
-  utt2spk = {}
-  utt2wav = {}
-  num_good_files = 0
-  num_bad_files = 0
-  speech_dir = os.path.join(root_dir, "speech")
-  for root, dirs, files in os.walk(speech_dir):
-    for file in files:
-      file_path = os.path.join(root, file)
-      if file.endswith(".wav"):
-        utt = str(file).replace(".wav", "")
-        utt2wav[utt] = file_path
-        utt2spk[utt] = utt
-  utt2spk_str = ""
-  utt2wav_str = ""
-  for utt in utt2spk:
-    if utt in utt2wav:
-      utt2spk_str = utt2spk_str + utt + " " + utt2spk[utt] + "\n"
-      utt2wav_str = utt2wav_str + utt + " " + utt2wav[utt] + "\n"
-      num_good_files += 1
-    else:
-      print("Missing file {}".format(utt))
-      num_bad_files += 1
-  print("In speech directory, processed {} files; {} had missing wav data".format(num_good_files, num_bad_files))
-  return utt2spk_str, utt2wav_str
-
-def prepare_noise(root_dir):
-  utt2spk = {}
-  utt2wav = {}
-  num_good_files = 0
-  num_bad_files = 0
-  noise_dir = os.path.join(root_dir, "noise")
-  for root, dirs, files in os.walk(noise_dir):
-    for file in files:
-      file_path = os.path.join(root, file)
-      if file.endswith(".wav"):
-        utt = str(file).replace(".wav", "")
-        utt2wav[utt] = file_path
-        utt2spk[utt] = utt
-  utt2spk_str = ""
-  utt2wav_str = ""
-  for utt in utt2spk:
-    if utt in utt2wav:
-      utt2spk_str = utt2spk_str + utt + " " + utt2spk[utt] + "\n"
-      utt2wav_str = utt2wav_str + utt + " " + utt2wav[utt] + "\n"
-      num_good_files += 1
-    else:
-      print("Missing file {}".format(utt))
-      num_bad_files += 1
-  print("In noise directory, processed {} files; {} had missing wav data".format(num_good_files, num_bad_files))
-  return utt2spk_str, utt2wav_str
-
-def main():
-  in_dir = sys.argv[1]
-  out_dir = sys.argv[2]
-  use_vocals = sys.argv[3] == "Y"
-  utt2spk_music, utt2wav_music = prepare_music(in_dir, use_vocals)
-  utt2spk_speech, utt2wav_speech = prepare_speech(in_dir)
-  utt2spk_noise, utt2wav_noise = prepare_noise(in_dir)
-  utt2spk = utt2spk_speech + utt2spk_music + utt2spk_noise
-  utt2wav = utt2wav_speech + utt2wav_music + utt2wav_noise
-  wav_fi = open(os.path.join(out_dir, "wav.scp"), 'w')
-  wav_fi.write(utt2wav)
-  utt2spk_fi = open(os.path.join(out_dir, "utt2spk"), 'w')
-  utt2spk_fi.write(utt2spk)
-
-
-if __name__=="__main__":
-  main()
diff --git a/egs/voxceleb/v1/local/make_musan.sh b/egs/voxceleb/v1/local/make_musan.sh
deleted file mode 100755
index 1565ef0d85c..00000000000
--- a/egs/voxceleb/v1/local/make_musan.sh
+++ /dev/null
@@ -1,39 +0,0 @@
-#!/bin/bash
-# Copyright 2015   David Snyder
-# Apache 2.0.
-#
-# Copy of egs/sre16/v1/local/make_musan.sh (commit e3fb7c4a0da4167f8c94b80f4d3cc5ab4d0e22e8).
-#
-# This script, called by ../run.sh, creates the MUSAN
-# data directory. The required dataset is freely available at
-#   http://www.openslr.org/17/
-
-set -e
-in_dir=$1
-data_dir=$2
-use_vocals='Y'
-
-mkdir -p local/musan.tmp
-
-echo "Preparing ${data_dir}/musan..."
-mkdir -p ${data_dir}/musan
-local/make_musan.py ${in_dir} ${data_dir}/musan ${use_vocals}
-
-utils/fix_data_dir.sh ${data_dir}/musan
-
-grep "music" ${data_dir}/musan/utt2spk > local/musan.tmp/utt2spk_music
-grep "speech" ${data_dir}/musan/utt2spk > local/musan.tmp/utt2spk_speech
-grep "noise" ${data_dir}/musan/utt2spk > local/musan.tmp/utt2spk_noise
-utils/subset_data_dir.sh --utt-list local/musan.tmp/utt2spk_music \
-  ${data_dir}/musan ${data_dir}/musan_music
-utils/subset_data_dir.sh --utt-list local/musan.tmp/utt2spk_speech \
-  ${data_dir}/musan ${data_dir}/musan_speech
-utils/subset_data_dir.sh --utt-list local/musan.tmp/utt2spk_noise \
-  ${data_dir}/musan ${data_dir}/musan_noise
-
-utils/fix_data_dir.sh ${data_dir}/musan_music
-utils/fix_data_dir.sh ${data_dir}/musan_speech
-utils/fix_data_dir.sh ${data_dir}/musan_noise
-
-rm -rf local/musan.tmp
-
diff --git a/egs/voxceleb/v2/run.sh b/egs/voxceleb/v2/run.sh
index 44340873a80..7c70e4a42c1 100755
--- a/egs/voxceleb/v2/run.sh
+++ b/egs/voxceleb/v2/run.sh
@@ -30,7 +30,7 @@ if [ $stage -le 0 ]; then
   # This script creates data/voxceleb1_test and data/voxceleb1_train for latest version of VoxCeleb1.
   # Our evaluation set is the test portion of VoxCeleb1.
   local/make_voxceleb1_v2.pl $voxceleb1_root dev data/voxceleb1_train
-  local/make_voxceleb1_v2.pl $voxceleb1_root test data/voxceleb1_test 
+  local/make_voxceleb1_v2.pl $voxceleb1_root test data/voxceleb1_test
   # if you downloaded the dataset soon after it was released, you will want to use the make_voxceleb1.pl script instead.
   # local/make_voxceleb1.pl $voxceleb1_root data
   # We'll train on all of VoxCeleb2, plus the training portion of VoxCeleb1.
@@ -84,7 +84,7 @@ if [ $stage -le 2 ]; then
 
   # Prepare the MUSAN corpus, which consists of music, speech, and noise
   # suitable for augmentation.
-  local/make_musan.sh $musan_root data
+  steps/data/make_musan.sh --sampling-rate 16000 $musan_root data
 
   # Get the duration of the MUSAN recordings.  This will be used by the
   # script augment_data_dir.py.
diff --git a/egs/wsj/s5/steps/data/make_musan.py b/egs/wsj/s5/steps/data/make_musan.py
index 2a7bed453cb..80b9d7cf6d4 100755
--- a/egs/wsj/s5/steps/data/make_musan.py
+++ b/egs/wsj/s5/steps/data/make_musan.py
@@ -18,10 +18,10 @@ def get_args():
                         action=common_lib.StrToBoolAction,
                         choices=["true", "false"],
                         help='use vocals from the music corpus')
-    parser.add_argument('--sampling-rate', type=int, default=None,
+    parser.add_argument('--sampling-rate', type=int, default=16000,
                         help="Sampling rate of the source data. If a positive integer is specified with this option, "
                         "the MUSAN corpus will be resampled to the rate of the source data."
-                        "Original MUSAN corpus is sampled at 16KHz")
+                        "Original MUSAN corpus is sampled at 16KHz. Defaults to 16000 Hz")
     parser.add_argument("in_dir", help="Input data directory")
     parser.add_argument("out_dir", help="Output data directory")
 
@@ -75,11 +75,11 @@ def prepare_music(root_dir, use_vocals, sampling_rate):
         if utt in utt2wav:
             if use_vocals or not utt2vocals[utt]:
                 utt2spk_str = utt2spk_str + utt + " " + utt2spk[utt] + "\n"
-                if sampling_rate is not None or sampling_rate != 16000:
+                if sampling_rate == 16000:
+                    utt2wav_str = utt2wav_str + utt + " " + utt2wav[utt] + "\n"
+                else:
                     utt2wav_str = utt2wav_str + utt + " sox -t wav " + utt2wav[utt] + " -r" \
                                     " {fs} -t wav - |\n".format(fs=sampling_rate)
-                else:
-                    utt2wav_str = utt2wav_str + utt + " " + utt2wav[utt] + "\n"
             num_good_files += 1
         else:
             print("Missing file {}".format(utt))
@@ -108,11 +108,11 @@ def prepare_speech(root_dir, sampling_rate):
     for utt in utt2spk:
         if utt in utt2wav:
             utt2spk_str = utt2spk_str + utt + " " + utt2spk[utt] + "\n"
-            if sampling_rate is not None or sampling_rate != 16000:
-                utt2wav_str = utt2wav_str + utt + " sox -t wav " + utt2wav[utt] + " -r" \
-                        " {fs} -t wav - |\n".format(fs=sampling_rate)
-            else:
+            if sampling_rate == 16000:
                 utt2wav_str = utt2wav_str + utt + " " + utt2wav[utt] + "\n"
+            else:
+                utt2wav_str = utt2wav_str + utt + " sox -t wav " + utt2wav[utt] + " -r" \
+                                    " {fs} -t wav - |\n".format(fs=sampling_rate)
             num_good_files += 1
         else:
             print("Missing file {}".format(utt))
@@ -141,11 +141,11 @@ def prepare_noise(root_dir, sampling_rate):
     for utt in utt2spk:
         if utt in utt2wav:
             utt2spk_str = utt2spk_str + utt + " " + utt2spk[utt] + "\n"
-            if sampling_rate is not None or sampling_rate != 16000:
-                utt2wav_str = utt2wav_str + utt + " sox -t wav " + utt2wav[utt] + " -r " \
-                                    "{fs} -t wav - |\n".format(fs=sampling_rate)
-            else:
+            if sampling_rate == 16000:
                 utt2wav_str = utt2wav_str + utt + " " + utt2wav[utt] + "\n"
+            else:
+                utt2wav_str = utt2wav_str + utt + " sox -t wav " + utt2wav[utt] + " -r" \
+                                    " {fs} -t wav - |\n".format(fs=sampling_rate)
             num_good_files += 1
         else:
             print("Missing file {}".format(utt))

From c695bbc884fee398d612e8494dfcc1831d79a52c Mon Sep 17 00:00:00 2001
From: "kkm (aka Kirill Katsnelson)" <kkm@smartaction.com>
Date: Tue, 14 May 2019 14:37:40 -0700
Subject: [PATCH 112/163] [src] Fix sample rounding errors in extract-segments
 (#3321)

With a segments file constructed from exact wave file durations
some segments came out one sample short. The reason is the
multiplication of the float sample frequency and double audio
time point is inexact. For example, float 8000.0 multiplied
by double 2.03 yields 16239.99999999999, one LSB short of the
correct sample number 16240.

Also changed all endpoint calculations so that they performed
in seconds, not sample numbers, as this does not require a
conversion in nearly every comparison, and report positions in
diagnostic messages also in seconds, not sample numbers.
---
 src/featbin/extract-segments.cc | 118 +++++++++++++++-----------------
 1 file changed, 57 insertions(+), 61 deletions(-)

diff --git a/src/featbin/extract-segments.cc b/src/featbin/extract-segments.cc
index dd4f5fbb32c..7791fb4307b 100644
--- a/src/featbin/extract-segments.cc
+++ b/src/featbin/extract-segments.cc
@@ -70,18 +70,18 @@ int main(int argc, char *argv[]) {
 
     RandomAccessTableReader<WaveHolder> reader(wav_rspecifier);
     TableWriter<WaveHolder> writer(wav_wspecifier);
-    Input ki(segments_rxfilename);  // no binary argment: never binary.
+    Input ki(segments_rxfilename);  // No binary argment: never binary.
 
     int32 num_lines = 0, num_success = 0;
 
     std::string line;
-    /* read each line from segments file */
+    // Read each line from the segments file.
     while (std::getline(ki.Stream(), line)) {
       num_lines++;
       std::vector<std::string> split_line;
-      // Split the line by space or tab and check the number of fields in each
-      // line. There must be 4 fields--segment name , reacording wav file name,
-      // start time, end time; 5th field (channel info) is optional.
+      // Split the line into whitespace-separated fields and verify their
+      // number. There must be 4 or 5 fields: segment name, reacording ID, start
+      // time, end time, and the optional channel number.
       SplitStringToVector(line, " \t\r", true, &split_line);
       if (split_line.size() != 4 && split_line.size() != 5) {
         KALDI_WARN << "Invalid line in segments file: " << line;
@@ -92,8 +92,8 @@ int main(int argc, char *argv[]) {
           start_str = split_line[2],
           end_str = split_line[3];
 
-      // Convert the start time and endtime to real from string. Segment is
-      // ignored if start or end time cannot be converted to real.
+      // Parse the start and end times as float values. Segment is ignored if
+      // any of end times is malformed.
       double start, end;
       if (!ConvertStringToReal(start_str, &start)) {
         KALDI_WARN << "Invalid line in segments file [bad start]: " << line;
@@ -103,24 +103,24 @@ int main(int argc, char *argv[]) {
         KALDI_WARN << "Invalid line in segments file [bad end]: " << line;
         continue;
       }
-      // start time must not be negative; start time must not be greater than
-      // end time, except if end time is -1
-      if (start < 0 || (end != -1.0 && end <= 0) || ((start >= end) && (end > 0))) {
-        KALDI_WARN << "Invalid line in segments file [empty or invalid segment]: "
-                   << line;
+      // Start time must be non-negative and not greater than the end time,
+      // except if the end time is -1.
+      if (start < 0 || (end != -1.0 && end <= 0) ||
+          ((start >= end) && (end > 0))) {
+        KALDI_WARN << ("Invalid line in segments file "
+                       "[empty or invalid segment]: ") << line;
         continue;
       }
-      int32 channel = -1;  // means channel info is unspecified.
-      // if each line has 5 elements then 5th element must be channel identifier
+      int32 channel = -1;  // -1 means channel is unspecified.
+      // If the line has 5 elements, then the 5th element is the channel number.
       if (split_line.size() == 5) {
         if (!ConvertStringToInteger(split_line[4], &channel) || channel < 0) {
           KALDI_WARN << "Invalid line in segments file [bad channel]: " << line;
           continue;
         }
       }
-      /* check whether a segment start time and end time exists in recording
-       * if fails , skips the segment.
-       */
+
+      // Check whether the recording ID is in wav.scp; if not, skip the segment.
       if (!reader.HasKey(recording)) {
         KALDI_WARN << "Could not find recording " << recording
                    << ", skipping segment " << segment;
@@ -129,74 +129,70 @@ int main(int argc, char *argv[]) {
 
       const WaveData &wave = reader.Value(recording);
       const Matrix<BaseFloat> &wave_data = wave.Data();
-      BaseFloat samp_freq = wave.SampFreq();  // read sampling fequency
-      int32 num_samp = wave_data.NumCols(),  // number of samples in recording
-        num_chan = wave_data.NumRows();  // number of channels in recording
-
-      // Convert starting time of the segment to corresponding sample number.
-      // If end time is -1 then use the whole file starting from start time.
-      int32 start_samp = start * samp_freq,
-          end_samp = (end != -1)? (end * samp_freq) : num_samp;
-      KALDI_ASSERT(start_samp >= 0 && end_samp > 0 && "Invalid start or end.");
-
-      // start sample must be less than total number of samples,
-      // otherwise skip the segment
-      if (start_samp < 0 || start_samp >= num_samp) {
-        KALDI_WARN << "Start sample out of range " << start_samp << " [length:] "
-                   << num_samp << ", skipping segment " << segment;
+      BaseFloat samp_freq = wave.SampFreq();  // Sampling fequency.
+      int32 num_samp = wave_data.NumCols(),  // Number of samples in recording.
+        num_chan = wave_data.NumRows();  // Number of channels in recording.
+      BaseFloat file_length = num_samp / samp_freq;  // In seconds.
+
+      // Start must be within the wave data, otherwise skip the segment.
+      if (start < 0 || start > file_length) {
+        KALDI_WARN << "Segment start is out of file data range [0, "
+                   << file_length << "s]; skipping segment '" << line << "'";
         continue;
       }
-      /* end sample must be less than total number samples
-       * otherwise skip the segment
-       */
-      if (end_samp > num_samp) {
-        if ((end_samp >=
-             num_samp + static_cast<int32>(max_overshoot * samp_freq))) {
-          KALDI_WARN << "End sample too far out of range " << end_samp
-                     << " [length:] " << num_samp << ", skipping segment "
-                     << segment;
-          continue;
-        }
-        end_samp = num_samp;  // for small differences, just truncate.
+
+      // End must be less than the file length adjusted for possible overshoot;
+      // otherwise skip the segment. end == -1 passes the check.
+      if (end > file_length + max_overshoot) {
+        KALDI_WARN << "Segment end is too far out of file data range [0,"
+                   << file_length << "s]; skipping segment '" << line << "'";
+        continue;
       }
-      // Skip if segment size is less than minimum segment length (default 0.1s)
-      if (end_samp <=
-          start_samp + static_cast<int32>(min_segment_length * samp_freq)) {
+
+      // Otherwise ensure the end is not beyond the end of data, and default
+      // end == -1 to the end of file data.
+      if (end < 0 || end > file_length) end = file_length;
+
+      // Skip if segment size is less than the minimum allowed.
+      if (end - start < min_segment_length) {
         KALDI_WARN << "Segment " << segment << " too short, skipping it.";
         continue;
       }
-      /* check whether the wav file has more than one channel
-       * if yes, specify the channel info in segments file
-       * otherwise skips the segment
-       */
+
+      // Check that the channel is specified in the segments file for a multi-
+      // channel file, and that the channel actually exists in the wave data.
       if (channel == -1) {
         if (num_chan == 1) channel = 0;
         else {
-          KALDI_ERR << "If your data has multiple channels, you must specify the"
-              " channel in the segments file.  Processing segment " << segment;
+          KALDI_ERR << ("Your data has multiple channels. You must "
+                        "specify the channel in the segments file. "
+                        "Skipping segment ") << segment;
         }
       } else {
         if (channel >= num_chan) {
           KALDI_WARN << "Invalid channel " << channel << " >= " << num_chan
-                     << ", processing segment " << segment;
+                     << ". Skipping segment " << segment;
           continue;
         }
       }
-      /*
-       * This function  return a portion of a wav data from the orignial wav data matrix
-       */
-      SubMatrix<BaseFloat> segment_matrix(wave_data, channel, 1, start_samp, end_samp-start_samp);
+
+      // Convert endpoints of the segment to sample numbers. Note that the
+      // conversion requires a proper rounding.
+      int32 start_samp = static_cast<int32>(start * samp_freq + 0.5f),
+          end_samp = static_cast<int32>(end * samp_freq + 0.5f);
+
+      // Get the range of data from the orignial wave_data matrix.
+      SubMatrix<BaseFloat> segment_matrix(wave_data, channel, 1,
+                                          start_samp, end_samp - start_samp);
       WaveData segment_wave(samp_freq, segment_matrix);
-      writer.Write(segment, segment_wave); // write segment in wave format.
+      writer.Write(segment, segment_wave);  // Write the range in wave format.
       num_success++;
     }
     KALDI_LOG << "Successfully processed " << num_success << " lines out of "
               << num_lines << " in the segments file. ";
-    /* prints number of segments processed */
     return 0;
   } catch(const std::exception &e) {
     std::cerr << e.what();
     return -1;
   }
 }
-

From cfa48ebca2cc20cc98fbcc465133b8a898e7c8b8 Mon Sep 17 00:00:00 2001
From: "kkm (aka Kirill Katsnelson)" <kkm@smartaction.com>
Date: Tue, 14 May 2019 14:39:55 -0700
Subject: [PATCH 113/163] [src,scripts]Store frame_shift, utt2{dur,num_frames},
 .conf with features (#3316)

Generate utt2dur and utt2num_frames during feature extraction,
and store frame period in frame_shift file in feature directory.

Copy relevant .conf files used in feature extraction into
the conf/ subdirectory with features.

Add missing validations and options in some extraction scripts.
---
 egs/wsj/s5/steps/make_fbank.sh             |  86 ++++++++++-----
 egs/wsj/s5/steps/make_fbank_pitch.sh       | 117 +++++++++++++-------
 egs/wsj/s5/steps/make_mfcc.sh              |  86 ++++++++++-----
 egs/wsj/s5/steps/make_mfcc_pitch.sh        | 113 ++++++++++++-------
 egs/wsj/s5/steps/make_mfcc_pitch_online.sh | 121 ++++++++++++++-------
 egs/wsj/s5/steps/make_plp.sh               |  84 +++++++++-----
 egs/wsj/s5/steps/make_plp_pitch.sh         | 114 ++++++++++++-------
 src/featbin/compute-fbank-feats.cc         |  70 +++++++-----
 src/featbin/compute-mfcc-feats.cc          |  40 ++++---
 9 files changed, 545 insertions(+), 286 deletions(-)

diff --git a/egs/wsj/s5/steps/make_fbank.sh b/egs/wsj/s5/steps/make_fbank.sh
index 77c48be2e90..29153458f9b 100755
--- a/egs/wsj/s5/steps/make_fbank.sh
+++ b/egs/wsj/s5/steps/make_fbank.sh
@@ -1,6 +1,7 @@
 #!/bin/bash
 
-# Copyright 2012-2016  Karel Vesely  Johns Hopkins University (Author: Daniel Povey)
+# Copyright 2012-2016  Karel Vesely
+# Copyright 2012-2016  Johns Hopkins University (Author: Daniel Povey)
 # Apache 2.0
 # To be run from .. (one directory up from here)
 # see ../run.sh for example
@@ -10,23 +11,28 @@ nj=4
 cmd=run.pl
 fbank_config=conf/fbank.conf
 compress=true
-write_utt2num_frames=false  # if true writes utt2num_frames
+write_utt2num_frames=true  # If true writes utt2num_frames.
+write_utt2dur=true
 # End configuration section.
 
-echo "$0 $@"  # Print the command line for logging
+echo "$0 $@"  # Print the command line for logging.
 
 if [ -f path.sh ]; then . ./path.sh; fi
 . parse_options.sh || exit 1;
 
 if [ $# -lt 1 ] || [ $# -gt 3 ]; then
-   echo "Usage: $0 [options] <data-dir> [<log-dir> [<fbank-dir>] ]";
-   echo "e.g.: $0 data/train exp/make_fbank/train mfcc"
-   echo "Note: <log-dir> defaults to <data-dir>/log, and <fbank-dir> defaults to <data-dir>/data"
-   echo "Options: "
-   echo "  --fbank-config <config-file>                     # config passed to compute-fbank-feats "
-   echo "  --nj <nj>                                        # number of parallel jobs"
-   echo "  --cmd (utils/run.pl|utils/queue.pl <queue opts>) # how to run jobs."
-   echo "  --write-utt2num-frames <true|false>     # If true, write utt2num_frames file."
+  cat >&2 <<EOF
+Usage: $0 [options] <data-dir> [<log-dir> [<fbank-dir>] ]
+ e.g.: $0 data/train
+Note: <log-dir> defaults to <data-dir>/log, and
+      <fbank-dir> defaults to <data-dir>/data
+Options:
+  --fbank-config <config-file>         # config passed to compute-fbank-feats.
+  --nj <nj>                            # number of parallel jobs.
+  --cmd <run.pl|queue.pl <queue opts>> # how to run jobs.
+  --write-utt2num-frames <true|false>  # If true, write utt2num_frames file.
+  --write-utt2dur <true|false>         # If true, write utt2dur file.
+EOF
    exit 1;
 fi
 
@@ -64,7 +70,7 @@ required="$scp $fbank_config"
 
 for f in $required; do
   if [ ! -f $f ]; then
-    echo "make_fbank.sh: no such file $f"
+    echo "$0: no such file $f"
     exit 1;
   fi
 done
@@ -91,9 +97,15 @@ else
   write_num_frames_opt=
 fi
 
+if $write_utt2dur; then
+  write_utt2dur_opt="--write-utt2dur=ark,t:$logdir/utt2dur.JOB"
+else
+  write_utt2dur_opt=
+fi
+
 if [ -f $data/segments ]; then
   echo "$0 [info]: segments file exists: using that."
-  split_segments=""
+  split_segments=
   for n in $(seq $nj); do
     split_segments="$split_segments $logdir/segments.$n"
   done
@@ -103,11 +115,11 @@ if [ -f $data/segments ]; then
 
   $cmd JOB=1:$nj $logdir/make_fbank_${name}.JOB.log \
     extract-segments scp,p:$scp $logdir/segments.JOB ark:- \| \
-    compute-fbank-feats $vtln_opts --verbose=2 --config=$fbank_config ark:- ark:- \| \
+    compute-fbank-feats $vtln_opts $write_utt2dur_opt --verbose=2 \
+      --config=$fbank_config ark:- ark:- \| \
     copy-feats --compress=$compress $write_num_frames_opt ark:- \
      ark,scp:$fbankdir/raw_fbank_$name.JOB.ark,$fbankdir/raw_fbank_$name.JOB.scp \
      || exit 1;
-
 else
   echo "$0: [info]: no segments file exists: assuming wav.scp indexed by utterance."
   split_scps=""
@@ -118,39 +130,57 @@ else
   utils/split_scp.pl $scp $split_scps || exit 1;
 
   $cmd JOB=1:$nj $logdir/make_fbank_${name}.JOB.log \
-    compute-fbank-feats $vtln_opts --verbose=2 --config=$fbank_config scp,p:$logdir/wav.JOB.scp ark:- \| \
+    compute-fbank-feats $vtln_opts $write_utt2dur_opt --verbose=2 \
+     --config=$fbank_config scp,p:$logdir/wav.JOB.scp ark:- \| \
     copy-feats --compress=$compress $write_num_frames_opt ark:- \
      ark,scp:$fbankdir/raw_fbank_$name.JOB.ark,$fbankdir/raw_fbank_$name.JOB.scp \
      || exit 1;
-
 fi
 
 
 if [ -f $logdir/.error.$name ]; then
-  echo "Error producing fbank features for $name:"
+  echo "$0: Error producing filterbank features for $name:"
   tail $logdir/make_fbank_${name}.1.log
   exit 1;
 fi
 
 # concatenate the .scp files together.
 for n in $(seq $nj); do
-  cat $fbankdir/raw_fbank_$name.$n.scp || exit 1;
-done > $data/feats.scp
+  cat $fbankdir/raw_fbank_$name.$n.scp || exit 1
+done > $data/feats.scp || exit 1
 
 if $write_utt2num_frames; then
   for n in $(seq $nj); do
-    cat $logdir/utt2num_frames.$n || exit 1;
+    cat $logdir/utt2num_frames.$n || exit 1
   done > $data/utt2num_frames || exit 1
-  rm $logdir/utt2num_frames.*
 fi
 
-rm $logdir/wav.*.scp  $logdir/segments.* 2>/dev/null
+if $write_utt2dur; then
+  for n in $(seq $nj); do
+    cat $logdir/utt2dur.$n || exit 1
+  done > $data/utt2dur || exit 1
+fi
+
+# Store frame_shift and fbank_config along with features.
+frame_shift=$(perl -ne 'if (/^--frame-shift=(\d+)/) {
+                          printf "%.3f", 0.001 * $1; exit; }' $fbank_config)
+echo ${frame_shift:-'0.01'} > $data/frame_shift
+mkdir -p $data/conf && cp $fbank_config $data/conf/fbank.conf || exit 1
+
+rm $logdir/wav_${name}.*.scp  $logdir/segments.* \
+   $logdir/utt2num_frames.* $logdir/utt2dur.* 2>/dev/null
 
-nf=`cat $data/feats.scp | wc -l`
-nu=`cat $data/utt2spk | wc -l`
+nf=$(wc -l < $data/feats.scp)
+nu=$(wc -l < $data/utt2spk)
 if [ $nf -ne $nu ]; then
-  echo "It seems not all of the feature files were successfully ($nf != $nu);"
-  echo "consider using utils/fix_data_dir.sh $data"
+  echo "$0: It seems not all of the feature files were successfully procesed" \
+       "($nf != $nu); consider using utils/fix_data_dir.sh $data"
+fi
+
+if (( nf < nu - nu/20 )); then
+  echo "$0: Less than 95% the features were successfully generated."\
+       "Probably a serious error."
+  exit 1
 fi
 
-echo "Succeeded creating filterbank features for $name"
+echo "$0: Succeeded creating filterbank features for $name"
diff --git a/egs/wsj/s5/steps/make_fbank_pitch.sh b/egs/wsj/s5/steps/make_fbank_pitch.sh
index b250128fd03..7f971df54ae 100755
--- a/egs/wsj/s5/steps/make_fbank_pitch.sh
+++ b/egs/wsj/s5/steps/make_fbank_pitch.sh
@@ -2,7 +2,7 @@
 
 # Copyright 2013  The Shenzhen Key Laboratory of Intelligent Media and Speech,
 #                 PKU-HKUST Shenzhen Hong Kong Institution (Author: Wei Shi)
-#           2016  Johns Hopkins University (author: Daniel Povey)
+#           2016  Johns Hopkins University (Author: Daniel Povey)
 # Apache 2.0
 # Combine filterbank and pitch features together
 # Note: This file is based on make_fbank.sh and make_pitch_kaldi.sh
@@ -15,26 +15,31 @@ pitch_config=conf/pitch.conf
 pitch_postprocess_config=
 paste_length_tolerance=2
 compress=true
-write_utt2num_frames=false  # if true writes utt2num_frames
+write_utt2num_frames=true  # If true writes utt2num_frames.
+write_utt2dur=true
 # End configuration section.
 
-echo "$0 $@"  # Print the command line for logging
+echo "$0 $@"  # Print the command line for logging.
 
 if [ -f path.sh ]; then . ./path.sh; fi
 . parse_options.sh || exit 1;
 
 if [ $# -lt 1 ] || [ $# -gt 3 ]; then
-   echo "Usage: $0 [options] <data-dir> [<log-dir> [<fbank-dir>] ]";
-   echo "e.g.: $0 data/train exp/make_fbank/train mfcc"
-   echo "Note: <log-dir> defaults to <data-dir>/log, and <fbank-dir> defaults to <data-dir>/data"
-   echo "Options: "
-   echo "  --fbank-config             <config-file>             # config passed to compute-fbank-feats "
-   echo "  --pitch-config             <pitch-config-file>       # config passed to compute-kaldi-pitch-feats "
-   echo "  --pitch-postprocess-config <postprocess-config-file> # config passed to process-kaldi-pitch-feats "
-   echo "  --paste-length-tolerance   <tolerance>               # length tolerance passed to paste-feats"
-   echo "  --nj                       <nj>                      # number of parallel jobs"
-   echo "  --cmd (utils/run.pl|utils/queue.pl <queue opts>)     # how to run jobs."
-   echo "  --write-utt2num-frames <true|false>     # If true, write utt2num_frames file."
+  cat >&2 <<EOF
+Usage: $0 [options] <data-dir> [<log-dir> [<fbank-dir>] ]
+ e.g.: $0 data/train
+Note: <log-dir> defaults to <data-dir>/log, and
+      <fbank-dir> defaults to <data-dir>/data
+Options:
+  --fbank-config <fbank-config-file>   # config passed to compute-fbank-feats.
+  --pitch-config <pitch-config-file>   # config passed to compute-kaldi-pitch-feats.
+  --pitch-postprocess-config <postprocess-config-file> # config passed to process-kaldi-pitch-feats.
+  --paste-length-tolerance <tolerance> # length tolerance passed to paste-feats.
+  --nj <nj>                            # number of parallel jobs.
+  --cmd <run.pl|queue.pl <queue opts>> # how to run jobs.
+  --write-utt2num-frames <true|false>  # If true, write utt2num_frames file.
+  --write-utt2dur <true|false>         # If true, write utt2dur file.
+EOF
    exit 1;
 fi
 
@@ -72,19 +77,19 @@ required="$scp $fbank_config $pitch_config"
 
 for f in $required; do
   if [ ! -f $f ]; then
-    echo "make_fbank_pitch.sh: no such file $f"
+    echo "$0: no such file $f"
     exit 1;
   fi
 done
 
+utils/validate_data_dir.sh --no-text --no-feats $data || exit 1;
+
 if [ ! -z "$pitch_postprocess_config" ]; then
   postprocess_config_opt="--config=$pitch_postprocess_config";
 else
   postprocess_config_opt=
 fi
 
-utils/validate_data_dir.sh --no-text --no-feats $data || exit 1;
-
 if [ -f $data/spk2warp ]; then
   echo "$0 [info]: using VTLN warp factors from $data/spk2warp"
   vtln_opts="--vtln-map=ark:$data/spk2warp --utt2spk=ark:$data/utt2spk"
@@ -105,9 +110,15 @@ else
   write_num_frames_opt=
 fi
 
+if $write_utt2dur; then
+  write_utt2dur_opt="--write-utt2dur=ark,t:$logdir/utt2dur.JOB"
+else
+  write_utt2dur_opt=
+fi
+
 if [ -f $data/segments ]; then
   echo "$0 [info]: segments file exists: using that."
-  split_segments=""
+  split_segments=
   for n in $(seq $nj); do
     split_segments="$split_segments $logdir/segments.$n"
   done
@@ -115,61 +126,89 @@ if [ -f $data/segments ]; then
   utils/split_scp.pl $data/segments $split_segments || exit 1;
   rm $logdir/.error 2>/dev/null
 
-  fbank_feats="ark:extract-segments scp,p:$scp $logdir/segments.JOB ark:- | compute-fbank-feats $vtln_opts --verbose=2 --config=$fbank_config ark:- ark:- |"
-  pitch_feats="ark,s,cs:extract-segments scp,p:$scp $logdir/segments.JOB ark:- | compute-kaldi-pitch-feats --verbose=2 --config=$pitch_config ark:- ark:- | process-kaldi-pitch-feats $postprocess_config_opt ark:- ark:- |"
+  fbank_feats="ark:extract-segments scp,p:$scp $logdir/segments.JOB ark:- |\
+    compute-fbank-feats $vtln_opts $write_utt2dur_opt --verbose=2 \
+      --config=$fbank_config ark:- ark:- |"
+  pitch_feats="ark,s,cs:extract-segments scp,p:$scp $logdir/segments.JOB ark:- | \
+    compute-kaldi-pitch-feats --verbose=2 --config=$pitch_config ark:- ark:- | \
+    process-kaldi-pitch-feats $postprocess_config_opt ark:- ark:- |"
 
   $cmd JOB=1:$nj $logdir/make_fbank_pitch_${name}.JOB.log \
-    paste-feats --length-tolerance=$paste_length_tolerance "$fbank_feats" "$pitch_feats" ark:- \| \
+    paste-feats --length-tolerance=$paste_length_tolerance \
+      "$fbank_feats" "$pitch_feats" ark:- \| \
     copy-feats --compress=$compress $write_num_frames_opt ark:- \
       ark,scp:$fbank_pitch_dir/raw_fbank_pitch_$name.JOB.ark,$fbank_pitch_dir/raw_fbank_pitch_$name.JOB.scp \
      || exit 1;
 
 else
   echo "$0: [info]: no segments file exists: assuming wav.scp indexed by utterance."
-  split_scps=""
+  split_scps=
   for n in $(seq $nj); do
-    split_scps="$split_scps $logdir/wav.$n.scp"
+    split_scps="$split_scps $logdir/wav_${name}.$n.scp"
   done
 
   utils/split_scp.pl $scp $split_scps || exit 1;
 
-  fbank_feats="ark:compute-fbank-feats $vtln_opts --verbose=2 --config=$fbank_config scp,p:$logdir/wav.JOB.scp ark:- |"
-  pitch_feats="ark,s,cs:compute-kaldi-pitch-feats --verbose=2 --config=$pitch_config scp,p:$logdir/wav.JOB.scp ark:- | process-kaldi-pitch-feats $postprocess_config_opt ark:- ark:- |"
+  fbank_feats="ark:compute-fbank-feats $vtln_opts $write_utt2dur_opt \
+   --verbose=2 --config=$fbank_config scp,p:$logdir/wav_${name}.JOB.scp ark:- |"
+  pitch_feats="ark,s,cs:compute-kaldi-pitch-feats --verbose=2 \
+      --config=$pitch_config scp,p:$logdir/wav_${name}.JOB.scp ark:- | \
+    process-kaldi-pitch-feats $postprocess_config_opt ark:- ark:- |"
 
   $cmd JOB=1:$nj $logdir/make_fbank_pitch_${name}.JOB.log \
-    paste-feats --length-tolerance=$paste_length_tolerance "$fbank_feats" "$pitch_feats" ark:- \| \
+    paste-feats --length-tolerance=$paste_length_tolerance \
+      "$fbank_feats" "$pitch_feats" ark:- \| \
     copy-feats --compress=$compress $write_num_frames_opt ark:- \
       ark,scp:$fbank_pitch_dir/raw_fbank_pitch_$name.JOB.ark,$fbank_pitch_dir/raw_fbank_pitch_$name.JOB.scp \
       || exit 1;
-
 fi
 
 
 if [ -f $logdir/.error.$name ]; then
-  echo "Error producing fbank & pitch features for $name:"
+  echo "$0: Error producing filterbank and pitch features for $name:"
   tail $logdir/make_fbank_pitch_${name}.1.log
   exit 1;
 fi
 
-# concatenate the .scp files together.
+# Concatenate the .scp files together.
 for n in $(seq $nj); do
-  cat $fbank_pitch_dir/raw_fbank_pitch_$name.$n.scp || exit 1;
-done > $data/feats.scp
+  cat $fbank_pitch_dir/raw_fbank_pitch_$name.$n.scp || exit 1
+done > $data/feats.scp || exit 1
 
 if $write_utt2num_frames; then
   for n in $(seq $nj); do
-    cat $logdir/utt2num_frames.$n || exit 1;
+    cat $logdir/utt2num_frames.$n || exit 1
   done > $data/utt2num_frames || exit 1
-  rm $logdir/utt2num_frames.*
 fi
 
-rm $logdir/wav.*.scp  $logdir/segments.* 2>/dev/null
+if $write_utt2dur; then
+  for n in $(seq $nj); do
+    cat $logdir/utt2dur.$n || exit 1
+  done > $data/utt2dur || exit 1
+fi
+
+# Store frame_shift, fbank_config and pitch_config along with features.
+frame_shift=$(perl -ne 'if (/^--frame-shift=(\d+)/) {
+                          printf "%.3f", 0.001 * $1; exit; }' $fbank_config)
+echo ${frame_shift:-'0.01'} > $data/frame_shift
+mkdir -p $data/conf &&
+  cp $fbank_config $data/conf/fbank.conf &&
+  cp $pitch_config $data/conf/pitch.conf || exit 1
 
-nf=`cat $data/feats.scp | wc -l`
-nu=`cat $data/utt2spk | wc -l`
+rm $logdir/wav_${name}.*.scp  $logdir/segments.* \
+   $logdir/utt2num_frames.* $logdir/utt2dur.* 2>/dev/null
+
+nf=$(wc -l < $data/feats.scp)
+nu=$(wc -l < $data/utt2spk)
 if [ $nf -ne $nu ]; then
-  echo "It seems not all of the feature files were successfully processed ($nf != $nu);"
-  echo "consider using utils/fix_data_dir.sh $data"
+  echo "$0: It seems not all of the feature files were successfully procesed" \
+       "($nf != $nu); consider using utils/fix_data_dir.sh $data"
+fi
+
+if (( nf < nu - nu/20 )); then
+  echo "$0: Less than 95% the features were successfully generated."\
+       "Probably a serious error."
+  exit 1
 fi
 
-echo "Succeeded creating filterbank & pitch features for $name"
+echo "$0: Succeeded creating filterbank and pitch features for $name"
diff --git a/egs/wsj/s5/steps/make_mfcc.sh b/egs/wsj/s5/steps/make_mfcc.sh
index 8514ce4e38d..37433f87dcd 100755
--- a/egs/wsj/s5/steps/make_mfcc.sh
+++ b/egs/wsj/s5/steps/make_mfcc.sh
@@ -10,23 +10,28 @@ nj=4
 cmd=run.pl
 mfcc_config=conf/mfcc.conf
 compress=true
-write_utt2num_frames=false  # if true writes utt2num_frames
+write_utt2num_frames=true  # If true writes utt2num_frames.
+write_utt2dur=true
 # End configuration section.
 
-echo "$0 $@"  # Print the command line for logging
+echo "$0 $@"  # Print the command line for logging.
 
 if [ -f path.sh ]; then . ./path.sh; fi
 . parse_options.sh || exit 1;
 
 if [ $# -lt 1 ] || [ $# -gt 3 ]; then
-   echo "Usage: $0 [options] <data-dir> [<log-dir> [<mfcc-dir>] ]";
-   echo "e.g.: $0 data/train exp/make_mfcc/train mfcc"
-   echo "Note: <log-dir> defaults to <data-dir>/log, and <mfccdir> defaults to <data-dir>/data"
-   echo "Options: "
-   echo "  --mfcc-config <config-file>                      # config passed to compute-mfcc-feats "
-   echo "  --nj <nj>                                        # number of parallel jobs"
-   echo "  --cmd (utils/run.pl|utils/queue.pl <queue opts>) # how to run jobs."
-   echo "  --write-utt2num-frames <true|false>     # If true, write utt2num_frames file."
+  cat >&2 <<EOF
+Usage: $0 [options] <data-dir> [<log-dir> [<mfcc-dir>] ]
+ e.g.: $0 data/train
+Note: <log-dir> defaults to <data-dir>/log, and
+      <mfcc-dir> defaults to <data-dir>/data.
+Options:
+  --mfcc-config <config-file>          # config passed to compute-mfcc-feats.
+  --nj <nj>                            # number of parallel jobs.
+  --cmd <run.pl|queue.pl <queue opts>> # how to run jobs.
+  --write-utt2num-frames <true|false>  # If true, write utt2num_frames file.
+  --write-utt2dur <true|false>         # If true, write utt2dur file.
+EOF
    exit 1;
 fi
 
@@ -63,10 +68,11 @@ required="$scp $mfcc_config"
 
 for f in $required; do
   if [ ! -f $f ]; then
-    echo "make_mfcc.sh: no such file $f"
+    echo "$0: no such file $f"
     exit 1;
   fi
 done
+
 utils/validate_data_dir.sh --no-text --no-feats $data || exit 1;
 
 if [ -f $data/spk2warp ]; then
@@ -92,11 +98,16 @@ else
   write_num_frames_opt=
 fi
 
+if $write_utt2dur; then
+  write_utt2dur_opt="--write-utt2dur=ark,t:$logdir/utt2dur.JOB"
+else
+  write_utt2dur_opt=
+fi
 
 if [ -f $data/segments ]; then
   echo "$0 [info]: segments file exists: using that."
 
-  split_segments=""
+  split_segments=
   for n in $(seq $nj); do
     split_segments="$split_segments $logdir/segments.$n"
   done
@@ -106,14 +117,15 @@ if [ -f $data/segments ]; then
 
   $cmd JOB=1:$nj $logdir/make_mfcc_${name}.JOB.log \
     extract-segments scp,p:$scp $logdir/segments.JOB ark:- \| \
-    compute-mfcc-feats $vtln_opts --verbose=2 --config=$mfcc_config ark:- ark:- \| \
+    compute-mfcc-feats $vtln_opts $write_utt2dur_opt --verbose=2 \
+      --config=$mfcc_config ark:- ark:- \| \
     copy-feats --compress=$compress $write_num_frames_opt ark:- \
       ark,scp:$mfccdir/raw_mfcc_$name.JOB.ark,$mfccdir/raw_mfcc_$name.JOB.scp \
      || exit 1;
 
 else
   echo "$0: [info]: no segments file exists: assuming wav.scp indexed by utterance."
-  split_scps=""
+  split_scps=
   for n in $(seq $nj); do
     split_scps="$split_scps $logdir/wav_${name}.$n.scp"
   done
@@ -125,44 +137,58 @@ else
   # utterances that have bad wave data.
 
   $cmd JOB=1:$nj $logdir/make_mfcc_${name}.JOB.log \
-    compute-mfcc-feats  $vtln_opts --verbose=2 --config=$mfcc_config \
-     scp,p:$logdir/wav_${name}.JOB.scp ark:- \| \
-      copy-feats $write_num_frames_opt --compress=$compress ark:- \
+    compute-mfcc-feats $vtln_opts $write_utt2dur_opt --verbose=2 \
+      --config=$mfcc_config scp,p:$logdir/wav_${name}.JOB.scp ark:- \| \
+    copy-feats $write_num_frames_opt --compress=$compress ark:- \
       ark,scp:$mfccdir/raw_mfcc_$name.JOB.ark,$mfccdir/raw_mfcc_$name.JOB.scp \
       || exit 1;
 fi
 
 
 if [ -f $logdir/.error.$name ]; then
-  echo "Error producing mfcc features for $name:"
+  echo "$0: Error producing MFCC features for $name:"
   tail $logdir/make_mfcc_${name}.1.log
   exit 1;
 fi
 
 # concatenate the .scp files together.
 for n in $(seq $nj); do
-  cat $mfccdir/raw_mfcc_$name.$n.scp || exit 1;
+  cat $mfccdir/raw_mfcc_$name.$n.scp || exit 1
 done > $data/feats.scp || exit 1
 
 if $write_utt2num_frames; then
   for n in $(seq $nj); do
-    cat $logdir/utt2num_frames.$n || exit 1;
+    cat $logdir/utt2num_frames.$n || exit 1
   done > $data/utt2num_frames || exit 1
-  rm $logdir/utt2num_frames.*
 fi
 
-rm $logdir/wav_${name}.*.scp  $logdir/segments.* 2>/dev/null
+if $write_utt2dur; then
+  for n in $(seq $nj); do
+    cat $logdir/utt2dur.$n || exit 1
+  done > $data/utt2dur || exit 1
+fi
 
-nf=`cat $data/feats.scp | wc -l`
-nu=`cat $data/utt2spk | wc -l`
+# Store frame_shift and mfcc_config along with features.
+frame_shift=$(perl -ne 'if (/^--frame-shift=(\d+)/) {
+                          printf "%.3f", 0.001 * $1; exit; }' $mfcc_config)
+echo ${frame_shift:-'0.01'} > $data/frame_shift
+mkdir -p $data/conf && cp $mfcc_config $data/conf/mfcc.conf || exit 1
+
+rm $logdir/wav_${name}.*.scp  $logdir/segments.* \
+   $logdir/utt2num_frames.* $logdir/utt2dur.* 2>/dev/null
+
+nf=$(wc -l < $data/feats.scp)
+nu=$(wc -l < $data/utt2spk)
 if [ $nf -ne $nu ]; then
-  echo "It seems not all of the feature files were successfully processed ($nf != $nu);"
-  echo "consider using utils/fix_data_dir.sh $data"
+  echo "$0: It seems not all of the feature files were successfully procesed" \
+       "($nf != $nu); consider using utils/fix_data_dir.sh $data"
 fi
 
-if [ $nf -lt $[$nu - ($nu/20)] ]; then
-  echo "Less than 95% the features were successfully generated.  Probably a serious error."
-  exit 1;
+if (( nf < nu - nu/20 )); then
+  echo "$0: Less than 95% the features were successfully generated."\
+       "Probably a serious error."
+  exit 1
 fi
 
-echo "Succeeded creating MFCC features for $name"
+
+echo "$0: Succeeded creating MFCC features for $name"
diff --git a/egs/wsj/s5/steps/make_mfcc_pitch.sh b/egs/wsj/s5/steps/make_mfcc_pitch.sh
index 98b670b82ae..dda31667d6a 100755
--- a/egs/wsj/s5/steps/make_mfcc_pitch.sh
+++ b/egs/wsj/s5/steps/make_mfcc_pitch.sh
@@ -1,7 +1,7 @@
 #!/bin/bash
 
-# Copyright 2013 The Shenzhen Key Laboratory of Intelligent Media and Speech,
-#                PKU-HKUST Shenzhen Hong Kong Institution (Author: Wei Shi)
+# Copyright 2013  The Shenzhen Key Laboratory of Intelligent Media and Speech,
+#                 PKU-HKUST Shenzhen Hong Kong Institution (Author: Wei Shi)
 #           2016  Johns Hopkins University (Author: Daniel Povey)
 # Apache 2.0
 # Combine MFCC and pitch features together
@@ -15,26 +15,31 @@ pitch_config=conf/pitch.conf
 pitch_postprocess_config=
 paste_length_tolerance=2
 compress=true
-write_utt2num_frames=false  # if true writes utt2num_frames
+write_utt2num_frames=true  # If true writes utt2num_frames.
+write_utt2dur=true
 # End configuration section.
 
-echo "$0 $@"  # Print the command line for logging
+echo "$0 $@"  # Print the command line for logging.
 
 if [ -f path.sh ]; then . ./path.sh; fi
 . parse_options.sh || exit 1;
 
 if [ $# -lt 1 ] || [ $# -gt 3 ]; then
-   echo "Usage: $0 [options] <data-dir> [<log-dir> [<mfcc-dir>] ]";
-   echo "e.g.: $0 data/train exp/make_mfcc/train mfcc"
-   echo "Note: <log-dir> defaults to <data-dir>/log, and <mfcc-dir> defaults to <data-dir>/data"
-   echo "Options: "
-   echo "  --mfcc-config              <mfcc-config-file>        # config passed to compute-mfcc-feats "
-   echo "  --pitch-config             <pitch-config-file>       # config passed to compute-kaldi-pitch-feats "
-   echo "  --pitch-postprocess-config <postprocess-config-file>  # config passed to process-kaldi-pitch-feats "
-   echo "  --paste-length-tolerance   <tolerance>               # length tolerance passed to paste-feats"
-   echo "  --nj                       <nj>                      # number of parallel jobs"
-   echo "  --cmd (utils/run.pl|utils/queue.pl <queue opts>)     # how to run jobs."
-   echo "  --write-utt2num-frames <true|false>     # If true, write utt2num_frames file."
+  cat >&2 <<EOF
+Usage: $0 [options] <data-dir> [<log-dir> [<mfcc-dir>] ]
+ e.g.: $0 data/train
+Note: <log-dir> defaults to <data-dir>/log, and
+      <mfcc-dir> defaults to <data-dir>/data
+Options:
+  --mfcc-config <mfcc-config-file>     # config passed to compute-mfcc-feats.
+  --pitch-config <pitch-config-file>   # config passed to compute-kaldi-pitch-feats.
+  --pitch-postprocess-config <postprocess-config-file> # config passed to process-kaldi-pitch-feats.
+  --paste-length-tolerance <tolerance> # length tolerance passed to paste-feats.
+  --nj <nj>                            # number of parallel jobs.
+  --cmd <run.pl|queue.pl <queue opts>> # how to run jobs.
+  --write-utt2num-frames <true|false>  # If true, write utt2num_frames file.
+  --write-utt2dur <true|false>         # If true, write utt2dur file.
+EOF
    exit 1;
 fi
 
@@ -72,10 +77,11 @@ required="$scp $mfcc_config $pitch_config"
 
 for f in $required; do
   if [ ! -f $f ]; then
-    echo "make_mfcc_pitch.sh: no such file $f"
+    echo "$0: no such file $f"
     exit 1;
   fi
 done
+
 utils/validate_data_dir.sh --no-text --no-feats $data || exit 1;
 
 if [ ! -z "$pitch_postprocess_config" ]; then
@@ -104,9 +110,15 @@ else
   write_num_frames_opt=
 fi
 
+if $write_utt2dur; then
+  write_utt2dur_opt="--write-utt2dur=ark,t:$logdir/utt2dur.JOB"
+else
+  write_utt2dur_opt=
+fi
+
 if [ -f $data/segments ]; then
   echo "$0 [info]: segments file exists: using that."
-  split_segments=""
+  split_segments=
   for n in $(seq $nj); do
     split_segments="$split_segments $logdir/segments.$n"
   done
@@ -114,66 +126,89 @@ if [ -f $data/segments ]; then
   utils/split_scp.pl $data/segments $split_segments || exit 1;
   rm $logdir/.error 2>/dev/null
 
-  mfcc_feats="ark:extract-segments scp,p:$scp $logdir/segments.JOB ark:- | compute-mfcc-feats $vtln_opts --verbose=2 --config=$mfcc_config ark:- ark:- |"
-  pitch_feats="ark,s,cs:extract-segments scp,p:$scp $logdir/segments.JOB ark:- | compute-kaldi-pitch-feats --verbose=2 --config=$pitch_config ark:- ark:- | process-kaldi-pitch-feats $postprocess_config_opt ark:- ark:- |"
+  mfcc_feats="ark:extract-segments scp,p:$scp $logdir/segments.JOB ark:- | \
+    compute-mfcc-feats $vtln_opts $write_utt2dur_opt --verbose=2 \
+      --config=$mfcc_config ark:- ark:- |"
+  pitch_feats="ark,s,cs:extract-segments scp,p:$scp $logdir/segments.JOB ark:- | \
+    compute-kaldi-pitch-feats --verbose=2 --config=$pitch_config ark:- ark:- | \
+    process-kaldi-pitch-feats $postprocess_config_opt ark:- ark:- |"
 
   $cmd JOB=1:$nj $logdir/make_mfcc_pitch_${name}.JOB.log \
-    paste-feats --length-tolerance=$paste_length_tolerance "$mfcc_feats" "$pitch_feats" ark:- \| \
+    paste-feats --length-tolerance=$paste_length_tolerance \
+      "$mfcc_feats" "$pitch_feats" ark:- \| \
     copy-feats --compress=$compress $write_num_frames_opt ark:- \
       ark,scp:$mfcc_pitch_dir/raw_mfcc_pitch_$name.JOB.ark,$mfcc_pitch_dir/raw_mfcc_pitch_$name.JOB.scp \
      || exit 1;
 
 else
   echo "$0: [info]: no segments file exists: assuming wav.scp indexed by utterance."
-  split_scps=""
+  split_scps=
   for n in $(seq $nj); do
     split_scps="$split_scps $logdir/wav_${name}.$n.scp"
   done
 
   utils/split_scp.pl $scp $split_scps || exit 1;
 
-  mfcc_feats="ark:compute-mfcc-feats $vtln_opts --verbose=2 --config=$mfcc_config scp,p:$logdir/wav_${name}.JOB.scp ark:- |"
-  pitch_feats="ark,s,cs:compute-kaldi-pitch-feats --verbose=2 --config=$pitch_config scp,p:$logdir/wav_${name}.JOB.scp ark:- | process-kaldi-pitch-feats $postprocess_config_opt ark:- ark:- |"
+  mfcc_feats="ark:compute-mfcc-feats $vtln_opts $write_utt2dur_opt --verbose=2 \
+    --config=$mfcc_config scp,p:$logdir/wav_${name}.JOB.scp ark:- |"
+  pitch_feats="ark,s,cs:compute-kaldi-pitch-feats --verbose=2 \
+      --config=$pitch_config scp,p:$logdir/wav_${name}.JOB.scp ark:- | \
+    process-kaldi-pitch-feats $postprocess_config_opt ark:- ark:- |"
 
   $cmd JOB=1:$nj $logdir/make_mfcc_pitch_${name}.JOB.log \
-    paste-feats --length-tolerance=$paste_length_tolerance "$mfcc_feats" "$pitch_feats" ark:- \| \
+    paste-feats --length-tolerance=$paste_length_tolerance \
+      "$mfcc_feats" "$pitch_feats" ark:- \| \
     copy-feats --compress=$compress $write_num_frames_opt ark:- \
       ark,scp:$mfcc_pitch_dir/raw_mfcc_pitch_$name.JOB.ark,$mfcc_pitch_dir/raw_mfcc_pitch_$name.JOB.scp \
       || exit 1;
-
 fi
 
 
 if [ -f $logdir/.error.$name ]; then
-  echo "Error producing mfcc & pitch features for $name:"
+  echo "$0: Error producing MFCC and pitch features for $name:"
   tail $logdir/make_mfcc_pitch_${name}.1.log
   exit 1;
 fi
 
-# concatenate the .scp files together.
+# Concatenate the .scp files together.
 for n in $(seq $nj); do
   cat $mfcc_pitch_dir/raw_mfcc_pitch_$name.$n.scp || exit 1;
-done > $data/feats.scp
+done > $data/feats.scp || exit 1
 
 if $write_utt2num_frames; then
   for n in $(seq $nj); do
-    cat $logdir/utt2num_frames.$n || exit 1;
+    cat $logdir/utt2num_frames.$n || exit 1
   done > $data/utt2num_frames || exit 1
-  rm $logdir/utt2num_frames.*
 fi
 
-rm $logdir/wav_${name}.*.scp  $logdir/segments.* 2>/dev/null
+if $write_utt2dur; then
+  for n in $(seq $nj); do
+    cat $logdir/utt2dur.$n || exit 1
+  done > $data/utt2dur || exit 1
+fi
+
+# Store frame_shift, mfcc_config and pitch_config along with features.
+frame_shift=$(perl -ne 'if (/^--frame-shift=(\d+)/) {
+                          printf "%.3f", 0.001 * $1; exit; }' $mfcc_config)
+echo ${frame_shift:-'0.01'} > $data/frame_shift
+mkdir -p $data/conf &&
+  cp $mfcc_config $data/conf/mfcc.conf &&
+  cp $pitch_config $data/conf/pitch.conf || exit 1
 
-nf=`cat $data/feats.scp | wc -l`
-nu=`cat $data/utt2spk | wc -l`
+rm $logdir/wav_${name}.*.scp  $logdir/segments.* \
+   $logdir/utt2num_frames.* $logdir/utt2dur.* 2>/dev/null
+
+nf=$(wc -l < $data/feats.scp)
+nu=$(wc -l < $data/utt2spk)
 if [ $nf -ne $nu ]; then
-  echo "It seems not all of the feature files were successfully processed ($nf != $nu);"
-  echo "consider using utils/fix_data_dir.sh $data"
+  echo "$0: It seems not all of the feature files were successfully procesed" \
+       "($nf != $nu); consider using utils/fix_data_dir.sh $data"
 fi
 
-if [ $nf -lt $[$nu - ($nu/20)] ]; then
-  echo "Less than 95% the features were successfully generated.  Probably a serious error."
-  exit 1;
+if (( nf < nu - nu/20 )); then
+  echo "$0: Less than 95% the features were successfully generated."\
+       "Probably a serious error."
+  exit 1
 fi
 
-echo "Succeeded creating MFCC & Pitch features for $name"
+echo "$0: Succeeded creating MFCC and pitch features for $name"
diff --git a/egs/wsj/s5/steps/make_mfcc_pitch_online.sh b/egs/wsj/s5/steps/make_mfcc_pitch_online.sh
index df51057a00b..001c1e4c6f4 100755
--- a/egs/wsj/s5/steps/make_mfcc_pitch_online.sh
+++ b/egs/wsj/s5/steps/make_mfcc_pitch_online.sh
@@ -1,7 +1,7 @@
 #!/bin/bash
 
-# Copyright 2013 The Shenzhen Key Laboratory of Intelligent Media and Speech,
-#                PKU-HKUST Shenzhen Hong Kong Institution (Author: Wei Shi)
+# Copyright 2013  The Shenzhen Key Laboratory of Intelligent Media and Speech,
+#                 PKU-HKUST Shenzhen Hong Kong Institution (Author: Wei Shi)
 #           2014-2016  Johns Hopkins University (Author: Daniel Povey)
 # Apache 2.0
 # Combine MFCC and online-pitch features together
@@ -14,25 +14,30 @@ mfcc_config=conf/mfcc.conf
 online_pitch_config=conf/online_pitch.conf
 paste_length_tolerance=2
 compress=true
+write_utt2num_frames=true  # If true writes utt2num_frames.
+write_utt2dur=true
 # End configuration section.
 
-echo "$0 $@"  # Print the command line for logging
+echo "$0 $@"  # Print the command line for logging.
 
 if [ -f path.sh ]; then . ./path.sh; fi
 . parse_options.sh || exit 1;
 
 if [ $# -lt 1 ] || [ $# -gt 3 ]; then
-   echo "Usage: $0 [options] <data-dir> [<log-dir> [<mfcc-dir>] ]";
-   echo "e.g.: $0 data/train exp/make_mfcc/train mfcc"
-   echo "Note: <log-dir> defaults to <data-dir>/log, and <mfcc-dir> defaults to <data-dir>/data"
-   echo "Options: "
-   echo "  --mfcc-config              <mfcc-config-file>        # config passed to compute-mfcc-feats, default "
-   echo "                                                       # is conf/mfcc.conf"
-   echo "  --online-pitch-config <online-pitch-config-file>     # config passed to compute-and-process-kaldi-pitch-feats, "
-   echo "                                                       # default is conf/online_pitch.conf"
-   echo "  --paste-length-tolerance   <tolerance>               # length tolerance passed to paste-feats"
-   echo "  --nj                       <nj>                      # number of parallel jobs"
-   echo "  --cmd (utils/run.pl|utils/queue.pl <queue opts>)     # how to run jobs."
+  cat >&2 <<EOF
+Usage: $0 [options] <data-dir> [<log-dir> [<mfcc-dir>] ]
+ e.g.: $0 data/train
+Note: <log-dir> defaults to <data-dir>/log, and
+      <mfcc-dir> defaults to <data-dir>/data
+Options:
+  --mfcc-config <mfcc-config-file>     # config passed to compute-mfcc-feats [conf/mfcc.conf]
+  --online-pitch-config <online-pitch-config-file> # config passed to compute-and-process-kaldi-pitch-feats [conf/online_pitch.conf]
+  --paste-length-tolerance <tolerance> # length tolerance passed to paste-feats.
+  --nj <nj>                            # number of parallel jobs.
+  --cmd <run.pl|queue.pl <queue opts>> # how to run jobs.
+  --write-utt2num-frames <true|false>  # If true, write utt2num_frames file.
+  --write-utt2dur <true|false>         # If true, write utt2dur file.
+EOF
    exit 1;
 fi
 
@@ -90,9 +95,21 @@ for n in $(seq $nj); do
   utils/create_data_link.pl $mfcc_pitch_dir/raw_mfcc_online_pitch_$name.$n.ark
 done
 
+if $write_utt2num_frames; then
+  write_num_frames_opt="--write-num-frames=ark,t:$logdir/utt2num_frames.JOB"
+else
+  write_num_frames_opt=
+fi
+
+if $write_utt2dur; then
+  write_utt2dur_opt="--write-utt2dur=ark,t:$logdir/utt2dur.JOB"
+else
+  write_utt2dur_opt=
+fi
+
 if [ -f $data/segments ]; then
   echo "$0 [info]: segments file exists: using that."
-  split_segments=""
+  split_segments=
   for n in $(seq $nj); do
     split_segments="$split_segments $logdir/segments.$n"
   done
@@ -100,58 +117,88 @@ if [ -f $data/segments ]; then
   utils/split_scp.pl $data/segments $split_segments || exit 1;
   rm $logdir/.error 2>/dev/null
 
-  mfcc_feats="ark:extract-segments scp,p:$scp $logdir/segments.JOB ark:- | compute-mfcc-feats $vtln_opts --verbose=2 --config=$mfcc_config ark:- ark:- |"
-  pitch_feats="ark,s,cs:extract-segments scp,p:$scp $logdir/segments.JOB ark:- | compute-and-process-kaldi-pitch-feats --verbose=2 --config=$online_pitch_config ark:- ark:- |"
+  mfcc_feats="ark:extract-segments scp,p:$scp $logdir/segments.JOB ark:- | \
+    compute-mfcc-feats $vtln_opts $write_utt2dur_opt --verbose=2 \
+      --config=$mfcc_config ark:- ark:- |"
+  pitch_feats="ark,s,cs:extract-segments scp,p:$scp $logdir/segments.JOB ark:- | \
+    compute-and-process-kaldi-pitch-feats --verbose=2 \
+      --config=$online_pitch_config ark:- ark:- |"
 
   $cmd JOB=1:$nj $logdir/make_mfcc_pitch_${name}.JOB.log \
-    paste-feats --length-tolerance=$paste_length_tolerance "$mfcc_feats" "$pitch_feats" ark:- \| \
-    copy-feats --compress=$compress ark:- \
+    paste-feats --length-tolerance=$paste_length_tolerance \
+      "$mfcc_feats" "$pitch_feats" ark:- \| \
+    copy-feats --compress=$compress $write_num_frames_opt ark:- \
       ark,scp:$mfcc_pitch_dir/raw_mfcc_online_pitch_$name.JOB.ark,$mfcc_pitch_dir/raw_mfcc_online_pitch_$name.JOB.scp \
      || exit 1;
 
 else
   echo "$0: [info]: no segments file exists: assuming wav.scp indexed by utterance."
-  split_scps=""
+  split_scps=
   for n in $(seq $nj); do
     split_scps="$split_scps $logdir/wav_${name}.$n.scp"
   done
 
   utils/split_scp.pl $scp $split_scps || exit 1;
 
-  mfcc_feats="ark:compute-mfcc-feats $vtln_opts --verbose=2 --config=$mfcc_config scp,p:$logdir/wav_${name}.JOB.scp ark:- |"
-  pitch_feats="ark,s,cs:compute-and-process-kaldi-pitch-feats --verbose=2 --config=$online_pitch_config scp,p:$logdir/wav_${name}.JOB.scp ark:- |"
+  mfcc_feats="ark:compute-mfcc-feats $vtln_opts $write_utt2dur_opt --verbose=2 \
+    --config=$mfcc_config scp,p:$logdir/wav_${name}.JOB.scp ark:- |"
+  pitch_feats="ark,s,cs:compute-and-process-kaldi-pitch-feats --verbose=2 \
+    --config=$online_pitch_config scp,p:$logdir/wav_${name}.JOB.scp ark:- |"
 
   $cmd JOB=1:$nj $logdir/make_mfcc_pitch_${name}.JOB.log \
-    paste-feats --length-tolerance=$paste_length_tolerance "$mfcc_feats" "$pitch_feats" ark:- \| \
-    copy-feats --compress=$compress ark:- \
+    paste-feats --length-tolerance=$paste_length_tolerance \
+      "$mfcc_feats" "$pitch_feats" ark:- \| \
+    copy-feats --compress=$compress $write_num_frames_opt ark:- \
       ark,scp:$mfcc_pitch_dir/raw_mfcc_online_pitch_$name.JOB.ark,$mfcc_pitch_dir/raw_mfcc_online_pitch_$name.JOB.scp \
       || exit 1;
 fi
 
 
 if [ -f $logdir/.error.$name ]; then
-  echo "Error producing mfcc & pitch features for $name:"
+  echo "$0: Error producing MFCC and online-pitch features for $name:"
   tail $logdir/make_mfcc_pitch_${name}.1.log
   exit 1;
 fi
 
-# concatenate the .scp files together.
+# Concatenate the .scp files together.
 for n in $(seq $nj); do
-  cat $mfcc_pitch_dir/raw_mfcc_online_pitch_$name.$n.scp || exit 1;
-done > $data/feats.scp
+  cat $mfcc_pitch_dir/raw_mfcc_online_pitch_$name.$n.scp || exit 1
+done > $data/feats.scp || exit 1
+
+if $write_utt2num_frames; then
+  for n in $(seq $nj); do
+    cat $logdir/utt2num_frames.$n || exit 1
+  done > $data/utt2num_frames || exit 1
+fi
 
-rm $logdir/wav_${name}.*.scp  $logdir/segments.* 2>/dev/null
+if $write_utt2dur; then
+  for n in $(seq $nj); do
+    cat $logdir/utt2dur.$n || exit 1
+  done > $data/utt2dur || exit 1
+fi
 
-nf=`cat $data/feats.scp | wc -l`
-nu=`cat $data/utt2spk | wc -l`
+# Store frame_shift, mfcc_config and pitch_config_online along with features.
+frame_shift=$(perl -ne 'if (/^--frame-shift=(\d+)/) {
+                          printf "%.3f", 0.001 * $1; exit; }' $mfcc_config)
+echo ${frame_shift:-'0.01'} > $data/frame_shift
+mkdir -p $data/conf &&
+  cp $mfcc_config $data/conf/mfcc.conf &&
+  cp $online_pitch_config $data/conf/online_pitch.conf || exit 1
+
+rm $logdir/wav_${name}.*.scp  $logdir/segments.* \
+   $logdir/utt2num_frames.* $logdir/utt2dur.* 2>/dev/null
+
+nf=$(wc -l < $data/feats.scp)
+nu=$(wc -l < $data/utt2spk)
 if [ $nf -ne $nu ]; then
-  echo "It seems not all of the feature files were successfully processed ($nf != $nu);"
-  echo "consider using utils/fix_data_dir.sh $data"
+  echo "$0: It seems not all of the feature files were successfully procesed" \
+       "($nf != $nu); consider using utils/fix_data_dir.sh $data"
 fi
 
-if [ $nf -lt $[$nu - ($nu/20)] ]; then
-  echo "Less than 95% the features were successfully generated.  Probably a serious error."
-  exit 1;
+if (( nf < nu - nu/20 )); then
+  echo "$0: Less than 95% the features were successfully generated."\
+       "Probably a serious error."
+  exit 1
 fi
 
-echo "Succeeded creating MFCC & online-pitch features for $name"
+echo "$0: Succeeded creating MFCC and online-pitch features for $name"
diff --git a/egs/wsj/s5/steps/make_plp.sh b/egs/wsj/s5/steps/make_plp.sh
index 85b4a02fbb6..c4a987aaeeb 100755
--- a/egs/wsj/s5/steps/make_plp.sh
+++ b/egs/wsj/s5/steps/make_plp.sh
@@ -10,22 +10,28 @@ nj=4
 cmd=run.pl
 plp_config=conf/plp.conf
 compress=true
-write_utt2num_frames=false  # if true writes utt2num_frames
+write_utt2num_frames=true  # If true writes utt2num_frames.
+write_utt2dur=true
 # End configuration section.
 
-echo "$0 $@"  # Print the command line for logging
+echo "$0 $@"  # Print the command line for logging.
 
 if [ -f path.sh ]; then . ./path.sh; fi
 . parse_options.sh || exit 1;
 
 if [ $# -lt 1 ] || [ $# -gt 3 ]; then
-   echo "Usage: $0 [options] <data-dir> [<log-dir> [<plp-dir>] ]";
-   echo "e.g.: $0 data/train exp/make_plp/train mfcc"
-   echo "Note: <log-dir> defaults to <data-dir>/log, and <plp-dir> defaults to <data-dir>/data"
-   echo "Options: "
-   echo "  --plp-config <config-file>                      # config passed to compute-plp-feats "
-   echo "  --nj <nj>                                        # number of parallel jobs"
-   echo "  --cmd (utils/run.pl|utils/queue.pl <queue opts>) # how to run jobs."
+  cat >&2 <<EOF
+Usage: $0 [options] <data-dir> [<log-dir> [<plp-dir>] ]
+ e.g.: $0 data/train
+Note: <log-dir> defaults to <data-dir>/log, and
+      <plp-dir> defaults to <data-dir>/data
+Options:
+  --plp-config <config-file>           # config passed to compute-plp-feats.
+  --nj <nj>                            # number of parallel jobs.
+  --cmd <run.pl|queue.pl <queue opts>> # how to run jobs.
+  --write-utt2num-frames <true|false>  # If true, write utt2num_frames file.
+  --write-utt2dur <true|false>         # If true, write utt2dur file.
+EOF
    exit 1;
 fi
 
@@ -62,7 +68,7 @@ required="$scp $plp_config"
 
 for f in $required; do
   if [ ! -f $f ]; then
-    echo "make_plp.sh: no such file $f"
+    echo "$0: no such file $f"
     exit 1;
   fi
 done
@@ -74,6 +80,8 @@ if [ -f $data/spk2warp ]; then
 elif [ -f $data/utt2warp ]; then
   echo "$0 [info]: using VTLN warp factors from $data/utt2warp"
   vtln_opts="--vtln-map=ark:$data/utt2warp"
+else
+  vtln_opts=
 fi
 
 for n in $(seq $nj); do
@@ -88,9 +96,15 @@ else
   write_num_frames_opt=
 fi
 
+if $write_utt2dur; then
+  write_utt2dur_opt="--write-utt2dur=ark,t:$logdir/utt2dur.JOB"
+else
+  write_utt2dur_opt=
+fi
+
 if [ -f $data/segments ]; then
   echo "$0 [info]: segments file exists: using that."
-  split_segments=""
+  split_segments=
   for n in $(seq $nj); do
     split_segments="$split_segments $logdir/segments.$n"
   done
@@ -100,14 +114,15 @@ if [ -f $data/segments ]; then
 
   $cmd JOB=1:$nj $logdir/make_plp_${name}.JOB.log \
     extract-segments scp,p:$scp $logdir/segments.JOB ark:- \| \
-    compute-plp-feats $vtln_opts --verbose=2 --config=$plp_config ark:- ark:- \| \
+    compute-plp-feats $vtln_opts $write_utt2dur_opt --verbose=2 \
+      --config=$plp_config ark:- ark:- \| \
     copy-feats --compress=$compress $write_num_frames_opt ark:- \
       ark,scp:$plpdir/raw_plp_$name.JOB.ark,$plpdir/raw_plp_$name.JOB.scp \
      || exit 1;
 
 else
   echo "$0: [info]: no segments file exists: assuming wav.scp indexed by utterance."
-  split_scps=""
+  split_scps=
   for n in $(seq $nj); do
     split_scps="$split_scps $logdir/wav_${name}.$n.scp"
   done
@@ -115,7 +130,8 @@ else
   utils/split_scp.pl $scp $split_scps || exit 1;
 
   $cmd JOB=1:$nj $logdir/make_plp_${name}.JOB.log \
-    compute-plp-feats  $vtln_opts --verbose=2 --config=$plp_config scp,p:$logdir/wav_${name}.JOB.scp ark:- \| \
+    compute-plp-feats $vtln_opts $write_utt2dur_opt --verbose=2 \
+      --config=$plp_config scp,p:$logdir/wav_${name}.JOB.scp ark:- \| \
     copy-feats --compress=$compress $write_num_frames_opt ark:- \
       ark,scp:$plpdir/raw_plp_$name.JOB.ark,$plpdir/raw_plp_$name.JOB.scp \
       || exit 1;
@@ -124,34 +140,48 @@ fi
 
 
 if [ -f $logdir/.error.$name ]; then
-  echo "Error producing plp features for $name:"
+  echo "$0: Error producing PLP features for $name:"
   tail $logdir/make_plp_${name}.1.log
   exit 1;
 fi
 
 # concatenate the .scp files together.
 for n in $(seq $nj); do
-  cat $plpdir/raw_plp_$name.$n.scp || exit 1;
+  cat $plpdir/raw_plp_$name.$n.scp || exit 1
 done > $data/feats.scp
 
 if $write_utt2num_frames; then
   for n in $(seq $nj); do
-    cat $logdir/utt2num_frames.$n || exit 1;
+    cat $logdir/utt2num_frames.$n || exit 1
   done > $data/utt2num_frames || exit 1
-  rm $logdir/utt2num_frames.*
 fi
 
-rm $logdir/wav_${name}.*.scp  $logdir/segments.* 2>/dev/null
+if $write_utt2dur; then
+  for n in $(seq $nj); do
+    cat $logdir/utt2dur.$n || exit 1
+  done > $data/utt2dur || exit 1
+fi
+
+# Store frame_shift and plp_config along with features.
+frame_shift=$(perl -ne 'if (/^--frame-shift=(\d+)/) {
+                          printf "%.3f", 0.001 * $1; exit; }' $plp_config)
+echo ${frame_shift:-'0.01'} > $data/frame_shift
+mkdir -p $data/conf && cp $plp_config $data/conf/plp.conf || exit 1
+
+rm $logdir/wav_${name}.*.scp  $logdir/segments.* \
+   $logdir/utt2num_frames.* $logdir/utt2dur.* 2>/dev/null
 
-nf=`cat $data/feats.scp | wc -l`
-nu=`cat $data/utt2spk | wc -l`
+nf=$(wc -l < $data/feats.scp)
+nu=$(wc -l < $data/utt2spk)
 if [ $nf -ne $nu ]; then
-  echo "It seems not all of the feature files were successfully ($nf != $nu);"
-  echo "consider using utils/fix_data_dir.sh $data"
+  echo "$0: It seems not all of the feature files were successfully procesed" \
+       "($nf != $nu); consider using utils/fix_data_dir.sh $data"
 fi
-if [ $nf -lt $[$nu - ($nu/20)] ]; then
-  echo "Less than 95% the features were successfully generated.  Probably a serious error."
-  exit 1;
+
+if (( nf < nu - nu/20 )); then
+  echo "$0: Less than 95% the features were successfully generated."\
+       "Probably a serious error."
+  exit 1
 fi
 
-echo "Succeeded creating PLP features for $name"
+echo "$0: Succeeded creating PLP features for $name"
diff --git a/egs/wsj/s5/steps/make_plp_pitch.sh b/egs/wsj/s5/steps/make_plp_pitch.sh
index 40ddd314f6c..9f565d8a5bf 100755
--- a/egs/wsj/s5/steps/make_plp_pitch.sh
+++ b/egs/wsj/s5/steps/make_plp_pitch.sh
@@ -1,7 +1,7 @@
 #!/bin/bash
 
-# Copyright 2013 The Shenzhen Key Laboratory of Intelligent Media and Speech,
-#                PKU-HKUST Shenzhen Hong Kong Institution (Author: Wei Shi)
+# Copyright 2013  The Shenzhen Key Laboratory of Intelligent Media and Speech,
+#                 PKU-HKUST Shenzhen Hong Kong Institution (Author: Wei Shi)
 #           2016  Johns Hopkins University (Author: Daniel Povey)
 # Apache 2.0
 # Combine PLP and pitch features together
@@ -15,25 +15,31 @@ pitch_config=conf/pitch.conf
 pitch_postprocess_config=
 paste_length_tolerance=2
 compress=true
-write_utt2num_frames=false  # if true writes utt2num_frames
+write_utt2num_frames=true  # If true writes utt2num_frames.
+write_utt2dur=true
 # End configuration section.
 
-echo "$0 $@"  # Print the command line for logging
+echo "$0 $@"  # Print the command line for logging.
 
 if [ -f ./path.sh ]; then . ./path.sh;  fi
 . parse_options.sh || exit 1;
 
 if [ $# -lt 1 ] || [ $# -gt 3 ]; then
-   echo "Usage: $0 [options] <data-dir> [<log-dir> [<plp-dir>] ]";
-   echo "e.g.: $0 data/train exp/make_plp/train mfcc"
-   echo "Note: <log-dir> defaults to <data-dir>/log, and <plp-dir> defaults to <data-dir>/data"
-   echo "Options: "
-   echo "  --plp-config               <config-file>             # config passed to compute-plp-feats "
-   echo "  --pitch-config             <pitch-config-file>       # config passed to compute-kaldi-pitch-feats "
-   echo "  --pitch-postprocess-config <postprocess-config-file> # config passed to process-kaldi-pitch-feats "
-   echo "  --paste-length-tolerance   <tolerance>               # length tolerance passed to paste-feats"
-   echo "  --nj                       <nj>                      # number of parallel jobs"
-   echo "  --cmd (utils/run.pl|utils/queue.pl <queue opts>)     # how to run jobs."
+  cat >&2 <<EOF
+Usage: $0 [options] <data-dir> [<log-dir> [<plp-dir>] ]
+ e.g.: $0 data/train
+Note: <log-dir> defaults to <data-dir>/log, and
+      <plp-dir> defaults to <data-dir>/data
+Options:
+  --plp-config <plp-config-file>       # config passed to compute-plp-feats.
+  --pitch-config <pitch-config-file>   # config passed to compute-kaldi-pitch-feats.
+  --pitch-postprocess-config <postprocess-config-file> # config passed to process-kaldi-pitch-feats.
+  --paste-length-tolerance <tolerance> # length tolerance passed to paste-feats.
+  --nj <nj>                            # number of parallel jobs.
+  --cmd <run.pl|queue.pl <queue opts>> # how to run jobs.
+  --write-utt2num-frames <true|false>  # If true, write utt2num_frames file.
+  --write-utt2dur <true|false>         # If true, write utt2dur file.
+EOF
    exit 1;
 fi
 
@@ -70,7 +76,7 @@ required="$scp $plp_config $pitch_config"
 
 for f in $required; do
   if [ ! -f $f ]; then
-    echo "make_plp_pitch.sh: no such file $f"
+    echo "$0: no such file $f"
     exit 1;
   fi
 done
@@ -102,9 +108,15 @@ else
   write_num_frames_opt=
 fi
 
+if $write_utt2dur; then
+  write_utt2dur_opt="--write-utt2dur=ark,t:$logdir/utt2dur.JOB"
+else
+  write_utt2dur_opt=
+fi
+
 if [ -f $data/segments ]; then
   echo "$0 [info]: segments file exists: using that."
-  split_segments=""
+  split_segments=
   for n in $(seq $nj); do
     split_segments="$split_segments $logdir/segments.$n"
   done
@@ -112,67 +124,89 @@ if [ -f $data/segments ]; then
   utils/split_scp.pl $data/segments $split_segments || exit 1;
   rm $logdir/.error 2>/dev/null
 
-  plp_feats="ark:extract-segments scp,p:$scp $logdir/segments.JOB ark:- | compute-plp-feats $vtln_opts --verbose=2 --config=$plp_config ark:- ark:- |"
-  pitch_feats="ark,s,cs:extract-segments scp,p:$scp $logdir/segments.JOB ark:- | compute-kaldi-pitch-feats --verbose=2 --config=$pitch_config ark:- ark:- | process-kaldi-pitch-feats $postprocess_config_opt ark:- ark:- |"
+  plp_feats="ark:extract-segments scp,p:$scp $logdir/segments.JOB ark:- | \
+    compute-plp-feats $vtln_opts $write_utt2dur_opt --verbose=2 \
+      --config=$plp_config ark:- ark:- |"
+  pitch_feats="ark,s,cs:extract-segments scp,p:$scp $logdir/segments.JOB ark:- | \
+    compute-kaldi-pitch-feats --verbose=2 --config=$pitch_config ark:- ark:- | \
+    process-kaldi-pitch-feats $postprocess_config_opt ark:- ark:- |"
 
   $cmd JOB=1:$nj $logdir/make_plp_pitch_${name}.JOB.log \
-    paste-feats --length-tolerance=$paste_length_tolerance "$plp_feats" "$pitch_feats" ark:- \| \
+    paste-feats --length-tolerance=$paste_length_tolerance \
+      "$plp_feats" "$pitch_feats" ark:- \| \
     copy-feats --compress=$compress $write_num_frames_opt ark:- \
       ark,scp:$plp_pitch_dir/raw_plp_pitch_$name.JOB.ark,$plp_pitch_dir/raw_plp_pitch_$name.JOB.scp \
      || exit 1;
 
 else
   echo "$0: [info]: no segments file exists: assuming wav.scp indexed by utterance."
-  split_scps=""
+  split_scps=
   for n in $(seq $nj); do
     split_scps="$split_scps $logdir/wav_${name}.$n.scp"
   done
 
   utils/split_scp.pl $scp $split_scps || exit 1;
 
-
-  plp_feats="ark:compute-plp-feats $vtln_opts --verbose=2 --config=$plp_config scp,p:$logdir/wav_${name}.JOB.scp ark:- |"
-  pitch_feats="ark,s,cs:compute-kaldi-pitch-feats --verbose=2 --config=$pitch_config scp,p:$logdir/wav_${name}.JOB.scp ark:- | process-kaldi-pitch-feats $postprocess_config_opt ark:- ark:- |"
+  plp_feats="ark:compute-plp-feats $vtln_opts $write_utt2dur_opt --verbose=2 \
+    --config=$plp_config scp,p:$logdir/wav_${name}.JOB.scp ark:- |"
+  pitch_feats="ark,s,cs:compute-kaldi-pitch-feats --verbose=2 \
+      --config=$pitch_config scp,p:$logdir/wav_${name}.JOB.scp ark:- | \
+    process-kaldi-pitch-feats $postprocess_config_opt ark:- ark:- |"
 
   $cmd JOB=1:$nj $logdir/make_plp_pitch_${name}.JOB.log \
-    paste-feats --length-tolerance=$paste_length_tolerance "$plp_feats" "$pitch_feats" ark:- \| \
+    paste-feats --length-tolerance=$paste_length_tolerance \
+      "$plp_feats" "$pitch_feats" ark:- \| \
     copy-feats --compress=$compress $write_num_frames_opt ark:- \
       ark,scp:$plp_pitch_dir/raw_plp_pitch_$name.JOB.ark,$plp_pitch_dir/raw_plp_pitch_$name.JOB.scp \
       || exit 1;
-
 fi
 
 
 if [ -f $logdir/.error.$name ]; then
-  echo "Error producing plp & pitch features for $name:"
+  echo "$0: Error producing PLP and pitch features for $name:"
   tail $logdir/make_plp_pitch_${name}.1.log
   exit 1;
 fi
 
-# concatenate the .scp files together.
+# Concatenate the .scp files together.
 for n in $(seq $nj); do
-  cat $plp_pitch_dir/raw_plp_pitch_$name.$n.scp || exit 1;
-done > $data/feats.scp
+  cat $plp_pitch_dir/raw_plp_pitch_$name.$n.scp || exit 1
+done > $data/feats.scp || exit 1
 
 if $write_utt2num_frames; then
   for n in $(seq $nj); do
-    cat $logdir/utt2num_frames.$n || exit 1;
+    cat $logdir/utt2num_frames.$n || exit 1
   done > $data/utt2num_frames || exit 1
-  rm $logdir/utt2num_frames.*
 fi
 
-rm $logdir/wav_${name}.*.scp  $logdir/segments.* 2>/dev/null
+if $write_utt2dur; then
+  for n in $(seq $nj); do
+    cat $logdir/utt2dur.$n || exit 1
+  done > $data/utt2dur || exit 1
+fi
+
+# Store frame_shift, plp_config and pitch_config along with features.
+frame_shift=$(perl -ne 'if (/^--frame-shift=(\d+)/) {
+                          printf "%.3f", 0.001 * $1; exit; }' $plp_config)
+echo ${frame_shift:-'0.01'} > $data/frame_shift
+mkdir -p $data/conf &&
+  cp $plp_config $data/conf/plp.conf &&
+  cp $pitch_config $data/conf/pitch.conf || exit 1
 
-nf=`cat $data/feats.scp | wc -l`
-nu=`cat $data/utt2spk | wc -l`
+rm $logdir/wav_${name}.*.scp  $logdir/segments.* \
+   $logdir/utt2num_frames.* $logdir/utt2dur.* 2>/dev/null
+
+nf=$(wc -l < $data/feats.scp)
+nu=$(wc -l < $data/utt2spk)
 if [ $nf -ne $nu ]; then
-  echo "It seems not all of the feature files were successfully processed ($nf != $nu);"
-  echo "consider using utils/fix_data_dir.sh $data"
+  echo "$0: It seems not all of the feature files were successfully procesed" \
+       "($nf != $nu); consider using utils/fix_data_dir.sh $data"
 fi
 
-if [ $nf -lt $[$nu - ($nu/20)] ]; then
-  echo "Less than 95% the features were successfully generated.  Probably a serious error."
-  exit 1;
+if (( nf < nu - nu/20 )); then
+  echo "$0: Less than 95% the features were successfully generated."\
+       "Probably a serious error."
+  exit 1
 fi
 
-echo "Succeeded creating PLP & Pitch features for $name"
+echo "$0: Succeeded creating PLP and pitch features for $name"
diff --git a/src/featbin/compute-fbank-feats.cc b/src/featbin/compute-fbank-feats.cc
index 41df621d62d..e52b30bafb6 100644
--- a/src/featbin/compute-fbank-feats.cc
+++ b/src/featbin/compute-fbank-feats.cc
@@ -19,9 +19,9 @@
 // limitations under the License.
 
 #include "base/kaldi-common.h"
-#include "util/common-utils.h"
 #include "feat/feature-fbank.h"
 #include "feat/wave-reader.h"
+#include "util/common-utils.h"
 
 
 int main(int argc, char *argv[]) {
@@ -29,35 +29,42 @@ int main(int argc, char *argv[]) {
     using namespace kaldi;
     const char *usage =
         "Create Mel-filter bank (FBANK) feature files.\n"
-        "Usage:  compute-fbank-feats [options...] <wav-rspecifier> <feats-wspecifier>\n";
+        "Usage:  compute-fbank-feats [options...] <wav-rspecifier> "
+        "<feats-wspecifier>\n";
 
-    // construct all the global objects
+    // Construct all the global objects.
     ParseOptions po(usage);
     FbankOptions fbank_opts;
+    // Define defaults for global options.
     bool subtract_mean = false;
     BaseFloat vtln_warp = 1.0;
     std::string vtln_map_rspecifier;
     std::string utt2spk_rspecifier;
     int32 channel = -1;
     BaseFloat min_duration = 0.0;
-    // Define defaults for gobal options
     std::string output_format = "kaldi";
+    std::string utt2dur_wspecifier;
 
-    // Register the option struct
+    // Register the option struct.
     fbank_opts.Register(&po);
-    // Register the options
-    po.Register("output-format", &output_format, "Format of the output files [kaldi, htk]");
-    po.Register("subtract-mean", &subtract_mean, "Subtract mean of each feature file [CMS]; not recommended to do it this way. ");
-    po.Register("vtln-warp", &vtln_warp, "Vtln warp factor (only applicable if vtln-map not specified)");
-    po.Register("vtln-map", &vtln_map_rspecifier, "Map from utterance or speaker-id to vtln warp factor (rspecifier)");
-    po.Register("utt2spk", &utt2spk_rspecifier, "Utterance to speaker-id map (if doing VTLN and you have warps per speaker)");
-    po.Register("channel", &channel, "Channel to extract (-1 -> expect mono, 0 -> left, 1 -> right)");
-    po.Register("min-duration", &min_duration, "Minimum duration of segments to process (in seconds).");
-
-    // OPTION PARSING ..........................................................
-    //
-
-    // parse options (+filling the registered variables)
+    // Register the options.
+    po.Register("output-format", &output_format,
+                "Format of the output files [kaldi, htk]");
+    po.Register("subtract-mean", &subtract_mean, "Subtract mean of each "
+                "feature file [CMS]; not recommended to do it this way. ");
+    po.Register("vtln-warp", &vtln_warp,
+                "Vtln warp factor (only applicable if vtln-map not specified)");
+    po.Register("vtln-map", &vtln_map_rspecifier,"Map from utterance or "
+                "speaker-id to vtln warp factor (rspecifier)");
+    po.Register("utt2spk", &utt2spk_rspecifier, "Utterance to speaker-id map "
+                "(if doing VTLN and you have warps per speaker)");
+    po.Register("channel", &channel, "Channel to extract (-1 -> expect mono, "
+                "0 -> left, 1 -> right)");
+    po.Register("min-duration", &min_duration, "Minimum duration of segments "
+                "to process (in seconds).");
+    po.Register("write-utt2dur", &utt2dur_wspecifier, "Wspecifier to write "
+                "duration of each utterance in seconds, e.g. 'ark,t:utt2dur'.");
+
     po.Read(argc, argv);
 
     if (po.NumArgs() != 2) {
@@ -71,16 +78,16 @@ int main(int argc, char *argv[]) {
 
     Fbank fbank(fbank_opts);
 
+    if (utt2spk_rspecifier != "" && vtln_map_rspecifier != "")
+      KALDI_ERR << ("The --utt2spk option is only needed if "
+                    "the --vtln-map option is used.");
+    RandomAccessBaseFloatReaderMapped vtln_map_reader(vtln_map_rspecifier,
+                                                      utt2spk_rspecifier);
+
     SequentialTableReader<WaveHolder> reader(wav_rspecifier);
     BaseFloatMatrixWriter kaldi_writer;  // typedef to TableWriter<something>.
     TableWriter<HtkMatrixHolder> htk_writer;
 
-    if (utt2spk_rspecifier != "")
-      KALDI_ASSERT(vtln_map_rspecifier != "" && "the utt2spk option is only "
-                   "needed if the vtln-map option is used.");
-    RandomAccessBaseFloatReaderMapped vtln_map_reader(vtln_map_rspecifier,
-                                                      utt2spk_rspecifier);
-
     if (output_format == "kaldi") {
       if (!kaldi_writer.Open(output_wspecifier))
         KALDI_ERR << "Could not initialize output with wspecifier "
@@ -93,6 +100,8 @@ int main(int argc, char *argv[]) {
       KALDI_ERR << "Invalid output_format string " << output_format;
     }
 
+    DoubleWriter utt2dur_writer(utt2dur_wspecifier);
+
     int32 num_utts = 0, num_success = 0;
     for (; !reader.Done(); reader.Next()) {
       num_utts++;
@@ -105,7 +114,7 @@ int main(int argc, char *argv[]) {
       }
       int32 num_chan = wave_data.Data().NumRows(), this_chan = channel;
       {  // This block works out the channel (0=left, 1=right...)
-        KALDI_ASSERT(num_chan > 0);  // should have been caught in
+        KALDI_ASSERT(num_chan > 0);  // This should have been caught in
         // reading code if no channels.
         if (channel == -1) {
           this_chan = 0;
@@ -136,10 +145,10 @@ int main(int argc, char *argv[]) {
       SubVector<BaseFloat> waveform(wave_data.Data(), this_chan);
       Matrix<BaseFloat> features;
       try {
-        fbank.ComputeFeatures(waveform, wave_data.SampFreq(), vtln_warp_local, &features);
+        fbank.ComputeFeatures(waveform, wave_data.SampFreq(),
+                              vtln_warp_local, &features);
       } catch (...) {
-        KALDI_WARN << "Failed to compute features for utterance "
-                   << utt;
+        KALDI_WARN << "Failed to compute features for utterance " << utt;
         continue;
       }
       if (subtract_mean) {
@@ -165,6 +174,9 @@ int main(int argc, char *argv[]) {
         p.second = header;
         htk_writer.Write(utt, p);
       }
+      if (utt2dur_writer.IsOpen()) {
+        utt2dur_writer.Write(utt, wave_data.Duration());
+      }
       if (num_utts % 10 == 0)
         KALDI_LOG << "Processed " << num_utts << " utterances";
       KALDI_VLOG(2) << "Processed features for key " << utt;
@@ -177,6 +189,4 @@ int main(int argc, char *argv[]) {
     std::cerr << e.what();
     return -1;
   }
-  return 0;
 }
-
diff --git a/src/featbin/compute-mfcc-feats.cc b/src/featbin/compute-mfcc-feats.cc
index 09efcd38dd0..0827d0a9360 100644
--- a/src/featbin/compute-mfcc-feats.cc
+++ b/src/featbin/compute-mfcc-feats.cc
@@ -19,33 +19,35 @@
 // limitations under the License.
 
 #include "base/kaldi-common.h"
-#include "util/common-utils.h"
 #include "feat/feature-mfcc.h"
 #include "feat/wave-reader.h"
+#include "util/common-utils.h"
 
 int main(int argc, char *argv[]) {
   try {
     using namespace kaldi;
     const char *usage =
         "Create MFCC feature files.\n"
-        "Usage:  compute-mfcc-feats [options...] <wav-rspecifier> <feats-wspecifier>\n";
+        "Usage:  compute-mfcc-feats [options...] <wav-rspecifier> "
+        "<feats-wspecifier>\n";
 
-    // construct all the global objects
+    // Construct all the global objects.
     ParseOptions po(usage);
     MfccOptions mfcc_opts;
+    // Define defaults for global options.
     bool subtract_mean = false;
     BaseFloat vtln_warp = 1.0;
     std::string vtln_map_rspecifier;
     std::string utt2spk_rspecifier;
     int32 channel = -1;
     BaseFloat min_duration = 0.0;
-    // Define defaults for gobal options
     std::string output_format = "kaldi";
+    std::string utt2dur_wspecifier;
 
-    // Register the MFCC option struct
+    // Register the MFCC option struct.
     mfcc_opts.Register(&po);
 
-    // Register the options
+    // Register the options.
     po.Register("output-format", &output_format, "Format of the output "
                 "files [kaldi, htk]");
     po.Register("subtract-mean", &subtract_mean, "Subtract mean of each "
@@ -60,6 +62,8 @@ int main(int argc, char *argv[]) {
                 "0 -> left, 1 -> right)");
     po.Register("min-duration", &min_duration, "Minimum duration of segments "
                 "to process (in seconds).");
+    po.Register("write-utt2dur", &utt2dur_wspecifier, "Wspecifier to write "
+                "duration of each utterance in seconds, e.g. 'ark,t:utt2dur'.");
 
     po.Read(argc, argv);
 
@@ -74,16 +78,16 @@ int main(int argc, char *argv[]) {
 
     Mfcc mfcc(mfcc_opts);
 
+    if (utt2spk_rspecifier != "" && vtln_map_rspecifier != "")
+      KALDI_ERR << ("The --utt2spk option is only needed if "
+                    "the --vtln-map option is used.");
+    RandomAccessBaseFloatReaderMapped vtln_map_reader(vtln_map_rspecifier,
+                                                      utt2spk_rspecifier);
+
     SequentialTableReader<WaveHolder> reader(wav_rspecifier);
     BaseFloatMatrixWriter kaldi_writer;  // typedef to TableWriter<something>.
     TableWriter<HtkMatrixHolder> htk_writer;
 
-    if (utt2spk_rspecifier != "")
-      KALDI_ASSERT(vtln_map_rspecifier != "" && "the utt2spk option is only "
-                   "needed if the vtln-map option is used.");
-    RandomAccessBaseFloatReaderMapped vtln_map_reader(vtln_map_rspecifier,
-                                                      utt2spk_rspecifier);
-    
     if (output_format == "kaldi") {
       if (!kaldi_writer.Open(output_wspecifier))
         KALDI_ERR << "Could not initialize output with wspecifier "
@@ -96,6 +100,8 @@ int main(int argc, char *argv[]) {
       KALDI_ERR << "Invalid output_format string " << output_format;
     }
 
+    DoubleWriter utt2dur_writer(utt2dur_wspecifier);
+
     int32 num_utts = 0, num_success = 0;
     for (; !reader.Done(); reader.Next()) {
       num_utts++;
@@ -139,10 +145,10 @@ int main(int argc, char *argv[]) {
       SubVector<BaseFloat> waveform(wave_data.Data(), this_chan);
       Matrix<BaseFloat> features;
       try {
-        mfcc.ComputeFeatures(waveform, wave_data.SampFreq(), vtln_warp_local, &features);
+        mfcc.ComputeFeatures(waveform, wave_data.SampFreq(),
+                             vtln_warp_local, &features);
       } catch (...) {
-        KALDI_WARN << "Failed to compute features for utterance "
-                   << utt;
+        KALDI_WARN << "Failed to compute features for utterance " << utt;
         continue;
       }
       if (subtract_mean) {
@@ -168,6 +174,9 @@ int main(int argc, char *argv[]) {
         p.second = header;
         htk_writer.Write(utt, p);
       }
+      if (utt2dur_writer.IsOpen()) {
+        utt2dur_writer.Write(utt, wave_data.Duration());
+      }
       if (num_utts % 10 == 0)
         KALDI_LOG << "Processed " << num_utts << " utterances";
       KALDI_VLOG(2) << "Processed features for key " << utt;
@@ -181,4 +190,3 @@ int main(int argc, char *argv[]) {
     return -1;
   }
 }
-

From a1343bd851722af16ae7415db8f0c2e3aac7df77 Mon Sep 17 00:00:00 2001
From: "Mortaza (Morrie) Doulaty" <mortaza.doulaty@gmail.com>
Date: Wed, 15 May 2019 20:29:48 +0100
Subject: [PATCH 114/163] [build] Initial version of Docker images for (CPU and
 GPU versions) (#3322)

---
 docker/README.md                  | 14 +++++++++++
 docker/debian9.8-cpu/Dockerfile   | 39 +++++++++++++++++++++++++++++++
 docker/ubuntu16.04-gpu/Dockerfile | 38 ++++++++++++++++++++++++++++++
 3 files changed, 91 insertions(+)
 create mode 100644 docker/README.md
 create mode 100644 docker/debian9.8-cpu/Dockerfile
 create mode 100644 docker/ubuntu16.04-gpu/Dockerfile

diff --git a/docker/README.md b/docker/README.md
new file mode 100644
index 00000000000..f4950376f52
--- /dev/null
+++ b/docker/README.md
@@ -0,0 +1,14 @@
+# Kaldi Docker Images
+
+Sample usage of the CPU based images:
+```
+docker run -it mdoulaty/kaldi:latest bash
+``` 
+
+Sample usage of the GPU based images:
+
+Note: use [nvidia-docker](https://github.com/NVIDIA/nvidia-docker) to run the GPU images.
+
+```
+docker run -it --runtime=nvidia mdoulaty/kaldi:gpu-latest bash
+```
diff --git a/docker/debian9.8-cpu/Dockerfile b/docker/debian9.8-cpu/Dockerfile
new file mode 100644
index 00000000000..f080677738b
--- /dev/null
+++ b/docker/debian9.8-cpu/Dockerfile
@@ -0,0 +1,39 @@
+
+FROM debian:9.8
+LABEL maintainer="mdoulaty@gmail.com"
+
+RUN apt-get update && \
+    apt-get install -y --no-install-recommends \
+        g++ \
+        make \
+        automake \
+        autoconf \
+        bzip2 \
+        unzip \
+        wget \
+        sox \
+        libtool \
+        git \
+        subversion \
+        python2.7 \
+        python3 \
+        zlib1g-dev \
+        ca-certificates \
+        patch \
+	vim && \
+    rm -rf /var/lib/apt/lists/*
+
+RUN ln -s /usr/bin/python2.7 /usr/bin/python 
+
+RUN git clone https://github.com/kaldi-asr/kaldi.git /opt/kaldi && \
+    cd /opt/kaldi && \
+    cd /opt/kaldi/tools && \
+    ./extras/install_mkl.sh && \
+    make -j $(nproc) && \
+    cd /opt/kaldi/src && \
+    ./configure --shared && \
+    make depend -j $(nproc) && \
+    make -j $(nproc)
+
+WORKDIR /opt/kaldi/
+
diff --git a/docker/ubuntu16.04-gpu/Dockerfile b/docker/ubuntu16.04-gpu/Dockerfile
new file mode 100644
index 00000000000..388c3639f24
--- /dev/null
+++ b/docker/ubuntu16.04-gpu/Dockerfile
@@ -0,0 +1,38 @@
+
+FROM nvidia/cuda:10.0-cudnn7-devel-ubuntu16.04
+LABEL maintainer="mdoulaty@gmail.com"
+
+RUN apt-get update && \
+    apt-get install -y --no-install-recommends \
+        g++ \
+        make \
+        automake \
+        autoconf \
+        bzip2 \
+        unzip \
+        wget \
+        sox \
+        libtool \
+        git \
+        subversion \
+        python2.7 \
+        python3 \
+        zlib1g-dev \
+        ca-certificates \
+        patch \
+	vim && \
+    rm -rf /var/lib/apt/lists/*
+
+RUN ln -s /usr/bin/python2.7 /usr/bin/python 
+
+RUN git clone https://github.com/kaldi-asr/kaldi.git /opt/kaldi && \
+    cd /opt/kaldi && \
+    cd /opt/kaldi/tools && \
+    ./extras/install_mkl.sh && \
+    make -j $(nproc) && \
+    cd /opt/kaldi/src && \
+    ./configure --shared --use-cuda && \
+    make depend -j $(nproc) && \
+    make -j $(nproc)
+
+WORKDIR /opt/kaldi/

From 956938408b43262fa197c3081fcd98bebf0d0ccd Mon Sep 17 00:00:00 2001
From: Wonkyum <wonkyum@gridspace.com>
Date: Wed, 15 May 2019 12:39:58 -0700
Subject: [PATCH 115/163] [scripts] fix typo/bug in make_musan.py (#3327)

---
 egs/wsj/s5/steps/data/make_musan.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/egs/wsj/s5/steps/data/make_musan.py b/egs/wsj/s5/steps/data/make_musan.py
index 80b9d7cf6d4..9165fd7e522 100755
--- a/egs/wsj/s5/steps/data/make_musan.py
+++ b/egs/wsj/s5/steps/data/make_musan.py
@@ -36,7 +36,7 @@ def check_args(args):
         raise Exception('input dir {0} does not exist'.format(args.in_dir))
     if not os.path.exists(args.out_dir):
         print("Preparing {0}/musan...".format(args.out_dir))
-        os.makedirs(args.output_dir)
+        os.makedirs(args.out_dir)
 
     return args
 

From 94aef8d540410fda3f811dfaa4d092d640236a16 Mon Sep 17 00:00:00 2001
From: "kkm (aka Kirill Katsnelson)" <kkm@smartaction.com>
Date: Wed, 15 May 2019 17:06:18 -0700
Subject: [PATCH 116/163] [scripts] Trust frame_shift and utt2num_frames if
 found (#3313)

Getting utt2dur involves accessing wave files, and potentially
running full pipelines in wav.scp, which may take hours for a
large data set. If utt2num_frames exists, use it instead if
frame rate is known.

Issue: #3303
Fixes: #3297 "cat: broken pipe"
---
 egs/wsj/s5/utils/data/get_frame_shift.sh | 63 +++++++++++++-----------
 egs/wsj/s5/utils/data/get_utt2dur.sh     | 18 +++++--
 2 files changed, 48 insertions(+), 33 deletions(-)

diff --git a/egs/wsj/s5/utils/data/get_frame_shift.sh b/egs/wsj/s5/utils/data/get_frame_shift.sh
index eaf21b37ea6..c836bde1b18 100755
--- a/egs/wsj/s5/utils/data/get_frame_shift.sh
+++ b/egs/wsj/s5/utils/data/get_frame_shift.sh
@@ -14,12 +14,16 @@
 . ./path.sh
 
 if [ $# != 1 ]; then
-  echo "Usage: $0 <datadir>"
-  echo "e.g.:"
-  echo " $0 data/train"
-  echo "This script prints the frame-shift (e.g. 0.01) to the standard out."
-  echo "If <datadir> does not contain utt2dur, this script may call utils/data/get_utt2dur.sh,"
-  echo "which will require write permission to <datadir>"
+  cat >&2 <<EOF
+Usage: frame_shift=\$($0 <datadir>)
+e.g.:  frame_shift=\$($0 data/train)
+
+This script prints the frame-shift in seconds (e.g. 0.01) to the standard out.
+Its output is intended to be captured in a shell variable.
+
+If <datadir> does not contain the file utt2dur, this script may invoke
+utils/data/get_utt2dur.sh, which will require write permission to <datadir>.
+EOF
   exit 1
 fi
 
@@ -27,6 +31,15 @@ export LC_ALL=C
 
 dir=$1
 
+if [[ -s $dir/frame_shift ]]; then
+  cat $dir/frame_shift
+  exit
+fi
+
+if [ ! -f $dir/feats.scp ]; then
+  echo "$0: $dir/feats.scp does not exist" 1>&2
+  exit 1
+fi
 
 if [ ! -s $dir/utt2dur ]; then
   if [ ! -e $dir/wav.scp ] && [ ! -s $dir/segments ]; then
@@ -35,37 +48,27 @@ if [ ! -s $dir/utt2dur ]; then
     exit 0
   fi
   echo "$0: $dir/utt2dur does not exist: creating it" 1>&2
-  utils/data/get_utt2dur.sh $dir 1>&2
+  utils/data/get_utt2dur.sh 1>&2 $dir || exit 1
 fi
 
-if [ ! -s $dir/frame_shift ]; then
-  if [ ! -f $dir/feats.scp ]; then
-    echo "$0: $dir/feats.scp does not exist" 1>&2
-    exit 1
-  fi
-
-  temp=$(mktemp /tmp/tmp.XXXX)
+temp=$(mktemp /tmp/tmp.XXXX) || exit 1
 
-  feat-to-len "scp:head -n 10 $dir/feats.scp|" ark,t:- > $temp
+feat-to-len --print-args=false "scp:head -n 10 $dir/feats.scp|" ark,t:- > $temp
 
-  if [ -z $temp ]; then
-    echo "$0: error running feat-to-len" 1>&2
-    exit 1
-  fi
-
-  frame_shift=$(head -n 10 $dir/utt2dur | paste - $temp | \
-    awk '{ dur += $2; frames += $4; } END { shift = dur / frames; if (shift > 0.01 && shift < 0.0102) shift = 0.01; print shift; }') || exit 1;
-
-  echo $frame_shift > $dir/frame_shift
+if [[ ! -s $temp ]]; then
   rm $temp
-fi
-
-frame_shift=$(cat $dir/frame_shift)
-if [ -z "$frame_shift" ]; then
-  echo "$0: Could not read get frame shift from directory $dir" 1>&2
+  echo "$0: error running feat-to-len" 1>&2
   exit 1
 fi
 
-echo $frame_shift
+frame_shift=$(head -n 10 $dir/utt2dur | paste - $temp | awk '
+      { dur += $2; frames += $4; }
+  END { shift = dur / frames;
+        if (shift > 0.01 && shift < 0.0102) shift = 0.01;
+        print shift; }') || exit 1;
 
+rm $temp
+
+echo $frame_shift > $dir/frame_shift
+echo $frame_shift
 exit 0
diff --git a/egs/wsj/s5/utils/data/get_utt2dur.sh b/egs/wsj/s5/utils/data/get_utt2dur.sh
index 995136a5575..a760981d198 100755
--- a/egs/wsj/s5/utils/data/get_utt2dur.sh
+++ b/egs/wsj/s5/utils/data/get_utt2dur.sh
@@ -23,7 +23,8 @@ if [ $# != 1 ]; then
   echo " $0 data/train"
   echo " Options:"
   echo " --frame-shift      # frame shift in seconds. Only relevant when we are"
-  echo "                    # getting duration from feats.scp (default: 0.01). "
+  echo "                    # getting duration from feats.scp, and only if the "
+  echo "                    # file frame_shift does not exist (default: 0.01). "
   exit 1
 fi
 
@@ -40,12 +41,17 @@ fi
 if [ -s $data/segments ]; then
   echo "$0: working out $data/utt2dur from $data/segments"
   awk '{len=$4-$3; print $1, len;}' < $data/segments  > $data/utt2dur
+elif [[ -s $data/frame_shift && -f $data/utt2num_frames ]]; then
+  echo "$0: computing $data/utt2dur from $data/{frame_shift,utt2num_frames}."
+  frame_shift=$(cat $data/frame_shift) || exit 1
+  # The 1.5 correction is the typical value of (frame_length-frame_shift)/frame_shift.
+  awk -v fs=$frame_shift '{ $2=($2+1.5)*fs; print }' <$data/utt2num_frames  >$data/utt2dur
 elif [ -f $data/wav.scp ]; then
   echo "$0: segments file does not exist so getting durations from wave files"
 
   # if the wav.scp contains only lines of the form
   # utt1  /foo/bar/sph2pipe -f wav /baz/foo.sph |
-  if cat $data/wav.scp | perl -e '
+  if perl <$data/wav.scp -e '
      while (<>) { s/\|\s*$/ |/;  # make sure final | is preceded by space.
              @A = split; if (!($#A == 5 && $A[1] =~ m/sph2pipe$/ &&
                                $A[2] eq "-f" && $A[3] eq "wav" && $A[5] eq "|")) { exit(1); }
@@ -102,7 +108,13 @@ elif [ -f $data/wav.scp ]; then
   fi
 elif [ -f $data/feats.scp ]; then
   echo "$0: wave file does not exist so getting durations from feats files"
-  feat-to-len scp:$data/feats.scp ark,t:- | awk -v frame_shift=$frame_shift '{print $1, $2*frame_shift;}' >$data/utt2dur
+  if [[ -s $data/frame_shift ]]; then
+    frame_shift=$(cat $data/frame_shift) || exit 1
+    echo "$0: using frame_shift=$frame_shift from file $data/frame_shift"
+  fi
+  # The 1.5 correction is the typical value of (frame_length-frame_shift)/frame_shift.
+  feat-to-len scp:$data/feats.scp ark,t:- |
+    awk -v frame_shift=$frame_shift '{print $1, ($2+1.5)*frame_shift}' >$data/utt2dur
 else
   echo "$0: Expected $data/wav.scp, $data/segments or $data/feats.scp to exist"
   exit 1

From 9ae4a5ca530a4ef25d19a5bb4abf4806fd825164 Mon Sep 17 00:00:00 2001
From: Wonkyum <wonkyum@gridspace.com>
Date: Thu, 16 May 2019 11:10:02 -0700
Subject: [PATCH 117/163] [scripts] typo fix in augmentation script (#3329)

Fixes typo in #3119
---
 egs/swbd/s5c/local/nnet3/multi_condition/run_aug_common.sh | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/egs/swbd/s5c/local/nnet3/multi_condition/run_aug_common.sh b/egs/swbd/s5c/local/nnet3/multi_condition/run_aug_common.sh
index 3603dd59d79..7d36cdfaac9 100755
--- a/egs/swbd/s5c/local/nnet3/multi_condition/run_aug_common.sh
+++ b/egs/swbd/s5c/local/nnet3/multi_condition/run_aug_common.sh
@@ -194,7 +194,7 @@ if [ "$use_ivectors" == "true" ]; then
     utils/filter_scp.pl data/${ivector_trainset}_aug/utt2spk data/${train_set}_aug/feats.scp > data/${ivector_trainset}_aug/feats.scp
     utils/fix_data_dir.sh data/${ivector_trainset}_aug
     echo "$0: Creating alignments of aug data by copying alignments of clean data"
-    local/copy_ali_dir.sh --nj 40 --cmd "$train_cmd" \
+    steps/copy_ali_dir.sh --nj 40 --cmd "$train_cmd" \
         data/${ivector_trainset}_aug exp/${lda_mllt_ali} exp/${lda_mllt_ali}_aug
 
     # We need to build a small system just because we need the LDA+MLLT transform

From 74ebdeecd2de0eba704ab91bd242a31647568f19 Mon Sep 17 00:00:00 2001
From: "kkm (aka Kirill Katsnelson)" <kkm@smartaction.com>
Date: Thu, 16 May 2019 12:38:21 -0700
Subject: [PATCH 118/163] [scripts] handle frame_shit and utt2num_frames in
 utils/ (#3323)

subset_data_dir.sh has been refactored thoroughly so that its
logic can be followed easier. It has been well tested and
dogfooded.

All changes here are necessary to subset, combine and verify
utt2num_frames, and copy frame_shift to new directories where
necessary.
---
 egs/wsj/s5/utils/combine_data.sh      |  20 ++-
 egs/wsj/s5/utils/copy_data_dir.sh     |   7 +-
 egs/wsj/s5/utils/subset_data_dir.sh   | 226 +++++++++++++-------------
 egs/wsj/s5/utils/validate_data_dir.sh |  20 ++-
 4 files changed, 149 insertions(+), 124 deletions(-)

diff --git a/egs/wsj/s5/utils/combine_data.sh b/egs/wsj/s5/utils/combine_data.sh
index a43cf9d77f3..8daffcea8c5 100755
--- a/egs/wsj/s5/utils/combine_data.sh
+++ b/egs/wsj/s5/utils/combine_data.sh
@@ -42,6 +42,20 @@ for dir in $*; do
   fi
 done
 
+# Check that frame_shift are compatible, where present together with features.
+dir_with_frame_shift=
+for dir in $*; do
+  if [[ -f $dir/feats.scp && -f $dir/frame_shift ]]; then
+    if [[ $dir_with_frame_shift ]] &&
+       ! cmp -s $dir_with_frame_shift/frame_shift $dir/frame_shift; then
+      echo "$0:error: different frame_shift in directories $dir and " \
+           "$dir_with_frame_shift. Cannot combine features."
+      exit 1;
+    fi
+    dir_with_frame_shift=$dir
+  fi
+done
+
 # W.r.t. utt2uniq file the script has different behavior compared to other files
 # it is not compulsary for it to exist in src directories, but if it exists in
 # even one it should exist in all. We will create the files where necessary
@@ -94,7 +108,7 @@ else
   echo "$0 [info]: not combining segments as it does not exist"
 fi
 
-for file in utt2spk utt2lang utt2dur reco2dur feats.scp text cmvn.scp vad.scp reco2file_and_channel wav.scp spk2gender $extra_files; do
+for file in utt2spk utt2lang utt2dur utt2num_frames reco2dur feats.scp text cmvn.scp vad.scp reco2file_and_channel wav.scp spk2gender $extra_files; do
   exists_somewhere=false
   absent_somewhere=false
   for d in $*; do
@@ -121,6 +135,10 @@ done
 
 utils/utt2spk_to_spk2utt.pl <$dest/utt2spk >$dest/spk2utt
 
+if [[ $dir_with_frame_shift ]]; then
+  cp $dir_with_frame_shift/frame_shift $dest
+fi
+
 if ! $skip_fix ; then
   utils/fix_data_dir.sh $dest || exit 1;
 fi
diff --git a/egs/wsj/s5/utils/copy_data_dir.sh b/egs/wsj/s5/utils/copy_data_dir.sh
index f3b885c5e79..fbd31203e34 100755
--- a/egs/wsj/s5/utils/copy_data_dir.sh
+++ b/egs/wsj/s5/utils/copy_data_dir.sh
@@ -103,6 +103,9 @@ fi
 if [ -f $srcdir/utt2dur ]; then
   utils/apply_map.pl -f 1 $destdir/utt_map <$srcdir/utt2dur >$destdir/utt2dur
 fi
+if [ -f $srcdir/utt2num_frames ]; then
+  utils/apply_map.pl -f 1 $destdir/utt_map <$srcdir/utt2num_frames >$destdir/utt2num_frames
+fi
 if [ -f $srcdir/reco2dur ]; then
   if [ -f $srcdir/segments ]; then
     cp $srcdir/reco2dur $destdir/reco2dur
@@ -116,7 +119,7 @@ fi
 if [ -f $srcdir/cmvn.scp ]; then
   utils/apply_map.pl -f 1 $destdir/spk_map <$srcdir/cmvn.scp >$destdir/cmvn.scp
 fi
-for f in stm glm ctm; do
+for f in frame_shift stm glm ctm; do
   if [ -f $srcdir/$f ]; then
     cp $srcdir/$f $destdir
   fi
@@ -126,7 +129,7 @@ rm $destdir/spk_map $destdir/utt_map
 
 echo "$0: copied data from $srcdir to $destdir"
 
-for f in feats.scp cmvn.scp vad.scp utt2lang utt2uniq utt2dur utt2num_frames text wav.scp reco2file_and_channel stm glm ctm; do
+for f in feats.scp cmvn.scp vad.scp utt2lang utt2uniq utt2dur utt2num_frames text wav.scp reco2file_and_channel frame_shift stm glm ctm; do
   if [ -f $destdir/$f ] && [ ! -f $srcdir/$f ]; then
     echo "$0: file $f exists in dest $destdir but not in src $srcdir.  Moving it to"
     echo " ... $destdir/.backup/$f"
diff --git a/egs/wsj/s5/utils/subset_data_dir.sh b/egs/wsj/s5/utils/subset_data_dir.sh
index 4cd3f9b7711..c05ca458221 100755
--- a/egs/wsj/s5/utils/subset_data_dir.sh
+++ b/egs/wsj/s5/utils/subset_data_dir.sh
@@ -34,42 +34,27 @@
 
 shortest=false
 perspk=false
-first_opt=""
 speakers=false
-spk_list_specified=false
-utt_list_specified=false
-
-if [ "$1" == "--per-spk" ]; then
-  perspk=true;
-  shift;
-elif [ "$1" == "--shortest" ]; then
-  shortest=true;
-  shift;
-elif [ "$1" == "--first" ]; then
-  first_opt="--first";
-  shift;
-elif [ "$1" == "--speakers" ]; then
-  speakers=true
-  shift;
-elif [ "$1" == "--last" ]; then
-  first_opt="--last";
-  shift;
-elif [ "$1" == "--spk-list" ]; then
-  spk_list_specified=true
-  shift;
-elif [ "$1" == "--utt-list" ]; then
-  utt_list_specified=true
-  shift;
-fi
-
-
-
-
-if [ $# != 3 ]; then
-  echo "Usage: "
+first_opt=
+spk_list=
+utt_list=
+
+expect_args=3
+case $1 in
+  --first|--last) first_opt=$1; shift ;;
+  --per-spk)  perspk=true; shift ;;
+  --shortest) shortest=true; shift ;;
+  --speakers) speakers=true; shift ;;
+  --spk-list) shift; spk_list=$1; shift; expect_args=2 ;;
+  --utt-list) shift; utt_list=$1; shift; expect_args=2 ;;
+  --*) echo "$0: invalid option '$1'"; exit 1
+esac
+
+if [ $# != $expect_args ]; then
+  echo "Usage:"
   echo "  subset_data_dir.sh [--speakers|--shortest|--first|--last|--per-spk] <srcdir> <num-utt> <destdir>"
   echo "  subset_data_dir.sh [--spk-list <speaker-list-file>] <srcdir> <destdir>"
-  echo "  subset_data_dir.sh [--utt-list <utterance-list-file>] <srcdir> <destdir>"
+  echo "  subset_data_dir.sh [--utt-list <utt-list-file>] <srcdir> <destdir>"
   echo "By default, randomly selects <num-utt> utterances from the data directory."
   echo "With --speakers, randomly selects enough speakers that we have <num-utt> utterances"
   echo "With --per-spk, selects <num-utt> utterances per speaker, if available."
@@ -77,124 +62,131 @@ if [ $# != 3 ]; then
   echo "With --last, selects the last <num-utt> utterances"
   echo "With --shortest, selects the shortest <num-utt> utterances."
   echo "With --spk-list, reads the speakers to keep from <speaker-list-file>"
+  echo "With --utt-list, reads the utterances to keep from <utt-list-file>"
   exit 1;
 fi
 
-if $spk_list_specified; then
-  spk_list=$1
-  srcdir=$2
-  destdir=$3
-elif $utt_list_specified; then
-  utt_list=$1
-  srcdir=$2
-  destdir=$3
+srcdir=$1
+if [[ $spk_list || $utt_list ]]; then
+  numutt=
+  destdir=$2
 else
-  srcdir=$1
   numutt=$2
   destdir=$3
 fi
 
-
 export LC_ALL=C
 
 if [ ! -f $srcdir/utt2spk ]; then
-  echo "subset_data_dir.sh: no such file $srcdir/utt2spk"
-  exit 1;
+  echo "$0: no such file $srcdir/utt2spk"
+  exit 1
 fi
 
-function do_filtering {
-  # assumes the utt2spk and spk2utt files already exist.
-  [ -f $srcdir/feats.scp ] && utils/filter_scp.pl $destdir/utt2spk <$srcdir/feats.scp >$destdir/feats.scp
-  [ -f $srcdir/vad.scp ] && utils/filter_scp.pl $destdir/utt2spk <$srcdir/vad.scp >$destdir/vad.scp
-  [ -f $srcdir/utt2lang ] && utils/filter_scp.pl $destdir/utt2spk <$srcdir/utt2lang >$destdir/utt2lang
-  [ -f $srcdir/utt2dur ] && utils/filter_scp.pl $destdir/utt2spk <$srcdir/utt2dur >$destdir/utt2dur
-  [ -f $srcdir/utt2num_frames ] && utils/filter_scp.pl $destdir/utt2spk <$srcdir/utt2num_frames >$destdir/utt2num_frames
-  [ -f $srcdir/utt2uniq ] && utils/filter_scp.pl $destdir/utt2spk <$srcdir/utt2uniq >$destdir/utt2uniq
-  [ -f $srcdir/wav.scp ] && utils/filter_scp.pl $destdir/utt2spk <$srcdir/wav.scp >$destdir/wav.scp
-  [ -f $srcdir/spk2warp ] && utils/filter_scp.pl $destdir/spk2utt <$srcdir/spk2warp >$destdir/spk2warp
-  [ -f $srcdir/utt2warp ] && utils/filter_scp.pl $destdir/utt2spk <$srcdir/utt2warp >$destdir/utt2warp
-  [ -f $srcdir/text ] && utils/filter_scp.pl $destdir/utt2spk <$srcdir/text >$destdir/text
-  [ -f $srcdir/spk2gender ] && utils/filter_scp.pl $destdir/spk2utt <$srcdir/spk2gender >$destdir/spk2gender
-  [ -f $srcdir/cmvn.scp ] && utils/filter_scp.pl $destdir/spk2utt <$srcdir/cmvn.scp >$destdir/cmvn.scp
-  if [ -f $srcdir/segments ]; then
-     utils/filter_scp.pl $destdir/utt2spk <$srcdir/segments >$destdir/segments
-     awk '{print $2;}' $destdir/segments | sort | uniq > $destdir/reco # recordings.
-     # The next line would override the command above for wav.scp, which would be incorrect.
-     [ -f $srcdir/wav.scp ] && utils/filter_scp.pl $destdir/reco <$srcdir/wav.scp >$destdir/wav.scp
-     [ -f $srcdir/reco2file_and_channel ] && \
-       utils/filter_scp.pl $destdir/reco <$srcdir/reco2file_and_channel >$destdir/reco2file_and_channel
-     [ -f $srcdir/reco2dur ] && \
-       utils/filter_scp.pl $destdir/reco <$srcdir/reco2dur >$destdir/reco2dur
-
-     # Filter the STM file for proper sclite scoring
-     # Copy over the comments from STM file
-     [ -f $srcdir/stm ] && grep "^;;" $srcdir/stm > $destdir/stm
-     [ -f $srcdir/stm ] && utils/filter_scp.pl $destdir/reco < $srcdir/stm >> $destdir/stm
-
-     rm $destdir/reco
-  else
-     awk '{print $1;}' $destdir/wav.scp | sort | uniq > $destdir/reco
-     [ -f $srcdir/reco2file_and_channel ] && \
-       utils/filter_scp.pl $destdir/reco <$srcdir/reco2file_and_channel >$destdir/reco2file_and_channel
-     [ -f $srcdir/reco2dur ] && \
-       utils/filter_scp.pl $destdir/reco <$srcdir/reco2dur >$destdir/reco2dur
-     
-     rm $destdir/reco
-  fi
-  srcutts=`cat $srcdir/utt2spk | wc -l`
-  destutts=`cat $destdir/utt2spk | wc -l`
-  echo "$0: reducing #utt from $srcutts to $destutts"
-}
+if [[ $numutt && $numutt -gt $(wc -l <$srcdir/utt2spk) ]]; then
+  echo "$0: cannot subset to more utterances than you originally had."
+  exit 1
+fi
 
+if $shortest && [ ! -f $srcdir/feats.scp ]; then
+  echo "$0: you selected --shortest but no feats.scp exist."
+  exit 1
+fi
+
+mkdir -p $destdir || exit 1
 
-if $spk_list_specified; then
-  mkdir -p $destdir
+if [[ $spk_list ]]; then
   utils/filter_scp.pl "$spk_list" $srcdir/spk2utt > $destdir/spk2utt || exit 1;
   utils/spk2utt_to_utt2spk.pl < $destdir/spk2utt > $destdir/utt2spk || exit 1;
-  do_filtering; # bash function.
-  exit 0;
-elif $utt_list_specified; then
-  mkdir -p $destdir
+elif [[ $utt_list ]]; then
   utils/filter_scp.pl "$utt_list" $srcdir/utt2spk > $destdir/utt2spk || exit 1;
   utils/utt2spk_to_spk2utt.pl < $destdir/utt2spk > $destdir/spk2utt || exit 1;
-  do_filtering; # bash function.
-  exit 0;
 elif $speakers; then
-  mkdir -p $destdir
-  utils/shuffle_list.pl < $srcdir/spk2utt | awk -v numutt=$numutt '{ if (tot < numutt){ print; } tot += (NF-1); }' | \
+  utils/shuffle_list.pl < $srcdir/spk2utt |
+    awk -v numutt=$numutt '{ if (tot < numutt){ print; } tot += (NF-1); }' |
     sort > $destdir/spk2utt
   utils/spk2utt_to_utt2spk.pl < $destdir/spk2utt > $destdir/utt2spk
-  do_filtering; # bash function.
-  exit 0;
 elif $perspk; then
-  mkdir -p $destdir
-  awk '{ n='$numutt'; printf("%s ",$1); skip=1; while(n*(skip+1) <= NF-1) { skip++; }
+  awk '{ n='$numutt'; printf("%s ",$1);
+         skip=1; while(n*(skip+1) <= NF-1) { skip++; }
          for(x=2; x<=NF && x <= n*skip; x += skip) { printf("%s ", $x); }
          printf("\n"); }' <$srcdir/spk2utt >$destdir/spk2utt
   utils/spk2utt_to_utt2spk.pl < $destdir/spk2utt > $destdir/utt2spk
-  do_filtering; # bash function.
-  exit 0;
 else
-  if [ $numutt -gt `cat $srcdir/utt2spk | wc -l` ]; then
-    echo "subset_data_dir.sh: cannot subset to more utterances than you originally had."
-    exit 1;
-  fi
-  mkdir -p $destdir || exit 1;
-
-  ## scripting note: $shortest evaluates to true or false
-  ## so this becomes the command true or false.
   if $shortest; then
-    # select the n shortest utterances.
+    # Select $numutt shortest utterances.
     . ./path.sh
-    [ ! -f $srcdir/feats.scp ] && echo "$0: you selected --shortest but no feats.scp exist." && exit 1;
     feat-to-len scp:$srcdir/feats.scp ark,t:$destdir/tmp.len || exit 1;
-    sort -n -k2 $destdir/tmp.len | awk '{print $1}' | head -$numutt >$destdir/tmp.uttlist
+    sort -n -k2 $destdir/tmp.len |
+      awk '{print $1}' |
+      head -$numutt >$destdir/tmp.uttlist
     utils/filter_scp.pl $destdir/tmp.uttlist $srcdir/utt2spk >$destdir/utt2spk
     rm $destdir/tmp.uttlist $destdir/tmp.len
   else
+    # Select $numutt random utterances.
     utils/subset_scp.pl $first_opt $numutt $srcdir/utt2spk > $destdir/utt2spk || exit 1;
   fi
   utils/utt2spk_to_spk2utt.pl < $destdir/utt2spk > $destdir/spk2utt
-  do_filtering;
-  exit 0;
 fi
+
+# Perform filtering. utt2spk and spk2utt files already exist by this point.
+# Filter by utterance.
+[ -f $srcdir/feats.scp ] &&
+  utils/filter_scp.pl $destdir/utt2spk <$srcdir/feats.scp >$destdir/feats.scp
+[ -f $srcdir/vad.scp ] &&
+  utils/filter_scp.pl $destdir/utt2spk <$srcdir/vad.scp >$destdir/vad.scp
+[ -f $srcdir/utt2lang ] &&
+  utils/filter_scp.pl $destdir/utt2spk <$srcdir/utt2lang >$destdir/utt2lang
+[ -f $srcdir/utt2dur ] &&
+  utils/filter_scp.pl $destdir/utt2spk <$srcdir/utt2dur >$destdir/utt2dur
+[ -f $srcdir/utt2num_frames ] &&
+  utils/filter_scp.pl $destdir/utt2spk <$srcdir/utt2num_frames >$destdir/utt2num_frames
+[ -f $srcdir/utt2uniq ] &&
+  utils/filter_scp.pl $destdir/utt2spk <$srcdir/utt2uniq >$destdir/utt2uniq
+[ -f $srcdir/wav.scp ] &&
+  utils/filter_scp.pl $destdir/utt2spk <$srcdir/wav.scp >$destdir/wav.scp
+[ -f $srcdir/utt2warp ] &&
+  utils/filter_scp.pl $destdir/utt2spk <$srcdir/utt2warp >$destdir/utt2warp
+[ -f $srcdir/text ] &&
+  utils/filter_scp.pl $destdir/utt2spk <$srcdir/text >$destdir/text
+
+# Filter by speaker.
+[ -f $srcdir/spk2warp ] &&
+  utils/filter_scp.pl $destdir/spk2utt <$srcdir/spk2warp >$destdir/spk2warp
+[ -f $srcdir/spk2gender ] &&
+  utils/filter_scp.pl $destdir/spk2utt <$srcdir/spk2gender >$destdir/spk2gender
+[ -f $srcdir/cmvn.scp ] &&
+  utils/filter_scp.pl $destdir/spk2utt <$srcdir/cmvn.scp >$destdir/cmvn.scp
+
+# Filter by recording-id.
+if [ -f $srcdir/segments ]; then
+  utils/filter_scp.pl $destdir/utt2spk <$srcdir/segments >$destdir/segments
+  # Recording-ids are in segments.
+  awk '{print $2}' $destdir/segments | sort | uniq >$destdir/reco
+  # The next line overrides the command above for wav.scp, which would be incorrect.
+  [ -f $srcdir/wav.scp ] &&
+    utils/filter_scp.pl $destdir/reco <$srcdir/wav.scp >$destdir/wav.scp
+else
+  # No segments; recording-ids are in wav.scp.
+  awk '{print $1}' $destdir/wav.scp | sort | uniq >$destdir/reco
+fi
+
+[ -f $srcdir/reco2file_and_channel ] &&
+  utils/filter_scp.pl $destdir/reco <$srcdir/reco2file_and_channel >$destdir/reco2file_and_channel
+[ -f $srcdir/reco2dur ] &&
+  utils/filter_scp.pl $destdir/reco <$srcdir/reco2dur >$destdir/reco2dur
+
+# Filter the STM file for proper sclite scoring.
+# Copy over the comments from STM file.
+[ -f $srcdir/stm ] &&
+  (grep "^;;" $srcdir/stm
+   utils/filter_scp.pl $destdir/reco $srcdir/stm) >$destdir/stm
+
+rm $destdir/reco
+
+# Copy frame_shift if present.
+[ -f $srcdir/frame_shift ] && cp $srcdir/frame_shift $destdir
+
+srcutts=$(wc -l <$srcdir/utt2spk)
+destutts=$(wc -l <$destdir/utt2spk)
+echo "$0: reducing #utt from $srcutts to $destutts"
+exit 0
diff --git a/egs/wsj/s5/utils/validate_data_dir.sh b/egs/wsj/s5/utils/validate_data_dir.sh
index dc06b6fa59e..c7e633ab57b 100755
--- a/egs/wsj/s5/utils/validate_data_dir.sh
+++ b/egs/wsj/s5/utils/validate_data_dir.sh
@@ -85,9 +85,7 @@ function check_sorted_and_uniq {
 }
 
 function partial_diff {
-  diff $1 $2 | head -n 6
-  echo "..."
-  diff $1 $2 | tail -n 6
+  diff -U1 $1 $2 | (head -n 6; echo "..."; tail -n 6)
   n1=`cat $1 | wc -l`
   n2=`cat $2 | wc -l`
   echo "[Lengths are $1=$n1 versus $2=$n2]"
@@ -341,9 +339,23 @@ if [ -f $data/utt2dur ]; then
     exit 1;
   fi
   cat $data/utt2dur | \
-    awk '{ if (NF != 2 || !($2 > 0)) { print "Bad line : " $0; exit(1) }}' || exit 1
+    awk '{ if (NF != 2 || !($2 > 0)) { print "Bad line utt2dur:" NR ":" $0; exit(1) }}' || exit 1
 fi
 
+if [ -f $data/utt2num_frames ]; then
+  check_sorted_and_uniq $data/utt2num_frames
+  cat $data/utt2num_frames | awk '{print $1}' > $tmpdir/utts.utt2num_frames
+  if ! cmp -s $tmpdir/utts{,.utt2num_frames}; then
+    echo "$0: Error: in $data, utterance-ids extracted from utt2spk and utt2num_frames file"
+    echo "$0: differ, partial diff is:"
+    partial_diff $tmpdir/utts{,.utt2num_frames}
+    exit 1
+  fi
+  awk <$data/utt2num_frames '{
+    if (NF != 2 || !($2 > 0) || $2 != int($2)) {
+      print "Bad line utt2num_frames:" NR ":" $0
+      exit 1 } }' || exit 1
+fi
 
 if [ -f $data/reco2dur ]; then
   check_sorted_and_uniq $data/reco2dur

From d1c49bfd5ec45a8f5e953bdbabaa9ed40c463b44 Mon Sep 17 00:00:00 2001
From: "kkm (aka Kirill Katsnelson)" <kkm@smartaction.com>
Date: Fri, 17 May 2019 09:51:28 -0700
Subject: [PATCH 119/163] [scripts] Extend combine_ali_dirs.sh to combine
 alignment lattices (#3315)

Relevant discussion:
https://groups.google.com/forum/#!topic/kaldi-help/2uxfByEAmfw
---
 egs/wsj/s5/steps/combine_ali_dirs.sh | 254 +++++++++++++++++++--------
 egs/wsj/s5/steps/combine_lat_dirs.sh |   1 +
 2 files changed, 180 insertions(+), 75 deletions(-)
 create mode 120000 egs/wsj/s5/steps/combine_lat_dirs.sh

diff --git a/egs/wsj/s5/steps/combine_ali_dirs.sh b/egs/wsj/s5/steps/combine_ali_dirs.sh
index fe704da3dc6..d2cd5d8de2a 100755
--- a/egs/wsj/s5/steps/combine_ali_dirs.sh
+++ b/egs/wsj/s5/steps/combine_ali_dirs.sh
@@ -1,105 +1,209 @@
 #!/bin/bash
 # Copyright 2016  Xiaohui Zhang  Apache 2.0.
+# Copyright 2019  SmartAction (kkm)
 
-# This srcipt operates on alignment directories, such as exp/tri4a_ali
-# the output is a new ali dir which has alignments from all the input ali dirs
+# This script combines alignment directories, such as exp/tri4a_ali, and
+# validates matching of the utterances and alignments after combining.
 
 # Begin configuration section.
 cmd=run.pl
-extra_files=
-num_jobs=4
+nj=4
+combine_lat=true
+combine_ali=true
+tolerance=10
 # End configuration section.
-echo "$0 $@"  # Print the command line for logging
+echo "$0 $@"  # Print the command line for logging.
 
-if [ -f path.sh ]; then . ./path.sh; fi
-. parse_options.sh || exit 1;
+[[ -f path.sh ]] && . ./path.sh
+. parse_options.sh || exit 1
+
+export LC_ALL=C
 
 if [[ $# -lt 3 ]]; then
-  echo "Usage: $0 [options] <data> <dest-ali-dir> <src-ali-dir1> <src-ali-dir2> ..."
-  echo "e.g.: $0 --num-jobs 32 data/train exp/tri3_ali_combined exp/tri3_ali_1 exp_tri3_ali_2"
-  echo "Options:"
-  echo " --extra-files <file1 file2...>   # specify addtional files in 'src-ali-dir1' to copy"
-  echo " --num-jobs <nj>                  # number of jobs used to split the data directory."
-  echo " Note, files that don't appear in the first source dir will not be added even if they appear in later ones."
-  echo " Other than alignments, only files from the first src ali dir are copied."
+  cat >&2 <<EOF
+Usage: $0 [options] <data> <dest-dir> <src-dir1> <src-dir2> ...
+ e.g.: $0 --nj 32 data/train exp/tri3_ali_combined exp/tri3_ali_1 exp_tri3_ali_2
+Options:
+ --nj <nj>              # number of jobs to split combined archives [4]
+ --combine_ali false    # merge ali.*.gz if present [true]
+ --combine_lat false    # merge lat.*.gz if present [true]
+ --tolerance <int,%>    # maximum percentage of missing alignments or lattices
+                        # w.r.t. total utterances in <data> before error is
+                        # reported [10]
+
+The script checks that certain important files are present and compatible in all
+source directories (phones.txt, tree); other are copied from the first source
+(cmvn_opts, final.mdl) without much checking.
+
+Both --combine_ali and --combine_lat are true by default, but the script
+proceeds with a warning if directories do not contain either alignments or
+alignment lattices. Check for files ali.1.gz and/or lat.1.gz in the <dest-dir>
+after the script completes if additional programmatic check is required.
+EOF
   exit 1;
 fi
 
-data=$1;
-shift;
-dest=$1;
-shift;
-first_src=$1;
-
-mkdir -p $dest;
-rm $dest/{ali.*.gz,num_jobs} 2>/dev/null
-
-cp $first_src/phones.txt $dest 2>/dev/null
-
-export LC_ALL=C
+if [[ ! $combine_lat && ! $combine_ali ]]; then
+  echo "$0: at least one of --combine_lat and --combine_ali must be true"
+  exit 1
+fi
 
-for dir in $*; do
-  if [ ! -f $dir/ali.1.gz ]; then
-    echo "$0: check if alignments (ali.*.gz) are present in $dir."
-    exit 1;
+data=$1
+dest=$2
+shift 2
+first_src=$1
+
+do_ali=$combine_ali
+do_lat=$combine_lat
+
+# Check if alignments and/or lattices are present. Since we combine both,
+# whichever present, issue a warning only. Also verify that the target is
+# different from any source; we cannot combine in-place, and a lot of damage
+# could result.
+for src in $@; do
+  if [[ "$(cd 2>/dev/null -P -- "$src" && pwd)" = \
+        "$(cd 2>/dev/null -P -- "$dest" && pwd)" ]]; then
+    echo "$0: error: Source $src is same as target $dest."
+    exit 1
+  fi
+  if $do_ali && [[ ! -f $src/ali.1.gz ]]; then
+    echo "$0: warning: Alignments (ali.*.gz) are not present in $src, not" \
+         "combining. Consider '--combine_ali false' to suppress this warning."
+    do_ali=false
+  fi
+  if $do_lat && [[ ! -f $src/lat.1.gz ]]; then
+    echo "$0: warning: Alignment lattices (lat.*.gz) are not present in $src,"\
+      "not combining. Consider '--combine_lat false' to suppress this warning."
+    do_lat=false
   fi
 done
 
-for dir in $*; do
-  for f in tree; do
-    diff $first_src/$f $dir/$f 1>/dev/null 2>&1
-    if [ $? -ne 0 ]; then
-      echo "$0: Cannot combine alignment directories with different $f files."
-    fi
-  done
-done
+if ! $do_ali && ! $do_lat; then
+  echo "$0: error: Cannot combine directories."
+  exit 1
+fi
 
-for f in final.mdl tree cmvn_opts num_jobs $extra_files; do
+# Verify that required files are present in the first directory.
+for f in cmvn_opts final.mdl num_jobs phones.txt tree; do
   if [ ! -f $first_src/$f ]; then
-    echo "combine_ali_dir.sh: no such file $first_src/$f"
-    exit 1;
+    echo "$0: error: Required source file $first_src/$f is missing."
+    exit 1
   fi
-  cp $first_src/$f $dest/
 done
 
-src_id=0
-temp_dir=$dest/temp
-[ -d $temp_dir ] && rm -r $temp_dir;
-mkdir -p $temp_dir
-echo "$0: dumping alignments in each source directory as single archive and index."
-for dir in $*; do
-  src_id=$((src_id + 1))
-  cur_num_jobs=$(cat $dir/num_jobs) || exit 1;
-  alis=$(for n in $(seq $cur_num_jobs); do echo -n "$dir/ali.$n.gz "; done)
-  $cmd $dir/log/copy_alignments.log \
-    copy-int-vector "ark:gunzip -c $alis|" \
-    ark,scp:$temp_dir/ali.$src_id.ark,$temp_dir/ali.$src_id.scp || exit 1;
+# Verify that phones and trees are compatible in all directories, and than
+# num_jobs files are present, too.
+for src in $@; do
+  if [[ $src != $first_src ]]; then
+    if [[ ! -f $src/num_jobs ]]; then
+      echo "$0: error: Required source file $src/num_jobs is missing."
+      exit 1
+    fi
+    if ! cmp -s $first_src/tree $src/tree; then
+      echo "$0: error: tree $src/tree is either missing or not the" \
+           "same as $first_src/tree."
+      exit 1
+    fi
+    if [[ ! -f $src/phones.txt ]]; then
+      echo "$0: error: Required source file $src/phones.txt is missing."
+      exit 1
+    fi
+    utils/lang/check_phones_compatible.sh $first_src/phones.txt \
+                                          $src/phones.txt || exit 1
+  fi
 done
-sort -m $temp_dir/ali.*.scp > $temp_dir/ali.scp || exit 1;
 
-echo "$0: splitting data to get reference utt2spk for individual ali.JOB.gz files."
-utils/split_data.sh $data $num_jobs || exit 1;
+# All checks passed, ok to prepare directory. Copy model and other files from
+# the first source, as they either checked to be compatible, or we do not care
+# if they are.
+mkdir -p $dest || exit 1
+rm -f $dest/{cmvn_opts,final.mdl,num_jobs,phones.txt,tree}
+$do_ali && rm -f $dest/ali.*.{gz,scp}
+$do_lat && rm -f $dest/lat.*.{gz,scp}
+cp $first_src/{cmvn_opts,final.mdl,phones.txt,tree} $dest/ || exit 1
+cp $first_src/frame_subsampling_factor $dest/ 2>/dev/null  # If present.
+echo $nj > $dest/num_jobs || exit 1
+
+# Make temporary directory, delete on signal, but not on 'exit 1'.
+temp_dir=$(mktemp -d $dest/temp.XXXXXX) || exit 1
+cleanup() { rm -rf "$temp_dir"; }
+trap cleanup HUP INT TERM
+echo "$0: note: Temporary directory $temp_dir will not be deleted in case of" \
+     "script failure, so you could examine it for troubleshooting."
+
+
+# This function may be called twice, once to combine alignments and the second
+# time to combine lattices. The two invocations are as follows:
+#   do_combine ali alignments copy-int-vector $@
+#   do_combine lat lattices   lattice-copy $@
+# where 'ali'/'lat' is a prefix to archive name, 'alignments'/'lattices' go into
+# log messages and logfile names, and 'copy-int-vector'/'lattice-copy' is the
+# program used to copy corresponding objects.
+do_combine() {
+  local ark=$1 entities=$2 copy_program=$3
+  shift 3
+
+  echo "$0: Gathering $entities from each source directory."
+  # Assign all source gzipped archive names to an exported variable, one each
+  # per source directory, so that we can copy archives in a job per source.
+  src_id=0
+  for src in $@; do
+    src_id=$((src_id + 1))
+    nj_src=$(cat $src/num_jobs) || exit 1
+    # Create and export variable src_arcs_${src_id} for the job runner.
+    # Each numbered variable will contain the list of archives, e. g.:
+    # src_arcs_1="exp/tri3_ali/ali.1.gz exp/tri3_ali/ali.1.gz ..."
+    # ('printf' repeats its format as long as there are more arguments).
+    printf -v src_arks_${src_id} "$src/$ark.%d.gz " $(seq $nj_src)
+    export src_arks_${src_id}
+  done
 
-echo "$0: splitting the alignments to appropriate chunks according to the reference utt2spk files."
-utils/filter_scps.pl JOB=1:$num_jobs \
-  $data/split$num_jobs/JOB/utt2spk $temp_dir/ali.scp $temp_dir/ali.JOB.scp
+  # Gather archives in parallel jobs.
+  $cmd JOB=1:$src_id $dest/log/gather_$entities.JOB.log \
+    $copy_program \
+      "ark:gunzip -c \${src_arks_JOB} |" \
+      "ark,scp:$temp_dir/$ark.JOB.ark,$temp_dir/$ark.JOB.scp" || exit 1
+
+  # Merge (presumed already sorted) scp's into a single script.
+  sort -m $temp_dir/$ark.*.scp > $temp_dir/$ark.scp || exit 1
+
+  echo "$0: Splitting combined $entities into $nj archives on speaker boundary."
+  $cmd JOB=1:$nj $dest/log/chop_combined_$entities.JOB.log \
+    $copy_program \
+      "scp:utils/split_scp.pl --utt2spk=$data/utt2spk --one-based -j $nj JOB $temp_dir/$ark.scp |" \
+      "ark:| gzip -c > $dest/$ark.JOB.gz" || exit 1
+
+  # Get some interesting stats, and signal an error if error threshold exceeded.
+  n_utt=$(wc -l <$data/utt2spk)
+  n_ali=$(wc -l <$temp_dir/$ark.scp)
+  n_ali_no_utt=$(join -j1 -v2 $data/utt2spk $temp_dir/$ark.scp | wc -l)
+  n_utt_no_ali=$(join -j1 -v1 $data/utt2spk $temp_dir/$ark.scp | wc -l)
+  n_utt_no_ali_pct=$(perl -e "print int($n_utt_no_ali/$n_utt * 100 + .5);")
+  echo "$0: Combined $n_ali $entities for $n_utt utterances." \
+       "There were $n_utt_no_ali utterances (${n_utt_no_ali_pct}%) without" \
+       "$entities, and $n_ali_no_utt $entities not matching any utterance."
+
+  if (( $n_utt_no_ali_pct >= $tolerance )); then
+    echo "$0: error: Percentage of utterances missing $entities," \
+         "${n_utt_no_ali_pct}%, is at or above error tolerance ${tolerance}%."
+    exit 1
+  fi
 
-for i in `seq 1 $num_jobs`; do
-    copy-int-vector scp:$temp_dir/ali.${i}.scp "ark:|gzip -c >$dest/ali.$i.gz" || exit 1;
-done
+  return 0
+}
 
-echo $num_jobs > $dest/num_jobs  || exit 1
+# Do the actual combining. Do not check returned exit code, as
+# the function always calls 'exit 1' on failure.
+$do_ali && do_combine ali 'alignments' copy-int-vector "$@"
+$do_lat && do_combine lat 'lattices' lattice-copy "$@"
 
-echo "$0: checking the alignment files generated have at least 90% of the utterances."
-for i in `seq 1 $num_jobs`; do
-  num_lines=`cat $temp_dir/ali.$i.scp | wc -l` || exit 1;
-  num_lines_tot=`cat $data/split$num_jobs/$i/utt2spk | wc -l` || exit 1;
-  python -c "import sys;
-percent = 100.0 * float($num_lines) / $num_lines_tot
-if percent < 90 :
-  print ('$dest/ali.$i.gz {0}% utterances missing.'.format(percent))"  || exit 1;
-done
-rm -r $temp_dir 2>/dev/null
+# Delete the temporary directory on success.
+cleanup
 
-echo "Combined alignments and stored in $dest"
+what=
+$do_ali && what+='alignments '
+$do_ali && $do_lat && what+='and '
+$do_lat && what+='lattices '
+echo "$0: Stored combined ${what}in $dest"  # No period, interferes with
+                                            # copy/paste from tty emulator.
 exit 0
diff --git a/egs/wsj/s5/steps/combine_lat_dirs.sh b/egs/wsj/s5/steps/combine_lat_dirs.sh
new file mode 120000
index 00000000000..9cc58c3a616
--- /dev/null
+++ b/egs/wsj/s5/steps/combine_lat_dirs.sh
@@ -0,0 +1 @@
+combine_ali_dirs.sh
\ No newline at end of file

From bcfcad7a6dfaabeaec98e3a4b0c095f1eef7ed3b Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Tanel=20Alum=C3=A4e?= <alumae@gmail.com>
Date: Fri, 17 May 2019 21:59:39 +0300
Subject: [PATCH 120/163] [src] Fix rare case when segment end rounding
 overshoots file end in extract-segments (#3331)

---
 src/featbin/extract-segments.cc | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/src/featbin/extract-segments.cc b/src/featbin/extract-segments.cc
index 7791fb4307b..bda79879483 100644
--- a/src/featbin/extract-segments.cc
+++ b/src/featbin/extract-segments.cc
@@ -180,7 +180,10 @@ int main(int argc, char *argv[]) {
       // conversion requires a proper rounding.
       int32 start_samp = static_cast<int32>(start * samp_freq + 0.5f),
           end_samp = static_cast<int32>(end * samp_freq + 0.5f);
-
+    
+      if (end_samp > num_samp) 
+        end_samp = num_samp;
+     
       // Get the range of data from the orignial wave_data matrix.
       SubMatrix<BaseFloat> segment_matrix(wave_data, channel, 1,
                                           start_samp, end_samp - start_samp);

From 264372c8254db337f7ed45d0791ab9b1b323d4a3 Mon Sep 17 00:00:00 2001
From: phanisankar-nidadavolu
 <32964714+phanisankar-nidadavolu@users.noreply.github.com>
Date: Mon, 20 May 2019 12:42:04 -0400
Subject: [PATCH 121/163] [scripts] Change --modify-spk-id default to False;
 back-compatibility fix for #3119 (#3334)

---
 egs/wsj/s5/steps/data/augment_data_dir.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/egs/wsj/s5/steps/data/augment_data_dir.py b/egs/wsj/s5/steps/data/augment_data_dir.py
index f9aaaf40d59..0274350e133 100755
--- a/egs/wsj/s5/steps/data/augment_data_dir.py
+++ b/egs/wsj/s5/steps/data/augment_data_dir.py
@@ -51,7 +51,7 @@ def get_args():
     parser.add_argument('--random-seed', type=int, dest = "random_seed",
                         default = 123, help='Random seed.')
     parser.add_argument("--modify-spk-id", type=str,
-                        dest='modify_spk_id', default=True,
+                        dest='modify_spk_id', default=False,
                         action=common_lib.StrToBoolAction,
                         choices=["true", "false"],
                         help='Utt prefix or suffix would be added to the spk id '

From 485c2486051324c69d1fc53cca6457312c3adca2 Mon Sep 17 00:00:00 2001
From: Daniel Povey <dpovey@gmail.com>
Date: Mon, 20 May 2019 14:32:47 -0400
Subject: [PATCH 122/163] [build] Add easier configure option in failure
 message of configure (#3335)

---
 src/configure | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/configure b/src/configure
index 3fb298ea240..0c8cc710a94 100755
--- a/src/configure
+++ b/src/configure
@@ -1179,7 +1179,7 @@ elif [ "`uname`" == "Linux" ]; then
       echo -n "Configuring MKL library directory: "
       MKLLIBDIR=`linux_configure_mkllibdir $MKLROOT`
       if [ $? -ne 0 ]; then
-        failure "MKL libraries could not be found. Please use the switch --mkl-libdir "
+        failure "MKL libraries could not be found. Please use the switch --mkl-libdir or try another math library, e.g. --mathlib=ATLAS (would be slower)"
       else
         echo "Found: $MKLLIBDIR"
       fi

From e3ece34dc55a9bd31c999f0cce2f184a491e1ef2 Mon Sep 17 00:00:00 2001
From: Shujian2015 <Shujian2015@users.noreply.github.com>
Date: Wed, 22 May 2019 14:31:26 -0400
Subject: [PATCH 123/163] [scripts,minor] Fix typo in comment (#3338)

---
 scripts/rnnlm/get_unigram_probs.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/scripts/rnnlm/get_unigram_probs.py b/scripts/rnnlm/get_unigram_probs.py
index e3189b26a92..52e637a0e2d 100755
--- a/scripts/rnnlm/get_unigram_probs.py
+++ b/scripts/rnnlm/get_unigram_probs.py
@@ -27,7 +27,7 @@
                     help="File that specifies multiplicities and weights for each data source: "
                     "e.g. if <text_dir> contains foo.txt and bar.txt, then should have lines "
                     "like 'foo 1 0.5' and 'bar 5 1.5'.  These "
-                    "don't have to sum to on.")
+                    "don't have to sum to one.")
 parser.add_argument("--smooth-unigram-counts", type=float, default=1.0,
                     help="Specify the constant for smoothing. We will add "
                          "(smooth_unigram_counts * num_words_with_non_zero_counts / vocab_size) "

From d03c16e1fa506f87a89cdb921f5439e5d95b0c63 Mon Sep 17 00:00:00 2001
From: saikiranvalluri <41471921+saikiranvalluri@users.noreply.github.com>
Date: Thu, 23 May 2019 23:26:37 +0530
Subject: [PATCH 124/163] [src,egs] Add option for applying SVD on trained
 models (#3272)

---
 .../s5/local/chain/run_tdnn_lstm_1a_svd.sh    | 448 ++++++++++++++++++
 src/nnet3/nnet-utils.cc                       | 181 +++++--
 src/nnet3/nnet-utils.h                        |   9 +-
 3 files changed, 588 insertions(+), 50 deletions(-)
 create mode 100644 egs/fisher_swbd/s5/local/chain/run_tdnn_lstm_1a_svd.sh

diff --git a/egs/fisher_swbd/s5/local/chain/run_tdnn_lstm_1a_svd.sh b/egs/fisher_swbd/s5/local/chain/run_tdnn_lstm_1a_svd.sh
new file mode 100644
index 00000000000..5beb2e74a9a
--- /dev/null
+++ b/egs/fisher_swbd/s5/local/chain/run_tdnn_lstm_1a_svd.sh
@@ -0,0 +1,448 @@
+#!/bin/bash
+#
+# Copyright 2018  Nagendra Kumar Goel,
+#            Saikiran Valluri, Govivace.Inc -  Apache 2.0
+
+# The script is organized as below.
+# First we train the baseline LSTMP-TDNN config chain model for few epochs on the (Fisher+swbd)-english data,
+# Then, we perform SVD based refactoring of all the Affine components in this baseline final.mdl,
+# in order to reduce the overall model parameters size,
+# as determined by the bottleneck dim value or Energy and Shrinkage threshold values.
+# Then, we finetune the weight parameters of the refactored model using entire Fisher + switchboard data for single epoch.
+
+# Command used for comparing  WERs of decoding on different testsets using pre-SVD and SVD models:
+#  ./local/chain/compare_wer_general.sh --looped tdnn_lstm_1a_sp tdnn_lstm_1a_svd_sp
+#
+# Please run this entire script till the end before running the above WER compare command...
+
+
+# System                tdnn_lstm_1a_sp
+# WER on eval2000(tg)        12.3
+#           [looped:]        12.2
+# WER on eval2000(fg)        12.1
+#           [looped:]        12.1
+# WER on eval2000(fg)
+#  [SVD retrained + looped]  12.1
+# WER on rt03(tg)            11.6
+#           [looped:]        11.6
+# WER on rt03(tg)
+#  [SVD retrained]           12
+# WER on rt03(fg)            11.3
+#           [looped:]        11.3
+# Final train prob         -0.074
+# Final valid prob         -0.084
+# Final train prob (xent)        -0.882
+# Final valid prob (xent)       -0.9393
+
+# WER stats for eval2000 using tdnn_lstm_1a_sp
+#           | #Snt #Wrd  | Corr Sub Del Ins Err  S.Err |
+# %WER 16.0 | 2628 21594 | 86.3 9.0 4.7 2.3 16.0 54.4 | exp/chain/tdnn_lstm_1a_sp/decode_eval2000_fsh_sw1_tg/score_7_0.0/eval2000_hires.ctm.callhm.filt.sys
+# %WER 12.3 | 4459 42989 | 89.4 7.1 3.5 1.7 12.3 49.8 | exp/chain/tdnn_lstm_1a_sp/decode_eval2000_fsh_sw1_tg/score_8_0.0/eval2000_hires.ctm.filt.sys
+# %WER 8.4 | 1831 21395 | 92.7 5.1 2.2 1.1 8.4 42.3 | exp/chain/tdnn_lstm_1a_sp/decode_eval2000_fsh_sw1_tg/score_10_0.0/eval2000_hires.ctm.swbd.filt.sys
+# %WER 15.9 | 2628 21594 | 86.4 8.9 4.7 2.3 15.9 54.3 | exp/chain/tdnn_lstm_1a_sp/decode_eval2000_fsh_sw1_fg/score_7_0.0/eval2000_hires.ctm.callhm.filt.sys
+# %WER 12.1 | 4459 42989 | 89.6 6.9 3.5 1.7 12.1 49.2 | exp/chain/tdnn_lstm_1a_sp/decode_eval2000_fsh_sw1_fg/score_8_0.0/eval2000_hires.ctm.filt.sys
+# %WER 8.2 | 1831 21395 | 93.1 5.1 1.8 1.3 8.2 41.7 | exp/chain/tdnn_lstm_1a_sp/decode_eval2000_fsh_sw1_fg/score_8_0.0/eval2000_hires.ctm.swbd.filt.sys
+
+# WER stats for rt03 using tdnn_lstm_1a_sp
+# %WER 9.6 | 3970 36721 | 91.5 5.5 3.0 1.1 9.6 41.2 | exp/chain/tdnn_lstm_1a_sp/decode_rt03_fsh_sw1_tg/score_7_0.0/rt03_hires.ctm.fsh.filt.sys
+# %WER 11.6 | 8420 76157 | 89.7 6.8 3.4 1.4 11.6 43.0 | exp/chain/tdnn_lstm_1a_sp/decode_rt03_fsh_sw1_tg/score_7_0.0/rt03_hires.ctm.filt.sys
+# %WER 13.3 | 4450 39436 | 88.0 7.4 4.6 1.3 13.3 44.5 | exp/chain/tdnn_lstm_1a_sp/decode_rt03_fsh_sw1_tg/score_9_0.0/rt03_hires.ctm.swbd.filt.sys
+# %WER 9.4 | 3970 36721 | 91.8 5.3 2.9 1.1 9.4 40.3 | exp/chain/tdnn_lstm_1a_sp/decode_rt03_fsh_sw1_fg/score_7_0.0/rt03_hires.ctm.fsh.filt.sys
+# %WER 11.3 | 8420 76157 | 89.9 6.4 3.7 1.2 11.3 42.4 | exp/chain/tdnn_lstm_1a_sp/decode_rt03_fsh_sw1_fg/score_8_0.0/rt03_hires.ctm.filt.sys
+# %WER 13.1 | 4450 39436 | 88.3 7.5 4.2 1.4 13.1 44.0 | exp/chain/tdnn_lstm_1a_sp/decode_rt03_fsh_sw1_fg/score_8_0.0/rt03_hires.ctm.swbd.filt.sys
+
+# WER stats for rt03 using tdnn_lstm_1a_svd_sp
+# %WER 9.7 | 3970 36721 | 91.3 5.9 2.8 1.0 9.7 40.0 | exp/chain/tdnn_lstm_1a_svd_sp/decode_rt03_fsh_sw1_tg/score_8_0.0/rt03_hires.ctm.fsh.filt.sys
+# %WER 12  | 8420 76157 | 89.3 7.3 3.4 1.3 12.0 42.0 | exp/chain/tdnn_lstm_1a_svd_sp/decode_rt03_fsh_sw1_tg/score_8_0.0/rt03_hires.ctm.filt.sys
+# %WER 14.1 | 4450 39436 | 87.4 8.2 4.3 1.5 14.1 44.6 | exp/chain/tdnn_lstm_1a_svd_sp/decode_rt03_fsh_sw1_tg/score_9_0.0/rt03_hires.ctm.swbd.filt.sys      
+
+
+set -e
+
+# configs for 'chain'
+stage=0
+train_stage=-20
+get_egs_stage=-10
+speed_perturb=true
+dir=exp/chain/tdnn_lstm_1a # Note: _sp will get added to this if $speed_perturb == true.
+svd_dir=${dir}_svd # Note: _sp will get added to this if $speed_perturb == true.
+decode_iter=
+decode_dir_affix=
+
+# training options
+leftmost_questions_truncate=-1
+chunk_width=150
+chunk_left_context=40
+chunk_right_context=0
+xent_regularize=0.025
+self_repair_scale=0.00001
+label_delay=5
+# decode options
+extra_left_context=50
+extra_right_context=0
+frames_per_chunk=
+
+remove_egs=false
+common_egs_dir=
+
+affix=
+
+# config for svd
+apply_svd=true
+energy_threshold=0.81
+shrinkage_threshold=0.64
+primary_lr_factor=0.25
+
+# End configuration section.
+echo "$0 $@"  # Print the command line for logging
+
+. ./cmd.sh
+. ./path.sh
+. ./utils/parse_options.sh
+
+if ! cuda-compiled; then
+  cat <<EOF && exit 1
+This script is intended to be used with GPUs but you have not compiled Kaldi with CUDA
+If you want to use GPUs (and have them), go to src/, and configure and make on a machine
+where "nvcc" is installed.
+EOF
+fi
+
+# The iVector-extraction and feature-dumping parts are the same as the standard
+# nnet3 setup, and you can skip them by setting "--stage 8" if you have already
+# run those things.
+
+suffix=
+if [ "$speed_perturb" == "true" ]; then
+  suffix=_sp
+fi
+
+dir=${dir}$suffix
+svd_dir=${svd_dir}$suffix
+build_tree_train_set=train_nodup
+train_set=train_nodup_sp
+build_tree_ali_dir=exp/tri5a_ali
+treedir=exp/chain/tri6_tree
+lang=data/lang_chain
+
+# if we are using the speed-perturbed data we need to generate
+# alignments for it.
+local/nnet3/run_ivector_common.sh --stage $stage \
+  --speed-perturb $speed_perturb \
+  --generate-alignments $speed_perturb || exit 1;
+
+if [ $stage -le 9 ]; then
+  # Get the alignments as lattices (gives the CTC training more freedom).
+  # use the same num-jobs as the alignments
+  nj=$(cat $build_tree_ali_dir/num_jobs) || exit 1;
+  steps/align_fmllr_lats.sh --nj $nj --cmd "$train_cmd" data/$train_set \
+    data/lang exp/tri5a exp/tri5a_lats_nodup$suffix
+  rm exp/tri5a_lats_nodup$suffix/fsts.*.gz # save space
+fi
+
+if [ $stage -le 10 ]; then
+  # Create a version of the lang/ directory that has one state per phone in the
+  # topo file. [note, it really has two states.. the first one is only repeated
+  # once, the second one has zero or more repeats.]
+  rm -rf $lang
+  cp -r data/lang $lang
+  silphonelist=$(cat $lang/phones/silence.csl) || exit 1;
+  nonsilphonelist=$(cat $lang/phones/nonsilence.csl) || exit 1;
+  # Use our special topology... note that later on may have to tune this
+  # topology.
+  steps/nnet3/chain/gen_topo.py $nonsilphonelist $silphonelist >$lang/topo
+fi
+
+if [ $stage -le 11 ]; then
+  # Build a tree using our new topology.
+  steps/nnet3/chain/build_tree.sh --frame-subsampling-factor 3 \
+      --leftmost-questions-truncate $leftmost_questions_truncate \
+      --context-opts "--context-width=2 --central-position=1" \
+      --cmd "$train_cmd" 11000 data/$build_tree_train_set $lang $build_tree_ali_dir $treedir
+fi
+
+if [ $stage -le 12 ]; then
+  echo "$0: creating neural net configs using the xconfig parser";
+
+  num_targets=$(tree-info $treedir/tree |grep num-pdfs|awk '{print $2}')
+  learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python)
+  lstm_opts="decay-time=20"
+
+  mkdir -p $dir/configs
+  cat <<EOF > $dir/configs/network.xconfig
+  input dim=100 name=ivector
+  input dim=40 name=input
+
+  # please note that it is important to have input layer with the name=input
+  # as the layer immediately preceding the fixed-affine-layer to enable
+  # the use of short notation for the descriptor
+  fixed-affine-layer name=lda input=Append(-2,-1,0,1,2, ReplaceIndex(ivector, t, 0)) affine-transform-file=$dir/configs/lda.mat
+
+  # the first splicing is moved before the lda layer, so no splicing here
+  relu-renorm-layer name=tdnn1 dim=1024
+  relu-renorm-layer name=tdnn2 input=Append(-1,0,1) dim=1024
+  relu-renorm-layer name=tdnn3 input=Append(-1,0,1) dim=1024
+
+  # check steps/libs/nnet3/xconfig/lstm.py for the other options and defaults
+  lstmp-layer name=lstm1 cell-dim=1024 recurrent-projection-dim=256 non-recurrent-projection-dim=256 delay=-3 $lstm_opts 
+  relu-renorm-layer name=tdnn4 input=Append(-3,0,3) dim=1024
+  relu-renorm-layer name=tdnn5 input=Append(-3,0,3) dim=1024
+  lstmp-layer name=lstm2 cell-dim=1024 recurrent-projection-dim=256 non-recurrent-projection-dim=256 delay=-3 $lstm_opts
+  relu-renorm-layer name=tdnn6 input=Append(-3,0,3) dim=1024
+  relu-renorm-layer name=tdnn7 input=Append(-3,0,3) dim=1024
+  lstmp-layer name=lstm3 cell-dim=1024 recurrent-projection-dim=256 non-recurrent-projection-dim=256 delay=-3 $lstm_opts
+
+  ## adding the layers for chain branch
+  output-layer name=output input=lstm3 output-delay=$label_delay include-log-softmax=false dim=$num_targets max-change=1.5
+
+  # adding the layers for xent branch
+  # This block prints the configs for a separate output that will be
+  # trained with a cross-entropy objective in the 'chain' models... this
+  # has the effect of regularizing the hidden parts of the model.  we use
+  # 0.5 / args.xent_regularize as the learning rate factor- the factor of
+  # 0.5 / args.xent_regularize is suitable as it means the xent
+  # final-layer learns at a rate independent of the regularization
+  # constant; and the 0.5 was tuned so as to make the relative progress
+  # similar in the xent and regular final layers.
+  output-layer name=output-xent input=lstm3 output-delay=$label_delay dim=$num_targets learning-rate-factor=$learning_rate_factor max-change=1.5
+
+EOF
+  steps/nnet3/xconfig_to_configs.py --xconfig-file $dir/configs/network.xconfig --config-dir $dir/configs/
+fi
+
+if [ $stage -le 13 ]; then
+  if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then
+    utils/create_split_dir.pl \
+     /export/b0{5,6,7,8}/$USER/kaldi-data/egs/swbd-$(date +'%m_%d_%H_%M')/s5c/$dir/egs/storage $dir/egs/storage
+  fi
+
+  steps/nnet3/chain/train.py --stage $train_stage \
+    --cmd "$decode_cmd" \
+    --feat.online-ivector-dir exp/nnet3/ivectors_${train_set} \
+    --feat.cmvn-opts "--norm-means=false --norm-vars=false" \
+    --chain.xent-regularize $xent_regularize \
+    --chain.leaky-hmm-coefficient 0.1 \
+    --chain.l2-regularize 0.00005 \
+    --chain.apply-deriv-weights false \
+    --chain.lm-opts="--num-extra-lm-states=2000" \
+    --trainer.num-chunk-per-minibatch 64 \
+    --trainer.frames-per-iter 1200000 \
+    --trainer.max-param-change 2.0 \
+    --trainer.num-epochs 4 \
+    --trainer.optimization.shrink-value 0.99 \
+    --trainer.optimization.num-jobs-initial 3 \
+    --trainer.optimization.num-jobs-final 16 \
+    --trainer.optimization.initial-effective-lrate 0.001 \
+    --trainer.optimization.final-effective-lrate 0.0001 \
+    --trainer.optimization.momentum 0.0 \
+    --trainer.deriv-truncate-margin 8 \
+    --egs.stage $get_egs_stage \
+    --egs.opts "--frames-overlap-per-eg 0" \
+    --egs.chunk-width $chunk_width \
+    --egs.chunk-left-context $chunk_left_context \
+    --egs.chunk-right-context $chunk_right_context \
+    --egs.chunk-left-context-initial 0 \
+    --egs.chunk-right-context-final 0 \
+    --egs.dir "$common_egs_dir" \
+    --cleanup.remove-egs $remove_egs \
+    --feat-dir data/${train_set}_hires \
+    --tree-dir $treedir \
+    --lat-dir exp/tri5a_lats_nodup$suffix \
+    --dir $dir  || exit 1;
+fi
+
+src_mdl=${dir}/final.mdl
+if $apply_svd && [ $stage -le 14 ]; then
+  # model compression using SVD
+
+  # threshold configs for tdnn layers
+  mkdir -p $svd_dir/configs
+  edits_config=$svd_dir/configs/final.config
+  common_egs_dir=$dir/egs
+  cat <<EOF > ${edits_config}
+  set-learning-rate-factor learning-rate-factor=$primary_lr_factor
+  apply-svd name=* energy-threshold=$energy_threshold shrinkage-threshold=$shrinkage_threshold
+EOF
+
+  # Copy files / directories from source directory
+  cp ${dir}/{cmvn_opts,tree,frame_subsampling_factor,0.trans_mdl,normalization.fst,den.fst} $svd_dir/.
+
+  # Generate initial model from trained model
+  $train_cmd $svd_dir/log/generate_input_mdl.log \
+    nnet3-am-copy --edits-config=$edits_config $src_mdl $svd_dir/input.raw
+
+  # Retrain the model for 1 epoch
+  steps/nnet3/chain/train.py --stage $train_stage \
+    --cmd "$decode_cmd" \
+    --trainer.input-model $svd_dir/input.raw \
+    --feat.online-ivector-dir exp/nnet3/ivectors_${train_set} \
+    --feat.cmvn-opts "--norm-means=false --norm-vars=false" \
+    --chain.xent-regularize $xent_regularize \
+    --chain.leaky-hmm-coefficient 0.1 \
+    --chain.l2-regularize 0.00005 \
+    --chain.apply-deriv-weights false \
+    --chain.lm-opts="--num-extra-lm-states=2000" \
+    --trainer.num-chunk-per-minibatch 64 \
+    --trainer.frames-per-iter 1200000 \
+    --trainer.max-param-change 2.0 \
+    --trainer.num-epochs 1 \
+    --trainer.optimization.shrink-value 0.99 \
+    --trainer.optimization.num-jobs-initial 3 \
+    --trainer.optimization.num-jobs-final 16 \
+    --trainer.optimization.initial-effective-lrate 0.001 \
+    --trainer.optimization.final-effective-lrate 0.0001 \
+    --trainer.optimization.momentum 0.0 \
+    --trainer.deriv-truncate-margin 8 \
+    --egs.stage $get_egs_stage \
+    --egs.opts "--frames-overlap-per-eg 0" \
+    --egs.chunk-width $chunk_width \
+    --egs.chunk-left-context $chunk_left_context \
+    --egs.chunk-right-context $chunk_right_context \
+    --egs.chunk-left-context-initial 0 \
+    --egs.chunk-right-context-final 0 \
+    --egs.dir "$common_egs_dir" \
+    --cleanup.remove-egs $remove_egs \
+    --feat-dir data/${train_set}_hires \
+    --tree-dir $treedir \
+    --lat-dir exp/tri5a_lats_nodup$suffix \
+    --dir ${svd_dir}  || exit 1;
+fi
+
+if [ $stage -le 15 ]; then
+  # Note: it might appear that this $lang directory is mismatched, and it is as
+  # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
+  # the lang directory.
+  utils/mkgraph.sh --self-loop-scale 1.0 data/lang_fsh_sw1_tg $dir $dir/graph_fsh_sw1_tg
+fi
+
+decode_suff=fsh_sw1_tg
+graph_dir=$dir/graph_fsh_sw1_tg
+
+if [ $stage -le 16 ]; then
+  [ -z $extra_left_context ] && extra_left_context=$chunk_left_context;
+  [ -z $extra_right_context ] && extra_right_context=$chunk_right_context;
+  [ -z $frames_per_chunk ] && frames_per_chunk=$chunk_width;
+  if [ ! -z $decode_iter ]; then
+    iter_opts=" --iter $decode_iter "
+  fi
+  for decode_set in rt03 eval2000; do
+      (
+      steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \
+          --nj 50 --cmd "$decode_cmd" $iter_opts \
+          --extra-left-context $extra_left_context  \
+          --extra-right-context $extra_right_context  \
+          --extra-left-context-initial 0 \
+          --extra-right-context-final 0 \
+          --frames-per-chunk "$frames_per_chunk" \
+          --online-ivector-dir exp/nnet3/ivectors_${decode_set} \
+         $graph_dir data/${decode_set}_hires \
+         $dir/decode_${decode_set}${decode_dir_affix:+_$decode_dir_affix}_${decode_suff} || exit 1;
+      if $has_fisher; then
+          steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \
+            data/lang_fsh_sw1_{tg,fg} data/${decode_set}_hires \
+            $dir/decode_${decode_set}${decode_dir_affix:+_$decode_dir_affix}_fsh_sw1_{tg,fg} || exit 1;
+      fi
+      ) &
+  done
+fi
+
+test_online_decoding=true
+lang=data/lang_fsh_sw1_tg
+if $test_online_decoding && [ $stage -le 17 ]; then
+  # note: if the features change (e.g. you add pitch features), you will have to
+  # change the options of the following command line.
+  steps/online/nnet3/prepare_online_decoding.sh \
+       --mfcc-config conf/mfcc_hires.conf \
+       $lang exp/nnet3/extractor $dir ${dir}_online
+
+  rm $dir/.error 2>/dev/null || true
+  for decode_set in rt03 eval2000; do
+    (
+      # note: we just give it "$decode_set" as it only uses the wav.scp, the
+      # feature type does not matter.
+
+      steps/online/nnet3/decode.sh --nj 50 --cmd "$decode_cmd" $iter_opts \
+          --acwt 1.0 --post-decode-acwt 10.0 \
+         $graph_dir data/${decode_set}_hires \
+         ${dir}_online/decode_${decode_set}${decode_iter:+_$decode_iter}_${decode_suff} || exit 1;
+      if $has_fisher; then
+              steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \
+                      data/lang_fsh_sw1_{tg,fg} data/${decode_set}_hires \
+                      ${dir}_online/decode_${decode_set}${decode_dir_affix:+_$decode_dir_affix}_fsh_sw1_{tg,fg} || exit 1;
+      fi
+    ) || touch $dir/.error &
+  done
+  wait
+  if [ -f $dir/.error ]; then
+    echo "$0: something went wrong in online decoding"
+    exit 1
+  fi
+fi
+
+if $apply_svd; then
+  # Decoding the svd retrained model.
+  dir=$svd_dir
+fi
+
+if [ $stage -le 18 ]; then
+  [ -z $extra_left_context ] && extra_left_context=$chunk_left_context;
+  [ -z $extra_right_context ] && extra_right_context=$chunk_right_context;
+  [ -z $frames_per_chunk ] && frames_per_chunk=$chunk_width;
+  if [ ! -z $decode_iter ]; then
+    iter_opts=" --iter $decode_iter "
+  fi
+  for decode_set in rt03 eval2000; do
+      (
+      steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \
+          --nj 50 --cmd "$decode_cmd" $iter_opts \
+          --extra-left-context $extra_left_context  \
+          --extra-right-context $extra_right_context  \
+          --extra-left-context-initial 0 \
+          --extra-right-context-final 0 \
+          --frames-per-chunk "$frames_per_chunk" \
+          --online-ivector-dir exp/nnet3/ivectors_${decode_set} \
+         $graph_dir data/${decode_set}_hires \
+         $dir/decode_${decode_set}${decode_dir_affix:+_$decode_dir_affix}_${decode_suff} || exit 1;
+      if $has_fisher; then
+          steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \
+            data/lang_fsh_sw1_{tg,fg} data/${decode_set}_hires \
+            $dir/decode_${decode_set}${decode_dir_affix:+_$decode_dir_affix}_fsh_sw1_{tg,fg} || exit 1;
+      fi
+      ) &
+  done
+fi
+
+test_online_decoding=true
+lang=data/lang_fsh_sw1_tg
+if $test_online_decoding && [ $stage -le 19 ]; then
+  # note: if the features change (e.g. you add pitch features), you will have to
+  # change the options of the following command line.
+  steps/online/nnet3/prepare_online_decoding.sh \
+       --mfcc-config conf/mfcc_hires.conf \
+       $lang exp/nnet3/extractor $dir ${dir}_online
+
+  rm $dir/.error 2>/dev/null || true
+  for decode_set in rt03 eval2000; do
+    (
+      # note: we just give it "$decode_set" as it only uses the wav.scp, the
+      # feature type does not matter.
+
+      steps/online/nnet3/decode.sh --nj 50 --cmd "$decode_cmd" $iter_opts \
+          --acwt 1.0 --post-decode-acwt 10.0 \
+         $graph_dir data/${decode_set}_hires \
+         ${dir}_online/decode_${decode_set}${decode_iter:+_$decode_iter}_${decode_suff} || exit 1;
+      if $has_fisher; then
+	      steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \
+		      data/lang_fsh_sw1_{tg,fg} data/${decode_set}_hires \
+		      ${dir}_online/decode_${decode_set}${decode_dir_affix:+_$decode_dir_affix}_fsh_sw1_{tg,fg} || exit 1;
+      fi
+    ) || touch $dir/.error &
+  done
+  wait
+  if [ -f $dir/.error ]; then
+    echo "$0: something went wrong in online decoding"
+    exit 1
+  fi
+fi
+
+exit 0;
diff --git a/src/nnet3/nnet-utils.cc b/src/nnet3/nnet-utils.cc
index 541d2735529..5ab9126f0b5 100644
--- a/src/nnet3/nnet-utils.cc
+++ b/src/nnet3/nnet-utils.cc
@@ -630,13 +630,37 @@ void FindOrphanNodes(const Nnet &nnet, std::vector<int32> *nodes) {
 }
 
 
+// Parameters used in applying SVD:
+// 1. Energy threshold : For each Affine weights layer in the original baseline nnet3 model,
+//  we perform SVD based factoring of the weights matrix of the layer,
+//  into a singular values (left diagonal) matrix, and two Eigen matrices.
+//
+// SVD : Wx = UEV, U,V are Eigen matrices, and E is the singularity matrix)
+//
+// We take the center matrix E, and consider only the Singular values which contribute
+//  to (Energy-threshold) times the total Energy of Singularity parameters.
+//   These Singularity parameters are actually sorted in descending order and lower
+//    values are pruned out until the Total energy (Sum of squares) of the pruned set
+//     of parameters is just above (Energy-threshold * Total init energy). The values which
+//      are pruned away are replaced with 0 in the Singularity matrix
+//      and the Weights matrix after SVD is derived with shrinked dimensions.
+//
+// 2. Shrinkage-threshold : If the Shrinkage ratio of the SVD refactored Weights matrix
+//       is higher than Shrinkage-threshold for any of the Tdnn layers,
+//        the SVD process is aborted for that particular Affine weights layer.
+//
+
 // this class implements the internals of the edit directive 'apply-svd'.
 class SvdApplier {
  public:
   SvdApplier(const std::string component_name_pattern,
              int32 bottleneck_dim,
+             BaseFloat energy_threshold,
+             BaseFloat shrinkage_threshold,
              Nnet *nnet): nnet_(nnet),
                           bottleneck_dim_(bottleneck_dim),
+        		  energy_threshold_(energy_threshold),
+          		  shrinkage_threshold_(shrinkage_threshold),
                           component_name_pattern_(component_name_pattern) { }
   void ApplySvd() {
     DecomposeComponents();
@@ -673,43 +697,70 @@ class SvdApplier {
                      << " -> " << output_dim;
           continue;
         }
-        size_t n = modified_component_info_.size();
-        modification_index_[c] = n;
-        modified_component_info_.resize(n + 1);
-        ModifiedComponentInfo &info = modified_component_info_[n];
-        info.component_index = c;
-        info.component_name = component_name;
         Component *component_a = NULL, *component_b = NULL;
-        info.component_name_a = component_name + "_a";
-        info.component_name_b = component_name + "_b";
-        if (nnet_->GetComponentIndex(info.component_name_a) >= 0)
-          KALDI_ERR << "Neural network already has a component named "
-                    << info.component_name_a;
-        if (nnet_->GetComponentIndex(info.component_name_b) >= 0)
-          KALDI_ERR << "Neural network already has a component named "
-                    << info.component_name_b;
-        DecomposeComponent(component_name, *affine, &component_a, &component_b);
-        info.component_a_index = nnet_->AddComponent(info.component_name_a,
-                                                     component_a);
-        info.component_b_index = nnet_->AddComponent(info.component_name_b,
-                                                     component_b);
+	if (DecomposeComponent(component_name, *affine, &component_a, &component_b)) {
+	  size_t n = modified_component_info_.size();
+	  modification_index_[c] = n;
+	  modified_component_info_.resize(n + 1);
+	  ModifiedComponentInfo &info = modified_component_info_[n];
+	  info.component_index = c;
+	  info.component_name = component_name;
+	  info.component_name_a = component_name + "_a";
+	  info.component_name_b = component_name + "_b";
+	  if (nnet_->GetComponentIndex(info.component_name_a) >= 0)
+	    KALDI_ERR << "Neural network already has a component named "
+		      << info.component_name_a;
+	  if (nnet_->GetComponentIndex(info.component_name_b) >= 0)
+	    KALDI_ERR << "Neural network already has a component named "
+		      << info.component_name_b;
+	  info.component_a_index = nnet_->AddComponent(info.component_name_a,
+						       component_a);
+	  info.component_b_index = nnet_->AddComponent(info.component_name_b,
+						       component_b);
+	}
       }
     }
     KALDI_LOG << "Converted " << modified_component_info_.size()
               << " components to FixedAffineComponent.";
   }
 
-  void DecomposeComponent(const std::string &component_name,
+  // This function finds the minimum index of 
+  // the Descending order sorted [input_vector],
+  // over a range of indices from [lower] to [upper] index,
+  // for which the sum of elements upto the found min. index is greater
+  // than [min_val].
+  // We add one to this index to return the reduced dimension value.
+
+  int32 GetReducedDimension(const Vector<BaseFloat> &input_vector,
+			     int32 lower,
+			     int32 upper,
+			     BaseFloat min_val) {
+    BaseFloat sum = 0;
+    int32 i = 0;
+    for (i = lower; i <= upper; i++) {
+	sum = sum + input_vector(i);
+	if (sum >= min_val) break;
+    }
+    return (i+1);
+  }
+ 
+// Here we perform SVD based refactorig of an input Affine component.
+// After applying SVD , we sort the Singularity values in descending order,
+// and take the subset of values which contribute to energy_threshold times
+// total original sum of squared singular values, and then refactor the Affine
+// component using only these selected singular values, thus making the bottleneck
+// dim of the refactored Affine layer equal to the no. of Singular values selected.
+// This function returs false if the shrinkage ratio of the total no. of parameters,
+// after the above SVD based refactoring, is greater than shrinkage threshold.
+//
+  bool DecomposeComponent(const std::string &component_name,
                           const AffineComponent &affine,
                           Component **component_a_out,
                           Component **component_b_out) {
     int32 input_dim = affine.InputDim(), output_dim = affine.OutputDim();
     Matrix<BaseFloat> linear_params(affine.LinearParams());
     Vector<BaseFloat> bias_params(affine.BiasParams());
-
-    int32 bottleneck_dim = bottleneck_dim_,
-        middle_dim = std::min<int32>(input_dim, output_dim);
-    KALDI_ASSERT(bottleneck_dim < middle_dim);
+    int32 middle_dim = std::min<int32>(input_dim, output_dim);
 
     // note: 'linear_params' is of dimension output_dim by input_dim.
     Vector<BaseFloat> s(middle_dim);
@@ -718,15 +769,40 @@ class SvdApplier {
     linear_params.Svd(&s, &B, &A);
     // make sure the singular values are sorted from greatest to least value.
     SortSvd(&s, &B, &A);
-    BaseFloat s_sum_orig = s.Sum();
-    s.Resize(bottleneck_dim, kCopyData);
-    A.Resize(bottleneck_dim, input_dim, kCopyData);
-    B.Resize(output_dim, bottleneck_dim, kCopyData);
-    BaseFloat s_sum_reduced = s.Sum();
+    Vector<BaseFloat> s2(s.Dim());
+    s2.AddVec2(1.0, s);
+    BaseFloat s2_sum_orig = s2.Sum();
+    KALDI_ASSERT(energy_threshold_ < 1);
+    KALDI_ASSERT(shrinkage_threshold_ < 1);
+    if (energy_threshold_ > 0) {
+      BaseFloat min_singular_sum = energy_threshold_ * s2_sum_orig;
+      bottleneck_dim_ = GetReducedDimension(s2, 0, s2.Dim()-1, min_singular_sum);
+    } 
+    SubVector<BaseFloat> this_part(s2, 0, bottleneck_dim_);
+    BaseFloat s2_sum_reduced = this_part.Sum();
+    BaseFloat shrinkage_ratio =
+      static_cast<BaseFloat>(bottleneck_dim_ * (input_dim+output_dim))
+      / static_cast<BaseFloat>(input_dim * output_dim);
+    if (shrinkage_ratio > shrinkage_threshold_) {
+      KALDI_LOG << "Shrinkage ratio " << shrinkage_ratio
+		<< " greater than threshold : " << shrinkage_threshold_
+		<< " Skipping SVD for this layer.";
+      return false;
+    }
+
+    s.Resize(bottleneck_dim_, kCopyData);
+    A.Resize(bottleneck_dim_, input_dim, kCopyData);
+    B.Resize(output_dim, bottleneck_dim_, kCopyData);
+    KALDI_LOG << "For component " << component_name
+              << " singular value squared sum changed by "
+              << (s2_sum_orig - s2_sum_reduced)
+              << " (from " << s2_sum_orig << " to " << s2_sum_reduced << ")";
     KALDI_LOG << "For component " << component_name
-              << " singular value sum changed by "
-              << (s_sum_orig - s_sum_reduced)
-              << " (from " << s_sum_orig << " to " << s_sum_reduced << ")";
+	      << " dimension reduced from "
+              << " (" << input_dim << "," << output_dim << ")"
+	      << " to [(" << input_dim << "," << bottleneck_dim_
+	      << "), (" << bottleneck_dim_ << "," << output_dim <<")]";
+    KALDI_LOG << "shrinkage ratio : " << shrinkage_ratio;
 
     // we'll divide the singular values equally between the two
     // parameter matrices.
@@ -745,23 +821,22 @@ class SvdApplier {
     component_b->SetUpdatableConfigs(affine);
     *component_a_out = component_a;
     *component_b_out = component_b;
+    return true;
   }
 
   // This function modifies the topology of the neural network, splitting
   // up the components we're modifying into two parts.
   // Suppose we have something like:
   //  component-node name=some_node component=some_component input=
+  // nodes_to_modify will be a list of component-node indexes that we
+  // need to split into two.  These will be nodes like
+  // component-node name=component_node_name component=component_name input=xxx
+  // where 'component_name' is one of the components that we're splitting.
+  // node_names_modified is nnet_->node_names_ except with, for the nodes that
+  // we are splitting in two, "some_node_name" replaced with
+  // "some_node_name_b" (the second of the two split nodes).
   void ModifyTopology() {
-    // nodes_to_split will be a list of component-node indexes that we
-    // need to split into two.  These will be nodes like
-    // component-node name=component_node_name component=component_name input=xxx
-    // where 'component_name' is one of the components that we're splitting.
     std::set<int32> nodes_to_modify;
-
-
-    // node_names_modified is nnet_->node_names_ except with, for the nodes that
-    // we are splitting in two, "some_node_name" replaced with
-    // "some_node_name_b" (the second of the two split nodes).
     std::vector<std::string> node_names_orig = nnet_->GetNodeNames(),
         node_names_modified = node_names_orig;
 
@@ -881,6 +956,8 @@ class SvdApplier {
 
   Nnet *nnet_;
   int32 bottleneck_dim_;
+  BaseFloat energy_threshold_;
+  BaseFloat shrinkage_threshold_;
   std::string component_name_pattern_;
 };
 
@@ -1313,13 +1390,21 @@ void ReadEditConfig(std::istream &edit_config_is, Nnet *nnet) {
     } else if (directive == "apply-svd") {
       std::string name_pattern;
       int32 bottleneck_dim = -1;
-      if (!config_line.GetValue("name", &name_pattern) ||
-          !config_line.GetValue("bottleneck-dim", &bottleneck_dim))
-        KALDI_ERR << "Edit directive apply-svd requires 'name' and "
-            "'bottleneck-dim' to be specified.";
-      if (bottleneck_dim <= 0)
-        KALDI_ERR << "Bottleneck-dim must be positive in apply-svd command.";
-      SvdApplier applier(name_pattern, bottleneck_dim, nnet);
+      BaseFloat energy_threshold = -1;
+      BaseFloat shrinkage_threshold = 1.0;
+      config_line.GetValue("bottleneck-dim", &bottleneck_dim);
+      config_line.GetValue("energy-threshold", &energy_threshold);
+      config_line.GetValue("shrinkage-threshold", &shrinkage_threshold);
+      if (!config_line.GetValue("name", &name_pattern))
+        KALDI_ERR << "Edit directive apply-svd requires 'name' to be specified.";
+      if (bottleneck_dim <= 0 && energy_threshold <=0)
+        KALDI_ERR << "Either Bottleneck-dim or energy-threshold "
+	  "must be set in apply-svd command. "
+	  "Range of possible values is (0 1]";
+      SvdApplier applier(name_pattern, bottleneck_dim,
+			 energy_threshold,
+			 shrinkage_threshold,
+			 nnet);
       applier.ApplySvd();
     } else if (directive == "reduce-rank") {
       std::string name_pattern;
diff --git a/src/nnet3/nnet-utils.h b/src/nnet3/nnet-utils.h
index 60a18f15d84..08307fc766d 100644
--- a/src/nnet3/nnet-utils.h
+++ b/src/nnet3/nnet-utils.h
@@ -300,13 +300,18 @@ void CollapseModel(const CollapseModelConfig &config,
        DropoutMaskComponent or GeneralDropoutComponent whose
        names match the given <name-pattern> (e.g. lstm*).  <name-pattern> defaults to "*".
 
-    apply-svd name=<name-pattern> bottleneck-dim=<dim>
+    apply-svd name=<name-pattern> bottleneck-dim=<dim> energy-threshold=<threshold> shrinkage-threshold=<s>
        Locates all components with names matching <name-pattern>, which are
        type AffineComponent or child classes thereof.  If <dim> is
        less than the minimum of the (input or output) dimension of the component,
-       it does SVD on the components' parameters, retaining only the alrgest
+       it does SVD on the components' parameters, retaining only the largest
        <dim> singular values, replacing these components with sequences of two
        components, of types LinearComponent and NaturalGradientAffineComponent.
+       Instead we can set the filtering criterion for the Singular values as energy-threshold,
+       and retain those values which contribute to energy-threshold times the total energy of
+       the original singular values. A particular SVD factored component is left unshrinked,
+       if the shrinkage ratio of the total no. of its parameters,
+       after the SVD based refactoring, is greater than shrinkage threshold.
        See also 'reduce-rank'.
 
     reduce-rank name=<name-pattern> rank=<dim>

From 33a16d8302f0562a5390acbf5715e037eefa35a5 Mon Sep 17 00:00:00 2001
From: Justin Luitjens <luitjens@users.noreply.github.com>
Date: Thu, 23 May 2019 12:51:27 -0600
Subject: [PATCH 125/163] [src] Add interfaces to nnet-batch-compute that
 expects device input. (#3311)

This avoids a ping pong of memory to host.

Implementation now assumes device memory.  interfaces will allocate
device memory and copy to it if data starts on host.

Add a cuda matrix copy function which clamps rows.  This is much
faster than copying one row at a time and the kernel can handle the
clamping for free.
---
 src/cudamatrix/cu-kernels-ansi.h | 14 ++++++
 src/cudamatrix/cu-kernels.cu     | 55 ++++++++++++++++++++++
 src/cudamatrix/cu-kernels.h      | 18 +++++++
 src/cudamatrix/cu-matrix.cc      | 27 +++++++++++
 src/cudamatrix/cu-matrix.h       |  8 ++++
 src/nnet3/nnet-batch-compute.cc  | 81 +++++++++++++++++++-------------
 src/nnet3/nnet-batch-compute.h   | 11 ++++-
 7 files changed, 179 insertions(+), 35 deletions(-)

diff --git a/src/cudamatrix/cu-kernels-ansi.h b/src/cudamatrix/cu-kernels-ansi.h
index 75ebcf79d74..40eec5ac573 100644
--- a/src/cudamatrix/cu-kernels-ansi.h
+++ b/src/cudamatrix/cu-kernels-ansi.h
@@ -796,6 +796,20 @@ void cuda_uncompress_uint8(dim3 Gr, dim3 Bl, BaseFloat *dest,
                           MatrixDim dim, const uint8_t *src,
                           int src_stride, float scale);
 
+//copies the sub matrix in src[range_start, range_end] to the matrix in dst
+//if src row is outside of the clamped range it will clamp to the specified
+//rows. src and dst cannot overlap.
+void cudaF_mat_copy_range_clamped(
+   int32_t row_start, int32_t row_end, int32_t num_cols,
+   const float *src, int32_t lds, 
+   int32_t clamp_low, int32_t clamp_high,
+   float *dst, int32_t ldd);
+void cudaD_mat_copy_range_clamped(
+   int32_t row_start, int32_t row_end, int32_t num_cols,
+   const double *src, int32_t lds, 
+   int32_t clamp_low, int32_t clamp_high,
+   double *dst, int32_t ldd);
+
 // Launches a kernel that does nothing, explicitly using the legacy default stream;
 // this will synchronize all CUDA streams (except for non-blocking streams) on the
 // device.
diff --git a/src/cudamatrix/cu-kernels.cu b/src/cudamatrix/cu-kernels.cu
index b89fc54b6ce..d038ca93606 100644
--- a/src/cudamatrix/cu-kernels.cu
+++ b/src/cudamatrix/cu-kernels.cu
@@ -3648,6 +3648,33 @@ static void _cuda_uncompress(BaseFloat *dest, MatrixDim dim,
   }
 }
 
+template <typename Real>
+__global__
+void _cuda_mat_copy_range_clamped(
+   int32_t row_start, int32_t row_end, int32_t num_cols,
+   const Real * __restrict__ src, int32_t lds, 
+   int32_t clamp_low, int32_t clamp_high,
+   Real * __restrict__ dst, int32_t ldd) {
+  int32_t rid = blockIdx.y*blockDim.y+threadIdx.y;
+  int32_t cid = blockIdx.x*blockDim.x+threadIdx.x;
+
+  int32_t num_rows = row_end - row_start;
+  // for each row in parallel
+  for (int32_t r = rid; r < num_rows; r += blockDim.y * gridDim.y) {
+    // for each column in parallel
+    for (int32_t c = cid; c < num_cols; c += blockDim.x * gridDim.x) {
+      // compute offset row
+      int32_t r_in = r + row_start;
+      // clamp if necessary
+      if (r_in < clamp_low) r_in = clamp_low;
+      if (r_in > clamp_high) r_in = clamp_high;
+
+      // copy data
+      dst[r * ldd + c] = src[r_in * lds + c];
+    }
+  }
+}
+
 __global__
 static void _noop_kernel() {
 }
@@ -5430,3 +5457,31 @@ void cuda_uncompress_int16(dim3 Gr, dim3 Bl, BaseFloat *dest,
 void cuda_legacy_noop() {
   _noop_kernel<<<1, 1, 0, cudaStreamLegacy>>>();
 }
+
+void cudaF_mat_copy_range_clamped(
+   int32_t row_start, int32_t row_end, int32_t num_cols,
+   const float *src, int32_t lds, 
+   int32_t clamp_low, int32_t clamp_high,
+   float *dst, int32_t ldd) {
+
+  int32_t num_rows =  row_end - row_start;
+  dim3 threads(32,32);
+  dim3 blocks((num_cols+31)/32,(num_rows+31)/32);
+
+  _cuda_mat_copy_range_clamped<float><<<blocks,threads>>>(row_start, row_end, num_cols,
+      src, lds, clamp_low, clamp_high, dst, ldd);
+}
+
+void cudaD_mat_copy_range_clamped(
+   int32_t row_start, int32_t row_end, int32_t num_cols,
+   const double *src, int32_t lds, 
+   int32_t clamp_low, int32_t clamp_high,
+   double *dst, int32_t ldd) {
+
+  int32_t num_rows =  row_end - row_start;
+  dim3 threads(32,32);
+  dim3 blocks((num_cols+31)/32,(num_rows+31)/32);
+
+  _cuda_mat_copy_range_clamped<double><<<blocks,threads>>>(row_start, row_end, num_cols,
+      src, lds, clamp_low, clamp_high, dst, ldd);
+}
diff --git a/src/cudamatrix/cu-kernels.h b/src/cudamatrix/cu-kernels.h
index f93c1e2b2e0..fe4d284f870 100644
--- a/src/cudamatrix/cu-kernels.h
+++ b/src/cudamatrix/cu-kernels.h
@@ -1561,6 +1561,24 @@ inline void cuda_mat_uncompress(dim3 Gr, dim3 Bl, BaseFloat *dest,
   cuda_uncompress_uint16(Gr, Bl, dest, dim, src, src_stride, scale);
 }
 
+inline void cuda_mat_copy_range_clamped(
+   int32_t row_start, int32_t row_end, int32_t num_cols,
+   const double *src, int32_t lds, 
+   int32_t clamp_low, int32_t clamp_high,
+   double *dst, int32_t ldd) {
+  cudaD_mat_copy_range_clamped(row_start, row_end, num_cols,
+      src, lds, clamp_low, clamp_high, dst, ldd);
+}
+
+inline void cuda_mat_copy_range_clamped(
+   int32_t row_start, int32_t row_end, int32_t num_cols,
+   const float *src, int32_t lds, 
+   int32_t clamp_low, int32_t clamp_high,
+   float *dst, int32_t ldd) {
+  cudaF_mat_copy_range_clamped(row_start, row_end, num_cols,
+      src, lds, clamp_low, clamp_high, dst, ldd);
+}
+    
 
 } // namespace kaldi
 
diff --git a/src/cudamatrix/cu-matrix.cc b/src/cudamatrix/cu-matrix.cc
index ae091370edd..ceccea62981 100644
--- a/src/cudamatrix/cu-matrix.cc
+++ b/src/cudamatrix/cu-matrix.cc
@@ -414,6 +414,33 @@ template
 CuMatrix<double>::CuMatrix(const MatrixBase<double> &other, MatrixTransposeType trans);
 
 
+template <typename Real>
+void CuMatrixBase<Real>:: CopyRangeFromMatClamped(const CuMatrixBase<Real> & src,
+      int32_t start_range, int32_t end_range,
+      int32_t clamp_low, int32_t clamp_high) {
+
+  KALDI_ASSERT(NumCols() == this->NumCols());
+  KALDI_ASSERT(NumRows() == end_range-start_range);
+
+#if HAVE_CUDA == 1
+  if (CuDevice::Instantiate().Enabled()) {
+    cuda_mat_copy_range_clamped(start_range, end_range, NumCols(),
+      src.Data(), src.Stride(), clamp_low, clamp_high,
+      Data(), Stride());
+  } else 
+#endif
+  {
+    for (int32 t = start_range; t < end_range; t++) {
+      int32 t_clamped = t;
+      if (t_clamped < clamp_low) t_clamped = clamp_low;
+      if (t_clamped >= clamp_high) t_clamped = clamp_high;
+      CuSubVector<Real> dest_row=this->Row(t - start_range);
+      const CuSubVector<Real> src_row=src.Row(t_clamped);
+      dest_row.CopyFromVec(src_row);
+    }
+  }
+}
+
 template<typename Real>
 template<typename OtherReal>
 void CuMatrixBase<Real>::CopyToMat(MatrixBase<OtherReal> *dst,
diff --git a/src/cudamatrix/cu-matrix.h b/src/cudamatrix/cu-matrix.h
index 85aa4c049e7..fade650bc2d 100644
--- a/src/cudamatrix/cu-matrix.h
+++ b/src/cudamatrix/cu-matrix.h
@@ -237,6 +237,7 @@ class CuMatrixBase {
   void CopyFromMat(const MatrixBase<OtherReal> &src,
                    MatrixTransposeType trans = kNoTrans);
 
+
   void CopyFromGeneralMat(const GeneralMatrix &src,
                           MatrixTransposeType trans = kNoTrans);
 
@@ -248,6 +249,13 @@ class CuMatrixBase {
   template<typename OtherReal>
   void CopyFromTp(const CuTpMatrix<OtherReal> &M,
                   MatrixTransposeType trans = kNoTrans);
+  
+  // This function will copy from source rows (start_range, end_range]
+  // if the range is outside of the clamped region then the clamped
+  // row will be replicated across the out of range areas
+  void CopyRangeFromMatClamped(const CuMatrixBase<Real> & src,
+      int32_t start_range, int32_t end_range,
+      int32_t clamp_low, int32_t clamp_high);
 
   template<typename OtherReal>
   void CopyFromMat(const CuMatrixBase<OtherReal> &M,
diff --git a/src/nnet3/nnet-batch-compute.cc b/src/nnet3/nnet-batch-compute.cc
index 8713d17c049..a2ecf942bdb 100644
--- a/src/nnet3/nnet-batch-compute.cc
+++ b/src/nnet3/nnet-batch-compute.cc
@@ -393,23 +393,17 @@ void NnetBatchComputer::FormatInputs(
       ivector_dim = tasks[0]->ivector.Dim(),
       num_tasks = tasks.size();
   KALDI_ASSERT(num_tasks > 0 && num_tasks <= minibatch_size);
-
-  // We first aggregate the input frames and i-vectors in matrices on the CPU,
-  // and then transfer them to the GPU.  Later on we'll change this code to
-  // used pinned memory.
-  Matrix<BaseFloat> input_cpu(num_tasks * num_input_frames, input_dim,
-                              kUndefined);
-
+  
+  input->Resize(minibatch_size * num_input_frames, input_dim,
+                kUndefined);
 
   for (int32 n = 0; n < num_tasks; n++) {
-    SubMatrix<BaseFloat> input_part(input_cpu,
+    CuSubMatrix<BaseFloat> input_part(*input,
                                     n * num_input_frames, num_input_frames,
                                     0, input_dim);
     input_part.CopyFromMat(tasks[n]->input);
   }
-  input->Resize(minibatch_size * num_input_frames, input_dim,
-                kUndefined);
-  input->RowRange(0, num_tasks * num_input_frames).CopyFromMat(input_cpu);
+  
   if (num_tasks < minibatch_size) {
     // The following will make things easier to debug if something fails, but
     // shouldn't be strictly necessary.
@@ -419,12 +413,11 @@ void NnetBatchComputer::FormatInputs(
   }
 
   if (ivector_dim != 0) {
-    Matrix<BaseFloat> ivectors_cpu(num_tasks, ivector_dim, kUndefined);
-    for (int32 n = 0; n < num_tasks; n++)
-      ivectors_cpu.Row(n).CopyFromVec(tasks[n]->ivector);
-
+    
     ivector->Resize(minibatch_size, ivector_dim, kUndefined);
-    ivector->RowRange(0, num_tasks).CopyFromMat(ivectors_cpu);
+    for (int32 n = 0; n < num_tasks; n++) {
+      ivector->Row(n).CopyFromVec(tasks[n]->ivector);
+    }
 
     if (num_tasks < minibatch_size) {
       // The following will make things easier to debug if something fails, but
@@ -550,7 +543,6 @@ bool NnetBatchComputer::Compute(bool allow_partial_minibatch) {
   minfo->tot_num_tasks += static_cast<int64>(tasks.size());
   minfo->seconds_taken += tim.Elapsed();
 
-
   SynchronizeGpu();
 
   for (size_t i = 0; i < tasks.size(); i++)
@@ -653,7 +645,7 @@ void GetOutputFrameInfoForTasks(
 
 void AddOnlineIvectorsToTasks(
     const NnetBatchComputerOptions &opts,
-    const Matrix<BaseFloat> &online_ivectors,
+    const CuMatrix<BaseFloat> &online_ivectors,
     int32 online_ivector_period,
     std::vector<NnetInferenceTask> *tasks) {
   int32 f = opts.frame_subsampling_factor,
@@ -704,7 +696,7 @@ void AddOnlineIvectorsToTasks(
 static void SplitInputToTasks(const NnetBatchComputerOptions &opts,
                               int32 nnet_left_context,
                               int32 nnet_right_context,
-                              const Matrix<BaseFloat> &input,
+                              const CuMatrix<BaseFloat> &input,
                               std::vector<NnetInferenceTask> *tasks) {
   int32 num_input_frames = input.NumRows(),
       f = opts.frame_subsampling_factor,
@@ -755,27 +747,50 @@ static void SplitInputToTasks(const NnetBatchComputerOptions &opts,
 
     task.input.Resize(end_input_t_padded - begin_input_t_padded,
                       input.NumCols(), kUndefined);
-    // the 't' value below is in the numbering of 'input'.
-    for (int32 t = begin_input_t_padded; t < end_input_t_padded; t++) {
-      int32 t_clipped = t;
-      if (t_clipped < 0) t_clipped = 0;
-      if (t_clipped >= num_input_frames) t_clipped = num_input_frames - 1;
-      SubVector<BaseFloat> dest(task.input,
-                                t - begin_input_t_padded),
-          src(input, t_clipped);
-      dest.CopyFromVec(src);
-    }
+
+    // Copy from intput into task input with clamping
+    task.input.CopyRangeFromMatClamped(input, begin_input_t_padded, 
+        end_input_t_padded, 0, num_input_frames-1);
   }
 }
 
 } // namespace utterance_splitting
 
-
 void NnetBatchComputer::SplitUtteranceIntoTasks(
     bool output_to_cpu,
     const Matrix<BaseFloat> &input,
-    const Vector<BaseFloat> *ivector,
-    const Matrix<BaseFloat> *online_ivectors,
+    const Vector<BaseFloat> *h_ivector,
+    const Matrix<BaseFloat> *h_online_ivectors,
+    int32 online_ivector_period,
+    std::vector<NnetInferenceTask> *tasks) {
+
+  // Inputs are expected to be in device memory. 
+  // create temporary device arrays and copy
+  // inputs into them
+  CuMatrix<BaseFloat> cu_input(input);
+  CuVector<BaseFloat> cu_ivector, *ivector = NULL;
+  CuMatrix<BaseFloat> cu_online_ivectors, *online_ivectors = NULL;
+
+  if (h_ivector!=NULL) {
+    cu_ivector.Resize(h_ivector->Dim(), kUndefined);
+    cu_ivector.CopyFromVec(*h_ivector);
+    ivector = &cu_ivector;
+  }
+  if (h_online_ivectors!=NULL) {
+    cu_online_ivectors.Resize(h_online_ivectors->NumRows(), h_online_ivectors->NumCols(), kUndefined);
+    cu_online_ivectors.CopyFromMat(*h_online_ivectors);
+    online_ivectors = &cu_online_ivectors;
+  }
+
+  SplitUtteranceIntoTasks(output_to_cpu, cu_input, ivector,
+      online_ivectors, online_ivector_period, tasks);
+}
+
+void NnetBatchComputer::SplitUtteranceIntoTasks(
+    bool output_to_cpu,
+    const CuMatrix<BaseFloat> &input,
+    const CuVector<BaseFloat> *ivector,
+    const CuMatrix<BaseFloat> *online_ivectors,
     int32 online_ivector_period,
     std::vector<NnetInferenceTask> *tasks) {
   using namespace utterance_splitting;
@@ -880,7 +895,7 @@ void MergeTaskOutput(
   }
   KALDI_ASSERT(num_output_frames != 0 && output_dim != 0);
   int32 cur_output_frame = 0;
-  output->Resize(num_output_frames, output_dim);
+  output->Resize(num_output_frames, output_dim, kUndefined);
   for (int32 i = 0; i < num_tasks; i++) {
     const NnetInferenceTask &task = tasks[i];
     int32 skip = task.num_initial_unused_output_frames,
diff --git a/src/nnet3/nnet-batch-compute.h b/src/nnet3/nnet-batch-compute.h
index e30d27e5e9a..a421fd4cd93 100644
--- a/src/nnet3/nnet-batch-compute.h
+++ b/src/nnet3/nnet-batch-compute.h
@@ -60,7 +60,7 @@ struct NnetInferenceTask {
   // the lowest t value was originally nonzero in the 'natural' numbering, this
   // just means we conceptually shift the 't' values; the only real constraint
   // is that the 't' values are contiguous.
-  Matrix<BaseFloat> input;
+  CuMatrix<BaseFloat> input;
 
   // The index of the first output frame (in the shifted numbering where the
   // first output frame is numbered zero.  This will typically be less than one,
@@ -113,7 +113,7 @@ struct NnetInferenceTask {
   bool is_irregular;
 
   // The i-vector for this chunk, if this network accepts i-vector inputs.
-  Vector<BaseFloat> ivector;
+  CuVector<BaseFloat> ivector;
 
   // A priority (higher is more urgent); may be either sign.  May be updated
   // after this object is provided to class NnetBatchComputer.
@@ -269,6 +269,13 @@ class NnetBatchComputer {
       const Matrix<BaseFloat> *online_ivectors,
       int32 online_ivector_period,
       std::vector<NnetInferenceTask> *tasks);
+  void SplitUtteranceIntoTasks(
+      bool output_to_cpu,
+      const CuMatrix<BaseFloat> &input,
+      const CuVector<BaseFloat> *ivector,
+      const CuMatrix<BaseFloat> *online_ivectors,
+      int32 online_ivector_period,
+      std::vector<NnetInferenceTask> *tasks);
 
   const NnetBatchComputerOptions &GetOptions() { return opts_; }
 

From 1e8260bd7fe5bea0cace46eeb1b308b41ee36258 Mon Sep 17 00:00:00 2001
From: Ewald Enzinger <entn-at@users.noreply.github.com>
Date: Mon, 27 May 2019 17:45:28 +0200
Subject: [PATCH 126/163] [build] Update GCC support check for CUDA toolkit
 10.1 (#3345)

---
 src/configure | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/src/configure b/src/configure
index 0c8cc710a94..a687bc81997 100755
--- a/src/configure
+++ b/src/configure
@@ -442,10 +442,14 @@ function configure_cuda {
           MIN_UNSUPPORTED_GCC_VER="7.0"
           MIN_UNSUPPORTED_GCC_VER_NUM=70000;
         ;;
-        9_2 | 9_* | 10_*)
+        9_2 | 9_* | 10_0)
           MIN_UNSUPPORTED_GCC_VER="8.0"
           MIN_UNSUPPORTED_GCC_VER_NUM=80000;
         ;;
+        10_1 | 10_*)
+          MIN_UNSUPPORTED_GCC_VER="9.0"
+          MIN_UNSUPPORTED_GCC_VER_NUM=90000;
+        ;;
         *)
           echo "Unsupported CUDA_VERSION (CUDA_VERSION=$CUDA_VERSION), please report it to Kaldi mailing list, together with 'nvcc -h' or 'ptxas -h' which lists allowed -gencode values..."; exit 1;
         ;;

From 10bb5dee3e6d63657284005b0dd858eb5806285f Mon Sep 17 00:00:00 2001
From: Xingyu Na <asr.naxingyu@gmail.com>
Date: Mon, 27 May 2019 23:55:42 +0800
Subject: [PATCH 127/163] [egs] Fix to aishell1 v1 download script (#3344)

---
 egs/aishell/v1/local/download_and_untar.sh | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/egs/aishell/v1/local/download_and_untar.sh b/egs/aishell/v1/local/download_and_untar.sh
index 0189bad1d4a..3578a1c0835 100755
--- a/egs/aishell/v1/local/download_and_untar.sh
+++ b/egs/aishell/v1/local/download_and_untar.sh
@@ -15,7 +15,7 @@ if [ $# -ne 3 ]; then
   echo "Usage: $0 [--remove-archive] <data-base> <url-base> <corpus-part>"
   echo "e.g.: $0 /export/a05/xna/data www.openslr.org/resources/33 data_aishell"
   echo "With --remove-archive it will remove the archive after successfully un-tarring it."
-  echo "<corpus-part> can be one of: data_aishell, resource."
+  echo "<corpus-part> can be one of: data_aishell, resource_aishell."
 fi
 
 data=$1
@@ -28,7 +28,7 @@ if [ ! -d "$data" ]; then
 fi
 
 part_ok=false
-list="data_aishell resource"
+list="data_aishell resource_aishell"
 for x in $list; do
   if [ "$part" == $x ]; then part_ok=true; fi
 done

From d8d3b86448b8307d20ce0990a3bf918d64057436 Mon Sep 17 00:00:00 2001
From: Vimal Manohar <vimal.manohar91@gmail.com>
Date: Mon, 27 May 2019 22:12:50 -0400
Subject: [PATCH 128/163] [scripts] Support utf-8 files in some scripts (#3346)

---
 egs/wsj/s5/steps/cleanup/internal/make_one_biased_lm.py | 5 +++++
 egs/wsj/s5/steps/cleanup/make_biased_lms.py             | 5 +++++
 2 files changed, 10 insertions(+)

diff --git a/egs/wsj/s5/steps/cleanup/internal/make_one_biased_lm.py b/egs/wsj/s5/steps/cleanup/internal/make_one_biased_lm.py
index 68055729fd9..e5f4a8d1996 100755
--- a/egs/wsj/s5/steps/cleanup/internal/make_one_biased_lm.py
+++ b/egs/wsj/s5/steps/cleanup/internal/make_one_biased_lm.py
@@ -10,6 +10,11 @@
 import math
 from collections import defaultdict
 
+import io
+sys.stdout = io.TextIOWrapper(sys.stdout.buffer,encoding="utf8")
+sys.stderr = io.TextIOWrapper(sys.stderr.buffer,encoding="utf8")
+sys.stdin = io.TextIOWrapper(sys.stdin.buffer,encoding="utf8")
+
 parser = argparse.ArgumentParser(description="""
 This script creates a biased language model suitable for alignment and
 data-cleanup purposes.   It reads (possibly multiple) lines of integerized text
diff --git a/egs/wsj/s5/steps/cleanup/make_biased_lms.py b/egs/wsj/s5/steps/cleanup/make_biased_lms.py
index 4b1fd320221..7c6fce990d4 100755
--- a/egs/wsj/s5/steps/cleanup/make_biased_lms.py
+++ b/egs/wsj/s5/steps/cleanup/make_biased_lms.py
@@ -7,6 +7,11 @@
 import subprocess
 from collections import defaultdict
 
+import io
+sys.stdout = io.TextIOWrapper(sys.stdout.buffer,encoding="utf8")
+sys.stderr = io.TextIOWrapper(sys.stderr.buffer,encoding="utf8")
+sys.stdin = io.TextIOWrapper(sys.stdin.buffer,encoding="utf8")
+
 parser = argparse.ArgumentParser(description="""
 This script is a wrapper for make_one_biased_lm.py that reads a Kaldi archive
 of (integerized) text data from the standard input and writes a Kaldi archive of

From 75a69d98bb01ab1f8c89794fd051addf3906fe46 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E9=BB=84=E5=9F=B9=E6=9D=BE?= <bringtree@qq.com>
Date: Wed, 29 May 2019 00:30:02 +0800
Subject: [PATCH 129/163] [scripts]: add warning to nnet3/chain/train.py about
 ineffective options (#3341)

---
 egs/wsj/s5/steps/nnet3/chain/train.py | 7 +++++++
 1 file changed, 7 insertions(+)

diff --git a/egs/wsj/s5/steps/nnet3/chain/train.py b/egs/wsj/s5/steps/nnet3/chain/train.py
index 67cb9f90620..91b7df4e8df 100755
--- a/egs/wsj/s5/steps/nnet3/chain/train.py
+++ b/egs/wsj/s5/steps/nnet3/chain/train.py
@@ -358,6 +358,13 @@ def train(args, run_opts):
                                right_context_final >= 0 else -1)
 
     default_egs_dir = '{0}/egs'.format(args.dir)
+
+    if (args.egs_dir is not None) and (args.cmvn_opts != "--norm-means=false --norm-vars=false"):
+        logger.warning("the --feat.cmvn-opts option has no effect because we are not dumping egs")
+
+    if (args.egs_dir is not None) and (args.frames_per_iter != 800000):
+        logger.warning("the --trainer.frames-per-iter option has no effect because we are not dumping egs")
+
     if ((args.stage <= -3) and args.egs_dir is None):
         logger.info("Generating egs")
         if (not os.path.exists("{0}/den.fst".format(args.dir)) or

From 448c876295c5d88ba9820afe3ed6bc6c0349aa80 Mon Sep 17 00:00:00 2001
From: Daniel Povey <dpovey@gmail.com>
Date: Mon, 3 Jun 2019 16:39:37 -0400
Subject: [PATCH 130/163] [src] Misc tensor progress

---
 src/tensor/linear-ops.cc   | 15 +++++++++-
 src/tensor/op.h            | 58 ++++++++++++++++++++++++++++++++++++++
 src/tensor/tensor-common.h |  1 +
 src/tensor/tensor.h        |  5 +++-
 4 files changed, 77 insertions(+), 2 deletions(-)

diff --git a/src/tensor/linear-ops.cc b/src/tensor/linear-ops.cc
index ae4defeae44..d5e23bc9fa9 100644
--- a/src/tensor/linear-ops.cc
+++ b/src/tensor/linear-ops.cc
@@ -23,11 +23,12 @@ namespace kaldi {
 namespace tensor {
 
 void AddOp::Expand(std::vector<std::unique_ptr<Op> > *ops) const {
-
   Pattern a_pattern = a_.Impl().pattern,
       b_pattern = b_.Impl().pattern;
   NormalizePatterns({a_pattern, b_pattern});
 
+  KALDI_ASSERT(Compatible(a_, b_));  // dtype and device, check they match.
+
   Tensor a(a_), b(b_);
 
   if (a_pattern != a_.Impl().pattern)
@@ -46,10 +47,22 @@ void AddOp::Expand(std::vector<std::unique_ptr<Op> > *ops) const {
   int64 combined_code = CombineCodes(a_pattern.GetCode(),
                                      b_pattern.GetCode());
 
+  Op *new_op;
+
+  /*
+    The case-statement values in the switch statement below may be interpreted
+    in groups of 3 hex characters, are 0xAAABBB, pertaining to Tensors a and b.
+    See GetPatternCode() in pattern-utils.h for documentation on the meanings of
+    the values and our notation with X,x,1.
+   */
+
   // We are doing a += b.
   switch(combined_code) {
     // A scalar plus a scalar
     case 0x000000000:
+      SET_TO_TEMPLATED_OP_REAL(new_op, a.Dtype(), a.DeviceType(), ScalarPlusEqScalar, a, b);
+      break;
+
 
 
 
diff --git a/src/tensor/op.h b/src/tensor/op.h
index 6fbc767190b..d722d716ba4 100644
--- a/src/tensor/op.h
+++ b/src/tensor/op.h
@@ -163,6 +163,64 @@ class Op {
 
 };
 
+
+// the following macro is primarily for use inside other macros defined below.
+#define SET_TO_TEMPLATED_OP_DEVICE(pointer_name, device_type, OpName, T, ...) \
+   {                                                                      \
+   switch (device_type) {                                                 \
+    case kCpuDevice:                                                      \
+      pointer_name = new OpName<T, kCpuDevice>(__VA_ARGS__); break;       \
+    case kGpuDevice:                                                      \
+      pointer_name = new OpName<T, kGpuDevice>(__VA_ARGS__); break;       \
+    default:                                                              \
+    KALDI_ERR << "Invalid device type " << int32(device_type);            \
+  }  while (0)
+// the while(0) is to allow a semicolon after the invocation.
+
+
+// the following macro is to be used to dispatch device and dtype-specific
+// implementations.  The idea is that you have defined a template like
+// template <class Dtype, class DeviceType> class OpName
+// and have specialized that template for the various combinations.
+// This executes commands like:
+//    pointer_name = new OpName<float, kCpu>(a, b, c);
+// See also SET_TO_TEMPLATED_OP_REAL for ops where integers are not
+// supported
+#define SET_TO_TEMPLATED_OP_ALL(pointer_name, dtype, device_type, OpName, ...) \
+    switch (dtype) {                                \
+     case kFloatDtype:                              \
+     SET_TO_TEMPLATE_OP_DEVICE(pointer_name, device_type, OpName, float, __VA_ARGS__); \
+      break;                                        \
+     case kDoubleDtype:                             \
+     SET_TO_TEMPLATE_OP_DEVICE(pointer_name, device_type, OpName, double, __VA_ARGS__); \
+      break;                                        \
+     case kInt32Dtype:                             \
+     SET_TO_TEMPLATE_OP_DEVICE(pointer_name, device_type, OpName, int32, __VA_ARGS__); \
+      break;                                        \
+    default:                                        \
+      KALDI_ERR << "Invalid dtype (this op only allows float or double): " \
+      << int32(dtype);                              \
+  } while(0)
+// the while(0) is to allow a semicolon after the invocation.
+
+#define SET_TO_TEMPLATED_OP_REAL(pointer_name, dtype, device_type, OpName, ...) \
+    switch (dtype) {                                \
+     case kFloatDtype:                              \
+       SET_TO_TEMPLATE_OP_DEVICE(pointer_name, device_type, OpName, float, __VA_ARGS__); \
+      break;                                        \
+     case kDoubleDtype:                             \
+       SET_TO_TEMPLATE_OP_DEVICE(pointer_name, device_type, OpName, double, __VA_ARGS__); \
+      break;                                        \
+    default:                                        \
+      KALDI_ERR << "Invalid dtype (this op only allows float or double): " \
+                << int32(dtype);                              \
+  } while(0)
+// the while(0) is to allow a semicolon after the invocation.
+
+
+
+
+
 // See linear-ops.h and nonlinear-ops.h for concrete examples of Ops.
 
 }  // namespace tensor
diff --git a/src/tensor/tensor-common.h b/src/tensor/tensor-common.h
index f4947ae92d6..0311fc9fbfe 100644
--- a/src/tensor/tensor-common.h
+++ b/src/tensor/tensor-common.h
@@ -67,6 +67,7 @@ enum DataType {
   // via SetDefaultDtype.
   kFloatDtype = 1,
   kDoubleDtype = 2,
+  kInt32Dtype = 3,
 };
 
 
diff --git a/src/tensor/tensor.h b/src/tensor/tensor.h
index d3641c07f61..a99728ac624 100644
--- a/src/tensor/tensor.h
+++ b/src/tensor/tensor.h
@@ -217,7 +217,10 @@ class Tensor {
   bool HasCStrides() const;
 
   // Return the data type.
-  DataType Dtype() const { return dtype_; }
+  DataType Dtype() const { return impl_->dtype; }
+
+  // Return the device type.
+  DeviceType DeviceType() const { return impl_->device.device_type; }
 
   /**
      Indexing operator taking one arg.  Returns a Tensor referencing

From 5937fae02528a87b298605bb1965f2b626e1c0df Mon Sep 17 00:00:00 2001
From: Daniel Povey <dpovey@gmail.com>
Date: Wed, 5 Jun 2019 17:36:33 -0400
Subject: [PATCH 131/163] [src] small change

---
 src/tensor/linear-ops.cc |  8 +++++++-
 src/tensor/linear-ops.h  | 14 +++++++-------
 2 files changed, 14 insertions(+), 8 deletions(-)

diff --git a/src/tensor/linear-ops.cc b/src/tensor/linear-ops.cc
index d5e23bc9fa9..47c2ed520cd 100644
--- a/src/tensor/linear-ops.cc
+++ b/src/tensor/linear-ops.cc
@@ -22,7 +22,7 @@
 namespace kaldi {
 namespace tensor {
 
-void AddOp::Expand(std::vector<std::unique_ptr<Op> > *ops) const {
+void PlusEqOp::Expand(std::vector<std::unique_ptr<Op> > *ops) const {
   Pattern a_pattern = a_.Impl().pattern,
       b_pattern = b_.Impl().pattern;
   NormalizePatterns({a_pattern, b_pattern});
@@ -63,6 +63,12 @@ void AddOp::Expand(std::vector<std::unique_ptr<Op> > *ops) const {
       SET_TO_TEMPLATED_OP_REAL(new_op, a.Dtype(), a.DeviceType(), ScalarPlusEqScalar, a, b);
       break;
 
+    default:
+      // Later we can add a more generic implementation that handles arbitrary
+      // patterns.
+      KALDI_ERR << "Unhandled code: " << std::hex << combined_code;
+  }
+
 
 
 
diff --git a/src/tensor/linear-ops.h b/src/tensor/linear-ops.h
index 3eade973772..ef8c5c19fe8 100644
--- a/src/tensor/linear-ops.h
+++ b/src/tensor/linear-ops.h
@@ -35,22 +35,22 @@ namespace tensor {
 
    May not be used if a and b overlap.
 */
-class AddOp {
+class PlusEqOp: public Op {
  public:
 
-  AddOp(const Tensor &a, Tensor &b):
+  PlusEqOp(const Tensor &a, Tensor &b):
       a_(a), b_(b) {
     KALDI_ASSERT(!Overlap(a, b) &&
                  BroadcastableAndCompatible(a, b));
   }
-  AddOp(const AddOp &other):
+  PlusEqOp(const PlusEqOp &other):
       a_(other.a_), b_(other.b_) { }
 
 
   int32 Properties() { return 0 ; }  // Not concrete.
 
   Op *Copy() const override {
-    return new AddOp(*this);
+    return new PlusEqOp(*this);
   }
 
   // Defined in linear-ops.cc; this function works out the more concrete
@@ -66,7 +66,7 @@ class AddOp {
       return;
     // else return the Op corresponding to:
     // b_deriv_ += a_deriv_.
-    ops->push_back(std::unique_ptr<Op>(new AddOp(AsTensor(b_deriv),
+    ops->push_back(std::unique_ptr<Op>(new PlusEqOp(AsTensor(b_deriv),
                                                  map->Deriv(a_))));
 
   }
@@ -78,7 +78,7 @@ class AddOp {
       return;
     // else return the Op corresponding to:
     // a_deriv_ += b_deriv_.
-    ops->push_back(std::unique_ptr<Op>(new AddOp(AsTensor(a_deriv),
+    ops->push_back(std::unique_ptr<Op>(new PlusEqOp(AsTensor(a_deriv),
                                                  map->Deriv(b_))));
   }
 
@@ -136,7 +136,7 @@ class AssignOp {
       return;
     // Return the Op corresponding to:
     // a_deriv_ += b_deriv_.
-    ops->push_back(std::unique_ptr<Op>(new AddOp(map->Deriv(b_),
+    ops->push_back(std::unique_ptr<Op>(new PlusEqOp(map->Deriv(b_),
                                                  AsTensor(a_deriv))));
   }
 

From 602ae12c697571dcc2fe081ffff43de314f8550d Mon Sep 17 00:00:00 2001
From: Daniel Povey <dpovey@gmail.com>
Date: Mon, 10 Jun 2019 12:46:13 -0400
Subject: [PATCH 132/163] [src] tensor progress

---
 src/tensor/context.h            | 138 ++------
 src/tensor/linear-cpu-ops.h     | 260 ++++++++++++++
 src/tensor/linear-ops.cc        |  75 +++-
 src/tensor/linear-ops.h         |   7 +-
 src/tensor/linear-ref-ops.h     |  87 +++++
 src/tensor/linear-special-ops.h | 583 ++------------------------------
 src/tensor/op.h                 |  74 +++-
 src/tensor/pattern-utils.cc     |   2 +-
 src/tensor/storage.h            |  11 +-
 src/tensor/tensor-common.h      |   5 +-
 src/tensor/tensor-settings.h    | 128 ++-----
 src/tensor/tensor-utils.cc      |  71 ++++
 src/tensor/tensor-utils.h       | 116 +++++++
 src/tensor/tensor.h             |  27 +-
 14 files changed, 754 insertions(+), 830 deletions(-)
 create mode 100644 src/tensor/linear-cpu-ops.h
 create mode 100644 src/tensor/linear-ref-ops.h
 create mode 100644 src/tensor/tensor-utils.cc

diff --git a/src/tensor/context.h b/src/tensor/context.h
index 928254586b6..5b7306c6562 100644
--- a/src/tensor/context.h
+++ b/src/tensor/context.h
@@ -38,13 +38,12 @@ namespace tensor {
 
 
 // class Context contains various configurations that we will sometimes need
-// when we do operations on Tensors.  Things like the default data type, the
-// debug mode, and so on.  This will be passed around
-class Context {
+// when we do operations on Tensors.
+struct Context {
   // The default DataType for newly created Tensors
-  DataType default_dtype_;
+  DataType default_dtype;
   // The default Device for newly created Tensors
-  Device default_device_;
+  Device default_device;
 };
 
 
@@ -213,63 +212,11 @@ class ForwardPropExecutionContext: public ExecutionContext {
   // May be used to query the derivative of some Tensor w.r.t. the
   // input, e.g. forward_context.GetDerivMap()->DerivIfPresent(some_tensor).
   DerivMap *GetDerivMap() { return deriv_map_.get(); }
-
-
 };
 
 
 
 
-
-// Mechanism to set the default device within a scope by constructing a variable
-// that exists only within that scope.
-class WithDeviceAs {
- public:
-  // Example:
-  // {
-  //   WithDeviceAs _(kCudaDevice);
-  //   // code in this block uses this default.  the variable
-  //   // name is _ because we don't need to access it.
-  // }
-  inline WithDeviceAs(DeviceType device_type):
-      prev_default_(GetDefaultDevice()) {
-    SetDefaultDevice(Device(device_type));
-  }
-  inline WithDeviceAs(Device device):
-      prev_default_(GetDefaultDevice()) {
-    SetDefaultDevice(device);
-  }
-  ~WithDeviceAs() { SetDefaultDevice(prev_default_); }
-
- private:
-  Device prev_default_;
-};
-
-
-
-DataType GetDefaultDtype();
-void SetDefaultDtype(DataType dtype);
-
-class WithDtypeAs {
- public:
-  // Example:
-  // {
-  //   WithDtypeAs _(kDoubleDtype);
-  //   // code in this block uses this default.  the variable
-  //   // name is _ because we don't need to access it.
-  // }
-  inline WithDtypeAs(DataType dtype):
-      prev_default_(GetDefaultDtype()) {
-    SetDefaultDtype(dtype);
-  }
-  ~WithDtypeAs() { SetDefaultDtype(prev_default_); }
-
- private:
-  DataType prev_default_;
-};
-
-
-
 // struct TensorOptions is used as an arg for some constructors
 // when creating Tensors and Variables; it allows flexibility
 // in specifying the device and/or dtype.  See the examples
@@ -278,19 +225,28 @@ struct TensorOptions {
   DataType dtype;
   Device device;
 
-  TensorOptions(): dtype(GetDefaultDtype()),
-                   device(GetDefaultDevice()) { }
-  TensorOptions(DataType dtype):
-      dtype(dtype), device(GetDefaultDevice()) { }
-  TensorOptions(Device device):
-      dtype(GetDefaultDtype()), device(device) { }
-  TensorOptions(DeviceType device_type):
-      dtype(GetDefaultDtype()), device(device_type) { }
-  TensorOptions(DataType dtype, Device device):
+  explicit TensorOptions(const Context &context):
+      dtype(context.default_dtype),
+      device(context.default_device) { }
+  explicit TensorOptions(const Context &context,
+                         DataType dtype):
+      dtype(dtype), device(context.default_device) { }
+  explicit TensorOptions(const Context &context, Device device):
+      dtype(context.default_dtype), device(device) { }
+  explicit TensorOptions(const Context &context, DeviceType device_type):
+      dtype(context.default_dtype), device(device_type) { }
+  // Here the context is not used; we could create a new version
+  // that doesn't take the context object, but of course that would
+  // make it harder if we add more options later.
+  TensorOptions(const Context &context, DataType dtype,
+                Device device):
       dtype(dtype), device(device) { }
+  TensorOptions(const Context &context, DataType dtype,
+                Device device_type):
+      dtype(dtype), device(device_type) { }
   TensorOptions(DataType dtype, Device device_type):
       dtype(dtype), device(device_type) { }
-  TensorOptions(const TensorOptions &other):
+  explicit TensorOptions(const TensorOptions &other):
       dtype(other.dtype), device(other.device) { }
 };
 
@@ -315,6 +271,7 @@ inline void SetDebugMode(bool b) {
     debug_start_tick = NextTick();
   debug_mode = b;
 }
+
 /**
    Returns the tick at which debug mode most recently changed from false to
    true.
@@ -324,54 +281,7 @@ inline int64 DebugTick() {
   return debug_start_tick;
 }
 
-class WithDebugModeAs {
- public:
-  // Example:
-  // {
-  //   WithDebugModeAs _(true);
-  //   // code in this block uses debug mode.
-  //   // variable name is _ because we won't use it.
-  // }
-  inline WithDebugModeAs(bool b):
-      prev_default_(DebugMode()) {
-    SetDebugMode(b);
-  }
-  ~WithDebugModeAs() { SetDebugMode(prev_default_); }
 
- private:
-  bool prev_default_;
-};
-
-
-
-// allow_grad means that gradient tracking is allowed; allow_grad = true
-// is the normal case, and means that if gradient tracking is required
-// (e.g. if the user created a Variable with requires_grad = true, and we do
-// operations that depend on it), then we'll track gradients.
-// It is our way to implement an equivalent of PyTorch's `with torch.no_grad()`.
-// Do not access this variable directly; use AllowGrad() and
-extern thread_local bool allow_grad;
-inline bool AllowGrad() { return allow_grad; }
-inline void SetAllowGrad(bool b) { allow_grad = b; }
-
-
-class WithNoGrad {
- public:
-  // Example:
-  // {
-  //   WithNoGrad _;
-  //   // code in this block has gradient tracking disabled.
-  //   // variable name is _ because we won't use it.
-  //
-  // }
-  inline WithNoGrad():
-      prev_default_(AllowGrad()) {
-    SetAllowGrad(false);
-  }
-  ~WithNoGrad() { SetAllowGrad(prev_default_); }
- private:
-  bool prev_default_;
-};
 
 
 }  // namespace tensor
diff --git a/src/tensor/linear-cpu-ops.h b/src/tensor/linear-cpu-ops.h
new file mode 100644
index 00000000000..372bee0c5a4
--- /dev/null
+++ b/src/tensor/linear-cpu-ops.h
@@ -0,0 +1,260 @@
+// tensor/linear-cpu-ops.h
+
+// Copyright      2019  Johns Hopkins University (author: Daniel Povey)
+
+// See ../../COPYING for clarification regarding multiple authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//  http://www.apache.org/licenses/LICENSE-2.0
+//
+// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
+// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
+// MERCHANTABLITY OR NON-INFRINGEMENT.
+// See the Apache 2 License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef KALDI_TENSOR_LINEAR_SPECIAL_OPS_H_
+#define KALDI_TENSOR_LINEAR_SPECIAL_OPS_H_ 1
+
+#include "tensor/tensor.h"
+#include "tensor/linear-special-ops.h"
+#include "matrix/kaldi-blas.h"
+
+
+// This Ops are more specialized forms of the Ops declared in linear-ops.h;
+// these correspond to more specific combinations of Tensor shapes.  These Ops
+// are only intended to be created from inside other more generic Ops.
+namespace kaldi {
+namespace tensor {
+
+/**
+   Does a += b for a and b both scalar, on CPU.
+ */
+template <class T>
+class ScalarPlusEqScalarOp<T, kCpuDevice>: public Op {
+
+  ScalarPlusEqScalarOp(const Tensor &a, const Tensor &b): a_(a), b_(b) { }
+
+  Op *Copy() {
+    return new ScalarPlusEqScalar<T, kCpuDevice>(a_, b_);
+  }
+
+  void Do() {
+    DebugNormalOp(a, kReadWrite, b_, kRead);
+    *a_.GetData<T>() += *b_.GetData<T>();
+  }
+
+  Tensor a_;
+  Tensor b_;
+};
+
+
+/**
+   Does a += b for a and b both possibly-strided vectors (Stvector), on CPU.
+
+   They must be normalized form, i.e. all axes trivial except raxis 0,
+   and they must have the same dimension.
+
+   This generic form of the template works for integer types (and would work,
+   if used, for float and double).  We will separately instantiate this
+   template for float and double, to use BLAS calls
+*/
+template <class T>
+class StvectorPlusEqStvectorOp<T, kCpuDevice>: public Op {
+
+  StvectorPlusEqStvectorOp(const Tensor &a, const Tensor &b): a_(a), b_(b) { }
+
+  int32 Properties() { return kConcreteOp; }
+
+  Op *Copy() {
+    return new StvectorPlusEqStvectorOp<T, kCpuDevice>(a_, b_);
+  }
+
+  void Do() {
+    DebugNormalOp(a, kReadWrite, b_, kRead);
+    const Pattern &a_pattern = a_.Pattern(),
+        &b_pattern = b_.Pattern();
+    int32 dim = a_pattern.dims[0],
+        a_stride = a_pattern.strides[0],
+        b_stride = b_pattern.strides[0];
+    T *a_data = a_.GetData<T>(),
+        *b_data = a_.GetData<T>();
+    // In future could look into unrolling this loop if it becomes a bottleneck.
+    for (int32 i = 0; i < dim; i++)
+      a_data[i * a_stride] += b_data[i * b_stride];
+  }
+  Tensor a_;
+  Tensor b_;
+};
+
+
+// override for float that uses BLAS
+template <>
+class StvectorPlusEqStvectorOp<float, kCpuDevice>: public Op {
+  SvectorPlusEqSvectorOp(const Tensor &a, const Tensor &b): a_(a), b_(b) { }
+  int32 Properties() { return kConcreteOp; }
+  Op *Copy() {
+    return new SvectorPlusEqSvectorOp<float, kCpuDevice>(a_, b_);
+  }
+  void Do() {
+    DebugNormalOp(a, kReadWrite, b_, kRead);
+    const Pattern &a_pattern = a_.Pattern(),
+        &b_pattern = b_.Pattern();
+    cblas_saxpy(a_pattern.dims[0], 1.0,
+                b_.GetData<T>(), a_pattern.strides[0],
+                a_.GetData<T>(), b_pattern.strides[0]);
+  }
+  Tensor a_;
+  Tensor b_;
+};
+
+// override for double that uses BLAS
+template <>
+class StvectorPlusEqStvectorOp<double, kCpuDevice>: public Op {
+  SvectorPlusEqSvectorOp(const Tensor &a, const Tensor &b): a_(a), b_(b) { }
+  int32 Properties() { return kConcreteOp; }
+  Op *Copy() {
+    return new SvectorPlusEqSvectorOp<double, kCpuDevice>(a_, b_);
+  }
+  void Do() {
+    DebugNormalOp(a, kReadWrite, b_, kRead);
+    const Pattern &a_pattern = a_.Pattern(),
+        &b_pattern = b_.Pattern();
+    cblas_daxpy(a_pattern.dims[0], 1.0,
+                b_.GetData<T>(), a_pattern.strides[0],
+                a_.GetData<T>(), b_pattern.strides[0]);
+  }
+  Tensor a_;
+  Tensor b_;
+};
+
+
+/**
+   Does a += b for a scalar and b a vector or strided vector, on CPU.
+   (i.e. a += sum(b)).
+
+   They must be normalized form, i.e. all axes trivial except raxis 0
+   of b, and b must not have negative stride.  (This is to allow
+   the BLAS template overrides).
+
+   This generic form of the template works for integer types (and would work,
+   if used, for float and double).  We will separately instantiate this
+   template for float and double, to use BLAS calls.
+*/
+template <class T>
+class ScalarPlusEqStvectorOp<T, kCpuDevice>: public Op {
+
+  StvectorPlusEqStvectorOp(const Tensor &a, const Tensor &b): a_(a), b_(b) { }
+
+  int32 Properties() { return kConcreteOp; }
+
+  Op *Copy() { return new ScalarPlusEqStvectorOp<T, kCpuDevice>(a_, b_); }
+
+  void Do() {
+    DebugNormalOp(a, kReadWrite, b_, kRead);
+    const Pattern &a_pattern = a_.Pattern(),
+        &b_pattern = b_.Pattern();
+    int32 dim = b_pattern.dims[0],
+        b_stride = b_pattern.strides[0];
+    T *a_data = a_.GetData<T>(),
+        *b_data = a_.GetData<T>();
+    T sum(0);
+    // In future could look into unrolling this loop if it becomes a bottleneck.
+    for (int32 i = 0; i < dim; i++)
+      sum += b_data[i * b_stride];
+    *a_data += sum;
+  }
+  Tensor a_;
+  Tensor b_;
+};
+
+
+
+// Override for T = float.
+template <>
+class ScalarPlusEqStvectorOp<float, kCpuDevice>: public Op {
+  StvectorPlusEqStvectorOp(const Tensor &a, const Tensor &b): a_(a), b_(b) { }
+
+  int32 Properties() { return kConcreteOp; }
+
+  Op *Copy() { return new ScalarPlusEqStvectorOp<float, kCpuDevice>(a_, b_); }
+
+  void Do() {
+    DebugNormalOp(a, kReadWrite, b_, kRead);
+    const Pattern &a_pattern = a_.Pattern(),
+        &b_pattern = b_.Pattern();
+    int32 dim = b_pattern.dims[0],
+        b_stride = b_pattern.strides[0];
+    float *a_data = a_.GetData<T>(),
+        *b_data = a_.GetData<T>();
+    *a_data += cblas_sasum(dim, b_data, b_stride);
+  }
+  Tensor a_;
+  Tensor b_;
+};
+
+// Override for T = double
+template <>
+class ScalarPlusEqStvectorOp<double, kCpuDevice>: public Op {
+  StvectorPlusEqStvectorOp(const Tensor &a, const Tensor &b): a_(a), b_(b) { }
+
+  int32 Properties() { return kConcreteOp; }
+
+  Op *Copy() { return new ScalarPlusEqStvectorOp<double, kCpuDevice>(a_, b_); }
+
+  void Do() {
+    DebugNormalOp(a, kReadWrite, b_, kRead);
+    const Pattern &a_pattern = a_.Pattern(),
+        &b_pattern = b_.Pattern();
+    int32 dim = b_pattern.dims[0],
+        b_stride = b_pattern.strides[0];
+    double *a_data = a_.GetData<T>(),
+        *b_data = a_.GetData<T>();
+    *a_data += cblas_dasum(dim, b_data, b_stride);
+  }
+  Tensor a_;
+  Tensor b_;
+};
+
+/**
+   Operation doing a += b with a a vector and b a scalar.  (I.e. add
+   a constant elementwise to a vector).
+
+   May not be used if a and b overlap.
+*/
+template <class T>
+class StvectorPlusEqScalarOp<T, kCpuDevice>: public Op {
+  StvectorPlusEqScalarOp(const Tensor &a, const Tensor &b): a_(a), b_(b) { }
+
+  int32 Properties() { return kConcreteOp; }
+
+  Op *Copy() { return new StvectorPlusEqScalarOp<T, kCpuDevice>(a_, b_); }
+
+  void Do() {
+    DebugNormalOp(a, kReadWrite, b_, kRead);
+    const Pattern &a_pattern = a_.Pattern(),
+        &b_pattern = b_.Pattern();
+    int32 dim = a_pattern.dims[0],
+        a_stride = a_pattern.strides[0];
+    T *a_data = a_.GetData<T>(),
+        *b_data = a_.GetData<T>();
+
+    T b = *b_data;
+    for (int32 i = 0; i < dim; i++)
+      a_data[i * a_stride] += b;
+  }
+  Tensor a_;
+  Tensor b_;
+};
+
+
+
+}  // namespace tensor
+}  // namespace kaldi
+
+
+#endif  // KALDI_TENSOR__LINEAR_OPS_H_
diff --git a/src/tensor/linear-ops.cc b/src/tensor/linear-ops.cc
index 47c2ed520cd..0727864998f 100644
--- a/src/tensor/linear-ops.cc
+++ b/src/tensor/linear-ops.cc
@@ -23,6 +23,15 @@ namespace kaldi {
 namespace tensor {
 
 void PlusEqOp::Expand(std::vector<std::unique_ptr<Op> > *ops) const {
+
+  if (ReferenceMode() && a_.DeviceType() == kCpuDevice) {
+    // In reference mode, always use the reference implementation.
+    Op *ans;
+    SET_TO_TEMPLATED_CPU_OP_ALL(ans, a_.Dtype(), a_, b_);
+    return ans;
+  }
+
+  // The generic implementation requires us to first normalize the patterns.
   Pattern a_pattern = a_.Impl().pattern,
       b_pattern = b_.Impl().pattern;
   NormalizePatterns({a_pattern, b_pattern});
@@ -37,10 +46,9 @@ void PlusEqOp::Expand(std::vector<std::unique_ptr<Op> > *ops) const {
     b = WithPattern(b, b_pattern);
 
   /*
-    The case-statement values in the switch statement below may be
-    interpreted in groups of 3 hex characters, are 0xAAABBBCCC,
-    pertaining to Tensors a, b and c respectively.  See
-    GetPatternCode() in pattern-utils.h for documentation on
+    The case-statement values in the switch statement below may be interpreted
+    in groups of 3 hex characters, are 0xAAABBB, pertaining to Tensors a and b
+    respectively.  See GetPatternCode() in pattern-utils.h for documentation on
     the meanings of the values and our notation with X,x,1.
 
   */
@@ -52,26 +60,69 @@ void PlusEqOp::Expand(std::vector<std::unique_ptr<Op> > *ops) const {
   /*
     The case-statement values in the switch statement below may be interpreted
     in groups of 3 hex characters, are 0xAAABBB, pertaining to Tensors a and b.
-    See GetPatternCode() in pattern-utils.h for documentation on the meanings of
+    See ComputePatternCode() in pattern-utils.h for documentation on the meanings of
     the values and our notation with X,x,1.
+       Quick legend:
+             X means dim >1, stride = 1
+             x means dim >1, stride != 1
+             1 means dim == 1, stride = 0.
+                 (Note: the numbers in case-statements below exclude negative
+                 strides because bit 11 of the 12-bit chunks would be set if
+                 there were a negative stride).
    */
 
   // We are doing a += b.
   switch(combined_code) {
-    // A scalar plus a scalar
-    case 0x000000000:
-      SET_TO_TEMPLATED_OP_REAL(new_op, a.Dtype(), a.DeviceType(), ScalarPlusEqScalar, a, b);
+    // A scalar += scalar,
+    case 0x000000:   // () +=  ()
+      SET_TO_TEMPLATED_OP_REAL(new_op, a.Dtype(), a.DeviceType(), ScalarPlusEqScalarOp, a, b);
       break;
+    // We may split apart some of the following cases in future.
+    // They all represent, vector += vector.
+    case 0x101101:  //  (X) += (X)
+    case 0x001001:  //  (x) += (x)
+    case 0x101001:  //  (X) += (x)
+    case 0x001101:  //  (X) += (x)
+      SET_TO_TEMPLATED_OP_REAL(new_op, a.Dtype(), a.DeviceType(), StvectorPlusEqStvectorOp, a, b);
+      break;
+    // Scalar += (sum of) vector or strided vector
+    case 0x000101:  //  () += (X)
+    case 0x000001:  //  () += (X)
+      SET_TO_TEMPLATED_OP_REAL(new_op, a.Dtype(), a.DeviceType(), ScalarPlusEqStvectorOp, a, b);
+      break;
+    // vector or strided vector += scalar.
+    // We could later split apart the strided and non-strided cases.
+    case 0x101000:  //  (x) += ()
+    case 0x001000:  //  (X) += ()
+      SET_TO_TEMPLATED_OP_REAL(new_op, a.Dtype(), a.DeviceType(), StvectorPlusEqScalarOp, a, b);
+      break;
+    // scalar += matrix
+    case 0x000103: { // () += (xX)
+      int32 num_rows = b.Pattern().dims[1];
+      // Create a temporary- a column vector, which is what we call
+      // a vector whose nontrivial axis is raxis 1 instead of raxis 0.
+      Tensor temp({num_rows, 1}, {a.Dtype(), a.Device()});
+      Op *temp_op;
+      // Below we do temp += b.  We could use PlusEqOp for this and also for the
+      // following reduction, but doing it this way avoids an unnecessary layer
+      // of expansion.
+      SET_TO_TEMPLATED_OP_REAL(temp_op, a.Dtype(), a.DeviceType(),
+                               ColVectorEqMatrixOp, temp, b);
+      ops->push_back(temp_op);
+      // Normalize the temporary vector so its nontrivial axis is raxis 0, by
+      // removing the current raxis 0 and having current raxis 1 shift down.
+      Tensor temp_normalized = Squeeze(temp, 0);
+      SET_TO_TEMPLATED_OP_REAL(new_op, a.Dtype(), a.DeviceType(),
+                               ScalarPlusEqStvectorOp, a, temp_normalized);
+    }
+
 
     default:
       // Later we can add a more generic implementation that handles arbitrary
       // patterns.
       KALDI_ERR << "Unhandled code: " << std::hex << combined_code;
   }
-
-
-
-
+  ops->push_back(new_op);
 }
 
 
diff --git a/src/tensor/linear-ops.h b/src/tensor/linear-ops.h
index ef8c5c19fe8..b37f78487cb 100644
--- a/src/tensor/linear-ops.h
+++ b/src/tensor/linear-ops.h
@@ -43,14 +43,11 @@ class PlusEqOp: public Op {
     KALDI_ASSERT(!Overlap(a, b) &&
                  BroadcastableAndCompatible(a, b));
   }
-  PlusEqOp(const PlusEqOp &other):
-      a_(other.a_), b_(other.b_) { }
-
 
-  int32 Properties() { return 0 ; }  // Not concrete.
+  int32 Properties() { return kConcreteOp; }
 
   Op *Copy() const override {
-    return new PlusEqOp(*this);
+    return new PlusEqOp(a_, b_);
   }
 
   // Defined in linear-ops.cc; this function works out the more concrete
diff --git a/src/tensor/linear-ref-ops.h b/src/tensor/linear-ref-ops.h
new file mode 100644
index 00000000000..f6e284b8222
--- /dev/null
+++ b/src/tensor/linear-ref-ops.h
@@ -0,0 +1,87 @@
+// tensor/linear-ref-ops.h
+
+// Copyright      2019  Johns Hopkins University (author: Daniel Povey)
+
+// See ../../COPYING for clarification regarding multiple authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//  http://www.apache.org/licenses/LICENSE-2.0
+//
+// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
+// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
+// MERCHANTABLITY OR NON-INFRINGEMENT.
+// See the Apache 2 License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef KALDI_TENSOR_LINEAR_REF_OPS_H_
+#define KALDI_TENSOR_LINEAR_REF_OPS_H_ 1
+
+#include "tensor/tensor.h"
+#include "tensor/op.h"
+
+
+// This header contains the "reference version" of linear Ops;
+// this is the very simple, not-efficient version that runs on
+// CPU when we run in "reference mode" (or when we encounter
+// some combination that can't be run using our normal BLAS-based
+// speciailized Ops).
+namespace kaldi {
+namespace tensor {
+
+// Corresponds to the command a += b.
+
+template <typename Real>
+class PlusEqRefOp: public Op {
+  PlusEqRefOp(const Tensor &a, const Tensor &b):
+      a_(a), b_(b) {
+    KALDI_ASSERT(!Overlap(a, b) && Compatible(a, b));
+
+  }
+
+  int32 Properties() { return 0 ; }  // Not concrete.
+
+  Op *Copy() const override {
+    return new PlusEqRefOp(a, b_);
+  }
+
+  void Do() const override {
+    RecordUse(a_, kReadWrite);
+    RecordUse(b_, kRead);
+    Do(a_.GetData<Real>(), b_.GetData<Real>,
+       KALDI_TENSOR_MAX_DIM - 1);
+  }
+
+  private:
+
+  void Do(Real *a, Real *b, int32 raxis) {
+    int32 dim = std::max<int32>(a_.dims[raxis], b_.dims[raxis]),
+        a_stride = a_.strides[raxis], b_stride = b_.strides[raxis];
+    if (raxis == 0) {
+      for (int32 i = 0; i < dim; i++) {
+        a[i * a_stride] += b[i * b_stride];
+      }
+    } else {
+      for (int32 i = 0; i < dim; i++) {
+        Do(a + i * a_stride, b + i * b_stride, raxis - 1);
+      }
+    }
+  }
+
+  Tensor a_;
+  Tensor b_;
+};
+
+
+}
+}
+
+
+}  // namespace tensor
+}  // namespace kaldi
+
+
+#endif  // KALDI_TENSOR_LINEAR_REF_OPS_H_
diff --git a/src/tensor/linear-special-ops.h b/src/tensor/linear-special-ops.h
index 49e9961cb54..3f463026607 100644
--- a/src/tensor/linear-special-ops.h
+++ b/src/tensor/linear-special-ops.h
@@ -25,586 +25,53 @@
 
 // This Ops are more specialized forms of the Ops declared in linear-ops; these
 // correspond to more specific combinations of Tensor shapes.
+// Just the template declarations are here; the overrides for CPU and
+// GPU are in linear-cpu-ops.h and linear-gpu-ops.h.
 namespace kaldi {
 namespace tensor {
 
 
 /**
-   Add operation taking two Tensors (T), i.e. a += b, which may include
-   summation and/or broadcasting depending on the dimensions of a and b
-
-   May not be used if a and b overlap.
-*/
-class AddScalarScalarOp {
- public:
-
-  AddScalarScalarOp(const Tensor &a, Tensor &b):
-      a_(a), b_(b) {
-    KALDI_PARANOID_ASSERT(
-        a.NumAxes() == 0 && b.NumAxes() == 0 &&
-        !Overlap(a, b) && Compatible(a, b));
-  }
-
-  virtual void Do() const {
-
-    KALDI_ERR << "Execution not supported for this Op (not concrete); "
-        "please expand ";
-  }
-
-
-  AddOp(const AddOp &other):
-      a_(other.a_), b_(other.b_) { }
-
-
-  int32 Properties() { return 0 ; }  // Not concrete.
-
-  Op *Copy() const override {
-    return new AddOp(*this);
-  }
-
-  // Defined in linear-ops.cc; this function works out the more concrete
-  // structure (e.g. vectors, matrices, things like that) and chooses the
-  // appropriate implementation
-  void Expand(std::vector<std::unique_ptr<Op> > *ops) const override;
-
-  void GetBackwardDerivOps(
-      DerivMap *map,
-      std::vector<std::unique_ptr<Op> > *ops) const override {
-    std::shared_ptr<TensorImpl> b_deriv = map->DerivIfPresent(b_);
-    if (b_deriv == nullptr)  // b wasn't tracked, so a won't be.
-      return;
-    // else return the Op corresponding to:
-    // b_deriv_ += a_deriv_.
-    ops->push_back(std::unique_ptr<Op>(new AddOp(AsTensor(b_deriv),
-                                                 map->Deriv(a_))));
-
-  }
-  void GetForwardDerivOps(
-      DerivMap *map,
-      std::vector<std::unique_ptr<Op> > *ops) const override {
-    std::shared_ptr<TensorImpl> a_deriv = map->DerivIfPresent(b_);
-    if (b_deriv == nullptr)  // b wasn't tracked, so a won't be.
-      return;
-    // else return the Op corresponding to:
-    // a_deriv_ += b_deriv_.
-    ops->push_back(std::unique_ptr<Op>(new AddOp(AsTensor(a_deriv),
-                                                 map->Deriv(b_))));
-  }
-
-
- private:
-  Tensor a_;
-  Tensor b_;
-};
-
+   Operation doing a += b with a and b scalar.
 
+   a and b may not point to the same data.
 
-/**
-   Assign operation, doing
-      b := a,
-   which may actually do summation and/or broadcasting depending on the
-   dimensions of b and a.  Formally, and with reference to the notation
-   in pattern.h, we can describe its operation as follows:
-       - Set all elements of b to zero
-       - For each index-tuple i in the index-tuple-set of b, b[i] += a[i].
-   Must not be used if b and a overlap.
-
-   "Assign" means that this is the first time we are setting the memory
-   involved, except possibly for things that don't generate any derivative
-   for various reasons.
-
-   See also SetOp, which is for when the memory might previously have
-   been written to by something differentiable.]
+   Template parameter T is the datatype concerned (say, T = float)
+   D is the DeviceType enum, kCpuDevice or kGpuDevice.
 
-   Note: in the backprop for AssignOp, we can do Unset() after, which
-   means the memory concerned must no longer be read from.
+   Will be specialized for CPU and GPU in linear-cpu-ops.h and linear-gpu-ops.h
 */
-class AssignOp {
- public:
-
-  AssignOp(const Tensor &a, Tensor &b):
-      a_(a), b_(b) {
-    KALDI_ASSERT(!Overlap(a, b) &&
-                 BroadcastableAndCompatible(a, b));
-  }
-  AssignOp(const AssignOp &other):
-      a_(other.a_), b_(other.b_) { }
-
-  void Do() const override {
-    Set(a, &b);  // b := a
-  }
-  Op *Copy() const override {
-    return new AssignOp(*this);
-  }
-
-  void GetBackwardDerivOps(
-      DerivMap *map,
-      std::vector<std::unique_ptr<Op> > *ops) const override {
-    std::shared_ptr<TensorImpl> a_deriv = map->DerivIfPresent(a_);
-    if (a_deriv == nullptr)  // a wasn't tracked, so b won't be.
-      return;
-    // Return the Op corresponding to:
-    // a_deriv_ += b_deriv_.
-    ops->push_back(std::unique_ptr<Op>(new AddOp(map->Deriv(b_),
-                                                 AsTensor(a_deriv))));
-  }
-
-  void GetForwardDerivOps(
-      DerivMap *map,
-      std::vector<std::unique_ptr<Op> > *ops) const override {
-    std::shared_ptr<TensorImpl> a_deriv = map->DerivIfPresent(a_);
-    if (a_deriv == nullptr)  // a wasn't tracked, so b won't be.
-      return;
-    // else return the Op corresponding to:
-    // b_deriv_ := a_deriv_.
-    ops->push_back(std::unique_ptr<Op>(new AssignOp(AsTensor(a_deriv),
-                                                  map->Deriv(b_))));
-  }
- private:
-   Tensor a_;
-   Tensor b_;
-};
-
-
-
+template <class T, DeviceType D>
+class ScalarPlusEqScalarOp;
 
 
 /**
-   class Op is a base-class for objects that are created when we do operations
-   on Variables.  The important thing to know here is that the Variables in
-   question will always have been allocated with particular dimensions,
-   and possibly even contain defined values, before we get to the Op.
-   Examples of Ops include,
-      a := b * c
-      a += b
-      a *= b
-   where the interpretation of the commands above will depend on the
-   dimensions of the Tensors involved.
-
-   Notice that all the member functions of class Op are `const`, i.e. they
-   shouldn't change this class (although of course they may change the
-   underlying Tensor data).  This is to remind users that Ops are supposed
-   to be reusable, and calls to this object shouldn't affect the behavior
-   of subsequent calls, except to the extent that the underlying Tensor
-   data has been changed.
- */
-class Op {
- public:
-
-  /**
-     Do whatever it is that this Op does (e.g. execute the command `a += b`,
-     if that was what this Op did)
-   */
-  virtual void Do() const;
-
-  /**
-     Return a copy of this object.  (This won't be needed very often but might
-     possibly be needed in the context of computing higher-order derivatives).
-  */
-  virtual Op *Copy() const;
-
-  /**
-     This is for forward-mode automatic differentiation (a rarely-used thing).
-     It appends to 'ops' the commands corresponding to the forward-mode
-     automatic differentiation w.r.t. this Op.
-
-       @param [in,out] 'map' is the map that maps from tensors to the
-             corresponding derivative values.  May be modified by adding
-             new key/value pairs.
-       @param [out] ops  This funtion will *append* to `ops` the
-             commands for computing the derivatives associated with
-             this Op in forward-mode automatic differentiation.  If none
-             of the inputs to the Op were tracked w.r.t. `map`,
-             nothing will be done.
-
-     Example: if the command was "a += b", the derivative operation would
-     be: deriv(a) += deriv(b).  In most cases these Ops would be executed
-     immediately and then deleted.
-   */
-  virtual void GetForwardDerivOps(DerivMap *map,
-                                  std::vector<std::unique_ptr<Op> > *ops) const;
-
+   Operation doing a += b with a and b possibly-strided vectors.
 
+   a and b may not overlap.
 
-  /**
-     This is for reverse-mode automatic differentiation (the normal type of
-     autograd).
+   Template parameter T is the datatype concerned (say, T = float)
+   D is the DeviceType enum, kCpuDevice or kGpuDevice.
 
-       @param [in,out] map   This object maps from tensors to the
-                       corresponding derivative values.  It may be changed by
-                       adding new elements to the map, if its Deriv() function
-                       is called.
-       @param [out]    ops  This function may *append* to 'ops' the commands
-                       used in the reverse-mode automatic differentiation.
-                       (Note: nothing will be appended if none of the inputs
-                       to the Op were already tracked w.r.t. 'map'.)
-
-     Example: if the command was "a += b * c", the operations added to
-     'ops' would correspond to `deriv(b) += deriv(a) * c` and
-     `deriv(c) += deriv(a) * b`.
-  */
-  virtual void GetBackwardDerivOps(DerivMap *map,
-                                   std::vector<std::unique_ptr<Op> > *ops) const;
-
-
-  /** Destructor.  It's important for efficiency of memory use to destroy Ops as
-      soon as you won't need them any more, because it may trigger the freeing
-      of Tensors and hence Storage regions.
-  */
-  virtual ~Op();
-};
-
-
-
-class Op {
-
-  Op(): tick_(GetTick()) { }
-
-  /// InputIteratorBegin() and InputIteratorEnd() form the begin and
-  /// end points of a list of Variables that were inputs of this Op
-  /// but were not outputs.  This is used by the backprop code when finding
-  /// the topological order of ops.  (Note: output variables themselves
-  /// refer to Ops, so if we included them in the input list we'd
-  /// get a cycle in the graph).  These Variables are expected to
-  /// still have their graph information (i.e. sub-classes of class Op
-  /// class must not call RemoveGraph() on the members of this list).
-  virtual Op *DepIteratorBegin() = 0;
-  virtual Op *DepIteratorEnd() = 0;
-
-
-
-  // This number >= 0 is used to determine the order of Ops in a graph; each
-  // time we generate an Op we increment a global counter.  Doing it this way,
-  // rather than via topological sorting, is simpler.
-  int64 GetTimestamp() const final { return tick_; }
-
-  virtual void Backprop();
-
- protected:
-
-  /**
-     The time (`GetTick()`) at which this Op was created; should be set
-     in child classes by doing:
-      `tick_ = GetTick()`
-     as the last statement of the constructor.   (This ensures the
-     tick is later-numbered than any ticks stored in the ChangeTracker
-     code by operations called from the constructor.)
-  */
-  int64 tick_;
-
-
-  /*
-    This function intended to be called from the Backprop() routines
-    of child classes, for example:
-       ` if (DebugMode()) {  CheckTensorTime(*a_);  } `
-    This will die if the memory underlying the Tensor being checked has been
-    modified more recently than tick_.
-  */
-  inline void CheckTensorTime(const Tensor &tensor) {
-    if (DebugMode()) {
-    }
-  }
-
-
-
-
-};
-
-
-template <class OpImpl>
-class OpPointer {
-
-  std::shared_ptr<OpImpl>
-
-}
-
-
-
-/**
-   This is a special version of base-class Op that is created when
-   any SharedGrad is allocated for a non-leaf Variable.  Its purpose
-   is to ensure that, when we get to this Op in the backprop, we deallocate
-   the data underlying the gradient Tensor (so we don't keep gradient
-   Tensors around for longer than is needed).
+   Will be specialized for CPU and GPU in linear-cpu-ops.h and linear-gpu-ops.h
 */
-class DeallocateOp: public Op {
-
-  // This operator has no dependencies as it will be created when a SharedGrad
-  // is first initialized, when no Ops have been done on it.
-  Op *DepIteratorBegin() override { return NULL; }
-  Op *DepIteratorEnd() override { return NULL; }
-
-  void Backprop() override {
-    if (auto s = tensor_to_deallocate_.lock())
-      ZeroDeallocating(s.get());
-  }
-
- private:
-  // Since we just want to deallocate its underlying data, there is no point
-  // increasing its ref-count; we can just shrug our shoulders if it has
-  // already been deleted.d
-  std::weak_ptr<Tensor> tensor_to_deallocate_;
-};
+template <class T, DeviceType D>
+class StvectorPlusEqStvectorOp;
 
 
 /**
-   A slight simplification of class UnaryOp for cases where it's
-   done in-place.
- */
-class InPlaceUnaryOp: public Op {
-
-};
-
-
-class UnaryOp: public Op {
+   Operation doing a += b with a a vector and b a scalar.  (I.e. add
+   a constant elementwise to a vector).
 
-  //
-  UnaryOp(const Variable &input, const Variable &output) {
-    if
-
-
-
-    if (SameVariable(input, output)) {
-
-    } else {
-    }
-  }
-
- public:
-
-  std::shared_ptr<Op> op1_;
-  std::shared_ptr<Op> op2_;
-
-
-
-
-}
-
-class GenericOp: public Op {
-
-  // GenericOp is a child of class Op that is intended as a generic base-class
-  // for expressions.
-
-
-
- protected:
-  // Constructor, to be used from child classes.  This base-class takes care
-  // of storing the list of input Variables for purposes of tracing dependencies;
-  //
-  //  @param [in] input_vars  The list of input Variables (meaning: Variables
-  //                   that are inputs to, but not outputs of, i.e. not modified
-  //                   by, this Op).
-  //  @param [in] output_var  The output Variable of this Op, i.e. the Variable
-  //                   which is modified or set by it.  We may provide another
-  //                   constructor taking ArrayRef<Variable> in this position,
-  //                   as and when we need to support Ops that operate on
-  //                   multiple output Variables.
-  void Op(const ArrayRef<Variable> &input_vars,
-          const Variable &output_var);
-
-
-  // TODO: maybe have a constructor of Op that takes an ArrayRef of the inputs
-  // that are not also outputs?  Could use that for graph traversal.
-
- private:
-
-  // num_inputs_ is the number of base Variables that are the base Variables of
-  // inputs of this Op (but not of outputs).  These are stored in the
-  // array 'inputs_'.
-
-  // inputs_ is a pointer to an array of shared_ptr<Variable> of size num_inputs_, which
-  // will be be allocated by new [] in the constructor and deleted by delete []
-  // in the destructor.
-
-  // This is a list of the Op-input-nodes (see glossary in tensor.h for explanation).
-  // We don't store the Op-output-nodes here; instead, they refer to this Op in
-  // their op_lists.
-  // (We don't store the Node(s) that is(are) the outputs of the Op here; its own
-  // op_list refers to this Op).
-  std::shared_ptr<Node> *inputs_;
-
-  int32 num_inputs_;
-
-  // If num_inputs_ is 1, then inputs_ is
-  void *inputs_;
-
-  int64 n_;  // initialized from the counter when this object is created.
-  std::shared_ptr<Op> tail_;  // TODO: make it unique_ptr?
- protected:
-  // Return true if this is not the last Op in the list of Ops attached to this
-  // base Variable (can be useful to know whether we need bother to scale the
-  // derivative in a scaling operation, for instance).
-  bool HasTail() const { return tail_ != nullptr; }
-};
-
-
-class AddToOp: public Op {
- public:
-
-  // This Op corresponds to the computation:
-  //   \f$  b  :=  alpha a  +   beta b.  \f$
-  // with broadcasting or summation depending on the dimensions
-  // involved.  Alpha and beta are constants, and differentiation w.r.t. them is
-  // not supported (you wouldn't reach this code if a or b were actual
-  // variables.)
-  //
-  // The Op is only constructed if b.Tracked() (which it would normally if
-  // a.Tracked()).
-  AddToOp(float alpha, float beta,
-          const Variable &a, const Variable &b):
-      Op({a}),
-      alpha_(alpha),
-      beta_(beta),
-      a_data_(a.GetData()),
-      a_grad_(a.GetGradIfPresent()),
-      b_data_(b.GetData()),
-      b_grad_(b.GetGrad()) {
-
-    Add(alpha, beta, *a_data_, b_data_.get());
-  }
-
-
-  void Backward() {
-    // Do: a_grad += alpha * b_grad.
-    if (a_grad_ != nullptr)
-      AddTo(alpha_, 1.0, b_grad, &a_grad);
-
-    if (beta_ != 1.0)
-      Scale(beta_, b_grad.get());
-  }
-
- private:
-
-  float alpha_;
-  float beta_;
-
-  // We hold onto all inputs that are not also outputs
-  // (here just a_) for dependency tracking.
-  Variable a_;
-
-  std::shared_ptr<Node> a_node_;
-
-  std::shared_ptr<Tensor> a_data_;
-  // a_grad_ will be NULL if a was not tracked.
-  std::shared_ptr<Tensor> a_grad_;
-  std::shared_ptr<Tensor> b_data_;
-  std::shared_ptr<Tensor> b_grad_;
-
-  Variable b_;
-  bool must_scale_b_grad_;
-
-};
-
-
-class AssignOp: public Op {
- public:
-
-  // This Op corresponds to the computation:
-  //   \f$  b := a  \f$
-  // with broadcasting or summation depending on the dimensions.
-  //
-  // Constructing this Op will make b tracked if it was already.
-  AssignOp(const Variable &a, const Variable &b):
-      Op({a}),
-      a_data_(a.GetData()),
-      a_grad_(a.GetGradIfPresent()),
-      b_data_(b.GetData()),
-      b_grad_(b.GetGrad()) {
-    Copy(a_data_, b_data_);
-
-      `tick_ = GetTick()`
-  }
-
-
-  void Backward() {
-    // Do: a_grad += alpha * b_grad.
-    if (a_grad_ != nullptr)
-      AddTo(alpha_, 1.0, b_grad, &a_grad);
-
-    if (beta_ != 1.0)
-      Scale(beta_, b_grad.get());
-  }
-
- private:
-
-  float alpha_;
-  float beta_;
-
-  // We hold onto all inputs that are not also outputs
-  // (here just a_) for dependency tracking.
-  Variable a_;
-
-  std::shared_ptr<Node> a_node_;
-
-  std::shared_ptr<Tensor> a_data_;
-  // a_grad_ will be NULL if a was not tracked.
-  std::shared_ptr<Tensor> a_grad_;
-  std::shared_ptr<Tensor> b_data_;
-  std::shared_ptr<Tensor> b_grad_;
-
-  Variable b_;
-  bool must_scale_b_grad_;
-
-};
-
-
-class AssignOp: public Op {
- public:
-
-  // This Op corresponds to the computation:
-  //   \f$  b  :=  alpha a  +   beta b.  \f$
-  // with broadcasting or summation depending on the dimensions
-  // involved.  Obviously alpha and beta are constants,
-  // and differentiation w.r.t. them is not supported.
-  //
-  // The Op is only constructed if b_.Tracked() (which it
-  // would normally if a_.Tracked()).
-  AddToOp(float alpha, float beta,
-          const Variable &a, const Variable &b):
-      Op({a}),
-      alpha_(alpha),
-      beta_(beta),
-      a_data_(a.GetData()),
-      a_grad_(a.GetGradIfPresent()),
-      b_data_(b.GetData()),
-      b_grad_(b.GetGrad()) {
-
-    Add(alpha, beta, *a_data_, b_data_.get());
-  }
-
-
-  void Backward() {
-    // Do: a_grad += alpha * b_grad.
-    if (a_grad_ != nullptr)
-      AddTo(alpha_, 1.0, b_grad, &a_grad);
-
-    if (beta_ != 1.0)
-      Scale(beta_, b_grad.get());
-  }
-
- private:
-
-  float alpha_;
-  float beta_;
-
-  // We hold onto all inputs that are not also outputs
-  // (here just a_) for dependency tracking.
-  Variable a_;
-
-  std::shared_ptr<Node> a_node_;
-
-  std::shared_ptr<Tensor> a_data_;
-  // a_grad_ will be NULL if a was not tracked.
-  std::shared_ptr<Tensor> a_grad_;
-  std::shared_ptr<Tensor> b_data_;
-  std::shared_ptr<Tensor> b_grad_;
-
-  Variable b_;
-  bool must_scale_b_grad_;
-
-};
+   May not be used if a and b overlap.
 
+   Template parameter T is the datatype concerned (say, T = float)
+   D is the DeviceType enum, kCpuDevice or kGpuDevice.
 
+   Will be specialized for CPU and GPU in linear-cpu-ops.h and linear-gpu-ops.h
+*/
+template <class T, DeviceType D>
+class StvectorPlusEqScalarOp;
 
 }  // namespace tensor
 }  // namespace kaldi
diff --git a/src/tensor/op.h b/src/tensor/op.h
index d722d716ba4..50d9b861910 100644
--- a/src/tensor/op.h
+++ b/src/tensor/op.h
@@ -29,6 +29,7 @@ class Variable;
 
 
 enum OpProperties {
+  kNotConcreteOp = 0,
   kConcreteOp = 1,  // An Op that is concrete is one that can be executed
                     // directly, i.e. its Do() function works; these Ops will
                     // generally correspond to a single function call, e.g. a
@@ -110,6 +111,10 @@ class Op {
      Example: if the command was "a += b", the derivative operation would
      be: deriv(a) += deriv(b).  In most cases these Ops would be executed
      immediately and then deleted.
+
+     This only has to be defined for Ops that are called directly by
+     user-level code; ops that are only encountered as a byproduct of
+     expanding other Ops do not have to define this function.
   */
   virtual void GetForwardDerivOps(DerivMap *map,
                                   std::vector<std::unique_ptr<Op> > *ops) const {
@@ -133,6 +138,10 @@ class Op {
      Example: if the command was "a += b * c", the operations added to
      'ops' would correspond to `deriv(b) += deriv(a) * c` and
      `deriv(c) += deriv(a) * b`.
+
+     This only has to be defined for Ops that are called directly by
+     user-level code; ops that are only encountered as a byproduct of
+     expanding other Ops do not have to define this function.
   */
   virtual void GetBackwardDerivOps(DerivMap *map,
                                    std::vector<std::unique_ptr<Op> > *ops) const {
@@ -143,7 +152,7 @@ class Op {
 
   /** Destructor.  It's important for efficiency of memory use to destroy Ops as
       soon as you won't need them any more, because it may trigger the freeing
-      of Tensors and hence Storage regions.
+      of Tensors and hence Storage objects.
   */
   virtual ~Op();
  protected:
@@ -164,7 +173,10 @@ class Op {
 };
 
 
-// the following macro is primarily for use inside other macros defined below.
+
+#ifdef HAVE_CUDA
+// The following macro is primarily for use inside other macros defined below.
+// This version is for when we compile with CUDA support.
 #define SET_TO_TEMPLATED_OP_DEVICE(pointer_name, device_type, OpName, T, ...) \
    {                                                                      \
    switch (device_type) {                                                 \
@@ -176,11 +188,26 @@ class Op {
     KALDI_ERR << "Invalid device type " << int32(device_type);            \
   }  while (0)
 // the while(0) is to allow a semicolon after the invocation.
-
+#else
+// The following macro is primarily for use inside other macros defined below.
+// This version is for when we compile without CUDA support.
+#define SET_TO_TEMPLATED_OP_DEVICE(pointer_name, device_type, OpName, T, ...) \
+   {                                                                      \
+   switch (device_type) {                                                 \
+    case kCpuDevice:                                                      \
+      pointer_name = new OpName<T, kCpuDevice>(__VA_ARGS__); break;       \
+    case kGpuDevice:                                                      \
+    KALDI_ERR << "You did not compile for CUDA, reconfigure with "        \
+                 "CUDA support.";                                         \
+    default:                                                              \
+    KALDI_ERR << "Invalid device type " << int32(device_type);            \
+  }  while (0)
+// the while(0) is to allow a semicolon after the invocation.
+#endif
 
 // the following macro is to be used to dispatch device and dtype-specific
 // implementations.  The idea is that you have defined a template like
-// template <class Dtype, class DeviceType> class OpName
+// template<class Dtype, class DeviceType> class OpName
 // and have specialized that template for the various combinations.
 // This executes commands like:
 //    pointer_name = new OpName<float, kCpu>(a, b, c);
@@ -189,13 +216,13 @@ class Op {
 #define SET_TO_TEMPLATED_OP_ALL(pointer_name, dtype, device_type, OpName, ...) \
     switch (dtype) {                                \
      case kFloatDtype:                              \
-     SET_TO_TEMPLATE_OP_DEVICE(pointer_name, device_type, OpName, float, __VA_ARGS__); \
+     SET_TO_TEMPLATED_OP_DEVICE(pointer_name, device_type, OpName, float, __VA_ARGS__); \
       break;                                        \
      case kDoubleDtype:                             \
-     SET_TO_TEMPLATE_OP_DEVICE(pointer_name, device_type, OpName, double, __VA_ARGS__); \
+     SET_TO_TEMPLATED_OP_DEVICE(pointer_name, device_type, OpName, double, __VA_ARGS__); \
       break;                                        \
      case kInt32Dtype:                             \
-     SET_TO_TEMPLATE_OP_DEVICE(pointer_name, device_type, OpName, int32, __VA_ARGS__); \
+     SET_TO_TEMPLATED_OP_DEVICE(pointer_name, device_type, OpName, int32, __VA_ARGS__); \
       break;                                        \
     default:                                        \
       KALDI_ERR << "Invalid dtype (this op only allows float or double): " \
@@ -206,10 +233,39 @@ class Op {
 #define SET_TO_TEMPLATED_OP_REAL(pointer_name, dtype, device_type, OpName, ...) \
     switch (dtype) {                                \
      case kFloatDtype:                              \
-       SET_TO_TEMPLATE_OP_DEVICE(pointer_name, device_type, OpName, float, __VA_ARGS__); \
+       SET_TO_TEMPLATED_OP_DEVICE(pointer_name, device_type, OpName, float, __VA_ARGS__); \
+      break;                                        \
+     case kDoubleDtype:                             \
+       SET_TO_TEMPLATED_OP_DEVICE(pointer_name, device_type, OpName, double, __VA_ARGS__); \
+      break;                                        \
+    default:                                        \
+      KALDI_ERR << "Invalid dtype (this op only allows float or double): " \
+                << int32(dtype);                              \
+  } while(0)
+// the while(0) is to allow a semicolon after the invocation.
+
+
+#define SET_TO_TEMPLATED_CPU_OP_REAL(pointer_name, dtype, OpName, ...) \
+    switch (dtype) {                                \
+     case kFloatDtype:                              \
+       pointer_name = new OpName<float, kCpuDevice>(__VA_ARGS__); break;       \
+      break;                                        \
+     case kDoubleDtype:                             \
+       pointer_name = new OpName<double, kCpuDevice>(__VA_ARGS__); break;       \
+      break;                                        \
+    default:                                        \
+      KALDI_ERR << "Invalid dtype (this op only allows float or double): " \
+                << int32(dtype);                              \
+  } while(0)
+// the while(0) is to allow a semicolon after the invocation.
+
+#define SET_TO_TEMPLATED_CPU_OP_ALL(pointer_name, dtype, OpName, ...) \
+    switch (dtype) {                                \
+     case kFloatDtype:                              \
+       pointer_name = new OpName<float, kCpuDevice>(__VA_ARGS__); break;       \
       break;                                        \
      case kDoubleDtype:                             \
-       SET_TO_TEMPLATE_OP_DEVICE(pointer_name, device_type, OpName, double, __VA_ARGS__); \
+       pointer_name = new OpName<double, kCpuDevice>(__VA_ARGS__); break;       \
       break;                                        \
     default:                                        \
       KALDI_ERR << "Invalid dtype (this op only allows float or double): " \
diff --git a/src/tensor/pattern-utils.cc b/src/tensor/pattern-utils.cc
index 2eeae14949c..56c0bb70b95 100644
--- a/src/tensor/pattern-utils.cc
+++ b/src/tensor/pattern-utils.cc
@@ -450,7 +450,7 @@ void SortAxes(Pattern *pattern) {
 }
 
 void SortTupleAxes(ArrayRef<Pattern*> patterns) {
-
+  // TODO.
 }
 
 void Transpose(int32 raxis1, int32 raxis2, Pattern *p) {
diff --git a/src/tensor/storage.h b/src/tensor/storage.h
index 8358e9abca2..7add77c5510 100644
--- a/src/tensor/storage.h
+++ b/src/tensor/storage.h
@@ -35,7 +35,6 @@ struct StorageAux;
 class Storage {
  public:
 
-
   // This initializes a ChangeTracker object in this->tracker if it
   // does not already exist, and returns its address.
   ChangeTracker *GetChangeTracker();
@@ -48,7 +47,7 @@ class Storage {
       return data;
     } else {
       Allocate();
-      if (zero_upon_allocation_)
+      if (zero_on_allocation_)
         Zero();
       return data;
     }
@@ -56,12 +55,12 @@ class Storage {
 
   /**
      This is called from TensorImpl when we call AllowUndefined() on it.
-     It gives the framework a free pass to not do zero-upon-allocation
+     It gives the framework a free pass to not do zero-on-allocation
      on the part of memory underlying this particular TensorImpl.  It
      will also cause data_ to be allocated if it was not already allocated.
   */
   inline void AllowUndefined(const TensorImpl &impl) {
-    if (data_ == nullptr && zero_upon_allocation_) {
+    if (data_ == nullptr && zero_on_allocation_) {
       Allocate();
       ZeroEverythingElse(impl);
     }
@@ -115,7 +114,7 @@ class Storage {
      matrices, since conceptually the main operation we do on deriv_ matrices is
      to add to them.
   */
-  inline void ZeroUponAllocation() { zero_upon_allocation_ = true; }
+  inline void ZeroOnAllocation() { zero_on_allocation_ = true; }
 
 
 
@@ -160,7 +159,7 @@ class Storage {
   // as a unique identifier.
   int64 id_;
 
-  bool zero_upon_allocation_;
+  bool zero_on_allocation_;
 
   // num_bytes is the number of bytes in the region we have allocated
   // (or are going to allocate).
diff --git a/src/tensor/tensor-common.h b/src/tensor/tensor-common.h
index 0311fc9fbfe..af18a23e34e 100644
--- a/src/tensor/tensor-common.h
+++ b/src/tensor/tensor-common.h
@@ -96,14 +96,15 @@ enum StridePolicy {
   kCopyStrides   // Means: use the exact strides provided.
 };
 
-/// Enumeration that says whether to zero a freshly initialized Tensor.
+/// Enumeration that says whether to zero a freshly initialized Tensor.  Note:
+/// the Tensor won't actually be zeroed when you construct it, it will be zeroed
+/// whenever it's actually needed (delayed allocation).
 enum InitializePolicy {
   kZeroData,
   kUninitialized
 };
 
 
-
 /// This enumeration value lists the unary functions that we might
 /// want to apply to Tensors; it exists so that much of the glue
 /// code can be templated.
diff --git a/src/tensor/tensor-settings.h b/src/tensor/tensor-settings.h
index 0de7dca2cf2..43d4bea7e77 100644
--- a/src/tensor/tensor-settings.h
+++ b/src/tensor/tensor-settings.h
@@ -37,82 +37,6 @@ namespace kaldi {
 namespace tensor {
 
 
-Device GetDefaultDevice();
-void SetDefaultDevice(Device device);
-
-// Mechanism to set the default device within a scope by constructing a variable
-// that exists only within that scope.
-class WithDeviceAs {
- public:
-  // Example:
-  // {
-  //   WithDeviceAs _(kCudaDevice);
-  //   // code in this block uses this default.  the variable
-  //   // name is _ because we don't need to access it.
-  // }
-  inline WithDeviceAs(DeviceType device_type):
-      prev_default_(GetDefaultDevice()) {
-    SetDefaultDevice(Device(device_type));
-  }
-  inline WithDeviceAs(Device device):
-      prev_default_(GetDefaultDevice()) {
-    SetDefaultDevice(device);
-  }
-  ~WithDeviceAs() { SetDefaultDevice(prev_default_); }
-
- private:
-  Device prev_default_;
-};
-
-
-
-DataType GetDefaultDtype();
-void SetDefaultDtype(DataType dtype);
-
-class WithDtypeAs {
- public:
-  // Example:
-  // {
-  //   WithDtypeAs _(kDoubleDtype);
-  //   // code in this block uses this default.  the variable
-  //   // name is _ because we don't need to access it.
-  // }
-  inline WithDtypeAs(DataType dtype):
-      prev_default_(GetDefaultDtype()) {
-    SetDefaultDtype(dtype);
-  }
-  ~WithDtypeAs() { SetDefaultDtype(prev_default_); }
-
- private:
-  DataType prev_default_;
-};
-
-
-
-// struct TensorOptions is used as an arg for some constructors
-// when creating Tensors and Variables; it allows flexibility
-// in specifying the device and/or dtype.  See the examples
-// shown where constructors of Tensor or Variable are declared.
-struct TensorOptions {
-  DataType dtype;
-  Device device;
-
-  TensorOptions(): dtype(GetDefaultDtype()),
-                   device(GetDefaultDevice()) { }
-  TensorOptions(DataType dtype):
-      dtype(dtype), device(GetDefaultDevice()) { }
-  TensorOptions(Device device):
-      dtype(GetDefaultDtype()), device(device) { }
-  TensorOptions(DeviceType device_type):
-      dtype(GetDefaultDtype()), device(device_type) { }
-  TensorOptions(DataType dtype, Device device):
-      dtype(dtype), device(device) { }
-  TensorOptions(DataType dtype, Device device_type):
-      dtype(dtype), device(device_type) { }
-  TensorOptions(const TensorOptions &other):
-      dtype(other.dtype), device(other.device) { }
-};
-
 
 // Global variable, initialized from zero, that is used in GetTick().
 // This is defined in tensor-settings.cc.
@@ -122,9 +46,9 @@ inline int64 NextTick() { return ++g_tick_counter; }
 
 // debug_mode activates code that checks for invalidated data in the backprop
 // pass; see "Invalidated:" in glossary in tensor.h.
-// Don't access this variable directly,
-extern bool debug_mode;     // Do not access directly!
-extern int64 debug_start_tick;   // Do not access directly!
+// Don't access this variable directly.
+extern bool g_debug_mode;     // Do not access directly!
+extern int64 g_debug_start_tick;   // Do not access directly!
 
 inline bool DebugMode() {
   return debug_mode;
@@ -162,35 +86,27 @@ class WithDebugModeAs {
 };
 
 
+inline bool DebugMode() {
+  return debug_mode;
+}
+inline void SetDebugMode(bool b) {
+  if (!debug_mode)
+    debug_start_tick = NextTick();
+  debug_mode = b;
+}
 
-// allow_grad means that gradient tracking is allowed; allow_grad = true
-// is the normal case, and means that if gradient tracking is required
-// (e.g. if the user created a Variable with requires_grad = true, and we do
-// operations that depend on it), then we'll track gradients.
-// It is our way to implement an equivalent of PyTorch's `with torch.no_grad()`.
-// Do not access this variable directly; use AllowGrad() and
-extern thread_local bool allow_grad;
-inline bool AllowGrad() { return allow_grad; }
-inline void SetAllowGrad(bool b) { allow_grad = b; }
-
+extern bool g_reference_mode;     // Do not access directly!
 
-class WithNoGrad {
- public:
-  // Example:
-  // {
-  //   WithNoGrad _;
-  //   // code in this block has gradient tracking disabled.
-  //   // variable name is _ because we won't use it.
-  //
-  // }
-  inline WithNoGrad():
-      prev_default_(AllowGrad()) {
-    SetAllowGrad(false);
-  }
-  ~WithNoGrad() { SetAllowGrad(prev_default_); }
- private:
-  bool prev_default_;
-};
+// Gets 'reference mode' bool.  If true, the simple reference implementation
+// will be used instead of the more optimized (e.g. BLAS-based) implementation.
+// This will typically affect the Expand() call of Ops instead of their
+// Do() call.
+inline bool ReferenceMode() {
+  return reference_mode;
+}
+inline void SetReferenceMode(bool b) {
+  reference_mode = b;
+}
 
 
 }  // namespace tensor
diff --git a/src/tensor/tensor-utils.cc b/src/tensor/tensor-utils.cc
new file mode 100644
index 00000000000..fa3e7090e44
--- /dev/null
+++ b/src/tensor/tensor-utils.cc
@@ -0,0 +1,71 @@
+// tensor/tensor-utils.cc
+
+// Copyright      2019  Johns Hopkins University (author: Daniel Povey)
+
+// See ../../COPYING for clarification regarding multiple authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//  http://www.apache.org/licenses/LICENSE-2.0
+//
+// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
+// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
+// MERCHANTABLITY OR NON-INFRINGEMENT.
+// See the Apache 2 License for the specific language governing permissions and
+// limitations under the License.
+
+#include "tensor/tensor-utils.cc"
+
+namespace kaldi {
+namespace tensor {
+
+void DebugNormalOpInternal(const Tensor &a, TensorUseEnum a_use,
+                           const Tensor &b, TensorUseEnum b_use) {
+  if (!Broadcastable(a, b))
+    KALDI_ERR << "Tensors in Operation do not have broadcastable shapes.";
+  if (Overlap(a, b))
+    KALDI_ERR << "Tensors in Operation overlap.";
+  if (!Broadcastable(a, b))
+    KADLDI_ERR << "Tensors in Operation do not have broadcastable shapes.";
+  if (a.Dtype() != b.Dtype())
+    KALDI_ERR << "Tensors in Operation have different data-types";
+  if (a.Device() != b.Device())
+    KALDI_ERR << "Tensors in Operation have different device";
+  RecordUse(a, a_use);
+  RecordUse(b, b_use);
+}
+
+
+void DebugNormalOpInternal(const Tensor &a, TensorUseEnum a_use,
+                           const Tensor &b, TensorUseEnum b_use,
+                           const Tensor &b, TensorUseEnum c_use) {
+  if (!Broadcastable(a, b, c))
+    KALDI_ERR << "Tensors in Operation do not have broadcastable shapes.";
+  bool a_written = (a_use == kWrite || a_use == kReadWrite);
+  bool b_written = (b_use == kWrite || b_use == kReadWrite);
+  bool c_written = (b_use == kWrite || b_use == kReadWrite);
+
+  if ((a_written || b_written) && Overlap(a, b))
+    KALDI_ERR << "Tensors a and b in Operation overlap.";
+  if ((b_written || c_written) && Overlap(b, c))
+    KALDI_ERR << "Tensors b and c in Operation overlap.";
+  if ((a_written || c_written) && Overlap(a, c))
+    KALDI_ERR << "Tensors a and c in Operation overlap.";
+
+  if (a.Dtype() != b.Dtype())
+    KALDI_ERR << "Tensors in Operation have different data-types";
+  if (a.Device() != b.Device())
+    KALDI_ERR << "Tensors in Operation have different device";
+  RecordUse(a, a_use);
+  RecordUse(b, b_use);
+  RecordUse(c, c_use);
+}
+
+
+
+
+}  // namespace tensor
+}  // namespace kaldi
diff --git a/src/tensor/tensor-utils.h b/src/tensor/tensor-utils.h
index 557cba8dfd9..4bb798fbc06 100644
--- a/src/tensor/tensor-utils.h
+++ b/src/tensor/tensor-utils.h
@@ -37,6 +37,15 @@ inline bool Compatible(const Tensor &a, const Tensor &b) {
   return Compatible(*a.impl_, *b.impl_);
 }
 
+/**
+  This function returns true if the Patterns of a and b are
+  broadastable.
+*/
+inline bool Broadcastable(const Tensor &a, const Tensor &b) {
+  return Broadcastable(*a.impl_, *b.impl_);
+}
+
+
 /**
   This function returns true if a and b have the same dtype
   and device and are broadcastable; equivalent to
@@ -187,6 +196,113 @@ void CompressTensors(ArrayRef<Tensor*> tensors);
 Tensor WithPattern(const Tensor &t, const Pattern &pattern);
 
 
+/**
+   This is to be called when any operation makes use of the memory underlying a
+   Tensor.
+      kRead
+      kWrite
+      kReadWrite
+      kReadInvalidate
+      kInvalidate
+*/
+inline void RecordUse(const Tensor &tensor,
+                      TensorUseEnum use_type) {
+  if (DebugMode()) {
+    tensor.impl_->storage_->GetMemoryChecker()->RecordUse(
+        SizeOf(impl.dtype), impl.pattern);
+  }
+}
+
+
+
+// Implementation for 2-Tensor DebugNormalOp (see declaration below); called in
+// debug mode only.
+void DebugNormalOpInternal(const Tensor &a, TensorUseEnum a_use,
+                           const Tensor &b, TensorUseEnum b_use);
+// Implementation for 3-Tensor DebugNormalOp (see declaration below); called in
+// debug mode only.
+void DebugNormalOpInternal(const Tensor &a, TensorUseEnum a_use,
+                           const Tensor &b, TensorUseEnum b_use,
+                           const Tensor &c, TensorUseEnum c_use);
+
+
+
+/**
+   This convenience function is to be used in the implementation of
+   Tensors (inside the Do() function).  In debug mode, it makes various
+   checks.  This is for use in "normal" ops, i.e. ops that operate on
+   the same data-types and on the same device.
+   This version is for use in Ops that operate on two tensors.
+
+      @param [in] a     The first Tensor the Op works on.
+      @param [in] a_use The use-type of Tensor a,
+                        saying what kind of operation we are
+                        doing on it: one of
+                         - kRead
+                         - kWrite
+                         - kReadWrite
+                         - kReadInvalidate
+                         - kInvalidate
+                        (the ones with Invalidate may be relatively
+                        rare; they are for Ops where we are avoiding
+                        some operation in the expectation that the data
+                        won't be used afterward).
+      @param [in] b     The second Tensor the Op works on
+      @param [in] b_use The use-type of Tensor b
+
+
+*/
+inline void DebugNormalOp(const Tensor &a, TensorUseEnum a_use,
+                          const Tensor &b, TensorUseEnum b_use) {
+  if (DebugMode())
+    DebugNormalOpInternal(a, a_use, b, b_use);
+}
+
+
+
+/**
+   This convenience function is to be used in the implementation of
+   Tensors (inside the Do() function).  In debug mode, it makes various
+   checks.  This is for use in "normal" ops, i.e. ops that operate on
+   the same data-types and on the same device.
+   This version is for use in Ops that operate on two tensors.
+
+      @param [in] a     The first Tensor the Op works on.
+      @param [in] a_use The use-type of Tensor a,
+                        saying what kind of operation we are
+                        doing on it: one of
+                         - kRead
+                         - kReadWrite
+                         - kReadInvalidate
+                         - kInvalidate
+                        (the ones with Invalidate may be relatively
+                        rare; they are for Ops where we are avoiding
+                        some operation in the expectation that the data
+                        won't be used afterward).
+      @param [in] b     The second Tensor the Op works on
+      @param [in] b_use The use-type of Tensor b
+      @param [in] c     The second Tensor the Op works on
+      @param [in] c_use The use-type of Tensor c
+*/
+inline void DebugNormalOp(const Tensor &a, TensorUseEnum a_use,
+                          const Tensor &b, TensorUseEnum b_use,
+                          const Tensor &c, TensorUseEnum c_use) {
+  if (DebugMode())
+    DebugNormalOpInternal(a, a_use, b, b_use, c, c_use);
+}
+
+/**
+   Calling this ensures that when (in future) a Tensor's storage region is
+   allocated, it will be zeroed.  This won't have any effect if the storage
+   region was already allocated.  Note: storage regions are not allocated
+   until they are actually used (e.g. by calling GetData()), so if Tensor
+   'a' is freshly created, this will have an effect.
+ */
+inline void ZeroOnAllocation(const Tensor &a) {
+  a.impl_->storage->ZeroOnAllocation();
+}
+
+
 
 }  // namespace tensor
 }  // namespace kaldi
diff --git a/src/tensor/tensor.h b/src/tensor/tensor.h
index a99728ac624..7089ce6db90 100644
--- a/src/tensor/tensor.h
+++ b/src/tensor/tensor.h
@@ -313,19 +313,21 @@ class Tensor {
 
        @param [in] dims    The dimensions of the tensor, up to
                      KALDI_TENSOR_MAX_DIM positive integers.
-       @param [in] opts    Options regarding data-type and device;
-                           see examples below.
-    Example (note: the braces are braced-initializer-lists)
+       @param [in] opts  Options regarding data-type and device.
+
+    Example (note: the braces are braced-initializer-lists for
+    the object of type TensorOptions.
 <code>
-   Tensor a({3,4});
-   Tensor b({}, kDoubleDtype);
-   Tensor c({5,6,7}, kCpuDevice);
+   Tensor a({3,4}, {context});
+   Tensor b({}, {context, kDoubleDtype);
+   Tensor c({5,6,7}, {context, kCpuDevice);
+   Tensor d({1,2}, {context, kDoubleDtype, kCpuDevice});
    Tensor d({1,2}, {kDoubleDtype, kCpuDevice});
 </code>
   */
   inline Tensor(ArrayRef<int32> dims,
-                TensorOptions opts = TensorOptions()):
-      impl_(new TensorImpl(meta, opts)) { }
+                const TensorOptions &opts):
+      impl_(new TensorImpl(context, opts)) { }
 
 
 
@@ -336,15 +338,6 @@ class Tensor {
 
        @param [in] meta  Struct containing the metadata specifying
                      the Tensor's pattern, data-type and device
-
-                     ;pattern  The dimension and stride information that
-                  this tensor should match (although we will fill gaps
-                  to make it contiguous)
-       @param [in] dtype   The data type to use
-       @param [in] device  The device to put the data on
-       @param [in] set_zero   If true, set the data to zero.  If false,
-                        the contents will be undefined.
-
   */
   Tensor(TensorMeta &meta, InitializePolicy p);
 

From 32101ba7bec09f5f038d262176e5e11d3bf7c793 Mon Sep 17 00:00:00 2001
From: Daniel Povey <dpovey@gmail.com>
Date: Mon, 10 Jun 2019 15:38:15 -0400
Subject: [PATCH 133/163] [src] Change name from kGpuDevice to kCudaDevice

---
 src/tensor/linear-ops.cc         | 2 +-
 src/tensor/linear-special-ops.cc | 2 +-
 src/tensor/linear-special-ops.h  | 6 +++---
 src/tensor/op.h                  | 6 +++---
 src/tensor/tensor-impl-linear.cc | 2 +-
 src/tensor/tensor-impl.h         | 4 ++--
 src/tensor/variable-functions.h  | 4 ++--
 src/tensor/variable-inplace.h    | 2 +-
 8 files changed, 14 insertions(+), 14 deletions(-)

diff --git a/src/tensor/linear-ops.cc b/src/tensor/linear-ops.cc
index 0727864998f..456a268bc96 100644
--- a/src/tensor/linear-ops.cc
+++ b/src/tensor/linear-ops.cc
@@ -135,7 +135,7 @@ inline static void AddProductScalar3(
       AddProductScalar3Cpu(alpha, beta, a, b, c);
       return;
 #ifdef HAVE_CUDA
-    case kGpuDevice:
+    case kCudaDevice:
       AddProductScalar3Gpu(alpha, beta, a, b, c);
       return;
 #endif
diff --git a/src/tensor/linear-special-ops.cc b/src/tensor/linear-special-ops.cc
index ae4defeae44..06f9247acb5 100644
--- a/src/tensor/linear-special-ops.cc
+++ b/src/tensor/linear-special-ops.cc
@@ -65,7 +65,7 @@ inline static void AddProductScalar3(
       AddProductScalar3Cpu(alpha, beta, a, b, c);
       return;
 #ifdef HAVE_CUDA
-    case kGpuDevice:
+    case kCudaDevice:
       AddProductScalar3Gpu(alpha, beta, a, b, c);
       return;
 #endif
diff --git a/src/tensor/linear-special-ops.h b/src/tensor/linear-special-ops.h
index 3f463026607..a7323235f5e 100644
--- a/src/tensor/linear-special-ops.h
+++ b/src/tensor/linear-special-ops.h
@@ -37,7 +37,7 @@ namespace tensor {
    a and b may not point to the same data.
 
    Template parameter T is the datatype concerned (say, T = float)
-   D is the DeviceType enum, kCpuDevice or kGpuDevice.
+   D is the DeviceType enum, kCpuDevice or kCudaDevice.
 
    Will be specialized for CPU and GPU in linear-cpu-ops.h and linear-gpu-ops.h
 */
@@ -51,7 +51,7 @@ class ScalarPlusEqScalarOp;
    a and b may not overlap.
 
    Template parameter T is the datatype concerned (say, T = float)
-   D is the DeviceType enum, kCpuDevice or kGpuDevice.
+   D is the DeviceType enum, kCpuDevice or kCudaDevice.
 
    Will be specialized for CPU and GPU in linear-cpu-ops.h and linear-gpu-ops.h
 */
@@ -66,7 +66,7 @@ class StvectorPlusEqStvectorOp;
    May not be used if a and b overlap.
 
    Template parameter T is the datatype concerned (say, T = float)
-   D is the DeviceType enum, kCpuDevice or kGpuDevice.
+   D is the DeviceType enum, kCpuDevice or kCudaDevice.
 
    Will be specialized for CPU and GPU in linear-cpu-ops.h and linear-gpu-ops.h
 */
diff --git a/src/tensor/op.h b/src/tensor/op.h
index 50d9b861910..d43d47aa756 100644
--- a/src/tensor/op.h
+++ b/src/tensor/op.h
@@ -182,8 +182,8 @@ class Op {
    switch (device_type) {                                                 \
     case kCpuDevice:                                                      \
       pointer_name = new OpName<T, kCpuDevice>(__VA_ARGS__); break;       \
-    case kGpuDevice:                                                      \
-      pointer_name = new OpName<T, kGpuDevice>(__VA_ARGS__); break;       \
+    case kCudaDevice:                                                      \
+      pointer_name = new OpName<T, kCudaDevice>(__VA_ARGS__); break;       \
     default:                                                              \
     KALDI_ERR << "Invalid device type " << int32(device_type);            \
   }  while (0)
@@ -196,7 +196,7 @@ class Op {
    switch (device_type) {                                                 \
     case kCpuDevice:                                                      \
       pointer_name = new OpName<T, kCpuDevice>(__VA_ARGS__); break;       \
-    case kGpuDevice:                                                      \
+    case kCudaDevice:                                                      \
     KALDI_ERR << "You did not compile for CUDA, reconfigure with "        \
                  "CUDA support.";                                         \
     default:                                                              \
diff --git a/src/tensor/tensor-impl-linear.cc b/src/tensor/tensor-impl-linear.cc
index b75d62b6c41..87139271306 100644
--- a/src/tensor/tensor-impl-linear.cc
+++ b/src/tensor/tensor-impl-linear.cc
@@ -32,7 +32,7 @@ inline static void AddProductScalar3(
       AddProductScalar3Cpu(alpha, beta, a, b, c);
       return;
 #ifdef HAVE_CUDA
-    case kGpuDevice:
+    case kCudaDevice:
       AddProductScalar3Gpu(alpha, beta, a, b, c);
       return;
 #endif
diff --git a/src/tensor/tensor-impl.h b/src/tensor/tensor-impl.h
index b5b70352fb3..5501a7c9c4a 100644
--- a/src/tensor/tensor-impl.h
+++ b/src/tensor/tensor-impl.h
@@ -148,8 +148,8 @@ struct TensorImpl {
                           see examples below
 <code>
    TensorImpl *t = new TensorImpl({10,20}),
-       *u = new TensorImpl({9}, {kGpuDevice});
-       *v = new TensorImpl({9}, {kDoubleDtype, kGpuDevice});
+       *u = new TensorImpl({9}, {kCudaDevice});
+       *v = new TensorImpl({9}, {kDoubleDtype, kCudaDevice});
 </code>
   */
   TensorImpl(ArrayRef<int32> dims,
diff --git a/src/tensor/variable-functions.h b/src/tensor/variable-functions.h
index f775aa32531..91495c1c131 100644
--- a/src/tensor/variable-functions.h
+++ b/src/tensor/variable-functions.h
@@ -42,7 +42,7 @@ namespace tensor {
 <code>
    Variable scalar = Undefined({});
    Variable a = Undefined({3,4}, {kDoubleDtype});
-   Variable b = Undefined({1,100}, {kDoubleDtype, kGpuDevice});
+   Variable b = Undefined({1,100}, {kDoubleDtype, kCudaDevice});
 </code>
   Note on C++: reading the code above may require getting used to C++
   braced-initializer-lists.  The {3,4} is interpreted as a
@@ -65,7 +65,7 @@ Variable Undefined(ArrayRef<int32> dims,
 <code>
    Variable scalar = Zeros({});
    Variable a = Zeros({3,4}, {kDoubleDtype});
-   Variable b = Zeros({1,100}, {kDoubleDtype, kGpuDevice});
+   Variable b = Zeros({1,100}, {kDoubleDtype, kCudaDevice});
 </code>
   Note on C++: reading the code above may require getting used to C++
   braced-initializer-lists.  The {3,4} is interpreted as a
diff --git a/src/tensor/variable-inplace.h b/src/tensor/variable-inplace.h
index 8c6dd219d29..d5b4da0c9ca 100644
--- a/src/tensor/variable-inplace.h
+++ b/src/tensor/variable-inplace.h
@@ -63,7 +63,7 @@ void SetZero(Variable *v);
 <code>
    Variable scalar = Zeros({});
    Variable a = Zeros({3,4}, {kDoubleDtype});
-   Variable b = Zeros({1,100}, {kDoubleDtype, kGpuDevice});
+   Variable b = Zeros({1,100}, {kDoubleDtype, kCudaDevice});
 </code>
   Note on C++: reading the code above may require getting used to C++
   braced-initializer-lists.  The {3,4} is interpreted as a

From 33c36bbbf7c50dfe89a4e06e19c5297e6d38e052 Mon Sep 17 00:00:00 2001
From: Daniel Povey <dpovey@gmail.com>
Date: Fri, 14 Jun 2019 15:22:57 -0400
Subject: [PATCH 134/163] [src] More tensor progress

---
 src/tensor/cuda-utils.cc          | 328 +++++++++++++++++++++++++++
 src/tensor/cuda-utils.h           | 355 ++++++++++++++++++++++++++++++
 src/tensor/linear-cpu-ops.h       |  81 +++++--
 src/tensor/linear-cpu-ref-ops.h   | 168 ++++++++++++++
 src/tensor/linear-gpu-ops.h       | 303 +++++++++++++++++++++++++
 src/tensor/linear-gpu-ref-ops.h   | 174 +++++++++++++++
 src/tensor/linear-ops.cc          | 146 ++++++++++--
 src/tensor/linear-ops.h           |  70 +++---
 src/tensor/linear-ref-ops.h       |  87 --------
 src/tensor/linear-special-ops.h   |  15 ++
 src/tensor/memory-checker.h       |  28 ++-
 src/tensor/op.h                   |  44 +++-
 src/tensor/pattern-tuple-utils.cc |   4 +-
 src/tensor/pattern-tuple-utils.h  |  77 +++++--
 src/tensor/pattern-utils.h        |  29 +--
 src/tensor/tensor-common.h        |  19 +-
 src/tensor/tensor-utils.h         |   5 +-
 src/tensor/tensor.h               |   8 +
 18 files changed, 1736 insertions(+), 205 deletions(-)
 create mode 100644 src/tensor/cuda-utils.cc
 create mode 100644 src/tensor/cuda-utils.h
 create mode 100644 src/tensor/linear-cpu-ref-ops.h
 create mode 100644 src/tensor/linear-gpu-ops.h
 create mode 100644 src/tensor/linear-gpu-ref-ops.h
 delete mode 100644 src/tensor/linear-ref-ops.h

diff --git a/src/tensor/cuda-utils.cc b/src/tensor/cuda-utils.cc
new file mode 100644
index 00000000000..1f76597504e
--- /dev/null
+++ b/src/tensor/cuda-utils.cc
@@ -0,0 +1,328 @@
+// tensor/cuda-utils.cc
+
+// Copyright      2019  Johns Hopkins University (author: Daniel Povey)
+
+// See ../../COPYING for clarification regarding multiple authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//  http://www.apache.org/licenses/LICENSE-2.0
+//
+// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
+// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
+// MERCHANTABLITY OR NON-INFRINGEMENT.
+// See the Apache 2 License for the specific language governing permissions and
+// limitations under the License.
+
+#include "tensor/cuda-utils.h"
+#include "base/kaldi-math.h"
+
+namespace kaldi {
+namespace tensor {
+
+#define KALDI_STANDARD_THREAD_BLOCK_SIZE 256
+#define KALDI_TARGET_NUM_THREAD_BLOCKS 1024
+
+
+/**
+   This function splits the kernel that's the last element of 'kernels' so that
+   it satisifes grid_dim.x <= 65535-- if necessary, by splitting it into
+   multiple kernels, increasing the length of the vector 'kernels'.
+ */
+static void SplitStandardKernelX(std::vector<StandardKernel> *kernels) {
+  int cur_grid_dim = kernels->back().grid_dim.x;
+  if (cur_grid_dim <= 65535)
+    return;
+  int num_kernels = (kernels->back().grid_dim.x + 65534) / 65535;
+
+  size_t cur_size = kernels.size(),
+      new_size = cur_size + num_kernels - 1;
+
+  std::vector<int> new_grid_dims(num_kernels,
+                                 cur_grid_dim / num_kernels);
+  for (int i = 0; i < cur_grid_dim % num_kernels; i++)
+    new_grid_dims[i]++;
+  // the above ensures that the sum of new_grid_dims equals
+  // cur_grid_dim; this is checked at the bottom of this function.
+  StandardKernel prev_kernel = kernels->back();
+  kernels->resize(new_size, prev_kernel);
+
+  int prev_grid_dim_sum = 0;
+  for (int i = 0; i < num_kernels; i++) {
+    StandardKernel &k = (*kernels)[cur_size - 1 + i];
+    int this_grid_dim = new_grid_dims[i];
+
+    k.dim_grid.x = new_grid_dims[i];
+    // If this is not the last i value (the last kernel), we can
+    // leave k.sizes.max_offset_a.x as it is because we have
+    // a 'whole number' of
+
+    if (i + 1 < num_kernels) {
+      // the following actually has no effect on operation, it's more
+      // for clarity.
+      k.sizes.max_offset_a.x = this_grid_dim * k.sizes.block_stride_a.x;
+    } else {
+      // for last one, this limit does make a difference, as the
+      // highest-numbered thread block may not have all threads run.
+      k.sizes.max_offset_a.x -= prev_grid_dim_sum * k.sizes.block_stride_a.x;
+    }
+    k.base_offset_a += prev_grid_dim_sum * k.sizes.block_stride_a.x;
+
+    prev_grid_dim_sum += this_grid_dim;
+  }
+  KALDI_ASSERT(prev_grid_dim_sum == cur_grid_dim);
+}
+
+static void SplitStandardKernelY(std::vector<StandardKernel> *kernels) {
+  // TODO.  Copy of the X one above.
+}
+
+static void GetStandardKernel1(const Pattern &a, const Pattern &b,
+                               std::vector<StandardKernel> *kernels) {
+  //  KALDI_PARANOID_ASSERT(a.num_axes == 1);
+
+  // Note: the following call will invoke the constructor of dim3 which
+  // sets all the values to 1, so we don't have to set the unused
+  // gridDim elements.
+  kernels->resize(kernels->size() + 1);
+  StandardKernel &k = kernels->back();
+  // Note: b.dims[0] is either 'dim' or 1; it won't affect anything, we only
+  // need b's stride.
+  int dim = a.dims[0],
+      a_stride = a.strides[0],
+      b_stride = b.strides[0];
+  int bs = KALDI_STANDARD_THREAD_BLOCK_SIZE;
+  int num_blocks = (dim + bs - 1) / bs;  // round up.
+
+  k.sizes.thread_stride_a.x = a_stride;
+  k.sizes.block_stride_a.x = a_stride * bs;
+
+  k.sizes.thread_stride_b.x = b_stride;
+  k.sizes.block_stride_b.x = b_stride * bs;
+
+  k.sizes.max_offset_a.x = dim * a_stride;
+
+  k.block_dim.x = std::min<int32>(bs, dim);
+  k.grid_dim.x = num_blocks;
+
+  if (num_blocks > 65535)
+    SplitStandardKernelX(kernels);
+}
+
+
+// Fills out the 'x' dimension of the standard kernel using raxis 0
+// of the patterns (which are assumed to have been sorted on the
+// stride of a, so that raxis 0 is the one with the smallest stride,
+// hopefully equal to 1)
+//
+// Does
+static void GetStandardKernelX(const Pattern &a, const Pattern &b,
+                               std::vector<StandardKernel> *kernels) {
+  //  KALDI_PARANOID_ASSERT(a.num_axes == 1);
+
+  // Note: the following call will invoke the constructor of dim3 which
+  // sets all the values to 1, so we don't have to set the unused
+  // gridDim elements.
+  kernels->resize(kernels->size() + 1);
+  StandardKernel &k = kernels->back();
+  // Note: b.dims[0] is either 'dim' or 1; it won't affect anything, we only
+  // need b's stride.
+  int dim = a.dims[0],
+      a_stride = a.strides[0],
+      b_stride = b.strides[0];
+  int bs = KALDI_STANDARD_THREAD_BLOCK_SIZE;
+  int num_blocks = (dim + bs - 1) / bs;  // round up.
+
+  k.sizes.thread_stride_a.x = a_stride;
+  k.sizes.block_stride_a.x = a_stride * bs;
+
+  k.sizes.thread_stride_b.x = b_stride;
+  k.sizes.block_stride_b.x = b_stride * bs;
+
+  k.sizes.max_offset_a.x = dim * a_stride;
+
+  k.block_dim.x = std::min<int32>(bs, dim);
+  k.grid_dim.x = num_blocks;
+
+  if (num_blocks > 65535)
+    SplitStandardKernelX(kernels);
+}
+
+
+
+static void GetStandardKernel2(const Pattern &a, const Pattern &b,
+                               std::vector<StandardKernel> *kernels) {
+  // Note: the following call will invoke the constructor of dim3 which
+  // sets all the values to 1, so we don't have to set the unused
+  // gridDim elements.
+  kernels->resize(kernels->size() + 1);
+
+  StandardKernel &k = kernels->back();
+  int dim0 = a.dims[0],
+      a_stride0 = a.strides[0],
+      b_stride0 = b.strides[0],
+      dim1 = a.dims[1],
+      a_stride1 = a.strides[1],
+      b_stride1 = b.strides[1];
+  // We expect the patterns will have been normalized prior to this
+  // call, which is why we don't expect zero strides for a.
+  // some of the code does assume this, so we check for it.
+  KALDI_PARANOID_ASSERT(a_stride0 != 0 && a_stride0 < a_stride1);
+
+  if (dim0 < 64) {
+    // dim0 is on the small side for a thread-block size, so we want the thread
+    // block size to include part of dim1.
+    int bs0 = dim0,
+        dim0_rounded_up = RoundUpToNearestPowerOfTwo(dim0),
+        bs1 = KALDI_STANDARD_THREAD_BLOCK_SIZE / dim0_rounded_up,
+        nb1 = (dim1 + bs1 - 1) / bs1;
+    k.block_dim.x = dim0;
+    k.grid_dim.x = 1;  // it had this value anyway; this is for clrity.
+    k.block_dim.y = bs1;
+    k.grid_dim.y = nb1;
+
+    k.sizes.max_offset_a.x = dim0 * a_stride0;
+    k.sizes.max_offset_a.y = dim1 * a_stride1;
+
+    k.sizes.thread_stride_a.x = a_stride0;
+    k.sizes.block_stride_a.x = a_stride0 * bs0;
+    k.sizes.thread_stride_a.y = a_stride1;
+    k.sizes.block_stride_a.y = a_stride1 * bs1;
+
+    k.sizes.thread_stride_b.x = b_stride0;
+    k.sizes.block_stride_b.x = b_stride0 * bs0;
+    k.sizes.thread_stride_b.y = b_stride1;
+    k.sizes.block_stride_b.y = b_stride1 * bs1;
+
+  } else {
+    int bs0 = std::min<int32>(dim0, KALDI_STANDARD_THREAD_BLOCK_SIZE),
+        nb0 = (dim0 + bs0 - 1) / bs0,
+        bs1 = 1,
+        nb1 = dim1;
+
+    k.block_dim.x = dim0;
+    k.grid_dim.x = 1;  // it had this value anyway; this is for clrity.
+    k.block_dim.y = bs1;
+    k.grid_dim.y = nb1;
+
+    k.sizes.max_offset_a.x = dim0 * a_stride0;
+    k.sizes.max_offset_a.y = dim1 * a_stride1;
+
+    if (nb0 > 65535)
+      SplitStandardKernelX(kernels);
+    else if (nb1 > 65535)
+      SplitStandardKernelY(kernels);
+    // we don't handle the case where they are both > 65535, because that, times
+    // the block size, would be more than the memory of any GPU, and would
+    // require code changes.
+
+
+  }
+
+
+    // everything goes in the x, and we rely on the loop limits to
+    //
+
+  }
+
+  if (dim0 * dim1 < 1024) {
+    // Do it in a single thread block.  There's no point wasting
+    // time figuring out more details.
+  } else if (dim0 > bs && dim1 * dim2 <= 16384) {
+    // 16384 is 4 * 4096, and 4096 is a kind of upper limit on
+    // how many threads we might expect to run at once.
+
+  }
+
+
+
+  KALDI_PARANOID_ASSERT(dim0 > 1 && dim1 > 1);
+  int bs = KALDI_STANDARD_THREAD_BLOCK_SIZE;
+  if (dim0 >= bs)
+  if (dim0 < bs) {
+    if (dim0 >= bs / 2) {
+      bs = dim0;
+    } else {
+      // This is a relatively complex case; the blocks can't just
+      // be on dim0, they have to also include dim1.  We
+      // would prefer to use an exact divisor of dim1, to avoid
+      // having to use 2 kernels.
+      int block_x = dim0,
+          block_y = -1;
+      float block_size_cost = 1.0e+10;
+      if (dim0 * dim1 <= 1024) {
+        block_y = dim1;
+      } else {
+        for (int this_block_y = 1;
+             this_block_y * block_x < 1024;
+             this_block_y++) {
+          if (dim1 % this_block_y == 0) {
+            int this_block_size = this_block_y * block_x;
+            float this_block_size_cost =  GetBlockSizeCost(this_block_size);
+            if (this_block_size_cost < block_size_cost) {
+              block_size_cost = this_block_size_cost;
+              block_y = this_block_y;
+            }
+          }
+        }
+      }
+      if (this_block_y == -1) {
+        block_y = KALDI_STANDARD_KERNEL1_BLOCK_SIZE / block_x;
+        // and we'll deal with the remainder via a second kernel.
+      }
+
+      }
+
+    }
+
+
+
+  }
+
+  int bs = KALDI_STANDARD_KERNEL1_BLOCK_SIZE;
+  int32 num_blocks = (dim + bs - 1) / bs;  // round up.
+  if (num_blocks > 1536)   // Don't want to have stragglers, so
+    num_blocks = 1024;     // only limit num_blocks to 1024 if
+                           // most will loop at least twice.
+  k.sizes.thread_stride_a.x = a_stride;
+  k.sizes.block_stride_a.x = a_stride * bs;
+  k.sizes.thread_stride_b.x = b_stride;
+  k.sizes.block_stride_b.x = b_stride * bs;
+  k.block_dim.x = bs;
+  k.grid_dim.x = num_blocks;
+  // We don't treat the case where dim < bs separately (e.g. setting
+  // k.block_dim.x = dim), I don't think it would make any real difference.
+}
+
+
+
+void GetStandardKernel(const Pattern &a, const Pattern &b,
+                       std::vector<StandardKernel> *kernels) {
+
+  // TODO: ensure that the 1st dim of a is the one with smallest stride.
+
+  KALDI_PARANOID_ASSERT(DimsGeq(a, b) && a.num_axes >= b.num_axes &&
+                        Broadcastable(a, b));
+  int32 num_axes = a.num_axes;
+  switch (num_axes) {
+    case 1:
+      GetStandardKernel1(a, b, kernels);
+      return;
+    case 2:
+      GetStandardKernel2(a, b, kernels);
+      return;
+    case 3:
+      GetStandardKernel3(a, b, kernels);
+      return;
+  }
+
+}
+
+
+
+
+}  // namespace kaldi
+}  // namespace tensor
diff --git a/src/tensor/cuda-utils.h b/src/tensor/cuda-utils.h
new file mode 100644
index 00000000000..175a1570c31
--- /dev/null
+++ b/src/tensor/cuda-utils.h
@@ -0,0 +1,355 @@
+// tensor/cuda-utils.h
+
+//  Copyright      2019  Johns Hopkins University (author: Daniel Povey)
+
+// See ../../COPYING for clarification regarding multiple authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//  http://www.apache.org/licenses/LICENSE-2.0
+//
+// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
+// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
+// MERCHANTABLITY OR NON-INFRINGEMENT.
+// See the Apache 2 License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef KALDI_TENSOR_TENSOR_CUDA_UTILS_H_
+#define KALDI_TENSOR_TENSOR_CUDA_UTILS_H_ 1
+
+// Caution: don't include this header if we're not compiling with cuda.
+
+#include "tensor/tensor-common.h"
+#include <cuda_runtime_api.h>
+#include <limits>
+
+
+namespace kaldi {
+namespace tensor {
+
+
+/**
+   These utilities are mostly for use with non-reducing (but possibly
+   broadcasting) kernels.  The setup is: we have two Tensors a and b.  We are
+   doing some operation like, say, a = sigmoid(b) that's non-reducing (no
+   summation) but possibly broadcasting.
+
+   For generality and also (reasonable) speed, we have a standard pattern/interface
+   of kernel for such operations.
+
+
+  void _standard_kernel(StandardTwoArgKernelSizes f, float *a, float *b) {
+    int a_offset_x = f.thread_stride_a.x * threadIdx.x + block_stride_a.x * blockIdx.x,
+      a_offset_yz = f.thread_stride_a.y * threadIdx.y + block_stride_a.y * blockIdx.y +
+                    f.thread_stride_a.z * threadIdx.z + block_stride_a.z * blockIdx.z
+    int b_offset_x = f.thread_stride_b.x * threadIdx.x + block_stride_b.x * blockIdx.x,
+      b_offset_yz = f.thread_stride_b.y * threadIdx.y + block_stride_b.y * blockIdx.y +
+                    f.thread_stride_b.z * threadIdx.z + block_stride_b.z * blockIdx.z
+
+     for (; a_offset_x < f.max_offset_a;
+        a_offset_x += block_stride_a * blockDim.x,
+        b_offset_x += block_stride_b * blockDim.x) {
+     a[a_offset_x + a_offset_yz] = some_func(b[b_offset_x + b_offset_yz]);
+  }
+
+  It's possible to encode a great variety of elementwise operations of up to 6
+  dimensions using the pattern above; the rare cases that can't be handled that
+  way can be handled using multiple invocations of the same kernel.
+
+  We don't make any special allowances for things like matrix transpose, though;
+  in future we may make a special variety of kernel that can handle transposes
+  while using coalesced memory access.
+
+  *Algorithm*.
+
+  We first ensure that the first raxis (raxis=0) of pattern a has the smallest
+  abs(stride).  This is necessary later in certain cases for the loop to work
+  correctly.
+
+  We try various algorithms for generating the kernel info; each one
+  returns a score, and we then select the one that gave the best score.
+
+
+
+
+  switch(num_axes) {
+    case 0:
+
+    case 1:
+      pretty easy.
+    case 2:
+      copy our matrix code.
+    case 3:
+
+
+
+  }
+
+  Simplest algorithm, applicable for up to 4 axes and if 1st axis is >= 256:
+
+
+
+
+  : 1st axis gets allocated to
+  block-dim x and spills over if necessary into grid-dim x.  Remaining dims go
+  into grid-dim x if not used, then grid-dims y and z.
+
+     Measure on:
+        coalesced memory access (no, should always have this).
+
+        - Loop length
+
+
+        number of blocks; want no more than about 1024
+        kernel size (prefer around 256; too small much worse than
+         much worse).
+
+
+
+  Next algorithm (only applicable if 1st dim is between 32 and 1024 and there are
+  >=2 dims, and the product of the remaining dims is >1024:
+
+  Make the loop be over one of those remaining dims.
+
+
+  We assume raxis 0 of a has stride 1, which it will if any dim had
+  stride 1.
+
+
+
+
+  The loop with `a_offset_x < f.max_offset_a` allows us to cover several elements with
+  one kernel (reducing kernel startup cost) and also makes it possible to fit
+
+
+
+
+  Depends on the dim...
+    Only one dim:
+       Type 1 kernel using a while loop and the if-statement,
+       with only the x dimension; thread block size = 128,
+       number of blocks no greater than 1024.
+
+
+   Two dims: # Note: we are assuming dim 0 is the one with stride=1 (if any).
+     Make sure that, for a, the first axis dominates the second.  (c.f.
+     axis-dominance property).
+
+     If first dim >= 64  # first dim alone will be the thread dim.
+       if (first_dim < 200) { // purposely between powers of 2.
+          threadDim.x = first_dim;
+          blockDim.x = 1
+          threadDim.y = 1
+          blockDim.y = second_dim;  # Use multiple kernels if limit of
+                                    # 65536 is an issue.
+       } else {
+         # assume 2nd dim becomes blockDim.y; work out the max num-blocks
+         # we might want of 1st dim.
+
+         # break up first_dim into blocks of 128;
+         # use the num-blocks given above if it's limiting,
+         # and loop for the rest.
+       }
+    else (first_dim < 64),
+       swap the x and y axes; use 256/first_dim to limit thread-block
+       size.
+
+
+    More than two dims (up to 5).
+       Sort dims from smallest to greatest stride.
+
+       First dim maps to 1st dimension.  If it is
+       <128, we'll have to augment the thread block size
+       with another dim.
+         - First choice: find a dim whose product
+           with the first dim is <1024, and if one
+           exists, take the closest one to 256.
+         - Second choice: take the next-smallest-stride
+           dim, choose 256/first_dim as the thread
+           block size, and put the rest of it in
+           the grid size. [would go to x, while the
+           1st choice goes to the y.]
+
+       Now iterate through the
+
+
+
+
+       If first dim is small, increase block size with another
+       dim.  Choose smallest remaining dims as blockDim.y and blockDim.z,
+       as long as num-blocks < 1024.
+
+
+       (Choose one that gives num-blocks between 128
+       and 1024 if already present; otherwise split one of the
+       dims and put it as y).
+
+       Put any remaining dims as gridDim.y and gridDim.z.
+       If this isn't enough, use multiple kernel launches
+       by (initially) iterating over the smallest dim.
+
+
+
+
+
+   Sometimes we can handle something by launching two type 1 kernels, or
+   a type 2 kernel
+
+   Type 2 kernels use the x, y and z dimensions of
+   grids and blocks
+
+
+   First: we define type 1 kernel as a non-reducing (but possibly broadcasting)
+   operation between two Tensors, e.g. a = b or a = sigmoid(b).  This is
+   a rather general type of kernel that can be used as the generic case
+   (applicable to arbitrary tensors).
+
+   The KernelInfo is the part that needs to be passed into the
+   kernel itself.  There are also two other things needed to launch the
+   kernel:
+<code>
+      dim3 grid_dim, block_dim;
+</code>
+
+   The basic operation we'll do in the kernel is something like this;
+   let 'a' and 'b' be pointers to float or something like that.  Let
+<code>
+    KernelInfo f;  // passed in.
+    int x_offset_a = f.thread_stride_a.x * threadIdx.x + f.block_stride_a.x * blockIdx.x;
+       y_offset_a = f.thread_stride_a.y * threadIdx.y + f.block_stride_a.y * blockIdx.y;
+       z_offset_a = f.thread_stride_a.z * threadIdx.z + f.block_stride_a.z * blockIdx.z;
+    // and similar statements to set x_offset_b, y_offset_b, z_offset_b.
+
+    if (x_offset_a < f.max_offset_a.x &&
+        y_offset_a < f.max_offset_a.y &&
+        z_offset_a < f.max_offset_a.z)
+      a[x_offset_a + y_offset_a + z_offset_a] =
+          b[x_offset_b + y_offset_b + z_offset_b];
+
+
+       thread_stride_a.y * threadIdx.y +
+       thread_stride_a.z * threadIdx.z +
+       block_stride_a.x * blockIdx.x
+    // clock speed e.g. 3 gHz.  Say 100 instructions.
+</code>
+ */
+
+
+struct StandardTwoArgKernelSizes {
+  dim3 thread_stride_a;
+  dim3 thread_stride_b;
+  dim3 block_stride_a;
+  dim3 block_stride_b;
+  dim3 max_offset_a;
+};
+
+
+
+
+struct StandardTwoArgKernel {
+  dim3 block_dim;
+  dim3 grid_dim;
+  StandardTwoArgKernelSizes sizes;  // passed into kernel.
+};
+
+
+/**
+   This function returns the dimensions/sizes for one or more "standard kernels"
+   to execute a "standard operation" on patterns a and b.  We define a
+   standard operation as an elementwise operation possibly with broadcasting,
+   of the form:
+       a[i] = f(b[i])
+
+   for some scalar function f, where i is an index-tuple.  a and b must be
+   broadcastable, and the dims of a must be >= the corresponding dims of b
+   (i.e.: no reduction).  We also require a.num_axes >= b.num_axes.
+   The standard kernel is as follows:
+<code>
+  void _standard_kernel(StandardTwoArgKernelSizes f, float *a, float *b) {
+    int a_offset_x = f.thread_stride_a.x * threadIdx.x + block_stride_a.x * blockIdx.x,
+      a_offset_y = f.thread_stride_a.y * threadIdx.y + block_stride_a.y * blockIdx.y,
+      a_offset_z = f.thread_stride_a.z * threadIdx.z + block_stride_a.z * blockIdx.z;
+    int b_offset = f.thread_stride_b.x * threadIdx.x + block_stride_b.x * blockIdx.x +
+                   f.thread_stride_b.y * threadIdx.y + block_stride_b.y * blockIdx.y +
+                   f.thread_stride_b.z * threadIdx.z + block_stride_b.z * blockIdx.z
+
+     if (a_offset_x < f.max_offset_a.x && a_offset_y < f.max_offset_a.y)
+       a[a_offset_x + a_offset_y + a_offset_z] = some_func(b[b_offset]);
+  }
+</code>
+  }
+
+      @param [in] a   First pattern for which we want the kernel (or kernels)
+      @param [in] b   Second pattern for which we want the kernel (or kernels)
+      @param [out] kernels  The kernels are *appended to* this vector (this
+                      allows for recursive operation in this function).  Normally,
+                      we'll have `kernels->size() == 1` at exit.  The user is expected
+                      to call all of them (the order doesn't matter, and they
+                      don't have to be called in sequence).
+ */
+void GetStandardKernel(const Pattern &a, const Pattern &b,
+                       std::vector<StandardKernel> *kernels);
+
+
+
+/**
+   First: we define type 1 kernel as a non-reducing (but possibly broadcasting)
+   operation between two Tensors, e.g. a = b or a = sigmoid(b).  This is
+   a rather general type of kernel that can be used as the generic case
+   (applicable to arbitrary tensors).
+
+   The KernelInfo is the part that needs to be passed into the
+   kernel itself.  There are also two other things needed to launch the
+   kernel:
+<code>
+      dim3 grid_dim, block_dim;
+</code>
+
+   The basic operation we'll do in the kernel is something like this;
+   let 'a' and 'b' be pointers to float or something like that.  Let
+<code>
+    Type1KernelInfo f;  // passed in.
+    int x_offset_a = f.thread_stride_a.x * threadIdx.x + f.block_stride_a.x * blockIdx.x;
+       y_offset_a = f.thread_stride_a.y * threadIdx.y + f.block_stride_a.y * blockIdx.y;
+       z_offset_a = f.thread_stride_a.z * threadIdx.z + f.block_stride_a.z * blockIdx.z;
+    // and similar statements to set x_offset_b, y_offset_b, z_offset_b.
+
+    if (x_offset_a < f.max_offset_a.x &&
+        y_offset_a < f.max_offset_a.y)
+      a[x_offset_a + y_offset_a + z_offset_a] =
+          b[x_offset_b + y_offset_b + z_offset_b];
+    // clock speed e.g. 3 gHz.  Say 100 instructions.
+</code>
+ */
+
+class StandardKernelSizes {
+  dim3 thread_stride_a;
+  dim3 thread_stride_b;
+  dim3 block_stride_a;
+  dim3 block_stride_b;
+  dim3 max_offset_a;
+};
+
+class StandardKernel {
+  dim3 dim_block;
+  dim3 dim_grid;
+  StandardKernelSizes sizes;
+  // offset_a and offset_b are offsets that we have to add to the data-pointers
+  // of a and b before we call the kernel; these will normally be zero, but may
+  // be nonzero if we have to generate multiple kernels due to, say, size
+  // constraints.
+  int64 base_offset_a{0};
+  int64 base_offset_b{0};
+};
+
+
+
+
+}  // namespace tensor
+}  // namespace kaldi
+
+
+#endif  // KALDI_TENSOR_TENSOR_PATTERN_H_
diff --git a/src/tensor/linear-cpu-ops.h b/src/tensor/linear-cpu-ops.h
index 372bee0c5a4..b80a37c6d17 100644
--- a/src/tensor/linear-cpu-ops.h
+++ b/src/tensor/linear-cpu-ops.h
@@ -24,7 +24,6 @@
 #include "tensor/linear-special-ops.h"
 #include "matrix/kaldi-blas.h"
 
-
 // This Ops are more specialized forms of the Ops declared in linear-ops.h;
 // these correspond to more specific combinations of Tensor shapes.  These Ops
 // are only intended to be created from inside other more generic Ops.
@@ -81,11 +80,23 @@ class StvectorPlusEqStvectorOp<T, kCpuDevice>: public Op {
     int32 dim = a_pattern.dims[0],
         a_stride = a_pattern.strides[0],
         b_stride = b_pattern.strides[0];
-    T *a_data = a_.GetData<T>(),
+
+    bool uninitialized;
+    T *a_data = a_.GetData<T>(&uninitialized),
         *b_data = a_.GetData<T>();
-    // In future could look into unrolling this loop if it becomes a bottleneck.
-    for (int32 i = 0; i < dim; i++)
-      a_data[i * a_stride] += b_data[i * b_stride];
+    if (uninitialized) {
+      // This branch is an optimization to avoid writing, and reading, zeros
+      // to/from memory.
+      DebugNormalOp(a, kWrite, b_, kRead);
+      // In future could look into unrolling this loop if it becomes a bottleneck.
+      for (int32 i = 0; i < dim; i++)
+        a_data[i * a_stride] = b_data[i * b_stride];
+    } else {
+      DebugNormalOp(a, kReadWrite, b_, kRead);
+      // In future could look into unrolling this loop if it becomes a bottleneck.
+      for (int32 i = 0; i < dim; i++)
+        a_data[i * a_stride] += b_data[i * b_stride];
+    }
   }
   Tensor a_;
   Tensor b_;
@@ -101,12 +112,24 @@ class StvectorPlusEqStvectorOp<float, kCpuDevice>: public Op {
     return new SvectorPlusEqSvectorOp<float, kCpuDevice>(a_, b_);
   }
   void Do() {
-    DebugNormalOp(a, kReadWrite, b_, kRead);
     const Pattern &a_pattern = a_.Pattern(),
         &b_pattern = b_.Pattern();
-    cblas_saxpy(a_pattern.dims[0], 1.0,
-                b_.GetData<T>(), a_pattern.strides[0],
-                a_.GetData<T>(), b_pattern.strides[0]);
+    bool uninitialized;
+    float *a_data = a_.GetData<float>(&uninitialized),
+        *b_data = a_.GetData<float>();
+    if (uninitialized) {
+      // This branch is an optimization to avoid writing, and reading, zeros
+      // to/from memory.
+      DebugNormalOp(a, kWrite, b_, kRead);
+      cblas_scopy(a_pattern.dims[0], 1.0,
+                  b_.GetData<float>(), b_pattern.strides[0],
+                  a_.GetData<float>(), a_pattern.strides[0]);
+    } else {
+      DebugNormalOp(a, kReadWrite, b_, kRead);
+      cblas_saxpy(a_pattern.dims[0], 1.0,
+                  b_.GetData<float>(), b_pattern.strides[0],
+                  a_.GetData<float>(), a_pattern.strides[0]);
+    }
   }
   Tensor a_;
   Tensor b_;
@@ -121,12 +144,24 @@ class StvectorPlusEqStvectorOp<double, kCpuDevice>: public Op {
     return new SvectorPlusEqSvectorOp<double, kCpuDevice>(a_, b_);
   }
   void Do() {
-    DebugNormalOp(a, kReadWrite, b_, kRead);
     const Pattern &a_pattern = a_.Pattern(),
         &b_pattern = b_.Pattern();
-    cblas_daxpy(a_pattern.dims[0], 1.0,
-                b_.GetData<T>(), a_pattern.strides[0],
-                a_.GetData<T>(), b_pattern.strides[0]);
+    bool uninitialized;
+    double *a_data = a_.GetData<double>(&uninitialized),
+        *b_data = a_.GetData<double>();
+    if (uninitialized) {
+      // This branch is an optimization to avoid writing, and reading, zeros
+      // to/from memory.
+      DebugNormalOp(a, kWrite, b_, kRead);
+      cblas_dcopy(a_pattern.dims[0], 1.0,
+                  b_.GetData<double>(), b_pattern.strides[0],
+                  a_.GetData<double>(), a_pattern.strides[0]);
+    } else {
+      DebugNormalOp(a, kReadWrite, b_, kRead);
+      cblas_daxpy(a_pattern.dims[0], 1.0,
+                  b_.GetData<double>(), b_pattern.strides[0],
+                  a_.GetData<double>(), a_pattern.strides[0]);
+    }
   }
   Tensor a_;
   Tensor b_;
@@ -200,7 +235,7 @@ class ScalarPlusEqStvectorOp<float, kCpuDevice>: public Op {
 // Override for T = double
 template <>
 class ScalarPlusEqStvectorOp<double, kCpuDevice>: public Op {
-  StvectorPlusEqStvectorOp(const Tensor &a, const Tensor &b): a_(a), b_(b) { }
+  ScalarPlusEqStvectorOp(const Tensor &a, const Tensor &b): a_(a), b_(b) { }
 
   int32 Properties() { return kConcreteOp; }
 
@@ -235,17 +270,25 @@ class StvectorPlusEqScalarOp<T, kCpuDevice>: public Op {
   Op *Copy() { return new StvectorPlusEqScalarOp<T, kCpuDevice>(a_, b_); }
 
   void Do() {
-    DebugNormalOp(a, kReadWrite, b_, kRead);
     const Pattern &a_pattern = a_.Pattern(),
         &b_pattern = b_.Pattern();
     int32 dim = a_pattern.dims[0],
         a_stride = a_pattern.strides[0];
-    T *a_data = a_.GetData<T>(),
+    bool uninitialized;
+    T *a_data = a_.GetData<T>(&uninitialized),
         *b_data = a_.GetData<T>();
 
-    T b = *b_data;
-    for (int32 i = 0; i < dim; i++)
-      a_data[i * a_stride] += b;
+    if (uninitialized) {
+      DebugNormalOp(a, kWrite, b_, kRead);
+      T b = *b_data;
+      for (int32 i = 0; i < dim; i++)
+        a_data[i * a_stride] = b;
+    } else {
+      DebugNormalOp(a, kReadWrite, b_, kRead);
+      T b = *b_data;
+      for (int32 i = 0; i < dim; i++)
+        a_data[i * a_stride] += b;
+    }
   }
   Tensor a_;
   Tensor b_;
diff --git a/src/tensor/linear-cpu-ref-ops.h b/src/tensor/linear-cpu-ref-ops.h
new file mode 100644
index 00000000000..f7d60329ba9
--- /dev/null
+++ b/src/tensor/linear-cpu-ref-ops.h
@@ -0,0 +1,168 @@
+// tensor/linear-ref-ops.h
+
+// Copyright      2019  Johns Hopkins University (author: Daniel Povey)
+
+// See ../../COPYING for clarification regarding multiple authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//  http://www.apache.org/licenses/LICENSE-2.0
+//
+// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
+// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
+// MERCHANTABLITY OR NON-INFRINGEMENT.
+// See the Apache 2 License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef KALDI_TENSOR_LINEAR_REF_OPS_H_
+#define KALDI_TENSOR_LINEAR_REF_OPS_H_ 1
+
+#include "tensor/tensor.h"
+#include "tensor/op.h"
+
+
+// This header contains the "reference version" of linear Ops;
+// this is the very simple, not-efficient version that runs on
+// CPU when we run in "reference mode" (or when we encounter
+// some combination that can't be run using our normal BLAS-based
+// speciailized Ops).
+namespace kaldi {
+namespace tensor {
+
+// Corresponds to the command a += b.
+template <typename T>
+class PlusEqRefOp: public Op {
+  PlusEqRefOp(const Tensor &a, const Tensor &b):
+      a_(a), b_(b) {
+    KALDI_ASSERT(!Overlap(a, b) && BroadcastableAndCompatible(a, b));
+  }
+
+  int32 Properties() { return kConcreteOp ; }
+
+  Op *Copy() const override {
+    return new PlusEqRefOp<T>(a_, b_);
+  }
+
+  void Do() const override {
+    RecordUse(a_, kReadWrite);
+    RecordUse(b_, kRead);
+    Do(a_.GetData<T>(), b_.GetData<T>,
+       KALDI_TENSOR_MAX_DIM - 1);
+  }
+
+  private:
+
+  void Do(T *a, const T *b, int32 raxis) {
+    int32 dim = std::max<int32>(a_.dims[raxis], b_.dims[raxis]),
+        a_stride = a_.strides[raxis], b_stride = b_.strides[raxis];
+    if (raxis == 0) {
+      for (int32 i = 0; i < dim; i++) {
+        a[i * a_stride] += b[i * b_stride];
+      }
+    } else {
+      for (int32 i = 0; i < dim; i++) {
+        Do(a + i * a_stride, b + i * b_stride, raxis - 1);
+      }
+    }
+  }
+
+  Tensor a_;
+  Tensor b_;
+};
+
+
+template <typename T>
+class SetZeroRefOp: public Op {
+  SetZeroRefOp(const Tensor &a):
+      a_(a) { }
+
+  int32 Properties() { return kConcreteOp ; }
+
+  Op *Copy() const override {
+    return new SetZeroRefOp<T>(a_);
+  }
+
+  void Do() const override {
+    RecordUse(a_, kWrite);
+    Do(a_.GetData<T>(), KALDI_TENSOR_MAX_DIM - 1);
+  }
+
+  private:
+
+  void Do(T *a, int32 raxis) {
+    int32 dim = a_.dims[raxis],
+        stride = a_.strides[raxis];
+    if (raxis == 0) {
+      for (int32 i = 0; i < dim; i++) {
+        a[i * a_stride] = 0;
+      }
+    } else {
+      for (int32 i = 0; i < dim; i++) {
+        Do(a + i * a_stride, raxis - 1);
+      }
+    }
+  }
+  Tensor a_;
+};
+
+
+// T is the data-type of a, U is the data-type of b;
+// this Op supports type conversion.
+template <typename T, typename U>
+class AssignRefOp: public Op {
+  PlusEqRefOp(const Tensor &a, const Tensor &b):
+      a_(a), b_(b) {
+    // The DimsGeq() makes sure there is no summation, as this version of the op
+    // does not support summation.
+    KALDI_ASSERT(!Overlap(a, b) && Compatible(a, b) &&
+                 Broadcastable(a, b) &&
+                 DimsGeq(a.Pattern(), b.Pattern()));
+  }
+
+  int32 Properties() { return kConcreteOp ; }
+
+  Op *Copy() const override {
+    return new PlusEqRefOp<T>(a_, b_);
+  }
+
+  void Do() const override {
+    RecordUse(a_, kWrite);
+    RecordUse(b_, kRead);
+    Do(a_.GetData<T>(), b_.GetData<U>,
+       KALDI_TENSOR_MAX_DIM - 1);
+  }
+
+  private:
+
+  void Do(T *a, const U *b, int32 raxis) {
+    int32 dim = std::max<int32>(a_.dims[raxis], b_.dims[raxis]),
+        a_stride = a_.strides[raxis], b_stride = b_.strides[raxis];
+    if (raxis == 0) {
+      for (int32 i = 0; i < dim; i++) {
+        a[i * a_stride] = static_cast<T>(b[i * b_stride]);
+      }
+    } else {
+      for (int32 i = 0; i < dim; i++) {
+        Do(a + i * a_stride, b + i * b_stride, raxis - 1);
+      }
+    }
+  }
+
+  Tensor a_;
+  Tensor b_;
+};
+
+
+
+}
+}
+
+
+}  // namespace tensor
+}  // namespace kaldi
+
+
+#endif  // KALDI_TENSOR_LINEAR_REF_OPS_H_
diff --git a/src/tensor/linear-gpu-ops.h b/src/tensor/linear-gpu-ops.h
new file mode 100644
index 00000000000..b80a37c6d17
--- /dev/null
+++ b/src/tensor/linear-gpu-ops.h
@@ -0,0 +1,303 @@
+// tensor/linear-cpu-ops.h
+
+// Copyright      2019  Johns Hopkins University (author: Daniel Povey)
+
+// See ../../COPYING for clarification regarding multiple authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//  http://www.apache.org/licenses/LICENSE-2.0
+//
+// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
+// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
+// MERCHANTABLITY OR NON-INFRINGEMENT.
+// See the Apache 2 License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef KALDI_TENSOR_LINEAR_SPECIAL_OPS_H_
+#define KALDI_TENSOR_LINEAR_SPECIAL_OPS_H_ 1
+
+#include "tensor/tensor.h"
+#include "tensor/linear-special-ops.h"
+#include "matrix/kaldi-blas.h"
+
+// This Ops are more specialized forms of the Ops declared in linear-ops.h;
+// these correspond to more specific combinations of Tensor shapes.  These Ops
+// are only intended to be created from inside other more generic Ops.
+namespace kaldi {
+namespace tensor {
+
+/**
+   Does a += b for a and b both scalar, on CPU.
+ */
+template <class T>
+class ScalarPlusEqScalarOp<T, kCpuDevice>: public Op {
+
+  ScalarPlusEqScalarOp(const Tensor &a, const Tensor &b): a_(a), b_(b) { }
+
+  Op *Copy() {
+    return new ScalarPlusEqScalar<T, kCpuDevice>(a_, b_);
+  }
+
+  void Do() {
+    DebugNormalOp(a, kReadWrite, b_, kRead);
+    *a_.GetData<T>() += *b_.GetData<T>();
+  }
+
+  Tensor a_;
+  Tensor b_;
+};
+
+
+/**
+   Does a += b for a and b both possibly-strided vectors (Stvector), on CPU.
+
+   They must be normalized form, i.e. all axes trivial except raxis 0,
+   and they must have the same dimension.
+
+   This generic form of the template works for integer types (and would work,
+   if used, for float and double).  We will separately instantiate this
+   template for float and double, to use BLAS calls
+*/
+template <class T>
+class StvectorPlusEqStvectorOp<T, kCpuDevice>: public Op {
+
+  StvectorPlusEqStvectorOp(const Tensor &a, const Tensor &b): a_(a), b_(b) { }
+
+  int32 Properties() { return kConcreteOp; }
+
+  Op *Copy() {
+    return new StvectorPlusEqStvectorOp<T, kCpuDevice>(a_, b_);
+  }
+
+  void Do() {
+    DebugNormalOp(a, kReadWrite, b_, kRead);
+    const Pattern &a_pattern = a_.Pattern(),
+        &b_pattern = b_.Pattern();
+    int32 dim = a_pattern.dims[0],
+        a_stride = a_pattern.strides[0],
+        b_stride = b_pattern.strides[0];
+
+    bool uninitialized;
+    T *a_data = a_.GetData<T>(&uninitialized),
+        *b_data = a_.GetData<T>();
+    if (uninitialized) {
+      // This branch is an optimization to avoid writing, and reading, zeros
+      // to/from memory.
+      DebugNormalOp(a, kWrite, b_, kRead);
+      // In future could look into unrolling this loop if it becomes a bottleneck.
+      for (int32 i = 0; i < dim; i++)
+        a_data[i * a_stride] = b_data[i * b_stride];
+    } else {
+      DebugNormalOp(a, kReadWrite, b_, kRead);
+      // In future could look into unrolling this loop if it becomes a bottleneck.
+      for (int32 i = 0; i < dim; i++)
+        a_data[i * a_stride] += b_data[i * b_stride];
+    }
+  }
+  Tensor a_;
+  Tensor b_;
+};
+
+
+// override for float that uses BLAS
+template <>
+class StvectorPlusEqStvectorOp<float, kCpuDevice>: public Op {
+  SvectorPlusEqSvectorOp(const Tensor &a, const Tensor &b): a_(a), b_(b) { }
+  int32 Properties() { return kConcreteOp; }
+  Op *Copy() {
+    return new SvectorPlusEqSvectorOp<float, kCpuDevice>(a_, b_);
+  }
+  void Do() {
+    const Pattern &a_pattern = a_.Pattern(),
+        &b_pattern = b_.Pattern();
+    bool uninitialized;
+    float *a_data = a_.GetData<float>(&uninitialized),
+        *b_data = a_.GetData<float>();
+    if (uninitialized) {
+      // This branch is an optimization to avoid writing, and reading, zeros
+      // to/from memory.
+      DebugNormalOp(a, kWrite, b_, kRead);
+      cblas_scopy(a_pattern.dims[0], 1.0,
+                  b_.GetData<float>(), b_pattern.strides[0],
+                  a_.GetData<float>(), a_pattern.strides[0]);
+    } else {
+      DebugNormalOp(a, kReadWrite, b_, kRead);
+      cblas_saxpy(a_pattern.dims[0], 1.0,
+                  b_.GetData<float>(), b_pattern.strides[0],
+                  a_.GetData<float>(), a_pattern.strides[0]);
+    }
+  }
+  Tensor a_;
+  Tensor b_;
+};
+
+// override for double that uses BLAS
+template <>
+class StvectorPlusEqStvectorOp<double, kCpuDevice>: public Op {
+  SvectorPlusEqSvectorOp(const Tensor &a, const Tensor &b): a_(a), b_(b) { }
+  int32 Properties() { return kConcreteOp; }
+  Op *Copy() {
+    return new SvectorPlusEqSvectorOp<double, kCpuDevice>(a_, b_);
+  }
+  void Do() {
+    const Pattern &a_pattern = a_.Pattern(),
+        &b_pattern = b_.Pattern();
+    bool uninitialized;
+    double *a_data = a_.GetData<double>(&uninitialized),
+        *b_data = a_.GetData<double>();
+    if (uninitialized) {
+      // This branch is an optimization to avoid writing, and reading, zeros
+      // to/from memory.
+      DebugNormalOp(a, kWrite, b_, kRead);
+      cblas_dcopy(a_pattern.dims[0], 1.0,
+                  b_.GetData<double>(), b_pattern.strides[0],
+                  a_.GetData<double>(), a_pattern.strides[0]);
+    } else {
+      DebugNormalOp(a, kReadWrite, b_, kRead);
+      cblas_daxpy(a_pattern.dims[0], 1.0,
+                  b_.GetData<double>(), b_pattern.strides[0],
+                  a_.GetData<double>(), a_pattern.strides[0]);
+    }
+  }
+  Tensor a_;
+  Tensor b_;
+};
+
+
+/**
+   Does a += b for a scalar and b a vector or strided vector, on CPU.
+   (i.e. a += sum(b)).
+
+   They must be normalized form, i.e. all axes trivial except raxis 0
+   of b, and b must not have negative stride.  (This is to allow
+   the BLAS template overrides).
+
+   This generic form of the template works for integer types (and would work,
+   if used, for float and double).  We will separately instantiate this
+   template for float and double, to use BLAS calls.
+*/
+template <class T>
+class ScalarPlusEqStvectorOp<T, kCpuDevice>: public Op {
+
+  StvectorPlusEqStvectorOp(const Tensor &a, const Tensor &b): a_(a), b_(b) { }
+
+  int32 Properties() { return kConcreteOp; }
+
+  Op *Copy() { return new ScalarPlusEqStvectorOp<T, kCpuDevice>(a_, b_); }
+
+  void Do() {
+    DebugNormalOp(a, kReadWrite, b_, kRead);
+    const Pattern &a_pattern = a_.Pattern(),
+        &b_pattern = b_.Pattern();
+    int32 dim = b_pattern.dims[0],
+        b_stride = b_pattern.strides[0];
+    T *a_data = a_.GetData<T>(),
+        *b_data = a_.GetData<T>();
+    T sum(0);
+    // In future could look into unrolling this loop if it becomes a bottleneck.
+    for (int32 i = 0; i < dim; i++)
+      sum += b_data[i * b_stride];
+    *a_data += sum;
+  }
+  Tensor a_;
+  Tensor b_;
+};
+
+
+
+// Override for T = float.
+template <>
+class ScalarPlusEqStvectorOp<float, kCpuDevice>: public Op {
+  StvectorPlusEqStvectorOp(const Tensor &a, const Tensor &b): a_(a), b_(b) { }
+
+  int32 Properties() { return kConcreteOp; }
+
+  Op *Copy() { return new ScalarPlusEqStvectorOp<float, kCpuDevice>(a_, b_); }
+
+  void Do() {
+    DebugNormalOp(a, kReadWrite, b_, kRead);
+    const Pattern &a_pattern = a_.Pattern(),
+        &b_pattern = b_.Pattern();
+    int32 dim = b_pattern.dims[0],
+        b_stride = b_pattern.strides[0];
+    float *a_data = a_.GetData<T>(),
+        *b_data = a_.GetData<T>();
+    *a_data += cblas_sasum(dim, b_data, b_stride);
+  }
+  Tensor a_;
+  Tensor b_;
+};
+
+// Override for T = double
+template <>
+class ScalarPlusEqStvectorOp<double, kCpuDevice>: public Op {
+  ScalarPlusEqStvectorOp(const Tensor &a, const Tensor &b): a_(a), b_(b) { }
+
+  int32 Properties() { return kConcreteOp; }
+
+  Op *Copy() { return new ScalarPlusEqStvectorOp<double, kCpuDevice>(a_, b_); }
+
+  void Do() {
+    DebugNormalOp(a, kReadWrite, b_, kRead);
+    const Pattern &a_pattern = a_.Pattern(),
+        &b_pattern = b_.Pattern();
+    int32 dim = b_pattern.dims[0],
+        b_stride = b_pattern.strides[0];
+    double *a_data = a_.GetData<T>(),
+        *b_data = a_.GetData<T>();
+    *a_data += cblas_dasum(dim, b_data, b_stride);
+  }
+  Tensor a_;
+  Tensor b_;
+};
+
+/**
+   Operation doing a += b with a a vector and b a scalar.  (I.e. add
+   a constant elementwise to a vector).
+
+   May not be used if a and b overlap.
+*/
+template <class T>
+class StvectorPlusEqScalarOp<T, kCpuDevice>: public Op {
+  StvectorPlusEqScalarOp(const Tensor &a, const Tensor &b): a_(a), b_(b) { }
+
+  int32 Properties() { return kConcreteOp; }
+
+  Op *Copy() { return new StvectorPlusEqScalarOp<T, kCpuDevice>(a_, b_); }
+
+  void Do() {
+    const Pattern &a_pattern = a_.Pattern(),
+        &b_pattern = b_.Pattern();
+    int32 dim = a_pattern.dims[0],
+        a_stride = a_pattern.strides[0];
+    bool uninitialized;
+    T *a_data = a_.GetData<T>(&uninitialized),
+        *b_data = a_.GetData<T>();
+
+    if (uninitialized) {
+      DebugNormalOp(a, kWrite, b_, kRead);
+      T b = *b_data;
+      for (int32 i = 0; i < dim; i++)
+        a_data[i * a_stride] = b;
+    } else {
+      DebugNormalOp(a, kReadWrite, b_, kRead);
+      T b = *b_data;
+      for (int32 i = 0; i < dim; i++)
+        a_data[i * a_stride] += b;
+    }
+  }
+  Tensor a_;
+  Tensor b_;
+};
+
+
+
+}  // namespace tensor
+}  // namespace kaldi
+
+
+#endif  // KALDI_TENSOR__LINEAR_OPS_H_
diff --git a/src/tensor/linear-gpu-ref-ops.h b/src/tensor/linear-gpu-ref-ops.h
new file mode 100644
index 00000000000..bf0ed1a1ae9
--- /dev/null
+++ b/src/tensor/linear-gpu-ref-ops.h
@@ -0,0 +1,174 @@
+// tensor/linear-gpu-ref-ops.h
+
+// Copyright      2019  Johns Hopkins University (author: Daniel Povey)
+
+// See ../../COPYING for clarification regarding multiple authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//  http://www.apache.org/licenses/LICENSE-2.0
+//
+// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
+// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
+// MERCHANTABLITY OR NON-INFRINGEMENT.
+// See the Apache 2 License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef KALDI_TENSOR_LINEAR_REF_OPS_H_
+#define KALDI_TENSOR_LINEAR_REF_OPS_H_ 1
+
+#include "tensor/tensor.h"
+#include "tensor/op.h"
+#include "tensor/linear-ops.h"
+#include "tensor/linear-special-ops.h"
+
+
+// This header contains the "reference version" of linear Ops;
+// this is the very simple, not-efficient version that runs on
+// CPU when we run in "reference mode" (or when we encounter
+// some combination that can't be run using our normal BLAS-based
+// speciailized Ops).
+namespace kaldi {
+namespace tensor {
+
+
+
+
+
+// Corresponds to the command a += b.
+template <typename T>
+class PlusEqRefOp: public Op {
+  PlusEqRefOp(const Tensor &a, const Tensor &b):
+      a_(a), b_(b) {
+    KALDI_ASSERT(!Overlap(a, b) && BroadcastableAndCompatible(a, b));
+  }
+
+  int32 Properties() { return kConcreteOp ; }
+
+  Op *Copy() const override {
+    return new PlusEqRefOp<T>(a_, b_);
+  }
+
+  void Do() const override {
+    RecordUse(a_, kReadWrite);
+    RecordUse(b_, kRead);
+    Do(a_.GetData<T>(), b_.GetData<T>,
+       KALDI_TENSOR_MAX_DIM - 1);
+  }
+
+  private:
+
+  void Do(T *a, const T *b, int32 raxis) {
+    int32 dim = std::max<int32>(a_.dims[raxis], b_.dims[raxis]),
+        a_stride = a_.strides[raxis], b_stride = b_.strides[raxis];
+    if (raxis == 0) {
+      for (int32 i = 0; i < dim; i++) {
+        a[i * a_stride] += b[i * b_stride];
+      }
+    } else {
+      for (int32 i = 0; i < dim; i++) {
+        Do(a + i * a_stride, b + i * b_stride, raxis - 1);
+      }
+    }
+  }
+
+  Tensor a_;
+  Tensor b_;
+};
+
+
+template <typename T>
+class SetZeroRefOp: public Op {
+  SetZeroRefOp(const Tensor &a):
+      a_(a) { }
+
+  int32 Properties() { return kConcreteOp ; }
+
+  Op *Copy() const override {
+    return new SetZeroRefOp<T>(a_);
+  }
+
+  void Do() const override {
+    RecordUse(a_, kWrite);
+    Do(a_.GetData<T>(), KALDI_TENSOR_MAX_DIM - 1);
+  }
+
+  private:
+
+  void Do(T *a, int32 raxis) {
+    int32 dim = a_.dims[raxis],
+        stride = a_.strides[raxis];
+    if (raxis == 0) {
+      for (int32 i = 0; i < dim; i++) {
+        a[i * a_stride] = 0;
+      }
+    } else {
+      for (int32 i = 0; i < dim; i++) {
+        Do(a + i * a_stride, raxis - 1);
+      }
+    }
+  }
+  Tensor a_;
+};
+
+
+// T is the data-type of a, U is the data-type of b;
+// this Op supports type conversion.
+template <typename T, typename U>
+class AssignRefOp: public Op {
+  PlusEqRefOp(const Tensor &a, const Tensor &b):
+      a_(a), b_(b) {
+    // The DimsGeq() makes sure there is no summation, as this version of the op
+    // does not support summation.
+    KALDI_ASSERT(!Overlap(a, b) && Compatible(a, b) &&
+                 Broadcastable(a, b) &&
+                 DimsGeq(a.Pattern(), b.Pattern()));
+  }
+
+  int32 Properties() { return kConcreteOp ; }
+
+  Op *Copy() const override {
+    return new PlusEqRefOp<T>(a_, b_);
+  }
+
+  void Do() const override {
+    RecordUse(a_, kWrite);
+    RecordUse(b_, kRead);
+    Do(a_.GetData<T>(), b_.GetData<U>,
+       KALDI_TENSOR_MAX_DIM - 1);
+  }
+
+  private:
+
+  void Do(T *a, const U *b, int32 raxis) {
+    int32 dim = std::max<int32>(a_.dims[raxis], b_.dims[raxis]),
+        a_stride = a_.strides[raxis], b_stride = b_.strides[raxis];
+    if (raxis == 0) {
+      for (int32 i = 0; i < dim; i++) {
+        a[i * a_stride] = static_cast<T>(b[i * b_stride]);
+      }
+    } else {
+      for (int32 i = 0; i < dim; i++) {
+        Do(a + i * a_stride, b + i * b_stride, raxis - 1);
+      }
+    }
+  }
+
+  Tensor a_;
+  Tensor b_;
+};
+
+
+
+}
+}
+
+
+}  // namespace tensor
+}  // namespace kaldi
+
+
+#endif  // KALDI_TENSOR_LINEAR_REF_OPS_H_
diff --git a/src/tensor/linear-ops.cc b/src/tensor/linear-ops.cc
index 456a268bc96..db0fd32e95f 100644
--- a/src/tensor/linear-ops.cc
+++ b/src/tensor/linear-ops.cc
@@ -23,12 +23,14 @@ namespace kaldi {
 namespace tensor {
 
 void PlusEqOp::Expand(std::vector<std::unique_ptr<Op> > *ops) const {
-
+  Op *new_op;
   if (ReferenceMode() && a_.DeviceType() == kCpuDevice) {
-    // In reference mode, always use the reference implementation.
-    Op *ans;
-    SET_TO_TEMPLATED_CPU_OP_ALL(ans, a_.Dtype(), a_, b_);
-    return ans;
+    // In reference mode on CPU always use the reference implementation.
+    // Reference mode is only supported on CPU so we use the normal Ops
+    // on GPU.
+    SET_TO_TEMPLATED_CPU_OP_ALL(new_op, a_.Dtype(), PlusEqRefOp, a_, b_);
+    ops->push_back(new_op);
+    return;
   }
 
   // The generic implementation requires us to first normalize the patterns.
@@ -55,8 +57,6 @@ void PlusEqOp::Expand(std::vector<std::unique_ptr<Op> > *ops) const {
   int64 combined_code = CombineCodes(a_pattern.GetCode(),
                                      b_pattern.GetCode());
 
-  Op *new_op;
-
   /*
     The case-statement values in the switch statement below may be interpreted
     in groups of 3 hex characters, are 0xAAABBB, pertaining to Tensors a and b.
@@ -102,13 +102,12 @@ void PlusEqOp::Expand(std::vector<std::unique_ptr<Op> > *ops) const {
       // Create a temporary- a column vector, which is what we call
       // a vector whose nontrivial axis is raxis 1 instead of raxis 0.
       Tensor temp({num_rows, 1}, {a.Dtype(), a.Device()});
-      Op *temp_op;
       // Below we do temp += b.  We could use PlusEqOp for this and also for the
       // following reduction, but doing it this way avoids an unnecessary layer
       // of expansion.
-      SET_TO_TEMPLATED_OP_REAL(temp_op, a.Dtype(), a.DeviceType(),
+      SET_TO_TEMPLATED_OP_REAL(new_op, a.Dtype(), a.DeviceType(),
                                ColVectorEqMatrixOp, temp, b);
-      ops->push_back(temp_op);
+      ops->push_back(new_op);
       // Normalize the temporary vector so its nontrivial axis is raxis 0, by
       // removing the current raxis 0 and having current raxis 1 shift down.
       Tensor temp_normalized = Squeeze(temp, 0);
@@ -126,25 +125,126 @@ void PlusEqOp::Expand(std::vector<std::unique_ptr<Op> > *ops) const {
 }
 
 
+void AssignOp::Expand() const {
+  Op *new_op;
+
+  if (a.Dtype() != b.Dtype()) {
+    if (a.Device() != b.Device()) {
+      KALDI_ERR << "Cross-device copying combined with type convesion not "
+          "supported yet.";
+      // Actually it would be easy to support just by creating a temporary
+      // (search above for `temp` for an example).
+    }
+
+
+  }
+  if (a.Device() != b.Device()) {
+    KALDI_ERR << "Cross-device copying not supported yet.";
+  }
+
+  if (ReferenceMode() && a_.DeviceType() == kCpuDevice) {
+    // In reference mode on CPU always use the reference implementation.
+    // Reference mode is only supported on CPU so we use the normal Ops
+    // on GPU.
+    SET_TO_TEMPLATED_CPU_OP_ALLPAIRS(new_op, a_.Dtype(), b.Dtype(),
+                                     AssignRefOp, a_, b_);
+    ops->push_back(new_op);
+    return;
+  }
+
+  // The generic implementation requires us to first normalize the patterns.
+  Pattern a_pattern = a_.Impl().pattern,
+      b_pattern = b_.Impl().pattern;
+  NormalizePatterns({a_pattern, b_pattern});
+
+  KALDI_ASSERT(Compatible(a_, b_));  // dtype and device, check they match.
+
+  Tensor a(a_), b(b_);
+
+  if (a_pattern != a_.Impl().pattern)
+    a = WithPattern(a, a_pattern);
+  if (b_pattern != b_.Impl().pattern)
+    b = WithPattern(b, b_pattern);
+
+  /*
+    The case-statement values in the switch statement below may be interpreted
+    in groups of 3 hex characters, are 0xAAABBB, pertaining to Tensors a and b
+    respectively.  See GetPatternCode() in pattern-utils.h for documentation on
+    the meanings of the values and our notation with X,x,1.
+
+  */
+  int64 combined_code = CombineCodes(a_pattern.GetCode(),
+                                     b_pattern.GetCode());
+
+  /*
+    The case-statement values in the switch statement below may be interpreted
+    in groups of 3 hex characters, are 0xAAABBB, pertaining to Tensors a and b.
+    See ComputePatternCode() in pattern-utils.h for documentation on the meanings of
+    the values and our notation with X,x,1.
+       Quick legend:
+             X means dim >1, stride = 1
+             x means dim >1, stride != 1
+             1 means dim == 1, stride = 0.
+                 (Note: the numbers in case-statements below exclude negative
+                 strides because bit 11 of the 12-bit chunks would be set if
+                 there were a negative stride).
+   */
+
+  // We are doing a += b.
+  switch(combined_code) {
+    // A scalar += scalar,
+    case 0x000000:   // () +=  ()
+      SET_TO_TEMPLATED_OP_REAL(new_op, a.Dtype(), a.DeviceType(), ScalarPlusEqScalarOp, a, b);
+      break;
+    // We may split apart some of the following cases in future.
+    // They all represent, vector += vector.
+    case 0x101101:  //  (X) += (X)
+    case 0x001001:  //  (x) += (x)
+    case 0x101001:  //  (X) += (x)
+    case 0x001101:  //  (X) += (x)
+      SET_TO_TEMPLATED_OP_REAL(new_op, a.Dtype(), a.DeviceType(), StvectorPlusEqStvectorOp, a, b);
+      break;
+    // Scalar += (sum of) vector or strided vector
+    case 0x000101:  //  () += (X)
+    case 0x000001:  //  () += (X)
+      SET_TO_TEMPLATED_OP_REAL(new_op, a.Dtype(), a.DeviceType(), ScalarPlusEqStvectorOp, a, b);
+      break;
+    // vector or strided vector += scalar.
+    // We could later split apart the strided and non-strided cases.
+    case 0x101000:  //  (x) += ()
+    case 0x001000:  //  (X) += ()
+      SET_TO_TEMPLATED_OP_REAL(new_op, a.Dtype(), a.DeviceType(), StvectorPlusEqScalarOp, a, b);
+      break;
+    // scalar += matrix
+    case 0x000103: { // () += (xX)
+      int32 num_rows = b.Pattern().dims[1];
+      // Create a temporary- a column vector, which is what we call
+      // a vector whose nontrivial axis is raxis 1 instead of raxis 0.
+      Tensor temp({num_rows, 1}, {a.Dtype(), a.Device()});
+      // Below we do temp += b.  We could use PlusEqOp for this and also for the
+      // following reduction, but doing it this way avoids an unnecessary layer
+      // of expansion.
+      SET_TO_TEMPLATED_OP_REAL(new_op, a.Dtype(), a.DeviceType(),
+                               ColVectorEqMatrixOp, temp, b);
+      ops->push_back(new_op);
+      // Normalize the temporary vector so its nontrivial axis is raxis 0, by
+      // removing the current raxis 0 and having current raxis 1 shift down.
+      Tensor temp_normalized = Squeeze(temp, 0);
+      SET_TO_TEMPLATED_OP_REAL(new_op, a.Dtype(), a.DeviceType(),
+                               ScalarPlusEqStvectorOp, a, temp_normalized);
+    }
+
 
-inline static void AddProductScalar3(
-    float alpha, float beta,
-    const TensorImpl &a, const TensorImpl &b, const TensorImpl *c) {
-  switch (a.device.device_type) {
-    case kCpuDevice:
-      AddProductScalar3Cpu(alpha, beta, a, b, c);
-      return;
-#ifdef HAVE_CUDA
-    case kCudaDevice:
-      AddProductScalar3Gpu(alpha, beta, a, b, c);
-      return;
-#endif
     default:
-      KALDI_ERR << "Unsupported device type " << a.ToString();
+      // Later we can add a more generic implementation that handles arbitrary
+      // patterns.
+      KALDI_ERR << "Unhandled code: " << std::hex << combined_code;
   }
+  ops->push_back(new_op);
 }
 
 
+
 void AddProduct(float alpha, float beta,
                 const TensorImpl &a, const TensorImpl &b, const TensorImpl *c){
 
diff --git a/src/tensor/linear-ops.h b/src/tensor/linear-ops.h
index b37f78487cb..f518671a083 100644
--- a/src/tensor/linear-ops.h
+++ b/src/tensor/linear-ops.h
@@ -44,7 +44,7 @@ class PlusEqOp: public Op {
                  BroadcastableAndCompatible(a, b));
   }
 
-  int32 Properties() { return kConcreteOp; }
+  int32 Properties() { return 0; }  // not concrete
 
   Op *Copy() const override {
     return new PlusEqOp(a_, b_);
@@ -97,34 +97,43 @@ class PlusEqOp: public Op {
        - For each index-tuple i in the index-tuple-set of b, b[i] += a[i].
    Must not be used if b and a overlap.
 
-   "Assign" means that this is the first time we are setting the memory
-   involved, except possibly for things that don't generate any derivative
-   for various reasons.
-
-   See also SetOp, which is for when the memory might previously have
-   been written to by something differentiable.]
-
-   Note: in the backprop for AssignOp, we can do Unset() after, which
-   means the memory concerned must no longer be read from.
+   While most Ops require the arguments to be "compatible", i.e. on the same
+   dtype and device, the Assign op does not require this.  (For the time being,
+   though, there may be limitations on what kinds of things you can do across
+   dtype and device, e.g. it may not support all the broadcasting and summation
+   operations that would normally be allowed).
 */
-class AssignOp {
+class AssignOp: public Op {
  public:
-
-  AssignOp(const Tensor &a, Tensor &b):
-      a_(a), b_(b) {
-    KALDI_ASSERT(!Overlap(a, b) &&
-                 BroadcastableAndCompatible(a, b));
-  }
-  AssignOp(const AssignOp &other):
-      a_(other.a_), b_(other.b_) { }
-
-  void Do() const override {
-    Set(a, &b);  // b := a
+  /**
+     If `zero_in_backprop` is true, then the backprop command for this operation
+     will zero the deriv w.r.t. b after that command.  (It would be safer to
+     set it by default to true, but this requires extra work).
+
+     Setting this to true should rarely be necessary-- only when we are
+     overwriting something that already had a derivative.  If you forget to set
+     this to true when you needed to, when you run in debug mode the
+     memory-checker code will tell you about the issue and crash.
+  */
+  AssignOp(const Tensor &a, Tensor &b,
+           bool zero_in_backprop = false):
+      a_(a), b_(b), zero_in_backprop(zero_in_backprop) {
+    // We don't require a and b to be compatible (same dtype and device),
+    // although other Ops do require this.
+    KALDI_ASSERT(!Overlap(a, b) && Broadcastable(a, b));
   }
   Op *Copy() const override {
-    return new AssignOp(*this);
+    return new AssignOp(a_, b_, zero_in_backprop_);
   }
 
+  int32 Properties() { return 0; }  // not concrete
+
+  /**
+     Expand into concrete Ops, depending on the dimensions and device.
+  */
+  void Expand() const override;
+
+
   void GetBackwardDerivOps(
       DerivMap *map,
       std::vector<std::unique_ptr<Op> > *ops) const override {
@@ -134,7 +143,10 @@ class AssignOp {
     // Return the Op corresponding to:
     // a_deriv_ += b_deriv_.
     ops->push_back(std::unique_ptr<Op>(new PlusEqOp(map->Deriv(b_),
-                                                 AsTensor(a_deriv))));
+                                                    AsTensor(a_deriv))));
+
+    if (zero_in_backprop_)
+      ops->push_back(std::unique_ptr<Op>(new ZeroOp(map->Deriv(b_))));
   }
 
   void GetForwardDerivOps(
@@ -146,11 +158,15 @@ class AssignOp {
     // else return the Op corresponding to:
     // b_deriv_ := a_deriv_.
     ops->push_back(std::unique_ptr<Op>(new AssignOp(AsTensor(a_deriv),
-                                                  map->Deriv(b_))));
+                                                    map->Deriv(b_))));
   }
  private:
-   Tensor a_;
-   Tensor b_;
+  Tensor a_;
+  Tensor b_;
+  // If true, we'll zero the derivative w.r.t. b after doing the backprop to a.
+  // This allows correct backprop in certain cases where you overwrite data, but
+  // it's rarely necessary so we make it optional to avoid unnecessary zeroing.
+  bool zero_in_backprop_;
 };
 
 
diff --git a/src/tensor/linear-ref-ops.h b/src/tensor/linear-ref-ops.h
deleted file mode 100644
index f6e284b8222..00000000000
--- a/src/tensor/linear-ref-ops.h
+++ /dev/null
@@ -1,87 +0,0 @@
-// tensor/linear-ref-ops.h
-
-// Copyright      2019  Johns Hopkins University (author: Daniel Povey)
-
-// See ../../COPYING for clarification regarding multiple authors
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//  http://www.apache.org/licenses/LICENSE-2.0
-//
-// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
-// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
-// MERCHANTABLITY OR NON-INFRINGEMENT.
-// See the Apache 2 License for the specific language governing permissions and
-// limitations under the License.
-
-#ifndef KALDI_TENSOR_LINEAR_REF_OPS_H_
-#define KALDI_TENSOR_LINEAR_REF_OPS_H_ 1
-
-#include "tensor/tensor.h"
-#include "tensor/op.h"
-
-
-// This header contains the "reference version" of linear Ops;
-// this is the very simple, not-efficient version that runs on
-// CPU when we run in "reference mode" (or when we encounter
-// some combination that can't be run using our normal BLAS-based
-// speciailized Ops).
-namespace kaldi {
-namespace tensor {
-
-// Corresponds to the command a += b.
-
-template <typename Real>
-class PlusEqRefOp: public Op {
-  PlusEqRefOp(const Tensor &a, const Tensor &b):
-      a_(a), b_(b) {
-    KALDI_ASSERT(!Overlap(a, b) && Compatible(a, b));
-
-  }
-
-  int32 Properties() { return 0 ; }  // Not concrete.
-
-  Op *Copy() const override {
-    return new PlusEqRefOp(a, b_);
-  }
-
-  void Do() const override {
-    RecordUse(a_, kReadWrite);
-    RecordUse(b_, kRead);
-    Do(a_.GetData<Real>(), b_.GetData<Real>,
-       KALDI_TENSOR_MAX_DIM - 1);
-  }
-
-  private:
-
-  void Do(Real *a, Real *b, int32 raxis) {
-    int32 dim = std::max<int32>(a_.dims[raxis], b_.dims[raxis]),
-        a_stride = a_.strides[raxis], b_stride = b_.strides[raxis];
-    if (raxis == 0) {
-      for (int32 i = 0; i < dim; i++) {
-        a[i * a_stride] += b[i * b_stride];
-      }
-    } else {
-      for (int32 i = 0; i < dim; i++) {
-        Do(a + i * a_stride, b + i * b_stride, raxis - 1);
-      }
-    }
-  }
-
-  Tensor a_;
-  Tensor b_;
-};
-
-
-}
-}
-
-
-}  // namespace tensor
-}  // namespace kaldi
-
-
-#endif  // KALDI_TENSOR_LINEAR_REF_OPS_H_
diff --git a/src/tensor/linear-special-ops.h b/src/tensor/linear-special-ops.h
index a7323235f5e..d9ea6a18447 100644
--- a/src/tensor/linear-special-ops.h
+++ b/src/tensor/linear-special-ops.h
@@ -73,6 +73,21 @@ class StvectorPlusEqStvectorOp;
 template <class T, DeviceType D>
 class StvectorPlusEqScalarOp;
 
+
+/**
+   Operation doing a += b with a and b possibly-strided vectors.
+
+   a and b may not overlap.
+
+   Template parameter T is the datatype concerned (say, T = float)
+   D is the DeviceType enum, kCpuDevice or kCudaDevice.
+
+   Will be specialized for CPU and GPU in linear-cpu-ref-ops.h and linear-gpu-ref-ops.h
+*/
+template <class T, DeviceType D>
+class PlusEqRefOp;
+
+
 }  // namespace tensor
 }  // namespace kaldi
 
diff --git a/src/tensor/memory-checker.h b/src/tensor/memory-checker.h
index fb159da51bd..b8649137e02 100644
--- a/src/tensor/memory-checker.h
+++ b/src/tensor/memory-checker.h
@@ -307,6 +307,18 @@ class UninitializedDataChecker: public DataCheckerBase {
     RecordEvent(element_size, pattern);
   }
 
+  /**
+     This function checks that this memory area is currently uninitialized;
+     if any part of it was previously initialized, it will crash.
+
+        @param [in] element_size  The size of the element stored in the
+                  Tensor, e.g. 4 for float, 8 for double.
+        @param [in] pattern  The pattern which we are checking
+   */
+  inline void CheckUninitialized(int32 element_size,
+                                 const Pattern &pattern);
+
+
   /**
      This function is called when this memory area is being read from.
      It will (usually) crash if an element of this memory area has not been
@@ -442,6 +454,8 @@ class MemoryChecker {
          kReadWrite
          kWrite
          kCheckUninitialized
+         kReadAndInvalidate
+         kInvalidate
      From a user's perspective the only thing this function might do is crash--
      which it is designed to do if it detects various "disallowed" things.
   */
@@ -451,12 +465,19 @@ class MemoryChecker {
     KALDI_PARANOID_ASSERT(DebugMode());
     if (debug_tick_ != DebugTick())
         Initialise(false);  // false means: not a new region.
-    if (use_type == kRead || use_type == kReadWrite) {
+
+    if (use_type == kInitialize || use_type == kCheckUninitialized) {
+      if (uninitialized_checker_)
+        uninitialized_checker_->CheckUninitialized(element_size, pattern);
+    }
+    if (use_type == kRead || use_type == kReadWrite ||
+        use_type == kReadInvalidate) {
       invalidated_checker_->RecordRead(element_size, pattern);
       if (uninitialized_checker_)
         uninitialized_checker_->RecordRead(element_size, pattern);
     }
-    if (use_type == kWrite || use_type == kReadWrite) {
+    if (use_type == kWrite || use_type == kReadWrite ||
+        use_type == kInitialize) {
       // Important that this happens after checking the reads above.
       // uninitialized_checker_ would never find an error in RecordRead() if it
       // was done after the RecordWrite().
@@ -464,6 +485,9 @@ class MemoryChecker {
         uninitialized_checker_->RecordWrite(element_size, pattern);
       change_tracker_->RecordWrite(element_size,  pattern);
     }
+    if (use_type == kInvalidate || use_type == kReadInvalidate) {
+      RecordInvalidation(element_size, pattern);
+    }
   }
 
   /**
diff --git a/src/tensor/op.h b/src/tensor/op.h
index d43d47aa756..5dcc7321757 100644
--- a/src/tensor/op.h
+++ b/src/tensor/op.h
@@ -259,13 +259,15 @@ class Op {
   } while(0)
 // the while(0) is to allow a semicolon after the invocation.
 
+// The following is used when you know that you are only using CPU, particularly
+// for "reference implementations"
 #define SET_TO_TEMPLATED_CPU_OP_ALL(pointer_name, dtype, OpName, ...) \
     switch (dtype) {                                \
      case kFloatDtype:                              \
-       pointer_name = new OpName<float, kCpuDevice>(__VA_ARGS__); break;       \
+       pointer_name = new OpName<float>(__VA_ARGS__); break;       \
       break;                                        \
      case kDoubleDtype:                             \
-       pointer_name = new OpName<double, kCpuDevice>(__VA_ARGS__); break;       \
+       pointer_name = new OpName<double>(__VA_ARGS__); break;       \
       break;                                        \
     default:                                        \
       KALDI_ERR << "Invalid dtype (this op only allows float or double): " \
@@ -273,6 +275,44 @@ class Op {
   } while(0)
 // the while(0) is to allow a semicolon after the invocation.
 
+// The following is used when you know that you are only using CPU, particularly
+// for "reference implementations"; this version accepts two dtype arguments,
+// for SimpleAssignOp which supports type conversion and possibly broadcasting,
+// transpose etc., but not summation.
+#define SET_TO_TEMPLATED_CPU_OP_ALLPAIRS(pointer_name, dtype1, dtype2, OpName, ...) \
+  switch (static_cast<DataType>(int32(dtype1) + (int32(dtype2) << 4))) { \
+     case kFloatFloatDtype:                               \
+       pointer_name = new OpName<float, float>(__VA_ARGS__); break; \
+      break;                                         \
+     case kFloatDoubleDtype:                               \
+       pointer_name = new OpName<float, double>(__VA_ARGS__); break; \
+      break;                                         \
+     case kFloatInt32Dtype:                               \
+       pointer_name = new OpName<float, int32>(__VA_ARGS__); break; \
+      break;                                         \
+     case kDoubleFloatDtype:                               \
+       pointer_name = new OpName<double, float>(__VA_ARGS__); break; \
+      break;                                         \
+     case kDoubleDoubleDtype:                               \
+       pointer_name = new OpName<double, double>(__VA_ARGS__); break; \
+      break;                                         \
+     case kDoubleInt32Dtype:                               \
+       pointer_name = new OpName<double, int32>(__VA_ARGS__); break; \
+      break;                                         \
+     case kInt32FloatDtype:                               \
+       pointer_name = new OpName<int32, float>(__VA_ARGS__); break; \
+      break;                                         \
+     case kInt32DoubleDtype:                               \
+       pointer_name = new OpName<int32, double>(__VA_ARGS__); break; \
+      break;                                         \
+     case kInt32Int32Dtype:                               \
+       pointer_name = new OpName<int32, int32>(__VA_ARGS__); break; \
+      break;                                         \
+    default:                                        \
+      KALDI_ERR << "Invalid pair of dtypes in Assign Op: "       \
+             << int32(dtype1) << ", " << int32(dtype2);   \
+  } while(0)
+
 
 
 
diff --git a/src/tensor/pattern-tuple-utils.cc b/src/tensor/pattern-tuple-utils.cc
index 60d60d9939d..8e5942e127a 100644
--- a/src/tensor/pattern-tuple-utils.cc
+++ b/src/tensor/pattern-tuple-utils.cc
@@ -1146,8 +1146,8 @@ int64 PatternRebaser::ConvertMemoryIndex(int64 m) {
   m -= src_offset_;
   if (num_axes == 0)
     return m;
-  // We visit the compressed axes in order from greatest to least src_stride.
-  // What this loop does is to reverse engineer the indexes into (the compressed
+  // We visit the reduced axes in order from greatest to least src_stride.
+  // What this loop does is to reverse engineer the indexes into (the reduced
   // version of) src_pattern that we'd need to get memory-offset m.  The 'i'
   // values in the loop are those indexes.
   for (int32 raxis = num_axes - 1; raxis >= 0; raxis--) {
diff --git a/src/tensor/pattern-tuple-utils.h b/src/tensor/pattern-tuple-utils.h
index 1a9d9bc25df..aa733fe3567 100644
--- a/src/tensor/pattern-tuple-utils.h
+++ b/src/tensor/pattern-tuple-utils.h
@@ -189,7 +189,7 @@ bool PatternsEquivalent(const Pattern &pattern1,
                duplicates removed and listed in increasing order, each
                stride divides the next one in the list exactly; but this is
                not a necessary condition.   (The necessary condition
-               is that both patterns, when compressed and converted
+               is that both patterns, when reduced and converted
                to common strides, are "Regular" (c.f. "Regularity
                property" in glossary).
 */
@@ -224,7 +224,7 @@ bool ComputeIntersection(const Pattern &pattern1,
                duplicates removed and listed in increasing order, each
                stride divides the next one in the list exactly; but this is
                not a necessary condition.   (The necessary condition
-               is that both patterns, when compressed and converted
+               is that both patterns, when reduced and converted
                to common strides, are "Regular" (c.f. "Regularity
                property" in glossary).
 */
@@ -566,30 +566,30 @@ class PatternRebaser {
   // at all (this is an optimization).
   bool needs_conversion_;
 
-  // The 'offset' value of src_pattern_compressed (i.e. the src_pattern passed
-  // to the constructor, which has been jointly compressed and normalized with
+  // The 'offset' value of src_pattern_reduced (i.e. the src_pattern passed
+  // to the constructor, which has been jointly reduced and normalized with
   // dest_pattern (to make all src_strides positive).
   int64 src_offset_;
-  // The 'offset' value of dest_pattern_compressed
+  // The 'offset' value of dest_pattern_reduced
   int64 dest_offset_;
 
   // num_axes_ is the number of axes, not in the original src_pattern /
-  // dest_pattern but after the two patterns have been jointly compressed and
+  // dest_pattern but after the two patterns have been jointly reduced and
   // then sorted from smallest to greatest stride in src_pattern.
-  // src_strides_ are the resulting strides from src_pattern_compressed, and
-  // dest_strides_ are the resulting strides from dest_pattern_compressed.
+  // src_strides_ are the resulting strides from src_pattern_reduced, and
+  // dest_strides_ are the resulting strides from dest_pattern_reduced.
 
   // dest_pattern_ are the strides of the thus-modified src_pattern and
   // dest_pattern.  As an optimization, if src_strides and dest_strides end up
   // being the same, we set num_axes to zero and skip modifying the strides when
-  // CompressPattern() is called.
+  // ReducePattern() is called.
 
   // Note: all of src_strides_[0] .. src_strides_[num_axes_ - 1] will be greater
   // than zero.  We can guarantee this because src_pattern and dest_pattern as
   // passed to the constructor had the same dims, so any axes with dim=1 would
   // have had dim=1 for both src and dest, hence they would have been removed by
-  // CompressPatterns(), hence no strides would be zero after
-  // CompressPatterns(); and CompressPatterns() normalizes the signs of the
+  // ReducePatterns(), hence no strides would be zero after
+  // ReducePatterns(); and ReducePatterns() normalizes the signs of the
   // strides so the first one (i.e. src_pattern) has positive strides.
   int32 num_axes_;
   int32 src_strides_[KALDI_TENSOR_MAX_DIM];
@@ -666,17 +666,50 @@ class OutOfPlaceAxisSorter {
      @param [in,out]  The patterns whose axes are to be sorted.  The same
                      permutation will be applied to all the patterns.
  */
-void SortTupleAxes(ArrayRef<Pattern*> patterns);
+void SortPatternTupleAxes1(ArrayRef<Pattern*> patterns);
 
 /**
-   Compresses a Pattern-tuple by removing or combining as many axes as possible.
-   See the documentation for CompressOnePattern() in pattern-utils.h basic
-   concept of compressing a single Pattern to a pattern with possibly fewer axes
+   This function sorts the axes in 'patterns' (which must be a valid
+   pattern-tuple, see pattern.h for explanation) from least to
+   greatest abs(stride) in the first Pattern, using the abs(stride)
+   of the remaining patterns, lexicographically, to disambiguate
+   in case of ties in the 1st pattern.
+
+     @param [in,out]  The patterns whose axes are to be sorted.  The same
+                     permutation will be applied to all the patterns.
+ */
+void SortPatternTupleAxesSimple(ArrayRef<Pattern*> patterns);
+
+
+/**
+   TODO: remove this.
+
+   Sorts the axes of the pattern-tuple `patterns` in a way that we use for
+   elementwise operations writing to the first of the patterns.  Let pattern0 be
+   the first pattern in `patterns`.  This function requires that no axis in
+   pattern0 be a trivial axis (dim=1, stride=0); this is because we expect the
+   operation to be non-reducing and the pattern-tuple to be in reduced form
+   (c.f. ReducePatternTuple()).
+
+   The sorting of axes (expressed in the private numbering) is as follows:
+     - First the axis that has the smallest stride in pattern0.
+     - Then the remaining axes, in order from greatest to largest
+       dim in pattern0, using the stride in pattern0 to disambiguate
+    Thus, it is the strides
+
+ */
+void SortPatternTupleAxesForCuda(ArrayRef<Pattern*> patterns);
+
+
+/**
+   Reduces a Pattern-tuple by removing or combining as many axes as possible.
+   See the documentation for ReduceOnePattern() in pattern-utils.h basic
+   concept of reducing a single Pattern to a pattern with possibly fewer axes
    (and maybe with negative strides converted to positive), which covers the
    same set of memory locations as the original Tensor.
 
-   The difference with just calling CompressOnePattern() several times is
-   that CompressPatterns() preserves the relationships between the tensors.
+   The difference with just calling ReduceOnePattern() several times is
+   that ReducePatterns() preserves the relationships between the tensors.
    In the language developed in pattern.h, this means the memory-index-tuple-set
    is preserved.
 
@@ -684,7 +717,7 @@ void SortTupleAxes(ArrayRef<Pattern*> patterns);
    the others may.
 
      @param [in,out] patterns   An nonempty array of the patterns
-                         to be jointly compressed.
+                         to be jointly reduced.
 
       @return  Returns true if it made any change to the patterns,
                false if they were unchanged.
@@ -702,9 +735,9 @@ void SortTupleAxes(ArrayRef<Pattern*> patterns);
  {{3,4},{4,1}}        {{1,1},{0,0}}      {{12},{1}}           {{1},{0}}    # combine
 \endverbatim
 
-   See also SortTupleAxes() and NormalizePatternTuple().
+   See also SortPatternTupleAxes() and NormalizePatternTuple().
  */
-bool CompressPatternTuple(ArrayRef<Pattern*> patterns);
+bool ReducePatternTuple(ArrayRef<Pattern*> patterns);
 
 
 /**
@@ -713,13 +746,13 @@ bool CompressPatternTuple(ArrayRef<Pattern*> patterns);
    i.e. the form produced by this function, which share the same
    memory-index-tuple-set but are not equal).
 
-   This just calls CompressPatternTuple() and then SortPatternTupleAxes().
+   This just calls ReducePatternTuple() and then SortPatternTupleAxes().
 
      @param [in,out] patterns.
 
 */
 inline bool NormalizePatternTuple(ArrayRef<Pattern*> patterns) {
-  CompressPatternTuple(patterns);
+  ReducePatternTuple(patterns);
   NormalizePatternTupleAxes(patterns);
 }
 
diff --git a/src/tensor/pattern-utils.h b/src/tensor/pattern-utils.h
index a1fc20f50fb..488f17a2891 100644
--- a/src/tensor/pattern-utils.h
+++ b/src/tensor/pattern-utils.h
@@ -392,12 +392,8 @@ inline void Squeeze(int32 axis, Pattern *p) {
        @param [in] b  The pattern of the second Tensor
        @param [in] b_not_smaller   If true, then we do not allow a dim of
                       b to be 1 while corresponding dim of a is >1.
-       @return  Returns true if a and b are broadcastable (with
-                an additional constraint that `a.dims[i] <= b.dims[i]` if
-                `b_not_smaller == true`.
  */
-bool Broadcastable(const Pattern &a, const Pattern &b,
-                   bool b_not_smaller = false);
+bool Broadcastable(const Pattern &a, const Pattern &b);
 
 
 /**  This function returns true if the dimensions of tensor patterns
@@ -409,19 +405,17 @@ bool Broadcastable(const Pattern &a, const Pattern &b,
        @param [in] a  The pattern of the first Tensor
        @param [in] b  The pattern of the second Tensor
        @param [in] c  The pattern of the third Tensor
-       @param [in] c_not_smaller   If true, then we do not allow a dim of
-                      c to be 1 while corresponding dims of a or b
-                      are > 1.
-       @return  Returns true if a, b and c are broadcastable (with
-                an additional constraint that
-                `max(a.dims[i], b.dims[i]) <= c.dims[i]` if
-                `c_not_smaller == true`).
-
+       @return  Returns true if a, b and c are broadcastable
  */
 bool Broadcastable(const Pattern &a, const Pattern &b,
-                   const Pattern &c, bool c_not_smaller = false);
+                   const Pattern &c);
 
 
+/**
+   Returns true if for each raxis, a.dims[raxis] >= b.dims[raxis].
+ */
+bool DimsGeq(const Pattern &a, const Pattern &b);
+
 
 /**
    Returns true if the shapes of a and b (see "Shape of a Pattern" in pattern.h)
@@ -530,6 +524,13 @@ void CompressOnePattern(Pattern *pattern);
 void SortAxes(Pattern *pattern);
 
 
+/**
+   Returns the raxis with the smallest value of abs(stride[raxis]),
+   taking the lowest-numbered raxis in case of ties (which could only
+   happen in the case of stride == 0).  Requires Valid(pattern).
+ */
+int32 RaxisWithSmallestAbsStride(const Pattern &pattern);
+
 // TODO: document this.
 inline void CanonicalizePattern(Pattern *pattern) {
   CompressOnePattern(pattern);
diff --git a/src/tensor/tensor-common.h b/src/tensor/tensor-common.h
index af18a23e34e..24252d61585 100644
--- a/src/tensor/tensor-common.h
+++ b/src/tensor/tensor-common.h
@@ -62,12 +62,20 @@ struct Device {
 enum DataType {
   // We will of course later extend this with many more types, including
   // integer types and half-precision floats.
-  kDefaultDtype = 0,
-  // kDefaultDtype means the type used when not specified; it's user definable
-  // via SetDefaultDtype.
   kFloatDtype = 1,
   kDoubleDtype = 2,
   kInt32Dtype = 3,
+
+
+  // The following enum members are to be used when we want a case statement
+  // over pairs of dtypes, say dtype1 and dtype2.  We would do this as: DataType
+  // pair_dtype = static_cast<DataType>(int32(dtype1) + (int32(dtype2) << 4));
+  kFloatFloatDtypes = 0x11,
+  kFloatDoubleDtypes = 0x12,
+  kFloatInt32Dtypes = 0x13,
+  kDoubleFloatDtype = 0x21,
+  kDoubleDoubleDtype = 0x22,
+  kDoubleInt32Dtype = 0x23
 };
 
 
@@ -135,7 +143,10 @@ enum TensorUseEnum {
   kRead,
   kReadWrite,
   kWrite,
-  kCheckUninitialized
+  kCheckUninitialized,
+  kInitialize,
+  kReadAndInvalidate,
+  kInvalidate
 };
 
 
diff --git a/src/tensor/tensor-utils.h b/src/tensor/tensor-utils.h
index 4bb798fbc06..6ba5956201b 100644
--- a/src/tensor/tensor-utils.h
+++ b/src/tensor/tensor-utils.h
@@ -51,10 +51,9 @@ inline bool Broadcastable(const Tensor &a, const Tensor &b) {
   and device and are broadcastable; equivalent to
   `Broadcastable(a, b) && Compatible(a, b)`.
 */
-inline bool BroadcastableAndCompatible(const Tensor &a, const Tensor &b,
-                                       b_non_reducing = false) {
+inline bool BroadcastablAendCompatible(const Tensor &a, const Tensor &b) {
   return Compatible(*a.impl_, *b.impl_) &&
-      Broadcastable(*a.impl_, *b.impl_, b_non_reducing);
+      Broadcastable(*a.impl_, *b.impl_);
 }
 
 
diff --git a/src/tensor/tensor.h b/src/tensor/tensor.h
index 7089ce6db90..49e4bb1645f 100644
--- a/src/tensor/tensor.h
+++ b/src/tensor/tensor.h
@@ -376,6 +376,14 @@ class Tensor {
   TensorImpl *CopyImpl();
 
 
+  /**
+     Returns the data pointer cast to type T, with the offset from the pattern
+     included.  Calling this will force allocation of the storage region if it
+     was not already allocated.
+     If the
+  */
+  template <class T> T* GetData(bool *was_uninitialized) const;
+
   /**
      Returns the data pointer cast to type T, with the offset from
      the pattern included.  Calling this will force allocation of

From b247f308c510d14827df04e20b78ca11b8078104 Mon Sep 17 00:00:00 2001
From: Daniel Povey <dpovey@gmail.com>
Date: Sat, 15 Jun 2019 14:42:12 -0400
Subject: [PATCH 135/163] [src] Progress on standard cuda kernels for tensor
 directory

---
 src/tensor/cuda-utils.cc    | 572 ++++++++++++++++++++++--------------
 src/tensor/cuda-utils.h     | 169 +++++++----
 src/tensor/pattern-utils.cc |  36 ++-
 src/tensor/pattern-utils.h  |  18 +-
 src/tensor/pattern.h        |  24 ++
 5 files changed, 536 insertions(+), 283 deletions(-)

diff --git a/src/tensor/cuda-utils.cc b/src/tensor/cuda-utils.cc
index 1f76597504e..766dc9154ee 100644
--- a/src/tensor/cuda-utils.cc
+++ b/src/tensor/cuda-utils.cc
@@ -31,10 +31,16 @@ namespace tensor {
    This function splits the kernel that's the last element of 'kernels' so that
    it satisifes grid_dim.x <= 65535-- if necessary, by splitting it into
    multiple kernels, increasing the length of the vector 'kernels'.
+       @param [in] kernel  The input kernel that needs to be split;
+                           must satisfy kernel.grid_dim.x > 65535.
+       @param [out] kernels  The split copies of the input kernel will be
+                          *appended* to the vector `kernels`.
+
  */
-static void SplitStandardKernelX(std::vector<StandardKernel> *kernels) {
+static void SplitStandardKernelX(const StandardThreeArgKernel &kernel,
+                                 std::vector<StandardThreeArgKernel> *kernels) {
   int cur_grid_dim = kernels->back().grid_dim.x;
-  if (cur_grid_dim <= 65535)
+  KALDI_ASSERT(cur_grid_dim > 65535);
     return;
   int num_kernels = (kernels->back().grid_dim.x + 65534) / 65535;
 
@@ -43,284 +49,420 @@ static void SplitStandardKernelX(std::vector<StandardKernel> *kernels) {
 
   std::vector<int> new_grid_dims(num_kernels,
                                  cur_grid_dim / num_kernels);
+  // the next loop ensures that the sum of new_grid_dims equals cur_grid_dim,
+  // correcting for the rounding down.  this will be checked at the bottom of
+  // this function.
   for (int i = 0; i < cur_grid_dim % num_kernels; i++)
     new_grid_dims[i]++;
-  // the above ensures that the sum of new_grid_dims equals
-  // cur_grid_dim; this is checked at the bottom of this function.
-  StandardKernel prev_kernel = kernels->back();
-  kernels->resize(new_size, prev_kernel);
+
 
   int prev_grid_dim_sum = 0;
   for (int i = 0; i < num_kernels; i++) {
-    StandardKernel &k = (*kernels)[cur_size - 1 + i];
+    kernels->push_back(kernel);
+    StandardKernel &new_kernel = kernels->back();
     int this_grid_dim = new_grid_dims[i];
 
-    k.dim_grid.x = new_grid_dims[i];
-    // If this is not the last i value (the last kernel), we can
-    // leave k.sizes.max_offset_a.x as it is because we have
-    // a 'whole number' of
-
+    new_kernel.dim_grid.x = this_grid_dim;
     if (i + 1 < num_kernels) {
-      // the following actually has no effect on operation, it's more
-      // for clarity.
+      // the following actually has no effect on operation since all
+      // threads will run; it's more for clarity.
       k.sizes.max_offset_a.x = this_grid_dim * k.sizes.block_stride_a.x;
     } else {
-      // for last one, this limit does make a difference, as the
-      // highest-numbered thread block may not have all threads run.
+      // for the last kernel, this limit might actually make a difference, as
+      // the highest-numbered thread block in the last kernel may not have all
+      // threads run.
       k.sizes.max_offset_a.x -= prev_grid_dim_sum * k.sizes.block_stride_a.x;
     }
     k.base_offset_a += prev_grid_dim_sum * k.sizes.block_stride_a.x;
+    k.base_offset_b += prev_grid_dim_sum * k.sizes.block_stride_b.x;
+    k.base_offset_c += prev_grid_dim_sum * k.sizes.block_stride_c.x;
 
     prev_grid_dim_sum += this_grid_dim;
   }
   KALDI_ASSERT(prev_grid_dim_sum == cur_grid_dim);
 }
 
-static void SplitStandardKernelY(std::vector<StandardKernel> *kernels) {
-  // TODO.  Copy of the X one above.
-}
-
-static void GetStandardKernel1(const Pattern &a, const Pattern &b,
-                               std::vector<StandardKernel> *kernels) {
-  //  KALDI_PARANOID_ASSERT(a.num_axes == 1);
+// This is a copy of SplitStandardKernelX above, but with x's changed to y's.
+// See the documentation for SplitStandardKernelX.
+static void SplitStandardKernelY(const StandardThreeArgKernel &kernel,
+                                 std::vector<StandardThreeArgKernel> *kernels) {
+  int cur_grid_dim = kernels->back().grid_dim.y;
+  KALDI_ASSERT(cur_grid_dim > 65535);
+    return;
+  int num_kernels = (kernels->back().grid_dim.y + 65534) / 65535;
 
-  // Note: the following call will invoke the constructor of dim3 which
-  // sets all the values to 1, so we don't have to set the unused
-  // gridDim elements.
-  kernels->resize(kernels->size() + 1);
-  StandardKernel &k = kernels->back();
-  // Note: b.dims[0] is either 'dim' or 1; it won't affect anything, we only
-  // need b's stride.
-  int dim = a.dims[0],
-      a_stride = a.strides[0],
-      b_stride = b.strides[0];
-  int bs = KALDI_STANDARD_THREAD_BLOCK_SIZE;
-  int num_blocks = (dim + bs - 1) / bs;  // round up.
+  size_t cur_size = kernels.size(),
+      new_size = cur_size + num_kernels - 1;
 
-  k.sizes.thread_stride_a.x = a_stride;
-  k.sizes.block_stride_a.x = a_stride * bs;
+  std::vector<int> new_grid_dims(num_kernels,
+                                 cur_grid_dim / num_kernels);
+  // the next loop ensures that the sum of new_grid_dims equals cur_grid_dim,
+  // correcting for the rounding down.  this will be checked at the bottom of
+  // this function.
+  for (int i = 0; i < cur_grid_dim % num_kernels; i++)
+    new_grid_dims[i]++;
 
-  k.sizes.thread_stride_b.x = b_stride;
-  k.sizes.block_stride_b.x = b_stride * bs;
 
-  k.sizes.max_offset_a.x = dim * a_stride;
+  int prev_grid_dim_sum = 0;
+  for (int i = 0; i < num_kernels; i++) {
+    kernels->push_back(kernel);
+    StandardKernel &new_kernel = kernels->back();
+    int this_grid_dim = new_grid_dims[i];
 
-  k.block_dim.x = std::min<int32>(bs, dim);
-  k.grid_dim.x = num_blocks;
+    new_kernel.dim_grid.y = this_grid_dim;
+    if (i + 1 < num_kernels) {
+      // the following actually has no effect on operation since all
+      // threads will run; it's more for clarity.
+      k.sizes.max_offset_a.y = this_grid_dim * k.sizes.block_stride_a.y;
+    } else {
+      // for the last kernel, this limit might actually make a difference, as
+      // the highest-numbered thread block in the last kernel may not have all
+      // threads run.
+      k.sizes.max_offset_a.y -= prev_grid_dim_sum * k.sizes.block_stride_a.y;
+    }
+    k.base_offset_a += prev_grid_dim_sum * k.sizes.block_stride_a.y;
+    k.base_offset_b += prev_grid_dim_sum * k.sizes.block_stride_b.y;
+    k.base_offset_c += prev_grid_dim_sum * k.sizes.block_stride_c.y;
 
-  if (num_blocks > 65535)
-    SplitStandardKernelX(kernels);
+    prev_grid_dim_sum += this_grid_dim;
+  }
+  KALDI_ASSERT(prev_grid_dim_sum == cur_grid_dim);
 }
 
 
-// Fills out the 'x' dimension of the standard kernel using raxis 0
-// of the patterns (which are assumed to have been sorted on the
-// stride of a, so that raxis 0 is the one with the smallest stride,
-// hopefully equal to 1)
-//
-// Does
-static void GetStandardKernelX(const Pattern &a, const Pattern &b,
-                               std::vector<StandardKernel> *kernels) {
-  //  KALDI_PARANOID_ASSERT(a.num_axes == 1);
-
-  // Note: the following call will invoke the constructor of dim3 which
-  // sets all the values to 1, so we don't have to set the unused
-  // gridDim elements.
-  kernels->resize(kernels->size() + 1);
-  StandardKernel &k = kernels->back();
+/**
+   This function is used to handle cases where we still have more than 3 axes
+   (should be very rare since we only use the standard kernel on reduced
+   pattern-tuples).  It creates copies of a kernel that differ only
+   in max_offset_a, max_offset_b, max_offset_c, to take account of
+   an raxis that has not been included in the kernel.
+
+     @param [in] a      The first Pattern that's an arg to the kernel
+     @param [in] b      The second Pattern that's an arg to the kernel
+     @param [in] c      The third Pattern that's an arg to the kernel
+     @param [in] raxis  The raxis that we're splitting on; in place of the
+                        single input 'kernel' we will have a separate
+                        output for each i in [0, a.dim[raxis] - 1]
+     @param [in] kernel    The original kernel that awe are going to
+                        expand.  Assumed to correspond to an index
+                        value of 0 on raxis 'raxis'.
+     @param [out] kernels  The output kernels are *appended* to this
+                        vector.  The number of output kernels will
+                        be a.dims[raxis].
+ */
+static void SplitStandardKernelByAxis(
+    const Pattern &a,
+    const Pattern &b,
+    const Pattern &c,
+    int32 raxis,
+    const StandardThreeArgKernel &kernel
+    std::vector<StandardThreeArgKernel> *kernels) {
+  // Asserting raxis > 0 is just from knowledge of how the calling code works,
+  // it is not something that would affect the operation of this function.
+  KALDI_ASSERT(raxis > 0 && raxis < a.num_axes);
+  int32 dim = a.dims[raxis];
+  for (int32 i = 0; i < dim; i++) {
+    kernels->push_back(kernel);
+    StandardThreeArgKernel &k = kernels->back();
+    k.max_offset_a += i * a.strides[raxis];
+    k.max_offset_b += i * b.strides[raxis];
+    k.max_offset_c += i * c.strides[raxis];
+  }
+}
+
+// Fills out the 'x' dimension of the standard kernel, which is assumed to have
+// immediately before been initialized with its default constructor.
+
+// The 'x' dimension is filled out using raxis=0, which is required to be the
+// lowest abs(stride) in 'a' and have stride != 0; most of the time, this stride
+// will be 1.  We preferentially make the thread block vary along this axis,
+// which will increase the chance of consolidated memory accesses.  (We could,
+// of course, take much more care to ensure memory accesses are consolidated,
+// taking into account the patterns of b and c and taking into account whether
+// the start of the tensor is on a 128-byte boundary; we can consider these
+// kinds of optimizations in future).
+static void ProcessStandardKernelX(const Pattern &a,
+                                   const Pattern &b,
+                                   const Pattern &c,
+                                   StandardThreeArgKernel *k) {
+  KALDI_PARANOID_ASSERT(a.num_axes >= 1 && a.dims[0] > 1);
   // Note: b.dims[0] is either 'dim' or 1; it won't affect anything, we only
   // need b's stride.
   int dim = a.dims[0],
       a_stride = a.strides[0],
-      b_stride = b.strides[0];
-  int bs = KALDI_STANDARD_THREAD_BLOCK_SIZE;
-  int num_blocks = (dim + bs - 1) / bs;  // round up.
-
-  k.sizes.thread_stride_a.x = a_stride;
-  k.sizes.block_stride_a.x = a_stride * bs;
+      b_stride = b.strides[0],
+      c_stride = c.strides[0];
 
-  k.sizes.thread_stride_b.x = b_stride;
-  k.sizes.block_stride_b.x = b_stride * bs;
+  // bs is the thread-block size (at least, as far as the x dimension is
+  // concerned).
+  int bs = std::min<int32>(RoundUpToNearestPowerOfTwo(dim),
+                           KALDI_STANDARD_THREAD_BLOCK_SIZE);
+  int num_blocks = (dim + bs - 1) / bs;  // round up.
 
-  k.sizes.max_offset_a.x = dim * a_stride;
+  k->sizes.thread_stride_a.x = a_stride;
+  k->sizes.block_stride_a.x = a_stride * bs;
+  k->sizes.thread_stride_b.x = b_stride;
+  k->sizes.block_stride_b.x = b_stride * bs;
+  k->sizes.thread_stride_c.x = c_stride;
+  k->sizes.block_stride_c.x = c_stride * bs;
 
-  k.block_dim.x = std::min<int32>(bs, dim);
-  k.grid_dim.x = num_blocks;
+  k->sizes.max_offset_a.x = dim * a_stride;
 
-  if (num_blocks > 65535)
-    SplitStandardKernelX(kernels);
+  k->block_dim.x = bs;
+  k->grid_dim.x = num_blocks;
 }
 
 
+// Fills out the 'y' dimension of the standard three-arg kernel (whose x
+// dimension is assumed to already have been set up) using an raxis-index
+// specified by the user; this will normally be the one with the largest dim,
+// and it won't be 0 because axis 0 goes to x and will already have been
+// processed.
+static void ProcessStandardKernelY(const Pattern &a,
+                                   const Pattern &b,
+                                   const Pattern &c,
+                                   int32 raxis,
+                                   StandardThreeArgKernel *kernel) {
+  KALDI_PARANOID_ASSERT(a.num_axes > raxis && raxis > 0);
+
+  int dim = a.dims[raxis],
+      a_stride = a.strides[raxis],
+      b_stride = b.strides[raxis],
+      c_stride = c.strides[raxis];
+
+  // bs means block size.
+  int bs_x = kernel->block_dim.x;
+  // If the threads-per-block is too small, we may have to have threads-per-block
+  // != 1 on this axis.
+  int bs_y = std::min<int32>(RoundUpToNearestPowerOfTwo(dim),
+                             KALDI_STANDARD_THREAD_BLOCK_SIZE / bs_x);
+  if (bs_y < 1)
+    bs_y = 1;  // just for robustness to any later code changes.
+  int num_blocks = (dim + bs_y - 1) / bs_y;  // round up.
+
+  k->sizes.thread_stride_a.y = a_stride;
+  k->sizes.block_stride_a.y = a_stride * bs_y;
+  k->sizes.thread_stride_b.y = b_stride;
+  k->sizes.block_stride_b.y = b_stride * bs_y;
+  k->sizes.thread_stride_c.y = c_stride;
+  k->sizes.block_stride_c.y = c_stride * bs_y;
+
+  k->sizes.max_offset_a.y = dim * a_stride;
+  k->block_dim.y = bs_y;
+  k->grid_dim.y = num_blocks;
+}
 
-static void GetStandardKernel2(const Pattern &a, const Pattern &b,
-                               std::vector<StandardKernel> *kernels) {
-  // Note: the following call will invoke the constructor of dim3 which
-  // sets all the values to 1, so we don't have to set the unused
-  // gridDim elements.
-  kernels->resize(kernels->size() + 1);
-
-  StandardKernel &k = kernels->back();
-  int dim0 = a.dims[0],
-      a_stride0 = a.strides[0],
-      b_stride0 = b.strides[0],
-      dim1 = a.dims[1],
-      a_stride1 = a.strides[1],
-      b_stride1 = b.strides[1];
-  // We expect the patterns will have been normalized prior to this
-  // call, which is why we don't expect zero strides for a.
-  // some of the code does assume this, so we check for it.
-  KALDI_PARANOID_ASSERT(a_stride0 != 0 && a_stride0 < a_stride1);
-
-  if (dim0 < 64) {
-    // dim0 is on the small side for a thread-block size, so we want the thread
-    // block size to include part of dim1.
-    int bs0 = dim0,
-        dim0_rounded_up = RoundUpToNearestPowerOfTwo(dim0),
-        bs1 = KALDI_STANDARD_THREAD_BLOCK_SIZE / dim0_rounded_up,
-        nb1 = (dim1 + bs1 - 1) / bs1;
-    k.block_dim.x = dim0;
-    k.grid_dim.x = 1;  // it had this value anyway; this is for clrity.
-    k.block_dim.y = bs1;
-    k.grid_dim.y = nb1;
-
-    k.sizes.max_offset_a.x = dim0 * a_stride0;
-    k.sizes.max_offset_a.y = dim1 * a_stride1;
-
-    k.sizes.thread_stride_a.x = a_stride0;
-    k.sizes.block_stride_a.x = a_stride0 * bs0;
-    k.sizes.thread_stride_a.y = a_stride1;
-    k.sizes.block_stride_a.y = a_stride1 * bs1;
-
-    k.sizes.thread_stride_b.x = b_stride0;
-    k.sizes.block_stride_b.x = b_stride0 * bs0;
-    k.sizes.thread_stride_b.y = b_stride1;
-    k.sizes.block_stride_b.y = b_stride1 * bs1;
-
-  } else {
-    int bs0 = std::min<int32>(dim0, KALDI_STANDARD_THREAD_BLOCK_SIZE),
-        nb0 = (dim0 + bs0 - 1) / bs0,
-        bs1 = 1,
-        nb1 = dim1;
-
-    k.block_dim.x = dim0;
-    k.grid_dim.x = 1;  // it had this value anyway; this is for clrity.
-    k.block_dim.y = bs1;
-    k.grid_dim.y = nb1;
-
-    k.sizes.max_offset_a.x = dim0 * a_stride0;
-    k.sizes.max_offset_a.y = dim1 * a_stride1;
-
-    if (nb0 > 65535)
-      SplitStandardKernelX(kernels);
-    else if (nb1 > 65535)
-      SplitStandardKernelY(kernels);
-    // we don't handle the case where they are both > 65535, because that, times
-    // the block size, would be more than the memory of any GPU, and would
-    // require code changes.
 
+// Fills out the 'z' dimension of the standard kernel (whose x and y dimensions
+// are assumed to already have been set up) using an raxis-index specified by the
+// user; this will normally be the one with the largest dim, and it won't be 0
+// because axis 0 goes to x and will already have been processed.
+static void ProcessStandardKernelZ(const Pattern &a, const Pattern &b,
+                                   int32 raxis,
+                                   StandardThreeArgKernel *kernel) {
+  KALDI_PARANOID_ASSERT(a.num_axes > raxis && raxis > 0);
+
+  int dim = a.dims[raxis],
+      a_stride = a.strides[raxis],
+      b_stride = b.strides[raxis],
+      c_stride = c.strides[raxis];
+
+  // bs means block size.
+  int bs_x = kernel->block_dim.x,
+      bs_y = kernel->block_dim.y;
+  // If the threads-per-block is too small, we may have to have grid_dim.z
+  // != 1.  But this is only possible if we can choose a value of grid_dim.z
+  // that exactly divides 'dim', because the kernel doesn't have an
+  // if-statement for the z dimension.
+
+  int bs_z;
+  for (int i = 1; i * bs_x * bs_y <= KALDI_STANDARD_THREAD_BLOCK_SIZE; i++)
+    if (dim % i == 0)
+      bs_z = i;
+  // Note: in the normal case, bs_z will be one now.  In all cases,
+  // bs_z will divide 'dim' exactly.
+
+  int num_blocks = dim / bs_z;  // round up.
+
+  k->sizes.thread_stride_a.z = a_stride;
+  k->sizes.block_stride_a.z = a_stride * bs_z;
+  k->sizes.thread_stride_b.z = b_stride;
+  k->sizes.block_stride_b.z = b_stride * bs_z;
+  k->sizes.thread_stride_c.z = c_stride;
+  k->sizes.block_stride_c.z = c_stride * bs_z;
+
+  // The kernel code will not actually inspect max_offset_a.z; we just leave it
+  // as a guide in case of future code changes.
+  k->sizes.max_offset_a.z = dim * a_stride;
+
+  k->block_dim.z = bs_z;
+  k->grid_dim.z = num_blocks;
+}
 
-  }
 
 
-    // everything goes in the x, and we rely on the loop limits to
-    //
 
+void FinalizeKernel(const Pattern &a,
+                    const Pattern &b,
+                    const Pattern &c,
+                    ArrayRef<int32> remaining_axes,
+                    std::vector<StandardThreeArgKernel> *kernels) {
+  // prev_size is the size of 'kernels'  before the most recent one
+  // was added (since GetStandardKernel appends).  Would normally be zero.
+  size_t prev_size = kernels->size() - 1;
+  if (kernels->back().grid_dim.x > 65535) {
+    SplitStandardKernelX(kernels);
+    if (kernels->back().grid_dim.y > 65535)
+      KALDI_ERR << "You are trying to process a tensor that's way too big";
+    // We don't handle the case where the x and y grid dims are both >65535,
+    // because that much data wouldn't fit on the GPU anyway once you take into
+    // account the thread block size.  (It would require code changes to do
+    // correctly).
+  } else if (kernels->back().grid_dim.y > 65535) {
+    SplitStandardKernelY(kernels);
   }
-
-  if (dim0 * dim1 < 1024) {
-    // Do it in a single thread block.  There's no point wasting
-    // time figuring out more details.
-  } else if (dim0 > bs && dim1 * dim2 <= 16384) {
-    // 16384 is 4 * 4096, and 4096 is a kind of upper limit on
-    // how many threads we might expect to run at once.
-
+  if (kernels->back().grid_dim.z > 65535)
+    KALDI_ERR << "You are trying to process a tensor that's way too big";
+
+  for (size_t i = 0; i < remaining_axes.size(); i++) {
+    int32 raxis = remaining_axes[i];
+    std::vector<StandardKernel> next_kernels;
+    for (auto kernel: *kernels)
+      SplitStandardKernelByAxis(a, b, c, raxis, kernel, next_kernels);
+    kernels->swap(next_kernels);
   }
-
-
-
-  KALDI_PARANOID_ASSERT(dim0 > 1 && dim1 > 1);
-  int bs = KALDI_STANDARD_THREAD_BLOCK_SIZE;
-  if (dim0 >= bs)
-  if (dim0 < bs) {
-    if (dim0 >= bs / 2) {
-      bs = dim0;
-    } else {
-      // This is a relatively complex case; the blocks can't just
-      // be on dim0, they have to also include dim1.  We
-      // would prefer to use an exact divisor of dim1, to avoid
-      // having to use 2 kernels.
-      int block_x = dim0,
-          block_y = -1;
-      float block_size_cost = 1.0e+10;
-      if (dim0 * dim1 <= 1024) {
-        block_y = dim1;
-      } else {
-        for (int this_block_y = 1;
-             this_block_y * block_x < 1024;
-             this_block_y++) {
-          if (dim1 % this_block_y == 0) {
-            int this_block_size = this_block_y * block_x;
-            float this_block_size_cost =  GetBlockSizeCost(this_block_size);
-            if (this_block_size_cost < block_size_cost) {
-              block_size_cost = this_block_size_cost;
-              block_y = this_block_y;
-            }
-          }
-        }
-      }
-      if (this_block_y == -1) {
-        block_y = KALDI_STANDARD_KERNEL1_BLOCK_SIZE / block_x;
-        // and we'll deal with the remainder via a second kernel.
-      }
-
-      }
-
-    }
-
-
-
-  }
-
-  int bs = KALDI_STANDARD_KERNEL1_BLOCK_SIZE;
-  int32 num_blocks = (dim + bs - 1) / bs;  // round up.
-  if (num_blocks > 1536)   // Don't want to have stragglers, so
-    num_blocks = 1024;     // only limit num_blocks to 1024 if
-                           // most will loop at least twice.
-  k.sizes.thread_stride_a.x = a_stride;
-  k.sizes.block_stride_a.x = a_stride * bs;
-  k.sizes.thread_stride_b.x = b_stride;
-  k.sizes.block_stride_b.x = b_stride * bs;
-  k.block_dim.x = bs;
-  k.grid_dim.x = num_blocks;
-  // We don't treat the case where dim < bs separately (e.g. setting
-  // k.block_dim.x = dim), I don't think it would make any real difference.
 }
 
 
+// Returns the raxis with the smallest abs(stride).  It is an error if any axis
+// has stride = 0 (i.e. is a trivial axis).  Intended to be called
+// from GetStandardKernel()
+int32 RaxisWithSmallestAbsStride(const Pattern &p) {
+  int32 num_axes = a.num_axes,
+      ans = 0;
+  for (int32 raxis = 1; raxis < num_axes; raxis++)
+    if (abs(p.strides[raxis]) < abs(p.strides[ans]))
+      ans = raxis;
+  KALDI_ASSERT(p.strides[ans] != 0 &&
+               "Args to GetStandardKernel() do not have the expected "
+               "properties");
+  // if the assert fails, either the pattern-tuple was not in reduced form, or
+  // there is reduction in the operation, which is not allowed in a "standard"
+  // kernel.
+  return ans;
+}
 
-void GetStandardKernel(const Pattern &a, const Pattern &b,
-                       std::vector<StandardKernel> *kernels) {
-
-  // TODO: ensure that the 1st dim of a is the one with smallest stride.
 
+void GetStandardThreeArgKernel(const Pattern &a,
+                               const Pattern &b,
+                               const Pattern &c,
+                               std::vector<StandardThreeArgKernel> *kernels) {
   KALDI_PARANOID_ASSERT(DimsGeq(a, b) && a.num_axes >= b.num_axes &&
                         Broadcastable(a, b));
+  int32 smallest_stride_raxis = RaxisWithSmallestStride(a);
+  if (smallest_stride_raxis != 0) {
+    // This is unexpected but we can deal with it by swapping axes.
+    Pattern a_new(a), b_new(b), c_new(c);
+    TransposeR(0, smallest_stride_raxis, &a_new);
+    TransposeR(0, smallest_stride_raxis, &b_new, true);
+    TransposeR(0, smallest_stride_raxis, &c_new, true);
+    GetStandardKernel(a_new, b_new, c_new, kernels);
+    return;
+  }
+  kernels->clear();
+  kernels->resize(1);
+  Kernel *kernel = &(kernels->back());
+
   int32 num_axes = a.num_axes;
   switch (num_axes) {
+    case 0:
+      // The default constructor gives values suitable for a kernel that
+      // only processes a single element, so there is nothing more to do.
+    return;
     case 1:
-      GetStandardKernel1(a, b, kernels);
+      ProcessStandardKernelX(a, b, kernel);
+      FinalizeKernel(a, b, {}, kernels);
       return;
     case 2:
-      GetStandardKernel2(a, b, kernels);
+      ProcessStandardKernelX(a, b, kernel);
+      ProcessStandardKernelY(a, b, 1, kernel);
+      FinalizeKernel(a, b, {}, kernels);
       return;
-    case 3:
-      GetStandardKernel3(a, b, kernels);
+    default: {  // >= 3 axes
+      ProcessStandardKernelX(a, b, kernel);
+      // Sort the raxes 1, 2,... from greatest to least dimension.  (Note: there
+      // are cases where this won't be optimal and we may want to take the
+      // stride into account in order to ensure more consolidated memory access;
+      // we could think about that later).
+      std::vector<int32> raxes;
+      for (int i = 1; i < num_axes; i++)
+        raxes.push_back(i);
+      std::sort(raxes.begin(), raxes.end(),
+                // below is a C++11 lambda used as a comparator function, like
+                // the operator x < y.  The "a" in brackets is the Pattern a,
+                // declared above, which is a "captured" variable for this
+                // lambda.
+                [a] (int x, int y) {
+                  // reverse the direction of comparison because we want raxes
+                  // sorted from greatest to least dim.
+                  return a.dims[x] > a.dims[y];
+                });
+      ProcessStandardKernelY(a, b, raxes[0], kernel);
+      ProcessStandardKernelZ(a, b, raxes[1], kernel);
+      raxes_data = &(raxes[0]);
+      // The expression {raxes_data + 2, raxes_data + num_axes - 1} is a
+      // constructor to ArrayRef which gives an array of ints including raxes[2]
+      // and any remaining elements.  This is the possibly-empty subset of raxes
+      // that we haven't already processed, and they should all have fairly
+      // small dimension as we've sorted `raxes` from greatest to least
+      // dimension.  We'll process these left-over raxes by duplicating the
+      // kernel, shifting the base_offset_{a,b,c} value as needed.
+      FinalizeKernel(a, b, {raxes_data + raxes_data + num_axes - 1},
+                     raxes.begin  kernel);
       return;
+    }
   }
+}
 
+// Convert from 3-arg to 2-arg kernel, discarding information.
+static void ConvertKernel(const StandardThreeArgKernel &src,
+                          StandardTwoArgKernel *dest) {
+  dest->dim_block = src.dim_block;
+  dest->dim_grid = src.dim_grid;
+  dest->sizes.thread_stride_a = src.sizes.thread_stride_a;
+  dest->sizes.thread_stride_b = src.sizes.thread_stride_b;
+  dest->sizes.block_stride_a = src.sizes.block_stride_a;
+  dest->sizes.block_stride_b = src.sizes.block_stride_b;
+  dest->sizes.max_offset_a = src.sizes.max_offset_a;
+  dest->base_offset_a = src.base_offset_a;
+  dest->base_offset_b = src.base_offset_b;
 }
 
+// Convert from 3-arg to 1-arg kernel, discarding information.
+static void ConvertKernel(const StandardThreeArgKernel &src,
+                          StandardTwoArgKernel *dest) {
+  dest->dim_block = src.dim_block;
+  dest->dim_grid = src.dim_grid;
+  dest->sizes.thread_stride_a = src.sizes.thread_stride_a;
+  dest->sizes.block_stride_a = src.sizes.block_stride_a;
+  dest->sizes.max_offset_a = src.sizes.max_offset_a;
+  dest->base_offset_a = src.base_offset_a;
+}
+
+// Doing a 2-arg kernel by first doing the 3-arg one is of course
+// wasteful
+void GetStandardTwoArgKernel(const Pattern &a,
+                             const Pattern &b,
+                             const Pattern &c,
+                             std::vector<StandardThreeArgKernel> *kernels) {
+  std::vector<StandardThreeArgKernel> temp_kernels;
+  GetStandardThreeArgKernel(a, b, b, &temp_kernels);
+  size_t size = temp_kernels.size();
+  kernels->resize(size);
+  for (size_t i = 0; i < size; i++)
+    ConvertKernel(temp_kernels[i],
+
+}
 
 
 
diff --git a/src/tensor/cuda-utils.h b/src/tensor/cuda-utils.h
index 175a1570c31..2c3a737b106 100644
--- a/src/tensor/cuda-utils.h
+++ b/src/tensor/cuda-utils.h
@@ -247,105 +247,160 @@ struct StandardTwoArgKernelSizes {
 };
 
 
-
-
-struct StandardTwoArgKernel {
-  dim3 block_dim;
-  dim3 grid_dim;
-  StandardTwoArgKernelSizes sizes;  // passed into kernel.
+class StandardTwoArgKernel {
+  dim3 dim_block;
+  dim3 dim_grid;
+  StandardTwoArgKernelSizes sizes;
+  // offset_a and offset_b are offsets that we have to add to the data-pointers
+  // of a and b before we call the kernel; these will normally be zero, but may
+  // be nonzero if we have to generate multiple kernels due to, say, size
+  // constraints.
+  int64 base_offset_a{0};
+  int64 base_offset_b{0};
 };
 
 
 /**
-   This function returns the dimensions/sizes for one or more "standard kernels"
-   to execute a "standard operation" on patterns a and b.  We define a
-   standard operation as an elementwise operation possibly with broadcasting,
-   of the form:
-       a[i] = f(b[i])
+   This function returns the dimensions/sizes for one or more "standard two arg
+   kernels" to execute a "standard two arg operation" on Tensors a and b, of
+   which only the patterns are provided.  We define a standard two-arg operation
+   as an elementwise operation possibly with broadcasting, of the form:
 
-   for some scalar function f, where i is an index-tuple.  a and b must be
-   broadcastable, and the dims of a must be >= the corresponding dims of b
-   (i.e.: no reduction).  We also require a.num_axes >= b.num_axes.
-   The standard kernel is as follows:
+       a[i] = f(b[i])
+   where i is an index-tuple in the index-tuple-set of the pattern-tuple (a,b);
+   search in pattern.h for the meaning of this notation.
+
+   a and b must be broadcastable, and the dims of a must be >= the corresponding
+   dims of b (i.e.: no reduction).  We also require a.num_axes >= b.num_axes,
+   which results from the tuple (a,b) having been reduced (see ReducePatternTuple()
+   in pattern-tuple-utils.h).
+   The standard two-arg kernel is as follows, exemplifying it with the
+   function "some_func".
 <code>
-  void _standard_kernel(StandardTwoArgKernelSizes f, float *a, float *b) {
+template <typename T>
+  void _standard_two_arg_kernel(StandardTwoArgKernelSizes f, T *a, const T *b) {
     int a_offset_x = f.thread_stride_a.x * threadIdx.x + block_stride_a.x * blockIdx.x,
       a_offset_y = f.thread_stride_a.y * threadIdx.y + block_stride_a.y * blockIdx.y,
       a_offset_z = f.thread_stride_a.z * threadIdx.z + block_stride_a.z * blockIdx.z;
     int b_offset = f.thread_stride_b.x * threadIdx.x + block_stride_b.x * blockIdx.x +
                    f.thread_stride_b.y * threadIdx.y + block_stride_b.y * blockIdx.y +
-                   f.thread_stride_b.z * threadIdx.z + block_stride_b.z * blockIdx.z
+                   f.thread_stride_b.z * threadIdx.z + block_stride_b.z * blockIdx.z;
 
      if (a_offset_x < f.max_offset_a.x && a_offset_y < f.max_offset_a.y)
        a[a_offset_x + a_offset_y + a_offset_z] = some_func(b[b_offset]);
   }
+
+  // which would be invoked as follows:
+  template <typename T>
+  void standard_two_arg_kernel(const Tensor &a, const Tensor &b,
+                               const StandardTwoArgKernel &k) {
+    _standard_two_arg_kernel<<<k.grid_dim, k.block_dim>>>(
+         a.GetData<T>() + k.base_offset_a,
+         b.GetData<T>() + k.base_offset_b);
+  }
+
 </code>
   }
 
       @param [in] a   First pattern for which we want the kernel (or kernels)
       @param [in] b   Second pattern for which we want the kernel (or kernels)
-      @param [out] kernels  The kernels are *appended to* this vector (this
-                      allows for recursive operation in this function).  Normally,
+      @param [out] kernels  The kernels are output this vector.  Normally,
                       we'll have `kernels->size() == 1` at exit.  The user is expected
-                      to call all of them (the order doesn't matter, and they
-                      don't have to be called in sequence).
+                      to call all of them (the order doesn't matter).
  */
-void GetStandardKernel(const Pattern &a, const Pattern &b,
-                       std::vector<StandardKernel> *kernels);
+void GetStandardTwoArgKernel(const Pattern &a, const Pattern &b,
+                             std::vector<StandardTwoArgKernel> *kernels);
 
 
 
-/**
-   First: we define type 1 kernel as a non-reducing (but possibly broadcasting)
-   operation between two Tensors, e.g. a = b or a = sigmoid(b).  This is
-   a rather general type of kernel that can be used as the generic case
-   (applicable to arbitrary tensors).
 
-   The KernelInfo is the part that needs to be passed into the
-   kernel itself.  There are also two other things needed to launch the
-   kernel:
-<code>
-      dim3 grid_dim, block_dim;
-</code>
-
-   The basic operation we'll do in the kernel is something like this;
-   let 'a' and 'b' be pointers to float or something like that.  Let
-<code>
-    Type1KernelInfo f;  // passed in.
-    int x_offset_a = f.thread_stride_a.x * threadIdx.x + f.block_stride_a.x * blockIdx.x;
-       y_offset_a = f.thread_stride_a.y * threadIdx.y + f.block_stride_a.y * blockIdx.y;
-       z_offset_a = f.thread_stride_a.z * threadIdx.z + f.block_stride_a.z * blockIdx.z;
-    // and similar statements to set x_offset_b, y_offset_b, z_offset_b.
-
-    if (x_offset_a < f.max_offset_a.x &&
-        y_offset_a < f.max_offset_a.y)
-      a[x_offset_a + y_offset_a + z_offset_a] =
-          b[x_offset_b + y_offset_b + z_offset_b];
-    // clock speed e.g. 3 gHz.  Say 100 instructions.
-</code>
- */
-
-class StandardKernelSizes {
+class StandardThreeArgKernelSizes {
   dim3 thread_stride_a;
   dim3 thread_stride_b;
+  dim3 thread_stride_c;
+
   dim3 block_stride_a;
   dim3 block_stride_b;
+  dim3 block_stride_c;
+
   dim3 max_offset_a;
 };
 
-class StandardKernel {
+class StandardThreeArgKernel {
   dim3 dim_block;
   dim3 dim_grid;
-  StandardKernelSizes sizes;
-  // offset_a and offset_b are offsets that we have to add to the data-pointers
-  // of a and b before we call the kernel; these will normally be zero, but may
+  StandardTwoArgKernelSizes sizes;
+  // base_offset_{a,b,c} are offsets that we have to add to the data-pointers of
+  // a, b and c before we call the kernel; these will normally be zero, but may
   // be nonzero if we have to generate multiple kernels due to, say, size
   // constraints.
   int64 base_offset_a{0};
   int64 base_offset_b{0};
+  int64 base_offset_c{0};
 };
 
 
+/**
+   This function returns the dimensions/sizes for one or more "standard three arg
+   kernels" to execute a "standard three arg operation" on Tensors a and b, of
+   which only the patterns are provided.  We define a standard three-arg operation
+   as an elementwise operation possibly with broadcasting, of the form:
+
+       a[i] = f(b[i], c[i])
+   where i is an index-tuple in the index-tuple-set of the pattern-tuple (a,b,c);
+   search in pattern.h for the meaning of this notation.
+
+   a, b and c must be broadcastable, and the dims of a must be >= the
+   corresponding dims of b and of c (i.e.: no reduction).  We also require
+   a.num_axes >= b.num_axes and a.num_aces >= c.num_axes, which results from the
+   tuple (a,b,c) having been reduced (see ReducePatternTuple() in
+   pattern-tuple-utils.h).
+
+   The standard three-arg kernel is as follows, exemplifying it with the
+   function "some_func".
+<code>
+template <typename T>
+  void _standard_three_arg_kernel(StandardThreeArgKernelSizes f,
+                                  T *a, const T *b, const T *c) {
+    int a_offset_x = f.thread_stride_a.x * threadIdx.x + block_stride_a.x * blockIdx.x,
+      a_offset_y = f.thread_stride_a.y * threadIdx.y + block_stride_a.y * blockIdx.y,
+      a_offset_z = f.thread_stride_a.z * threadIdx.z + block_stride_a.z * blockIdx.z;
+    int b_offset = f.thread_stride_b.x * threadIdx.x + block_stride_b.x * blockIdx.x +
+                   f.thread_stride_b.y * threadIdx.y + block_stride_b.y * blockIdx.y +
+                   f.thread_stride_b.z * threadIdx.z + block_stride_b.z * blockIdx.z,
+        c_offset = f.thread_stride_c.x * threadIdx.x + block_stride_c.x * blockIdx.x +
+                   f.thread_stride_c.y * threadIdx.y + block_stride_c.y * blockIdx.y +
+                   f.thread_stride_c.z * threadIdx.z + block_stride_c.z * blockIdx.z;
+
+     if (a_offset_x < f.max_offset_a.x && a_offset_y < f.max_offset_a.y)
+       a[a_offset_x + a_offset_y + a_offset_z] = some_func(b[b_offset], c[c_offset]);
+  }
+
+  // which would be invoked as follows:
+  template <typename T>
+  void standard_three_arg_kernel(const Tensor &a, const Tensor &b,
+                                 const Tensor &c,
+                                 const StandardThreeArgKernel &k) {
+    _standard_three_arg_kernel<<<k.grid_dim, k.block_dim>>>(
+         a.GetData<T>() + k.base_offset_a,
+         b.GetData<T>() + k.base_offset_b,
+         c.GetData<T>() + k.base_offset_c);
+  }
+</code>
+  }
+
+      @param [in] a   First pattern for which we want the kernel (or kernels)
+      @param [in] b   Second pattern for which we want the kernel (or kernels)
+      @param [in] c   Second pattern for which we want the kernel (or kernels)
+      @param [out] kernels  The kernels are output this vector.  Normally,
+                      we'll have `kernels->size() == 1` at exit.  The user is expected
+                      to call all of them (the order doesn't matter).
+ */
+void GetStandardThreeArgKernel(const Pattern &a, const Pattern &b,
+                               std::vector<StandardThreeArgKernel> *kernels);
+
+
+
 
 
 }  // namespace tensor
diff --git a/src/tensor/pattern-utils.cc b/src/tensor/pattern-utils.cc
index 56c0bb70b95..e65550af561 100644
--- a/src/tensor/pattern-utils.cc
+++ b/src/tensor/pattern-utils.cc
@@ -453,16 +453,40 @@ void SortTupleAxes(ArrayRef<Pattern*> patterns) {
   // TODO.
 }
 
-void Transpose(int32 raxis1, int32 raxis2, Pattern *p) {
-  if (static_cast<uint32>(raxis1) >= static_cast<uint32>(p->num_axes) ||
-      static_cast<uint32>(raxis2) >= static_cast<uint32>(p->num_axes)) {
-    KALDI_ERR << "Invalid axes to transpose: raxis1="
-              << raxis1 << ", raxis2=" << raxis2
-              << ", num-axes = " << p->num_axes;
+void TransposeR(int32 raxis1, int32 raxis2, Pattern *p,
+                bool increase_num_axes) {
+  if (!increase_num_axes) {
+    if (static_cast<uint32>(raxis1) >= static_cast<uint32>(p->num_axes) ||
+        static_cast<uint32>(raxis2) >= static_cast<uint32>(p->num_axes)) {
+      KALDI_ERR << "Invalid axes to transpose: raxis1="
+                << raxis1 << ", raxis2=" << raxis2
+                << ", num-axes = " << p->num_axes;
+    }
+  } else {
+    if (static_cast<uint32>(raxis1) >= KALDI_TENSOR_MAX_DIM ||
+        static_cast<uint32>(raxis2) >= KALDI_TENSOR_MAX_DIM) {
+      KALDI_ERR << "Invalid axes to transpose: raxis1="
+                << raxis1 << ", raxis2=" << raxis2
+                << ", num-axes = " << p->num_axes;
+    }
   }
   std::swap(p->strides[raxis1], p->strides[raxis2]);
   std::swap(p->dims[raxis1], p->dims[raxis2]);
   p->code = -1;
+  if (increase_num_axes) {
+    if (raxis1 >= p->num_axes) {
+      // checking both the conditionsbelow is redundant if the pattern is valid,
+      // but we don't assume that.
+      if (p->dims[raxis1] != 1 || p->strides[raxis1] != 0)
+        p->num_axes = raxis1 + 1;
+    }
+    if (raxis2 >= p->num_axes) {
+      // checking both the conditionsbelow is redundant if the pattern is valid,
+      // but we don't assume that.
+      if (p->dims[raxis2] != 1 || p->strides[raxis2] != 0)
+        p->num_axes = raxis2 + 1;
+    }
+  }
 }
 
 void Transpose(int32 axis1, int32 axis2, Pattern *p) {
diff --git a/src/tensor/pattern-utils.h b/src/tensor/pattern-utils.h
index 488f17a2891..9ce86cdac1d 100644
--- a/src/tensor/pattern-utils.h
+++ b/src/tensor/pattern-utils.h
@@ -317,14 +317,22 @@ inline void Squeeze(int32 axis, Pattern *p) {
 /** Transpose the two specified axes (specified in the private/reversed
     numbering) of a Pattern.
 
-    @param [in] raxis1  First axis to be transposed; must be in range
-                        `[0, p->num_axes - 1]`
-    @param [in] raxis2  Second axis to be transposed; must be in range
-                        `[0, p->num_axes - 1]`
+    @param [in] raxis1  First axis to be transposed; must be >=0,
+                        and if increase_num_axes is false, must be
+                        less than p->num_axes.
+    @param [in] raxis2  Second axis to be transposed; must be >=0,
+                        and if increase_num_axes is false, must be
+                        less than p->num_axes.
                         If identical to axis1, nothing will be done.
     @param [in,out] p  Pattern whose axes are to be transposed.
+    @param [in] increase_num_axes   If this is true, we allow
+                        raxis1 and/or raxis2 to be >= p->num_axes;
+                        we will increase p->num_axes as necessary
+                        if this operation results in any
+                        raxis >= p->num_axes becoming non-trivial.
  */
-void TransposeR(int32 raxis1, int32 raxis2, Pattern *p);
+void TransposeR(int32 raxis1, int32 raxis2, Pattern *p,
+                bool increase_num_axes = false);
 
 
 /** Transpose the two specified axes (specified in the private/reversed
diff --git a/src/tensor/pattern.h b/src/tensor/pattern.h
index ed879018854..b091014aede 100644
--- a/src/tensor/pattern.h
+++ b/src/tensor/pattern.h
@@ -318,6 +318,8 @@ namespace tensor {
 
     Set-equivalent:   Two Patterns are set-equivalent if their memory-index-sets
                       are identical.
+                      Two Pattern-tuples are set-equivalent if their
+                      memory-index-tuple-sets are identical.
 
 
     Shape of a Pattern: The vector of the dimensions of a Pattern: e.g. [] for
@@ -391,6 +393,28 @@ namespace tensor {
                       It is easy to show that the linear property is transitive;
                       that is if P is linear in Q and Q is linear in R, then
                       P is linear in R.
+    Reduced pattern:
+                      A pattern is in reduced form if there is no
+                      set-equivalent pattern which has fewer axes.
+
+                      What this means more concretely is that the pattern has no
+                      trivial axes and has no pairs of axes which could be
+                      combined.  For example, a matrix where successive rows
+                      "touch" (i.e. they are not separated by a stride) can
+                      always be reduced.
+
+    Reduced pattern-tuple:
+                      A pattern-tuple is in reduced form if there is no
+                      set-equivalent pattern-tuple which has fewer axes
+                      (defining the num_axes of a pattern-tuple as the
+                      greatest of the num_axes of the patterns in the tuple).
+
+                      What this means more concretely/intuitively is that there
+                      are no axes which are trivial for all patterns and can be
+                      removed; and there are no pairs of axes which can be
+                      combined for all patterns in the tuple.
+
+
 
     Regularity property:   This is a property of Patterns that is relevant when
                       reducing Patterns to a common set of strides.  It can

From 553f4a81771aae9545d755d3e69f4b65e4ff6158 Mon Sep 17 00:00:00 2001
From: Daniel Povey <dpovey@gmail.com>
Date: Wed, 19 Jun 2019 10:32:32 -0400
Subject: [PATCH 136/163] [src] TEnsor progress.

---
 src/tensor/cuda-utils.cc                      | 242 +++++++-------
 src/tensor/cuda-utils.h                       | 311 ++++++------------
 src/tensor/linear-cpu-ops.h                   |  86 ++++-
 src/tensor/linear-cpu-ref-ops.h               |  23 +-
 .../{linear-gpu-ops.h => linear-cuda-ops.h}   |  24 +-
 src/tensor/linear-ops.cc                      | 173 +++++++---
 src/tensor/linear-ops.h                       |   9 +-
 src/tensor/pattern-utils.h                    |  89 ++++-
 src/tensor/pattern.h                          |  17 +-
 9 files changed, 545 insertions(+), 429 deletions(-)
 rename src/tensor/{linear-gpu-ops.h => linear-cuda-ops.h} (96%)

diff --git a/src/tensor/cuda-utils.cc b/src/tensor/cuda-utils.cc
index 766dc9154ee..a66bec0d964 100644
--- a/src/tensor/cuda-utils.cc
+++ b/src/tensor/cuda-utils.cc
@@ -24,7 +24,6 @@ namespace kaldi {
 namespace tensor {
 
 #define KALDI_STANDARD_THREAD_BLOCK_SIZE 256
-#define KALDI_TARGET_NUM_THREAD_BLOCKS 1024
 
 
 /**
@@ -37,11 +36,10 @@ namespace tensor {
                           *appended* to the vector `kernels`.
 
  */
-static void SplitStandardKernelX(const StandardThreeArgKernel &kernel,
-                                 std::vector<StandardThreeArgKernel> *kernels) {
+static void SplitStandardKernelX(const StandardOneArgKernel &kernel,
+                                 std::vector<StandardOneArgKernel> *kernels) {
   int cur_grid_dim = kernels->back().grid_dim.x;
   KALDI_ASSERT(cur_grid_dim > 65535);
-    return;
   int num_kernels = (kernels->back().grid_dim.x + 65534) / 65535;
 
   size_t cur_size = kernels.size(),
@@ -66,16 +64,14 @@ static void SplitStandardKernelX(const StandardThreeArgKernel &kernel,
     if (i + 1 < num_kernels) {
       // the following actually has no effect on operation since all
       // threads will run; it's more for clarity.
-      k.sizes.max_offset_a.x = this_grid_dim * k.sizes.block_stride_a.x;
+      k.sizes.mindex_a_range.x = this_grid_dim * k.sizes.block_stride_a.x;
     } else {
       // for the last kernel, this limit might actually make a difference, as
       // the highest-numbered thread block in the last kernel may not have all
       // threads run.
-      k.sizes.max_offset_a.x -= prev_grid_dim_sum * k.sizes.block_stride_a.x;
+      k.sizes.mindex_a_range.x -= prev_grid_dim_sum * k.sizes.block_stride_a.x;
     }
-    k.base_offset_a += prev_grid_dim_sum * k.sizes.block_stride_a.x;
-    k.base_offset_b += prev_grid_dim_sum * k.sizes.block_stride_b.x;
-    k.base_offset_c += prev_grid_dim_sum * k.sizes.block_stride_c.x;
+    k.offset_a += prev_grid_dim_sum * k.sizes.block_stride_a.x;
 
     prev_grid_dim_sum += this_grid_dim;
   }
@@ -84,8 +80,8 @@ static void SplitStandardKernelX(const StandardThreeArgKernel &kernel,
 
 // This is a copy of SplitStandardKernelX above, but with x's changed to y's.
 // See the documentation for SplitStandardKernelX.
-static void SplitStandardKernelY(const StandardThreeArgKernel &kernel,
-                                 std::vector<StandardThreeArgKernel> *kernels) {
+static void SplitStandardKernelY(const StandardOneArgKernel &kernel,
+                                 std::vector<StandardOneArgKernel> *kernels) {
   int cur_grid_dim = kernels->back().grid_dim.y;
   KALDI_ASSERT(cur_grid_dim > 65535);
     return;
@@ -113,16 +109,14 @@ static void SplitStandardKernelY(const StandardThreeArgKernel &kernel,
     if (i + 1 < num_kernels) {
       // the following actually has no effect on operation since all
       // threads will run; it's more for clarity.
-      k.sizes.max_offset_a.y = this_grid_dim * k.sizes.block_stride_a.y;
+      k.sizes.mindex_a_range.y = this_grid_dim * k.sizes.block_stride_a.y;
     } else {
       // for the last kernel, this limit might actually make a difference, as
       // the highest-numbered thread block in the last kernel may not have all
       // threads run.
-      k.sizes.max_offset_a.y -= prev_grid_dim_sum * k.sizes.block_stride_a.y;
+      k.sizes.mindex_a_range.y -= prev_grid_dim_sum * k.sizes.block_stride_a.y;
     }
-    k.base_offset_a += prev_grid_dim_sum * k.sizes.block_stride_a.y;
-    k.base_offset_b += prev_grid_dim_sum * k.sizes.block_stride_b.y;
-    k.base_offset_c += prev_grid_dim_sum * k.sizes.block_stride_c.y;
+    k.offset_a += prev_grid_dim_sum * k.sizes.block_stride_a.y;
 
     prev_grid_dim_sum += this_grid_dim;
   }
@@ -133,13 +127,11 @@ static void SplitStandardKernelY(const StandardThreeArgKernel &kernel,
 /**
    This function is used to handle cases where we still have more than 3 axes
    (should be very rare since we only use the standard kernel on reduced
-   pattern-tuples).  It creates copies of a kernel that differ only
-   in max_offset_a, max_offset_b, max_offset_c, to take account of
-   an raxis that has not been included in the kernel.
+   pattern-tuples).  It creates copies of a kernel that differ only in
+   mindex_a_range to take account of an raxis that has not been included in the
+   kernel.
 
      @param [in] a      The first Pattern that's an arg to the kernel
-     @param [in] b      The second Pattern that's an arg to the kernel
-     @param [in] c      The third Pattern that's an arg to the kernel
      @param [in] raxis  The raxis that we're splitting on; in place of the
                         single input 'kernel' we will have a separate
                         output for each i in [0, a.dim[raxis] - 1]
@@ -152,21 +144,17 @@ static void SplitStandardKernelY(const StandardThreeArgKernel &kernel,
  */
 static void SplitStandardKernelByAxis(
     const Pattern &a,
-    const Pattern &b,
-    const Pattern &c,
     int32 raxis,
-    const StandardThreeArgKernel &kernel
-    std::vector<StandardThreeArgKernel> *kernels) {
+    const StandardOneArgKernel &kernel
+    std::vector<StandardOneArgKernel> *kernels) {
   // Asserting raxis > 0 is just from knowledge of how the calling code works,
   // it is not something that would affect the operation of this function.
   KALDI_ASSERT(raxis > 0 && raxis < a.num_axes);
   int32 dim = a.dims[raxis];
   for (int32 i = 0; i < dim; i++) {
     kernels->push_back(kernel);
-    StandardThreeArgKernel &k = kernels->back();
-    k.max_offset_a += i * a.strides[raxis];
-    k.max_offset_b += i * b.strides[raxis];
-    k.max_offset_c += i * c.strides[raxis];
+    StandardOneArgKernel &k = kernels->back();
+    k.mindex_a_range += i * a.strides[raxis];
   }
 }
 
@@ -182,9 +170,7 @@ static void SplitStandardKernelByAxis(
 // the start of the tensor is on a 128-byte boundary; we can consider these
 // kinds of optimizations in future).
 static void ProcessStandardKernelX(const Pattern &a,
-                                   const Pattern &b,
-                                   const Pattern &c,
-                                   StandardThreeArgKernel *k) {
+                                   StandardOneArgKernel *k) {
   KALDI_PARANOID_ASSERT(a.num_axes >= 1 && a.dims[0] > 1);
   // Note: b.dims[0] is either 'dim' or 1; it won't affect anything, we only
   // need b's stride.
@@ -201,12 +187,7 @@ static void ProcessStandardKernelX(const Pattern &a,
 
   k->sizes.thread_stride_a.x = a_stride;
   k->sizes.block_stride_a.x = a_stride * bs;
-  k->sizes.thread_stride_b.x = b_stride;
-  k->sizes.block_stride_b.x = b_stride * bs;
-  k->sizes.thread_stride_c.x = c_stride;
-  k->sizes.block_stride_c.x = c_stride * bs;
-
-  k->sizes.max_offset_a.x = dim * a_stride;
+  k->sizes.mindex_a_range.x = dim * a_stride;
 
   k->block_dim.x = bs;
   k->grid_dim.x = num_blocks;
@@ -219,16 +200,12 @@ static void ProcessStandardKernelX(const Pattern &a,
 // and it won't be 0 because axis 0 goes to x and will already have been
 // processed.
 static void ProcessStandardKernelY(const Pattern &a,
-                                   const Pattern &b,
-                                   const Pattern &c,
                                    int32 raxis,
-                                   StandardThreeArgKernel *kernel) {
+                                   StandardOneArgKernel *kernel) {
   KALDI_PARANOID_ASSERT(a.num_axes > raxis && raxis > 0);
 
   int dim = a.dims[raxis],
-      a_stride = a.strides[raxis],
-      b_stride = b.strides[raxis],
-      c_stride = c.strides[raxis];
+      stride = a.strides[raxis];
 
   // bs means block size.
   int bs_x = kernel->block_dim.x;
@@ -240,14 +217,10 @@ static void ProcessStandardKernelY(const Pattern &a,
     bs_y = 1;  // just for robustness to any later code changes.
   int num_blocks = (dim + bs_y - 1) / bs_y;  // round up.
 
-  k->sizes.thread_stride_a.y = a_stride;
-  k->sizes.block_stride_a.y = a_stride * bs_y;
-  k->sizes.thread_stride_b.y = b_stride;
-  k->sizes.block_stride_b.y = b_stride * bs_y;
-  k->sizes.thread_stride_c.y = c_stride;
-  k->sizes.block_stride_c.y = c_stride * bs_y;
+  k->sizes.thread_stride_a.y = stride;
+  k->sizes.block_stride_a.y = stride * bs_y;
 
-  k->sizes.max_offset_a.y = dim * a_stride;
+  k->sizes.mindex_a_range.y = dim * stride;
   k->block_dim.y = bs_y;
   k->grid_dim.y = num_blocks;
 }
@@ -257,14 +230,12 @@ static void ProcessStandardKernelY(const Pattern &a,
 // are assumed to already have been set up) using an raxis-index specified by the
 // user; this will normally be the one with the largest dim, and it won't be 0
 // because axis 0 goes to x and will already have been processed.
-static void ProcessStandardKernelZ(const Pattern &a, const Pattern &b,
-                                   int32 raxis,
-                                   StandardThreeArgKernel *kernel) {
+static void ProcessStandardKernelZ(const Pattern &a,
+                                   StandardOneArgKernel *kernel) {
   KALDI_PARANOID_ASSERT(a.num_axes > raxis && raxis > 0);
 
   int dim = a.dims[raxis],
-      a_stride = a.strides[raxis],
-      b_stride = b.strides[raxis],
+      stride = a.strides[raxis];
       c_stride = c.strides[raxis];
 
   // bs means block size.
@@ -284,16 +255,12 @@ static void ProcessStandardKernelZ(const Pattern &a, const Pattern &b,
 
   int num_blocks = dim / bs_z;  // round up.
 
-  k->sizes.thread_stride_a.z = a_stride;
-  k->sizes.block_stride_a.z = a_stride * bs_z;
-  k->sizes.thread_stride_b.z = b_stride;
-  k->sizes.block_stride_b.z = b_stride * bs_z;
-  k->sizes.thread_stride_c.z = c_stride;
-  k->sizes.block_stride_c.z = c_stride * bs_z;
+  k->sizes.thread_stride_a.z = stride;
+  k->sizes.block_stride_a.z = stride * bs_z;
 
-  // The kernel code will not actually inspect max_offset_a.z; we just leave it
+  // The kernel code will not actually inspect mindex_a_range.z; we just leave it
   // as a guide in case of future code changes.
-  k->sizes.max_offset_a.z = dim * a_stride;
+  k->sizes.mindex_a_range.z = dim * stride;
 
   k->block_dim.z = bs_z;
   k->grid_dim.z = num_blocks;
@@ -303,10 +270,8 @@ static void ProcessStandardKernelZ(const Pattern &a, const Pattern &b,
 
 
 void FinalizeKernel(const Pattern &a,
-                    const Pattern &b,
-                    const Pattern &c,
                     ArrayRef<int32> remaining_axes,
-                    std::vector<StandardThreeArgKernel> *kernels) {
+                    std::vector<StandardOneArgKernel> *kernels) {
   // prev_size is the size of 'kernels'  before the most recent one
   // was added (since GetStandardKernel appends).  Would normally be zero.
   size_t prev_size = kernels->size() - 1;
@@ -334,14 +299,15 @@ void FinalizeKernel(const Pattern &a,
 }
 
 
-// Returns the raxis with the smallest abs(stride).  It is an error if any axis
-// has stride = 0 (i.e. is a trivial axis).  Intended to be called
-// from GetStandardKernel()
-int32 RaxisWithSmallestAbsStride(const Pattern &p) {
+// Returns the raxis with the most negative stride.  It is an error if any axis
+// has stride <= 0 (we can require this because of the normalization of the
+// pattern-tuples given to GetStandard{One,Two,Three}ArgKernel).  Intended to be
+// called from GetStandardKernel()
+int32 RaxisWithMostNegativeStride(const Pattern &p) {
   int32 num_axes = a.num_axes,
       ans = 0;
   for (int32 raxis = 1; raxis < num_axes; raxis++)
-    if (abs(p.strides[raxis]) < abs(p.strides[ans]))
+    if (p.strides[raxis] < p.strides[ans])
       ans = raxis;
   KALDI_ASSERT(p.strides[ans] != 0 &&
                "Args to GetStandardKernel() do not have the expected "
@@ -353,25 +319,23 @@ int32 RaxisWithSmallestAbsStride(const Pattern &p) {
 }
 
 
-void GetStandardThreeArgKernel(const Pattern &a,
-                               const Pattern &b,
-                               const Pattern &c,
-                               std::vector<StandardThreeArgKernel> *kernels) {
-  KALDI_PARANOID_ASSERT(DimsGeq(a, b) && a.num_axes >= b.num_axes &&
-                        Broadcastable(a, b));
-  int32 smallest_stride_raxis = RaxisWithSmallestStride(a);
+void GetStandardOneArgKernel(const Pattern &a,
+                             std::vector<StandardOneArgKernel> *kernels) {
+  int32 smallest_stride_raxis = RaxisWithMostNegativeStride(a);
+  if (a.strides[smallest_stride_raxis] <= 0)
+    KALDI_ERR << "Input pattern does not have expected properties";
+
   if (smallest_stride_raxis != 0) {
     // This is unexpected but we can deal with it by swapping axes.
-    Pattern a_new(a), b_new(b), c_new(c);
+    Pattern a_new(a);
     TransposeR(0, smallest_stride_raxis, &a_new);
-    TransposeR(0, smallest_stride_raxis, &b_new, true);
-    TransposeR(0, smallest_stride_raxis, &c_new, true);
-    GetStandardKernel(a_new, b_new, c_new, kernels);
+    GetStandardKernel(a_new, kernels);
     return;
   }
   kernels->clear();
   kernels->resize(1);
   Kernel *kernel = &(kernels->back());
+  kernel->offset_a = a.offset;
 
   int32 num_axes = a.num_axes;
   switch (num_axes) {
@@ -380,16 +344,16 @@ void GetStandardThreeArgKernel(const Pattern &a,
       // only processes a single element, so there is nothing more to do.
     return;
     case 1:
-      ProcessStandardKernelX(a, b, kernel);
-      FinalizeKernel(a, b, {}, kernels);
+      ProcessStandardKernelX(a, kernel);
+      FinalizeKernel(a, {}, kernels);
       return;
     case 2:
-      ProcessStandardKernelX(a, b, kernel);
-      ProcessStandardKernelY(a, b, 1, kernel);
-      FinalizeKernel(a, b, {}, kernels);
+      ProcessStandardKernelX(a, kernel);
+      ProcessStandardKernelY(a, 1, kernel);
+      FinalizeKernel(a, {}, kernels);
       return;
     default: {  // >= 3 axes
-      ProcessStandardKernelX(a, b, kernel);
+      ProcessStandardKernelX(a, kernel);
       // Sort the raxes 1, 2,... from greatest to least dimension.  (Note: there
       // are cases where this won't be optimal and we may want to take the
       // stride into account in order to ensure more consolidated memory access;
@@ -407,8 +371,8 @@ void GetStandardThreeArgKernel(const Pattern &a,
                   // sorted from greatest to least dim.
                   return a.dims[x] > a.dims[y];
                 });
-      ProcessStandardKernelY(a, b, raxes[0], kernel);
-      ProcessStandardKernelZ(a, b, raxes[1], kernel);
+      ProcessStandardKernelY(a, raxes[0], kernel);
+      ProcessStandardKernelZ(a, raxes[1], kernel);
       raxes_data = &(raxes[0]);
       // The expression {raxes_data + 2, raxes_data + num_axes - 1} is a
       // constructor to ArrayRef which gives an array of ints including raxes[2]
@@ -416,52 +380,98 @@ void GetStandardThreeArgKernel(const Pattern &a,
       // that we haven't already processed, and they should all have fairly
       // small dimension as we've sorted `raxes` from greatest to least
       // dimension.  We'll process these left-over raxes by duplicating the
-      // kernel, shifting the base_offset_{a,b,c} value as needed.
-      FinalizeKernel(a, b, {raxes_data + raxes_data + num_axes - 1},
+      // kernel, shifting the offset_{a,b,c} value as needed.
+      FinalizeKernel(a, {raxes_data + raxes_data + num_axes - 1},
                      raxes.begin  kernel);
       return;
     }
   }
 }
 
-// Convert from 3-arg to 2-arg kernel, discarding information.
-static void ConvertKernel(const StandardThreeArgKernel &src,
-                          StandardTwoArgKernel *dest) {
+// Convert from 1-arg to 3-arg kernel.
+static void ConvertToThreeArgKernel(
+    const Pattern &a,
+    const Pattern &b,
+    const Pattern &c,
+    const StandardOneArgKernel &src,
+    StandardThreeArgKernel *dest) {
   dest->dim_block = src.dim_block;
   dest->dim_grid = src.dim_grid;
-  dest->sizes.thread_stride_a = src.sizes.thread_stride_a;
-  dest->sizes.thread_stride_b = src.sizes.thread_stride_b;
-  dest->sizes.block_stride_a = src.sizes.block_stride_a;
-  dest->sizes.block_stride_b = src.sizes.block_stride_b;
-  dest->sizes.max_offset_a = src.sizes.max_offset_a;
-  dest->base_offset_a = src.base_offset_a;
-  dest->base_offset_b = src.base_offset_b;
+  dest->offset_a = src.offset_a;
+  dest->offset_b = ConvertMindex(a, b, src.offset_a);
+  dest->offset_c = ConvertMindex(a, c, src.offset_a);
+
+  StandardThreeArgKernelSizes &s = dest->sizes;
+  s.thread_stride_a = src.sizes.thread_stride_a;
+  s.block_stride_a = src.sizes.block_stride_a;
+  s.mindex_a_range = src.sizes.mindex_a_range;
+
+  s.thread_stride_b.x = ConvertMindexDifference(a, b, s.thread_stride_a.x);
+  s.thread_stride_b.y = ConvertMindexDifference(a, b, s.thread_stride_a.y);
+  s.thread_stride_b.z = ConvertMindexDifference(a, b, s.thread_stride_a.z);
+  s.block_stride_b.x = ConvertMindexDifference(a, b, s.block_stride_a.x);
+  s.block_stride_b.y = ConvertMindexDifference(a, b, s.block_stride_a.y);
+  s.block_stride_b.z = ConvertMindexDifference(a, b, s.block_stride_a.z);
+
+  s.thread_stride_c.x = ConvertMindexDifference(a, c, s.thread_stride_a.x);
+  s.thread_stride_c.y = ConvertMindexDifference(a, c, s.thread_stride_a.y);
+  s.thread_stride_c.z = ConvertMindexDifference(a, c, s.thread_stride_a.z);
+  s.block_stride_c.x = ConvertMindexDifference(a, c, s.block_stride_a.x);
+  s.block_stride_c.y = ConvertMindexDifference(a, c, s.block_stride_a.y);
+  s.block_stride_c.z = ConvertMindexDifference(a, c, s.block_stride_a.z);
 }
 
-// Convert from 3-arg to 1-arg kernel, discarding information.
-static void ConvertKernel(const StandardThreeArgKernel &src,
-                          StandardTwoArgKernel *dest) {
+// Convert from 1-arg to 2-arg kernel.
+static void ConvertToTwoArgKernel(
+    const Pattern &a,
+    const Pattern &b,
+    const StandardOneArgKernel &src,
+    StandardThreeArgKernel *dest) {
   dest->dim_block = src.dim_block;
   dest->dim_grid = src.dim_grid;
-  dest->sizes.thread_stride_a = src.sizes.thread_stride_a;
-  dest->sizes.block_stride_a = src.sizes.block_stride_a;
-  dest->sizes.max_offset_a = src.sizes.max_offset_a;
-  dest->base_offset_a = src.base_offset_a;
+  dest->offset_a = src.offset_a;
+  dest->offset_b = ConvertMindex(a, b, src.offset_a);
+
+  StandardThreeArgKernelSizes &s = dest->sizes;
+  s.thread_stride_a = src.sizes.thread_stride_a;
+  s.block_stride_a = src.sizes.block_stride_a;
+  s.mindex_a_range = src.sizes.mindex_a_range;
+
+  s.thread_stride_b.x = ConvertMindexDifference(a, b, s.thread_stride_a.x);
+  s.thread_stride_b.y = ConvertMindexDifference(a, b, s.thread_stride_a.y);
+  s.thread_stride_b.z = ConvertMindexDifference(a, b, s.thread_stride_a.z);
+  s.block_stride_b.x = ConvertMindexDifference(a, b, s.block_stride_a.x);
+  s.block_stride_b.y = ConvertMindexDifference(a, b, s.block_stride_a.y);
+  s.block_stride_b.z = ConvertMindexDifference(a, b, s.block_stride_a.z);
+}
+
+
+void GetStandardThreeArgKernel(const Pattern &a,
+                               const Pattern &b,
+                               const Pattern &c,
+                               std::vector<StandardThreeArgKernel> *kernels) {
+  KALDI_PARANOID_ASSERT(a.num_axes >= b.num_axes && a.num_axes >= c.num_axes &&
+                        Broadcastable(a, b) && DimsGeq(a, b) &&
+                        Broadcastable(a, c) && DimsGeq(a, c));
+  std::vector<StandardThreeArgKernel> temp_kernels;
+  GetStandardOneArgKernel(a, kernels);
+  size_t size = temp_kernels.size();
+  kernels->resize(size);
+  for (size_t i = 0; i < size; i++)
+    ConvertToThreeArgKernel(a, b, c, temp_kernels[i], &((*kernels)[i]));
 }
 
-// Doing a 2-arg kernel by first doing the 3-arg one is of course
-// wasteful
 void GetStandardTwoArgKernel(const Pattern &a,
                              const Pattern &b,
-                             const Pattern &c,
                              std::vector<StandardThreeArgKernel> *kernels) {
+  KALDI_PARANOID_ASSERT(DimsGeq(a, b) && a.num_axes >= b.num_axes &&
+                        Broadcastable(a, b));
   std::vector<StandardThreeArgKernel> temp_kernels;
-  GetStandardThreeArgKernel(a, b, b, &temp_kernels);
+  GetStandardOneArgKernel(a, kernels);
   size_t size = temp_kernels.size();
   kernels->resize(size);
   for (size_t i = 0; i < size; i++)
-    ConvertKernel(temp_kernels[i],
-
+    ConvertToThreeArgKernel(a, b, c, temp_kernels[i], &((*kernels)[i]));
 }
 
 
diff --git a/src/tensor/cuda-utils.h b/src/tensor/cuda-utils.h
index 2c3a737b106..03cbd22afc9 100644
--- a/src/tensor/cuda-utils.h
+++ b/src/tensor/cuda-utils.h
@@ -31,211 +31,77 @@ namespace kaldi {
 namespace tensor {
 
 
-/**
-   These utilities are mostly for use with non-reducing (but possibly
-   broadcasting) kernels.  The setup is: we have two Tensors a and b.  We are
-   doing some operation like, say, a = sigmoid(b) that's non-reducing (no
-   summation) but possibly broadcasting.
-
-   For generality and also (reasonable) speed, we have a standard pattern/interface
-   of kernel for such operations.
-
-
-  void _standard_kernel(StandardTwoArgKernelSizes f, float *a, float *b) {
-    int a_offset_x = f.thread_stride_a.x * threadIdx.x + block_stride_a.x * blockIdx.x,
-      a_offset_yz = f.thread_stride_a.y * threadIdx.y + block_stride_a.y * blockIdx.y +
-                    f.thread_stride_a.z * threadIdx.z + block_stride_a.z * blockIdx.z
-    int b_offset_x = f.thread_stride_b.x * threadIdx.x + block_stride_b.x * blockIdx.x,
-      b_offset_yz = f.thread_stride_b.y * threadIdx.y + block_stride_b.y * blockIdx.y +
-                    f.thread_stride_b.z * threadIdx.z + block_stride_b.z * blockIdx.z
-
-     for (; a_offset_x < f.max_offset_a;
-        a_offset_x += block_stride_a * blockDim.x,
-        b_offset_x += block_stride_b * blockDim.x) {
-     a[a_offset_x + a_offset_yz] = some_func(b[b_offset_x + b_offset_yz]);
-  }
-
-  It's possible to encode a great variety of elementwise operations of up to 6
-  dimensions using the pattern above; the rare cases that can't be handled that
-  way can be handled using multiple invocations of the same kernel.
-
-  We don't make any special allowances for things like matrix transpose, though;
-  in future we may make a special variety of kernel that can handle transposes
-  while using coalesced memory access.
-
-  *Algorithm*.
-
-  We first ensure that the first raxis (raxis=0) of pattern a has the smallest
-  abs(stride).  This is necessary later in certain cases for the loop to work
-  correctly.
 
-  We try various algorithms for generating the kernel info; each one
-  returns a score, and we then select the one that gave the best score.
+struct StandardOneArgKernelSizes {
+  dim3 thread_stride_a;
+  dim3 block_stride_a;
+  dim3 mindex_a_range;
+};
 
 
+class StandardOneArgKernel {
+  dim3 dim_block;
+  dim3 dim_grid;
+  StandardOneArgKernelSizes sizes;
+  // offset_a is an offset that we have to add to the data-pointer of a before
+  // we call the kernel; it will normally equal the 'offset' members of the
+  // pattern, but may be different if we have to generate multiple kernels due
+  // to, say, size constraints.
+  int64 offset_a;
+};
 
 
-  switch(num_axes) {
-    case 0:
+/**
+   This function returns the dimensions/sizes for one or more "standard one arg
+   kernels" to execute a "standard one arg operation" on Tensors a, of which
+   only the patterns are provided.  We define a standard one-arg operation as an
+   in-place elementwise operation of the form:
 
-    case 1:
-      pretty easy.
-    case 2:
-      copy our matrix code.
-    case 3:
+       a[i] = f(a[i])
 
+   where i is an index-tuple in the index-tuple-set of the pattern a; search in
+   pattern.h for the meaning of this notation.  Note: one-arg kernels may not
+   actually be needed in practice as two-arg kernels with a and b identical
+   can do the same thing.
 
+   The standard one-arg kernel is as follows:
+<code>
+template <typename T>
+__global__ void _some_one_arg_kernel(StandardOneArgKernelSizes f, T *a) {
+    int a_offset_x = f.thread_stride_a.x * threadIdx.x + block_stride_a.x * blockIdx.x,
+      a_offset_y = f.thread_stride_a.y * threadIdx.y + block_stride_a.y * blockIdx.y,
+      a_offset_z = f.thread_stride_a.z * threadIdx.z + block_stride_a.z * blockIdx.z,
+      a_offset = a_offset_x + a_offset_y + a_offset_z;
 
+     if (a_offset_x < f.mindex_a_range.x && a_offset_y < f.mindex_a_range.y)
+       a[a_offset] = some_func(a[a_offset]);
   }
 
-  Simplest algorithm, applicable for up to 4 axes and if 1st axis is >= 256:
-
-
-
-
-  : 1st axis gets allocated to
-  block-dim x and spills over if necessary into grid-dim x.  Remaining dims go
-  into grid-dim x if not used, then grid-dims y and z.
-
-     Measure on:
-        coalesced memory access (no, should always have this).
-
-        - Loop length
-
-
-        number of blocks; want no more than about 1024
-        kernel size (prefer around 256; too small much worse than
-         much worse).
-
-
-
-  Next algorithm (only applicable if 1st dim is between 32 and 1024 and there are
-  >=2 dims, and the product of the remaining dims is >1024:
-
-  Make the loop be over one of those remaining dims.
-
-
-  We assume raxis 0 of a has stride 1, which it will if any dim had
-  stride 1.
-
-
-
-
-  The loop with `a_offset_x < f.max_offset_a` allows us to cover several elements with
-  one kernel (reducing kernel startup cost) and also makes it possible to fit
-
-
-
-
-  Depends on the dim...
-    Only one dim:
-       Type 1 kernel using a while loop and the if-statement,
-       with only the x dimension; thread block size = 128,
-       number of blocks no greater than 1024.
-
-
-   Two dims: # Note: we are assuming dim 0 is the one with stride=1 (if any).
-     Make sure that, for a, the first axis dominates the second.  (c.f.
-     axis-dominance property).
-
-     If first dim >= 64  # first dim alone will be the thread dim.
-       if (first_dim < 200) { // purposely between powers of 2.
-          threadDim.x = first_dim;
-          blockDim.x = 1
-          threadDim.y = 1
-          blockDim.y = second_dim;  # Use multiple kernels if limit of
-                                    # 65536 is an issue.
-       } else {
-         # assume 2nd dim becomes blockDim.y; work out the max num-blocks
-         # we might want of 1st dim.
-
-         # break up first_dim into blocks of 128;
-         # use the num-blocks given above if it's limiting,
-         # and loop for the rest.
-       }
-    else (first_dim < 64),
-       swap the x and y axes; use 256/first_dim to limit thread-block
-       size.
-
-
-    More than two dims (up to 5).
-       Sort dims from smallest to greatest stride.
-
-       First dim maps to 1st dimension.  If it is
-       <128, we'll have to augment the thread block size
-       with another dim.
-         - First choice: find a dim whose product
-           with the first dim is <1024, and if one
-           exists, take the closest one to 256.
-         - Second choice: take the next-smallest-stride
-           dim, choose 256/first_dim as the thread
-           block size, and put the rest of it in
-           the grid size. [would go to x, while the
-           1st choice goes to the y.]
-
-       Now iterate through the
-
-
-
-
-       If first dim is small, increase block size with another
-       dim.  Choose smallest remaining dims as blockDim.y and blockDim.z,
-       as long as num-blocks < 1024.
-
-
-       (Choose one that gives num-blocks between 128
-       and 1024 if already present; otherwise split one of the
-       dims and put it as y).
-
-       Put any remaining dims as gridDim.y and gridDim.z.
-       If this isn't enough, use multiple kernel launches
-       by (initially) iterating over the smallest dim.
-
-
-
-
-
-   Sometimes we can handle something by launching two type 1 kernels, or
-   a type 2 kernel
-
-   Type 2 kernels use the x, y and z dimensions of
-   grids and blocks
-
-
-   First: we define type 1 kernel as a non-reducing (but possibly broadcasting)
-   operation between two Tensors, e.g. a = b or a = sigmoid(b).  This is
-   a rather general type of kernel that can be used as the generic case
-   (applicable to arbitrary tensors).
+  // which would be invoked as follows:
+  template <typename T>
+  void some_one_arg_kernel(const Tensor &a,
+                               const StandardOneArgKernel &k) {
+    _some_one_arg_kernel<<<k.grid_dim, k.block_dim>>>(
+         k.sizes(), a.GetData<T>() + k.base_offset_a);
+         b.GetData<T>() + k.base_offset_b);
+  }
+  //
 
-   The KernelInfo is the part that needs to be passed into the
-   kernel itself.  There are also two other things needed to launch the
-   kernel:
-<code>
-      dim3 grid_dim, block_dim;
 </code>
+  }
 
-   The basic operation we'll do in the kernel is something like this;
-   let 'a' and 'b' be pointers to float or something like that.  Let
-<code>
-    KernelInfo f;  // passed in.
-    int x_offset_a = f.thread_stride_a.x * threadIdx.x + f.block_stride_a.x * blockIdx.x;
-       y_offset_a = f.thread_stride_a.y * threadIdx.y + f.block_stride_a.y * blockIdx.y;
-       z_offset_a = f.thread_stride_a.z * threadIdx.z + f.block_stride_a.z * blockIdx.z;
-    // and similar statements to set x_offset_b, y_offset_b, z_offset_b.
-
-    if (x_offset_a < f.max_offset_a.x &&
-        y_offset_a < f.max_offset_a.y &&
-        z_offset_a < f.max_offset_a.z)
-      a[x_offset_a + y_offset_a + z_offset_a] =
-          b[x_offset_b + y_offset_b + z_offset_b];
-
-
-       thread_stride_a.y * threadIdx.y +
-       thread_stride_a.z * threadIdx.z +
-       block_stride_a.x * blockIdx.x
-    // clock speed e.g. 3 gHz.  Say 100 instructions.
-</code>
+      @param [in] a   Fattern for which we want the kernel (or kernels).  This is
+                      an elementwise operation so it must be in-place.
+                      All its strides are required to be positive (hence it may
+                      have not trivial axes).
+      @param [out] kernels  The kernels are output to this vector.  Normally,
+                      we'll have `kernels->size() == 1` at exit.  The user is expected
+                      to call all of them (the order doesn't matter).
  */
+void GetStandardOneArgKernel(const Pattern &a,
+                             std::vector<StandardOneArgKernel> *kernels);
+
+
 
 
 struct StandardTwoArgKernelSizes {
@@ -243,7 +109,7 @@ struct StandardTwoArgKernelSizes {
   dim3 thread_stride_b;
   dim3 block_stride_a;
   dim3 block_stride_b;
-  dim3 max_offset_a;
+  dim3 mindex_a_range;
 };
 
 
@@ -252,11 +118,12 @@ class StandardTwoArgKernel {
   dim3 dim_grid;
   StandardTwoArgKernelSizes sizes;
   // offset_a and offset_b are offsets that we have to add to the data-pointers
-  // of a and b before we call the kernel; these will normally be zero, but may
-  // be nonzero if we have to generate multiple kernels due to, say, size
+  // of a and b before we call the kernel; these will normally equal the
+  // 'offset' members of the respective patterns, but they may be different from
+  // those if we have to generate multiple kernels due to, say, size
   // constraints.
-  int64 base_offset_a{0};
-  int64 base_offset_b{0};
+  int64 offset_a;
+  int64 offset_b;
 };
 
 
@@ -274,11 +141,10 @@ class StandardTwoArgKernel {
    dims of b (i.e.: no reduction).  We also require a.num_axes >= b.num_axes,
    which results from the tuple (a,b) having been reduced (see ReducePatternTuple()
    in pattern-tuple-utils.h).
-   The standard two-arg kernel is as follows, exemplifying it with the
-   function "some_func".
+   The standard two-arg kernel is as follows:
 <code>
 template <typename T>
-  void _standard_two_arg_kernel(StandardTwoArgKernelSizes f, T *a, const T *b) {
+__global__ void _some_two_arg_kernel(StandardTwoArgKernelSizes f, T *a, const T *b) {
     int a_offset_x = f.thread_stride_a.x * threadIdx.x + block_stride_a.x * blockIdx.x,
       a_offset_y = f.thread_stride_a.y * threadIdx.y + block_stride_a.y * blockIdx.y,
       a_offset_z = f.thread_stride_a.z * threadIdx.z + block_stride_a.z * blockIdx.z;
@@ -286,24 +152,29 @@ template <typename T>
                    f.thread_stride_b.y * threadIdx.y + block_stride_b.y * blockIdx.y +
                    f.thread_stride_b.z * threadIdx.z + block_stride_b.z * blockIdx.z;
 
-     if (a_offset_x < f.max_offset_a.x && a_offset_y < f.max_offset_a.y)
+     if (a_offset_x < f.mindex_a_range.x && a_offset_y < f.mindex_a_range.y)
        a[a_offset_x + a_offset_y + a_offset_z] = some_func(b[b_offset]);
   }
 
   // which would be invoked as follows:
   template <typename T>
-  void standard_two_arg_kernel(const Tensor &a, const Tensor &b,
+  void some_two_arg_kernel(const Tensor &a, const Tensor &b,
                                const StandardTwoArgKernel &k) {
-    _standard_two_arg_kernel<<<k.grid_dim, k.block_dim>>>(
+    _some_two_arg_kernel<<<k.grid_dim, k.block_dim>>>(
+         k.sizes,
          a.GetData<T>() + k.base_offset_a,
          b.GetData<T>() + k.base_offset_b);
   }
 
 </code>
-  }
+  There is also way to invoke two-arg kernels "in-place" so that the function
+  takes two args, like a = f(a, b).
 
-      @param [in] a   First pattern for which we want the kernel (or kernels)
+      @param [in] a   First pattern for which we want the kernel (or kernels).
+                      All its strides are required to be positive (hence it may
+                      have not trivial axes).
       @param [in] b   Second pattern for which we want the kernel (or kernels)
+                      Must satisfy Broadcastable(a, b).
       @param [out] kernels  The kernels are output this vector.  Normally,
                       we'll have `kernels->size() == 1` at exit.  The user is expected
                       to call all of them (the order doesn't matter).
@@ -323,20 +194,22 @@ class StandardThreeArgKernelSizes {
   dim3 block_stride_b;
   dim3 block_stride_c;
 
-  dim3 max_offset_a;
+  dim3 mindex_a_range;
 };
 
 class StandardThreeArgKernel {
   dim3 dim_block;
   dim3 dim_grid;
   StandardTwoArgKernelSizes sizes;
+
   // base_offset_{a,b,c} are offsets that we have to add to the data-pointers of
-  // a, b and c before we call the kernel; these will normally be zero, but may
-  // be nonzero if we have to generate multiple kernels due to, say, size
+  // the storage regions of a, b and c before we call the kernel; these will
+  // normally equal the 'offset' members of the input patterns, but they may
+  // differ from those if we have to generate multiple kernels due to, say, size
   // constraints.
-  int64 base_offset_a{0};
-  int64 base_offset_b{0};
-  int64 base_offset_c{0};
+  int64 base_offset_a;
+  int64 base_offset_b;
+  int64 base_offset_c;
 };
 
 
@@ -356,12 +229,11 @@ class StandardThreeArgKernel {
    tuple (a,b,c) having been reduced (see ReducePatternTuple() in
    pattern-tuple-utils.h).
 
-   The standard three-arg kernel is as follows, exemplifying it with the
-   function "some_func".
+   The standard three-arg kernel is as follows:
 <code>
 template <typename T>
-  void _standard_three_arg_kernel(StandardThreeArgKernelSizes f,
-                                  T *a, const T *b, const T *c) {
+  void _some_three_arg_kernel(StandardThreeArgKernelSizes f,
+                              T *a, const T *b, const T *c) {
     int a_offset_x = f.thread_stride_a.x * threadIdx.x + block_stride_a.x * blockIdx.x,
       a_offset_y = f.thread_stride_a.y * threadIdx.y + block_stride_a.y * blockIdx.y,
       a_offset_z = f.thread_stride_a.z * threadIdx.z + block_stride_a.z * blockIdx.z;
@@ -372,16 +244,17 @@ template <typename T>
                    f.thread_stride_c.y * threadIdx.y + block_stride_c.y * blockIdx.y +
                    f.thread_stride_c.z * threadIdx.z + block_stride_c.z * blockIdx.z;
 
-     if (a_offset_x < f.max_offset_a.x && a_offset_y < f.max_offset_a.y)
+     if (a_offset_x < f.mindex_a_range.x && a_offset_y < f.mindex_a_range.y)
        a[a_offset_x + a_offset_y + a_offset_z] = some_func(b[b_offset], c[c_offset]);
   }
 
   // which would be invoked as follows:
   template <typename T>
-  void standard_three_arg_kernel(const Tensor &a, const Tensor &b,
+  void some_three_arg_kernel(const Tensor &a, const Tensor &b,
                                  const Tensor &c,
                                  const StandardThreeArgKernel &k) {
-    _standard_three_arg_kernel<<<k.grid_dim, k.block_dim>>>(
+    _some_three_arg_kernel<<<k.grid_dim, k.block_dim>>>(
+         k.sizes,
          a.GetData<T>() + k.base_offset_a,
          b.GetData<T>() + k.base_offset_b,
          c.GetData<T>() + k.base_offset_c);
@@ -389,9 +262,11 @@ template <typename T>
 </code>
   }
 
-      @param [in] a   First pattern for which we want the kernel (or kernels)
+      @param [in] a   First pattern for which we want the kernel (or kernels).
+                      All its strides are required to be positive (hence it may
+                      have not trivial axes).
       @param [in] b   Second pattern for which we want the kernel (or kernels)
-      @param [in] c   Second pattern for which we want the kernel (or kernels)
+      @param [in] c   Third pattern for which we want the kernel (or kernels)
       @param [out] kernels  The kernels are output this vector.  Normally,
                       we'll have `kernels->size() == 1` at exit.  The user is expected
                       to call all of them (the order doesn't matter).
diff --git a/src/tensor/linear-cpu-ops.h b/src/tensor/linear-cpu-ops.h
index b80a37c6d17..a30a517395d 100644
--- a/src/tensor/linear-cpu-ops.h
+++ b/src/tensor/linear-cpu-ops.h
@@ -34,12 +34,12 @@ namespace tensor {
    Does a += b for a and b both scalar, on CPU.
  */
 template <class T>
-class ScalarPlusEqScalarOp<T, kCpuDevice>: public Op {
+class ScalarPlusEqScalarCpuOp<T>: public Op {
 
   ScalarPlusEqScalarOp(const Tensor &a, const Tensor &b): a_(a), b_(b) { }
 
   Op *Copy() {
-    return new ScalarPlusEqScalar<T, kCpuDevice>(a_, b_);
+    return new ScalarPlusEqScalar<T>(a_, b_);
   }
 
   void Do() {
@@ -63,14 +63,14 @@ class ScalarPlusEqScalarOp<T, kCpuDevice>: public Op {
    template for float and double, to use BLAS calls
 */
 template <class T>
-class StvectorPlusEqStvectorOp<T, kCpuDevice>: public Op {
+class StvectorPlusEqStvectorCpuOp<T>: public Op {
 
   StvectorPlusEqStvectorOp(const Tensor &a, const Tensor &b): a_(a), b_(b) { }
 
   int32 Properties() { return kConcreteOp; }
 
   Op *Copy() {
-    return new StvectorPlusEqStvectorOp<T, kCpuDevice>(a_, b_);
+    return new StvectorPlusEqStvectorCpuOp<T>(a_, b_);
   }
 
   void Do() {
@@ -105,11 +105,11 @@ class StvectorPlusEqStvectorOp<T, kCpuDevice>: public Op {
 
 // override for float that uses BLAS
 template <>
-class StvectorPlusEqStvectorOp<float, kCpuDevice>: public Op {
+class StvectorPlusEqStvectorCpuOp<float>: public Op {
   SvectorPlusEqSvectorOp(const Tensor &a, const Tensor &b): a_(a), b_(b) { }
   int32 Properties() { return kConcreteOp; }
   Op *Copy() {
-    return new SvectorPlusEqSvectorOp<float, kCpuDevice>(a_, b_);
+    return new SvectorPlusEqSvectorCpuOp<float>(a_, b_);
   }
   void Do() {
     const Pattern &a_pattern = a_.Pattern(),
@@ -137,11 +137,11 @@ class StvectorPlusEqStvectorOp<float, kCpuDevice>: public Op {
 
 // override for double that uses BLAS
 template <>
-class StvectorPlusEqStvectorOp<double, kCpuDevice>: public Op {
+class StvectorPlusEqStvectorCpuOp<double>: public Op {
   SvectorPlusEqSvectorOp(const Tensor &a, const Tensor &b): a_(a), b_(b) { }
   int32 Properties() { return kConcreteOp; }
   Op *Copy() {
-    return new SvectorPlusEqSvectorOp<double, kCpuDevice>(a_, b_);
+    return new SvectorPlusEqSvectorCpuOp<double>(a_, b_);
   }
   void Do() {
     const Pattern &a_pattern = a_.Pattern(),
@@ -181,13 +181,13 @@ class StvectorPlusEqStvectorOp<double, kCpuDevice>: public Op {
    template for float and double, to use BLAS calls.
 */
 template <class T>
-class ScalarPlusEqStvectorOp<T, kCpuDevice>: public Op {
+class ScalarPlusEqStvectorCpuOp<T>: public Op {
 
   StvectorPlusEqStvectorOp(const Tensor &a, const Tensor &b): a_(a), b_(b) { }
 
   int32 Properties() { return kConcreteOp; }
 
-  Op *Copy() { return new ScalarPlusEqStvectorOp<T, kCpuDevice>(a_, b_); }
+  Op *Copy() { return new ScalarPlusEqStvectorCpuOp<T>(a_, b_); }
 
   void Do() {
     DebugNormalOp(a, kReadWrite, b_, kRead);
@@ -211,12 +211,12 @@ class ScalarPlusEqStvectorOp<T, kCpuDevice>: public Op {
 
 // Override for T = float.
 template <>
-class ScalarPlusEqStvectorOp<float, kCpuDevice>: public Op {
+class ScalarPlusEqStvectorCpuOp<float>: public Op {
   StvectorPlusEqStvectorOp(const Tensor &a, const Tensor &b): a_(a), b_(b) { }
 
   int32 Properties() { return kConcreteOp; }
 
-  Op *Copy() { return new ScalarPlusEqStvectorOp<float, kCpuDevice>(a_, b_); }
+  Op *Copy() { return new ScalarPlusEqStvectorCpuOp<float>(a_, b_); }
 
   void Do() {
     DebugNormalOp(a, kReadWrite, b_, kRead);
@@ -234,12 +234,12 @@ class ScalarPlusEqStvectorOp<float, kCpuDevice>: public Op {
 
 // Override for T = double
 template <>
-class ScalarPlusEqStvectorOp<double, kCpuDevice>: public Op {
+class ScalarPlusEqStvectorCpuOp<double>: public Op {
   ScalarPlusEqStvectorOp(const Tensor &a, const Tensor &b): a_(a), b_(b) { }
 
   int32 Properties() { return kConcreteOp; }
 
-  Op *Copy() { return new ScalarPlusEqStvectorOp<double, kCpuDevice>(a_, b_); }
+  Op *Copy() { return new ScalarPlusEqStvectorCpuOp<double>(a_, b_); }
 
   void Do() {
     DebugNormalOp(a, kReadWrite, b_, kRead);
@@ -262,12 +262,12 @@ class ScalarPlusEqStvectorOp<double, kCpuDevice>: public Op {
    May not be used if a and b overlap.
 */
 template <class T>
-class StvectorPlusEqScalarOp<T, kCpuDevice>: public Op {
+class StvectorPlusEqScalarCpuOp<T>: public Op {
   StvectorPlusEqScalarOp(const Tensor &a, const Tensor &b): a_(a), b_(b) { }
 
   int32 Properties() { return kConcreteOp; }
 
-  Op *Copy() { return new StvectorPlusEqScalarOp<T, kCpuDevice>(a_, b_); }
+  Op *Copy() { return new StvectorPlusEqScalarCpuOp<T>(a_, b_); }
 
   void Do() {
     const Pattern &a_pattern = a_.Pattern(),
@@ -281,11 +281,13 @@ class StvectorPlusEqScalarOp<T, kCpuDevice>: public Op {
     if (uninitialized) {
       DebugNormalOp(a, kWrite, b_, kRead);
       T b = *b_data;
+#pragma unroll (4)
       for (int32 i = 0; i < dim; i++)
         a_data[i * a_stride] = b;
     } else {
       DebugNormalOp(a, kReadWrite, b_, kRead);
       T b = *b_data;
+#pragma unroll (4)
       for (int32 i = 0; i < dim; i++)
         a_data[i * a_stride] += b;
     }
@@ -295,9 +297,59 @@ class StvectorPlusEqScalarOp<T, kCpuDevice>: public Op {
 };
 
 
+/**
+   Operation doing a += b with a a vector or strided vector (implicitly,
+   interpreted as a row vector) and b a matrix, so it sums up the rows of the
+   matrix.
+
+   May not be used if a and b overlap.
+*/
+template <class T>
+class StvectorPlusEqMatrixCpuOp<T>: public Op {
+  StvectorPlusEqMatrixCpuOp(const Tensor &a, const Tensor &b): a_(a), b_(b) { }
+
+  int32 Properties() { return kConcreteOp; }
+
+  Op *Copy() { return new StvectorPlusEqMatrixCpuOp<T>(a_, b_); }
+
+  void Do() {
+    const Pattern &a_pattern = a_.Pattern(),
+        &b_pattern = b_.Pattern();
+    int32 a_dim = a_pattern.dims[0],
+        b_num_cols = b_pattern.dims[0],
+        b_num_rows = b_pattern.dims[1],
+        a_stride = a_pattern.strides[0],
+        b_stride =  b_pattern.strides[1];
+    KALDI_PARANOID_ASSERT(b_pattern.strides[0] == 1 &&
+                          a_dim == b_num_cols);
+
+    bool uninitialized;
+    T *a_data = a_.GetData<T>(&uninitialized),
+        *b_data = a_.GetData<T>();
+
+    if (uninitialized) {
+      DebugNormalOp(a, kWrite, b_, kRead);
+      T b = *b_data;
+#pragma unroll (4)
+      for (int32 i = 0; i < dim; i++)
+        a_data[i * a_stride] = b;
+    } else {
+      DebugNormalOp(a, kReadWrite, b_, kRead);
+      T b = *b_data;
+#pragma unroll (4)
+      for (int32 i = 0; i < dim; i++)
+        a_data[i * a_stride] += b;
+    }
+  }
+  Tensor a_;
+  Tensor b_;
+};
+
+
+
 
 }  // namespace tensor
 }  // namespace kaldi
 
 
-#endif  // KALDI_TENSOR__LINEAR_OPS_H_
+#endif  // KALDI_TENSOR_LINEAR_OPS_H_
diff --git a/src/tensor/linear-cpu-ref-ops.h b/src/tensor/linear-cpu-ref-ops.h
index f7d60329ba9..7e4ddab9297 100644
--- a/src/tensor/linear-cpu-ref-ops.h
+++ b/src/tensor/linear-cpu-ref-ops.h
@@ -34,8 +34,8 @@ namespace tensor {
 
 // Corresponds to the command a += b.
 template <typename T>
-class PlusEqRefOp: public Op {
-  PlusEqRefOp(const Tensor &a, const Tensor &b):
+class PlusEqCpuRefOp: public Op {
+  PlusEqCpuRefOp(const Tensor &a, const Tensor &b):
       a_(a), b_(b) {
     KALDI_ASSERT(!Overlap(a, b) && BroadcastableAndCompatible(a, b));
   }
@@ -43,7 +43,7 @@ class PlusEqRefOp: public Op {
   int32 Properties() { return kConcreteOp ; }
 
   Op *Copy() const override {
-    return new PlusEqRefOp<T>(a_, b_);
+    return new PlusEqCpuRefOp<T>(a_, b_);
   }
 
   void Do() const override {
@@ -96,8 +96,14 @@ class SetZeroRefOp: public Op {
     int32 dim = a_.dims[raxis],
         stride = a_.strides[raxis];
     if (raxis == 0) {
-      for (int32 i = 0; i < dim; i++) {
-        a[i * a_stride] = 0;
+      // TODO: if stride is 1, use memset below.
+      if (stride == 1) {
+        std::memset(a, 0, dim * sizeof(T));
+      } else {
+#pragma unroll (4)
+        for (int32 i = 0; i < dim; i++) {
+          a[i * a_stride] = 0;
+        }
       }
     } else {
       for (int32 i = 0; i < dim; i++) {
@@ -112,8 +118,8 @@ class SetZeroRefOp: public Op {
 // T is the data-type of a, U is the data-type of b;
 // this Op supports type conversion.
 template <typename T, typename U>
-class AssignRefOp: public Op {
-  PlusEqRefOp(const Tensor &a, const Tensor &b):
+class AssignCpuRefOp: public Op {
+  AssignCpuRefOp(const Tensor &a, const Tensor &b):
       a_(a), b_(b) {
     // The DimsGeq() makes sure there is no summation, as this version of the op
     // does not support summation.
@@ -125,7 +131,7 @@ class AssignRefOp: public Op {
   int32 Properties() { return kConcreteOp ; }
 
   Op *Copy() const override {
-    return new PlusEqRefOp<T>(a_, b_);
+    return new AssignCpuRefOp<T>(a_, b_);
   }
 
   void Do() const override {
@@ -141,6 +147,7 @@ class AssignRefOp: public Op {
     int32 dim = std::max<int32>(a_.dims[raxis], b_.dims[raxis]),
         a_stride = a_.strides[raxis], b_stride = b_.strides[raxis];
     if (raxis == 0) {
+#pragma unroll (4)
       for (int32 i = 0; i < dim; i++) {
         a[i * a_stride] = static_cast<T>(b[i * b_stride]);
       }
diff --git a/src/tensor/linear-gpu-ops.h b/src/tensor/linear-cuda-ops.h
similarity index 96%
rename from src/tensor/linear-gpu-ops.h
rename to src/tensor/linear-cuda-ops.h
index b80a37c6d17..21884f98459 100644
--- a/src/tensor/linear-gpu-ops.h
+++ b/src/tensor/linear-cuda-ops.h
@@ -1,4 +1,4 @@
-// tensor/linear-cpu-ops.h
+// tensor/linear-cuda-ops.h
 
 // Copyright      2019  Johns Hopkins University (author: Daniel Povey)
 
@@ -17,8 +17,9 @@
 // See the Apache 2 License for the specific language governing permissions and
 // limitations under the License.
 
-#ifndef KALDI_TENSOR_LINEAR_SPECIAL_OPS_H_
-#define KALDI_TENSOR_LINEAR_SPECIAL_OPS_H_ 1
+#ifndef KALDI_TENSOR_LINEAR_CUDA_OPS_H_
+#define KALDI_TENSOR_LINEAR_CUDA_OPS_H_ 1
+#if HAVE_CUDA == 1
 
 #include "tensor/tensor.h"
 #include "tensor/linear-special-ops.h"
@@ -30,20 +31,25 @@
 namespace kaldi {
 namespace tensor {
 
-/**
-   Does a += b for a and b both scalar, on CPU.
- */
+
+//    Does a += b for a and b both scalar, on GPU
 template <class T>
-class ScalarPlusEqScalarOp<T, kCpuDevice>: public Op {
+class ScalarPlusEqScalarOp<T, kGpuDevice>;
+
+
+template<>
+class ScalarPlusEqScalarOp<T, kGpuDevice>;
+
 
   ScalarPlusEqScalarOp(const Tensor &a, const Tensor &b): a_(a), b_(b) { }
 
   Op *Copy() {
-    return new ScalarPlusEqScalar<T, kCpuDevice>(a_, b_);
+    return new ScalarPlusEqScalar<T, kGpuDevice>(a_, b_);
   }
 
   void Do() {
     DebugNormalOp(a, kReadWrite, b_, kRead);
+
     *a_.GetData<T>() += *b_.GetData<T>();
   }
 
@@ -299,5 +305,5 @@ class StvectorPlusEqScalarOp<T, kCpuDevice>: public Op {
 }  // namespace tensor
 }  // namespace kaldi
 
-
+#endif  // HAVE_CUDA == 1
 #endif  // KALDI_TENSOR__LINEAR_OPS_H_
diff --git a/src/tensor/linear-ops.cc b/src/tensor/linear-ops.cc
index db0fd32e95f..18c4b3f5bfe 100644
--- a/src/tensor/linear-ops.cc
+++ b/src/tensor/linear-ops.cc
@@ -22,9 +22,15 @@
 namespace kaldi {
 namespace tensor {
 
+
 void PlusEqOp::Expand(std::vector<std::unique_ptr<Op> > *ops) const {
+  if (a_.DeviceType() == kCpuDevice) ExpandCpu(ops);
+  else ExpandCuda(ops);
+}
+
+void PlusEqOp::ExpandCpu(std::vector<std::unique_ptr<Op> > *ops) const {
   Op *new_op;
-  if (ReferenceMode() && a_.DeviceType() == kCpuDevice) {
+  if (ReferenceMode()) {
     // In reference mode on CPU always use the reference implementation.
     // Reference mode is only supported on CPU so we use the normal Ops
     // on GPU.
@@ -33,35 +39,31 @@ void PlusEqOp::Expand(std::vector<std::unique_ptr<Op> > *ops) const {
     return;
   }
 
-  // The generic implementation requires us to first normalize the patterns.
+  // The implementation requires us to first reduce the patterns,
+  // so we don't have too many combinations of codes to handle.
   Pattern a_pattern = a_.Impl().pattern,
       b_pattern = b_.Impl().pattern;
-  NormalizePatterns({a_pattern, b_pattern});
-
-  KALDI_ASSERT(Compatible(a_, b_));  // dtype and device, check they match.
+  ReducePatterns({a_pattern, b_pattern});
 
+  // The few lines below construct Tensors a and b which have the same data as
+  // a_ and b_, but with reduced patterns; we use a_ and b_ directly if the
+  // reduction made no difference.
   Tensor a(a_), b(b_);
-
   if (a_pattern != a_.Impl().pattern)
     a = WithPattern(a, a_pattern);
   if (b_pattern != b_.Impl().pattern)
     b = WithPattern(b, b_pattern);
 
-  /*
-    The case-statement values in the switch statement below may be interpreted
-    in groups of 3 hex characters, are 0xAAABBB, pertaining to Tensors a and b
-    respectively.  See GetPatternCode() in pattern-utils.h for documentation on
-    the meanings of the values and our notation with X,x,1.
 
-  */
   int64 combined_code = CombineCodes(a_pattern.GetCode(),
                                      b_pattern.GetCode());
 
   /*
-    The case-statement values in the switch statement below may be interpreted
-    in groups of 3 hex characters, are 0xAAABBB, pertaining to Tensors a and b.
-    See ComputePatternCode() in pattern-utils.h for documentation on the meanings of
-    the values and our notation with X,x,1.
+    'combined_code' may be viewed as a hex number 0xAAABBB where AAA is
+    the code of a_pattern and BBB is the code of b_pattern.  See
+    documentation for ComputePatternCode() in pattern-utils.h for
+    more documentation on the meanings of the values and our notation
+    with X,x,1.
        Quick legend:
              X means dim >1, stride = 1
              x means dim >1, stride != 1
@@ -69,34 +71,40 @@ void PlusEqOp::Expand(std::vector<std::unique_ptr<Op> > *ops) const {
                  (Note: the numbers in case-statements below exclude negative
                  strides because bit 11 of the 12-bit chunks would be set if
                  there were a negative stride).
-   */
+       Rightmost position in these (xX)-type notations below is the
+       highest-numbered axis / lowest-number raxis
+  */
+
+
+  // We implemented the blas-like operations for general T as well as the versions
+  // that use BLAS, so we don't need to check if the type is float or double.
 
   // We are doing a += b.
   switch(combined_code) {
     // A scalar += scalar,
     case 0x000000:   // () +=  ()
-      SET_TO_TEMPLATED_OP_REAL(new_op, a.Dtype(), a.DeviceType(), ScalarPlusEqScalarOp, a, b);
+      SET_TO_TEMPLATED_OP_REAL(new_op, a.Dtype(), ScalarPlusEqScalarCpuOp, a, b);
       break;
-    // We may split apart some of the following cases in future.
-    // They all represent, vector += vector.
+      // We may split apart some of the following cases in future.
+      // They all represent, vector += vector.
     case 0x101101:  //  (X) += (X)
     case 0x001001:  //  (x) += (x)
     case 0x101001:  //  (X) += (x)
     case 0x001101:  //  (X) += (x)
-      SET_TO_TEMPLATED_OP_REAL(new_op, a.Dtype(), a.DeviceType(), StvectorPlusEqStvectorOp, a, b);
+      SET_TO_TEMPLATED_OP_REAL(new_op, a.Dtype(), StvectorPlusEqStvectorCpuOp, a, b);
       break;
-    // Scalar += (sum of) vector or strided vector
+      // Scalar += (sum of) vector or strided vector
     case 0x000101:  //  () += (X)
     case 0x000001:  //  () += (X)
-      SET_TO_TEMPLATED_OP_REAL(new_op, a.Dtype(), a.DeviceType(), ScalarPlusEqStvectorOp, a, b);
+      SET_TO_TEMPLATED_OP_REAL(new_op, a.Dtype(), ScalarPlusEqStvectorCpuOp, a, b);
       break;
-    // vector or strided vector += scalar.
-    // We could later split apart the strided and non-strided cases.
+      // vector or strided vector += scalar.
+      // We could later split apart the strided and non-strided cases.
     case 0x101000:  //  (x) += ()
     case 0x001000:  //  (X) += ()
-      SET_TO_TEMPLATED_OP_REAL(new_op, a.Dtype(), a.DeviceType(), StvectorPlusEqScalarOp, a, b);
+      SET_TO_TEMPLATED_OP_REAL(new_op, a.Dtype(), StvectorPlusEqScalarCpuOp, a, b);
       break;
-    // scalar += matrix
+      // scalar += matrix
     case 0x000103: { // () += (xX)
       int32 num_rows = b.Pattern().dims[1];
       // Create a temporary- a column vector, which is what we call
@@ -105,26 +113,113 @@ void PlusEqOp::Expand(std::vector<std::unique_ptr<Op> > *ops) const {
       // Below we do temp += b.  We could use PlusEqOp for this and also for the
       // following reduction, but doing it this way avoids an unnecessary layer
       // of expansion.
-      SET_TO_TEMPLATED_OP_REAL(new_op, a.Dtype(), a.DeviceType(),
-                               ColVectorEqMatrixOp, temp, b);
+      SET_TO_TEMPLATED_OP_REAL(new_op, a.Dtype(),
+                               ColVectorEqMatrixCpuOp, temp, b);
       ops->push_back(new_op);
       // Normalize the temporary vector so its nontrivial axis is raxis 0, by
       // removing the current raxis 0 and having current raxis 1 shift down.
-      Tensor temp_normalized = Squeeze(temp, 0);
-      SET_TO_TEMPLATED_OP_REAL(new_op, a.Dtype(), a.DeviceType(),
-                               ScalarPlusEqStvectorOp, a, temp_normalized);
+      Tensor temp_normalized = SqueezeR(temp, 0);
+      SET_TO_TEMPLATED_OP_REAL(new_op, a.Dtype(),
+                               ScalarPlusEqStvectorCpuOp, a,
+                               temp_normalized);
+      break;
     }
-
+    case 0x101103: // (X) += (xX)
+    case 0x001103: // (x) += (xX)
+      // vector += matrix.  Implicitly this is a row vector, since its
+      // nontrivial axis is in the same position as the column axis of the
+      // matrix.  So we are summing the rows of the matrix.
+      SET_TO_TEMPLATED_OP_REAL(new_op, a.Dtype(), StvectorPlusEqMatrix);
 
     default:
-      // Later we can add a more generic implementation that handles arbitrary
-      // patterns.
-      KALDI_ERR << "Unhandled code: " << std::hex << combined_code;
+          // The reference op, which might be slow especially if there is
+          // reduction.  We'll continue trying to add special handling for common
+          // cases.
+          SET_TO_TEMPLATED_OP_ALL(new_op, a_.Dtype(), PlusEqRefCpuOp, a_, b_);
+  }
+    } else {  // CPU, but not float or double.
+      switch (dtype) {
+        case kInt32Dtype:
+          new_op = new PlusEqRefOp<int32>(a_, b_);
+        default:
+          KALDI_ERR << "Unexpected dtype: " << dtype;
+      }
+    }
+    ops->push_back(new_op);
+    return;
+  } else {
+    KALDI_ASSERT(a.DeviceType() == kCuda);
+#if HAVE_CUDA == 1
+    if (a.Dtype() == kFloat || a.Dtype() == kDouble) {
+      // For certain special cases we have a BLAS implementation.
+      switch(combined_code) {
+        // We may split apart some of the following cases in future.
+        // They all represent, vector += vector.
+        case 0x101101:  //  (X) += (X)
+        case 0x001001:  //  (x) += (x)
+        case 0x101001:  //  (X) += (x)
+        case 0x001101:  //  (X) += (x)
+          SET_TO_TEMPLATED_OP_REAL(new_op, a.Dtype(), a.DeviceType(), StvectorPlusEqStvectorCudaOp, a, b);
+          break;
+          // Scalar += (sum of) vector or strided vector
+        case 0x000101:  //  () += (X)
+        case 0x000001:  //  () += (X)
+          SET_TO_TEMPLATED_OP_REAL(new_op, a.Dtype(), a.DeviceType(), ScalarPlusEqStvectorCudaOp, a, b);
+          break;
+          // vector or strided vector += scalar.
+          // We could later split apart the strided and non-strided cases.
+        case 0x101000:  //  (x) += ()
+        case 0x001000:  //  (X) += ()
+          SET_TO_TEMPLATED_OP_REAL(new_op, a.Dtype(), a.DeviceType(), StvectorPlusEqScalarCudaOp, a, b);
+          break;
+          // scalar += matrix
+        case 0x000103: { // () += (xX)
+          int32 num_rows = b.Pattern().dims[1];
+          // Create a temporary- a column vector, which is what we call
+          // a vector whose nontrivial axis is raxis 1 instead of raxis 0.
+          Tensor temp({num_rows, 1}, {a.Dtype(), a.Device()});
+          // Below we do temp += b.  We could use PlusEqOp for this and also for the
+          // following reduction, but doing it this way avoids an unnecessary layer
+          // of expansion.
+          SET_TO_TEMPLATED_OP_REAL(new_op, a.Dtype(), a.DeviceType(),
+                                       ColVectorEqMatrixOp, temp, b);
+          ops->push_back(new_op);
+          // Normalize the temporary vector so its nontrivial axis is raxis 0, by
+          // removing the current raxis 0 and having current raxis 1 shift down.
+          Tensor temp_normalized = Squeeze(temp, 0);
+          SET_TO_TEMPLATED_OP_REAL(new_op, a.Dtype(), a.DeviceType(),
+                                   ScalarPlusEqStvectorOp, a, temp_normalized);
+        }
+      }
+#else
+      KALDI_ERR << "You have not compiled for CUDA but are trying to use GPU."
+          "Please configure for GPU use and recompile."
+#endif  //  HAVE_CUDA == 1
   }
-  ops->push_back(new_op);
 }
 
 
+void PlusEqOp::ExpandCpu(std::vector<std::unique_ptr<Op> > *ops) const {
+  Op *new_op;
+  // The implementation requires us to first normalize the patterns,
+  // so we don't have too many combinations of codes to handle.
+  Pattern a_pattern = a_.Impl().pattern,
+      b_pattern = b_.Impl().pattern;
+  NormalizePatterns({a_pattern, b_pattern});
+
+  // The few lines below construct Tensors a and b which have the same data as
+  // a_ and b_, but with reduced patterns; we use a_ and b_ directly if the
+  // reduction made no difference.
+  Tensor a(a_), b(b_);
+  if (a_pattern != a_.Impl().pattern)
+    a = WithPattern(a, a_pattern);
+  if (b_pattern != b_.Impl().pattern)
+    b = WithPattern(b, b_pattern);
+
+
+
+}
+
 void AssignOp::Expand() const {
   Op *new_op;
 
@@ -146,8 +241,8 @@ void AssignOp::Expand() const {
     // In reference mode on CPU always use the reference implementation.
     // Reference mode is only supported on CPU so we use the normal Ops
     // on GPU.
-    SET_TO_TEMPLATED_CPU_OP_ALLPAIRS(new_op, a_.Dtype(), b.Dtype(),
-                                     AssignRefOp, a_, b_);
+    SET_TO_TEMPLATED_OP_ALLPAIRS(new_op, a_.Dtype(), b.Dtype(),
+                                 AssignRefOp, a_, b_);
     ops->push_back(new_op);
     return;
   }
diff --git a/src/tensor/linear-ops.h b/src/tensor/linear-ops.h
index f518671a083..8d5deffc02b 100644
--- a/src/tensor/linear-ops.h
+++ b/src/tensor/linear-ops.h
@@ -17,8 +17,8 @@
 // See the Apache 2 License for the specific language governing permissions and
 // limitations under the License.
 
-#ifndef KALDI_TENSOR__LINEAR_OPS_H_
-#define KALDI_TENSOR__LINEAR_OPS_H_ 1
+#ifndef KALDI_TENSOR_LINEAR_OPS_H_
+#define KALDI_TENSOR_LINEAR_OPS_H_ 1
 
 #include "tensor/tensor.h"
 
@@ -81,6 +81,11 @@ class PlusEqOp: public Op {
 
 
  private:
+  // The implementation of Expand() is complicated so we split it
+  // into two separate functions.
+  void ExpandCpu(std::vector<std::unique_ptr<Op> > *ops) const;
+  void ExpandCuda(std::vector<std::unique_ptr<Op> > *ops) const;
+
   Tensor a_;
   Tensor b_;
 };
diff --git a/src/tensor/pattern-utils.h b/src/tensor/pattern-utils.h
index 9ce86cdac1d..63e6a6c0165 100644
--- a/src/tensor/pattern-utils.h
+++ b/src/tensor/pattern-utils.h
@@ -483,7 +483,7 @@ bool SameStrides(const Pattern &a,
 
 
 /**
-   Compresses a Pattern by removing or combining as many axes as possible.
+   Reduces a Pattern by removing or combining as many axes as possible.
    This version is suitable for operations that do not rely on any kind of
    structure, such as zeroing or nonlinearities; the only equivalence maintained
    is equivalence of the set of memory locations covered (the memory-index-set).
@@ -491,7 +491,7 @@ bool SameStrides(const Pattern &a,
    output.  The output (dim,stride) pairs will be ordered from
    greatest to least stride (note: all output strides will be positive).
 
-      @param [in,out]  pattern   The pattern to be compressed
+      @param [in,out]  pattern   The pattern to be reduced
 
    Examples are below, where we write a Pattern as
 
@@ -512,7 +512,7 @@ bool SameStrides(const Pattern &a,
    {{2,3,4},{100,4,1}}        {{2,12},{100,1}}
 \endverbatim
  */
-void CompressOnePattern(Pattern *pattern);
+void ReduceOnePattern(Pattern *pattern);
 
 
 
@@ -541,7 +541,7 @@ int32 RaxisWithSmallestAbsStride(const Pattern &pattern);
 
 // TODO: document this.
 inline void CanonicalizePattern(Pattern *pattern) {
-  CompressOnePattern(pattern);
+  ReduceOnePattern(pattern);
   SortAxes(pattern);
 }
 
@@ -591,22 +591,22 @@ class PatternHasher {
 
 
 /*
-  CompressTwoPatterns() is a special case of CompressPatterns() where there
-  are exactly two patterns to be jointly compressed.  See documentation of
-  CompressPatterns() for explanation.
+  ReduceTwoPatterns() is a special case of ReducePatterns() where there
+  are exactly two patterns to be jointly reduced.  See documentation of
+  ReducePatterns() for explanation.
 */
-void CompressTwoPatterns(Pattern *a,
+void ReduceTwoPatterns(Pattern *a,
                          Pattern *b);
 
 
 /**
-   Compresses a Pattern by removing or combining as many axes as possible,
+   Reduces a Pattern by removing or combining as many axes as possible,
    while preserving the memory-index-set of the pattern (see glossary for
    explanation), and also while respecting certain invariances that are relevant
    when constructing 'views' ('view' is PyTorch terminology; the NumPy
    equivalent is 'reshape').  The "C" in the function name refers to C-style
    arrays.  Basically what this function does is a highly restricted subset
-   of what CompressOnePattern() does.
+   of what ReduceOnePattern() does.
 
    This function removes axes with dim=1.
 
@@ -641,7 +641,7 @@ void CompressTwoPatterns(Pattern *a,
    {2,3,4},{100,-4,-1}        {{2,12},{100,-1}}
 \endverbatim
  */
-void CompressPatternC(Pattern *p);
+void ReducePatternC(Pattern *p);
 
 
 
@@ -668,10 +668,10 @@ void CompressPatternC(Pattern *p);
 
 
    Notes on implementation (glossing over ones in 'dims' which are easy to
-   handle as a special case): we would first call CompressPattern on
+   handle as a special case): we would first call ReducePattern on
    'pattern_in'.  Then we would attempt to find a correspondence with
-   the dimensions of this compressed pattern and a partition of the
-   sequence 'dims'.  For example, suppose the compressed pattern
+   the dimensions of this reduced pattern and a partition of the
+   sequence 'dims'.  For example, suppose the reduced pattern
    is (100, 9) and dims is (50, 2, 3, 3), then the partition would
    be (50, 2), (3, 3).  If this is not possible (e.g. if dims
    had been (30,10,3) instead), we return false.
@@ -734,6 +734,67 @@ void Select(int32 eaxis, int32 index,
             const Pattern &src, Pattern *dest);
 
 
+/**
+   Infer an index-tuple from a memory-index m and a pattern p.
+   That is: find the index-tuple i such that src[i] = p[m]
+   and i[r] = 0 in all axes such that p.dims[r] == 1.
+   There is at most one such index-tuple, by the uniqueness property.
+   The numbering used here is by raxis (i.e. the private numbering).
+
+           @param [in] p  Input pattern.  Required to be valid.
+           @param [in] m  The memory-index we are querying.
+           @param [out] index_tuple  On success, the index-tuple will be
+                    written to here (it will have size equal to p.num_axes).
+                    Note: the indexing is by raxis (that is why there
+                    is "R" in the function name).
+                    On failure, the value at exit is undefined.
+           @return  Returns true on success, false if no
+                    such index-tuple existed.
+*/
+bool GetIndexTupleR(const Pattern &p,
+                    int64 m,
+                    std::vector<int32> *index_tuple);
+
+/**
+   Convert a memory-index from one pattern to another.  Specifically, it finds
+   an index-tuple i such that a[i] = mindex_a, and returns mindex_b = b[i], if
+   exactly one such mindex_b exists; otherwise it crashes.  (The caveat about
+   "if exactly one such index exists" has to do with the possibility that there
+   is an raxis r that is trivial for a but not for b).
+
+   These memory-indexes include the 'offset' members of a and b.  For a version
+   that does not include the offset (i.e. is invariant to the offset members),
+   see ConvertMindexDifference.
+
+             @param [in] a     Source Pattern, from which mindex_a is derived
+             @param [out] b    Destination Pattern, which we index to get
+                               the returned mindex
+             @return           Returns the memory-index mindex_b such that
+                               there exists an index-tuple i in the
+                               index-tuple-set of the tuple (a, b) satisfying
+                               a[i] == mindex_a and b[i] == mindex_b.  If it is
+                               not the case that exactly one such memory-index
+                               exists, it is an error and this function may
+                               crash.
+ */
+int64 ConvertMindex(const Pattern &a,
+                    const Pattern &b,
+                    int64 mindex_a);
+
+/**
+   Convert a difference between memory-indexes from one pattern to another.
+   This is equivalent to setting the 'offset' values of a and b to zero and
+   calling ConvertMindex() with the modified args, in cases where that inner
+   call does not crash.  (But this function also generalizes to 'out-of-range'
+   or negative memory-indexes, like a linear continuation of the function).
+
+   See documentation for ConvertMindex() for more explanation.
+*/
+int64 ConvertMindexDifference(const Pattern &a,
+                              const Pattern &b,
+                              int64 mindex_a);
+
+
 /**
    This function returns true if 'pattern' has the same strides
    as 'C' array with the same dimensions would have.  (Note:
diff --git a/src/tensor/pattern.h b/src/tensor/pattern.h
index b091014aede..93e46f2714e 100644
--- a/src/tensor/pattern.h
+++ b/src/tensor/pattern.h
@@ -461,12 +461,17 @@ namespace tensor {
                       necessary (since most BLAS implementations do not support
                       negative stride).
 
-   Uniqueness property:  A property of a Pattern that no two different index-tuples,
-                      when used to index the Pattern, generate the same memory-index.
-                      The axis-dominance property is sufficient, but not necessary,
-                      to ensure the uniqueness property.  (The uniqueness property
-                      is probably not so easy to test for efficiently in the general
-                      case).
+   Uniqueness property:  A property of a Pattern that there does not exist
+                      two index-tuples i1 and i2 which are different in non-trivial
+                      axes of the pattern (i.e. i1[r] != i2[r] for some r that is
+                      a non-trivial raxis of the pattern), which when
+                      used to index the Pattern, generate the same memory-index.
+
+                      The axis-dominance property is sufficient, but not
+                      necessary, to ensure the uniqueness property.  (The
+                      uniqueness property is probably not easy to test for
+                      efficiently in the general case where axis dominance does
+                      not hold).
 
     Valid Pattern:
                      A valid Pattern must be as follows.  Think of this as the mathematical definition;

From 935b1514ec3010726bf9eaa3102bc9f025d70c7e Mon Sep 17 00:00:00 2001
From: Dan Povey <dpovey@gmail.com>
Date: Wed, 19 Jun 2019 15:08:39 -0400
Subject: [PATCH 137/163] [src] Minor changes / fixes

---
 src/cblasext/cblas-extensions.cc | 39 ++++++++++++++++++++++++++++++++
 src/cblasext/cblas-extensions.h  | 33 ++++++++++++++++++++++++++-
 src/matrix/kaldi-matrix.cc       | 17 ++++----------
 src/tensor/linear-ops.h          |  4 ++--
 4 files changed, 77 insertions(+), 16 deletions(-)

diff --git a/src/cblasext/cblas-extensions.cc b/src/cblasext/cblas-extensions.cc
index 8b5ea941081..8d23ae6ab2d 100644
--- a/src/cblasext/cblas-extensions.cc
+++ b/src/cblasext/cblas-extensions.cc
@@ -120,4 +120,43 @@ template void cblasext_mul_elements_mat(
     double *Bdata, KaldiBlasInt b_stride);
 
 
+template <typename Real>
+Real cblasext_trace_mat_mat(
+    const Real *a_data,
+    KaldiBlasInt a_num_rows, KaldiBlasInt a_num_cols,
+    KaldiBlasInt a_stride, KaldiBlasInt a_col_stride,
+    const Real *b_data, CBLAS_TRANSPOSE b_trans,
+    KaldiBlasInt b_stride, KaldiBlasInt b_col_stride) {
+  Real ans = 0.0;
+  if (b_trans == CblasNoTrans) {
+    for (KaldiBlasInt i = 0; i < a_num_rows;
+         i++, a_data += a_stride, b_data += b_col_stride) {
+      ans += cblas_Xdot(a_num_cols, a_data, a_col_stride, b_data, b_stride);
+    }
+    return ans;
+  } else {
+    for (KaldiBlasInt i = 0; i < a_num_rows;
+         i++, a_data += a_stride, b_data += b_stride) {
+      ans += cblas_Xdot(a_num_cols, a_data, a_col_stride,
+                        b_data, b_col_stride);
+    }
+    return ans;
+  }
+}
+
+template float cblasext_trace_mat_mat(
+    const float *a_data,
+    KaldiBlasInt a_num_rows, KaldiBlasInt a_num_cols,
+    KaldiBlasInt a_stride, KaldiBlasInt a_col_stride,
+    const float *b_data, CBLAS_TRANSPOSE b_trans,
+    KaldiBlasInt b_stride, KaldiBlasInt b_col_stride);
+template double cblasext_trace_mat_mat(
+    const double *a_data,
+    KaldiBlasInt a_num_rows, KaldiBlasInt a_num_cols,
+    KaldiBlasInt a_stride, KaldiBlasInt a_col_stride,
+    const double *b_data, CBLAS_TRANSPOSE b_trans,
+    KaldiBlasInt b_stride, KaldiBlasInt b_col_stride);
+
+
+
 } // namespace kaldi
diff --git a/src/cblasext/cblas-extensions.h b/src/cblasext/cblas-extensions.h
index aaf12004c25..f3c3dbe3be9 100644
--- a/src/cblasext/cblas-extensions.h
+++ b/src/cblasext/cblas-extensions.h
@@ -1,4 +1,4 @@
-// matrix/cblas-extensions.h
+// cblasext/cblas-extensions.h
 
 // Copyright 2012-2019  Johns Hopkins University (author: Daniel Povey);
 //                      Haihua Xu; Wei Shi
@@ -69,6 +69,37 @@ void cblasext_mul_elements_mat(
     Real *Bdata,
     KaldiBlasInt b_stride);
 
+/**
+   For matrices A and B (possibly with column strides as well as
+   row strides): if transB = false, compute
+      tr(A B) = \sum_{i,j} A(i, j) B(j, i)
+   or if transB = true, compute
+      tr(A B) = \sum_{i,j} A(i, j) B(i, j).
+     @param [in] Adata        Data pointer of matrix A
+     @param [in] a_num_rows   Number of rows of matrix A
+     @param [in] a_num_cols   Number of columns of matrix A
+     @param [in] a_stride     Row stride of matrix A; may have any value.
+     @param [in] a_col_stride Column stride of A, would be 1 for
+                              a normal matrix; must be positive.
+     @param [in] b_data        Data pointer of matrix B; may be
+                              the same as Adata.
+     @param [in] b_trans       True if B is transposed.  Note: the
+                              expression would have the same value
+                              if the transpose was applied to A
+                              instead.
+     @param [in] b_stride     Row stride of matrix B; may have any
+                              value.
+     @param [in] b_col_stride Column stride of matrix B; must be
+                              positive, will normally be 1.
+ */
+template <typename Real>
+Real cblasext_trace_mat_mat(
+    const Real *a_data,
+    KaldiBlasInt a_num_rows, KaldiBlasInt a_num_cols,
+    KaldiBlasInt a_stride, KaldiBlasInt a_col_stride,
+    const Real *b_data, CBLAS_TRANSPOSE b_trans,
+    KaldiBlasInt b_stride, KaldiBlasInt b_col_stride);
+
 
 
 
diff --git a/src/matrix/kaldi-matrix.cc b/src/matrix/kaldi-matrix.cc
index 917b8848c78..ffa1ab550b7 100644
--- a/src/matrix/kaldi-matrix.cc
+++ b/src/matrix/kaldi-matrix.cc
@@ -2650,24 +2650,15 @@ template <typename Real>
 Real TraceMatMat(const MatrixBase<Real> &A,
                  const MatrixBase<Real> &B,
                  MatrixTransposeType trans) {  // tr(A B), equivalent to sum of each element of A times same element in B'
-  MatrixIndexT aStride = A.stride_, bStride = B.stride_;
   if (trans == kNoTrans) {
     KALDI_ASSERT(A.NumRows() == B.NumCols() && A.NumCols() == B.NumRows());
-    Real ans = 0.0;
-    Real *adata = A.data_, *bdata = B.data_;
-    MatrixIndexT arows = A.NumRows(), acols = A.NumCols();
-    for (MatrixIndexT row = 0;row < arows;row++, adata+=aStride, bdata++)
-      ans += cblas_Xdot(acols, adata, 1, bdata, bStride);
-    return ans;
   } else {
     KALDI_ASSERT(A.NumRows() == B.NumRows() && A.NumCols() == B.NumCols());
-    Real ans = 0.0;
-    Real *adata = A.data_, *bdata = B.data_;
-    MatrixIndexT arows = A.NumRows(), acols = A.NumCols();
-    for (MatrixIndexT row = 0;row < arows;row++, adata+=aStride, bdata+=bStride)
-      ans += cblas_Xdot(acols, adata, 1, bdata, 1);
-    return ans;
   }
+  return cblasext_trace_mat_mat(A.Data(), A.NumRows(), A.NumCols(),
+                                A.Stride(), 1, B.Data(),
+                                static_cast<CBLAS_TRANSPOSE>(trans),
+                                B.Stride(), 1);
 }
 
 
diff --git a/src/tensor/linear-ops.h b/src/tensor/linear-ops.h
index 3eade973772..69632aa2f7f 100644
--- a/src/tensor/linear-ops.h
+++ b/src/tensor/linear-ops.h
@@ -17,8 +17,8 @@
 // See the Apache 2 License for the specific language governing permissions and
 // limitations under the License.
 
-#ifndef KALDI_TENSOR__LINEAR_OPS_H_
-#define KALDI_TENSOR__LINEAR_OPS_H_ 1
+#ifndef KALDI_TENSOR_LINEAR_OPS_H_
+#define KALDI_TENSOR_LINEAR_OPS_H_ 1
 
 #include "tensor/tensor.h"
 

From 1b4dec70ae86ff52d26fb5a739cfa43192cde1ff Mon Sep 17 00:00:00 2001
From: Dan Povey <dpovey@gmail.com>
Date: Wed, 19 Jun 2019 15:39:23 -0400
Subject: [PATCH 138/163] [build] Add missing Makefile

---
 src/cblasext/Makefile | 22 ++++++++++++++++++++++
 1 file changed, 22 insertions(+)
 create mode 100644 src/cblasext/Makefile

diff --git a/src/cblasext/Makefile b/src/cblasext/Makefile
new file mode 100644
index 00000000000..a4f6cb320f7
--- /dev/null
+++ b/src/cblasext/Makefile
@@ -0,0 +1,22 @@
+
+
+all:
+
+OPENFST_CXXFLAGS =
+OPENFST_LDLIBS =
+
+include ../kaldi.mk
+
+
+# you can uncomment matrix-lib-speed-test if you want to do the speed tests.
+
+TESTFILES = cblasext-test
+
+OBJFILES = cblas-extensions.o
+
+LIBNAME = kaldi-cblasext
+
+ADDLIBS = ../base/kaldi-base.a
+
+include ../makefiles/default_rules.mk
+

From c349ef5559bd1d356d529402e127d3fc9d8a14a9 Mon Sep 17 00:00:00 2001
From: Daniel Povey <dpovey@gmail.com>
Date: Wed, 19 Jun 2019 21:53:01 -0400
Subject: [PATCH 139/163] [src] Changes to make more things compile

---
 src/Makefile                 | 12 ++++-----
 src/bin/Makefile             |  2 +-
 src/chain/Makefile           |  2 +-
 src/chainbin/Makefile        |  2 +-
 src/cudadecoder/Makefile     |  4 +--
 src/cudadecoderbin/Makefile  |  2 +-
 src/cudafeat/Makefile        |  2 +-
 src/cudafeatbin/Makefile     |  2 +-
 src/cudamatrix/Makefile      |  2 +-
 src/decoder/Makefile         |  2 +-
 src/feat/Makefile            |  2 +-
 src/featbin/Makefile         |  2 +-
 src/fgmmbin/Makefile         |  2 +-
 src/fstbin/Makefile          |  2 +-
 src/fstext/Makefile          |  2 +-
 src/gmm/Makefile             |  2 +-
 src/gmmbin/Makefile          |  2 +-
 src/hmm/Makefile             |  2 +-
 src/hmm/hmm-test-utils.cc    | 21 ++--------------
 src/hmm/hmm-test-utils.h     |  2 --
 src/hmm/hmm-utils-test.cc    | 20 ++++++---------
 src/hmm/hmm-utils.cc         | 14 +++++------
 src/hmm/topology-test.cc     |  6 ++---
 src/hmm/topology.cc          |  2 +-
 src/hmm/transitions.cc       |  9 +++++--
 src/hmm/transitions.h        | 29 ++++++----------------
 src/ivector/Makefile         |  2 +-
 src/ivectorbin/Makefile      |  2 +-
 src/kws/Makefile             |  2 +-
 src/kwsbin/Makefile          |  2 +-
 src/lat/Makefile             |  2 +-
 src/lat/lattice-functions.cc | 47 ++++++++++++++++++++++++++++++++++++
 src/lat/lattice-functions.h  | 42 +++++++++++++++++++++-----------
 src/latbin/Makefile          |  4 +--
 src/latbin/lattice-copy.cc   | 26 +++++++++++++++++---
 src/lm/Makefile              |  2 +-
 src/lmbin/Makefile           |  2 +-
 src/nnet3/Makefile           |  2 +-
 src/nnet3bin/Makefile        |  2 +-
 src/online/Makefile          |  2 +-
 src/online2/Makefile         |  2 +-
 src/online2bin/Makefile      |  2 +-
 src/onlinebin/Makefile       |  2 +-
 src/rnnlm/Makefile           |  2 +-
 src/rnnlmbin/Makefile        |  2 +-
 src/tfrnnlm/Makefile         |  2 +-
 src/tfrnnlmbin/Makefile      |  2 +-
 src/transform/Makefile       |  2 +-
 src/tree/Makefile            |  2 +-
 src/util/Makefile            |  2 +-
 50 files changed, 178 insertions(+), 132 deletions(-)

diff --git a/src/Makefile b/src/Makefile
index adbd82d2e0f..1e128b6e5cc 100644
--- a/src/Makefile
+++ b/src/Makefile
@@ -135,8 +135,8 @@ bin fstbin gmmbin fgmmbin featbin cudafeatbin nnet3bin chainbin latbin ivectorbi
 #2)The libraries have inter-dependencies
 base: base/.depend.mk
 matrix: base
-util: base matrix
-feat: base matrix util gmm transform tree
+util: base cudamatrix matrix
+feat: base cudamatrix matrix util gmm transform tree
 tree: base util matrix
 gmm: base util matrix tree
 transform: base util matrix gmm tree
@@ -151,10 +151,10 @@ rnnlm: base util matrix cudamatrix nnet3 lm hmm
 chain: lat hmm tree fstext matrix cudamatrix util base
 ivector: base util matrix transform tree gmm
 #3)Dependencies for optional parts of Kaldi
-onlinebin: base matrix util feat tree gmm transform fstext hmm lm decoder lat cudamatrix online
-# python-kaldi-decoding: base matrix util feat tree gmm transform fstext hmm decoder lat online
-cudafeat: base matrix util gmm transform tree feat cudamatrix online2
-cudafeatbin: base matrix util gmm transform tree feat cudamatrix cudafeat online2
+onlinebin: base cudamatrix matrix util feat tree gmm transform fstext hmm lm decoder lat cudamatrix online
+# python-kaldi-decoding: base cudamatrix matrix util feat tree gmm transform fstext hmm decoder lat online
+cudafeat: base cudamatrix matrix util gmm transform tree feat cudamatrix online2
+cudafeatbin: base cudamatrix matrix util gmm transform tree feat cudamatrix cudafeat online2
 online: decoder gmm transform feat matrix util base lat hmm tree
 online2: decoder gmm transform feat matrix util base lat hmm tree ivector cudamatrix nnet3 chain
 kws: base util hmm tree matrix lat
diff --git a/src/bin/Makefile b/src/bin/Makefile
index f8f0564743c..6ab96434246 100644
--- a/src/bin/Makefile
+++ b/src/bin/Makefile
@@ -31,7 +31,7 @@ ADDLIBS = ../decoder/kaldi-decoder.a ../lat/kaldi-lat.a ../lm/kaldi-lm.a \
           ../fstext/kaldi-fstext.a ../hmm/kaldi-hmm.a \
           ../transform/kaldi-transform.a ../gmm/kaldi-gmm.a \
           ../tree/kaldi-tree.a ../util/kaldi-util.a ../matrix/kaldi-matrix.a \
-          ../base/kaldi-base.a
+          ../cblasext/kaldi-cblasext.a ../base/kaldi-base.a
 
 
 TESTFILES =
diff --git a/src/chain/Makefile b/src/chain/Makefile
index fbad28f7de6..dd4859f5449 100644
--- a/src/chain/Makefile
+++ b/src/chain/Makefile
@@ -18,7 +18,7 @@ LIBNAME = kaldi-chain
 
 ADDLIBS = ../cudamatrix/kaldi-cudamatrix.a ../lat/kaldi-lat.a \
           ../fstext/kaldi-fstext.a ../hmm/kaldi-hmm.a ../tree/kaldi-tree.a \
-          ../util/kaldi-util.a ../matrix/kaldi-matrix.a ../base/kaldi-base.a 
+          ../util/kaldi-util.a ../matrix/kaldi-matrix.a ../cblasext/kaldi-cblasext.a ../base/kaldi-base.a 
 
 # Make sure we have CUDA_ARCH from kaldi.mk,
 ifeq ($(CUDA), true)
diff --git a/src/chainbin/Makefile b/src/chainbin/Makefile
index 41ac7342d17..519c2bbf77d 100644
--- a/src/chainbin/Makefile
+++ b/src/chainbin/Makefile
@@ -25,7 +25,7 @@ ADDLIBS = ../nnet3/kaldi-nnet3.a ../chain/kaldi-chain.a \
           ../cudamatrix/kaldi-cudamatrix.a ../decoder/kaldi-decoder.a \
           ../lat/kaldi-lat.a ../fstext/kaldi-fstext.a ../hmm/kaldi-hmm.a \
           ../transform/kaldi-transform.a ../gmm/kaldi-gmm.a \
-          ../tree/kaldi-tree.a ../util/kaldi-util.a ../matrix/kaldi-matrix.a \
+          ../tree/kaldi-tree.a ../util/kaldi-util.a ../matrix/kaldi-matrix.a ../cblasext/kaldi-cblasext.a \
           ../base/kaldi-base.a 
 
 include ../makefiles/default_rules.mk
diff --git a/src/cudadecoder/Makefile b/src/cudadecoder/Makefile
index ede7cfddbe7..d000666ad29 100644
--- a/src/cudadecoder/Makefile
+++ b/src/cudadecoder/Makefile
@@ -20,8 +20,8 @@ LDLIBS += $(CUDA_LDLIBS)
 
 LIBNAME = kaldi-cudadecoder
 
-ADDLIBS = ../cudamatrix/kaldi-cudamatrix.a ../base/kaldi-base.a ../matrix/kaldi-matrix.a \
-          ../lat/kaldi-lat.a ../util/kaldi-util.a ../matrix/kaldi-matrix.a ../gmm/kaldi-gmm.a \
+ADDLIBS = ../cudamatrix/kaldi-cudamatrix.a ../base/kaldi-base.a ../matrix/kaldi-matrix.a ../cblasext/kaldi-cblasext.a \
+          ../lat/kaldi-lat.a ../util/kaldi-util.a ../matrix/kaldi-matrix.a ../cblasext/kaldi-cblasext.a ../gmm/kaldi-gmm.a \
           ../fstext/kaldi-fstext.a ../hmm/kaldi-hmm.a ../gmm/kaldi-gmm.a ../transform/kaldi-transform.a \
           ../tree/kaldi-tree.a ../online2/kaldi-online2.a ../nnet3/kaldi-nnet3.a
 
diff --git a/src/cudadecoderbin/Makefile b/src/cudadecoderbin/Makefile
index 0692126dacc..276bb0ffc9c 100644
--- a/src/cudadecoderbin/Makefile
+++ b/src/cudadecoderbin/Makefile
@@ -20,7 +20,7 @@ ADDLIBS = ../cudadecoder/kaldi-cudadecoder.a  \
 ../lat/kaldi-lat.a ../fstext/kaldi-fstext.a ../hmm/kaldi-hmm.a \
 ../feat/kaldi-feat.a ../transform/kaldi-transform.a \
 ../gmm/kaldi-gmm.a ../tree/kaldi-tree.a ../util/kaldi-util.a \
-../matrix/kaldi-matrix.a ../base/kaldi-base.a 
+../matrix/kaldi-matrix.a ../cblasext/kaldi-cblasext.a ../base/kaldi-base.a 
 
 endif
 
diff --git a/src/cudafeat/Makefile b/src/cudafeat/Makefile
index 913c1ea9dbb..a2351f972ff 100644
--- a/src/cudafeat/Makefile
+++ b/src/cudafeat/Makefile
@@ -13,7 +13,7 @@ endif
 
 LIBNAME = kaldi-cudafeat
 
-ADDLIBS = ../feat/kaldi-feat.a ../util/kaldi-util.a ../matrix/kaldi-matrix.a \
+ADDLIBS = ../feat/kaldi-feat.a ../util/kaldi-util.a ../matrix/kaldi-matrix.a ../cblasext/kaldi-cblasext.a \
           ../base/kaldi-base.a ../cudamatrix/kaldi-cudamatrix.a \
           ../gmm/kaldi-gmm.a ../online2/kaldi-online2.a
 
diff --git a/src/cudafeatbin/Makefile b/src/cudafeatbin/Makefile
index e1af458b62e..068fcfdf806 100644
--- a/src/cudafeatbin/Makefile
+++ b/src/cudafeatbin/Makefile
@@ -19,7 +19,7 @@ TESTFILES =
 ADDLIBS = ../cudafeat/kaldi-cudafeat.a ../cudamatrix/kaldi-cudamatrix.a \
 					../hmm/kaldi-hmm.a ../feat/kaldi-feat.a \
           ../transform/kaldi-transform.a ../gmm/kaldi-gmm.a \
-          ../tree/kaldi-tree.a ../util/kaldi-util.a ../matrix/kaldi-matrix.a \
+          ../tree/kaldi-tree.a ../util/kaldi-util.a ../matrix/kaldi-matrix.a ../cblasext/kaldi-cblasext.a \
           ../base/kaldi-base.a ../online2/kaldi-online2.a
 
 include ../makefiles/default_rules.mk
diff --git a/src/cudamatrix/Makefile b/src/cudamatrix/Makefile
index 45c2ba44fd7..5c0b4e7680c 100644
--- a/src/cudamatrix/Makefile
+++ b/src/cudamatrix/Makefile
@@ -18,7 +18,7 @@ endif
 
 LIBNAME = kaldi-cudamatrix
 
-ADDLIBS = ../util/kaldi-util.a ../matrix/kaldi-matrix.a ../base/kaldi-base.a 
+ADDLIBS = ../util/kaldi-util.a ../matrix/kaldi-matrix.a ../cblasext/kaldi-cblasext.a ../base/kaldi-base.a 
 
 # Make sure we have CUDA_ARCH from kaldi.mk,
 ifeq ($(CUDA), true)
diff --git a/src/decoder/Makefile b/src/decoder/Makefile
index 020fe358fe9..c60e70a6f73 100644
--- a/src/decoder/Makefile
+++ b/src/decoder/Makefile
@@ -13,7 +13,7 @@ LIBNAME = kaldi-decoder
 
 ADDLIBS = ../lat/kaldi-lat.a ../fstext/kaldi-fstext.a ../hmm/kaldi-hmm.a \
           ../transform/kaldi-transform.a ../gmm/kaldi-gmm.a \
-          ../tree/kaldi-tree.a ../util/kaldi-util.a ../matrix/kaldi-matrix.a \
+          ../tree/kaldi-tree.a ../util/kaldi-util.a ../matrix/kaldi-matrix.a ../cblasext/kaldi-cblasext.a \
           ../base/kaldi-base.a 
 
 include ../makefiles/default_rules.mk
diff --git a/src/feat/Makefile b/src/feat/Makefile
index 9850e578d9a..ce8a75ba289 100644
--- a/src/feat/Makefile
+++ b/src/feat/Makefile
@@ -16,7 +16,7 @@ OBJFILES = feature-functions.o feature-mfcc.o feature-fbank.o \
 LIBNAME = kaldi-feat
 
 ADDLIBS = ../transform/kaldi-transform.a ../gmm/kaldi-gmm.a \
-          ../tree/kaldi-tree.a ../util/kaldi-util.a ../matrix/kaldi-matrix.a \
+          ../tree/kaldi-tree.a ../util/kaldi-util.a ../matrix/kaldi-matrix.a ../cblasext/kaldi-cblasext.a \
           ../base/kaldi-base.a
 
 include ../makefiles/default_rules.mk
diff --git a/src/featbin/Makefile b/src/featbin/Makefile
index 09de6c2fb19..bb11b797e69 100644
--- a/src/featbin/Makefile
+++ b/src/featbin/Makefile
@@ -24,7 +24,7 @@ TESTFILES =
 
 ADDLIBS = ../hmm/kaldi-hmm.a ../feat/kaldi-feat.a \
           ../transform/kaldi-transform.a ../gmm/kaldi-gmm.a \
-          ../tree/kaldi-tree.a ../util/kaldi-util.a ../matrix/kaldi-matrix.a \
+          ../tree/kaldi-tree.a ../util/kaldi-util.a ../matrix/kaldi-matrix.a ../cblasext/kaldi-cblasext.a \
           ../base/kaldi-base.a
 
 include ../makefiles/default_rules.mk
diff --git a/src/fgmmbin/Makefile b/src/fgmmbin/Makefile
index 5db252477b5..060c5e06957 100644
--- a/src/fgmmbin/Makefile
+++ b/src/fgmmbin/Makefile
@@ -18,6 +18,6 @@ TESTFILES =
 ADDLIBS = ../decoder/kaldi-decoder.a ../lat/kaldi-lat.a ../hmm/kaldi-hmm.a \
           ../feat/kaldi-feat.a ../transform/kaldi-transform.a \
           ../gmm/kaldi-gmm.a ../tree/kaldi-tree.a ../util/kaldi-util.a \
-          ../matrix/kaldi-matrix.a ../base/kaldi-base.a 
+          ../matrix/kaldi-matrix.a ../cblasext/kaldi-cblasext.a ../base/kaldi-base.a 
 
 include ../makefiles/default_rules.mk
diff --git a/src/fstbin/Makefile b/src/fstbin/Makefile
index a22c014a7d5..c91e405e2c2 100644
--- a/src/fstbin/Makefile
+++ b/src/fstbin/Makefile
@@ -26,6 +26,6 @@ TESTFILES =
 LIBFILE =
 
 ADDLIBS = ../decoder/kaldi-decoder.a ../fstext/kaldi-fstext.a \
-          ../util/kaldi-util.a ../matrix/kaldi-matrix.a ../base/kaldi-base.a 
+          ../util/kaldi-util.a ../matrix/kaldi-matrix.a ../cblasext/kaldi-cblasext.a ../base/kaldi-base.a 
 
 include ../makefiles/default_rules.mk
diff --git a/src/fstext/Makefile b/src/fstext/Makefile
index b76bd413c42..655437dc52d 100644
--- a/src/fstext/Makefile
+++ b/src/fstext/Makefile
@@ -24,7 +24,7 @@ LIBNAME = kaldi-fstext
 
 # tree and matrix archives needed for test-context-fst
 # matrix archive needed for push-special.
-ADDLIBS = ../tree/kaldi-tree.a ../util/kaldi-util.a ../matrix/kaldi-matrix.a \
+ADDLIBS = ../tree/kaldi-tree.a ../util/kaldi-util.a ../matrix/kaldi-matrix.a ../cblasext/kaldi-cblasext.a \
           ../base/kaldi-base.a 
 
 include ../makefiles/default_rules.mk
diff --git a/src/gmm/Makefile b/src/gmm/Makefile
index 9b770bb4845..64fe320507a 100644
--- a/src/gmm/Makefile
+++ b/src/gmm/Makefile
@@ -14,7 +14,7 @@ OBJFILES = diag-gmm.o diag-gmm-normal.o mle-diag-gmm.o am-diag-gmm.o \
 
 LIBNAME = kaldi-gmm
 
-ADDLIBS = ../tree/kaldi-tree.a ../util/kaldi-util.a ../matrix/kaldi-matrix.a \
+ADDLIBS = ../tree/kaldi-tree.a ../util/kaldi-util.a ../matrix/kaldi-matrix.a ../cblasext/kaldi-cblasext.a \
           ../base/kaldi-base.a
 
 
diff --git a/src/gmmbin/Makefile b/src/gmmbin/Makefile
index 1e926e88432..f43dfa96ccb 100644
--- a/src/gmmbin/Makefile
+++ b/src/gmmbin/Makefile
@@ -36,7 +36,7 @@ TESTFILES =
 ADDLIBS = ../decoder/kaldi-decoder.a ../lat/kaldi-lat.a \
           ../fstext/kaldi-fstext.a ../hmm/kaldi-hmm.a ../feat/kaldi-feat.a \
           ../transform/kaldi-transform.a ../gmm/kaldi-gmm.a \
-          ../tree/kaldi-tree.a ../util/kaldi-util.a ../matrix/kaldi-matrix.a \
+          ../tree/kaldi-tree.a ../util/kaldi-util.a ../matrix/kaldi-matrix.a ../cblasext/kaldi-cblasext.a \
           ../base/kaldi-base.a
 
 
diff --git a/src/hmm/Makefile b/src/hmm/Makefile
index 0315a51b214..fb8c57397c8 100644
--- a/src/hmm/Makefile
+++ b/src/hmm/Makefile
@@ -9,7 +9,7 @@ OBJFILES = topology.o transitions.o hmm-utils.o tree-accu.o \
         posterior.o hmm-test-utils.o
 
 LIBNAME = kaldi-hmm
-ADDLIBS = ../tree/kaldi-tree.a ../util/kaldi-util.a ../matrix/kaldi-matrix.a \
+ADDLIBS = ../tree/kaldi-tree.a ../util/kaldi-util.a ../matrix/kaldi-matrix.a ../cblasext/kaldi-cblasext.a \
           ../base/kaldi-base.a
 
 include ../makefiles/default_rules.mk
diff --git a/src/hmm/hmm-test-utils.cc b/src/hmm/hmm-test-utils.cc
index 4323482cde4..a43f7e956c3 100644
--- a/src/hmm/hmm-test-utils.cc
+++ b/src/hmm/hmm-test-utils.cc
@@ -177,7 +177,6 @@ Topology GenRandTopology() {
 }
 
 void GeneratePathThroughHmm(const Topology &topology,
-                            bool reorder,
                             int32 phone,
                             std::vector<std::pair<int32, int32> > *path) {
   path->clear();
@@ -195,22 +194,7 @@ void GeneratePathThroughHmm(const Topology &topology,
     auto const &arc(aiter.Value());
     if (arc.ilabel != -1) {
       std::pair<int32, int32> pr(cur_state, arc_index);
-      if (!reorder) {
-        path->push_back(pr);
-      } else {
-        bool is_self_loop = (cur_state == arc.nextstate);
-        if (is_self_loop) { // save these up, we'll put them after the forward
-                            // transition.
-          pending_self_loops.push_back(pr);
-        } else {
-          // non-self-loop: output it and then flush out any self-loops we
-          // stored up.
-          path->push_back(pr);
-          path->insert(path->end(), pending_self_loops.begin(),
-                       pending_self_loops.end());
-          pending_self_loops.clear();
-        }
-      }
+      path->push_back(pr);
     }
     cur_state = arc.nextstate;
   }
@@ -220,7 +204,6 @@ void GeneratePathThroughHmm(const Topology &topology,
 
 void GenerateRandomAlignment(const ContextDependencyInterface &ctx_dep,
                              const Transitions &trans_model,
-                             bool reorder,
                              const std::vector<int32> &phone_sequence,
                              std::vector<int32> *alignment) {
   int32 context_width = ctx_dep.ContextWidth(),
@@ -240,7 +223,7 @@ void GenerateRandomAlignment(const ContextDependencyInterface &ctx_dep,
     // (emitting-HMM-state, transition-index) pairs
     std::vector<std::pair<int32, int32> > path;
     int32 phone = phone_sequence[i];
-    GeneratePathThroughHmm(trans_model.GetTopo(), reorder, phone, &path);
+    GeneratePathThroughHmm(trans_model.GetTopo(), phone, &path);
     for (size_t k = 0; k < path.size(); k++) {
       auto const &entry = trans_model.GetTopo().TopologyForPhone(phone);
       int32 hmm_state = path[k].first,
diff --git a/src/hmm/hmm-test-utils.h b/src/hmm/hmm-test-utils.h
index f9f516e7d4c..b7748e57338 100644
--- a/src/hmm/hmm-test-utils.h
+++ b/src/hmm/hmm-test-utils.h
@@ -61,7 +61,6 @@ Topology GenRandTopology();
 /// the 'reorder' option is as described in the documentation; if true, the
 /// self-loops from a state are reordered to come after the forward-transition.
 void GeneratePathThroughHmm(const Topology &topology,
-                            bool reorder,
                             int32 phone,
                             std::vector<std::pair<int32, int32> > *path);
 
@@ -70,7 +69,6 @@ void GeneratePathThroughHmm(const Topology &topology,
 /// transition-ids) corresponding to a given phone sequence.
 void GenerateRandomAlignment(const ContextDependencyInterface &ctx_dep,
                              const Transitions &trans_model,
-                             bool reorder,
                              const std::vector<int32> &phone_sequence,
                              std::vector<int32> *alignment);
 
diff --git a/src/hmm/hmm-utils-test.cc b/src/hmm/hmm-utils-test.cc
index fddbd82a9a0..66e1cb4f172 100644
--- a/src/hmm/hmm-utils-test.cc
+++ b/src/hmm/hmm-utils-test.cc
@@ -210,9 +210,8 @@ void TestSplitToPhones() {
     int32 rand_phone = phone_list[RandInt(0, phone_list.size() - 1)];
     phone_seq.push_back(rand_phone);
   }
-  bool reorder = (RandInt(0, 1) == 0);
   std::vector<int32> alignment;
-  GenerateRandomAlignment(*ctx_dep, *trans_model, reorder,
+  GenerateRandomAlignment(*ctx_dep, *trans_model,
                           phone_seq, &alignment);
   std::vector<std::vector<int32> > split_alignment;
   SplitToPhones(*trans_model, alignment, &split_alignment);
@@ -230,18 +229,14 @@ void TestSplitToPhones() {
 }
 
 void TestConvertAlignment() {
-  bool old_reorder = (RandInt(0, 1) == 1),
-      new_reorder = (RandInt(0, 1) == 1),
-      new_tree = (RandInt(0, 1) == 1),
+  bool new_tree = (RandInt(0, 1) == 1),
       new_topology = (RandInt(0, 1) == 1);
   if (!new_tree)
     new_topology = true;
 
   int32 subsample_factor = RandInt(1, 3);
 
-  KALDI_LOG << " old-reorder = " << old_reorder
-            << ", new-reorder = " << new_reorder
-            << ", new-tree = " << new_tree
+  KALDI_LOG << ", new-tree = " << new_tree
             << ", subsample-factor = " << subsample_factor;
 
   std::vector<int32> phones;
@@ -286,15 +281,15 @@ void TestConvertAlignment() {
     phone_sequence.push_back(phones[RandInt(0, phones.size() - 1)]);
   std::vector<int32> old_alignment;
   GenerateRandomAlignment(*ctx_dep_old, trans_model_old,
-                          old_reorder, phone_sequence,
+                          phone_sequence,
                           &old_alignment);
 
   std::vector<int32> new_alignment;
 
   bool ans = ConvertAlignment(trans_model_old, trans_model_new, *ctx_dep_new,
                               old_alignment, subsample_factor, false,
-                              new_reorder, NULL, &new_alignment);
-  if(!ans) {
+                              NULL, &new_alignment);
+  if (!ans) {
     KALDI_WARN << "Alignment conversion failed";
     // make sure it failed for a good reason.
     KALDI_ASSERT(new_topology || subsample_factor > 1);
@@ -312,7 +307,7 @@ void TestConvertAlignment() {
       std::vector<int32> old_alignment_copy;
       bool ans = ConvertAlignment(trans_model_new, trans_model_old, *ctx_dep_old,
                                   new_alignment, subsample_factor, false,
-                                  old_reorder, NULL, &old_alignment_copy);
+                                  NULL, &old_alignment_copy);
       KALDI_ASSERT(ans);
       KALDI_ASSERT(old_alignment_copy == old_alignment);
     }
@@ -336,4 +331,3 @@ int main() {
     kaldi::TestConvertAlignment();
   std::cout << "Test OK.\n";
 }
-
diff --git a/src/hmm/hmm-utils.cc b/src/hmm/hmm-utils.cc
index b7729c3b62e..ffedc43f26d 100644
--- a/src/hmm/hmm-utils.cc
+++ b/src/hmm/hmm-utils.cc
@@ -51,10 +51,10 @@ std::shared_ptr<fst::ExpandedFst<fst::StdArc>> GetHmmAsFsa(
   // vector of the pdfs, indexed by pdf-class (pdf-classes must start from zero
   // and be contiguous).
   std::vector<int32> pdfs(topo.NumPdfClasses(phone));
-  for (int32 pdf_class = 0;
-       pdf_class < static_cast<int32>(pdfs.size());
+  for (int32 pdf_class = 1;
+       pdf_class <= static_cast<int32>(pdfs.size());
        pdf_class++) {
-    if (! ctx_dep.Compute(phone_window, pdf_class, &(pdfs[pdf_class])) ) {
+    if (! ctx_dep.Compute(phone_window, pdf_class, &(pdfs[pdf_class - 1])) ) {
       std::ostringstream ctx_ss;
       for (size_t i = 0; i < phone_window.size(); i++)
         ctx_ss << phone_window[i] << ' ';
@@ -278,8 +278,8 @@ void GetIlabelMapping (const std::vector<std::vector<int32> > &ilabel_info_old,
       int32 central_phone = vec[P];
       int32 num_pdf_classes = trans_model.GetTopo().NumPdfClasses(central_phone);
       std::vector<int32> state_seq(num_pdf_classes);  // Indexed by pdf-class
-      for (int32 pdf_class = 0; pdf_class < num_pdf_classes; pdf_class++) {
-        if (!ctx_dep.Compute(vec, pdf_class, &(state_seq[pdf_class]))) {
+      for (int32 pdf_class = 1; pdf_class <= num_pdf_classes; pdf_class++) {
+        if (!ctx_dep.Compute(vec, pdf_class, &(state_seq[pdf_class - 1]))) {
           std::ostringstream ss;
           WriteIntegerVector(ss, false, vec);
           KALDI_ERR << "tree did not succeed in converting phone window "<<ss.str();
@@ -668,9 +668,9 @@ static inline void ConvertAlignmentForPhone(
 
   int32 new_num_pdf_classes = new_topo.NumPdfClasses(new_central_phone);
   std::vector<int32> pdf_ids(new_num_pdf_classes);  // Indexed by pdf-class
-  for (int32 pdf_class = 0; pdf_class < new_num_pdf_classes; pdf_class++) {
+  for (int32 pdf_class = 1; pdf_class <= new_num_pdf_classes; pdf_class++) {
     if (!new_ctx_dep.Compute(new_phone_window, pdf_class,
-                             &(pdf_ids[pdf_class]))) {
+                             &(pdf_ids[pdf_class - 1]))) {
       std::ostringstream ss;
       WriteIntegerVector(ss, false, new_phone_window);
       KALDI_ERR << "tree did not succeed in converting phone window "
diff --git a/src/hmm/topology-test.cc b/src/hmm/topology-test.cc
index 2ed8ce38b4a..7073ce94866 100644
--- a/src/hmm/topology-test.cc
+++ b/src/hmm/topology-test.cc
@@ -31,7 +31,7 @@ void TestTopology() {
   std::string input_str = "<Topology>\n"
       "<TopologyEntry>\n"
       "<ForPhones> 1 2 3 4 5 6 7 8 9 </ForPhones>\n"
-      " 0  1  1  0.0\n"
+      " 0  1  1  0\n"
       " 1  1  1  0.693\n"
       " 1  2  2  0.693\n"
       " 2  2  2  0.693\n"
@@ -42,8 +42,8 @@ void TestTopology() {
 
       "<TopologyEntry>\n"
       "<ForPhones> 10 11 13 </ForPhones>\n"
-      " 0  0  1  0.693\n"
-      " 0  1  1  0.693\n"
+      //      " 0  0  1  0.693\n"   // disallowed!
+      " 0  1  1  0\n"
       " 1  1  2  0.693\n"
       " 1  2  2  0.693\n"
       " 2 \n\n"
diff --git a/src/hmm/topology.cc b/src/hmm/topology.cc
index 9e4f1b86771..973a2cde542 100644
--- a/src/hmm/topology.cc
+++ b/src/hmm/topology.cc
@@ -184,7 +184,7 @@ void Topology::Check() {
       }
       if (!ApproxEqual(outward_prob_sum, 1.0))
         KALDI_WARN << "Outward transition probabilities should sum to 1.0 "
-            "for each state";
+            "for each state, value was: " << outward_prob_sum;
     }
     if (!has_final_state) {
       KALDI_ERR << "Must have a final state.";
diff --git a/src/hmm/transitions.cc b/src/hmm/transitions.cc
index 602f96485f9..c666e508e08 100644
--- a/src/hmm/transitions.cc
+++ b/src/hmm/transitions.cc
@@ -176,8 +176,13 @@ int32 Transitions::TupleToTransitionId(int32 phone, int32 topo_state,
   // this code to sort first on pdf, and then index on pdf, so those
   // that have the same pdf are in a contiguous range.
   auto lowerbound = std::lower_bound(info_.begin(), info_.end(), tuple);
-  if (lowerbound == info_.end() || !(*lowerbound == tuple))
-    KALDI_ERR << "Tuple not found. (incompatible tree and model?)";
+  if (lowerbound == info_.end() || !(*lowerbound == tuple)) {
+    bool is_end = (lowerbound == info_.end());
+    const TransitionIdInfo &this_tuple = *lowerbound;
+    KALDI_ERR << "Tuple not found. (incompatible tree and model?)"
+              << std::boolalpha << is_end
+              << ", this_tuple pdf_id " << this_tuple.pdf_id;
+  }
 
   return static_cast<int32>((lowerbound - info_.begin()));
 }
diff --git a/src/hmm/transitions.h b/src/hmm/transitions.h
index 89af42904fb..4b3b49ad81e 100644
--- a/src/hmm/transitions.h
+++ b/src/hmm/transitions.h
@@ -153,34 +153,19 @@ class Transitions {
       else if (phone > other.phone) return false;
       else if (topo_state < other.topo_state) return true;
       else if (topo_state > other.topo_state) return false;
+      else if (arc_index < other.arc_index) return true;
+      else if (arc_index > other.arc_index) return false;
       else if (pdf_id < other.pdf_id) return true;
       else if (pdf_id > other.pdf_id) return false;
       else return (self_loop_pdf_id < other.self_loop_pdf_id);
     }
-    // TODO.  operator == can compare all members. Also compare derived members?
+
+    // Compare all non-derived members.
     bool operator == (const TransitionIdInfo &other) const {
-      // I don't think this is being used right now. For now, just abort
-      // whenever it is used, so I can see where it is used.
-      KALDI_ASSERT(false);
-      if (phone == other.phone && topo_state == other.topo_state &&
-          pdf_id == other.pdf_id) {
-        // This assertion is no longer true. Two states can have
-        // different arc_index fields. This equality operator is just
-        // bizarre. Should a TransitionIdInfo really be the same as
-        // another one if they don't have the same arc_index? I don't
-        // think so...  Should probably make a TransitionState class
-        // exposing a different operator== based on this class.
-        KALDI_ASSERT(self_loop_pdf_id == other.self_loop_pdf_id);
-        return true;
-      } else {
-        return false;
-      }
+      return (phone == other.phone && topo_state == other.topo_state &&
+              arc_index == other.arc_index && pdf_id == other.pdf_id &&
+              self_loop_pdf_id == other.self_loop_pdf_id);
     }
-
-    // TransitionIdInfo& operator=(const TransitionIdInfo& other) {
-    //   is_final = other.is_final;
-    //   return *this;
-    // }
   };
 
 
diff --git a/src/ivector/Makefile b/src/ivector/Makefile
index 1154da6880b..ad53c9007b2 100644
--- a/src/ivector/Makefile
+++ b/src/ivector/Makefile
@@ -13,7 +13,7 @@ OBJFILES = ivector-extractor.o voice-activity-detection.o plda.o \
 LIBNAME = kaldi-ivector
 
 ADDLIBS = ../transform/kaldi-transform.a ../gmm/kaldi-gmm.a \
-          ../tree/kaldi-tree.a ../util/kaldi-util.a ../matrix/kaldi-matrix.a \
+          ../tree/kaldi-tree.a ../util/kaldi-util.a ../matrix/kaldi-matrix.a ../cblasext/kaldi-cblasext.a \
           ../base/kaldi-base.a 
 
 
diff --git a/src/ivectorbin/Makefile b/src/ivectorbin/Makefile
index 5a738352d9c..0a490573ba0 100644
--- a/src/ivectorbin/Makefile
+++ b/src/ivectorbin/Makefile
@@ -26,7 +26,7 @@ TESTFILES =
 
 
 ADDLIBS = ../ivector/kaldi-ivector.a ../hmm/kaldi-hmm.a ../gmm/kaldi-gmm.a \
-          ../tree/kaldi-tree.a ../util/kaldi-util.a ../matrix/kaldi-matrix.a \
+          ../tree/kaldi-tree.a ../util/kaldi-util.a ../matrix/kaldi-matrix.a ../cblasext/kaldi-cblasext.a \
           ../base/kaldi-base.a 
 
 include ../makefiles/default_rules.mk
diff --git a/src/kws/Makefile b/src/kws/Makefile
index c4367eb2958..9dc7bddab70 100644
--- a/src/kws/Makefile
+++ b/src/kws/Makefile
@@ -10,7 +10,7 @@ OBJFILES = kws-functions.o kws-functions2.o kws-scoring.o
 LIBNAME = kaldi-kws
 
 ADDLIBS = ../lat/kaldi-lat.a ../hmm/kaldi-hmm.a ../tree/kaldi-tree.a \
-          ../util/kaldi-util.a ../matrix/kaldi-matrix.a ../base/kaldi-base.a 
+          ../util/kaldi-util.a ../matrix/kaldi-matrix.a ../cblasext/kaldi-cblasext.a ../base/kaldi-base.a 
 
 
 include ../makefiles/default_rules.mk
diff --git a/src/kwsbin/Makefile b/src/kwsbin/Makefile
index bcc2685b7f3..f03b0a07f92 100644
--- a/src/kwsbin/Makefile
+++ b/src/kwsbin/Makefile
@@ -17,6 +17,6 @@ TESTFILES =
 
 ADDLIBS = ../kws/kaldi-kws.a ../lat/kaldi-lat.a ../fstext/kaldi-fstext.a \
           ../hmm/kaldi-hmm.a ../tree/kaldi-tree.a ../util/kaldi-util.a \
-          ../matrix/kaldi-matrix.a ../base/kaldi-base.a 
+          ../matrix/kaldi-matrix.a ../cblasext/kaldi-cblasext.a ../base/kaldi-base.a 
 
 include ../makefiles/default_rules.mk
diff --git a/src/lat/Makefile b/src/lat/Makefile
index 56521486826..3d4c6afcc79 100644
--- a/src/lat/Makefile
+++ b/src/lat/Makefile
@@ -16,7 +16,7 @@ OBJFILES = kaldi-lattice.o lattice-functions.o word-align-lattice.o \
 LIBNAME = kaldi-lat
 
 ADDLIBS = ../hmm/kaldi-hmm.a ../tree/kaldi-tree.a ../util/kaldi-util.a \
-          ../matrix/kaldi-matrix.a ../base/kaldi-base.a 
+          ../matrix/kaldi-matrix.a ../cblasext/kaldi-cblasext.a ../base/kaldi-base.a 
 
 
 include ../makefiles/default_rules.mk
diff --git a/src/lat/lattice-functions.cc b/src/lat/lattice-functions.cc
index 878a99d79e3..a6338328451 100644
--- a/src/lat/lattice-functions.cc
+++ b/src/lat/lattice-functions.cc
@@ -1757,4 +1757,51 @@ void ReplaceAcousticScoresFromMap(
   }
 }
 
+void AddTransitions(
+    const Transitions &tmodel,
+    Lattice *lat) {
+  typedef Lattice::Arc Arc;
+  typedef Arc::Weight Weight;
+  typedef Arc::StateId StateId;
+  typedef Arc::Label Label;
+
+  for (StateId s = 0; s < lat->NumStates(); s++) {
+    for (fst::MutableArcIterator<Lattice> aiter(lat, s);
+          !aiter.Done(); aiter.Next()) {
+      Arc arc(aiter.Value());
+      int32 tid = arc.ilabel;
+      BaseFloat transition_cost =
+          tmodel.InfoForTransitionId(tid).transition_cost;
+      arc.weight.SetValue1(arc.weight.Value1() + transition_cost);
+      aiter.SetValue(arc);
+    }
+  }
+}
+
+
+void AddTransitions(
+    const Transitions &tmodel,
+    CompactLattice *lat) {
+  typedef CompactLattice::Arc Arc;
+  typedef Arc::Weight Weight;
+  typedef Arc::StateId StateId;
+  typedef Arc::Label Label;
+
+  for (StateId s = 0; s < lat->NumStates(); s++) {
+    for (fst::MutableArcIterator<CompactLattice> aiter(lat, s);
+          !aiter.Done(); aiter.Next()) {
+      Arc arc(aiter.Value());
+      BaseFloat tot_transition_cost = 0.0;
+      for (int32 tid: arc.weight.String())
+        tot_transition_cost +=
+            tmodel.InfoForTransitionId(tid).transition_cost;
+      LatticeWeight new_weight = arc.weight.Weight();
+      new_weight.SetValue1(new_weight.Value1() + tot_transition_cost);
+      arc.weight.SetWeight(new_weight);
+      aiter.SetValue(arc);
+    }
+  }
+}
+
+
 }  // namespace kaldi
diff --git a/src/lat/lattice-functions.h b/src/lat/lattice-functions.h
index b54b12551c8..0bf73540485 100644
--- a/src/lat/lattice-functions.h
+++ b/src/lat/lattice-functions.h
@@ -356,6 +356,20 @@ bool RescoreCompactLatticeSpeedup(
     CompactLattice *clat);
 
 
+/// Adds transition costs from transition-model 'tmodel' to the costs in 'lat'.
+/// Note: these transition costs are not trainable, they are fixed once the
+/// topology is known.
+void AddTransitions(
+    const Transitions &tmodel,
+    CompactLattice *lat);
+/// Adds transition costs from transition-model 'tmodel' to the costs in 'lat'.
+/// Note: these transition costs are not trainable, they are fixed once the
+/// topology is known.
+void AddTransitions(
+    const Transitions &tmodel,
+    Lattice *lat);
+
+
 /// This function *adds* the negated scores obtained from the Decodable object,
 /// to the acoustic scores on the arcs.  If you want to replace them, you should
 /// use ScaleCompactLattice to first set the acoustic scores to zero.  Returns
@@ -377,26 +391,26 @@ void ComposeCompactLatticeDeterministic(
     fst::DeterministicOnDemandFst<fst::StdArc>* det_fst,
     CompactLattice* composed_clat);
 
-/// This function computes the mapping from the pair 
-/// (frame-index, transition-id) to the pair 
-/// (sum-of-acoustic-scores, num-of-occurences) over all occurences of the 
+/// This function computes the mapping from the pair
+/// (frame-index, transition-id) to the pair
+/// (sum-of-acoustic-scores, num-of-occurences) over all occurences of the
 /// transition-id in that frame.
-/// frame-index in the lattice. 
-/// This function is useful for retaining the acoustic scores in a 
-/// non-compact lattice after a process like determinization where the 
+/// frame-index in the lattice.
+/// This function is useful for retaining the acoustic scores in a
+/// non-compact lattice after a process like determinization where the
 /// frame-level acoustic scores are typically lost.
-/// The function ReplaceAcousticScoresFromMap is used to restore the 
+/// The function ReplaceAcousticScoresFromMap is used to restore the
 /// acoustic scores computed by this function.
 ///
-///   @param [in] lat   Input lattice. Expected to be top-sorted. Otherwise the 
-///                     function will crash. 
-///   @param [out] acoustic_scores  
+///   @param [in] lat   Input lattice. Expected to be top-sorted. Otherwise the
+///                     function will crash.
+///   @param [out] acoustic_scores
 ///                     Pointer to a map from the pair (frame-index,
 ///                     transition-id) to a pair (sum-of-acoustic-scores,
 ///                     num-of-occurences).
 ///                     Usually the acoustic scores for a pdf-id (and hence
 ///                     transition-id) on a frame will be the same for all the
-///                     occurences of the pdf-id in that frame. 
+///                     occurences of the pdf-id in that frame.
 ///                     But if not, we will take the average of the acoustic
 ///                     scores. Hence, we store both the sum-of-acoustic-scores
 ///                     and the num-of-occurences of the transition-id in that
@@ -409,11 +423,11 @@ void ComputeAcousticScoresMap(
 /// This function restores acoustic scores computed using the function
 /// ComputeAcousticScoresMap into the lattice.
 ///
-///   @param [in] acoustic_scores  
+///   @param [in] acoustic_scores
 ///                      A map from the pair (frame-index, transition-id) to a
-///                      pair (sum-of-acoustic-scores, num-of-occurences) of 
+///                      pair (sum-of-acoustic-scores, num-of-occurences) of
 ///                      the occurences of the transition-id in that frame.
-///                      See the comments for ComputeAcousticScoresMap for 
+///                      See the comments for ComputeAcousticScoresMap for
 ///                      details.
 ///   @param [out] lat   Pointer to the output lattice.
 void ReplaceAcousticScoresFromMap(
diff --git a/src/latbin/Makefile b/src/latbin/Makefile
index 9809cdcbb85..1ba4f5b5a88 100644
--- a/src/latbin/Makefile
+++ b/src/latbin/Makefile
@@ -12,7 +12,7 @@ BINFILES = lattice-best-path lattice-prune lattice-equivalent lattice-to-nbest \
            lattice-determinize lattice-oracle lattice-rmali \
            lattice-compose lattice-boost-ali lattice-copy lattice-to-fst \
            lattice-to-phone-lattice lattice-interp lattice-project \
-           lattice-add-trans-probs lattice-difference \
+           lattice-difference \
            nbest-to-linear nbest-to-lattice lattice-1best linear-to-nbest \
            lattice-mbr-decode lattice-align-words lattice-to-mpe-post \
            lattice-copy-backoff nbest-to-ctm lattice-determinize-pruned \
@@ -35,6 +35,6 @@ TESTFILES =
 ADDLIBS = ../rnnlm/kaldi-rnnlm.a ../nnet3/kaldi-nnet3.a \
           ../cudamatrix/kaldi-cudamatrix.a ../lat/kaldi-lat.a ../lm/kaldi-lm.a \
           ../fstext/kaldi-fstext.a ../hmm/kaldi-hmm.a ../tree/kaldi-tree.a \
-          ../util/kaldi-util.a ../matrix/kaldi-matrix.a ../base/kaldi-base.a 
+          ../util/kaldi-util.a ../matrix/kaldi-matrix.a ../cblasext/kaldi-cblasext.a ../base/kaldi-base.a
 
 include ../makefiles/default_rules.mk
diff --git a/src/latbin/lattice-copy.cc b/src/latbin/lattice-copy.cc
index 22bddef4575..5bf83dfbd8d 100644
--- a/src/latbin/lattice-copy.cc
+++ b/src/latbin/lattice-copy.cc
@@ -23,6 +23,8 @@
 #include "util/common-utils.h"
 #include "fstext/fstext-lib.h"
 #include "lat/kaldi-lattice.h"
+#include "hmm/transitions.h"
+#include "lat/lattice-functions.h"
 
 namespace kaldi {
   int32 CopySubsetLattices(std::string filename,
@@ -154,6 +156,7 @@ int main(int argc, char *argv[]) {
     bool write_compact = true, ignore_missing = false;
     std::string include_rxfilename;
     std::string exclude_rxfilename;
+    std::string transition_model_rxfilename;
 
     po.Register("write-compact", &write_compact, "If true, write in normal (compact) form.");
     po.Register("include", &include_rxfilename,
@@ -166,6 +169,10 @@ int main(int argc, char *argv[]) {
                 "whose lattices will be excluded");
     po.Register("ignore-missing", &ignore_missing,
                 "Exit with status 0 even if no lattices are copied");
+    po.Register("add-transitions", &transition_model_rxfilename,
+                "If this option is provided, transition costs/probabilities will "
+                "be added, obtained from the provided model.  (Note: these "
+                "are derived from the topology, they are not trained.)");
 
     po.Read(argc, argv);
 
@@ -183,6 +190,11 @@ int main(int argc, char *argv[]) {
 
     int32 n_done = 0;
 
+    Transitions transitions;  // For adding transition costs.
+
+    if (!transition_model_rxfilename.empty())
+      ReadKaldiObject(transition_model_rxfilename, &transitions);
+
     if (write_compact) {
       SequentialCompactLatticeReader lattice_reader(lats_rspecifier);
       CompactLatticeWriter lattice_writer(lats_wspecifier);
@@ -200,8 +212,12 @@ int main(int argc, char *argv[]) {
             false, ignore_missing);
       }
 
-      for (; !lattice_reader.Done(); lattice_reader.Next(), n_done++)
-        lattice_writer.Write(lattice_reader.Key(), lattice_reader.Value());
+      for (; !lattice_reader.Done(); lattice_reader.Next(), n_done++) {
+        CompactLattice &lat = lattice_reader.Value();
+        if (!transition_model_rxfilename.empty())
+          AddTransitions(transitions, &lat);
+        lattice_writer.Write(lattice_reader.Key(), lat);
+      }
     } else {
       SequentialLatticeReader lattice_reader(lats_rspecifier);
       LatticeWriter lattice_writer(lats_wspecifier);
@@ -219,8 +235,12 @@ int main(int argc, char *argv[]) {
             true, ignore_missing);
       }
 
-      for (; !lattice_reader.Done(); lattice_reader.Next(), n_done++)
+      for (; !lattice_reader.Done(); lattice_reader.Next(), n_done++) {
+        Lattice &lat = lattice_reader.Value();
+        if (!transition_model_rxfilename.empty())
+          AddTransitions(transitions, &lat);
         lattice_writer.Write(lattice_reader.Key(), lattice_reader.Value());
+      }
     }
     KALDI_LOG << "Done copying " << n_done << " lattices.";
 
diff --git a/src/lm/Makefile b/src/lm/Makefile
index c0654fa83b2..8ffccaae99e 100644
--- a/src/lm/Makefile
+++ b/src/lm/Makefile
@@ -12,6 +12,6 @@ OBJFILES = arpa-file-parser.o arpa-lm-compiler.o const-arpa-lm.o \
 LIBNAME = kaldi-lm
 
 ADDLIBS = ../fstext/kaldi-fstext.a ../util/kaldi-util.a \
-          ../matrix/kaldi-matrix.a ../base/kaldi-base.a 
+          ../matrix/kaldi-matrix.a ../cblasext/kaldi-cblasext.a ../base/kaldi-base.a 
 
 include ../makefiles/default_rules.mk
diff --git a/src/lmbin/Makefile b/src/lmbin/Makefile
index 108ddab50c5..229fd210bb8 100644
--- a/src/lmbin/Makefile
+++ b/src/lmbin/Makefile
@@ -10,7 +10,7 @@ OBJFILES =
 
 TESTFILES =
 
-ADDLIBS = ../lm/kaldi-lm.a ../util/kaldi-util.a ../matrix/kaldi-matrix.a \
+ADDLIBS = ../lm/kaldi-lm.a ../util/kaldi-util.a ../matrix/kaldi-matrix.a ../cblasext/kaldi-cblasext.a \
           ../base/kaldi-base.a 
 
 include ../makefiles/default_rules.mk
diff --git a/src/nnet3/Makefile b/src/nnet3/Makefile
index 5e67211c3a7..e474e85378a 100644
--- a/src/nnet3/Makefile
+++ b/src/nnet3/Makefile
@@ -40,7 +40,7 @@ ADDLIBS = ../chain/kaldi-chain.a ../cudamatrix/kaldi-cudamatrix.a \
           ../decoder/kaldi-decoder.a ../lat/kaldi-lat.a \
           ../fstext/kaldi-fstext.a ../hmm/kaldi-hmm.a \
           ../transform/kaldi-transform.a ../gmm/kaldi-gmm.a \
-          ../tree/kaldi-tree.a ../util/kaldi-util.a ../matrix/kaldi-matrix.a \
+          ../tree/kaldi-tree.a ../util/kaldi-util.a ../matrix/kaldi-matrix.a ../cblasext/kaldi-cblasext.a \
           ../base/kaldi-base.a 
 
 include ../makefiles/default_rules.mk
diff --git a/src/nnet3bin/Makefile b/src/nnet3bin/Makefile
index ac4ee9b1818..4c7c63b554e 100644
--- a/src/nnet3bin/Makefile
+++ b/src/nnet3bin/Makefile
@@ -32,7 +32,7 @@ ADDLIBS = ../nnet3/kaldi-nnet3.a ../chain/kaldi-chain.a \
           ../cudamatrix/kaldi-cudamatrix.a ../decoder/kaldi-decoder.a \
           ../lat/kaldi-lat.a ../fstext/kaldi-fstext.a ../hmm/kaldi-hmm.a \
           ../transform/kaldi-transform.a ../gmm/kaldi-gmm.a \
-          ../tree/kaldi-tree.a ../util/kaldi-util.a ../matrix/kaldi-matrix.a \
+          ../tree/kaldi-tree.a ../util/kaldi-util.a ../matrix/kaldi-matrix.a ../cblasext/kaldi-cblasext.a \
           ../base/kaldi-base.a
 
 include ../makefiles/default_rules.mk
diff --git a/src/online/Makefile b/src/online/Makefile
index 32c99500750..4316473e6c0 100644
--- a/src/online/Makefile
+++ b/src/online/Makefile
@@ -37,7 +37,7 @@ LIBNAME = kaldi-online
 ADDLIBS = ../decoder/kaldi-decoder.a ../lat/kaldi-lat.a ../hmm/kaldi-hmm.a \
           ../feat/kaldi-feat.a ../transform/kaldi-transform.a \
           ../gmm/kaldi-gmm.a ../tree/kaldi-tree.a ../util/kaldi-util.a \
-          ../matrix/kaldi-matrix.a ../base/kaldi-base.a 
+          ../matrix/kaldi-matrix.a ../cblasext/kaldi-cblasext.a ../base/kaldi-base.a 
 
 include ../makefiles/default_rules.mk
 
diff --git a/src/online2/Makefile b/src/online2/Makefile
index 4507f6252dc..8da05efedf9 100644
--- a/src/online2/Makefile
+++ b/src/online2/Makefile
@@ -17,7 +17,7 @@ ADDLIBS = ../ivector/kaldi-ivector.a ../nnet3/kaldi-nnet3.a \
           ../cudamatrix/kaldi-cudamatrix.a ../decoder/kaldi-decoder.a \
           ../lat/kaldi-lat.a ../hmm/kaldi-hmm.a ../feat/kaldi-feat.a \
           ../transform/kaldi-transform.a ../gmm/kaldi-gmm.a \
-          ../tree/kaldi-tree.a ../util/kaldi-util.a ../matrix/kaldi-matrix.a \
+          ../tree/kaldi-tree.a ../util/kaldi-util.a ../matrix/kaldi-matrix.a ../cblasext/kaldi-cblasext.a \
           ../base/kaldi-base.a
 
 
diff --git a/src/online2bin/Makefile b/src/online2bin/Makefile
index 024ab652320..21b309e3200 100644
--- a/src/online2bin/Makefile
+++ b/src/online2bin/Makefile
@@ -23,5 +23,5 @@ ADDLIBS = ../online2/kaldi-online2.a ../ivector/kaldi-ivector.a \
           ../lat/kaldi-lat.a ../fstext/kaldi-fstext.a ../hmm/kaldi-hmm.a \
           ../feat/kaldi-feat.a ../transform/kaldi-transform.a \
           ../gmm/kaldi-gmm.a ../tree/kaldi-tree.a ../util/kaldi-util.a \
-          ../matrix/kaldi-matrix.a ../base/kaldi-base.a 
+          ../matrix/kaldi-matrix.a ../cblasext/kaldi-cblasext.a ../base/kaldi-base.a 
 include ../makefiles/default_rules.mk
diff --git a/src/onlinebin/Makefile b/src/onlinebin/Makefile
index 7c0550d0848..3442c5ef38f 100644
--- a/src/onlinebin/Makefile
+++ b/src/onlinebin/Makefile
@@ -39,7 +39,7 @@ TESTFILES =
 ADDLIBS = ../online/kaldi-online.a ../decoder/kaldi-decoder.a \
           ../lat/kaldi-lat.a ../hmm/kaldi-hmm.a ../feat/kaldi-feat.a \
           ../transform/kaldi-transform.a ../gmm/kaldi-gmm.a \
-          ../tree/kaldi-tree.a ../util/kaldi-util.a ../matrix/kaldi-matrix.a \
+          ../tree/kaldi-tree.a ../util/kaldi-util.a ../matrix/kaldi-matrix.a ../cblasext/kaldi-cblasext.a \
           ../base/kaldi-base.a 
 
 include ../makefiles/default_rules.mk
diff --git a/src/rnnlm/Makefile b/src/rnnlm/Makefile
index d4b3f3ce0a8..0a383c3c710 100644
--- a/src/rnnlm/Makefile
+++ b/src/rnnlm/Makefile
@@ -16,6 +16,6 @@ LIBNAME = kaldi-rnnlm
 
 ADDLIBS = ../nnet3/kaldi-nnet3.a ../cudamatrix/kaldi-cudamatrix.a \
           ../lm/kaldi-lm.a ../hmm/kaldi-hmm.a ../util/kaldi-util.a \
-          ../matrix/kaldi-matrix.a ../base/kaldi-base.a 
+          ../matrix/kaldi-matrix.a ../cblasext/kaldi-cblasext.a ../base/kaldi-base.a 
 
 include ../makefiles/default_rules.mk
diff --git a/src/rnnlmbin/Makefile b/src/rnnlmbin/Makefile
index 23a8eba6145..7e8f5127c33 100644
--- a/src/rnnlmbin/Makefile
+++ b/src/rnnlmbin/Makefile
@@ -20,7 +20,7 @@ ADDLIBS = ../rnnlm/kaldi-rnnlm.a ../nnet3/kaldi-nnet3.a \
           ../cudamatrix/kaldi-cudamatrix.a ../decoder/kaldi-decoder.a \
           ../lat/kaldi-lat.a ../lm/kaldi-lm.a ../fstext/kaldi-fstext.a \
           ../hmm/kaldi-hmm.a ../transform/kaldi-transform.a ../gmm/kaldi-gmm.a \
-          ../tree/kaldi-tree.a ../util/kaldi-util.a ../matrix/kaldi-matrix.a \
+          ../tree/kaldi-tree.a ../util/kaldi-util.a ../matrix/kaldi-matrix.a ../cblasext/kaldi-cblasext.a \
           ../base/kaldi-base.a 
 
 include ../makefiles/default_rules.mk
diff --git a/src/tfrnnlm/Makefile b/src/tfrnnlm/Makefile
index 3dc8d584210..3e2e6f2e93f 100644
--- a/src/tfrnnlm/Makefile
+++ b/src/tfrnnlm/Makefile
@@ -30,7 +30,7 @@ TESTFILES =
 
 LIBNAME = kaldi-tensorflow-rnnlm
 
-ADDLIBS = ../lm/kaldi-lm.a ../util/kaldi-util.a ../matrix/kaldi-matrix.a \
+ADDLIBS = ../lm/kaldi-lm.a ../util/kaldi-util.a ../matrix/kaldi-matrix.a ../cblasext/kaldi-cblasext.a \
           ../base/kaldi-base.a
 LDLIBS +=  -lz -ldl -fPIC -lrt
 LDLIBS += -L$(TENSORFLOW)/bazel-bin/tensorflow -ltensorflow_cc -ltensorflow_framework
diff --git a/src/tfrnnlmbin/Makefile b/src/tfrnnlmbin/Makefile
index 77fe58c088c..e4713dd353a 100644
--- a/src/tfrnnlmbin/Makefile
+++ b/src/tfrnnlmbin/Makefile
@@ -31,7 +31,7 @@ TESTFILES =
 
 ADDLIBS = ../lat/kaldi-lat.a ../lm/kaldi-lm.a ../fstext/kaldi-fstext.a \
           ../hmm/kaldi-hmm.a ../tree/kaldi-tree.a ../util/kaldi-util.a \
-          ../matrix/kaldi-matrix.a ../base/kaldi-base.a \
+          ../matrix/kaldi-matrix.a ../cblasext/kaldi-cblasext.a ../base/kaldi-base.a \
           ../tfrnnlm/kaldi-tensorflow-rnnlm.a
 
 LDLIBS +=  -lz -ldl -fPIC -lrt
diff --git a/src/transform/Makefile b/src/transform/Makefile
index 3899ac52334..b515a289954 100644
--- a/src/transform/Makefile
+++ b/src/transform/Makefile
@@ -13,6 +13,6 @@ OBJFILES = lda-estimate.o \
 LIBNAME = kaldi-transform
 
 ADDLIBS = ../gmm/kaldi-gmm.a ../tree/kaldi-tree.a ../util/kaldi-util.a \
-          ../matrix/kaldi-matrix.a ../base/kaldi-base.a
+          ../matrix/kaldi-matrix.a ../cblasext/kaldi-cblasext.a ../base/kaldi-base.a
 
 include ../makefiles/default_rules.mk
diff --git a/src/tree/Makefile b/src/tree/Makefile
index 8e10eb6c7ea..f13e7a3c2d8 100644
--- a/src/tree/Makefile
+++ b/src/tree/Makefile
@@ -13,7 +13,7 @@ OBJFILES = event-map.o context-dep.o clusterable-classes.o cluster-utils.o \
 					 build-tree-utils.o build-tree.o build-tree-questions.o tree-renderer.o
 
 LIBNAME = kaldi-tree
-ADDLIBS = ../util/kaldi-util.a ../matrix/kaldi-matrix.a ../base/kaldi-base.a 
+ADDLIBS = ../util/kaldi-util.a ../matrix/kaldi-matrix.a ../cblasext/kaldi-cblasext.a ../base/kaldi-base.a 
 
 
 include ../makefiles/default_rules.mk
diff --git a/src/util/Makefile b/src/util/Makefile
index acfab8b8de1..de030b5dce3 100644
--- a/src/util/Makefile
+++ b/src/util/Makefile
@@ -15,6 +15,6 @@ OBJFILES = text-utils.o kaldi-io.o kaldi-holder.o kaldi-table.o \
 
 LIBNAME = kaldi-util
 
-ADDLIBS = ../matrix/kaldi-matrix.a ../base/kaldi-base.a 
+ADDLIBS = ../matrix/kaldi-matrix.a ../cblasext/kaldi-cblasext.a ../base/kaldi-base.a 
 
 include ../makefiles/default_rules.mk

From ebc6f83d91d5bc11051623454f70c8a3bf70b638 Mon Sep 17 00:00:00 2001
From: Daniel Povey <dpovey@gmail.com>
Date: Thu, 20 Jun 2019 16:02:41 -0400
Subject: [PATCH 140/163] [src] Partial changes to cudafeat, giving up for now

---
 src/cudafeat/Makefile               |  4 +--
 src/cudafeat/feature-mfcc-cuda.cu   | 38 +++++++++++++++--------------
 src/cudafeat/feature-mfcc-cuda.h    |  7 +++---
 src/cudafeat/feature-window-cuda.cu | 37 ----------------------------
 src/cudafeat/feature-window-cuda.h  | 38 -----------------------------
 5 files changed, 26 insertions(+), 98 deletions(-)
 delete mode 100644 src/cudafeat/feature-window-cuda.cu
 delete mode 100644 src/cudafeat/feature-window-cuda.h

diff --git a/src/cudafeat/Makefile b/src/cudafeat/Makefile
index a2351f972ff..8bd5ef6d39b 100644
--- a/src/cudafeat/Makefile
+++ b/src/cudafeat/Makefile
@@ -5,10 +5,10 @@ all:
 include ../kaldi.mk
 ifeq ($(CUDA), true)
 
-TESTFILES = 
+TESTFILES =
 
 ifeq ($(CUDA), true)
-  OBJFILES +=  feature-window-cuda.o feature-mfcc-cuda.o feature-online-cmvn-cuda.o
+  OBJFILES +=  feature-mfcc-cuda.o feature-online-cmvn-cuda.o
 endif
 
 LIBNAME = kaldi-cudafeat
diff --git a/src/cudafeat/feature-mfcc-cuda.cu b/src/cudafeat/feature-mfcc-cuda.cu
index 730e7bd47e7..925456e862e 100644
--- a/src/cudafeat/feature-mfcc-cuda.cu
+++ b/src/cudafeat/feature-mfcc-cuda.cu
@@ -17,8 +17,10 @@
 #include <nvToolsExt.h>
 #include <cub/cub.cuh>
 
-#include "cudafeat/feature-mfcc-cuda.h"
+#include "feat/feature-window.h"
+#include "feat/feature-mfcc.h"
 #include "cudamatrix/cu-rand.h"
+#include "cudafeat/feature-mfcc-cuda.h"
 
 // Each thread block processes a unique frame
 // threads in the same threadblock collaborate to
@@ -246,26 +248,22 @@ __global__ void process_window_kernel(
 }
 
 __device__ inline int32 FirstSampleOfFrame(int32 frame, int32 frame_shift,
-                                           int32 window_size, bool snip_edges) {
-  if (snip_edges) {
-    return frame * frame_shift;
-  } else {
-    int32 midpoint_of_frame = frame_shift * frame + frame_shift / 2,
-          beginning_of_frame = midpoint_of_frame - window_size / 2;
-    return beginning_of_frame;
-  }
+                                           int32 window_size) {
+  int32 midpoint_of_frame = frame_shift * frame + frame_shift / 2,
+      beginning_of_frame = midpoint_of_frame - window_size / 2;
+  return beginning_of_frame;
 }
 
 __global__ void extract_window_kernel(
     int32 frame_shift, int32 frame_length, int32 frame_length_padded,
-    int32 window_size, bool snip_edges, int32_t sample_offset,
+    int32 window_size, int32_t sample_offset,
     const BaseFloat __restrict__ *wave, int32 wave_dim,
     BaseFloat *__restrict__ windows, int32_t wlda) {
   int frame = blockIdx.x;
   int tidx = threadIdx.x;
 
   int32 start_sample =
-      FirstSampleOfFrame(frame, frame_shift, window_size, snip_edges);
+      FirstSampleOfFrame(frame, frame_shift, window_size);
 
   // wave_start and wave_end are start and end indexes into 'wave', for the
   // piece of wave that we're trying to extract.
@@ -339,11 +337,15 @@ __global__ void dot_log_kernel(int32_t num_frames, int32_t frame_length,
 
 namespace kaldi {
 
-CudaMfcc::CudaMfcc(const MfccOptions &opts)
-    : MfccComputer(opts),
+CudaMfcc::CudaMfcc(const MfccOptions &opts):
+    MfccComputer(opts),
       cu_lifter_coeffs_(lifter_coeffs_),
-      cu_dct_matrix_(dct_matrix_),
-      window_function_(opts.frame_opts) {
+      cu_dct_matrix_(dct_matrix_) {
+  {
+    Vector<BaseFloat> temp;
+    InitFeatureWindowFunction(opts.frame_opts, &temp);
+    window_function_.Swap(&temp);
+  }
   const MelBanks *mel_banks = GetMelBanks(1.0);
   const std::vector<std::pair<int32, Vector<BaseFloat>>> &bins =
       mel_banks->GetBins();
@@ -408,7 +410,7 @@ void CudaMfcc::ExtractWindows(int32_t num_frames, int64 sample_offset,
 
   extract_window_kernel<<<num_frames, CU1DBLOCK>>>(
       opts.WindowShift(), frame_length, frame_length_padded, opts.WindowSize(),
-      opts.snip_edges, sample_offset, wave.Data(), wave.Dim(),
+      sample_offset, wave.Data(), wave.Dim(),
       cu_windows_.Data(), cu_windows_.Stride());
   CU_SAFE_CALL(cudaGetLastError());
 }
@@ -422,9 +424,9 @@ void CudaMfcc::ProcessWindows(int num_frames,
   KALDI_ASSERT(fft_num_frames % fft_size_ == 0);
 
   process_window_kernel<<<num_frames, CU1DBLOCK>>>(
-      frame_length_, opts.dither, std::numeric_limits<float>::epsilon(),
+      frame_length_, std::numeric_limits<float>::epsilon(),
       opts.remove_dc_offset, opts.preemph_coeff, NeedRawLogEnergy(),
-      log_energy_pre_window->Data(), window_function_.cu_window.Data(),
+      log_energy_pre_window->Data(), window_function_.Data(),
       tmp_window_.Data(), tmp_window_.Stride(), cu_windows_.Data(),
       cu_windows_.Stride());
 
diff --git a/src/cudafeat/feature-mfcc-cuda.h b/src/cudafeat/feature-mfcc-cuda.h
index 5fabc4c8fe4..fe6e737421c 100644
--- a/src/cudafeat/feature-mfcc-cuda.h
+++ b/src/cudafeat/feature-mfcc-cuda.h
@@ -22,9 +22,9 @@
 #include <cufft.h>
 #endif
 
-#include "cudafeat/feature-window-cuda.h"
 #include "cudamatrix/cu-matrix.h"
 #include "cudamatrix/cu-vector.h"
+#include "feat/feature-window.h"
 #include "feat/feature-mfcc.h"
 
 namespace kaldi {
@@ -59,7 +59,7 @@ class CudaMfcc : public MfccComputer {
 
   int frame_length_, padded_length_, fft_length_, fft_size_;
   cufftHandle plan_;
-  CudaFeatureWindowFunction window_function_;
+  CuVector<BaseFloat> window_function_;
 
   int bin_size_;
   int32 *offsets_, *sizes_;
@@ -69,6 +69,7 @@ class CudaMfcc : public MfccComputer {
   // for sanity checking cufft
   int32_t stride_, tmp_stride_;
 };
-}
+
+}  // namespace kaldi
 
 #endif
diff --git a/src/cudafeat/feature-window-cuda.cu b/src/cudafeat/feature-window-cuda.cu
deleted file mode 100644
index 0c98bee30ba..00000000000
--- a/src/cudafeat/feature-window-cuda.cu
+++ /dev/null
@@ -1,37 +0,0 @@
-// cudafeat/feature-window-cuda.cu
-//
-// Copyright (c) 2019, NVIDIA CORPORATION.  All rights reserved.
-// Justin Luitjens
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include <nvToolsExt.h>
-#include "cudafeat/feature-window-cuda.h"
-#include "matrix/matrix-functions.h"
-
-namespace kaldi {
-
-CudaFeatureWindowFunction::CudaFeatureWindowFunction(
-    const FrameExtractionOptions &opts) {
-  nvtxRangePushA("CudaFeatureWindowFunction::CudaFeatureWindowFunction");
-  int32 frame_length = opts.WindowSize();
-
-  // Create CPU feature window
-  FeatureWindowFunction feature_window(opts);
-
-  // Copy into GPU memory
-  cu_window.Resize(frame_length, kUndefined);
-  cu_window.CopyFromVec(feature_window.window);
-  nvtxRangePop();
-}
-}  // namespace kaldi
diff --git a/src/cudafeat/feature-window-cuda.h b/src/cudafeat/feature-window-cuda.h
deleted file mode 100644
index ff749a855b9..00000000000
--- a/src/cudafeat/feature-window-cuda.h
+++ /dev/null
@@ -1,38 +0,0 @@
-// cudafeat/feature-window-cuda.h
-//
-// Copyright (c) 2019, NVIDIA CORPORATION.  All rights reserved.
-// Justin Luitjens
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#ifndef KALDI_CUDAFEAT_FEATURE_WINDOW_CUDA_H_
-#define KALDI_CUDAFEAT_FEATURE_WINDOW_CUDA_H_
-
-#include "cudamatrix/cu-matrix.h"
-#include "cudamatrix/cu-vector.h"
-#include "feat/feature-window.h"
-
-namespace kaldi {
-
-// This struct stores a feature window on the device.
-// Behind the scense it just computes a feature window on
-// the host and then copies it into device memory.
-struct CudaFeatureWindowFunction {
-  CudaFeatureWindowFunction() {}
-  explicit CudaFeatureWindowFunction(const FrameExtractionOptions &opts);
-  CuVector<float> cu_window;
-};
-
-}  // namespace kaldi
-
-#endif  // KALDI_CUDAFEAT_FEATURE_WINDOW_CUDA_H_

From 5b0c098dd8dd7afdd1b4b128f87624612555424e Mon Sep 17 00:00:00 2001
From: Daniel Povey <dpovey@gmail.com>
Date: Fri, 21 Jun 2019 20:35:50 -0400
Subject: [PATCH 141/163] [src] Various changes to get it to compile

---
 src/bin/Makefile                              |  10 +-
 src/chain/chain-supervision-test.cc           |   2 +-
 .../batched-threaded-nnet3-cuda-pipeline.cc   |   2 +-
 .../batched-threaded-nnet3-cuda-pipeline.h    |   4 +-
 src/cudadecoder/cuda-fst.cc                   |  12 +-
 src/cudadecoder/cuda-fst.h                    |  14 +-
 src/cudadecoder/decodable-cumatrix.cc         |   2 +-
 src/cudadecoder/decodable-cumatrix.h          |   4 +-
 src/cudadecoderbin/batched-wav-nnet3-cuda.cc  |   2 +-
 src/cudafeat/feature-mfcc-cuda.cu             | 139 ++++--------------
 src/fstext/fstext-utils-inl.h                 |  82 ++---------
 src/fstext/fstext-utils-test.cc               |  20 +--
 src/fstext/fstext-utils.h                     |  27 +---
 src/hmm/hmm-test-utils.cc                     |  32 ++--
 src/hmm/hmm-test-utils.h                      |   2 +-
 src/hmm/hmm-utils-test.cc                     |   7 +-
 src/hmm/hmm-utils.cc                          | 104 ++++++-------
 src/hmm/transitions-test.cc                   |   2 +-
 src/hmm/transitions.cc                        |  16 +-
 src/lat/word-align-lattice-lexicon-test.cc    |   2 +-
 src/tree/build-tree-test.cc                   |  33 ++---
 src/tree/build-tree.cc                        |  12 +-
 src/tree/build-tree.h                         |  15 +-
 src/tree/context-dep.h                        |  14 +-
 24 files changed, 205 insertions(+), 354 deletions(-)

diff --git a/src/bin/Makefile b/src/bin/Makefile
index 6ab96434246..c088e4da76b 100644
--- a/src/bin/Makefile
+++ b/src/bin/Makefile
@@ -30,10 +30,14 @@ OBJFILES =
 ADDLIBS = ../decoder/kaldi-decoder.a ../lat/kaldi-lat.a ../lm/kaldi-lm.a \
           ../fstext/kaldi-fstext.a ../hmm/kaldi-hmm.a \
           ../transform/kaldi-transform.a ../gmm/kaldi-gmm.a \
-          ../tree/kaldi-tree.a ../util/kaldi-util.a ../matrix/kaldi-matrix.a \
+          ../tree/kaldi-tree.a ../cudamatrix/kaldi-cudamatrix.a \
+          ../util/kaldi-util.a ../matrix/kaldi-matrix.a \
           ../cblasext/kaldi-cblasext.a ../base/kaldi-base.a
 
 
-TESTFILES =
+#LDLIBS += $(CUDA_LDLIBS)
 
-include ../makefiles/default_rules.mk
+
+# # TESTFILES =
+
+# # include ../makefiles/default_rules.mk
diff --git a/src/chain/chain-supervision-test.cc b/src/chain/chain-supervision-test.cc
index 10385e2c4f2..8af77af5d12 100644
--- a/src/chain/chain-supervision-test.cc
+++ b/src/chain/chain-supervision-test.cc
@@ -456,7 +456,7 @@ void ChainDenominatorTest(const DenominatorGraph &den_graph) {
 
 void ChainSupervisionTest() {
   ContextDependency *ctx_dep;
-  Transitions *trans_model = GenRandTransitionModel(&ctx_dep);
+  Transitions *trans_model = GenRandTransitions(&ctx_dep);
   const std::vector<int32> &phones = trans_model->GetPhones();
 
   int32 subsample_factor = RandInt(1, 3);
diff --git a/src/cudadecoder/batched-threaded-nnet3-cuda-pipeline.cc b/src/cudadecoder/batched-threaded-nnet3-cuda-pipeline.cc
index 34c7ea06a9d..cd58551ea1e 100644
--- a/src/cudadecoder/batched-threaded-nnet3-cuda-pipeline.cc
+++ b/src/cudadecoder/batched-threaded-nnet3-cuda-pipeline.cc
@@ -28,7 +28,7 @@ namespace cuda_decoder {
 
 void BatchedThreadedNnet3CudaPipeline::Initialize(
     const fst::Fst<fst::StdArc> &decode_fst, const nnet3::AmNnetSimple &am_nnet,
-    const TransitionModel &trans_model) {
+    const Transitions &trans_model) {
   KALDI_LOG << "BatchedThreadedNnet3CudaPipeline Initialize with "
             << config_.num_control_threads << " control threads, "
             << config_.num_worker_threads << " worker threads"
diff --git a/src/cudadecoder/batched-threaded-nnet3-cuda-pipeline.h b/src/cudadecoder/batched-threaded-nnet3-cuda-pipeline.h
index 6754f9a2442..72687879b18 100644
--- a/src/cudadecoder/batched-threaded-nnet3-cuda-pipeline.h
+++ b/src/cudadecoder/batched-threaded-nnet3-cuda-pipeline.h
@@ -105,7 +105,7 @@ class BatchedThreadedNnet3CudaPipeline {
   // allocates reusable objects that are common across all decodings
   void Initialize(const fst::Fst<fst::StdArc> &decode_fst,
                   const nnet3::AmNnetSimple &nnet,
-                  const TransitionModel &trans_model);
+                  const Transitions &trans_model);
 
   // deallocates reusable objects
   void Finalize();
@@ -243,7 +243,7 @@ class BatchedThreadedNnet3CudaPipeline {
   const BatchedThreadedNnet3CudaPipelineConfig &config_;
 
   CudaFst cuda_fst_;
-  const TransitionModel *trans_model_;
+  const Transitions *trans_model_;
   const nnet3::AmNnetSimple *am_nnet_;
   nnet3::DecodableNnetSimpleLoopedInfo *decodable_info_;
   OnlineNnet2FeaturePipelineInfo *feature_info_;
diff --git a/src/cudadecoder/cuda-fst.cc b/src/cudadecoder/cuda-fst.cc
index 6f899d87321..70f745f286a 100644
--- a/src/cudadecoder/cuda-fst.cc
+++ b/src/cudadecoder/cuda-fst.cc
@@ -113,21 +113,21 @@ void CudaFst::PopulateArcs(const fst::Fst<StdArc> &fst) {
       h_arc_id_ilabels_[idx] = arc.ilabel;
       // For now we consider id indexing == pdf indexing
       // If the two are differents, we'll call ApplyTransModelOnIlabels with a
-      // TransitionModel
+      // Transitions
       h_arc_pdf_ilabels_[idx] = arc.ilabel;
       h_arc_olabels_[idx] = arc.olabel;
     }
   }
 }
 
-void CudaFst::ApplyTransitionModelOnIlabels(
-    const TransitionModel &trans_model) {
+void CudaFst::ApplyTransitionsOnIlabels(
+    const Transitions &trans_model) {
   // Converting ilabel here, to avoid reindexing when reading nnet3 output
   // We only need to convert the emitting arcs
   // The emitting arcs are the first e_count_ arcs
   for (int iarc = 0; iarc < e_count_; ++iarc)
     h_arc_pdf_ilabels_[iarc] =
-        trans_model.TransitionIdToPdf(h_arc_id_ilabels_[iarc]);
+        trans_model.InfoForTransitionId(h_arc_id_ilabels_[iarc]).pdf_id;
 }
 
 void CudaFst::CopyDataToDevice() {
@@ -153,7 +153,7 @@ void CudaFst::CopyDataToDevice() {
 }
 
 void CudaFst::Initialize(const fst::Fst<StdArc> &fst,
-                         const TransitionModel *trans_model) {
+                         const Transitions *trans_model) {
   nvtxRangePushA("CudaFst constructor");
   start_ = fst.Start();
 
@@ -164,7 +164,7 @@ void CudaFst::Initialize(const fst::Fst<StdArc> &fst,
   // at the end of Initialize
   h_arc_pdf_ilabels_.resize(arc_count_);
   PopulateArcs(fst);
-  if (trans_model) ApplyTransitionModelOnIlabels(*trans_model);
+  if (trans_model) ApplyTransitionsOnIlabels(*trans_model);
 
   KALDI_ASSERT(d_e_offsets_);
   KALDI_ASSERT(d_ne_offsets_);
diff --git a/src/cudadecoder/cuda-fst.h b/src/cudadecoder/cuda-fst.h
index 1dac627755b..8c07bb4936d 100644
--- a/src/cudadecoder/cuda-fst.h
+++ b/src/cudadecoder/cuda-fst.h
@@ -20,7 +20,7 @@
 #include "cudadecoder/cuda-decoder-common.h"
 #include "cudamatrix/cu-device.h"
 #include "lat/kaldi-lattice.h"
-#include "nnet3/decodable-online-looped.h"  // TransitionModel
+#include "nnet3/decodable-online-looped.h"  // Transitions
 
 namespace kaldi {
 namespace cuda_decoder {
@@ -52,13 +52,13 @@ class CudaFst {
         d_final_(nullptr){};
   // Creates a CSR representation of the FST,
   // then copies it to the GPU
-  // If a TransitionModel is passed, we'll use it to convert the ilabels id
+  // If a Transitions is passed, we'll use it to convert the ilabels id
   // indexes into pdf indexes
-  // If no TransitionModel is passed, we'll assume TransitionModel == identity
-  // Important: The CudaDecodable won't apply the TransitionModel. If you use a
-  // TransitionModel, you need to apply it now
+  // If no Transitions is passed, we'll assume Transitions == identity
+  // Important: The CudaDecodable won't apply the Transitions. If you use a
+  // Transitions, you need to apply it now
   void Initialize(const fst::Fst<StdArc> &fst,
-                  const TransitionModel *trans_model = NULL);
+                  const Transitions *trans_model = NULL);
   void Finalize();
 
   inline uint32_t NumStates() const { return num_states_; }
@@ -75,7 +75,7 @@ class CudaFst {
   // Converting the id ilabels into pdf ilabels using the transition model
   // It allows the CudaDecoder to read the acoustic model loglikelihoods at the
   // right indexes
-  void ApplyTransitionModelOnIlabels(const TransitionModel &trans_model);
+  void ApplyTransitionsOnIlabels(const Transitions &trans_model);
   // Copies fst to device into the pre-allocated datastructures
   void CopyDataToDevice();
   // Total number of states
diff --git a/src/cudadecoder/decodable-cumatrix.cc b/src/cudadecoder/decodable-cumatrix.cc
index d7c1d0359a5..4704238852c 100644
--- a/src/cudadecoder/decodable-cumatrix.cc
+++ b/src/cudadecoder/decodable-cumatrix.cc
@@ -24,7 +24,7 @@ namespace kaldi {
 namespace cuda_decoder {
 
 DecodableCuMatrixMapped::DecodableCuMatrixMapped(
-    const TransitionModel &tm, const CuMatrixBase<BaseFloat> &likes,
+    const Transitions &tm, const CuMatrixBase<BaseFloat> &likes,
     int32 frame_offset)
     : trans_model_(tm), likes_(&likes), frame_offset_(frame_offset) {
   if (likes.NumCols() != tm.NumPdfs())
diff --git a/src/cudadecoder/decodable-cumatrix.h b/src/cudadecoder/decodable-cumatrix.h
index d34079cc9c7..aaef4c9fd3f 100644
--- a/src/cudadecoder/decodable-cumatrix.h
+++ b/src/cudadecoder/decodable-cumatrix.h
@@ -35,7 +35,7 @@ class DecodableCuMatrixMapped : public CudaDecodableInterface {
   // This constructor creates an object that will not delete "likes" when done.
   // the frame_offset is the frame the row 0 of 'likes' corresponds to, would be
   // greater than one if this is not the first chunk of likelihoods.
-  DecodableCuMatrixMapped(const TransitionModel &tm,
+  DecodableCuMatrixMapped(const Transitions &tm,
                           const CuMatrixBase<BaseFloat> &likes,
                           int32 frame_offset = 0);
 
@@ -57,7 +57,7 @@ class DecodableCuMatrixMapped : public CudaDecodableInterface {
   virtual BaseFloat *GetLogLikelihoodsCudaPointer(int32 subsampled_frame);
 
 private:
-  const TransitionModel &trans_model_; // for tid to pdf mapping
+  const Transitions &trans_model_; // for tid to pdf mapping
   const CuMatrixBase<BaseFloat> *likes_;
 
   int32 frame_offset_;
diff --git a/src/cudadecoderbin/batched-wav-nnet3-cuda.cc b/src/cudadecoderbin/batched-wav-nnet3-cuda.cc
index a59c1e2a1b1..ba082ade062 100644
--- a/src/cudadecoderbin/batched-wav-nnet3-cuda.cc
+++ b/src/cudadecoderbin/batched-wav-nnet3-cuda.cc
@@ -181,7 +181,7 @@ int main(int argc, char *argv[]) {
     std::string nnet3_rxfilename = po.GetArg(1), fst_rxfilename = po.GetArg(2),
                 wav_rspecifier = po.GetArg(3), clat_wspecifier = po.GetArg(4);
 
-    TransitionModel trans_model;
+    Transitions trans_model;
     nnet3::AmNnetSimple am_nnet;
 
     // read transition model and nnet
diff --git a/src/cudafeat/feature-mfcc-cuda.cu b/src/cudafeat/feature-mfcc-cuda.cu
index 925456e862e..84b9241a6b0 100644
--- a/src/cudafeat/feature-mfcc-cuda.cu
+++ b/src/cudafeat/feature-mfcc-cuda.cu
@@ -25,34 +25,22 @@
 // Each thread block processes a unique frame
 // threads in the same threadblock collaborate to
 // compute the frame together.
-__global__ void apply_lifter_and_floor_energy(
-    int num_frames, int num_cols, float cepstral_lifter, bool use_energy,
-    float energy_floor, float *log_energy, float *lifter_coeffs,
+__global__ void include_log_energy(
+    int num_frames,
+    float energy_floor,
+    const float *log_energy,
     float *features, int32_t ldf) {
-  int thread_id = threadIdx.x;
   int frame = blockIdx.x;
 
   float *feats = features + frame * ldf;
 
-  // apply lifter coefficients
-  if (cepstral_lifter != 0.0f) {
-    for (int c = thread_id; c < num_cols; c += CU1DBLOCK) {
-      float lift = lifter_coeffs[c];
-      float f = feats[c];
-      feats[c] = f * lift;
-    }
-  }
-
-  // Thread 0 for each frame will apply energy
-  if (use_energy && thread_id == 0) {
-    float energy = log_energy[frame];
-    float log_energy_floor = log(energy_floor);
+  float energy = log_energy[frame];
+  float log_energy_floor = log(energy_floor);
 
-    if (energy_floor > 0.0f && energy < log_energy_floor) {
-      energy = log_energy_floor;
-    }
-    feats[0] = energy;
+  if (energy_floor > 0.0f && energy < log_energy_floor) {
+    energy = log_energy_floor;
   }
+  feats[0] = energy;
 }
 
 // Each threadblock computes a different row of the matrix.
@@ -132,9 +120,8 @@ __global__ void mel_banks_compute_kernel(int32_t num_frames, float energy_floor,
 }
 
 __global__ void process_window_kernel(
-    int frame_length, float dither, float energy_floor, bool remove_dc_offset,
-    float preemph_coeff, bool need_raw_log_energy, float *log_energy_pre_window,
-    const float *windowing, float *tmp_windows, int32_t ldt, float *windows,
+    int frame_length, bool remove_dc_offset,
+    const float *windowing, float *windows,
     int32_t ldw) {
   // Specialize WarpReduce for type float
   typedef cub::BlockReduce<float, CU1DBLOCK> BlockReduce;
@@ -142,7 +129,6 @@ __global__ void process_window_kernel(
 
   int thread_id = threadIdx.x;
   int row = blockIdx.x;
-  float *tmp_window = tmp_windows + row * ldt;
   float *window = windows + row * ldw;
 
   __shared__ float ssum;
@@ -151,26 +137,24 @@ __global__ void process_window_kernel(
   float wdot = 0;
 
   for (int idx = thread_id; idx < frame_length; idx += CU1DBLOCK) {
-    // tmp_window contains optional dither.  Apply that on read.
     float wval = window[idx];
-    if (dither != 0.0f) {
-      wval += tmp_window[idx] * dither;
-    }
     // compute local sum for removing dc offset
     sum += wval;
     // compute dot product for log energy
     wdot += wval * wval;
 
     float windowing_mul = 1;
-    if (remove_dc_offset == false && preemph_coeff == 0.0f) {
+    if (remove_dc_offset == false) {
       // we are done here so set windowing multiplication on write.
       windowing_mul = windowing[idx];
     }
-
     // write dithered output
     window[idx] = wval * windowing_mul;
   }
   __syncthreads();
+  // CAUTION (dp): when various configs were removed I tried to simplify this code
+  // by removing things that weren't supported.  Its structure may not make sense
+  // any more even if I did that correctly.
   if (remove_dc_offset) {
     // we will recompute this below
     wdot = 0.0f;
@@ -184,65 +168,15 @@ __global__ void process_window_kernel(
     sum = -ssum / frame_length;
     for (int idx = thread_id; idx < frame_length; idx += CU1DBLOCK) {
       float windowing_mul = 1;
-      float *out = window;
-      if (preemph_coeff == 0.0f) {
-        // we are done here so apply windowing
-        windowing_mul = windowing[idx];
-      } else {
-        // write to temp window as we will copy back into window
-        // when doing pre-emphasis
-        out = tmp_window;
-      }
+      windowing_mul = windowing[idx];
       // updated window value
       float wval = window[idx] + sum;
 
       // compute new dot product with dc offset removed
       wdot += wval * wval;
 
-      assert(windowing_mul == 1);
       // write output
-      out[idx] = wval * windowing_mul;
-    }
-  }
-  __syncthreads();
-
-  // if pointer is not NULL we will set energy to either
-  // the computed energy or 0 depending on need_raw_log_energy
-  if (log_energy_pre_window != NULL) {
-    float energy = 0.0f;
-
-    if (need_raw_log_energy) {
-      // must sync to use retemp_storage
-      if (remove_dc_offset) __syncthreads();
-      // use cub to reduce
-      wdot = BlockReduce(temp_storage).Sum(wdot);
-
-      energy = max(wdot, energy_floor);
-    }
-
-    if (thread_id == 0) {
-      log_energy_pre_window[row] = log(energy);
-    }
-  }
-
-  // TODO this could be more efficient using shared memory instead of
-  // tmp_window.
-  if (preemph_coeff != 0.0f) {
-    // wait for tmp_window to be computed
-    __threadfence();
-    __syncthreads();
-    // starting thread idx at 0 to keep writes aligned.
-    // unaligned reads are less painful then unaligned writes
-    for (int idx = thread_id; idx < frame_length; idx += CU1DBLOCK) {
-      float wval = tmp_window[idx];
-      float prev_window = wval;
-      if (idx > 0) {
-        prev_window = tmp_window[idx - 1];
-      }
-      // use __fmul_rn to match CPU
-      // window[idx] = (wval - preemph_coeff*prev_window) * windowing[idx];
-      window[idx] =
-          (wval - __fmul_rn(preemph_coeff, prev_window)) * windowing[idx];
+      window[idx] = wval * windowing_mul;
     }
   }
 }
@@ -339,7 +273,6 @@ namespace kaldi {
 
 CudaMfcc::CudaMfcc(const MfccOptions &opts):
     MfccComputer(opts),
-      cu_lifter_coeffs_(lifter_coeffs_),
       cu_dct_matrix_(dct_matrix_) {
   {
     Vector<BaseFloat> temp;
@@ -424,11 +357,9 @@ void CudaMfcc::ProcessWindows(int num_frames,
   KALDI_ASSERT(fft_num_frames % fft_size_ == 0);
 
   process_window_kernel<<<num_frames, CU1DBLOCK>>>(
-      frame_length_, std::numeric_limits<float>::epsilon(),
-      opts.remove_dc_offset, opts.preemph_coeff, NeedRawLogEnergy(),
-      log_energy_pre_window->Data(), window_function_.Data(),
-      tmp_window_.Data(), tmp_window_.Stride(), cu_windows_.Data(),
-      cu_windows_.Stride());
+      frame_length_, opts.remove_dc_offset,
+      window_function_.Data(),
+      cu_windows_.Data(), cu_windows_.Stride());
 
   CU_SAFE_CALL(cudaGetLastError());
 }
@@ -437,11 +368,10 @@ void CudaMfcc::ComputeFinalFeatures(int num_frames, BaseFloat vtln_wrap,
                                     CuVector<BaseFloat> *cu_signal_log_energy,
                                     CuMatrix<BaseFloat> *cu_features) {
   Vector<float> tmp;
-  assert(opts_.htk_compat == false);
 
   if (num_frames == 0) return;
 
-  if (opts_.use_energy && !opts_.raw_energy) {
+  if (opts_.use_energy) {
     dot_log_kernel<<<num_frames, CU1DBLOCK>>>(
         num_frames, cu_windows_.NumCols(), cu_windows_.Data(),
         cu_windows_.Stride(), cu_signal_log_energy->Data());
@@ -483,11 +413,16 @@ void CudaMfcc::ComputeFinalFeatures(int num_frames, BaseFloat vtln_wrap,
   cu_features->AddMatMat(1.0, cu_mel_energies_, kNoTrans, cu_dct_matrix_,
                          kTrans, 0.0);
 
-  apply_lifter_and_floor_energy<<<num_frames, CU1DBLOCK>>>(
-      cu_features->NumRows(), cu_features->NumCols(), opts_.cepstral_lifter,
-      opts_.use_energy, opts_.energy_floor, cu_signal_log_energy->Data(),
-      cu_lifter_coeffs_.Data(), cu_features->Data(), cu_features->Stride());
-  CU_SAFE_CALL(cudaGetLastError());
+  if (opts_.use_energy) {
+    // yes, using 1 thread per block does not make sense.
+    // this code was adapted lazily from previous code.
+    include_log_energy<<<num_frames, 1>>>(
+        cu_features->NumRows(),
+        opts_.energy_floor,
+        cu_signal_log_energy->Data(),
+        cu_features->Data(), cu_features->Stride());
+    CU_SAFE_CALL(cudaGetLastError());
+  }
 }
 
 void CudaMfcc::ComputeFeatures(const CuVectorBase<BaseFloat> &cu_wave,
@@ -499,7 +434,6 @@ void CudaMfcc::ComputeFeatures(const CuVectorBase<BaseFloat> &cu_wave,
   // compute fft frames by rounding up to a multiple of fft_size_
   int fft_num_frames = num_frames + (fft_size_ - num_frames % fft_size_);
   int feature_dim = Dim();
-  bool use_raw_log_energy = NeedRawLogEnergy();
 
   CuVector<BaseFloat> raw_log_energies;
   raw_log_energies.Resize(num_frames, kUndefined);
@@ -511,17 +445,6 @@ void CudaMfcc::ComputeFeatures(const CuVectorBase<BaseFloat> &cu_wave,
   tmp_window_.Resize(fft_num_frames, padded_length_ + 2, kUndefined,
                      kStrideEqualNumCols);
 
-  if (frame_opts.dither != 0.0f) {
-    // Calling cu-rand directly
-    // CuRand class works on CuMatrixBase which must
-    // assume that the matrix is part of a larger matrix
-    // Doing this directly avoids unecessary memory copies
-    CURAND_SAFE_CALL(
-        curandGenerateNormal(GetCurandHandle(), tmp_window_.Data(),
-                             tmp_window_.NumRows() * tmp_window_.Stride(),
-                             0.0 /*mean*/, 1.0 /*stddev*/));
-  }
-
   // Extract Windows
   ExtractWindows(num_frames, 0, cu_wave, frame_opts);
 
diff --git a/src/fstext/fstext-utils-inl.h b/src/fstext/fstext-utils-inl.h
index 681096d0cbc..bbcf68ec2ec 100644
--- a/src/fstext/fstext-utils-inl.h
+++ b/src/fstext/fstext-utils-inl.h
@@ -524,21 +524,27 @@ bool FollowingInputSymbolsAreSameClass(bool end_is_epsilon, const Fst<Arc> &fst,
   return true;
 }
 
-// TODO(galv): Confirm that start_is_epsilon is no longer necessary
-// now that we longer allow epsilon transitions.
 template<class Arc>
-void MakePrecedingInputSymbolsSame(MutableFst<Arc> *fst) {
+void MakePrecedingInputSymbolsSame(bool start_is_epsilon, MutableFst<Arc> *fst) {
   IdentityFunction<typename Arc::Label> f;
-  MakePrecedingInputSymbolsSameClass(fst, f);
+  MakePrecedingInputSymbolsSameClass(start_is_epsilon, fst, f);
 }
 
 template<class Arc, class F>
-void MakePrecedingInputSymbolsSameClass(MutableFst<Arc> *fst, const F &f) {
+void MakePrecedingInputSymbolsSameClass(bool start_is_epsilon, MutableFst<Arc> *fst, const F &f) {
   typedef typename F::Result ClassType;
   typedef typename Arc::StateId StateId;
   typedef typename Arc::Weight Weight;
   vector<ClassType> classes;
   ClassType noClass = f(kNoLabel);
+  ClassType epsClass = f(0);
+  if (start_is_epsilon) {  // treat having-start-state as epsilon in-transition.
+    StateId start_state = fst->Start();
+    if (start_state < 0 || start_state == kNoStateId) // empty FST.
+      return;
+    classes.resize(start_state+1, noClass);
+    classes[start_state] = epsClass;
+  }
 
   // Find bad states (states with multiple input-symbols into them).
   std::set<StateId> bad_states;  // states that we need to change.
@@ -600,73 +606,14 @@ void MakePrecedingInputSymbolsSameClass(MutableFst<Arc> *fst, const F &f) {
   }
 }
 
-template<class Arc>
-void MakeFollowingInputSymbolsSame(bool end_is_epsilon, MutableFst<Arc> *fst) {
-  IdentityFunction<typename Arc::Label> f;
-  MakeFollowingInputSymbolsSameClass(end_is_epsilon, fst, f);
-}
-
-template<class Arc, class F>
-void MakeFollowingInputSymbolsSameClass(bool end_is_epsilon, MutableFst<Arc> *fst, const F &f) {
-  typedef typename Arc::StateId StateId;
-  typedef typename Arc::Weight Weight;
-  typedef typename F::Result ClassType;
-  vector<StateId> bad_states;
-  ClassType noClass = f(kNoLabel);
-  ClassType epsClass = f(0);
-  for (StateIterator<Fst<Arc> > siter(*fst); !siter.Done(); siter.Next()) {
-    StateId s = siter.Value();
-    ClassType c = noClass;
-    bool bad = false;
-    for (ArcIterator<Fst<Arc> > aiter(*fst, s); !aiter.Done(); aiter.Next()) {
-      const Arc &arc = aiter.Value();
-      if (c == noClass)
-        c = f(arc.ilabel);
-      else
-        if (c != f(arc.ilabel)) {
-          bad = true;
-          break;
-        }
-    }
-    if (end_is_epsilon && c != noClass &&
-       c != epsClass && fst->Final(s) != Weight::Zero())
-      bad = true;
-    if (bad)
-      bad_states.push_back(s);
-  }
-  vector<Arc> my_arcs;
-  for (size_t i = 0; i < bad_states.size(); i++) {
-    StateId s = bad_states[i];
-    my_arcs.clear();
-    for (ArcIterator<MutableFst<Arc> > aiter(*fst, s); !aiter.Done(); aiter.Next())
-      my_arcs.push_back(aiter.Value());
-
-    for (size_t j = 0; j < my_arcs.size(); j++) {
-      Arc &arc = my_arcs[j];
-      if (arc.ilabel != 0) {
-        StateId newstate = fst->AddState();
-        // Create a new state for each non-eps arc in original FST, out of each bad state.
-        // Not as optimal as it could be, but does avoid some complicated weight-pushing
-        // issues in which, to maintain stochasticity, we would have to know which semiring
-        // we want to maintain stochasticity in.
-        fst->AddArc(newstate, Arc(arc.ilabel, 0, Weight::One(), arc.nextstate));
-        MutableArcIterator<MutableFst<Arc> > maiter(fst, s);
-        maiter.Seek(j);
-        maiter.SetValue(Arc(0, arc.olabel, arc.weight, newstate));
-      }
-    }
-  }
-}
-
 
 template<class Arc>
-std::unique_ptr<VectorFst<Arc>>
-MakeLoopFst(const vector<std::unique_ptr<const ExpandedFst<Arc>>> &fsts) {
+VectorFst<Arc>* MakeLoopFst(const vector<const ExpandedFst<Arc> *> &fsts) {
   typedef typename Arc::Weight Weight;
   typedef typename Arc::StateId StateId;
   typedef typename Arc::Label Label;
 
-  std::unique_ptr<VectorFst<Arc>> ans(new VectorFst<Arc>);
+  VectorFst<Arc> *ans = new VectorFst<Arc>;
   StateId loop_state = ans->AddState();  // = 0.
   ans->SetStart(loop_state);
   ans->SetFinal(loop_state, Weight::One());
@@ -676,8 +623,7 @@ MakeLoopFst(const vector<std::unique_ptr<const ExpandedFst<Arc>>> &fsts) {
   unordered_map<const ExpandedFst<Arc> *, Arc> cache;
 
   for (Label i = 0; i < static_cast<Label>(fsts.size()); i++) {
-    // TODO(galv): I feel like this won't work with my unique_ptr usage. Call .get()?
-    const ExpandedFst<Arc> *fst = fsts[i].get();
+    const ExpandedFst<Arc> *fst = fsts[i];
     if (fst == NULL) continue;
     { // optimization with cache: helpful if some members of "fsts" may
       // contain the same pointer value (e.g. in GetHTransducer).
diff --git a/src/fstext/fstext-utils-test.cc b/src/fstext/fstext-utils-test.cc
index 96ccf67366e..0f4a2ae1e65 100644
--- a/src/fstext/fstext-utils-test.cc
+++ b/src/fstext/fstext-utils-test.cc
@@ -214,19 +214,13 @@ template<class Arc>  void TestAcceptorMinimize() {
 template<class Arc>  void TestMakeSymbolsSame() {
 
   VectorFst<Arc> *fst = RandFst<Arc>();
-  bool foll = (kaldi::Rand() % 2 == 0);
   bool is_symbol = (kaldi::Rand() % 2 == 0);
 
 
   VectorFst<Arc> fst2(*fst);
 
-  if (foll) {
-    MakeFollowingInputSymbolsSame(is_symbol, &fst2);
-    assert(FollowingInputSymbolsAreSame(is_symbol, fst2));
-  } else {
-    MakePrecedingInputSymbolsSame(is_symbol, &fst2);
-    assert(PrecedingInputSymbolsAreSame(is_symbol, fst2));
-  }
+  MakePrecedingInputSymbolsSame(is_symbol, &fst2);
+  assert(PrecedingInputSymbolsAreSame(is_symbol, fst2));
 
 
   assert(RandEquivalent(*fst, fst2, 5/*paths*/, 0.01/*delta*/, kaldi::Rand()/*seed*/, 100/*path length-- max?*/));
@@ -251,20 +245,14 @@ struct TestFunctor {
 template<class Arc>  void TestMakeSymbolsSameClass() {
 
   VectorFst<Arc> *fst = RandFst<Arc>();
-  bool foll = (kaldi::Rand() % 2 == 0);
   bool is_symbol = (kaldi::Rand() % 2 == 0);
 
 
   VectorFst<Arc> fst2(*fst);
 
   TestFunctor<Arc> f;
-  if (foll) {
-    MakeFollowingInputSymbolsSameClass(is_symbol, &fst2, f);
-    assert(FollowingInputSymbolsAreSameClass(is_symbol, fst2, f));
-  } else {
-    MakePrecedingInputSymbolsSameClass(is_symbol, &fst2, f);
-    assert(PrecedingInputSymbolsAreSameClass(is_symbol, fst2, f));
-  }
+  MakePrecedingInputSymbolsSameClass(is_symbol, &fst2, f);
+  assert(PrecedingInputSymbolsAreSameClass(is_symbol, fst2, f));
 
   assert(RandEquivalent(*fst, fst2, 5/*paths*/, 0.01/*delta*/, kaldi::Rand()/*seed*/, 100/*path length-- max?*/));
 
diff --git a/src/fstext/fstext-utils.h b/src/fstext/fstext-utils.h
index 2b94ca8c456..0be6725a44f 100644
--- a/src/fstext/fstext-utils.h
+++ b/src/fstext/fstext-utils.h
@@ -24,7 +24,6 @@
 #define KALDI_FSTEXT_FSTEXT_UTILS_H_
 #include <algorithm>
 #include <map>
-#include <memory>
 #include <set>
 #include <vector>
 #include <fst/fstlib.h>
@@ -253,31 +252,15 @@ bool FollowingInputSymbolsAreSameClass(bool end_is_epsilon, const Fst<Arc> &fst,
 /// that have differing input symbols going in, and inserting, for each of
 /// the preceding arcs with non-epsilon input symbol, a new dummy state that
 /// has an epsilon link to the fst state.
+/// If "start_is_epsilon", ensure that start-state can have only epsilon-links
+/// into it.
 template<class Arc>
-void MakePrecedingInputSymbolsSame(MutableFst<Arc> *fst);
+void MakePrecedingInputSymbolsSame(bool start_is_epsilon, MutableFst<Arc> *fst);
 
 
 /// As MakePrecedingInputSymbolsSame, but takes a functor object that maps labels to classes.
 template<class Arc, class F>
-void MakePrecedingInputSymbolsSameClass(MutableFst<Arc> *fst, const F &f);
-
-
-/// MakeFollowingInputSymbolsSame ensures that all arcs exiting any given fst
-/// state have the same input symbol.  It does this by detecting states that have
-/// differing input symbols on arcs that exit it, and inserting, for each of the
-/// following arcs with non-epsilon input symbol, a new dummy state that has an
-/// input-epsilon link from the fst state.  The output symbol and weight stay on the
-/// link to the dummy state (in order to keep the FST output-deterministic and
-/// stochastic, if it already was).
-/// If end_is_epsilon, treat "being a final-state" like having an epsilon output
-/// link.
-template<class Arc>
-void MakeFollowingInputSymbolsSame(bool end_is_epsilon, MutableFst<Arc> *fst);
-
-/// As MakeFollowingInputSymbolsSame, but takes a functor object that maps labels to classes.
-template<class Arc, class F>
-void MakeFollowingInputSymbolsSameClass(bool end_is_epsilon, MutableFst<Arc> *fst, const F &f);
-
+void MakePrecedingInputSymbolsSameClass(bool start_is_epsilon, MutableFst<Arc> *fst, const F &f);
 
 
 
@@ -303,7 +286,7 @@ void MakeFollowingInputSymbolsSameClass(bool end_is_epsilon, MutableFst<Arc> *fs
 /// less well optimized and would have a lot of final-states.
 
 template<class Arc>
-std::unique_ptr<VectorFst<Arc>> MakeLoopFst(const vector<std::unique_ptr<const ExpandedFst<Arc>>> &fsts);
+VectorFst<Arc>* MakeLoopFst(const vector<const ExpandedFst<Arc> *> &fsts);
 
 
 /// ApplyProbabilityScale is applicable to FSTs in the log or tropical semiring.
diff --git a/src/hmm/hmm-test-utils.cc b/src/hmm/hmm-test-utils.cc
index a43f7e956c3..808296b3db7 100644
--- a/src/hmm/hmm-test-utils.cc
+++ b/src/hmm/hmm-test-utils.cc
@@ -23,7 +23,7 @@
 
 namespace kaldi {
 
-Transitions *GenRandTransitionModel(ContextDependency **ctx_dep_out) {
+Transitions *GenRandTransitions(ContextDependency **ctx_dep_out) {
   std::vector<int32> phones;
   phones.push_back(1);
   for (int32 i = 2; i < 20; i++)
@@ -182,23 +182,25 @@ void GeneratePathThroughHmm(const Topology &topology,
   path->clear();
   auto const &this_entry = topology.TopologyForPhone(phone); // an FST
   int32 cur_state = 0;  // start-state is always state zero.
+
+  // Note: final_state == num_states - 1 is actually not something
+  // that would be generally true, but it is true for the topologies we
+  // use in the test code.
   int32 num_states = this_entry.NumStates(), final_state = num_states - 1;
   KALDI_ASSERT(num_states > 1);  // there has to be a final nonemitting state
   // that's different from the start state.
-  std::vector<std::pair<int32, int32> > pending_self_loops;
+
   while (cur_state != final_state) {
     int32 num_transitions = this_entry.NumArcs(cur_state),
         arc_index = RandInt(0, num_transitions - 1);
     fst::ArcIterator<fst::StdVectorFst> aiter(this_entry, cur_state);
     aiter.Seek(arc_index);
     auto const &arc(aiter.Value());
-    if (arc.ilabel != -1) {
-      std::pair<int32, int32> pr(cur_state, arc_index);
-      path->push_back(pr);
-    }
+    KALDI_ASSERT(arc.ilabel > 0);
+    std::pair<int32, int32> pr(cur_state, arc_index);
+    path->push_back(pr);
     cur_state = arc.nextstate;
   }
-  KALDI_ASSERT(pending_self_loops.empty());
 }
 
 
@@ -209,8 +211,14 @@ void GenerateRandomAlignment(const ContextDependencyInterface &ctx_dep,
   int32 context_width = ctx_dep.ContextWidth(),
       central_position = ctx_dep.CentralPosition(),
       num_phones = phone_sequence.size();
+
+  auto all_phones = trans_model.GetPhones();
+  int32 model_max_phone = *std::max_element(all_phones.begin(),
+                                            all_phones.end());
   alignment->clear();
   for (int32 i = 0; i < num_phones; i++) {
+    KALDI_ASSERT(phone_sequence[i] > 0
+                 && phone_sequence[i] <= model_max_phone);
     std::vector<int32> context_window;
     context_window.reserve(context_width);
     for (int32 j = i - central_position;
@@ -220,7 +228,7 @@ void GenerateRandomAlignment(const ContextDependencyInterface &ctx_dep,
       else context_window.push_back(0);  // zero for out-of-window phones
     }
     // 'path' is the path through this phone's HMM, represented as
-    // (emitting-HMM-state, transition-index) pairs
+    // (source-HMM-state, transition-index) pairs
     std::vector<std::pair<int32, int32> > path;
     int32 phone = phone_sequence[i];
     GeneratePathThroughHmm(trans_model.GetTopo(), phone, &path);
@@ -241,8 +249,12 @@ void GenerateRandomAlignment(const ContextDependencyInterface &ctx_dep,
 
       bool ans = ctx_dep.Compute(context_window, forward_pdf_class, &forward_pdf_id);
       KALDI_ASSERT(ans && "context-dependency computation failed.");
-      ans = ctx_dep.Compute(context_window, self_loop_pdf_class, &self_loop_pdf_id);
-      KALDI_ASSERT(ans && "context-dependency computation failed.");
+      if (self_loop_pdf_class != -1) {
+        ans = ctx_dep.Compute(context_window, self_loop_pdf_class, &self_loop_pdf_id);
+        KALDI_ASSERT(ans && "context-dependency computation failed.");
+      } else {
+        self_loop_pdf_id = -1;
+      }
       int32 transition_id = trans_model.TupleToTransitionId(phone, hmm_state, arc_index,
                                                             forward_pdf_id, self_loop_pdf_id);
       alignment->push_back(transition_id);
diff --git a/src/hmm/hmm-test-utils.h b/src/hmm/hmm-test-utils.h
index b7748e57338..32c901c1791 100644
--- a/src/hmm/hmm-test-utils.h
+++ b/src/hmm/hmm-test-utils.h
@@ -34,7 +34,7 @@ namespace kaldi {
 // This function returns a randomly generated Transitions object.
 // If 'ctx_dep' is not NULL, it outputs to *ctx_dep a pointer to the
 // tree that was used to generate the transition model.
-Transitions *GenRandTransitionModel(ContextDependency **ctx_dep);
+Transitions *GenRandTransitions(ContextDependency **ctx_dep);
 
 /// This function returns a Topology object giving a normal 3-state topology,
 /// covering all phones in the list "phones".  This is mainly of use in testing
diff --git a/src/hmm/hmm-utils-test.cc b/src/hmm/hmm-utils-test.cc
index 66e1cb4f172..a94a99941d6 100644
--- a/src/hmm/hmm-utils-test.cc
+++ b/src/hmm/hmm-utils-test.cc
@@ -202,7 +202,7 @@ void TestAccumulateTreeStatsOptions() {
 
 void TestSplitToPhones() {
   ContextDependency *ctx_dep = NULL;
-  Transitions *trans_model = GenRandTransitionModel(&ctx_dep);
+  Transitions *trans_model = GenRandTransitions(&ctx_dep);
   std::vector<int32> phone_seq;
   int32 num_phones = RandInt(0, 10);
   const std::vector<int32> &phone_list = trans_model->GetPhones();
@@ -214,7 +214,8 @@ void TestSplitToPhones() {
   GenerateRandomAlignment(*ctx_dep, *trans_model,
                           phone_seq, &alignment);
   std::vector<std::vector<int32> > split_alignment;
-  SplitToPhones(*trans_model, alignment, &split_alignment);
+  bool ans = SplitToPhones(*trans_model, alignment, &split_alignment);
+  KALDI_ASSERT(ans);
   KALDI_ASSERT(split_alignment.size() == phone_seq.size());
   for (size_t i = 0; i < split_alignment.size(); i++) {
     KALDI_ASSERT(!split_alignment[i].empty());
@@ -278,7 +279,7 @@ void TestConvertAlignment() {
   std::vector<int32> phone_sequence;
   int32 phone_sequence_length = RandInt(0, 20);
   for (int32 i = 0; i < phone_sequence_length; i++)
-    phone_sequence.push_back(phones[RandInt(0, phones.size() - 1)]);
+    phone_sequence.push_back(phones[RandInt(1, phones.size())]);
   std::vector<int32> old_alignment;
   GenerateRandomAlignment(*ctx_dep_old, trans_model_old,
                           phone_sequence,
diff --git a/src/hmm/hmm-utils.cc b/src/hmm/hmm-utils.cc
index ffedc43f26d..db5998fe17a 100644
--- a/src/hmm/hmm-utils.cc
+++ b/src/hmm/hmm-utils.cc
@@ -239,7 +239,12 @@ GetHTransducer(const std::vector<std::vector<int32> > &ilabel_info,
     }
   }
 
-  std::unique_ptr<VectorFst<Arc>> ans = MakeLoopFst(fsts);
+  // fsts_bare is as fsts, but with bare pointers.
+  std::vector<const ExpandedFst<Arc> *> fsts_bare(fsts.size());
+  for (size_t i = 0; i < fsts.size(); i++)
+    fsts_bare[i] = fsts[i].get();
+
+  std::unique_ptr<VectorFst<Arc>> ans(MakeLoopFst(fsts_bare));
   return ans;
 }
 
@@ -453,12 +458,12 @@ static bool StateIsStochastic(FST fst, typename FST::StateId s) {
   using Arc = typename FST::Arc;
   using Weight = typename Arc::Weight;
   Weight total_prob = Weight::Zero();
-  for (MutableArcIterator<MutableFst<Arc> > aiter(fst, s);
+  for (MutableArcIterator<MutableFst<Arc> > aiter(&fst, s);
        !aiter.Done();
        aiter.Next()) {
-    total_prob = Plus(total_prob, aiter.Value());
+    total_prob = Plus(total_prob, aiter.Value().weight);
   }
-  return ApproxEqual(total_prob.Value(), Weight::One());
+  return fst::ApproxEqual(total_prob, Weight::One());
 }
 
 void AddSelfLoops(const Transitions &trans_model,
@@ -478,7 +483,7 @@ void AddSelfLoops(const Transitions &trans_model,
   // self-loop to be added to it.  Approximately this means that if a
   // state has multiple different symbols on arcs entering it, it will be
   // duplicated, with one copy per incoming symbol.
-  MakePrecedingInputSymbolsSameClass(fst, f);
+  MakePrecedingInputSymbolsSameClass(true, fst, f);
 
   // use the following to keep track of the transition-state incoming
   // into each state. This works because each state now has only one
@@ -544,7 +549,7 @@ void AddSelfLoops(const Transitions &trans_model,
       fst->AddArc(s, Arc(self_loop_tid, 0, Weight(-self_loop_log_prob), s));
 
     }
-    KALDI_PARANOID_ASSERT(StateIsStochastic(fst, s));
+    KALDI_PARANOID_ASSERT(StateIsStochastic(*fst, s));
   }
 }
 
@@ -565,47 +570,32 @@ static bool SplitToPhonesInternal(const Transitions &trans_model,
   // each phone]..
 
   bool was_ok = true;
-  for (size_t i = 0; i < alignment.size(); i++) {
+  int32 prev_phone = trans_model.InfoForTransitionId(alignment[0]).phone;
+  // i = 0 can't be an end point, it's the start of the sequence,
+  // so we start with 1.
+  for (size_t i = 1; i < alignment.size(); i++) {
     int32 trans_id = alignment[i];
-    if (trans_model.InfoForTransitionId(trans_id).is_final) {
-      while (i+1 < alignment.size() &&
-             trans_model.InfoForTransitionId(alignment[i+1]).is_self_loop) {
-        KALDI_ASSERT(trans_model.InfoForTransitionId(alignment[i]) ==
-                     trans_model.InfoForTransitionId(alignment[i+1]));
-        i++;
-      }
-      end_points.push_back(i+1);
-    } else if (i+1 == alignment.size()) {
-      // need to have an end-point at the actual end.
-      // but this is an error- should have been detected already.
+    const auto &info = trans_model.InfoForTransitionId(trans_id);
+    if (info.is_initial) {
+      end_points.push_back(i);
+    } else if (info.phone != prev_phone) {
+      KALDI_WARN << "Not OK.";
       was_ok = false;
-      end_points.push_back(i+1);
-    } else {
-      int32 this_phone = trans_model.InfoForTransitionId(trans_id).phone;
-      int32 next_trans_id = alignment[i+1];
-      int32 next_phone = trans_model.InfoForTransitionId(next_trans_id).phone;
-
-      if (this_phone != next_phone){
-        // The phone changed, but this is an error-- we should have detected this via the
-        // is_final check.
-        was_ok = false;
-        end_points.push_back(i+1);
-      }
     }
+    prev_phone = info.phone;
+  }
+  end_points.push_back(alignment.size());
+  if (!trans_model.InfoForTransitionId(alignment.back()).is_final) {
+    KALDI_WARN << "Not OK.";
+    was_ok = false;
   }
 
-  size_t cur_point = 0;
+  size_t cur_start = 0;
   for (int32 end_point: end_points) {
     split_output->push_back(std::vector<int32>());
-    // The next if-statement checks if the initial trans-id at the
-    // current end point is the initial-state of the current phone (a
-    // cursory check that the alignment is plausible).
-    int32 topo_state = trans_model.InfoForTransitionId(end_point).topo_state;
-    if (topo_state != 0)
-      was_ok = false;
-    for (size_t j = cur_point; j < end_point; j++)
+    for (size_t j = cur_start; j < end_point; j++)
       split_output->back().push_back(alignment[j]);
-    cur_point = end_point;
+    cur_start = end_point;
   }
   return was_ok;
 }
@@ -667,10 +657,10 @@ static inline void ConvertAlignmentForPhone(
   }
 
   int32 new_num_pdf_classes = new_topo.NumPdfClasses(new_central_phone);
-  std::vector<int32> pdf_ids(new_num_pdf_classes);  // Indexed by pdf-class
+  std::vector<int32> pdf_ids(new_num_pdf_classes + 1);  // Indexed by pdf-class
   for (int32 pdf_class = 1; pdf_class <= new_num_pdf_classes; pdf_class++) {
     if (!new_ctx_dep.Compute(new_phone_window, pdf_class,
-                             &(pdf_ids[pdf_class - 1]))) {
+                             &(pdf_ids[pdf_class]))) {
       std::ostringstream ss;
       WriteIntegerVector(ss, false, new_phone_window);
       KALDI_ERR << "tree did not succeed in converting phone window "
@@ -679,17 +669,20 @@ static inline void ConvertAlignmentForPhone(
   }
 
   // the topologies and lengths match -> we can directly transfer
-  // the alignment.
+  // the alignment (assume the pdf-classes are identical).
   for (int32 j = 0; j < alignment_size; j++) {
     int32 old_tid = old_phone_alignment[j];
     auto&& info = old_trans_model.InfoForTransitionId(old_tid);
-    int32 old_forward_pdf_class = old_trans_model.PdfClassForTid(old_tid);
-    int32 old_self_loop_pdf_class = old_trans_model.PdfClassForTid(info.self_loop_pdf_id);
-    int32 new_forward_pdf_id = pdf_ids[old_forward_pdf_class];
-    int32 new_self_loop_pdf_id = pdf_ids[old_self_loop_pdf_class];
+    int32 old_pdf_class = old_trans_model.PdfClassForTid(old_tid);
+    int32 old_self_loop_pdf_class = (
+        info.self_loop_pdf_id != -1 ?
+        old_trans_model.PdfClassForTid(info.self_loop_pdf_id) : -1);
+    int32 new_pdf_id = pdf_ids[old_pdf_class];
+    int32 new_self_loop_pdf_id = (old_self_loop_pdf_class != -1 ?
+                                  pdf_ids[old_self_loop_pdf_class] : -1);
     int32 new_tid =
       new_trans_model.TupleToTransitionId(new_central_phone, info.topo_state,
-                                          info.arc_index, new_forward_pdf_id,
+                                          info.arc_index, new_pdf_id,
                                           new_self_loop_pdf_id);
     (*new_phone_alignment)[j] = new_tid;
   }
@@ -801,14 +794,15 @@ static bool ComputeNewPhoneLengths(const Topology &topology,
   'conversion_shift' is for.
 */
 
-static bool ConvertAlignmentInternal(const Transitions &old_trans_model,
-                      const Transitions &new_trans_model,
-                      const ContextDependencyInterface &new_ctx_dep,
-                      const std::vector<int32> &old_alignment,
-                      int32 conversion_shift,
-                      int32 subsample_factor,
-                      const std::vector<int32> *phone_map,
-                      std::vector<int32> *new_alignment) {
+static bool ConvertAlignmentInternal(
+    const Transitions &old_trans_model,
+    const Transitions &new_trans_model,
+    const ContextDependencyInterface &new_ctx_dep,
+    const std::vector<int32> &old_alignment,
+    int32 conversion_shift,
+    int32 subsample_factor,
+    const std::vector<int32> *phone_map,
+    std::vector<int32> *new_alignment) {
   KALDI_ASSERT(0 <= conversion_shift && conversion_shift < subsample_factor);
   KALDI_ASSERT(new_alignment != NULL);
   new_alignment->clear();
diff --git a/src/hmm/transitions-test.cc b/src/hmm/transitions-test.cc
index a66c563c76a..8e2fe403f34 100644
--- a/src/hmm/transitions-test.cc
+++ b/src/hmm/transitions-test.cc
@@ -24,7 +24,7 @@ namespace kaldi {
 
 
 void TestTransitions() {
-  Transitions *trans_model = GenRandTransitionModel(NULL);
+  Transitions *trans_model = GenRandTransitions(NULL);
   bool binary = (rand() % 2 == 0);
 
   std::ostringstream os;
diff --git a/src/hmm/transitions.cc b/src/hmm/transitions.cc
index c666e508e08..440912b37dc 100644
--- a/src/hmm/transitions.cc
+++ b/src/hmm/transitions.cc
@@ -63,20 +63,24 @@ void Transitions::ComputeInfo(const ContextDependencyInterface &ctx_dep) {
 
   for (size_t i = 0; i < phones.size(); i++) {
     int32 phone = phones[i];
-    auto const &entry = topo_.TopologyForPhone(phone);  // an FST
+    const fst::StdVectorFst &entry = topo_.TopologyForPhone(phone);
     int num_states = entry.NumStates();
 
     std::vector<StateId> state_to_self_loop_pdf_class(num_states, kNoPdf);
-    for (StateId state = 0; state < num_states; ++state)
+    for (StateId state = 0; state < num_states; ++state) {
       for (fst::ArcIterator<fst::StdVectorFst> aiter(entry, state); !aiter.Done(); aiter.Next()) {
         const fst::StdArc &arc(aiter.Value());
         if (arc.nextstate == state) {
-          KALDI_ASSERT(state_to_self_loop_pdf_class[state] == kNoPdf);  // Only 1 self-loop allowed.
+          if (state_to_self_loop_pdf_class[state] != kNoPdf)
+            KALDI_ERR << "State " << state << " in topology of phone "
+                      << phone << " has more than one self-loop.";
           state_to_self_loop_pdf_class[state] = arc.ilabel;
         }
       }
+    }
 
-    std::map<std::pair<int32, int32>, std::vector<std::pair<int32, int32> > > phone_to_arc_list;
+    std::map<std::pair<int32, int32>, std::vector<std::pair<int32, int32> > > &phone_to_arc_list(
+        to_arc_list[phone]);
     for (StateId state = 0; state < num_states; ++state) {
       for (fst::ArcIterator<fst::StdVectorFst> aiter(entry, state);
            !aiter.Done(); aiter.Next()) {
@@ -90,7 +94,6 @@ void Transitions::ComputeInfo(const ContextDependencyInterface &ctx_dep) {
     }
     for (auto const &pdf_class_to_arc: phone_to_arc_list)
       pdf_class_pairs[phone].push_back(pdf_class_to_arc.first);
-    to_arc_list[phone] = phone_to_arc_list;
   }
   ctx_dep.GetPdfInfo(phones, pdf_class_pairs, &pdf_info);
 
@@ -126,7 +129,7 @@ void Transitions::ComputeInfo(const ContextDependencyInterface &ctx_dep) {
 void Transitions::ComputeDerived() {
   pdf_ids_.resize(info_.size());
   for (int32 tid = 1; tid <= NumTransitionIds(); ++tid) {
-    auto transition = info_[tid];
+    TransitionIdInfo &transition = info_[tid];
     auto const &entry = topo_.TopologyForPhone(transition.phone);  // an FST
     fst::ArcIterator<fst::StdVectorFst> aiter(entry, transition.topo_state);
     aiter.Seek(transition.arc_index);
@@ -153,7 +156,6 @@ void Transitions::ComputeDerived() {
                               arc_index, transition.self_loop_pdf_id,
                               transition.self_loop_pdf_id);
     }
-
     pdf_ids_[tid] = transition.pdf_id;
   }
 }
diff --git a/src/lat/word-align-lattice-lexicon-test.cc b/src/lat/word-align-lattice-lexicon-test.cc
index db70e21d43e..4987a6cd427 100644
--- a/src/lat/word-align-lattice-lexicon-test.cc
+++ b/src/lat/word-align-lattice-lexicon-test.cc
@@ -172,7 +172,7 @@ void GenerateCompactLatticeRandomly(const std::vector<int32> &alignment,
 
 void TestWordAlignLatticeLexicon() {
   ContextDependency *ctx_dep;
-  Transitions *trans_model = GenRandTransitionModel(&ctx_dep);
+  Transitions *trans_model = GenRandTransitions(&ctx_dep);
   bool allow_zero_words = true;
   bool allow_empty_word = true;
   bool allow_multiple_prons = true;
diff --git a/src/tree/build-tree-test.cc b/src/tree/build-tree-test.cc
index f8a1b58fbc7..5c2bac0e73d 100644
--- a/src/tree/build-tree-test.cc
+++ b/src/tree/build-tree-test.cc
@@ -34,21 +34,21 @@ void TestGenRandStats() {
     for (size_t i = 0;i < (size_t)num_phones;i++)
       phone_ids[i] = (i == 0 ? (Rand() % 2) : phone_ids[i-1] + 1 + (Rand()%2));
     int32 max_phone = *std::max_element(phone_ids.begin(), phone_ids.end());
-    std::vector<int32> hmm_lengths(max_phone+1);
+    std::vector<int32> num_pdf_classes(max_phone+1);
     std::vector<bool> is_ctx_dep(max_phone+1);
 
     for (int32 i = 0; i <= max_phone; i++) {
-      hmm_lengths[i] = 1 + Rand() % 3;
+      num_pdf_classes[i] = 1 + Rand() % 3;
       is_ctx_dep[i] = (RandUniform() < ctx_dep_prob);  // true w.p. ctx_dep_prob.
     }
     for (size_t i = 0;i < (size_t) num_phones;i++) {
-      KALDI_VLOG(2) <<  "For idx = "<< i << ", (phone_id, hmm_length, is_ctx_dep) == " << (phone_ids[i]) << " " << (hmm_lengths[phone_ids[i]]) << " " << (is_ctx_dep[phone_ids[i]]);
+      KALDI_VLOG(2) <<  "For idx = "<< i << ", (phone_id, num_pdf_classes, is_ctx_dep) == " << (phone_ids[i]) << " " << (num_pdf_classes[phone_ids[i]]) << " " << (is_ctx_dep[phone_ids[i]]);
     }
     BuildTreeStatsType stats;
     // put false for all_covered argument.
     // if it doesn't really ensure that all are covered with true, this will induce
     // failure in the test of context-fst.
-    GenRandStats(dim, num_stats, N, P, phone_ids, hmm_lengths, is_ctx_dep, false, &stats);
+    GenRandStats(dim, num_stats, N, P, phone_ids, num_pdf_classes, is_ctx_dep, false, &stats);
     std::cout << "Writing random stats.";
     std::cout <<"dim = " << dim << '\n';
     std::cout <<"num_phones = " << num_phones << '\n';
@@ -58,7 +58,7 @@ void TestGenRandStats() {
     std::cout << "is-ctx-dep = ";
     for (size_t i = 0;i < is_ctx_dep.size();i++)
       WriteBasicType(std::cout, false, static_cast<bool>(is_ctx_dep[i]));
-    std::cout << "hmm_lengths = "; WriteIntegerVector(std::cout, false, hmm_lengths);
+    std::cout << "num_pdf_classes = "; WriteIntegerVector(std::cout, false, num_pdf_classes);
     std::cout << "phone_ids = "; WriteIntegerVector(std::cout, false, phone_ids);
     std::cout << "Stats are: \n";
     WriteBuildTreeStats(std::cout, false, stats);
@@ -69,10 +69,10 @@ void TestGenRandStats() {
       EventValueType central_phone;
       bool b = EventMap::Lookup(stats[i].first, P, &central_phone);
       KALDI_ASSERT(b);
-      EventValueType position;
-      b = EventMap::Lookup(stats[i].first, kPdfClass, &position);
+      EventValueType pdf_class;
+      b = EventMap::Lookup(stats[i].first, kPdfClass, &pdf_class);
       KALDI_ASSERT(b);
-      KALDI_ASSERT(position>=0 && position < hmm_lengths[central_phone]);
+      KALDI_ASSERT(pdf_class >= 1 && pdf_class <= num_pdf_classes[central_phone]);
 
       for (EventKeyType j = 0; j < N; j++) {
         if (j != P) {  // non-"central" phone.
@@ -102,20 +102,20 @@ void TestBuildTree() {
     for (size_t i = 0;i < (size_t)num_phones;i++)
       phone_ids[i] = (i == 0 ? (Rand() % 2) : phone_ids[i-1] + 1 + (Rand()%2));
     int32 max_phone = *std::max_element(phone_ids.begin(), phone_ids.end());
-    std::vector<int32> hmm_lengths(max_phone+1);
+    std::vector<int32> num_pdf_classes(max_phone+1);
     std::vector<bool> is_ctx_dep(max_phone+1);
 
     for (int32 i = 0; i <= max_phone; i++) {
-      hmm_lengths[i] = 1 + Rand() % 3;
+      num_pdf_classes[i] = 1 + Rand() % 3;
       is_ctx_dep[i] = (RandUniform() < ctx_dep_prob);  // true w.p. ctx_dep_prob.
     }
     for (size_t i = 0;i < (size_t) num_phones;i++) {
-      KALDI_VLOG(2) <<  "For idx = "<< i << ", (phone_id, hmm_length, is_ctx_dep) == " << (phone_ids[i]) << " " << (hmm_lengths[phone_ids[i]]) << " " << (is_ctx_dep[phone_ids[i]]);
+      KALDI_VLOG(2) <<  "For idx = "<< i << ", (phone_id, num_pdf_classes, is_ctx_dep) == " << (phone_ids[i]) << " " << (num_pdf_classes[phone_ids[i]]) << " " << (is_ctx_dep[phone_ids[i]]);
     }
     // Generate rand stats.  These were tested in TestGenRandStats() above.
     BuildTreeStatsType stats;
     bool ensure_all_covered = false;
-    GenRandStats(dim, num_stats, N, P, phone_ids, hmm_lengths, is_ctx_dep, ensure_all_covered, &stats);
+    GenRandStats(dim, num_stats, N, P, phone_ids, num_pdf_classes, is_ctx_dep, ensure_all_covered, &stats);
 
     {  // print out the stats.
       std::cout << "Writing random stats.";
@@ -127,7 +127,7 @@ void TestBuildTree() {
       std::cout << "is-ctx-dep = ";
       for (size_t i = 0;i < is_ctx_dep.size();i++)
         WriteBasicType(std::cout, false, static_cast<bool>(is_ctx_dep[i]));
-      std::cout << "hmm_lengths = "; WriteIntegerVector(std::cout, false, hmm_lengths);
+      std::cout << "num_pdf_classes = "; WriteIntegerVector(std::cout, false, num_pdf_classes);
       std::cout << "phone_ids = "; WriteIntegerVector(std::cout, false, phone_ids);
       std::cout << "Stats are: \n";
       WriteBuildTreeStats(std::cout, false, stats);
@@ -172,11 +172,11 @@ void TestBuildTree() {
         bool round_num_leaves = true;
 
         EventMap *tree_not_rounded =
-               BuildTree(qopts, phone_sets, hmm_lengths, share_roots,
+               BuildTree(qopts, phone_sets, num_pdf_classes, share_roots,
                          do_split, stats, thresh, max_leaves, 0.0, P,
                          false);
 
-        tree = BuildTree(qopts, phone_sets, hmm_lengths, share_roots,
+        tree = BuildTree(qopts, phone_sets, num_pdf_classes, share_roots,
                          do_split, stats, thresh, max_leaves, 0.0, P,
                          round_num_leaves);
 
@@ -214,7 +214,7 @@ void TestBuildTree() {
 
         KALDI_ASSERT(num_removed < 8);
       } else {
-        tree = BuildTree(qopts, phone_sets, hmm_lengths, share_roots,
+        tree = BuildTree(qopts, phone_sets, num_pdf_classes, share_roots,
                          do_split, stats, thresh, max_leaves, 0.0, P,
                          false);
       }
@@ -235,4 +235,3 @@ int main() {
   kaldi::TestGenRandStats();
   kaldi::TestBuildTree();
 }
-
diff --git a/src/tree/build-tree.cc b/src/tree/build-tree.cc
index 0234a607cba..67fd03e46ca 100644
--- a/src/tree/build-tree.cc
+++ b/src/tree/build-tree.cc
@@ -29,7 +29,7 @@ namespace kaldi {
 
 void GenRandStats(int32 dim, int32 num_stats, int32 N, int32 P,
                   const std::vector<int32> &phone_ids,
-                  const std::vector<int32> &phone2hmm_length,
+                  const std::vector<int32> &phone2num_pdf_classes,
                   const std::vector<bool> &is_ctx_dep,
                   bool ensure_all_phones_covered,
                   BuildTreeStatsType *stats_out) {
@@ -41,7 +41,7 @@ void GenRandStats(int32 dim, int32 num_stats, int32 N, int32 P,
   KALDI_ASSERT(phone_ids.size() != 0);
   KALDI_ASSERT(stats_out != NULL && stats_out->empty());
   int32 max_phone = *std::max_element(phone_ids.begin(), phone_ids.end());
-  KALDI_ASSERT(phone2hmm_length.size() >= static_cast<size_t>(1 + max_phone));
+  KALDI_ASSERT(phone2num_pdf_classes.size() >= static_cast<size_t>(1 + max_phone));
   KALDI_ASSERT(is_ctx_dep.size() >= static_cast<size_t>(1 + max_phone));
 
   // Make sure phone id's distinct.
@@ -68,12 +68,12 @@ void GenRandStats(int32 dim, int32 num_stats, int32 N, int32 P,
     std::vector<int32> phone_vec(N);
     for (size_t i = 0;i < (size_t)N;i++) phone_vec[i] = phone_ids[(Rand() % num_phones)];
 
-    int32 hmm_length = phone2hmm_length[phone_vec[P]];
-    KALDI_ASSERT(hmm_length > 0);
+    int32 num_pdf_classes = phone2num_pdf_classes[phone_vec[P]];
+    KALDI_ASSERT(num_pdf_classes > 0);
     covered[phone_vec[P]] = true;
 
     // For each position [in the central phone]...
-    for (int32 j = 0; j < hmm_length; j++) {
+    for (int32 j = 0; j < num_pdf_classes; j++) {
       // create event vector.
       EventType event_vec;
       // Use j+1 in next line becuase pdf-classes are 1-based.
@@ -93,7 +93,7 @@ void GenRandStats(int32 dim, int32 num_stats, int32 N, int32 P,
         Vector<BaseFloat> weights(N);  // weight of each component.
         for (int32 k = 0; k < N; k++) {
           BaseFloat k_pos = (N - 0.5 - k) / N;  // between 0 and 1, less for lower k...
-          BaseFloat j_pos = (hmm_length - 0.5 - j) / hmm_length;
+          BaseFloat j_pos = (num_pdf_classes - 0.5 - j) / num_pdf_classes;
           // j_pos is between 0 and 1, less for lower j.
 
           BaseFloat weight = j_pos*k_pos + (1.0-j_pos)*(1.0-k_pos);
diff --git a/src/tree/build-tree.h b/src/tree/build-tree.h
index 9196c6bb204..e457d50e622 100644
--- a/src/tree/build-tree.h
+++ b/src/tree/build-tree.h
@@ -72,11 +72,11 @@ namespace kaldi {
  *                  or a negative value (e.g. -1) sets it to the smallest likelihood
  *                  change seen during the splitting algorithm; this typically causes
  *                  about a 20% reduction in the number of leaves.
- 
+
  * @param P [in] The central position of the phone context window, e.g. 1 for a
  *                triphone system.
- * @param round_num_leaves [in]  If true, then the number of leaves in the 
- *                  final tree is made a multiple of 8. This is done by 
+ * @param round_num_leaves [in]  If true, then the number of leaves in the
+ *                  final tree is made a multiple of 8. This is done by
  *                  further clustering the leaves after they are first
  *                  clustered based on log-likelihood change.
  *                  (See cluster_thresh above) (default: true)
@@ -93,7 +93,7 @@ EventMap *BuildTree(Questions &qopts,
                     BaseFloat thresh,
                     int32 max_leaves,
                     BaseFloat cluster_thresh,  // typically == thresh.  If negative, use smallest split.
-                    int32 P, 
+                    int32 P,
                     bool round_num_leaves = true);
 
 
@@ -131,7 +131,7 @@ EventMap *BuildTree(Questions &qopts,
  *                 (generally true for non-silence phones).
  * @param stats [in] The statistics used in tree-building.
  * @param max_leaves_first [in] Maximum number of leaves it will create in first
- *                  level of decision tree. 
+ *                  level of decision tree.
  * @param max_leaves_second [in] Maximum number of leaves it will create in second
  *                  level of decision tree.  Must be > max_leaves_first.
  * @param cluster_leaves [in] Boolean value; if true, we post-cluster the leaves produced
@@ -180,7 +180,8 @@ EventMap *BuildTreeTwoLevel(Questions &qopts,
 /// @param N [in] context-size (typically 3)
 /// @param P [in] central-phone position in zero-based numbering (typically 1)
 /// @param phone_ids [in] integer ids of phones
-/// @param hmm_lengths [in] lengths of hmm for phone, indexed by phone.
+/// @param num_pdf_classes [in] number of pdf-classes for each phone, indexed by phone.
+///                    Note: pdf-classes are 1-based.
 /// @param is_ctx_dep [in] boolean array indexed by phone, saying whether each phone
 ///     is context dependent.
 /// @param ensure_all_phones_covered [in] Boolean argument: if true, GenRandStats
@@ -189,7 +190,7 @@ EventMap *BuildTreeTwoLevel(Questions &qopts,
 
 void GenRandStats(int32 dim, int32 num_stats, int32 N, int32 P,
                   const std::vector<int32> &phone_ids,
-                  const std::vector<int32> &hmm_lengths,
+                  const std::vector<int32> &num_pdf_classes,
                   const std::vector<bool> &is_ctx_dep,
                   bool ensure_all_phones_covered,
                   BuildTreeStatsType *stats_out);
diff --git a/src/tree/context-dep.h b/src/tree/context-dep.h
index 743dc964198..1bb1ba08734 100644
--- a/src/tree/context-dep.h
+++ b/src/tree/context-dep.h
@@ -100,9 +100,7 @@ class ContextDependency: public ContextDependencyInterface {
   /// GetPdfInfo returns a vector indexed by pdf-id, saying for each pdf which
   /// pairs of (phone, pdf-class) it can correspond to.  (Usually just one).
   /// c.f. hmm/topology.h for meaning of pdf-class.
-  /// This is the old, simpler interface of GetPdfInfo(), and that this one can
-  /// only be called if the Topology object's IsHmm() function call returns
-  /// true.
+  /// This is the old, simpler interface of GetPdfInfo().
   virtual void GetPdfInfo(
       const std::vector<int32> &phones,  // list of phones
       const std::vector<int32> &num_pdf_classes,  // indexed by phone,
@@ -138,11 +136,11 @@ class ContextDependency: public ContextDependencyInterface {
   EventMap *to_pdf_;  // owned here.
 
   // 'context' is the context-window of phones, of
-  // length N, with -1 for those positions where phones 
-  // that are currently unknown, treated as wildcards; at least 
-  // the central phone [position P] must be a real phone, i.e. 
-  // not -1. 
-  // This function inserts any allowed pairs (forward_pdf, self_loop_pdf) 
+  // length N, with -1 for those positions where phones
+  // that are currently unknown, treated as wildcards; at least
+  // the central phone [position P] must be a real phone, i.e.
+  // not -1.
+  // This function inserts any allowed pairs (forward_pdf, self_loop_pdf)
   // to the set "pairs".
   void EnumeratePairs(
       const std::vector<int32> &phones,

From 038ea0618bb89ec256d27e44fab175ea03d6ac94 Mon Sep 17 00:00:00 2001
From: Daniel Povey <dpovey@gmail.com>
Date: Sat, 22 Jun 2019 13:06:34 -0400
Subject: [PATCH 142/163] [src] Bug-fixes/rewrites to fix test failures in
 hmm-utils-test

---
 src/hmm/hmm-test-utils.cc |  71 ++++++++-----------
 src/hmm/hmm-utils-test.cc |   2 +-
 src/hmm/hmm-utils.cc      | 141 +++++++++++++++++++-------------------
 src/hmm/hmm-utils.h       |  42 +++++-------
 src/hmm/topology.cc       |  64 ++++++++++++++++-
 src/hmm/topology.h        |  55 ++++++++++++++-
 6 files changed, 229 insertions(+), 146 deletions(-)

diff --git a/src/hmm/hmm-test-utils.cc b/src/hmm/hmm-test-utils.cc
index 808296b3db7..6eae1a119b2 100644
--- a/src/hmm/hmm-test-utils.cc
+++ b/src/hmm/hmm-test-utils.cc
@@ -80,14 +80,14 @@ Topology GetDefaultTopology(const std::vector<int32> &phones_in) {
 
 
 Topology GenRandTopology(const std::vector<int32> &phones_in,
-                            const std::vector<int32> &num_pdf_classes) {
+                         const std::vector<int32> &num_pdf_classes) {
   std::vector<int32> phones(phones_in);
   std::sort(phones.begin(), phones.end());
   KALDI_ASSERT(IsSortedAndUniq(phones) && !phones.empty());
 
   std::ostringstream topo_string;
 
-   std::map<int32, std::vector<int32> > num_pdf_classes_to_phones;
+  std::map<int32, std::vector<int32> > num_pdf_classes_to_phones;
   for (size_t i = 0; i < phones.size(); i++) {
     int32 p = phones[i];
     KALDI_ASSERT(static_cast<size_t>(p) < num_pdf_classes.size());
@@ -108,47 +108,30 @@ Topology GenRandTopology(const std::vector<int32> &phones_in,
     for (size_t i = 0; i < phones.size(); i++)
       topo_string << phones[i] << " ";
     topo_string << "</ForPhones>\n";
-    bool ergodic = (RandInt(0, 1) == 0);
-    if (ergodic) {
-      // Note, this type of topology is not something we ever use in practice- it
-      // has an initial nonemitting state (no PdfClass specified).  But it's
-      // supported so we're testing it.
-      std::vector<int32> state_to_pdf_class;
-      state_to_pdf_class.push_back(-1);  // state zero, nonemitting.
-      for (int32 i = 1; i <= this_num_pdf_classes; i++) {
-        int32 num_states = RandInt(1, 2);
-        for (int32 j = 0; j < num_states; j++)
-          state_to_pdf_class.push_back(i);
-      }
-      state_to_pdf_class.push_back(-1);  // final non-emitting state.
-      { // state zero is nonemitting.  This is not something used in any current
-        // example script.
-        BaseFloat prob = 1.0 / (state_to_pdf_class.size() - 2);
-        for (size_t i = 1; i + 1 < state_to_pdf_class.size(); i++)
-          topo_string << "0 " << i << ' ' << state_to_pdf_class[i]
-                      << ' ' << -Log(prob) << '\n';
-      }
-      // ergodic part.
-      for (size_t i = 1; i + 1 < state_to_pdf_class.size(); i++) {
-        BaseFloat prob = 1.0 / (state_to_pdf_class.size() - 1);
-        for (size_t j = 1; j < state_to_pdf_class.size(); j++)
-          topo_string << i << ' ' << j << ' '
-                      << state_to_pdf_class[i] << ' ' << -Log(prob) << '\n';
-      }
-      // final, nonemitting state.  No pdf-class, no transitions.
-      topo_string << (state_to_pdf_class.size() - 1) << "\n\n";
-    } else {
-      // feedforward topology.
-      int32 cur_state = 0;
-      for (int32 pdf_class = 1; pdf_class <= this_num_pdf_classes; pdf_class++) {
-        int32 this_num_states = RandInt(1, 2);
-        for (int32 s = 0; s < this_num_states; s++) {
-          topo_string << cur_state << " " << (cur_state + 1) << " " << pdf_class << "\n";
-          cur_state++;
-        }
-      }
-      // final, non-emitting state.
-      topo_string << cur_state << "\n\n";
+
+    switch (this_num_pdf_classes)  {
+      case 1:
+        topo_string << "0   1   1   0.0\n"
+                       "1   1   1   0.693\n"
+                      "1  0.693\n\n";
+        break;
+      case 2:
+        topo_string << "0   1   1   0.0\n"
+                       "1   1   1   0.693\n"
+                       "1   2   2  0.693\n"
+                       "2   2   2  0.693\n"
+                       "2  0.693\n\n";
+        break;
+      case 3:
+        topo_string << "0   1   1   0.0\n"
+                       "1   1   1   0.693\n"
+                       "1   2   2  0.693\n"
+                       "2   3   3  0.0\n"  // mix it up a bit.
+                       "3   3   3  0.693\n"
+                       "3  0.693\n\n";
+        break;
+      default:
+        KALDI_ERR << "Un-handled num-pdf-classes\n";
     }
     topo_string << "</TopologyEntry>\n";
   }
@@ -171,7 +154,7 @@ Topology GenRandTopology() {
   } else {
     std::vector<int32> num_pdf_classes(phones.back() + 1, -1);
     for (int32 i = 0; i < phones.size(); i++)
-      num_pdf_classes[phones[i]] = RandInt(1, 5);
+      num_pdf_classes[phones[i]] = RandInt(1, 3);
     return GenRandTopology(phones, num_pdf_classes);
   }
 }
diff --git a/src/hmm/hmm-utils-test.cc b/src/hmm/hmm-utils-test.cc
index a94a99941d6..5d7f4fcc2c3 100644
--- a/src/hmm/hmm-utils-test.cc
+++ b/src/hmm/hmm-utils-test.cc
@@ -279,7 +279,7 @@ void TestConvertAlignment() {
   std::vector<int32> phone_sequence;
   int32 phone_sequence_length = RandInt(0, 20);
   for (int32 i = 0; i < phone_sequence_length; i++)
-    phone_sequence.push_back(phones[RandInt(1, phones.size())]);
+    phone_sequence.push_back(phones[RandInt(0, phones.size() - 1)]);
   std::vector<int32> old_alignment;
   GenerateRandomAlignment(*ctx_dep_old, trans_model_old,
                           phone_sequence,
diff --git a/src/hmm/hmm-utils.cc b/src/hmm/hmm-utils.cc
index db5998fe17a..fe56e57981d 100644
--- a/src/hmm/hmm-utils.cc
+++ b/src/hmm/hmm-utils.cc
@@ -29,11 +29,11 @@
 
 namespace kaldi {
 
-std::shared_ptr<fst::ExpandedFst<fst::StdArc>> GetHmmAsFsa(
-    std::vector<int32> phone_window,
+std::shared_ptr<fst::StdVectorFst> GetHmmAsFsa(
+    const std::vector<int32> &phone_window,
     const ContextDependencyInterface &ctx_dep,
     const Transitions &trans_model,
-    const HTransducerConfig &config,
+    bool include_self_loops,
     HmmCacheType *cache) {
   if (static_cast<int32>(phone_window.size()) != ctx_dep.ContextWidth())
     KALDI_ERR << "Context size mismatch, ilabel-info [from context FST is "
@@ -48,13 +48,12 @@ std::shared_ptr<fst::ExpandedFst<fst::StdArc>> GetHmmAsFsa(
 
   const Topology &topo = trans_model.GetTopo();
 
-  // vector of the pdfs, indexed by pdf-class (pdf-classes must start from zero
-  // and be contiguous).
-  std::vector<int32> pdfs(topo.NumPdfClasses(phone));
+  // vector of the pdf-ids, indexed by pdf-class minus one.
+  std::vector<int32> pdf_ids(topo.NumPdfClasses(phone));
   for (int32 pdf_class = 1;
-       pdf_class <= static_cast<int32>(pdfs.size());
+       pdf_class <= static_cast<int32>(pdf_ids.size());
        pdf_class++) {
-    if (! ctx_dep.Compute(phone_window, pdf_class, &(pdfs[pdf_class - 1])) ) {
+    if (! ctx_dep.Compute(phone_window, pdf_class, &(pdf_ids[pdf_class - 1])) ) {
       std::ostringstream ctx_ss;
       for (size_t i = 0; i < phone_window.size(); i++)
         ctx_ss << phone_window[i] << ' ';
@@ -68,7 +67,7 @@ std::shared_ptr<fst::ExpandedFst<fst::StdArc>> GetHmmAsFsa(
     }
   }
 
-  std::pair<int32, std::vector<int32> > cache_index(phone, pdfs);
+  std::pair<int32, std::vector<int32> > cache_index(phone, pdf_ids);
   if (cache != NULL) {
     HmmCacheType::iterator iter = cache->find(cache_index);
     if (iter != cache->end())
@@ -76,64 +75,59 @@ std::shared_ptr<fst::ExpandedFst<fst::StdArc>> GetHmmAsFsa(
   }
 
   using Arc = fst::StdArc;
-  using MyEditFst = fst::EditFst<Arc>;
   using StateId = Arc::StateId;
+  using Weight = Arc::Weight;
 
   const fst::StdVectorFst &entry = topo.TopologyForPhone(phone);
-  std::shared_ptr<MyEditFst> loopless_entry = std::make_shared<MyEditFst>(entry);
-
-  for (fst::StateIterator<MyEditFst> siter(*loopless_entry);
-       !siter.Done(); siter.Next()) {
-    StateId state = siter.Value();
-    std::vector<Arc> non_self_loops;
-    BaseFloat non_self_loop_prob = 1.0;
-    for (fst::ArcIterator<MyEditFst> aiter(*loopless_entry, state);
+  // the elements correction_factors are factors only in the semiring;
+  // physically they are costs to be added.
+  std::vector<float> correction_factors;
+  if (include_self_loops)
+    correction_factors.resize(entry.NumStates(), 0);
+  else
+    correction_factors = topo.CorrectionFactorsForPhone(phone);
+  const std::vector<int32> &self_loop_pdf_classes =
+      topo.SelfLoopPdfClassesForPhone(phone);
+  std::shared_ptr<fst::StdVectorFst> ans(
+      new fst::StdVectorFst());
+  StateId num_states = entry.NumStates();
+  for (StateId s = 0; s < num_states; s++)
+    ans->AddState();
+  KALDI_PARANOID_ASSERT(entry.Start() == 0);  // required by topology class.
+  ans->SetStart(0);
+
+  for (StateId s = 0; s < num_states; s++) {
+    Weight correction_weight(correction_factors[s]);
+    ans->SetFinal(s, Times(correction_weight, entry.Final(s)));
+
+    for (fst::ArcIterator<fst::StdVectorFst> aiter(entry, s);
          !aiter.Done(); aiter.Next()) {
-      const Arc& arc = aiter.Value();
-      if (arc.nextstate != state) {
-        non_self_loops.push_back(arc);
-      } else {
-        non_self_loop_prob -= exp(-arc.weight.Value());
-      }
-    }
-    KALDI_ASSERT(non_self_loop_prob >= BaseFloat(0));
-    if (non_self_loops.size() != loopless_entry->NumArcs(state)) {
-      loopless_entry->DeleteArcs(state);
-      for (Arc& arc: non_self_loops) {
-        // Renormalize the remaining arcs to have an outgoing weight
-        // of 1.0, so we maintain stochasticity
-        arc.weight = Arc::Weight(-log(exp(-arc.weight.Value()) / non_self_loop_prob));
-        loopless_entry->AddArc(state, arc);
-      }
+      if (!include_self_loops && aiter.Value().nextstate == s)
+        continue;
+      Arc arc = aiter.Value();
+
+      // self_loop_pdf_class is the pdf-class of the self-loop of the destination
+      // state of this arc, if any, else -1.
+      int32 self_loop_pdf_class = self_loop_pdf_classes[arc.nextstate];
+      // self_loop_pdf_id is the pdf-id of the self-loop in the destination
+      // state of this arc, if any, else -1.
+      int32 self_loop_pdf_id = (self_loop_pdf_class != -1 ?
+                                pdf_ids[self_loop_pdf_class - 1] : -1);
+      int32 pdf_class = arc.ilabel,
+          pdf_id = pdf_ids[pdf_class - 1],
+          trans_id = trans_model.TupleToTransitionId(
+              phone, s, aiter.Position(), pdf_id, self_loop_pdf_id);
+
+      arc.ilabel = trans_id;
+      arc.olabel = trans_id;
+      arc.weight = Times(correction_weight, arc.weight);
+      ans->AddArc(s, arc);
     }
   }
 
-  ApplyProbabilityScale(config.transition_scale, loopless_entry.get());
   if (cache != NULL)
-    (*cache)[cache_index] = loopless_entry;
-  return loopless_entry;
-}
-
-
-
-const fst::VectorFst<fst::StdArc>&
-GetHmmAsFsaSimple(std::vector<int32> phone_window,
-                  const ContextDependencyInterface &ctx_dep,
-                  const Transitions &trans_model,
-                  BaseFloat prob_scale) {
-  using namespace fst;
-
-  if (static_cast<int32>(phone_window.size()) != ctx_dep.ContextWidth())
-    KALDI_ERR <<"Context size mismatch, ilabel-info [from context FST is "
-              <<(phone_window.size())<<", context-dependency object "
-        "expects "<<(ctx_dep.ContextWidth());
-
-  int P = ctx_dep.CentralPosition();
-  int32 phone = phone_window[P];
-  KALDI_ASSERT(phone != 0);
-
-  const Topology &topo = trans_model.GetTopo();
-  return topo.TopologyForPhone(phone);
+    (*cache)[cache_index] = ans;
+  return ans;
 }
 
 
@@ -232,7 +226,7 @@ GetHTransducer(const std::vector<std::vector<int32> > &ilabel_info,
       std::shared_ptr<ExpandedFst<Arc>> fst = GetHmmAsFsa(phone_window,
                                                           ctx_dep,
                                                           trans_model,
-                                                          config,
+                                                          config.include_self_loops,
                                                           &cache);
       std::unique_ptr<ExpandedFst<Arc>> u_fst(fst->Copy());
       fsts[j] = std::move(u_fst);
@@ -240,7 +234,7 @@ GetHTransducer(const std::vector<std::vector<int32> > &ilabel_info,
   }
 
   // fsts_bare is as fsts, but with bare pointers.
-  std::vector<const ExpandedFst<Arc> *> fsts_bare(fsts.size());
+  std::vector<const fst::ExpandedFst<Arc> *> fsts_bare(fsts.size());
   for (size_t i = 0; i < fsts.size(); i++)
     fsts_bare[i] = fsts[i].get();
 
@@ -332,7 +326,7 @@ GetPdfToTransitionIdTransducer(const Transitions &trans_model) {
   ans->SetFinal(0, Weight::One());
   for (int32 tid = 1; tid <= trans_model.NumTransitionIds(); tid++) {
     int32 pdf = trans_model.TransitionIdToPdfFast(tid);
-    ans->AddArc(0, Arc(pdf+1, tid, Weight::One(), 0));  // note the offset of 1 on the pdfs.
+    ans->AddArc(0, Arc(pdf+1, tid, Weight::One(), 0));  // note the offset of 1 on the pdf_ids.
     // it's because 0 is a valid pdf.
   }
   return ans;
@@ -676,7 +670,7 @@ static inline void ConvertAlignmentForPhone(
     int32 old_pdf_class = old_trans_model.PdfClassForTid(old_tid);
     int32 old_self_loop_pdf_class = (
         info.self_loop_pdf_id != -1 ?
-        old_trans_model.PdfClassForTid(info.self_loop_pdf_id) : -1);
+        old_trans_model.PdfClassForTid(info.self_loop_transition_id) : -1);
     int32 new_pdf_id = pdf_ids[old_pdf_class];
     int32 new_self_loop_pdf_id = (old_self_loop_pdf_class != -1 ?
                                   pdf_ids[old_self_loop_pdf_class] : -1);
@@ -1053,17 +1047,20 @@ void GetRandomAlignmentForPhone(const ContextDependencyInterface &ctx_dep,
                                 std::vector<int32> *alignment) {
   typedef fst::StdArc Arc;
   int32 length = alignment->size();
-  BaseFloat prob_scale = 0.0;
-  fst::VectorFst<Arc> fst = GetHmmAsFsaSimple(phone_window, ctx_dep,
-                                              trans_model, prob_scale);
-  fst::RmEpsilon(&fst);
+  bool include_self_loops = true;
+  std::shared_ptr<fst::StdVectorFst> fst =
+      GetHmmAsFsa(phone_window, ctx_dep,
+                  trans_model,
+                  include_self_loops);
+
+  fst::RmEpsilon(fst.get());
 
   fst::VectorFst<Arc> length_constraint_fst;
   {  // set up length_constraint_fst.
     std::vector<int32> symbols;
     bool include_epsilon = false;
     // note: 'fst' is an acceptor so ilabels == olabels.
-    GetInputSymbols(fst, include_epsilon, &symbols);
+    GetInputSymbols(*fst, include_epsilon, &symbols);
     int32 cur_state = length_constraint_fst.AddState();
     length_constraint_fst.SetStart(cur_state);
     for (int32 i = 0; i < length; i++) {
@@ -1079,7 +1076,7 @@ void GetRandomAlignmentForPhone(const ContextDependencyInterface &ctx_dep,
     length_constraint_fst.SetFinal(cur_state, fst::TropicalWeight::One());
   }
   fst::VectorFst<Arc> composed_fst;
-  fst::Compose(fst, length_constraint_fst, &composed_fst);
+  fst::Compose(*fst, length_constraint_fst, &composed_fst);
   fst::VectorFst<Arc> single_path_fst;
   {  // randomly generate a single path.
     fst::UniformArcSelector<Arc> selector;
@@ -1096,6 +1093,12 @@ void GetRandomAlignmentForPhone(const ContextDependencyInterface &ctx_dep,
   bool ans = fst::GetLinearSymbolSequence<Arc, int32>(
       single_path_fst, &symbol_sequence, NULL, NULL);
   KALDI_ASSERT(ans && symbol_sequence.size() == length);
+  KALDI_PARANOID_ASSERT(
+      trans_model.InfoForTransitionId(symbol_sequence.front()).is_initial &&
+      trans_model.InfoForTransitionId(symbol_sequence.back()).is_final);
+  if (symbol_sequence.size() > 1) {
+    KALDI_ASSERT(!trans_model.InfoForTransitionId(symbol_sequence.back()).is_initial);
+  } // TODO: remove the above.
   symbol_sequence.swap(*alignment);
 }
 
diff --git a/src/hmm/hmm-utils.h b/src/hmm/hmm-utils.h
index a0a23cae5b6..256d17ab609 100644
--- a/src/hmm/hmm-utils.h
+++ b/src/hmm/hmm-utils.h
@@ -37,19 +37,16 @@ namespace kaldi {
 /// Configuration class for the GetHTransducer() function; see
 /// \ref hmm_graph_config for context.
 struct HTransducerConfig {
-  /// Transition log-prob scale, see \ref hmm_scale.
-  /// Note this doesn't apply to self-loops; GetHTransducer() does
-  /// not include self-loops.
-  BaseFloat transition_scale;
   int32 nonterm_phones_offset;
+  // We don't currently make `include_self_loops` configurable from the command
+  // line; it's included in order to make it obvious how to add the self loops.
+  bool include_self_loops;
 
   HTransducerConfig():
-      transition_scale(1.0),
-      nonterm_phones_offset(-1) { }
+      nonterm_phones_offset(-1),
+      include_self_loops(false) { }
 
   void Register (OptionsItf *opts) {
-    opts->Register("transition-scale", &transition_scale,
-                   "Scale of transition probs (relative to LM)");
     opts->Register("nonterm-phones-offset", &nonterm_phones_offset,
                    "The integer id of #nonterm_bos in phones.txt, if present. "
                    "Only needs to be set if you are doing grammar decoding, "
@@ -69,7 +66,7 @@ struct HmmCacheHash {
 /// HmmCacheType is a map from (central-phone, sequence of pdf-ids) to FST, used
 /// as cache in GetHmmAsFsa, as an optimization.
 typedef unordered_map<std::pair<int32, std::vector<int32> >,
-                      std::shared_ptr<fst::ExpandedFst<fst::StdArc>>,
+                      std::shared_ptr<fst::StdVectorFst>,
                       HmmCacheHash> HmmCacheType;
 
 
@@ -83,35 +80,28 @@ typedef unordered_map<std::pair<int32, std::vector<int32> >,
 ///
 /// as the symbols.
 /// For documentation in context, see \ref hmm_graph_get_hmm_as_fst
-///   @param context_window  A vector representing the phonetic context; see
+///   @param [in] context_window  A vector representing the phonetic context; see
 ///            \ref tree_window "here" for explanation.
-///   @param ctx_dep The object that contains the phonetic decision-tree
-///   @param trans_model The transition-model object, which provides
+///   @param [in] ctx_dep The object that contains the phonetic decision-tree
+///   @param [in] trans_model The transition-model object, which provides
 ///         the mappings to transition-ids and also the transition
 ///         probabilities.
-///   @param config Configuration object, see \ref HTransducerConfig.
+///   @param [in] include_self_loops.  If true, self-loop arcs will be
+///          included in the result; if false, they will be omitted and
+///          the probabilities appropriately renormalized; you can
+///          add them later using AddSelfLoops().
 ///   @param cache Object used as a lookaside buffer to save computation;
 ///       if it finds that the object it needs is already there, it will
 ///       just return a pointer value from "cache"-- note that this means
 ///       you have to be careful not to delete things twice.
-std::shared_ptr<fst::ExpandedFst<fst::StdArc>> GetHmmAsFsa(
-    std::vector<int32> context_window,
+std::shared_ptr<fst::StdVectorFst> GetHmmAsFsa(
+    const std::vector<int32> &context_window,
     const ContextDependencyInterface &ctx_dep,
     const Transitions &trans_model,
-    const HTransducerConfig &config,
+    bool include_self_loops = false,
     HmmCacheType *cache = NULL);
 
 
-/// Included mainly as a form of documentation, not used in any other code
-/// currently.  Creates the acceptor FST with self-loops, and with fewer
-/// options.
-const fst::StdVectorFst&
-GetHmmAsFsaSimple(std::vector<int32> context_window,
-                  const ContextDependencyInterface &ctx_dep,
-                  const Transitions &trans_model,
-                  BaseFloat prob_scale);
-
-
 /**
   * Returns the H tranducer; result owned by caller.  Caution: our version of
   * the H transducer does not include self-loops; you have to add those later.
diff --git a/src/hmm/topology.cc b/src/hmm/topology.cc
index 973a2cde542..fc2fa87cefc 100644
--- a/src/hmm/topology.cc
+++ b/src/hmm/topology.cc
@@ -97,6 +97,7 @@ void Topology::Read(std::istream &is, bool binary) {
     }
     ExpectToken(is, binary, "</Topology>");
   }
+  ComputeDerived();
   Check();
 }
 
@@ -212,17 +213,33 @@ void Topology::Check() {
     if (fst.NumStates() != num_states || NumArcs(fst) != num_arcs)
       KALDI_ERR << "Topology changed after calling Connect().";
   }
+  KALDI_ASSERT(self_loop_correction_factors_.size() == entries_.size() &&
+               self_loop_pdf_classes_.size() == entries_.size());
 }
 
-// Will throw if phone not covered.
 const fst::StdVectorFst& Topology::TopologyForPhone(int32 phone) const {
+  if (static_cast<size_t>(phone) >= phone2idx_.size()
+      || phone2idx_[phone] == -1)
+    KALDI_ERR << "TopologyForPhone(), phone " << phone << " not covered.";
+  return entries_[phone2idx_[phone]];
+}
+
+const std::vector<float>& Topology::CorrectionFactorsForPhone(int32 phone) const {
+  if (static_cast<size_t>(phone) >= phone2idx_.size()
+      || phone2idx_[phone] == -1)
+    KALDI_ERR << "TopologyForPhone(), phone " << phone << " not covered.";
+  return self_loop_correction_factors_[phone2idx_[phone]];
+}
+
+const std::vector<int32>& Topology::SelfLoopPdfClassesForPhone(int32 phone) const {
   if (static_cast<size_t>(phone) >= phone2idx_.size()
       || phone2idx_[phone] == -1) {
-    KALDI_ERR << "TopologyForPhone(), phone "<< phone <<" not covered.";
+    KALDI_ERR << "TopologyForPhone(), phone " << phone << " not covered.";
   }
-  return entries_[phone2idx_[phone]];
+  return self_loop_pdf_classes_[phone2idx_[phone]];
 }
 
+
 int32 Topology::NumPdfClasses(int32 phone) const {
   // will throw if phone not covered.
   const fst::StdVectorFst &entry = TopologyForPhone(phone);
@@ -305,4 +322,45 @@ bool Topology::operator==(const Topology &other) const {
   }
 }
 
+
+void Topology::ComputeDerived() {
+  using Arc = fst::StdArc;
+  using StateId = Arc::StateId;
+  using Weight = Arc::Weight;
+
+  self_loop_correction_factors_.resize(entries_.size());
+  self_loop_pdf_classes_.resize(entries_.size());
+  for (size_t i = 0; i < entries_.size(); i++) {
+    const fst::StdVectorFst &entry = entries_[i];
+    std::vector<float> &correction_factors(
+        self_loop_correction_factors_[i]);
+    std::vector<int32> &pdf_classes(
+        self_loop_pdf_classes_[i]);
+    StateId num_states = entry.NumStates();
+    correction_factors.resize(num_states);
+    pdf_classes.resize(num_states, -1);
+    for (StateId s = 0; s < num_states; s++) {
+      float tot_prob = exp(-entry.Final(s).Value()),
+          self_loop_prob = 0.0;
+      for (fst::ArcIterator<fst::StdVectorFst> aiter(entry, s);
+           !aiter.Done(); aiter.Next()) {
+        const Arc& arc = aiter.Value();
+        float this_prob = exp(-arc.weight.Value());
+        tot_prob += this_prob;
+        if (arc.nextstate == s) {
+          self_loop_prob += this_prob;
+          KALDI_ASSERT(pdf_classes[s] == -1 &&
+                       "State in topology has more than one self-loop");
+          pdf_classes[s] = arc.ilabel;
+        }
+      }
+      KALDI_ASSERT(tot_prob > 0 && "Invalid topology");
+      // correction_factor is initialized with a number <= 0 that will be added
+      // to costs.  It will result in properly normalized probs after removing
+      // the self-loop, assuming the topo was properly normalized before.
+      correction_factors[s] = log((tot_prob - self_loop_prob) / tot_prob);
+    }
+  }
+}
+
 } // End namespace kaldi
diff --git a/src/hmm/topology.h b/src/hmm/topology.h
index 680329e3c24..55ec4dcf35c 100644
--- a/src/hmm/topology.h
+++ b/src/hmm/topology.h
@@ -93,6 +93,27 @@ class Topology {
   /// will throw exception if phone not covered by the topology.
   const fst::StdVectorFst &TopologyForPhone(int32 phone) const;
 
+  /// Returns a reference to a vector of floats of size
+  /// `TopologyForPhone(phone).NumStates()`; this contains numbers <= 0 which are to be
+  /// added to the final-costs and non-self-loop arc costs when creating graphs
+  /// without self-loops (we call it a correction factor becuause in the
+  /// semiring it's multiplied, although physically it is added); this
+  /// correction factor will ensure that the probability sum of the
+  /// non-self-loop arcs and final-prob of each state has the same value that it
+  /// did before removing the self-loop.  It's used to make sure that
+  /// intermediate FSTs made during graph compilation are as stochastic as
+  /// possible.
+  /// The user could compute this themselves, but we provide it
+  /// directly for speed.
+  const std::vector<float> &CorrectionFactorsForPhone(int32 phone) const;
+
+  /// For each phone, this will return a vector of size
+  /// `TopologyForPhone(phone).NumStates()` containing, for each state
+  /// in this phone's topology entry, the pdf-class of the self-loop on
+  /// that state (if any), and otherwise, -1.  This could be computed
+  /// by the user from the FST, but is provided for convenience.
+  const std::vector<int32> &SelfLoopPdfClassesForPhone(int32 phone) const;
+
   /// Returns the number of \ref pdf_class "pdf-classes" for this phone;
   /// throws exception if phone not covered by this topology.
   int32 NumPdfClasses(int32 phone) const;
@@ -118,13 +139,41 @@ class Topology {
 
   // Allow default assignment operator and copy constructor.
  private:
+
+  void ComputeDerived();
+
   using Arc     = typename fst::StdVectorFst::Arc;
   using StateId = typename fst::StdVectorFst::StateId;
   using Weight  = typename fst::StdVectorFst::Weight;
 
-  std::vector<int32> phones_;  // list of all phones we have topology for.  Sorted, uniq.  no epsilon (zero) phone.
-  std::vector<int32> phone2idx_;  // map from phones to indexes into the entries vector (or -1 for not present).
-  std::vector<fst::StdVectorFst> entries_;
+  std::vector<int32> phones_;  // list of all phones we have topology for.
+                               // Sorted, uniq.  no epsilon (zero) phone.
+  std::vector<int32> phone2idx_;  // map from phones to indexes into the entries
+                                  // vector (or -1 for not present).
+  std::vector<fst::StdVectorFst> entries_;  // list of topology entries, indexed
+                                            // by the elements of phone2indx_.
+
+  // Below this point are 'derived quantities' (things not written to disk,
+  // that can be worked out from the information above).
+
+  // This is a vector indexed by 'idx' (the same as the index into entries_) and
+  // then by state-id in the corresponding topology entry; it contains the
+  // correction factor that we add to the costs of arcs leaving that state (and
+  // its final-cost) if we remove the self-loop; it's a number <= 0.  This will
+  // make the probability sum of this state have the same value it did before
+  // removing the self-loop, hopefully 1.0.  (viewing the costs as negated
+  // log-probs, of course).  Doing this will make the no-self-loop FST
+  // stochastic if it was stochastic with the self-loops.
+  std::vector<std::vector<float> > self_loop_correction_factors_;
+
+  // This is a vector indexed by 'idx' (the same as the index into entries_) and
+  // then by state-id in the corresponding topology entry; it contains the
+  // pdf-class of the self-loop of each state that had a self-loop, or -1
+  // for the states that didn't have self-loops.  Note: the pdf-class is
+  // a number >0 which is the label on the arc in the topology entries (ilabel
+  // or olabel; they are the same because the topology entries are
+  // acceptors).
+  std::vector<std::vector<int32> > self_loop_pdf_classes_;
 };
 
 

From 42942f56fb87169537c465858aee7f47bb2f2a14 Mon Sep 17 00:00:00 2001
From: Daniel Povey <dpovey@gmail.com>
Date: Sun, 23 Jun 2019 11:27:23 -0400
Subject: [PATCH 143/163] [src] Various changes to make test pass

---
 src/bin/add-self-loops.cc                  |   4 +-
 src/bin/compile-graph.cc                   |   4 +-
 src/chain/chain-den-graph.cc               |  17 +-
 src/chain/chain-supervision.cc             |  38 ++--
 src/decoder/training-graph-compiler.cc     |  25 +--
 src/decoder/training-graph-compiler.h      |  16 +-
 src/fstext/fstext-utils-inl.h              |  20 +--
 src/fstext/fstext-utils.h                  |   5 +-
 src/hmm/hmm-utils.cc                       | 199 ++++++++++-----------
 src/hmm/hmm-utils.h                        |  30 ++--
 src/hmm/transitions.h                      |   1 +
 src/lat/word-align-lattice-lexicon-test.cc |   5 +-
 src/nnet3/nnet-compute.cc                  |   8 +-
 13 files changed, 171 insertions(+), 201 deletions(-)

diff --git a/src/bin/add-self-loops.cc b/src/bin/add-self-loops.cc
index 601d8d587f3..ebaf219aff1 100644
--- a/src/bin/add-self-loops.cc
+++ b/src/bin/add-self-loops.cc
@@ -92,13 +92,13 @@ int main(int argc, char *argv[]) {
       KALDI_ERR << "add-self-loops: error reading input FST.";
 
     BaseFloat self_loop_scale = 1.0;
-    bool check_no_self_loops = true;
+    bool currently_self_loop_free = true;
 
     // The work gets done here.
     AddSelfLoops(trans_model,
                  disambig_syms_in,
                  self_loop_scale,
-                 check_no_self_loops, fst);
+                 currently_self_loop_free, fst);
 
     if (! fst->Write(fst_out_filename) )
       KALDI_ERR << "add-self-loops: error writing FST to "
diff --git a/src/bin/compile-graph.cc b/src/bin/compile-graph.cc
index c9600462427..9125c67ffb3 100644
--- a/src/bin/compile-graph.cc
+++ b/src/bin/compile-graph.cc
@@ -169,11 +169,11 @@ int main(int argc, char *argv[]) {
     MinimizeEncoded(&hclg_fst);
 
     std::vector<int32> disambig;
-    bool check_no_self_loops = true;
+    bool currently_self_loop_free = true;
     AddSelfLoops(trans_model,
                  disambig,
                  self_loop_scale,
-                 check_no_self_loops,
+                 currently_self_loop_free,
                  &hclg_fst);
 
     if (nonterm_phones_offset >= 0)
diff --git a/src/chain/chain-den-graph.cc b/src/chain/chain-den-graph.cc
index e8db0bbe5a5..36e82c1baf6 100644
--- a/src/chain/chain-den-graph.cc
+++ b/src/chain/chain-den-graph.cc
@@ -336,12 +336,9 @@ void CreateDenominatorFst(const ContextDependency &ctx_dep,
             << context_dep_lm.NumStates() << " and " << NumArcs(context_dep_lm);
 
   std::vector<int32> disambig_syms_h; // disambiguation symbols on input side
-  // of H -- will be empty.
-  HTransducerConfig h_config;
-  // the default is 1, but just document that we want this to stay as one.
-  // we'll use the same value in test time.  Consistency is the key here.
-  h_config.transition_scale = 1.0;
+                                      // of H -- will be empty.
 
+  HTransducerConfig h_config;
   std::unique_ptr<StdVectorFst> h_fst = GetHTransducer(inv_cfst.IlabelInfo(),
                                                        ctx_dep,
                                                        trans_model,
@@ -351,14 +348,14 @@ void CreateDenominatorFst(const ContextDependency &ctx_dep,
   StdVectorFst transition_id_fst;
   TableCompose(*h_fst, context_dep_lm, &transition_id_fst);
 
-  BaseFloat self_loop_scale = 1.0;  // We have to be careful to use the same
-                                    // value in test time.
   // 'reorder' must always be set to true for chain models.
-  bool check_no_self_loops = true;
+  bool currently_self_loop_free = true,
+      use_weights = true;
 
   // add self-loops to the FST with transition-ids as its labels.
-  AddSelfLoops(trans_model, disambig_syms_h, self_loop_scale,
-               check_no_self_loops, &transition_id_fst);
+  AddSelfLoops(trans_model, disambig_syms_h,
+               currently_self_loop_free, use_weights,
+               &transition_id_fst);
   // at this point transition_id_fst will have transition-ids as its ilabels and
   // context-dependent phones (indexes into IlabelInfo()) as its olabels.
   // Discard the context-dependent phones by projecting on the input, keeping
diff --git a/src/chain/chain-supervision.cc b/src/chain/chain-supervision.cc
index 717fb1f27a8..a99592aa403 100644
--- a/src/chain/chain-supervision.cc
+++ b/src/chain/chain-supervision.cc
@@ -21,6 +21,7 @@
 #include "lat/lattice-functions.h"
 #include "util/text-utils.h"
 #include "hmm/hmm-utils.h"
+#include "fstext/fstext-utils.h"
 #include <numeric>
 
 namespace kaldi {
@@ -332,11 +333,7 @@ bool ProtoSupervisionToSupervision(
                                       // disambiguation symbols on the output.
 
   HTransducerConfig h_cfg;
-
-  // We don't want to add any transition probabilities as they will be added
-  // when we compose with the denominator graph.
-  h_cfg.transition_scale = 0.0;
-
+  h_cfg.include_self_loops = true;
   std::unique_ptr<VectorFst<StdArc>> h_fst = GetHTransducer(inv_cfst.IlabelInfo(),
                                                             ctx_dep,
                                                             trans_model,
@@ -344,23 +341,19 @@ bool ProtoSupervisionToSupervision(
                                                             &disambig_syms_h);
   KALDI_ASSERT(disambig_syms_h.empty());
 
+  // We don't want to include any transition probabilities as they will be added
+  // when we compose with the normalization FST.
+  fst::RemoveWeights(h_fst.get());
+
   VectorFst<StdArc> transition_id_fst;
   TableCompose(*h_fst, context_dep_fst, &transition_id_fst);
 
-  // We don't want to add any transition probabilities as they will be added
-  // when we compose with the denominator graph.
-  BaseFloat self_loop_scale = 0.0;
-
-  bool check_no_self_loops = true;
-  // add self-loops to the FST with transition-ids as its labels.
-  AddSelfLoops(trans_model, disambig_syms_h, self_loop_scale,
-               check_no_self_loops, &transition_id_fst);
-
   // at this point transition_id_fst will have transition-ids as its ilabels and
   // context-dependent phones (indexes into ILabelInfo()) as its olabels.
   // Discard the context-dependent phones by projecting on the input, keeping
   // only the transition-ids.
   fst::Project(&transition_id_fst, fst::PROJECT_INPUT);
+
   if (transition_id_fst.Properties(fst::kIEpsilons, true) != 0) {
     // remove epsilons, if there are any.
     fst::RmEpsilon(&transition_id_fst);
@@ -1058,17 +1051,18 @@ bool ConvertSupervisionToUnconstrained(
 
     // There are be no disambiguation symbols here.
     std::vector<int32> disambig_syms;
-    // We're not adding transition probabilities; we rely on compsition with the
+    // We're not adding transition probabilities; we rely on composition with the
     // normalization FST for that.  (note: all transition probabilities are just
     // 0.5 anyway, for the typical chain topology).
-    BaseFloat self_loop_scale = 0.0;
-    // The FST we're about to call AddSelfLoops() on will have self-loops, on
-    // the first frame, so disable the check that the FST was originally
-    // self-loop-free.
-    bool check_no_self_loops = false;
+    //
+    // The FST we're about to call AddSelfLoops() on will already have one
+    // self-loop, on the first frame, so tell that to AddSelfLoops().
+    bool currently_self_loop_free = false,
+        use_weights = false;
     supervision->e2e_fsts.resize(1);
-    AddSelfLoops(trans_mdl, disambig_syms, self_loop_scale,
-                 check_no_self_loops, &(supervision->e2e_fsts[0]));
+    AddSelfLoops(trans_mdl, disambig_syms,
+                 currently_self_loop_free, use_weights,
+                 &(supervision->e2e_fsts[0]));
   }
 
   { // Convert transition-ids to pdf-ids+1 on the FST labels,
diff --git a/src/decoder/training-graph-compiler.cc b/src/decoder/training-graph-compiler.cc
index 865552047a4..a59e83dee43 100644
--- a/src/decoder/training-graph-compiler.cc
+++ b/src/decoder/training-graph-compiler.cc
@@ -98,16 +98,18 @@ bool TrainingGraphCompiler::CompileGraph(const fst::VectorFst<fst::StdArc> &word
   KALDI_ASSERT(ctx2word_fst.Start() != kNoStateId);
 
   HTransducerConfig h_cfg;
-  h_cfg.transition_scale = opts_.transition_scale;
 
   std::vector<int32> disambig_syms_h; // disambiguation symbols on
-  // input side of H.
+                                      // input side of H.
+
   std::unique_ptr<VectorFst<StdArc>> H = GetHTransducer(inv_cfst.IlabelInfo(),
                                                         ctx_dep_,
                                                         trans_model_,
                                                         h_cfg,
                                                         &disambig_syms_h);
 
+  RemoveWeights(H.get());
+
   VectorFst<StdArc> &trans2word_fst = *out_fst;  // transition-id to word.
   TableCompose(*H, ctx2word_fst, &trans2word_fst);
 
@@ -129,11 +131,13 @@ bool TrainingGraphCompiler::CompileGraph(const fst::VectorFst<fst::StdArc> &word
   MinimizeEncoded(&trans2word_fst);
 
   std::vector<int32> disambig;
-  bool check_no_self_loops = true;
+  bool currently_self_loop_free = true,
+      use_weights = false;
+
   AddSelfLoops(trans_model_,
                disambig,
-               opts_.self_loop_scale,
-               check_no_self_loops,
+               currently_self_loop_free,
+               use_weights,
                &trans2word_fst);
 
   return true;
@@ -193,7 +197,6 @@ bool TrainingGraphCompiler::CompileGraphs(
   }
 
   HTransducerConfig h_cfg;
-  h_cfg.transition_scale = opts_.transition_scale;
 
   std::vector<int32> disambig_syms_h;
   std::unique_ptr<VectorFst<StdArc>> H = GetHTransducer(inv_cfst.IlabelInfo(),
@@ -214,23 +217,21 @@ bool TrainingGraphCompiler::CompileGraphs(
       if (opts_.rm_eps)
         RemoveEpsLocal(&trans2word_fst);
     }
-
-    // Encoded minimization.
     MinimizeEncoded(&trans2word_fst);
 
     std::vector<int32> disambig;
-    bool check_no_self_loops = true;
+    bool currently_self_loop_free = true,
+        use_weights = true;
     AddSelfLoops(trans_model_,
                  disambig,
-                 opts_.self_loop_scale,
-                 check_no_self_loops,
+                 currently_self_loop_free,
+                 use_weights,
                  &trans2word_fst);
 
     KALDI_ASSERT(trans2word_fst.Start() != kNoStateId);
 
     *((*out_fsts)[i]) = trans2word_fst;
   }
-
   return true;
 }
 
diff --git a/src/decoder/training-graph-compiler.h b/src/decoder/training-graph-compiler.h
index 89ef72020ca..989accb2a05 100644
--- a/src/decoder/training-graph-compiler.h
+++ b/src/decoder/training-graph-compiler.h
@@ -31,23 +31,13 @@ namespace kaldi {
 
 struct TrainingGraphCompilerOptions {
 
-  BaseFloat transition_scale;
-  BaseFloat self_loop_scale;
   bool rm_eps;
 
-  explicit TrainingGraphCompilerOptions(BaseFloat transition_scale = 1.0,
-                                        BaseFloat self_loop_scale = 1.0) :
-      transition_scale(transition_scale),
-      self_loop_scale(self_loop_scale),
-      rm_eps(false) { }
+  explicit TrainingGraphCompilerOptions(): rm_eps(false) { }
 
   void Register(OptionsItf *opts) {
-    opts->Register("transition-scale", &transition_scale, "Scale of transition "
-                   "probabilities (excluding self-loops)");
-    opts->Register("self-loop-scale", &self_loop_scale, "Scale of self-loop vs. "
-                   "non-self-loop probability mass ");
-    opts->Register("rm-eps", &rm_eps,  "Remove [most] epsilons before minimization (only applicable "
-                   "if disambig symbols present)");
+    opts->Register("rm-eps", &rm_eps,  "Remove [most] epsilons before minimization (only "
+                   "matters if disambig symbols present)");
   }
 };
 
diff --git a/src/fstext/fstext-utils-inl.h b/src/fstext/fstext-utils-inl.h
index bbcf68ec2ec..d072118ee1e 100644
--- a/src/fstext/fstext-utils-inl.h
+++ b/src/fstext/fstext-utils-inl.h
@@ -532,18 +532,17 @@ void MakePrecedingInputSymbolsSame(bool start_is_epsilon, MutableFst<Arc> *fst)
 
 template<class Arc, class F>
 void MakePrecedingInputSymbolsSameClass(bool start_is_epsilon, MutableFst<Arc> *fst, const F &f) {
-  typedef typename F::Result ClassType;
   typedef typename Arc::StateId StateId;
   typedef typename Arc::Weight Weight;
-  vector<ClassType> classes;
-  ClassType noClass = f(kNoLabel);
-  ClassType epsClass = f(0);
+  vector<int32> classes;
+  int32 no_class = f(kNoLabel),
+      eps_class = f(0);
   if (start_is_epsilon) {  // treat having-start-state as epsilon in-transition.
     StateId start_state = fst->Start();
     if (start_state < 0 || start_state == kNoStateId) // empty FST.
       return;
-    classes.resize(start_state+1, noClass);
-    classes[start_state] = epsClass;
+    classes.resize(start_state+1, no_class);
+    classes[start_state] = eps_class;
   }
 
   // Find bad states (states with multiple input-symbols into them).
@@ -553,8 +552,8 @@ void MakePrecedingInputSymbolsSameClass(bool start_is_epsilon, MutableFst<Arc> *
     for (ArcIterator<Fst<Arc> > aiter(*fst, s); !aiter.Done(); aiter.Next()) {
       const Arc &arc = aiter.Value();
       if (classes.size() <= static_cast<size_t>(arc.nextstate))
-        classes.resize(arc.nextstate+1, noClass);
-      if (classes[arc.nextstate] == noClass)
+        classes.resize(arc.nextstate+1, no_class);
+      if (classes[arc.nextstate] == no_class)
         classes[arc.nextstate] = f(arc.ilabel);
       else
         if (classes[arc.nextstate] != f(arc.ilabel))
@@ -562,6 +561,7 @@ void MakePrecedingInputSymbolsSameClass(bool start_is_epsilon, MutableFst<Arc> *
     }
   }
   if (bad_states.empty()) return;  // Nothing to do.
+
   kaldi::ConstIntegerSet<StateId> bad_states_ciset(bad_states);  // faster lookup.
 
   // Work out list of arcs we have to change as (state, arc-offset).
@@ -579,7 +579,7 @@ void MakePrecedingInputSymbolsSameClass(bool start_is_epsilon, MutableFst<Arc> *
   }
   KALDI_ASSERT(!arcs_to_change.empty());  // since !bad_states.empty().
 
-  std::map<pair<StateId, ClassType>, StateId> state_map;
+  std::map<std::pair<StateId, int32>, StateId> state_map;
   // state_map is a map from (bad-state, input-symbol-class) to dummy-state.
 
   for (size_t i = 0; i < arcs_to_change.size(); i++) {
@@ -590,7 +590,7 @@ void MakePrecedingInputSymbolsSameClass(bool start_is_epsilon, MutableFst<Arc> *
 
     // Transition is non-eps transition to "bad" state.  Introduce new state (or find
     // existing one).
-    pair<StateId, ClassType> p(arc.nextstate, f(arc.ilabel));
+    pair<StateId, int32> p(arc.nextstate, f(arc.ilabel));
     if (state_map.count(p) == 0) {
       StateId newstate = state_map[p] = fst->AddState();
       fst->AddArc(newstate, Arc(0, 0, Weight::One(), arc.nextstate));
diff --git a/src/fstext/fstext-utils.h b/src/fstext/fstext-utils.h
index 0be6725a44f..25c4a53c633 100644
--- a/src/fstext/fstext-utils.h
+++ b/src/fstext/fstext-utils.h
@@ -230,7 +230,7 @@ bool PrecedingInputSymbolsAreSame(bool start_is_epsilon, const Fst<Arc> &fst);
 /// F::Result F::operator() (F::Arg a) const;
 /// where F::Result is an integer type and F::Arc can be constructed from Arc::Label.
 /// this must apply to valid labels and also to kNoLabel (so we can have a marker for
-/// the invalid labels.
+/// the invalid labels).
 template<class Arc, class F>
 bool PrecedingInputSymbolsAreSameClass(bool start_is_epsilon, const Fst<Arc> &fst, const F &f);
 
@@ -258,7 +258,8 @@ template<class Arc>
 void MakePrecedingInputSymbolsSame(bool start_is_epsilon, MutableFst<Arc> *fst);
 
 
-/// As MakePrecedingInputSymbolsSame, but takes a functor object that maps labels to classes.
+/// As MakePrecedingInputSymbolsSame, but takes a functor object that maps
+/// labels to (int32) classes
 template<class Arc, class F>
 void MakePrecedingInputSymbolsSameClass(bool start_is_epsilon, MutableFst<Arc> *fst, const F &f);
 
diff --git a/src/hmm/hmm-utils.cc b/src/hmm/hmm-utils.cc
index fe56e57981d..7e858cbed06 100644
--- a/src/hmm/hmm-utils.cc
+++ b/src/hmm/hmm-utils.cc
@@ -369,80 +369,51 @@ struct TransitionState {
   const Transitions::TransitionIdInfo& info;
 };
 
-class TidToTstateMapper {
+class TidToSelfLoopMapper {
 public:
-  // Function object used in MakePrecedingInputSymbolsSameClass and
-  // MakeFollowingInputSymbolsSameClass (as called by AddSelfLoopsReorder and
-  // AddSelfLoopsNoReorder).  It maps transition-ids to transition-states (and
-  // -1 to -1, 0 to 0 and disambiguation symbols to 0).  If check_no_self_loops
-  // == true, it also checks that there are no self-loops in the graph (i.e. in
-  // the labels it is called with).  This is just a convenient place to put this
-  // check.
-
-  // This maps valid transition-ids to transition states, maps kNoLabel to -1, and
-  // maps all other symbols (i.e. epsilon symbols, disambig symbols, and symbols
-  // with values over 100000/kNontermBigNumber) to zero.
-  // Its point is to provide an equivalence class on labels that's relevant to what
-  // the self-loop will be on the following (or preceding) state.
-
-  // TransitionState no longer exists. It's basically a
-  // TransitionIdInfo without the arc_index field.
-
-  TidToTstateMapper(const Transitions &trans_model,
+  // Function object used in MakePrecedingInputSymbolsSameClass and.
+  // It maps a transition-ids t to the transition-id on the self-loop
+  // of the destination-state of t (or -1 if there is no self-loop).
+  //
+  // If currently_self_loop_free == true, it also checks that there are no
+  // self-loops in the graph (i.e. in the labels it is called with).  This is
+  // just a convenient place to put this check.
+
+  // This maps valid transition-ids to transition states, and maps all other
+  // symbols (i.e. epsilon symbols, disambig symbols, and symbols with values
+  // over 100000/kNontermBigNumber) to zero.  Its point is to provide an
+  // equivalence class on labels that's relevant to what the self-loop will be
+  // on the following state.
+  TidToSelfLoopMapper(const Transitions &trans_model,
                     const std::vector<int32> &disambig_syms,
-                    bool check_no_self_loops):
+                    bool currently_self_loop_free):
       trans_model_(trans_model),
       disambig_syms_(disambig_syms),
-      check_no_self_loops_(check_no_self_loops) {
-    KALDI_ASSERT((*this)(fst::kNoLabel) == NoLabelClass());
-    KALDI_ASSERT((*this)(0) == ZeroClass());
-}
-
-  typedef TransitionState Result;
-  static const Result& NoLabelClass() {
-    // Take advantage of the fact that phone must be greater than or
-    // equal to 1 to create a TransitionIdInfo which in practice will
-    // never be created normally.
-
-    // Use -1 for all other fields so we can easily see when debugging
-    // whether we are using one of these invalid TransitionIdInfo
-    // classes.
-    static auto *no_label =
-      new Transitions::TransitionIdInfo{.phone = -1, .topo_state = -1,
-                                        .arc_index = -1, .pdf_id = -1,
-                                        .self_loop_pdf_id = -1};
-    static auto *no_label_state = new TransitionState(*no_label);
-    return *no_label_state;
-  }
+      currently_self_loop_free_(currently_self_loop_free) { }
 
-  static const Result& ZeroClass() {
-    static auto *zero_label =
-      new Transitions::TransitionIdInfo{.phone = 0, .topo_state = -1, .arc_index = -1,
-                                        .pdf_id = -1, .self_loop_pdf_id = -1};
-    static auto *zero_label_state = new TransitionState(*zero_label);
-    return *zero_label_state;
-  }
-
-  Result operator() (int32 tid) const {
-    if (tid == static_cast<int32>(fst::kNoLabel)) return NoLabelClass();  // -1 -> -1
-    else if (tid >= 1 && tid <= trans_model_.NumTransitionIds()) {
-      if (check_no_self_loops_ && trans_model_.InfoForTransitionId(tid).is_self_loop)
+  int32 operator() (int32 tid) const {
+    if (tid > 0 && tid <= trans_model_.NumTransitionIds()) {
+      if (currently_self_loop_free_ && trans_model_.InfoForTransitionId(tid).is_self_loop)
         KALDI_ERR << "AddSelfLoops: graph already has self-loops.";
-      return TransitionState(trans_model_.InfoForTransitionId(tid));
+      return trans_model_.InfoForTransitionId(tid).self_loop_transition_id;
+    } else if (tid == fst::kNoLabel) {
+      return -1;  // actually kNoLabel is -1.
     } else {  // 0 or (presumably) disambiguation symbol.  Map to zero
       int32 big_number = fst::kNontermBigNumber;  // 1000000
-      if (tid != 0 && tid < big_number)
+      if (tid != 0 && tid < big_number) {
         KALDI_ASSERT(std::binary_search(disambig_syms_.begin(),
                                         disambig_syms_.end(),
-                                        tid));  // or invalid tid
-      return ZeroClass();
+                                        tid) &&
+                    "It looks like you have an invalid symbol in your graph: ");
+      }
+      return 0;
     }
   }
 
 private:
   const Transitions &trans_model_;
   const std::vector<int32> &disambig_syms_;  // sorted.
-  bool check_no_self_loops_;
+  bool currently_self_loop_free_;
 };
 
 // Returns true if the outgoing arcs of the state s sum to 1.0
@@ -462,8 +433,8 @@ static bool StateIsStochastic(FST fst, typename FST::StateId s) {
 
 void AddSelfLoops(const Transitions &trans_model,
                   const std::vector<int32> &disambig_syms,
-                  BaseFloat self_loop_scale,
-                  bool check_no_self_loops,
+                  bool currently_self_loop_free,
+                  bool use_weights,
                   fst::VectorFst<fst::StdArc> *fst) {
   KALDI_ASSERT(fst->Start() != fst::kNoStateId);
   using namespace fst;
@@ -472,78 +443,96 @@ void AddSelfLoops(const Transitions &trans_model,
   typedef Arc::StateId StateId;
   typedef Arc::Weight Weight;
 
-  TidToTstateMapper f(trans_model, disambig_syms, check_no_self_loops);
+  TidToSelfLoopMapper f(trans_model, disambig_syms, currently_self_loop_free);
+
   // Duplicate states as necessary so that each state will require at most one
   // self-loop to be added to it.  Approximately this means that if a
   // state has multiple different symbols on arcs entering it, it will be
   // duplicated, with one copy per incoming symbol.
   MakePrecedingInputSymbolsSameClass(true, fst, f);
 
-  // use the following to keep track of the transition-state incoming
-  // into each state. This works because each state now has only one
-  // transition state coming into (because of
-  // MakePrecedingInputSymbolsSameClass).
-  std::vector<TransitionState> state_in(fst->NumStates(), f.NoLabelClass());
-
   // This first loop just works out the label into each state,
   // and converts the transitions in the graph from transition-states
   // to transition-ids.
   // state_in maps each state in the fst to its TransitionState
 
-  for (StateIterator<VectorFst<Arc> > siter(*fst);
-       !siter.Done();
-       siter.Next()) {
-    StateId s = siter.Value();
+
+  StateId num_states = fst->NumStates();
+  // self_loop_transition_id gives the transition-id of the self-loop
+  // of this state, or zero or -1 if it doesn't require a self-loop.
+  std::vector<int32> self_loop_transition_id(num_states, -2);
+
+  for (StateId s = 0; s < num_states; s++) {
     for (MutableArcIterator<VectorFst<Arc> > aiter(fst, s);
          !aiter.Done();
          aiter.Next()) {
-      const Arc& arc = aiter.Value();
-      TransitionState trans_state = f(arc.ilabel);
-      if (state_in[arc.nextstate] == f.NoLabelClass()) {
-        state_in[arc.nextstate] = trans_state;
+      const Arc &arc = aiter.Value();
+      int32 next_state_self_loop_transition_id = f(arc.ilabel);
+      if (self_loop_transition_id[arc.nextstate] == -2) {
+        // Note: next_state_self_loop_transition_id could be
+        self_loop_transition_id[arc.nextstate] =
+            next_state_self_loop_transition_id;
       } else {
-        KALDI_ASSERT(state_in[arc.nextstate] == trans_state);
+        KALDI_ASSERT(self_loop_transition_id[arc.nextstate] ==
+                     next_state_self_loop_transition_id);
         // or probably an error in MakePrecedingInputSymbolsSame.
       }
     }
   }
 
-  // The start state should have no incoming arcs (invariant of Topology)
-  KALDI_ASSERT(state_in[fst->Start()] == f.ZeroClass());
+  if (!currently_self_loop_free) {
+    for (StateId s = 0; s < num_states; s++) {
+      for (MutableArcIterator<VectorFst<Arc> > aiter(fst, s);
+           !aiter.Done();
+           aiter.Next()) {
+        const Arc &arc = aiter.Value();
+        int32 tid = arc.ilabel;
+        if (tid > 0 && tid <= trans_model.NumTransitionIds() &&
+            trans_model.InfoForTransitionId(tid).is_self_loop)
+          self_loop_transition_id[s] = 0;
+      }
+    }
+  } else {
+    // We shouldn't have added a self-loop to the start state.
+    KALDI_ASSERT(self_loop_transition_id[fst->Start()] == 0);
+  }
 
   // The next loop looks at each graph state, adds the self-loop [if needed] and
-  // multiples all the out-transitions' probs (and final-prob) by the
-  // forward-prob for that state (which is one minus self-loop-prob).  We do it
-  // like this to maintain stochasticity (i.e. rather than multiplying the arcs
-  // with the corresponding labels on them by this probability).
-
-  for (StateId s = 0; s < static_cast<StateId>(state_in.size()); s++) {
-    const TransitionState& trans_state = state_in[s];
-    if (trans_state != f.NoLabelClass() && trans_state != f.ZeroClass() &&
-        trans_state.info.self_loop_pdf_id != -1) {
-      // defined, and not eps or a disambiguation symbol or a
-      // nonterminal-related symbol for grammar decoding, and has a
-      // self-loop which needs to be added, while maintaining
-      int32 self_loop_tid = trans_state.info.self_loop_transition_id;
-      KALDI_ASSERT(self_loop_tid != 0 &&
-                   "Can't have a self_loop_pdf_id without a self_loop_transition_id");
-      // 1) Multiply all probabilities by "forward" probability.
-      BaseFloat self_loop_log_prob =
-        -trans_model.InfoForTransitionId(self_loop_tid).transition_cost;
-      BaseFloat log_forward_prob = log(1.0 - exp(self_loop_log_prob));
-      fst->SetFinal(s, Times(fst->Final(s), Weight(-log_forward_prob)));
+  // multiples all the out-transitions' probs (and final-prob) by the inverse of
+  // the correction factor that we used when creating the no-self-loops graph.
+  // We do it like this to maintain stochasticity throughout the graph compilation
+  // process.
+
+  if (use_weights) {
+    for (StateId s = 0; s < num_states; s++) {
+      int32 tid = self_loop_transition_id[s];
+      if (tid <= 0)
+        continue;
+      const auto &info(trans_model.InfoForTransitionId(tid));
+
+      BaseFloat self_loop_cost = info.transition_cost,
+          correction_factor = trans_model.GetTopo().CorrectionFactorsForPhone(
+              info.phone)[info.topo_state];
+      Weight correction(-correction_factor),
+          self_loop_weight(self_loop_cost);
+
+      fst->SetFinal(s, Times(fst->Final(s), correction));
       for (MutableArcIterator<MutableFst<Arc> > aiter(fst, s);
-          !aiter.Done();
-          aiter.Next()) {
+           !aiter.Done();
+           aiter.Next()) {
         Arc arc = aiter.Value();
-        arc.weight = Times(arc.weight, Weight(-log_forward_prob));
+        arc.weight = Times(arc.weight, correction);
         aiter.SetValue(arc);
       }
-      // 2) Add self-loop
-      fst->AddArc(s, Arc(self_loop_tid, 0, Weight(-self_loop_log_prob), s));
-
+      // Add self-loop.  ilabel is `tid`, olabel is epsilon (0).
+      fst->AddArc(s, Arc(tid, 0, self_loop_weight, s));
+    }
+  } else {
+    for (StateId s = 0; s < num_states; s++) {
+      int32 tid = self_loop_transition_id[s];
+      // Add self-loop.  ilabel is `tid`, olabel is epsilon (0).
+      fst->AddArc(s, Arc(tid, 0, Weight::One(), s));
     }
-    KALDI_PARANOID_ASSERT(StateIsStochastic(*fst, s));
   }
 }
 
diff --git a/src/hmm/hmm-utils.h b/src/hmm/hmm-utils.h
index 256d17ab609..2ca54a9dd95 100644
--- a/src/hmm/hmm-utils.h
+++ b/src/hmm/hmm-utils.h
@@ -158,28 +158,26 @@ void GetIlabelMapping(const std::vector<std::vector<int32> > &ilabel_info_old,
   * same as disambiguation symbols, assuming they are special symbols for
   * grammar decoding.
   *
-  * @param trans_model [in] Transition model
-  * @param disambig_syms [in] Sorted, uniq list of disambiguation symbols, required
-  *       if the graph contains disambiguation symbols but only needed for sanity checks.
-  * @param reorder [in] If true, reorders the transitions (see \ref hmm_reorder).
-  *                     You'll normally want this to be true.
-  * @param check_no_self_loops [in]  If true, it will check that there are no
-  *                      self-loops in the original graph; you'll normally want
-  *                      this to be true.  If false, it will allow them, and
-  *                      will add self-loops after the original self-loop
-  *                      transitions, assuming reorder==true... this happens to
-  *                      be what we want when converting normal to unconstrained
-  *                      chain examples.  WARNING: this was added in 2018;
-  *                      if you get a compilation error, add this as 'true',
-  *                      which emulates the behavior of older code.
+  * @param [in] trans_model  Transition model
+  * @param [in] disambig_syms Sorted, unique list of disambiguation symbols, required
+  *         if the graph contains disambiguation symbols but only needed for sanity checks.
+  * @param [in] currently_self_loop_free   If true, we require (and check) that
+  *                      the graph was free of self-loops at entry.  If
+  *                      false, it assumes that some states may already have
+  *                      self-loops, and will refrain from adding duplicate
+  *                      self-loop to them.
+  * @param [in] use_weights  If true, weights will be used (which
+  *                      includes a correction term to make things continue to
+  *                      sum to one); otherwise, we add the new self-loop arcs
+  *                      with probability One().
   * @param  fst [in, out] The FST to be modified. This should normally be HCLG
   *                       or any other FST with transition ids as its input
   *                       labels.
   */
 void AddSelfLoops(const Transitions &trans_model,
                   const std::vector<int32> &disambig_syms,  // used as a check only.
-                  BaseFloat self_loop_scale,
-                  bool check_no_self_loops,
+                  bool currently_self_loop_free,
+                  bool use_weights,
                   fst::VectorFst<fst::StdArc> *fst);
 
 
diff --git a/src/hmm/transitions.h b/src/hmm/transitions.h
index 4b3b49ad81e..6bab0e627dc 100644
--- a/src/hmm/transitions.h
+++ b/src/hmm/transitions.h
@@ -106,6 +106,7 @@ class Transitions {
   // This struct is the information associated with one transition-id.
   // You can work out the transition-id from the first 5 fields.
   struct TransitionIdInfo {
+
     int32 phone;      // The phone
     int32 topo_state; // The state in the topology FST for this phone
     int32 arc_index;  // The arc-index leaving this state
diff --git a/src/lat/word-align-lattice-lexicon-test.cc b/src/lat/word-align-lattice-lexicon-test.cc
index 4987a6cd427..d1ed65c2618 100644
--- a/src/lat/word-align-lattice-lexicon-test.cc
+++ b/src/lat/word-align-lattice-lexicon-test.cc
@@ -191,8 +191,7 @@ void TestWordAlignLatticeLexicon() {
   PrintWordsAndPhones(word_seq, phone_seq);
 
   std::vector<int32> alignment;
-  bool reorder = (RandInt(0, 1) == 0);
-  GenerateRandomAlignment(*ctx_dep, *trans_model, reorder,
+  GenerateRandomAlignment(*ctx_dep, *trans_model,
                           phone_seq, &alignment);
 
   CompactLattice clat;
@@ -206,7 +205,6 @@ void TestWordAlignLatticeLexicon() {
   opts.test = true;  // we rely on the self-test code that's activated when we
                      // do this.
   opts.allow_duplicate_paths = true;
-  opts.reorder = reorder;
   CompactLattice aligned_clat;
   bool ans = WordAlignLatticeLexicon(clat, *trans_model, lexicon_info, opts,
                                      &aligned_clat);
@@ -234,4 +232,3 @@ int main() {
     kaldi::TestWordAlignLatticeLexicon();
   std::cout << "Tests succeeded\n";
 }
-
diff --git a/src/nnet3/nnet-compute.cc b/src/nnet3/nnet-compute.cc
index 7ee7d7df717..76a162ef7b4 100644
--- a/src/nnet3/nnet-compute.cc
+++ b/src/nnet3/nnet-compute.cc
@@ -446,7 +446,7 @@ void NnetComputer::ExecuteCommand() {
 
 CuSubMatrix<BaseFloat> NnetComputer::GetSubMatrix(int32 submatrix_index) {
   KALDI_PARANOID_ASSERT(static_cast<size_t>(submatrix_index) <
-                        computation_.submatrices.size());
+                       computation_.submatrices.size());
   const NnetComputation::SubMatrixInfo &info =
       computation_.submatrices[submatrix_index];
   const CuMatrix<BaseFloat> &mat = matrices_[info.matrix_index];
@@ -491,8 +491,10 @@ void NnetComputer::GetPointers(int32 indexes_multi_index,
   for (int32 i = 0; i < size; i += 30 + RandInt(0, 9)) {
     // Do a pseudo-random spot check that the row-indexes are not out of range.
     int32 submatrix_index = pairs[i].first, row = pairs[i].second;
-    CuSubMatrix<BaseFloat> m = GetSubMatrix(submatrix_index);
-    KALDI_ASSERT(row >= 0 && row < m.NumRows() && num_cols == m.NumCols());
+    if (submatrix_index != -1) {
+      CuSubMatrix<BaseFloat> m = GetSubMatrix(submatrix_index);
+      KALDI_ASSERT(row >= 0 && row < m.NumRows() && num_cols == m.NumCols());
+    }
   }
 #endif
   pointers->CopyFromVec(vec);

From 9dd4f6375b96e4c173220954d315bbaeb58b6438 Mon Sep 17 00:00:00 2001
From: Daniel Povey <dpovey@gmail.com>
Date: Sun, 23 Jun 2019 12:16:54 -0400
Subject: [PATCH 144/163] [src] One last fix to make tests pass

---
 src/hmm/hmm-utils.cc | 8 +++++---
 1 file changed, 5 insertions(+), 3 deletions(-)

diff --git a/src/hmm/hmm-utils.cc b/src/hmm/hmm-utils.cc
index 7e858cbed06..76470e09bb6 100644
--- a/src/hmm/hmm-utils.cc
+++ b/src/hmm/hmm-utils.cc
@@ -458,8 +458,8 @@ void AddSelfLoops(const Transitions &trans_model,
 
 
   StateId num_states = fst->NumStates();
-  // self_loop_transition_id gives the transition-id of the self-loop
-  // of this state, or zero or -1 if it doesn't require a self-loop.
+  // self_loop_transition_id gives the transition-id of the self-loop of this
+  // state, or zero or -1 or -2 if it doesn't require a self-loop.
   std::vector<int32> self_loop_transition_id(num_states, -2);
 
   for (StateId s = 0; s < num_states; s++) {
@@ -481,6 +481,8 @@ void AddSelfLoops(const Transitions &trans_model,
   }
 
   if (!currently_self_loop_free) {
+    // there might be some self-loops present already, so make sure we don't
+    // duplicate them.
     for (StateId s = 0; s < num_states; s++) {
       for (MutableArcIterator<VectorFst<Arc> > aiter(fst, s);
            !aiter.Done();
@@ -494,7 +496,7 @@ void AddSelfLoops(const Transitions &trans_model,
     }
   } else {
     // We shouldn't have added a self-loop to the start state.
-    KALDI_ASSERT(self_loop_transition_id[fst->Start()] == 0);
+    KALDI_ASSERT(self_loop_transition_id[fst->Start()] <= 0);
   }
 
   // The next loop looks at each graph state, adds the self-loop [if needed] and

From a9c96f6ededcf9ee7c4de1ab2006b0b69fd62fbe Mon Sep 17 00:00:00 2001
From: Daniel Povey <dpovey@gmail.com>
Date: Tue, 2 Jul 2019 20:59:49 -0400
Subject: [PATCH 145/163] [src] Changing numbering of pattern preconditions

---
 src/tensor/pattern.h | 42 +++++++++++++++++++++++-------------------
 1 file changed, 23 insertions(+), 19 deletions(-)

diff --git a/src/tensor/pattern.h b/src/tensor/pattern.h
index 93e46f2714e..a38ef16b2da 100644
--- a/src/tensor/pattern.h
+++ b/src/tensor/pattern.h
@@ -478,36 +478,40 @@ namespace tensor {
                      see the declaration of struct Pattern for additional details about how
                      it is stored.
 
-                          (i) The num_axes must satisfy 0 <= num_axes < KALDI_TENSOR_MAX_DIM
-                          (ii) The offset must be >= 0.
-                          (iii) the dims must all be >0.
-                          (iv) the strides must be nonzero (but not necessarily positive) for axes with
-                                dim != 1.
-                          (v) the axis-dominance property.   This property is sufficient, but not
-                              necessary, to ensure the uniqueness property.  It requires that
-                              when the axes are sorted from least to greatest value of abs(stride),
-                              for each axis-index 0 <= r < num_axes - 1 (using the private numbering
-                              of axis-indexes),
+                          (i)  The num_axes must satisfy 0 <= num_axes <= KALDI_TENSOR_MAX_DIM
+                          (ii) The dims must all be >0a
+                          (iii) We require dim[i] == 1 and strides[i] == 0 for
+                               num_axes < i < KALDI_TENSOR_MAX_DIM
+                          (iv) We require that no memory index reachable through the pattern
+                               be negative, which can be expressed as:
+                               offset + \sum_{i=0}^{num_axes - 1} min(0, strides[i]*(dims[i]-1)) >= 0
+                          (v) The strides must be nonzero for axes i with dim[i] != 1.
+                          (vi) the axis-dominance property.   This property is sufficient, but not
+                               necessary, to ensure the uniqueness property.  It requires that
+                               when the axes are sorted from least to greatest value of abs(stride),
+                               for each axis-index 0 <= r < num_axes - 1 (using the private numbering
+                               of axis-indexes),
                                     dim(r) * abs(stride(r)) <= abs(stride(r+1)).
-                              (Note: this property doesn't require that the axes be sorted that
-                              way; if you need that, search for "Canonical form").
-                          (vi) the strides must be zero for axes with dim=1.
+                               (Note: this property doesn't require that the axes be sorted that
+                               way; if you need that, search for "Canonical form").
+                         (vii) the strides must be zero for axes with dim=1.
 
 
      Valid-1 Pattern:
                       A Pattern is valid-1 (read as: valid minus one) if it
-                      satisfies properties (i) through (v) of a valid Pattern
+                      satisfies properties (i) through (vi) of a valid Pattern
                       (i.e. it may have nonzero strides for axes with dim=1, but
                       must otherwise be valid).  A valid pattern is also valid-1.
 
      Valid-2 Pattern:
                       A Pattern is valid-2 (read as valid minus two) if it
-                      satisfies properties (i) through (iv) of a valid Pattern
-                      and satisfies the uniqueness property.  That is, it must
+                      satisfies properties (i) through (v) of a valid Pattern
+                      and also satisfies the uniqueness property.  That is, it must
                       be a valid Pattern, except:
-                      it may have nonzero strides for axes with dim=1, since
-                      we don't require property (v); and it does not have to
-                      satisfy the axis-dominance property (property (vi)).
+                       - it may have nonzero strides for axes with dim=1, since
+                         we don't require property (vi)
+                       - it does not have to satisfy the axis-dominance property
+                         (property (vi)).
                       However, it must still satisfy the uniqueness property
                       (see its glossary entry); we don't normally explicitly
                       require the uniqueness property because it is implied by

From 57a8d0ea90e994821b295769bb9965176ff615a2 Mon Sep 17 00:00:00 2001
From: Daniel Povey <dpovey@gmail.com>
Date: Wed, 10 Jul 2019 17:35:28 -0400
Subject: [PATCH 146/163] [scripts,egs] Removing no-longer-existing options
 like --transition-scale

---
 .../s5/local/chain/tuning/run_tdnn_1a.sh      |   2 +-
 .../s5/local/chain/tuning/run_tdnn_2a.sh      |   2 +-
 .../s5/local/chain/tuning/run_tdnn_1a.sh      |   2 +-
 .../s5/local/chain/tuning/run_tdnn_2a.sh      |   2 +-
 .../s5/local/chain/tuning/run_tdnn_1a.sh      |   2 +-
 .../s5/local/chain/tuning/run_tdnn_1b.sh      |   2 +-
 egs/ami/s5/local/chain/run_blstm_ami_5.sh     |   2 +-
 egs/ami/s5/local/chain/run_tdnn_ami_5.sh      |   2 +-
 .../multi_condition/tuning/run_tdnn_1a.sh     |   2 +-
 .../tuning/run_tdnn_lstm_1a.sh                |   2 +-
 .../tuning/run_tdnn_lstm_1b.sh                |   2 +-
 .../chain/tuning/run_cnn_tdnn_lstm_1a.sh      |   2 +-
 .../chain/tuning/run_cnn_tdnn_lstm_1b.sh      |   2 +-
 .../chain/tuning/run_cnn_tdnn_lstm_1c.sh      |   2 +-
 egs/ami/s5b/local/chain/tuning/run_tdnn_1a.sh |   2 +-
 egs/ami/s5b/local/chain/tuning/run_tdnn_1b.sh |   2 +-
 egs/ami/s5b/local/chain/tuning/run_tdnn_1c.sh |   2 +-
 egs/ami/s5b/local/chain/tuning/run_tdnn_1d.sh |   2 +-
 egs/ami/s5b/local/chain/tuning/run_tdnn_1e.sh |   2 +-
 egs/ami/s5b/local/chain/tuning/run_tdnn_1f.sh |   2 +-
 egs/ami/s5b/local/chain/tuning/run_tdnn_1g.sh |   2 +-
 egs/ami/s5b/local/chain/tuning/run_tdnn_1h.sh |   2 +-
 egs/ami/s5b/local/chain/tuning/run_tdnn_1i.sh |   2 +-
 .../local/chain/tuning/run_tdnn_lstm_1a.sh    |   2 +-
 .../local/chain/tuning/run_tdnn_lstm_1b.sh    |   2 +-
 .../local/chain/tuning/run_tdnn_lstm_1c.sh    |   2 +-
 .../local/chain/tuning/run_tdnn_lstm_1d.sh    |   2 +-
 .../local/chain/tuning/run_tdnn_lstm_1e.sh    |   2 +-
 .../local/chain/tuning/run_tdnn_lstm_1f.sh    |   2 +-
 .../local/chain/tuning/run_tdnn_lstm_1g.sh    |   2 +-
 .../local/chain/tuning/run_tdnn_lstm_1h.sh    |   2 +-
 .../local/chain/tuning/run_tdnn_lstm_1i.sh    |   2 +-
 .../local/chain/tuning/run_tdnn_lstm_1j.sh    |   2 +-
 .../local/chain/tuning/run_tdnn_lstm_1k.sh    |   2 +-
 .../local/chain/tuning/run_tdnn_lstm_1l.sh    |   2 +-
 .../local/chain/tuning/run_tdnn_lstm_1m.sh    |   2 +-
 .../local/chain/tuning/run_tdnn_lstm_1n.sh    |   2 +-
 .../local/chain/tuning/run_tdnn_lstm_1o.sh    |   2 +-
 .../local/chain/tuning/run_tdnn_lstm_bs_1a.sh |   2 +-
 .../local/chain/tuning/run_tdnn_opgru_1a.sh   |   2 +-
 .../local/chain/tuning/run_tdnn_opgru_1b.sh   |   2 +-
 .../local/chain/tuning/run_tdnn_opgru_1c.sh   |   2 +-
 .../s5/local/chain/tuning/run_blstm_7b.sh     |   2 +-
 .../s5/local/chain/tuning/run_blstm_asp_1.sh  |   2 +-
 .../s5/local/chain/tuning/run_tdnn_7b.sh      |   2 +-
 .../s5/local/chain/tuning/run_tdnn_asp_1.sh   |   2 +-
 .../s5/local/chain/tuning/run_tdnn_lstm_1a.sh |   2 +-
 egs/babel/s5c/local/ali_to_rttm.sh            |   2 +-
 egs/babel/s5d/local/ali_to_rttm.sh            |   2 +-
 egs/babel/s5d/local/chain/tuning/run_tdnn.sh  |   2 +-
 .../s5d/local/chain/tuning/run_tdnn_lstm.sh   |   2 +-
 .../local/chain/tuning/run_tdnn_lstm_bab1.sh  |   2 +-
 .../local/chain/tuning/run_tdnn_lstm_bab2.sh  |   2 +-
 .../local/chain/tuning/run_tdnn_lstm_bab3.sh  |   2 +-
 .../local/chain/tuning/run_tdnn_lstm_bab4.sh  |   2 +-
 .../local/chain/tuning/run_tdnn_lstm_bab5.sh  |   2 +-
 .../local/chain/tuning/run_tdnn_lstm_bab6.sh  |   2 +-
 .../local/chain/tuning/run_tdnn_lstm_bab7.sh  |   2 +-
 .../local/chain/tuning/run_tdnn_lstm_bab8.sh  |   2 +-
 .../local/chain/tuning/run_cnn_e2eali_1a.sh   |   4 +-
 .../v1/local/chain/tuning/run_e2e_cnn_1a.sh   |   2 +-
 egs/bentham/v1/run_end2end.sh                 |   2 +-
 .../s5_1ch/local/chain/tuning/run_tdnn_1a.sh  |   2 +-
 .../local/chain/tuning/run_tdnn_lstm_1a.sh    |   2 +-
 .../s5/local/chain/tuning/run_tdnn_1a.sh      |   2 +-
 egs/chime5/s5/local/run_recog.sh              |   2 +-
 .../chain/tuning/run_cnn_tdnn_lstm_1a.sh      |   2 +-
 .../s5b/local/chain/tuning/run_tdnn_1a.sh     |   2 +-
 .../s5b/local/chain/tuning/run_tdnn_1b.sh     |   2 +-
 .../local/chain/tuning/run_tdnn_lstm_1a.sh    |   2 +-
 egs/chime5/s5b/local/run_recog.sh             |   2 +-
 .../s5/local/chain/tuning/run_tdnn_1a.sh      |   2 +-
 egs/csj/s5/local/chain/tuning/run_tdnn_1a.sh  |   2 +-
 .../s5/local/chain/run_tdnn_1g.sh             |   2 +-
 egs/fisher_english/s5/local/chain/run_tdnn.sh |   2 +-
 .../tuning/run_tdnn_100k_semisupervised_1a.sh |   4 +-
 .../local/semisup/chain/tuning/run_tdnn_1a.sh |   2 +-
 .../tuning/run_tdnn_50k_semisupervised_1a.sh  |   4 +-
 .../s5/local/chain/run_blstm_6h.sh            |   2 +-
 .../s5/local/chain/run_blstm_6j.sh            |   2 +-
 egs/fisher_swbd/s5/local/chain/run_tdnn_7b.sh |   2 +-
 egs/fisher_swbd/s5/local/chain/run_tdnn_7c.sh |   2 +-
 egs/fisher_swbd/s5/local/chain/run_tdnn_7d.sh |   2 +-
 .../s5/local/chain/run_tdnn_lstm_1a.sh        |   2 +-
 .../s5/local/chain/run_tdnn_lstm_1a_svd.sh    |   2 +-
 .../s5/local/chain/run_tdnn_lstm_1b.sh        |   2 +-
 .../s5/local/chain/run_tdnn_opgru_1a.sh       |   2 +-
 .../s5/local/chain/run_tdnn_opgru_1b.sh       |   2 +-
 .../s5/local/chain/tuning/run_tdnn_1a.sh      |   2 +-
 .../s5/local/chain/tuning/run_tdnn_1b.sh      |   2 +-
 .../s5/local/chain/tuning/run_tdnn_1c.sh      |   2 +-
 .../s5/local/chain/tuning/run_tdnn_1d.sh      |   2 +-
 .../s5b/local/chain/tuning/run_tdnn_1a.sh     |   2 +-
 .../local/chain/tuning/run_tdnn_lstm_1a.sh    |   2 +-
 .../s5c/local/chain/tuning/run_tdnn_1a.sh     |   2 +-
 .../local/chain/tuning/run_tdnn_lstm_1a.sh    |   2 +-
 egs/gp/s1/steps/align_deltas.sh               |   2 +-
 egs/gp/s1/steps/train_deltas.sh               |   2 +-
 egs/gp/s1/steps/train_mono.sh                 |   2 +-
 egs/gp/s1/utils/lmrescore.sh                  |   2 +-
 egs/gp/s1/utils/mkgraph.sh                    |  16 +--
 .../s5/local/chain/tuning/run_cnn_tdnn_1a.sh  |   2 +-
 .../s5/local/chain/tuning/run_tdnn_1a.sh      |   2 +-
 .../s5/local/chain/tuning/run_tdnn_1b.sh      |   2 +-
 .../s5/local/chain/tuning/run_tdnn_2a.sh      |   2 +-
 .../s5/local/chain/tuning/run_cnn_tdnn_1a.sh  |   2 +-
 .../s5/local/chain/tuning/run_tdnn_1a.sh      |   2 +-
 .../s5/local/chain/tuning/run_tdnn_1b.sh      |   2 +-
 egs/iam/v1/local/chain/tuning/run_cnn_1a.sh   |   2 +-
 .../local/chain/tuning/run_cnn_chainali_1a.sh |   4 +-
 .../local/chain/tuning/run_cnn_chainali_1b.sh |   4 +-
 .../local/chain/tuning/run_cnn_chainali_1c.sh |   4 +-
 .../local/chain/tuning/run_cnn_chainali_1d.sh |   4 +-
 .../local/chain/tuning/run_cnn_e2eali_1a.sh   |   4 +-
 .../local/chain/tuning/run_cnn_e2eali_1b.sh   |   4 +-
 .../local/chain/tuning/run_cnn_e2eali_1c.sh   |   4 +-
 .../v1/local/chain/tuning/run_e2e_cnn_1a.sh   |   2 +-
 egs/iam/v1/run_end2end.sh                     |   2 +-
 .../local/chain/tuning/run_cnn_e2eali_1a.sh   |   4 +-
 .../local/chain/tuning/run_cnn_e2eali_1b.sh   |   4 +-
 .../local/chain/tuning/run_cnn_e2eali_1c.sh   |   4 +-
 .../local/chain/tuning/run_cnn_e2eali_1d.sh   |   4 +-
 .../v2/local/chain/tuning/run_e2e_cnn_1a.sh   |   2 +-
 .../v2/local/chain/tuning/run_e2e_cnn_1b.sh   |   2 +-
 egs/iam/v2/run_end2end.sh                     |   2 +-
 egs/iban/s5/local/chain/tuning/run_tdnn_1a.sh |   2 +-
 egs/iban/s5/local/chain/tuning/run_tdnn_1b.sh |   2 +-
 egs/ifnenit/v1/local/chain/run_cnn_1a.sh      |   2 +-
 .../v1/local/chain/run_cnn_chainali_1a.sh     |   4 +-
 .../s5/local/chain/run_tdnn_discriminative.sh |   4 +-
 .../s5/local/chain/tuning/run_cnn_tdnn_1a.sh  |   4 +-
 .../s5/local/chain/tuning/run_tdnn_1a.sh      |   4 +-
 .../s5/local/chain/tuning/run_tdnn_1b.sh      |   4 +-
 .../s5/local/chain/tuning/run_tdnn_1c.sh      |   4 +-
 .../s5/local/chain/tuning/run_tdnn_1d.sh      |   4 +-
 .../s5/local/chain/tuning/run_tdnn_lstm_1a.sh |   4 +-
 .../s5/local/chain/tuning/run_tdnn_lstm_1b.sh |   4 +-
 .../v1/local/chain/tuning/run_cnn_1a.sh       |   2 +-
 .../local/chain/tuning/run_cnn_chainali_1a.sh |   4 +-
 .../local/chain/tuning/run_cnn_e2eali_1a.sh   |   4 +-
 .../local/chain/tuning/run_cnn_e2eali_1b.sh   |   4 +-
 .../v1/local/tl/chain/run_cnn_e2eali.sh       |   4 +-
 .../v1/local/tl/chain/run_e2e_cnn.sh          |   2 +-
 .../v1/local/tl/run_text_localization.sh      |   2 +-
 egs/madcat_ar/v1/run_end2end.sh               |   2 +-
 .../v1/local/chain/tuning/run_cnn_1a.sh       |   2 +-
 .../local/chain/tuning/run_cnn_chainali_1a.sh |   4 +-
 .../local/chain/tuning/run_cnn_chainali_1b.sh |   4 +-
 egs/madcat_zh/v1/run_end2end.sh               |   2 +-
 .../s5/local/chain/tuning/run_tdnn_1a.sh      |   4 +-
 .../s5/local/chain/tuning/run_tdnn_1b.sh      |   2 +-
 .../s5/local/chain/tuning/run_tdnn_lstm_1a.sh |   4 +-
 .../local/semisup/chain/tuning/run_tdnn_1a.sh |   2 +-
 .../tuning/run_tdnn_semisupervised_1a.sh      |   4 +-
 egs/mgb5/s5/local/chain/tuning/run_tdnn_1a.sh |   2 +-
 .../s5/local/chain/tuning/run_cnn_tdnn_1a.sh  |   2 +-
 .../s5/local/chain/tuning/run_tdnn_1a.sh      |   2 +-
 .../s5/local/chain/tuning/run_tdnn_1b.sh      |   2 +-
 .../s5/local/chain/tuning/run_tdnn_1c.sh      |   2 +-
 .../tuning/run_tdnn_1c_discriminative.sh      |   4 +-
 .../s5/local/chain/tuning/run_tdnn_1d.sh      |   2 +-
 .../s5/local/chain/tuning/run_tdnn_1e.sh      |   2 +-
 .../s5/local/chain/tuning/run_tdnn_1f.sh      |   2 +-
 .../s5/local/chain/tuning/run_tdnn_1g.sh      |   2 +-
 .../s5/local/chain/tuning/run_tdnn_1g20.sh    |   2 +-
 .../s5/local/chain/tuning/run_tdnn_1h.sh      |   2 +-
 .../s5/local/grammar/extend_vocab_demo.sh     |   4 +-
 .../grammar/extend_vocab_demo_silprobs.sh     |   4 +-
 .../s5/local/grammar/simple_demo.sh           |   6 +-
 .../s5/local/grammar/simple_demo_silprobs.sh  |   6 +-
 .../s5/local/kws/create_hitlist.sh            |   2 +-
 egs/multi_en/s5/local/chain/run_blstm_6h.sh   |   2 +-
 .../s5/local/chain/tuning/run_tdnn_5b.sh      |   2 +-
 .../s5/local/chain/tuning/run_tdnn_lstm_1a.sh |   2 +-
 .../local/chain/tuning/run_tdnn_opgru_1a.sh   |   2 +-
 .../local/chain/tuning/run_tdnn_opgru_1b.sh   |   2 +-
 .../s5/local/chain/tuning/run_tdnn_1a.sh      |   2 +-
 .../s5/local/chain/tuning/run_tdnn_lstm_1a.sh |   2 +-
 .../local/chain/tuning/run_cnn_e2eali_1a.sh   |   4 +-
 .../v1/local/chain/tuning/run_e2e_cnn_1a.sh   |   2 +-
 egs/rimes/v1/run_end2end.sh                   |   2 +-
 egs/rm/s5/local/chain/tuning/run_tdnn_5g.sh   |   4 +-
 egs/rm/s5/local/chain/tuning/run_tdnn_5n.sh   |   4 +-
 egs/rm/s5/local/chain/tuning/run_tdnn_5o.sh   |   4 +-
 .../local/chain/tuning/run_tdnn_wsj_rm_1a.sh  |   2 +-
 .../local/chain/tuning/run_tdnn_wsj_rm_1b.sh  |   2 +-
 .../local/chain/tuning/run_tdnn_wsj_rm_1c.sh  |   4 +-
 .../s5/local/chain/tuning/run_lstm_1a.sh      |   2 +-
 .../s5/local/chain/tuning/run_lstm_1b.sh      |   2 +-
 .../s5/local/chain/tuning/run_lstm_1c.sh      |   2 +-
 .../s5/local/chain/tuning/run_lstm_1d.sh      |   2 +-
 .../s5/local/chain/tuning/run_lstm_1e.sh      |   2 +-
 .../s5/local/chain/tuning/run_tdnn_1a.sh      |   2 +-
 .../s5/local/chain/tuning/run_tdnn_1b.sh      |   2 +-
 .../s5/local/chain/tuning/run_tdnn_lstm_1a.sh |   2 +-
 .../chain/multi_condition/run_tdnn_7f.sh      |   2 +-
 .../chain/multi_condition/run_tdnn_7k.sh      |   2 +-
 .../chain/multi_condition/run_tdnn_aug_1a.sh  |   2 +-
 .../s5c/local/chain/tuning/run_blstm_6h.sh    |   2 +-
 .../tuning/run_blstm_6h_discriminative.sh     |   4 +-
 .../s5c/local/chain/tuning/run_blstm_6i.sh    |   2 +-
 .../s5c/local/chain/tuning/run_blstm_6j.sh    |   2 +-
 .../s5c/local/chain/tuning/run_blstm_6k.sh    |   2 +-
 .../s5c/local/chain/tuning/run_blstm_6l.sh    |   2 +-
 .../s5c/local/chain/tuning/run_blstm_6m.sh    |   2 +-
 .../s5c/local/chain/tuning/run_blstm_6n.sh    |   2 +-
 .../s5c/local/chain/tuning/run_blstm_6o.sh    |   2 +-
 .../s5c/local/chain/tuning/run_cnn_tdnn_1a.sh |   2 +-
 .../s5c/local/chain/tuning/run_lstm_6h.sh     |   2 +-
 .../s5c/local/chain/tuning/run_lstm_6i.sh     |   2 +-
 .../s5c/local/chain/tuning/run_lstm_6j.sh     |   2 +-
 .../s5c/local/chain/tuning/run_lstm_6k.sh     |   2 +-
 .../s5c/local/chain/tuning/run_lstm_6l.sh     |   2 +-
 egs/swbd/s5c/local/chain/tuning/run_lstm_d.sh |   2 +-
 .../s5c/local/chain/tuning/run_tdnn_2a.sh     |   4 +-
 .../s5c/local/chain/tuning/run_tdnn_2b.sh     |   4 +-
 .../s5c/local/chain/tuning/run_tdnn_2c.sh     |   2 +-
 .../s5c/local/chain/tuning/run_tdnn_2d.sh     |   2 +-
 .../s5c/local/chain/tuning/run_tdnn_2e.sh     |   2 +-
 .../s5c/local/chain/tuning/run_tdnn_2f.sh     |   2 +-
 .../s5c/local/chain/tuning/run_tdnn_2g.sh     |   2 +-
 .../s5c/local/chain/tuning/run_tdnn_2h.sh     |   2 +-
 .../s5c/local/chain/tuning/run_tdnn_2i.sh     |   2 +-
 .../s5c/local/chain/tuning/run_tdnn_2j.sh     |   2 +-
 .../s5c/local/chain/tuning/run_tdnn_2k.sh     |   2 +-
 .../s5c/local/chain/tuning/run_tdnn_2l.sh     |   2 +-
 .../s5c/local/chain/tuning/run_tdnn_2m.sh     |   2 +-
 .../s5c/local/chain/tuning/run_tdnn_2n.sh     |   2 +-
 .../s5c/local/chain/tuning/run_tdnn_2o.sh     |   2 +-
 .../s5c/local/chain/tuning/run_tdnn_2p.sh     |   2 +-
 .../s5c/local/chain/tuning/run_tdnn_2q.sh     |   2 +-
 .../s5c/local/chain/tuning/run_tdnn_2r.sh     |   2 +-
 .../s5c/local/chain/tuning/run_tdnn_2s.sh     |   2 +-
 .../s5c/local/chain/tuning/run_tdnn_2t.sh     |   2 +-
 .../s5c/local/chain/tuning/run_tdnn_2u.sh     |   2 +-
 .../s5c/local/chain/tuning/run_tdnn_2v.sh     |   2 +-
 .../s5c/local/chain/tuning/run_tdnn_2w.sh     |   2 +-
 .../s5c/local/chain/tuning/run_tdnn_2x.sh     |   2 +-
 .../s5c/local/chain/tuning/run_tdnn_2y.sh     |   2 +-
 .../s5c/local/chain/tuning/run_tdnn_3c.sh     |   2 +-
 .../s5c/local/chain/tuning/run_tdnn_3d.sh     |   2 +-
 .../s5c/local/chain/tuning/run_tdnn_3e.sh     |   2 +-
 .../s5c/local/chain/tuning/run_tdnn_3f.sh     |   2 +-
 .../s5c/local/chain/tuning/run_tdnn_3g.sh     |   2 +-
 .../s5c/local/chain/tuning/run_tdnn_3h.sh     |   2 +-
 .../s5c/local/chain/tuning/run_tdnn_3i.sh     |   2 +-
 .../s5c/local/chain/tuning/run_tdnn_3j.sh     |   2 +-
 .../s5c/local/chain/tuning/run_tdnn_3k.sh     |   2 +-
 .../s5c/local/chain/tuning/run_tdnn_3k2.sh    |   2 +-
 .../s5c/local/chain/tuning/run_tdnn_3l.sh     |   2 +-
 .../s5c/local/chain/tuning/run_tdnn_3m.sh     |   2 +-
 .../s5c/local/chain/tuning/run_tdnn_3n.sh     |   2 +-
 .../s5c/local/chain/tuning/run_tdnn_3o.sh     |   2 +-
 .../s5c/local/chain/tuning/run_tdnn_3p.sh     |   2 +-
 .../s5c/local/chain/tuning/run_tdnn_3q.sh     |   2 +-
 .../s5c/local/chain/tuning/run_tdnn_3r.sh     |   2 +-
 .../s5c/local/chain/tuning/run_tdnn_3s.sh     |   2 +-
 .../s5c/local/chain/tuning/run_tdnn_3t.sh     |   2 +-
 .../s5c/local/chain/tuning/run_tdnn_3u.sh     |   2 +-
 .../s5c/local/chain/tuning/run_tdnn_3v.sh     |   2 +-
 .../s5c/local/chain/tuning/run_tdnn_3w.sh     |   2 +-
 .../s5c/local/chain/tuning/run_tdnn_3x.sh     |   2 +-
 .../s5c/local/chain/tuning/run_tdnn_3y.sh     |   2 +-
 .../s5c/local/chain/tuning/run_tdnn_3z.sh     |   2 +-
 .../s5c/local/chain/tuning/run_tdnn_4a.sh     |   2 +-
 .../s5c/local/chain/tuning/run_tdnn_4b.sh     |   2 +-
 .../s5c/local/chain/tuning/run_tdnn_4c.sh     |   2 +-
 .../s5c/local/chain/tuning/run_tdnn_4d.sh     |   2 +-
 .../s5c/local/chain/tuning/run_tdnn_4e.sh     |   2 +-
 .../s5c/local/chain/tuning/run_tdnn_4f.sh     |   2 +-
 .../s5c/local/chain/tuning/run_tdnn_4g.sh     |   2 +-
 .../s5c/local/chain/tuning/run_tdnn_4n.sh     |   2 +-
 .../s5c/local/chain/tuning/run_tdnn_4p.sh     |   2 +-
 .../s5c/local/chain/tuning/run_tdnn_4q.sh     |   2 +-
 .../s5c/local/chain/tuning/run_tdnn_4r.sh     |   2 +-
 .../s5c/local/chain/tuning/run_tdnn_4s.sh     |   2 +-
 .../s5c/local/chain/tuning/run_tdnn_4t.sh     |   2 +-
 .../s5c/local/chain/tuning/run_tdnn_4u.sh     |   2 +-
 .../s5c/local/chain/tuning/run_tdnn_4v.sh     |   2 +-
 .../s5c/local/chain/tuning/run_tdnn_4w.sh     |   2 +-
 .../s5c/local/chain/tuning/run_tdnn_4x.sh     |   2 +-
 .../s5c/local/chain/tuning/run_tdnn_5a.sh     |   2 +-
 .../s5c/local/chain/tuning/run_tdnn_5b.sh     |   2 +-
 .../s5c/local/chain/tuning/run_tdnn_5c.sh     |   2 +-
 .../s5c/local/chain/tuning/run_tdnn_5d.sh     |   2 +-
 .../s5c/local/chain/tuning/run_tdnn_5e.sh     |   2 +-
 .../s5c/local/chain/tuning/run_tdnn_5f.sh     |   2 +-
 .../s5c/local/chain/tuning/run_tdnn_5g.sh     |   2 +-
 .../s5c/local/chain/tuning/run_tdnn_5h.sh     |   2 +-
 .../s5c/local/chain/tuning/run_tdnn_5i.sh     |   2 +-
 .../s5c/local/chain/tuning/run_tdnn_5j.sh     |   2 +-
 .../s5c/local/chain/tuning/run_tdnn_5k.sh     |   2 +-
 .../s5c/local/chain/tuning/run_tdnn_5l.sh     |   2 +-
 .../s5c/local/chain/tuning/run_tdnn_5m.sh     |   2 +-
 .../s5c/local/chain/tuning/run_tdnn_5n.sh     |   2 +-
 .../s5c/local/chain/tuning/run_tdnn_5o.sh     |   2 +-
 .../s5c/local/chain/tuning/run_tdnn_5p.sh     |   2 +-
 .../s5c/local/chain/tuning/run_tdnn_5q.sh     |   2 +-
 .../s5c/local/chain/tuning/run_tdnn_5r.sh     |   2 +-
 .../s5c/local/chain/tuning/run_tdnn_5s.sh     |   2 +-
 .../s5c/local/chain/tuning/run_tdnn_5t.sh     |   2 +-
 .../s5c/local/chain/tuning/run_tdnn_5u.sh     |   2 +-
 .../s5c/local/chain/tuning/run_tdnn_5v.sh     |   2 +-
 .../s5c/local/chain/tuning/run_tdnn_5w.sh     |   2 +-
 .../s5c/local/chain/tuning/run_tdnn_5x.sh     |   2 +-
 .../s5c/local/chain/tuning/run_tdnn_5y.sh     |   2 +-
 .../s5c/local/chain/tuning/run_tdnn_5z.sh     |   2 +-
 .../s5c/local/chain/tuning/run_tdnn_6a.sh     |   2 +-
 .../s5c/local/chain/tuning/run_tdnn_6b.sh     |   2 +-
 .../s5c/local/chain/tuning/run_tdnn_6c.sh     |   2 +-
 .../s5c/local/chain/tuning/run_tdnn_6d.sh     |   2 +-
 .../s5c/local/chain/tuning/run_tdnn_6e.sh     |   2 +-
 .../s5c/local/chain/tuning/run_tdnn_6f.sh     |   2 +-
 .../s5c/local/chain/tuning/run_tdnn_6g.sh     |   2 +-
 .../s5c/local/chain/tuning/run_tdnn_6h.sh     |   2 +-
 .../tuning/run_tdnn_6h_discriminative.sh      |   4 +-
 .../s5c/local/chain/tuning/run_tdnn_6h_py.sh  |   2 +-
 .../s5c/local/chain/tuning/run_tdnn_6i.sh     |   2 +-
 .../s5c/local/chain/tuning/run_tdnn_6j.sh     |   2 +-
 .../s5c/local/chain/tuning/run_tdnn_6k.sh     |   2 +-
 .../s5c/local/chain/tuning/run_tdnn_6l.sh     |   2 +-
 .../s5c/local/chain/tuning/run_tdnn_6m.sh     |   2 +-
 .../s5c/local/chain/tuning/run_tdnn_6n.sh     |   2 +-
 .../s5c/local/chain/tuning/run_tdnn_6o.sh     |   2 +-
 .../s5c/local/chain/tuning/run_tdnn_6p.sh     |   2 +-
 .../s5c/local/chain/tuning/run_tdnn_6q.sh     |   2 +-
 .../s5c/local/chain/tuning/run_tdnn_6r.sh     |   2 +-
 .../s5c/local/chain/tuning/run_tdnn_6s.sh     |   2 +-
 .../s5c/local/chain/tuning/run_tdnn_6t.sh     |   2 +-
 .../s5c/local/chain/tuning/run_tdnn_6u.sh     |   2 +-
 .../s5c/local/chain/tuning/run_tdnn_6v.sh     |   2 +-
 .../s5c/local/chain/tuning/run_tdnn_6w.sh     |   2 +-
 .../s5c/local/chain/tuning/run_tdnn_6x.sh     |   2 +-
 .../s5c/local/chain/tuning/run_tdnn_6y.sh     |   2 +-
 .../s5c/local/chain/tuning/run_tdnn_6z.sh     |   2 +-
 .../s5c/local/chain/tuning/run_tdnn_7a.sh     |   2 +-
 .../s5c/local/chain/tuning/run_tdnn_7b.sh     |   2 +-
 .../s5c/local/chain/tuning/run_tdnn_7c.sh     |   2 +-
 .../s5c/local/chain/tuning/run_tdnn_7d.sh     |   2 +-
 .../s5c/local/chain/tuning/run_tdnn_7e.sh     |   2 +-
 .../s5c/local/chain/tuning/run_tdnn_7f.sh     |   2 +-
 .../s5c/local/chain/tuning/run_tdnn_7g.sh     |   2 +-
 .../s5c/local/chain/tuning/run_tdnn_7h.sh     |   2 +-
 .../s5c/local/chain/tuning/run_tdnn_7i.sh     |   2 +-
 .../s5c/local/chain/tuning/run_tdnn_7j.sh     |   2 +-
 .../s5c/local/chain/tuning/run_tdnn_7k.sh     |   2 +-
 .../s5c/local/chain/tuning/run_tdnn_7l.sh     |   2 +-
 .../s5c/local/chain/tuning/run_tdnn_7m.sh     |   2 +-
 .../s5c/local/chain/tuning/run_tdnn_7m25l.sh  |   2 +-
 .../s5c/local/chain/tuning/run_tdnn_7n.sh     |   2 +-
 .../s5c/local/chain/tuning/run_tdnn_7o.sh     |   2 +-
 .../s5c/local/chain/tuning/run_tdnn_7p.sh     |   2 +-
 .../s5c/local/chain/tuning/run_tdnn_7q.sh     |   2 +-
 egs/swbd/s5c/local/chain/tuning/run_tdnn_a.sh |   4 +-
 .../s5c/local/chain/tuning/run_tdnn_a2.sh     |   4 +-
 .../chain/tuning/run_tdnn_attention_1a.sh     |   2 +-
 egs/swbd/s5c/local/chain/tuning/run_tdnn_b.sh |   4 +-
 .../local/chain/tuning/run_tdnn_blstm_1a.sh   |   2 +-
 .../local/chain/tuning/run_tdnn_blstm_1b.sh   |   2 +-
 .../local/chain/tuning/run_tdnn_blstm_1c.sh   |   2 +-
 .../local/chain/tuning/run_tdnn_blstm_1d.sh   |   2 +-
 egs/swbd/s5c/local/chain/tuning/run_tdnn_c.sh |   4 +-
 egs/swbd/s5c/local/chain/tuning/run_tdnn_d.sh |   4 +-
 egs/swbd/s5c/local/chain/tuning/run_tdnn_e.sh |   4 +-
 egs/swbd/s5c/local/chain/tuning/run_tdnn_f.sh |   4 +-
 egs/swbd/s5c/local/chain/tuning/run_tdnn_g.sh |   4 +-
 egs/swbd/s5c/local/chain/tuning/run_tdnn_h.sh |   4 +-
 egs/swbd/s5c/local/chain/tuning/run_tdnn_i.sh |   4 +-
 egs/swbd/s5c/local/chain/tuning/run_tdnn_j.sh |   4 +-
 egs/swbd/s5c/local/chain/tuning/run_tdnn_k.sh |   4 +-
 egs/swbd/s5c/local/chain/tuning/run_tdnn_l.sh |   4 +-
 .../local/chain/tuning/run_tdnn_lstm_1a.sh    |   2 +-
 .../local/chain/tuning/run_tdnn_lstm_1b.sh    |   2 +-
 .../local/chain/tuning/run_tdnn_lstm_1c.sh    |   2 +-
 .../local/chain/tuning/run_tdnn_lstm_1d.sh    |   2 +-
 .../local/chain/tuning/run_tdnn_lstm_1e.sh    |   2 +-
 .../local/chain/tuning/run_tdnn_lstm_1f.sh    |   2 +-
 .../local/chain/tuning/run_tdnn_lstm_1g.sh    |   2 +-
 .../local/chain/tuning/run_tdnn_lstm_1h.sh    |   2 +-
 .../local/chain/tuning/run_tdnn_lstm_1i.sh    |   2 +-
 .../local/chain/tuning/run_tdnn_lstm_1j.sh    |   2 +-
 .../local/chain/tuning/run_tdnn_lstm_1k.sh    |   2 +-
 .../local/chain/tuning/run_tdnn_lstm_1l.sh    |   2 +-
 .../local/chain/tuning/run_tdnn_lstm_1m.sh    |   2 +-
 .../local/chain/tuning/run_tdnn_lstm_1n.sh    |   2 +-
 egs/swbd/s5c/local/chain/tuning/run_tdnn_m.sh |   4 +-
 egs/swbd/s5c/local/chain/tuning/run_tdnn_n.sh |   4 +-
 egs/swbd/s5c/local/chain/tuning/run_tdnn_o.sh |   4 +-
 .../local/chain/tuning/run_tdnn_opgru_1a.sh   |   2 +-
 .../local/chain/tuning/run_tdnn_opgru_1b.sh   |   2 +-
 egs/swbd/s5c/local/chain/tuning/run_tdnn_p.sh |   4 +-
 egs/swbd/s5c/local/chain/tuning/run_tdnn_q.sh |   4 +-
 egs/swbd/s5c/local/chain/tuning/run_tdnn_r.sh |   4 +-
 egs/swbd/s5c/local/chain/tuning/run_tdnn_s.sh |   4 +-
 egs/swbd/s5c/local/chain/tuning/run_tdnn_t.sh |   4 +-
 egs/swbd/s5c/local/chain/tuning/run_tdnn_u.sh |   4 +-
 egs/swbd/s5c/local/chain/tuning/run_tdnn_v.sh |   4 +-
 egs/swbd/s5c/local/chain/tuning/run_tdnn_w.sh |   4 +-
 egs/swbd/s5c/local/chain/tuning/run_tdnn_x.sh |   4 +-
 egs/swbd/s5c/local/chain/tuning/run_tdnn_y.sh |   4 +-
 egs/swbd/s5c/local/chain/tuning/run_tdnn_z.sh |   4 +-
 .../s5c/local/nnet3/tuning/run_tdnn_lfr1a.sh  |   2 +-
 .../s5c/local/nnet3/tuning/run_tdnn_lfr1b.sh  |   2 +-
 .../s5c/local/nnet3/tuning/run_tdnn_lfr1c.sh  |   2 +-
 .../local/nnet3/tuning/run_tdnn_lfr1c_disc.sh |   4 +-
 egs/tedlium/s5/local/chain/run_tdnn.sh        |   2 +-
 .../s5_r2/local/chain/tuning/run_blstm_1a.sh  |   2 +-
 .../s5_r2/local/chain/tuning/run_lstm_1a.sh   |   2 +-
 .../s5_r2/local/chain/tuning/run_lstm_1b.sh   |   2 +-
 .../s5_r2/local/chain/tuning/run_lstm_1c.sh   |   2 +-
 .../s5_r2/local/chain/tuning/run_lstm_1d.sh   |   2 +-
 .../s5_r2/local/chain/tuning/run_lstm_1e.sh   |   2 +-
 .../s5_r2/local/chain/tuning/run_tdnn_1a.sh   |   2 +-
 .../s5_r2/local/chain/tuning/run_tdnn_1b.sh   |   2 +-
 .../s5_r2/local/chain/tuning/run_tdnn_1c.sh   |   2 +-
 .../s5_r2/local/chain/tuning/run_tdnn_1d.sh   |   2 +-
 .../s5_r2/local/chain/tuning/run_tdnn_1e.sh   |   2 +-
 .../s5_r2/local/chain/tuning/run_tdnn_1f.sh   |   2 +-
 .../s5_r2/local/chain/tuning/run_tdnn_1g.sh   |   2 +-
 .../local/chain/tuning/run_tdnn_lstm_1a.sh    |   2 +-
 .../local/chain/tuning/run_tdnn_lstm_1b.sh    |   2 +-
 .../local/chain/tuning/run_tdnn_lstm_1c.sh    |   2 +-
 .../local/chain/tuning/run_tdnn_lstm_1d.sh    |   2 +-
 .../local/chain/tuning/run_tdnn_lstm_1e.sh    |   2 +-
 .../chain/tuning/run_tdnn_lstm_1e_disc.sh     |   4 +-
 .../local/chain/tuning/run_tdnn_lstm_1f.sh    |   2 +-
 .../local/chain/tuning/run_tdnn_lstm_1g.sh    |   2 +-
 .../local/chain/tuning/run_tdnn_lstm_1h.sh    |   2 +-
 .../local/chain/tuning/run_tdnn_lstm_1i.sh    |   2 +-
 .../local/chain/tuning/run_tdnn_lstm_1j.sh    |   2 +-
 .../local/chain/tuning/run_tdnn_lstm_1k.sh    |   2 +-
 .../local/chain/tuning/run_tdnn_lstm_1l.sh    |   2 +-
 .../local/chain/tuning/run_tdnn_lstm_1m.sh    |   2 +-
 .../local/chain/tuning/run_tdnn_lstm_1n.sh    |   2 +-
 .../local/chain/tuning/run_tdnn_lstm_1o.sh    |   2 +-
 .../local/chain/tuning/run_tdnn_lstm_1r.sh    |   2 +-
 .../local/chain/tuning/run_tdnn_lstm_1s.sh    |   2 +-
 .../local/chain/tuning/run_tdnn_lstm_1t.sh    |   2 +-
 .../local/chain/tuning/run_tdnn_lstm_1u.sh    |   2 +-
 .../local/chain/tuning/run_tdnn_lstm_1v.sh    |   2 +-
 .../tuning/run_tdnn_lstm_attention_1a.sh      |   2 +-
 .../tuning/run_tdnn_lstm_attention_bs_1a.sh   |   2 +-
 .../tuning/run_tdnn_lstm_attention_bs_1b.sh   |   2 +-
 .../local/nnet3/tuning/run_tdnn_lfr_1a.sh     |   2 +-
 .../nnet3/tuning/run_tdnn_lstm_lfr_1a.sh      |   2 +-
 .../s5_r3/local/chain/tuning/run_tdnn_1a.sh   |   2 +-
 .../s5_r3/local/chain/tuning/run_tdnn_1b.sh   |   2 +-
 .../s5_r3/local/chain/tuning/run_tdnn_1c.sh   |   2 +-
 .../s5/local/chain/tuning/run_tdnn_1a.sh      |   2 +-
 egs/uw3/v1/local/chain/run_cnn_1a.sh          |   2 +-
 .../s5b/local/chain/tuning/run_tdnn_1a.sh     |   2 +-
 .../s5/local/chain/e2e/run_tdnn_flatstart.sh  |   6 +-
 .../chain/e2e/run_tdnn_lstm_flatstart.sh      |   6 +-
 .../e2e/tuning/run_tdnnf_flatstart_char1a.sh  |   6 +-
 .../e2e/tuning/run_tdnnf_flatstart_char1b.sh  |   6 +-
 .../s5/local/chain/tuning/run_cnn_tdnn_1a.sh  |   8 +-
 .../s5/local/chain/tuning/run_cnn_tdnn_1b.sh  |   8 +-
 .../s5/local/chain/tuning/run_cnn_tdnn_1c.sh  |   8 +-
 egs/wsj/s5/local/chain/tuning/run_tdnn_1a.sh  |   8 +-
 egs/wsj/s5/local/chain/tuning/run_tdnn_1b.sh  |   8 +-
 egs/wsj/s5/local/chain/tuning/run_tdnn_1c.sh  |   8 +-
 egs/wsj/s5/local/chain/tuning/run_tdnn_1d.sh  |   8 +-
 egs/wsj/s5/local/chain/tuning/run_tdnn_1e.sh  |   8 +-
 egs/wsj/s5/local/chain/tuning/run_tdnn_1f.sh  |   8 +-
 egs/wsj/s5/local/chain/tuning/run_tdnn_1g.sh  |   8 +-
 .../s5/local/chain/tuning/run_tdnn_lstm_1a.sh |  10 +-
 .../s5/local/chain/tuning/run_tdnn_lstm_1b.sh |  10 +-
 .../nnet3/tuning/run_tdnn_lstm_lfr_1a.sh      |  10 +-
 egs/wsj/s5/steps/align_basis_fmllr.sh         |   2 +-
 egs/wsj/s5/steps/align_basis_fmllr_lats.sh    |   7 +-
 egs/wsj/s5/steps/align_fmllr.sh               |   2 +-
 egs/wsj/s5/steps/align_fmllr_lats.sh          |   7 +-
 egs/wsj/s5/steps/align_lvtln.sh               |   2 +-
 egs/wsj/s5/steps/align_raw_fmllr.sh           |   2 +-
 egs/wsj/s5/steps/align_sgmm2.sh               |   2 +-
 egs/wsj/s5/steps/align_si.sh                  |   2 +-
 .../cleanup/clean_and_segment_data_nnet3.sh   |   4 -
 egs/wsj/s5/steps/cleanup/find_bad_utts.sh     |   3 +-
 .../s5/steps/cleanup/find_bad_utts_nnet.sh    |   2 +-
 .../s5/steps/cleanup/make_biased_lm_graphs.sh |   4 +-
 .../steps/cleanup/make_segmentation_graph.sh  |   7 +-
 .../s5/steps/cleanup/make_utterance_graph.sh  |  15 +--
 .../cleanup/segment_long_utterances_nnet3.sh  |   2 +-
 egs/wsj/s5/steps/decode_basis_fmllr.sh        |   2 +-
 egs/wsj/s5/steps/decode_fromlats.sh           |   2 +-
 egs/wsj/s5/steps/decode_sgmm2_fromlats.sh     |   2 +-
 egs/wsj/s5/steps/get_fmllr_basis.sh           |   2 +-
 egs/wsj/s5/steps/lmrescore.sh                 |   8 +-
 egs/wsj/s5/steps/make_phone_graph.sh          |  12 +--
 egs/wsj/s5/steps/nnet/align.sh                |   4 +-
 egs/wsj/s5/steps/nnet2/align.sh               |   2 +-
 egs/wsj/s5/steps/nnet3/align.sh               |   6 +-
 egs/wsj/s5/steps/nnet3/align_lats.sh          |   9 +-
 .../s5/steps/nnet3/chain/e2e/prepare_e2e.sh   |   2 +-
 egs/wsj/s5/steps/nnet3/get_degs.sh            |  16 +--
 egs/wsj/s5/steps/nnet3/make_denlats.sh        |   3 +-
 egs/wsj/s5/steps/online/nnet2/align.sh        |   2 +-
 .../internal/prepare_sad_graph.py             |   7 --
 egs/wsj/s5/steps/tandem/align_fmllr.sh        |   2 +-
 egs/wsj/s5/steps/tandem/align_sgmm2.sh        |   2 +-
 egs/wsj/s5/steps/tandem/align_si.sh           |   2 +-
 egs/wsj/s5/steps/tandem/train_deltas.sh       |   2 +-
 egs/wsj/s5/steps/tandem/train_lda_mllt.sh     |   2 +-
 egs/wsj/s5/steps/tandem/train_mllt.sh         |   2 +-
 egs/wsj/s5/steps/tandem/train_mono.sh         |   2 +-
 egs/wsj/s5/steps/tandem/train_sat.sh          |   2 +-
 egs/wsj/s5/steps/tandem/train_sgmm2.sh        |   2 +-
 egs/wsj/s5/steps/train_deltas.sh              |   2 +-
 egs/wsj/s5/steps/train_lda_mllt.sh            |   2 +-
 egs/wsj/s5/steps/train_lvtln.sh               |   2 +-
 egs/wsj/s5/steps/train_mono.sh                |   2 +-
 egs/wsj/s5/steps/train_quick.sh               |   2 +-
 egs/wsj/s5/steps/train_raw_sat.sh             |   2 +-
 egs/wsj/s5/steps/train_sat.sh                 |   2 +-
 egs/wsj/s5/steps/train_sat_basis.sh           |   2 +-
 egs/wsj/s5/steps/train_segmenter.sh           |   2 +-
 egs/wsj/s5/steps/train_sgmm2.sh               |   2 +-
 egs/wsj/s5/steps/train_sgmm2_group.sh         |   2 +-
 egs/wsj/s5/utils/gen_topo.pl                  | 100 +++++++++++-------
 egs/wsj/s5/utils/mkgraph.sh                   |  18 +---
 egs/wsj/s5/utils/validate_lang.pl             |  35 +++---
 .../v1/local/chain/run_cnn_e2eali_1b.sh       |   4 +-
 .../v1/local/chain/run_flatstart_cnn1a.sh     |   2 +-
 egs/yomdle_fa/v1/run.sh                       |   2 +-
 .../local/chain/tuning/run_cnn_e2eali_1a.sh   |   4 +-
 .../local/chain/tuning/run_cnn_e2eali_1b.sh   |   2 +-
 .../run_cnn_chainali_semisupervised_1a.sh     |   4 +-
 .../run_cnn_chainali_semisupervised_1b.sh     |   4 +-
 egs/yomdle_korean/v1/run_end2end.sh           |   6 +-
 .../local/chain/tuning/run_cnn_e2eali_1a.sh   |   2 +-
 egs/yomdle_russian/v1/run_end2end.sh          |   6 +-
 .../v1/local/chain/run_e2e_cnn.sh             |   2 +-
 .../local/chain/tuning/run_cnn_e2eali_1a.sh   |   4 +-
 .../local/chain/tuning/run_cnn_e2eali_1b.sh   |   4 +-
 .../run_cnn_chainali_semisupervised_1a.sh     |   4 +-
 .../run_cnn_chainali_semisupervised_1b.sh     |   4 +-
 egs/yomdle_tamil/v1/run_end2end.sh            |   2 +-
 .../v1/local/chain/run_cnn_e2eali_1b.sh       |   4 +-
 .../v1/local/chain/run_flatstart_cnn1a.sh     |   2 +-
 egs/yomdle_zh/v1/run.sh                       |   2 +-
 .../s5/local/chain/tuning/run_tdnn_1a.sh      |   2 +-
 .../local/chain/tuning/run_tdnn_opgru_1a.sh   |   2 +-
 542 files changed, 779 insertions(+), 828 deletions(-)

diff --git a/egs/aidatatang_200zh/s5/local/chain/tuning/run_tdnn_1a.sh b/egs/aidatatang_200zh/s5/local/chain/tuning/run_tdnn_1a.sh
index 0be0e2c79c6..2aeb836083c 100644
--- a/egs/aidatatang_200zh/s5/local/chain/tuning/run_tdnn_1a.sh
+++ b/egs/aidatatang_200zh/s5/local/chain/tuning/run_tdnn_1a.sh
@@ -177,7 +177,7 @@ if [ $stage -le 12 ]; then
   # Note: it might appear that this $lang directory is mismatched, and it is as
   # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
   # the lang directory.
-  utils/mkgraph.sh --self-loop-scale 1.0 data/lang_test $dir $dir/graph
+  utils/mkgraph.sh data/lang_test $dir $dir/graph
 fi
 
 graph_dir=$dir/graph
diff --git a/egs/aidatatang_200zh/s5/local/chain/tuning/run_tdnn_2a.sh b/egs/aidatatang_200zh/s5/local/chain/tuning/run_tdnn_2a.sh
index 78dd4000e58..e3e97e9ae2a 100644
--- a/egs/aidatatang_200zh/s5/local/chain/tuning/run_tdnn_2a.sh
+++ b/egs/aidatatang_200zh/s5/local/chain/tuning/run_tdnn_2a.sh
@@ -197,7 +197,7 @@ if [ $stage -le 12 ]; then
   # Note: it might appear that this $lang directory is mismatched, and it is as
   # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
   # the lang directory.
-  utils/mkgraph.sh --self-loop-scale 1.0 data/lang_test $dir $dir/graph
+  utils/mkgraph.sh data/lang_test $dir $dir/graph
 fi
 
 graph_dir=$dir/graph
diff --git a/egs/aishell/s5/local/chain/tuning/run_tdnn_1a.sh b/egs/aishell/s5/local/chain/tuning/run_tdnn_1a.sh
index b38fa4d9c7a..e0acea5f168 100755
--- a/egs/aishell/s5/local/chain/tuning/run_tdnn_1a.sh
+++ b/egs/aishell/s5/local/chain/tuning/run_tdnn_1a.sh
@@ -168,7 +168,7 @@ if [ $stage -le 12 ]; then
   # Note: it might appear that this $lang directory is mismatched, and it is as
   # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
   # the lang directory.
-  utils/mkgraph.sh --self-loop-scale 1.0 data/lang_test $dir $dir/graph
+  utils/mkgraph.sh data/lang_test $dir $dir/graph
 fi
 
 graph_dir=$dir/graph
diff --git a/egs/aishell/s5/local/chain/tuning/run_tdnn_2a.sh b/egs/aishell/s5/local/chain/tuning/run_tdnn_2a.sh
index 6b7223785d9..965932316b8 100755
--- a/egs/aishell/s5/local/chain/tuning/run_tdnn_2a.sh
+++ b/egs/aishell/s5/local/chain/tuning/run_tdnn_2a.sh
@@ -170,7 +170,7 @@ if [ $stage -le 12 ]; then
   # Note: it might appear that this $lang directory is mismatched, and it is as
   # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
   # the lang directory.
-  utils/mkgraph.sh --self-loop-scale 1.0 data/lang_test $dir $dir/graph
+  utils/mkgraph.sh data/lang_test $dir $dir/graph
 fi
 
 graph_dir=$dir/graph
diff --git a/egs/aishell2/s5/local/chain/tuning/run_tdnn_1a.sh b/egs/aishell2/s5/local/chain/tuning/run_tdnn_1a.sh
index 86c9becac5b..9148f54d29b 100755
--- a/egs/aishell2/s5/local/chain/tuning/run_tdnn_1a.sh
+++ b/egs/aishell2/s5/local/chain/tuning/run_tdnn_1a.sh
@@ -188,7 +188,7 @@ if [ $stage -le 12 ]; then
   # Note: it might appear that this $lang directory is mismatched, and it is as
   # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
   # the lang directory.
-  utils/mkgraph.sh --self-loop-scale 1.0 data/lang_test $dir $dir/graph
+  utils/mkgraph.sh data/lang_test $dir $dir/graph
 fi
 
 graph_dir=$dir/graph
diff --git a/egs/aishell2/s5/local/chain/tuning/run_tdnn_1b.sh b/egs/aishell2/s5/local/chain/tuning/run_tdnn_1b.sh
index d8560e63909..f0d87890c00 100755
--- a/egs/aishell2/s5/local/chain/tuning/run_tdnn_1b.sh
+++ b/egs/aishell2/s5/local/chain/tuning/run_tdnn_1b.sh
@@ -238,7 +238,7 @@ if [ $stage -le 12 ]; then
   # Note: it might appear that this $lang directory is mismatched, and it is as
   # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
   # the lang directory.
-  utils/mkgraph.sh --self-loop-scale 1.0 data/lang_test $dir $dir/graph
+  utils/mkgraph.sh data/lang_test $dir $dir/graph
 fi
 
 graph_dir=$dir/graph
diff --git a/egs/ami/s5/local/chain/run_blstm_ami_5.sh b/egs/ami/s5/local/chain/run_blstm_ami_5.sh
index 53221a2bd53..90e096a9264 100755
--- a/egs/ami/s5/local/chain/run_blstm_ami_5.sh
+++ b/egs/ami/s5/local/chain/run_blstm_ami_5.sh
@@ -149,7 +149,7 @@ if [ $stage -le 18 ]; then
   # Note: it might appear that this $lang directory is mismatched, and it is as
   # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
   # the lang directory.
-  utils/mkgraph.sh --self-loop-scale 1.0 data/lang_${LM} $dir $graph_dir
+  utils/mkgraph.sh data/lang_${LM} $dir $graph_dir
 fi
 
 if [ $stage -le 19 ]; then
diff --git a/egs/ami/s5/local/chain/run_tdnn_ami_5.sh b/egs/ami/s5/local/chain/run_tdnn_ami_5.sh
index df635316127..5b9ab9de043 100755
--- a/egs/ami/s5/local/chain/run_tdnn_ami_5.sh
+++ b/egs/ami/s5/local/chain/run_tdnn_ami_5.sh
@@ -175,7 +175,7 @@ if [ $stage -le 18 ]; then
   # Note: it might appear that this $lang directory is mismatched, and it is as
   # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
   # the lang directory.
-  utils/mkgraph.sh --self-loop-scale 1.0 data/lang_${LM} $dir $graph_dir
+  utils/mkgraph.sh data/lang_${LM} $dir $graph_dir
 fi
 
 if [ $stage -le 19 ]; then
diff --git a/egs/ami/s5b/local/chain/multi_condition/tuning/run_tdnn_1a.sh b/egs/ami/s5b/local/chain/multi_condition/tuning/run_tdnn_1a.sh
index 4d260e3c517..57628d86798 100755
--- a/egs/ami/s5b/local/chain/multi_condition/tuning/run_tdnn_1a.sh
+++ b/egs/ami/s5b/local/chain/multi_condition/tuning/run_tdnn_1a.sh
@@ -307,7 +307,7 @@ if [ $stage -le 17 ]; then
   # Note: it might appear that this data/lang_chain directory is mismatched, and it is as
   # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
   # the lang directory.
-  utils/mkgraph.sh --self-loop-scale 1.0 data/lang_${LM} $dir $graph_dir
+  utils/mkgraph.sh data/lang_${LM} $dir $graph_dir
 fi
 
 if [ $stage -le 18 ]; then
diff --git a/egs/ami/s5b/local/chain/multi_condition/tuning/run_tdnn_lstm_1a.sh b/egs/ami/s5b/local/chain/multi_condition/tuning/run_tdnn_lstm_1a.sh
index 3546b6a7ced..8aae7760a71 100755
--- a/egs/ami/s5b/local/chain/multi_condition/tuning/run_tdnn_lstm_1a.sh
+++ b/egs/ami/s5b/local/chain/multi_condition/tuning/run_tdnn_lstm_1a.sh
@@ -301,7 +301,7 @@ if [ $stage -le 17 ]; then
   # Note: it might appear that this data/lang_chain directory is mismatched, and it is as
   # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
   # the lang directory.
-  utils/mkgraph.sh --self-loop-scale 1.0 data/lang_${LM} $dir $graph_dir
+  utils/mkgraph.sh data/lang_${LM} $dir $graph_dir
 fi
 
 if [ $stage -le 18 ]; then
diff --git a/egs/ami/s5b/local/chain/multi_condition/tuning/run_tdnn_lstm_1b.sh b/egs/ami/s5b/local/chain/multi_condition/tuning/run_tdnn_lstm_1b.sh
index 1a839b045bd..64d8e1822ca 100755
--- a/egs/ami/s5b/local/chain/multi_condition/tuning/run_tdnn_lstm_1b.sh
+++ b/egs/ami/s5b/local/chain/multi_condition/tuning/run_tdnn_lstm_1b.sh
@@ -330,7 +330,7 @@ if [ $stage -le 17 ]; then
   # Note: it might appear that this data/lang_chain directory is mismatched, and it is as
   # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
   # the lang directory.
-  utils/mkgraph.sh --self-loop-scale 1.0 data/lang_${LM} $dir $graph_dir
+  utils/mkgraph.sh data/lang_${LM} $dir $graph_dir
 fi
 
 if [ $stage -le 18 ]; then
diff --git a/egs/ami/s5b/local/chain/tuning/run_cnn_tdnn_lstm_1a.sh b/egs/ami/s5b/local/chain/tuning/run_cnn_tdnn_lstm_1a.sh
index d926c1dc6d7..23e5bda2038 100644
--- a/egs/ami/s5b/local/chain/tuning/run_cnn_tdnn_lstm_1a.sh
+++ b/egs/ami/s5b/local/chain/tuning/run_cnn_tdnn_lstm_1a.sh
@@ -284,7 +284,7 @@ if [ $stage -le 17 ]; then
   # Note: it might appear that this data/lang_chain directory is mismatched, and it is as
   # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
   # the lang directory.
-  utils/mkgraph.sh --self-loop-scale 1.0 data/lang_${LM} $dir $graph_dir
+  utils/mkgraph.sh data/lang_${LM} $dir $graph_dir
 fi
 
 if [ $stage -le 18 ]; then
diff --git a/egs/ami/s5b/local/chain/tuning/run_cnn_tdnn_lstm_1b.sh b/egs/ami/s5b/local/chain/tuning/run_cnn_tdnn_lstm_1b.sh
index d9cd1c356e8..d9dd08166c2 100644
--- a/egs/ami/s5b/local/chain/tuning/run_cnn_tdnn_lstm_1b.sh
+++ b/egs/ami/s5b/local/chain/tuning/run_cnn_tdnn_lstm_1b.sh
@@ -278,7 +278,7 @@ if [ $stage -le 17 ]; then
   # Note: it might appear that this data/lang_chain directory is mismatched, and it is as
   # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
   # the lang directory.
-  utils/mkgraph.sh --self-loop-scale 1.0 data/lang_${LM} $dir $graph_dir
+  utils/mkgraph.sh data/lang_${LM} $dir $graph_dir
 fi
 
 if [ $stage -le 18 ]; then
diff --git a/egs/ami/s5b/local/chain/tuning/run_cnn_tdnn_lstm_1c.sh b/egs/ami/s5b/local/chain/tuning/run_cnn_tdnn_lstm_1c.sh
index a0805b4f9f1..ac5c403c4bd 100755
--- a/egs/ami/s5b/local/chain/tuning/run_cnn_tdnn_lstm_1c.sh
+++ b/egs/ami/s5b/local/chain/tuning/run_cnn_tdnn_lstm_1c.sh
@@ -287,7 +287,7 @@ if [ $stage -le 17 ]; then
   # Note: it might appear that this data/lang_chain directory is mismatched, and it is as
   # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
   # the lang directory.
-  utils/mkgraph.sh --self-loop-scale 1.0 data/lang_${LM} $dir $graph_dir
+  utils/mkgraph.sh data/lang_${LM} $dir $graph_dir
 fi
 
 if [ $stage -le 18 ]; then
diff --git a/egs/ami/s5b/local/chain/tuning/run_tdnn_1a.sh b/egs/ami/s5b/local/chain/tuning/run_tdnn_1a.sh
index 03ebc5845e4..3b107519114 100755
--- a/egs/ami/s5b/local/chain/tuning/run_tdnn_1a.sh
+++ b/egs/ami/s5b/local/chain/tuning/run_tdnn_1a.sh
@@ -217,7 +217,7 @@ if [ $stage -le 17 ]; then
   # Note: it might appear that this data/lang_chain directory is mismatched, and it is as
   # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
   # the lang directory.
-  utils/mkgraph.sh --self-loop-scale 1.0 data/lang_${LM} $dir $graph_dir
+  utils/mkgraph.sh data/lang_${LM} $dir $graph_dir
 fi
 
 if [ $stage -le 18 ]; then
diff --git a/egs/ami/s5b/local/chain/tuning/run_tdnn_1b.sh b/egs/ami/s5b/local/chain/tuning/run_tdnn_1b.sh
index 997357b80a9..2ea2266b1b5 100755
--- a/egs/ami/s5b/local/chain/tuning/run_tdnn_1b.sh
+++ b/egs/ami/s5b/local/chain/tuning/run_tdnn_1b.sh
@@ -245,7 +245,7 @@ if [ $stage -le 17 ]; then
   # Note: it might appear that this data/lang_chain directory is mismatched, and it is as
   # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
   # the lang directory.
-  utils/mkgraph.sh --self-loop-scale 1.0 data/lang_${LM} $dir $graph_dir
+  utils/mkgraph.sh data/lang_${LM} $dir $graph_dir
 fi
 
 if [ $stage -le 18 ]; then
diff --git a/egs/ami/s5b/local/chain/tuning/run_tdnn_1c.sh b/egs/ami/s5b/local/chain/tuning/run_tdnn_1c.sh
index 4d062e65429..de2030c71cc 100755
--- a/egs/ami/s5b/local/chain/tuning/run_tdnn_1c.sh
+++ b/egs/ami/s5b/local/chain/tuning/run_tdnn_1c.sh
@@ -232,7 +232,7 @@ if [ $stage -le 17 ]; then
   # Note: it might appear that this data/lang_chain directory is mismatched, and it is as
   # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
   # the lang directory.
-  utils/mkgraph.sh --self-loop-scale 1.0 data/lang_${LM} $dir $graph_dir
+  utils/mkgraph.sh data/lang_${LM} $dir $graph_dir
 fi
 
 if [ $stage -le 18 ]; then
diff --git a/egs/ami/s5b/local/chain/tuning/run_tdnn_1d.sh b/egs/ami/s5b/local/chain/tuning/run_tdnn_1d.sh
index 387570388d0..4375253d3a2 100755
--- a/egs/ami/s5b/local/chain/tuning/run_tdnn_1d.sh
+++ b/egs/ami/s5b/local/chain/tuning/run_tdnn_1d.sh
@@ -244,7 +244,7 @@ if [ $stage -le 17 ]; then
   # Note: it might appear that this data/lang_chain directory is mismatched, and it is as
   # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
   # the lang directory.
-  utils/mkgraph.sh --self-loop-scale 1.0 data/lang_${LM} $dir $graph_dir
+  utils/mkgraph.sh data/lang_${LM} $dir $graph_dir
 fi
 
 if [ $stage -le 18 ]; then
diff --git a/egs/ami/s5b/local/chain/tuning/run_tdnn_1e.sh b/egs/ami/s5b/local/chain/tuning/run_tdnn_1e.sh
index 0436b08cdc0..b372db56e32 100755
--- a/egs/ami/s5b/local/chain/tuning/run_tdnn_1e.sh
+++ b/egs/ami/s5b/local/chain/tuning/run_tdnn_1e.sh
@@ -242,7 +242,7 @@ if [ $stage -le 17 ]; then
   # Note: it might appear that this data/lang_chain directory is mismatched, and it is as
   # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
   # the lang directory.
-  utils/mkgraph.sh --self-loop-scale 1.0 data/lang_${LM} $dir $graph_dir
+  utils/mkgraph.sh data/lang_${LM} $dir $graph_dir
 fi
 
 if [ $stage -le 18 ]; then
diff --git a/egs/ami/s5b/local/chain/tuning/run_tdnn_1f.sh b/egs/ami/s5b/local/chain/tuning/run_tdnn_1f.sh
index 4ca526d63b8..ee887fd91c2 100644
--- a/egs/ami/s5b/local/chain/tuning/run_tdnn_1f.sh
+++ b/egs/ami/s5b/local/chain/tuning/run_tdnn_1f.sh
@@ -247,7 +247,7 @@ if [ $stage -le 17 ]; then
   # Note: it might appear that this data/lang_chain directory is mismatched, and it is as
   # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
   # the lang directory.
-  utils/mkgraph.sh --self-loop-scale 1.0 data/lang_${LM} $dir $graph_dir
+  utils/mkgraph.sh data/lang_${LM} $dir $graph_dir
 fi
 
 if [ $stage -le 18 ]; then
diff --git a/egs/ami/s5b/local/chain/tuning/run_tdnn_1g.sh b/egs/ami/s5b/local/chain/tuning/run_tdnn_1g.sh
index baed760bb68..8c421c58351 100644
--- a/egs/ami/s5b/local/chain/tuning/run_tdnn_1g.sh
+++ b/egs/ami/s5b/local/chain/tuning/run_tdnn_1g.sh
@@ -248,7 +248,7 @@ if [ $stage -le 17 ]; then
   # Note: it might appear that this data/lang_chain directory is mismatched, and it is as
   # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
   # the lang directory.
-  utils/mkgraph.sh --self-loop-scale 1.0 data/lang_${LM} $dir $graph_dir
+  utils/mkgraph.sh data/lang_${LM} $dir $graph_dir
 fi
 
 if [ $stage -le 18 ]; then
diff --git a/egs/ami/s5b/local/chain/tuning/run_tdnn_1h.sh b/egs/ami/s5b/local/chain/tuning/run_tdnn_1h.sh
index e721a858c0a..2c226c01105 100755
--- a/egs/ami/s5b/local/chain/tuning/run_tdnn_1h.sh
+++ b/egs/ami/s5b/local/chain/tuning/run_tdnn_1h.sh
@@ -251,7 +251,7 @@ if [ $stage -le 17 ]; then
   # Note: it might appear that this data/lang_chain directory is mismatched, and it is as
   # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
   # the lang directory.
-  utils/mkgraph.sh --self-loop-scale 1.0 data/lang_${LM} $dir $graph_dir
+  utils/mkgraph.sh data/lang_${LM} $dir $graph_dir
 fi
 
 if [ $stage -le 18 ]; then
diff --git a/egs/ami/s5b/local/chain/tuning/run_tdnn_1i.sh b/egs/ami/s5b/local/chain/tuning/run_tdnn_1i.sh
index de40cb2d1a4..7486b3b6d6e 100755
--- a/egs/ami/s5b/local/chain/tuning/run_tdnn_1i.sh
+++ b/egs/ami/s5b/local/chain/tuning/run_tdnn_1i.sh
@@ -253,7 +253,7 @@ if [ $stage -le 17 ]; then
   # Note: it might appear that this data/lang_chain directory is mismatched, and it is as
   # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
   # the lang directory.
-  utils/mkgraph.sh --self-loop-scale 1.0 data/lang_${LM} $dir $graph_dir
+  utils/mkgraph.sh data/lang_${LM} $dir $graph_dir
 fi
 
 if [ $stage -le 18 ]; then
diff --git a/egs/ami/s5b/local/chain/tuning/run_tdnn_lstm_1a.sh b/egs/ami/s5b/local/chain/tuning/run_tdnn_lstm_1a.sh
index 4f580b88f6b..84470f6530b 100755
--- a/egs/ami/s5b/local/chain/tuning/run_tdnn_lstm_1a.sh
+++ b/egs/ami/s5b/local/chain/tuning/run_tdnn_lstm_1a.sh
@@ -259,7 +259,7 @@ if [ $stage -le 17 ]; then
   # Note: it might appear that this data/lang_chain directory is mismatched, and it is as
   # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
   # the lang directory.
-  utils/mkgraph.sh --self-loop-scale 1.0 data/lang_${LM} $dir $graph_dir
+  utils/mkgraph.sh data/lang_${LM} $dir $graph_dir
 fi
 
 if [ $stage -le 18 ]; then
diff --git a/egs/ami/s5b/local/chain/tuning/run_tdnn_lstm_1b.sh b/egs/ami/s5b/local/chain/tuning/run_tdnn_lstm_1b.sh
index 904a079d7de..93ef04d79f5 100755
--- a/egs/ami/s5b/local/chain/tuning/run_tdnn_lstm_1b.sh
+++ b/egs/ami/s5b/local/chain/tuning/run_tdnn_lstm_1b.sh
@@ -263,7 +263,7 @@ if [ $stage -le 17 ]; then
   # Note: it might appear that this data/lang_chain directory is mismatched, and it is as
   # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
   # the lang directory.
-  utils/mkgraph.sh --self-loop-scale 1.0 data/lang_${LM} $dir $graph_dir
+  utils/mkgraph.sh data/lang_${LM} $dir $graph_dir
 fi
 
 if [ $stage -le 18 ]; then
diff --git a/egs/ami/s5b/local/chain/tuning/run_tdnn_lstm_1c.sh b/egs/ami/s5b/local/chain/tuning/run_tdnn_lstm_1c.sh
index 511e520465a..60a6356077e 100755
--- a/egs/ami/s5b/local/chain/tuning/run_tdnn_lstm_1c.sh
+++ b/egs/ami/s5b/local/chain/tuning/run_tdnn_lstm_1c.sh
@@ -262,7 +262,7 @@ if [ $stage -le 17 ]; then
   # Note: it might appear that this data/lang_chain directory is mismatched, and it is as
   # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
   # the lang directory.
-  utils/mkgraph.sh --self-loop-scale 1.0 data/lang_${LM} $dir $graph_dir
+  utils/mkgraph.sh data/lang_${LM} $dir $graph_dir
 fi
 
 if [ $stage -le 18 ]; then
diff --git a/egs/ami/s5b/local/chain/tuning/run_tdnn_lstm_1d.sh b/egs/ami/s5b/local/chain/tuning/run_tdnn_lstm_1d.sh
index bd81b7df4eb..a3ee0bcb631 100755
--- a/egs/ami/s5b/local/chain/tuning/run_tdnn_lstm_1d.sh
+++ b/egs/ami/s5b/local/chain/tuning/run_tdnn_lstm_1d.sh
@@ -264,7 +264,7 @@ if [ $stage -le 17 ]; then
   # Note: it might appear that this data/lang_chain directory is mismatched, and it is as
   # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
   # the lang directory.
-  utils/mkgraph.sh --self-loop-scale 1.0 data/lang_${LM} $dir $graph_dir
+  utils/mkgraph.sh data/lang_${LM} $dir $graph_dir
 fi
 
 if [ $stage -le 18 ]; then
diff --git a/egs/ami/s5b/local/chain/tuning/run_tdnn_lstm_1e.sh b/egs/ami/s5b/local/chain/tuning/run_tdnn_lstm_1e.sh
index 50903e78b6d..aff42a3647f 100755
--- a/egs/ami/s5b/local/chain/tuning/run_tdnn_lstm_1e.sh
+++ b/egs/ami/s5b/local/chain/tuning/run_tdnn_lstm_1e.sh
@@ -264,7 +264,7 @@ if [ $stage -le 17 ]; then
   # Note: it might appear that this data/lang_chain directory is mismatched, and it is as
   # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
   # the lang directory.
-  utils/mkgraph.sh --self-loop-scale 1.0 data/lang_${LM} $dir $graph_dir
+  utils/mkgraph.sh data/lang_${LM} $dir $graph_dir
 fi
 
 if [ $stage -le 18 ]; then
diff --git a/egs/ami/s5b/local/chain/tuning/run_tdnn_lstm_1f.sh b/egs/ami/s5b/local/chain/tuning/run_tdnn_lstm_1f.sh
index f6c53001498..a748e034cf8 100755
--- a/egs/ami/s5b/local/chain/tuning/run_tdnn_lstm_1f.sh
+++ b/egs/ami/s5b/local/chain/tuning/run_tdnn_lstm_1f.sh
@@ -263,7 +263,7 @@ if [ $stage -le 17 ]; then
   # Note: it might appear that this data/lang_chain directory is mismatched, and it is as
   # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
   # the lang directory.
-  utils/mkgraph.sh --self-loop-scale 1.0 data/lang_${LM} $dir $graph_dir
+  utils/mkgraph.sh data/lang_${LM} $dir $graph_dir
 fi
 
 if [ $stage -le 18 ]; then
diff --git a/egs/ami/s5b/local/chain/tuning/run_tdnn_lstm_1g.sh b/egs/ami/s5b/local/chain/tuning/run_tdnn_lstm_1g.sh
index 79fd9ef3fb5..0cdf44279f2 100755
--- a/egs/ami/s5b/local/chain/tuning/run_tdnn_lstm_1g.sh
+++ b/egs/ami/s5b/local/chain/tuning/run_tdnn_lstm_1g.sh
@@ -264,7 +264,7 @@ if [ $stage -le 17 ]; then
   # Note: it might appear that this data/lang_chain directory is mismatched, and it is as
   # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
   # the lang directory.
-  utils/mkgraph.sh --self-loop-scale 1.0 data/lang_${LM} $dir $graph_dir
+  utils/mkgraph.sh data/lang_${LM} $dir $graph_dir
 fi
 
 if [ $stage -le 18 ]; then
diff --git a/egs/ami/s5b/local/chain/tuning/run_tdnn_lstm_1h.sh b/egs/ami/s5b/local/chain/tuning/run_tdnn_lstm_1h.sh
index e58a7f89e03..428e4926693 100755
--- a/egs/ami/s5b/local/chain/tuning/run_tdnn_lstm_1h.sh
+++ b/egs/ami/s5b/local/chain/tuning/run_tdnn_lstm_1h.sh
@@ -265,7 +265,7 @@ if [ $stage -le 17 ]; then
   # Note: it might appear that this data/lang_chain directory is mismatched, and it is as
   # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
   # the lang directory.
-  utils/mkgraph.sh --self-loop-scale 1.0 data/lang_${LM} $dir $graph_dir
+  utils/mkgraph.sh data/lang_${LM} $dir $graph_dir
 fi
 
 if [ $stage -le 18 ]; then
diff --git a/egs/ami/s5b/local/chain/tuning/run_tdnn_lstm_1i.sh b/egs/ami/s5b/local/chain/tuning/run_tdnn_lstm_1i.sh
index 13f894f5a48..3bd87ca26f0 100755
--- a/egs/ami/s5b/local/chain/tuning/run_tdnn_lstm_1i.sh
+++ b/egs/ami/s5b/local/chain/tuning/run_tdnn_lstm_1i.sh
@@ -264,7 +264,7 @@ if [ $stage -le 17 ]; then
   # Note: it might appear that this data/lang_chain directory is mismatched, and it is as
   # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
   # the lang directory.
-  utils/mkgraph.sh --self-loop-scale 1.0 data/lang_${LM} $dir $graph_dir
+  utils/mkgraph.sh data/lang_${LM} $dir $graph_dir
 fi
 
 if [ $stage -le 18 ]; then
diff --git a/egs/ami/s5b/local/chain/tuning/run_tdnn_lstm_1j.sh b/egs/ami/s5b/local/chain/tuning/run_tdnn_lstm_1j.sh
index 48b31832e8c..b835da9cf38 100755
--- a/egs/ami/s5b/local/chain/tuning/run_tdnn_lstm_1j.sh
+++ b/egs/ami/s5b/local/chain/tuning/run_tdnn_lstm_1j.sh
@@ -275,7 +275,7 @@ if [ $stage -le 17 ]; then
   # Note: it might appear that this data/lang_chain directory is mismatched, and it is as
   # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
   # the lang directory.
-  utils/mkgraph.sh --self-loop-scale 1.0 data/lang_${LM} $dir $graph_dir
+  utils/mkgraph.sh data/lang_${LM} $dir $graph_dir
 fi
 
 if [ $stage -le 18 ]; then
diff --git a/egs/ami/s5b/local/chain/tuning/run_tdnn_lstm_1k.sh b/egs/ami/s5b/local/chain/tuning/run_tdnn_lstm_1k.sh
index e675bc494bb..0caf4494b79 100755
--- a/egs/ami/s5b/local/chain/tuning/run_tdnn_lstm_1k.sh
+++ b/egs/ami/s5b/local/chain/tuning/run_tdnn_lstm_1k.sh
@@ -271,7 +271,7 @@ if [ $stage -le 17 ]; then
   # Note: it might appear that this data/lang_chain directory is mismatched, and it is as
   # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
   # the lang directory.
-  utils/mkgraph.sh --self-loop-scale 1.0 data/lang_${LM} $dir $graph_dir
+  utils/mkgraph.sh data/lang_${LM} $dir $graph_dir
 fi
 
 if [ $stage -le 18 ]; then
diff --git a/egs/ami/s5b/local/chain/tuning/run_tdnn_lstm_1l.sh b/egs/ami/s5b/local/chain/tuning/run_tdnn_lstm_1l.sh
index 2d019398274..f8a6a0f1aa7 100644
--- a/egs/ami/s5b/local/chain/tuning/run_tdnn_lstm_1l.sh
+++ b/egs/ami/s5b/local/chain/tuning/run_tdnn_lstm_1l.sh
@@ -315,7 +315,7 @@ if [ $stage -le 17 ]; then
   # Note: it might appear that this data/lang_chain directory is mismatched, and it is as
   # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
   # the lang directory.
-  utils/mkgraph.sh --self-loop-scale 1.0 data/lang_${LM} $dir $graph_dir
+  utils/mkgraph.sh data/lang_${LM} $dir $graph_dir
 fi
 
 if [ $stage -le 18 ]; then
diff --git a/egs/ami/s5b/local/chain/tuning/run_tdnn_lstm_1m.sh b/egs/ami/s5b/local/chain/tuning/run_tdnn_lstm_1m.sh
index 9e5b971bbe2..cb49eb94888 100644
--- a/egs/ami/s5b/local/chain/tuning/run_tdnn_lstm_1m.sh
+++ b/egs/ami/s5b/local/chain/tuning/run_tdnn_lstm_1m.sh
@@ -321,7 +321,7 @@ if [ $stage -le 17 ]; then
   # Note: it might appear that this data/lang_chain directory is mismatched, and it is as
   # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
   # the lang directory.
-  utils/mkgraph.sh --self-loop-scale 1.0 data/lang_${LM} $dir $graph_dir
+  utils/mkgraph.sh data/lang_${LM} $dir $graph_dir
 fi
 
 if [ $stage -le 18 ]; then
diff --git a/egs/ami/s5b/local/chain/tuning/run_tdnn_lstm_1n.sh b/egs/ami/s5b/local/chain/tuning/run_tdnn_lstm_1n.sh
index 9575c3cf686..0df4d741fe4 100644
--- a/egs/ami/s5b/local/chain/tuning/run_tdnn_lstm_1n.sh
+++ b/egs/ami/s5b/local/chain/tuning/run_tdnn_lstm_1n.sh
@@ -269,7 +269,7 @@ if [ $stage -le 17 ]; then
   # Note: it might appear that this data/lang_chain directory is mismatched, and it is as
   # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
   # the lang directory.
-  utils/mkgraph.sh --self-loop-scale 1.0 data/lang_${LM} $dir $graph_dir
+  utils/mkgraph.sh data/lang_${LM} $dir $graph_dir
 fi
 
 if [ $stage -le 18 ]; then
diff --git a/egs/ami/s5b/local/chain/tuning/run_tdnn_lstm_1o.sh b/egs/ami/s5b/local/chain/tuning/run_tdnn_lstm_1o.sh
index a7f2625c181..6bbc6fd52ad 100755
--- a/egs/ami/s5b/local/chain/tuning/run_tdnn_lstm_1o.sh
+++ b/egs/ami/s5b/local/chain/tuning/run_tdnn_lstm_1o.sh
@@ -276,7 +276,7 @@ if [ $stage -le 17 ]; then
   # Note: it might appear that this data/lang_chain directory is mismatched, and it is as
   # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
   # the lang directory.
-  utils/mkgraph.sh --self-loop-scale 1.0 data/lang_${LM} $dir $graph_dir
+  utils/mkgraph.sh data/lang_${LM} $dir $graph_dir
 fi
 
 if [ $stage -le 18 ]; then
diff --git a/egs/ami/s5b/local/chain/tuning/run_tdnn_lstm_bs_1a.sh b/egs/ami/s5b/local/chain/tuning/run_tdnn_lstm_bs_1a.sh
index ca920869b30..dacf4639a1f 100755
--- a/egs/ami/s5b/local/chain/tuning/run_tdnn_lstm_bs_1a.sh
+++ b/egs/ami/s5b/local/chain/tuning/run_tdnn_lstm_bs_1a.sh
@@ -278,7 +278,7 @@ if [ $stage -le 17 ]; then
   # Note: it might appear that this data/lang_chain directory is mismatched, and it is as
   # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
   # the lang directory.
-  utils/mkgraph.sh --self-loop-scale 1.0 data/lang_${LM} $dir $graph_dir
+  utils/mkgraph.sh data/lang_${LM} $dir $graph_dir
 fi
 
 if [ $stage -le 18 ]; then
diff --git a/egs/ami/s5b/local/chain/tuning/run_tdnn_opgru_1a.sh b/egs/ami/s5b/local/chain/tuning/run_tdnn_opgru_1a.sh
index 53dbd5238db..1fd80acab90 100644
--- a/egs/ami/s5b/local/chain/tuning/run_tdnn_opgru_1a.sh
+++ b/egs/ami/s5b/local/chain/tuning/run_tdnn_opgru_1a.sh
@@ -272,7 +272,7 @@ if [ $stage -le 17 ]; then
   # Note: it might appear that this data/lang_chain directory is mismatched, and it is as
   # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
   # the lang directory.
-  utils/mkgraph.sh --self-loop-scale 1.0 data/lang_${LM} $dir $graph_dir
+  utils/mkgraph.sh data/lang_${LM} $dir $graph_dir
 fi
 
 if [ $stage -le 18 ]; then
diff --git a/egs/ami/s5b/local/chain/tuning/run_tdnn_opgru_1b.sh b/egs/ami/s5b/local/chain/tuning/run_tdnn_opgru_1b.sh
index dafef668e60..d39a7cf6c9f 100644
--- a/egs/ami/s5b/local/chain/tuning/run_tdnn_opgru_1b.sh
+++ b/egs/ami/s5b/local/chain/tuning/run_tdnn_opgru_1b.sh
@@ -273,7 +273,7 @@ if [ $stage -le 17 ]; then
   # Note: it might appear that this data/lang_chain directory is mismatched, and it is as
   # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
   # the lang directory.
-  utils/mkgraph.sh --self-loop-scale 1.0 data/lang_${LM} $dir $graph_dir
+  utils/mkgraph.sh data/lang_${LM} $dir $graph_dir
 fi
 
 if [ $stage -le 18 ]; then
diff --git a/egs/ami/s5b/local/chain/tuning/run_tdnn_opgru_1c.sh b/egs/ami/s5b/local/chain/tuning/run_tdnn_opgru_1c.sh
index 677946d0b9a..d0b3f4181bc 100644
--- a/egs/ami/s5b/local/chain/tuning/run_tdnn_opgru_1c.sh
+++ b/egs/ami/s5b/local/chain/tuning/run_tdnn_opgru_1c.sh
@@ -272,7 +272,7 @@ if [ $stage -le 17 ]; then
   # Note: it might appear that this data/lang_chain directory is mismatched, and it is as
   # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
   # the lang directory.
-  utils/mkgraph.sh --self-loop-scale 1.0 data/lang_${LM} $dir $graph_dir
+  utils/mkgraph.sh data/lang_${LM} $dir $graph_dir
 fi
 
 if [ $stage -le 18 ]; then
diff --git a/egs/aspire/s5/local/chain/tuning/run_blstm_7b.sh b/egs/aspire/s5/local/chain/tuning/run_blstm_7b.sh
index bd13010c791..2928bde6ab4 100755
--- a/egs/aspire/s5/local/chain/tuning/run_blstm_7b.sh
+++ b/egs/aspire/s5/local/chain/tuning/run_blstm_7b.sh
@@ -230,7 +230,7 @@ if [ $stage -le 13 ]; then
   # Note: it might appear that this $lang directory is mismatched, and it is as
   # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
   # the lang directory.
-  utils/mkgraph.sh --self-loop-scale 1.0 data/lang_pp_test $dir $dir/graph_pp
+  utils/mkgraph.sh data/lang_pp_test $dir $dir/graph_pp
 fi
 
 if [ $stage -le 14 ]; then
diff --git a/egs/aspire/s5/local/chain/tuning/run_blstm_asp_1.sh b/egs/aspire/s5/local/chain/tuning/run_blstm_asp_1.sh
index b5979a3ce6b..fed98e57b99 100755
--- a/egs/aspire/s5/local/chain/tuning/run_blstm_asp_1.sh
+++ b/egs/aspire/s5/local/chain/tuning/run_blstm_asp_1.sh
@@ -199,7 +199,7 @@ if [ $stage -le 13 ]; then
   # Note: it might appear that this $lang directory is mismatched, and it is as
   # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
   # the lang directory.
-  utils/mkgraph.sh --self-loop-scale 1.0 data/lang_pp_test $dir $dir/graph_pp
+  utils/mkgraph.sh data/lang_pp_test $dir $dir/graph_pp
 fi
 
 
diff --git a/egs/aspire/s5/local/chain/tuning/run_tdnn_7b.sh b/egs/aspire/s5/local/chain/tuning/run_tdnn_7b.sh
index d6292fbadb3..bb15ea684f0 100755
--- a/egs/aspire/s5/local/chain/tuning/run_tdnn_7b.sh
+++ b/egs/aspire/s5/local/chain/tuning/run_tdnn_7b.sh
@@ -216,7 +216,7 @@ if [ $stage -le 13 ]; then
   # Note: it might appear that this $lang directory is mismatched, and it is as
   # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
   # the lang directory.
-  utils/mkgraph.sh --self-loop-scale 1.0 data/lang_pp_test $dir $dir/graph_pp
+  utils/mkgraph.sh data/lang_pp_test $dir $dir/graph_pp
 fi
 
 if [ $stage -le 14 ]; then
diff --git a/egs/aspire/s5/local/chain/tuning/run_tdnn_asp_1.sh b/egs/aspire/s5/local/chain/tuning/run_tdnn_asp_1.sh
index 5b35c902354..8eeb2fef21a 100755
--- a/egs/aspire/s5/local/chain/tuning/run_tdnn_asp_1.sh
+++ b/egs/aspire/s5/local/chain/tuning/run_tdnn_asp_1.sh
@@ -190,7 +190,7 @@ if [ $stage -le 13 ]; then
   # Note: it might appear that this $lang directory is mismatched, and it is as
   # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
   # the lang directory.
-  utils/mkgraph.sh --self-loop-scale 1.0 data/lang_pp_test $dir $dir/graph_pp
+  utils/mkgraph.sh data/lang_pp_test $dir $dir/graph_pp
 fi
 
 if [ $stage -le 14 ]; then
diff --git a/egs/aspire/s5/local/chain/tuning/run_tdnn_lstm_1a.sh b/egs/aspire/s5/local/chain/tuning/run_tdnn_lstm_1a.sh
index e6aa37a7543..96fa90ff4ae 100755
--- a/egs/aspire/s5/local/chain/tuning/run_tdnn_lstm_1a.sh
+++ b/egs/aspire/s5/local/chain/tuning/run_tdnn_lstm_1a.sh
@@ -249,7 +249,7 @@ if [ $stage -le 14 ]; then
   # Note: it might appear that this $lang directory is mismatched, and it is as
   # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
   # the lang directory.
-  utils/mkgraph.sh --self-loop-scale 1.0 data/lang_pp_test $dir $graph_dir
+  utils/mkgraph.sh data/lang_pp_test $dir $graph_dir
 fi
 
 if [ $stage -le 15 ]; then
diff --git a/egs/babel/s5c/local/ali_to_rttm.sh b/egs/babel/s5c/local/ali_to_rttm.sh
index ef11f516ea3..4b1ef5948cd 100755
--- a/egs/babel/s5c/local/ali_to_rttm.sh
+++ b/egs/babel/s5c/local/ali_to_rttm.sh
@@ -23,7 +23,7 @@
 #local/ali_to_rttm.sh data/dev2h data/lang exp/sgmm5/align_dev2h/
 
 cmd=run.pl
-scale_opts="--transition-scale=1.0 --acoustic-scale=0.1 --self-loop-scale=0.1"
+scale_opts="--acoustic-scale=0.1"
 beam=10
 retry_beam=40
 boost_silence=1.0
diff --git a/egs/babel/s5d/local/ali_to_rttm.sh b/egs/babel/s5d/local/ali_to_rttm.sh
index cb4f0740130..6a720c91287 100755
--- a/egs/babel/s5d/local/ali_to_rttm.sh
+++ b/egs/babel/s5d/local/ali_to_rttm.sh
@@ -23,7 +23,7 @@
 #local/ali_to_rttm.sh data/dev2h data/lang exp/sgmm5/align_dev2h/
 
 cmd=run.pl
-scale_opts="--transition-scale=1.0 --acoustic-scale=0.1 --self-loop-scale=0.1"
+scale_opts="--acoustic-scale=0.1"
 beam=10
 retry_beam=40
 boost_silence=1.0
diff --git a/egs/babel/s5d/local/chain/tuning/run_tdnn.sh b/egs/babel/s5d/local/chain/tuning/run_tdnn.sh
index 7b4535f8c5e..102225f9bc4 100755
--- a/egs/babel/s5d/local/chain/tuning/run_tdnn.sh
+++ b/egs/babel/s5d/local/chain/tuning/run_tdnn.sh
@@ -210,7 +210,7 @@ if [ $stage -le 19 ]; then
   # Note: it might appear that this data/lang_chain directory is mismatched, and it is as
   # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
   # the lang directory.
-  utils/mkgraph.sh --self-loop-scale 1.0 data/langp_test $dir $dir/graph
+  utils/mkgraph.sh data/langp_test $dir $dir/graph
 fi
 
 exit 0
diff --git a/egs/babel/s5d/local/chain/tuning/run_tdnn_lstm.sh b/egs/babel/s5d/local/chain/tuning/run_tdnn_lstm.sh
index 5fc14dda826..93958c93717 100755
--- a/egs/babel/s5d/local/chain/tuning/run_tdnn_lstm.sh
+++ b/egs/babel/s5d/local/chain/tuning/run_tdnn_lstm.sh
@@ -217,7 +217,7 @@ if [ $stage -le 19 ]; then
   # Note: it might appear that this data/lang_chain directory is mismatched, and it is as
   # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
   # the lang directory.
-  utils/mkgraph.sh --self-loop-scale 1.0 data/langp_test $dir $dir/graph
+  utils/mkgraph.sh data/langp_test $dir $dir/graph
 fi
 
 exit 0
diff --git a/egs/babel/s5d/local/chain/tuning/run_tdnn_lstm_bab1.sh b/egs/babel/s5d/local/chain/tuning/run_tdnn_lstm_bab1.sh
index 8c7de5d18d4..d4f2ed70cfb 100755
--- a/egs/babel/s5d/local/chain/tuning/run_tdnn_lstm_bab1.sh
+++ b/egs/babel/s5d/local/chain/tuning/run_tdnn_lstm_bab1.sh
@@ -215,7 +215,7 @@ if [ $stage -le 19 ]; then
   # Note: it might appear that this data/lang_chain directory is mismatched, and it is as
   # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
   # the lang directory.
-  utils/mkgraph.sh --self-loop-scale 1.0 data/langp_test $dir $dir/graph
+  utils/mkgraph.sh data/langp_test $dir $dir/graph
 fi
 
 exit 0
diff --git a/egs/babel/s5d/local/chain/tuning/run_tdnn_lstm_bab2.sh b/egs/babel/s5d/local/chain/tuning/run_tdnn_lstm_bab2.sh
index 0b3e70b5a04..b9ff6c1a15d 100755
--- a/egs/babel/s5d/local/chain/tuning/run_tdnn_lstm_bab2.sh
+++ b/egs/babel/s5d/local/chain/tuning/run_tdnn_lstm_bab2.sh
@@ -215,7 +215,7 @@ if [ $stage -le 19 ]; then
   # Note: it might appear that this data/lang_chain directory is mismatched, and it is as
   # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
   # the lang directory.
-  utils/mkgraph.sh --self-loop-scale 1.0 data/langp_test $dir $dir/graph
+  utils/mkgraph.sh data/langp_test $dir $dir/graph
 fi
 
 exit 0
diff --git a/egs/babel/s5d/local/chain/tuning/run_tdnn_lstm_bab3.sh b/egs/babel/s5d/local/chain/tuning/run_tdnn_lstm_bab3.sh
index 45f2907645e..1c3f26e7def 100755
--- a/egs/babel/s5d/local/chain/tuning/run_tdnn_lstm_bab3.sh
+++ b/egs/babel/s5d/local/chain/tuning/run_tdnn_lstm_bab3.sh
@@ -216,7 +216,7 @@ if [ $stage -le 19 ]; then
   # Note: it might appear that this data/lang_chain directory is mismatched, and it is as
   # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
   # the lang directory.
-  utils/mkgraph.sh --self-loop-scale 1.0 data/langp_test $dir $dir/graph
+  utils/mkgraph.sh data/langp_test $dir $dir/graph
 fi
 
 exit 0
diff --git a/egs/babel/s5d/local/chain/tuning/run_tdnn_lstm_bab4.sh b/egs/babel/s5d/local/chain/tuning/run_tdnn_lstm_bab4.sh
index 0d92aff5c28..2342437c83c 100755
--- a/egs/babel/s5d/local/chain/tuning/run_tdnn_lstm_bab4.sh
+++ b/egs/babel/s5d/local/chain/tuning/run_tdnn_lstm_bab4.sh
@@ -216,7 +216,7 @@ if [ $stage -le 19 ]; then
   # Note: it might appear that this data/lang_chain directory is mismatched, and it is as
   # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
   # the lang directory.
-  utils/mkgraph.sh --self-loop-scale 1.0 data/langp_test $dir $dir/graph
+  utils/mkgraph.sh data/langp_test $dir $dir/graph
 fi
 
 exit 0
diff --git a/egs/babel/s5d/local/chain/tuning/run_tdnn_lstm_bab5.sh b/egs/babel/s5d/local/chain/tuning/run_tdnn_lstm_bab5.sh
index 4129c00dcb4..38bdcfda2f5 100755
--- a/egs/babel/s5d/local/chain/tuning/run_tdnn_lstm_bab5.sh
+++ b/egs/babel/s5d/local/chain/tuning/run_tdnn_lstm_bab5.sh
@@ -216,7 +216,7 @@ if [ $stage -le 19 ]; then
   # Note: it might appear that this data/lang_chain directory is mismatched, and it is as
   # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
   # the lang directory.
-  utils/mkgraph.sh --self-loop-scale 1.0 data/langp_test $dir $dir/graph
+  utils/mkgraph.sh data/langp_test $dir $dir/graph
 fi
 
 exit 0
diff --git a/egs/babel/s5d/local/chain/tuning/run_tdnn_lstm_bab6.sh b/egs/babel/s5d/local/chain/tuning/run_tdnn_lstm_bab6.sh
index 1cfa50c1aa1..d0c7ca09b1c 100755
--- a/egs/babel/s5d/local/chain/tuning/run_tdnn_lstm_bab6.sh
+++ b/egs/babel/s5d/local/chain/tuning/run_tdnn_lstm_bab6.sh
@@ -216,7 +216,7 @@ if [ $stage -le 19 ]; then
   # Note: it might appear that this data/lang_chain directory is mismatched, and it is as
   # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
   # the lang directory.
-  utils/mkgraph.sh --self-loop-scale 1.0 data/langp_test $dir $dir/graph
+  utils/mkgraph.sh data/langp_test $dir $dir/graph
 fi
 
 exit 0
diff --git a/egs/babel/s5d/local/chain/tuning/run_tdnn_lstm_bab7.sh b/egs/babel/s5d/local/chain/tuning/run_tdnn_lstm_bab7.sh
index ba8ac1e0373..ceb94f5e16b 100755
--- a/egs/babel/s5d/local/chain/tuning/run_tdnn_lstm_bab7.sh
+++ b/egs/babel/s5d/local/chain/tuning/run_tdnn_lstm_bab7.sh
@@ -218,7 +218,7 @@ if [ $stage -le 19 ]; then
   # Note: it might appear that this data/lang_chain directory is mismatched, and it is as
   # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
   # the lang directory.
-  utils/mkgraph.sh --self-loop-scale 1.0 data/langp_test $dir $dir/graph
+  utils/mkgraph.sh data/langp_test $dir $dir/graph
 fi
 
 exit 0
diff --git a/egs/babel/s5d/local/chain/tuning/run_tdnn_lstm_bab8.sh b/egs/babel/s5d/local/chain/tuning/run_tdnn_lstm_bab8.sh
index 5de285e080e..98c6b13aeef 100755
--- a/egs/babel/s5d/local/chain/tuning/run_tdnn_lstm_bab8.sh
+++ b/egs/babel/s5d/local/chain/tuning/run_tdnn_lstm_bab8.sh
@@ -220,7 +220,7 @@ if [ $stage -le 19 ]; then
   # Note: it might appear that this data/lang_chain directory is mismatched, and it is as
   # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
   # the lang directory.
-  utils/mkgraph.sh --self-loop-scale 1.0 data/langp_test $dir $dir/graph
+  utils/mkgraph.sh data/langp_test $dir $dir/graph
 fi
 
 exit 0
diff --git a/egs/bentham/v1/local/chain/tuning/run_cnn_e2eali_1a.sh b/egs/bentham/v1/local/chain/tuning/run_cnn_e2eali_1a.sh
index ec530ef1ce4..fac93f0a790 100755
--- a/egs/bentham/v1/local/chain/tuning/run_cnn_e2eali_1a.sh
+++ b/egs/bentham/v1/local/chain/tuning/run_cnn_e2eali_1a.sh
@@ -110,7 +110,7 @@ if [ $stage -le 2 ]; then
   # use the same num-jobs as the alignments
   steps/nnet3/align_lats.sh --nj $nj --cmd "$cmd" \
                             --acoustic-scale 1.0 \
-                            --scale-opts '--transition-scale=1.0 --self-loop-scale=1.0' \
+                            \
                             ${train_data_dir} data/lang $e2echain_model_dir $lat_dir
   echo "" >$lat_dir/splice_opts
 fi
@@ -235,7 +235,7 @@ if [ $stage -le 6 ]; then
   # as long as phones.txt was compatible.
 
   utils/mkgraph.sh \
-    --self-loop-scale 1.0 $lang_decode \
+    $lang_decode \
     $dir $dir/graph || exit 1;
 fi
 
diff --git a/egs/bentham/v1/local/chain/tuning/run_e2e_cnn_1a.sh b/egs/bentham/v1/local/chain/tuning/run_e2e_cnn_1a.sh
index 716bdce3729..c5d4106e44a 100755
--- a/egs/bentham/v1/local/chain/tuning/run_e2e_cnn_1a.sh
+++ b/egs/bentham/v1/local/chain/tuning/run_e2e_cnn_1a.sh
@@ -147,7 +147,7 @@ if [ $stage -le 4 ]; then
   # as long as phones.txt was compatible.
 
   utils/mkgraph.sh \
-    --self-loop-scale 1.0 $lang_decode \
+    $lang_decode \
     $dir $dir/graph || exit 1;
 fi
 
diff --git a/egs/bentham/v1/run_end2end.sh b/egs/bentham/v1/run_end2end.sh
index 63c034e41f6..5d821aeb9ee 100755
--- a/egs/bentham/v1/run_end2end.sh
+++ b/egs/bentham/v1/run_end2end.sh
@@ -111,7 +111,7 @@ if [ $stage -le 6 ]; then
   echo "$0: Aligning the training data using the e2e chain model..."
   steps/nnet3/align.sh --nj 50 --cmd "$cmd" \
                        --use-gpu false \
-                       --scale-opts '--transition-scale=1.0 --self-loop-scale=1.0 --acoustic-scale=1.0' \
+                       --scale-opts ' --acoustic-scale=1.0' \
                        data/train data/lang exp/chain/e2e_cnn_1a exp/chain/e2e_ali_train
 fi
 
diff --git a/egs/chime4/s5_1ch/local/chain/tuning/run_tdnn_1a.sh b/egs/chime4/s5_1ch/local/chain/tuning/run_tdnn_1a.sh
index 3f8b7c60090..fb254339cb5 100755
--- a/egs/chime4/s5_1ch/local/chain/tuning/run_tdnn_1a.sh
+++ b/egs/chime4/s5_1ch/local/chain/tuning/run_tdnn_1a.sh
@@ -321,7 +321,7 @@ if [ $stage -le 17 ]; then
   utils/lang/check_phones_compatible.sh \
     data/lang_test_tgpr_5k/phones.txt $lang/phones.txt
   utils/mkgraph.sh \
-    --self-loop-scale 1.0 data/lang_test_tgpr_5k \
+    data/lang_test_tgpr_5k \
     $tree_dir $tree_dir/graph_tgpr_5k || exit 1;
 fi
 
diff --git a/egs/chime4/s5_1ch/local/chain/tuning/run_tdnn_lstm_1a.sh b/egs/chime4/s5_1ch/local/chain/tuning/run_tdnn_lstm_1a.sh
index 8b4e93cd05b..19ea72a944a 100755
--- a/egs/chime4/s5_1ch/local/chain/tuning/run_tdnn_lstm_1a.sh
+++ b/egs/chime4/s5_1ch/local/chain/tuning/run_tdnn_lstm_1a.sh
@@ -275,7 +275,7 @@ if [ $stage -le 17 ]; then
   utils/lang/check_phones_compatible.sh \
     data/lang_test_tgpr_5k/phones.txt $lang/phones.txt
   utils/mkgraph.sh \
-    --self-loop-scale 1.0 data/lang_test_tgpr_5k \
+    data/lang_test_tgpr_5k \
     $tree_dir $tree_dir/graph_tgpr_5k || exit 1;
 fi
 
diff --git a/egs/chime5/s5/local/chain/tuning/run_tdnn_1a.sh b/egs/chime5/s5/local/chain/tuning/run_tdnn_1a.sh
index d60e6a4aa04..834234ea87d 100755
--- a/egs/chime5/s5/local/chain/tuning/run_tdnn_1a.sh
+++ b/egs/chime5/s5/local/chain/tuning/run_tdnn_1a.sh
@@ -212,7 +212,7 @@ if [ $stage -le 15 ]; then
   # Note: it's not important to give mkgraph.sh the lang directory with the
   # matched topology (since it gets the topology file from the model).
   utils/mkgraph.sh \
-    --self-loop-scale 1.0 data/lang${lm_suffix}/ \
+    data/lang${lm_suffix}/ \
     $tree_dir $tree_dir/graph${lm_suffix} || exit 1;
 fi
 
diff --git a/egs/chime5/s5/local/run_recog.sh b/egs/chime5/s5/local/run_recog.sh
index 5c74c9ff242..9da73a02821 100755
--- a/egs/chime5/s5/local/run_recog.sh
+++ b/egs/chime5/s5/local/run_recog.sh
@@ -130,7 +130,7 @@ if [ $stage -le 18 ]; then
   chunk_right_context=0
   
   utils/mkgraph.sh \
-      --self-loop-scale 1.0 data/lang${lm_suffix}/ \
+      data/lang${lm_suffix}/ \
       $tree_dir $tree_dir/graph${lm_suffix} || exit 1;
 
   frames_per_chunk=$(echo $chunk_width | cut -d, -f1)
diff --git a/egs/chime5/s5b/local/chain/tuning/run_cnn_tdnn_lstm_1a.sh b/egs/chime5/s5b/local/chain/tuning/run_cnn_tdnn_lstm_1a.sh
index 95e9d934bd3..7b14b7dff67 100755
--- a/egs/chime5/s5b/local/chain/tuning/run_cnn_tdnn_lstm_1a.sh
+++ b/egs/chime5/s5b/local/chain/tuning/run_cnn_tdnn_lstm_1a.sh
@@ -246,7 +246,7 @@ if [ $stage -le 15 ]; then
   # Note: it's not important to give mkgraph.sh the lang directory with the
   # matched topology (since it gets the topology file from the model).
   utils/mkgraph.sh \
-    --self-loop-scale 1.0 data/lang${lm_suffix}/ \
+    data/lang${lm_suffix}/ \
     $tree_dir $tree_dir/graph${lm_suffix} || exit 1;
 fi
 
diff --git a/egs/chime5/s5b/local/chain/tuning/run_tdnn_1a.sh b/egs/chime5/s5b/local/chain/tuning/run_tdnn_1a.sh
index daad37e2cd7..3b6c73e41d8 100755
--- a/egs/chime5/s5b/local/chain/tuning/run_tdnn_1a.sh
+++ b/egs/chime5/s5b/local/chain/tuning/run_tdnn_1a.sh
@@ -216,7 +216,7 @@ if [ $stage -le 15 ]; then
   # Note: it's not important to give mkgraph.sh the lang directory with the
   # matched topology (since it gets the topology file from the model).
   utils/mkgraph.sh \
-    --self-loop-scale 1.0 data/lang${lm_suffix}/ \
+    data/lang${lm_suffix}/ \
     $tree_dir $tree_dir/graph${lm_suffix} || exit 1;
 fi
 
diff --git a/egs/chime5/s5b/local/chain/tuning/run_tdnn_1b.sh b/egs/chime5/s5b/local/chain/tuning/run_tdnn_1b.sh
index e033715d884..ccde8a0fcd3 100755
--- a/egs/chime5/s5b/local/chain/tuning/run_tdnn_1b.sh
+++ b/egs/chime5/s5b/local/chain/tuning/run_tdnn_1b.sh
@@ -224,7 +224,7 @@ if [ $stage -le 15 ]; then
   # Note: it's not important to give mkgraph.sh the lang directory with the
   # matched topology (since it gets the topology file from the model).
   utils/mkgraph.sh \
-    --self-loop-scale 1.0 data/lang${lm_suffix}/ \
+    data/lang${lm_suffix}/ \
     $tree_dir $tree_dir/graph${lm_suffix} || exit 1;
 fi
 
diff --git a/egs/chime5/s5b/local/chain/tuning/run_tdnn_lstm_1a.sh b/egs/chime5/s5b/local/chain/tuning/run_tdnn_lstm_1a.sh
index e3d8e6ac4dc..e80797de57a 100755
--- a/egs/chime5/s5b/local/chain/tuning/run_tdnn_lstm_1a.sh
+++ b/egs/chime5/s5b/local/chain/tuning/run_tdnn_lstm_1a.sh
@@ -239,7 +239,7 @@ if [ $stage -le 15 ]; then
   # Note: it's not important to give mkgraph.sh the lang directory with the
   # matched topology (since it gets the topology file from the model).
   utils/mkgraph.sh \
-    --self-loop-scale 1.0 data/lang${lm_suffix}/ \
+    data/lang${lm_suffix}/ \
     $tree_dir $tree_dir/graph${lm_suffix} || exit 1;
 fi
 
diff --git a/egs/chime5/s5b/local/run_recog.sh b/egs/chime5/s5b/local/run_recog.sh
index 5c74c9ff242..9da73a02821 100755
--- a/egs/chime5/s5b/local/run_recog.sh
+++ b/egs/chime5/s5b/local/run_recog.sh
@@ -130,7 +130,7 @@ if [ $stage -le 18 ]; then
   chunk_right_context=0
   
   utils/mkgraph.sh \
-      --self-loop-scale 1.0 data/lang${lm_suffix}/ \
+      data/lang${lm_suffix}/ \
       $tree_dir $tree_dir/graph${lm_suffix} || exit 1;
 
   frames_per_chunk=$(echo $chunk_width | cut -d, -f1)
diff --git a/egs/commonvoice/s5/local/chain/tuning/run_tdnn_1a.sh b/egs/commonvoice/s5/local/chain/tuning/run_tdnn_1a.sh
index d4acd0fed4b..74d37961396 100755
--- a/egs/commonvoice/s5/local/chain/tuning/run_tdnn_1a.sh
+++ b/egs/commonvoice/s5/local/chain/tuning/run_tdnn_1a.sh
@@ -229,7 +229,7 @@ if [ $stage -le 15 ]; then
   # Note: it's not important to give mkgraph.sh the lang directory with the
   # matched topology (since it gets the topology file from the model).
   utils/mkgraph.sh \
-    --self-loop-scale 1.0 data/lang_test \
+    data/lang_test \
     $tree_dir $tree_dir/graph || exit 1;
 fi
 
diff --git a/egs/csj/s5/local/chain/tuning/run_tdnn_1a.sh b/egs/csj/s5/local/chain/tuning/run_tdnn_1a.sh
index 75ceb80e3e0..1cb21d96375 100755
--- a/egs/csj/s5/local/chain/tuning/run_tdnn_1a.sh
+++ b/egs/csj/s5/local/chain/tuning/run_tdnn_1a.sh
@@ -213,7 +213,7 @@ fi
 
 if [ $stage -le 14 ]; then
   utils/mkgraph.sh \
-    --self-loop-scale 1.0 data/lang_csj_tg $dir $dir/graph_csj_tg
+    data/lang_csj_tg $dir $dir/graph_csj_tg
 
   for decode_set in $test_sets; do
     steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 --nj 10 \
diff --git a/egs/fisher_callhome_spanish/s5/local/chain/run_tdnn_1g.sh b/egs/fisher_callhome_spanish/s5/local/chain/run_tdnn_1g.sh
index 7f407552c2e..5f4690d05b4 100755
--- a/egs/fisher_callhome_spanish/s5/local/chain/run_tdnn_1g.sh
+++ b/egs/fisher_callhome_spanish/s5/local/chain/run_tdnn_1g.sh
@@ -249,7 +249,7 @@ if [ $stage -le 21 ]; then
   #LM was trained only on Fisher Spanish train subset.
 
   utils/mkgraph.sh \
-    --self-loop-scale 1.0 data/lang_test \
+    data/lang_test \
     $tree_dir $tree_dir/graph_fsp_train || exit 1;
 
 fi
diff --git a/egs/fisher_english/s5/local/chain/run_tdnn.sh b/egs/fisher_english/s5/local/chain/run_tdnn.sh
index 1fd0f1fdf3a..424a4610bab 100755
--- a/egs/fisher_english/s5/local/chain/run_tdnn.sh
+++ b/egs/fisher_english/s5/local/chain/run_tdnn.sh
@@ -193,7 +193,7 @@ if [ $stage -le 14 ]; then
   # Note: it might appear that this $lang directory is mismatched, and it is as
   # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
   # the lang directory.
-  utils/mkgraph.sh --self-loop-scale 1.0 data/lang_test $dir $graph_dir
+  utils/mkgraph.sh data/lang_test $dir $graph_dir
 fi
 
 decode_suff=
diff --git a/egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_100k_semisupervised_1a.sh b/egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_100k_semisupervised_1a.sh
index b76efc4f1de..0ff2d018a42 100644
--- a/egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_100k_semisupervised_1a.sh
+++ b/egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_100k_semisupervised_1a.sh
@@ -126,7 +126,7 @@ for f in data/${supervised_set_perturbed}/feats.scp \
 done
 
 if [ ! -f $graphdir/HCLG.fst ]; then
-  utils/mkgraph.sh --self-loop-scale 1.0 $unsup_decode_lang $sup_chain_dir $graphdir
+  utils/mkgraph.sh $unsup_decode_lang $sup_chain_dir $graphdir
 fi
 
 # Prepare the speed-perturbed unsupervised data directory
@@ -402,7 +402,7 @@ if [ $stage -le 17 ]; then
   # Note: it might appear that this $lang directory is mismatched, and it is as
   # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
   # the lang directory.
-  utils/mkgraph.sh --self-loop-scale 1.0 ${test_lang} $dir $test_graph_dir
+  utils/mkgraph.sh ${test_lang} $dir $test_graph_dir
 fi
 
 if [ $stage -le 18 ]; then
diff --git a/egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_1a.sh b/egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_1a.sh
index b1c133942ef..aa2818c23ce 100755
--- a/egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_1a.sh
+++ b/egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_1a.sh
@@ -224,7 +224,7 @@ if [ $stage -le 15 ]; then
   # Note: it might appear that this $lang directory is mismatched, and it is as
   # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
   # the lang directory.
-  utils/mkgraph.sh --self-loop-scale 1.0 data/lang_test_poco_unk $dir $graph_dir
+  utils/mkgraph.sh data/lang_test_poco_unk $dir $graph_dir
 fi
 
 decode_suff=
diff --git a/egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_50k_semisupervised_1a.sh b/egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_50k_semisupervised_1a.sh
index 53aac8c08ea..7e6639c0a45 100755
--- a/egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_50k_semisupervised_1a.sh
+++ b/egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_50k_semisupervised_1a.sh
@@ -138,7 +138,7 @@ for f in data/${supervised_set_perturbed}/feats.scp \
 done
 
 if [ ! -f $graphdir/HCLG.fst ]; then
-  utils/mkgraph.sh --self-loop-scale 1.0 $unsup_decode_lang $sup_chain_dir $graphdir
+  utils/mkgraph.sh $unsup_decode_lang $sup_chain_dir $graphdir
 fi
 
 if [ $stage -le 2 ]; then
@@ -421,7 +421,7 @@ if [ $stage -le 17 ]; then
   # Note: it might appear that this $lang directory is mismatched, and it is as
   # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
   # the lang directory.
-  utils/mkgraph.sh --self-loop-scale 1.0 ${test_lang} $dir $test_graph_dir
+  utils/mkgraph.sh ${test_lang} $dir $test_graph_dir
 fi
 
 if [ $stage -le 18 ]; then
diff --git a/egs/fisher_swbd/s5/local/chain/run_blstm_6h.sh b/egs/fisher_swbd/s5/local/chain/run_blstm_6h.sh
index 66f87c8da8f..ce32a3ca9b7 100755
--- a/egs/fisher_swbd/s5/local/chain/run_blstm_6h.sh
+++ b/egs/fisher_swbd/s5/local/chain/run_blstm_6h.sh
@@ -143,7 +143,7 @@ if [ $stage -le 14 ]; then
   # Note: it might appear that this $lang directory is mismatched, and it is as
   # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
   # the lang directory.
-  utils/mkgraph.sh --self-loop-scale 1.0 data/lang_fsh_sw1_tg $dir $dir/graph_fsh_sw1_tg
+  utils/mkgraph.sh data/lang_fsh_sw1_tg $dir $dir/graph_fsh_sw1_tg
 fi
 
 decode_suff=fsh_sw1_tg
diff --git a/egs/fisher_swbd/s5/local/chain/run_blstm_6j.sh b/egs/fisher_swbd/s5/local/chain/run_blstm_6j.sh
index c12f604f26b..0bedf85c8cb 100755
--- a/egs/fisher_swbd/s5/local/chain/run_blstm_6j.sh
+++ b/egs/fisher_swbd/s5/local/chain/run_blstm_6j.sh
@@ -216,7 +216,7 @@ if [ $stage -le 14 ]; then
   # Note: it might appear that this $lang directory is mismatched, and it is as
   # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
   # the lang directory.
-  utils/mkgraph.sh --self-loop-scale 1.0 data/lang_fsh_sw1_tg $dir $dir/graph_fsh_sw1_tg
+  utils/mkgraph.sh data/lang_fsh_sw1_tg $dir $dir/graph_fsh_sw1_tg
 fi
 
 decode_suff=fsh_sw1_tg
diff --git a/egs/fisher_swbd/s5/local/chain/run_tdnn_7b.sh b/egs/fisher_swbd/s5/local/chain/run_tdnn_7b.sh
index 543f753bd4e..0179ebd26e3 100755
--- a/egs/fisher_swbd/s5/local/chain/run_tdnn_7b.sh
+++ b/egs/fisher_swbd/s5/local/chain/run_tdnn_7b.sh
@@ -135,7 +135,7 @@ if [ $stage -le 14 ]; then
   # Note: it might appear that this $lang directory is mismatched, and it is as
   # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
   # the lang directory.
-  utils/mkgraph.sh --self-loop-scale 1.0 data/lang_fsh_sw1_tg $dir $dir/graph_fsh_sw1_tg
+  utils/mkgraph.sh data/lang_fsh_sw1_tg $dir $dir/graph_fsh_sw1_tg
 fi
 
 decode_suff=fsh_sw1_tg
diff --git a/egs/fisher_swbd/s5/local/chain/run_tdnn_7c.sh b/egs/fisher_swbd/s5/local/chain/run_tdnn_7c.sh
index efcd1eced4a..910bbe358bf 100644
--- a/egs/fisher_swbd/s5/local/chain/run_tdnn_7c.sh
+++ b/egs/fisher_swbd/s5/local/chain/run_tdnn_7c.sh
@@ -212,7 +212,7 @@ if [ $stage -le 14 ]; then
   # Note: it might appear that this $lang directory is mismatched, and it is as
   # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
   # the lang directory.
-  utils/mkgraph.sh --self-loop-scale 1.0 data/lang_fsh_sw1_tg $dir $dir/graph_fsh_sw1_tg
+  utils/mkgraph.sh data/lang_fsh_sw1_tg $dir $dir/graph_fsh_sw1_tg
 fi
 
 decode_suff=fsh_sw1_tg
diff --git a/egs/fisher_swbd/s5/local/chain/run_tdnn_7d.sh b/egs/fisher_swbd/s5/local/chain/run_tdnn_7d.sh
index e4a555abfdd..ac990889e2a 100644
--- a/egs/fisher_swbd/s5/local/chain/run_tdnn_7d.sh
+++ b/egs/fisher_swbd/s5/local/chain/run_tdnn_7d.sh
@@ -221,7 +221,7 @@ if [ $stage -le 14 ]; then
   # Note: it might appear that this $lang directory is mismatched, and it is as
   # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
   # the lang directory.
-  utils/mkgraph.sh --self-loop-scale 1.0 data/lang_fsh_sw1_tg $dir $dir/graph_fsh_sw1_tg
+  utils/mkgraph.sh data/lang_fsh_sw1_tg $dir $dir/graph_fsh_sw1_tg
 fi
 
 decode_suff=fsh_sw1_tg
diff --git a/egs/fisher_swbd/s5/local/chain/run_tdnn_lstm_1a.sh b/egs/fisher_swbd/s5/local/chain/run_tdnn_lstm_1a.sh
index 5650cedca28..89ef17fa9bc 100755
--- a/egs/fisher_swbd/s5/local/chain/run_tdnn_lstm_1a.sh
+++ b/egs/fisher_swbd/s5/local/chain/run_tdnn_lstm_1a.sh
@@ -232,7 +232,7 @@ if [ $stage -le 14 ]; then
   # Note: it might appear that this $lang directory is mismatched, and it is as
   # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
   # the lang directory.
-  utils/mkgraph.sh --self-loop-scale 1.0 data/lang_fsh_sw1_tg $dir $dir/graph_fsh_sw1_tg
+  utils/mkgraph.sh data/lang_fsh_sw1_tg $dir $dir/graph_fsh_sw1_tg
 fi
 
 decode_suff=fsh_sw1_tg
diff --git a/egs/fisher_swbd/s5/local/chain/run_tdnn_lstm_1a_svd.sh b/egs/fisher_swbd/s5/local/chain/run_tdnn_lstm_1a_svd.sh
index 5beb2e74a9a..1a711089912 100644
--- a/egs/fisher_swbd/s5/local/chain/run_tdnn_lstm_1a_svd.sh
+++ b/egs/fisher_swbd/s5/local/chain/run_tdnn_lstm_1a_svd.sh
@@ -312,7 +312,7 @@ if [ $stage -le 15 ]; then
   # Note: it might appear that this $lang directory is mismatched, and it is as
   # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
   # the lang directory.
-  utils/mkgraph.sh --self-loop-scale 1.0 data/lang_fsh_sw1_tg $dir $dir/graph_fsh_sw1_tg
+  utils/mkgraph.sh data/lang_fsh_sw1_tg $dir $dir/graph_fsh_sw1_tg
 fi
 
 decode_suff=fsh_sw1_tg
diff --git a/egs/fisher_swbd/s5/local/chain/run_tdnn_lstm_1b.sh b/egs/fisher_swbd/s5/local/chain/run_tdnn_lstm_1b.sh
index f3cc869e6de..aed698b343d 100755
--- a/egs/fisher_swbd/s5/local/chain/run_tdnn_lstm_1b.sh
+++ b/egs/fisher_swbd/s5/local/chain/run_tdnn_lstm_1b.sh
@@ -242,7 +242,7 @@ if [ $stage -le 14 ]; then
   # Note: it might appear that this $lang directory is mismatched, and it is as
   # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
   # the lang directory.
-  utils/mkgraph.sh --self-loop-scale 1.0 data/lang_fsh_sw1_tg $dir $dir/graph_fsh_sw1_tg
+  utils/mkgraph.sh data/lang_fsh_sw1_tg $dir $dir/graph_fsh_sw1_tg
 fi
 
 decode_suff=fsh_sw1_tg
diff --git a/egs/fisher_swbd/s5/local/chain/run_tdnn_opgru_1a.sh b/egs/fisher_swbd/s5/local/chain/run_tdnn_opgru_1a.sh
index 059a81e15fc..cd5910cf9b4 100755
--- a/egs/fisher_swbd/s5/local/chain/run_tdnn_opgru_1a.sh
+++ b/egs/fisher_swbd/s5/local/chain/run_tdnn_opgru_1a.sh
@@ -239,7 +239,7 @@ if [ $stage -le 14 ]; then
   # Note: it might appear that this $lang directory is mismatched, and it is as
   # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
   # the lang directory.
-  utils/mkgraph.sh --self-loop-scale 1.0 data/lang_fsh_sw1_tg $dir $dir/graph_fsh_sw1_tg
+  utils/mkgraph.sh data/lang_fsh_sw1_tg $dir $dir/graph_fsh_sw1_tg
 fi
 
 decode_suff=fsh_sw1_tg
diff --git a/egs/fisher_swbd/s5/local/chain/run_tdnn_opgru_1b.sh b/egs/fisher_swbd/s5/local/chain/run_tdnn_opgru_1b.sh
index d86b699d6f6..51546ddd622 100755
--- a/egs/fisher_swbd/s5/local/chain/run_tdnn_opgru_1b.sh
+++ b/egs/fisher_swbd/s5/local/chain/run_tdnn_opgru_1b.sh
@@ -240,7 +240,7 @@ if [ $stage -le 14 ]; then
   # Note: it might appear that this $lang directory is mismatched, and it is as
   # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
   # the lang directory.
-  utils/mkgraph.sh --self-loop-scale 1.0 data/lang_fsh_sw1_tg $dir $dir/graph_fsh_sw1_tg
+  utils/mkgraph.sh data/lang_fsh_sw1_tg $dir $dir/graph_fsh_sw1_tg
 fi
 
 decode_suff=fsh_sw1_tg
diff --git a/egs/formosa/s5/local/chain/tuning/run_tdnn_1a.sh b/egs/formosa/s5/local/chain/tuning/run_tdnn_1a.sh
index 66c5ad3335f..e4aa735a9d8 100755
--- a/egs/formosa/s5/local/chain/tuning/run_tdnn_1a.sh
+++ b/egs/formosa/s5/local/chain/tuning/run_tdnn_1a.sh
@@ -164,7 +164,7 @@ if [ $stage -le 12 ]; then
   # Note: it might appear that this $lang directory is mismatched, and it is as
   # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
   # the lang directory.
-  utils/mkgraph.sh --self-loop-scale 1.0 data/lang_test $dir $dir/graph
+  utils/mkgraph.sh data/lang_test $dir $dir/graph
 fi
 
 graph_dir=$dir/graph
diff --git a/egs/formosa/s5/local/chain/tuning/run_tdnn_1b.sh b/egs/formosa/s5/local/chain/tuning/run_tdnn_1b.sh
index 1981bb0530d..ec2f9dc1b6c 100755
--- a/egs/formosa/s5/local/chain/tuning/run_tdnn_1b.sh
+++ b/egs/formosa/s5/local/chain/tuning/run_tdnn_1b.sh
@@ -172,7 +172,7 @@ if [ $stage -le 12 ]; then
   # Note: it might appear that this $lang directory is mismatched, and it is as
   # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
   # the lang directory.
-  utils/mkgraph.sh --self-loop-scale 1.0 data/lang_test $dir $dir/graph
+  utils/mkgraph.sh data/lang_test $dir $dir/graph
 fi
 
 graph_dir=$dir/graph
diff --git a/egs/formosa/s5/local/chain/tuning/run_tdnn_1c.sh b/egs/formosa/s5/local/chain/tuning/run_tdnn_1c.sh
index 6fa10344cfc..c3ee11a0638 100755
--- a/egs/formosa/s5/local/chain/tuning/run_tdnn_1c.sh
+++ b/egs/formosa/s5/local/chain/tuning/run_tdnn_1c.sh
@@ -174,7 +174,7 @@ if [ $stage -le 12 ]; then
   # Note: it might appear that this $lang directory is mismatched, and it is as
   # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
   # the lang directory.
-  utils/mkgraph.sh --self-loop-scale 1.0 data/lang_test $dir $dir/graph
+  utils/mkgraph.sh data/lang_test $dir $dir/graph
 fi
 
 graph_dir=$dir/graph
diff --git a/egs/formosa/s5/local/chain/tuning/run_tdnn_1d.sh b/egs/formosa/s5/local/chain/tuning/run_tdnn_1d.sh
index 1f4b7e12850..03d739579bd 100755
--- a/egs/formosa/s5/local/chain/tuning/run_tdnn_1d.sh
+++ b/egs/formosa/s5/local/chain/tuning/run_tdnn_1d.sh
@@ -173,7 +173,7 @@ if [ $stage -le 12 ]; then
   # Note: it might appear that this $lang directory is mismatched, and it is as
   # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
   # the lang directory.
-  utils/mkgraph.sh --self-loop-scale 1.0 data/lang_test $dir $dir/graph
+  utils/mkgraph.sh data/lang_test $dir $dir/graph
 fi
 
 graph_dir=$dir/graph
diff --git a/egs/gale_arabic/s5b/local/chain/tuning/run_tdnn_1a.sh b/egs/gale_arabic/s5b/local/chain/tuning/run_tdnn_1a.sh
index bf2e45c9914..d62d214d957 100755
--- a/egs/gale_arabic/s5b/local/chain/tuning/run_tdnn_1a.sh
+++ b/egs/gale_arabic/s5b/local/chain/tuning/run_tdnn_1a.sh
@@ -200,7 +200,7 @@ if [ $stage -le 17 ]; then
   utils/lang/check_phones_compatible.sh \
     data/lang_test/phones.txt $lang/phones.txt
   utils/mkgraph.sh \
-    --self-loop-scale 1.0 data/lang_test \
+    data/lang_test \
     $tree_dir $tree_dir/graph || exit 1;
 fi
 
diff --git a/egs/gale_arabic/s5b/local/chain/tuning/run_tdnn_lstm_1a.sh b/egs/gale_arabic/s5b/local/chain/tuning/run_tdnn_lstm_1a.sh
index deebafc95e4..5278b97591a 100755
--- a/egs/gale_arabic/s5b/local/chain/tuning/run_tdnn_lstm_1a.sh
+++ b/egs/gale_arabic/s5b/local/chain/tuning/run_tdnn_lstm_1a.sh
@@ -206,7 +206,7 @@ if [ $stage -le 19 ]; then
   # Note: it might appear that this data/lang_chain directory is mismatched, and it is as
   # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
   # the lang directory.
-  utils/mkgraph.sh --left-biphone --self-loop-scale 1.0 data/lang_test $dir $dir/graph
+  utils/mkgraph.sh --left-biphone data/lang_test $dir $dir/graph
 fi
 
 if [ $stage -le 20 ]; then
diff --git a/egs/gale_arabic/s5c/local/chain/tuning/run_tdnn_1a.sh b/egs/gale_arabic/s5c/local/chain/tuning/run_tdnn_1a.sh
index bf2e45c9914..d62d214d957 100755
--- a/egs/gale_arabic/s5c/local/chain/tuning/run_tdnn_1a.sh
+++ b/egs/gale_arabic/s5c/local/chain/tuning/run_tdnn_1a.sh
@@ -200,7 +200,7 @@ if [ $stage -le 17 ]; then
   utils/lang/check_phones_compatible.sh \
     data/lang_test/phones.txt $lang/phones.txt
   utils/mkgraph.sh \
-    --self-loop-scale 1.0 data/lang_test \
+    data/lang_test \
     $tree_dir $tree_dir/graph || exit 1;
 fi
 
diff --git a/egs/gale_arabic/s5c/local/chain/tuning/run_tdnn_lstm_1a.sh b/egs/gale_arabic/s5c/local/chain/tuning/run_tdnn_lstm_1a.sh
index deebafc95e4..5278b97591a 100755
--- a/egs/gale_arabic/s5c/local/chain/tuning/run_tdnn_lstm_1a.sh
+++ b/egs/gale_arabic/s5c/local/chain/tuning/run_tdnn_lstm_1a.sh
@@ -206,7 +206,7 @@ if [ $stage -le 19 ]; then
   # Note: it might appear that this data/lang_chain directory is mismatched, and it is as
   # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
   # the lang directory.
-  utils/mkgraph.sh --left-biphone --self-loop-scale 1.0 data/lang_test $dir $dir/graph
+  utils/mkgraph.sh --left-biphone data/lang_test $dir $dir/graph
 fi
 
 if [ $stage -le 20 ]; then
diff --git a/egs/gp/s1/steps/align_deltas.sh b/egs/gp/s1/steps/align_deltas.sh
index 22da04432c7..37406b4d2a6 100755
--- a/egs/gp/s1/steps/align_deltas.sh
+++ b/egs/gp/s1/steps/align_deltas.sh
@@ -93,7 +93,7 @@ mkdir -p $dir
 # Create copy of the tree and model and occs...
 cp $srcdir/{tree,final.mdl,final.occs} $dir || exit 1;
 
-scale_opts="--transition-scale=1.0 --acoustic-scale=0.1 --self-loop-scale=0.1"
+scale_opts="--acoustic-scale=0.1"
 
 if [ ! -d $data/split$nj -o $data/split$nj -ot $data/feats.scp ]; then
   split_data.sh $data $nj
diff --git a/egs/gp/s1/steps/train_deltas.sh b/egs/gp/s1/steps/train_deltas.sh
index 0efe7b60379..45a4a54f861 100755
--- a/egs/gp/s1/steps/train_deltas.sh
+++ b/egs/gp/s1/steps/train_deltas.sh
@@ -125,7 +125,7 @@ if [ ! -f $alidir/final.mdl ]; then
   exit 1;
 fi
 
-scale_opts="--transition-scale=1.0 --acoustic-scale=0.1 --self-loop-scale=0.1"
+scale_opts="--acoustic-scale=0.1"
 realign_iters="10 20 30";
 oov_sym=`cat $lang/oov.txt`
 silphonelist=`cat $lang/silphones.csl`
diff --git a/egs/gp/s1/steps/train_mono.sh b/egs/gp/s1/steps/train_mono.sh
index e82c14fcaf2..c4e2ad42228 100755
--- a/egs/gp/s1/steps/train_mono.sh
+++ b/egs/gp/s1/steps/train_mono.sh
@@ -77,7 +77,7 @@ dir=$3
 [ -f path.sh ] && . ./path.sh
 
 # Configuration:
-scale_opts="--transition-scale=1.0 --acoustic-scale=0.1 --self-loop-scale=0.1"
+scale_opts="--acoustic-scale=0.1"
 numiters=40    # Number of iterations of training
 maxiterinc=30 # Last iter to increase #Gauss on.
 numgauss=300 # Initial num-Gauss (must be more than #states=3*phones).
diff --git a/egs/gp/s1/utils/lmrescore.sh b/egs/gp/s1/utils/lmrescore.sh
index c911d0ce8b0..bf70021f13e 100755
--- a/egs/gp/s1/utils/lmrescore.sh
+++ b/egs/gp/s1/utils/lmrescore.sh
@@ -157,7 +157,7 @@ case "$mode" in
       lattice-compose ark:- $outdir/Ldet.fst ark:- \| \
       lattice-determinize ark:- ark:- \| \
       lattice-compose --phi-label=$phi ark:- $newlm ark:- \| \
-      lattice-add-trans-probs --transition-scale=1.0 --self-loop-scale=0.1 \
+      lattice-add-trans-probs \
         $mdl ark:- ark:- \| \
       gzip -c \>$newlat  ||  error_exit "Error doing LM rescoring."
   ;;
diff --git a/egs/gp/s1/utils/mkgraph.sh b/egs/gp/s1/utils/mkgraph.sh
index 3aba742832d..14a4048ffba 100755
--- a/egs/gp/s1/utils/mkgraph.sh
+++ b/egs/gp/s1/utils/mkgraph.sh
@@ -19,7 +19,7 @@
 # all the language-model, pronunciation dictionary (lexicon), context-dependency,
 # and HMM structure in our model.  The output is a Finite State Transducer
 # that has word-ids on the output, and pdf-ids on the input (these are indexes
-# that resolve to Gaussian Mixture Models).  
+# that resolve to Gaussian Mixture Models).
 # See
 #  http://kaldi-asr.org/doc/graph_recipe_test.html
 # (this is compiled from this repository using Doxygen,
@@ -30,7 +30,7 @@ N=3
 P=1
 clean=false
 
-for x in 1 2 3; do 
+for x in 1 2 3; do
   if [ $1 == "--mono" ]; then
     N=1;
     P=0;
@@ -60,9 +60,6 @@ if $clean; then rm -r $lang/tmp; fi
 
 mkdir -p $dir
 
-tscale=1.0
-loopscale=0.1
-
 # If $lang/tmp/LG.fst does not exist or is older than its sources, make it...
 # (note: the [[ ]] brackets make the || type operators work (inside [ ], we
 # would have to use -o instead),  -f means file exists, and -ot means older than).
@@ -101,7 +98,7 @@ fi
 if [[ ! -f $dir/Ha.fst || $dir/Ha.fst -ot $model  \
     || $dir/Ha.fst -ot $lang/tmp/ilabels_${N}_${P} ]]; then
   make-h-transducer --disambig-syms-out=$dir/disambig_tid.list \
-    --transition-scale=$tscale $lang/tmp/ilabels_${N}_${P} $tree $model \
+    $lang/tmp/ilabels_${N}_${P} $tree $model \
      > $dir/Ha.fst  || exit 1;
 fi
 
@@ -114,13 +111,10 @@ if [[ ! -f $dir/HCLGa.fst || $dir/HCLGa.fst -ot $dir/Ha.fst || \
 fi
 
 if [[ ! -f $dir/HCLG.fst || $dir/HCLG.fst -ot $dir/HCLGa.fst ]]; then
-  add-self-loops --self-loop-scale=$loopscale --reorder=true \
+  add-self-loops \
     $model < $dir/HCLGa.fst > $dir/HCLG.fst || exit 1;
 
-  if [ $tscale == 1.0 -a $loopscale == 1.0 ]; then
-    # No point doing this test if transition-scale not 1, as it is bound to fail. 
-    fstisstochastic $dir/HCLG.fst || echo "Final HCLG is not stochastic."
-  fi
+  fstisstochastic $dir/HCLG.fst || echo "Final HCLG is not stochastic."
 fi
 
 # keep a copy of the lexicon and a list of silence phones with HCLG...
diff --git a/egs/heroico/s5/local/chain/tuning/run_cnn_tdnn_1a.sh b/egs/heroico/s5/local/chain/tuning/run_cnn_tdnn_1a.sh
index 361879b4142..0a40bd33c66 100755
--- a/egs/heroico/s5/local/chain/tuning/run_cnn_tdnn_1a.sh
+++ b/egs/heroico/s5/local/chain/tuning/run_cnn_tdnn_1a.sh
@@ -249,7 +249,7 @@ if [ $stage -le 15 ]; then
   # Note: it's not important to give mkgraph.sh the lang directory with the
   # matched topology (since it gets the topology file from the model).
   utils/mkgraph.sh \
-    --self-loop-scale 1.0 \
+    \
     data/lang_test \
     $tree_dir \
     $tree_dir/graph || exit 1;
diff --git a/egs/heroico/s5/local/chain/tuning/run_tdnn_1a.sh b/egs/heroico/s5/local/chain/tuning/run_tdnn_1a.sh
index 290bd4c7970..147195d18b7 100755
--- a/egs/heroico/s5/local/chain/tuning/run_tdnn_1a.sh
+++ b/egs/heroico/s5/local/chain/tuning/run_tdnn_1a.sh
@@ -236,7 +236,7 @@ if [ $stage -le 15 ]; then
   # Note: it's not important to give mkgraph.sh the lang directory with the
   # matched topology (since it gets the topology file from the model).
   utils/mkgraph.sh \
-    --self-loop-scale 1.0 \
+    \
     data/lang_test \
     $tree_dir \
     $tree_dir/graph || exit 1;
diff --git a/egs/heroico/s5/local/chain/tuning/run_tdnn_1b.sh b/egs/heroico/s5/local/chain/tuning/run_tdnn_1b.sh
index cfb4dc1f697..3591f11d228 100755
--- a/egs/heroico/s5/local/chain/tuning/run_tdnn_1b.sh
+++ b/egs/heroico/s5/local/chain/tuning/run_tdnn_1b.sh
@@ -232,7 +232,7 @@ if [ $stage -le 15 ]; then
   # Note: it's not important to give mkgraph.sh the lang directory with the
   # matched topology (since it gets the topology file from the model).
   utils/mkgraph.sh \
-    --self-loop-scale 1.0 \
+    \
     data/lang_test \
     $tree_dir \
     $tree_dir/graph || exit 1;
diff --git a/egs/hkust/s5/local/chain/tuning/run_tdnn_2a.sh b/egs/hkust/s5/local/chain/tuning/run_tdnn_2a.sh
index c62b776de2b..c79606dcfd1 100755
--- a/egs/hkust/s5/local/chain/tuning/run_tdnn_2a.sh
+++ b/egs/hkust/s5/local/chain/tuning/run_tdnn_2a.sh
@@ -207,7 +207,7 @@ if [ $stage -le 14 ]; then
   # Note: it might appear that this $lang directory is mismatched, and it is as
   # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
   # the lang directory.
-  utils/mkgraph.sh --self-loop-scale 1.0 data/lang_test $dir $dir/graph
+  utils/mkgraph.sh data/lang_test $dir $dir/graph
 fi
 
 graph_dir=$dir/graph
diff --git a/egs/hub4_spanish/s5/local/chain/tuning/run_cnn_tdnn_1a.sh b/egs/hub4_spanish/s5/local/chain/tuning/run_cnn_tdnn_1a.sh
index d1b657a2d74..e2a51260ff5 100755
--- a/egs/hub4_spanish/s5/local/chain/tuning/run_cnn_tdnn_1a.sh
+++ b/egs/hub4_spanish/s5/local/chain/tuning/run_cnn_tdnn_1a.sh
@@ -254,7 +254,7 @@ if [ $stage -le 15 ]; then
   # Note: it's not important to give mkgraph.sh the lang directory with the
   # matched topology (since it gets the topology file from the model).
   utils/mkgraph.sh \
-    --self-loop-scale 1.0 data/langp_test \
+    data/langp_test \
     $tree_dir $dir/graph || exit 1;
 fi
 
diff --git a/egs/hub4_spanish/s5/local/chain/tuning/run_tdnn_1a.sh b/egs/hub4_spanish/s5/local/chain/tuning/run_tdnn_1a.sh
index 40bbbe1ae79..25b2224a855 100755
--- a/egs/hub4_spanish/s5/local/chain/tuning/run_tdnn_1a.sh
+++ b/egs/hub4_spanish/s5/local/chain/tuning/run_tdnn_1a.sh
@@ -223,7 +223,7 @@ if [ $stage -le 15 ]; then
   # Note: it's not important to give mkgraph.sh the lang directory with the
   # matched topology (since it gets the topology file from the model).
   utils/mkgraph.sh \
-    --self-loop-scale 1.0 data/langp_test \
+    data/langp_test \
     $tree_dir $dir/graph || exit 1;
 fi
 
diff --git a/egs/hub4_spanish/s5/local/chain/tuning/run_tdnn_1b.sh b/egs/hub4_spanish/s5/local/chain/tuning/run_tdnn_1b.sh
index a498d8157f3..246adb1e45d 100755
--- a/egs/hub4_spanish/s5/local/chain/tuning/run_tdnn_1b.sh
+++ b/egs/hub4_spanish/s5/local/chain/tuning/run_tdnn_1b.sh
@@ -240,7 +240,7 @@ if [ $stage -le 15 ]; then
   # Note: it's not important to give mkgraph.sh the lang directory with the
   # matched topology (since it gets the topology file from the model).
   utils/mkgraph.sh \
-    --self-loop-scale 1.0 data/langp_test \
+    data/langp_test \
     $tree_dir $dir/graph || exit 1;
 fi
 
diff --git a/egs/iam/v1/local/chain/tuning/run_cnn_1a.sh b/egs/iam/v1/local/chain/tuning/run_cnn_1a.sh
index ef1273f3961..a0655c6f247 100755
--- a/egs/iam/v1/local/chain/tuning/run_cnn_1a.sh
+++ b/egs/iam/v1/local/chain/tuning/run_cnn_1a.sh
@@ -209,7 +209,7 @@ if [ $stage -le 6 ]; then
   # lang directory, one that contained a wordlist and LM of your choice,
   # as long as phones.txt was compatible.
   utils/mkgraph.sh \
-    --self-loop-scale 1.0 data/$lang_decode \
+    data/$lang_decode \
     $dir $dir/graph || exit 1;
 fi
 
diff --git a/egs/iam/v1/local/chain/tuning/run_cnn_chainali_1a.sh b/egs/iam/v1/local/chain/tuning/run_cnn_chainali_1a.sh
index bbcc55aa2b0..2b80fbcb4de 100755
--- a/egs/iam/v1/local/chain/tuning/run_cnn_chainali_1a.sh
+++ b/egs/iam/v1/local/chain/tuning/run_cnn_chainali_1a.sh
@@ -100,7 +100,7 @@ if [ $stage -le 2 ]; then
   # use the same num-jobs as the alignments
   steps/nnet3/align_lats.sh --nj $nj --cmd "$cmd" \
                             --acoustic-scale 1.0 \
-                            --scale-opts '--transition-scale=1.0 --self-loop-scale=1.0' \
+                            \
                             $train_data_dir data/lang $chain_model_dir $lat_dir
   cp $gmm_lat_dir/splice_opts $lat_dir/splice_opts
 fi
@@ -206,7 +206,7 @@ if [ $stage -le 6 ]; then
   # lang directory, one that contained a wordlist and LM of your choice,
   # as long as phones.txt was compatible.
   utils/mkgraph.sh \
-    --self-loop-scale 1.0 data/$lang_decode \
+    data/$lang_decode \
     $dir $dir/graph || exit 1;
 fi
 
diff --git a/egs/iam/v1/local/chain/tuning/run_cnn_chainali_1b.sh b/egs/iam/v1/local/chain/tuning/run_cnn_chainali_1b.sh
index 401ffa14e19..ad7367b614e 100755
--- a/egs/iam/v1/local/chain/tuning/run_cnn_chainali_1b.sh
+++ b/egs/iam/v1/local/chain/tuning/run_cnn_chainali_1b.sh
@@ -98,7 +98,7 @@ if [ $stage -le 2 ]; then
   # use the same num-jobs as the alignments
   steps/nnet3/align_lats.sh --nj $nj --cmd "$cmd" \
                             --acoustic-scale 1.0 \
-                            --scale-opts '--transition-scale=1.0 --self-loop-scale=1.0' \
+                            \
                             $train_data_dir data/lang $chain_model_dir $lat_dir
   cp $gmm_lat_dir/splice_opts $lat_dir/splice_opts
 fi
@@ -207,7 +207,7 @@ if [ $stage -le 6 ]; then
   # lang directory, one that contained a wordlist and LM of your choice,
   # as long as phones.txt was compatible.
   utils/mkgraph.sh \
-    --self-loop-scale 1.0 data/$lang_decode \
+    data/$lang_decode \
     $dir $dir/graph || exit 1;
 fi
 
diff --git a/egs/iam/v1/local/chain/tuning/run_cnn_chainali_1c.sh b/egs/iam/v1/local/chain/tuning/run_cnn_chainali_1c.sh
index 17209b9204f..3770eb0aa40 100755
--- a/egs/iam/v1/local/chain/tuning/run_cnn_chainali_1c.sh
+++ b/egs/iam/v1/local/chain/tuning/run_cnn_chainali_1c.sh
@@ -97,7 +97,7 @@ if [ $stage -le 2 ]; then
   # use the same num-jobs as the alignments
   steps/nnet3/align_lats.sh --nj $nj --cmd "$cmd" \
                             --acoustic-scale 1.0 \
-                            --scale-opts '--transition-scale=1.0 --self-loop-scale=1.0' \
+                            \
                             $train_data_dir data/lang $chain_model_dir $lat_dir
   cp $gmm_lat_dir/splice_opts $lat_dir/splice_opts
 fi
@@ -213,7 +213,7 @@ if [ $stage -le 6 ]; then
   # lang directory, one that contained a wordlist and LM of your choice,
   # as long as phones.txt was compatible.
   utils/mkgraph.sh \
-    --self-loop-scale 1.0 data/$lang_decode \
+    data/$lang_decode \
     $dir $dir/graph || exit 1;
 fi
 
diff --git a/egs/iam/v1/local/chain/tuning/run_cnn_chainali_1d.sh b/egs/iam/v1/local/chain/tuning/run_cnn_chainali_1d.sh
index 89a40ed2a13..e5d12aabbb7 100755
--- a/egs/iam/v1/local/chain/tuning/run_cnn_chainali_1d.sh
+++ b/egs/iam/v1/local/chain/tuning/run_cnn_chainali_1d.sh
@@ -101,7 +101,7 @@ if [ $stage -le 2 ]; then
   # use the same num-jobs as the alignments
   steps/nnet3/align_lats.sh --nj $nj --cmd "$cmd" \
                             --acoustic-scale 1.0 \
-                            --scale-opts '--transition-scale=1.0 --self-loop-scale=1.0' \
+                            \
                             $train_data_dir data/lang $chain_model_dir $lat_dir
   cp $gmm_lat_dir/splice_opts $lat_dir/splice_opts
 fi
@@ -215,7 +215,7 @@ if [ $stage -le 6 ]; then
   # lang directory, one that contained a wordlist and LM of your choice,
   # as long as phones.txt was compatible.
   utils/mkgraph.sh \
-    --self-loop-scale 1.0 data/$lang_decode \
+    data/$lang_decode \
     $dir $dir/graph || exit 1;
 fi
 
diff --git a/egs/iam/v1/local/chain/tuning/run_cnn_e2eali_1a.sh b/egs/iam/v1/local/chain/tuning/run_cnn_e2eali_1a.sh
index 703d404159a..81399230b2e 100755
--- a/egs/iam/v1/local/chain/tuning/run_cnn_e2eali_1a.sh
+++ b/egs/iam/v1/local/chain/tuning/run_cnn_e2eali_1a.sh
@@ -94,7 +94,7 @@ if [ $stage -le 2 ]; then
   # use the same num-jobs as the alignments
   steps/nnet3/align_lats.sh --nj $nj --cmd "$cmd" \
                             --acoustic-scale 1.0 \
-                            --scale-opts '--transition-scale=1.0 --self-loop-scale=1.0' \
+                            \
                             ${train_data_dir} data/lang $e2echain_model_dir $lat_dir
   echo "" >$lat_dir/splice_opts
 fi
@@ -211,7 +211,7 @@ if [ $stage -le 6 ]; then
   # lang directory, one that contained a wordlist and LM of your choice,
   # as long as phones.txt was compatible.
   utils/mkgraph.sh \
-    --self-loop-scale 1.0 data/$lang_decode \
+    data/$lang_decode \
     $dir $dir/graph || exit 1;
 fi
 
diff --git a/egs/iam/v1/local/chain/tuning/run_cnn_e2eali_1b.sh b/egs/iam/v1/local/chain/tuning/run_cnn_e2eali_1b.sh
index 905c4661477..ecc93e9341a 100755
--- a/egs/iam/v1/local/chain/tuning/run_cnn_e2eali_1b.sh
+++ b/egs/iam/v1/local/chain/tuning/run_cnn_e2eali_1b.sh
@@ -91,7 +91,7 @@ if [ $stage -le 2 ]; then
   # use the same num-jobs as the alignments
   steps/nnet3/align_lats.sh --nj $nj --cmd "$cmd" \
                             --acoustic-scale 1.0 \
-                            --scale-opts '--transition-scale=1.0 --self-loop-scale=1.0' \
+                            \
                             $train_data_dir data/lang $e2echain_model_dir $lat_dir
   echo "" >$lat_dir/splice_opts
 fi
@@ -203,7 +203,7 @@ if [ $stage -le 6 ]; then
   # lang directory, one that contained a wordlist and LM of your choice,
   # as long as phones.txt was compatible.
   utils/mkgraph.sh \
-    --self-loop-scale 1.0 data/$lang_decode \
+    data/$lang_decode \
     $dir $dir/graph || exit 1;
 fi
 
diff --git a/egs/iam/v1/local/chain/tuning/run_cnn_e2eali_1c.sh b/egs/iam/v1/local/chain/tuning/run_cnn_e2eali_1c.sh
index 26b1aca0929..f7cf2d3ff59 100755
--- a/egs/iam/v1/local/chain/tuning/run_cnn_e2eali_1c.sh
+++ b/egs/iam/v1/local/chain/tuning/run_cnn_e2eali_1c.sh
@@ -93,7 +93,7 @@ if [ $stage -le 2 ]; then
   # use the same num-jobs as the alignments
   steps/nnet3/align_lats.sh --nj $nj --cmd "$cmd" \
                             --acoustic-scale 1.0 \
-                            --scale-opts '--transition-scale=1.0 --self-loop-scale=1.0' \
+                            \
                             $train_data_dir data/lang $e2echain_model_dir $lat_dir
   echo "" >$lat_dir/splice_opts
 fi
@@ -206,7 +206,7 @@ if [ $stage -le 6 ]; then
   # lang directory, one that contained a wordlist and LM of your choice,
   # as long as phones.txt was compatible.
   utils/mkgraph.sh \
-    --self-loop-scale 1.0 $lang_decode \
+    $lang_decode \
     $dir $dir/graph || exit 1;
 fi
 
diff --git a/egs/iam/v1/local/chain/tuning/run_e2e_cnn_1a.sh b/egs/iam/v1/local/chain/tuning/run_e2e_cnn_1a.sh
index 462ad0522de..72ad70e7dcd 100755
--- a/egs/iam/v1/local/chain/tuning/run_e2e_cnn_1a.sh
+++ b/egs/iam/v1/local/chain/tuning/run_e2e_cnn_1a.sh
@@ -138,7 +138,7 @@ if [ $stage -le 4 ]; then
   # lang directory, one that contained a wordlist and LM of your choice,
   # as long as phones.txt was compatible.
   utils/mkgraph.sh \
-    --self-loop-scale 1.0 $lang_decode \
+    $lang_decode \
     $dir $dir/graph || exit 1;
 fi
 
diff --git a/egs/iam/v1/run_end2end.sh b/egs/iam/v1/run_end2end.sh
index 0a8b014715f..e81c2eb54ba 100755
--- a/egs/iam/v1/run_end2end.sh
+++ b/egs/iam/v1/run_end2end.sh
@@ -114,7 +114,7 @@ if [ $stage -le 6 ]; then
   echo "$0: Aligning the training data using the e2e chain model..."
   steps/nnet3/align.sh --nj 50 --cmd "$cmd" \
                        --use-gpu false \
-                       --scale-opts '--transition-scale=1.0 --self-loop-scale=1.0 --acoustic-scale=1.0' \
+                       --scale-opts ' --acoustic-scale=1.0' \
                        data/$train_set data/lang exp/chain/e2e_cnn_1a exp/chain/e2e_ali_train
 fi
 
diff --git a/egs/iam/v2/local/chain/tuning/run_cnn_e2eali_1a.sh b/egs/iam/v2/local/chain/tuning/run_cnn_e2eali_1a.sh
index 9a01688ba35..10a69265b3f 100755
--- a/egs/iam/v2/local/chain/tuning/run_cnn_e2eali_1a.sh
+++ b/egs/iam/v2/local/chain/tuning/run_cnn_e2eali_1a.sh
@@ -106,7 +106,7 @@ if [ $stage -le 2 ]; then
   # use the same num-jobs as the alignments
   steps/nnet3/align_lats.sh --nj $nj --cmd "$cmd" \
                             --acoustic-scale 1.0 \
-                            --scale-opts '--transition-scale=1.0 --self-loop-scale=1.0' \
+                            \
                             ${train_data_dir} data/lang $e2echain_model_dir $lat_dir
   echo "" >$lat_dir/splice_opts
 fi
@@ -231,7 +231,7 @@ if [ $stage -le 6 ]; then
   # as long as phones.txt was compatible.
 
   utils/mkgraph.sh \
-    --self-loop-scale 1.0 $lang_decode \
+    $lang_decode \
     $dir $dir/graph || exit 1;
 fi
 
diff --git a/egs/iam/v2/local/chain/tuning/run_cnn_e2eali_1b.sh b/egs/iam/v2/local/chain/tuning/run_cnn_e2eali_1b.sh
index 28aa246f334..ce3fda36052 100755
--- a/egs/iam/v2/local/chain/tuning/run_cnn_e2eali_1b.sh
+++ b/egs/iam/v2/local/chain/tuning/run_cnn_e2eali_1b.sh
@@ -108,7 +108,7 @@ if [ $stage -le 2 ]; then
   # use the same num-jobs as the alignments
   steps/nnet3/align_lats.sh --nj $nj --cmd "$cmd" \
                             --acoustic-scale 1.0 \
-                            --scale-opts '--transition-scale=1.0 --self-loop-scale=1.0' \
+                            \
                             ${train_data_dir} data/lang $e2echain_model_dir $lat_dir
   echo "" >$lat_dir/splice_opts
 fi
@@ -233,7 +233,7 @@ if [ $stage -le 6 ]; then
   # as long as phones.txt was compatible.
 
   utils/mkgraph.sh \
-    --self-loop-scale 1.0 $lang_decode \
+    $lang_decode \
     $dir $dir/graph || exit 1;
 fi
 
diff --git a/egs/iam/v2/local/chain/tuning/run_cnn_e2eali_1c.sh b/egs/iam/v2/local/chain/tuning/run_cnn_e2eali_1c.sh
index f158317950a..c253a796813 100755
--- a/egs/iam/v2/local/chain/tuning/run_cnn_e2eali_1c.sh
+++ b/egs/iam/v2/local/chain/tuning/run_cnn_e2eali_1c.sh
@@ -110,7 +110,7 @@ if [ $stage -le 2 ]; then
   # use the same num-jobs as the alignments
   steps/nnet3/align_lats.sh --nj $nj --cmd "$cmd" \
                             --acoustic-scale 1.0 \
-                            --scale-opts '--transition-scale=1.0 --self-loop-scale=1.0' \
+                            \
                             ${train_data_dir} data/lang $e2echain_model_dir $lat_dir
   echo "" >$lat_dir/splice_opts
 fi
@@ -234,7 +234,7 @@ if [ $stage -le 6 ]; then
   # as long as phones.txt was compatible.
 
   utils/mkgraph.sh \
-    --self-loop-scale 1.0 $lang_decode \
+    $lang_decode \
     $dir $dir/graph || exit 1;
 fi
 
diff --git a/egs/iam/v2/local/chain/tuning/run_cnn_e2eali_1d.sh b/egs/iam/v2/local/chain/tuning/run_cnn_e2eali_1d.sh
index 1c44057454a..dde868d6918 100755
--- a/egs/iam/v2/local/chain/tuning/run_cnn_e2eali_1d.sh
+++ b/egs/iam/v2/local/chain/tuning/run_cnn_e2eali_1d.sh
@@ -108,7 +108,7 @@ if [ $stage -le 2 ]; then
   # use the same num-jobs as the alignments
   steps/nnet3/align_lats.sh --nj $nj --cmd "$cmd" \
                             --acoustic-scale 1.0 \
-                            --scale-opts '--transition-scale=1.0 --self-loop-scale=1.0' \
+                            \
                             ${train_data_dir} data/lang $e2echain_model_dir $lat_dir
   echo "" >$lat_dir/splice_opts
 fi
@@ -233,7 +233,7 @@ if [ $stage -le 6 ]; then
   # as long as phones.txt was compatible.
 
   utils/mkgraph.sh \
-    --self-loop-scale 1.0 $lang_decode \
+    $lang_decode \
     $dir $dir/graph || exit 1;
 fi
 
diff --git a/egs/iam/v2/local/chain/tuning/run_e2e_cnn_1a.sh b/egs/iam/v2/local/chain/tuning/run_e2e_cnn_1a.sh
index cb2bfa0a82d..1758efd8f4d 100755
--- a/egs/iam/v2/local/chain/tuning/run_e2e_cnn_1a.sh
+++ b/egs/iam/v2/local/chain/tuning/run_e2e_cnn_1a.sh
@@ -158,7 +158,7 @@ if [ $stage -le 4 ]; then
   # as long as phones.txt was compatible.
 
   utils/mkgraph.sh \
-    --self-loop-scale 1.0 $lang_decode \
+    $lang_decode \
     $dir $dir/graph || exit 1;
 fi
 
diff --git a/egs/iam/v2/local/chain/tuning/run_e2e_cnn_1b.sh b/egs/iam/v2/local/chain/tuning/run_e2e_cnn_1b.sh
index d5f79602695..f02246503d1 100755
--- a/egs/iam/v2/local/chain/tuning/run_e2e_cnn_1b.sh
+++ b/egs/iam/v2/local/chain/tuning/run_e2e_cnn_1b.sh
@@ -144,7 +144,7 @@ if [ $stage -le 4 ]; then
   # as long as phones.txt was compatible.
 
   utils/mkgraph.sh \
-    --self-loop-scale 1.0 $lang_decode \
+    $lang_decode \
     $dir $dir/graph || exit 1;
 fi
 
diff --git a/egs/iam/v2/run_end2end.sh b/egs/iam/v2/run_end2end.sh
index c515c85fc72..51dc6737c86 100755
--- a/egs/iam/v2/run_end2end.sh
+++ b/egs/iam/v2/run_end2end.sh
@@ -136,7 +136,7 @@ if [ $stage -le 7 ]; then
   echo "$0: Aligning the training data using the e2e chain model..."
   steps/nnet3/align.sh --nj 50 --cmd "$cmd" \
                        --use-gpu false \
-                       --scale-opts '--transition-scale=1.0 --self-loop-scale=1.0 --acoustic-scale=1.0' \
+                       --scale-opts ' --acoustic-scale=1.0' \
                        data/train_aug data/lang exp/chain/e2e_cnn_1b exp/chain/e2e_ali_train
 fi
 
diff --git a/egs/iban/s5/local/chain/tuning/run_tdnn_1a.sh b/egs/iban/s5/local/chain/tuning/run_tdnn_1a.sh
index 10650a18269..af7f6599f97 100755
--- a/egs/iban/s5/local/chain/tuning/run_tdnn_1a.sh
+++ b/egs/iban/s5/local/chain/tuning/run_tdnn_1a.sh
@@ -223,7 +223,7 @@ if [ $stage -le 14 ]; then
   # Note: it's not important to give mkgraph.sh the lang directory with the
   # matched topology (since it gets the topology file from the model).
   utils/mkgraph.sh \
-    --self-loop-scale 1.0 data/lang_test \
+    data/lang_test \
     $tree_dir $tree_dir/graph || exit 1;
 fi
 
diff --git a/egs/iban/s5/local/chain/tuning/run_tdnn_1b.sh b/egs/iban/s5/local/chain/tuning/run_tdnn_1b.sh
index db62e6f8a55..e657c9bc3f3 100755
--- a/egs/iban/s5/local/chain/tuning/run_tdnn_1b.sh
+++ b/egs/iban/s5/local/chain/tuning/run_tdnn_1b.sh
@@ -237,7 +237,7 @@ if [ $stage -le 14 ]; then
   # Note: it's not important to give mkgraph.sh the lang directory with the
   # matched topology (since it gets the topology file from the model).
   utils/mkgraph.sh \
-    --self-loop-scale 1.0 data/lang_test \
+    data/lang_test \
     $tree_dir $tree_dir/graph || exit 1;
 fi
 
diff --git a/egs/ifnenit/v1/local/chain/run_cnn_1a.sh b/egs/ifnenit/v1/local/chain/run_cnn_1a.sh
index b0ecd547741..d0c386e75ee 100755
--- a/egs/ifnenit/v1/local/chain/run_cnn_1a.sh
+++ b/egs/ifnenit/v1/local/chain/run_cnn_1a.sh
@@ -211,7 +211,7 @@ if [ $stage -le 6 ]; then
   # as long as phones.txt was compatible.
 
   utils/mkgraph.sh \
-    --self-loop-scale 1.0 data/$lang_test \
+    data/$lang_test \
     $dir $dir/graph || exit 1;
 fi
 
diff --git a/egs/ifnenit/v1/local/chain/run_cnn_chainali_1a.sh b/egs/ifnenit/v1/local/chain/run_cnn_chainali_1a.sh
index 7f3132d657e..7eeb6f4a15c 100755
--- a/egs/ifnenit/v1/local/chain/run_cnn_chainali_1a.sh
+++ b/egs/ifnenit/v1/local/chain/run_cnn_chainali_1a.sh
@@ -101,7 +101,7 @@ if [ $stage -le 2 ]; then
   # use the same num-jobs as the alignments
   steps/nnet3/align_lats.sh --nj $nj --cmd "$cmd" \
                             --acoustic-scale 1.0 \
-                            --scale-opts '--transition-scale=1.0 --self-loop-scale=1.0' \
+                            \
                             ${train_data_dir} data/lang $chain_model_dir $lat_dir
   cp $gmm_lat_dir/splice_opts $lat_dir/splice_opts
 fi
@@ -216,7 +216,7 @@ if [ $stage -le 6 ]; then
   # as long as phones.txt was compatible.
 
   utils/mkgraph.sh \
-    --self-loop-scale 1.0 data/$lang_test \
+    data/$lang_test \
     $dir $dir/graph || exit 1;
 fi
 
diff --git a/egs/librispeech/s5/local/chain/run_tdnn_discriminative.sh b/egs/librispeech/s5/local/chain/run_tdnn_discriminative.sh
index 6bf3a139ad1..0e5a3410e31 100755
--- a/egs/librispeech/s5/local/chain/run_tdnn_discriminative.sh
+++ b/egs/librispeech/s5/local/chain/run_tdnn_discriminative.sh
@@ -125,7 +125,7 @@ if [ $stage -le 1 ]; then
          # have some stragglers.
   steps/nnet3/align.sh  --cmd "$decode_cmd" --use-gpu false \
     --online-ivector-dir $train_ivector_dir \
-    --scale-opts "--transition-scale=1.0 --acoustic-scale=1.0 --self-loop-scale=1.0" \
+    --scale-opts "--acoustic-scale=1.0 " \
     --nj $nj $train_data_dir $lang $srcdir ${srcdir}_ali${affix} ;
 fi
 
@@ -139,7 +139,7 @@ if [ -z "$lats_dir" ]; then
     subsplit=40 # number of jobs that run per job (but 2 run at a time, so total jobs is 80, giving
     # total slots = 80 * 6 = 480.
     steps/nnet3/make_denlats.sh --cmd "$decode_cmd" \
-      --self-loop-scale 1.0 --acwt 1.0 --determinize true \
+      --acwt 1.0 --determinize true \
       --online-ivector-dir $train_ivector_dir \
       --nj $nj --sub-split $subsplit --num-threads "$num_threads_denlats" --config conf/decode.config \
       $train_data_dir $lang $srcdir ${lats_dir} ;
diff --git a/egs/librispeech/s5/local/chain/tuning/run_cnn_tdnn_1a.sh b/egs/librispeech/s5/local/chain/tuning/run_cnn_tdnn_1a.sh
index 8ebca6fd650..d79b7774de1 100755
--- a/egs/librispeech/s5/local/chain/tuning/run_cnn_tdnn_1a.sh
+++ b/egs/librispeech/s5/local/chain/tuning/run_cnn_tdnn_1a.sh
@@ -211,7 +211,7 @@ if [ $stage -le 16 ]; then
   # Note: it might appear that this $lang directory is mismatched, and it is as
   # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
   # the lang directory.
-  utils/mkgraph.sh --self-loop-scale 1.0 --remove-oov data/lang_test_tgsmall $dir $graph_dir
+  utils/mkgraph.sh --remove-oov data/lang_test_tgsmall $dir $graph_dir
   # remove <UNK> from the graph, and convert back to const-FST.
   fstrmsymbols --apply-to-output=true --remove-arcs=true "echo 3|" $graph_dir/HCLG.fst - | \
     fstconvert --fst_type=const > $graph_dir/temp.fst
@@ -230,7 +230,7 @@ if [ $stage -le 17 ]; then
           --nj $decode_nj --cmd "$decode_cmd" $iter_opts \
           --online-ivector-dir exp/nnet3${nnet3_affix}/ivectors_${decode_set}_hires \
           $graph_dir data/${decode_set}_hires $dir/decode_${decode_set}${decode_iter:+_$decode_iter}_tgsmall || exit 1
-      steps/lmrescore.sh --cmd "$decode_cmd" --self-loop-scale 1.0 data/lang_test_{tgsmall,tgmed} \
+      steps/lmrescore.sh --cmd "$decode_cmd" data/lang_test_{tgsmall,tgmed} \
           data/${decode_set}_hires $dir/decode_${decode_set}${decode_iter:+_$decode_iter}_{tgsmall,tgmed} || exit 1
       steps/lmrescore_const_arpa.sh \
           --cmd "$decode_cmd" data/lang_test_{tgsmall,tglarge} \
diff --git a/egs/librispeech/s5/local/chain/tuning/run_tdnn_1a.sh b/egs/librispeech/s5/local/chain/tuning/run_tdnn_1a.sh
index d4c789f7794..30999fdad5d 100755
--- a/egs/librispeech/s5/local/chain/tuning/run_tdnn_1a.sh
+++ b/egs/librispeech/s5/local/chain/tuning/run_tdnn_1a.sh
@@ -178,7 +178,7 @@ if [ $stage -le 16 ]; then
   # Note: it might appear that this $lang directory is mismatched, and it is as
   # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
   # the lang directory.
-  utils/mkgraph.sh --self-loop-scale 1.0 --remove-oov data/lang_test_tgsmall $dir $graph_dir
+  utils/mkgraph.sh --remove-oov data/lang_test_tgsmall $dir $graph_dir
   # romove <UNK> from the graph
   fstrmsymbols --apply-to-output=true --remove-arcs=true "echo 3|" $graph_dir/HCLG.fst $graph_dir/HCLG.fst
 fi
@@ -196,7 +196,7 @@ if [ $stage -le 17 ]; then
           --nj $decode_nj --cmd "$decode_cmd" $iter_opts \
           --online-ivector-dir exp/nnet3${nnet3_affix}/ivectors_${decode_set}_hires \
           $graph_dir data/${decode_set}_hires $dir/decode_${decode_set}${decode_iter:+_$decode_iter}_tgsmall || exit 1
-      steps/lmrescore.sh --cmd "$decode_cmd" --self-loop-scale 1.0 data/lang_test_{tgsmall,tgmed} \
+      steps/lmrescore.sh --cmd "$decode_cmd" data/lang_test_{tgsmall,tgmed} \
           data/${decode_set}_hires $dir/decode_${decode_set}${decode_iter:+_$decode_iter}_{tgsmall,tgmed} || exit 1
       steps/lmrescore_const_arpa.sh \
           --cmd "$decode_cmd" data/lang_test_{tgsmall,tglarge} \
diff --git a/egs/librispeech/s5/local/chain/tuning/run_tdnn_1b.sh b/egs/librispeech/s5/local/chain/tuning/run_tdnn_1b.sh
index 57f50df761d..d0ac8efc214 100755
--- a/egs/librispeech/s5/local/chain/tuning/run_tdnn_1b.sh
+++ b/egs/librispeech/s5/local/chain/tuning/run_tdnn_1b.sh
@@ -205,7 +205,7 @@ if [ $stage -le 16 ]; then
   # Note: it might appear that this $lang directory is mismatched, and it is as
   # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
   # the lang directory.
-  utils/mkgraph.sh --self-loop-scale 1.0 --remove-oov data/lang_test_tgsmall $dir $graph_dir
+  utils/mkgraph.sh --remove-oov data/lang_test_tgsmall $dir $graph_dir
   # remove <UNK> from the graph, and convert back to const-FST.
   fstrmsymbols --apply-to-output=true --remove-arcs=true "echo 3|" $graph_dir/HCLG.fst - | \
     fstconvert --fst_type=const > $graph_dir/temp.fst
@@ -225,7 +225,7 @@ if [ $stage -le 17 ]; then
           --nj $decode_nj --cmd "$decode_cmd" $iter_opts \
           --online-ivector-dir exp/nnet3${nnet3_affix}/ivectors_${decode_set}_hires \
           $graph_dir data/${decode_set}_hires $dir/decode_${decode_set}${decode_iter:+_$decode_iter}_tgsmall || exit 1
-      steps/lmrescore.sh --cmd "$decode_cmd" --self-loop-scale 1.0 data/lang_test_{tgsmall,tgmed} \
+      steps/lmrescore.sh --cmd "$decode_cmd" data/lang_test_{tgsmall,tgmed} \
           data/${decode_set}_hires $dir/decode_${decode_set}${decode_iter:+_$decode_iter}_{tgsmall,tgmed} || exit 1
       steps/lmrescore_const_arpa.sh \
           --cmd "$decode_cmd" data/lang_test_{tgsmall,tglarge} \
diff --git a/egs/librispeech/s5/local/chain/tuning/run_tdnn_1c.sh b/egs/librispeech/s5/local/chain/tuning/run_tdnn_1c.sh
index 3970fa8c4d9..53773f5ba03 100755
--- a/egs/librispeech/s5/local/chain/tuning/run_tdnn_1c.sh
+++ b/egs/librispeech/s5/local/chain/tuning/run_tdnn_1c.sh
@@ -196,7 +196,7 @@ if [ $stage -le 16 ]; then
   # Note: it might appear that this $lang directory is mismatched, and it is as
   # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
   # the lang directory.
-  utils/mkgraph.sh --self-loop-scale 1.0 --remove-oov data/lang_test_tgsmall $dir $graph_dir
+  utils/mkgraph.sh --remove-oov data/lang_test_tgsmall $dir $graph_dir
   # remove <UNK> from the graph, and convert back to const-FST.
   fstrmsymbols --apply-to-output=true --remove-arcs=true "echo 3|" $graph_dir/HCLG.fst - | \
     fstconvert --fst_type=const > $graph_dir/temp.fst
@@ -215,7 +215,7 @@ if [ $stage -le 17 ]; then
           --nj $decode_nj --cmd "$decode_cmd" $iter_opts \
           --online-ivector-dir exp/nnet3${nnet3_affix}/ivectors_${decode_set}_hires \
           $graph_dir data/${decode_set}_hires $dir/decode_${decode_set}${decode_iter:+_$decode_iter}_tgsmall || exit 1
-      steps/lmrescore.sh --cmd "$decode_cmd" --self-loop-scale 1.0 data/lang_test_{tgsmall,tgmed} \
+      steps/lmrescore.sh --cmd "$decode_cmd" data/lang_test_{tgsmall,tgmed} \
           data/${decode_set}_hires $dir/decode_${decode_set}${decode_iter:+_$decode_iter}_{tgsmall,tgmed} || exit 1
       steps/lmrescore_const_arpa.sh \
           --cmd "$decode_cmd" data/lang_test_{tgsmall,tglarge} \
diff --git a/egs/librispeech/s5/local/chain/tuning/run_tdnn_1d.sh b/egs/librispeech/s5/local/chain/tuning/run_tdnn_1d.sh
index 5c488362e59..d977db51449 100755
--- a/egs/librispeech/s5/local/chain/tuning/run_tdnn_1d.sh
+++ b/egs/librispeech/s5/local/chain/tuning/run_tdnn_1d.sh
@@ -296,7 +296,7 @@ if [ $stage -le 16 ]; then
   # Note: it might appear that this $lang directory is mismatched, and it is as
   # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
   # the lang directory.
-  utils/mkgraph.sh --self-loop-scale 1.0 --remove-oov data/lang_test_tgsmall $dir $graph_dir
+  utils/mkgraph.sh --remove-oov data/lang_test_tgsmall $dir $graph_dir
   # remove <UNK> from the graph, and convert back to const-FST.
   fstrmsymbols --apply-to-output=true --remove-arcs=true "echo 3|" $graph_dir/HCLG.fst - | \
     fstconvert --fst_type=const > $graph_dir/temp.fst
@@ -315,7 +315,7 @@ if [ $stage -le 17 ]; then
           --nj $decode_nj --cmd "$decode_cmd" $iter_opts \
           --online-ivector-dir exp/nnet3${nnet3_affix}/ivectors_${decode_set}_hires \
           $graph_dir data/${decode_set}_hires $dir/decode_${decode_set}${decode_iter:+_$decode_iter}_tgsmall || exit 1
-      steps/lmrescore.sh --cmd "$decode_cmd" --self-loop-scale 1.0 data/lang_test_{tgsmall,tgmed} \
+      steps/lmrescore.sh --cmd "$decode_cmd" data/lang_test_{tgsmall,tgmed} \
           data/${decode_set}_hires $dir/decode_${decode_set}${decode_iter:+_$decode_iter}_{tgsmall,tgmed} || exit 1
       steps/lmrescore_const_arpa.sh \
           --cmd "$decode_cmd" data/lang_test_{tgsmall,tglarge} \
diff --git a/egs/librispeech/s5/local/chain/tuning/run_tdnn_lstm_1a.sh b/egs/librispeech/s5/local/chain/tuning/run_tdnn_lstm_1a.sh
index 4277f769119..eea2e26f600 100755
--- a/egs/librispeech/s5/local/chain/tuning/run_tdnn_lstm_1a.sh
+++ b/egs/librispeech/s5/local/chain/tuning/run_tdnn_lstm_1a.sh
@@ -179,7 +179,7 @@ if [ $stage -le 14 ]; then
   # Note: it might appear that this $lang directory is mismatched, and it is as
   # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
   # the lang directory.
-  utils/mkgraph.sh --self-loop-scale 1.0 --remove-oov data/lang_test_tgsmall $dir $graph_dir
+  utils/mkgraph.sh --remove-oov data/lang_test_tgsmall $dir $graph_dir
   # remove <UNK> from the graph, and convert back to const-FST.
   fstrmsymbols --apply-to-output=true --remove-arcs=true "echo 3|" $graph_dir/HCLG.fst - | \
     fstconvert --fst_type=const > $graph_dir/temp.fst
@@ -204,7 +204,7 @@ if [ $stage -le 15 ]; then
           --frames-per-chunk "$frames_per_chunk_primary" \
           --online-ivector-dir exp/nnet3${nnet3_affix}/ivectors_${decode_set}_hires \
           $graph_dir data/${decode_set}_hires $dir/decode_${decode_set}${decode_iter:+_$decode_iter}_tgsmall || exit 1
-      steps/lmrescore.sh --cmd "$decode_cmd" --self-loop-scale 1.0 data/lang_test_{tgsmall,tgmed} \
+      steps/lmrescore.sh --cmd "$decode_cmd" data/lang_test_{tgsmall,tgmed} \
           data/${decode_set}_hires $dir/decode_${decode_set}${decode_iter:+_$decode_iter}_{tgsmall,tgmed} || exit 1
       steps/lmrescore_const_arpa.sh \
           --cmd "$decode_cmd" data/lang_test_{tgsmall,tglarge} \
diff --git a/egs/librispeech/s5/local/chain/tuning/run_tdnn_lstm_1b.sh b/egs/librispeech/s5/local/chain/tuning/run_tdnn_lstm_1b.sh
index 383cc533270..57a0aac526c 100755
--- a/egs/librispeech/s5/local/chain/tuning/run_tdnn_lstm_1b.sh
+++ b/egs/librispeech/s5/local/chain/tuning/run_tdnn_lstm_1b.sh
@@ -214,7 +214,7 @@ if [ $stage -le 14 ]; then
   # Note: it might appear that this $lang directory is mismatched, and it is as
   # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
   # the lang directory.
-  utils/mkgraph.sh --self-loop-scale 1.0 --remove-oov data/lang_test_tgsmall $dir $graph_dir
+  utils/mkgraph.sh --remove-oov data/lang_test_tgsmall $dir $graph_dir
   # remove <UNK> from the graph, and convert back to const-FST.
   fstrmsymbols --apply-to-output=true --remove-arcs=true "echo 3|" $graph_dir/HCLG.fst - | \
     fstconvert --fst_type=const > $graph_dir/temp.fst
@@ -239,7 +239,7 @@ if [ $stage -le 15 ]; then
           --frames-per-chunk "$frames_per_chunk_primary" \
           --online-ivector-dir exp/nnet3${nnet3_affix}/ivectors_${decode_set}_hires \
           $graph_dir data/${decode_set}_hires $dir/decode_${decode_set}${decode_iter:+_$decode_iter}_tgsmall || exit 1
-      steps/lmrescore.sh --cmd "$decode_cmd" --self-loop-scale 1.0 data/lang_test_{tgsmall,tgmed} \
+      steps/lmrescore.sh --cmd "$decode_cmd" data/lang_test_{tgsmall,tgmed} \
           data/${decode_set}_hires $dir/decode_${decode_set}${decode_iter:+_$decode_iter}_{tgsmall,tgmed} || exit 1
       steps/lmrescore_const_arpa.sh \
           --cmd "$decode_cmd" data/lang_test_{tgsmall,tglarge} \
diff --git a/egs/madcat_ar/v1/local/chain/tuning/run_cnn_1a.sh b/egs/madcat_ar/v1/local/chain/tuning/run_cnn_1a.sh
index 892ee441516..33de4ae8b93 100755
--- a/egs/madcat_ar/v1/local/chain/tuning/run_cnn_1a.sh
+++ b/egs/madcat_ar/v1/local/chain/tuning/run_cnn_1a.sh
@@ -194,7 +194,7 @@ if [ $stage -le 6 ]; then
   # lang directory, one that contained a wordlist and LM of your choice,
   # as long as phones.txt was compatible.
   utils/mkgraph.sh \
-    --self-loop-scale 1.0 $lang_decode \
+    $lang_decode \
     $dir $dir/graph || exit 1;
 fi
 
diff --git a/egs/madcat_ar/v1/local/chain/tuning/run_cnn_chainali_1a.sh b/egs/madcat_ar/v1/local/chain/tuning/run_cnn_chainali_1a.sh
index 7ca7c652fd2..62ec4686ed6 100755
--- a/egs/madcat_ar/v1/local/chain/tuning/run_cnn_chainali_1a.sh
+++ b/egs/madcat_ar/v1/local/chain/tuning/run_cnn_chainali_1a.sh
@@ -86,7 +86,7 @@ if [ $stage -le 2 ]; then
   # use the same num-jobs as the alignments
   steps/nnet3/align_lats.sh --nj $nj --cmd "$cmd" \
                             --acoustic-scale 1.0 \
-                            --scale-opts '--transition-scale=1.0 --self-loop-scale=1.0' \
+                            \
                             ${train_data_dir} data/lang $chain_model_dir $lat_dir
   cp $gmm_lat_dir/splice_opts $lat_dir/splice_opts
 fi
@@ -194,7 +194,7 @@ if [ $stage -le 6 ]; then
   # lang directory, one that contained a wordlist and LM of your choice,
   # as long as phones.txt was compatible.
   utils/mkgraph.sh \
-    --self-loop-scale 1.0 $lang_decode \
+    $lang_decode \
     $dir $dir/graph || exit 1;
 fi
 
diff --git a/egs/madcat_ar/v1/local/chain/tuning/run_cnn_e2eali_1a.sh b/egs/madcat_ar/v1/local/chain/tuning/run_cnn_e2eali_1a.sh
index a8bc1836ffe..59a167f1e64 100755
--- a/egs/madcat_ar/v1/local/chain/tuning/run_cnn_e2eali_1a.sh
+++ b/egs/madcat_ar/v1/local/chain/tuning/run_cnn_e2eali_1a.sh
@@ -87,7 +87,7 @@ if [ $stage -le 2 ]; then
   # use the same num-jobs as the alignments
   steps/nnet3/align_lats.sh --nj $nj --cmd "$cmd" \
                             --acoustic-scale 1.0 \
-                            --scale-opts '--transition-scale=1.0 --self-loop-scale=1.0' \
+                            \
                             ${train_data_dir} data/lang $e2echain_model_dir $lat_dir
   echo "" >$lat_dir/splice_opts
 fi
@@ -206,7 +206,7 @@ if [ $stage -le 6 ]; then
   # as long as phones.txt was compatible.
 
   utils/mkgraph.sh \
-    --self-loop-scale 1.0 $lang_decode \
+    $lang_decode \
     $dir $dir/graph || exit 1;
 fi
 
diff --git a/egs/madcat_ar/v1/local/chain/tuning/run_cnn_e2eali_1b.sh b/egs/madcat_ar/v1/local/chain/tuning/run_cnn_e2eali_1b.sh
index 0828e051dcc..fdf6f994268 100755
--- a/egs/madcat_ar/v1/local/chain/tuning/run_cnn_e2eali_1b.sh
+++ b/egs/madcat_ar/v1/local/chain/tuning/run_cnn_e2eali_1b.sh
@@ -100,7 +100,7 @@ if [ $stage -le 2 ]; then
   # use the same num-jobs as the alignments
   steps/nnet3/align_lats.sh --nj $nj --cmd "$cmd" \
                             --acoustic-scale 1.0 \
-                            --scale-opts '--transition-scale=1.0 --self-loop-scale=1.0' \
+                            \
                             ${train_data_dir} data/lang $e2echain_model_dir $lat_dir
   echo "" >$lat_dir/splice_opts
 fi
@@ -219,7 +219,7 @@ if [ $stage -le 6 ]; then
   # as long as phones.txt was compatible.
 
   utils/mkgraph.sh \
-    --self-loop-scale 1.0 $lang_decode \
+    $lang_decode \
     $dir $dir/graph || exit 1;
 fi
 
diff --git a/egs/madcat_ar/v1/local/tl/chain/run_cnn_e2eali.sh b/egs/madcat_ar/v1/local/tl/chain/run_cnn_e2eali.sh
index ccbb7119674..5403dd2af05 100755
--- a/egs/madcat_ar/v1/local/tl/chain/run_cnn_e2eali.sh
+++ b/egs/madcat_ar/v1/local/tl/chain/run_cnn_e2eali.sh
@@ -94,7 +94,7 @@ if [ $stage -le 2 ]; then
   # use the same num-jobs as the alignments
   steps/nnet3/align_lats.sh --nj $nj --cmd "$cmd" \
                             --acoustic-scale 1.0 \
-                            --scale-opts '--transition-scale=1.0 --self-loop-scale=1.0' \
+                            \
                             ${train_data_dir} data/lang $e2echain_model_dir $lat_dir
   echo "" >$lat_dir/splice_opts
 
@@ -213,7 +213,7 @@ if [ $stage -le 6 ]; then
   # as long as phones.txt was compatible.
 
   utils/mkgraph.sh \
-    --self-loop-scale 1.0 $lang_decode \
+    $lang_decode \
     $dir $dir/graph || exit 1;
 fi
 
diff --git a/egs/madcat_ar/v1/local/tl/chain/run_e2e_cnn.sh b/egs/madcat_ar/v1/local/tl/chain/run_e2e_cnn.sh
index 3fca8cf5fdc..90ca63a971e 100755
--- a/egs/madcat_ar/v1/local/tl/chain/run_e2e_cnn.sh
+++ b/egs/madcat_ar/v1/local/tl/chain/run_e2e_cnn.sh
@@ -150,7 +150,7 @@ if [ $stage -le 4 ]; then
   # as long as phones.txt was compatible.
 
   utils/mkgraph.sh \
-    --self-loop-scale 1.0 $lang_decode \
+    $lang_decode \
     $dir $dir/graph || exit 1;
 fi
 
diff --git a/egs/madcat_ar/v1/local/tl/run_text_localization.sh b/egs/madcat_ar/v1/local/tl/run_text_localization.sh
index 8d12f7d802f..5066adc73dd 100755
--- a/egs/madcat_ar/v1/local/tl/run_text_localization.sh
+++ b/egs/madcat_ar/v1/local/tl/run_text_localization.sh
@@ -133,7 +133,7 @@ if [ $stage -le 6 ]; then
   echo "$0: Aligning the training data using the e2e chain model...$(date)."
   steps/nnet3/align.sh --nj $nj --cmd "$cmd" \
                        --use-gpu false \
-                       --scale-opts '--transition-scale=1.0 --self-loop-scale=1.0 --acoustic-scale=1.0' \
+                       --scale-opts ' --acoustic-scale=1.0' \
                        data/train_aug data/lang exp/chain/e2e_cnn_1a exp/chain/e2e_ali_train
 fi
 
diff --git a/egs/madcat_ar/v1/run_end2end.sh b/egs/madcat_ar/v1/run_end2end.sh
index 62f4eeb7c71..bb22e1b1a8e 100755
--- a/egs/madcat_ar/v1/run_end2end.sh
+++ b/egs/madcat_ar/v1/run_end2end.sh
@@ -119,7 +119,7 @@ fi
 
 if [ $stage -le 5 ] && $decode_e2e; then
   echo "$0: $(date) stage 5: decoding end2end setup..."
-  utils/mkgraph.sh --self-loop-scale 1.0 $lang_decode \
+  utils/mkgraph.sh $lang_decode \
     exp/chain/e2e_cnn_1a/ exp/chain/e2e_cnn_1a/graph || exit 1;
 
   steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 --nj $nj --cmd "$cmd" \
diff --git a/egs/madcat_zh/v1/local/chain/tuning/run_cnn_1a.sh b/egs/madcat_zh/v1/local/chain/tuning/run_cnn_1a.sh
index 164d62a7ad9..6affb1587aa 100755
--- a/egs/madcat_zh/v1/local/chain/tuning/run_cnn_1a.sh
+++ b/egs/madcat_zh/v1/local/chain/tuning/run_cnn_1a.sh
@@ -206,7 +206,7 @@ if [ $stage -le 6 ]; then
   # as long as phones.txt was compatible.
 
   utils/mkgraph.sh \
-    --self-loop-scale 1.0 data/lang_test \
+    data/lang_test \
     $dir $dir/graph || exit 1;
 fi
 
diff --git a/egs/madcat_zh/v1/local/chain/tuning/run_cnn_chainali_1a.sh b/egs/madcat_zh/v1/local/chain/tuning/run_cnn_chainali_1a.sh
index be51bdcc3d1..46df193483c 100755
--- a/egs/madcat_zh/v1/local/chain/tuning/run_cnn_chainali_1a.sh
+++ b/egs/madcat_zh/v1/local/chain/tuning/run_cnn_chainali_1a.sh
@@ -92,7 +92,7 @@ if [ $stage -le 2 ]; then
   # Get the alignments as lattices (gives the chain training more freedom).
   # use the same num-jobs as the alignments
   steps/nnet3/align_lats.sh --nj $nj --cmd "$cmd" \
-                            --scale-opts '--transition-scale=1.0 --self-loop-scale=1.0' \
+                            \
                             ${train_data_dir} data/lang $chain_model_dir $lat_dir
   cp $gmm_lat_dir/splice_opts $lat_dir/splice_opts
 fi
@@ -202,7 +202,7 @@ if [ $stage -le 6 ]; then
   # lang directory, one that contained a wordlist and LM of your choice,
   # as long as phones.txt was compatible.
   utils/mkgraph.sh \
-    --self-loop-scale 1.0 data/lang_test \
+    data/lang_test \
     $dir $dir/graph || exit 1;
 fi
 
diff --git a/egs/madcat_zh/v1/local/chain/tuning/run_cnn_chainali_1b.sh b/egs/madcat_zh/v1/local/chain/tuning/run_cnn_chainali_1b.sh
index aa61620a92f..a478a63160f 100755
--- a/egs/madcat_zh/v1/local/chain/tuning/run_cnn_chainali_1b.sh
+++ b/egs/madcat_zh/v1/local/chain/tuning/run_cnn_chainali_1b.sh
@@ -96,7 +96,7 @@ if [ $stage -le 2 ]; then
   # use the same num-jobs as the alignments
   steps/nnet3/align_lats.sh --nj $nj --cmd "$cmd" \
                             --acoustic-scale 1.0 \
-                            --scale-opts '--transition-scale=1.0 --self-loop-scale=1.0' \
+                            \
                             ${train_data_dir} data/lang $chain_model_dir $lat_dir
   cp $gmm_lat_dir/splice_opts $lat_dir/splice_opts
 fi
@@ -210,7 +210,7 @@ if [ $stage -le 6 ]; then
   # as long as phones.txt was compatible.
 
   utils/mkgraph.sh \
-    --self-loop-scale 1.0 data/lang_test \
+    data/lang_test \
     $dir $dir/graph || exit 1;
 fi
 
diff --git a/egs/madcat_zh/v1/run_end2end.sh b/egs/madcat_zh/v1/run_end2end.sh
index 7e0fc1e25d1..a89222fe1b4 100755
--- a/egs/madcat_zh/v1/run_end2end.sh
+++ b/egs/madcat_zh/v1/run_end2end.sh
@@ -96,7 +96,7 @@ fi
 
 if [ $stage -le 5 ] && $decode_e2e; then
   echo "$0: $(date) stage 5: decoding end2end setup..."
-  utils/mkgraph.sh --self-loop-scale 1.0 $lang_decode \
+  utils/mkgraph.sh $lang_decode \
     exp/chain/e2e_cnn_1a/ exp/chain/e2e_cnn_1a/graph || exit 1;
 
   steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 --nj $nj --cmd "$cmd" \
diff --git a/egs/material/s5/local/chain/tuning/run_tdnn_1a.sh b/egs/material/s5/local/chain/tuning/run_tdnn_1a.sh
index 4f38ee886a7..f7c99ef08d5 100755
--- a/egs/material/s5/local/chain/tuning/run_tdnn_1a.sh
+++ b/egs/material/s5/local/chain/tuning/run_tdnn_1a.sh
@@ -243,11 +243,11 @@ if [ $stage -le 12 ]; then
   # Note: it's not important to give mkgraph.sh the lang directory with the
   # matched topology (since it gets the topology file from the model).
   utils/mkgraph.sh \
-    --self-loop-scale 1.0 data/lang_test \
+    data/lang_test \
     $tree_dir $tree_dir/graph || exit 1;
 
   utils/mkgraph.sh \
-    --self-loop-scale 1.0 data/lang_combined_test \
+    data/lang_combined_test \
     $tree_dir ${tree_dir}/graph_combined || exit 1;
 fi
 
diff --git a/egs/material/s5/local/chain/tuning/run_tdnn_1b.sh b/egs/material/s5/local/chain/tuning/run_tdnn_1b.sh
index 023cb34b43d..4c853eefa9f 100755
--- a/egs/material/s5/local/chain/tuning/run_tdnn_1b.sh
+++ b/egs/material/s5/local/chain/tuning/run_tdnn_1b.sh
@@ -268,7 +268,7 @@ if [ $stage -le 12 ]; then
   # Note: it's not important to give mkgraph.sh the lang directory with the
   # matched topology (since it gets the topology file from the model).
   utils/mkgraph.sh \
-    --self-loop-scale 1.0 data/lang_combined_test \
+    data/lang_combined_test \
     $tree_dir ${tree_dir}/graph_combined || exit 1;
 fi
 
diff --git a/egs/material/s5/local/chain/tuning/run_tdnn_lstm_1a.sh b/egs/material/s5/local/chain/tuning/run_tdnn_lstm_1a.sh
index af5a62dad0d..576d1146d63 100755
--- a/egs/material/s5/local/chain/tuning/run_tdnn_lstm_1a.sh
+++ b/egs/material/s5/local/chain/tuning/run_tdnn_lstm_1a.sh
@@ -234,11 +234,11 @@ if [ $stage -le 12 ]; then
   # Note: it's not important to give mkgraph.sh the lang directory with the
   # matched topology (since it gets the topology file from the model).
   utils/mkgraph.sh \
-    --self-loop-scale 1.0 data/lang_test \
+    data/lang_test \
     $tree_dir $tree_dir/graph || exit 1;
 
   utils/mkgraph.sh \
-    --self-loop-scale 1.0 data/lang_combined_test \
+    data/lang_combined_test \
     $tree_dir ${tree_dir}/graph_combined || exit 1;
 fi
 
diff --git a/egs/material/s5/local/semisup/chain/tuning/run_tdnn_1a.sh b/egs/material/s5/local/semisup/chain/tuning/run_tdnn_1a.sh
index 3d3056182ee..6b641a9235c 100755
--- a/egs/material/s5/local/semisup/chain/tuning/run_tdnn_1a.sh
+++ b/egs/material/s5/local/semisup/chain/tuning/run_tdnn_1a.sh
@@ -246,7 +246,7 @@ if [ $stage -le 12 ]; then
   # Note: it's not important to give mkgraph.sh the lang directory with the
   # matched topology (since it gets the topology file from the model).
   utils/mkgraph.sh \
-    --self-loop-scale 1.0 data/lang_combined_test \
+    data/lang_combined_test \
     $tree_dir ${tree_dir}/graph_combined || exit 1;
 fi
 
diff --git a/egs/material/s5/local/semisup/chain/tuning/run_tdnn_semisupervised_1a.sh b/egs/material/s5/local/semisup/chain/tuning/run_tdnn_semisupervised_1a.sh
index 37c957a3227..54bfa09b261 100755
--- a/egs/material/s5/local/semisup/chain/tuning/run_tdnn_semisupervised_1a.sh
+++ b/egs/material/s5/local/semisup/chain/tuning/run_tdnn_semisupervised_1a.sh
@@ -124,7 +124,7 @@ done
 
 if [ $stage -le 1 ]; then
   if [ ! -f $graphdir/HCLG.fst ]; then
-    utils/mkgraph.sh --self-loop-scale 1.0 $unsup_decode_lang $sup_chain_dir $graphdir
+    utils/mkgraph.sh $unsup_decode_lang $sup_chain_dir $graphdir
   fi
 fi
 
@@ -439,7 +439,7 @@ if [ $stage -le 13 ]; then
   # Note: it might appear that this $lang directory is mismatched, and it is as
   # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
   # the lang directory.
-  utils/mkgraph.sh --self-loop-scale 1.0 ${test_lang} $dir $test_graph_dir
+  utils/mkgraph.sh ${test_lang} $dir $test_graph_dir
 fi
 
 if [ $stage -le 14 ]; then
diff --git a/egs/mgb5/s5/local/chain/tuning/run_tdnn_1a.sh b/egs/mgb5/s5/local/chain/tuning/run_tdnn_1a.sh
index 6300511e817..9b6caa24e09 100644
--- a/egs/mgb5/s5/local/chain/tuning/run_tdnn_1a.sh
+++ b/egs/mgb5/s5/local/chain/tuning/run_tdnn_1a.sh
@@ -224,7 +224,7 @@ if [ $stage -le 14 ]; then
   # Note: it's not important to give mkgraph.sh the lang directory with the
   # matched topology (since it gets the topology file from the model).
   utils/mkgraph.sh \
-    --self-loop-scale 1.0 data/lang_test \
+    data/lang_test \
     $tree_dir $tree_dir/graph || exit 1;
 fi
 
diff --git a/egs/mini_librispeech/s5/local/chain/tuning/run_cnn_tdnn_1a.sh b/egs/mini_librispeech/s5/local/chain/tuning/run_cnn_tdnn_1a.sh
index c8f2503b578..6a1a5d9222f 100755
--- a/egs/mini_librispeech/s5/local/chain/tuning/run_cnn_tdnn_1a.sh
+++ b/egs/mini_librispeech/s5/local/chain/tuning/run_cnn_tdnn_1a.sh
@@ -246,7 +246,7 @@ if [ $stage -le 15 ]; then
   # Note: it's not important to give mkgraph.sh the lang directory with the
   # matched topology (since it gets the topology file from the model).
   utils/mkgraph.sh \
-    --self-loop-scale 1.0 data/lang_test_tgsmall \
+    data/lang_test_tgsmall \
     $tree_dir $tree_dir/graph_tgsmall || exit 1;
 fi
 
diff --git a/egs/mini_librispeech/s5/local/chain/tuning/run_tdnn_1a.sh b/egs/mini_librispeech/s5/local/chain/tuning/run_tdnn_1a.sh
index da16297c9dd..066ef065f4a 100755
--- a/egs/mini_librispeech/s5/local/chain/tuning/run_tdnn_1a.sh
+++ b/egs/mini_librispeech/s5/local/chain/tuning/run_tdnn_1a.sh
@@ -233,7 +233,7 @@ if [ $stage -le 15 ]; then
   # Note: it's not important to give mkgraph.sh the lang directory with the
   # matched topology (since it gets the topology file from the model).
   utils/mkgraph.sh \
-    --self-loop-scale 1.0 data/lang_test_tgsmall \
+    data/lang_test_tgsmall \
     $tree_dir $tree_dir/graph_tgsmall || exit 1;
 fi
 
diff --git a/egs/mini_librispeech/s5/local/chain/tuning/run_tdnn_1b.sh b/egs/mini_librispeech/s5/local/chain/tuning/run_tdnn_1b.sh
index 3d0c2d63902..42d51ef2b37 100755
--- a/egs/mini_librispeech/s5/local/chain/tuning/run_tdnn_1b.sh
+++ b/egs/mini_librispeech/s5/local/chain/tuning/run_tdnn_1b.sh
@@ -241,7 +241,7 @@ if [ $stage -le 15 ]; then
   # Note: it's not important to give mkgraph.sh the lang directory with the
   # matched topology (since it gets the topology file from the model).
   utils/mkgraph.sh \
-    --self-loop-scale 1.0 data/lang_test_tgsmall \
+    data/lang_test_tgsmall \
     $tree_dir $tree_dir/graph_tgsmall || exit 1;
 fi
 
diff --git a/egs/mini_librispeech/s5/local/chain/tuning/run_tdnn_1c.sh b/egs/mini_librispeech/s5/local/chain/tuning/run_tdnn_1c.sh
index 081af8fe2f8..f4192726330 100755
--- a/egs/mini_librispeech/s5/local/chain/tuning/run_tdnn_1c.sh
+++ b/egs/mini_librispeech/s5/local/chain/tuning/run_tdnn_1c.sh
@@ -237,7 +237,7 @@ if [ $stage -le 15 ]; then
   # Note: it's not important to give mkgraph.sh the lang directory with the
   # matched topology (since it gets the topology file from the model).
   utils/mkgraph.sh \
-    --self-loop-scale 1.0 data/lang_test_tgsmall \
+    data/lang_test_tgsmall \
     $tree_dir $tree_dir/graph_tgsmall || exit 1;
 fi
 
diff --git a/egs/mini_librispeech/s5/local/chain/tuning/run_tdnn_1c_discriminative.sh b/egs/mini_librispeech/s5/local/chain/tuning/run_tdnn_1c_discriminative.sh
index 1aa519ccb9d..d68018a1032 100755
--- a/egs/mini_librispeech/s5/local/chain/tuning/run_tdnn_1c_discriminative.sh
+++ b/egs/mini_librispeech/s5/local/chain/tuning/run_tdnn_1c_discriminative.sh
@@ -116,7 +116,7 @@ if [ $stage -le 1 ]; then
          # have some stragglers.
   steps/nnet3/align.sh  --cmd "$decode_cmd" --use-gpu false \
     --online-ivector-dir $online_ivector_dir \
-    --scale-opts "--transition-scale=1.0 --acoustic-scale=1.0 --self-loop-scale=1.0" \
+    --scale-opts "--acoustic-scale=1.0 " \
     --nj $nj $train_data_dir $lang $srcdir ${srcdir}_ali${affix} ;
 fi
 
@@ -130,7 +130,7 @@ if [ -z "$lats_dir" ]; then
     subsplit=40 # number of jobs that run per job (but 2 run at a time, so total jobs is 80, giving
     # total slots = 80 * 6 = 480.
     steps/nnet3/make_denlats.sh --cmd "$decode_cmd" \
-      --self-loop-scale 1.0 --acwt 1.0 --determinize true \
+      --acwt 1.0 --determinize true \
       --online-ivector-dir $online_ivector_dir \
       --nj $nj --sub-split $subsplit --num-threads "$num_threads_denlats" --config conf/decode.config \
       $train_data_dir $lang $srcdir ${lats_dir} ;
diff --git a/egs/mini_librispeech/s5/local/chain/tuning/run_tdnn_1d.sh b/egs/mini_librispeech/s5/local/chain/tuning/run_tdnn_1d.sh
index 04df38d4da3..9c2ee69f2d5 100755
--- a/egs/mini_librispeech/s5/local/chain/tuning/run_tdnn_1d.sh
+++ b/egs/mini_librispeech/s5/local/chain/tuning/run_tdnn_1d.sh
@@ -239,7 +239,7 @@ if [ $stage -le 15 ]; then
   # Note: it's not important to give mkgraph.sh the lang directory with the
   # matched topology (since it gets the topology file from the model).
   utils/mkgraph.sh \
-    --self-loop-scale 1.0 data/lang_test_tgsmall \
+    data/lang_test_tgsmall \
     $tree_dir $tree_dir/graph_tgsmall || exit 1;
 fi
 
diff --git a/egs/mini_librispeech/s5/local/chain/tuning/run_tdnn_1e.sh b/egs/mini_librispeech/s5/local/chain/tuning/run_tdnn_1e.sh
index cdf9bb584f4..6df89c4acef 100755
--- a/egs/mini_librispeech/s5/local/chain/tuning/run_tdnn_1e.sh
+++ b/egs/mini_librispeech/s5/local/chain/tuning/run_tdnn_1e.sh
@@ -238,7 +238,7 @@ if [ $stage -le 15 ]; then
   # Note: it's not important to give mkgraph.sh the lang directory with the
   # matched topology (since it gets the topology file from the model).
   utils/mkgraph.sh \
-    --self-loop-scale 1.0 data/lang_test_tgsmall \
+    data/lang_test_tgsmall \
     $tree_dir $tree_dir/graph_tgsmall || exit 1;
 fi
 
diff --git a/egs/mini_librispeech/s5/local/chain/tuning/run_tdnn_1f.sh b/egs/mini_librispeech/s5/local/chain/tuning/run_tdnn_1f.sh
index d1385ff2be5..c619e6a287e 100755
--- a/egs/mini_librispeech/s5/local/chain/tuning/run_tdnn_1f.sh
+++ b/egs/mini_librispeech/s5/local/chain/tuning/run_tdnn_1f.sh
@@ -246,7 +246,7 @@ if [ $stage -le 15 ]; then
   # Note: it's not important to give mkgraph.sh the lang directory with the
   # matched topology (since it gets the topology file from the model).
   utils/mkgraph.sh \
-    --self-loop-scale 1.0 data/lang_test_tgsmall \
+    data/lang_test_tgsmall \
     $tree_dir $tree_dir/graph_tgsmall || exit 1;
 fi
 
diff --git a/egs/mini_librispeech/s5/local/chain/tuning/run_tdnn_1g.sh b/egs/mini_librispeech/s5/local/chain/tuning/run_tdnn_1g.sh
index ad51780e191..f7f356fcb15 100755
--- a/egs/mini_librispeech/s5/local/chain/tuning/run_tdnn_1g.sh
+++ b/egs/mini_librispeech/s5/local/chain/tuning/run_tdnn_1g.sh
@@ -246,7 +246,7 @@ if [ $stage -le 15 ]; then
   # Note: it's not important to give mkgraph.sh the lang directory with the
   # matched topology (since it gets the topology file from the model).
   utils/mkgraph.sh \
-    --self-loop-scale 1.0 data/lang_test_tgsmall \
+    data/lang_test_tgsmall \
     $tree_dir $tree_dir/graph_tgsmall || exit 1;
 fi
 
diff --git a/egs/mini_librispeech/s5/local/chain/tuning/run_tdnn_1g20.sh b/egs/mini_librispeech/s5/local/chain/tuning/run_tdnn_1g20.sh
index dbfe5c5a07a..92e34bf6a78 100755
--- a/egs/mini_librispeech/s5/local/chain/tuning/run_tdnn_1g20.sh
+++ b/egs/mini_librispeech/s5/local/chain/tuning/run_tdnn_1g20.sh
@@ -259,7 +259,7 @@ if [ $stage -le 15 ]; then
   # Note: it's not important to give mkgraph.sh the lang directory with the
   # matched topology (since it gets the topology file from the model).
   utils/mkgraph.sh \
-    --self-loop-scale 1.0 data/lang_test_tgsmall \
+    data/lang_test_tgsmall \
     $tree_dir $tree_dir/graph_tgsmall || exit 1;
 fi
 
diff --git a/egs/mini_librispeech/s5/local/chain/tuning/run_tdnn_1h.sh b/egs/mini_librispeech/s5/local/chain/tuning/run_tdnn_1h.sh
index cc4123e2755..e9b04da7ec0 100755
--- a/egs/mini_librispeech/s5/local/chain/tuning/run_tdnn_1h.sh
+++ b/egs/mini_librispeech/s5/local/chain/tuning/run_tdnn_1h.sh
@@ -239,7 +239,7 @@ if [ $stage -le 15 ]; then
   # Note: it's not important to give mkgraph.sh the lang directory with the
   # matched topology (since it gets the topology file from the model).
   utils/mkgraph.sh \
-    --self-loop-scale 1.0 data/lang_test_tgsmall \
+    data/lang_test_tgsmall \
     $tree_dir $tree_dir/graph_tgsmall || exit 1;
 fi
 
diff --git a/egs/mini_librispeech/s5/local/grammar/extend_vocab_demo.sh b/egs/mini_librispeech/s5/local/grammar/extend_vocab_demo.sh
index 1ec4a0d575b..8e82f0af9ef 100755
--- a/egs/mini_librispeech/s5/local/grammar/extend_vocab_demo.sh
+++ b/egs/mini_librispeech/s5/local/grammar/extend_vocab_demo.sh
@@ -70,7 +70,7 @@ fi
 
 if [ $stage -le 2 ]; then
   # make the top-level part of the graph.
-  utils/mkgraph.sh --self-loop-scale 1.0 $lang_base $tree_dir $tree_dir/extvocab_nosp_top
+  utils/mkgraph.sh $lang_base $tree_dir $tree_dir/extvocab_nosp_top
 fi
 
 if [ $stage -le 3 ] && $run_g2p; then
@@ -266,7 +266,7 @@ if [ $stage -le 6 ]; then
   # make the part of the graph that will be included.
   # Refer to the 'compile-graph' commands in ./simple_demo.sh for how you'd do
   # this in code.
-  utils/mkgraph.sh --self-loop-scale 1.0 $lang_ext $tree_dir $tree_dir/extvocab_nosp_part
+  utils/mkgraph.sh $lang_ext $tree_dir $tree_dir/extvocab_nosp_part
 fi
 
 if [ $stage -le 7 ]; then
diff --git a/egs/mini_librispeech/s5/local/grammar/extend_vocab_demo_silprobs.sh b/egs/mini_librispeech/s5/local/grammar/extend_vocab_demo_silprobs.sh
index 28c58dfa453..1975ac97152 100755
--- a/egs/mini_librispeech/s5/local/grammar/extend_vocab_demo_silprobs.sh
+++ b/egs/mini_librispeech/s5/local/grammar/extend_vocab_demo_silprobs.sh
@@ -71,7 +71,7 @@ fi
 
 if [ $stage -le 2 ]; then
   # make the top-level part of the graph.
-  utils/mkgraph.sh --self-loop-scale 1.0 $lang_base $tree_dir $tree_dir/extvocab_top
+  utils/mkgraph.sh $lang_base $tree_dir $tree_dir/extvocab_top
 fi
 
 if [ $stage -le 3 ] && $run_g2p; then
@@ -267,7 +267,7 @@ if [ $stage -le 6 ]; then
   # make the part of the graph that will be included.
   # Refer to the 'compile-graph' commands in ./simple_demo.sh for how you'd do
   # this in code.
-  utils/mkgraph.sh --self-loop-scale 1.0 $lang_ext $tree_dir $tree_dir/extvocab_part
+  utils/mkgraph.sh $lang_ext $tree_dir $tree_dir/extvocab_part
 fi
 
 if [ $stage -le 7 ]; then
diff --git a/egs/mini_librispeech/s5/local/grammar/simple_demo.sh b/egs/mini_librispeech/s5/local/grammar/simple_demo.sh
index a4edeb8091c..bd187c19bcf 100755
--- a/egs/mini_librispeech/s5/local/grammar/simple_demo.sh
+++ b/egs/mini_librispeech/s5/local/grammar/simple_demo.sh
@@ -66,7 +66,7 @@ if [ $stage -le 2 ]; then
 2  0.69314718055994
 3
 EOF
-  utils/mkgraph.sh --self-loop-scale 1.0 $lang $tree_dir $tree_dir/grammar1
+  utils/mkgraph.sh $lang $tree_dir $tree_dir/grammar1
 
   # test that the binary 'compile-graph' does the same thing as mkgraph.sh.
   compile-graph --read-disambig-syms=$lang/phones/disambig.int $tree_dir/tree $tree_dir/1.mdl $lang/L_disambig.fst $lang/G.fst $tree_dir/grammar1/HCLG2.fst
@@ -96,7 +96,7 @@ if [ $stage -le 3 ]; then
 2  0.69314718055994
 3
 EOF
-  utils/mkgraph.sh --self-loop-scale 1.0 $lang $tree_dir $tree_dir/grammar2a
+  utils/mkgraph.sh $lang $tree_dir $tree_dir/grammar2a
 
   # test that the binary 'compile-graph' does the same thing as mkgraph.sh.
   offset=$(grep nonterm_bos $lang/phones.txt | awk '{print $2}') # 364
@@ -123,7 +123,7 @@ if [ $stage -le 4 ]; then
 2    3    #nonterm_end <eps>
 3
 EOF
-  utils/mkgraph.sh --self-loop-scale 1.0 $lang $tree_dir $tree_dir/grammar2b
+  utils/mkgraph.sh $lang $tree_dir $tree_dir/grammar2b
 
 
   # test that the binary 'compile-graph' does the same thing as mkgraph.sh.
diff --git a/egs/mini_librispeech/s5/local/grammar/simple_demo_silprobs.sh b/egs/mini_librispeech/s5/local/grammar/simple_demo_silprobs.sh
index 414227f2ad6..088b20eba1a 100755
--- a/egs/mini_librispeech/s5/local/grammar/simple_demo_silprobs.sh
+++ b/egs/mini_librispeech/s5/local/grammar/simple_demo_silprobs.sh
@@ -65,7 +65,7 @@ if [ $stage -le 2 ]; then
 2  0.69314718055994
 3
 EOF
-  utils/mkgraph.sh --self-loop-scale 1.0 $lang $tree_dir $tree_dir/grammar1
+  utils/mkgraph.sh $lang $tree_dir $tree_dir/grammar1
 
   # test that the binary 'compile-graph' does the same thing as mkgraph.sh.
   compile-graph --read-disambig-syms=$lang/phones/disambig.int $tree_dir/tree $tree_dir/1.mdl $lang/L_disambig.fst $lang/G.fst $tree_dir/grammar1/HCLG2.fst
@@ -94,7 +94,7 @@ if [ $stage -le 3 ]; then
 2  0.69314718055994
 3
 EOF
-  utils/mkgraph.sh --self-loop-scale 1.0 $lang $tree_dir $tree_dir/grammar2a
+  utils/mkgraph.sh $lang $tree_dir $tree_dir/grammar2a
 
   # test that the binary 'compile-graph' does the same thing as mkgraph.sh.
   offset=$(grep nonterm_bos $lang/phones.txt | awk '{print $2}') # 364
@@ -121,7 +121,7 @@ if [ $stage -le 4 ]; then
 2    3    #nonterm_end <eps>
 3
 EOF
-  utils/mkgraph.sh --self-loop-scale 1.0 $lang $tree_dir $tree_dir/grammar2b
+  utils/mkgraph.sh $lang $tree_dir $tree_dir/grammar2b
 
 
   # test that the binary 'compile-graph' does the same thing as mkgraph.sh.
diff --git a/egs/mini_librispeech/s5/local/kws/create_hitlist.sh b/egs/mini_librispeech/s5/local/kws/create_hitlist.sh
index be06a3b9312..6ad516607d5 100755
--- a/egs/mini_librispeech/s5/local/kws/create_hitlist.sh
+++ b/egs/mini_librispeech/s5/local/kws/create_hitlist.sh
@@ -17,7 +17,7 @@
 
 
 cmd=run.pl
-scale_opts="--transition-scale=1.0 --acoustic-scale=0.1 --self-loop-scale=0.1"
+scale_opts="--acoustic-scale=0.1"
 beam=10
 retry_beam=40
 boost_silence=1.0
diff --git a/egs/multi_en/s5/local/chain/run_blstm_6h.sh b/egs/multi_en/s5/local/chain/run_blstm_6h.sh
index 126d29350a1..81ab737ddd9 100644
--- a/egs/multi_en/s5/local/chain/run_blstm_6h.sh
+++ b/egs/multi_en/s5/local/chain/run_blstm_6h.sh
@@ -151,7 +151,7 @@ if [ $stage -le 14 ]; then
   # Note: it might appear that this $lang directory is mismatched, and it is as
   # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
   # the lang directory.
-  utils/mkgraph.sh --self-loop-scale 1.0 data/lang_fsh_sw1_tg $dir $dir/graph_fsh_sw1_tg
+  utils/mkgraph.sh data/lang_fsh_sw1_tg $dir $dir/graph_fsh_sw1_tg
 fi
 
 decode_suff=fsh_sw1_tg
diff --git a/egs/multi_en/s5/local/chain/tuning/run_tdnn_5b.sh b/egs/multi_en/s5/local/chain/tuning/run_tdnn_5b.sh
index 96f5fdac8f3..31b467f9398 100755
--- a/egs/multi_en/s5/local/chain/tuning/run_tdnn_5b.sh
+++ b/egs/multi_en/s5/local/chain/tuning/run_tdnn_5b.sh
@@ -235,7 +235,7 @@ if [ $stage -le 14 ]; then
   # Note: it might appear that this $lang directory is mismatched, and it is as
   # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
   # the lang directory.
-  utils/mkgraph.sh --self-loop-scale 1.0 data/lang_${multi}_${gmm}_fsh_sw1_tg $dir $dir/graph_fsh_sw1_tg
+  utils/mkgraph.sh data/lang_${multi}_${gmm}_fsh_sw1_tg $dir $dir/graph_fsh_sw1_tg
 fi
 
 decode_suff=fsh_sw1_tg
diff --git a/egs/multi_en/s5/local/chain/tuning/run_tdnn_lstm_1a.sh b/egs/multi_en/s5/local/chain/tuning/run_tdnn_lstm_1a.sh
index 62266334962..13156cfc118 100755
--- a/egs/multi_en/s5/local/chain/tuning/run_tdnn_lstm_1a.sh
+++ b/egs/multi_en/s5/local/chain/tuning/run_tdnn_lstm_1a.sh
@@ -252,7 +252,7 @@ if [ $stage -le 14 ]; then
   # Note: it might appear that this $lang directory is mismatched, and it is as
   # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
   # the lang directory.
-  utils/mkgraph.sh --self-loop-scale 1.0 $lang_dir \
+  utils/mkgraph.sh $lang_dir \
     $dir $dir/graph${lang_suffix}
 fi
 
diff --git a/egs/multi_en/s5/local/chain/tuning/run_tdnn_opgru_1a.sh b/egs/multi_en/s5/local/chain/tuning/run_tdnn_opgru_1a.sh
index 79cd3eb3014..be12f7cca45 100755
--- a/egs/multi_en/s5/local/chain/tuning/run_tdnn_opgru_1a.sh
+++ b/egs/multi_en/s5/local/chain/tuning/run_tdnn_opgru_1a.sh
@@ -241,7 +241,7 @@ if [ $stage -le 14 ]; then
   # Note: it might appear that this $lang directory is mismatched, and it is as
   # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
   # the lang directory.
-  utils/mkgraph.sh --self-loop-scale 1.0 data/lang_${multi}_${gmm}_fsh_sw1_tg $dir $dir/graph_fsh_sw1_tg
+  utils/mkgraph.sh data/lang_${multi}_${gmm}_fsh_sw1_tg $dir $dir/graph_fsh_sw1_tg
 fi
 
 decode_suff=fsh_sw1_tg
diff --git a/egs/multi_en/s5/local/chain/tuning/run_tdnn_opgru_1b.sh b/egs/multi_en/s5/local/chain/tuning/run_tdnn_opgru_1b.sh
index a7170af9431..6e45ade836d 100755
--- a/egs/multi_en/s5/local/chain/tuning/run_tdnn_opgru_1b.sh
+++ b/egs/multi_en/s5/local/chain/tuning/run_tdnn_opgru_1b.sh
@@ -237,7 +237,7 @@ if [ $stage -le 14 ]; then
   # Note: it might appear that this $lang directory is mismatched, and it is as
   # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
   # the lang directory.
-  utils/mkgraph.sh --self-loop-scale 1.0 data/lang_${multi}_${gmm}_fsh_sw1_tg $dir $dir/graph_fsh_sw1_tg
+  utils/mkgraph.sh data/lang_${multi}_${gmm}_fsh_sw1_tg $dir $dir/graph_fsh_sw1_tg
 fi
 
 decode_suff=fsh_sw1_tg
diff --git a/egs/reverb/s5/local/chain/tuning/run_tdnn_1a.sh b/egs/reverb/s5/local/chain/tuning/run_tdnn_1a.sh
index c8b4997161e..e68051c1770 100755
--- a/egs/reverb/s5/local/chain/tuning/run_tdnn_1a.sh
+++ b/egs/reverb/s5/local/chain/tuning/run_tdnn_1a.sh
@@ -223,7 +223,7 @@ if [ $stage -le 15 ]; then
   # Note: it's not important to give mkgraph.sh the lang directory with the
   # matched topology (since it gets the topology file from the model).
   utils/mkgraph.sh \
-    --self-loop-scale 1.0 data/lang${lm_suffix}/ \
+    data/lang${lm_suffix}/ \
     $tree_dir $tree_dir/graph${lm_suffix} || exit 1;
 fi
 
diff --git a/egs/reverb/s5/local/chain/tuning/run_tdnn_lstm_1a.sh b/egs/reverb/s5/local/chain/tuning/run_tdnn_lstm_1a.sh
index 4723400c76b..0d27842f18a 100755
--- a/egs/reverb/s5/local/chain/tuning/run_tdnn_lstm_1a.sh
+++ b/egs/reverb/s5/local/chain/tuning/run_tdnn_lstm_1a.sh
@@ -239,7 +239,7 @@ if [ $stage -le 15 ]; then
   # Note: it's not important to give mkgraph.sh the lang directory with the
   # matched topology (since it gets the topology file from the model).
   utils/mkgraph.sh \
-    --self-loop-scale 1.0 data/lang${lm_suffix}/ \
+    data/lang${lm_suffix}/ \
     $tree_dir $tree_dir/graph${lm_suffix} || exit 1;
 fi
 
diff --git a/egs/rimes/v1/local/chain/tuning/run_cnn_e2eali_1a.sh b/egs/rimes/v1/local/chain/tuning/run_cnn_e2eali_1a.sh
index 33eb9dcb98c..f0303909c8e 100755
--- a/egs/rimes/v1/local/chain/tuning/run_cnn_e2eali_1a.sh
+++ b/egs/rimes/v1/local/chain/tuning/run_cnn_e2eali_1a.sh
@@ -110,7 +110,7 @@ if [ $stage -le 2 ]; then
   # use the same num-jobs as the alignments
   steps/nnet3/align_lats.sh --nj $nj --cmd "$cmd" \
                             --acoustic-scale 1.0 --generate-ali-from-lats true \
-                            --scale-opts '--transition-scale=1.0 --self-loop-scale=1.0' \
+                            \
                             ${train_data_dir} data/lang $e2echain_model_dir $lat_dir
   echo "" >$lat_dir/splice_opts
 fi
@@ -239,7 +239,7 @@ if [ $stage -le 7 ]; then
   # as long as phones.txt was compatible.
 
   utils/mkgraph.sh \
-    --self-loop-scale 1.0 $lang_decode \
+    $lang_decode \
     $dir $dir/graph || exit 1;
 fi
 
diff --git a/egs/rimes/v1/local/chain/tuning/run_e2e_cnn_1a.sh b/egs/rimes/v1/local/chain/tuning/run_e2e_cnn_1a.sh
index 9d28a41316d..b6226ab5c2e 100755
--- a/egs/rimes/v1/local/chain/tuning/run_e2e_cnn_1a.sh
+++ b/egs/rimes/v1/local/chain/tuning/run_e2e_cnn_1a.sh
@@ -140,7 +140,7 @@ if [ $stage -le 4 ]; then
   # lang directory, one that contained a wordlist and LM of your choice,
   # as long as phones.txt was compatible.
   utils/mkgraph.sh \
-    --self-loop-scale 1.0 $lang_decode \
+    $lang_decode \
     $dir $dir/graph || exit 1;
 fi
 
diff --git a/egs/rimes/v1/run_end2end.sh b/egs/rimes/v1/run_end2end.sh
index d3e3da2be13..89ed4e656a6 100755
--- a/egs/rimes/v1/run_end2end.sh
+++ b/egs/rimes/v1/run_end2end.sh
@@ -103,7 +103,7 @@ fi
 if [ $stage -le 7 ]; then
   echo "$0: Aligning the training data using the e2e chain model..."
   steps/nnet3/align.sh --nj 50 --cmd "$cmd" \
-                       --scale-opts '--transition-scale=1.0 --self-loop-scale=1.0 --acoustic-scale=1.0' \
+                       --scale-opts ' --acoustic-scale=1.0' \
                        data/$train_set data/lang exp/chain/e2e_cnn_1a exp/chain/e2e_ali_train
 fi
 
diff --git a/egs/rm/s5/local/chain/tuning/run_tdnn_5g.sh b/egs/rm/s5/local/chain/tuning/run_tdnn_5g.sh
index c393a9aa28b..0464c073b2a 100755
--- a/egs/rm/s5/local/chain/tuning/run_tdnn_5g.sh
+++ b/egs/rm/s5/local/chain/tuning/run_tdnn_5g.sh
@@ -134,7 +134,7 @@ if [ $stage -le 10 ]; then
   # Note: it might appear that this $lang directory is mismatched, and it is as
   # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
   # the lang directory.
-  utils/mkgraph.sh --self-loop-scale 1.0 data/lang $dir $dir/graph
+  utils/mkgraph.sh data/lang $dir $dir/graph
   steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \
     --scoring-opts "--min-lmwt 1" \
     --nj 20 --cmd "$decode_cmd" \
@@ -143,7 +143,7 @@ if [ $stage -le 10 ]; then
 fi
 
 if [ $stage -le 11 ]; then
-  utils/mkgraph.sh --self-loop-scale 1.0 data/lang_ug $dir $dir/graph_ug
+  utils/mkgraph.sh data/lang_ug $dir $dir/graph_ug
   steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \
     --nj 20 --cmd "$decode_cmd" \
     --online-ivector-dir exp/nnet2_online/ivectors_test \
diff --git a/egs/rm/s5/local/chain/tuning/run_tdnn_5n.sh b/egs/rm/s5/local/chain/tuning/run_tdnn_5n.sh
index 131bcf98de9..c2869d20731 100755
--- a/egs/rm/s5/local/chain/tuning/run_tdnn_5n.sh
+++ b/egs/rm/s5/local/chain/tuning/run_tdnn_5n.sh
@@ -136,7 +136,7 @@ if [ $stage -le 10 ]; then
   # Note: it might appear that this $lang directory is mismatched, and it is as
   # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
   # the lang directory.
-  utils/mkgraph.sh --self-loop-scale 1.0 data/lang $dir $dir/graph
+  utils/mkgraph.sh data/lang $dir $dir/graph
   steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \
     --scoring-opts "--min-lmwt 1" \
     --nj 20 --cmd "$decode_cmd" \
@@ -145,7 +145,7 @@ if [ $stage -le 10 ]; then
 fi
 
 if [ $stage -le 11 ]; then
-  utils/mkgraph.sh --self-loop-scale 1.0 data/lang_ug $dir $dir/graph_ug
+  utils/mkgraph.sh data/lang_ug $dir $dir/graph_ug
   steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \
     --nj 20 --cmd "$decode_cmd" \
     --online-ivector-dir exp/nnet2_online/ivectors_test \
diff --git a/egs/rm/s5/local/chain/tuning/run_tdnn_5o.sh b/egs/rm/s5/local/chain/tuning/run_tdnn_5o.sh
index db5944fdbea..60cf4733185 100755
--- a/egs/rm/s5/local/chain/tuning/run_tdnn_5o.sh
+++ b/egs/rm/s5/local/chain/tuning/run_tdnn_5o.sh
@@ -166,7 +166,7 @@ if [ $stage -le 10 ]; then
   # Note: it might appear that this $lang directory is mismatched, and it is as
   # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
   # the lang directory.
-  utils/mkgraph.sh --self-loop-scale 1.0 data/lang $dir $dir/graph
+  utils/mkgraph.sh data/lang $dir $dir/graph
   steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \
     --scoring-opts "--min-lmwt 1" \
     --nj 20 --cmd "$decode_cmd" \
@@ -175,7 +175,7 @@ if [ $stage -le 10 ]; then
 fi
 
 if [ $stage -le 11 ]; then
-  utils/mkgraph.sh --self-loop-scale 1.0 data/lang_ug $dir $dir/graph_ug
+  utils/mkgraph.sh data/lang_ug $dir $dir/graph_ug
   steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \
     --nj 20 --cmd "$decode_cmd" \
     --online-ivector-dir exp/nnet2_online/ivectors_test \
diff --git a/egs/rm/s5/local/chain/tuning/run_tdnn_wsj_rm_1a.sh b/egs/rm/s5/local/chain/tuning/run_tdnn_wsj_rm_1a.sh
index 2fd2556c19b..df9c020bd30 100755
--- a/egs/rm/s5/local/chain/tuning/run_tdnn_wsj_rm_1a.sh
+++ b/egs/rm/s5/local/chain/tuning/run_tdnn_wsj_rm_1a.sh
@@ -198,7 +198,7 @@ if [ $stage -le 9 ]; then
   if $use_ivector;then
     ivec_opt="--online-ivector-dir exp/nnet2${nnet_affix}/ivectors_test"
   fi
-  utils/mkgraph.sh --self-loop-scale 1.0 data/lang $dir $dir/graph
+  utils/mkgraph.sh data/lang $dir $dir/graph
   steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \
     --scoring-opts "--min-lmwt 1" \
     --nj 20 --cmd "$decode_cmd" $ivec_opt \
diff --git a/egs/rm/s5/local/chain/tuning/run_tdnn_wsj_rm_1b.sh b/egs/rm/s5/local/chain/tuning/run_tdnn_wsj_rm_1b.sh
index 3e8d5717d4b..f21aec5c29a 100755
--- a/egs/rm/s5/local/chain/tuning/run_tdnn_wsj_rm_1b.sh
+++ b/egs/rm/s5/local/chain/tuning/run_tdnn_wsj_rm_1b.sh
@@ -211,7 +211,7 @@ if [ $stage -le 8 ]; then
   # the lang directory.
   ivec_opt=""
   if $use_ivector;then ivec_opt="--online-ivector-dir exp/nnet2${nnet_affix}/ivectors_test" ; fi
-  utils/mkgraph.sh --self-loop-scale 1.0 $lang_src_tgt $dir $dir/graph
+  utils/mkgraph.sh $lang_src_tgt $dir $dir/graph
   steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \
     --scoring-opts "--min-lmwt 1" \
     --nj 20 --cmd "$decode_cmd" $ivec_opt \
diff --git a/egs/rm/s5/local/chain/tuning/run_tdnn_wsj_rm_1c.sh b/egs/rm/s5/local/chain/tuning/run_tdnn_wsj_rm_1c.sh
index 611aede371d..f6f4d9b0193 100755
--- a/egs/rm/s5/local/chain/tuning/run_tdnn_wsj_rm_1c.sh
+++ b/egs/rm/s5/local/chain/tuning/run_tdnn_wsj_rm_1c.sh
@@ -147,7 +147,7 @@ if [ $stage -le 4 ]; then
     --generate-ali-from-lats true \
     --acoustic-scale 1.0 --extra-left-context-initial 0 --extra-right-context-final 0 \
     --frames-per-chunk 150 \
-    --scale-opts "--transition-scale=1.0 --self-loop-scale=1.0" \
+    --scale-opts "" \
     data/train_hires $lang_src_tgt $src_mdl_dir $lat_dir || exit 1;
   rm $lat_dir/fsts.*.gz # save space
 fi
@@ -219,7 +219,7 @@ if [ $stage -le 8 ]; then
   tes_ivec_opt=""
   if $use_ivector;then test_ivec_opt="--online-ivector-dir exp/nnet2${nnet_affix}/ivectors_test" ; fi
 
-  utils/mkgraph.sh --self-loop-scale 1.0 $lang_src_tgt $dir $dir/graph
+  utils/mkgraph.sh $lang_src_tgt $dir $dir/graph
   steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \
     --scoring-opts "--min-lmwt 1" \
     --nj 20 --cmd "$decode_cmd" $test_ivec_opt \
diff --git a/egs/sprakbanken/s5/local/chain/tuning/run_lstm_1a.sh b/egs/sprakbanken/s5/local/chain/tuning/run_lstm_1a.sh
index 47557f93696..8ebf33af18f 100755
--- a/egs/sprakbanken/s5/local/chain/tuning/run_lstm_1a.sh
+++ b/egs/sprakbanken/s5/local/chain/tuning/run_lstm_1a.sh
@@ -232,7 +232,7 @@ if [ $stage -le 19 ]; then
   # Note: it might appear that this data/lang_chain directory is mismatched, and it is as
   # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
   # the lang directory.
-  utils/mkgraph.sh --self-loop-scale 1.0 data/lang $dir $dir/graph
+  utils/mkgraph.sh data/lang $dir $dir/graph
 fi
 
 if [ $stage -le 20 ]; then
diff --git a/egs/sprakbanken/s5/local/chain/tuning/run_lstm_1b.sh b/egs/sprakbanken/s5/local/chain/tuning/run_lstm_1b.sh
index 7afa1b7f902..6414898fca7 100755
--- a/egs/sprakbanken/s5/local/chain/tuning/run_lstm_1b.sh
+++ b/egs/sprakbanken/s5/local/chain/tuning/run_lstm_1b.sh
@@ -233,7 +233,7 @@ if [ $stage -le 19 ]; then
   # Note: it might appear that this data/lang_chain directory is mismatched, and it is as
   # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
   # the lang directory.
-  utils/mkgraph.sh --self-loop-scale 1.0 data/lang $dir $dir/graph
+  utils/mkgraph.sh data/lang $dir $dir/graph
 fi
 
 if [ $stage -le 20 ]; then
diff --git a/egs/sprakbanken/s5/local/chain/tuning/run_lstm_1c.sh b/egs/sprakbanken/s5/local/chain/tuning/run_lstm_1c.sh
index e69e499e152..8ce19734a22 100755
--- a/egs/sprakbanken/s5/local/chain/tuning/run_lstm_1c.sh
+++ b/egs/sprakbanken/s5/local/chain/tuning/run_lstm_1c.sh
@@ -231,7 +231,7 @@ if [ $stage -le 19 ]; then
   # Note: it might appear that this data/lang_chain directory is mismatched, and it is as
   # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
   # the lang directory.
-  utils/mkgraph.sh --self-loop-scale 1.0 data/lang $dir $dir/graph
+  utils/mkgraph.sh data/lang $dir $dir/graph
 fi
 
 if [ $stage -le 20 ]; then
diff --git a/egs/sprakbanken/s5/local/chain/tuning/run_lstm_1d.sh b/egs/sprakbanken/s5/local/chain/tuning/run_lstm_1d.sh
index 86e0352828c..cc51f149446 100755
--- a/egs/sprakbanken/s5/local/chain/tuning/run_lstm_1d.sh
+++ b/egs/sprakbanken/s5/local/chain/tuning/run_lstm_1d.sh
@@ -244,7 +244,7 @@ if [ $stage -le 19 ]; then
   # Note: it might appear that this data/lang_chain directory is mismatched, and it is as
   # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
   # the lang directory.
-  utils/mkgraph.sh --self-loop-scale 1.0 data/lang $dir $dir/graph
+  utils/mkgraph.sh data/lang $dir $dir/graph
 fi
 
 if [ $stage -le 20 ]; then
diff --git a/egs/sprakbanken/s5/local/chain/tuning/run_lstm_1e.sh b/egs/sprakbanken/s5/local/chain/tuning/run_lstm_1e.sh
index 313f899a471..b6b8ffb9885 100755
--- a/egs/sprakbanken/s5/local/chain/tuning/run_lstm_1e.sh
+++ b/egs/sprakbanken/s5/local/chain/tuning/run_lstm_1e.sh
@@ -232,7 +232,7 @@ if [ $stage -le 19 ]; then
   # Note: it might appear that this data/lang_chain directory is mismatched, and it is as
   # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
   # the lang directory.
-  utils/mkgraph.sh --self-loop-scale 1.0 data/lang_test_tg $dir $dir/graph
+  utils/mkgraph.sh data/lang_test_tg $dir $dir/graph
 fi
 
 if [ $stage -le 20 ]; then
diff --git a/egs/sprakbanken/s5/local/chain/tuning/run_tdnn_1a.sh b/egs/sprakbanken/s5/local/chain/tuning/run_tdnn_1a.sh
index 4991326a86d..cf28beb7691 100755
--- a/egs/sprakbanken/s5/local/chain/tuning/run_tdnn_1a.sh
+++ b/egs/sprakbanken/s5/local/chain/tuning/run_tdnn_1a.sh
@@ -177,7 +177,7 @@ if [ $stage -le 19 ]; then
   # Note: it might appear that this data/lang_chain directory is mismatched, and it is as
   # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
   # the lang directory.
-  utils/mkgraph.sh --self-loop-scale 1.0 data/lang $dir $dir/graph
+  utils/mkgraph.sh data/lang $dir $dir/graph
 fi
 
 if [ $stage -le 20 ]; then
diff --git a/egs/sprakbanken/s5/local/chain/tuning/run_tdnn_1b.sh b/egs/sprakbanken/s5/local/chain/tuning/run_tdnn_1b.sh
index 600f27ddf86..f9ffab37b73 100755
--- a/egs/sprakbanken/s5/local/chain/tuning/run_tdnn_1b.sh
+++ b/egs/sprakbanken/s5/local/chain/tuning/run_tdnn_1b.sh
@@ -215,7 +215,7 @@ if [ $stage -le 19 ]; then
   # Note: it might appear that this data/lang_chain directory is mismatched, and it is as
   # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
   # the lang directory.
-  utils/mkgraph.sh --self-loop-scale 1.0 data/lang_test_tg $dir $dir/graph
+  utils/mkgraph.sh data/lang_test_tg $dir $dir/graph
 fi
 
 if [ $stage -le 20 ]; then
diff --git a/egs/sprakbanken/s5/local/chain/tuning/run_tdnn_lstm_1a.sh b/egs/sprakbanken/s5/local/chain/tuning/run_tdnn_lstm_1a.sh
index cedc448464a..6fc8735783f 100755
--- a/egs/sprakbanken/s5/local/chain/tuning/run_tdnn_lstm_1a.sh
+++ b/egs/sprakbanken/s5/local/chain/tuning/run_tdnn_lstm_1a.sh
@@ -231,7 +231,7 @@ if [ $stage -le 19 ]; then
   # Note: it might appear that this data/lang_chain directory is mismatched, and it is as
   # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
   # the lang directory.
-  utils/mkgraph.sh --self-loop-scale 1.0 data/lang_test_tg $dir $dir/graph
+  utils/mkgraph.sh data/lang_test_tg $dir $dir/graph
 fi
 
 if [ $stage -le 20 ]; then
diff --git a/egs/swbd/s5c/local/chain/multi_condition/run_tdnn_7f.sh b/egs/swbd/s5c/local/chain/multi_condition/run_tdnn_7f.sh
index d317b1dc55a..25a66075419 100755
--- a/egs/swbd/s5c/local/chain/multi_condition/run_tdnn_7f.sh
+++ b/egs/swbd/s5c/local/chain/multi_condition/run_tdnn_7f.sh
@@ -221,7 +221,7 @@ if [ $stage -le 14 ]; then
   # Note: it might appear that this $lang directory is mismatched, and it is as
   # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
   # the lang directory.
-  utils/mkgraph.sh --self-loop-scale 1.0 data/lang_sw1_tg $dir $dir/graph_sw1_tg
+  utils/mkgraph.sh data/lang_sw1_tg $dir $dir/graph_sw1_tg
 fi
 
 decode_suff=sw1_tg
diff --git a/egs/swbd/s5c/local/chain/multi_condition/run_tdnn_7k.sh b/egs/swbd/s5c/local/chain/multi_condition/run_tdnn_7k.sh
index 20dcab8eb50..2c9437e5343 100755
--- a/egs/swbd/s5c/local/chain/multi_condition/run_tdnn_7k.sh
+++ b/egs/swbd/s5c/local/chain/multi_condition/run_tdnn_7k.sh
@@ -234,7 +234,7 @@ if [ $stage -le 14 ]; then
   # Note: it might appear that this $lang directory is mismatched, and it is as
   # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
   # the lang directory.
-  utils/mkgraph.sh --self-loop-scale 1.0 data/lang_sw1_tg $dir $dir/graph_sw1_tg
+  utils/mkgraph.sh data/lang_sw1_tg $dir $dir/graph_sw1_tg
 fi
 
 graph_dir=$dir/graph_sw1_tg
diff --git a/egs/swbd/s5c/local/chain/multi_condition/run_tdnn_aug_1a.sh b/egs/swbd/s5c/local/chain/multi_condition/run_tdnn_aug_1a.sh
index 8762430ee7f..353a77d8668 100755
--- a/egs/swbd/s5c/local/chain/multi_condition/run_tdnn_aug_1a.sh
+++ b/egs/swbd/s5c/local/chain/multi_condition/run_tdnn_aug_1a.sh
@@ -214,7 +214,7 @@ if [ $stage -le 16 ]; then
   # Note: it might appear that this $lang directory is mismatched, and it is as
   # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
   # the lang directory.
-  utils/mkgraph.sh --self-loop-scale 1.0 data/lang_sw1_tg $dir $dir/graph_sw1_tg
+  utils/mkgraph.sh data/lang_sw1_tg $dir $dir/graph_sw1_tg
 fi
 
 
diff --git a/egs/swbd/s5c/local/chain/tuning/run_blstm_6h.sh b/egs/swbd/s5c/local/chain/tuning/run_blstm_6h.sh
index a1be44cdbbf..0c2c8ee1f54 100755
--- a/egs/swbd/s5c/local/chain/tuning/run_blstm_6h.sh
+++ b/egs/swbd/s5c/local/chain/tuning/run_blstm_6h.sh
@@ -172,7 +172,7 @@ if [ $stage -le 14 ]; then
   # Note: it might appear that this $lang directory is mismatched, and it is as
   # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
   # the lang directory.
-  utils/mkgraph.sh --self-loop-scale 1.0 data/lang_sw1_tg $dir $dir/graph_sw1_tg
+  utils/mkgraph.sh data/lang_sw1_tg $dir $dir/graph_sw1_tg
 fi
 
 decode_suff=sw1_tg
diff --git a/egs/swbd/s5c/local/chain/tuning/run_blstm_6h_discriminative.sh b/egs/swbd/s5c/local/chain/tuning/run_blstm_6h_discriminative.sh
index d7382d78dc6..ed051c1729d 100755
--- a/egs/swbd/s5c/local/chain/tuning/run_blstm_6h_discriminative.sh
+++ b/egs/swbd/s5c/local/chain/tuning/run_blstm_6h_discriminative.sh
@@ -132,7 +132,7 @@ if [ $stage -le 1 ]; then
          # have some stragglers.
   steps/nnet3/align.sh  --cmd "$decode_cmd" --use-gpu false \
     --online-ivector-dir $online_ivector_dir $context_opts \
-    --scale-opts "--transition-scale=1.0 --acoustic-scale=1.0 --self-loop-scale=1.0" \
+    --scale-opts "--acoustic-scale=1.0 " \
     --nj $nj $train_data_dir $lang $srcdir ${srcdir}_ali${affix} ;
 fi
 
@@ -146,7 +146,7 @@ if [ -z "$lats_dir" ]; then
     subsplit=40 # number of jobs that run per job (but 2 run at a time, so total jobs is 80, giving
     # total slots = 80 * 6 = 480.
     steps/nnet3/make_denlats.sh --cmd "$decode_cmd" \
-      --self-loop-scale 1.0 --acwt 1.0 --determinize true \
+      --acwt 1.0 --determinize true \
       --online-ivector-dir $online_ivector_dir $context_opts \
       --nj $nj --sub-split $subsplit --num-threads "$num_threads_denlats" --config conf/decode.config \
       $train_data_dir $lang $srcdir ${lats_dir} ;
diff --git a/egs/swbd/s5c/local/chain/tuning/run_blstm_6i.sh b/egs/swbd/s5c/local/chain/tuning/run_blstm_6i.sh
index 1eac1c60c27..b633d64375b 100755
--- a/egs/swbd/s5c/local/chain/tuning/run_blstm_6i.sh
+++ b/egs/swbd/s5c/local/chain/tuning/run_blstm_6i.sh
@@ -178,7 +178,7 @@ if [ $stage -le 14 ]; then
   # Note: it might appear that this $lang directory is mismatched, and it is as
   # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
   # the lang directory.
-  utils/mkgraph.sh --self-loop-scale 1.0 data/lang_sw1_tg $dir $dir/graph_sw1_tg
+  utils/mkgraph.sh data/lang_sw1_tg $dir $dir/graph_sw1_tg
 fi
 
 decode_suff=sw1_tg
diff --git a/egs/swbd/s5c/local/chain/tuning/run_blstm_6j.sh b/egs/swbd/s5c/local/chain/tuning/run_blstm_6j.sh
index acdae844b65..426e8647cd8 100755
--- a/egs/swbd/s5c/local/chain/tuning/run_blstm_6j.sh
+++ b/egs/swbd/s5c/local/chain/tuning/run_blstm_6j.sh
@@ -203,7 +203,7 @@ if [ $stage -le 14 ]; then
   # Note: it might appear that this $lang directory is mismatched, and it is as
   # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
   # the lang directory.
-  utils/mkgraph.sh --self-loop-scale 1.0 data/lang_sw1_tg $dir $dir/graph_sw1_tg
+  utils/mkgraph.sh data/lang_sw1_tg $dir $dir/graph_sw1_tg
 fi
 
 decode_suff=sw1_tg
diff --git a/egs/swbd/s5c/local/chain/tuning/run_blstm_6k.sh b/egs/swbd/s5c/local/chain/tuning/run_blstm_6k.sh
index bbd8cb63697..7e36d47f1f9 100755
--- a/egs/swbd/s5c/local/chain/tuning/run_blstm_6k.sh
+++ b/egs/swbd/s5c/local/chain/tuning/run_blstm_6k.sh
@@ -203,7 +203,7 @@ if [ $stage -le 14 ]; then
   # Note: it might appear that this $lang directory is mismatched, and it is as
   # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
   # the lang directory.
-  utils/mkgraph.sh --self-loop-scale 1.0 data/lang_sw1_tg $dir $dir/graph_sw1_tg
+  utils/mkgraph.sh data/lang_sw1_tg $dir $dir/graph_sw1_tg
 fi
 
 decode_suff=sw1_tg
diff --git a/egs/swbd/s5c/local/chain/tuning/run_blstm_6l.sh b/egs/swbd/s5c/local/chain/tuning/run_blstm_6l.sh
index 16f2ea211d0..5377ca916a1 100644
--- a/egs/swbd/s5c/local/chain/tuning/run_blstm_6l.sh
+++ b/egs/swbd/s5c/local/chain/tuning/run_blstm_6l.sh
@@ -213,7 +213,7 @@ if [ $stage -le 14 ]; then
   # Note: it might appear that this $lang directory is mismatched, and it is as
   # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
   # the lang directory.
-  utils/mkgraph.sh --self-loop-scale 1.0 data/lang_sw1_tg $dir $dir/graph_sw1_tg
+  utils/mkgraph.sh data/lang_sw1_tg $dir $dir/graph_sw1_tg
 fi
 
 decode_suff=sw1_tg
diff --git a/egs/swbd/s5c/local/chain/tuning/run_blstm_6m.sh b/egs/swbd/s5c/local/chain/tuning/run_blstm_6m.sh
index 09f7d72434c..692b2240aa0 100755
--- a/egs/swbd/s5c/local/chain/tuning/run_blstm_6m.sh
+++ b/egs/swbd/s5c/local/chain/tuning/run_blstm_6m.sh
@@ -211,7 +211,7 @@ if [ $stage -le 14 ]; then
   # Note: it might appear that this $lang directory is mismatched, and it is as
   # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
   # the lang directory.
-  utils/mkgraph.sh --self-loop-scale 1.0 data/lang_sw1_tg $dir $dir/graph_sw1_tg
+  utils/mkgraph.sh data/lang_sw1_tg $dir $dir/graph_sw1_tg
 fi
 
 decode_suff=sw1_tg
diff --git a/egs/swbd/s5c/local/chain/tuning/run_blstm_6n.sh b/egs/swbd/s5c/local/chain/tuning/run_blstm_6n.sh
index 8e44d0bc114..86e2a7786ce 100755
--- a/egs/swbd/s5c/local/chain/tuning/run_blstm_6n.sh
+++ b/egs/swbd/s5c/local/chain/tuning/run_blstm_6n.sh
@@ -217,7 +217,7 @@ if [ $stage -le 14 ]; then
   # Note: it might appear that this $lang directory is mismatched, and it is as
   # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
   # the lang directory.
-  utils/mkgraph.sh --self-loop-scale 1.0 data/lang_sw1_tg $dir $dir/graph_sw1_tg
+  utils/mkgraph.sh data/lang_sw1_tg $dir $dir/graph_sw1_tg
 fi
 
 decode_suff=sw1_tg
diff --git a/egs/swbd/s5c/local/chain/tuning/run_blstm_6o.sh b/egs/swbd/s5c/local/chain/tuning/run_blstm_6o.sh
index 6a836e81b09..f0a746ca362 100755
--- a/egs/swbd/s5c/local/chain/tuning/run_blstm_6o.sh
+++ b/egs/swbd/s5c/local/chain/tuning/run_blstm_6o.sh
@@ -219,7 +219,7 @@ if [ $stage -le 14 ]; then
   # Note: it might appear that this $lang directory is mismatched, and it is as
   # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
   # the lang directory.
-  utils/mkgraph.sh --self-loop-scale 1.0 data/lang_sw1_tg $dir $dir/graph_sw1_tg
+  utils/mkgraph.sh data/lang_sw1_tg $dir $dir/graph_sw1_tg
 fi
 
 decode_suff=sw1_tg
diff --git a/egs/swbd/s5c/local/chain/tuning/run_cnn_tdnn_1a.sh b/egs/swbd/s5c/local/chain/tuning/run_cnn_tdnn_1a.sh
index d1a61360f85..f4dee51dd03 100755
--- a/egs/swbd/s5c/local/chain/tuning/run_cnn_tdnn_1a.sh
+++ b/egs/swbd/s5c/local/chain/tuning/run_cnn_tdnn_1a.sh
@@ -207,7 +207,7 @@ if [ $stage -le 14 ]; then
   # Note: it might appear that this $lang directory is mismatched, and it is as
   # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
   # the lang directory.
-  utils/mkgraph.sh --self-loop-scale 1.0 data/lang_sw1_tg $dir $dir/graph_sw1_tg
+  utils/mkgraph.sh data/lang_sw1_tg $dir $dir/graph_sw1_tg
 fi
 
 
diff --git a/egs/swbd/s5c/local/chain/tuning/run_lstm_6h.sh b/egs/swbd/s5c/local/chain/tuning/run_lstm_6h.sh
index ac22e858aea..459e9b4b00d 100755
--- a/egs/swbd/s5c/local/chain/tuning/run_lstm_6h.sh
+++ b/egs/swbd/s5c/local/chain/tuning/run_lstm_6h.sh
@@ -177,7 +177,7 @@ if [ $stage -le 14 ]; then
   # Note: it might appear that this $lang directory is mismatched, and it is as
   # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
   # the lang directory.
-  utils/mkgraph.sh --self-loop-scale 1.0 data/lang_sw1_tg $dir $dir/graph_sw1_tg
+  utils/mkgraph.sh data/lang_sw1_tg $dir $dir/graph_sw1_tg
 fi
 
 decode_suff=sw1_tg
diff --git a/egs/swbd/s5c/local/chain/tuning/run_lstm_6i.sh b/egs/swbd/s5c/local/chain/tuning/run_lstm_6i.sh
index aa48db04841..12646f07897 100644
--- a/egs/swbd/s5c/local/chain/tuning/run_lstm_6i.sh
+++ b/egs/swbd/s5c/local/chain/tuning/run_lstm_6i.sh
@@ -177,7 +177,7 @@ if [ $stage -le 14 ]; then
   # Note: it might appear that this $lang directory is mismatched, and it is as
   # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
   # the lang directory.
-  utils/mkgraph.sh --self-loop-scale 1.0 data/lang_sw1_tg $dir $dir/graph_sw1_tg
+  utils/mkgraph.sh data/lang_sw1_tg $dir $dir/graph_sw1_tg
 fi
 
 decode_suff=sw1_tg
diff --git a/egs/swbd/s5c/local/chain/tuning/run_lstm_6j.sh b/egs/swbd/s5c/local/chain/tuning/run_lstm_6j.sh
index 48db81f586f..3b8a4b6f104 100755
--- a/egs/swbd/s5c/local/chain/tuning/run_lstm_6j.sh
+++ b/egs/swbd/s5c/local/chain/tuning/run_lstm_6j.sh
@@ -197,7 +197,7 @@ if [ $stage -le 14 ]; then
   # Note: it might appear that this $lang directory is mismatched, and it is as
   # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
   # the lang directory.
-  utils/mkgraph.sh --self-loop-scale 1.0 data/lang_sw1_tg $dir $dir/graph_sw1_tg
+  utils/mkgraph.sh data/lang_sw1_tg $dir $dir/graph_sw1_tg
 fi
 
 decode_suff=sw1_tg
diff --git a/egs/swbd/s5c/local/chain/tuning/run_lstm_6k.sh b/egs/swbd/s5c/local/chain/tuning/run_lstm_6k.sh
index 021eab09506..65ef23f8ce3 100755
--- a/egs/swbd/s5c/local/chain/tuning/run_lstm_6k.sh
+++ b/egs/swbd/s5c/local/chain/tuning/run_lstm_6k.sh
@@ -204,7 +204,7 @@ if [ $stage -le 14 ]; then
   # Note: it might appear that this $lang directory is mismatched, and it is as
   # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
   # the lang directory.
-  utils/mkgraph.sh --self-loop-scale 1.0 data/lang_sw1_tg $dir $dir/graph_sw1_tg
+  utils/mkgraph.sh data/lang_sw1_tg $dir $dir/graph_sw1_tg
 fi
 
 
diff --git a/egs/swbd/s5c/local/chain/tuning/run_lstm_6l.sh b/egs/swbd/s5c/local/chain/tuning/run_lstm_6l.sh
index f219167f9ec..b8e08c61ddf 100755
--- a/egs/swbd/s5c/local/chain/tuning/run_lstm_6l.sh
+++ b/egs/swbd/s5c/local/chain/tuning/run_lstm_6l.sh
@@ -215,7 +215,7 @@ if [ $stage -le 14 ]; then
   # Note: it might appear that this $lang directory is mismatched, and it is as
   # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
   # the lang directory.
-  utils/mkgraph.sh --self-loop-scale 1.0 data/lang_sw1_tg $dir $dir/graph_sw1_tg
+  utils/mkgraph.sh data/lang_sw1_tg $dir $dir/graph_sw1_tg
 fi
 
 
diff --git a/egs/swbd/s5c/local/chain/tuning/run_lstm_d.sh b/egs/swbd/s5c/local/chain/tuning/run_lstm_d.sh
index 551be099390..532eb3ede7e 100755
--- a/egs/swbd/s5c/local/chain/tuning/run_lstm_d.sh
+++ b/egs/swbd/s5c/local/chain/tuning/run_lstm_d.sh
@@ -182,7 +182,7 @@ if [ $stage -le 14 ]; then
   # Note: it might appear that this $lang directory is mismatched, and it is as
   # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
   # the lang directory.
-  utils/mkgraph.sh --self-loop-scale 1.0 data/lang_sw1_tg $dir $dir/graph_sw1_tg
+  utils/mkgraph.sh data/lang_sw1_tg $dir $dir/graph_sw1_tg
 fi
 
 decode_suff=sw1_tg
diff --git a/egs/swbd/s5c/local/chain/tuning/run_tdnn_2a.sh b/egs/swbd/s5c/local/chain/tuning/run_tdnn_2a.sh
index c584bbe29a6..318d026b8d6 100755
--- a/egs/swbd/s5c/local/chain/tuning/run_tdnn_2a.sh
+++ b/egs/swbd/s5c/local/chain/tuning/run_tdnn_2a.sh
@@ -220,8 +220,8 @@ if [ $stage -le 13 ]; then
   # Note: it might appear that this $lang directory is mismatched, and it is as
   # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
   # the lang directory.
-  utils/mkgraph.sh --transition-scale 0.0 \
-      --self-loop-scale 0.0 data/lang_sw1_tg $dir $dir/graph_sw1_tg
+  utils/mkgraph.sh \
+      data/lang_sw1_tg $dir $dir/graph_sw1_tg
 fi
 
 decode_suff=sw1_tg
diff --git a/egs/swbd/s5c/local/chain/tuning/run_tdnn_2b.sh b/egs/swbd/s5c/local/chain/tuning/run_tdnn_2b.sh
index 227a74067d4..e7b574f0ea3 100755
--- a/egs/swbd/s5c/local/chain/tuning/run_tdnn_2b.sh
+++ b/egs/swbd/s5c/local/chain/tuning/run_tdnn_2b.sh
@@ -211,8 +211,8 @@ if [ $stage -le 13 ]; then
   # Note: it might appear that this $lang directory is mismatched, and it is as
   # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
   # the lang directory.
-  utils/mkgraph.sh --transition-scale 0.0 \
-      --self-loop-scale 0.0 data/lang_sw1_tg $dir $dir/graph_sw1_tg
+  utils/mkgraph.sh \
+      data/lang_sw1_tg $dir $dir/graph_sw1_tg
 fi
 
 decode_suff=sw1_tg
diff --git a/egs/swbd/s5c/local/chain/tuning/run_tdnn_2c.sh b/egs/swbd/s5c/local/chain/tuning/run_tdnn_2c.sh
index 9fc08f27d45..556325b5bdc 100755
--- a/egs/swbd/s5c/local/chain/tuning/run_tdnn_2c.sh
+++ b/egs/swbd/s5c/local/chain/tuning/run_tdnn_2c.sh
@@ -202,7 +202,7 @@ if [ $stage -le 13 ]; then
   # Note: it might appear that this $lang directory is mismatched, and it is as
   # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
   # the lang directory.
-  utils/mkgraph.sh --self-loop-scale 1.0 data/lang_sw1_tg $dir $dir/graph_sw1_tg
+  utils/mkgraph.sh data/lang_sw1_tg $dir $dir/graph_sw1_tg
 fi
 
 decode_suff=sw1_tg
diff --git a/egs/swbd/s5c/local/chain/tuning/run_tdnn_2d.sh b/egs/swbd/s5c/local/chain/tuning/run_tdnn_2d.sh
index 2ef8c374514..4c9b497f2fa 100755
--- a/egs/swbd/s5c/local/chain/tuning/run_tdnn_2d.sh
+++ b/egs/swbd/s5c/local/chain/tuning/run_tdnn_2d.sh
@@ -207,7 +207,7 @@ if [ $stage -le 13 ]; then
   # Note: it might appear that this $lang directory is mismatched, and it is as
   # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
   # the lang directory.
-  utils/mkgraph.sh --self-loop-scale 1.0 data/lang_sw1_tg $dir $dir/graph_sw1_tg
+  utils/mkgraph.sh data/lang_sw1_tg $dir $dir/graph_sw1_tg
 fi
 
 decode_suff=sw1_tg
diff --git a/egs/swbd/s5c/local/chain/tuning/run_tdnn_2e.sh b/egs/swbd/s5c/local/chain/tuning/run_tdnn_2e.sh
index 2db9a59c2e2..4c31608c8f9 100755
--- a/egs/swbd/s5c/local/chain/tuning/run_tdnn_2e.sh
+++ b/egs/swbd/s5c/local/chain/tuning/run_tdnn_2e.sh
@@ -230,7 +230,7 @@ if [ $stage -le 13 ]; then
   # Note: it might appear that this $lang directory is mismatched, and it is as
   # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
   # the lang directory.
-  utils/mkgraph.sh --self-loop-scale 1.0 data/lang_sw1_tg $dir $dir/graph_sw1_tg
+  utils/mkgraph.sh data/lang_sw1_tg $dir $dir/graph_sw1_tg
 fi
 
 decode_suff=sw1_tg
diff --git a/egs/swbd/s5c/local/chain/tuning/run_tdnn_2f.sh b/egs/swbd/s5c/local/chain/tuning/run_tdnn_2f.sh
index f510fccd882..d940e81efab 100755
--- a/egs/swbd/s5c/local/chain/tuning/run_tdnn_2f.sh
+++ b/egs/swbd/s5c/local/chain/tuning/run_tdnn_2f.sh
@@ -212,7 +212,7 @@ if [ $stage -le 13 ]; then
   # Note: it might appear that this $lang directory is mismatched, and it is as
   # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
   # the lang directory.
-  utils/mkgraph.sh --self-loop-scale 1.0 data/lang_sw1_tg $dir $dir/graph_sw1_tg
+  utils/mkgraph.sh data/lang_sw1_tg $dir $dir/graph_sw1_tg
 fi
 
 decode_suff=sw1_tg
diff --git a/egs/swbd/s5c/local/chain/tuning/run_tdnn_2g.sh b/egs/swbd/s5c/local/chain/tuning/run_tdnn_2g.sh
index 65b48b43685..8f43562e4e3 100755
--- a/egs/swbd/s5c/local/chain/tuning/run_tdnn_2g.sh
+++ b/egs/swbd/s5c/local/chain/tuning/run_tdnn_2g.sh
@@ -215,7 +215,7 @@ if [ $stage -le 13 ]; then
   # Note: it might appear that this $lang directory is mismatched, and it is as
   # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
   # the lang directory.
-  utils/mkgraph.sh --self-loop-scale 1.0 data/lang_sw1_tg $dir $dir/graph_sw1_tg
+  utils/mkgraph.sh data/lang_sw1_tg $dir $dir/graph_sw1_tg
 fi
 
 decode_suff=sw1_tg
diff --git a/egs/swbd/s5c/local/chain/tuning/run_tdnn_2h.sh b/egs/swbd/s5c/local/chain/tuning/run_tdnn_2h.sh
index d86233ff83b..5154834004b 100755
--- a/egs/swbd/s5c/local/chain/tuning/run_tdnn_2h.sh
+++ b/egs/swbd/s5c/local/chain/tuning/run_tdnn_2h.sh
@@ -217,7 +217,7 @@ if [ $stage -le 13 ]; then
   # Note: it might appear that this $lang directory is mismatched, and it is as
   # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
   # the lang directory.
-  utils/mkgraph.sh --self-loop-scale 1.0 data/lang_sw1_tg $dir $dir/graph_sw1_tg
+  utils/mkgraph.sh data/lang_sw1_tg $dir $dir/graph_sw1_tg
 fi
 
 decode_suff=sw1_tg
diff --git a/egs/swbd/s5c/local/chain/tuning/run_tdnn_2i.sh b/egs/swbd/s5c/local/chain/tuning/run_tdnn_2i.sh
index cab9dd957a3..53e8e2a9ed6 100755
--- a/egs/swbd/s5c/local/chain/tuning/run_tdnn_2i.sh
+++ b/egs/swbd/s5c/local/chain/tuning/run_tdnn_2i.sh
@@ -215,7 +215,7 @@ if [ $stage -le 13 ]; then
   # Note: it might appear that this $lang directory is mismatched, and it is as
   # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
   # the lang directory.
-  utils/mkgraph.sh --self-loop-scale 1.0 data/lang_sw1_tg $dir $dir/graph_sw1_tg
+  utils/mkgraph.sh data/lang_sw1_tg $dir $dir/graph_sw1_tg
 fi
 
 decode_suff=sw1_tg
diff --git a/egs/swbd/s5c/local/chain/tuning/run_tdnn_2j.sh b/egs/swbd/s5c/local/chain/tuning/run_tdnn_2j.sh
index 0eca2ff10ff..d5b5a80bef4 100755
--- a/egs/swbd/s5c/local/chain/tuning/run_tdnn_2j.sh
+++ b/egs/swbd/s5c/local/chain/tuning/run_tdnn_2j.sh
@@ -216,7 +216,7 @@ if [ $stage -le 13 ]; then
   # Note: it might appear that this $lang directory is mismatched, and it is as
   # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
   # the lang directory.
-  utils/mkgraph.sh --self-loop-scale 1.0 data/lang_sw1_tg $dir $dir/graph_sw1_tg
+  utils/mkgraph.sh data/lang_sw1_tg $dir $dir/graph_sw1_tg
 fi
 
 decode_suff=sw1_tg
diff --git a/egs/swbd/s5c/local/chain/tuning/run_tdnn_2k.sh b/egs/swbd/s5c/local/chain/tuning/run_tdnn_2k.sh
index 7e127c10917..1eb99309edd 100755
--- a/egs/swbd/s5c/local/chain/tuning/run_tdnn_2k.sh
+++ b/egs/swbd/s5c/local/chain/tuning/run_tdnn_2k.sh
@@ -225,7 +225,7 @@ if [ $stage -le 13 ]; then
   # Note: it might appear that this $lang directory is mismatched, and it is as
   # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
   # the lang directory.
-  utils/mkgraph.sh --self-loop-scale 1.0 data/lang_sw1_tg $dir $dir/graph_sw1_tg
+  utils/mkgraph.sh data/lang_sw1_tg $dir $dir/graph_sw1_tg
 fi
 
 decode_suff=sw1_tg
diff --git a/egs/swbd/s5c/local/chain/tuning/run_tdnn_2l.sh b/egs/swbd/s5c/local/chain/tuning/run_tdnn_2l.sh
index fbe45761996..53ad7c17e14 100755
--- a/egs/swbd/s5c/local/chain/tuning/run_tdnn_2l.sh
+++ b/egs/swbd/s5c/local/chain/tuning/run_tdnn_2l.sh
@@ -235,7 +235,7 @@ if [ $stage -le 13 ]; then
   # Note: it might appear that this $lang directory is mismatched, and it is as
   # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
   # the lang directory.
-  utils/mkgraph.sh --self-loop-scale 1.0 data/lang_sw1_tg $dir $dir/graph_sw1_tg
+  utils/mkgraph.sh data/lang_sw1_tg $dir $dir/graph_sw1_tg
 fi
 
 decode_suff=sw1_tg
diff --git a/egs/swbd/s5c/local/chain/tuning/run_tdnn_2m.sh b/egs/swbd/s5c/local/chain/tuning/run_tdnn_2m.sh
index 93db16408cc..345446f3cc8 100755
--- a/egs/swbd/s5c/local/chain/tuning/run_tdnn_2m.sh
+++ b/egs/swbd/s5c/local/chain/tuning/run_tdnn_2m.sh
@@ -236,7 +236,7 @@ if [ $stage -le 13 ]; then
   # Note: it might appear that this $lang directory is mismatched, and it is as
   # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
   # the lang directory.
-  utils/mkgraph.sh --self-loop-scale 1.0 data/lang_sw1_tg $dir $dir/graph_sw1_tg
+  utils/mkgraph.sh data/lang_sw1_tg $dir $dir/graph_sw1_tg
 fi
 
 decode_suff=sw1_tg
diff --git a/egs/swbd/s5c/local/chain/tuning/run_tdnn_2n.sh b/egs/swbd/s5c/local/chain/tuning/run_tdnn_2n.sh
index 57eb66dac35..1ee2a92f494 100755
--- a/egs/swbd/s5c/local/chain/tuning/run_tdnn_2n.sh
+++ b/egs/swbd/s5c/local/chain/tuning/run_tdnn_2n.sh
@@ -276,7 +276,7 @@ if [ $stage -le 16 ]; then
   # Note: it might appear that this $lang directory is mismatched, and it is as
   # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
   # the lang directory.
-  utils/mkgraph.sh --self-loop-scale 1.0 data/lang_sw1_tg $dir $dir/graph_sw1_tg
+  utils/mkgraph.sh data/lang_sw1_tg $dir $dir/graph_sw1_tg
 fi
 
 decode_suff=sw1_tg
diff --git a/egs/swbd/s5c/local/chain/tuning/run_tdnn_2o.sh b/egs/swbd/s5c/local/chain/tuning/run_tdnn_2o.sh
index ae085c9804f..e5111d13bf2 100755
--- a/egs/swbd/s5c/local/chain/tuning/run_tdnn_2o.sh
+++ b/egs/swbd/s5c/local/chain/tuning/run_tdnn_2o.sh
@@ -235,7 +235,7 @@ if [ $stage -le 13 ]; then
   # Note: it might appear that this $lang directory is mismatched, and it is as
   # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
   # the lang directory.
-  utils/mkgraph.sh --self-loop-scale 1.0 data/lang_sw1_tg $dir $dir/graph_sw1_tg
+  utils/mkgraph.sh data/lang_sw1_tg $dir $dir/graph_sw1_tg
 fi
 
 decode_suff=sw1_tg
diff --git a/egs/swbd/s5c/local/chain/tuning/run_tdnn_2p.sh b/egs/swbd/s5c/local/chain/tuning/run_tdnn_2p.sh
index 4c6ad3b9761..b7ae60e2449 100755
--- a/egs/swbd/s5c/local/chain/tuning/run_tdnn_2p.sh
+++ b/egs/swbd/s5c/local/chain/tuning/run_tdnn_2p.sh
@@ -250,7 +250,7 @@ if [ $stage -le 13 ]; then
   # Note: it might appear that this $lang directory is mismatched, and it is as
   # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
   # the lang directory.
-  utils/mkgraph.sh --self-loop-scale 1.0 data/lang_sw1_tg $dir $dir/graph_sw1_tg
+  utils/mkgraph.sh data/lang_sw1_tg $dir $dir/graph_sw1_tg
 fi
 
 decode_suff=sw1_tg
diff --git a/egs/swbd/s5c/local/chain/tuning/run_tdnn_2q.sh b/egs/swbd/s5c/local/chain/tuning/run_tdnn_2q.sh
index 54b03fb2296..872a2d5bd04 100755
--- a/egs/swbd/s5c/local/chain/tuning/run_tdnn_2q.sh
+++ b/egs/swbd/s5c/local/chain/tuning/run_tdnn_2q.sh
@@ -244,7 +244,7 @@ if [ $stage -le 13 ]; then
   # Note: it might appear that this $lang directory is mismatched, and it is as
   # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
   # the lang directory.
-  utils/mkgraph.sh --self-loop-scale 1.0 data/lang_sw1_tg $dir $dir/graph_sw1_tg
+  utils/mkgraph.sh data/lang_sw1_tg $dir $dir/graph_sw1_tg
 fi
 
 decode_suff=sw1_tg
diff --git a/egs/swbd/s5c/local/chain/tuning/run_tdnn_2r.sh b/egs/swbd/s5c/local/chain/tuning/run_tdnn_2r.sh
index 9282b733946..657c8925e60 100755
--- a/egs/swbd/s5c/local/chain/tuning/run_tdnn_2r.sh
+++ b/egs/swbd/s5c/local/chain/tuning/run_tdnn_2r.sh
@@ -248,7 +248,7 @@ if [ $stage -le 13 ]; then
   # Note: it might appear that this $lang directory is mismatched, and it is as
   # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
   # the lang directory.
-  utils/mkgraph.sh --self-loop-scale 1.0 data/lang_sw1_tg $dir $dir/graph_sw1_tg
+  utils/mkgraph.sh data/lang_sw1_tg $dir $dir/graph_sw1_tg
 fi
 
 decode_suff=sw1_tg
diff --git a/egs/swbd/s5c/local/chain/tuning/run_tdnn_2s.sh b/egs/swbd/s5c/local/chain/tuning/run_tdnn_2s.sh
index 3e829e246f3..f326b2e286a 100755
--- a/egs/swbd/s5c/local/chain/tuning/run_tdnn_2s.sh
+++ b/egs/swbd/s5c/local/chain/tuning/run_tdnn_2s.sh
@@ -236,7 +236,7 @@ if [ $stage -le 13 ]; then
   # Note: it might appear that this $lang directory is mismatched, and it is as
   # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
   # the lang directory.
-  utils/mkgraph.sh --self-loop-scale 1.0 data/lang_sw1_tg $dir $dir/graph_sw1_tg
+  utils/mkgraph.sh data/lang_sw1_tg $dir $dir/graph_sw1_tg
 fi
 
 decode_suff=sw1_tg
diff --git a/egs/swbd/s5c/local/chain/tuning/run_tdnn_2t.sh b/egs/swbd/s5c/local/chain/tuning/run_tdnn_2t.sh
index 4a322e1a8fa..8bd8111537d 100755
--- a/egs/swbd/s5c/local/chain/tuning/run_tdnn_2t.sh
+++ b/egs/swbd/s5c/local/chain/tuning/run_tdnn_2t.sh
@@ -240,7 +240,7 @@ if [ $stage -le 13 ]; then
   # Note: it might appear that this $lang directory is mismatched, and it is as
   # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
   # the lang directory.
-  utils/mkgraph.sh --self-loop-scale 1.0 data/lang_sw1_tg $dir $dir/graph_sw1_tg
+  utils/mkgraph.sh data/lang_sw1_tg $dir $dir/graph_sw1_tg
 fi
 
 decode_suff=sw1_tg
diff --git a/egs/swbd/s5c/local/chain/tuning/run_tdnn_2u.sh b/egs/swbd/s5c/local/chain/tuning/run_tdnn_2u.sh
index 9ec5bf81d3d..1d506714faa 100755
--- a/egs/swbd/s5c/local/chain/tuning/run_tdnn_2u.sh
+++ b/egs/swbd/s5c/local/chain/tuning/run_tdnn_2u.sh
@@ -252,7 +252,7 @@ if [ $stage -le 13 ]; then
   # Note: it might appear that this $lang directory is mismatched, and it is as
   # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
   # the lang directory.
-  utils/mkgraph.sh --self-loop-scale 1.0 data/lang_sw1_tg $dir $dir/graph_sw1_tg
+  utils/mkgraph.sh data/lang_sw1_tg $dir $dir/graph_sw1_tg
 fi
 
 decode_suff=sw1_tg
diff --git a/egs/swbd/s5c/local/chain/tuning/run_tdnn_2v.sh b/egs/swbd/s5c/local/chain/tuning/run_tdnn_2v.sh
index cd009cfcc12..7c3b7cccead 100755
--- a/egs/swbd/s5c/local/chain/tuning/run_tdnn_2v.sh
+++ b/egs/swbd/s5c/local/chain/tuning/run_tdnn_2v.sh
@@ -257,7 +257,7 @@ if [ $stage -le 13 ]; then
   # Note: it might appear that this $lang directory is mismatched, and it is as
   # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
   # the lang directory.
-  utils/mkgraph.sh --self-loop-scale 1.0 data/lang_sw1_tg $dir $dir/graph_sw1_tg
+  utils/mkgraph.sh data/lang_sw1_tg $dir $dir/graph_sw1_tg
 fi
 
 decode_suff=sw1_tg
diff --git a/egs/swbd/s5c/local/chain/tuning/run_tdnn_2w.sh b/egs/swbd/s5c/local/chain/tuning/run_tdnn_2w.sh
index 687093c98c5..674be6e15e1 100755
--- a/egs/swbd/s5c/local/chain/tuning/run_tdnn_2w.sh
+++ b/egs/swbd/s5c/local/chain/tuning/run_tdnn_2w.sh
@@ -252,7 +252,7 @@ if [ $stage -le 13 ]; then
   # Note: it might appear that this $lang directory is mismatched, and it is as
   # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
   # the lang directory.
-  utils/mkgraph.sh --self-loop-scale 1.0 data/lang_sw1_tg $dir $dir/graph_sw1_tg
+  utils/mkgraph.sh data/lang_sw1_tg $dir $dir/graph_sw1_tg
 fi
 
 decode_suff=sw1_tg
diff --git a/egs/swbd/s5c/local/chain/tuning/run_tdnn_2x.sh b/egs/swbd/s5c/local/chain/tuning/run_tdnn_2x.sh
index e2d6204af0c..df5513245f6 100755
--- a/egs/swbd/s5c/local/chain/tuning/run_tdnn_2x.sh
+++ b/egs/swbd/s5c/local/chain/tuning/run_tdnn_2x.sh
@@ -258,7 +258,7 @@ if [ $stage -le 13 ]; then
   # Note: it might appear that this $lang directory is mismatched, and it is as
   # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
   # the lang directory.
-  utils/mkgraph.sh --self-loop-scale 1.0 data/lang_sw1_tg $dir $dir/graph_sw1_tg
+  utils/mkgraph.sh data/lang_sw1_tg $dir $dir/graph_sw1_tg
 fi
 
 decode_suff=sw1_tg
diff --git a/egs/swbd/s5c/local/chain/tuning/run_tdnn_2y.sh b/egs/swbd/s5c/local/chain/tuning/run_tdnn_2y.sh
index c1211feae64..1169193dfce 100755
--- a/egs/swbd/s5c/local/chain/tuning/run_tdnn_2y.sh
+++ b/egs/swbd/s5c/local/chain/tuning/run_tdnn_2y.sh
@@ -243,7 +243,7 @@ if [ $stage -le 13 ]; then
   # Note: it might appear that this $lang directory is mismatched, and it is as
   # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
   # the lang directory.
-  utils/mkgraph.sh --self-loop-scale 1.0 data/lang_sw1_tg $dir $dir/graph_sw1_tg
+  utils/mkgraph.sh data/lang_sw1_tg $dir $dir/graph_sw1_tg
 fi
 
 decode_suff=sw1_tg
diff --git a/egs/swbd/s5c/local/chain/tuning/run_tdnn_3c.sh b/egs/swbd/s5c/local/chain/tuning/run_tdnn_3c.sh
index 01ff8079f2a..b5c73f9d700 100755
--- a/egs/swbd/s5c/local/chain/tuning/run_tdnn_3c.sh
+++ b/egs/swbd/s5c/local/chain/tuning/run_tdnn_3c.sh
@@ -250,7 +250,7 @@ if [ $stage -le 13 ]; then
   # Note: it might appear that this $lang directory is mismatched, and it is as
   # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
   # the lang directory.
-  utils/mkgraph.sh --self-loop-scale 1.0 data/lang_sw1_tg $dir $dir/graph_sw1_tg
+  utils/mkgraph.sh data/lang_sw1_tg $dir $dir/graph_sw1_tg
 fi
 
 decode_suff=sw1_tg
diff --git a/egs/swbd/s5c/local/chain/tuning/run_tdnn_3d.sh b/egs/swbd/s5c/local/chain/tuning/run_tdnn_3d.sh
index 0cb513c84f1..0759e406e56 100755
--- a/egs/swbd/s5c/local/chain/tuning/run_tdnn_3d.sh
+++ b/egs/swbd/s5c/local/chain/tuning/run_tdnn_3d.sh
@@ -262,7 +262,7 @@ if [ $stage -le 13 ]; then
   # Note: it might appear that this $lang directory is mismatched, and it is as
   # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
   # the lang directory.
-  utils/mkgraph.sh --self-loop-scale 1.0 data/lang_sw1_tg $dir $dir/graph_sw1_tg
+  utils/mkgraph.sh data/lang_sw1_tg $dir $dir/graph_sw1_tg
 fi
 
 decode_suff=sw1_tg
diff --git a/egs/swbd/s5c/local/chain/tuning/run_tdnn_3e.sh b/egs/swbd/s5c/local/chain/tuning/run_tdnn_3e.sh
index 687f684a68c..422505200ea 100755
--- a/egs/swbd/s5c/local/chain/tuning/run_tdnn_3e.sh
+++ b/egs/swbd/s5c/local/chain/tuning/run_tdnn_3e.sh
@@ -251,7 +251,7 @@ if [ $stage -le 13 ]; then
   # Note: it might appear that this $lang directory is mismatched, and it is as
   # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
   # the lang directory.
-  utils/mkgraph.sh --self-loop-scale 1.0 data/lang_sw1_tg $dir $dir/graph_sw1_tg
+  utils/mkgraph.sh data/lang_sw1_tg $dir $dir/graph_sw1_tg
 fi
 
 decode_suff=sw1_tg
diff --git a/egs/swbd/s5c/local/chain/tuning/run_tdnn_3f.sh b/egs/swbd/s5c/local/chain/tuning/run_tdnn_3f.sh
index 0a4b935485a..32ae7f8ed07 100755
--- a/egs/swbd/s5c/local/chain/tuning/run_tdnn_3f.sh
+++ b/egs/swbd/s5c/local/chain/tuning/run_tdnn_3f.sh
@@ -258,7 +258,7 @@ if [ $stage -le 13 ]; then
   # Note: it might appear that this $lang directory is mismatched, and it is as
   # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
   # the lang directory.
-  utils/mkgraph.sh --self-loop-scale 1.0 data/lang_sw1_tg $dir $dir/graph_sw1_tg
+  utils/mkgraph.sh data/lang_sw1_tg $dir $dir/graph_sw1_tg
 fi
 
 decode_suff=sw1_tg
diff --git a/egs/swbd/s5c/local/chain/tuning/run_tdnn_3g.sh b/egs/swbd/s5c/local/chain/tuning/run_tdnn_3g.sh
index 077a84d31e9..b1ebcfb706a 100755
--- a/egs/swbd/s5c/local/chain/tuning/run_tdnn_3g.sh
+++ b/egs/swbd/s5c/local/chain/tuning/run_tdnn_3g.sh
@@ -278,7 +278,7 @@ if [ $stage -le 13 ]; then
   # Note: it might appear that this $lang directory is mismatched, and it is as
   # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
   # the lang directory.
-  utils/mkgraph.sh --self-loop-scale 1.0 data/lang_sw1_tg $dir $dir/graph_sw1_tg
+  utils/mkgraph.sh data/lang_sw1_tg $dir $dir/graph_sw1_tg
 fi
 
 decode_suff=sw1_tg
diff --git a/egs/swbd/s5c/local/chain/tuning/run_tdnn_3h.sh b/egs/swbd/s5c/local/chain/tuning/run_tdnn_3h.sh
index dcda3a00383..2c7cad476d2 100755
--- a/egs/swbd/s5c/local/chain/tuning/run_tdnn_3h.sh
+++ b/egs/swbd/s5c/local/chain/tuning/run_tdnn_3h.sh
@@ -264,7 +264,7 @@ if [ $stage -le 13 ]; then
   # Note: it might appear that this $lang directory is mismatched, and it is as
   # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
   # the lang directory.
-  utils/mkgraph.sh --self-loop-scale 1.0 data/lang_sw1_tg $dir $dir/graph_sw1_tg
+  utils/mkgraph.sh data/lang_sw1_tg $dir $dir/graph_sw1_tg
 fi
 
 decode_suff=sw1_tg
diff --git a/egs/swbd/s5c/local/chain/tuning/run_tdnn_3i.sh b/egs/swbd/s5c/local/chain/tuning/run_tdnn_3i.sh
index 996795c9aee..3ca650c496f 100755
--- a/egs/swbd/s5c/local/chain/tuning/run_tdnn_3i.sh
+++ b/egs/swbd/s5c/local/chain/tuning/run_tdnn_3i.sh
@@ -285,7 +285,7 @@ if [ $stage -le 13 ]; then
   # Note: it might appear that this $lang directory is mismatched, and it is as
   # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
   # the lang directory.
-  utils/mkgraph.sh --self-loop-scale 1.0 data/lang_sw1_tg $dir $dir/graph_sw1_tg
+  utils/mkgraph.sh data/lang_sw1_tg $dir $dir/graph_sw1_tg
 fi
 
 decode_suff=sw1_tg
diff --git a/egs/swbd/s5c/local/chain/tuning/run_tdnn_3j.sh b/egs/swbd/s5c/local/chain/tuning/run_tdnn_3j.sh
index 66e44fb6f04..752f6635e6c 100755
--- a/egs/swbd/s5c/local/chain/tuning/run_tdnn_3j.sh
+++ b/egs/swbd/s5c/local/chain/tuning/run_tdnn_3j.sh
@@ -271,7 +271,7 @@ if [ $stage -le 13 ]; then
   # Note: it might appear that this $lang directory is mismatched, and it is as
   # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
   # the lang directory.
-  utils/mkgraph.sh --self-loop-scale 1.0 data/lang_sw1_tg $dir $dir/graph_sw1_tg
+  utils/mkgraph.sh data/lang_sw1_tg $dir $dir/graph_sw1_tg
 fi
 
 decode_suff=sw1_tg
diff --git a/egs/swbd/s5c/local/chain/tuning/run_tdnn_3k.sh b/egs/swbd/s5c/local/chain/tuning/run_tdnn_3k.sh
index 5369b5251d1..fe474c82053 100755
--- a/egs/swbd/s5c/local/chain/tuning/run_tdnn_3k.sh
+++ b/egs/swbd/s5c/local/chain/tuning/run_tdnn_3k.sh
@@ -285,7 +285,7 @@ if [ $stage -le 13 ]; then
   # Note: it might appear that this $lang directory is mismatched, and it is as
   # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
   # the lang directory.
-  utils/mkgraph.sh --self-loop-scale 1.0 data/lang_sw1_tg $dir $dir/graph_sw1_tg
+  utils/mkgraph.sh data/lang_sw1_tg $dir $dir/graph_sw1_tg
 fi
 
 decode_suff=sw1_tg
diff --git a/egs/swbd/s5c/local/chain/tuning/run_tdnn_3k2.sh b/egs/swbd/s5c/local/chain/tuning/run_tdnn_3k2.sh
index 1902213402f..00fe73eacb8 100755
--- a/egs/swbd/s5c/local/chain/tuning/run_tdnn_3k2.sh
+++ b/egs/swbd/s5c/local/chain/tuning/run_tdnn_3k2.sh
@@ -331,7 +331,7 @@ if [ $stage -le 13 ]; then
   # Note: it might appear that this $lang directory is mismatched, and it is as
   # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
   # the lang directory.
-  utils/mkgraph.sh --self-loop-scale 1.0 data/lang_sw1_tg $dir $dir/graph_sw1_tg
+  utils/mkgraph.sh data/lang_sw1_tg $dir $dir/graph_sw1_tg
 fi
 
 decode_suff=sw1_tg
diff --git a/egs/swbd/s5c/local/chain/tuning/run_tdnn_3l.sh b/egs/swbd/s5c/local/chain/tuning/run_tdnn_3l.sh
index ae36ab2b65f..d15e25a50ec 100755
--- a/egs/swbd/s5c/local/chain/tuning/run_tdnn_3l.sh
+++ b/egs/swbd/s5c/local/chain/tuning/run_tdnn_3l.sh
@@ -281,7 +281,7 @@ if [ $stage -le 13 ]; then
   # Note: it might appear that this $lang directory is mismatched, and it is as
   # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
   # the lang directory.
-  utils/mkgraph.sh --self-loop-scale 1.0 data/lang_sw1_tg $dir $dir/graph_sw1_tg
+  utils/mkgraph.sh data/lang_sw1_tg $dir $dir/graph_sw1_tg
 fi
 
 decode_suff=sw1_tg
diff --git a/egs/swbd/s5c/local/chain/tuning/run_tdnn_3m.sh b/egs/swbd/s5c/local/chain/tuning/run_tdnn_3m.sh
index 49656fb8aa7..5035a31837e 100755
--- a/egs/swbd/s5c/local/chain/tuning/run_tdnn_3m.sh
+++ b/egs/swbd/s5c/local/chain/tuning/run_tdnn_3m.sh
@@ -285,7 +285,7 @@ if [ $stage -le 13 ]; then
   # Note: it might appear that this $lang directory is mismatched, and it is as
   # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
   # the lang directory.
-  utils/mkgraph.sh --self-loop-scale 1.0 data/lang_sw1_tg $dir $dir/graph_sw1_tg
+  utils/mkgraph.sh data/lang_sw1_tg $dir $dir/graph_sw1_tg
 fi
 
 decode_suff=sw1_tg
diff --git a/egs/swbd/s5c/local/chain/tuning/run_tdnn_3n.sh b/egs/swbd/s5c/local/chain/tuning/run_tdnn_3n.sh
index e2b0b0ebb10..06c6f360ab8 100755
--- a/egs/swbd/s5c/local/chain/tuning/run_tdnn_3n.sh
+++ b/egs/swbd/s5c/local/chain/tuning/run_tdnn_3n.sh
@@ -280,7 +280,7 @@ if [ $stage -le 13 ]; then
   # Note: it might appear that this $lang directory is mismatched, and it is as
   # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
   # the lang directory.
-  utils/mkgraph.sh --self-loop-scale 1.0 data/lang_sw1_tg $dir $dir/graph_sw1_tg
+  utils/mkgraph.sh data/lang_sw1_tg $dir $dir/graph_sw1_tg
 fi
 
 decode_suff=sw1_tg
diff --git a/egs/swbd/s5c/local/chain/tuning/run_tdnn_3o.sh b/egs/swbd/s5c/local/chain/tuning/run_tdnn_3o.sh
index 298eb913ff3..23ae76103e5 100755
--- a/egs/swbd/s5c/local/chain/tuning/run_tdnn_3o.sh
+++ b/egs/swbd/s5c/local/chain/tuning/run_tdnn_3o.sh
@@ -284,7 +284,7 @@ if [ $stage -le 13 ]; then
   # Note: it might appear that this $lang directory is mismatched, and it is as
   # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
   # the lang directory.
-  utils/mkgraph.sh --self-loop-scale 1.0 data/lang_sw1_tg $dir $dir/graph_sw1_tg
+  utils/mkgraph.sh data/lang_sw1_tg $dir $dir/graph_sw1_tg
 fi
 
 decode_suff=sw1_tg
diff --git a/egs/swbd/s5c/local/chain/tuning/run_tdnn_3p.sh b/egs/swbd/s5c/local/chain/tuning/run_tdnn_3p.sh
index 6ec9c6fe4b8..b907ad73b1a 100755
--- a/egs/swbd/s5c/local/chain/tuning/run_tdnn_3p.sh
+++ b/egs/swbd/s5c/local/chain/tuning/run_tdnn_3p.sh
@@ -308,7 +308,7 @@ if [ $stage -le 13 ]; then
   # Note: it might appear that this $lang directory is mismatched, and it is as
   # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
   # the lang directory.
-  utils/mkgraph.sh --self-loop-scale 1.0 data/lang_sw1_tg $dir $dir/graph_sw1_tg
+  utils/mkgraph.sh data/lang_sw1_tg $dir $dir/graph_sw1_tg
 fi
 
 decode_suff=sw1_tg
diff --git a/egs/swbd/s5c/local/chain/tuning/run_tdnn_3q.sh b/egs/swbd/s5c/local/chain/tuning/run_tdnn_3q.sh
index 4c911ba867e..8c81726d34c 100755
--- a/egs/swbd/s5c/local/chain/tuning/run_tdnn_3q.sh
+++ b/egs/swbd/s5c/local/chain/tuning/run_tdnn_3q.sh
@@ -290,7 +290,7 @@ if [ $stage -le 13 ]; then
   # Note: it might appear that this $lang directory is mismatched, and it is as
   # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
   # the lang directory.
-  utils/mkgraph.sh --self-loop-scale 1.0 data/lang_sw1_tg $dir $dir/graph_sw1_tg
+  utils/mkgraph.sh data/lang_sw1_tg $dir $dir/graph_sw1_tg
 fi
 
 decode_suff=sw1_tg
diff --git a/egs/swbd/s5c/local/chain/tuning/run_tdnn_3r.sh b/egs/swbd/s5c/local/chain/tuning/run_tdnn_3r.sh
index fba4ef6d15f..6a3a5a5d871 100755
--- a/egs/swbd/s5c/local/chain/tuning/run_tdnn_3r.sh
+++ b/egs/swbd/s5c/local/chain/tuning/run_tdnn_3r.sh
@@ -296,7 +296,7 @@ if [ $stage -le 13 ]; then
   # Note: it might appear that this $lang directory is mismatched, and it is as
   # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
   # the lang directory.
-  utils/mkgraph.sh --self-loop-scale 1.0 data/lang_sw1_tg $dir $dir/graph_sw1_tg
+  utils/mkgraph.sh data/lang_sw1_tg $dir $dir/graph_sw1_tg
 fi
 
 decode_suff=sw1_tg
diff --git a/egs/swbd/s5c/local/chain/tuning/run_tdnn_3s.sh b/egs/swbd/s5c/local/chain/tuning/run_tdnn_3s.sh
index daab4cad318..790eb939b79 100755
--- a/egs/swbd/s5c/local/chain/tuning/run_tdnn_3s.sh
+++ b/egs/swbd/s5c/local/chain/tuning/run_tdnn_3s.sh
@@ -315,7 +315,7 @@ if [ $stage -le 13 ]; then
   # Note: it might appear that this $lang directory is mismatched, and it is as
   # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
   # the lang directory.
-  utils/mkgraph.sh --self-loop-scale 1.0 data/lang_sw1_tg $dir $dir/graph_sw1_tg
+  utils/mkgraph.sh data/lang_sw1_tg $dir $dir/graph_sw1_tg
 fi
 
 decode_suff=sw1_tg
diff --git a/egs/swbd/s5c/local/chain/tuning/run_tdnn_3t.sh b/egs/swbd/s5c/local/chain/tuning/run_tdnn_3t.sh
index 034f2bafd70..ba3e714a158 100755
--- a/egs/swbd/s5c/local/chain/tuning/run_tdnn_3t.sh
+++ b/egs/swbd/s5c/local/chain/tuning/run_tdnn_3t.sh
@@ -311,7 +311,7 @@ if [ $stage -le 13 ]; then
   # Note: it might appear that this $lang directory is mismatched, and it is as
   # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
   # the lang directory.
-  utils/mkgraph.sh --self-loop-scale 1.0 data/lang_sw1_tg $dir $dir/graph_sw1_tg
+  utils/mkgraph.sh data/lang_sw1_tg $dir $dir/graph_sw1_tg
 fi
 
 decode_suff=sw1_tg
diff --git a/egs/swbd/s5c/local/chain/tuning/run_tdnn_3u.sh b/egs/swbd/s5c/local/chain/tuning/run_tdnn_3u.sh
index 97c44ad55fc..a767afefd5e 100755
--- a/egs/swbd/s5c/local/chain/tuning/run_tdnn_3u.sh
+++ b/egs/swbd/s5c/local/chain/tuning/run_tdnn_3u.sh
@@ -305,7 +305,7 @@ if [ $stage -le 13 ]; then
   # Note: it might appear that this $lang directory is mismatched, and it is as
   # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
   # the lang directory.
-  utils/mkgraph.sh --self-loop-scale 1.0 data/lang_sw1_tg $dir $dir/graph_sw1_tg
+  utils/mkgraph.sh data/lang_sw1_tg $dir $dir/graph_sw1_tg
 fi
 
 decode_suff=sw1_tg
diff --git a/egs/swbd/s5c/local/chain/tuning/run_tdnn_3v.sh b/egs/swbd/s5c/local/chain/tuning/run_tdnn_3v.sh
index 381a9e8686f..57957ab4585 100755
--- a/egs/swbd/s5c/local/chain/tuning/run_tdnn_3v.sh
+++ b/egs/swbd/s5c/local/chain/tuning/run_tdnn_3v.sh
@@ -303,7 +303,7 @@ if [ $stage -le 13 ]; then
   # Note: it might appear that this $lang directory is mismatched, and it is as
   # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
   # the lang directory.
-  utils/mkgraph.sh --self-loop-scale 1.0 data/lang_sw1_tg $dir $dir/graph_sw1_tg
+  utils/mkgraph.sh data/lang_sw1_tg $dir $dir/graph_sw1_tg
 fi
 
 decode_suff=sw1_tg
diff --git a/egs/swbd/s5c/local/chain/tuning/run_tdnn_3w.sh b/egs/swbd/s5c/local/chain/tuning/run_tdnn_3w.sh
index 9f13b10753d..d35dfd8a18a 100755
--- a/egs/swbd/s5c/local/chain/tuning/run_tdnn_3w.sh
+++ b/egs/swbd/s5c/local/chain/tuning/run_tdnn_3w.sh
@@ -307,7 +307,7 @@ if [ $stage -le 13 ]; then
   # Note: it might appear that this $lang directory is mismatched, and it is as
   # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
   # the lang directory.
-  utils/mkgraph.sh --self-loop-scale 1.0 data/lang_sw1_tg $dir $dir/graph_sw1_tg
+  utils/mkgraph.sh data/lang_sw1_tg $dir $dir/graph_sw1_tg
 fi
 
 decode_suff=sw1_tg
diff --git a/egs/swbd/s5c/local/chain/tuning/run_tdnn_3x.sh b/egs/swbd/s5c/local/chain/tuning/run_tdnn_3x.sh
index 25db1450265..71399ca4fb7 100755
--- a/egs/swbd/s5c/local/chain/tuning/run_tdnn_3x.sh
+++ b/egs/swbd/s5c/local/chain/tuning/run_tdnn_3x.sh
@@ -316,7 +316,7 @@ if [ $stage -le 13 ]; then
   # Note: it might appear that this $lang directory is mismatched, and it is as
   # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
   # the lang directory.
-  utils/mkgraph.sh --self-loop-scale 1.0 data/lang_sw1_tg $dir $dir/graph_sw1_tg
+  utils/mkgraph.sh data/lang_sw1_tg $dir $dir/graph_sw1_tg
 fi
 
 decode_suff=sw1_tg
diff --git a/egs/swbd/s5c/local/chain/tuning/run_tdnn_3y.sh b/egs/swbd/s5c/local/chain/tuning/run_tdnn_3y.sh
index 3376652f3c2..b328a38f564 100755
--- a/egs/swbd/s5c/local/chain/tuning/run_tdnn_3y.sh
+++ b/egs/swbd/s5c/local/chain/tuning/run_tdnn_3y.sh
@@ -321,7 +321,7 @@ if [ $stage -le 13 ]; then
   # Note: it might appear that this $lang directory is mismatched, and it is as
   # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
   # the lang directory.
-  utils/mkgraph.sh --self-loop-scale 1.0 data/lang_sw1_tg $dir $dir/graph_sw1_tg
+  utils/mkgraph.sh data/lang_sw1_tg $dir $dir/graph_sw1_tg
 fi
 
 decode_suff=sw1_tg
diff --git a/egs/swbd/s5c/local/chain/tuning/run_tdnn_3z.sh b/egs/swbd/s5c/local/chain/tuning/run_tdnn_3z.sh
index 25a68263dc7..50148dc8378 100755
--- a/egs/swbd/s5c/local/chain/tuning/run_tdnn_3z.sh
+++ b/egs/swbd/s5c/local/chain/tuning/run_tdnn_3z.sh
@@ -325,7 +325,7 @@ if [ $stage -le 13 ]; then
   # Note: it might appear that this $lang directory is mismatched, and it is as
   # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
   # the lang directory.
-  utils/mkgraph.sh --self-loop-scale 1.0 data/lang_sw1_tg $dir $dir/graph_sw1_tg
+  utils/mkgraph.sh data/lang_sw1_tg $dir $dir/graph_sw1_tg
 fi
 
 decode_suff=sw1_tg
diff --git a/egs/swbd/s5c/local/chain/tuning/run_tdnn_4a.sh b/egs/swbd/s5c/local/chain/tuning/run_tdnn_4a.sh
index 0be490863dc..908ccb0c6e4 100755
--- a/egs/swbd/s5c/local/chain/tuning/run_tdnn_4a.sh
+++ b/egs/swbd/s5c/local/chain/tuning/run_tdnn_4a.sh
@@ -324,7 +324,7 @@ if [ $stage -le 13 ]; then
   # Note: it might appear that this $lang directory is mismatched, and it is as
   # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
   # the lang directory.
-  utils/mkgraph.sh --self-loop-scale 1.0 data/lang_sw1_tg $dir $dir/graph_sw1_tg
+  utils/mkgraph.sh data/lang_sw1_tg $dir $dir/graph_sw1_tg
 fi
 
 decode_suff=sw1_tg
diff --git a/egs/swbd/s5c/local/chain/tuning/run_tdnn_4b.sh b/egs/swbd/s5c/local/chain/tuning/run_tdnn_4b.sh
index 40ede7c5982..6955905caa7 100755
--- a/egs/swbd/s5c/local/chain/tuning/run_tdnn_4b.sh
+++ b/egs/swbd/s5c/local/chain/tuning/run_tdnn_4b.sh
@@ -321,7 +321,7 @@ if [ $stage -le 13 ]; then
   # Note: it might appear that this $lang directory is mismatched, and it is as
   # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
   # the lang directory.
-  utils/mkgraph.sh --self-loop-scale 1.0 data/lang_sw1_tg $dir $dir/graph_sw1_tg
+  utils/mkgraph.sh data/lang_sw1_tg $dir $dir/graph_sw1_tg
 fi
 
 decode_suff=sw1_tg
diff --git a/egs/swbd/s5c/local/chain/tuning/run_tdnn_4c.sh b/egs/swbd/s5c/local/chain/tuning/run_tdnn_4c.sh
index be9043c0527..badda8a057b 100755
--- a/egs/swbd/s5c/local/chain/tuning/run_tdnn_4c.sh
+++ b/egs/swbd/s5c/local/chain/tuning/run_tdnn_4c.sh
@@ -332,7 +332,7 @@ if [ $stage -le 13 ]; then
   # Note: it might appear that this $lang directory is mismatched, and it is as
   # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
   # the lang directory.
-  utils/mkgraph.sh --self-loop-scale 1.0 data/lang_sw1_tg $dir $dir/graph_sw1_tg
+  utils/mkgraph.sh data/lang_sw1_tg $dir $dir/graph_sw1_tg
 fi
 
 decode_suff=sw1_tg
diff --git a/egs/swbd/s5c/local/chain/tuning/run_tdnn_4d.sh b/egs/swbd/s5c/local/chain/tuning/run_tdnn_4d.sh
index 7f58fbebbfc..9ac4f3aeac3 100755
--- a/egs/swbd/s5c/local/chain/tuning/run_tdnn_4d.sh
+++ b/egs/swbd/s5c/local/chain/tuning/run_tdnn_4d.sh
@@ -321,7 +321,7 @@ if [ $stage -le 13 ]; then
   # Note: it might appear that this $lang directory is mismatched, and it is as
   # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
   # the lang directory.
-  utils/mkgraph.sh --self-loop-scale 1.0 data/lang_sw1_tg $dir $dir/graph_sw1_tg
+  utils/mkgraph.sh data/lang_sw1_tg $dir $dir/graph_sw1_tg
 fi
 
 decode_suff=sw1_tg
diff --git a/egs/swbd/s5c/local/chain/tuning/run_tdnn_4e.sh b/egs/swbd/s5c/local/chain/tuning/run_tdnn_4e.sh
index 8625cfa52c8..84d890912bd 100755
--- a/egs/swbd/s5c/local/chain/tuning/run_tdnn_4e.sh
+++ b/egs/swbd/s5c/local/chain/tuning/run_tdnn_4e.sh
@@ -337,7 +337,7 @@ if [ $stage -le 13 ]; then
   # Note: it might appear that this $lang directory is mismatched, and it is as
   # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
   # the lang directory.
-  utils/mkgraph.sh --self-loop-scale 1.0 data/lang_sw1_tg $dir $dir/graph_sw1_tg
+  utils/mkgraph.sh data/lang_sw1_tg $dir $dir/graph_sw1_tg
 fi
 
 decode_suff=sw1_tg
diff --git a/egs/swbd/s5c/local/chain/tuning/run_tdnn_4f.sh b/egs/swbd/s5c/local/chain/tuning/run_tdnn_4f.sh
index 7ba4e8c6cb7..43242092b69 100755
--- a/egs/swbd/s5c/local/chain/tuning/run_tdnn_4f.sh
+++ b/egs/swbd/s5c/local/chain/tuning/run_tdnn_4f.sh
@@ -341,7 +341,7 @@ if [ $stage -le 13 ]; then
   # Note: it might appear that this $lang directory is mismatched, and it is as
   # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
   # the lang directory.
-  utils/mkgraph.sh --self-loop-scale 1.0 data/lang_sw1_tg $dir $dir/graph_sw1_tg
+  utils/mkgraph.sh data/lang_sw1_tg $dir $dir/graph_sw1_tg
 fi
 
 decode_suff=sw1_tg
diff --git a/egs/swbd/s5c/local/chain/tuning/run_tdnn_4g.sh b/egs/swbd/s5c/local/chain/tuning/run_tdnn_4g.sh
index f1059f0091f..dbce2b1cb1c 100755
--- a/egs/swbd/s5c/local/chain/tuning/run_tdnn_4g.sh
+++ b/egs/swbd/s5c/local/chain/tuning/run_tdnn_4g.sh
@@ -340,7 +340,7 @@ if [ $stage -le 13 ]; then
   # Note: it might appear that this $lang directory is mismatched, and it is as
   # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
   # the lang directory.
-  utils/mkgraph.sh --self-loop-scale 1.0 data/lang_sw1_tg $dir $dir/graph_sw1_tg
+  utils/mkgraph.sh data/lang_sw1_tg $dir $dir/graph_sw1_tg
 fi
 
 decode_suff=sw1_tg
diff --git a/egs/swbd/s5c/local/chain/tuning/run_tdnn_4n.sh b/egs/swbd/s5c/local/chain/tuning/run_tdnn_4n.sh
index 62154dd5d71..4b57c4d072d 100644
--- a/egs/swbd/s5c/local/chain/tuning/run_tdnn_4n.sh
+++ b/egs/swbd/s5c/local/chain/tuning/run_tdnn_4n.sh
@@ -361,7 +361,7 @@ if [ $stage -le 13 ]; then
   # Note: it might appear that this $lang directory is mismatched, and it is as
   # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
   # the lang directory.
-  utils/mkgraph.sh --self-loop-scale 1.0 data/lang_sw1_tg $dir $dir/graph_sw1_tg
+  utils/mkgraph.sh data/lang_sw1_tg $dir $dir/graph_sw1_tg
 fi
 
 decode_suff=sw1_tg
diff --git a/egs/swbd/s5c/local/chain/tuning/run_tdnn_4p.sh b/egs/swbd/s5c/local/chain/tuning/run_tdnn_4p.sh
index 0120c2c507d..224708375fc 100755
--- a/egs/swbd/s5c/local/chain/tuning/run_tdnn_4p.sh
+++ b/egs/swbd/s5c/local/chain/tuning/run_tdnn_4p.sh
@@ -356,7 +356,7 @@ if [ $stage -le 13 ]; then
   # Note: it might appear that this $lang directory is mismatched, and it is as
   # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
   # the lang directory.
-  utils/mkgraph.sh --self-loop-scale 1.0 data/lang_sw1_tg $dir $dir/graph_sw1_tg
+  utils/mkgraph.sh data/lang_sw1_tg $dir $dir/graph_sw1_tg
 fi
 
 decode_suff=sw1_tg
diff --git a/egs/swbd/s5c/local/chain/tuning/run_tdnn_4q.sh b/egs/swbd/s5c/local/chain/tuning/run_tdnn_4q.sh
index 7d920092c30..57d13be22ce 100755
--- a/egs/swbd/s5c/local/chain/tuning/run_tdnn_4q.sh
+++ b/egs/swbd/s5c/local/chain/tuning/run_tdnn_4q.sh
@@ -148,7 +148,7 @@ if [ $stage -le 13 ]; then
   # Note: it might appear that this $lang directory is mismatched, and it is as
   # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
   # the lang directory.
-  utils/mkgraph.sh --self-loop-scale 1.0 data/lang_sw1_tg $dir $dir/graph_sw1_tg
+  utils/mkgraph.sh data/lang_sw1_tg $dir $dir/graph_sw1_tg
 fi
 
 decode_suff=sw1_tg
diff --git a/egs/swbd/s5c/local/chain/tuning/run_tdnn_4r.sh b/egs/swbd/s5c/local/chain/tuning/run_tdnn_4r.sh
index 591b79352ab..9722c7c6704 100755
--- a/egs/swbd/s5c/local/chain/tuning/run_tdnn_4r.sh
+++ b/egs/swbd/s5c/local/chain/tuning/run_tdnn_4r.sh
@@ -355,7 +355,7 @@ if [ $stage -le 13 ]; then
   # Note: it might appear that this $lang directory is mismatched, and it is as
   # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
   # the lang directory.
-  utils/mkgraph.sh --self-loop-scale 1.0 data/lang_sw1_tg $dir $dir/graph_sw1_tg
+  utils/mkgraph.sh data/lang_sw1_tg $dir $dir/graph_sw1_tg
 fi
 
 decode_suff=sw1_tg
diff --git a/egs/swbd/s5c/local/chain/tuning/run_tdnn_4s.sh b/egs/swbd/s5c/local/chain/tuning/run_tdnn_4s.sh
index fea6a776dbf..3c020375985 100755
--- a/egs/swbd/s5c/local/chain/tuning/run_tdnn_4s.sh
+++ b/egs/swbd/s5c/local/chain/tuning/run_tdnn_4s.sh
@@ -355,7 +355,7 @@ if [ $stage -le 13 ]; then
   # Note: it might appear that this $lang directory is mismatched, and it is as
   # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
   # the lang directory.
-  utils/mkgraph.sh --self-loop-scale 1.0 data/lang_sw1_tg $dir $dir/graph_sw1_tg
+  utils/mkgraph.sh data/lang_sw1_tg $dir $dir/graph_sw1_tg
 fi
 
 decode_suff=sw1_tg
diff --git a/egs/swbd/s5c/local/chain/tuning/run_tdnn_4t.sh b/egs/swbd/s5c/local/chain/tuning/run_tdnn_4t.sh
index 0173b586700..d8ebc5b150f 100755
--- a/egs/swbd/s5c/local/chain/tuning/run_tdnn_4t.sh
+++ b/egs/swbd/s5c/local/chain/tuning/run_tdnn_4t.sh
@@ -357,7 +357,7 @@ if [ $stage -le 13 ]; then
   # Note: it might appear that this $lang directory is mismatched, and it is as
   # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
   # the lang directory.
-  utils/mkgraph.sh --self-loop-scale 1.0 data/lang_sw1_tg $dir $dir/graph_sw1_tg
+  utils/mkgraph.sh data/lang_sw1_tg $dir $dir/graph_sw1_tg
 fi
 
 decode_suff=sw1_tg
diff --git a/egs/swbd/s5c/local/chain/tuning/run_tdnn_4u.sh b/egs/swbd/s5c/local/chain/tuning/run_tdnn_4u.sh
index ac15f232500..86811c0c886 100755
--- a/egs/swbd/s5c/local/chain/tuning/run_tdnn_4u.sh
+++ b/egs/swbd/s5c/local/chain/tuning/run_tdnn_4u.sh
@@ -359,7 +359,7 @@ if [ $stage -le 13 ]; then
   # Note: it might appear that this $lang directory is mismatched, and it is as
   # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
   # the lang directory.
-  utils/mkgraph.sh --self-loop-scale 1.0 data/lang_sw1_tg $dir $dir/graph_sw1_tg
+  utils/mkgraph.sh data/lang_sw1_tg $dir $dir/graph_sw1_tg
 fi
 
 decode_suff=sw1_tg
diff --git a/egs/swbd/s5c/local/chain/tuning/run_tdnn_4v.sh b/egs/swbd/s5c/local/chain/tuning/run_tdnn_4v.sh
index 0682615acf3..9fb9d849e75 100755
--- a/egs/swbd/s5c/local/chain/tuning/run_tdnn_4v.sh
+++ b/egs/swbd/s5c/local/chain/tuning/run_tdnn_4v.sh
@@ -369,7 +369,7 @@ if [ $stage -le 13 ]; then
   # Note: it might appear that this $lang directory is mismatched, and it is as
   # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
   # the lang directory.
-  utils/mkgraph.sh --self-loop-scale 1.0 data/lang_sw1_tg $dir $dir/graph_sw1_tg
+  utils/mkgraph.sh data/lang_sw1_tg $dir $dir/graph_sw1_tg
 fi
 
 decode_suff=sw1_tg
diff --git a/egs/swbd/s5c/local/chain/tuning/run_tdnn_4w.sh b/egs/swbd/s5c/local/chain/tuning/run_tdnn_4w.sh
index 77d5013d91f..5baa90022ed 100755
--- a/egs/swbd/s5c/local/chain/tuning/run_tdnn_4w.sh
+++ b/egs/swbd/s5c/local/chain/tuning/run_tdnn_4w.sh
@@ -372,7 +372,7 @@ if [ $stage -le 13 ]; then
   # Note: it might appear that this $lang directory is mismatched, and it is as
   # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
   # the lang directory.
-  utils/mkgraph.sh --self-loop-scale 1.0 data/lang_sw1_tg $dir $dir/graph_sw1_tg
+  utils/mkgraph.sh data/lang_sw1_tg $dir $dir/graph_sw1_tg
 fi
 
 decode_suff=sw1_tg
diff --git a/egs/swbd/s5c/local/chain/tuning/run_tdnn_4x.sh b/egs/swbd/s5c/local/chain/tuning/run_tdnn_4x.sh
index 9c59137bbfc..c6520ee0f1b 100755
--- a/egs/swbd/s5c/local/chain/tuning/run_tdnn_4x.sh
+++ b/egs/swbd/s5c/local/chain/tuning/run_tdnn_4x.sh
@@ -371,7 +371,7 @@ if [ $stage -le 13 ]; then
   # Note: it might appear that this $lang directory is mismatched, and it is as
   # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
   # the lang directory.
-  utils/mkgraph.sh --self-loop-scale 1.0 data/lang_sw1_tg $dir $dir/graph_sw1_tg
+  utils/mkgraph.sh data/lang_sw1_tg $dir $dir/graph_sw1_tg
 fi
 
 decode_suff=sw1_tg
diff --git a/egs/swbd/s5c/local/chain/tuning/run_tdnn_5a.sh b/egs/swbd/s5c/local/chain/tuning/run_tdnn_5a.sh
index 1d44637a8c8..e5a605af399 100755
--- a/egs/swbd/s5c/local/chain/tuning/run_tdnn_5a.sh
+++ b/egs/swbd/s5c/local/chain/tuning/run_tdnn_5a.sh
@@ -376,7 +376,7 @@ if [ $stage -le 13 ]; then
   # Note: it might appear that this $lang directory is mismatched, and it is as
   # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
   # the lang directory.
-  utils/mkgraph.sh --self-loop-scale 1.0 data/lang_sw1_tg $dir $dir/graph_sw1_tg
+  utils/mkgraph.sh data/lang_sw1_tg $dir $dir/graph_sw1_tg
 fi
 
 decode_suff=sw1_tg
diff --git a/egs/swbd/s5c/local/chain/tuning/run_tdnn_5b.sh b/egs/swbd/s5c/local/chain/tuning/run_tdnn_5b.sh
index cdb769fb959..79ee6ce0aa4 100755
--- a/egs/swbd/s5c/local/chain/tuning/run_tdnn_5b.sh
+++ b/egs/swbd/s5c/local/chain/tuning/run_tdnn_5b.sh
@@ -379,7 +379,7 @@ if [ $stage -le 13 ]; then
   # Note: it might appear that this $lang directory is mismatched, and it is as
   # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
   # the lang directory.
-  utils/mkgraph.sh --self-loop-scale 1.0 data/lang_sw1_tg $dir $dir/graph_sw1_tg
+  utils/mkgraph.sh data/lang_sw1_tg $dir $dir/graph_sw1_tg
 fi
 
 decode_suff=sw1_tg
diff --git a/egs/swbd/s5c/local/chain/tuning/run_tdnn_5c.sh b/egs/swbd/s5c/local/chain/tuning/run_tdnn_5c.sh
index 17d8c41a82e..7f916c32993 100755
--- a/egs/swbd/s5c/local/chain/tuning/run_tdnn_5c.sh
+++ b/egs/swbd/s5c/local/chain/tuning/run_tdnn_5c.sh
@@ -384,7 +384,7 @@ if [ $stage -le 13 ]; then
   # Note: it might appear that this $lang directory is mismatched, and it is as
   # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
   # the lang directory.
-  utils/mkgraph.sh --self-loop-scale 1.0 data/lang_sw1_tg $dir $dir/graph_sw1_tg
+  utils/mkgraph.sh data/lang_sw1_tg $dir $dir/graph_sw1_tg
 fi
 
 decode_suff=sw1_tg
diff --git a/egs/swbd/s5c/local/chain/tuning/run_tdnn_5d.sh b/egs/swbd/s5c/local/chain/tuning/run_tdnn_5d.sh
index f3b92944f1a..c898f769f5f 100755
--- a/egs/swbd/s5c/local/chain/tuning/run_tdnn_5d.sh
+++ b/egs/swbd/s5c/local/chain/tuning/run_tdnn_5d.sh
@@ -382,7 +382,7 @@ if [ $stage -le 13 ]; then
   # Note: it might appear that this $lang directory is mismatched, and it is as
   # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
   # the lang directory.
-  utils/mkgraph.sh --self-loop-scale 1.0 data/lang_sw1_tg $dir $dir/graph_sw1_tg
+  utils/mkgraph.sh data/lang_sw1_tg $dir $dir/graph_sw1_tg
 fi
 
 decode_suff=sw1_tg
diff --git a/egs/swbd/s5c/local/chain/tuning/run_tdnn_5e.sh b/egs/swbd/s5c/local/chain/tuning/run_tdnn_5e.sh
index 5a64c967907..4db6f59a868 100755
--- a/egs/swbd/s5c/local/chain/tuning/run_tdnn_5e.sh
+++ b/egs/swbd/s5c/local/chain/tuning/run_tdnn_5e.sh
@@ -392,7 +392,7 @@ if [ $stage -le 13 ]; then
   # Note: it might appear that this $lang directory is mismatched, and it is as
   # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
   # the lang directory.
-  utils/mkgraph.sh --self-loop-scale 1.0 data/lang_sw1_tg $dir $dir/graph_sw1_tg
+  utils/mkgraph.sh data/lang_sw1_tg $dir $dir/graph_sw1_tg
 fi
 
 decode_suff=sw1_tg
diff --git a/egs/swbd/s5c/local/chain/tuning/run_tdnn_5f.sh b/egs/swbd/s5c/local/chain/tuning/run_tdnn_5f.sh
index c40f2ada0d3..4e8b1632a56 100755
--- a/egs/swbd/s5c/local/chain/tuning/run_tdnn_5f.sh
+++ b/egs/swbd/s5c/local/chain/tuning/run_tdnn_5f.sh
@@ -398,7 +398,7 @@ if [ $stage -le 13 ]; then
   # Note: it might appear that this $lang directory is mismatched, and it is as
   # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
   # the lang directory.
-  utils/mkgraph.sh --self-loop-scale 1.0 data/lang_sw1_tg $dir $dir/graph_sw1_tg
+  utils/mkgraph.sh data/lang_sw1_tg $dir $dir/graph_sw1_tg
 fi
 
 decode_suff=sw1_tg
diff --git a/egs/swbd/s5c/local/chain/tuning/run_tdnn_5g.sh b/egs/swbd/s5c/local/chain/tuning/run_tdnn_5g.sh
index 5f59e146f65..18a9669daa7 100755
--- a/egs/swbd/s5c/local/chain/tuning/run_tdnn_5g.sh
+++ b/egs/swbd/s5c/local/chain/tuning/run_tdnn_5g.sh
@@ -452,7 +452,7 @@ if [ $stage -le 13 ]; then
   # Note: it might appear that this $lang directory is mismatched, and it is as
   # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
   # the lang directory.
-  utils/mkgraph.sh --self-loop-scale 1.0 data/lang_sw1_tg $dir $dir/graph_sw1_tg
+  utils/mkgraph.sh data/lang_sw1_tg $dir $dir/graph_sw1_tg
 fi
 
 decode_suff=sw1_tg
diff --git a/egs/swbd/s5c/local/chain/tuning/run_tdnn_5h.sh b/egs/swbd/s5c/local/chain/tuning/run_tdnn_5h.sh
index f8dc8886eb5..b4976c1c83a 100755
--- a/egs/swbd/s5c/local/chain/tuning/run_tdnn_5h.sh
+++ b/egs/swbd/s5c/local/chain/tuning/run_tdnn_5h.sh
@@ -409,7 +409,7 @@ if [ $stage -le 13 ]; then
   # Note: it might appear that this $lang directory is mismatched, and it is as
   # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
   # the lang directory.
-  utils/mkgraph.sh --self-loop-scale 1.0 data/lang_sw1_tg $dir $dir/graph_sw1_tg
+  utils/mkgraph.sh data/lang_sw1_tg $dir $dir/graph_sw1_tg
 fi
 
 decode_suff=sw1_tg
diff --git a/egs/swbd/s5c/local/chain/tuning/run_tdnn_5i.sh b/egs/swbd/s5c/local/chain/tuning/run_tdnn_5i.sh
index 7b7f67125c3..af92e3d1a94 100755
--- a/egs/swbd/s5c/local/chain/tuning/run_tdnn_5i.sh
+++ b/egs/swbd/s5c/local/chain/tuning/run_tdnn_5i.sh
@@ -407,7 +407,7 @@ if [ $stage -le 13 ]; then
   # Note: it might appear that this $lang directory is mismatched, and it is as
   # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
   # the lang directory.
-  utils/mkgraph.sh --self-loop-scale 1.0 data/lang_sw1_tg $dir $dir/graph_sw1_tg
+  utils/mkgraph.sh data/lang_sw1_tg $dir $dir/graph_sw1_tg
 fi
 
 decode_suff=sw1_tg
diff --git a/egs/swbd/s5c/local/chain/tuning/run_tdnn_5j.sh b/egs/swbd/s5c/local/chain/tuning/run_tdnn_5j.sh
index bf1787c4373..f88e5885bb3 100755
--- a/egs/swbd/s5c/local/chain/tuning/run_tdnn_5j.sh
+++ b/egs/swbd/s5c/local/chain/tuning/run_tdnn_5j.sh
@@ -403,7 +403,7 @@ if [ $stage -le 13 ]; then
   # Note: it might appear that this $lang directory is mismatched, and it is as
   # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
   # the lang directory.
-  utils/mkgraph.sh --self-loop-scale 1.0 data/lang_sw1_tg $dir $dir/graph_sw1_tg
+  utils/mkgraph.sh data/lang_sw1_tg $dir $dir/graph_sw1_tg
 fi
 
 decode_suff=sw1_tg
diff --git a/egs/swbd/s5c/local/chain/tuning/run_tdnn_5k.sh b/egs/swbd/s5c/local/chain/tuning/run_tdnn_5k.sh
index 93f9bffdd12..fec6d8d2a36 100755
--- a/egs/swbd/s5c/local/chain/tuning/run_tdnn_5k.sh
+++ b/egs/swbd/s5c/local/chain/tuning/run_tdnn_5k.sh
@@ -430,7 +430,7 @@ if [ $stage -le 13 ]; then
   # Note: it might appear that this $lang directory is mismatched, and it is as
   # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
   # the lang directory.
-  utils/mkgraph.sh --self-loop-scale 1.0 data/lang_sw1_tg $dir $dir/graph_sw1_tg
+  utils/mkgraph.sh data/lang_sw1_tg $dir $dir/graph_sw1_tg
 fi
 
 decode_suff=sw1_tg
diff --git a/egs/swbd/s5c/local/chain/tuning/run_tdnn_5l.sh b/egs/swbd/s5c/local/chain/tuning/run_tdnn_5l.sh
index f0c66c3a7cd..5b7580e8022 100755
--- a/egs/swbd/s5c/local/chain/tuning/run_tdnn_5l.sh
+++ b/egs/swbd/s5c/local/chain/tuning/run_tdnn_5l.sh
@@ -440,7 +440,7 @@ if [ $stage -le 13 ]; then
   # Note: it might appear that this $lang directory is mismatched, and it is as
   # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
   # the lang directory.
-  utils/mkgraph.sh --self-loop-scale 1.0 data/lang_sw1_tg $dir $dir/graph_sw1_tg
+  utils/mkgraph.sh data/lang_sw1_tg $dir $dir/graph_sw1_tg
 fi
 
 decode_suff=sw1_tg
diff --git a/egs/swbd/s5c/local/chain/tuning/run_tdnn_5m.sh b/egs/swbd/s5c/local/chain/tuning/run_tdnn_5m.sh
index dc0f19e9261..aac8f038360 100644
--- a/egs/swbd/s5c/local/chain/tuning/run_tdnn_5m.sh
+++ b/egs/swbd/s5c/local/chain/tuning/run_tdnn_5m.sh
@@ -405,7 +405,7 @@ if [ $stage -le 13 ]; then
   # Note: it might appear that this $lang directory is mismatched, and it is as
   # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
   # the lang directory.
-  utils/mkgraph.sh --self-loop-scale 1.0 data/lang_sw1_tg $dir $dir/graph_sw1_tg
+  utils/mkgraph.sh data/lang_sw1_tg $dir $dir/graph_sw1_tg
 fi
 
 decode_suff=sw1_tg
diff --git a/egs/swbd/s5c/local/chain/tuning/run_tdnn_5n.sh b/egs/swbd/s5c/local/chain/tuning/run_tdnn_5n.sh
index 51a3f6e7723..efe8193cc1b 100755
--- a/egs/swbd/s5c/local/chain/tuning/run_tdnn_5n.sh
+++ b/egs/swbd/s5c/local/chain/tuning/run_tdnn_5n.sh
@@ -435,7 +435,7 @@ if [ $stage -le 14 ]; then
   # Note: it might appear that this $lang directory is mismatched, and it is as
   # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
   # the lang directory.
-  utils/mkgraph.sh --self-loop-scale 1.0 data/lang_sw1_tg $dir $dir/graph_sw1_tg
+  utils/mkgraph.sh data/lang_sw1_tg $dir $dir/graph_sw1_tg
 fi
 
 decode_suff=sw1_tg
diff --git a/egs/swbd/s5c/local/chain/tuning/run_tdnn_5o.sh b/egs/swbd/s5c/local/chain/tuning/run_tdnn_5o.sh
index 4e2e6033d29..a8497ce24c9 100755
--- a/egs/swbd/s5c/local/chain/tuning/run_tdnn_5o.sh
+++ b/egs/swbd/s5c/local/chain/tuning/run_tdnn_5o.sh
@@ -443,7 +443,7 @@ if [ $stage -le 14 ]; then
   # Note: it might appear that this $lang directory is mismatched, and it is as
   # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
   # the lang directory.
-  utils/mkgraph.sh --self-loop-scale 1.0 data/lang_sw1_tg $dir $dir/graph_sw1_tg
+  utils/mkgraph.sh data/lang_sw1_tg $dir $dir/graph_sw1_tg
 fi
 
 decode_suff=sw1_tg
diff --git a/egs/swbd/s5c/local/chain/tuning/run_tdnn_5p.sh b/egs/swbd/s5c/local/chain/tuning/run_tdnn_5p.sh
index 36056efce7a..47acc829fe8 100755
--- a/egs/swbd/s5c/local/chain/tuning/run_tdnn_5p.sh
+++ b/egs/swbd/s5c/local/chain/tuning/run_tdnn_5p.sh
@@ -396,7 +396,7 @@ if [ $stage -le 13 ]; then
   # Note: it might appear that this $lang directory is mismatched, and it is as
   # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
   # the lang directory.
-  utils/mkgraph.sh --self-loop-scale 1.0 data/lang_sw1_tg $dir $dir/graph_sw1_tg
+  utils/mkgraph.sh data/lang_sw1_tg $dir $dir/graph_sw1_tg
 fi
 
 decode_suff=sw1_tg
diff --git a/egs/swbd/s5c/local/chain/tuning/run_tdnn_5q.sh b/egs/swbd/s5c/local/chain/tuning/run_tdnn_5q.sh
index 01a9e867b57..691f575ab34 100755
--- a/egs/swbd/s5c/local/chain/tuning/run_tdnn_5q.sh
+++ b/egs/swbd/s5c/local/chain/tuning/run_tdnn_5q.sh
@@ -400,7 +400,7 @@ if [ $stage -le 13 ]; then
   # Note: it might appear that this $lang directory is mismatched, and it is as
   # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
   # the lang directory.
-  utils/mkgraph.sh --self-loop-scale 1.0 data/lang_sw1_tg $dir $dir/graph_sw1_tg
+  utils/mkgraph.sh data/lang_sw1_tg $dir $dir/graph_sw1_tg
 fi
 
 decode_suff=sw1_tg
diff --git a/egs/swbd/s5c/local/chain/tuning/run_tdnn_5r.sh b/egs/swbd/s5c/local/chain/tuning/run_tdnn_5r.sh
index a20ca2da3de..f48f8500710 100755
--- a/egs/swbd/s5c/local/chain/tuning/run_tdnn_5r.sh
+++ b/egs/swbd/s5c/local/chain/tuning/run_tdnn_5r.sh
@@ -402,7 +402,7 @@ if [ $stage -le 13 ]; then
   # Note: it might appear that this $lang directory is mismatched, and it is as
   # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
   # the lang directory.
-  utils/mkgraph.sh --self-loop-scale 1.0 data/lang_sw1_tg $dir $dir/graph_sw1_tg
+  utils/mkgraph.sh data/lang_sw1_tg $dir $dir/graph_sw1_tg
 fi
 
 decode_suff=sw1_tg
diff --git a/egs/swbd/s5c/local/chain/tuning/run_tdnn_5s.sh b/egs/swbd/s5c/local/chain/tuning/run_tdnn_5s.sh
index df981a478c0..00f00103be7 100755
--- a/egs/swbd/s5c/local/chain/tuning/run_tdnn_5s.sh
+++ b/egs/swbd/s5c/local/chain/tuning/run_tdnn_5s.sh
@@ -416,7 +416,7 @@ if [ $stage -le 13 ]; then
   # Note: it might appear that this $lang directory is mismatched, and it is as
   # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
   # the lang directory.
-  utils/mkgraph.sh --self-loop-scale 1.0 data/lang_sw1_tg $dir $dir/graph_sw1_tg
+  utils/mkgraph.sh data/lang_sw1_tg $dir $dir/graph_sw1_tg
 fi
 
 decode_suff=sw1_tg
diff --git a/egs/swbd/s5c/local/chain/tuning/run_tdnn_5t.sh b/egs/swbd/s5c/local/chain/tuning/run_tdnn_5t.sh
index ddd08de7707..8a69ec95d9b 100755
--- a/egs/swbd/s5c/local/chain/tuning/run_tdnn_5t.sh
+++ b/egs/swbd/s5c/local/chain/tuning/run_tdnn_5t.sh
@@ -420,7 +420,7 @@ if [ $stage -le 13 ]; then
   # Note: it might appear that this $lang directory is mismatched, and it is as
   # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
   # the lang directory.
-  utils/mkgraph.sh --self-loop-scale 1.0 data/lang_sw1_tg $dir $dir/graph_sw1_tg
+  utils/mkgraph.sh data/lang_sw1_tg $dir $dir/graph_sw1_tg
 fi
 
 decode_suff=sw1_tg
diff --git a/egs/swbd/s5c/local/chain/tuning/run_tdnn_5u.sh b/egs/swbd/s5c/local/chain/tuning/run_tdnn_5u.sh
index 28333fd912e..998300ca1a2 100755
--- a/egs/swbd/s5c/local/chain/tuning/run_tdnn_5u.sh
+++ b/egs/swbd/s5c/local/chain/tuning/run_tdnn_5u.sh
@@ -481,7 +481,7 @@ if [ $stage -le 16 ]; then
   # Note: it might appear that this $lang directory is mismatched, and it is as
   # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
   # the lang directory.
-  utils/mkgraph.sh --self-loop-scale 1.0 data/lang_sw1_tg $dir $dir/graph_sw1_tg
+  utils/mkgraph.sh data/lang_sw1_tg $dir $dir/graph_sw1_tg
 fi
 
 decode_suff=sw1_tg
diff --git a/egs/swbd/s5c/local/chain/tuning/run_tdnn_5v.sh b/egs/swbd/s5c/local/chain/tuning/run_tdnn_5v.sh
index 2cdb0bb988c..507d2257adc 100755
--- a/egs/swbd/s5c/local/chain/tuning/run_tdnn_5v.sh
+++ b/egs/swbd/s5c/local/chain/tuning/run_tdnn_5v.sh
@@ -434,7 +434,7 @@ if [ $stage -le 13 ]; then
   # Note: it might appear that this $lang directory is mismatched, and it is as
   # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
   # the lang directory.
-  utils/mkgraph.sh --self-loop-scale 1.0 data/lang_sw1_tg $dir $dir/graph_sw1_tg
+  utils/mkgraph.sh data/lang_sw1_tg $dir $dir/graph_sw1_tg
 fi
 
 decode_suff=sw1_tg
diff --git a/egs/swbd/s5c/local/chain/tuning/run_tdnn_5w.sh b/egs/swbd/s5c/local/chain/tuning/run_tdnn_5w.sh
index 5a33622645a..3d263c066d6 100755
--- a/egs/swbd/s5c/local/chain/tuning/run_tdnn_5w.sh
+++ b/egs/swbd/s5c/local/chain/tuning/run_tdnn_5w.sh
@@ -445,7 +445,7 @@ if [ $stage -le 13 ]; then
   # Note: it might appear that this $lang directory is mismatched, and it is as
   # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
   # the lang directory.
-  utils/mkgraph.sh --self-loop-scale 1.0 data/lang_sw1_tg $dir $dir/graph_sw1_tg
+  utils/mkgraph.sh data/lang_sw1_tg $dir $dir/graph_sw1_tg
 fi
 
 decode_suff=sw1_tg
diff --git a/egs/swbd/s5c/local/chain/tuning/run_tdnn_5x.sh b/egs/swbd/s5c/local/chain/tuning/run_tdnn_5x.sh
index 0b76fe60a7b..608f14634c1 100755
--- a/egs/swbd/s5c/local/chain/tuning/run_tdnn_5x.sh
+++ b/egs/swbd/s5c/local/chain/tuning/run_tdnn_5x.sh
@@ -452,7 +452,7 @@ if [ $stage -le 13 ]; then
   # Note: it might appear that this $lang directory is mismatched, and it is as
   # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
   # the lang directory.
-  utils/mkgraph.sh --self-loop-scale 1.0 data/lang_sw1_tg $dir $dir/graph_sw1_tg
+  utils/mkgraph.sh data/lang_sw1_tg $dir $dir/graph_sw1_tg
 fi
 
 decode_suff=sw1_tg
diff --git a/egs/swbd/s5c/local/chain/tuning/run_tdnn_5y.sh b/egs/swbd/s5c/local/chain/tuning/run_tdnn_5y.sh
index 3fd623e163f..eb5581f6144 100755
--- a/egs/swbd/s5c/local/chain/tuning/run_tdnn_5y.sh
+++ b/egs/swbd/s5c/local/chain/tuning/run_tdnn_5y.sh
@@ -451,7 +451,7 @@ if [ $stage -le 13 ]; then
   # Note: it might appear that this $lang directory is mismatched, and it is as
   # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
   # the lang directory.
-  utils/mkgraph.sh --self-loop-scale 1.0 data/lang_sw1_tg $dir $dir/graph_sw1_tg
+  utils/mkgraph.sh data/lang_sw1_tg $dir $dir/graph_sw1_tg
 fi
 
 decode_suff=sw1_tg
diff --git a/egs/swbd/s5c/local/chain/tuning/run_tdnn_5z.sh b/egs/swbd/s5c/local/chain/tuning/run_tdnn_5z.sh
index ff3528d9660..0b0b8ada738 100755
--- a/egs/swbd/s5c/local/chain/tuning/run_tdnn_5z.sh
+++ b/egs/swbd/s5c/local/chain/tuning/run_tdnn_5z.sh
@@ -443,7 +443,7 @@ if [ $stage -le 13 ]; then
   # Note: it might appear that this $lang directory is mismatched, and it is as
   # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
   # the lang directory.
-  utils/mkgraph.sh --self-loop-scale 1.0 data/lang_sw1_tg $dir $dir/graph_sw1_tg
+  utils/mkgraph.sh data/lang_sw1_tg $dir $dir/graph_sw1_tg
 fi
 
 decode_suff=sw1_tg
diff --git a/egs/swbd/s5c/local/chain/tuning/run_tdnn_6a.sh b/egs/swbd/s5c/local/chain/tuning/run_tdnn_6a.sh
index 194245be1e3..c8b02f54172 100755
--- a/egs/swbd/s5c/local/chain/tuning/run_tdnn_6a.sh
+++ b/egs/swbd/s5c/local/chain/tuning/run_tdnn_6a.sh
@@ -465,7 +465,7 @@ if [ $stage -le 13 ]; then
   # Note: it might appear that this $lang directory is mismatched, and it is as
   # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
   # the lang directory.
-  utils/mkgraph.sh --self-loop-scale 1.0 data/lang_sw1_tg $dir $dir/graph_sw1_tg
+  utils/mkgraph.sh data/lang_sw1_tg $dir $dir/graph_sw1_tg
 fi
 
 decode_suff=sw1_tg
diff --git a/egs/swbd/s5c/local/chain/tuning/run_tdnn_6b.sh b/egs/swbd/s5c/local/chain/tuning/run_tdnn_6b.sh
index d4194a5afe4..8449fb3b76a 100755
--- a/egs/swbd/s5c/local/chain/tuning/run_tdnn_6b.sh
+++ b/egs/swbd/s5c/local/chain/tuning/run_tdnn_6b.sh
@@ -455,7 +455,7 @@ if [ $stage -le 13 ]; then
   # Note: it might appear that this $lang directory is mismatched, and it is as
   # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
   # the lang directory.
-  utils/mkgraph.sh --self-loop-scale 1.0 data/lang_sw1_tg $dir $dir/graph_sw1_tg
+  utils/mkgraph.sh data/lang_sw1_tg $dir $dir/graph_sw1_tg
 fi
 
 decode_suff=sw1_tg
diff --git a/egs/swbd/s5c/local/chain/tuning/run_tdnn_6c.sh b/egs/swbd/s5c/local/chain/tuning/run_tdnn_6c.sh
index 89021098c49..927a33616ea 100755
--- a/egs/swbd/s5c/local/chain/tuning/run_tdnn_6c.sh
+++ b/egs/swbd/s5c/local/chain/tuning/run_tdnn_6c.sh
@@ -443,7 +443,7 @@ if [ $stage -le 13 ]; then
   # Note: it might appear that this $lang directory is mismatched, and it is as
   # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
   # the lang directory.
-  utils/mkgraph.sh --self-loop-scale 1.0 data/lang_sw1_tg $dir $dir/graph_sw1_tg
+  utils/mkgraph.sh data/lang_sw1_tg $dir $dir/graph_sw1_tg
 fi
 
 decode_suff=sw1_tg
diff --git a/egs/swbd/s5c/local/chain/tuning/run_tdnn_6d.sh b/egs/swbd/s5c/local/chain/tuning/run_tdnn_6d.sh
index 354640e0258..8909f643de0 100755
--- a/egs/swbd/s5c/local/chain/tuning/run_tdnn_6d.sh
+++ b/egs/swbd/s5c/local/chain/tuning/run_tdnn_6d.sh
@@ -445,7 +445,7 @@ if [ $stage -le 13 ]; then
   # Note: it might appear that this $lang directory is mismatched, and it is as
   # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
   # the lang directory.
-  utils/mkgraph.sh --self-loop-scale 1.0 data/lang_sw1_tg $dir $dir/graph_sw1_tg
+  utils/mkgraph.sh data/lang_sw1_tg $dir $dir/graph_sw1_tg
 fi
 
 decode_suff=sw1_tg
diff --git a/egs/swbd/s5c/local/chain/tuning/run_tdnn_6e.sh b/egs/swbd/s5c/local/chain/tuning/run_tdnn_6e.sh
index 80fea19e7a2..a8d82682d24 100755
--- a/egs/swbd/s5c/local/chain/tuning/run_tdnn_6e.sh
+++ b/egs/swbd/s5c/local/chain/tuning/run_tdnn_6e.sh
@@ -439,7 +439,7 @@ if [ $stage -le 13 ]; then
   # Note: it might appear that this $lang directory is mismatched, and it is as
   # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
   # the lang directory.
-  utils/mkgraph.sh --self-loop-scale 1.0 data/lang_sw1_tg $dir $dir/graph_sw1_tg
+  utils/mkgraph.sh data/lang_sw1_tg $dir $dir/graph_sw1_tg
 fi
 
 decode_suff=sw1_tg
diff --git a/egs/swbd/s5c/local/chain/tuning/run_tdnn_6f.sh b/egs/swbd/s5c/local/chain/tuning/run_tdnn_6f.sh
index f92048cfeb4..a661b5b59fb 100755
--- a/egs/swbd/s5c/local/chain/tuning/run_tdnn_6f.sh
+++ b/egs/swbd/s5c/local/chain/tuning/run_tdnn_6f.sh
@@ -445,7 +445,7 @@ if [ $stage -le 13 ]; then
   # Note: it might appear that this $lang directory is mismatched, and it is as
   # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
   # the lang directory.
-  utils/mkgraph.sh --self-loop-scale 1.0 data/lang_sw1_tg $dir $dir/graph_sw1_tg
+  utils/mkgraph.sh data/lang_sw1_tg $dir $dir/graph_sw1_tg
 fi
 
 decode_suff=sw1_tg
diff --git a/egs/swbd/s5c/local/chain/tuning/run_tdnn_6g.sh b/egs/swbd/s5c/local/chain/tuning/run_tdnn_6g.sh
index fbc5e0c54b5..c14f2fca4d7 100755
--- a/egs/swbd/s5c/local/chain/tuning/run_tdnn_6g.sh
+++ b/egs/swbd/s5c/local/chain/tuning/run_tdnn_6g.sh
@@ -466,7 +466,7 @@ if [ $stage -le 13 ]; then
   # Note: it might appear that this $lang directory is mismatched, and it is as
   # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
   # the lang directory.
-  utils/mkgraph.sh --self-loop-scale 1.0 data/lang_sw1_tg $dir $dir/graph_sw1_tg
+  utils/mkgraph.sh data/lang_sw1_tg $dir $dir/graph_sw1_tg
 fi
 
 decode_suff=sw1_tg
diff --git a/egs/swbd/s5c/local/chain/tuning/run_tdnn_6h.sh b/egs/swbd/s5c/local/chain/tuning/run_tdnn_6h.sh
index 5449671d131..5a1a7d40474 100755
--- a/egs/swbd/s5c/local/chain/tuning/run_tdnn_6h.sh
+++ b/egs/swbd/s5c/local/chain/tuning/run_tdnn_6h.sh
@@ -469,7 +469,7 @@ if [ $stage -le 13 ]; then
   # Note: it might appear that this $lang directory is mismatched, and it is as
   # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
   # the lang directory.
-  utils/mkgraph.sh --self-loop-scale 1.0 data/lang_sw1_tg $dir $dir/graph_sw1_tg
+  utils/mkgraph.sh data/lang_sw1_tg $dir $dir/graph_sw1_tg
 fi
 
 decode_suff=sw1_tg
diff --git a/egs/swbd/s5c/local/chain/tuning/run_tdnn_6h_discriminative.sh b/egs/swbd/s5c/local/chain/tuning/run_tdnn_6h_discriminative.sh
index 6db0a4f5ac4..4e47c28b9f7 100755
--- a/egs/swbd/s5c/local/chain/tuning/run_tdnn_6h_discriminative.sh
+++ b/egs/swbd/s5c/local/chain/tuning/run_tdnn_6h_discriminative.sh
@@ -159,7 +159,7 @@ if [ $stage -le 1 ]; then
          # have some stragglers.
   steps/nnet3/align.sh  --cmd "$decode_cmd" --use-gpu false \
     --online-ivector-dir $online_ivector_dir \
-    --scale-opts "--transition-scale=1.0 --acoustic-scale=1.0 --self-loop-scale=1.0" \
+    --scale-opts "--acoustic-scale=1.0 " \
     --nj $nj $train_data_dir $lang $srcdir ${srcdir}_ali${affix} ;
 fi
 
@@ -173,7 +173,7 @@ if [ -z "$lats_dir" ]; then
     subsplit=40 # number of jobs that run per job (but 2 run at a time, so total jobs is 80, giving
     # total slots = 80 * 6 = 480.
     steps/nnet3/make_denlats.sh --cmd "$decode_cmd" \
-      --self-loop-scale 1.0 --acwt 1.0 --determinize true \
+      --acwt 1.0 --determinize true \
       --online-ivector-dir $online_ivector_dir \
       --nj $nj --sub-split $subsplit --num-threads "$num_threads_denlats" --config conf/decode.config \
       $train_data_dir $lang $srcdir ${lats_dir} ;
diff --git a/egs/swbd/s5c/local/chain/tuning/run_tdnn_6h_py.sh b/egs/swbd/s5c/local/chain/tuning/run_tdnn_6h_py.sh
index 32631f4d348..f763c5f95b3 100755
--- a/egs/swbd/s5c/local/chain/tuning/run_tdnn_6h_py.sh
+++ b/egs/swbd/s5c/local/chain/tuning/run_tdnn_6h_py.sh
@@ -152,7 +152,7 @@ if [ $stage -le 14 ]; then
   # Note: it might appear that this $lang directory is mismatched, and it is as
   # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
   # the lang directory.
-  utils/mkgraph.sh --self-loop-scale 1.0 data/lang_sw1_tg $dir $dir/graph_sw1_tg
+  utils/mkgraph.sh data/lang_sw1_tg $dir $dir/graph_sw1_tg
 fi
 
 decode_suff=sw1_tg
diff --git a/egs/swbd/s5c/local/chain/tuning/run_tdnn_6i.sh b/egs/swbd/s5c/local/chain/tuning/run_tdnn_6i.sh
index 093bceb2717..a00b762695f 100755
--- a/egs/swbd/s5c/local/chain/tuning/run_tdnn_6i.sh
+++ b/egs/swbd/s5c/local/chain/tuning/run_tdnn_6i.sh
@@ -473,7 +473,7 @@ if [ $stage -le 15 ]; then
   # Note: it might appear that this $lang directory is mismatched, and it is as
   # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
   # the lang directory.
-  utils/mkgraph.sh --self-loop-scale 1.0 data/lang_sw1_tg $dir $dir/graph_sw1_tg
+  utils/mkgraph.sh data/lang_sw1_tg $dir $dir/graph_sw1_tg
 fi
 
 decode_suff=sw1_tg
diff --git a/egs/swbd/s5c/local/chain/tuning/run_tdnn_6j.sh b/egs/swbd/s5c/local/chain/tuning/run_tdnn_6j.sh
index cf98106ea04..416ae831c21 100755
--- a/egs/swbd/s5c/local/chain/tuning/run_tdnn_6j.sh
+++ b/egs/swbd/s5c/local/chain/tuning/run_tdnn_6j.sh
@@ -458,7 +458,7 @@ if [ $stage -le 13 ]; then
   # Note: it might appear that this $lang directory is mismatched, and it is as
   # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
   # the lang directory.
-  utils/mkgraph.sh --self-loop-scale 1.0 data/lang_sw1_tg $dir $dir/graph_sw1_tg
+  utils/mkgraph.sh data/lang_sw1_tg $dir $dir/graph_sw1_tg
 fi
 
 decode_suff=sw1_tg
diff --git a/egs/swbd/s5c/local/chain/tuning/run_tdnn_6k.sh b/egs/swbd/s5c/local/chain/tuning/run_tdnn_6k.sh
index 5d518aeab2a..99b00b50e1a 100755
--- a/egs/swbd/s5c/local/chain/tuning/run_tdnn_6k.sh
+++ b/egs/swbd/s5c/local/chain/tuning/run_tdnn_6k.sh
@@ -485,7 +485,7 @@ if [ $stage -le 15 ]; then
   # Note: it might appear that this $lang directory is mismatched, and it is as
   # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
   # the lang directory.
-  utils/mkgraph.sh --self-loop-scale 1.0 data/lang_sw1_tg $dir $dir/graph_sw1_tg
+  utils/mkgraph.sh data/lang_sw1_tg $dir $dir/graph_sw1_tg
 fi
 
 decode_suff=sw1_tg
diff --git a/egs/swbd/s5c/local/chain/tuning/run_tdnn_6l.sh b/egs/swbd/s5c/local/chain/tuning/run_tdnn_6l.sh
index c76f5a9efd3..6e30c9a62e1 100755
--- a/egs/swbd/s5c/local/chain/tuning/run_tdnn_6l.sh
+++ b/egs/swbd/s5c/local/chain/tuning/run_tdnn_6l.sh
@@ -497,7 +497,7 @@ if [ $stage -le 15 ]; then
   # Note: it might appear that this $lang directory is mismatched, and it is as
   # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
   # the lang directory.
-  utils/mkgraph.sh --self-loop-scale 1.0 data/lang_sw1_tg $dir $dir/graph_sw1_tg
+  utils/mkgraph.sh data/lang_sw1_tg $dir $dir/graph_sw1_tg
 fi
 
 decode_suff=sw1_tg
diff --git a/egs/swbd/s5c/local/chain/tuning/run_tdnn_6m.sh b/egs/swbd/s5c/local/chain/tuning/run_tdnn_6m.sh
index 39d6d3cb449..d352a63c944 100755
--- a/egs/swbd/s5c/local/chain/tuning/run_tdnn_6m.sh
+++ b/egs/swbd/s5c/local/chain/tuning/run_tdnn_6m.sh
@@ -473,7 +473,7 @@ if [ $stage -le 13 ]; then
   # Note: it might appear that this $lang directory is mismatched, and it is as
   # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
   # the lang directory.
-  utils/mkgraph.sh --self-loop-scale 1.0 data/lang_sw1_tg $dir $dir/graph_sw1_tg
+  utils/mkgraph.sh data/lang_sw1_tg $dir $dir/graph_sw1_tg
 fi
 
 decode_suff=sw1_tg
diff --git a/egs/swbd/s5c/local/chain/tuning/run_tdnn_6n.sh b/egs/swbd/s5c/local/chain/tuning/run_tdnn_6n.sh
index 0911711e73c..04adf6805d5 100755
--- a/egs/swbd/s5c/local/chain/tuning/run_tdnn_6n.sh
+++ b/egs/swbd/s5c/local/chain/tuning/run_tdnn_6n.sh
@@ -475,7 +475,7 @@ if [ $stage -le 13 ]; then
   # Note: it might appear that this $lang directory is mismatched, and it is as
   # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
   # the lang directory.
-  utils/mkgraph.sh --self-loop-scale 1.0 data/lang_sw1_tg $dir $dir/graph_sw1_tg
+  utils/mkgraph.sh data/lang_sw1_tg $dir $dir/graph_sw1_tg
 fi
 
 decode_suff=sw1_tg
diff --git a/egs/swbd/s5c/local/chain/tuning/run_tdnn_6o.sh b/egs/swbd/s5c/local/chain/tuning/run_tdnn_6o.sh
index c07cb35ed33..959067206fe 100755
--- a/egs/swbd/s5c/local/chain/tuning/run_tdnn_6o.sh
+++ b/egs/swbd/s5c/local/chain/tuning/run_tdnn_6o.sh
@@ -484,7 +484,7 @@ if [ $stage -le 13 ]; then
   # Note: it might appear that this $lang directory is mismatched, and it is as
   # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
   # the lang directory.
-  utils/mkgraph.sh --self-loop-scale 1.0 data/lang_sw1_tg $dir $dir/graph_sw1_tg
+  utils/mkgraph.sh data/lang_sw1_tg $dir $dir/graph_sw1_tg
 fi
 
 decode_suff=sw1_tg
diff --git a/egs/swbd/s5c/local/chain/tuning/run_tdnn_6p.sh b/egs/swbd/s5c/local/chain/tuning/run_tdnn_6p.sh
index 5710dbe2ef9..38065eba75d 100755
--- a/egs/swbd/s5c/local/chain/tuning/run_tdnn_6p.sh
+++ b/egs/swbd/s5c/local/chain/tuning/run_tdnn_6p.sh
@@ -479,7 +479,7 @@ if [ $stage -le 13 ]; then
   # Note: it might appear that this $lang directory is mismatched, and it is as
   # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
   # the lang directory.
-  utils/mkgraph.sh --self-loop-scale 1.0 data/lang_sw1_tg $dir $dir/graph_sw1_tg
+  utils/mkgraph.sh data/lang_sw1_tg $dir $dir/graph_sw1_tg
 fi
 
 decode_suff=sw1_tg
diff --git a/egs/swbd/s5c/local/chain/tuning/run_tdnn_6q.sh b/egs/swbd/s5c/local/chain/tuning/run_tdnn_6q.sh
index 3e93d79b799..b2ab4a581d5 100755
--- a/egs/swbd/s5c/local/chain/tuning/run_tdnn_6q.sh
+++ b/egs/swbd/s5c/local/chain/tuning/run_tdnn_6q.sh
@@ -469,7 +469,7 @@ if [ $stage -le 14 ]; then
   # Note: it might appear that this $lang directory is mismatched, and it is as
   # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
   # the lang directory.
-  utils/mkgraph.sh --self-loop-scale 1.0 data/lang_sw1_tg $dir $dir/graph_sw1_tg
+  utils/mkgraph.sh data/lang_sw1_tg $dir $dir/graph_sw1_tg
 fi
 
 decode_suff=sw1_tg
diff --git a/egs/swbd/s5c/local/chain/tuning/run_tdnn_6r.sh b/egs/swbd/s5c/local/chain/tuning/run_tdnn_6r.sh
index 0415f4e0fb9..dda85a77550 100755
--- a/egs/swbd/s5c/local/chain/tuning/run_tdnn_6r.sh
+++ b/egs/swbd/s5c/local/chain/tuning/run_tdnn_6r.sh
@@ -468,7 +468,7 @@ if [ $stage -le 15 ]; then
   # Note: it might appear that this $lang directory is mismatched, and it is as
   # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
   # the lang directory.
-  utils/mkgraph.sh --self-loop-scale 1.0 data/lang_sw1_tg $dir $dir/graph_sw1_tg
+  utils/mkgraph.sh data/lang_sw1_tg $dir $dir/graph_sw1_tg
 fi
 
 decode_suff=sw1_tg
diff --git a/egs/swbd/s5c/local/chain/tuning/run_tdnn_6s.sh b/egs/swbd/s5c/local/chain/tuning/run_tdnn_6s.sh
index 0564c0a858f..4de17d1c875 100755
--- a/egs/swbd/s5c/local/chain/tuning/run_tdnn_6s.sh
+++ b/egs/swbd/s5c/local/chain/tuning/run_tdnn_6s.sh
@@ -478,7 +478,7 @@ if [ $stage -le 15 ]; then
   # Note: it might appear that this $lang directory is mismatched, and it is as
   # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
   # the lang directory.
-  utils/mkgraph.sh --self-loop-scale 1.0 data/lang_sw1_tg $dir $dir/graph_sw1_tg
+  utils/mkgraph.sh data/lang_sw1_tg $dir $dir/graph_sw1_tg
 fi
 
 decode_suff=sw1_tg
diff --git a/egs/swbd/s5c/local/chain/tuning/run_tdnn_6t.sh b/egs/swbd/s5c/local/chain/tuning/run_tdnn_6t.sh
index 98ecd477a1d..787ff971b9d 100755
--- a/egs/swbd/s5c/local/chain/tuning/run_tdnn_6t.sh
+++ b/egs/swbd/s5c/local/chain/tuning/run_tdnn_6t.sh
@@ -488,7 +488,7 @@ if [ $stage -le 15 ]; then
   # Note: it might appear that this $lang directory is mismatched, and it is as
   # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
   # the lang directory.
-  utils/mkgraph.sh --self-loop-scale 1.0 data/lang_sw1_tg $dir $dir/graph_sw1_tg
+  utils/mkgraph.sh data/lang_sw1_tg $dir $dir/graph_sw1_tg
 fi
 
 decode_suff=sw1_tg
diff --git a/egs/swbd/s5c/local/chain/tuning/run_tdnn_6u.sh b/egs/swbd/s5c/local/chain/tuning/run_tdnn_6u.sh
index 9e8afc3c5b8..67da19429a2 100755
--- a/egs/swbd/s5c/local/chain/tuning/run_tdnn_6u.sh
+++ b/egs/swbd/s5c/local/chain/tuning/run_tdnn_6u.sh
@@ -499,7 +499,7 @@ if [ $stage -le 13 ]; then
   # Note: it might appear that this $lang directory is mismatched, and it is as
   # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
   # the lang directory.
-  utils/mkgraph.sh --self-loop-scale 1.0 data/lang_sw1_tg $dir $dir/graph_sw1_tg
+  utils/mkgraph.sh data/lang_sw1_tg $dir $dir/graph_sw1_tg
 fi
 
 decode_suff=sw1_tg
diff --git a/egs/swbd/s5c/local/chain/tuning/run_tdnn_6v.sh b/egs/swbd/s5c/local/chain/tuning/run_tdnn_6v.sh
index 732b60d7c95..9c0de1c8597 100755
--- a/egs/swbd/s5c/local/chain/tuning/run_tdnn_6v.sh
+++ b/egs/swbd/s5c/local/chain/tuning/run_tdnn_6v.sh
@@ -191,7 +191,7 @@ if [ $stage -le 13 ]; then
   # Note: it might appear that this $lang directory is mismatched, and it is as
   # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
   # the lang directory.
-  utils/mkgraph.sh --self-loop-scale 1.0 data/lang_sw1_tg $dir $dir/graph_sw1_tg
+  utils/mkgraph.sh data/lang_sw1_tg $dir $dir/graph_sw1_tg
 fi
 
 decode_suff=sw1_tg
diff --git a/egs/swbd/s5c/local/chain/tuning/run_tdnn_6w.sh b/egs/swbd/s5c/local/chain/tuning/run_tdnn_6w.sh
index a625859f7d4..e0db69bf5a5 100755
--- a/egs/swbd/s5c/local/chain/tuning/run_tdnn_6w.sh
+++ b/egs/swbd/s5c/local/chain/tuning/run_tdnn_6w.sh
@@ -199,7 +199,7 @@ if [ $stage -le 13 ]; then
   # Note: it might appear that this $lang directory is mismatched, and it is as
   # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
   # the lang directory.
-  utils/mkgraph.sh --self-loop-scale 1.0 data/lang_sw1_tg $dir $dir/graph_sw1_tg
+  utils/mkgraph.sh data/lang_sw1_tg $dir $dir/graph_sw1_tg
 fi
 
 decode_suff=sw1_tg
diff --git a/egs/swbd/s5c/local/chain/tuning/run_tdnn_6x.sh b/egs/swbd/s5c/local/chain/tuning/run_tdnn_6x.sh
index 2e79e24ddb6..90a6793254c 100755
--- a/egs/swbd/s5c/local/chain/tuning/run_tdnn_6x.sh
+++ b/egs/swbd/s5c/local/chain/tuning/run_tdnn_6x.sh
@@ -194,7 +194,7 @@ if [ $stage -le 13 ]; then
   # Note: it might appear that this $lang directory is mismatched, and it is as
   # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
   # the lang directory.
-  utils/mkgraph.sh --self-loop-scale 1.0 data/lang_sw1_tg $dir $dir/graph_sw1_tg
+  utils/mkgraph.sh data/lang_sw1_tg $dir $dir/graph_sw1_tg
 fi
 
 decode_suff=sw1_tg
diff --git a/egs/swbd/s5c/local/chain/tuning/run_tdnn_6y.sh b/egs/swbd/s5c/local/chain/tuning/run_tdnn_6y.sh
index 5cf1cead63f..fd5a9342c40 100755
--- a/egs/swbd/s5c/local/chain/tuning/run_tdnn_6y.sh
+++ b/egs/swbd/s5c/local/chain/tuning/run_tdnn_6y.sh
@@ -192,7 +192,7 @@ if [ $stage -le 13 ]; then
   # Note: it might appear that this $lang directory is mismatched, and it is as
   # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
   # the lang directory.
-  utils/mkgraph.sh --self-loop-scale 1.0 data/lang_sw1_tg $dir $dir/graph_sw1_tg
+  utils/mkgraph.sh data/lang_sw1_tg $dir $dir/graph_sw1_tg
 fi
 
 decode_suff=sw1_tg
diff --git a/egs/swbd/s5c/local/chain/tuning/run_tdnn_6z.sh b/egs/swbd/s5c/local/chain/tuning/run_tdnn_6z.sh
index baa42a087b7..a03cf67b4f7 100755
--- a/egs/swbd/s5c/local/chain/tuning/run_tdnn_6z.sh
+++ b/egs/swbd/s5c/local/chain/tuning/run_tdnn_6z.sh
@@ -196,7 +196,7 @@ if [ $stage -le 13 ]; then
   # Note: it might appear that this $lang directory is mismatched, and it is as
   # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
   # the lang directory.
-  utils/mkgraph.sh --self-loop-scale 1.0 data/lang_sw1_tg $dir $dir/graph_sw1_tg
+  utils/mkgraph.sh data/lang_sw1_tg $dir $dir/graph_sw1_tg
 fi
 
 decode_suff=sw1_tg
diff --git a/egs/swbd/s5c/local/chain/tuning/run_tdnn_7a.sh b/egs/swbd/s5c/local/chain/tuning/run_tdnn_7a.sh
index 5dd430ded8d..1ce8dcef65f 100755
--- a/egs/swbd/s5c/local/chain/tuning/run_tdnn_7a.sh
+++ b/egs/swbd/s5c/local/chain/tuning/run_tdnn_7a.sh
@@ -227,7 +227,7 @@ if [ $stage -le 16 ]; then
   # Note: it might appear that this $lang directory is mismatched, and it is as
   # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
   # the lang directory.
-  utils/mkgraph.sh --self-loop-scale 1.0 data/lang_sw1_tg $dir $dir/graph_sw1_tg
+  utils/mkgraph.sh data/lang_sw1_tg $dir $dir/graph_sw1_tg
 fi
 
 decode_suff=sw1_tg
diff --git a/egs/swbd/s5c/local/chain/tuning/run_tdnn_7b.sh b/egs/swbd/s5c/local/chain/tuning/run_tdnn_7b.sh
index 47dbe843d8e..c3bc49b783d 100755
--- a/egs/swbd/s5c/local/chain/tuning/run_tdnn_7b.sh
+++ b/egs/swbd/s5c/local/chain/tuning/run_tdnn_7b.sh
@@ -211,7 +211,7 @@ if [ $stage -le 13 ]; then
   # Note: it might appear that this $lang directory is mismatched, and it is as
   # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
   # the lang directory.
-  utils/mkgraph.sh --self-loop-scale 1.0 data/lang_sw1_tg $dir $dir/graph_sw1_tg
+  utils/mkgraph.sh data/lang_sw1_tg $dir $dir/graph_sw1_tg
 fi
 
 decode_suff=sw1_tg
diff --git a/egs/swbd/s5c/local/chain/tuning/run_tdnn_7c.sh b/egs/swbd/s5c/local/chain/tuning/run_tdnn_7c.sh
index 3335ef788a4..8006dc6051e 100755
--- a/egs/swbd/s5c/local/chain/tuning/run_tdnn_7c.sh
+++ b/egs/swbd/s5c/local/chain/tuning/run_tdnn_7c.sh
@@ -211,7 +211,7 @@ if [ $stage -le 13 ]; then
   # Note: it might appear that this $lang directory is mismatched, and it is as
   # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
   # the lang directory.
-  utils/mkgraph.sh --self-loop-scale 1.0 data/lang_sw1_tg $dir $dir/graph_sw1_tg
+  utils/mkgraph.sh data/lang_sw1_tg $dir $dir/graph_sw1_tg
 fi
 
 decode_suff=sw1_tg
diff --git a/egs/swbd/s5c/local/chain/tuning/run_tdnn_7d.sh b/egs/swbd/s5c/local/chain/tuning/run_tdnn_7d.sh
index dba1b99582a..a83027b2a05 100755
--- a/egs/swbd/s5c/local/chain/tuning/run_tdnn_7d.sh
+++ b/egs/swbd/s5c/local/chain/tuning/run_tdnn_7d.sh
@@ -181,7 +181,7 @@ if [ $stage -le 14 ]; then
   # Note: it might appear that this $lang directory is mismatched, and it is as
   # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
   # the lang directory.
-  utils/mkgraph.sh --self-loop-scale 1.0 data/lang_sw1_tg $dir $dir/graph_sw1_tg
+  utils/mkgraph.sh data/lang_sw1_tg $dir $dir/graph_sw1_tg
 fi
 
 decode_suff=sw1_tg
diff --git a/egs/swbd/s5c/local/chain/tuning/run_tdnn_7e.sh b/egs/swbd/s5c/local/chain/tuning/run_tdnn_7e.sh
index 704411b6a76..a3dcb68c92d 100755
--- a/egs/swbd/s5c/local/chain/tuning/run_tdnn_7e.sh
+++ b/egs/swbd/s5c/local/chain/tuning/run_tdnn_7e.sh
@@ -182,7 +182,7 @@ if [ $stage -le 14 ]; then
   # Note: it might appear that this $lang directory is mismatched, and it is as
   # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
   # the lang directory.
-  utils/mkgraph.sh --self-loop-scale 1.0 data/lang_sw1_tg $dir $dir/graph_sw1_tg
+  utils/mkgraph.sh data/lang_sw1_tg $dir $dir/graph_sw1_tg
 fi
 
 decode_suff=sw1_tg
diff --git a/egs/swbd/s5c/local/chain/tuning/run_tdnn_7f.sh b/egs/swbd/s5c/local/chain/tuning/run_tdnn_7f.sh
index a7a5a11dc7a..c07b5f3b936 100755
--- a/egs/swbd/s5c/local/chain/tuning/run_tdnn_7f.sh
+++ b/egs/swbd/s5c/local/chain/tuning/run_tdnn_7f.sh
@@ -183,7 +183,7 @@ if [ $stage -le 14 ]; then
   # Note: it might appear that this $lang directory is mismatched, and it is as
   # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
   # the lang directory.
-  utils/mkgraph.sh --self-loop-scale 1.0 data/lang_sw1_tg $dir $dir/graph_sw1_tg
+  utils/mkgraph.sh data/lang_sw1_tg $dir $dir/graph_sw1_tg
 fi
 
 decode_suff=sw1_tg
diff --git a/egs/swbd/s5c/local/chain/tuning/run_tdnn_7g.sh b/egs/swbd/s5c/local/chain/tuning/run_tdnn_7g.sh
index 0623d26a9e4..9a0cfe0b301 100755
--- a/egs/swbd/s5c/local/chain/tuning/run_tdnn_7g.sh
+++ b/egs/swbd/s5c/local/chain/tuning/run_tdnn_7g.sh
@@ -200,7 +200,7 @@ if [ $stage -le 14 ]; then
   # Note: it might appear that this $lang directory is mismatched, and it is as
   # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
   # the lang directory.
-  utils/mkgraph.sh --self-loop-scale 1.0 data/lang_sw1_tg $dir $dir/graph_sw1_tg
+  utils/mkgraph.sh data/lang_sw1_tg $dir $dir/graph_sw1_tg
 fi
 
 decode_suff=sw1_tg
diff --git a/egs/swbd/s5c/local/chain/tuning/run_tdnn_7h.sh b/egs/swbd/s5c/local/chain/tuning/run_tdnn_7h.sh
index dbbe3c1e6fd..13323fa559f 100755
--- a/egs/swbd/s5c/local/chain/tuning/run_tdnn_7h.sh
+++ b/egs/swbd/s5c/local/chain/tuning/run_tdnn_7h.sh
@@ -200,7 +200,7 @@ if [ $stage -le 14 ]; then
   # Note: it might appear that this $lang directory is mismatched, and it is as
   # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
   # the lang directory.
-  utils/mkgraph.sh --self-loop-scale 1.0 data/lang_sw1_tg $dir $dir/graph_sw1_tg
+  utils/mkgraph.sh data/lang_sw1_tg $dir $dir/graph_sw1_tg
 fi
 
 
diff --git a/egs/swbd/s5c/local/chain/tuning/run_tdnn_7i.sh b/egs/swbd/s5c/local/chain/tuning/run_tdnn_7i.sh
index 2a8a658bf6b..483b08c6938 100755
--- a/egs/swbd/s5c/local/chain/tuning/run_tdnn_7i.sh
+++ b/egs/swbd/s5c/local/chain/tuning/run_tdnn_7i.sh
@@ -193,7 +193,7 @@ if [ $stage -le 14 ]; then
   # Note: it might appear that this $lang directory is mismatched, and it is as
   # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
   # the lang directory.
-  utils/mkgraph.sh --self-loop-scale 1.0 data/lang_sw1_tg $dir $dir/graph_sw1_tg
+  utils/mkgraph.sh data/lang_sw1_tg $dir $dir/graph_sw1_tg
 fi
 
 decode_suff=sw1_tg
diff --git a/egs/swbd/s5c/local/chain/tuning/run_tdnn_7j.sh b/egs/swbd/s5c/local/chain/tuning/run_tdnn_7j.sh
index a9eba36ddaa..a9e91f78dd1 100755
--- a/egs/swbd/s5c/local/chain/tuning/run_tdnn_7j.sh
+++ b/egs/swbd/s5c/local/chain/tuning/run_tdnn_7j.sh
@@ -193,7 +193,7 @@ if [ $stage -le 14 ]; then
   # Note: it might appear that this $lang directory is mismatched, and it is as
   # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
   # the lang directory.
-  utils/mkgraph.sh --self-loop-scale 1.0 data/lang_sw1_tg $dir $dir/graph_sw1_tg
+  utils/mkgraph.sh data/lang_sw1_tg $dir $dir/graph_sw1_tg
 fi
 
 decode_suff=sw1_tg
diff --git a/egs/swbd/s5c/local/chain/tuning/run_tdnn_7k.sh b/egs/swbd/s5c/local/chain/tuning/run_tdnn_7k.sh
index 8e0b290cf87..9843df8945b 100755
--- a/egs/swbd/s5c/local/chain/tuning/run_tdnn_7k.sh
+++ b/egs/swbd/s5c/local/chain/tuning/run_tdnn_7k.sh
@@ -194,7 +194,7 @@ if [ $stage -le 14 ]; then
   # Note: it might appear that this $lang directory is mismatched, and it is as
   # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
   # the lang directory.
-  utils/mkgraph.sh --self-loop-scale 1.0 data/lang_sw1_tg $dir $dir/graph_sw1_tg
+  utils/mkgraph.sh data/lang_sw1_tg $dir $dir/graph_sw1_tg
 fi
 
 
diff --git a/egs/swbd/s5c/local/chain/tuning/run_tdnn_7l.sh b/egs/swbd/s5c/local/chain/tuning/run_tdnn_7l.sh
index bb9ddf209d6..09dd37fe61d 100644
--- a/egs/swbd/s5c/local/chain/tuning/run_tdnn_7l.sh
+++ b/egs/swbd/s5c/local/chain/tuning/run_tdnn_7l.sh
@@ -188,7 +188,7 @@ if [ $stage -le 14 ]; then
   # Note: it might appear that this $lang directory is mismatched, and it is as
   # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
   # the lang directory.
-  utils/mkgraph.sh --self-loop-scale 1.0 data/lang_sw1_tg $dir $dir/graph_sw1_tg
+  utils/mkgraph.sh data/lang_sw1_tg $dir $dir/graph_sw1_tg
 fi
 
 decode_suff=sw1_tg
diff --git a/egs/swbd/s5c/local/chain/tuning/run_tdnn_7m.sh b/egs/swbd/s5c/local/chain/tuning/run_tdnn_7m.sh
index 97f92c14f1f..ef99c6d42f7 100755
--- a/egs/swbd/s5c/local/chain/tuning/run_tdnn_7m.sh
+++ b/egs/swbd/s5c/local/chain/tuning/run_tdnn_7m.sh
@@ -204,7 +204,7 @@ if [ $stage -le 14 ]; then
   # Note: it might appear that this $lang directory is mismatched, and it is as
   # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
   # the lang directory.
-  utils/mkgraph.sh --self-loop-scale 1.0 data/lang_sw1_tg $dir $dir/graph_sw1_tg
+  utils/mkgraph.sh data/lang_sw1_tg $dir $dir/graph_sw1_tg
 fi
 
 
diff --git a/egs/swbd/s5c/local/chain/tuning/run_tdnn_7m25l.sh b/egs/swbd/s5c/local/chain/tuning/run_tdnn_7m25l.sh
index d9fe106e5d7..fdbbdae43b6 100755
--- a/egs/swbd/s5c/local/chain/tuning/run_tdnn_7m25l.sh
+++ b/egs/swbd/s5c/local/chain/tuning/run_tdnn_7m25l.sh
@@ -541,7 +541,7 @@ if [ $stage -le 14 ]; then
   # Note: it might appear that this $lang directory is mismatched, and it is as
   # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
   # the lang directory.
-  utils/mkgraph.sh --self-loop-scale 1.0 data/lang_sw1_tg $dir $dir/graph_sw1_tg
+  utils/mkgraph.sh data/lang_sw1_tg $dir $dir/graph_sw1_tg
 fi
 
 
diff --git a/egs/swbd/s5c/local/chain/tuning/run_tdnn_7n.sh b/egs/swbd/s5c/local/chain/tuning/run_tdnn_7n.sh
index 99e43443f99..ee3b0a70974 100755
--- a/egs/swbd/s5c/local/chain/tuning/run_tdnn_7n.sh
+++ b/egs/swbd/s5c/local/chain/tuning/run_tdnn_7n.sh
@@ -207,7 +207,7 @@ if [ $stage -le 14 ]; then
   # Note: it might appear that this $lang directory is mismatched, and it is as
   # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
   # the lang directory.
-  utils/mkgraph.sh --self-loop-scale 1.0 data/lang_sw1_tg $dir $dir/graph_sw1_tg
+  utils/mkgraph.sh data/lang_sw1_tg $dir $dir/graph_sw1_tg
 fi
 
 
diff --git a/egs/swbd/s5c/local/chain/tuning/run_tdnn_7o.sh b/egs/swbd/s5c/local/chain/tuning/run_tdnn_7o.sh
index 44ca3b3d279..26448a96ffc 100755
--- a/egs/swbd/s5c/local/chain/tuning/run_tdnn_7o.sh
+++ b/egs/swbd/s5c/local/chain/tuning/run_tdnn_7o.sh
@@ -231,7 +231,7 @@ if [ $stage -le 14 ]; then
   # Note: it might appear that this $lang directory is mismatched, and it is as
   # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
   # the lang directory.
-  utils/mkgraph.sh --self-loop-scale 1.0 data/lang_sw1_tg $dir $dir/graph_sw1_tg
+  utils/mkgraph.sh data/lang_sw1_tg $dir $dir/graph_sw1_tg
 fi
 
 
diff --git a/egs/swbd/s5c/local/chain/tuning/run_tdnn_7p.sh b/egs/swbd/s5c/local/chain/tuning/run_tdnn_7p.sh
index d19a4ef4c0b..d68475c6fad 100755
--- a/egs/swbd/s5c/local/chain/tuning/run_tdnn_7p.sh
+++ b/egs/swbd/s5c/local/chain/tuning/run_tdnn_7p.sh
@@ -219,7 +219,7 @@ if [ $stage -le 14 ]; then
   # Note: it might appear that this $lang directory is mismatched, and it is as
   # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
   # the lang directory.
-  utils/mkgraph.sh --self-loop-scale 1.0 data/lang_sw1_tg $dir $dir/graph_sw1_tg
+  utils/mkgraph.sh data/lang_sw1_tg $dir $dir/graph_sw1_tg
 fi
 
 
diff --git a/egs/swbd/s5c/local/chain/tuning/run_tdnn_7q.sh b/egs/swbd/s5c/local/chain/tuning/run_tdnn_7q.sh
index cea0891d5d7..6fbdd992d02 100755
--- a/egs/swbd/s5c/local/chain/tuning/run_tdnn_7q.sh
+++ b/egs/swbd/s5c/local/chain/tuning/run_tdnn_7q.sh
@@ -207,7 +207,7 @@ if [ $stage -le 14 ]; then
   # Note: it might appear that this $lang directory is mismatched, and it is as
   # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
   # the lang directory.
-  utils/mkgraph.sh --self-loop-scale 1.0 data/lang_sw1_tg $dir $dir/graph_sw1_tg
+  utils/mkgraph.sh data/lang_sw1_tg $dir $dir/graph_sw1_tg
 fi
 
 
diff --git a/egs/swbd/s5c/local/chain/tuning/run_tdnn_a.sh b/egs/swbd/s5c/local/chain/tuning/run_tdnn_a.sh
index 96046ac23c1..a18133a5609 100755
--- a/egs/swbd/s5c/local/chain/tuning/run_tdnn_a.sh
+++ b/egs/swbd/s5c/local/chain/tuning/run_tdnn_a.sh
@@ -122,8 +122,8 @@ if [ $stage -le 13 ]; then
   # Note: it might appear that this $lang directory is mismatched, and it is as
   # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
   # the lang directory.
-  utils/mkgraph.sh --transition-scale 0.0 \
-      --self-loop-scale 0.0 data/lang_sw1_tg $dir $dir/graph_sw1_tg
+  utils/mkgraph.sh \
+      data/lang_sw1_tg $dir $dir/graph_sw1_tg
 fi
 
 decode_suff=sw1_tg
diff --git a/egs/swbd/s5c/local/chain/tuning/run_tdnn_a2.sh b/egs/swbd/s5c/local/chain/tuning/run_tdnn_a2.sh
index 3a8e41a8315..94921a094bd 100755
--- a/egs/swbd/s5c/local/chain/tuning/run_tdnn_a2.sh
+++ b/egs/swbd/s5c/local/chain/tuning/run_tdnn_a2.sh
@@ -121,8 +121,8 @@ if [ $stage -le 13 ]; then
   # Note: it might appear that this $lang directory is mismatched, and it is as
   # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
   # the lang directory.
-  utils/mkgraph.sh --transition-scale 0.0 \
-      --self-loop-scale 0.0 data/lang_sw1_tg $dir $dir/graph_sw1_tg
+  utils/mkgraph.sh \
+      data/lang_sw1_tg $dir $dir/graph_sw1_tg
 fi
 
 decode_suff=sw1_tg
diff --git a/egs/swbd/s5c/local/chain/tuning/run_tdnn_attention_1a.sh b/egs/swbd/s5c/local/chain/tuning/run_tdnn_attention_1a.sh
index d4febd61e94..df1b86fee25 100755
--- a/egs/swbd/s5c/local/chain/tuning/run_tdnn_attention_1a.sh
+++ b/egs/swbd/s5c/local/chain/tuning/run_tdnn_attention_1a.sh
@@ -202,7 +202,7 @@ if [ $stage -le 14 ]; then
   # Note: it might appear that this $lang directory is mismatched, and it is as
   # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
   # the lang directory.
-  utils/mkgraph.sh --self-loop-scale 1.0 data/lang_sw1_tg $dir $dir/graph_sw1_tg
+  utils/mkgraph.sh data/lang_sw1_tg $dir $dir/graph_sw1_tg
 fi
 
 
diff --git a/egs/swbd/s5c/local/chain/tuning/run_tdnn_b.sh b/egs/swbd/s5c/local/chain/tuning/run_tdnn_b.sh
index 8c623a7c01b..e02d283a171 100755
--- a/egs/swbd/s5c/local/chain/tuning/run_tdnn_b.sh
+++ b/egs/swbd/s5c/local/chain/tuning/run_tdnn_b.sh
@@ -124,8 +124,8 @@ if [ $stage -le 13 ]; then
   # Note: it might appear that this $lang directory is mismatched, and it is as
   # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
   # the lang directory.
-  utils/mkgraph.sh --transition-scale 0.0 \
-      --self-loop-scale 0.0 data/lang_sw1_tg $dir $dir/graph_sw1_tg
+  utils/mkgraph.sh \
+      data/lang_sw1_tg $dir $dir/graph_sw1_tg
 fi
 
 decode_suff=sw1_tg
diff --git a/egs/swbd/s5c/local/chain/tuning/run_tdnn_blstm_1a.sh b/egs/swbd/s5c/local/chain/tuning/run_tdnn_blstm_1a.sh
index 4414147bf0e..460a9081d28 100755
--- a/egs/swbd/s5c/local/chain/tuning/run_tdnn_blstm_1a.sh
+++ b/egs/swbd/s5c/local/chain/tuning/run_tdnn_blstm_1a.sh
@@ -210,7 +210,7 @@ if [ $stage -le 14 ]; then
   # Note: it might appear that this $lang directory is mismatched, and it is as
   # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
   # the lang directory.
-  utils/mkgraph.sh --self-loop-scale 1.0 data/lang_sw1_tg $dir $dir/graph_sw1_tg
+  utils/mkgraph.sh data/lang_sw1_tg $dir $dir/graph_sw1_tg
 fi
 
 decode_suff=sw1_tg
diff --git a/egs/swbd/s5c/local/chain/tuning/run_tdnn_blstm_1b.sh b/egs/swbd/s5c/local/chain/tuning/run_tdnn_blstm_1b.sh
index cd9d4dc6f2b..e60697f1300 100644
--- a/egs/swbd/s5c/local/chain/tuning/run_tdnn_blstm_1b.sh
+++ b/egs/swbd/s5c/local/chain/tuning/run_tdnn_blstm_1b.sh
@@ -213,7 +213,7 @@ if [ $stage -le 14 ]; then
   # Note: it might appear that this $lang directory is mismatched, and it is as
   # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
   # the lang directory.
-  utils/mkgraph.sh --self-loop-scale 1.0 data/lang_sw1_tg $dir $dir/graph_sw1_tg
+  utils/mkgraph.sh data/lang_sw1_tg $dir $dir/graph_sw1_tg
 fi
 
 decode_suff=sw1_tg
diff --git a/egs/swbd/s5c/local/chain/tuning/run_tdnn_blstm_1c.sh b/egs/swbd/s5c/local/chain/tuning/run_tdnn_blstm_1c.sh
index 18b660b4080..53485466f13 100644
--- a/egs/swbd/s5c/local/chain/tuning/run_tdnn_blstm_1c.sh
+++ b/egs/swbd/s5c/local/chain/tuning/run_tdnn_blstm_1c.sh
@@ -211,7 +211,7 @@ if [ $stage -le 14 ]; then
   # Note: it might appear that this $lang directory is mismatched, and it is as
   # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
   # the lang directory.
-  utils/mkgraph.sh --self-loop-scale 1.0 data/lang_sw1_tg $dir $dir/graph_sw1_tg
+  utils/mkgraph.sh data/lang_sw1_tg $dir $dir/graph_sw1_tg
 fi
 
 decode_suff=sw1_tg
diff --git a/egs/swbd/s5c/local/chain/tuning/run_tdnn_blstm_1d.sh b/egs/swbd/s5c/local/chain/tuning/run_tdnn_blstm_1d.sh
index be615e0e361..f9474506d86 100644
--- a/egs/swbd/s5c/local/chain/tuning/run_tdnn_blstm_1d.sh
+++ b/egs/swbd/s5c/local/chain/tuning/run_tdnn_blstm_1d.sh
@@ -205,7 +205,7 @@ if [ $stage -le 14 ]; then
   # Note: it might appear that this $lang directory is mismatched, and it is as
   # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
   # the lang directory.
-  utils/mkgraph.sh --self-loop-scale 1.0 data/lang_sw1_tg $dir $dir/graph_sw1_tg
+  utils/mkgraph.sh data/lang_sw1_tg $dir $dir/graph_sw1_tg
 fi
 
 decode_suff=sw1_tg
diff --git a/egs/swbd/s5c/local/chain/tuning/run_tdnn_c.sh b/egs/swbd/s5c/local/chain/tuning/run_tdnn_c.sh
index ec4634acf69..cbc6254575c 100755
--- a/egs/swbd/s5c/local/chain/tuning/run_tdnn_c.sh
+++ b/egs/swbd/s5c/local/chain/tuning/run_tdnn_c.sh
@@ -132,8 +132,8 @@ if [ $stage -le 13 ]; then
   # Note: it might appear that this $lang directory is mismatched, and it is as
   # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
   # the lang directory.
-  utils/mkgraph.sh --transition-scale 0.0 \
-      --self-loop-scale 0.0 data/lang_sw1_tg $dir $dir/graph_sw1_tg
+  utils/mkgraph.sh \
+      data/lang_sw1_tg $dir $dir/graph_sw1_tg
 fi
 
 decode_suff=sw1_tg
diff --git a/egs/swbd/s5c/local/chain/tuning/run_tdnn_d.sh b/egs/swbd/s5c/local/chain/tuning/run_tdnn_d.sh
index 3a66a8cd556..94241dfc833 100755
--- a/egs/swbd/s5c/local/chain/tuning/run_tdnn_d.sh
+++ b/egs/swbd/s5c/local/chain/tuning/run_tdnn_d.sh
@@ -136,8 +136,8 @@ if [ $stage -le 13 ]; then
   # Note: it might appear that this $lang directory is mismatched, and it is as
   # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
   # the lang directory.
-  utils/mkgraph.sh --transition-scale 0.0 \
-      --self-loop-scale 0.0 data/lang_sw1_tg $dir $dir/graph_sw1_tg
+  utils/mkgraph.sh \
+      data/lang_sw1_tg $dir $dir/graph_sw1_tg
 fi
 
 decode_suff=sw1_tg
diff --git a/egs/swbd/s5c/local/chain/tuning/run_tdnn_e.sh b/egs/swbd/s5c/local/chain/tuning/run_tdnn_e.sh
index d30a513181e..f19df0c7190 100755
--- a/egs/swbd/s5c/local/chain/tuning/run_tdnn_e.sh
+++ b/egs/swbd/s5c/local/chain/tuning/run_tdnn_e.sh
@@ -142,8 +142,8 @@ if [ $stage -le 13 ]; then
   # Note: it might appear that this $lang directory is mismatched, and it is as
   # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
   # the lang directory.
-  utils/mkgraph.sh --transition-scale 0.0 \
-      --self-loop-scale 0.0 data/lang_sw1_tg $dir $dir/graph_sw1_tg
+  utils/mkgraph.sh \
+      data/lang_sw1_tg $dir $dir/graph_sw1_tg
 fi
 
 decode_suff=sw1_tg
diff --git a/egs/swbd/s5c/local/chain/tuning/run_tdnn_f.sh b/egs/swbd/s5c/local/chain/tuning/run_tdnn_f.sh
index 12450c2ae62..047cf298021 100755
--- a/egs/swbd/s5c/local/chain/tuning/run_tdnn_f.sh
+++ b/egs/swbd/s5c/local/chain/tuning/run_tdnn_f.sh
@@ -147,8 +147,8 @@ if [ $stage -le 13 ]; then
   # Note: it might appear that this $lang directory is mismatched, and it is as
   # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
   # the lang directory.
-  utils/mkgraph.sh --transition-scale 0.0 \
-      --self-loop-scale 0.0 data/lang_sw1_tg $dir $dir/graph_sw1_tg
+  utils/mkgraph.sh \
+      data/lang_sw1_tg $dir $dir/graph_sw1_tg
 fi
 
 decode_suff=sw1_tg
diff --git a/egs/swbd/s5c/local/chain/tuning/run_tdnn_g.sh b/egs/swbd/s5c/local/chain/tuning/run_tdnn_g.sh
index 70845684262..340d8fd7b61 100755
--- a/egs/swbd/s5c/local/chain/tuning/run_tdnn_g.sh
+++ b/egs/swbd/s5c/local/chain/tuning/run_tdnn_g.sh
@@ -149,8 +149,8 @@ if [ $stage -le 13 ]; then
   # Note: it might appear that this $lang directory is mismatched, and it is as
   # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
   # the lang directory.
-  utils/mkgraph.sh --transition-scale 0.0 \
-      --self-loop-scale 0.0 data/lang_sw1_tg $dir $dir/graph_sw1_tg
+  utils/mkgraph.sh \
+      data/lang_sw1_tg $dir $dir/graph_sw1_tg
 fi
 
 decode_suff=sw1_tg
diff --git a/egs/swbd/s5c/local/chain/tuning/run_tdnn_h.sh b/egs/swbd/s5c/local/chain/tuning/run_tdnn_h.sh
index 01f8743f585..91999d8459e 100755
--- a/egs/swbd/s5c/local/chain/tuning/run_tdnn_h.sh
+++ b/egs/swbd/s5c/local/chain/tuning/run_tdnn_h.sh
@@ -163,8 +163,8 @@ if [ $stage -le 13 ]; then
   # Note: it might appear that this $lang directory is mismatched, and it is as
   # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
   # the lang directory.
-  utils/mkgraph.sh --transition-scale 0.0 \
-      --self-loop-scale 0.0 data/lang_sw1_tg $dir $dir/graph_sw1_tg
+  utils/mkgraph.sh \
+      data/lang_sw1_tg $dir $dir/graph_sw1_tg
 fi
 
 decode_suff=sw1_tg
diff --git a/egs/swbd/s5c/local/chain/tuning/run_tdnn_i.sh b/egs/swbd/s5c/local/chain/tuning/run_tdnn_i.sh
index 82d91bbd33e..0b17e772a95 100755
--- a/egs/swbd/s5c/local/chain/tuning/run_tdnn_i.sh
+++ b/egs/swbd/s5c/local/chain/tuning/run_tdnn_i.sh
@@ -157,8 +157,8 @@ if [ $stage -le 13 ]; then
   # Note: it might appear that this $lang directory is mismatched, and it is as
   # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
   # the lang directory.
-  utils/mkgraph.sh --transition-scale 0.0 \
-      --self-loop-scale 0.0 data/lang_sw1_tg $dir $dir/graph_sw1_tg
+  utils/mkgraph.sh \
+      data/lang_sw1_tg $dir $dir/graph_sw1_tg
 fi
 
 decode_suff=sw1_tg
diff --git a/egs/swbd/s5c/local/chain/tuning/run_tdnn_j.sh b/egs/swbd/s5c/local/chain/tuning/run_tdnn_j.sh
index 334eec7e872..8fa842cb5da 100755
--- a/egs/swbd/s5c/local/chain/tuning/run_tdnn_j.sh
+++ b/egs/swbd/s5c/local/chain/tuning/run_tdnn_j.sh
@@ -164,8 +164,8 @@ if [ $stage -le 13 ]; then
   # Note: it might appear that this $lang directory is mismatched, and it is as
   # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
   # the lang directory.
-  utils/mkgraph.sh --transition-scale 0.0 \
-      --self-loop-scale 0.0 data/lang_sw1_tg $dir $dir/graph_sw1_tg
+  utils/mkgraph.sh \
+      data/lang_sw1_tg $dir $dir/graph_sw1_tg
 fi
 
 decode_suff=sw1_tg
diff --git a/egs/swbd/s5c/local/chain/tuning/run_tdnn_k.sh b/egs/swbd/s5c/local/chain/tuning/run_tdnn_k.sh
index b64318ec4bb..0ee01ac3de4 100755
--- a/egs/swbd/s5c/local/chain/tuning/run_tdnn_k.sh
+++ b/egs/swbd/s5c/local/chain/tuning/run_tdnn_k.sh
@@ -159,8 +159,8 @@ if [ $stage -le 13 ]; then
   # Note: it might appear that this $lang directory is mismatched, and it is as
   # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
   # the lang directory.
-  utils/mkgraph.sh --transition-scale 0.0 \
-      --self-loop-scale 0.0 data/lang_sw1_tg $dir $dir/graph_sw1_tg
+  utils/mkgraph.sh \
+      data/lang_sw1_tg $dir $dir/graph_sw1_tg
 fi
 
 decode_suff=sw1_tg
diff --git a/egs/swbd/s5c/local/chain/tuning/run_tdnn_l.sh b/egs/swbd/s5c/local/chain/tuning/run_tdnn_l.sh
index 6de6c79affc..9fbfcf68b01 100755
--- a/egs/swbd/s5c/local/chain/tuning/run_tdnn_l.sh
+++ b/egs/swbd/s5c/local/chain/tuning/run_tdnn_l.sh
@@ -163,8 +163,8 @@ if [ $stage -le 13 ]; then
   # Note: it might appear that this $lang directory is mismatched, and it is as
   # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
   # the lang directory.
-  utils/mkgraph.sh --transition-scale 0.0 \
-      --self-loop-scale 0.0 data/lang_sw1_tg $dir $dir/graph_sw1_tg
+  utils/mkgraph.sh \
+      data/lang_sw1_tg $dir $dir/graph_sw1_tg
 fi
 
 decode_suff=sw1_tg
diff --git a/egs/swbd/s5c/local/chain/tuning/run_tdnn_lstm_1a.sh b/egs/swbd/s5c/local/chain/tuning/run_tdnn_lstm_1a.sh
index 43855e6f7ce..15e194d9081 100755
--- a/egs/swbd/s5c/local/chain/tuning/run_tdnn_lstm_1a.sh
+++ b/egs/swbd/s5c/local/chain/tuning/run_tdnn_lstm_1a.sh
@@ -205,7 +205,7 @@ if [ $stage -le 14 ]; then
   # Note: it might appear that this $lang directory is mismatched, and it is as
   # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
   # the lang directory.
-  utils/mkgraph.sh --self-loop-scale 1.0 data/lang_sw1_tg $dir $dir/graph_sw1_tg
+  utils/mkgraph.sh data/lang_sw1_tg $dir $dir/graph_sw1_tg
 fi
 
 decode_suff=sw1_tg
diff --git a/egs/swbd/s5c/local/chain/tuning/run_tdnn_lstm_1b.sh b/egs/swbd/s5c/local/chain/tuning/run_tdnn_lstm_1b.sh
index 5c82ed0eb11..afcb48ff04a 100755
--- a/egs/swbd/s5c/local/chain/tuning/run_tdnn_lstm_1b.sh
+++ b/egs/swbd/s5c/local/chain/tuning/run_tdnn_lstm_1b.sh
@@ -201,7 +201,7 @@ if [ $stage -le 14 ]; then
   # Note: it might appear that this $lang directory is mismatched, and it is as
   # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
   # the lang directory.
-  utils/mkgraph.sh --self-loop-scale 1.0 data/lang_sw1_tg $dir $dir/graph_sw1_tg
+  utils/mkgraph.sh data/lang_sw1_tg $dir $dir/graph_sw1_tg
 fi
 
 decode_suff=sw1_tg
diff --git a/egs/swbd/s5c/local/chain/tuning/run_tdnn_lstm_1c.sh b/egs/swbd/s5c/local/chain/tuning/run_tdnn_lstm_1c.sh
index c3df0bf2b2c..c6e8625ff0b 100755
--- a/egs/swbd/s5c/local/chain/tuning/run_tdnn_lstm_1c.sh
+++ b/egs/swbd/s5c/local/chain/tuning/run_tdnn_lstm_1c.sh
@@ -208,7 +208,7 @@ if [ $stage -le 14 ]; then
   # Note: it might appear that this $lang directory is mismatched, and it is as
   # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
   # the lang directory.
-  utils/mkgraph.sh --self-loop-scale 1.0 data/lang_sw1_tg $dir $dir/graph_sw1_tg
+  utils/mkgraph.sh data/lang_sw1_tg $dir $dir/graph_sw1_tg
 fi
 
 decode_suff=sw1_tg
diff --git a/egs/swbd/s5c/local/chain/tuning/run_tdnn_lstm_1d.sh b/egs/swbd/s5c/local/chain/tuning/run_tdnn_lstm_1d.sh
index 3d353387239..ab301178c87 100755
--- a/egs/swbd/s5c/local/chain/tuning/run_tdnn_lstm_1d.sh
+++ b/egs/swbd/s5c/local/chain/tuning/run_tdnn_lstm_1d.sh
@@ -231,7 +231,7 @@ if [ $stage -le 14 ]; then
   # Note: it might appear that this $lang directory is mismatched, and it is as
   # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
   # the lang directory.
-  utils/mkgraph.sh --self-loop-scale 1.0 data/lang_sw1_tg $dir $dir/graph_sw1_tg
+  utils/mkgraph.sh data/lang_sw1_tg $dir $dir/graph_sw1_tg
 fi
 
 
diff --git a/egs/swbd/s5c/local/chain/tuning/run_tdnn_lstm_1e.sh b/egs/swbd/s5c/local/chain/tuning/run_tdnn_lstm_1e.sh
index 2a2d508ecdd..3d221ad03f2 100755
--- a/egs/swbd/s5c/local/chain/tuning/run_tdnn_lstm_1e.sh
+++ b/egs/swbd/s5c/local/chain/tuning/run_tdnn_lstm_1e.sh
@@ -226,7 +226,7 @@ if [ $stage -le 14 ]; then
   # Note: it might appear that this $lang directory is mismatched, and it is as
   # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
   # the lang directory.
-  utils/mkgraph.sh --self-loop-scale 1.0 data/lang_sw1_tg $dir $dir/graph_sw1_tg
+  utils/mkgraph.sh data/lang_sw1_tg $dir $dir/graph_sw1_tg
 fi
 
 
diff --git a/egs/swbd/s5c/local/chain/tuning/run_tdnn_lstm_1f.sh b/egs/swbd/s5c/local/chain/tuning/run_tdnn_lstm_1f.sh
index 5af5463b372..9affa2aeaf2 100755
--- a/egs/swbd/s5c/local/chain/tuning/run_tdnn_lstm_1f.sh
+++ b/egs/swbd/s5c/local/chain/tuning/run_tdnn_lstm_1f.sh
@@ -243,7 +243,7 @@ if [ $stage -le 14 ]; then
   # Note: it might appear that this $lang directory is mismatched, and it is as
   # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
   # the lang directory.
-  utils/mkgraph.sh --self-loop-scale 1.0 data/lang_sw1_tg $dir $dir/graph_sw1_tg
+  utils/mkgraph.sh data/lang_sw1_tg $dir $dir/graph_sw1_tg
 fi
 
 
diff --git a/egs/swbd/s5c/local/chain/tuning/run_tdnn_lstm_1g.sh b/egs/swbd/s5c/local/chain/tuning/run_tdnn_lstm_1g.sh
index 28105a587ec..1921c9735e5 100755
--- a/egs/swbd/s5c/local/chain/tuning/run_tdnn_lstm_1g.sh
+++ b/egs/swbd/s5c/local/chain/tuning/run_tdnn_lstm_1g.sh
@@ -225,7 +225,7 @@ if [ $stage -le 14 ]; then
   # Note: it might appear that this $lang directory is mismatched, and it is as
   # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
   # the lang directory.
-  utils/mkgraph.sh --self-loop-scale 1.0 data/lang_sw1_tg $dir $dir/graph_sw1_tg
+  utils/mkgraph.sh data/lang_sw1_tg $dir $dir/graph_sw1_tg
 fi
 
 
diff --git a/egs/swbd/s5c/local/chain/tuning/run_tdnn_lstm_1h.sh b/egs/swbd/s5c/local/chain/tuning/run_tdnn_lstm_1h.sh
index d6e81f2d8eb..579ccb45140 100755
--- a/egs/swbd/s5c/local/chain/tuning/run_tdnn_lstm_1h.sh
+++ b/egs/swbd/s5c/local/chain/tuning/run_tdnn_lstm_1h.sh
@@ -222,7 +222,7 @@ if [ $stage -le 14 ]; then
   # Note: it might appear that this $lang directory is mismatched, and it is as
   # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
   # the lang directory.
-  utils/mkgraph.sh --self-loop-scale 1.0 data/lang_sw1_tg $dir $dir/graph_sw1_tg
+  utils/mkgraph.sh data/lang_sw1_tg $dir $dir/graph_sw1_tg
 fi
 
 
diff --git a/egs/swbd/s5c/local/chain/tuning/run_tdnn_lstm_1i.sh b/egs/swbd/s5c/local/chain/tuning/run_tdnn_lstm_1i.sh
index 060d98c9d05..4e8dddab481 100755
--- a/egs/swbd/s5c/local/chain/tuning/run_tdnn_lstm_1i.sh
+++ b/egs/swbd/s5c/local/chain/tuning/run_tdnn_lstm_1i.sh
@@ -243,7 +243,7 @@ if [ $stage -le 14 ]; then
   # Note: it might appear that this $lang directory is mismatched, and it is as
   # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
   # the lang directory.
-  utils/mkgraph.sh --self-loop-scale 1.0 data/lang_sw1_tg $dir $dir/graph_sw1_tg
+  utils/mkgraph.sh data/lang_sw1_tg $dir $dir/graph_sw1_tg
 fi
 
 
diff --git a/egs/swbd/s5c/local/chain/tuning/run_tdnn_lstm_1j.sh b/egs/swbd/s5c/local/chain/tuning/run_tdnn_lstm_1j.sh
index 9bd39a262c5..5f7818689d4 100755
--- a/egs/swbd/s5c/local/chain/tuning/run_tdnn_lstm_1j.sh
+++ b/egs/swbd/s5c/local/chain/tuning/run_tdnn_lstm_1j.sh
@@ -210,7 +210,7 @@ if [ $stage -le 14 ]; then
   # Note: it might appear that this $lang directory is mismatched, and it is as
   # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
   # the lang directory.
-  utils/mkgraph.sh --self-loop-scale 1.0 data/lang_sw1_tg $dir $dir/graph_sw1_tg
+  utils/mkgraph.sh data/lang_sw1_tg $dir $dir/graph_sw1_tg
 fi
 
 
diff --git a/egs/swbd/s5c/local/chain/tuning/run_tdnn_lstm_1k.sh b/egs/swbd/s5c/local/chain/tuning/run_tdnn_lstm_1k.sh
index ccd6138da6e..4383dacb5ff 100644
--- a/egs/swbd/s5c/local/chain/tuning/run_tdnn_lstm_1k.sh
+++ b/egs/swbd/s5c/local/chain/tuning/run_tdnn_lstm_1k.sh
@@ -220,7 +220,7 @@ if [ $stage -le 14 ]; then
   # Note: it might appear that this $lang directory is mismatched, and it is as
   # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
   # the lang directory.
-  utils/mkgraph.sh --self-loop-scale 1.0 data/lang_sw1_tg $dir $dir/graph_sw1_tg
+  utils/mkgraph.sh data/lang_sw1_tg $dir $dir/graph_sw1_tg
 fi
 
 
diff --git a/egs/swbd/s5c/local/chain/tuning/run_tdnn_lstm_1l.sh b/egs/swbd/s5c/local/chain/tuning/run_tdnn_lstm_1l.sh
index f702033377a..3b070d0a35b 100644
--- a/egs/swbd/s5c/local/chain/tuning/run_tdnn_lstm_1l.sh
+++ b/egs/swbd/s5c/local/chain/tuning/run_tdnn_lstm_1l.sh
@@ -208,7 +208,7 @@ if [ $stage -le 14 ]; then
   # Note: it might appear that this $lang directory is mismatched, and it is as
   # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
   # the lang directory.
-  utils/mkgraph.sh --self-loop-scale 1.0 data/lang_sw1_tg $dir $dir/graph_sw1_tg
+  utils/mkgraph.sh data/lang_sw1_tg $dir $dir/graph_sw1_tg
 fi
 
 decode_suff=sw1_tg
diff --git a/egs/swbd/s5c/local/chain/tuning/run_tdnn_lstm_1m.sh b/egs/swbd/s5c/local/chain/tuning/run_tdnn_lstm_1m.sh
index b43577bd76c..375b00a70bd 100755
--- a/egs/swbd/s5c/local/chain/tuning/run_tdnn_lstm_1m.sh
+++ b/egs/swbd/s5c/local/chain/tuning/run_tdnn_lstm_1m.sh
@@ -220,7 +220,7 @@ if [ $stage -le 14 ]; then
   # Note: it might appear that this $lang directory is mismatched, and it is as
   # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
   # the lang directory.
-  utils/mkgraph.sh --self-loop-scale 1.0 data/lang_sw1_tg $dir $dir/graph_sw1_tg
+  utils/mkgraph.sh data/lang_sw1_tg $dir $dir/graph_sw1_tg
 fi
 
 decode_suff=sw1_tg
diff --git a/egs/swbd/s5c/local/chain/tuning/run_tdnn_lstm_1n.sh b/egs/swbd/s5c/local/chain/tuning/run_tdnn_lstm_1n.sh
index 5bb6e7da152..eb7cf854982 100755
--- a/egs/swbd/s5c/local/chain/tuning/run_tdnn_lstm_1n.sh
+++ b/egs/swbd/s5c/local/chain/tuning/run_tdnn_lstm_1n.sh
@@ -217,7 +217,7 @@ if [ $stage -le 14 ]; then
   # Note: it might appear that this $lang directory is mismatched, and it is as
   # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
   # the lang directory.
-  utils/mkgraph.sh --self-loop-scale 1.0 data/lang_sw1_tg $dir $dir/graph_sw1_tg
+  utils/mkgraph.sh data/lang_sw1_tg $dir $dir/graph_sw1_tg
 fi
 
 decode_suff=sw1_tg
diff --git a/egs/swbd/s5c/local/chain/tuning/run_tdnn_m.sh b/egs/swbd/s5c/local/chain/tuning/run_tdnn_m.sh
index 8d357db0217..5290de020eb 100755
--- a/egs/swbd/s5c/local/chain/tuning/run_tdnn_m.sh
+++ b/egs/swbd/s5c/local/chain/tuning/run_tdnn_m.sh
@@ -164,8 +164,8 @@ if [ $stage -le 13 ]; then
   # Note: it might appear that this $lang directory is mismatched, and it is as
   # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
   # the lang directory.
-  utils/mkgraph.sh --transition-scale 0.0 \
-      --self-loop-scale 0.0 data/lang_sw1_tg $dir $dir/graph_sw1_tg
+  utils/mkgraph.sh \
+      data/lang_sw1_tg $dir $dir/graph_sw1_tg
 fi
 
 decode_suff=sw1_tg
diff --git a/egs/swbd/s5c/local/chain/tuning/run_tdnn_n.sh b/egs/swbd/s5c/local/chain/tuning/run_tdnn_n.sh
index a190a1d56dd..cdc7b8749fa 100755
--- a/egs/swbd/s5c/local/chain/tuning/run_tdnn_n.sh
+++ b/egs/swbd/s5c/local/chain/tuning/run_tdnn_n.sh
@@ -174,8 +174,8 @@ if [ $stage -le 13 ]; then
   # Note: it might appear that this $lang directory is mismatched, and it is as
   # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
   # the lang directory.
-  utils/mkgraph.sh --transition-scale 0.0 \
-      --self-loop-scale 0.0 data/lang_sw1_tg $dir $dir/graph_sw1_tg
+  utils/mkgraph.sh \
+      data/lang_sw1_tg $dir $dir/graph_sw1_tg
 fi
 
 decode_suff=sw1_tg
diff --git a/egs/swbd/s5c/local/chain/tuning/run_tdnn_o.sh b/egs/swbd/s5c/local/chain/tuning/run_tdnn_o.sh
index 5b80665268d..1e4ffbaf665 100755
--- a/egs/swbd/s5c/local/chain/tuning/run_tdnn_o.sh
+++ b/egs/swbd/s5c/local/chain/tuning/run_tdnn_o.sh
@@ -178,8 +178,8 @@ if [ $stage -le 13 ]; then
   # Note: it might appear that this $lang directory is mismatched, and it is as
   # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
   # the lang directory.
-  utils/mkgraph.sh --transition-scale 0.0 \
-      --self-loop-scale 0.0 data/lang_sw1_tg $dir $dir/graph_sw1_tg
+  utils/mkgraph.sh \
+      data/lang_sw1_tg $dir $dir/graph_sw1_tg
 fi
 
 decode_suff=sw1_tg
diff --git a/egs/swbd/s5c/local/chain/tuning/run_tdnn_opgru_1a.sh b/egs/swbd/s5c/local/chain/tuning/run_tdnn_opgru_1a.sh
index 4db38d74508..7dee86b1741 100755
--- a/egs/swbd/s5c/local/chain/tuning/run_tdnn_opgru_1a.sh
+++ b/egs/swbd/s5c/local/chain/tuning/run_tdnn_opgru_1a.sh
@@ -225,7 +225,7 @@ if [ $stage -le 14 ]; then
   # Note: it might appear that this $lang directory is mismatched, and it is as
   # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
   # the lang directory.
-  utils/mkgraph.sh --self-loop-scale 1.0 data/lang_sw1_tg $dir $dir/graph_sw1_tg
+  utils/mkgraph.sh data/lang_sw1_tg $dir $dir/graph_sw1_tg
 fi
 
 decode_suff=sw1_tg
diff --git a/egs/swbd/s5c/local/chain/tuning/run_tdnn_opgru_1b.sh b/egs/swbd/s5c/local/chain/tuning/run_tdnn_opgru_1b.sh
index 7e9dec67068..0a83ab53e6c 100755
--- a/egs/swbd/s5c/local/chain/tuning/run_tdnn_opgru_1b.sh
+++ b/egs/swbd/s5c/local/chain/tuning/run_tdnn_opgru_1b.sh
@@ -223,7 +223,7 @@ if [ $stage -le 14 ]; then
   # Note: it might appear that this $lang directory is mismatched, and it is as
   # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
   # the lang directory.
-  utils/mkgraph.sh --self-loop-scale 1.0 data/lang_sw1_tg $dir $dir/graph_sw1_tg
+  utils/mkgraph.sh data/lang_sw1_tg $dir $dir/graph_sw1_tg
 fi
 
 decode_suff=sw1_tg
diff --git a/egs/swbd/s5c/local/chain/tuning/run_tdnn_p.sh b/egs/swbd/s5c/local/chain/tuning/run_tdnn_p.sh
index d401790449d..cc2dfe960bb 100755
--- a/egs/swbd/s5c/local/chain/tuning/run_tdnn_p.sh
+++ b/egs/swbd/s5c/local/chain/tuning/run_tdnn_p.sh
@@ -171,8 +171,8 @@ if [ $stage -le 13 ]; then
   # Note: it might appear that this $lang directory is mismatched, and it is as
   # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
   # the lang directory.
-  utils/mkgraph.sh --transition-scale 0.0 \
-      --self-loop-scale 0.0 data/lang_sw1_tg $dir $dir/graph_sw1_tg
+  utils/mkgraph.sh \
+      data/lang_sw1_tg $dir $dir/graph_sw1_tg
 fi
 
 decode_suff=sw1_tg
diff --git a/egs/swbd/s5c/local/chain/tuning/run_tdnn_q.sh b/egs/swbd/s5c/local/chain/tuning/run_tdnn_q.sh
index c6758a62fa5..1204cdb0eed 100755
--- a/egs/swbd/s5c/local/chain/tuning/run_tdnn_q.sh
+++ b/egs/swbd/s5c/local/chain/tuning/run_tdnn_q.sh
@@ -181,8 +181,8 @@ if [ $stage -le 13 ]; then
   # Note: it might appear that this $lang directory is mismatched, and it is as
   # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
   # the lang directory.
-  utils/mkgraph.sh --transition-scale 0.0 \
-      --self-loop-scale 0.0 data/lang_sw1_tg $dir $dir/graph_sw1_tg
+  utils/mkgraph.sh \
+      data/lang_sw1_tg $dir $dir/graph_sw1_tg
 fi
 
 decode_suff=sw1_tg
diff --git a/egs/swbd/s5c/local/chain/tuning/run_tdnn_r.sh b/egs/swbd/s5c/local/chain/tuning/run_tdnn_r.sh
index 73cadcc622c..95bef5b5317 100755
--- a/egs/swbd/s5c/local/chain/tuning/run_tdnn_r.sh
+++ b/egs/swbd/s5c/local/chain/tuning/run_tdnn_r.sh
@@ -181,8 +181,8 @@ if [ $stage -le 13 ]; then
   # Note: it might appear that this $lang directory is mismatched, and it is as
   # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
   # the lang directory.
-  utils/mkgraph.sh --transition-scale 0.0 \
-      --self-loop-scale 0.0 data/lang_sw1_tg $dir $dir/graph_sw1_tg
+  utils/mkgraph.sh \
+      data/lang_sw1_tg $dir $dir/graph_sw1_tg
 fi
 
 decode_suff=sw1_tg
diff --git a/egs/swbd/s5c/local/chain/tuning/run_tdnn_s.sh b/egs/swbd/s5c/local/chain/tuning/run_tdnn_s.sh
index ae10b53824f..f1ffc479eb2 100755
--- a/egs/swbd/s5c/local/chain/tuning/run_tdnn_s.sh
+++ b/egs/swbd/s5c/local/chain/tuning/run_tdnn_s.sh
@@ -183,8 +183,8 @@ if [ $stage -le 13 ]; then
   # Note: it might appear that this $lang directory is mismatched, and it is as
   # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
   # the lang directory.
-  utils/mkgraph.sh --transition-scale 0.0 \
-      --self-loop-scale 0.0 data/lang_sw1_tg $dir $dir/graph_sw1_tg
+  utils/mkgraph.sh \
+      data/lang_sw1_tg $dir $dir/graph_sw1_tg
 fi
 
 decode_suff=sw1_tg
diff --git a/egs/swbd/s5c/local/chain/tuning/run_tdnn_t.sh b/egs/swbd/s5c/local/chain/tuning/run_tdnn_t.sh
index dabb2a6db87..7b7698a730f 100755
--- a/egs/swbd/s5c/local/chain/tuning/run_tdnn_t.sh
+++ b/egs/swbd/s5c/local/chain/tuning/run_tdnn_t.sh
@@ -186,8 +186,8 @@ if [ $stage -le 13 ]; then
   # Note: it might appear that this $lang directory is mismatched, and it is as
   # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
   # the lang directory.
-  utils/mkgraph.sh --transition-scale 0.0 \
-      --self-loop-scale 0.0 data/lang_sw1_tg $dir $dir/graph_sw1_tg
+  utils/mkgraph.sh \
+      data/lang_sw1_tg $dir $dir/graph_sw1_tg
 fi
 
 decode_suff=sw1_tg
diff --git a/egs/swbd/s5c/local/chain/tuning/run_tdnn_u.sh b/egs/swbd/s5c/local/chain/tuning/run_tdnn_u.sh
index c83274499fa..d2f69394557 100755
--- a/egs/swbd/s5c/local/chain/tuning/run_tdnn_u.sh
+++ b/egs/swbd/s5c/local/chain/tuning/run_tdnn_u.sh
@@ -191,8 +191,8 @@ if [ $stage -le 13 ]; then
   # Note: it might appear that this $lang directory is mismatched, and it is as
   # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
   # the lang directory.
-  utils/mkgraph.sh --transition-scale 0.0 \
-      --self-loop-scale 0.0 data/lang_sw1_tg $dir $dir/graph_sw1_tg
+  utils/mkgraph.sh \
+      data/lang_sw1_tg $dir $dir/graph_sw1_tg
 fi
 
 decode_suff=sw1_tg
diff --git a/egs/swbd/s5c/local/chain/tuning/run_tdnn_v.sh b/egs/swbd/s5c/local/chain/tuning/run_tdnn_v.sh
index 38f31269d33..fe195ab33ee 100755
--- a/egs/swbd/s5c/local/chain/tuning/run_tdnn_v.sh
+++ b/egs/swbd/s5c/local/chain/tuning/run_tdnn_v.sh
@@ -197,8 +197,8 @@ if [ $stage -le 13 ]; then
   # Note: it might appear that this $lang directory is mismatched, and it is as
   # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
   # the lang directory.
-  utils/mkgraph.sh --transition-scale 0.0 \
-      --self-loop-scale 0.0 data/lang_sw1_tg $dir $dir/graph_sw1_tg
+  utils/mkgraph.sh \
+      data/lang_sw1_tg $dir $dir/graph_sw1_tg
 fi
 
 decode_suff=sw1_tg
diff --git a/egs/swbd/s5c/local/chain/tuning/run_tdnn_w.sh b/egs/swbd/s5c/local/chain/tuning/run_tdnn_w.sh
index 35d1ddd8052..cea2fd8d4c4 100755
--- a/egs/swbd/s5c/local/chain/tuning/run_tdnn_w.sh
+++ b/egs/swbd/s5c/local/chain/tuning/run_tdnn_w.sh
@@ -190,8 +190,8 @@ if [ $stage -le 13 ]; then
   # Note: it might appear that this $lang directory is mismatched, and it is as
   # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
   # the lang directory.
-  utils/mkgraph.sh --transition-scale 0.0 \
-      --self-loop-scale 0.0 data/lang_sw1_tg $dir $dir/graph_sw1_tg
+  utils/mkgraph.sh \
+      data/lang_sw1_tg $dir $dir/graph_sw1_tg
 fi
 
 decode_suff=sw1_tg
diff --git a/egs/swbd/s5c/local/chain/tuning/run_tdnn_x.sh b/egs/swbd/s5c/local/chain/tuning/run_tdnn_x.sh
index 0f294033489..6cb8a63b478 100755
--- a/egs/swbd/s5c/local/chain/tuning/run_tdnn_x.sh
+++ b/egs/swbd/s5c/local/chain/tuning/run_tdnn_x.sh
@@ -191,8 +191,8 @@ if [ $stage -le 13 ]; then
   # Note: it might appear that this $lang directory is mismatched, and it is as
   # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
   # the lang directory.
-  utils/mkgraph.sh --transition-scale 0.0 \
-      --self-loop-scale 0.0 data/lang_sw1_tg $dir $dir/graph_sw1_tg
+  utils/mkgraph.sh \
+      data/lang_sw1_tg $dir $dir/graph_sw1_tg
 fi
 
 decode_suff=sw1_tg
diff --git a/egs/swbd/s5c/local/chain/tuning/run_tdnn_y.sh b/egs/swbd/s5c/local/chain/tuning/run_tdnn_y.sh
index 09217d1b196..4ce697ad42f 100755
--- a/egs/swbd/s5c/local/chain/tuning/run_tdnn_y.sh
+++ b/egs/swbd/s5c/local/chain/tuning/run_tdnn_y.sh
@@ -201,8 +201,8 @@ if [ $stage -le 13 ]; then
   # Note: it might appear that this $lang directory is mismatched, and it is as
   # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
   # the lang directory.
-  utils/mkgraph.sh --transition-scale 0.0 \
-      --self-loop-scale 0.0 data/lang_sw1_tg $dir $dir/graph_sw1_tg
+  utils/mkgraph.sh \
+      data/lang_sw1_tg $dir $dir/graph_sw1_tg
 fi
 
 decode_suff=sw1_tg
diff --git a/egs/swbd/s5c/local/chain/tuning/run_tdnn_z.sh b/egs/swbd/s5c/local/chain/tuning/run_tdnn_z.sh
index 0c8524a2c90..04ed8cfb022 100755
--- a/egs/swbd/s5c/local/chain/tuning/run_tdnn_z.sh
+++ b/egs/swbd/s5c/local/chain/tuning/run_tdnn_z.sh
@@ -191,8 +191,8 @@ if [ $stage -le 13 ]; then
   # Note: it might appear that this $lang directory is mismatched, and it is as
   # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
   # the lang directory.
-  utils/mkgraph.sh --transition-scale 0.0 \
-      --self-loop-scale 0.0 data/lang_sw1_tg $dir $dir/graph_sw1_tg
+  utils/mkgraph.sh \
+      data/lang_sw1_tg $dir $dir/graph_sw1_tg
 fi
 
 decode_suff=sw1_tg
diff --git a/egs/swbd/s5c/local/nnet3/tuning/run_tdnn_lfr1a.sh b/egs/swbd/s5c/local/nnet3/tuning/run_tdnn_lfr1a.sh
index 02e637286b5..ed1927c648e 100755
--- a/egs/swbd/s5c/local/nnet3/tuning/run_tdnn_lfr1a.sh
+++ b/egs/swbd/s5c/local/nnet3/tuning/run_tdnn_lfr1a.sh
@@ -138,7 +138,7 @@ if [ $stage -le 13 ]; then
   # Note: it might appear that this $lang directory is mismatched, and it is as
   # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
   # the lang directory.
-  utils/mkgraph.sh --self-loop-scale 0.333 data/lang_sw1_tg $dir $graph_dir
+  utils/mkgraph.sh data/lang_sw1_tg $dir $graph_dir
 fi
 
 if [ $stage -le 14 ]; then
diff --git a/egs/swbd/s5c/local/nnet3/tuning/run_tdnn_lfr1b.sh b/egs/swbd/s5c/local/nnet3/tuning/run_tdnn_lfr1b.sh
index 67fd3c03d27..59b63a6fcee 100755
--- a/egs/swbd/s5c/local/nnet3/tuning/run_tdnn_lfr1b.sh
+++ b/egs/swbd/s5c/local/nnet3/tuning/run_tdnn_lfr1b.sh
@@ -140,7 +140,7 @@ if [ $stage -le 13 ]; then
   # Note: it might appear that this $lang directory is mismatched, and it is as
   # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
   # the lang directory.
-  utils/mkgraph.sh --self-loop-scale 0.333 data/lang_sw1_tg $dir $graph_dir
+  utils/mkgraph.sh data/lang_sw1_tg $dir $graph_dir
 fi
 
 if [ $stage -le 14 ]; then
diff --git a/egs/swbd/s5c/local/nnet3/tuning/run_tdnn_lfr1c.sh b/egs/swbd/s5c/local/nnet3/tuning/run_tdnn_lfr1c.sh
index 260116666a0..e6cc4e8bed1 100755
--- a/egs/swbd/s5c/local/nnet3/tuning/run_tdnn_lfr1c.sh
+++ b/egs/swbd/s5c/local/nnet3/tuning/run_tdnn_lfr1c.sh
@@ -137,7 +137,7 @@ if [ $stage -le 13 ]; then
   # Note: it might appear that this $lang directory is mismatched, and it is as
   # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
   # the lang directory.
-  utils/mkgraph.sh --self-loop-scale 0.333 data/lang_sw1_tg $dir $graph_dir
+  utils/mkgraph.sh data/lang_sw1_tg $dir $graph_dir
 fi
 
 if [ $stage -le 14 ]; then
diff --git a/egs/swbd/s5c/local/nnet3/tuning/run_tdnn_lfr1c_disc.sh b/egs/swbd/s5c/local/nnet3/tuning/run_tdnn_lfr1c_disc.sh
index e1d0f06affe..2c8c6a57669 100755
--- a/egs/swbd/s5c/local/nnet3/tuning/run_tdnn_lfr1c_disc.sh
+++ b/egs/swbd/s5c/local/nnet3/tuning/run_tdnn_lfr1c_disc.sh
@@ -130,7 +130,7 @@ if [ $stage -le 1 ]; then
   # hardcode no-GPU for alignment, although you could use GPU [you wouldn't
   # get excellent GPU utilization though.]
   steps/nnet3/align.sh  --cmd "$decode_cmd" --use-gpu false \
-    --scale-opts '--transition-scale=1.0 --acoustic-scale=0.333 --self-loop-scale=0.333' \
+    --scale-opts '--acoustic-scale=0.333' \
     --frames-per-chunk $frames_per_chunk_decoding \
     --extra-left-context $extra_left_context --extra-right-context $extra_right_context \
     --extra-left-context-initial 0 --extra-right-context-final 0 \
@@ -150,7 +150,7 @@ if [ -z "$degs_dir" ]; then
 
     steps/nnet3/get_degs.sh \
       --cmd "$decode_cmd --mem 10G" --num-threads 3 \
-      --self-loop-scale 0.333 --acwt 0.333 \
+      --acwt 0.333 \
       --max-copy-jobs $max_copy_jobs \
       --extra-left-context $extra_left_context \
       --extra-right-context $extra_right_context \
diff --git a/egs/tedlium/s5/local/chain/run_tdnn.sh b/egs/tedlium/s5/local/chain/run_tdnn.sh
index 545294dd035..96fee897a56 100755
--- a/egs/tedlium/s5/local/chain/run_tdnn.sh
+++ b/egs/tedlium/s5/local/chain/run_tdnn.sh
@@ -173,7 +173,7 @@ if [ $stage -le 14 ]; then
   # Note: it might appear that this $lang directory is mismatched, and it is as
   # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
   # the lang directory.
-  utils/mkgraph.sh --self-loop-scale 1.0 data/lang_test $dir $dir/graph
+  utils/mkgraph.sh data/lang_test $dir $dir/graph
 fi
 
 graph_dir=$dir/graph
diff --git a/egs/tedlium/s5_r2/local/chain/tuning/run_blstm_1a.sh b/egs/tedlium/s5_r2/local/chain/tuning/run_blstm_1a.sh
index 2ac8c09dad1..a53e2016f8b 100644
--- a/egs/tedlium/s5_r2/local/chain/tuning/run_blstm_1a.sh
+++ b/egs/tedlium/s5_r2/local/chain/tuning/run_blstm_1a.sh
@@ -222,7 +222,7 @@ if [ $stage -le 19 ]; then
   # Note: it might appear that this data/lang_chain directory is mismatched, and it is as
   # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
   # the lang directory.
-  utils/mkgraph.sh --self-loop-scale 1.0 data/lang $dir $dir/graph
+  utils/mkgraph.sh data/lang $dir $dir/graph
 fi
 
 if [ $stage -le 20 ]; then
diff --git a/egs/tedlium/s5_r2/local/chain/tuning/run_lstm_1a.sh b/egs/tedlium/s5_r2/local/chain/tuning/run_lstm_1a.sh
index 47557f93696..8ebf33af18f 100755
--- a/egs/tedlium/s5_r2/local/chain/tuning/run_lstm_1a.sh
+++ b/egs/tedlium/s5_r2/local/chain/tuning/run_lstm_1a.sh
@@ -232,7 +232,7 @@ if [ $stage -le 19 ]; then
   # Note: it might appear that this data/lang_chain directory is mismatched, and it is as
   # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
   # the lang directory.
-  utils/mkgraph.sh --self-loop-scale 1.0 data/lang $dir $dir/graph
+  utils/mkgraph.sh data/lang $dir $dir/graph
 fi
 
 if [ $stage -le 20 ]; then
diff --git a/egs/tedlium/s5_r2/local/chain/tuning/run_lstm_1b.sh b/egs/tedlium/s5_r2/local/chain/tuning/run_lstm_1b.sh
index 7afa1b7f902..6414898fca7 100755
--- a/egs/tedlium/s5_r2/local/chain/tuning/run_lstm_1b.sh
+++ b/egs/tedlium/s5_r2/local/chain/tuning/run_lstm_1b.sh
@@ -233,7 +233,7 @@ if [ $stage -le 19 ]; then
   # Note: it might appear that this data/lang_chain directory is mismatched, and it is as
   # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
   # the lang directory.
-  utils/mkgraph.sh --self-loop-scale 1.0 data/lang $dir $dir/graph
+  utils/mkgraph.sh data/lang $dir $dir/graph
 fi
 
 if [ $stage -le 20 ]; then
diff --git a/egs/tedlium/s5_r2/local/chain/tuning/run_lstm_1c.sh b/egs/tedlium/s5_r2/local/chain/tuning/run_lstm_1c.sh
index e69e499e152..8ce19734a22 100755
--- a/egs/tedlium/s5_r2/local/chain/tuning/run_lstm_1c.sh
+++ b/egs/tedlium/s5_r2/local/chain/tuning/run_lstm_1c.sh
@@ -231,7 +231,7 @@ if [ $stage -le 19 ]; then
   # Note: it might appear that this data/lang_chain directory is mismatched, and it is as
   # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
   # the lang directory.
-  utils/mkgraph.sh --self-loop-scale 1.0 data/lang $dir $dir/graph
+  utils/mkgraph.sh data/lang $dir $dir/graph
 fi
 
 if [ $stage -le 20 ]; then
diff --git a/egs/tedlium/s5_r2/local/chain/tuning/run_lstm_1d.sh b/egs/tedlium/s5_r2/local/chain/tuning/run_lstm_1d.sh
index 86e0352828c..cc51f149446 100755
--- a/egs/tedlium/s5_r2/local/chain/tuning/run_lstm_1d.sh
+++ b/egs/tedlium/s5_r2/local/chain/tuning/run_lstm_1d.sh
@@ -244,7 +244,7 @@ if [ $stage -le 19 ]; then
   # Note: it might appear that this data/lang_chain directory is mismatched, and it is as
   # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
   # the lang directory.
-  utils/mkgraph.sh --self-loop-scale 1.0 data/lang $dir $dir/graph
+  utils/mkgraph.sh data/lang $dir $dir/graph
 fi
 
 if [ $stage -le 20 ]; then
diff --git a/egs/tedlium/s5_r2/local/chain/tuning/run_lstm_1e.sh b/egs/tedlium/s5_r2/local/chain/tuning/run_lstm_1e.sh
index 0fdb2b3b63e..b39f94865f3 100755
--- a/egs/tedlium/s5_r2/local/chain/tuning/run_lstm_1e.sh
+++ b/egs/tedlium/s5_r2/local/chain/tuning/run_lstm_1e.sh
@@ -234,7 +234,7 @@ if [ $stage -le 19 ]; then
   # Note: it might appear that this data/lang_chain directory is mismatched, and it is as
   # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
   # the lang directory.
-  utils/mkgraph.sh --self-loop-scale 1.0 data/lang $dir $dir/graph
+  utils/mkgraph.sh data/lang $dir $dir/graph
 fi
 
 if [ $stage -le 20 ]; then
diff --git a/egs/tedlium/s5_r2/local/chain/tuning/run_tdnn_1a.sh b/egs/tedlium/s5_r2/local/chain/tuning/run_tdnn_1a.sh
index 70e72ee1914..36921a1ea9f 100755
--- a/egs/tedlium/s5_r2/local/chain/tuning/run_tdnn_1a.sh
+++ b/egs/tedlium/s5_r2/local/chain/tuning/run_tdnn_1a.sh
@@ -177,7 +177,7 @@ if [ $stage -le 19 ]; then
   # Note: it might appear that this data/lang_chain directory is mismatched, and it is as
   # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
   # the lang directory.
-  utils/mkgraph.sh --self-loop-scale 1.0 data/lang $dir $dir/graph
+  utils/mkgraph.sh data/lang $dir $dir/graph
 fi
 
 if [ $stage -le 20 ]; then
diff --git a/egs/tedlium/s5_r2/local/chain/tuning/run_tdnn_1b.sh b/egs/tedlium/s5_r2/local/chain/tuning/run_tdnn_1b.sh
index 492d3efb804..8839ecf14a7 100755
--- a/egs/tedlium/s5_r2/local/chain/tuning/run_tdnn_1b.sh
+++ b/egs/tedlium/s5_r2/local/chain/tuning/run_tdnn_1b.sh
@@ -223,7 +223,7 @@ if [ $stage -le 19 ]; then
   # Note: it might appear that this data/lang_chain directory is mismatched, and it is as
   # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
   # the lang directory.
-  utils/mkgraph.sh --self-loop-scale 1.0 data/lang $dir $dir/graph
+  utils/mkgraph.sh data/lang $dir $dir/graph
 fi
 
 if [ $stage -le 20 ]; then
diff --git a/egs/tedlium/s5_r2/local/chain/tuning/run_tdnn_1c.sh b/egs/tedlium/s5_r2/local/chain/tuning/run_tdnn_1c.sh
index 01768c3875f..6eaf886ef5b 100755
--- a/egs/tedlium/s5_r2/local/chain/tuning/run_tdnn_1c.sh
+++ b/egs/tedlium/s5_r2/local/chain/tuning/run_tdnn_1c.sh
@@ -240,7 +240,7 @@ if [ $stage -le 19 ]; then
   # Note: it might appear that this data/lang_chain directory is mismatched, and it is as
   # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
   # the lang directory.
-  utils/mkgraph.sh --self-loop-scale 1.0 data/lang $dir $dir/graph
+  utils/mkgraph.sh data/lang $dir $dir/graph
 fi
 
 if [ $stage -le 20 ]; then
diff --git a/egs/tedlium/s5_r2/local/chain/tuning/run_tdnn_1d.sh b/egs/tedlium/s5_r2/local/chain/tuning/run_tdnn_1d.sh
index bb5007f4c9f..1ee826d0e5c 100755
--- a/egs/tedlium/s5_r2/local/chain/tuning/run_tdnn_1d.sh
+++ b/egs/tedlium/s5_r2/local/chain/tuning/run_tdnn_1d.sh
@@ -231,7 +231,7 @@ if [ $stage -le 19 ]; then
   # Note: it might appear that this data/lang_chain directory is mismatched, and it is as
   # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
   # the lang directory.
-  utils/mkgraph.sh --self-loop-scale 1.0 data/lang $dir $dir/graph
+  utils/mkgraph.sh data/lang $dir $dir/graph
 fi
 
 if [ $stage -le 20 ]; then
diff --git a/egs/tedlium/s5_r2/local/chain/tuning/run_tdnn_1e.sh b/egs/tedlium/s5_r2/local/chain/tuning/run_tdnn_1e.sh
index 1476ed1fd40..f3b0e654813 100755
--- a/egs/tedlium/s5_r2/local/chain/tuning/run_tdnn_1e.sh
+++ b/egs/tedlium/s5_r2/local/chain/tuning/run_tdnn_1e.sh
@@ -223,7 +223,7 @@ if [ $stage -le 19 ]; then
   # Note: it might appear that this data/lang_chain directory is mismatched, and it is as
   # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
   # the lang directory.
-  utils/mkgraph.sh --self-loop-scale 1.0 data/lang $dir $dir/graph
+  utils/mkgraph.sh data/lang $dir $dir/graph
 fi
 
 if [ $stage -le 20 ]; then
diff --git a/egs/tedlium/s5_r2/local/chain/tuning/run_tdnn_1f.sh b/egs/tedlium/s5_r2/local/chain/tuning/run_tdnn_1f.sh
index 47f939fea1c..368b10e4ca7 100755
--- a/egs/tedlium/s5_r2/local/chain/tuning/run_tdnn_1f.sh
+++ b/egs/tedlium/s5_r2/local/chain/tuning/run_tdnn_1f.sh
@@ -222,7 +222,7 @@ if [ $stage -le 19 ]; then
   # Note: it might appear that this data/lang_chain directory is mismatched, and it is as
   # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
   # the lang directory.
-  utils/mkgraph.sh --self-loop-scale 1.0 data/lang $dir $dir/graph
+  utils/mkgraph.sh data/lang $dir $dir/graph
 fi
 
 if [ $stage -le 20 ]; then
diff --git a/egs/tedlium/s5_r2/local/chain/tuning/run_tdnn_1g.sh b/egs/tedlium/s5_r2/local/chain/tuning/run_tdnn_1g.sh
index f02025674e8..7c41c7e2a5d 100755
--- a/egs/tedlium/s5_r2/local/chain/tuning/run_tdnn_1g.sh
+++ b/egs/tedlium/s5_r2/local/chain/tuning/run_tdnn_1g.sh
@@ -226,7 +226,7 @@ if [ $stage -le 19 ]; then
   # Note: it might appear that this data/lang_chain directory is mismatched, and it is as
   # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
   # the lang directory.
-  utils/mkgraph.sh --self-loop-scale 1.0 data/lang $dir $dir/graph
+  utils/mkgraph.sh data/lang $dir $dir/graph
 fi
 
 if [ $stage -le 20 ]; then
diff --git a/egs/tedlium/s5_r2/local/chain/tuning/run_tdnn_lstm_1a.sh b/egs/tedlium/s5_r2/local/chain/tuning/run_tdnn_lstm_1a.sh
index b03da27e760..a13e767e767 100755
--- a/egs/tedlium/s5_r2/local/chain/tuning/run_tdnn_lstm_1a.sh
+++ b/egs/tedlium/s5_r2/local/chain/tuning/run_tdnn_lstm_1a.sh
@@ -242,7 +242,7 @@ if [ $stage -le 19 ]; then
   # Note: it might appear that this data/lang_chain directory is mismatched, and it is as
   # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
   # the lang directory.
-  utils/mkgraph.sh --self-loop-scale 1.0 data/lang $dir $dir/graph
+  utils/mkgraph.sh data/lang $dir $dir/graph
 fi
 
 if [ $stage -le 20 ]; then
diff --git a/egs/tedlium/s5_r2/local/chain/tuning/run_tdnn_lstm_1b.sh b/egs/tedlium/s5_r2/local/chain/tuning/run_tdnn_lstm_1b.sh
index e896a7867b3..936129704bb 100755
--- a/egs/tedlium/s5_r2/local/chain/tuning/run_tdnn_lstm_1b.sh
+++ b/egs/tedlium/s5_r2/local/chain/tuning/run_tdnn_lstm_1b.sh
@@ -257,7 +257,7 @@ if [ $stage -le 19 ]; then
   # Note: it might appear that this data/lang_chain directory is mismatched, and it is as
   # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
   # the lang directory.
-  utils/mkgraph.sh --self-loop-scale 1.0 data/lang $dir $dir/graph
+  utils/mkgraph.sh data/lang $dir $dir/graph
 fi
 
 if [ $stage -le 20 ]; then
diff --git a/egs/tedlium/s5_r2/local/chain/tuning/run_tdnn_lstm_1c.sh b/egs/tedlium/s5_r2/local/chain/tuning/run_tdnn_lstm_1c.sh
index 00f72fab796..15745ba14a0 100755
--- a/egs/tedlium/s5_r2/local/chain/tuning/run_tdnn_lstm_1c.sh
+++ b/egs/tedlium/s5_r2/local/chain/tuning/run_tdnn_lstm_1c.sh
@@ -249,7 +249,7 @@ if [ $stage -le 19 ]; then
   # Note: it might appear that this data/lang_chain directory is mismatched, and it is as
   # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
   # the lang directory.
-  utils/mkgraph.sh --self-loop-scale 1.0 data/lang $dir $dir/graph
+  utils/mkgraph.sh data/lang $dir $dir/graph
 fi
 
 if [ $stage -le 20 ]; then
diff --git a/egs/tedlium/s5_r2/local/chain/tuning/run_tdnn_lstm_1d.sh b/egs/tedlium/s5_r2/local/chain/tuning/run_tdnn_lstm_1d.sh
index 80a9ed1c4d0..d827d2d789b 100755
--- a/egs/tedlium/s5_r2/local/chain/tuning/run_tdnn_lstm_1d.sh
+++ b/egs/tedlium/s5_r2/local/chain/tuning/run_tdnn_lstm_1d.sh
@@ -253,7 +253,7 @@ if [ $stage -le 19 ]; then
   # Note: it might appear that this data/lang_chain directory is mismatched, and it is as
   # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
   # the lang directory.
-  utils/mkgraph.sh --self-loop-scale 1.0 data/lang $dir $dir/graph
+  utils/mkgraph.sh data/lang $dir $dir/graph
 fi
 
 if [ $stage -le 20 ]; then
diff --git a/egs/tedlium/s5_r2/local/chain/tuning/run_tdnn_lstm_1e.sh b/egs/tedlium/s5_r2/local/chain/tuning/run_tdnn_lstm_1e.sh
index 031978f878a..563e8071df1 100755
--- a/egs/tedlium/s5_r2/local/chain/tuning/run_tdnn_lstm_1e.sh
+++ b/egs/tedlium/s5_r2/local/chain/tuning/run_tdnn_lstm_1e.sh
@@ -301,7 +301,7 @@ if [ $stage -le 19 ]; then
   # Note: it might appear that this data/lang_chain directory is mismatched, and it is as
   # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
   # the lang directory.
-  utils/mkgraph.sh --self-loop-scale 1.0 data/lang $dir $dir/graph
+  utils/mkgraph.sh data/lang $dir $dir/graph
 fi
 
 if [ $stage -le 20 ]; then
diff --git a/egs/tedlium/s5_r2/local/chain/tuning/run_tdnn_lstm_1e_disc.sh b/egs/tedlium/s5_r2/local/chain/tuning/run_tdnn_lstm_1e_disc.sh
index 0d64c75aea8..dd26c9f0bf3 100755
--- a/egs/tedlium/s5_r2/local/chain/tuning/run_tdnn_lstm_1e_disc.sh
+++ b/egs/tedlium/s5_r2/local/chain/tuning/run_tdnn_lstm_1e_disc.sh
@@ -135,7 +135,7 @@ if [ $stage -le 1 ]; then
   # hardcode no-GPU for alignment, although you could use GPU [you wouldn't
   # get excellent GPU utilization though.]
   steps/nnet3/align.sh  --cmd "$decode_cmd" --use-gpu false \
-    --scale-opts "--transition-scale=1.0 --acoustic-scale=1.0 --self-loop-scale=1.0" \
+    --scale-opts "--acoustic-scale=1.0 " \
     --frames-per-chunk $frames_per_chunk_egs \
     --extra-left-context $extra_left_context --extra-right-context $extra_right_context \
     --extra-left-context-initial 0 --extra-right-context-final 0 \
@@ -156,7 +156,7 @@ if [ -z "$degs_dir" ]; then
     steps/nnet3/get_degs.sh \
       --cmd "$decode_cmd --mem 10G" --num-threads 3 \
       --max-copy-jobs $max_copy_jobs \
-      --self-loop-scale 1.0 --acwt 1.0 \
+      --acwt 1.0 \
       --extra-left-context $extra_left_context \
       --extra-right-context $extra_right_context \
       --extra-left-context-initial 0 --extra-right-context-final 0 \
diff --git a/egs/tedlium/s5_r2/local/chain/tuning/run_tdnn_lstm_1f.sh b/egs/tedlium/s5_r2/local/chain/tuning/run_tdnn_lstm_1f.sh
index c60b8f7fefc..d3d6ca36816 100755
--- a/egs/tedlium/s5_r2/local/chain/tuning/run_tdnn_lstm_1f.sh
+++ b/egs/tedlium/s5_r2/local/chain/tuning/run_tdnn_lstm_1f.sh
@@ -255,7 +255,7 @@ if [ $stage -le 19 ]; then
   # Note: it might appear that this data/lang_chain directory is mismatched, and it is as
   # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
   # the lang directory.
-  utils/mkgraph.sh --self-loop-scale 1.0 data/lang $dir $dir/graph
+  utils/mkgraph.sh data/lang $dir $dir/graph
 fi
 
 if [ $stage -le 20 ]; then
diff --git a/egs/tedlium/s5_r2/local/chain/tuning/run_tdnn_lstm_1g.sh b/egs/tedlium/s5_r2/local/chain/tuning/run_tdnn_lstm_1g.sh
index 2d2048a6869..638cbec690a 100755
--- a/egs/tedlium/s5_r2/local/chain/tuning/run_tdnn_lstm_1g.sh
+++ b/egs/tedlium/s5_r2/local/chain/tuning/run_tdnn_lstm_1g.sh
@@ -258,7 +258,7 @@ if [ $stage -le 19 ]; then
   # Note: it might appear that this data/lang_chain directory is mismatched, and it is as
   # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
   # the lang directory.
-  utils/mkgraph.sh --self-loop-scale 1.0 data/lang $dir $dir/graph
+  utils/mkgraph.sh data/lang $dir $dir/graph
 fi
 
 if [ $stage -le 20 ]; then
diff --git a/egs/tedlium/s5_r2/local/chain/tuning/run_tdnn_lstm_1h.sh b/egs/tedlium/s5_r2/local/chain/tuning/run_tdnn_lstm_1h.sh
index a074e128270..dcf463156a6 100755
--- a/egs/tedlium/s5_r2/local/chain/tuning/run_tdnn_lstm_1h.sh
+++ b/egs/tedlium/s5_r2/local/chain/tuning/run_tdnn_lstm_1h.sh
@@ -256,7 +256,7 @@ if [ $stage -le 19 ]; then
   # Note: it might appear that this data/lang_chain directory is mismatched, and it is as
   # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
   # the lang directory.
-  utils/mkgraph.sh --self-loop-scale 1.0 data/lang $dir $dir/graph
+  utils/mkgraph.sh data/lang $dir $dir/graph
 fi
 
 if [ $stage -le 20 ]; then
diff --git a/egs/tedlium/s5_r2/local/chain/tuning/run_tdnn_lstm_1i.sh b/egs/tedlium/s5_r2/local/chain/tuning/run_tdnn_lstm_1i.sh
index 3bfe175806f..0a4c678c606 100755
--- a/egs/tedlium/s5_r2/local/chain/tuning/run_tdnn_lstm_1i.sh
+++ b/egs/tedlium/s5_r2/local/chain/tuning/run_tdnn_lstm_1i.sh
@@ -277,7 +277,7 @@ if [ $stage -le 19 ]; then
   # Note: it might appear that this data/lang_chain directory is mismatched, and it is as
   # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
   # the lang directory.
-  utils/mkgraph.sh --self-loop-scale 1.0 data/lang $dir $dir/graph
+  utils/mkgraph.sh data/lang $dir $dir/graph
 fi
 
 if [ $stage -le 20 ]; then
diff --git a/egs/tedlium/s5_r2/local/chain/tuning/run_tdnn_lstm_1j.sh b/egs/tedlium/s5_r2/local/chain/tuning/run_tdnn_lstm_1j.sh
index acbef783823..c1d9c4dad71 100755
--- a/egs/tedlium/s5_r2/local/chain/tuning/run_tdnn_lstm_1j.sh
+++ b/egs/tedlium/s5_r2/local/chain/tuning/run_tdnn_lstm_1j.sh
@@ -274,7 +274,7 @@ if [ $stage -le 19 ]; then
   # Note: it might appear that this data/lang_chain directory is mismatched, and it is as
   # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
   # the lang directory.
-  utils/mkgraph.sh --self-loop-scale 1.0 data/lang $dir $dir/graph
+  utils/mkgraph.sh data/lang $dir $dir/graph
 fi
 
 if [ $stage -le 20 ]; then
diff --git a/egs/tedlium/s5_r2/local/chain/tuning/run_tdnn_lstm_1k.sh b/egs/tedlium/s5_r2/local/chain/tuning/run_tdnn_lstm_1k.sh
index 173be863608..212967ca356 100755
--- a/egs/tedlium/s5_r2/local/chain/tuning/run_tdnn_lstm_1k.sh
+++ b/egs/tedlium/s5_r2/local/chain/tuning/run_tdnn_lstm_1k.sh
@@ -279,7 +279,7 @@ if [ $stage -le 19 ]; then
   # Note: it might appear that this data/lang_chain directory is mismatched, and it is as
   # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
   # the lang directory.
-  utils/mkgraph.sh --self-loop-scale 1.0 data/lang $dir $dir/graph
+  utils/mkgraph.sh data/lang $dir $dir/graph
 fi
 
 if [ $stage -le 20 ]; then
diff --git a/egs/tedlium/s5_r2/local/chain/tuning/run_tdnn_lstm_1l.sh b/egs/tedlium/s5_r2/local/chain/tuning/run_tdnn_lstm_1l.sh
index 94955d0472c..dbb2a8acbe1 100755
--- a/egs/tedlium/s5_r2/local/chain/tuning/run_tdnn_lstm_1l.sh
+++ b/egs/tedlium/s5_r2/local/chain/tuning/run_tdnn_lstm_1l.sh
@@ -270,7 +270,7 @@ if [ $stage -le 19 ]; then
   # Note: it might appear that this data/lang_chain directory is mismatched, and it is as
   # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
   # the lang directory.
-  utils/mkgraph.sh --self-loop-scale 1.0 data/lang $dir $dir/graph
+  utils/mkgraph.sh data/lang $dir $dir/graph
 fi
 
 if [ $stage -le 20 ]; then
diff --git a/egs/tedlium/s5_r2/local/chain/tuning/run_tdnn_lstm_1m.sh b/egs/tedlium/s5_r2/local/chain/tuning/run_tdnn_lstm_1m.sh
index efd3bc98725..189f8a2cb79 100755
--- a/egs/tedlium/s5_r2/local/chain/tuning/run_tdnn_lstm_1m.sh
+++ b/egs/tedlium/s5_r2/local/chain/tuning/run_tdnn_lstm_1m.sh
@@ -270,7 +270,7 @@ if [ $stage -le 19 ]; then
   # Note: it might appear that this data/lang_chain directory is mismatched, and it is as
   # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
   # the lang directory.
-  utils/mkgraph.sh --self-loop-scale 1.0 data/lang $dir $dir/graph
+  utils/mkgraph.sh data/lang $dir $dir/graph
 fi
 
 if [ $stage -le 20 ]; then
diff --git a/egs/tedlium/s5_r2/local/chain/tuning/run_tdnn_lstm_1n.sh b/egs/tedlium/s5_r2/local/chain/tuning/run_tdnn_lstm_1n.sh
index c0559e8d389..27dd0331dd2 100755
--- a/egs/tedlium/s5_r2/local/chain/tuning/run_tdnn_lstm_1n.sh
+++ b/egs/tedlium/s5_r2/local/chain/tuning/run_tdnn_lstm_1n.sh
@@ -280,7 +280,7 @@ if [ $stage -le 19 ]; then
   # Note: it might appear that this data/lang_chain directory is mismatched, and it is as
   # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
   # the lang directory.
-  utils/mkgraph.sh --self-loop-scale 1.0 data/lang $dir $dir/graph
+  utils/mkgraph.sh data/lang $dir $dir/graph
 fi
 
 if [ $stage -le 20 ]; then
diff --git a/egs/tedlium/s5_r2/local/chain/tuning/run_tdnn_lstm_1o.sh b/egs/tedlium/s5_r2/local/chain/tuning/run_tdnn_lstm_1o.sh
index 5a6dbaef8af..89de6799c27 100755
--- a/egs/tedlium/s5_r2/local/chain/tuning/run_tdnn_lstm_1o.sh
+++ b/egs/tedlium/s5_r2/local/chain/tuning/run_tdnn_lstm_1o.sh
@@ -284,7 +284,7 @@ if [ $stage -le 19 ]; then
   # Note: it might appear that this data/lang_chain directory is mismatched, and it is as
   # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
   # the lang directory.
-  utils/mkgraph.sh --self-loop-scale 1.0 data/lang $dir $dir/graph
+  utils/mkgraph.sh data/lang $dir $dir/graph
 fi
 
 if [ $stage -le 20 ]; then
diff --git a/egs/tedlium/s5_r2/local/chain/tuning/run_tdnn_lstm_1r.sh b/egs/tedlium/s5_r2/local/chain/tuning/run_tdnn_lstm_1r.sh
index dd38d56759f..5209710dbc4 100755
--- a/egs/tedlium/s5_r2/local/chain/tuning/run_tdnn_lstm_1r.sh
+++ b/egs/tedlium/s5_r2/local/chain/tuning/run_tdnn_lstm_1r.sh
@@ -279,7 +279,7 @@ if [ $stage -le 19 ]; then
   # Note: it might appear that this data/lang_chain directory is mismatched, and it is as
   # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
   # the lang directory.
-  utils/mkgraph.sh --self-loop-scale 1.0 data/lang $dir $dir/graph
+  utils/mkgraph.sh data/lang $dir $dir/graph
 fi
 
 if [ $stage -le 20 ]; then
diff --git a/egs/tedlium/s5_r2/local/chain/tuning/run_tdnn_lstm_1s.sh b/egs/tedlium/s5_r2/local/chain/tuning/run_tdnn_lstm_1s.sh
index 1378d2d176d..bb083315157 100644
--- a/egs/tedlium/s5_r2/local/chain/tuning/run_tdnn_lstm_1s.sh
+++ b/egs/tedlium/s5_r2/local/chain/tuning/run_tdnn_lstm_1s.sh
@@ -240,7 +240,7 @@ if [ $stage -le 19 ]; then
   # Note: it might appear that this data/lang_chain directory is mismatched, and it is as
   # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
   # the lang directory.
-  utils/mkgraph.sh --self-loop-scale 1.0 data/lang $dir $dir/graph
+  utils/mkgraph.sh data/lang $dir $dir/graph
 fi
 
 if [ $stage -le 20 ]; then
diff --git a/egs/tedlium/s5_r2/local/chain/tuning/run_tdnn_lstm_1t.sh b/egs/tedlium/s5_r2/local/chain/tuning/run_tdnn_lstm_1t.sh
index 3c4882ec2c6..82eca8f19af 100644
--- a/egs/tedlium/s5_r2/local/chain/tuning/run_tdnn_lstm_1t.sh
+++ b/egs/tedlium/s5_r2/local/chain/tuning/run_tdnn_lstm_1t.sh
@@ -240,7 +240,7 @@ if [ $stage -le 19 ]; then
   # Note: it might appear that this data/lang_chain directory is mismatched, and it is as
   # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
   # the lang directory.
-  utils/mkgraph.sh --self-loop-scale 1.0 data/lang $dir $dir/graph
+  utils/mkgraph.sh data/lang $dir $dir/graph
 fi
 
 if [ $stage -le 20 ]; then
diff --git a/egs/tedlium/s5_r2/local/chain/tuning/run_tdnn_lstm_1u.sh b/egs/tedlium/s5_r2/local/chain/tuning/run_tdnn_lstm_1u.sh
index 23ea14ae151..df96469ad28 100644
--- a/egs/tedlium/s5_r2/local/chain/tuning/run_tdnn_lstm_1u.sh
+++ b/egs/tedlium/s5_r2/local/chain/tuning/run_tdnn_lstm_1u.sh
@@ -234,7 +234,7 @@ if [ $stage -le 19 ]; then
   # Note: it might appear that this data/lang_chain directory is mismatched, and it is as
   # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
   # the lang directory.
-  utils/mkgraph.sh --self-loop-scale 1.0 data/lang $dir $dir/graph
+  utils/mkgraph.sh data/lang $dir $dir/graph
 fi
 
 if [ $stage -le 20 ]; then
diff --git a/egs/tedlium/s5_r2/local/chain/tuning/run_tdnn_lstm_1v.sh b/egs/tedlium/s5_r2/local/chain/tuning/run_tdnn_lstm_1v.sh
index 7c44d963504..a10913133ad 100755
--- a/egs/tedlium/s5_r2/local/chain/tuning/run_tdnn_lstm_1v.sh
+++ b/egs/tedlium/s5_r2/local/chain/tuning/run_tdnn_lstm_1v.sh
@@ -239,7 +239,7 @@ if [ $stage -le 19 ]; then
   # Note: it might appear that this data/lang_chain directory is mismatched, and it is as
   # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
   # the lang directory.
-  utils/mkgraph.sh --self-loop-scale 1.0 data/lang $dir $dir/graph
+  utils/mkgraph.sh data/lang $dir $dir/graph
 fi
 
 if [ $stage -le 20 ]; then
diff --git a/egs/tedlium/s5_r2/local/chain/tuning/run_tdnn_lstm_attention_1a.sh b/egs/tedlium/s5_r2/local/chain/tuning/run_tdnn_lstm_attention_1a.sh
index 042ef346578..6c95f819dd2 100755
--- a/egs/tedlium/s5_r2/local/chain/tuning/run_tdnn_lstm_attention_1a.sh
+++ b/egs/tedlium/s5_r2/local/chain/tuning/run_tdnn_lstm_attention_1a.sh
@@ -248,7 +248,7 @@ if [ $stage -le 19 ]; then
   # Note: it might appear that this data/lang_chain directory is mismatched, and it is as
   # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
   # the lang directory.
-  utils/mkgraph.sh --self-loop-scale 1.0 data/lang $dir $dir/graph
+  utils/mkgraph.sh data/lang $dir $dir/graph
 fi
 
 if [ $stage -le 20 ]; then
diff --git a/egs/tedlium/s5_r2/local/chain/tuning/run_tdnn_lstm_attention_bs_1a.sh b/egs/tedlium/s5_r2/local/chain/tuning/run_tdnn_lstm_attention_bs_1a.sh
index 905e1845183..fc3e45dc7b0 100755
--- a/egs/tedlium/s5_r2/local/chain/tuning/run_tdnn_lstm_attention_bs_1a.sh
+++ b/egs/tedlium/s5_r2/local/chain/tuning/run_tdnn_lstm_attention_bs_1a.sh
@@ -254,7 +254,7 @@ if [ $stage -le 19 ]; then
   # Note: it might appear that this data/lang_chain directory is mismatched, and it is as
   # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
   # the lang directory.
-  utils/mkgraph.sh --self-loop-scale 1.0 data/lang $dir $dir/graph
+  utils/mkgraph.sh data/lang $dir $dir/graph
 fi
 
 if [ $stage -le 20 ]; then
diff --git a/egs/tedlium/s5_r2/local/chain/tuning/run_tdnn_lstm_attention_bs_1b.sh b/egs/tedlium/s5_r2/local/chain/tuning/run_tdnn_lstm_attention_bs_1b.sh
index 7bd96e7d82c..273068b6b28 100755
--- a/egs/tedlium/s5_r2/local/chain/tuning/run_tdnn_lstm_attention_bs_1b.sh
+++ b/egs/tedlium/s5_r2/local/chain/tuning/run_tdnn_lstm_attention_bs_1b.sh
@@ -241,7 +241,7 @@ if [ $stage -le 19 ]; then
   # Note: it might appear that this data/lang_chain directory is mismatched, and it is as
   # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
   # the lang directory.
-  utils/mkgraph.sh --self-loop-scale 1.0 data/lang $dir $dir/graph
+  utils/mkgraph.sh data/lang $dir $dir/graph
 fi
 
 if [ $stage -le 20 ]; then
diff --git a/egs/tedlium/s5_r2/local/nnet3/tuning/run_tdnn_lfr_1a.sh b/egs/tedlium/s5_r2/local/nnet3/tuning/run_tdnn_lfr_1a.sh
index f0220b17376..06b4b2451a4 100755
--- a/egs/tedlium/s5_r2/local/nnet3/tuning/run_tdnn_lfr_1a.sh
+++ b/egs/tedlium/s5_r2/local/nnet3/tuning/run_tdnn_lfr_1a.sh
@@ -172,7 +172,7 @@ if [ $stage -le 16 ]; then
   # as long as phones.txt was compatible.
 
   utils/lang/check_phones_compatible.sh data/lang/phones.txt $lang/phones.txt
-  utils/mkgraph.sh --self-loop-scale 0.333 data/lang $dir $dir/graph
+  utils/mkgraph.sh data/lang $dir $dir/graph
 fi
 
 
diff --git a/egs/tedlium/s5_r2/local/nnet3/tuning/run_tdnn_lstm_lfr_1a.sh b/egs/tedlium/s5_r2/local/nnet3/tuning/run_tdnn_lstm_lfr_1a.sh
index 3e8509bf4ac..d867c99a185 100755
--- a/egs/tedlium/s5_r2/local/nnet3/tuning/run_tdnn_lstm_lfr_1a.sh
+++ b/egs/tedlium/s5_r2/local/nnet3/tuning/run_tdnn_lstm_lfr_1a.sh
@@ -222,7 +222,7 @@ if [ $stage -le 16 ]; then
   # as long as phones.txt was compatible.
 
   utils/lang/check_phones_compatible.sh data/lang/phones.txt $lang/phones.txt
-  utils/mkgraph.sh --self-loop-scale 0.333 data/lang $dir $dir/graph
+  utils/mkgraph.sh data/lang $dir $dir/graph
 fi
 
 if [ $stage -le 17 ]; then
diff --git a/egs/tedlium/s5_r3/local/chain/tuning/run_tdnn_1a.sh b/egs/tedlium/s5_r3/local/chain/tuning/run_tdnn_1a.sh
index 1204ff6ce4c..0289ff9dd16 100755
--- a/egs/tedlium/s5_r3/local/chain/tuning/run_tdnn_1a.sh
+++ b/egs/tedlium/s5_r3/local/chain/tuning/run_tdnn_1a.sh
@@ -224,7 +224,7 @@ if [ $stage -le 19 ]; then
   # Note: it might appear that this data/lang_chain directory is mismatched, and it is as
   # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
   # the lang directory.
-  utils/mkgraph.sh --self-loop-scale 1.0 data/lang $dir $dir/graph
+  utils/mkgraph.sh data/lang $dir $dir/graph
 fi
 
 if [ $stage -le 20 ]; then
diff --git a/egs/tedlium/s5_r3/local/chain/tuning/run_tdnn_1b.sh b/egs/tedlium/s5_r3/local/chain/tuning/run_tdnn_1b.sh
index 744c964db2f..a5250f4ff9d 100755
--- a/egs/tedlium/s5_r3/local/chain/tuning/run_tdnn_1b.sh
+++ b/egs/tedlium/s5_r3/local/chain/tuning/run_tdnn_1b.sh
@@ -231,7 +231,7 @@ if [ $stage -le 19 ]; then
   # Note: it might appear that this data/lang_chain directory is mismatched, and it is as
   # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
   # the lang directory.
-  utils/mkgraph.sh --self-loop-scale 1.0 data/lang $dir $dir/graph
+  utils/mkgraph.sh data/lang $dir $dir/graph
 fi
 
 if [ $stage -le 20 ]; then
diff --git a/egs/tedlium/s5_r3/local/chain/tuning/run_tdnn_1c.sh b/egs/tedlium/s5_r3/local/chain/tuning/run_tdnn_1c.sh
index faac365af54..1c6c3709bdf 100755
--- a/egs/tedlium/s5_r3/local/chain/tuning/run_tdnn_1c.sh
+++ b/egs/tedlium/s5_r3/local/chain/tuning/run_tdnn_1c.sh
@@ -224,7 +224,7 @@ if [ $stage -le 19 ]; then
   # Note: it might appear that this data/lang_chain directory is mismatched, and it is as
   # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
   # the lang directory.
-  utils/mkgraph.sh --self-loop-scale 1.0 data/lang $dir $dir/graph
+  utils/mkgraph.sh data/lang $dir $dir/graph
 fi
 
 if [ $stage -le 20 ]; then
diff --git a/egs/tunisian_msa/s5/local/chain/tuning/run_tdnn_1a.sh b/egs/tunisian_msa/s5/local/chain/tuning/run_tdnn_1a.sh
index ab68ba6fb68..166b6d842a0 100755
--- a/egs/tunisian_msa/s5/local/chain/tuning/run_tdnn_1a.sh
+++ b/egs/tunisian_msa/s5/local/chain/tuning/run_tdnn_1a.sh
@@ -223,7 +223,7 @@ if [ $stage -le 15 ]; then
   # Note: it's not important to give mkgraph.sh the lang directory with the
   # matched topology (since it gets the topology file from the model).
   utils/mkgraph.sh \
-    --self-loop-scale 1.0 \
+    \
     data/lang_test \
     $tree_dir \
     $tree_dir/graph || exit 1;
diff --git a/egs/uw3/v1/local/chain/run_cnn_1a.sh b/egs/uw3/v1/local/chain/run_cnn_1a.sh
index e3548609da7..401d79e7217 100755
--- a/egs/uw3/v1/local/chain/run_cnn_1a.sh
+++ b/egs/uw3/v1/local/chain/run_cnn_1a.sh
@@ -216,7 +216,7 @@ if [ $stage -le 6 ]; then
   # as long as phones.txt was compatible.
 
   utils/mkgraph.sh \
-    --self-loop-scale 1.0 $lang_test \
+    $lang_test \
     $dir $dir/graph || exit 1;
 fi
 
diff --git a/egs/vystadial_cz/s5b/local/chain/tuning/run_tdnn_1a.sh b/egs/vystadial_cz/s5b/local/chain/tuning/run_tdnn_1a.sh
index 844ccf80677..d68ac82ce6c 100755
--- a/egs/vystadial_cz/s5b/local/chain/tuning/run_tdnn_1a.sh
+++ b/egs/vystadial_cz/s5b/local/chain/tuning/run_tdnn_1a.sh
@@ -234,7 +234,7 @@ if [ $stage -le 15 ]; then
   # Note: it's not important to give mkgraph.sh the lang directory with the
   # matched topology (since it gets the topology file from the model).
   utils/mkgraph.sh \
-    --self-loop-scale 1.0 data/lang_sp_test \
+    data/lang_sp_test \
     $tree_dir $tree_dir/graph || exit 1;
 fi
 
diff --git a/egs/wsj/s5/local/chain/e2e/run_tdnn_flatstart.sh b/egs/wsj/s5/local/chain/e2e/run_tdnn_flatstart.sh
index 1ddb3c305ac..0d07afa4519 100755
--- a/egs/wsj/s5/local/chain/e2e/run_tdnn_flatstart.sh
+++ b/egs/wsj/s5/local/chain/e2e/run_tdnn_flatstart.sh
@@ -175,13 +175,13 @@ if [ $stage -le 4 ]; then
   utils/lang/check_phones_compatible.sh \
     data/lang_nosp_test_tgpr/phones.txt $lang/phones.txt
   utils/mkgraph.sh \
-    --self-loop-scale 1.0 data/lang_nosp_test_tgpr \
+    data/lang_nosp_test_tgpr \
     $dir $treedir/graph_tgpr || exit 1;
 
   utils/lang/check_phones_compatible.sh \
     data/lang_nosp_test_bd_tgpr/phones.txt $lang/phones.txt
   utils/mkgraph.sh \
-    --self-loop-scale 1.0 data/lang_nosp_test_bd_tgpr \
+    data/lang_nosp_test_bd_tgpr \
     $dir $treedir/graph_bd_tgpr || exit 1;
 fi
 
@@ -203,7 +203,7 @@ if [ $stage -le 5 ]; then
           $treedir/graph_${lmtype} data/${data}_hires ${dir}/decode_${lmtype}_${data_affix} || exit 1
       done
       steps/lmrescore.sh \
-        --self-loop-scale 1.0 \
+        \
         --cmd "$decode_cmd" data/lang_nosp_test_{tgpr,tg} \
         data/${data}_hires ${dir}/decode_{tgpr,tg}_${data_affix} || exit 1
       steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \
diff --git a/egs/wsj/s5/local/chain/e2e/run_tdnn_lstm_flatstart.sh b/egs/wsj/s5/local/chain/e2e/run_tdnn_lstm_flatstart.sh
index be82e80d5fe..714ace8a633 100755
--- a/egs/wsj/s5/local/chain/e2e/run_tdnn_lstm_flatstart.sh
+++ b/egs/wsj/s5/local/chain/e2e/run_tdnn_lstm_flatstart.sh
@@ -189,13 +189,13 @@ if [ $stage -le 4 ]; then
   utils/lang/check_phones_compatible.sh \
     data/lang_char_test_tgpr/phones.txt $lang/phones.txt
   utils/mkgraph.sh \
-    --self-loop-scale 1.0 data/lang_char_test_tgpr \
+    data/lang_char_test_tgpr \
     $dir $treedir/graph_tgpr || exit 1;
 
   utils/lang/check_phones_compatible.sh \
     data/lang_char_test_bd_tgpr/phones.txt $lang/phones.txt
   utils/mkgraph.sh \
-    --self-loop-scale 1.0 data/lang_char_test_bd_tgpr \
+    data/lang_char_test_bd_tgpr \
     $dir $treedir/graph_bd_tgpr || exit 1;
 fi
 
@@ -219,7 +219,7 @@ if [ $stage -le 5 ]; then
           $treedir/graph_${lmtype} data/${data}_hires ${dir}/decode_${lmtype}_${data_affix} || exit 1
       done
       steps/lmrescore.sh \
-        --self-loop-scale 1.0 \
+        \
         --cmd "$decode_cmd" data/lang_char_test_{tgpr,tg} \
         data/${data}_hires ${dir}/decode_{tgpr,tg}_${data_affix} || exit 1
       steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \
diff --git a/egs/wsj/s5/local/chain/e2e/tuning/run_tdnnf_flatstart_char1a.sh b/egs/wsj/s5/local/chain/e2e/tuning/run_tdnnf_flatstart_char1a.sh
index 4ab0cf58d53..70f22f39903 100755
--- a/egs/wsj/s5/local/chain/e2e/tuning/run_tdnnf_flatstart_char1a.sh
+++ b/egs/wsj/s5/local/chain/e2e/tuning/run_tdnnf_flatstart_char1a.sh
@@ -181,13 +181,13 @@ if [ $stage -le 4 ]; then
   utils/lang/check_phones_compatible.sh \
     data/lang_char_test_tgpr/phones.txt $lang/phones.txt
   utils/mkgraph.sh \
-    --self-loop-scale 1.0 data/lang_char_test_tgpr \
+    data/lang_char_test_tgpr \
     $dir $treedir/graph_tgpr || exit 1;
 
   utils/lang/check_phones_compatible.sh \
     data/lang_char_test_bd_tgpr/phones.txt $lang/phones.txt
   utils/mkgraph.sh \
-    --self-loop-scale 1.0 data/lang_char_test_bd_tgpr \
+    data/lang_char_test_bd_tgpr \
     $dir $treedir/graph_bd_tgpr || exit 1;
 fi
 
@@ -209,7 +209,7 @@ if [ $stage -le 5 ]; then
           $treedir/graph_${lmtype} data/${data}_hires ${dir}/decode_${lmtype}_${data_affix} || exit 1
       done
       steps/lmrescore.sh \
-        --self-loop-scale 1.0 \
+        \
         --cmd "$decode_cmd" data/lang_char_test_{tgpr,tg} \
         data/${data}_hires ${dir}/decode_{tgpr,tg}_${data_affix} || exit 1
       steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \
diff --git a/egs/wsj/s5/local/chain/e2e/tuning/run_tdnnf_flatstart_char1b.sh b/egs/wsj/s5/local/chain/e2e/tuning/run_tdnnf_flatstart_char1b.sh
index 4e66fae8baa..615c2535f7d 100755
--- a/egs/wsj/s5/local/chain/e2e/tuning/run_tdnnf_flatstart_char1b.sh
+++ b/egs/wsj/s5/local/chain/e2e/tuning/run_tdnnf_flatstart_char1b.sh
@@ -183,13 +183,13 @@ if [ $stage -le 4 ]; then
   utils/lang/check_phones_compatible.sh \
     data/lang_char_test_tgpr/phones.txt $lang/phones.txt
   utils/mkgraph.sh \
-    --self-loop-scale 1.0 data/lang_char_test_tgpr \
+    data/lang_char_test_tgpr \
     $dir $treedir/graph_tgpr || exit 1;
 
   utils/lang/check_phones_compatible.sh \
     data/lang_char_test_bd_tgpr/phones.txt $lang/phones.txt
   utils/mkgraph.sh \
-    --self-loop-scale 1.0 data/lang_char_test_bd_tgpr \
+    data/lang_char_test_bd_tgpr \
     $dir $treedir/graph_bd_tgpr || exit 1;
 fi
 
@@ -211,7 +211,7 @@ if [ $stage -le 5 ]; then
           $treedir/graph_${lmtype} data/${data}_hires ${dir}/decode_${lmtype}_${data_affix} || exit 1
       done
       steps/lmrescore.sh \
-        --self-loop-scale 1.0 \
+        \
         --cmd "$decode_cmd" data/lang_char_test_{tgpr,tg} \
         data/${data}_hires ${dir}/decode_{tgpr,tg}_${data_affix} || exit 1
       steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \
diff --git a/egs/wsj/s5/local/chain/tuning/run_cnn_tdnn_1a.sh b/egs/wsj/s5/local/chain/tuning/run_cnn_tdnn_1a.sh
index e656b67e529..5a8a20496cd 100755
--- a/egs/wsj/s5/local/chain/tuning/run_cnn_tdnn_1a.sh
+++ b/egs/wsj/s5/local/chain/tuning/run_cnn_tdnn_1a.sh
@@ -269,13 +269,13 @@ if [ $stage -le 17 ]; then
   utils/lang/check_phones_compatible.sh \
     data/lang_test_tgpr/phones.txt $lang/phones.txt
   utils/mkgraph.sh \
-    --self-loop-scale 1.0 data/lang_test_tgpr \
+    data/lang_test_tgpr \
     $tree_dir $tree_dir/graph_tgpr || exit 1;
 
   utils/lang/check_phones_compatible.sh \
     data/lang_test_bd_tgpr/phones.txt $lang/phones.txt
   utils/mkgraph.sh \
-    --self-loop-scale 1.0 data/lang_test_bd_tgpr \
+    data/lang_test_bd_tgpr \
     $tree_dir $tree_dir/graph_bd_tgpr || exit 1;
 fi
 
@@ -300,7 +300,7 @@ if [ $stage -le 18 ]; then
           $tree_dir/graph_${lmtype} data/${data}_hires ${dir}/decode_${lmtype}_${data_affix} || exit 1
       done
       steps/lmrescore.sh \
-        --self-loop-scale 1.0 \
+        \
         --cmd "$decode_cmd" data/lang_test_{tgpr,tg} \
         data/${data}_hires ${dir}/decode_{tgpr,tg}_${data_affix} || exit 1
       steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \
@@ -338,7 +338,7 @@ if $test_online_decoding && [ $stage -le 19 ]; then
           $tree_dir/graph_${lmtype} data/${data} ${dir}_online/decode_${lmtype}_${data_affix} || exit 1
       done
       steps/lmrescore.sh \
-        --self-loop-scale 1.0 \
+        \
         --cmd "$decode_cmd" data/lang_test_{tgpr,tg} \
         data/${data}_hires ${dir}_online/decode_{tgpr,tg}_${data_affix} || exit 1
       steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \
diff --git a/egs/wsj/s5/local/chain/tuning/run_cnn_tdnn_1b.sh b/egs/wsj/s5/local/chain/tuning/run_cnn_tdnn_1b.sh
index 9db76e94430..e5265d3b31d 100755
--- a/egs/wsj/s5/local/chain/tuning/run_cnn_tdnn_1b.sh
+++ b/egs/wsj/s5/local/chain/tuning/run_cnn_tdnn_1b.sh
@@ -272,13 +272,13 @@ if [ $stage -le 17 ]; then
   utils/lang/check_phones_compatible.sh \
     data/lang_test_tgpr/phones.txt $lang/phones.txt
   utils/mkgraph.sh \
-    --self-loop-scale 1.0 data/lang_test_tgpr \
+    data/lang_test_tgpr \
     $tree_dir $tree_dir/graph_tgpr || exit 1;
 
   utils/lang/check_phones_compatible.sh \
     data/lang_test_bd_tgpr/phones.txt $lang/phones.txt
   utils/mkgraph.sh \
-    --self-loop-scale 1.0 data/lang_test_bd_tgpr \
+    data/lang_test_bd_tgpr \
     $tree_dir $tree_dir/graph_bd_tgpr || exit 1;
 fi
 
@@ -303,7 +303,7 @@ if [ $stage -le 18 ]; then
           $tree_dir/graph_${lmtype} data/${data}_hires ${dir}/decode_${lmtype}_${data_affix} || exit 1
       done
       steps/lmrescore.sh \
-        --self-loop-scale 1.0 \
+        \
         --cmd "$decode_cmd" data/lang_test_{tgpr,tg} \
         data/${data}_hires ${dir}/decode_{tgpr,tg}_${data_affix} || exit 1
       steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \
@@ -341,7 +341,7 @@ if $test_online_decoding && [ $stage -le 19 ]; then
           $tree_dir/graph_${lmtype} data/${data} ${dir}_online/decode_${lmtype}_${data_affix} || exit 1
       done
       steps/lmrescore.sh \
-        --self-loop-scale 1.0 \
+        \
         --cmd "$decode_cmd" data/lang_test_{tgpr,tg} \
         data/${data}_hires ${dir}_online/decode_{tgpr,tg}_${data_affix} || exit 1
       steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \
diff --git a/egs/wsj/s5/local/chain/tuning/run_cnn_tdnn_1c.sh b/egs/wsj/s5/local/chain/tuning/run_cnn_tdnn_1c.sh
index 36ec5bb61af..e1394fb65da 100755
--- a/egs/wsj/s5/local/chain/tuning/run_cnn_tdnn_1c.sh
+++ b/egs/wsj/s5/local/chain/tuning/run_cnn_tdnn_1c.sh
@@ -260,13 +260,13 @@ if [ $stage -le 17 ]; then
   utils/lang/check_phones_compatible.sh \
     data/lang_test_tgpr/phones.txt $lang/phones.txt
   utils/mkgraph.sh \
-    --self-loop-scale 1.0 data/lang_test_tgpr \
+    data/lang_test_tgpr \
     $tree_dir $tree_dir/graph_tgpr || exit 1;
 
   utils/lang/check_phones_compatible.sh \
     data/lang_test_bd_tgpr/phones.txt $lang/phones.txt
   utils/mkgraph.sh \
-    --self-loop-scale 1.0 data/lang_test_bd_tgpr \
+    data/lang_test_bd_tgpr \
     $tree_dir $tree_dir/graph_bd_tgpr || exit 1;
 fi
 
@@ -287,7 +287,7 @@ if [ $stage -le 18 ]; then
           $tree_dir/graph_${lmtype} data/${data}_hires ${dir}/decode_${lmtype}_${data_affix} || exit 1
       done
       steps/lmrescore.sh \
-        --self-loop-scale 1.0 \
+        \
         --cmd "$decode_cmd" data/lang_test_{tgpr,tg} \
         data/${data}_hires ${dir}/decode_{tgpr,tg}_${data_affix} || exit 1
       steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \
@@ -325,7 +325,7 @@ if $test_online_decoding && [ $stage -le 19 ]; then
           $tree_dir/graph_${lmtype} data/${data} ${dir}_online/decode_${lmtype}_${data_affix} || exit 1
       done
       steps/lmrescore.sh \
-        --self-loop-scale 1.0 \
+        \
         --cmd "$decode_cmd" data/lang_test_{tgpr,tg} \
         data/${data}_hires ${dir}_online/decode_{tgpr,tg}_${data_affix} || exit 1
       steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \
diff --git a/egs/wsj/s5/local/chain/tuning/run_tdnn_1a.sh b/egs/wsj/s5/local/chain/tuning/run_tdnn_1a.sh
index 8d44db6f917..a3950d71d1f 100755
--- a/egs/wsj/s5/local/chain/tuning/run_tdnn_1a.sh
+++ b/egs/wsj/s5/local/chain/tuning/run_tdnn_1a.sh
@@ -276,13 +276,13 @@ if [ $stage -le 17 ]; then
   utils/lang/check_phones_compatible.sh \
     data/lang_test_tgpr/phones.txt $lang/phones.txt
   utils/mkgraph.sh \
-    --self-loop-scale 1.0 data/lang_test_tgpr \
+    data/lang_test_tgpr \
     $tree_dir $tree_dir/graph_tgpr || exit 1;
 
   utils/lang/check_phones_compatible.sh \
     data/lang_test_bd_tgpr/phones.txt $lang/phones.txt
   utils/mkgraph.sh \
-    --self-loop-scale 1.0 data/lang_test_bd_tgpr \
+    data/lang_test_bd_tgpr \
     $tree_dir $tree_dir/graph_bd_tgpr || exit 1;
 fi
 
@@ -307,7 +307,7 @@ if [ $stage -le 18 ]; then
           $tree_dir/graph_${lmtype} data/${data}_hires ${dir}/decode_${lmtype}_${data_affix} || exit 1
       done
       steps/lmrescore.sh \
-        --self-loop-scale 1.0 \
+        \
         --cmd "$decode_cmd" data/lang_test_{tgpr,tg} \
         data/${data}_hires ${dir}/decode_{tgpr,tg}_${data_affix} || exit 1
       steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \
@@ -345,7 +345,7 @@ if $test_online_decoding && [ $stage -le 19 ]; then
           $tree_dir/graph_${lmtype} data/${data} ${dir}_online/decode_${lmtype}_${data_affix} || exit 1
       done
       steps/lmrescore.sh \
-        --self-loop-scale 1.0 \
+        \
         --cmd "$decode_cmd" data/lang_test_{tgpr,tg} \
         data/${data}_hires ${dir}_online/decode_{tgpr,tg}_${data_affix} || exit 1
       steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \
diff --git a/egs/wsj/s5/local/chain/tuning/run_tdnn_1b.sh b/egs/wsj/s5/local/chain/tuning/run_tdnn_1b.sh
index 544b9b04a0a..26a88900b0f 100755
--- a/egs/wsj/s5/local/chain/tuning/run_tdnn_1b.sh
+++ b/egs/wsj/s5/local/chain/tuning/run_tdnn_1b.sh
@@ -252,13 +252,13 @@ if [ $stage -le 17 ]; then
   utils/lang/check_phones_compatible.sh \
     data/lang_test_tgpr/phones.txt $lang/phones.txt
   utils/mkgraph.sh \
-    --self-loop-scale 1.0 data/lang_test_tgpr \
+    data/lang_test_tgpr \
     $tree_dir $tree_dir/graph_tgpr || exit 1;
 
   utils/lang/check_phones_compatible.sh \
     data/lang_test_bd_tgpr/phones.txt $lang/phones.txt
   utils/mkgraph.sh \
-    --self-loop-scale 1.0 data/lang_test_bd_tgpr \
+    data/lang_test_bd_tgpr \
     $tree_dir $tree_dir/graph_bd_tgpr || exit 1;
 fi
 
@@ -283,7 +283,7 @@ if [ $stage -le 18 ]; then
           $tree_dir/graph_${lmtype} data/${data}_hires ${dir}/decode_${lmtype}_${data_affix} || exit 1
       done
       steps/lmrescore.sh \
-        --self-loop-scale 1.0 \
+        \
         --cmd "$decode_cmd" data/lang_test_{tgpr,tg} \
         data/${data}_hires ${dir}/decode_{tgpr,tg}_${data_affix} || exit 1
       steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \
@@ -321,7 +321,7 @@ if $test_online_decoding && [ $stage -le 19 ]; then
           $tree_dir/graph_${lmtype} data/${data} ${dir}_online/decode_${lmtype}_${data_affix} || exit 1
       done
       steps/lmrescore.sh \
-        --self-loop-scale 1.0 \
+        \
         --cmd "$decode_cmd" data/lang_test_{tgpr,tg} \
         data/${data}_hires ${dir}_online/decode_{tgpr,tg}_${data_affix} || exit 1
       steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \
diff --git a/egs/wsj/s5/local/chain/tuning/run_tdnn_1c.sh b/egs/wsj/s5/local/chain/tuning/run_tdnn_1c.sh
index b268ed7feda..654fc25a49a 100755
--- a/egs/wsj/s5/local/chain/tuning/run_tdnn_1c.sh
+++ b/egs/wsj/s5/local/chain/tuning/run_tdnn_1c.sh
@@ -253,13 +253,13 @@ if [ $stage -le 17 ]; then
   utils/lang/check_phones_compatible.sh \
     data/lang_test_tgpr/phones.txt $lang/phones.txt
   utils/mkgraph.sh \
-    --self-loop-scale 1.0 data/lang_test_tgpr \
+    data/lang_test_tgpr \
     $tree_dir $tree_dir/graph_tgpr || exit 1;
 
   utils/lang/check_phones_compatible.sh \
     data/lang_test_bd_tgpr/phones.txt $lang/phones.txt
   utils/mkgraph.sh \
-    --self-loop-scale 1.0 data/lang_test_bd_tgpr \
+    data/lang_test_bd_tgpr \
     $tree_dir $tree_dir/graph_bd_tgpr || exit 1;
 fi
 
@@ -283,7 +283,7 @@ if [ $stage -le 18 ]; then
           $tree_dir/graph_${lmtype} data/${data}_hires ${dir}/decode_${lmtype}_${data_affix} || exit 1
       done
       steps/lmrescore.sh \
-        --self-loop-scale 1.0 \
+        \
         --cmd "$decode_cmd" data/lang_test_{tgpr,tg} \
         data/${data}_hires ${dir}/decode_{tgpr,tg}_${data_affix} || exit 1
       steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \
@@ -321,7 +321,7 @@ if $test_online_decoding && [ $stage -le 19 ]; then
           $tree_dir/graph_${lmtype} data/${data} ${dir}_online/decode_${lmtype}_${data_affix} || exit 1
       done
       steps/lmrescore.sh \
-        --self-loop-scale 1.0 \
+        \
         --cmd "$decode_cmd" data/lang_test_{tgpr,tg} \
         data/${data}_hires ${dir}_online/decode_{tgpr,tg}_${data_affix} || exit 1
       steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \
diff --git a/egs/wsj/s5/local/chain/tuning/run_tdnn_1d.sh b/egs/wsj/s5/local/chain/tuning/run_tdnn_1d.sh
index d1a7f9d0663..a8549470006 100755
--- a/egs/wsj/s5/local/chain/tuning/run_tdnn_1d.sh
+++ b/egs/wsj/s5/local/chain/tuning/run_tdnn_1d.sh
@@ -255,13 +255,13 @@ if [ $stage -le 17 ]; then
   utils/lang/check_phones_compatible.sh \
     data/lang_test_tgpr/phones.txt $lang/phones.txt
   utils/mkgraph.sh \
-    --self-loop-scale 1.0 data/lang_test_tgpr \
+    data/lang_test_tgpr \
     $tree_dir $tree_dir/graph_tgpr || exit 1;
 
   utils/lang/check_phones_compatible.sh \
     data/lang_test_bd_tgpr/phones.txt $lang/phones.txt
   utils/mkgraph.sh \
-    --self-loop-scale 1.0 data/lang_test_bd_tgpr \
+    data/lang_test_bd_tgpr \
     $tree_dir $tree_dir/graph_bd_tgpr || exit 1;
 fi
 
@@ -285,7 +285,7 @@ if [ $stage -le 18 ]; then
           $tree_dir/graph_${lmtype} data/${data}_hires ${dir}/decode_${lmtype}_${data_affix} || exit 1
       done
       steps/lmrescore.sh \
-        --self-loop-scale 1.0 \
+        \
         --cmd "$decode_cmd" data/lang_test_{tgpr,tg} \
         data/${data}_hires ${dir}/decode_{tgpr,tg}_${data_affix} || exit 1
       steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \
@@ -323,7 +323,7 @@ if $test_online_decoding && [ $stage -le 19 ]; then
           $tree_dir/graph_${lmtype} data/${data} ${dir}_online/decode_${lmtype}_${data_affix} || exit 1
       done
       steps/lmrescore.sh \
-        --self-loop-scale 1.0 \
+        \
         --cmd "$decode_cmd" data/lang_test_{tgpr,tg} \
         data/${data}_hires ${dir}_online/decode_{tgpr,tg}_${data_affix} || exit 1
       steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \
diff --git a/egs/wsj/s5/local/chain/tuning/run_tdnn_1e.sh b/egs/wsj/s5/local/chain/tuning/run_tdnn_1e.sh
index e20069fbfa1..585399e1367 100755
--- a/egs/wsj/s5/local/chain/tuning/run_tdnn_1e.sh
+++ b/egs/wsj/s5/local/chain/tuning/run_tdnn_1e.sh
@@ -264,13 +264,13 @@ if [ $stage -le 17 ]; then
   utils/lang/check_phones_compatible.sh \
     data/lang_test_tgpr/phones.txt $lang/phones.txt
   utils/mkgraph.sh \
-    --self-loop-scale 1.0 data/lang_test_tgpr \
+    data/lang_test_tgpr \
     $tree_dir $tree_dir/graph_tgpr || exit 1;
 
   utils/lang/check_phones_compatible.sh \
     data/lang_test_bd_tgpr/phones.txt $lang/phones.txt
   utils/mkgraph.sh \
-    --self-loop-scale 1.0 data/lang_test_bd_tgpr \
+    data/lang_test_bd_tgpr \
     $tree_dir $tree_dir/graph_bd_tgpr || exit 1;
 fi
 
@@ -294,7 +294,7 @@ if [ $stage -le 18 ]; then
           $tree_dir/graph_${lmtype} data/${data}_hires ${dir}/decode_${lmtype}_${data_affix} || exit 1
       done
       steps/lmrescore.sh \
-        --self-loop-scale 1.0 \
+        \
         --cmd "$decode_cmd" data/lang_test_{tgpr,tg} \
         data/${data}_hires ${dir}/decode_{tgpr,tg}_${data_affix} || exit 1
       steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \
@@ -332,7 +332,7 @@ if $test_online_decoding && [ $stage -le 19 ]; then
           $tree_dir/graph_${lmtype} data/${data} ${dir}_online/decode_${lmtype}_${data_affix} || exit 1
       done
       steps/lmrescore.sh \
-        --self-loop-scale 1.0 \
+        \
         --cmd "$decode_cmd" data/lang_test_{tgpr,tg} \
         data/${data}_hires ${dir}_online/decode_{tgpr,tg}_${data_affix} || exit 1
       steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \
diff --git a/egs/wsj/s5/local/chain/tuning/run_tdnn_1f.sh b/egs/wsj/s5/local/chain/tuning/run_tdnn_1f.sh
index 86df0779841..ad97730bf01 100755
--- a/egs/wsj/s5/local/chain/tuning/run_tdnn_1f.sh
+++ b/egs/wsj/s5/local/chain/tuning/run_tdnn_1f.sh
@@ -258,13 +258,13 @@ if [ $stage -le 17 ]; then
   utils/lang/check_phones_compatible.sh \
     data/lang_test_tgpr/phones.txt $lang/phones.txt
   utils/mkgraph.sh \
-    --self-loop-scale 1.0 data/lang_test_tgpr \
+    data/lang_test_tgpr \
     $tree_dir $tree_dir/graph_tgpr || exit 1;
 
   utils/lang/check_phones_compatible.sh \
     data/lang_test_bd_tgpr/phones.txt $lang/phones.txt
   utils/mkgraph.sh \
-    --self-loop-scale 1.0 data/lang_test_bd_tgpr \
+    data/lang_test_bd_tgpr \
     $tree_dir $tree_dir/graph_bd_tgpr || exit 1;
 fi
 
@@ -288,7 +288,7 @@ if [ $stage -le 18 ]; then
           $tree_dir/graph_${lmtype} data/${data}_hires ${dir}/decode_${lmtype}_${data_affix} || exit 1
       done
       steps/lmrescore.sh \
-        --self-loop-scale 1.0 \
+        \
         --cmd "$decode_cmd" data/lang_test_{tgpr,tg} \
         data/${data}_hires ${dir}/decode_{tgpr,tg}_${data_affix} || exit 1
       steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \
@@ -326,7 +326,7 @@ if $test_online_decoding && [ $stage -le 19 ]; then
           $tree_dir/graph_${lmtype} data/${data} ${dir}_online/decode_${lmtype}_${data_affix} || exit 1
       done
       steps/lmrescore.sh \
-        --self-loop-scale 1.0 \
+        \
         --cmd "$decode_cmd" data/lang_test_{tgpr,tg} \
         data/${data}_hires ${dir}_online/decode_{tgpr,tg}_${data_affix} || exit 1
       steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \
diff --git a/egs/wsj/s5/local/chain/tuning/run_tdnn_1g.sh b/egs/wsj/s5/local/chain/tuning/run_tdnn_1g.sh
index ba90afbb213..8a2371c053f 100755
--- a/egs/wsj/s5/local/chain/tuning/run_tdnn_1g.sh
+++ b/egs/wsj/s5/local/chain/tuning/run_tdnn_1g.sh
@@ -266,13 +266,13 @@ if [ $stage -le 17 ]; then
   utils/lang/check_phones_compatible.sh \
     data/lang_test_tgpr/phones.txt $lang/phones.txt
   utils/mkgraph.sh \
-    --self-loop-scale 1.0 data/lang_test_tgpr \
+    data/lang_test_tgpr \
     $tree_dir $tree_dir/graph_tgpr || exit 1;
 
   utils/lang/check_phones_compatible.sh \
     data/lang_test_bd_tgpr/phones.txt $lang/phones.txt
   utils/mkgraph.sh \
-    --self-loop-scale 1.0 data/lang_test_bd_tgpr \
+    data/lang_test_bd_tgpr \
     $tree_dir $tree_dir/graph_bd_tgpr || exit 1;
 fi
 
@@ -296,7 +296,7 @@ if [ $stage -le 18 ]; then
           $tree_dir/graph_${lmtype} data/${data}_hires ${dir}/decode_${lmtype}_${data_affix} || exit 1
       done
       steps/lmrescore.sh \
-        --self-loop-scale 1.0 \
+        \
         --cmd "$decode_cmd" data/lang_test_{tgpr,tg} \
         data/${data}_hires ${dir}/decode_{tgpr,tg}_${data_affix} || exit 1
       steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \
@@ -334,7 +334,7 @@ if $test_online_decoding && [ $stage -le 19 ]; then
           $tree_dir/graph_${lmtype} data/${data} ${dir}_online/decode_${lmtype}_${data_affix} || exit 1
       done
       steps/lmrescore.sh \
-        --self-loop-scale 1.0 \
+        \
         --cmd "$decode_cmd" data/lang_test_{tgpr,tg} \
         data/${data}_hires ${dir}_online/decode_{tgpr,tg}_${data_affix} || exit 1
       steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \
diff --git a/egs/wsj/s5/local/chain/tuning/run_tdnn_lstm_1a.sh b/egs/wsj/s5/local/chain/tuning/run_tdnn_lstm_1a.sh
index 6e4f220c1f2..58c31f67ff7 100755
--- a/egs/wsj/s5/local/chain/tuning/run_tdnn_lstm_1a.sh
+++ b/egs/wsj/s5/local/chain/tuning/run_tdnn_lstm_1a.sh
@@ -276,13 +276,13 @@ if [ $stage -le 17 ]; then
   utils/lang/check_phones_compatible.sh \
     data/lang_test_tgpr/phones.txt $lang/phones.txt
   utils/mkgraph.sh \
-    --self-loop-scale 1.0 data/lang_test_tgpr \
+    data/lang_test_tgpr \
     $tree_dir $tree_dir/graph_tgpr || exit 1;
 
   utils/lang/check_phones_compatible.sh \
     data/lang_test_bd_tgpr/phones.txt $lang/phones.txt
   utils/mkgraph.sh \
-    --self-loop-scale 1.0 data/lang_test_bd_tgpr \
+    data/lang_test_bd_tgpr \
     $tree_dir $tree_dir/graph_bd_tgpr || exit 1;
 fi
 
@@ -307,7 +307,7 @@ if [ $stage -le 18 ]; then
           $tree_dir/graph_${lmtype} data/${data}_hires ${dir}/decode_${lmtype}_${data_affix} || exit 1
       done
       steps/lmrescore.sh \
-        --self-loop-scale 1.0 \
+        \
         --cmd "$decode_cmd" data/lang_test_{tgpr,tg} \
         data/${data}_hires ${dir}/decode_{tgpr,tg}_${data_affix} || exit 1
       steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \
@@ -343,7 +343,7 @@ if [ $stage -le 19 ]; then
           $tree_dir/graph_${lmtype} data/${data}_hires ${dir}/decode_looped_${lmtype}_${data_affix} || exit 1
       done
       steps/lmrescore.sh \
-        --self-loop-scale 1.0 \
+        \
         --cmd "$decode_cmd" data/lang_test_{tgpr,tg} \
         data/${data}_hires ${dir}/decode_looped_{tgpr,tg}_${data_affix} || exit 1
       steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \
@@ -377,7 +377,7 @@ if $test_online_decoding && [ $stage -le 20 ]; then
           $tree_dir/graph_${lmtype} data/${data} ${dir}_online/decode_${lmtype}_${data_affix} || exit 1
       done
       steps/lmrescore.sh \
-        --self-loop-scale 1.0 \
+        \
         --cmd "$decode_cmd" data/lang_test_{tgpr,tg} \
         data/${data}_hires ${dir}_online/decode_{tgpr,tg}_${data_affix} || exit 1
       steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \
diff --git a/egs/wsj/s5/local/chain/tuning/run_tdnn_lstm_1b.sh b/egs/wsj/s5/local/chain/tuning/run_tdnn_lstm_1b.sh
index 2d113e58a93..41389e4d07a 100755
--- a/egs/wsj/s5/local/chain/tuning/run_tdnn_lstm_1b.sh
+++ b/egs/wsj/s5/local/chain/tuning/run_tdnn_lstm_1b.sh
@@ -567,13 +567,13 @@ if [ $stage -le 17 ]; then
   utils/lang/check_phones_compatible.sh \
     data/lang_test_tgpr/phones.txt $lang/phones.txt
   utils/mkgraph.sh \
-    --self-loop-scale 1.0 data/lang_test_tgpr \
+    data/lang_test_tgpr \
     $tree_dir $tree_dir/graph_tgpr || exit 1;
 
   utils/lang/check_phones_compatible.sh \
     data/lang_test_bd_tgpr/phones.txt $lang/phones.txt
   utils/mkgraph.sh \
-    --self-loop-scale 1.0 data/lang_test_bd_tgpr \
+    data/lang_test_bd_tgpr \
     $tree_dir $tree_dir/graph_bd_tgpr || exit 1;
 fi
 
@@ -598,7 +598,7 @@ if [ $stage -le 18 ]; then
           $tree_dir/graph_${lmtype} data/${data}_hires ${dir}/decode_${lmtype}_${data_affix} || exit 1
       done
       steps/lmrescore.sh \
-        --self-loop-scale 1.0 \
+        \
         --cmd "$decode_cmd" data/lang_test_{tgpr,tg} \
         data/${data}_hires ${dir}/decode_{tgpr,tg}_${data_affix} || exit 1
       steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \
@@ -634,7 +634,7 @@ if [ $stage -le 19 ]; then
           $tree_dir/graph_${lmtype} data/${data}_hires ${dir}/decode_looped_${lmtype}_${data_affix} || exit 1
       done
       steps/lmrescore.sh \
-        --self-loop-scale 1.0 \
+        \
         --cmd "$decode_cmd" data/lang_test_{tgpr,tg} \
         data/${data}_hires ${dir}/decode_looped_{tgpr,tg}_${data_affix} || exit 1
       steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \
@@ -668,7 +668,7 @@ if $test_online_decoding && [ $stage -le 20 ]; then
           $tree_dir/graph_${lmtype} data/${data} ${dir}_online/decode_${lmtype}_${data_affix} || exit 1
       done
       steps/lmrescore.sh \
-        --self-loop-scale 1.0 \
+        \
         --cmd "$decode_cmd" data/lang_test_{tgpr,tg} \
         data/${data}_hires ${dir}_online/decode_{tgpr,tg}_${data_affix} || exit 1
       steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \
diff --git a/egs/wsj/s5/local/nnet3/tuning/run_tdnn_lstm_lfr_1a.sh b/egs/wsj/s5/local/nnet3/tuning/run_tdnn_lstm_lfr_1a.sh
index f2a4ed37ae5..03554c61b57 100755
--- a/egs/wsj/s5/local/nnet3/tuning/run_tdnn_lstm_lfr_1a.sh
+++ b/egs/wsj/s5/local/nnet3/tuning/run_tdnn_lstm_lfr_1a.sh
@@ -218,12 +218,12 @@ if [ $stage -le 16 ]; then
 
   utils/lang/check_phones_compatible.sh \
     data/lang_test_tgpr/phones.txt $lang/phones.txt
-  utils/mkgraph.sh --self-loop-scale 0.333 data/lang_test_tgpr \
+  utils/mkgraph.sh data/lang_test_tgpr \
                    $dir $dir/graph_tgpr || exit 1;
 
   utils/lang/check_phones_compatible.sh \
     data/lang_test_bd_tgpr/phones.txt $lang/phones.txt
-  utils/mkgraph.sh --self-loop-scale 0.333 data/lang_test_bd_tgpr \
+  utils/mkgraph.sh data/lang_test_bd_tgpr \
       $dir $dir/graph_bd_tgpr || exit 1;
 fi
 
@@ -248,7 +248,7 @@ if [ $stage -le 17 ]; then
           $dir/graph_${lmtype} data/${data}_hires ${dir}/decode_${lmtype}_${data_affix} || exit 1
       done
       steps/lmrescore.sh \
-        --self-loop-scale 0.333 \
+        \
         --cmd "$decode_cmd" data/lang_test_{tgpr,tg} \
         data/${data}_hires ${dir}/decode_{tgpr,tg}_${data_affix} || exit 1
       steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \
@@ -284,7 +284,7 @@ if [ $stage -le 18 ]; then
           $dir/graph_${lmtype} data/${data}_hires ${dir}/decode_looped_${lmtype}_${data_affix} || exit 1
       done
       steps/lmrescore.sh \
-        --self-loop-scale 0.333 \
+        \
         --cmd "$decode_cmd" data/lang_test_{tgpr,tg} \
         data/${data}_hires ${dir}/decode_looped_{tgpr,tg}_${data_affix} || exit 1
       steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \
@@ -318,7 +318,7 @@ if $test_online_decoding && [ $stage -le 19 ]; then
           $dir/graph_${lmtype} data/${data} ${dir}_online/decode_${lmtype}_${data_affix} || exit 1
       done
       steps/lmrescore.sh \
-        --self-loop-scale 0.333 \
+        \
         --cmd "$decode_cmd" data/lang_test_{tgpr,tg} \
         data/${data}_hires ${dir}_online/decode_{tgpr,tg}_${data_affix} || exit 1
       steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \
diff --git a/egs/wsj/s5/steps/align_basis_fmllr.sh b/egs/wsj/s5/steps/align_basis_fmllr.sh
index e5510c5ab7e..a9ceab29b4f 100755
--- a/egs/wsj/s5/steps/align_basis_fmllr.sh
+++ b/egs/wsj/s5/steps/align_basis_fmllr.sh
@@ -19,7 +19,7 @@ nj=4
 cmd=run.pl
 use_graphs=false
 # Begin configuration.
-scale_opts="--transition-scale=1.0 --acoustic-scale=0.1 --self-loop-scale=0.1"
+scale_opts="--acoustic-scale=0.1"
 basis_fmllr_opts="--fmllr-min-count=22  --num-iters=10 --size-scale=0.2 --step-size-iters=3"
 beam=10
 retry_beam=40
diff --git a/egs/wsj/s5/steps/align_basis_fmllr_lats.sh b/egs/wsj/s5/steps/align_basis_fmllr_lats.sh
index 426168496cc..dd3db90bd76 100755
--- a/egs/wsj/s5/steps/align_basis_fmllr_lats.sh
+++ b/egs/wsj/s5/steps/align_basis_fmllr_lats.sh
@@ -16,7 +16,6 @@ stage=0
 nj=4
 cmd=run.pl
 # Begin configuration.
-scale_opts="--transition-scale=1.0 --self-loop-scale=0.1"
 acoustic_scale=0.1
 beam=10
 retry_beam=40
@@ -112,18 +111,18 @@ if [ $stage -le 0 ]; then
   echo "$0: compiling training graphs"
   tra="ark:utils/sym2int.pl --map-oov $oov -f 2- $lang/words.txt $sdata/JOB/text|";
   $cmd JOB=1:$nj $dir/log/compile_graphs.JOB.log  \
-    compile-train-graphs --read-disambig-syms=$lang/phones/disambig.int $scale_opts $dir/tree $dir/final.mdl  $lang/L.fst "$tra" \
+    compile-train-graphs --read-disambig-syms=$lang/phones/disambig.int $dir/tree $dir/final.mdl  $lang/L.fst "$tra" \
     "ark:|gzip -c >$dir/fsts.JOB.gz" || exit 1;
 fi
 
 
 if [ $stage -le 1 ]; then
-  # Note: we need to set --transition-scale=0.0 --self-loop-scale=0.0 because,
+  # Note: we need to set   because,
   # as explained above, we compiled the transition probs into the training
   # graphs.
   echo "$0: aligning data in $data using $alimdl and speaker-independent features."
   $cmd JOB=1:$nj $dir/log/align_pass1.JOB.log \
-    gmm-align-compiled --transition-scale=0.0 --self-loop-scale=0.0 --acoustic-scale=$acoustic_scale \
+    gmm-align-compiled   --acoustic-scale=$acoustic_scale \
         --beam=$beam --retry-beam=$retry_beam "$alimdl_cmd" \
     "ark:gunzip -c $dir/fsts.JOB.gz|" "$sifeats" "ark:|gzip -c >$dir/pre_ali.JOB.gz" || exit 1;
 fi
diff --git a/egs/wsj/s5/steps/align_fmllr.sh b/egs/wsj/s5/steps/align_fmllr.sh
index 327978e680f..c1ec67ec7dc 100755
--- a/egs/wsj/s5/steps/align_fmllr.sh
+++ b/egs/wsj/s5/steps/align_fmllr.sh
@@ -18,7 +18,7 @@ nj=4
 cmd=run.pl
 use_graphs=false
 # Begin configuration.
-scale_opts="--transition-scale=1.0 --acoustic-scale=0.1 --self-loop-scale=0.1"
+scale_opts="--acoustic-scale=0.1"
 beam=10
 retry_beam=40
 careful=false
diff --git a/egs/wsj/s5/steps/align_fmllr_lats.sh b/egs/wsj/s5/steps/align_fmllr_lats.sh
index b331b40d73c..e561a6f0d29 100755
--- a/egs/wsj/s5/steps/align_fmllr_lats.sh
+++ b/egs/wsj/s5/steps/align_fmllr_lats.sh
@@ -12,7 +12,6 @@ stage=0
 nj=4
 cmd=run.pl
 # Begin configuration.
-scale_opts="--transition-scale=1.0 --self-loop-scale=0.1"
 acoustic_scale=0.1
 beam=10
 retry_beam=40
@@ -100,18 +99,18 @@ if [ $stage -le 0 ]; then
   echo "$0: compiling training graphs"
   tra="ark:utils/sym2int.pl --map-oov $oov -f 2- $lang/words.txt $sdata/JOB/text|";
   $cmd JOB=1:$nj $dir/log/compile_graphs.JOB.log  \
-    compile-train-graphs --read-disambig-syms=$lang/phones/disambig.int $scale_opts $dir/tree $dir/final.mdl  $lang/L.fst "$tra" \
+    compile-train-graphs --read-disambig-syms=$lang/phones/disambig.int $dir/tree $dir/final.mdl  $lang/L.fst "$tra" \
     "ark:|gzip -c >$dir/fsts.JOB.gz" || exit 1;
 fi
 
 
 if [ $stage -le 1 ]; then
-  # Note: we need to set --transition-scale=0.0 --self-loop-scale=0.0 because,
+  # Note: we need to set   because,
   # as explained above, we compiled the transition probs into the training
   # graphs.
   echo "$0: aligning data in $data using $alimdl and speaker-independent features."
   $cmd JOB=1:$nj $dir/log/align_pass1.JOB.log \
-    gmm-align-compiled --transition-scale=0.0 --self-loop-scale=0.0 --acoustic-scale=$acoustic_scale \
+    gmm-align-compiled   --acoustic-scale=$acoustic_scale \
         --beam=$beam --retry-beam=$retry_beam "$alimdl_cmd" \
     "ark:gunzip -c $dir/fsts.JOB.gz|" "$sifeats" "ark:|gzip -c >$dir/pre_ali.JOB.gz" || exit 1;
 fi
diff --git a/egs/wsj/s5/steps/align_lvtln.sh b/egs/wsj/s5/steps/align_lvtln.sh
index 9efba2b9096..671c3e45c71 100755
--- a/egs/wsj/s5/steps/align_lvtln.sh
+++ b/egs/wsj/s5/steps/align_lvtln.sh
@@ -13,7 +13,7 @@ nj=4
 cmd=run.pl
 use_graphs=false
 # Begin configuration.
-scale_opts="--transition-scale=1.0 --acoustic-scale=0.1 --self-loop-scale=0.1"
+scale_opts="--acoustic-scale=0.1"
 beam=10.0
 retry_beam=40
 boost_silence=1.0 # factor by which to boost silence during alignment.
diff --git a/egs/wsj/s5/steps/align_raw_fmllr.sh b/egs/wsj/s5/steps/align_raw_fmllr.sh
index 639dde559a4..5cec25c096a 100755
--- a/egs/wsj/s5/steps/align_raw_fmllr.sh
+++ b/egs/wsj/s5/steps/align_raw_fmllr.sh
@@ -18,7 +18,7 @@ nj=4
 cmd=run.pl
 use_graphs=false
 # Begin configuration.
-scale_opts="--transition-scale=1.0 --acoustic-scale=0.1 --self-loop-scale=0.1"
+scale_opts="--acoustic-scale=0.1"
 beam=10
 retry_beam=40
 boost_silence=1.0 # factor by which to boost silence during alignment.
diff --git a/egs/wsj/s5/steps/align_sgmm2.sh b/egs/wsj/s5/steps/align_sgmm2.sh
index d2f829f7e3e..951e241284e 100755
--- a/egs/wsj/s5/steps/align_sgmm2.sh
+++ b/egs/wsj/s5/steps/align_sgmm2.sh
@@ -18,7 +18,7 @@ use_gselect=false # use gselect info from srcdir [regardless, we use
    # Gaussian-selection info, we might have to compute it though.]
 gselect=15  # Number of Gaussian-selection indices for SGMMs.
 # Begin configuration.
-scale_opts="--transition-scale=1.0 --acoustic-scale=0.1 --self-loop-scale=0.1"
+scale_opts="--acoustic-scale=0.1"
 beam=10
 retry_beam=40
 transform_dir=  # directory to find fMLLR transforms in.
diff --git a/egs/wsj/s5/steps/align_si.sh b/egs/wsj/s5/steps/align_si.sh
index 0bfebe6b0fc..749124dfadf 100755
--- a/egs/wsj/s5/steps/align_si.sh
+++ b/egs/wsj/s5/steps/align_si.sh
@@ -15,7 +15,7 @@ nj=4
 cmd=run.pl
 use_graphs=false
 # Begin configuration.
-scale_opts="--transition-scale=1.0 --acoustic-scale=0.1 --self-loop-scale=0.1"
+scale_opts="--acoustic-scale=0.1"
 beam=10
 retry_beam=40
 careful=false
diff --git a/egs/wsj/s5/steps/cleanup/clean_and_segment_data_nnet3.sh b/egs/wsj/s5/steps/cleanup/clean_and_segment_data_nnet3.sh
index cc8da298d2f..67d92e0b73a 100755
--- a/egs/wsj/s5/steps/cleanup/clean_and_segment_data_nnet3.sh
+++ b/egs/wsj/s5/steps/cleanup/clean_and_segment_data_nnet3.sh
@@ -111,10 +111,6 @@ cp $srcdir/frame_subsampling_factor $dir 2>/dev/null || true
 
 if [ -f $srcdir/frame_subsampling_factor ]; then
   echo "$0: guessing that this is a chain system, checking parameters."
-  if [ -z $scale_opts ]; then
-    echo "$0: setting scale_opts"
-    scale_opts="--self-loop-scale=1.0 --transition-scale=1.0"
-  fi
   if [ $acwt == 0.1 ]; then
     echo "$0: setting acwt=1.0"
     acwt=1.0
diff --git a/egs/wsj/s5/steps/cleanup/find_bad_utts.sh b/egs/wsj/s5/steps/cleanup/find_bad_utts.sh
index 9bb67abeff9..27136be6fb5 100755
--- a/egs/wsj/s5/steps/cleanup/find_bad_utts.sh
+++ b/egs/wsj/s5/steps/cleanup/find_bad_utts.sh
@@ -12,7 +12,7 @@ nj=4
 cmd=run.pl
 use_graphs=false
 # Begin configuration.
-scale_opts="--transition-scale=1.0 --self-loop-scale=0.1"
+scale_opts=""
 acoustic_scale=0.1
 beam=15.0
 lattice_beam=8.0
@@ -197,4 +197,3 @@ if [ $stage -le 3 ]; then
     sort -i -b -k1,1 -k4,4nr -k2,2 -k3,3 > $dir/analysis/ops_details.txt
 
 fi
-
diff --git a/egs/wsj/s5/steps/cleanup/find_bad_utts_nnet.sh b/egs/wsj/s5/steps/cleanup/find_bad_utts_nnet.sh
index b18efe35a3c..68b8497f4e2 100755
--- a/egs/wsj/s5/steps/cleanup/find_bad_utts_nnet.sh
+++ b/egs/wsj/s5/steps/cleanup/find_bad_utts_nnet.sh
@@ -12,7 +12,7 @@ nj=8
 cmd=run.pl
 use_graphs=false
 # Begin configuration.
-scale_opts="--transition-scale=1.0 --self-loop-scale=0.1"
+scale_opts=""
 acoustic_scale=0.1
 beam=15.0
 lattice_beam=8.0
diff --git a/egs/wsj/s5/steps/cleanup/make_biased_lm_graphs.sh b/egs/wsj/s5/steps/cleanup/make_biased_lm_graphs.sh
index d957ce4d5c7..9233d142946 100755
--- a/egs/wsj/s5/steps/cleanup/make_biased_lm_graphs.sh
+++ b/egs/wsj/s5/steps/cleanup/make_biased_lm_graphs.sh
@@ -17,7 +17,7 @@ set -e
 # Begin configuration section.
 nj=10
 cmd=run.pl
-scale_opts="--transition-scale=1.0 --self-loop-scale=0.1"
+scale_opts=""
 top_n_words=100 # Number of common words that we compile into each graph (most frequent
                 # in $data/text.orig.
 top_n_words_weight=1.0  # this weight is before renormalization; it can be more
@@ -49,7 +49,7 @@ if [ $# != 4 ]; then
    echo "Main options (for others, see top of script file)"
    echo "  --scale-opts <scale-opts>                 # Options relating to language"
    echo "                                            # model scale; default is "
-   echo "                                            # '--transition-scale=1.0 --self-loop-scale=0.1'"
+   echo "                                            # ''"
    echo "  --top-n-words <N>                         # Number of most-common-words to add with"
    echo "                                            # unigram probabilities into graph (default: 100)"
    echo "  --top-n-words-weight <float>              # Weight given to top-n-words portion of graph"
diff --git a/egs/wsj/s5/steps/cleanup/make_segmentation_graph.sh b/egs/wsj/s5/steps/cleanup/make_segmentation_graph.sh
index 6705ab6db54..ab18d801c2e 100755
--- a/egs/wsj/s5/steps/cleanup/make_segmentation_graph.sh
+++ b/egs/wsj/s5/steps/cleanup/make_segmentation_graph.sh
@@ -6,8 +6,6 @@
 # Begin configuration section.
 nj=4
 cmd=run.pl
-tscale=1.0      # transition scale.
-loopscale=0.1   # scale for self-loops.
 cleanup=true
 ngram_order=1
 srilm_options="-wbdiscount"   # By default, use Witten-Bell discounting in SRILM
@@ -35,8 +33,6 @@ if [ $# -ne 4 ]; then
   echo "Options:"
   echo "    --ngram-order           # order of n-gram language model"
   echo "    --srilm-options         # options for ngram-count in SRILM tool"
-  echo "    --tscale                # transition scale"
-  echo "    --loopscale             # scale for self-loops"
   echo "    --cleanup               # if true, removes the intermediate files"
   exit 1;
 fi
@@ -87,7 +83,7 @@ fi
 
 mkdir -p $graph_dir/split$nj
 mkdir -p $graph_dir/log
- 
+
 split_texts=""
 for n in $(seq $nj); do
   mkdir -p $graph_dir/split$nj/$n
@@ -97,7 +93,6 @@ utils/split_scp.pl $data/text.orig $split_texts
 
 $cmd JOB=1:$nj $graph_dir/log/make_utterance_graph.JOB.log \
   steps/cleanup/make_utterance_graph.sh --cleanup $cleanup \
-  --tscale $tscale --loopscale $loopscale \
   --ngram-order $ngram_order --srilm-options "$srilm_options" \
   $graph_dir/split$nj/JOB/text $lang \
   $model_dir $graph_dir/split$nj/JOB || exit 1;
diff --git a/egs/wsj/s5/steps/cleanup/make_utterance_graph.sh b/egs/wsj/s5/steps/cleanup/make_utterance_graph.sh
index 277c5a2da1c..a784c8777a8 100755
--- a/egs/wsj/s5/steps/cleanup/make_utterance_graph.sh
+++ b/egs/wsj/s5/steps/cleanup/make_utterance_graph.sh
@@ -4,8 +4,6 @@
 # Apache 2.0
 
 # Begin configuration section.
-tscale=1.0      # transition scale.
-loopscale=0.1   # scale for self-loops.
 cleanup=true
 ngram_order=1
 srilm_options="-wbdiscount"   # By default, use Witten-Bell discounting in SRILM
@@ -34,8 +32,6 @@ if [ $# -ne 4 ]; then
   echo "Options:"
   echo "    --ngram-order           # order of n-gram language model"
   echo "    --srilm-options         # options for ngram-count in SRILM tool"
-  echo "    --tscale                # transition scale"
-  echo "    --loopscale             # scale for self-loops"
   echo "    --cleanup               # if true, removes the intermediate files"
   exit 1;
 fi
@@ -134,7 +130,7 @@ cat $text | utils/sym2int.pl --map-oov $oov -f 2- $lang/words.txt | \
   fstisstochastic $wdir/CLG.fst  || echo "$0: $uttid/CLG.fst not stochastic."
 
   make-h-transducer --disambig-syms-out=$wdir/disambig_tid.int \
-    --transition-scale=$tscale $wdir/ilabels_${N}_${P} \
+    $wdir/ilabels_${N}_${P} \
     $model_dir/tree $model_dir/final.mdl > $wdir/Ha.fst
 
   # Builds HCLGa.fst
@@ -145,13 +141,10 @@ cat $text | utils/sym2int.pl --map-oov $oov -f 2- $lang/words.txt | \
   fstisstochastic $wdir/HCLGa.fst ||\
     echo "$0: $uttid/HCLGa.fst is not stochastic"
 
-  add-self-loops --self-loop-scale=$loopscale --reorder=true \
-    $model_dir/final.mdl < $wdir/HCLGa.fst > $wdir/HCLG.fst
+  add-self-loops $model_dir/final.mdl < $wdir/HCLGa.fst > $wdir/HCLG.fst
 
-  if [ $tscale == 1.0 -a $loopscale == 1.0 ]; then
-    fstisstochastic $wdir/HCLG.fst ||\
-      echo "$0: $uttid/HCLG.fst is not stochastic."
-  fi
+  fstisstochastic $wdir/HCLG.fst ||\
+    echo "$0: $uttid/HCLG.fst is not stochastic."
 
   echo "$uttid $wdir/HCLG.fst" >> $graph_dir/sub_graphs/HCLG.fsts.scp
   echo
diff --git a/egs/wsj/s5/steps/cleanup/segment_long_utterances_nnet3.sh b/egs/wsj/s5/steps/cleanup/segment_long_utterances_nnet3.sh
index f0df1e7730c..d42cda1b9d3 100755
--- a/egs/wsj/s5/steps/cleanup/segment_long_utterances_nnet3.sh
+++ b/egs/wsj/s5/steps/cleanup/segment_long_utterances_nnet3.sh
@@ -171,7 +171,7 @@ if [ -f $srcdir/frame_subsampling_factor ]; then
   echo "$0: guessing that this is a chain system, checking parameters."
   if [ -z $scale_opts ]; then
     echo "$0: setting scale_opts"
-    scale_opts="--self-loop-scale=1.0 --transition-scale=1.0"
+    scale_opts=" "
   fi
   if [ $acwt == 0.1 ]; then
     echo "$0: setting acwt=1.0"
diff --git a/egs/wsj/s5/steps/decode_basis_fmllr.sh b/egs/wsj/s5/steps/decode_basis_fmllr.sh
index afb914e7f0d..7e39048f463 100755
--- a/egs/wsj/s5/steps/decode_basis_fmllr.sh
+++ b/egs/wsj/s5/steps/decode_basis_fmllr.sh
@@ -37,7 +37,7 @@ acwt=0.083333 # Acoustic weight used in getting fMLLR transforms, and also in
               # lattice generation.
 
 # Parameters in alignment of training data
-scale_opts="--transition-scale=1.0 --acoustic-scale=0.1 --self-loop-scale=0.1"
+scale_opts="--acoustic-scale=0.1"
 align_beam=10
 retry_beam=40
 
diff --git a/egs/wsj/s5/steps/decode_fromlats.sh b/egs/wsj/s5/steps/decode_fromlats.sh
index ee719c0e132..73c8954fb48 100755
--- a/egs/wsj/s5/steps/decode_fromlats.sh
+++ b/egs/wsj/s5/steps/decode_fromlats.sh
@@ -22,7 +22,7 @@ beam=20.0
 lattice_beam=7.0
 acwt=0.083333
 batch_size=75 # Limits memory blowup in compile-train-graphs-fsts
-scale_opts="--transition-scale=1.0 --self-loop-scale=0.1"
+scale_opts=""
 skip_scoring=false
 # End configuration.
 
diff --git a/egs/wsj/s5/steps/decode_sgmm2_fromlats.sh b/egs/wsj/s5/steps/decode_sgmm2_fromlats.sh
index 1cdd9885314..a953aeb90e7 100755
--- a/egs/wsj/s5/steps/decode_sgmm2_fromlats.sh
+++ b/egs/wsj/s5/steps/decode_sgmm2_fromlats.sh
@@ -33,7 +33,7 @@ vecs_beam=4.0 # Beam we use to prune lattices while getting posteriors for
 use_fmllr=false
 fmllr_iters=10
 fmllr_min_count=1000
-scale_opts="--transition-scale=1.0 --self-loop-scale=0.1"
+scale_opts=""
 skip_scoring=false
 # End configuration section.
 
diff --git a/egs/wsj/s5/steps/get_fmllr_basis.sh b/egs/wsj/s5/steps/get_fmllr_basis.sh
index 9b60af1fa51..3f145714ef1 100755
--- a/egs/wsj/s5/steps/get_fmllr_basis.sh
+++ b/egs/wsj/s5/steps/get_fmllr_basis.sh
@@ -8,7 +8,7 @@
 
 stage=0
 # Parameters in alignment of training data
-scale_opts="--transition-scale=1.0 --acoustic-scale=0.1 --self-loop-scale=0.1"
+scale_opts="--acoustic-scale=0.1"
 per_utt=true # If true, then treat each utterance as a separate speaker for purposes of
   # basis training... this is recommended if the number of actual speakers in your
   # training set is less than (feature-dim) * (feature-dim+1).
diff --git a/egs/wsj/s5/steps/lmrescore.sh b/egs/wsj/s5/steps/lmrescore.sh
index 88db8ae15dc..abe5e740166 100755
--- a/egs/wsj/s5/steps/lmrescore.sh
+++ b/egs/wsj/s5/steps/lmrescore.sh
@@ -7,7 +7,6 @@ mode=4  # mode can be 1 through 5.  They should all give roughly similar results
         # See the comments in the case statement for more details.
 cmd=run.pl
 skip_scoring=false
-self_loop_scale=0.1  # only matters for mode 4.
 acoustic_scale=0.1   # only matters for mode 5.
 # End configuration section.
 
@@ -22,8 +21,6 @@ if [ $# != 5 ]; then
    echo " --cmd   <cmd-string>       # How to run commands (e.g. run.pl, queue.pl)"
    echo " --mode  (1|2|3|4|5)        # Mode of LM rescoring to use (default: 4)."
    echo "                            # These should give very similar results."
-   echo " --self-loop-scale  <scale> # Self-loop-scale, only relevant in mode 4."
-   echo "                            # Default: 0.1."
    echo " --acoustic-scale  <scale>  # Acoustic scale, only relevant in mode 5."
    echo "                            # Default: 0.1."
    exit 1;
@@ -109,8 +106,6 @@ case "$mode" in
      # grammar and transition weights.
     mdl=`dirname $indir`/final.mdl
     [ ! -f $mdl ] && echo No such model $mdl && exit 1;
-    [[ -f `dirname $indir`/frame_subsampling_factor && "$self_loop_scale" == 0.1 ]] && \
-      echo "$0: WARNING: chain models need '--self-loop-scale 1.0'";
     $cmd JOB=1:$nj $outdir/log/rescorelm.JOB.log \
       gunzip -c $indir/lat.JOB.gz \| \
       lattice-scale --lm-scale=0.0 ark:- ark:- \| \
@@ -118,8 +113,7 @@ case "$mode" in
       lattice-compose ark:- $outdir/Ldet.fst ark:- \| \
       lattice-determinize ark:- ark:- \| \
       lattice-compose --phi-label=$phi ark:- $newlm ark:- \| \
-      lattice-add-trans-probs --transition-scale=1.0 --self-loop-scale=$self_loop_scale \
-      $mdl ark:- ark:- \| \
+      lattice-add-trans-probs $mdl ark:- ark:- \| \
       gzip -c \>$outdir/lat.JOB.gz  || exit 1;
     ;;
   5) # Mode 5 uses the binary lattice-lmrescore-pruned to do the LM rescoring
diff --git a/egs/wsj/s5/steps/make_phone_graph.sh b/egs/wsj/s5/steps/make_phone_graph.sh
index aaf88cc66d2..c7b57374f1c 100755
--- a/egs/wsj/s5/steps/make_phone_graph.sh
+++ b/egs/wsj/s5/steps/make_phone_graph.sh
@@ -15,8 +15,6 @@ stage=0
 cmd=run.pl
 N=3  # change N and P for non-trigram systems.
 P=1
-tscale=1.0 # transition scale.
-loopscale=0.1 # scale for self-loops.
 # End configuration section.
 
 echo "$0 $@"  # Print the command line for logging
@@ -117,7 +115,7 @@ fi
 if [ $stage -le 5 ]; then
   echo "$0: creating Ha.fst"
   make-h-transducer --disambig-syms-out=$dir/phone_graph/disambig_tid.int \
-    --transition-scale=$tscale $dir/phone_graph/ilabels_${N}_${P} $dir/tree $dir/final.mdl \
+    $dir/phone_graph/ilabels_${N}_${P} $dir/tree $dir/final.mdl \
        > $dir/phone_graph/Ha.fst
 fi
 
@@ -131,13 +129,9 @@ if [ $stage -le 6 ]; then
 fi
 
 if [ $stage -le 7 ]; then
-  add-self-loops --self-loop-scale=$loopscale --reorder=true \
-    $dir/final.mdl < $dir/phone_graph/HCLGa.fst > $dir/phone_graph/HCLG.fst || exit 1;
+  add-self-loops $dir/final.mdl < $dir/phone_graph/HCLGa.fst > $dir/phone_graph/HCLG.fst || exit 1;
 
-  if [ $tscale == 1.0 -a $loopscale == 1.0 ]; then
-    # No point doing this test if transition-scale not 1, as it is bound to fail.
-    fstisstochastic $dir/phone_graph/HCLG.fst || echo "[info]: final HCLG is not stochastic."
-  fi
+  fstisstochastic $dir/phone_graph/HCLG.fst || echo "[info]: final HCLG is not stochastic."
 
   # $lang/phones.txt is the symbol table that corresponds to the output
   # symbols on the graph; decoding scripts expect it as words.txt.
diff --git a/egs/wsj/s5/steps/nnet/align.sh b/egs/wsj/s5/steps/nnet/align.sh
index 5f700cf28ed..98b3e949c86 100755
--- a/egs/wsj/s5/steps/nnet/align.sh
+++ b/egs/wsj/s5/steps/nnet/align.sh
@@ -10,7 +10,7 @@ nj=4
 cmd=run.pl
 stage=0
 # Begin configuration.
-scale_opts="--transition-scale=1.0 --acoustic-scale=0.1 --self-loop-scale=0.1"
+scale_opts="--acoustic-scale=0.1"
 beam=10
 retry_beam=40
 nnet_forward_opts="--no-softmax=true --prior-scale=1.0"
@@ -19,7 +19,7 @@ text= # (optional) transcipts we align to,
 
 align_to_lats=false # optionally produce alignment in lattice format
  lats_decode_opts="--acoustic-scale=0.1 --beam=20 --lattice_beam=10"
- lats_graph_scales="--transition-scale=1.0 --self-loop-scale=0.1"
+ lats_graph_scales=""
 
 use_gpu="no" # yes|no|optionaly
 # End configuration options.
diff --git a/egs/wsj/s5/steps/nnet2/align.sh b/egs/wsj/s5/steps/nnet2/align.sh
index fa040d692ad..5b89655ef37 100755
--- a/egs/wsj/s5/steps/nnet2/align.sh
+++ b/egs/wsj/s5/steps/nnet2/align.sh
@@ -9,7 +9,7 @@
 nj=4
 cmd=run.pl
 # Begin configuration.
-scale_opts="--transition-scale=1.0 --acoustic-scale=0.1 --self-loop-scale=0.1"
+scale_opts="--acoustic-scale=0.1"
 beam=10
 retry_beam=40
 transform_dir=
diff --git a/egs/wsj/s5/steps/nnet3/align.sh b/egs/wsj/s5/steps/nnet3/align.sh
index aa2de2ee1a5..75102f2c1d7 100755
--- a/egs/wsj/s5/steps/nnet3/align.sh
+++ b/egs/wsj/s5/steps/nnet3/align.sh
@@ -13,7 +13,7 @@
 nj=4
 cmd=run.pl
 # Begin configuration.
-scale_opts="--transition-scale=1.0 --acoustic-scale=0.1 --self-loop-scale=0.1"
+scale_opts="--acoustic-scale=0.1"
 beam=10
 retry_beam=40
 iter=final
@@ -104,10 +104,10 @@ if [ -f $srcdir/frame_subsampling_factor ]; then
   frame_subsampling_opt="--frame-subsampling-factor=$frame_subsampling_factor"
   cp $srcdir/frame_subsampling_factor $dir
   if [ "$frame_subsampling_factor" -gt 1 ] && \
-     [ "$scale_opts" == "--transition-scale=1.0 --acoustic-scale=0.1 --self-loop-scale=0.1" ]; then
+     [ "$scale_opts" == "--acoustic-scale=0.1" ]; then
     echo "$0: frame-subsampling-factor is not 1 (so likely a chain system),"
     echo "...  but the scale opts are the defaults.  You probably want"
-    echo "--scale-opts '--transition-scale=1.0 --acoustic-scale=1.0 --self-loop-scale=1.0'"
+    echo "--scale-opts '--acoustic-scale=1.0 '"
     sleep 1
   fi
 fi
diff --git a/egs/wsj/s5/steps/nnet3/align_lats.sh b/egs/wsj/s5/steps/nnet3/align_lats.sh
index 201cc3552ba..c3b55e8547c 100755
--- a/egs/wsj/s5/steps/nnet3/align_lats.sh
+++ b/egs/wsj/s5/steps/nnet3/align_lats.sh
@@ -13,7 +13,6 @@ nj=4
 cmd=run.pl
 stage=-1
 # Begin configuration.
-scale_opts="--transition-scale=1.0 --self-loop-scale=0.1"
 acoustic_scale=0.1
 beam=20
 iter=final
@@ -94,13 +93,10 @@ if [ -f $srcdir/frame_subsampling_factor ]; then
   cp $srcdir/frame_subsampling_factor $dir
   if [[ $frame_subsampling_factor -gt 1 ]]; then
     # Assume a chain system, check agrument sanity.
-    if [[ ! ($scale_opts == *--self-loop-scale=1.0* &&
-             $scale_opts == *--transition-scale=1.0* &&
-             $acoustic_scale = '1.0') ]]; then
+    if [[  $acoustic_scale = '1.0') ]]; then
       echo "$0: ERROR: frame-subsampling-factor is not 1, assuming a chain system."
       echo "... You should pass the following options to this script:"
-      echo "  --scale-opts '--transition-scale=1.0 --self-loop-scale=1.0'" \
-           "--acoustic_scale 1.0"
+      echo "--acoustic_scale 1.0"
     fi
   fi
 fi
@@ -122,7 +118,6 @@ if [ $stage -le 0 ]; then
   ## because the other scripts write them without transition probs.
   $cmd JOB=1:$nj $dir/log/compile_graphs.JOB.log \
     $prog --read-disambig-syms=$lang/phones/disambig.int \
-    $scale_opts \
     $dir/tree $srcdir/${iter}.mdl  $lang/L.fst "$tra" \
     "ark:|gzip -c >$dir/fsts.JOB.gz" || exit 1
 fi
diff --git a/egs/wsj/s5/steps/nnet3/chain/e2e/prepare_e2e.sh b/egs/wsj/s5/steps/nnet3/chain/e2e/prepare_e2e.sh
index 07d5ee8cfb8..3503cc57f50 100755
--- a/egs/wsj/s5/steps/nnet3/chain/e2e/prepare_e2e.sh
+++ b/egs/wsj/s5/steps/nnet3/chain/e2e/prepare_e2e.sh
@@ -20,7 +20,7 @@ type=mono             # Can be either mono or biphone -- either way
                       # the resulting tree is full (i.e. it doesn't do any tying)
 ci_silence=false      # If true, silence phones will be treated as context independent
 
-scale_opts="--transition-scale=0.0 --self-loop-scale=0.0"
+scale_opts=" "
 tie=false             # If true, gmm-init-biphone will do some tying when
                       # creating the full biphone tree (it won't be full anymore).
                       # Specifically, it will revert to monophone if the data
diff --git a/egs/wsj/s5/steps/nnet3/get_degs.sh b/egs/wsj/s5/steps/nnet3/get_degs.sh
index 7853daa4563..4007419bc47 100755
--- a/egs/wsj/s5/steps/nnet3/get_degs.sh
+++ b/egs/wsj/s5/steps/nnet3/get_degs.sh
@@ -59,9 +59,6 @@ nj=200
 iter=final
 
 
-# decoding-graph option
-self_loop_scale=0.1  # for decoding graph.. should be 1.0 for chain models.
-
 # options relating to decoding.
 frames_per_chunk_decoding=150
 beam=13.0
@@ -156,21 +153,10 @@ if [ -f $srcdir/frame_subsampling_factor ]; then
   # e.g. for 'chain' systems
   frame_subsampling_opt="--frame-subsampling-factor=$frame_subsampling_factor"
   cp $srcdir/frame_subsampling_factor $dir
-  if [ $frame_subsampling_factor -ne 1 ] && [ "$self_loop_scale" == "0.1" ]; then
-    echo "$0: warning: frame_subsampling_factor is not 1 (so likely a chain system),"
-    echo "...  but self-loop-scale is 0.1.  Make sure this is not a mistake."
-    sleep 1
-  fi
 else
   frame_subsampling_factor=1
 fi
 
-if [ "$self_loop_scale" == "1.0" ] && [ "$acwt" == 0.1 ]; then
-  echo "$0: warning: you set --self-loop-scale=1.0 (so likely a chain system)",
-  echo " ... but the acwt is still 0.1 (you probably want --acwt 1.0)"
-  sleep 1
-fi
-
 ## Make the decoding graph.
 if [ $stage -le 0 ]; then
   new_lang="$dir/"$(basename "$lang")
@@ -183,7 +169,7 @@ if [ $stage -le 0 ]; then
     utils/make_unigram_grammar.pl | fstcompile | fstarcsort --sort_type=ilabel > $new_lang/G.fst \
     || exit 1;
 
-  utils/mkgraph.sh --self-loop-scale $self_loop_scale $new_lang $srcdir $dir/dengraph || exit 1;
+  utils/mkgraph.sh $new_lang $srcdir $dir/dengraph || exit 1;
 fi
 
 # copy alignments into ark,scp format which allows us to use different num-jobs
diff --git a/egs/wsj/s5/steps/nnet3/make_denlats.sh b/egs/wsj/s5/steps/nnet3/make_denlats.sh
index 36da179bbaf..29df25cbf91 100755
--- a/egs/wsj/s5/steps/nnet3/make_denlats.sh
+++ b/egs/wsj/s5/steps/nnet3/make_denlats.sh
@@ -17,7 +17,6 @@ sub_split=1
 beam=13.0
 frames_per_chunk=50
 lattice_beam=7.0
-self_loop_scale=0.1
 acwt=0.1
 max_active=5000
 min_active=200
@@ -102,7 +101,7 @@ else
    awk '{for(n=2;n<=NF;n++){ printf("%s ", $n); } printf("\n"); }' | \
     utils/make_unigram_grammar.pl | fstcompile | fstarcsort --sort_type=ilabel > $new_lang/G.fst \
     || exit 1;
-  utils/mkgraph.sh --self-loop-scale $self_loop_scale $new_lang $srcdir $dir/dengraph || exit 1;
+  utils/mkgraph.sh $new_lang $srcdir $dir/dengraph || exit 1;
 fi
 cmvn_opts=`cat $srcdir/cmvn_opts 2>/dev/null`
 cp $srcdir/cmvn_opts $dir 2>/dev/null
diff --git a/egs/wsj/s5/steps/online/nnet2/align.sh b/egs/wsj/s5/steps/online/nnet2/align.sh
index c24bbf0291e..249947ecfd9 100755
--- a/egs/wsj/s5/steps/online/nnet2/align.sh
+++ b/egs/wsj/s5/steps/online/nnet2/align.sh
@@ -14,7 +14,7 @@
 nj=4
 cmd=run.pl
 # Begin configuration.
-scale_opts="--transition-scale=1.0 --acoustic-scale=0.1 --self-loop-scale=0.1"
+scale_opts="--acoustic-scale=0.1"
 beam=10
 retry_beam=40
 iter=final
diff --git a/egs/wsj/s5/steps/segmentation/internal/prepare_sad_graph.py b/egs/wsj/s5/steps/segmentation/internal/prepare_sad_graph.py
index 12c9bb1e902..e1d6702212e 100755
--- a/egs/wsj/s5/steps/segmentation/internal/prepare_sad_graph.py
+++ b/egs/wsj/s5/steps/segmentation/internal/prepare_sad_graph.py
@@ -41,13 +41,6 @@ def get_args():
         duration constraint.""",
         formatter_class=argparse.ArgumentDefaultsHelpFormatter)
 
-    parser.add_argument("--transition-scale", type=float, default=1.0,
-                        help="""Scale on transition probabilities relative to
-                        LM weights""")
-    parser.add_argument("--loopscale", type=float, default=0.1,
-                        help="""Scale on self-loop log-probabilities relative
-                        to LM weights""")
-
     parser.add_argument("--min-silence-duration", type=float, default=0.03,
                         help="""Minimum duration for silence""")
     parser.add_argument("--min-speech-duration", type=float, default=0.3,
diff --git a/egs/wsj/s5/steps/tandem/align_fmllr.sh b/egs/wsj/s5/steps/tandem/align_fmllr.sh
index 0b012e24146..12526f6f792 100755
--- a/egs/wsj/s5/steps/tandem/align_fmllr.sh
+++ b/egs/wsj/s5/steps/tandem/align_fmllr.sh
@@ -19,7 +19,7 @@ nj=4
 cmd=run.pl
 use_graphs=false
 # Begin configuration.
-scale_opts="--transition-scale=1.0 --acoustic-scale=0.1 --self-loop-scale=0.1"
+scale_opts="--acoustic-scale=0.1"
 beam=10
 retry_beam=40
 boost_silence=1.0 # factor by which to boost silence during alignment.
diff --git a/egs/wsj/s5/steps/tandem/align_sgmm2.sh b/egs/wsj/s5/steps/tandem/align_sgmm2.sh
index 48eb1fbef43..ab41834dfcb 100755
--- a/egs/wsj/s5/steps/tandem/align_sgmm2.sh
+++ b/egs/wsj/s5/steps/tandem/align_sgmm2.sh
@@ -19,7 +19,7 @@ use_gselect=false # use gselect info from srcdir [regardless, we use
    # Gaussian-selection info, we might have to compute it though.]
 gselect=15  # Number of Gaussian-selection indices for SGMMs.
 # Begin configuration.
-scale_opts="--transition-scale=1.0 --acoustic-scale=0.1 --self-loop-scale=0.1"
+scale_opts="--acoustic-scale=0.1"
 beam=10
 retry_beam=40
 transform_dir=  # directory to find fMLLR transforms in.
diff --git a/egs/wsj/s5/steps/tandem/align_si.sh b/egs/wsj/s5/steps/tandem/align_si.sh
index 4e52c51e308..1cd9e534165 100755
--- a/egs/wsj/s5/steps/tandem/align_si.sh
+++ b/egs/wsj/s5/steps/tandem/align_si.sh
@@ -16,7 +16,7 @@ nj=4
 cmd=run.pl
 use_graphs=false
 # Begin configuration.
-scale_opts="--transition-scale=1.0 --acoustic-scale=0.1 --self-loop-scale=0.1"
+scale_opts="--acoustic-scale=0.1"
 beam=10
 retry_beam=40
 boost_silence=1.0 # Factor by which to boost silence during alignment.
diff --git a/egs/wsj/s5/steps/tandem/train_deltas.sh b/egs/wsj/s5/steps/tandem/train_deltas.sh
index d6a1baa6623..70fb30dcb9c 100755
--- a/egs/wsj/s5/steps/tandem/train_deltas.sh
+++ b/egs/wsj/s5/steps/tandem/train_deltas.sh
@@ -8,7 +8,7 @@
 stage=-4 #  This allows restarting after partway, when something when wrong.
 config=
 cmd=run.pl
-scale_opts="--transition-scale=1.0 --acoustic-scale=0.1 --self-loop-scale=0.1"
+scale_opts="--acoustic-scale=0.1"
 realign_iters="10 20 30";
 num_iters=35    # Number of iterations of training
 max_iter_inc=25 # Last iter to increase #Gauss on.
diff --git a/egs/wsj/s5/steps/tandem/train_lda_mllt.sh b/egs/wsj/s5/steps/tandem/train_lda_mllt.sh
index a5fa4ea8786..67ca80b11ff 100755
--- a/egs/wsj/s5/steps/tandem/train_lda_mllt.sh
+++ b/egs/wsj/s5/steps/tandem/train_lda_mllt.sh
@@ -8,7 +8,7 @@
 cmd=run.pl
 config=
 stage=-5
-scale_opts="--transition-scale=1.0 --acoustic-scale=0.1 --self-loop-scale=0.1"
+scale_opts="--acoustic-scale=0.1"
 realign_iters="10 20 30";
 mllt_iters="2 4 6 12";
 num_iters=35    # Number of iterations of training
diff --git a/egs/wsj/s5/steps/tandem/train_mllt.sh b/egs/wsj/s5/steps/tandem/train_mllt.sh
index 7d46074baec..e8796c8f5db 100755
--- a/egs/wsj/s5/steps/tandem/train_mllt.sh
+++ b/egs/wsj/s5/steps/tandem/train_mllt.sh
@@ -12,7 +12,7 @@
 cmd=run.pl
 config=
 stage=-5
-scale_opts="--transition-scale=1.0 --acoustic-scale=0.1 --self-loop-scale=0.1"
+scale_opts="--acoustic-scale=0.1"
 realign_iters="10 20 30";
 mllt_iters="2 4 6 12";
 num_iters=35    # Number of iterations of training
diff --git a/egs/wsj/s5/steps/tandem/train_mono.sh b/egs/wsj/s5/steps/tandem/train_mono.sh
index b5c55f6f369..486478709d6 100755
--- a/egs/wsj/s5/steps/tandem/train_mono.sh
+++ b/egs/wsj/s5/steps/tandem/train_mono.sh
@@ -11,7 +11,7 @@
 # Begin configuration section.
 nj=4
 cmd=run.pl
-scale_opts="--transition-scale=1.0 --acoustic-scale=0.1 --self-loop-scale=0.1"
+scale_opts="--acoustic-scale=0.1"
 num_iters=40    # Number of iterations of training
 max_iter_inc=30 # Last iter to increase #Gauss on.
 totgauss=1000 # Target #Gaussians.  
diff --git a/egs/wsj/s5/steps/tandem/train_sat.sh b/egs/wsj/s5/steps/tandem/train_sat.sh
index 09e3f625674..2bfd2130b55 100755
--- a/egs/wsj/s5/steps/tandem/train_sat.sh
+++ b/egs/wsj/s5/steps/tandem/train_sat.sh
@@ -14,7 +14,7 @@
 stage=-5
 fmllr_update_type=full
 cmd=run.pl
-scale_opts="--transition-scale=1.0 --acoustic-scale=0.1 --self-loop-scale=0.1"
+scale_opts="--acoustic-scale=0.1"
 beam=10
 retry_beam=40
 boost_silence=1.0 # Factor by which to boost silence likelihoods in alignment
diff --git a/egs/wsj/s5/steps/tandem/train_sgmm2.sh b/egs/wsj/s5/steps/tandem/train_sgmm2.sh
index daa0437b47b..2df69708c8a 100755
--- a/egs/wsj/s5/steps/tandem/train_sgmm2.sh
+++ b/egs/wsj/s5/steps/tandem/train_sgmm2.sh
@@ -16,7 +16,7 @@ cmd=run.pl
 stage=-6 # use this to resume partially finished training
 context_opts= # e.g. set it to "--context-width=5 --central-position=2"  for a
 # quinphone system.
-scale_opts="--transition-scale=1.0 --acoustic-scale=0.1 --self-loop-scale=0.1"
+scale_opts="--acoustic-scale=0.1"
 num_iters=25   # Total number of iterations of training
 num_iters_alimdl=3 # Number of iterations for estimating alignment model.
 max_iter_inc=15 # Last iter to increase #substates on.
diff --git a/egs/wsj/s5/steps/train_deltas.sh b/egs/wsj/s5/steps/train_deltas.sh
index 7deace6b13e..452cb2852cd 100755
--- a/egs/wsj/s5/steps/train_deltas.sh
+++ b/egs/wsj/s5/steps/train_deltas.sh
@@ -7,7 +7,7 @@
 stage=-4 #  This allows restarting after partway, when something when wrong.
 config=
 cmd=run.pl
-scale_opts="--transition-scale=1.0 --acoustic-scale=0.1 --self-loop-scale=0.1"
+scale_opts="--acoustic-scale=0.1"
 realign_iters="10 20 30";
 num_iters=35    # Number of iterations of training
 max_iter_inc=25 # Last iter to increase #Gauss on.
diff --git a/egs/wsj/s5/steps/train_lda_mllt.sh b/egs/wsj/s5/steps/train_lda_mllt.sh
index a1828aa6fcb..60ce32bbd35 100755
--- a/egs/wsj/s5/steps/train_lda_mllt.sh
+++ b/egs/wsj/s5/steps/train_lda_mllt.sh
@@ -14,7 +14,7 @@
 cmd=run.pl
 config=
 stage=-5
-scale_opts="--transition-scale=1.0 --acoustic-scale=0.1 --self-loop-scale=0.1"
+scale_opts="--acoustic-scale=0.1"
 realign_iters="10 20 30";
 mllt_iters="2 4 6 12";
 num_iters=35    # Number of iterations of training
diff --git a/egs/wsj/s5/steps/train_lvtln.sh b/egs/wsj/s5/steps/train_lvtln.sh
index 111e0598edf..264171da00a 100755
--- a/egs/wsj/s5/steps/train_lvtln.sh
+++ b/egs/wsj/s5/steps/train_lvtln.sh
@@ -17,7 +17,7 @@
 stage=-6 #  This allows restarting after partway, when something when wrong.
 config=
 cmd=run.pl
-scale_opts="--transition-scale=1.0 --acoustic-scale=0.1 --self-loop-scale=0.1"
+scale_opts="--acoustic-scale=0.1"
 realign_iters="10 20 30";
 num_iters=35    # Number of iterations of training
 max_iter_inc=25 # Last iter to increase #Gauss on.
diff --git a/egs/wsj/s5/steps/train_mono.sh b/egs/wsj/s5/steps/train_mono.sh
index 5a0b79a4a1c..3b4744db2ed 100755
--- a/egs/wsj/s5/steps/train_mono.sh
+++ b/egs/wsj/s5/steps/train_mono.sh
@@ -11,7 +11,7 @@
 # Begin configuration section.
 nj=4
 cmd=run.pl
-scale_opts="--transition-scale=1.0 --acoustic-scale=0.1 --self-loop-scale=0.1"
+scale_opts="--acoustic-scale=0.1"
 num_iters=40    # Number of iterations of training
 max_iter_inc=30 # Last iter to increase #Gauss on.
 initial_beam=6 # beam used in the first iteration (set smaller to speed up initialization)
diff --git a/egs/wsj/s5/steps/train_quick.sh b/egs/wsj/s5/steps/train_quick.sh
index 3325c4964e9..4e3c807484a 100755
--- a/egs/wsj/s5/steps/train_quick.sh
+++ b/egs/wsj/s5/steps/train_quick.sh
@@ -10,7 +10,7 @@
 
 # Begin configuration..
 cmd=run.pl
-scale_opts="--transition-scale=1.0 --acoustic-scale=0.1 --self-loop-scale=0.1"
+scale_opts="--acoustic-scale=0.1"
 realign_iters="10 15"; # Only realign twice.
 num_iters=20    # Number of iterations of training
 maxiterinc=15 # Last iter to increase #Gauss on.
diff --git a/egs/wsj/s5/steps/train_raw_sat.sh b/egs/wsj/s5/steps/train_raw_sat.sh
index aa5e8813d71..615988096e7 100755
--- a/egs/wsj/s5/steps/train_raw_sat.sh
+++ b/egs/wsj/s5/steps/train_raw_sat.sh
@@ -14,7 +14,7 @@
 # Begin configuration section.
 stage=-6
 cmd=run.pl
-scale_opts="--transition-scale=1.0 --acoustic-scale=0.1 --self-loop-scale=0.1"
+scale_opts="--acoustic-scale=0.1"
 beam=10
 retry_beam=40
 boost_silence=1.0 # Factor by which to boost silence likelihoods in alignment
diff --git a/egs/wsj/s5/steps/train_sat.sh b/egs/wsj/s5/steps/train_sat.sh
index 92b744dc75c..4219b52f804 100755
--- a/egs/wsj/s5/steps/train_sat.sh
+++ b/egs/wsj/s5/steps/train_sat.sh
@@ -17,7 +17,7 @@ exit_stage=-100 # you can use this to require it to exit at the
                 # supported.
 fmllr_update_type=full
 cmd=run.pl
-scale_opts="--transition-scale=1.0 --acoustic-scale=0.1 --self-loop-scale=0.1"
+scale_opts="--acoustic-scale=0.1"
 beam=10
 retry_beam=40
 careful=false
diff --git a/egs/wsj/s5/steps/train_sat_basis.sh b/egs/wsj/s5/steps/train_sat_basis.sh
index 5245ea0c619..c2b5591a773 100755
--- a/egs/wsj/s5/steps/train_sat_basis.sh
+++ b/egs/wsj/s5/steps/train_sat_basis.sh
@@ -13,7 +13,7 @@
 # Begin configuration section.
 stage=-5
 cmd=run.pl
-scale_opts="--transition-scale=1.0 --acoustic-scale=0.1 --self-loop-scale=0.1"
+scale_opts="--acoustic-scale=0.1"
 beam=10
 retry_beam=40
 boost_silence=1.0 # Factor by which to boost silence likelihoods in alignment
diff --git a/egs/wsj/s5/steps/train_segmenter.sh b/egs/wsj/s5/steps/train_segmenter.sh
index 515005c0257..64d006c2e75 100755
--- a/egs/wsj/s5/steps/train_segmenter.sh
+++ b/egs/wsj/s5/steps/train_segmenter.sh
@@ -8,7 +8,7 @@ stage=-4 # For restarting a process that went part way.
 config=
 cmd=run.pl
 
-scale_opts="--transition-scale=1.0 --acoustic-scale=0.1 --self-loop-scale=0.1"
+scale_opts="--acoustic-scale=0.1"
 realign_iters="10 20 30";
 num_iters=35    # Number of iterations of training
 max_iter_inc=25 # Last iter to increase #Gauss on.
diff --git a/egs/wsj/s5/steps/train_sgmm2.sh b/egs/wsj/s5/steps/train_sgmm2.sh
index 7f7df2e046a..812387599af 100755
--- a/egs/wsj/s5/steps/train_sgmm2.sh
+++ b/egs/wsj/s5/steps/train_sgmm2.sh
@@ -14,7 +14,7 @@ cmd=run.pl
 stage=-6 # use this to resume partially finished training
 context_opts= # e.g. set it to "--context-width=5 --central-position=2"  for a
 # quinphone system.
-scale_opts="--transition-scale=1.0 --acoustic-scale=0.1 --self-loop-scale=0.1"
+scale_opts="--acoustic-scale=0.1"
 num_iters=25   # Total number of iterations of training
 num_iters_alimdl=3 # Number of iterations for estimating alignment model.
 max_iter_inc=15 # Last iter to increase #substates on.
diff --git a/egs/wsj/s5/steps/train_sgmm2_group.sh b/egs/wsj/s5/steps/train_sgmm2_group.sh
index 7263e2d5e8e..59cfb51e9ab 100755
--- a/egs/wsj/s5/steps/train_sgmm2_group.sh
+++ b/egs/wsj/s5/steps/train_sgmm2_group.sh
@@ -17,7 +17,7 @@ cmd=run.pl
 stage=-6 # use this to resume partially finished training
 context_opts= # e.g. set it to "--context-width=5 --central-position=2"  for a
 # quinphone system.
-scale_opts="--transition-scale=1.0 --acoustic-scale=0.1 --self-loop-scale=0.1"
+scale_opts="--acoustic-scale=0.1"
 num_iters=25   # Total number of iterations of training
 num_iters_alimdl=3 # Number of iterations for estimating alignment model.
 max_iter_inc=15 # Last iter to increase #substates on.
diff --git a/egs/wsj/s5/utils/gen_topo.pl b/egs/wsj/s5/utils/gen_topo.pl
index 1c02ed0eaeb..896f41d870d 100755
--- a/egs/wsj/s5/utils/gen_topo.pl
+++ b/egs/wsj/s5/utils/gen_topo.pl
@@ -4,6 +4,9 @@
 
 # Generate a topology file.  This allows control of the number of states in the
 # non-silence HMMs, and in the silence HMMs.
+# This is the topology we use for GMM training, which is, when configured
+# with 3 states, the Bakis model.  For chain (lattice-free MMI) training, see
+# steps/chain/gen_topo.pl.
 
 if (@ARGV != 4) {
   print STDERR "Usage: utils/gen_topo.pl <num-nonsilence-states> <num-silence-states> <colon-separated-nonsilence-phones> <colon-separated-silence-phones>\n";
@@ -28,52 +31,75 @@
 print "<ForPhones>\n";
 print "$nonsil_phones\n";
 print "</ForPhones>\n";
-for ($state = 0; $state < $num_nonsil_states; $state++) {
-  $statep1 = $state+1;
-  print "<State> $state <PdfClass> $state <Transition> $state 0.75 <Transition> $statep1 0.25 </State>\n";
+# The following is the single transition leaving the start-state.  It has pdf-id
+# 1, corresponding to state 1 which it enters..  The cost is 0.0 = log(1); there
+# is only one choice here.  Note: there are actually $num_nonsil_states + 1
+# states, but in HMM terms it's equivalent to $num_nonsil_states states;
+# and that's the length of the shortest successful path.
+print "0  1  1  0.0\n";
+for ($state = 1; $state <= $num_nonsil_states; $state++) {
+  $pdf_class = $state;
+  $next_state = $state + 1;
+  $next_pdf_class = $next_state;
+  # self-loop.
+  print "$state $state $pdf_class 0.6931471806\n";
+  if ($next_state <= $num_nonsil_states) {
+    print "$state $next_state $next_pdf_class 0.6931471806\n";
+  } else {
+    print "$state 0.6931471806\n";  # final-prob.
+  }
 }
-print "<State> $num_nonsil_states </State>\n"; # non-emitting final state.
+print "\n";  # terminate the FSA.. empty line marks its end.
 print "</TopologyEntry>\n";
 # Now silence phones.  They have a different topology-- apart from the first and
 # last states, it's fully connected, as long as you have >= 3 states.
 
+print "<TopologyEntry>\n";
+print "<ForPhones>\n";
+print "$sil_phones\n";
+print "</ForPhones>\n";
+
+
+print "0  1  1  0.0\n";
 if ($num_sil_states > 1) {
-  $transp = 1.0 / ($num_sil_states-1);
-  print "<TopologyEntry>\n";
-  print "<ForPhones>\n";
-  print "$sil_phones\n";
-  print "</ForPhones>\n";
-  print "<State> 0 <PdfClass> 0 ";
-  for ($nextstate = 0; $nextstate < $num_sil_states-1; $nextstate++) { # Transitions to all but last
-    # emitting state.
-    print "<Transition> $nextstate $transp ";
+  # Note: actually it must be >= 3, we checked this above;
+  # 2 is disallowed (I know, it's odd).
+  # Also note: $num_sil_states is not actually the number of states
+  # in the FSA; it's the number of states in its HMM equivalent.
+  # the FSA has one extra state, state 0.
+  # we'll treat the final state, numbered $num_sil_states,
+  # separately; it doesn't have the transition back to
+  # lower-numbered states.
+
+  $self_loop_cost = 0.6931471806;  # -log(0.5)
+  $non_self_loop_cost = -log(0.5 / ($num_sil_states - 2));
+
+  $state = 1;
+  $pdf_id = $state;
+  print "$state  $state  $pdf_id  $self_loop_cost\n";
+  for ($next_state = 2; $next_state < $num_sil_states; $next_state++) {
+    $next_pdf_id = $next_state;
+    print "$state  $next_state  $next_pdf_id  $non_self_loop_cost\n";
   }
-  print "</State>\n";
-  for ($state = 1; $state < $num_sil_states-1; $state++) { # the central states all have transitions to
-    # themselves and to the last emitting state.
-    print "<State> $state <PdfClass> $state ";
-    for ($nextstate = 1; $nextstate < $num_sil_states; $nextstate++) {
-      print "<Transition> $nextstate $transp ";
+
+  for ($state = 2; $state < $num_sil_states; $state++) {
+    $pdf_id = $state;
+    for ($next_state = 2; $next_state <= $num_sil_states; $next_state++) {
+      my $cost = ($next_state == $state ? $self_loop_cost : $non_self_loop_cost);
+      $next_pdf_id = $next_state;
+      print "$state  $next_state  $next_pdf_id  $cost\n";
     }
-    print "</State>\n";
   }
-  # Final emitting state (non-skippable).
-  $state = $num_sil_states-1;
-  print "<State> $state <PdfClass> $state <Transition> $state 0.75 <Transition> $num_sil_states 0.25 </State>\n";
-  # Final nonemitting state:
-  print "<State> $num_sil_states </State>\n";
-  print "</TopologyEntry>\n";
+  $final_state = $num_sil_states;
+  $pdf_id = $final_state;
+  print "$final_state  $final_state  $pdf_id  $self_loop_cost\n";
+  print "$final_state 0.6931471806\n";
+  print "\n";
 } else {
-  print "<TopologyEntry>\n";
-  print "<ForPhones>\n";
-  print "$sil_phones\n";
-  print "</ForPhones>\n";
-  print "<State> 0 <PdfClass> 0 ";
-  print "<Transition> 0 0.75 ";
-  print "<Transition> 1 0.25 ";
-  print "</State>\n";
-  print "<State> $num_sil_states </State>\n"; # non-emitting final state.
-  print "</TopologyEntry>\n";
+  print "0  0  1  0.6931471806\n";
+  print "1  1  1  0.6931471806\n";
+  print "1  0.6931471806\n";
+  print "\n";
 }
-
+print "</TopologyEntry>\n";
 print "</Topology>\n";
diff --git a/egs/wsj/s5/utils/mkgraph.sh b/egs/wsj/s5/utils/mkgraph.sh
index 31e86cd38f6..8346c69ffb7 100755
--- a/egs/wsj/s5/utils/mkgraph.sh
+++ b/egs/wsj/s5/utils/mkgraph.sh
@@ -15,17 +15,12 @@
 
 set -o pipefail
 
-tscale=1.0
-loopscale=0.1
-
 remove_oov=false
 
 for x in `seq 4`; do
   [ "$1" == "--mono" -o "$1" == "--left-biphone" -o "$1" == "--quinphone" ] && shift && \
     echo "WARNING: the --mono, --left-biphone and --quinphone options are now deprecated and ignored."
   [ "$1" == "--remove-oov" ] && remove_oov=true && shift;
-  [ "$1" == "--transition-scale" ] && tscale=$2 && shift 2;
-  [ "$1" == "--self-loop-scale" ] && loopscale=$2 && shift 2;
 done
 
 if [ $# != 3 ]; then
@@ -34,8 +29,6 @@ if [ $# != 3 ]; then
    echo " Options:"
    echo " --remove-oov       #  If true, any paths containing the OOV symbol (obtained from oov.int"
    echo "                    #  in the lang directory) are removed from the G.fst during compilation."
-   echo " --transition-scale #  Scaling factor on transition probabilities."
-   echo " --self-loop-scale  #  Please see: http://kaldi-asr.org/doc/hmm.html#hmm_scale."
    echo "Note: the --mono, --left-biphone and --quinphone options are now deprecated"
    echo "and will be ignored."
    exit 1;
@@ -75,8 +68,6 @@ fi
 N=$(tree-info $tree | grep "context-width" | cut -d' ' -f2) || { echo "Error when getting context-width"; exit 1; }
 P=$(tree-info $tree | grep "central-position" | cut -d' ' -f2) || { echo "Error when getting central-position"; exit 1; }
 
-[[ -f $2/frame_subsampling_factor && "$loopscale" == "0.1" ]] && \
-  echo "$0: WARNING: chain models need '--self-loop-scale 1.0'";
 
 if [ -f $lang/phones/nonterm_phones_offset.int ]; then
   if [[ $N != 2  || $P != 1 ]]; then
@@ -124,7 +115,7 @@ trap "rm -f $dir/Ha.fst.$$" EXIT HUP INT PIPE TERM
 if [[ ! -s $dir/Ha.fst || $dir/Ha.fst -ot $model  \
     || $dir/Ha.fst -ot $lang/tmp/ilabels_${N}_${P} ]]; then
   make-h-transducer $nonterm_opt --disambig-syms-out=$dir/disambig_tid.int \
-    --transition-scale=$tscale $lang/tmp/ilabels_${N}_${P} $tree $model \
+     $lang/tmp/ilabels_${N}_${P} $tree $model \
      > $dir/Ha.fst.$$  || exit 1;
   mv $dir/Ha.fst.$$ $dir/Ha.fst
 fi
@@ -146,14 +137,11 @@ fi
 
 trap "rm -f $dir/HCLG.fst.$$" EXIT HUP INT PIPE TERM
 if [[ ! -s $dir/HCLG.fst || $dir/HCLG.fst -ot $dir/HCLGa.fst ]]; then
-  add-self-loops --self-loop-scale=$loopscale --reorder=true $model $dir/HCLGa.fst | \
+  add-self-loops $model $dir/HCLGa.fst | \
     $prepare_grammar_command | \
     fstconvert --fst_type=const > $dir/HCLG.fst.$$ || exit 1;
   mv $dir/HCLG.fst.$$ $dir/HCLG.fst
-  if [ $tscale == 1.0 -a $loopscale == 1.0 ]; then
-    # No point doing this test if transition-scale not 1, as it is bound to fail.
-    fstisstochastic $dir/HCLG.fst || echo "[info]: final HCLG is not stochastic."
-  fi
+  fstisstochastic $dir/HCLG.fst || echo "[info]: final HCLG is not stochastic."
 fi
 
 # note: the empty FST has 66 bytes.  this check is for whether the final FST
diff --git a/egs/wsj/s5/utils/validate_lang.pl b/egs/wsj/s5/utils/validate_lang.pl
index 8dba2a0ca69..88230d428e1 100755
--- a/egs/wsj/s5/utils/validate_lang.pl
+++ b/egs/wsj/s5/utils/validate_lang.pl
@@ -627,19 +627,20 @@ sub check_summation {
   %phones_in_topo_int_hash = ( );
   %phones_in_topo_hash = ( );
   while (<T>) {
-    chomp;
-    next if (m/^<.*>[ ]*$/);
-    foreach $i (split(" ", $_)) {
-      if (defined $phones_in_topo_int_hash{$i}) {
-        $topo_ok = 0;
-        $exit = 1; print "--> ERROR: $lang/topo has phone $i twice\n";
-      }
-      if (!defined $pint2sym{$i}) {
-        $topo_ok = 0;
-        $exit = 1; print "--> ERROR: $lang/topo has phone $i which is not in phones.txt\n";
+    if (m/<ForPhones>/) {
+      my $line = <T>;
+      foreach $phone (split(" ", $line)) {
+        if (defined $phones_in_topo_int_hash{$phone}) {
+          $topo_ok = 0;
+          $exit = 1; print "--> ERROR: $lang/topo has phone $phone twice\n";
+        }
+        if (!defined $pint2sym{$phone}) {
+          $topo_ok = 0;
+          $exit = 1; print "--> ERROR: $lang/topo has phone $phone which is not in phones.txt\n";
+        }
+        $phones_in_topo_int_hash{$phone} = 1;
+        $phones_in_topo_hash{$pint2sym{$phone}} = 1;
       }
-      $phones_in_topo_int_hash{$i} = 1;
-      $phones_in_topo_hash{$pint2sym{$i}} = 1;
     }
   }
   close(T);
@@ -816,8 +817,8 @@ sub check_summation {
 
 # Check validity of L.fst, L_disambig.fst, and word_boundary.int.
 # First we generate a random word/subword sequence. We then compile it into fst and compose it with L.fst/L_disambig.fst.
-# For subword case the last subword of the sequence must be a end-subword 
-# (i.e. the subword can only be at the end of word or is a single word itself) 
+# For subword case the last subword of the sequence must be a end-subword
+# (i.e. the subword can only be at the end of word or is a single word itself)
 # to guarantee the composition would not fail.
 # We then get the corresponging phones sequence and apply a transition matrix on it to get the number of valid boundaries.
 # In word case, the number of valid boundaries should be equal to the number of words.
@@ -883,14 +884,14 @@ sub check_summation {
           $end_subword ++;
         }
       }
-    } 
+    }
 
     # generate the last word (subword)
     $id = int(rand(scalar(keys %wint2sym)));
     if ($subword_check) {
       $subword = $wint2sym{$id};
       $suffix = substr($subword, -$separator_length, $separator_length);
-      # the last subword can not followed by separator  
+      # the last subword can not followed by separator
       while (defined $wdisambig_words_hash{$id} or
            $wint2sym{$id} eq "<s>" or $wint2sym{$id} eq "</s>" or
            $wint2sym{$id} =~ m/^#nonterm/ or $id == 0 or $suffix eq $separator) {
@@ -952,7 +953,7 @@ sub check_summation {
       }
     }
     if (!$exit) {
-      if ($subword_check) { 
+      if ($subword_check) {
         $wlen = $end_subword;
       }
       if ($num_words != $wlen) {
diff --git a/egs/yomdle_fa/v1/local/chain/run_cnn_e2eali_1b.sh b/egs/yomdle_fa/v1/local/chain/run_cnn_e2eali_1b.sh
index 700b57d9fce..90fae4d4015 100755
--- a/egs/yomdle_fa/v1/local/chain/run_cnn_e2eali_1b.sh
+++ b/egs/yomdle_fa/v1/local/chain/run_cnn_e2eali_1b.sh
@@ -102,7 +102,7 @@ if [ $stage -le 2 ]; then
   # use the same num-jobs as the alignments
   steps/nnet3/align_lats.sh --nj $nj --cmd "$cmd" \
                             --acoustic-scale 1.0 \
-                            --scale-opts '--transition-scale=1.0 --self-loop-scale=1.0' \
+                            \
                             ${train_data_dir} $data_dir/lang $e2echain_model_dir $lat_dir
   echo "" >$lat_dir/splice_opts
 
@@ -227,7 +227,7 @@ if [ $stage -le 6 ]; then
   # as long as phones.txt was compatible.
 
   utils/mkgraph.sh \
-    --self-loop-scale 1.0 $data_dir/$lang_test \
+    $data_dir/$lang_test \
     $dir $dir/graph || exit 1;
 fi
 
diff --git a/egs/yomdle_fa/v1/local/chain/run_flatstart_cnn1a.sh b/egs/yomdle_fa/v1/local/chain/run_flatstart_cnn1a.sh
index bb5352943f6..6adde439b00 100755
--- a/egs/yomdle_fa/v1/local/chain/run_flatstart_cnn1a.sh
+++ b/egs/yomdle_fa/v1/local/chain/run_flatstart_cnn1a.sh
@@ -155,7 +155,7 @@ if [ $stage -le 4 ]; then
   # as long as phones.txt was compatible.
 
   utils/mkgraph.sh \
-    --self-loop-scale 1.0 $data_dir/$lang_test \
+    $data_dir/$lang_test \
     $dir $dir/graph || exit 1;
 fi
 
diff --git a/egs/yomdle_fa/v1/run.sh b/egs/yomdle_fa/v1/run.sh
index a7547b1ee69..da75679a8b5 100755
--- a/egs/yomdle_fa/v1/run.sh
+++ b/egs/yomdle_fa/v1/run.sh
@@ -99,7 +99,7 @@ if [ $stage -le 6 ]; then
     echo "$0: Aligning the training data using the e2e chain model..."
     echo "Date: $(date)."
     steps/nnet3/align.sh --nj $nj --cmd "$cmd" \
-        --scale-opts '--transition-scale=1.0 --acoustic-scale=1.0 --self-loop-scale=1.0' \
+         \
         $data_dir/train_aug $data_dir/lang $exp_dir/chain/e2e_cnn_1a $exp_dir/chain/e2e_ali_train
 fi
 
diff --git a/egs/yomdle_korean/v1/local/chain/tuning/run_cnn_e2eali_1a.sh b/egs/yomdle_korean/v1/local/chain/tuning/run_cnn_e2eali_1a.sh
index 03333f6d229..ad00b8d4774 100755
--- a/egs/yomdle_korean/v1/local/chain/tuning/run_cnn_e2eali_1a.sh
+++ b/egs/yomdle_korean/v1/local/chain/tuning/run_cnn_e2eali_1a.sh
@@ -98,7 +98,7 @@ if [ $stage -le 2 ]; then
   # use the same num-jobs as the alignments
   steps/nnet3/align_lats.sh --nj $nj --cmd "$cmd" \
                             --acoustic-scale 1.0 \
-                            --scale-opts '--transition-scale=1.0 --self-loop-scale=1.0' \
+                            \
                             ${train_data_dir} data/lang $e2echain_model_dir $lat_dir
   echo "" >$lat_dir/splice_opts
 fi
@@ -216,7 +216,7 @@ if [ $stage -le 6 ] && $decode_chain; then
   # as long as phones.txt was compatible.
 
   utils/mkgraph.sh \
-    --self-loop-scale 1.0 $lang_decode \
+    $lang_decode \
     $dir $dir/graph || exit 1;
 fi
 
diff --git a/egs/yomdle_korean/v1/local/chain/tuning/run_cnn_e2eali_1b.sh b/egs/yomdle_korean/v1/local/chain/tuning/run_cnn_e2eali_1b.sh
index fd9cdc8921d..3e9197e7e42 100755
--- a/egs/yomdle_korean/v1/local/chain/tuning/run_cnn_e2eali_1b.sh
+++ b/egs/yomdle_korean/v1/local/chain/tuning/run_cnn_e2eali_1b.sh
@@ -95,7 +95,7 @@ if [ $stage -le 2 ]; then
   # use the same num-jobs as the alignments
   steps/nnet3/align_lats.sh --nj $nj --cmd "$cmd" \
                             --acoustic-scale 1.0 \
-                            --scale-opts '--transition-scale=1.0 --self-loop-scale=1.0' \
+                            \
                             ${train_data_dir} data/lang $e2echain_model_dir $lat_dir
   echo "" >$lat_dir/splice_opts
 fi
diff --git a/egs/yomdle_korean/v1/local/semisup/chain/run_cnn_chainali_semisupervised_1a.sh b/egs/yomdle_korean/v1/local/semisup/chain/run_cnn_chainali_semisupervised_1a.sh
index f6b2c1bac42..5fa8d3a0d29 100755
--- a/egs/yomdle_korean/v1/local/semisup/chain/run_cnn_chainali_semisupervised_1a.sh
+++ b/egs/yomdle_korean/v1/local/semisup/chain/run_cnn_chainali_semisupervised_1a.sh
@@ -100,7 +100,7 @@ for f in data/$supervised_set/feats.scp \
 done
 
 if [ ! -f $graphdir/HCLG.fst ]; then
-  utils/mkgraph.sh --self-loop-scale 1.0 $lang_decode $sup_chain_dir $graphdir
+  utils/mkgraph.sh $lang_decode $sup_chain_dir $graphdir
 fi
 
 # Decode unsupervised data and write lattices in non-compact
@@ -312,7 +312,7 @@ if [ $stage -le 17 ]; then
   # Note: it might appear that this $lang directory is mismatched, and it is as
   # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
   # the lang directory.
-  utils/mkgraph.sh --self-loop-scale 1.0 $lang_decode $dir $dir/graph
+  utils/mkgraph.sh $lang_decode $dir $dir/graph
 fi
 
 if [ $stage -le 18 ]; then
diff --git a/egs/yomdle_korean/v1/local/semisup/chain/run_cnn_chainali_semisupervised_1b.sh b/egs/yomdle_korean/v1/local/semisup/chain/run_cnn_chainali_semisupervised_1b.sh
index 8185fa2645d..cef080071b1 100755
--- a/egs/yomdle_korean/v1/local/semisup/chain/run_cnn_chainali_semisupervised_1b.sh
+++ b/egs/yomdle_korean/v1/local/semisup/chain/run_cnn_chainali_semisupervised_1b.sh
@@ -99,7 +99,7 @@ for f in data/$supervised_set/feats.scp \
 done
 
 if [ ! -f $graphdir/HCLG.fst ]; then
-  utils/mkgraph.sh --self-loop-scale 1.0 $lang_decode $sup_chain_dir $graphdir
+  utils/mkgraph.sh $lang_decode $sup_chain_dir $graphdir
 fi
 
 # Decode unsupervised data and write lattices in non-compact
@@ -310,7 +310,7 @@ if [ $stage -le 17 ]; then
   # Note: it might appear that this $lang directory is mismatched, and it is as
   # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
   # the lang directory.
-  utils/mkgraph.sh --self-loop-scale 1.0 $lang_decode $dir $dir/graph
+  utils/mkgraph.sh $lang_decode $dir $dir/graph
 fi
 
 if [ $stage -le 18 ]; then
diff --git a/egs/yomdle_korean/v1/run_end2end.sh b/egs/yomdle_korean/v1/run_end2end.sh
index 65f5beb4b08..193e6eebff3 100755
--- a/egs/yomdle_korean/v1/run_end2end.sh
+++ b/egs/yomdle_korean/v1/run_end2end.sh
@@ -127,7 +127,7 @@ fi
 if [ $stage -le 7 ]; then
   echo "$(date) stage 7: Aligning the training data using the e2e chain model..."
   steps/nnet3/align.sh --nj $nj --cmd "$cmd" \
-    --scale-opts '--transition-scale=1.0 --acoustic-scale=1.0 --self-loop-scale=1.0' \
+     \
     data/train data/lang exp/chain/e2e_cnn_1a exp/chain/e2e_ali_train
 fi
 
@@ -152,7 +152,7 @@ if [ $stage -le 10 ] && $decode_e2e; then
   echo "$(date) stage 10: decoding end2end setup..."
 
   utils/mkgraph.sh \
-    --self-loop-scale 1.0 $lang_decode \
+    $lang_decode \
     exp/chain/e2e_cnn_1a/ exp/chain/e2e_cnn_1a/graph || exit 1;
 
   steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \
@@ -170,7 +170,7 @@ if [ $stage -le 11 ] && $decode_chain; then
   echo "$(date) stage 11: decoding chain alignment setup..."
 
   utils/mkgraph.sh \
-    --self-loop-scale 1.0 $lang_decode \
+    $lang_decode \
     exp/chain/cnn_e2eali_1a/ exp/chain/cnn_e2eali_1a/graph || exit 1;
 
   frames_per_chunk=$(echo $chunk_width | cut -d, -f1)
diff --git a/egs/yomdle_russian/v1/local/chain/tuning/run_cnn_e2eali_1a.sh b/egs/yomdle_russian/v1/local/chain/tuning/run_cnn_e2eali_1a.sh
index cd582472993..969f50dc857 100755
--- a/egs/yomdle_russian/v1/local/chain/tuning/run_cnn_e2eali_1a.sh
+++ b/egs/yomdle_russian/v1/local/chain/tuning/run_cnn_e2eali_1a.sh
@@ -90,7 +90,7 @@ if [ $stage -le 2 ]; then
   # use the same num-jobs as the alignments
   steps/nnet3/align_lats.sh --nj $nj --cmd "$cmd" \
                             --acoustic-scale 1.0 \
-                            --scale-opts '--transition-scale=1.0 --self-loop-scale=1.0' \
+                            \
                             ${train_data_dir} data/lang $e2echain_model_dir $lat_dir
   echo "" >$lat_dir/splice_opts
 fi
diff --git a/egs/yomdle_russian/v1/run_end2end.sh b/egs/yomdle_russian/v1/run_end2end.sh
index 12beebeaa05..03525a22d54 100755
--- a/egs/yomdle_russian/v1/run_end2end.sh
+++ b/egs/yomdle_russian/v1/run_end2end.sh
@@ -127,7 +127,7 @@ fi
 if [ $stage -le 7 ]; then
   echo "$0: $(date) stage 7: Aligning the training data using the e2e chain model..."
   steps/nnet3/align.sh --nj $nj --cmd "$cmd" \
-    --scale-opts '--transition-scale=1.0 --acoustic-scale=1.0 --self-loop-scale=1.0' \
+     \
     data/train data/lang exp/chain/e2e_cnn_1a exp/chain/e2e_ali_train
 fi
 
@@ -152,7 +152,7 @@ if [ $stage -le 10 ] && $decode_e2e; then
   echo "$0: $(date) stage 10: decoding end2end setup..."
 
   utils/mkgraph.sh \
-    --self-loop-scale 1.0 $lang_decode \
+    $lang_decode \
     exp/chain/e2e_cnn_1a/ exp/chain/e2e_cnn_1a/graph || exit 1;
 
   steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \
@@ -170,7 +170,7 @@ if [ $stage -le 11 ] && $decode_chain; then
   echo "$0: $(date) stage 11: decoding chain alignment setup..."
 
   utils/mkgraph.sh \
-    --self-loop-scale 1.0 $lang_decode \
+    $lang_decode \
     exp/chain/cnn_e2eali_1a/ exp/chain/cnn_e2eali_1a/graph || exit 1;
 
   frames_per_chunk=$(echo $chunk_width | cut -d, -f1)
diff --git a/egs/yomdle_tamil/v1/local/chain/run_e2e_cnn.sh b/egs/yomdle_tamil/v1/local/chain/run_e2e_cnn.sh
index f553467d4a6..7145dd365a4 100755
--- a/egs/yomdle_tamil/v1/local/chain/run_e2e_cnn.sh
+++ b/egs/yomdle_tamil/v1/local/chain/run_e2e_cnn.sh
@@ -141,7 +141,7 @@ if [ $stage -le 4 ] && $decode_e2e; then
   # as long as phones.txt was compatible.
 
   utils/mkgraph.sh \
-    --self-loop-scale 1.0 $lang_decode \
+    $lang_decode \
     $dir $dir/graph || exit 1;
 fi
 
diff --git a/egs/yomdle_tamil/v1/local/chain/tuning/run_cnn_e2eali_1a.sh b/egs/yomdle_tamil/v1/local/chain/tuning/run_cnn_e2eali_1a.sh
index 03333f6d229..ad00b8d4774 100755
--- a/egs/yomdle_tamil/v1/local/chain/tuning/run_cnn_e2eali_1a.sh
+++ b/egs/yomdle_tamil/v1/local/chain/tuning/run_cnn_e2eali_1a.sh
@@ -98,7 +98,7 @@ if [ $stage -le 2 ]; then
   # use the same num-jobs as the alignments
   steps/nnet3/align_lats.sh --nj $nj --cmd "$cmd" \
                             --acoustic-scale 1.0 \
-                            --scale-opts '--transition-scale=1.0 --self-loop-scale=1.0' \
+                            \
                             ${train_data_dir} data/lang $e2echain_model_dir $lat_dir
   echo "" >$lat_dir/splice_opts
 fi
@@ -216,7 +216,7 @@ if [ $stage -le 6 ] && $decode_chain; then
   # as long as phones.txt was compatible.
 
   utils/mkgraph.sh \
-    --self-loop-scale 1.0 $lang_decode \
+    $lang_decode \
     $dir $dir/graph || exit 1;
 fi
 
diff --git a/egs/yomdle_tamil/v1/local/chain/tuning/run_cnn_e2eali_1b.sh b/egs/yomdle_tamil/v1/local/chain/tuning/run_cnn_e2eali_1b.sh
index fb15ce10dde..a531d966dad 100755
--- a/egs/yomdle_tamil/v1/local/chain/tuning/run_cnn_e2eali_1b.sh
+++ b/egs/yomdle_tamil/v1/local/chain/tuning/run_cnn_e2eali_1b.sh
@@ -96,7 +96,7 @@ if [ $stage -le 2 ]; then
   # use the same num-jobs as the alignments
   steps/nnet3/align_lats.sh --nj $nj --cmd "$cmd" \
                             --acoustic-scale 1.0 \
-                            --scale-opts '--transition-scale=1.0 --self-loop-scale=1.0' \
+                            \
                             ${train_data_dir} data/lang $e2echain_model_dir $lat_dir
   echo "" >$lat_dir/splice_opts
 fi
@@ -216,7 +216,7 @@ if [ $stage -le 6 ] && $decode_chain; then
   # as long as phones.txt was compatible.
 
   utils/mkgraph.sh \
-    --self-loop-scale 1.0 $lang_decode \
+    $lang_decode \
     $dir $dir/graph || exit 1;
 fi
 
diff --git a/egs/yomdle_tamil/v1/local/semisup/chain/run_cnn_chainali_semisupervised_1a.sh b/egs/yomdle_tamil/v1/local/semisup/chain/run_cnn_chainali_semisupervised_1a.sh
index f6b2c1bac42..5fa8d3a0d29 100755
--- a/egs/yomdle_tamil/v1/local/semisup/chain/run_cnn_chainali_semisupervised_1a.sh
+++ b/egs/yomdle_tamil/v1/local/semisup/chain/run_cnn_chainali_semisupervised_1a.sh
@@ -100,7 +100,7 @@ for f in data/$supervised_set/feats.scp \
 done
 
 if [ ! -f $graphdir/HCLG.fst ]; then
-  utils/mkgraph.sh --self-loop-scale 1.0 $lang_decode $sup_chain_dir $graphdir
+  utils/mkgraph.sh $lang_decode $sup_chain_dir $graphdir
 fi
 
 # Decode unsupervised data and write lattices in non-compact
@@ -312,7 +312,7 @@ if [ $stage -le 17 ]; then
   # Note: it might appear that this $lang directory is mismatched, and it is as
   # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
   # the lang directory.
-  utils/mkgraph.sh --self-loop-scale 1.0 $lang_decode $dir $dir/graph
+  utils/mkgraph.sh $lang_decode $dir $dir/graph
 fi
 
 if [ $stage -le 18 ]; then
diff --git a/egs/yomdle_tamil/v1/local/semisup/chain/run_cnn_chainali_semisupervised_1b.sh b/egs/yomdle_tamil/v1/local/semisup/chain/run_cnn_chainali_semisupervised_1b.sh
index 17d59642b05..dae34d51f20 100755
--- a/egs/yomdle_tamil/v1/local/semisup/chain/run_cnn_chainali_semisupervised_1b.sh
+++ b/egs/yomdle_tamil/v1/local/semisup/chain/run_cnn_chainali_semisupervised_1b.sh
@@ -99,7 +99,7 @@ for f in data/$supervised_set/feats.scp \
 done
 
 if [ ! -f $graphdir/HCLG.fst ]; then
-  utils/mkgraph.sh --self-loop-scale 1.0 $lang_decode $sup_chain_dir $graphdir
+  utils/mkgraph.sh $lang_decode $sup_chain_dir $graphdir
 fi
 
 # Decode unsupervised data and write lattices in non-compact
@@ -308,7 +308,7 @@ if [ $stage -le 17 ]; then
   # Note: it might appear that this $lang directory is mismatched, and it is as
   # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
   # the lang directory.
-  utils/mkgraph.sh --self-loop-scale 1.0 $lang_decode $dir $dir/graph
+  utils/mkgraph.sh $lang_decode $dir $dir/graph
 fi
 
 if [ $stage -le 18 ]; then
diff --git a/egs/yomdle_tamil/v1/run_end2end.sh b/egs/yomdle_tamil/v1/run_end2end.sh
index e6a8e0a4432..55a4d7bc83d 100755
--- a/egs/yomdle_tamil/v1/run_end2end.sh
+++ b/egs/yomdle_tamil/v1/run_end2end.sh
@@ -155,7 +155,7 @@ if [ $stage -le 8 ]; then
   echo "$(date) stage 8: Aligning the training data using the e2e chain model..."
   steps/nnet3/align.sh --nj $nj --cmd "$cmd" \
     --use-gpu false \
-    --scale-opts '--transition-scale=1.0 --acoustic-scale=1.0 --self-loop-scale=1.0' \
+     \
     data/train_aug data/lang exp/chain/e2e_cnn_1a exp/chain/e2e_ali_train
 fi
 
diff --git a/egs/yomdle_zh/v1/local/chain/run_cnn_e2eali_1b.sh b/egs/yomdle_zh/v1/local/chain/run_cnn_e2eali_1b.sh
index 0a4e00d7aed..e9f16063484 100755
--- a/egs/yomdle_zh/v1/local/chain/run_cnn_e2eali_1b.sh
+++ b/egs/yomdle_zh/v1/local/chain/run_cnn_e2eali_1b.sh
@@ -101,7 +101,7 @@ if [ $stage -le 2 ]; then
   # use the same num-jobs as the alignments
   steps/nnet3/align_lats.sh --nj $nj --cmd "$cmd" \
                             --acoustic-scale 1.0 \
-                            --scale-opts '--transition-scale=1.0 --self-loop-scale=1.0' \
+                            \
                             ${train_data_dir} $data_dir/lang $e2echain_model_dir $lat_dir
   echo "" >$lat_dir/splice_opts
 
@@ -228,7 +228,7 @@ if [ $stage -le 6 ]; then
   # as long as phones.txt was compatible.
 
   utils/mkgraph.sh \
-    --self-loop-scale 1.0 $data_dir/$lang_test \
+    $data_dir/$lang_test \
     $dir $dir/graph || exit 1;
 fi
 
diff --git a/egs/yomdle_zh/v1/local/chain/run_flatstart_cnn1a.sh b/egs/yomdle_zh/v1/local/chain/run_flatstart_cnn1a.sh
index 88bbd32790c..2823b5f2ada 100755
--- a/egs/yomdle_zh/v1/local/chain/run_flatstart_cnn1a.sh
+++ b/egs/yomdle_zh/v1/local/chain/run_flatstart_cnn1a.sh
@@ -154,7 +154,7 @@ if [ $stage -le 4 ]; then
   # as long as phones.txt was compatible.
 
   utils/mkgraph.sh \
-    --self-loop-scale 1.0 $data_dir/$lang_test \
+    $data_dir/$lang_test \
     $dir $dir/graph || exit 1;
 fi
 
diff --git a/egs/yomdle_zh/v1/run.sh b/egs/yomdle_zh/v1/run.sh
index 128f15694cc..2d649fd192a 100755
--- a/egs/yomdle_zh/v1/run.sh
+++ b/egs/yomdle_zh/v1/run.sh
@@ -99,7 +99,7 @@ if [ $stage -le 6 ]; then
     echo "$0: Aligning the training data using the e2e chain model..."
     echo "Date: $(date)."
     steps/nnet3/align.sh --nj $nj --cmd "$cmd" --use-gpu false \
-        --scale-opts '--transition-scale=1.0 --acoustic-scale=1.0 --self-loop-scale=1.0' \
+         \
         $data_dir/train_aug $data_dir/lang $exp_dir/chain/e2e_cnn_1a $exp_dir/chain/e2e_ali_train
 fi
 
diff --git a/egs/zeroth_korean/s5/local/chain/tuning/run_tdnn_1a.sh b/egs/zeroth_korean/s5/local/chain/tuning/run_tdnn_1a.sh
index 14b9a8d6c8e..02706d98602 100755
--- a/egs/zeroth_korean/s5/local/chain/tuning/run_tdnn_1a.sh
+++ b/egs/zeroth_korean/s5/local/chain/tuning/run_tdnn_1a.sh
@@ -252,7 +252,7 @@ if [ $stage -le 13 ]; then
   utils/lang/check_phones_compatible.sh \
     data/lang_test_tgsmall/phones.txt $lang/phones.txt
   utils/mkgraph.sh \
-    --self-loop-scale 1.0 data/lang_test_tgsmall \
+    data/lang_test_tgsmall \
     $tree_dir $tree_dir/graph_tgsmall || exit 1;
 fi
 
diff --git a/egs/zeroth_korean/s5/local/chain/tuning/run_tdnn_opgru_1a.sh b/egs/zeroth_korean/s5/local/chain/tuning/run_tdnn_opgru_1a.sh
index 28b36243ba3..5372a5862fa 100755
--- a/egs/zeroth_korean/s5/local/chain/tuning/run_tdnn_opgru_1a.sh
+++ b/egs/zeroth_korean/s5/local/chain/tuning/run_tdnn_opgru_1a.sh
@@ -260,7 +260,7 @@ if [ $stage -le 13 ]; then
   utils/lang/check_phones_compatible.sh \
     data/lang_test_tgsmall/phones.txt $lang/phones.txt
   utils/mkgraph.sh \
-    --self-loop-scale 1.0 data/lang_test_tgsmall \
+    data/lang_test_tgsmall \
     $tree_dir $tree_dir/graph_tgsmall || exit 1;
 fi
 

From f4b8f53741deb3564cd7bf31ec036467a4696f8c Mon Sep 17 00:00:00 2001
From: Daniel Povey <dpovey@gmail.com>
Date: Wed, 10 Jul 2019 17:35:53 -0400
Subject: [PATCH 147/163] [src] Various fixes

---
 src/Makefile                         | 47 ++++++++++++++--------------
 src/bin/Makefile                     |  9 ++----
 src/bin/compile-graph.cc             | 13 ++------
 src/bin/compile-train-graphs-fsts.cc |  7 ++---
 src/bin/compile-train-graphs.cc      |  8 ++---
 src/feat/feature-fbank.cc            |  2 +-
 src/gmmbin/gmm-est.cc                |  7 ++---
 src/gmmbin/gmm-sum-accs.cc           | 10 ++----
 src/hmm/topology.cc                  |  8 ++---
 9 files changed, 43 insertions(+), 68 deletions(-)

diff --git a/src/Makefile b/src/Makefile
index 1e128b6e5cc..82102714811 100644
--- a/src/Makefile
+++ b/src/Makefile
@@ -134,29 +134,30 @@ bin fstbin gmmbin fgmmbin featbin cudafeatbin nnet3bin chainbin latbin ivectorbi
 
 #2)The libraries have inter-dependencies
 base: base/.depend.mk
-matrix: base
-util: base cudamatrix matrix
-feat: base cudamatrix matrix util gmm transform tree
-tree: base util matrix
-gmm: base util matrix tree
-transform: base util matrix gmm tree
-fstext: base util matrix tree
-hmm: base tree matrix util
-lm: base util matrix fstext
-decoder: base util matrix gmm hmm tree transform lat
-lat: base util hmm tree matrix
-cudamatrix: base util matrix
-nnet3: base util matrix decoder lat gmm hmm tree transform cudamatrix chain fstext
-rnnlm: base util matrix cudamatrix nnet3 lm hmm
-chain: lat hmm tree fstext matrix cudamatrix util base
-ivector: base util matrix transform tree gmm
+cblasext: base
+matrix: base cblasext
+util: base matrix cblasext
+feat: base cudamatrix matrix cblasext util gmm transform tree
+tree: base util matrix cblasext
+gmm: base util matrix cblasext tree
+transform: base util matrix cblasext gmm tree
+fstext: base util matrix cblasext tree
+hmm: base tree matrix cblasext util
+lm: base util matrix cblasext fstext
+decoder: base util matrix cblasext gmm hmm tree transform lat fstext
+lat: base util hmm tree matrix cblasext
+cudamatrix: base util matrix cblasext
+nnet3: base util matrix cblasext decoder lat gmm hmm tree transform cudamatrix chain fstext
+rnnlm: base util matrix cblasext cudamatrix nnet3 lm hmm
+chain: lat hmm tree fstext matrix cblasext cudamatrix util base
+ivector: base util matrix cblasext transform tree gmm
 #3)Dependencies for optional parts of Kaldi
-onlinebin: base cudamatrix matrix util feat tree gmm transform fstext hmm lm decoder lat cudamatrix online
-# python-kaldi-decoding: base cudamatrix matrix util feat tree gmm transform fstext hmm decoder lat online
-cudafeat: base cudamatrix matrix util gmm transform tree feat cudamatrix online2
-cudafeatbin: base cudamatrix matrix util gmm transform tree feat cudamatrix cudafeat online2
-online: decoder gmm transform feat matrix util base lat hmm tree
-online2: decoder gmm transform feat matrix util base lat hmm tree ivector cudamatrix nnet3 chain
-kws: base util hmm tree matrix lat
+onlinebin: base cudamatrix matrix cblasext util feat tree gmm transform fstext hmm lm decoder lat cudamatrix online
+# python-kaldi-decoding: base cudamatrix matrix cblasext util feat tree gmm transform fstext hmm decoder lat online
+cudafeat: base cudamatrix matrix cblasext util gmm transform tree feat cudamatrix online2
+cudafeatbin: base cudamatrix matrix cblasext util gmm transform tree feat cudamatrix cudafeat online2
+online: decoder gmm transform feat matrix cblasext util base lat hmm tree
+online2: decoder gmm transform feat matrix cblasext util base lat hmm tree ivector cudamatrix nnet3 chain
+kws: base util hmm tree matrix cblasext lat
 cudadecoder:  cudamatrix cudafeat online2 nnet3 ivector feat fstext lat chain transform
 cudadecoderbin: cudadecoder cudafeat cudamatrix online2 nnet3 ivector feat fstext lat chain transform
diff --git a/src/bin/Makefile b/src/bin/Makefile
index c088e4da76b..855a43bf350 100644
--- a/src/bin/Makefile
+++ b/src/bin/Makefile
@@ -1,6 +1,5 @@
 
 all:
-	-rm -f arpa2fst
 EXTRA_CXXFLAGS = -Wno-sign-compare
 include ../kaldi.mk
 
@@ -34,10 +33,6 @@ ADDLIBS = ../decoder/kaldi-decoder.a ../lat/kaldi-lat.a ../lm/kaldi-lm.a \
           ../util/kaldi-util.a ../matrix/kaldi-matrix.a \
           ../cblasext/kaldi-cblasext.a ../base/kaldi-base.a
 
+TESTFILES =
 
-#LDLIBS += $(CUDA_LDLIBS)
-
-
-# # TESTFILES =
-
-# # include ../makefiles/default_rules.mk
+include ../makefiles/default_rules.mk
diff --git a/src/bin/compile-graph.cc b/src/bin/compile-graph.cc
index 9125c67ffb3..dea332aced0 100644
--- a/src/bin/compile-graph.cc
+++ b/src/bin/compile-graph.cc
@@ -48,19 +48,12 @@ int main(int argc, char *argv[]) {
     ParseOptions po(usage);
 
 
-    BaseFloat transition_scale = 1.0;
-    BaseFloat self_loop_scale = 1.0;  // Caution: the script default is 0.1.
     int32 nonterm_phones_offset = -1;
     std::string disambig_rxfilename;
 
 
     po.Register("read-disambig-syms", &disambig_rxfilename, "File containing "
                 "list of disambiguation symbols in phone symbol table");
-    po.Register("transition-scale", &transition_scale, "Scale of transition "
-                "probabilities (excluding self-loops).");
-    po.Register("self-loop-scale", &self_loop_scale, "Scale of self-loop vs. "
-                "non-self-loop probability mass.  Caution: the default of "
-                "mkgraph.sh is 0.1, but this defaults to 1.0.");
     po.Register("nonterm-phones-offset", &nonterm_phones_offset, "Integer "
                 "value of symbol #nonterm_bos in phones.txt, if present. "
                 "(Only relevant for grammar decoding).");
@@ -141,7 +134,6 @@ int main(int argc, char *argv[]) {
     lg_fst.DeleteStates();
 
     HTransducerConfig h_cfg;
-    h_cfg.transition_scale = transition_scale;
     h_cfg.nonterm_phones_offset = nonterm_phones_offset;
     std::vector<int32> disambig_syms_h; // disambiguation symbols on
                                         // input side of H.
@@ -169,11 +161,12 @@ int main(int argc, char *argv[]) {
     MinimizeEncoded(&hclg_fst);
 
     std::vector<int32> disambig;
-    bool currently_self_loop_free = true;
+    bool currently_self_loop_free = true,
+        use_weights = true;
     AddSelfLoops(trans_model,
                  disambig,
-                 self_loop_scale,
                  currently_self_loop_free,
+                 use_weights,
                  &hclg_fst);
 
     if (nonterm_phones_offset >= 0)
diff --git a/src/bin/compile-train-graphs-fsts.cc b/src/bin/compile-train-graphs-fsts.cc
index 473887538ae..8d0203c0a5e 100644
--- a/src/bin/compile-train-graphs-fsts.cc
+++ b/src/bin/compile-train-graphs-fsts.cc
@@ -52,9 +52,6 @@ int main(int argc, char *argv[]) {
 
     TrainingGraphCompilerOptions gopts;
     int32 batch_size = 250;
-    gopts.transition_scale = 0.0;  // Change the default to 0.0 since we will generally add the
-    // transition probs in the alignment phase (since they change each time)
-    gopts.self_loop_scale = 0.0;  // Ditto for self-loop probs.
     std::string disambig_rxfilename;
     gopts.Register(&po);
 
@@ -63,7 +60,7 @@ int main(int argc, char *argv[]) {
                 "more memory.  E.g. 500");
     po.Register("read-disambig-syms", &disambig_rxfilename, "File containing "
                 "list of disambiguation symbols in phone symbol table");
-    
+
     po.Read(argc, argv);
 
     if (po.NumArgs() != 5) {
@@ -103,7 +100,7 @@ int main(int argc, char *argv[]) {
 
     SequentialTableReader<fst::VectorFstHolder> fst_reader(fsts_rspecifier);
     TableWriter<fst::VectorFstHolder> fst_writer(fsts_wspecifier);
-    
+
     int num_succeed = 0, num_fail = 0;
 
     if (batch_size == 1) {  // We treat batch_size of 1 as a special case in order
diff --git a/src/bin/compile-train-graphs.cc b/src/bin/compile-train-graphs.cc
index a0722c920b4..3e3532fbd98 100644
--- a/src/bin/compile-train-graphs.cc
+++ b/src/bin/compile-train-graphs.cc
@@ -46,9 +46,7 @@ int main(int argc, char *argv[]) {
 
     TrainingGraphCompilerOptions gopts;
     int32 batch_size = 250;
-    gopts.transition_scale = 0.0;  // Change the default to 0.0 since we will generally add the
-    // transition probs in the alignment phase (since they change eacm time)
-    gopts.self_loop_scale = 0.0;  // Ditto for self-loop probs.
+
     std::string disambig_rxfilename;
     gopts.Register(&po);
 
@@ -57,7 +55,7 @@ int main(int argc, char *argv[]) {
                 "more memory.  E.g. 500");
     po.Register("read-disambig-syms", &disambig_rxfilename, "File containing "
                 "list of disambiguation symbols in phone symbol table");
-    
+
     po.Read(argc, argv);
 
     if (po.NumArgs() != 5) {
@@ -85,7 +83,7 @@ int main(int argc, char *argv[]) {
       if (!ReadIntegerVectorSimple(disambig_rxfilename, &disambig_syms))
         KALDI_ERR << "fstcomposecontext: Could not read disambiguation symbols from "
                   << disambig_rxfilename;
-    
+
     TrainingGraphCompiler gc(trans_model, ctx_dep, lex_fst, disambig_syms, gopts);
 
     lex_fst = NULL;  // we gave ownership to gc.
diff --git a/src/feat/feature-fbank.cc b/src/feat/feature-fbank.cc
index 8becf6a8141..28311903b67 100644
--- a/src/feat/feature-fbank.cc
+++ b/src/feat/feature-fbank.cc
@@ -78,7 +78,7 @@ void FbankComputer::Compute(BaseFloat vtln_warp,
                feature->Dim() == this->Dim());
 
 
-  BaseFloat signal_log_energy;
+  BaseFloat signal_log_energy = 0.0;
   if (opts_.use_energy)
     signal_log_energy = Log(std::max<BaseFloat>(VecVec(*signal_frame, *signal_frame),
                                                 opts_.energy_floor));
diff --git a/src/gmmbin/gmm-est.cc b/src/gmmbin/gmm-est.cc
index 023a9f324b0..5bde0923536 100644
--- a/src/gmmbin/gmm-est.cc
+++ b/src/gmmbin/gmm-est.cc
@@ -1,6 +1,7 @@
 // gmmbin/gmm-est.cc
 
 // Copyright 2009-2011  Microsoft Corporation
+//                2019  Johns Hopkins University (author: Daniel Povey)
 
 // See ../../COPYING for clarification regarding multiple authors
 //
@@ -85,12 +86,10 @@ int main(int argc, char *argv[]) {
       am_gmm.Read(ki.Stream(), binary_read);
     }
 
-    Vector<double> transition_accs;
     AccumAmDiagGmm gmm_accs;
     {
       bool binary;
       Input ki(stats_filename, &binary);
-      transition_accs.Read(ki.Stream(), binary);
       gmm_accs.Read(ki.Stream(), binary, true);  // true == add; doesn't matter here.
     }
 
@@ -133,12 +132,10 @@ int main(int argc, char *argv[]) {
       am_gmm.Write(ko.Stream(), binary_write);
     }
 
-    KALDI_LOG << "Written model to " << model_out_filename;
+    KALDI_LOG << "Wrote model to " << model_out_filename;
     return 0;
   } catch(const std::exception &e) {
     std::cerr << e.what() << '\n';
     return -1;
   }
 }
-
-
diff --git a/src/gmmbin/gmm-sum-accs.cc b/src/gmmbin/gmm-sum-accs.cc
index 49146925bab..6d99c4a35c9 100644
--- a/src/gmmbin/gmm-sum-accs.cc
+++ b/src/gmmbin/gmm-sum-accs.cc
@@ -50,16 +50,12 @@ int main(int argc, char *argv[]) {
       std::string stats_in_filename = po.GetArg(i);
       bool binary_read;
       kaldi::Input ki(stats_in_filename, &binary_read);
-      transition_accs.Read(ki.Stream(), binary_read, true /*add read values*/);
       gmm_accs.Read(ki.Stream(), binary_read, true /*add read values*/);
     }
 
     // Write out the accs
-    {
-      kaldi::Output ko(stats_out_filename, binary);
-      transition_accs.Write(ko.Stream(), binary);
-      gmm_accs.Write(ko.Stream(), binary);
-    }
+    WriteKaldiObject(gmm_accs, stats_out_filename, binary);
+
     KALDI_LOG << "Summed " << num_accs << " stats, total count "
               << gmm_accs.TotCount() << ", avg like/frame "
               << (gmm_accs.TotLogLike() / gmm_accs.TotCount());
@@ -70,5 +66,3 @@ int main(int argc, char *argv[]) {
     return -1;
   }
 }
-
-
diff --git a/src/hmm/topology.cc b/src/hmm/topology.cc
index fc2fa87cefc..4a90a0d5414 100644
--- a/src/hmm/topology.cc
+++ b/src/hmm/topology.cc
@@ -334,11 +334,11 @@ void Topology::ComputeDerived() {
     const fst::StdVectorFst &entry = entries_[i];
     std::vector<float> &correction_factors(
         self_loop_correction_factors_[i]);
-    std::vector<int32> &pdf_classes(
+    std::vector<int32> &self_loop_pdf_classes(
         self_loop_pdf_classes_[i]);
     StateId num_states = entry.NumStates();
     correction_factors.resize(num_states);
-    pdf_classes.resize(num_states, -1);
+    self_loop_pdf_classes.resize(num_states, -1);
     for (StateId s = 0; s < num_states; s++) {
       float tot_prob = exp(-entry.Final(s).Value()),
           self_loop_prob = 0.0;
@@ -349,9 +349,9 @@ void Topology::ComputeDerived() {
         tot_prob += this_prob;
         if (arc.nextstate == s) {
           self_loop_prob += this_prob;
-          KALDI_ASSERT(pdf_classes[s] == -1 &&
+          KALDI_ASSERT(self_loop_pdf_classes[s] == -1 &&
                        "State in topology has more than one self-loop");
-          pdf_classes[s] = arc.ilabel;
+          self_loop_pdf_classes[s] = arc.ilabel;
         }
       }
       KALDI_ASSERT(tot_prob > 0 && "Invalid topology");

From d91a02013642da5c489809c8228bc095fa9bce61 Mon Sep 17 00:00:00 2001
From: Daniel Povey <dpovey@gmail.com>
Date: Wed, 10 Jul 2019 19:44:15 -0400
Subject: [PATCH 148/163] [src] Add back lattice-add-trans-probs

---
 src/hmm/hmm-utils.cc                  | 58 +++++++++++++++++++
 src/hmm/hmm-utils.h                   | 24 ++++++++
 src/latbin/Makefile                   |  2 +-
 src/latbin/lattice-add-trans-probs.cc | 81 +++++++++++++++++++++++++++
 4 files changed, 164 insertions(+), 1 deletion(-)
 create mode 100644 src/latbin/lattice-add-trans-probs.cc

diff --git a/src/hmm/hmm-utils.cc b/src/hmm/hmm-utils.cc
index 76470e09bb6..b3aae0f2717 100644
--- a/src/hmm/hmm-utils.cc
+++ b/src/hmm/hmm-utils.cc
@@ -1032,6 +1032,64 @@ bool ConvertPhnxToProns(const std::vector<int32> &phnx,
 }
 
 
+
+
+void AddTransitionProbs(const Transitions &trans_model,
+                        const std::vector<int32> &disambig_syms,  // may be empty
+                        fst::VectorFst<fst::StdArc> *fst) {
+  using namespace fst;
+  KALDI_ASSERT(IsSortedAndUniq(disambig_syms));
+  int num_tids = trans_model.NumTransitionIds();
+  for (StateIterator<VectorFst<StdArc> > siter(*fst);
+      !siter.Done();
+      siter.Next()) {
+    for (MutableArcIterator<VectorFst<StdArc> > aiter(fst, siter.Value());
+         !aiter.Done();
+         aiter.Next()) {
+      StdArc arc = aiter.Value();
+      StdArc::Label l = arc.ilabel;
+      if (l >= 1 && l <= num_tids) {  // a transition-id.
+        BaseFloat cost = trans_model.InfoForTransitionId(l).transition_cost;
+        arc.weight = Times(arc.weight, TropicalWeight(cost));
+      } else if (l != 0) {
+        if (!std::binary_search(disambig_syms.begin(), disambig_syms.end(),
+                               arc.ilabel))
+          KALDI_ERR << "AddTransitionProbs: invalid symbol " << arc.ilabel
+                    << " on graph input side.";
+      }
+      aiter.SetValue(arc);
+    }
+  }
+}
+
+void AddTransitionProbs(const Transitions &trans_model,
+                        Lattice *lat) {
+  using namespace fst;
+  int num_tids = trans_model.NumTransitionIds();
+  for (fst::StateIterator<Lattice> siter(*lat);
+       !siter.Done();
+       siter.Next()) {
+    for (MutableArcIterator<Lattice> aiter(lat, siter.Value());
+         !aiter.Done();
+         aiter.Next()) {
+      LatticeArc arc = aiter.Value();
+      LatticeArc::Label l = arc.ilabel;
+      if (l >= 1 && l <= num_tids) {  // a transition-id.
+        BaseFloat cost = trans_model.InfoForTransitionId(l).transition_cost;
+        arc.weight.SetValue1(arc.weight.Value1() + cost);
+      } else if (l != 0) {
+        KALDI_ERR << "AddTransitionProbs: invalid symbol " << arc.ilabel
+                  << " on lattice input side.";
+      }
+      aiter.SetValue(arc);
+    }
+  }
+}
+
+
+
+
+
 void GetRandomAlignmentForPhone(const ContextDependencyInterface &ctx_dep,
                                 const Transitions &trans_model,
                                 const std::vector<int32> &phone_window,
diff --git a/src/hmm/hmm-utils.h b/src/hmm/hmm-utils.h
index 2ca54a9dd95..bc9e3eaeaa7 100644
--- a/src/hmm/hmm-utils.h
+++ b/src/hmm/hmm-utils.h
@@ -180,6 +180,30 @@ void AddSelfLoops(const Transitions &trans_model,
                   bool use_weights,
                   fst::VectorFst<fst::StdArc> *fst);
 
+/**
+  * Adds transition-prob to the graph.
+  * Useful if you want to create a graph without transition probs, then possibly
+  * train the model (including the transition probs) but keep the graph fixed,
+  * and add back in the transition probs.  It assumes the fst has transition-ids
+  * on it.  It is not an error if the FST has no states (nothing will be done).
+  * @param trans_model [in] The transition model
+  * @param disambig_syms [in] A list of disambiguation symbols, required if the
+  *                       graph has disambiguation symbols on its input but only
+  *                       used for checks.
+  * @param  fst [in, out] The FST to be modified.
+  */
+void AddTransitionProbs(const Transitions &trans_model,
+                        const std::vector<int32> &disambig_syms,
+                        fst::VectorFst<fst::StdArc> *fst);
+
+/**
+   This is as AddSelfLoops(), but operates on a Lattice, where
+   it affects the graph part of the weight (the first element
+   of the pair). */
+void AddTransitionProbs(const Transitions &trans_model,
+                        Lattice *lat);
+
+
 
 /// Returns a transducer from pdfs plus one (input) to  transition-ids (output).
 /// Currently of use only for testing.
diff --git a/src/latbin/Makefile b/src/latbin/Makefile
index 1ba4f5b5a88..5f686e8ae3e 100644
--- a/src/latbin/Makefile
+++ b/src/latbin/Makefile
@@ -12,7 +12,7 @@ BINFILES = lattice-best-path lattice-prune lattice-equivalent lattice-to-nbest \
            lattice-determinize lattice-oracle lattice-rmali \
            lattice-compose lattice-boost-ali lattice-copy lattice-to-fst \
            lattice-to-phone-lattice lattice-interp lattice-project \
-           lattice-difference \
+           lattice-add-trans-probs lattice-difference \
            nbest-to-linear nbest-to-lattice lattice-1best linear-to-nbest \
            lattice-mbr-decode lattice-align-words lattice-to-mpe-post \
            lattice-copy-backoff nbest-to-ctm lattice-determinize-pruned \
diff --git a/src/latbin/lattice-add-trans-probs.cc b/src/latbin/lattice-add-trans-probs.cc
new file mode 100644
index 00000000000..bf783ea77e7
--- /dev/null
+++ b/src/latbin/lattice-add-trans-probs.cc
@@ -0,0 +1,81 @@
+// latbin/lattice-add-trans-probs.cc
+
+// Copyright 2009-2011  Microsoft Corporation
+
+// See ../../COPYING for clarification regarding multiple authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//  http://www.apache.org/licenses/LICENSE-2.0
+//
+// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
+// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
+// MERCHANTABLITY OR NON-INFRINGEMENT.
+// See the Apache 2 License for the specific language governing permissions and
+// limitations under the License.
+
+
+#include "base/kaldi-common.h"
+#include "util/common-utils.h"
+#include "fstext/fstext-lib.h"
+#include "lat/kaldi-lattice.h"
+#include "lat/lattice-functions.h"
+#include "hmm/transitions.h"
+#include "hmm/hmm-utils.h"
+
+int main(int argc, char *argv[]) {
+  try {
+    using namespace kaldi;
+    typedef kaldi::int32 int32;
+    typedef kaldi::int64 int64;
+    using fst::SymbolTable;
+    using fst::VectorFst;
+    using fst::StdArc;
+
+    const char *usage =
+        "Add transition probabilities into graph part of lattice scores\n"
+        "\n"
+        "Usage: lattice-add-trans-probs [options] model lattice-rspecifier lattice-wspecifier\n"
+        " e.g.: lattice-add-trans-probs ark:in.lats ark:out.lats\n";
+
+    ParseOptions po(usage);
+
+    po.Read(argc, argv);
+
+    if (po.NumArgs() != 3) {
+      po.PrintUsage();
+      exit(1);
+    }
+
+    std::string
+        model_rxfilename = po.GetArg(1),
+        lats_rspecifier = po.GetArg(2),
+        lats_wspecifier = po.GetArg(3);
+
+    int32 n_done = 0;
+
+    Transitions trans_model;
+
+    ReadKaldiObject(model_rxfilename, &trans_model);
+
+    SequentialLatticeReader lattice_reader(lats_rspecifier); // read as
+    // regular lattice.
+    CompactLatticeWriter clat_writer(lats_wspecifier); // write as compact.
+    for (; !lattice_reader.Done(); lattice_reader.Next(), n_done++) {
+      Lattice lat(lattice_reader.Value());
+      AddTransitionProbs(trans_model, &lat);
+      CompactLattice clat;
+      ConvertLattice(lat, &clat);
+      clat_writer.Write(lattice_reader.Key(), clat);
+      n_done++;
+    }
+    KALDI_LOG << "Done adding transition probabilities to " << n_done << " lattices.";
+    return (n_done != 0 ? 0 : 1);
+  } catch(const std::exception &e) {
+    std::cerr << e.what();
+    return -1;
+  }
+}

From 10400f446d5b5567971c2044fc9399171613095f Mon Sep 17 00:00:00 2001
From: Daniel Povey <dpovey@gmail.com>
Date: Thu, 11 Jul 2019 18:36:48 -0400
Subject: [PATCH 149/163] [src,scripts] Various fixes related to kaldi10 topo
 changes

---
 egs/wsj/s5/steps/nnet3/chain/gen_topo.pl  | 42 -----------------
 egs/wsj/s5/steps/nnet3/chain/gen_topo.py  | 12 +++--
 egs/wsj/s5/steps/nnet3/chain/gen_topo2.py | 55 -----------------------
 egs/wsj/s5/steps/nnet3/chain/gen_topo3.py | 41 -----------------
 egs/wsj/s5/steps/nnet3/chain/gen_topo4.py | 46 -------------------
 src/hmm/hmm-utils.cc                      | 10 ++---
 src/lat/word-align-lattice.cc             |  4 +-
 7 files changed, 13 insertions(+), 197 deletions(-)
 delete mode 100755 egs/wsj/s5/steps/nnet3/chain/gen_topo.pl
 delete mode 100755 egs/wsj/s5/steps/nnet3/chain/gen_topo2.py
 delete mode 100755 egs/wsj/s5/steps/nnet3/chain/gen_topo3.py
 delete mode 100755 egs/wsj/s5/steps/nnet3/chain/gen_topo4.py

diff --git a/egs/wsj/s5/steps/nnet3/chain/gen_topo.pl b/egs/wsj/s5/steps/nnet3/chain/gen_topo.pl
deleted file mode 100755
index 32dfa272a97..00000000000
--- a/egs/wsj/s5/steps/nnet3/chain/gen_topo.pl
+++ /dev/null
@@ -1,42 +0,0 @@
-#!/usr/bin/env perl
-
-# Copyright 2012  Johns Hopkins University (author: Daniel Povey)
-
-# Generate a topology file.  This allows control of the number of states in the
-# non-silence HMMs, and in the silence HMMs.  This is a modified version of
-# 'utils/gen_topo.pl' that generates a different type of topology, one that we
-# believe should be useful in the 'chain' model.  Note: right now it doesn't
-# have any real options, and it treats silence and nonsilence the same.  The
-# intention is that you write different versions of this script, or add options,
-# if you experiment with it.
-
-if (@ARGV != 2) {
-  print STDERR "Usage: utils/gen_topo.pl <colon-separated-nonsilence-phones> <colon-separated-silence-phones>\n";
-  print STDERR "e.g.:  utils/gen_topo.pl 4:5:6:7:8:9:10 1:2:3\n";
-  exit (1);
-}
-
-($nonsil_phones, $sil_phones) = @ARGV;
-
-$nonsil_phones =~ s/:/ /g;
-$sil_phones =~ s/:/ /g;
-$nonsil_phones =~ m/^\d[ \d]+$/ || die "$0: bad arguments @ARGV\n";
-$sil_phones =~ m/^\d[ \d]*$/ || die "$0: bad arguments @ARGV\n";
-
-print "<Topology>\n";
-print "<TopologyEntry>\n";
-print "<ForPhones>\n";
-print "$nonsil_phones $sil_phones\n";
-print "</ForPhones>\n";
-# The next two lines may look like a bug, but they are as intended.  State 0 has
-# no self-loop, it happens exactly once.  And it can go either to state 1 (with
-# a self-loop) or to state 2, so we can have zero or more instances of state 1
-# following state 0.
-# We make the transition-probs 0.5 so they normalize, to keep the code happy.
-# In fact, we always set the transition probability scale to 0.0 in the 'chain'
-# code, so they are never used.
-print "<State> 0 <PdfClass> 0 <Transition> 1 0.5 <Transition> 2 0.5 </State>\n";
-print "<State> 1 <PdfClass> 1 <Transition> 1 0.5 <Transition> 2 0.5 </State>\n";
-print "<State> 2 </State>\n";
-print "</TopologyEntry>\n";
-print "</Topology>\n";
diff --git a/egs/wsj/s5/steps/nnet3/chain/gen_topo.py b/egs/wsj/s5/steps/nnet3/chain/gen_topo.py
index 88def77451b..f587d1b8448 100755
--- a/egs/wsj/s5/steps/nnet3/chain/gen_topo.py
+++ b/egs/wsj/s5/steps/nnet3/chain/gen_topo.py
@@ -32,17 +32,15 @@
 nonsilence_phones = [ int(x) for x in args.nonsilence_phones.split(":") ]
 all_phones = silence_phones +  nonsilence_phones
 
+
 print("<Topology>")
 print("<TopologyEntry>")
 print("<ForPhones>")
 print(" ".join([str(x) for x in all_phones]))
 print("</ForPhones>")
-# We make the transition-probs 0.5 so they normalize, to keep the code happy.
-# In fact, we always set the transition probability scale to 0.0 in the 'chain'
-# code, so they are never used.
-# Note: the <ForwardPdfClass> will actually happen on the incoming arc because
-# we always build the graph with "reorder=true".
-print("<State> 0 <ForwardPdfClass> 0 <SelfLoopPdfClass> 1 <Transition> 0 0.5 <Transition> 1 0.5 </State>")
-print("<State> 1 </State>")
+print("0  1  1  0.0")
+print("1  1  2  0.69314718055")
+print("1  0.69314718055")
+print("")
 print("</TopologyEntry>")
 print("</Topology>")
diff --git a/egs/wsj/s5/steps/nnet3/chain/gen_topo2.py b/egs/wsj/s5/steps/nnet3/chain/gen_topo2.py
deleted file mode 100755
index a33dab666e6..00000000000
--- a/egs/wsj/s5/steps/nnet3/chain/gen_topo2.py
+++ /dev/null
@@ -1,55 +0,0 @@
-#!/usr/bin/env python
-
-# Copyright 2012  Johns Hopkins University (author: Daniel Povey)
-
-# Generate a topology file.  This allows control of the number of states in the
-# non-silence HMMs, and in the silence HMMs.  This is a modified version of
-# 'utils/gen_topo.pl' that generates a different type of topology, one that we
-# believe should be useful in the 'chain' model.  Note: right now it doesn't
-# have any real options, and it treats silence and nonsilence the same.  The
-# intention is that you write different versions of this script, or add options,
-# if you experiment with it.
-
-from __future__ import print_function
-import argparse
-
-
-parser = argparse.ArgumentParser(description="Usage: steps/nnet3/chain/gen_topo.py "
-                                             "<colon-separated-nonsilence-phones> <colon-separated-silence-phones>"
-                                             "e.g.:  steps/nnet3/chain/gen_topo.pl 4:5:6:7:8:9:10 1:2:3\n",
-                                 epilog="See egs/swbd/s5c/local/chain/train_tdnn_a.sh for example of usage.");
-parser.add_argument("nonsilence_phones", type=str,
-                    help="List of non-silence phones as integers, separated by colons, e.g. 4:5:6:7:8:9");
-parser.add_argument("silence_phones", type=str,
-                    help="List of silence phones as integers, separated by colons, e.g. 1:2:3");
-
-args = parser.parse_args()
-
-silence_phones = [ int(x) for x in args.silence_phones.split(":") ]
-nonsilence_phones = [ int(x) for x in args.nonsilence_phones.split(":") ]
-all_phones = silence_phones +  nonsilence_phones
-
-print("<Topology>")
-print("<TopologyEntry>")
-print("<ForPhones>")
-print(" ".join([str(x) for x in all_phones]))
-print("</ForPhones>")
-
-# the pdf-classes are as follows:
-#  pdf-class 0 is in a 1-frame sequence, the initial and final state.
-#  pdf-class 1 is in a sequence with >=3 frames, the 'middle' states.  (important that
-#   it be numbered 1, which is the default list of pdf-classes used in 'cluster-phones').
-#  pdf-class 2 is the initial-state in a sequence with >= 2 frames.
-#  pdf-class 3 is the final-state in a sequence with >= 2 frames.
-# state 0 is nonemitting in this topology.
-
-print("<State> 0 <Transition> 1 0.5 <Transition> 2 0.5 </State>")  # initial nonemitting state.
-print("<State> 1 <PdfClass> 0 <Transition> 5 1.0 </State>")  # 1-frame sequence.
-print("<State> 2 <PdfClass> 2 <Transition> 3 0.5 <Transition> 4 0.5 </State>")  # 2 or more frames
-print("<State> 3 <PdfClass> 1 <Transition> 3 0.5 <Transition> 4 0.5 </State>")  # 3 or more frames
-print("<State> 4 <PdfClass> 3 <Transition> 5 1.0 </State>") # 2 or more frames.
-print("<State> 5 </State>")  # final nonemitting state
-
-print("</TopologyEntry>")
-print("</Topology>")
-
diff --git a/egs/wsj/s5/steps/nnet3/chain/gen_topo3.py b/egs/wsj/s5/steps/nnet3/chain/gen_topo3.py
deleted file mode 100755
index f43f5046813..00000000000
--- a/egs/wsj/s5/steps/nnet3/chain/gen_topo3.py
+++ /dev/null
@@ -1,41 +0,0 @@
-#!/usr/bin/env python
-
-# Copyright 2012  Johns Hopkins University (author: Daniel Povey)
-
-# Generate a topology file.  This allows control of the number of states in the
-# non-silence HMMs, and in the silence HMMs.  This is a modified version of
-# 'utils/gen_topo.pl' that generates a different type of topology, one that we
-# believe should be useful in the 'chain' model.  Note: right now it doesn't
-# have any real options, and it treats silence and nonsilence the same.  The
-# intention is that you write different versions of this script, or add options,
-# if you experiment with it.
-
-from __future__ import print_function
-import argparse
-
-
-parser = argparse.ArgumentParser(description="Usage: steps/nnet3/chain/gen_topo.py "
-                                             "<colon-separated-nonsilence-phones> <colon-separated-silence-phones>"
-                                             "e.g.:  steps/nnet3/chain/gen_topo.pl 4:5:6:7:8:9:10 1:2:3\n",
-                                 epilog="See egs/swbd/s5c/local/chain/train_tdnn_a.sh for example of usage.");
-parser.add_argument("nonsilence_phones", type=str,
-                    help="List of non-silence phones as integers, separated by colons, e.g. 4:5:6:7:8:9");
-parser.add_argument("silence_phones", type=str,
-                    help="List of silence phones as integers, separated by colons, e.g. 1:2:3");
-
-args = parser.parse_args()
-
-silence_phones = [ int(x) for x in args.silence_phones.split(":") ]
-nonsilence_phones = [ int(x) for x in args.nonsilence_phones.split(":") ]
-all_phones = silence_phones +  nonsilence_phones
-
-print("<Topology>")
-print("<TopologyEntry>")
-print("<ForPhones>")
-print(" ".join([str(x) for x in all_phones]))
-print("</ForPhones>")
-print("<State> 0 <PdfClass> 0 <Transition> 0 0.5 <Transition> 1 0.5 </State>")
-print("<State> 1 </State>")
-print("</TopologyEntry>")
-print("</Topology>")
-
diff --git a/egs/wsj/s5/steps/nnet3/chain/gen_topo4.py b/egs/wsj/s5/steps/nnet3/chain/gen_topo4.py
deleted file mode 100755
index 6d88a6e4449..00000000000
--- a/egs/wsj/s5/steps/nnet3/chain/gen_topo4.py
+++ /dev/null
@@ -1,46 +0,0 @@
-#!/usr/bin/env python
-
-# Copyright 2012  Johns Hopkins University (author: Daniel Povey)
-
-# Generate a topology file.  This allows control of the number of states in the
-# non-silence HMMs, and in the silence HMMs.  This is a modified version of
-# 'utils/gen_topo.pl' that generates a different type of topology, one that we
-# believe should be useful in the 'chain' model.  Note: right now it doesn't
-# have any real options, and it treats silence and nonsilence the same.  The
-# intention is that you write different versions of this script, or add options,
-# if you experiment with it.
-
-from __future__ import print_function
-import argparse
-
-
-parser = argparse.ArgumentParser(description="Usage: steps/nnet3/chain/gen_topo.py "
-                                             "<colon-separated-nonsilence-phones> <colon-separated-silence-phones>"
-                                             "e.g.:  steps/nnet3/chain/gen_topo.pl 4:5:6:7:8:9:10 1:2:3\n",
-                                 epilog="See egs/swbd/s5c/local/chain/train_tdnn_a.sh for example of usage.");
-parser.add_argument("nonsilence_phones", type=str,
-                    help="List of non-silence phones as integers, separated by colons, e.g. 4:5:6:7:8:9");
-parser.add_argument("silence_phones", type=str,
-                    help="List of silence phones as integers, separated by colons, e.g. 1:2:3");
-
-args = parser.parse_args()
-
-silence_phones = [ int(x) for x in args.silence_phones.split(":") ]
-nonsilence_phones = [ int(x) for x in args.nonsilence_phones.split(":") ]
-all_phones = silence_phones +  nonsilence_phones
-
-print("<Topology>")
-print("<TopologyEntry>")
-print("<ForPhones>")
-print(" ".join([str(x) for x in all_phones]))
-print("</ForPhones>")
-# state 0 is obligatory (occurs once)
-print("<State> 0 <PdfClass> 0 <Transition> 1 0.3333 <Transition> 2 0.3333 <Transition> 3 0.3333 </State> ")
-# state 1 is used only when >2 frames
-print("<State> 1 <PdfClass> 1 <Transition> 1 0.5 <Transition> 2 0.5 </State>")
-# state 2 is used only when >=2 frames (and occurs once)
-print("<State> 2 <PdfClass> 2 <Transition> 3 1.0 </State>")
-print("<State> 3 </State>")  # final nonemitting state
-print("</TopologyEntry>")
-print("</Topology>")
-
diff --git a/src/hmm/hmm-utils.cc b/src/hmm/hmm-utils.cc
index b3aae0f2717..cc416c8eef2 100644
--- a/src/hmm/hmm-utils.cc
+++ b/src/hmm/hmm-utils.cc
@@ -872,11 +872,11 @@ bool ConvertAlignment(const Transitions &old_trans_model,
                       bool repeat_frames,
                       const std::vector<int32> *phone_map,
                       std::vector<int32> *new_alignment) {
-  if (subsample_factor == 1) {
-    if (repeat_frames) {
-      KALDI_WARN << "repeat_frames being set to true has no effect when "
+  if (subsample_factor == 1 && repeat_frames)
+    KALDI_WARN << "repeat_frames being set to true has no effect when "
         "subsample_factor=1 (its default value)";
-    }
+
+  if (subsample_factor == 1 || !repeat_frames) {
     return ConvertAlignmentInternal(old_trans_model,
                                     new_trans_model,
                                     new_ctx_dep,
@@ -899,7 +899,7 @@ bool ConvertAlignment(const Transitions &old_trans_model,
                                     new_trans_model,
                                     new_ctx_dep,
                                     old_alignment,
-                                    conversion_shift, // conversion_shift
+                                    conversion_shift,
                                     subsample_factor,
                                     phone_map,
                                     &shifted_alignments[conversion_shift]))
diff --git a/src/lat/word-align-lattice.cc b/src/lat/word-align-lattice.cc
index 56514822130..951f96a302a 100644
--- a/src/lat/word-align-lattice.cc
+++ b/src/lat/word-align-lattice.cc
@@ -456,6 +456,8 @@ bool LatticeWordAligner::ComputationState::OutputNormalWordArc(
   // OK, we just consumed the word-initial phone.
   if (i == len)
     return false;
+  prev_info = &tmodel.InfoForTransitionId(transition_ids_[i]);
+  i++;
   // Eat up any word-internal phones.
   while (i < len && wb_info.TypeOfPhone(prev_info->phone) ==
          WordBoundaryInfo::kWordInternalPhone) {
@@ -466,7 +468,7 @@ bool LatticeWordAligner::ComputationState::OutputNormalWordArc(
     return false;
   // Try to find the ending of the next phone, which should be a word-final
   // phone.
-  for (i = 1; i < len; i++) {
+  for (; i < len; i++) {
     const Transitions::TransitionIdInfo *this_info = &tmodel.InfoForTransitionId(
         transition_ids_[i]);
     if (prev_info->is_final && this_info->is_initial) {

From 10663ad3be3f4ef6beda657c7cba0073c43d07e3 Mon Sep 17 00:00:00 2001
From: Daniel Povey <dpovey@gmail.com>
Date: Sun, 14 Jul 2019 19:23:23 -0400
Subject: [PATCH 150/163] [src,scripts] Fixes to kaldi10 branch to make things
 work

---
 egs/wsj/s5/steps/nnet3/chain/build_tree.sh |  1 -
 src/bin/compile-questions.cc               | 12 ++++++------
 src/cudadecoderbin/Makefile                |  4 ++--
 src/feat/feature-mfcc.cc                   | 14 ++++++++++++++
 src/feat/feature-mfcc.h                    |  8 +++++++-
 src/hmm/hmm-utils.cc                       |  3 ---
 6 files changed, 29 insertions(+), 13 deletions(-)

diff --git a/egs/wsj/s5/steps/nnet3/chain/build_tree.sh b/egs/wsj/s5/steps/nnet3/chain/build_tree.sh
index 757963f13a7..ebb9e24902f 100755
--- a/egs/wsj/s5/steps/nnet3/chain/build_tree.sh
+++ b/egs/wsj/s5/steps/nnet3/chain/build_tree.sh
@@ -23,7 +23,6 @@ context_opts=  # e.g. set this to "--context-width 5 --central-position 2" for q
 cluster_thresh=-1  # for build-tree control final bottom-up clustering of leaves
 frame_subsampling_factor=1
 alignment_subsampling_factor=
-leftmost_questions_truncate=-1  # note: this option is deprecated and has no effect
 tree_stats_opts=
 cluster_phones_opts=
 repeat_frames=false
diff --git a/src/bin/compile-questions.cc b/src/bin/compile-questions.cc
index 1c8565e032d..bf734ac01da 100644
--- a/src/bin/compile-questions.cc
+++ b/src/bin/compile-questions.cc
@@ -130,13 +130,13 @@ int main(int argc, char *argv[]) {
     }
 
     QuestionsForKey pdfclass_opts(num_iters_refine);
-    std::vector<std::vector<int32> > pdfclass_questions(max_num_pdfclasses-1);
-    for (int32 i = 0; i < max_num_pdfclasses - 1; i++)
-      for (int32 j = 0; j <= i; j++)
-        pdfclass_questions[i].push_back(j);
-    // E.g. if max_num_pdfclasses == 3,  pdfclass_questions is now [ [0], [0, 1] ].
+    std::vector<std::vector<int32> > pdfclass_questions(max_num_pdfclasses - 1);
+    for (int32 i = 1; i <= max_num_pdfclasses - 1; i++)
+      for (int32 j = 1; j <= i; j++)
+        pdfclass_questions[i-1].push_back(j);
+    // E.g. if max_num_pdfclasses == 3,  pdfclass_questions is now [ 1], [1, 2] ].
     pdfclass_opts.initial_questions = pdfclass_questions;
-    KALDI_LOG << "Setting questions for hmm-position [hmm-position ranges from 0 to "<< (max_num_pdfclasses-1) <<"]";
+    KALDI_LOG << "Setting questions for pdf-class [pdf-class ranges from 1 to "<< max_num_pdfclasses <<"]";
     qo.SetQuestionsOf(kPdfClass, pdfclass_opts);
 
     WriteKaldiObject(qo, questions_out_filename, binary);
diff --git a/src/cudadecoderbin/Makefile b/src/cudadecoderbin/Makefile
index 276bb0ffc9c..bd9462ef00c 100644
--- a/src/cudadecoderbin/Makefile
+++ b/src/cudadecoderbin/Makefile
@@ -15,12 +15,12 @@ TESTFILES =
 
 ADDLIBS = ../cudadecoder/kaldi-cudadecoder.a  \
 ../online2/kaldi-online2.a ../ivector/kaldi-ivector.a \
-../nnet3/kaldi-nnet3.a ../chain/kaldi-chain.a ../nnet2/kaldi-nnet2.a \
+../nnet3/kaldi-nnet3.a ../chain/kaldi-chain.a \
 ../cudamatrix/kaldi-cudamatrix.a ../decoder/kaldi-decoder.a \
 ../lat/kaldi-lat.a ../fstext/kaldi-fstext.a ../hmm/kaldi-hmm.a \
 ../feat/kaldi-feat.a ../transform/kaldi-transform.a \
 ../gmm/kaldi-gmm.a ../tree/kaldi-tree.a ../util/kaldi-util.a \
-../matrix/kaldi-matrix.a ../cblasext/kaldi-cblasext.a ../base/kaldi-base.a 
+../matrix/kaldi-matrix.a ../cblasext/kaldi-cblasext.a ../base/kaldi-base.a
 
 endif
 
diff --git a/src/feat/feature-mfcc.cc b/src/feat/feature-mfcc.cc
index 5959f952f14..15085788d91 100644
--- a/src/feat/feature-mfcc.cc
+++ b/src/feat/feature-mfcc.cc
@@ -25,6 +25,15 @@
 namespace kaldi {
 
 
+// Compute liftering coefficients (scaling on cepstral coeffs)
+// coeffs are numbered slightly differently from HTK: the zeroth
+// index is C0, which is not affected.
+static void ComputeLifterCoeffs(BaseFloat Q, VectorBase<BaseFloat> *coeffs) {
+  for (int32 i = 0; i < coeffs->Dim(); i++)
+      (*coeffs)(i) = 1.0 + 0.5 * Q * sin (M_PI * i / Q);
+}
+
+
 void MfccComputer::Compute(BaseFloat vtln_warp,
                            VectorBase<BaseFloat> *signal_frame,
                            VectorBase<BaseFloat> *feature) {
@@ -55,6 +64,7 @@ void MfccComputer::Compute(BaseFloat vtln_warp,
   feature->SetZero();  // in case there were NaNs.
   // feature = dct_matrix_ * mel_energies [which now have log]
   feature->AddMatVec(1.0, dct_matrix_, kNoTrans, mel_energies_, 0.0);
+  feature->MulElements(lifter_coeffs_);
 
   if (opts_.use_energy)
     (*feature)(0) = signal_log_energy;
@@ -73,6 +83,10 @@ MfccComputer::MfccComputer(const MfccOptions &opts):
 
   Matrix<BaseFloat> dct_matrix(num_bins, num_bins);
   ComputeDctMatrix(&dct_matrix);
+  lifter_coeffs_.Resize(opts.num_ceps);
+  ComputeLifterCoeffs(opts.cepstral_lifter, &lifter_coeffs_);
+
+
   // Note that we include zeroth dct in either case.  If using the
   // energy we replace this with the energy.  This means a different
   // ordering of features than HTK.
diff --git a/src/feat/feature-mfcc.h b/src/feat/feature-mfcc.h
index 459a3faf10c..9a35405504a 100644
--- a/src/feat/feature-mfcc.h
+++ b/src/feat/feature-mfcc.h
@@ -41,11 +41,17 @@ struct MfccOptions {
   int32 num_ceps;  // e.g. 13: num cepstral coeffs, counting zero.
   bool use_energy;  // if true, use energy; else C0
   BaseFloat energy_floor;
+  // cepstral_lifter controls a scaling factor on the cepstra that helps give
+  // all the MFCC coeffs a similar dynamic range by scaling up the
+  // higher-frequency coefficients.  It's a rather odd formula involving
+  // a sigh.   We don't make it configurable.
+  BaseFloat cepstral_lifter;
 
   MfccOptions() : mel_opts(23),
                   num_ceps(13),
                   use_energy(true),
-                  energy_floor(1.0e-10) { }
+                  energy_floor(1.0e-10),
+                  cepstral_lifter(22.0) { }
 
 
   void Register(OptionsItf *opts) {
diff --git a/src/hmm/hmm-utils.cc b/src/hmm/hmm-utils.cc
index cc416c8eef2..55eec2e2939 100644
--- a/src/hmm/hmm-utils.cc
+++ b/src/hmm/hmm-utils.cc
@@ -1145,9 +1145,6 @@ void GetRandomAlignmentForPhone(const ContextDependencyInterface &ctx_dep,
   KALDI_PARANOID_ASSERT(
       trans_model.InfoForTransitionId(symbol_sequence.front()).is_initial &&
       trans_model.InfoForTransitionId(symbol_sequence.back()).is_final);
-  if (symbol_sequence.size() > 1) {
-    KALDI_ASSERT(!trans_model.InfoForTransitionId(symbol_sequence.back()).is_initial);
-  } // TODO: remove the above.
   symbol_sequence.swap(*alignment);
 }
 

From 252fedfd0270f38ce5d747d4136f04e7b5ddfbfb Mon Sep 17 00:00:00 2001
From: Xingyu Na <asr.naxingyu@gmail.com>
Date: Mon, 15 Jul 2019 11:56:07 +0800
Subject: [PATCH 151/163] [build] Add missing SUBDIR to Makefile (#3466)

---
 src/Makefile | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/Makefile b/src/Makefile
index 82102714811..182dc3f0a26 100644
--- a/src/Makefile
+++ b/src/Makefile
@@ -9,7 +9,7 @@ SUBDIRS = base matrix util feat cudafeat tree gmm transform \
           bin fstbin gmmbin fgmmbin featbin cudafeatbin \
           latbin nnet3 rnnlm chain nnet3bin kwsbin \
           ivector ivectorbin online2 online2bin lmbin chainbin rnnlmbin \
-          cudadecoder cudadecoderbin
+          cudadecoder cudadecoderbin cblasext
 
 
 MEMTESTDIRS = base matrix util feat cudafeat tree gmm transform \

From bf16577a20d2de5391561b7dc34e6d37d64bf43f Mon Sep 17 00:00:00 2001
From: Xingyu Na <asr.naxingyu@gmail.com>
Date: Wed, 17 Jul 2019 10:59:48 +0800
Subject: [PATCH 152/163] [src] restore cuda-compiled to kaldi10 (#3471)

---
 .gitignore                    |  3 ++-
 src/nnet3bin/Makefile         |  3 ++-
 src/nnet3bin/cuda-compiled.cc | 36 +++++++++++++++++++++++++++++++++++
 3 files changed, 40 insertions(+), 2 deletions(-)
 create mode 100644 src/nnet3bin/cuda-compiled.cc

diff --git a/.gitignore b/.gitignore
index 5764bfe22c6..13d8aefe39d 100644
--- a/.gitignore
+++ b/.gitignore
@@ -73,7 +73,8 @@ GSYMS
 /src/kaldi.mk.bak
 
 # /egs/
-/egs/*/*/mfcc
+/egs/*/*/mfcc*
+/egs/*/*/fbank*
 /egs/*/*/plp
 /egs/*/*/exp
 /egs/*/*/data
diff --git a/src/nnet3bin/Makefile b/src/nnet3bin/Makefile
index 4c7c63b554e..74d85efce1c 100644
--- a/src/nnet3bin/Makefile
+++ b/src/nnet3bin/Makefile
@@ -19,7 +19,8 @@ BINFILES = nnet3-init nnet3-info nnet3-get-egs nnet3-copy-egs nnet3-subset-egs \
    nnet3-discriminative-subset-egs nnet3-get-egs-simple \
    nnet3-discriminative-compute-from-egs nnet3-latgen-faster-looped \
    nnet3-egs-augment-image nnet3-xvector-get-egs nnet3-xvector-compute \
-   nnet3-latgen-grammar nnet3-compute-batch nnet3-latgen-faster-batch
+   nnet3-latgen-grammar nnet3-compute-batch nnet3-latgen-faster-batch \
+   cuda-compiled
 
 OBJFILES =
 
diff --git a/src/nnet3bin/cuda-compiled.cc b/src/nnet3bin/cuda-compiled.cc
new file mode 100644
index 00000000000..b6de9257657
--- /dev/null
+++ b/src/nnet3bin/cuda-compiled.cc
@@ -0,0 +1,36 @@
+// nnet2bin/cuda-compiled.cc
+
+// Copyright 2014 Johns Hopkins University (author:  Daniel Povey)
+
+// See ../../COPYING for clarification regarding multiple authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//  http://www.apache.org/licenses/LICENSE-2.0
+//
+// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
+// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
+// MERCHANTABLITY OR NON-INFRINGEMENT.
+// See the Apache 2 License for the specific language governing permissions and
+// limitations under the License.
+
+#include "base/kaldi-common.h"
+#include "cudamatrix/cu-device.h"
+
+int main(int argc, char *argv[]) {
+  const char *usage = "This program returns exit status 0 (success) if the code\n"
+      "was compiled with CUDA support, and 1 otherwise.  To support CUDA, you\n"
+      "must run 'configure' on a machine that has the CUDA compiler 'nvcc'\n"
+      "available.\n";
+  if (argc > 1) {
+    std::cerr << usage << "\n";
+  }
+#if HAVE_CUDA==1
+  return 0;
+#else
+  return 1;
+#endif
+}

From 5a30b71436e5fe75cc1ef41ae7f6fdd7580a2206 Mon Sep 17 00:00:00 2001
From: Xingyu Na <asr.naxingyu@gmail.com>
Date: Fri, 19 Jul 2019 07:21:10 +0800
Subject: [PATCH 153/163] Add feature transform; remove train transition
 (#3474)

---
 .../train/frame_level_objf/acoustic_model.py  |   6 +-
 src/nnet3/Makefile                            |   5 +-
 src/nnet3/get-feature-transform.cc            | 203 ++++++++++++++++++
 src/nnet3/get-feature-transform.h             | 179 +++++++++++++++
 src/nnet3bin/Makefile                         |   2 +-
 src/nnet3bin/cuda-compiled.cc                 |   2 +-
 src/nnet3bin/nnet-get-feature-transform.cc    |  85 ++++++++
 7 files changed, 474 insertions(+), 8 deletions(-)
 create mode 100644 src/nnet3/get-feature-transform.cc
 create mode 100644 src/nnet3/get-feature-transform.h
 create mode 100644 src/nnet3bin/nnet-get-feature-transform.cc

diff --git a/egs/wsj/s5/steps/libs/nnet3/train/frame_level_objf/acoustic_model.py b/egs/wsj/s5/steps/libs/nnet3/train/frame_level_objf/acoustic_model.py
index 4a39ed9dae6..144bc879e51 100644
--- a/egs/wsj/s5/steps/libs/nnet3/train/frame_level_objf/acoustic_model.py
+++ b/egs/wsj/s5/steps/libs/nnet3/train/frame_level_objf/acoustic_model.py
@@ -76,12 +76,10 @@ def prepare_initial_acoustic_model(dir, alidir, run_opts,
         common_train_lib.prepare_initial_network(dir, run_opts,
                                                  srand=srand)
 
-    # Convert to .mdl, train the transitions, set the priors.
+    # Convert to .mdl, set the priors.
     common_lib.execute_command(
         """{command} {dir}/log/init_mdl.log \
-                nnet3-am-init {alidir}/final.mdl {raw_mdl} - \| \
-                nnet3-am-train-transitions - \
-                "ark:gunzip -c {alidir}/ali.*.gz|" {dir}/0.mdl
+                nnet3-am-init {alidir}/final.mdl {raw_mdl} {dir}/0.mdl
         """.format(command=run_opts.command,
                    dir=dir, alidir=alidir,
                    raw_mdl=(input_model if input_model is not None
diff --git a/src/nnet3/Makefile b/src/nnet3/Makefile
index e474e85378a..a39362b0af0 100644
--- a/src/nnet3/Makefile
+++ b/src/nnet3/Makefile
@@ -31,7 +31,8 @@ OBJFILES = nnet-common.o nnet-compile.o nnet-component-itf.o \
   nnet-compile-looped.o decodable-simple-looped.o \
   decodable-online-looped.o convolution.o \
   nnet-convolutional-component.o attention.o \
-  nnet-attention-component.o nnet-tdnn-component.o nnet-batch-compute.o
+  nnet-attention-component.o nnet-tdnn-component.o nnet-batch-compute.o \
+  get-feature-transform.o
 
 
 LIBNAME = kaldi-nnet3
@@ -41,6 +42,6 @@ ADDLIBS = ../chain/kaldi-chain.a ../cudamatrix/kaldi-cudamatrix.a \
           ../fstext/kaldi-fstext.a ../hmm/kaldi-hmm.a \
           ../transform/kaldi-transform.a ../gmm/kaldi-gmm.a \
           ../tree/kaldi-tree.a ../util/kaldi-util.a ../matrix/kaldi-matrix.a ../cblasext/kaldi-cblasext.a \
-          ../base/kaldi-base.a 
+          ../base/kaldi-base.a
 
 include ../makefiles/default_rules.mk
diff --git a/src/nnet3/get-feature-transform.cc b/src/nnet3/get-feature-transform.cc
new file mode 100644
index 00000000000..3eef63765fe
--- /dev/null
+++ b/src/nnet3/get-feature-transform.cc
@@ -0,0 +1,203 @@
+// nnet3/get-feature-transform.cc
+
+// Copyright 2009-2011  Jan Silovsky
+//                2013  Johns Hopkins University (author: Daniel Povey)
+
+// See ../../COPYING for clarification regarding multiple authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//  http://www.apache.org/licenses/LICENSE-2.0
+//
+// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
+// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
+// MERCHANTABLITY OR NON-INFRINGEMENT.
+// See the Apache 2 License for the specific language governing permissions and
+// limitations under the License.
+
+
+#include "nnet3/get-feature-transform.h"
+
+namespace kaldi {
+
+
+
+void FeatureTransformEstimate::Estimate(const FeatureTransformEstimateOptions &opts,
+                                        Matrix<BaseFloat> *M,
+                                        TpMatrix<BaseFloat> *C) const {
+  double count;
+  Vector<double> total_mean;
+  SpMatrix<double> total_covar, between_covar;
+  GetStats(&total_covar, &between_covar, &total_mean, &count);
+  KALDI_LOG << "Data count is " << count;
+  EstimateInternal(opts, total_covar, between_covar, total_mean, M, C);
+}
+
+// static
+void FeatureTransformEstimate::EstimateInternal(
+    const FeatureTransformEstimateOptions &opts,
+    const SpMatrix<double> &total_covar,
+    const SpMatrix<double> &between_covar,
+    const Vector<double> &total_mean,
+    Matrix<BaseFloat> *M,
+    TpMatrix<BaseFloat> *C) {
+
+  int32 target_dim = opts.dim, dim = total_covar.NumRows();
+  // Interpret zero or negative target_dim as the full dim
+  if (target_dim <= 0)
+    target_dim = dim;
+  // between-class covar is of most rank C-1
+  KALDI_ASSERT(target_dim <= dim);
+
+  // within-class covariance
+  SpMatrix<double> wc_covar(total_covar);
+  wc_covar.AddSp(-1.0, between_covar);
+  TpMatrix<double> wc_covar_sqrt(dim);
+  try {
+    wc_covar_sqrt.Cholesky(wc_covar);
+    if (C != NULL) {
+      C->Resize(dim);
+      C->CopyFromTp(wc_covar_sqrt);
+    }
+  } catch (...) {
+    BaseFloat smooth = 1.0e-03 * wc_covar.Trace() / wc_covar.NumRows();
+    KALDI_LOG << "Cholesky failed (possibly not +ve definite), so adding " << smooth
+              << " to diagonal and trying again.\n";
+    for (int32 i = 0; i < wc_covar.NumRows(); i++)
+      wc_covar(i, i) += smooth;
+    wc_covar_sqrt.Cholesky(wc_covar);
+  }
+  Matrix<double> wc_covar_sqrt_mat(wc_covar_sqrt);
+  wc_covar_sqrt_mat.Invert();
+
+  SpMatrix<double> tmp_sp(dim);
+  tmp_sp.AddMat2Sp(1.0, wc_covar_sqrt_mat, kNoTrans, between_covar, 0.0);
+  Matrix<double> tmp_mat(tmp_sp);
+  Matrix<double> svd_u(dim, dim), svd_vt(dim, dim);
+  Vector<double> svd_d(dim);
+  tmp_mat.Svd(&svd_d, &svd_u, &svd_vt);
+  SortSvd(&svd_d, &svd_u);
+
+  KALDI_LOG << "LDA singular values are " << svd_d;
+
+  KALDI_LOG << "Sum of all singular values is " << svd_d.Sum();
+  KALDI_LOG << "Sum of selected singular values is " <<
+      SubVector<double>(svd_d, 0, target_dim).Sum();
+
+  Matrix<double> lda_mat(dim, dim);
+  lda_mat.AddMatMat(1.0, svd_u, kTrans, wc_covar_sqrt_mat, kNoTrans, 0.0);
+
+  // finally, copy first target_dim rows to m
+  M->Resize(target_dim, dim);
+  M->CopyFromMat(lda_mat.Range(0, target_dim, 0, dim));
+
+  if (opts.within_class_factor != 1.0) {
+    for (int32 i = 0; i < svd_d.Dim(); i++) {
+      BaseFloat old_var = 1.0 + svd_d(i), // the total variance of that dim..
+          new_var = opts.within_class_factor + svd_d(i), // the variance we want..
+          scale = sqrt(new_var / old_var);
+      if (i < M->NumRows())
+        M->Row(i).Scale(scale);
+    }
+  }
+
+  if (opts.max_singular_value > 0.0) {
+    int32 rows = M->NumRows(), cols = M->NumCols(),
+        min_dim = std::min(rows, cols);
+    Matrix<BaseFloat> U(rows, min_dim), Vt(min_dim, cols);
+    Vector<BaseFloat> s(min_dim);
+    M->Svd(&s, &U, &Vt); // decompose m = U diag(s) Vt.
+    BaseFloat max_s = s.Max();
+    int32 n;
+    s.ApplyCeiling(opts.max_singular_value, &n);
+    if (n > 0) {
+      KALDI_LOG << "Applied ceiling to " << n << " out of " << s.Dim()
+                << " singular values of transform using ceiling "
+                << opts.max_singular_value << ", max is " << max_s;
+      Vt.MulRowsVec(s);
+      // reconstruct m with the modified singular values:
+      M->AddMatMat(1.0, U, kNoTrans, Vt, kNoTrans, 0.0);
+    }
+  }
+
+  if (opts.remove_offset)
+    AddMeanOffset(total_mean, M);
+}
+
+void FeatureTransformEstimateMulti::EstimateTransformPart(
+    const FeatureTransformEstimateOptions &opts,
+    const std::vector<int32> &indexes,
+    const SpMatrix<double> &total_covar,
+    const SpMatrix<double> &between_covar,
+    const Vector<double> &mean,
+    Matrix<BaseFloat> *M) const {
+
+  int32 full_dim = Dim(), proj_dim = indexes.size();
+  Matrix<double> transform(proj_dim, full_dim); // projects from full to projected dim.
+  for (int32 i = 0; i < proj_dim; i++)
+    transform(i, indexes[i]) = 1.0;
+
+  SpMatrix<double> total_covar_proj(proj_dim), between_covar_proj(proj_dim);
+  Vector<double> mean_proj(proj_dim);
+  total_covar_proj.AddMat2Sp(1.0, transform, kNoTrans, total_covar, 0.0);
+  between_covar_proj.AddMat2Sp(1.0, transform, kNoTrans, between_covar, 0.0);
+  mean_proj.AddMatVec(1.0, transform, kNoTrans, mean, 0.0);
+
+  Matrix<BaseFloat> M_proj;
+  FeatureTransformEstimateOptions opts_tmp(opts);
+  opts_tmp.dim = proj_dim;
+  EstimateInternal(opts_tmp, total_covar_proj, between_covar_proj, mean_proj,
+                   &M_proj, NULL);
+  if (M_proj.NumCols() == proj_dim + 1) { // Extend transform to add the extra "1" that we
+                                          // use to handle mean shifts..
+    transform.Resize(proj_dim + 1, full_dim + 1, kCopyData);
+    transform(proj_dim, full_dim) = 1.0;
+  }
+  M->Resize(proj_dim, transform.NumCols());
+  // Produce output..
+  M->AddMatMat(1.0, M_proj, kNoTrans, Matrix<BaseFloat>(transform),
+               kNoTrans, 0.0);
+}
+
+void FeatureTransformEstimateMulti::Estimate(
+    const FeatureTransformEstimateOptions &opts,
+    const std::vector<std::vector<int32> > &indexes,
+    Matrix<BaseFloat> *M) const {
+
+  int32 input_dim = Dim(), output_dim = 0, num_transforms = indexes.size();
+  for (int32 i = 0; i < num_transforms; i++) { // some input-checking.
+    KALDI_ASSERT(indexes[i].size() > 0);
+    std::vector<int32> this_indexes(indexes[i]);
+    std::sort(this_indexes.begin(), this_indexes.end());
+    KALDI_ASSERT(IsSortedAndUniq(this_indexes)); // check for duplicates.
+    KALDI_ASSERT(this_indexes.front() >= 0);
+    KALDI_ASSERT(this_indexes.back() < input_dim);
+    output_dim += this_indexes.size();
+  }
+
+  int32 input_dim_ext = (opts.remove_offset ? input_dim + 1 : input_dim);
+  M->Resize(output_dim, input_dim_ext);
+
+  double count;
+  Vector<double> total_mean;
+  SpMatrix<double> total_covar, between_covar;
+  GetStats(&total_covar, &between_covar, &total_mean, &count);
+
+  int32 cur_output_index = 0;
+  for (int32 i = 0; i < num_transforms; i++) {
+    Matrix<BaseFloat> M_tmp;
+    EstimateTransformPart(opts, indexes[i], total_covar, between_covar,
+                          total_mean, &M_tmp);
+    int32 this_output_dim = indexes[i].size();
+    M->Range(cur_output_index, this_output_dim, 0, M->NumCols()).
+        CopyFromMat(M_tmp);
+    cur_output_index += this_output_dim;
+  }
+
+}
+
+
+}  // End of namespace kaldi
diff --git a/src/nnet3/get-feature-transform.h b/src/nnet3/get-feature-transform.h
new file mode 100644
index 00000000000..d3a52d55552
--- /dev/null
+++ b/src/nnet3/get-feature-transform.h
@@ -0,0 +1,179 @@
+// nnet3/get-feature-transform.h
+
+// Copyright 2009-2011  Jan Silovsky
+//                2013  Johns Hopkins University (author: Daniel Povey)
+
+// See ../../COPYING for clarification regarding multiple authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//  http://www.apache.org/licenses/LICENSE-2.0
+//
+// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
+// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
+// MERCHANTABLITY OR NON-INFRINGEMENT.
+// See the Apache 2 License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef KALDI_NNET3_GET_FEATURE_TRANSFORM_H_
+#define KALDI_NNET3_GET_FEATURE_TRANSFORM_H_
+
+#include "base/kaldi-common.h"
+#include "util/common-utils.h"
+#include "matrix/matrix-lib.h"
+#include "transform/lda-estimate.h"
+
+namespace kaldi {
+
+/**
+   @file
+   This file is modified from transform/lda-estimate.h
+   It contains a class intended to be used in preconditioning
+   data for neural network training.  See the documentation for class
+   FeatureTransformEstimate for more details.
+*/
+
+struct FeatureTransformEstimateOptions {
+  bool remove_offset;
+  int32 dim;
+  BaseFloat within_class_factor;
+  BaseFloat max_singular_value;
+  FeatureTransformEstimateOptions(): remove_offset(true), dim(-1),
+                                     within_class_factor(0.001), max_singular_value(5.0) { }
+
+  void Register(OptionsItf *opts) {
+    opts->Register("remove-offset", &remove_offset, "If true, output an affine "
+                   "transform that makes the projected data mean equal to zero.");
+    opts->Register("dim", &dim, "Dimension to project to with LDA");
+    opts->Register("within-class-factor", &within_class_factor, "If 1.0, do "
+                   "conventional LDA where the within-class variance will be "
+                   "unit in the projected space.  May be set to less than 1.0, "
+                   "which scales the features to have less variance, particularly "
+                   "for dimensions where between-class variance is small. ");
+    opts->Register("max-singular-value", &max_singular_value, "If >0, maximum "
+                   "allowed singular value of final transform (they are floored "
+                   "to this)");
+  }
+};
+
+/**
+     Class for computing a feature transform used for preconditioning of the
+     training data in neural-networks.
+
+     By preconditioning here, all we really mean is an affine transform of the
+     input data-- say if we set up the classification as going from vectors x_i
+     to labels y_i, then this would be a linear transform on X, so we replace
+     x_i with x'_i = A x_i + b.  The statistics we use to obtain this transform
+     are the within-class and between class variance statistics, and the global
+     data mean, that we would use to estimate LDA.  When designing this, we had
+     a few principles in mind:
+        - We want to remove the global mean of the input features (this is
+          well established, I think there is a paper by LeCun explaining why
+          this is a good thing).
+        - We would like the transform to make the training process roughly
+          invariant to linear transformations of the input features, meaning
+          that whatever linear transformation you apply prior to this transform,
+          it should 'undo' it.
+        - We want directions in which there is a lot of between-class variance
+          to be given a higher variance than directions that have mostly
+          within-class variance-- it has been our experience that these
+          'nuisance directions' will interfere with the training if they are
+          given too large a scaling.
+     It is essential to our method that the number of classes is higher than
+     the dimension of the input feature space, which is normal for speech
+     recognition tasks (~5000 > ~250).
+
+     Basically our method is as follows:
+
+       - First subtract the mean.
+       - Get the within-class and between-class stats, as for LDA.
+       - Normalize the space as for LDA, so that the within-class covariance
+         matrix is unit and the between-class covariance matrix is diagonalized
+       - At this stage, if the user asked for dimension reduction then
+         reduce the dimension by taking out dimensions with least between-class
+         variance [note: the current scripts do not do this by default]
+       - Apply a transform that reduces the variance of dimensions
+         with low between-class variance, as we'll describe below.
+       - Finally, do an SVD of the resulting transform, A = U S V^T, apply a
+         maximum to the diagonal elements of the matrix S (e.g. 5.0), and
+         reconstruct A' = U S' V^T; this is the final transform.  The point of
+         this stage is to stop the transform from 'blowing up' any dimensions of
+         the space excessively; this stage was introduced in response to a
+         problem we encountered at one point, and I think normally not very many
+         dimensions of S end up getting floored.
+
+      We need to explain the step that applies the dimension-specific scaling,
+      which we described above as, "Apply a transform that reduces the variance
+      of dimensions with low between-class variance".  For a particular
+      dimension, let the between-class diagonal covariance element be \lambda_i,
+      and the within-class diagonal covariance is 1 at this point (since we
+      have normalized the within-class covariance to unity); hence, the total
+      variance is \lambda_i + 1.
+      Below, "within-class-factor" is a constant that we set by default to
+      0.001.  We scale the i'th dimension of the features by:
+
+         \f$  sqrt( (within-class-factor + \lambda_i) / (1 + \lambda_i) ) \f$
+
+      If \lambda_i >> 1, this scaling factor approaches 1 (we don't need to
+      scale up dimensions with high between-class variance as they already
+      naturally have a higher variance than other dimensions.  As \lambda_i
+      becomes small, this scaling factor approaches sqrt(within-class-factor),
+      so dimensions with very small between-class variance get assigned a small
+      variance equal to within-class-factor, and for dimensions with
+      intermediate between-class variance, they end up with a variance roughly
+      equal to \lambda_i: consider that the variance was originally (1 +
+      \lambda_i), so by scaling the features by approximately sqrt((\lambda_i) /
+      (1 + \lambda_i)), the variance becomes approximately \lambda_i [this is
+      clear after noting that the variance gets scaled by the square of the
+      feature scale].
+ */
+class FeatureTransformEstimate: public LdaEstimate {
+ public:
+  /// Estimates the LDA transform matrix m.  If Mfull != NULL, it also outputs
+  /// the full matrix (without dimensionality reduction), which is useful for
+  /// some purposes.  If opts.remove_offset == true, it will output both matrices
+  /// with an extra column which corresponds to mean-offset removal (the matrix
+  /// should be multiplied by the feature with a 1 appended to give the correct
+  /// result, as with other Kaldi transforms.)
+  /// "within_cholesky" is a pointer to an SpMatrix that, if non-NULL, will
+  /// be set to the Cholesky factor of the within-class covariance matrix.
+  /// This is used for perturbing features.
+  void Estimate(const FeatureTransformEstimateOptions &opts,
+                Matrix<BaseFloat> *M,
+                TpMatrix<BaseFloat> *within_cholesky) const;
+ protected:
+  static void EstimateInternal(const FeatureTransformEstimateOptions &opts,
+                               const SpMatrix<double> &total_covar,
+                               const SpMatrix<double> &between_covar,
+                               const Vector<double> &mean,
+                               Matrix<BaseFloat> *M,
+                               TpMatrix<BaseFloat> *C);
+};
+
+
+class FeatureTransformEstimateMulti: public FeatureTransformEstimate {
+ public:
+  /// This is as FeatureTransformEstimate, but for use in
+  /// nnet-get-feature-transform-multi.cc, see the usage message
+  /// of that program for a description of what it does.
+  void Estimate(const FeatureTransformEstimateOptions &opts,
+                const std::vector<std::vector<int32> > &indexes,
+                Matrix<BaseFloat> *M) const;
+
+ private:
+  void EstimateTransformPart(const FeatureTransformEstimateOptions &opts,
+                             const std::vector<int32> &indexes,
+                             const SpMatrix<double> &total_covar,
+                             const SpMatrix<double> &between_covar,
+                             const Vector<double> &mean,
+                             Matrix<BaseFloat> *M) const;
+};
+
+
+
+}  // End namespace kaldi
+
+#endif  // KALDI_NNET3_GET_FEATURE_TRANSFORM_H_
diff --git a/src/nnet3bin/Makefile b/src/nnet3bin/Makefile
index 74d85efce1c..7212e480dbc 100644
--- a/src/nnet3bin/Makefile
+++ b/src/nnet3bin/Makefile
@@ -20,7 +20,7 @@ BINFILES = nnet3-init nnet3-info nnet3-get-egs nnet3-copy-egs nnet3-subset-egs \
    nnet3-discriminative-compute-from-egs nnet3-latgen-faster-looped \
    nnet3-egs-augment-image nnet3-xvector-get-egs nnet3-xvector-compute \
    nnet3-latgen-grammar nnet3-compute-batch nnet3-latgen-faster-batch \
-   cuda-compiled
+   nnet-get-feature-transform cuda-compiled
 
 OBJFILES =
 
diff --git a/src/nnet3bin/cuda-compiled.cc b/src/nnet3bin/cuda-compiled.cc
index b6de9257657..50a36450412 100644
--- a/src/nnet3bin/cuda-compiled.cc
+++ b/src/nnet3bin/cuda-compiled.cc
@@ -1,4 +1,4 @@
-// nnet2bin/cuda-compiled.cc
+// nnet3bin/cuda-compiled.cc
 
 // Copyright 2014 Johns Hopkins University (author:  Daniel Povey)
 
diff --git a/src/nnet3bin/nnet-get-feature-transform.cc b/src/nnet3bin/nnet-get-feature-transform.cc
new file mode 100644
index 00000000000..43bbaacbe94
--- /dev/null
+++ b/src/nnet3bin/nnet-get-feature-transform.cc
@@ -0,0 +1,85 @@
+// nnet3bin/nnet-get-feature-transform.cc
+
+// Copyright 2013  Johns Hopkins University (author: Daniel Povey)
+
+// See ../../COPYING for clarification regarding multiple authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//  http://www.apache.org/licenses/LICENSE-2.0
+//
+// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
+// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
+// MERCHANTABLITY OR NON-INFRINGEMENT.
+// See the Apache 2 License for the specific language governing permissions and
+// limitations under the License.
+
+
+#include "base/kaldi-common.h"
+#include "util/common-utils.h"
+#include "nnet3/get-feature-transform.h"
+
+int main(int argc, char *argv[]) {
+  using namespace kaldi;
+  typedef kaldi::int32 int32;
+  try {
+    const char *usage =
+        "Get feature-projection transform using stats obtained with acc-lda.\n"
+        "See comments in the code of nnet2/get-feature-transform.h for more\n"
+        "information.\n"
+        "\n"
+        "Usage:  nnet-get-feature-transform [options] <matrix-out> <lda-acc-1> <lda-acc-2> ...\n";
+
+    bool binary = true;
+    FeatureTransformEstimateOptions opts;
+    std::string write_cholesky;
+    std::string write_within_covar;
+    ParseOptions po(usage);
+    po.Register("binary", &binary, "Write outputs in binary mode.");
+    po.Register("write-cholesky", &write_cholesky, "If supplied, write to this "
+                "wxfilename the Cholesky factor of the within-class covariance. "
+                "Can be used for perturbing features.  E.g. "
+                "--write-cholesky=exp/nnet5/cholesky.tpmat");
+    po.Register("write-within-covar", &write_within_covar, "If supplied, write "
+                "to this wxfilename the within-class covariance (as a symmetric "
+                "matrix). E.g. --write-within-covar=exp/nnet5/within_covar.mat");
+    opts.Register(&po);
+    po.Read(argc, argv);
+
+    if (po.NumArgs() < 2) {
+      po.PrintUsage();
+      exit(1);
+    }
+
+    FeatureTransformEstimate fte;
+    std::string projection_wxfilename = po.GetArg(1);
+
+    for (int32 i = 2; i <= po.NumArgs(); i++) {
+      bool binary_in, add = true;
+      Input ki(po.GetArg(i), &binary_in);
+      fte.Read(ki.Stream(), binary_in, add);
+    }
+
+    Matrix<BaseFloat> mat;
+    TpMatrix<BaseFloat> cholesky;
+    fte.Estimate(opts, &mat,
+                 (write_cholesky != "" || write_within_covar != "" ?
+                  &cholesky : NULL));
+    WriteKaldiObject(mat, projection_wxfilename, binary);
+    if (write_cholesky != "") {
+      WriteKaldiObject(cholesky, write_cholesky, binary);
+    }
+    if (write_within_covar != "") {
+      SpMatrix<BaseFloat> within_var(cholesky.NumRows());
+      within_var.AddTp2(1.0, cholesky, kNoTrans, 0.0);
+      WriteKaldiObject(within_var, write_within_covar, binary);
+    }
+    return 0;
+  } catch(const std::exception &e) {
+    std::cerr << e.what();
+    return -1;
+  }
+}

From db6b23d9ce7f2b769f5694d96332663baec6b0f5 Mon Sep 17 00:00:00 2001
From: Daniel Povey <dpovey@gmail.com>
Date: Thu, 18 Jul 2019 19:32:23 -0400
Subject: [PATCH 154/163] [src] Fixes RE unusual topologies

---
 src/hmm/hmm-utils.cc    | 16 +++++++++-------
 src/hmm/transitions.cc  | 25 ++++++++++++++-----------
 src/tree/context-dep.cc |  9 +++++++++
 3 files changed, 32 insertions(+), 18 deletions(-)

diff --git a/src/hmm/hmm-utils.cc b/src/hmm/hmm-utils.cc
index 55eec2e2939..7acf8e8068c 100644
--- a/src/hmm/hmm-utils.cc
+++ b/src/hmm/hmm-utils.cc
@@ -373,7 +373,7 @@ class TidToSelfLoopMapper {
 public:
   // Function object used in MakePrecedingInputSymbolsSameClass and.
   // It maps a transition-ids t to the transition-id on the self-loop
-  // of the destination-state of t (or -1 if there is no self-loop).
+  // of the destination-state of t (or 0 if there is no self-loop).
   //
   // If currently_self_loop_free == true, it also checks that there are no
   // self-loops in the graph (i.e. in the labels it is called with).  This is
@@ -385,8 +385,8 @@ class TidToSelfLoopMapper {
   // equivalence class on labels that's relevant to what the self-loop will be
   // on the following state.
   TidToSelfLoopMapper(const Transitions &trans_model,
-                    const std::vector<int32> &disambig_syms,
-                    bool currently_self_loop_free):
+                      const std::vector<int32> &disambig_syms,
+                      bool currently_self_loop_free):
       trans_model_(trans_model),
       disambig_syms_(disambig_syms),
       currently_self_loop_free_(currently_self_loop_free) { }
@@ -397,7 +397,7 @@ class TidToSelfLoopMapper {
         KALDI_ERR << "AddSelfLoops: graph already has self-loops.";
       return trans_model_.InfoForTransitionId(tid).self_loop_transition_id;
     } else if (tid == fst::kNoLabel) {
-      return -1;  // actually kNoLabel is -1.
+      return 0;
     } else {  // 0 or (presumably) disambiguation symbol.  Map to zero
       int32 big_number = fst::kNontermBigNumber;  // 1000000
       if (tid != 0 && tid < big_number) {
@@ -459,8 +459,10 @@ void AddSelfLoops(const Transitions &trans_model,
 
   StateId num_states = fst->NumStates();
   // self_loop_transition_id gives the transition-id of the self-loop of this
-  // state, or zero or -1 or -2 if it doesn't require a self-loop.
-  std::vector<int32> self_loop_transition_id(num_states, -2);
+  // state, or zero if it doesn't require a self-loop.
+  // -1 is where we don't know the self-loop transition id (if any)
+  // for this state yet.
+  std::vector<int32> self_loop_transition_id(num_states, -1);
 
   for (StateId s = 0; s < num_states; s++) {
     for (MutableArcIterator<VectorFst<Arc> > aiter(fst, s);
@@ -468,7 +470,7 @@ void AddSelfLoops(const Transitions &trans_model,
          aiter.Next()) {
       const Arc &arc = aiter.Value();
       int32 next_state_self_loop_transition_id = f(arc.ilabel);
-      if (self_loop_transition_id[arc.nextstate] == -2) {
+      if (self_loop_transition_id[arc.nextstate] == -1) {
         // Note: next_state_self_loop_transition_id could be
         self_loop_transition_id[arc.nextstate] =
             next_state_self_loop_transition_id;
diff --git a/src/hmm/transitions.cc b/src/hmm/transitions.cc
index 440912b37dc..7319fe0063a 100644
--- a/src/hmm/transitions.cc
+++ b/src/hmm/transitions.cc
@@ -44,11 +44,6 @@ void Transitions::ComputeInfo(const ContextDependencyInterface &ctx_dep) {
   const std::vector<int32> &phones = topo_.GetPhones();
   KALDI_ASSERT(!phones.empty());
 
-  // pdf_info is a set of lists indexed by phone. Each list is indexed by
-  // (pdf-class, self-loop pdf-class) of each arc of that phone, and the element
-  // is a list of possible (pdf, self-loop pdf) pairs that that (pdf-class, self-loop pdf-class)
-  // pair generates.
-  std::vector<std::vector<std::vector<std::pair<int32, int32> > > > pdf_info;
   // pdf_class_pairs is a set of lists indexed by phone. Each list stores
   // all unique (pdf-class, self-loop pdf-class) pairs that that phone
   // can have (on its arcs).
@@ -79,7 +74,7 @@ void Transitions::ComputeInfo(const ContextDependencyInterface &ctx_dep) {
       }
     }
 
-    std::map<std::pair<int32, int32>, std::vector<std::pair<int32, int32> > > &phone_to_arc_list(
+    std::map<std::pair<int32, int32>, std::vector<std::pair<int32, int32> > > &this_to_arc_list(
         to_arc_list[phone]);
     for (StateId state = 0; state < num_states; ++state) {
       for (fst::ArcIterator<fst::StdVectorFst> aiter(entry, state);
@@ -87,17 +82,25 @@ void Transitions::ComputeInfo(const ContextDependencyInterface &ctx_dep) {
         const fst::StdArc &arc(aiter.Value());
         int32 forward_pdf_class = arc.ilabel,
             self_loop_pdf_class = state_to_self_loop_pdf_class[arc.nextstate];
-        auto state_arc_pair = std::make_pair(state, aiter.Position());
+        auto state_arc_pair = std::make_pair(state, int32(aiter.Position()));
         auto pdf_class_pair = std::make_pair(forward_pdf_class, self_loop_pdf_class);
-        phone_to_arc_list[pdf_class_pair].push_back(state_arc_pair);
+        this_to_arc_list[pdf_class_pair].push_back(state_arc_pair);
       }
     }
-    for (auto const &pdf_class_to_arc: phone_to_arc_list)
+    for (auto const &pdf_class_to_arc: this_to_arc_list) {
       pdf_class_pairs[phone].push_back(pdf_class_to_arc.first);
+    }
   }
+  // pdf_info will be a set of lists indexed by phone. Each list is indexed by
+  // the same index as we index into pdf_class_pairs[phone], and the element is
+  // a list of possible (pdf, self-loop pdf) pairs that that (pdf-class,
+  // self-loop pdf-class) pair generates.
+  std::vector<std::vector<std::vector<std::pair<int32, int32> > > > pdf_info;
+
   ctx_dep.GetPdfInfo(phones, pdf_class_pairs, &pdf_info);
 
-  info_.push_back(TransitionIdInfo());  // transition-id is 1-based.
+  info_.push_back(TransitionIdInfo());  // transition-id is 1-based, add a
+                                        // dummy for element zero.
 
   for (int32 i = 0; i < phones.size(); i++) {
     int32 phone = phones[i];
@@ -140,7 +143,7 @@ void Transitions::ComputeDerived() {
     transition.is_final = (entry.Final(arc.nextstate) != fst::StdFst::Weight::Zero());
     transition.transition_cost = arc.weight.Value();
     if (transition.self_loop_pdf_id == -1)
-      transition.self_loop_transition_id = -1;
+      transition.self_loop_transition_id = 0;
     else {
       // Find the self-loop of the destination state:
       int32 arc_index = -1;
diff --git a/src/tree/context-dep.cc b/src/tree/context-dep.cc
index 9d224abfbe7..c9943c0bb1f 100644
--- a/src/tree/context-dep.cc
+++ b/src/tree/context-dep.cc
@@ -199,6 +199,15 @@ void ContextDependency::EnumeratePairs(
   to_pdf_->MultiMap(vec, &forward_pdfs);
   SortAndUniq(&forward_pdfs);
 
+  if (self_loop_pdf_class <= 0) {
+    // Invalid pdf-class because there was no self-loop.  Return pairs
+    // where the self-loop pdf-id is -1.
+    for (int32 forward_pdf: forward_pdfs) {
+      pairs->insert(std::pair<int32,int32>(forward_pdf, -1));
+    }
+    return;
+  }
+
   // get list of possible self-loop pdfs
   vec.clear();
   for (size_t i = 0; i < N_; i++)

From 1cbb69128bedeef005654725f380d9ec40c51634 Mon Sep 17 00:00:00 2001
From: Daniel Povey <dpovey@gmail.com>
Date: Thu, 18 Jul 2019 16:34:02 -0700
Subject: [PATCH 155/163] [src] Fixes RE unusual topologies (#3478)

---
 src/hmm/hmm-utils.cc    | 16 +++++++++-------
 src/hmm/transitions.cc  | 25 ++++++++++++++-----------
 src/tree/context-dep.cc |  9 +++++++++
 3 files changed, 32 insertions(+), 18 deletions(-)

diff --git a/src/hmm/hmm-utils.cc b/src/hmm/hmm-utils.cc
index 55eec2e2939..7acf8e8068c 100644
--- a/src/hmm/hmm-utils.cc
+++ b/src/hmm/hmm-utils.cc
@@ -373,7 +373,7 @@ class TidToSelfLoopMapper {
 public:
   // Function object used in MakePrecedingInputSymbolsSameClass and.
   // It maps a transition-ids t to the transition-id on the self-loop
-  // of the destination-state of t (or -1 if there is no self-loop).
+  // of the destination-state of t (or 0 if there is no self-loop).
   //
   // If currently_self_loop_free == true, it also checks that there are no
   // self-loops in the graph (i.e. in the labels it is called with).  This is
@@ -385,8 +385,8 @@ class TidToSelfLoopMapper {
   // equivalence class on labels that's relevant to what the self-loop will be
   // on the following state.
   TidToSelfLoopMapper(const Transitions &trans_model,
-                    const std::vector<int32> &disambig_syms,
-                    bool currently_self_loop_free):
+                      const std::vector<int32> &disambig_syms,
+                      bool currently_self_loop_free):
       trans_model_(trans_model),
       disambig_syms_(disambig_syms),
       currently_self_loop_free_(currently_self_loop_free) { }
@@ -397,7 +397,7 @@ class TidToSelfLoopMapper {
         KALDI_ERR << "AddSelfLoops: graph already has self-loops.";
       return trans_model_.InfoForTransitionId(tid).self_loop_transition_id;
     } else if (tid == fst::kNoLabel) {
-      return -1;  // actually kNoLabel is -1.
+      return 0;
     } else {  // 0 or (presumably) disambiguation symbol.  Map to zero
       int32 big_number = fst::kNontermBigNumber;  // 1000000
       if (tid != 0 && tid < big_number) {
@@ -459,8 +459,10 @@ void AddSelfLoops(const Transitions &trans_model,
 
   StateId num_states = fst->NumStates();
   // self_loop_transition_id gives the transition-id of the self-loop of this
-  // state, or zero or -1 or -2 if it doesn't require a self-loop.
-  std::vector<int32> self_loop_transition_id(num_states, -2);
+  // state, or zero if it doesn't require a self-loop.
+  // -1 is where we don't know the self-loop transition id (if any)
+  // for this state yet.
+  std::vector<int32> self_loop_transition_id(num_states, -1);
 
   for (StateId s = 0; s < num_states; s++) {
     for (MutableArcIterator<VectorFst<Arc> > aiter(fst, s);
@@ -468,7 +470,7 @@ void AddSelfLoops(const Transitions &trans_model,
          aiter.Next()) {
       const Arc &arc = aiter.Value();
       int32 next_state_self_loop_transition_id = f(arc.ilabel);
-      if (self_loop_transition_id[arc.nextstate] == -2) {
+      if (self_loop_transition_id[arc.nextstate] == -1) {
         // Note: next_state_self_loop_transition_id could be
         self_loop_transition_id[arc.nextstate] =
             next_state_self_loop_transition_id;
diff --git a/src/hmm/transitions.cc b/src/hmm/transitions.cc
index 440912b37dc..7319fe0063a 100644
--- a/src/hmm/transitions.cc
+++ b/src/hmm/transitions.cc
@@ -44,11 +44,6 @@ void Transitions::ComputeInfo(const ContextDependencyInterface &ctx_dep) {
   const std::vector<int32> &phones = topo_.GetPhones();
   KALDI_ASSERT(!phones.empty());
 
-  // pdf_info is a set of lists indexed by phone. Each list is indexed by
-  // (pdf-class, self-loop pdf-class) of each arc of that phone, and the element
-  // is a list of possible (pdf, self-loop pdf) pairs that that (pdf-class, self-loop pdf-class)
-  // pair generates.
-  std::vector<std::vector<std::vector<std::pair<int32, int32> > > > pdf_info;
   // pdf_class_pairs is a set of lists indexed by phone. Each list stores
   // all unique (pdf-class, self-loop pdf-class) pairs that that phone
   // can have (on its arcs).
@@ -79,7 +74,7 @@ void Transitions::ComputeInfo(const ContextDependencyInterface &ctx_dep) {
       }
     }
 
-    std::map<std::pair<int32, int32>, std::vector<std::pair<int32, int32> > > &phone_to_arc_list(
+    std::map<std::pair<int32, int32>, std::vector<std::pair<int32, int32> > > &this_to_arc_list(
         to_arc_list[phone]);
     for (StateId state = 0; state < num_states; ++state) {
       for (fst::ArcIterator<fst::StdVectorFst> aiter(entry, state);
@@ -87,17 +82,25 @@ void Transitions::ComputeInfo(const ContextDependencyInterface &ctx_dep) {
         const fst::StdArc &arc(aiter.Value());
         int32 forward_pdf_class = arc.ilabel,
             self_loop_pdf_class = state_to_self_loop_pdf_class[arc.nextstate];
-        auto state_arc_pair = std::make_pair(state, aiter.Position());
+        auto state_arc_pair = std::make_pair(state, int32(aiter.Position()));
         auto pdf_class_pair = std::make_pair(forward_pdf_class, self_loop_pdf_class);
-        phone_to_arc_list[pdf_class_pair].push_back(state_arc_pair);
+        this_to_arc_list[pdf_class_pair].push_back(state_arc_pair);
       }
     }
-    for (auto const &pdf_class_to_arc: phone_to_arc_list)
+    for (auto const &pdf_class_to_arc: this_to_arc_list) {
       pdf_class_pairs[phone].push_back(pdf_class_to_arc.first);
+    }
   }
+  // pdf_info will be a set of lists indexed by phone. Each list is indexed by
+  // the same index as we index into pdf_class_pairs[phone], and the element is
+  // a list of possible (pdf, self-loop pdf) pairs that that (pdf-class,
+  // self-loop pdf-class) pair generates.
+  std::vector<std::vector<std::vector<std::pair<int32, int32> > > > pdf_info;
+
   ctx_dep.GetPdfInfo(phones, pdf_class_pairs, &pdf_info);
 
-  info_.push_back(TransitionIdInfo());  // transition-id is 1-based.
+  info_.push_back(TransitionIdInfo());  // transition-id is 1-based, add a
+                                        // dummy for element zero.
 
   for (int32 i = 0; i < phones.size(); i++) {
     int32 phone = phones[i];
@@ -140,7 +143,7 @@ void Transitions::ComputeDerived() {
     transition.is_final = (entry.Final(arc.nextstate) != fst::StdFst::Weight::Zero());
     transition.transition_cost = arc.weight.Value();
     if (transition.self_loop_pdf_id == -1)
-      transition.self_loop_transition_id = -1;
+      transition.self_loop_transition_id = 0;
     else {
       // Find the self-loop of the destination state:
       int32 arc_index = -1;
diff --git a/src/tree/context-dep.cc b/src/tree/context-dep.cc
index 9d224abfbe7..c9943c0bb1f 100644
--- a/src/tree/context-dep.cc
+++ b/src/tree/context-dep.cc
@@ -199,6 +199,15 @@ void ContextDependency::EnumeratePairs(
   to_pdf_->MultiMap(vec, &forward_pdfs);
   SortAndUniq(&forward_pdfs);
 
+  if (self_loop_pdf_class <= 0) {
+    // Invalid pdf-class because there was no self-loop.  Return pairs
+    // where the self-loop pdf-id is -1.
+    for (int32 forward_pdf: forward_pdfs) {
+      pairs->insert(std::pair<int32,int32>(forward_pdf, -1));
+    }
+    return;
+  }
+
   // get list of possible self-loop pdfs
   vec.clear();
   for (size_t i = 0; i < N_; i++)

From 6ba25b5ab700d74c1e01146af935ec246f17cb61 Mon Sep 17 00:00:00 2001
From: Daniel Povey <dpovey@gmail.com>
Date: Fri, 19 Jul 2019 15:30:59 -0700
Subject: [PATCH 156/163] [src] Fixes RE unusual topologies (#3481)

---
 egs/wsj/s5/steps/nnet3/chain/gen_topo5.py | 22 ++++++++++------------
 src/fstext/fstext-utils.h                 |  3 ++-
 src/hmm/hmm-utils.cc                      |  5 +++--
 3 files changed, 15 insertions(+), 15 deletions(-)

diff --git a/egs/wsj/s5/steps/nnet3/chain/gen_topo5.py b/egs/wsj/s5/steps/nnet3/chain/gen_topo5.py
index 1583966b58c..9df502545a5 100755
--- a/egs/wsj/s5/steps/nnet3/chain/gen_topo5.py
+++ b/egs/wsj/s5/steps/nnet3/chain/gen_topo5.py
@@ -2,6 +2,9 @@
 
 # Copyright 2012  Johns Hopkins University (author: Daniel Povey)
 
+# This script was modified around 11.11.2016, when the code was extended to
+# support having a different pdf-class on the self loop.
+
 # Generate a topology file.  This allows control of the number of states in the
 # non-silence HMMs, and in the silence HMMs.  This is a modified version of
 # 'utils/gen_topo.pl' that generates a different type of topology, one that we
@@ -29,22 +32,17 @@
 nonsilence_phones = [ int(x) for x in args.nonsilence_phones.split(":") ]
 all_phones = silence_phones +  nonsilence_phones
 
+
 print("<Topology>")
 print("<TopologyEntry>")
 print("<ForPhones>")
 print(" ".join([str(x) for x in all_phones]))
 print("</ForPhones>")
-# state 0 is nonemitting
-print("<State> 0 <Transition> 1 0.5 <Transition> 2 0.5 </State>")
-# state 1 is for when we traverse it in 1 state
-print("<State> 1 <PdfClass> 0 <Transition> 4 1.0 </State>")
-# state 2 is for when we traverse it in >1 state, for the first state.
-print("<State> 2 <PdfClass> 2 <Transition> 3 1.0 </State>")
-# state 3 is for the self-loop.  Use pdf-class 1 here so that the default
-# phone-class clustering (which uses only pdf-class 1 by default) gets only
-# stats from longer phones.
-print("<State> 3 <PdfClass> 1 <Transition> 3 0.5 <Transition> 4 0.5 </State>")
-print("<State> 4 </State>")
+print("0  1  1  0.69314718055")
+print("0  2  3  0.69314718055")
+print("1  1  2  0.69314718055")
+print("1  0.69314718055")
+print("2  0.0")
+print("")
 print("</TopologyEntry>")
 print("</Topology>")
-
diff --git a/src/fstext/fstext-utils.h b/src/fstext/fstext-utils.h
index 25c4a53c633..b220dd59c62 100644
--- a/src/fstext/fstext-utils.h
+++ b/src/fstext/fstext-utils.h
@@ -259,7 +259,8 @@ void MakePrecedingInputSymbolsSame(bool start_is_epsilon, MutableFst<Arc> *fst);
 
 
 /// As MakePrecedingInputSymbolsSame, but takes a functor object that maps
-/// labels to (int32) classes
+/// labels to (int32) classes.  Caution: it must not map kNoLabel (-1)
+/// to the same value as any real symbol.
 template<class Arc, class F>
 void MakePrecedingInputSymbolsSameClass(bool start_is_epsilon, MutableFst<Arc> *fst, const F &f);
 
diff --git a/src/hmm/hmm-utils.cc b/src/hmm/hmm-utils.cc
index 7acf8e8068c..7bd6070f151 100644
--- a/src/hmm/hmm-utils.cc
+++ b/src/hmm/hmm-utils.cc
@@ -381,7 +381,8 @@ class TidToSelfLoopMapper {
 
   // This maps valid transition-ids to transition states, and maps all other
   // symbols (i.e. epsilon symbols, disambig symbols, and symbols with values
-  // over 100000/kNontermBigNumber) to zero.  Its point is to provide an
+  // over 100000/kNontermBigNumber) to zero.  (and -1 == kNoLabel to -1).
+  // Its purpose is to provide an
   // equivalence class on labels that's relevant to what the self-loop will be
   // on the following state.
   TidToSelfLoopMapper(const Transitions &trans_model,
@@ -397,7 +398,7 @@ class TidToSelfLoopMapper {
         KALDI_ERR << "AddSelfLoops: graph already has self-loops.";
       return trans_model_.InfoForTransitionId(tid).self_loop_transition_id;
     } else if (tid == fst::kNoLabel) {
-      return 0;
+      return -1;
     } else {  // 0 or (presumably) disambiguation symbol.  Map to zero
       int32 big_number = fst::kNontermBigNumber;  // 1000000
       if (tid != 0 && tid < big_number) {

From 7a535037e4734aa8b22a5a2b4cc64d56cb72cafb Mon Sep 17 00:00:00 2001
From: Daniel Povey <dpovey@gmail.com>
Date: Mon, 22 Jul 2019 10:49:31 -0700
Subject: [PATCH 157/163] [src] Fixes RE unusual topologies (#3480)

* [src] Fixes RE unusual topologies

* [src] Some fixes RE unusual topologies.

From 3a1e523bac258b06d0263b52bdd1ac37d7d6f2d8 Mon Sep 17 00:00:00 2001
From: Daniel Povey <dpovey@gmail.com>
Date: Tue, 3 Sep 2019 18:00:02 +0800
Subject: [PATCH 158/163] Kaldi10 feature-changes +  attention/transformer
 scripts (#3562)

* [src] Fixes RE unusual topologies

* [src] Various feature-extraction changes/simplifications, made while syncing with kaldi10feat python code

* Some preliminary work on attention, saving a checkpoint

* [src] Rewrite to nnet-computation-graph code to fix graph-building bug

* [scripts] Add attention example

* [egs] Add attention example

* [scripts] Add use-relu option in attention.py, not used currently.
---
 .../s5/local/chain/run_att.sh                 |   1 +
 .../s5/local/chain/tuning/run_att_1a.sh       | 300 ++++++++++
 .../s5/steps/libs/nnet3/xconfig/attention.py  | 281 +++++++++
 egs/wsj/s5/steps/libs/nnet3/xconfig/parser.py |   3 +-
 src/bin/cuda-gpu-available.cc                 |   6 +-
 src/cblasext/Makefile                         |   3 +-
 src/cudafeat/feature-mfcc-cuda.cu             |  11 +-
 src/feat/Makefile                             |   2 +-
 src/feat/feature-fbank-test.cc                |  44 --
 src/feat/feature-fbank.cc                     |  32 +-
 src/feat/feature-fbank.h                      |  16 +-
 src/feat/feature-functions.cc                 |  14 +-
 src/feat/feature-mfcc-test.cc                 |  33 -
 src/feat/feature-mfcc.cc                      |  30 +-
 src/feat/feature-mfcc.h                       |   8 +-
 src/feat/feature-sdc-test.cc                  |  23 +-
 src/feat/feature-spectrogram.cc               |  82 ---
 src/feat/feature-spectrogram.h                | 117 ----
 src/feat/feature-window.cc                    |   7 +-
 src/feat/feature-window.h                     |  32 +-
 src/feat/online-feature-test.cc               |   8 +-
 src/matrix/matrix-functions.cc                | 566 +-----------------
 src/matrix/matrix-functions.h                 |  73 +--
 src/matrix/matrix-lib-test.cc                 | 143 +----
 src/nnet3/nnet-attention-component.cc         |   8 +-
 src/nnet3/nnet-attention-component.h          |  10 +-
 src/nnet3/nnet-computation-graph.cc           | 310 +++++-----
 src/nnet3/nnet-computation-graph.h            |  91 +--
 tools/Makefile                                |  11 +-
 tools/extras/install_openblas.sh              |  26 +-
 30 files changed, 904 insertions(+), 1387 deletions(-)
 create mode 120000 egs/mini_librispeech/s5/local/chain/run_att.sh
 create mode 100755 egs/mini_librispeech/s5/local/chain/tuning/run_att_1a.sh
 delete mode 100644 src/feat/feature-spectrogram.cc
 delete mode 100644 src/feat/feature-spectrogram.h

diff --git a/egs/mini_librispeech/s5/local/chain/run_att.sh b/egs/mini_librispeech/s5/local/chain/run_att.sh
new file mode 120000
index 00000000000..bf5d5a0c0f1
--- /dev/null
+++ b/egs/mini_librispeech/s5/local/chain/run_att.sh
@@ -0,0 +1 @@
+tuning/run_att_1a.sh
\ No newline at end of file
diff --git a/egs/mini_librispeech/s5/local/chain/tuning/run_att_1a.sh b/egs/mini_librispeech/s5/local/chain/tuning/run_att_1a.sh
new file mode 100755
index 00000000000..2238e66f041
--- /dev/null
+++ b/egs/mini_librispeech/s5/local/chain/tuning/run_att_1a.sh
@@ -0,0 +1,300 @@
+#!/bin/bash
+
+# run_att_1a.sh is similar to run_tdnn_1h.sh but with some TDNN layers replaced
+# with attention layers.
+
+
+# Note: below, att1a and att1a2 are two different runs of the same script.
+#
+# local/chain/compare_wer.sh exp/chain/tdnn1h_sp exp/chain/att1a_sp exp/chain/att1a2_sp
+# System                tdnn1h_sp  att1a_sp att1a2_sp
+#WER dev_clean_2 (tgsmall)      12.27     12.16     12.65
+#WER dev_clean_2 (tglarge)       8.61      8.68      8.94
+# Final train prob        -0.0462   -0.0434   -0.0425
+# Final valid prob        -0.0814   -0.0807   -0.0814
+# Final train prob (xent)   -1.1354   -1.0721   -1.0647
+# Final valid prob (xent)   -1.3680   -1.3254   -1.3263
+# Num-params                 5210944   4193064   4193064
+
+
+# steps/info/chain_dir_info.pl exp/chain/tdnn1h_sp exp/chain/att1a_sp
+# exp/chain/tdnn1h_sp: num-iters=34 nj=2..5 num-params=5.2M dim=40+100->2336 combine=-0.049->-0.047 (over 3) xent:train/valid[21,33,final]=(-1.36,-1.16,-1.14/-1.57,-1.40,-1.37) logprob:train/valid[21,33,final]=(-0.061,-0.051,-0.046/-0.094,-0.089,-0.081)
+# exp/chain/att1a_sp: num-iters=34 nj=2..5 num-params=4.2M dim=40+100->2336 combine=-0.046->-0.044 (over 4) xent:train/valid[21,33,final]=(-1.30,-1.10,-1.07/-1.53,-1.38,-1.33) logprob:train/valid[21,33,final]=(-0.057,-0.049,-0.043/-0.091,-0.087,-0.081)
+# exp/chain/att1a2_sp: num-iters=34 nj=2..5 num-params=4.2M dim=40+100->2336 combine=-0.046->-0.044 (over 3) xent:train/valid[21,33,final]=(-1.30,-1.08,-1.06/-1.53,-1.36,-1.33) logprob:train/valid[21,33,final]=(-0.056,-0.048,-0.043/-0.091,-0.085,-0.081)
+
+
+# Set -e here so that we catch if any executable fails immediately
+set -euo pipefail
+
+# First the options that are passed through to run_ivector_common.sh
+# (some of which are also used in this script directly).
+stage=0
+decode_nj=10
+train_set=train_clean_5
+test_sets=dev_clean_2
+gmm=tri3b
+nnet3_affix=
+
+# The rest are configs specific to this script.  Most of the parameters
+# are just hardcoded at this level, in the commands below.
+affix=1a   # affix for the directory name
+tree_affix=
+train_stage=-10
+get_egs_stage=-10
+decode_iter=
+
+# training options
+# training chunk-options
+chunk_width=140,100,160
+dropout_schedule='0,0@0.20,0.3@0.50,0'
+common_egs_dir=
+xent_regularize=0.1
+
+# training options
+srand=0
+remove_egs=true
+reporting_email=
+
+#decode options
+test_online_decoding=true  # if true, it will run the last decoding stage.
+
+
+# End configuration section.
+echo "$0 $@"  # Print the command line for logging
+
+. ./cmd.sh
+. ./path.sh
+. ./utils/parse_options.sh
+
+if ! cuda-compiled; then
+  cat <<EOF && exit 1
+This script is intended to be used with GPUs but you have not compiled Kaldi with CUDA
+If you want to use GPUs (and have them), go to src/, and configure and make on a machine
+where "nvcc" is installed.
+EOF
+fi
+
+# The iVector-extraction and feature-dumping parts are the same as the standard
+# nnet3 setup, and you can skip them by setting "--stage 11" if you have already
+# run those things.
+local/nnet3/run_ivector_common.sh --stage $stage \
+                                  --train-set $train_set \
+                                  --gmm $gmm \
+                                  --nnet3-affix "$nnet3_affix" || exit 1;
+
+# Problem: We have removed the "train_" prefix of our training set in
+# the alignment directory names! Bad!
+gmm_dir=exp/$gmm
+ali_dir=exp/${gmm}_ali_${train_set}_sp
+tree_dir=exp/chain${nnet3_affix}/tree_sp${tree_affix:+_$tree_affix}
+lang=data/lang_chain
+lat_dir=exp/chain${nnet3_affix}/${gmm}_${train_set}_sp_lats
+dir=exp/chain${nnet3_affix}/att${affix}_sp
+train_data_dir=data/${train_set}_sp_hires
+lores_train_data_dir=data/${train_set}_sp
+train_ivector_dir=exp/nnet3${nnet3_affix}/ivectors_${train_set}_sp_hires
+
+for f in $gmm_dir/final.mdl $train_data_dir/feats.scp $train_ivector_dir/ivector_online.scp \
+    $lores_train_data_dir/feats.scp $ali_dir/ali.1.gz; do
+  [ ! -f $f ] && echo "$0: expected file $f to exist" && exit 1
+done
+
+if [ $stage -le 10 ]; then
+  echo "$0: creating lang directory $lang with chain-type topology"
+  # Create a version of the lang/ directory that has one state per phone in the
+  # topo file. [note, it really has two states.. the first one is only repeated
+  # once, the second one has zero or more repeats.]
+  if [ -d $lang ]; then
+    if [ $lang/L.fst -nt data/lang/L.fst ]; then
+      echo "$0: $lang already exists, not overwriting it; continuing"
+    else
+      echo "$0: $lang already exists and seems to be older than data/lang..."
+      echo " ... not sure what to do.  Exiting."
+      exit 1;
+    fi
+  else
+    cp -r data/lang $lang
+    silphonelist=$(cat $lang/phones/silence.csl) || exit 1;
+    nonsilphonelist=$(cat $lang/phones/nonsilence.csl) || exit 1;
+    # Use our special topology... note that later on may have to tune this
+    # topology.
+    steps/nnet3/chain/gen_topo.py $nonsilphonelist $silphonelist >$lang/topo
+  fi
+fi
+
+if [ $stage -le 11 ]; then
+  # Get the alignments as lattices (gives the chain training more freedom).
+  # use the same num-jobs as the alignments
+  steps/align_fmllr_lats.sh --nj 75 --cmd "$train_cmd" ${lores_train_data_dir} \
+    data/lang $gmm_dir $lat_dir
+  rm $lat_dir/fsts.*.gz # save space
+fi
+
+if [ $stage -le 12 ]; then
+  # Build a tree using our new topology.  We know we have alignments for the
+  # speed-perturbed data (local/nnet3/run_ivector_common.sh made them), so use
+  # those.  The num-leaves is always somewhat less than the num-leaves from
+  # the GMM baseline.
+   if [ -f $tree_dir/final.mdl ]; then
+     echo "$0: $tree_dir/final.mdl already exists, refusing to overwrite it."
+     exit 1;
+  fi
+  steps/nnet3/chain/build_tree.sh \
+    --frame-subsampling-factor 3 \
+    --context-opts "--context-width=2 --central-position=1" \
+    --cmd "$train_cmd" 3500 ${lores_train_data_dir} \
+    $lang $ali_dir $tree_dir
+fi
+
+
+if [ $stage -le 13 ]; then
+  mkdir -p $dir
+  echo "$0: creating neural net configs using the xconfig parser";
+
+  num_targets=$(tree-info $tree_dir/tree |grep num-pdfs|awk '{print $2}')
+  learning_rate_factor=$(echo "print (0.5/$xent_regularize)" | python)
+
+  tdnn_opts="l2-regularize=0.03 dropout-proportion=0.0 dropout-per-dim-continuous=true"
+
+  tdnnf_opts="l2-regularize=0.03 dropout-proportion=0.0 bypass-scale=0.66"
+  linear_opts="l2-regularize=0.03 orthonormal-constraint=-1.0"
+  prefinal_opts="l2-regularize=0.03"
+  output_opts="l2-regularize=0.015"
+  attention_opts="l2-regularize=0.01 bottleneck-dim=96 num-heads=4 value-dim=50 key-dim=50 time-stride=3 num-left-inputs=4 num-right-inputs=2 bypass-scale=0.66"
+
+  mkdir -p $dir/configs
+  cat <<EOF > $dir/configs/network.xconfig
+  input dim=100 name=ivector
+  input dim=40 name=input
+
+  # please note that it is important to have input layer with the name=input
+  # as the layer immediately preceding the fixed-affine-layer to enable
+  # the use of short notation for the descriptor
+  fixed-affine-layer name=lda input=Append(-1,0,1,ReplaceIndex(ivector, t, 0)) affine-transform-file=$dir/configs/lda.mat
+
+  # the first splicing is moved before the lda layer, so no splicing here
+  relu-batchnorm-dropout-layer name=tdnn1 $tdnn_opts dim=768
+  tdnnf-layer name=tdnnf2 $tdnnf_opts dim=768 bottleneck-dim=96 time-stride=1
+  tdnnf-layer name=tdnnf3 $tdnnf_opts dim=768 bottleneck-dim=96 time-stride=1
+  tdnnf-layer name=tdnnf4 $tdnnf_opts dim=768 bottleneck-dim=96 time-stride=1
+  tdnnf-layer name=tdnnf5 $tdnnf_opts dim=768 bottleneck-dim=96 time-stride=0
+  tdnnf-layer name=tdnnf6 $tdnnf_opts dim=768 bottleneck-dim=96 time-stride=3
+  attention-block name=att7  $attention_opts
+  tdnnf-layer name=tdnnf8 $tdnnf_opts dim=768 bottleneck-dim=96 time-stride=3
+  attention-block name=att9  $attention_opts
+  tdnnf-layer name=tdnnf10 $tdnnf_opts dim=768 bottleneck-dim=96 time-stride=3
+  linear-component name=prefinal-l dim=192 $linear_opts
+
+  ## adding the layers for chain branch
+  prefinal-layer name=prefinal-chain input=prefinal-l $prefinal_opts small-dim=192 big-dim=768
+  output-layer name=output include-log-softmax=false dim=$num_targets $output_opts
+
+  # adding the layers for xent branch
+  prefinal-layer name=prefinal-xent input=prefinal-l $prefinal_opts small-dim=192 big-dim=768
+  output-layer name=output-xent dim=$num_targets learning-rate-factor=$learning_rate_factor $output_opts
+EOF
+  steps/nnet3/xconfig_to_configs.py --xconfig-file $dir/configs/network.xconfig --config-dir $dir/configs/
+fi
+
+
+if [ $stage -le 14 ]; then
+  if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then
+    utils/create_split_dir.pl \
+     /export/b0{3,4,5,6}/$USER/kaldi-data/egs/mini_librispeech-$(date +'%m_%d_%H_%M')/s5/$dir/egs/storage $dir/egs/storage
+  fi
+
+  steps/nnet3/chain/train.py --stage=$train_stage \
+    --cmd="$decode_cmd" \
+    --feat.online-ivector-dir=$train_ivector_dir \
+    --feat.cmvn-opts="--norm-means=false --norm-vars=false" \
+    --chain.xent-regularize $xent_regularize \
+    --chain.leaky-hmm-coefficient=0.1 \
+    --chain.l2-regularize=0.0 \
+    --chain.apply-deriv-weights=false \
+    --chain.lm-opts="--num-extra-lm-states=2000" \
+    --trainer.dropout-schedule $dropout_schedule \
+    --trainer.add-option="--optimization.memory-compression-level=2" \
+    --trainer.srand=$srand \
+    --trainer.max-param-change=2.0 \
+    --trainer.num-epochs=20 \
+    --trainer.frames-per-iter=3000000 \
+    --trainer.optimization.num-jobs-initial=2 \
+    --trainer.optimization.num-jobs-final=5 \
+    --trainer.optimization.initial-effective-lrate=0.002 \
+    --trainer.optimization.final-effective-lrate=0.0002 \
+    --trainer.num-chunk-per-minibatch=128,64 \
+    --egs.chunk-width=$chunk_width \
+    --egs.dir="$common_egs_dir" \
+    --egs.opts="--frames-overlap-per-eg 0" \
+    --cleanup.remove-egs=$remove_egs \
+    --use-gpu=true \
+    --reporting.email="$reporting_email" \
+    --feat-dir=$train_data_dir \
+    --tree-dir=$tree_dir \
+    --lat-dir=$lat_dir \
+    --dir=$dir  || exit 1;
+fi
+
+if [ $stage -le 15 ]; then
+  # Note: it's not important to give mkgraph.sh the lang directory with the
+  # matched topology (since it gets the topology file from the model).
+  utils/mkgraph.sh \
+    data/lang_test_tgsmall \
+    $tree_dir $tree_dir/graph_tgsmall || exit 1;
+fi
+
+if [ $stage -le 16 ]; then
+  frames_per_chunk=$(echo $chunk_width | cut -d, -f1)
+  rm $dir/.error 2>/dev/null || true
+
+  for data in $test_sets; do
+    (
+      nspk=$(wc -l <data/${data}_hires/spk2utt)
+      steps/nnet3/decode.sh \
+          --acwt 1.0 --post-decode-acwt 10.0 \
+          --frames-per-chunk $frames_per_chunk \
+          --nj $nspk --cmd "$decode_cmd"  --num-threads 4 \
+          --online-ivector-dir exp/nnet3${nnet3_affix}/ivectors_${data}_hires \
+          $tree_dir/graph_tgsmall data/${data}_hires ${dir}/decode_tgsmall_${data} || exit 1
+      steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \
+        data/lang_test_{tgsmall,tglarge} \
+       data/${data}_hires ${dir}/decode_{tgsmall,tglarge}_${data} || exit 1
+    ) || touch $dir/.error &
+  done
+  wait
+  [ -f $dir/.error ] && echo "$0: there was a problem while decoding" && exit 1
+fi
+
+# Not testing the 'looped' decoding separately, because for
+# TDNN systems it would give exactly the same results as the
+# normal decoding.
+
+if $test_online_decoding && [ $stage -le 17 ]; then
+  # note: if the features change (e.g. you add pitch features), you will have to
+  # change the options of the following command line.
+  steps/online/nnet3/prepare_online_decoding.sh \
+    --mfcc-config conf/mfcc_hires.conf \
+    $lang exp/nnet3${nnet3_affix}/extractor ${dir} ${dir}_online
+
+  rm $dir/.error 2>/dev/null || true
+
+  for data in $test_sets; do
+    (
+      nspk=$(wc -l <data/${data}_hires/spk2utt)
+      # note: we just give it "data/${data}" as it only uses the wav.scp, the
+      # feature type does not matter.
+      steps/online/nnet3/decode.sh \
+        --acwt 1.0 --post-decode-acwt 10.0 \
+        --nj $nspk --cmd "$decode_cmd" \
+        $tree_dir/graph_tgsmall data/${data} ${dir}_online/decode_tgsmall_${data} || exit 1
+      steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \
+        data/lang_test_{tgsmall,tglarge} \
+       data/${data}_hires ${dir}_online/decode_{tgsmall,tglarge}_${data} || exit 1
+    ) || touch $dir/.error &
+  done
+  wait
+  [ -f $dir/.error ] && echo "$0: there was a problem while decoding" && exit 1
+fi
+
+
+exit 0;
diff --git a/egs/wsj/s5/steps/libs/nnet3/xconfig/attention.py b/egs/wsj/s5/steps/libs/nnet3/xconfig/attention.py
index db4cb392f10..21874ad6923 100644
--- a/egs/wsj/s5/steps/libs/nnet3/xconfig/attention.py
+++ b/egs/wsj/s5/steps/libs/nnet3/xconfig/attention.py
@@ -247,3 +247,284 @@ def _add_components(self, input_desc, input_dim, nonlinearities):
             configs.append(line)
             cur_node = '{0}.{1}'.format(self.name, nonlinearity)
         return configs
+
+
+# This class is for parsing lines like
+#  'attention-block dim=768  bottleneck-dim=128 num-heads=8 value-dim=50 key-dim=50 time-stride=3 num-left-inputs=30 num-right-inputs=10 bypass-scale=0.66'
+#
+#  It is a little like a TDNNF-layer, but with attention in the middle and no
+#  ReLU.  Note: as of now, there is no nonlinearity other than what comes from
+#  the attention component itself (it has a softmax).  Imagine the input and
+#  output dim of the layer is largish, like 768.
+#
+#  So we go, 768 --(linear with orthonormal)--> 128 --(affine)--> attention-input-dim  --(attention)--> (50+context-dim)*8  \
+#            --(linear with orthonormal)-->128 -->(linear) 768 -> batchnorm, then add residual connection from original 768-dim input.
+#
+#  ... where attention-input-dim equals value-dim + 2*key-dim + context-dim
+#  and context-dim = (num-left-inputs + 1 + num-right-inputs + 1)
+#     in this case it's 50 + 2*50 + (30+10+1) = 191.
+#
+#
+# Parameters of the class, and their defaults:
+#   input='[-1]'               [Descriptor giving the input of the layer.]
+#   bottleneck-dim=-1              [bottleneck dimension, e.g. 128.]
+#   num-heads=-1               [Number of attention heads, e.g. 8]
+#   value-dim=-1               [Dimension of values (the things which get weighted-averaged
+#                               and then output. E.g. 50]
+#   key-dim=-1                 [Dimension of the keys, e.g. 50.  Affects the query
+#                               dimension, but that's larger by context_dim,
+#                               where context_dim == num-left-inputs+1+num-right-inputs.
+#                               That's for the encoding of the position of the input frame.]
+#   dim=-1                     [Dimension of the output of this layer (after the bottleneck;
+#                               e.g. 768].  Defaults to the dimension of the input.]
+#   time-stride=1              [Time stride, dictates the spacing of the inputs to this
+#                               layer.  E.g. might be 3 in typical TDNN-F setups.]
+#   num-left-inputs=-1         [Number of inputs to the left that we use.  Must be specified.]
+#   num-right-inputs=-1         [Number of inputs to the right that we use.  Must be specified.]
+#   num-left-inputs-required: -1   [This affects the left/right context that the network will
+#                                have, i.e. how many frames of input it will insist on having.
+#                                It affects the behavior at chunk boundaries; larger will tend
+#                                to be slower but more accurate.  Note: the default of -1 means:
+#                                use the same as num-left-inputs].
+#   num-right-inputs-required: -1  [See comment for num-left-inputs-required]
+#   output-context:  True        [If true, the softmax weights will be an additional
+#                                output of the attention heads.]
+#   key-scale: 0.0               [If >0.0, becomes a scaling factor on the keys.  Otherwise, we
+#                                 use the default value of 1.0 / sqrt(key-dim).]
+#
+#
+#  bypass-scale : 0.66          [Scale on the input in the residual connection.]
+#  target-rms:   1.0            [Scaling on the output of the batchnorm]
+#
+#  Extra configs that are passed into the affine and linear components:
+#   learning-rate-factor=1.0   [This can be used to make the affine component
+#                               train faster or slower].
+#   max-change=0.75    [maximum change per iteration, per component]
+#   l2-regularize=0.0  [l2 regularization constant for linear and affine components.]
+#
+#  use-relu=False      [If true, add relu]
+#
+#   Documentation for the rest of the parameters (related to the
+#   attention component) can be found in nnet-attention-component.h
+
+
+class XconfigAttentionBlock(XconfigLayerBase):
+    def __init__(self, first_token, key_to_value, prev_names = None):
+        # Here we just list some likely combinations.. you can just add any
+        # combinations you want to use, to this list.
+        assert first_token == 'attention-block'
+        XconfigLayerBase.__init__(self, first_token, key_to_value, prev_names)
+
+    def set_default_configs(self):
+        # note: self.config['input'] is a descriptor, '[-1]' means output
+        # the most recent layer.
+        self.config = { 'input':'[-1]',
+                        'dim': -1,
+                        'bottleneck-dim': -1,
+                        'num-heads': -1,
+                        'value-dim': -1,
+                        'key-dim': -1,
+                        'dim': -1,
+                        'time-stride': 1,
+                        'num-left-inputs': -1,
+                        'num-right-inputs': -1,
+                        'learning-rate-factor': 1.0,
+                        'max-change' : 0.75,
+                        'ng-affine-options' : '',
+                        'l2-regularize': 0.0,
+                        'num-left-inputs-required': -1,
+                        'num-right-inputs-required': -1,
+                        'output-context': True,
+                        'target-rms': 1.0,
+                        'key-scale': 0.0,
+                        'bypass-scale': 0.66,
+                        'use-relu': False}
+
+
+    def check_configs(self):
+        for x in [ 'bottleneck-dim', 'num-heads', 'value-dim', 'key-dim' ]:
+            if self.config[x] <= 0:
+                raise RuntimeError("Expected {} to be positive, got {}".format(x, self.config[x]))
+        for x in ['num-left-inputs', 'num-right-inputs' ]:
+            if self.config[x] < 0:
+                raise RuntimeError("Expected {} to be nonnegative, got {}".format(x, self.config[x]))
+        # Not checking everything here.
+        if self.config['learning-rate-factor'] <= 0.0:
+            raise RuntimeError("learning-rate-factor has invalid value {0}"
+                               .format(self.config['learning-rate-factor']))
+        if self.config['key-scale'] == 0.0:
+            self.config['key-scale'] = 1.0 / math.sqrt(self.config['key-dim'])
+
+    def output_name(self, auxiliary_output=None):
+        # at a later stage we might want to expose even the pre-nonlinearity
+        # vectors
+        return '{0}.noop'.format(self.name)
+
+    def attention_input_dim(self):
+        context_dim = (self.config['num-left-inputs'] +
+                       self.config['num-right-inputs'] + 1)
+        num_heads = self.config['num-heads']
+        key_dim = self.config['key-dim']
+        value_dim = self.config['value-dim']
+        query_dim = key_dim + context_dim;
+        return num_heads * (key_dim + value_dim + query_dim)
+
+    def attention_output_dim(self):
+        context_dim = (self.config['num-left-inputs'] +
+                       self.config['num-right-inputs'] + 1)
+        num_heads = self.config['num-heads']
+        value_dim = self.config['value-dim']
+        return (num_heads *
+                (value_dim +
+                 (context_dim if self.config['output-context'] else 0)))
+
+    def output_dim(self, auxiliary_output = None):
+        dim = self.config['dim']
+        if dim > 0:
+            return dim
+        else:
+            return self.descriptors['input']['dim']
+
+    def get_full_config(self):
+        ans = []
+        config_lines = self._generate_config()
+
+        for line in config_lines:
+            for config_name in ['ref', 'final']:
+                # we do not support user specified matrices in this layer
+                # so 'ref' and 'final' configs are the same.
+                ans.append((config_name, line))
+        return ans
+
+
+    def _generate_config(self):
+        # by 'descriptor_final_string' we mean a string that can appear in
+        # config-files, i.e. it contains the 'final' names of nodes.
+        input_desc = self.descriptors['input']['final-string']
+        input_dim = self.descriptors['input']['dim']
+        output_dim = self.config['dim']
+        if output_dim <= 0:
+            output_dim = input_dim
+        bottleneck_dim = self.config['bottleneck-dim']
+        attention_input_dim = self.attention_input_dim()
+        attention_output_dim = self.attention_output_dim()
+        target_rms = self.config['target-rms']
+        max_change = self.config['max-change']
+        l2_regularize = self.config['l2-regularize']
+        learning_rate_factor=self.config['learning-rate-factor']
+
+        learning_rate_option=('learning-rate-factor={0}'.format(learning_rate_factor)
+                              if learning_rate_factor != 1.0 else '')
+        l2_regularize_option = ('l2-regularize={0} '.format(l2_regularize)
+                                if l2_regularize != 0.0 else '')
+
+        common_options=("{lroption} {l2option} max-change={max_change} "
+                        "".format(lroption = learning_rate_option,
+                                  l2option = l2_regularize_option,
+                                  max_change = max_change))
+
+
+        configs = []
+
+
+        # The first linear component
+        line = ('component name={0}.linear1 type=LinearComponent '
+                'input-dim={1} output-dim={2} '
+                '{3} orthonormal-constraint=-1 '
+                ''.format(self.name, input_dim, bottleneck_dim,
+                          common_options))
+
+        configs.append(line)
+        line = ('component-node name={0}.linear1 component={0}.linear1 input={1} '
+                ''.format(self.name, input_desc))
+        configs.append(line)
+
+        # The first affine component
+        line = ('component name={0}.affine1 type=NaturalGradientAffineComponent '
+                'input-dim={1} output-dim={2} '
+                '{3}'.format(self.name, bottleneck_dim, attention_input_dim,
+                             common_options))
+        configs.append(line)
+        line = ('component-node name={0}.affine1 component={0}.affine1 input={0}.linear1'
+                ''.format(self.name, input_desc))
+        configs.append(line)
+
+
+        line = ('component name={0}.layernorm1 type=NormalizeComponent dim={1} '
+                ' '.format(self.name, attention_input_dim))
+        configs.append(line)
+        line = ('component-node name={0}.layernorm1 component={0}.layernorm1 '
+                'input={0}.affine1 '.format(self.name))
+        configs.append(line)
+        cur_name='layernorm1'
+
+        # The attention component
+        line = ('component name={name}.attention type=RestrictedAttentionComponent '
+                'value-dim={v} key-dim={k} num-left-inputs={nl} '
+                'num-right-inputs={nr} num-left-inputs-required={nlr}'
+                ' num-right-inputs-required={nrr} output-context={oc}'
+                ' time-stride={ts} num-heads={nh} key-scale={ks}'
+                ''.format(name=self.name,
+                          v=self.config['value-dim'], k=self.config['key-dim'],
+                          nl=self.config['num-left-inputs'],
+                          nr=self.config['num-right-inputs'],
+                          nlr=self.config['num-left-inputs-required'],
+                          nrr=self.config['num-right-inputs-required'],
+                          oc=self.config['output-context'],
+                          ts=self.config['time-stride'],
+                          nh=self.config['num-heads'],
+                          ks=self.config['key-scale']))
+        configs.append(line)
+        line = ('component-node name={0}.attention component={0}.attention input={0}.{1}'
+                ''.format(self.name, cur_name))
+        configs.append(line)
+
+        # The second linear component
+        line = ('component name={0}.linear2 type=LinearComponent '
+                'input-dim={1} output-dim={2} orthonormal-constraint=-1 '
+                '{3}'.format(self.name, attention_output_dim, bottleneck_dim,
+                             common_options))
+        configs.append(line)
+        line = ('component-node name={0}.linear2 component={0}.linear2 '
+                'input={0}.attention '.format(self.name))
+        configs.append(line)
+
+        # The third linear component
+        line = ('component name={0}.linear3 type=LinearComponent '
+                'input-dim={1} output-dim={2} '
+                '{3}'.format(self.name, bottleneck_dim, output_dim,
+                             common_options))
+        configs.append(line)
+        line = ('component-node name={0}.linear3 component={0}.linear3 '
+                'input={0}.linear2 '.format(self.name))
+        configs.append(line)
+
+
+        if self.config['use-relu']:
+            line = ('component name={0}.relu type=RectifiedLinearComponent dim={1} '
+                    ''.format(self.name, output_dim))
+            configs.append(line)
+            line = ('component-node name={0}.relu component={0}.relu '
+                    'input={0}.linear3 '.format(self.name))
+            configs.append(line)
+            cur_name = 'relu'
+        else:
+            cur_name = 'linear3'
+
+
+        line = ('component name={0}.layernorm2 type=NormalizeComponent dim={1} '
+                'target-rms={2} '.format(self.name, output_dim, target_rms))
+        configs.append(line)
+        line = ('component-node name={0}.layernorm2 component={0}.layernorm2 '
+                'input={0}.{1} '.format(self.name, cur_name))
+        configs.append(line)
+
+
+        line = ('component name={0}.noop type=NoOpComponent dim={1}'.format(
+            self.name, output_dim))
+        configs.append(line)
+        line = ('component-node name={name}.noop component={name}.noop input=Sum(Scale({b}, {i}), {name}.layernorm2)'
+                ''.format(name=self.name, b=self.config['bypass-scale'], i=input_desc))
+        configs.append(line)
+
+        return configs
diff --git a/egs/wsj/s5/steps/libs/nnet3/xconfig/parser.py b/egs/wsj/s5/steps/libs/nnet3/xconfig/parser.py
index b540423e3cd..fa42b5baf0c 100644
--- a/egs/wsj/s5/steps/libs/nnet3/xconfig/parser.py
+++ b/egs/wsj/s5/steps/libs/nnet3/xconfig/parser.py
@@ -84,7 +84,8 @@
         'scale-component':  xlayers.XconfigPerElementScaleComponent,
         'dim-range-component': xlayers.XconfigDimRangeComponent,
         'offset-component':  xlayers.XconfigPerElementOffsetComponent,
-        'combine-feature-maps-layer': xlayers.XconfigCombineFeatureMapsLayer
+        'combine-feature-maps-layer': xlayers.XconfigCombineFeatureMapsLayer,
+        'attention-block': xlayers.XconfigAttentionBlock
 }
 
 # Turn a config line and a list of previous layers into
diff --git a/src/bin/cuda-gpu-available.cc b/src/bin/cuda-gpu-available.cc
index 923ed8280cf..67063fc0f96 100644
--- a/src/bin/cuda-gpu-available.cc
+++ b/src/bin/cuda-gpu-available.cc
@@ -23,6 +23,7 @@
 #endif
 
 #include "base/kaldi-common.h"
+#include "util/parse-options.h"
 #include "cudamatrix/cu-device.h"
 #include "cudamatrix/cu-matrix.h"
 
@@ -50,8 +51,9 @@ int main(int argc, char *argv[]) try {
         "exit-code: 0 = success, 1 = compiled without GPU support, -1 = error\n"
         "\n"
         "Usage:  cuda-gpu-available\n";
-  // Remove unused variable warning
-  (void) usage;
+
+  ParseOptions po(usage);
+  po.Read(argc, argv);
 
   char hostname[100] = "UNKNOWN-HOSTNAME";
 #if !defined(_MSC_VER) && !defined(__CYGWIN__)
diff --git a/src/cblasext/Makefile b/src/cblasext/Makefile
index a4f6cb320f7..a3d684cdee7 100644
--- a/src/cblasext/Makefile
+++ b/src/cblasext/Makefile
@@ -10,7 +10,7 @@ include ../kaldi.mk
 
 # you can uncomment matrix-lib-speed-test if you want to do the speed tests.
 
-TESTFILES = cblasext-test
+TESTFILES =
 
 OBJFILES = cblas-extensions.o
 
@@ -19,4 +19,3 @@ LIBNAME = kaldi-cblasext
 ADDLIBS = ../base/kaldi-base.a
 
 include ../makefiles/default_rules.mk
-
diff --git a/src/cudafeat/feature-mfcc-cuda.cu b/src/cudafeat/feature-mfcc-cuda.cu
index 84b9241a6b0..351f60fe094 100644
--- a/src/cudafeat/feature-mfcc-cuda.cu
+++ b/src/cudafeat/feature-mfcc-cuda.cu
@@ -120,8 +120,7 @@ __global__ void mel_banks_compute_kernel(int32_t num_frames, float energy_floor,
 }
 
 __global__ void process_window_kernel(
-    int frame_length, bool remove_dc_offset,
-    const float *windowing, float *windows,
+    int frame_length, const float *windowing, float *windows,
     int32_t ldw) {
   // Specialize WarpReduce for type float
   typedef cub::BlockReduce<float, CU1DBLOCK> BlockReduce;
@@ -144,10 +143,6 @@ __global__ void process_window_kernel(
     wdot += wval * wval;
 
     float windowing_mul = 1;
-    if (remove_dc_offset == false) {
-      // we are done here so set windowing multiplication on write.
-      windowing_mul = windowing[idx];
-    }
     // write dithered output
     window[idx] = wval * windowing_mul;
   }
@@ -155,7 +150,7 @@ __global__ void process_window_kernel(
   // CAUTION (dp): when various configs were removed I tried to simplify this code
   // by removing things that weren't supported.  Its structure may not make sense
   // any more even if I did that correctly.
-  if (remove_dc_offset) {
+  { // This block comptes and applies the DC offset.
     // we will recompute this below
     wdot = 0.0f;
     // use cub to reduce
@@ -357,7 +352,7 @@ void CudaMfcc::ProcessWindows(int num_frames,
   KALDI_ASSERT(fft_num_frames % fft_size_ == 0);
 
   process_window_kernel<<<num_frames, CU1DBLOCK>>>(
-      frame_length_, opts.remove_dc_offset,
+      frame_length_,
       window_function_.Data(),
       cu_windows_.Data(), cu_windows_.Stride());
 
diff --git a/src/feat/Makefile b/src/feat/Makefile
index ce8a75ba289..4396caaf409 100644
--- a/src/feat/Makefile
+++ b/src/feat/Makefile
@@ -4,7 +4,7 @@ all:
 
 include ../kaldi.mk
 
-TESTFILES = feature-mfcc-test feature-fbank-test \
+TESTFILES = feature-mfcc-test \
          feature-functions-test pitch-functions-test feature-sdc-test \
          resample-test online-feature-test signal-test wave-reader-test
 
diff --git a/src/feat/feature-fbank-test.cc b/src/feat/feature-fbank-test.cc
index 39fc5a2906d..9298b47eba4 100644
--- a/src/feat/feature-fbank-test.cc
+++ b/src/feat/feature-fbank-test.cc
@@ -29,48 +29,6 @@ using namespace kaldi;
 
 
 
-/**
- */
-static void UnitTestSimple() {
-  std::cout << "=== UnitTestSimple() ===\n";
-
-  Vector<BaseFloat> v(100000);
-  Matrix<BaseFloat> m;
-
-  // init with noise
-  for (int32 i = 0; i < v.Dim(); i++) {
-    v(i) = (abs( i * 433024253 ) % 65535) - (65535 / 2);
-  }
-
-  std::cout << "<<<=== Just make sure it runs... Nothing is compared\n";
-  // the parametrization object
-  FbankOptions op;
-  // trying to have same opts as baseline.
-  op.frame_opts.window_type = "rectangular";
-  op.frame_opts.remove_dc_offset = false;
-  op.frame_opts.round_to_power_of_two = true;
-  op.mel_opts.low_freq = 0.0;
-  op.use_energy = true;
-
-  Fbank fbank(op);
-  // use default parameters
-
-  // compute fbanks.
-  fbank.Compute(v, 1.0, &m);
-
-  // possibly dump
-  //   std::cout << "== Output features == \n" << m;
-  std::cout << "Test passed :)\n\n";
-}
-
-
-
-static void UnitTestFeat() {
-  UnitTestSimple();
-}
-
-
-
 
 int main() {
   try {
@@ -83,5 +41,3 @@ int main() {
     return 1;
   }
 }
-
-
diff --git a/src/feat/feature-fbank.cc b/src/feat/feature-fbank.cc
index 28311903b67..df10712f956 100644
--- a/src/feat/feature-fbank.cc
+++ b/src/feat/feature-fbank.cc
@@ -24,13 +24,9 @@
 namespace kaldi {
 
 FbankComputer::FbankComputer(const FbankOptions &opts):
-    opts_(opts), srfft_(NULL) {
+    opts_(opts),
+    srfft_(new SplitRadixRealFft<BaseFloat>(opts.frame_opts.PaddedWindowSize())) {
   KALDI_ASSERT(opts.energy_floor > 0.0 && "Nonzero energy floor is required.");
-
-  int32 padded_window_size = opts.frame_opts.PaddedWindowSize();
-  if ((padded_window_size & (padded_window_size-1)) == 0)  // Is a power of two...
-    srfft_ = new SplitRadixRealFft<BaseFloat>(padded_window_size);
-
   // We'll definitely need the filterbanks info for VTLN warping factor 1.0.
   // [note: this call caches it.]
   GetMelBanks(1.0);
@@ -38,13 +34,12 @@ FbankComputer::FbankComputer(const FbankOptions &opts):
 
 FbankComputer::FbankComputer(const FbankComputer &other):
     opts_(other.opts_),
-    mel_banks_(other.mel_banks_), srfft_(NULL) {
+    mel_banks_(other.mel_banks_),
+    srfft_(new SplitRadixRealFft<BaseFloat>(*(other.srfft_))) {
   for (std::map<BaseFloat, MelBanks*>::iterator iter = mel_banks_.begin();
       iter != mel_banks_.end();
       ++iter)
     iter->second = new MelBanks(*(iter->second));
-  if (other.srfft_)
-    srfft_ = new SplitRadixRealFft<BaseFloat>(*(other.srfft_));
 }
 
 FbankComputer::~FbankComputer() {
@@ -80,19 +75,25 @@ void FbankComputer::Compute(BaseFloat vtln_warp,
 
   BaseFloat signal_log_energy = 0.0;
   if (opts_.use_energy)
-    signal_log_energy = Log(std::max<BaseFloat>(VecVec(*signal_frame, *signal_frame),
-                                                opts_.energy_floor));
+    signal_log_energy = Log(std::max<BaseFloat>(
+        VecVec(*signal_frame, *signal_frame),
+        opts_.energy_floor * opts_.frame_opts.WindowSize()));
 
-  if (srfft_ != NULL)  // Compute FFT using split-radix algorithm.
-    srfft_->Compute(signal_frame->Data(), true);
-  else  // An alternative algorithm that works for non-powers-of-two.
-    RealFft(signal_frame, true);
+  // Compute FFT using split-radix algorithm.
+  srfft_->Compute(signal_frame->Data(), true);
 
   // Convert the FFT into a power spectrum.
   ComputePowerSpectrum(signal_frame);
   SubVector<BaseFloat> power_spectrum(*signal_frame, 0,
                                       signal_frame->Dim() / 2 + 1);
 
+  // The energy_floor has the scale for the energy of a single sample, and the
+  // FFT has a higher dynamic range (it's not the orthogonal FFT)... the sqrt
+  // expression is to correct for that.
+  BaseFloat floor = opts_.energy_floor *
+                    std::sqrt(BaseFloat(opts_.frame_opts.WindowSize()));
+  power_spectrum.ApplyFloor(floor);
+
   int32 mel_offset = (opts_.use_energy ? 1 : 0);
   SubVector<BaseFloat> mel_energies(*feature,
                                     mel_offset,
@@ -101,7 +102,6 @@ void FbankComputer::Compute(BaseFloat vtln_warp,
   // Sum with mel fiterbanks over the power spectrum
   mel_banks.Compute(power_spectrum, &mel_energies);
 
-  mel_energies.ApplyFloor(opts_.energy_floor);
   mel_energies.ApplyLog();  // take the log.
 
   // Copy energy as first value
diff --git a/src/feat/feature-fbank.h b/src/feat/feature-fbank.h
index 04421c506b6..665a087fcaa 100644
--- a/src/feat/feature-fbank.h
+++ b/src/feat/feature-fbank.h
@@ -42,14 +42,15 @@ struct FbankOptions {
   FrameExtractionOptions frame_opts;
   MelBanksOptions mel_opts;
   bool use_energy;  // append an extra dimension with energy to the filter banks
-  BaseFloat energy_floor;  // Floor on energy, to avoid log(0.0).  The floor of
-                           // 1e-10 may be interpreted as (approximately)
-                           // 0.1 * 2**-30.  The smallest nonzero value in a 16-bit
-                           // waveform would be 1^-15, and 1^-30 is its square.
+  BaseFloat energy_floor;  // Floor on energy, to avoid log(0.0), which will be
+                           // multiplied by sqrt(window-length-in-frames) and
+                           // applied per FFT bin. The value of 1.0e-09 is
+                           // approximately (1.0/32768.0)^2, like a signal value
+                           // of +- 1 in a 16-bit recording.
 
   FbankOptions(): mel_opts(23),
                   use_energy(false),
-                  energy_floor(1.0e-10) { }
+                  energy_floor(1.0e-09) { }
 
   void Register(OptionsItf *opts) {
     frame_opts.Register(opts);
@@ -58,8 +59,9 @@ struct FbankOptions {
                    "Add an extra dimension with energy to the filterbank "
                    "output.");
     opts->Register("energy-floor", &energy_floor,
-                   "Floor on energy (absolute, not relative) in filterbank "
-                   "computation.");
+                   "Floor on energy expressed as a squared-signal value per "
+                   "frame.  The default value represents about +-1 in int16 "
+                   "representation.");
   }
 };
 
diff --git a/src/feat/feature-functions.cc b/src/feat/feature-functions.cc
index 4ae2550c364..672227f619e 100644
--- a/src/feat/feature-functions.cc
+++ b/src/feat/feature-functions.cc
@@ -29,13 +29,8 @@ namespace kaldi {
 void ComputePowerSpectrum(VectorBase<BaseFloat> *waveform) {
   int32 dim = waveform->Dim();
 
-  // no, letting it be non-power-of-two for now.
-  // KALDI_ASSERT(dim > 0 && (dim & (dim-1) == 0));  // make sure a power of two.. actually my FFT code
-  // does not require this (dan) but this is better in case we use different code [dan].
-
-  // RealFft(waveform, true);  // true == forward (not inverse) FFT; makes no difference here,
-  // as we just want power spectrum.
-
+  // make sure a power of two.
+  KALDI_ASSERT(dim > 0 && ((dim & (dim-1)) == 0));
   // now we have in waveform, first half of complex spectrum
   // it's stored as [real0, realN/2-1, real1, im1, real2, im2, ...]
   int32 half_dim = dim/2;
@@ -46,8 +41,9 @@ void ComputePowerSpectrum(VectorBase<BaseFloat> *waveform) {
     (*waveform)(i) = real*real + im*im;
   }
   (*waveform)(0) = first_energy;
-  (*waveform)(half_dim) = last_energy;  // Will actually never be used, and anyway
-  // if the signal has been bandlimited sensibly this should be zero.
+  (*waveform)(half_dim) = last_energy;
+  // Will actually never be used, and anyway if the signal has been bandlimited
+  // sensibly this should be zero.
 }
 
 
diff --git a/src/feat/feature-mfcc-test.cc b/src/feat/feature-mfcc-test.cc
index 43a9b14dea6..280e2155c86 100644
--- a/src/feat/feature-mfcc-test.cc
+++ b/src/feat/feature-mfcc-test.cc
@@ -72,38 +72,6 @@ static void UnitTestReadWave() {
 
 
 
-/**
- */
-static void UnitTestSimple() {
-  std::cout << "=== UnitTestSimple() ===\n";
-
-  Vector<BaseFloat> v(100000);
-  Matrix<BaseFloat> m;
-
-  // init with noise
-  for (int32 i = 0; i < v.Dim(); i++) {
-    v(i) = (abs( i * 433024253 ) % 65535) - (65535 / 2);
-  }
-
-  std::cout << "<<<=== Just make sure it runs... Nothing is compared\n";
-  // the parametrization object
-  MfccOptions op;
-  // trying to have same opts as baseline.
-  op.frame_opts.window_type = "rectangular";
-  op.frame_opts.remove_dc_offset = false;
-  op.frame_opts.round_to_power_of_two = true;
-  op.mel_opts.low_freq = 0.0;
-
-  Mfcc mfcc(op);
-  // use default parameters
-
-  // compute mfccs.
-  mfcc.Compute(v, 1.0, &m);
-
-  // possibly dump
-  //   std::cout << "== Output features == \n" << m;
-  std::cout << "Test passed :)\n\n";
-}
 
 
 void UnitTestVtln() {
@@ -145,7 +113,6 @@ void UnitTestVtln() {
 static void UnitTestFeat() {
   UnitTestVtln();
   UnitTestReadWave();
-  UnitTestSimple();
   std::cout << "Tests succeeded.\n";
 }
 
diff --git a/src/feat/feature-mfcc.cc b/src/feat/feature-mfcc.cc
index 15085788d91..79e02ca5db2 100644
--- a/src/feat/feature-mfcc.cc
+++ b/src/feat/feature-mfcc.cc
@@ -42,23 +42,26 @@ void MfccComputer::Compute(BaseFloat vtln_warp,
 
   BaseFloat signal_log_energy;
   if (opts_.use_energy)
-    signal_log_energy = Log(std::max<BaseFloat>(VecVec(*signal_frame, *signal_frame),
-                                                opts_.energy_floor));
+    signal_log_energy = Log(std::max<BaseFloat>(
+        VecVec(*signal_frame, *signal_frame),
+        opts_.energy_floor * opts_.frame_opts.WindowSize()));
   const MelBanks &mel_banks = *(GetMelBanks(vtln_warp));
 
-  if (srfft_ != NULL)  // Compute FFT using the split-radix algorithm.
-    srfft_->Compute(signal_frame->Data(), true);
-  else  // An alternative algorithm that works for non-powers-of-two.
-    RealFft(signal_frame, true);
+  srfft_->Compute(signal_frame->Data(), true);
 
   // Convert the FFT into a power spectrum.
   ComputePowerSpectrum(signal_frame);
   SubVector<BaseFloat> power_spectrum(*signal_frame, 0,
                                       signal_frame->Dim() / 2 + 1);
 
-  mel_banks.Compute(power_spectrum, &mel_energies_);
+  // The energy_floor has the scale for the energy of a single sample, and the
+  // FFT has a higher dynamic range (it's not the orthogonal FFT)... the sqrt
+  // expression is to correct for that.
+  BaseFloat floor = opts_.energy_floor *
+                    std::sqrt(BaseFloat(opts_.frame_opts.WindowSize()));
+  power_spectrum.ApplyFloor(floor);
 
-  mel_energies_.ApplyFloor(opts_.energy_floor);
+  mel_banks.Compute(power_spectrum, &mel_energies_);
   mel_energies_.ApplyLog();
 
   feature->SetZero();  // in case there were NaNs.
@@ -71,7 +74,8 @@ void MfccComputer::Compute(BaseFloat vtln_warp,
 }
 
 MfccComputer::MfccComputer(const MfccOptions &opts):
-    opts_(opts), srfft_(NULL),
+    opts_(opts),
+    srfft_(new SplitRadixRealFft<BaseFloat>(opts.frame_opts.PaddedWindowSize())),
     mel_energies_(opts.mel_opts.num_bins) {
 
   int32 num_bins = opts.mel_opts.num_bins;
@@ -94,10 +98,6 @@ MfccComputer::MfccComputer(const MfccOptions &opts):
   dct_matrix_.Resize(opts.num_ceps, num_bins);
   dct_matrix_.CopyFromMat(dct_rows);  // subset of rows.
 
-  int32 padded_window_size = opts.frame_opts.PaddedWindowSize();
-  if ((padded_window_size & (padded_window_size-1)) == 0)  // Is a power of two...
-    srfft_ = new SplitRadixRealFft<BaseFloat>(padded_window_size);
-
   // We'll definitely need the filterbanks info for VTLN warping factor 1.0.
   // [note: this call caches it.]
   GetMelBanks(1.0);
@@ -107,13 +107,11 @@ MfccComputer::MfccComputer(const MfccComputer &other):
     opts_(other.opts_), lifter_coeffs_(other.lifter_coeffs_),
     dct_matrix_(other.dct_matrix_),
     mel_banks_(other.mel_banks_),
-    srfft_(NULL),
+    srfft_(new SplitRadixRealFft<BaseFloat>(*(other.srfft_))),
     mel_energies_(other.mel_energies_.Dim(), kUndefined) {
   for (std::map<BaseFloat, MelBanks*>::iterator iter = mel_banks_.begin();
        iter != mel_banks_.end(); ++iter)
     iter->second = new MelBanks(*(iter->second));
-  if (other.srfft_ != NULL)
-    srfft_ = new SplitRadixRealFft<BaseFloat>(*(other.srfft_));
 }
 
 
diff --git a/src/feat/feature-mfcc.h b/src/feat/feature-mfcc.h
index 9a35405504a..993d0dc777e 100644
--- a/src/feat/feature-mfcc.h
+++ b/src/feat/feature-mfcc.h
@@ -40,7 +40,11 @@ struct MfccOptions {
   MelBanksOptions mel_opts;
   int32 num_ceps;  // e.g. 13: num cepstral coeffs, counting zero.
   bool use_energy;  // if true, use energy; else C0
-  BaseFloat energy_floor;
+  BaseFloat energy_floor;  // Floor on energy, to avoid log(0.0), which will be
+                           // multiplied by sqrt(window-length-in-frames) and
+                           // applied per FFT bin. The value of 1.0e-09 is
+                           // approximately (1.0/32768.0)^2, like a signal value
+                           // of +- 1 in a 16-bit recording.
   // cepstral_lifter controls a scaling factor on the cepstra that helps give
   // all the MFCC coeffs a similar dynamic range by scaling up the
   // higher-frequency coefficients.  It's a rather odd formula involving
@@ -50,7 +54,7 @@ struct MfccOptions {
   MfccOptions() : mel_opts(23),
                   num_ceps(13),
                   use_energy(true),
-                  energy_floor(1.0e-10),
+                  energy_floor(1.0e-09),
                   cepstral_lifter(22.0) { }
 
 
diff --git a/src/feat/feature-sdc-test.cc b/src/feat/feature-sdc-test.cc
index 722c1dda41d..42370ce4715 100644
--- a/src/feat/feature-sdc-test.cc
+++ b/src/feat/feature-sdc-test.cc
@@ -45,7 +45,7 @@ static void UnitTestCompareWithDeltaFeatures(Matrix<BaseFloat> &raw_features, in
   int32 dd_num_rows = deltas_features.NumRows();
   int32 sdc_num_rows = shifted_deltas_features.NumRows();
   int32 num_features = raw_features.NumCols();
- 
+
   // Number of rows will be equal, but not
   // columns, in general.
   KALDI_ASSERT(dd_num_rows == sdc_num_rows);
@@ -60,7 +60,7 @@ static void UnitTestCompareWithDeltaFeatures(Matrix<BaseFloat> &raw_features, in
   }
 }
 
-static void UnitTestParams(Matrix<BaseFloat> &raw_features, int32 window, 
+static void UnitTestParams(Matrix<BaseFloat> &raw_features, int32 window,
                            int32 shift, int32 n_blocks) {
   std::cout << "=== UnitTestSDCParams() ===\n";
   ShiftedDeltaFeaturesOptions shifted_deltas_opts;
@@ -78,8 +78,8 @@ static void UnitTestParams(Matrix<BaseFloat> &raw_features, int32 window,
   int32 sdc_num_cols = shifted_deltas_features.NumCols();
 
   KALDI_ASSERT(sdc_num_cols == raw_num_cols * (n_blocks  + 1));
-  
-  /* For every coefficient in the raw feature vector a 
+
+  /* For every coefficient in the raw feature vector a
      delta is calculated and appended to the new feature vector,
      as is done normally in a delta-deltas computation.
      In addition, n_blocks delta in advance are also appended.
@@ -89,7 +89,7 @@ static void UnitTestParams(Matrix<BaseFloat> &raw_features, int32 window,
      mapping from these additional deltas to where they would
      appear in a delta-deltas computation and verfies these
      values' equality. */
-  for (int32 i = 0; i < sdc_num_rows; i++) { 
+  for (int32 i = 0; i < sdc_num_rows; i++) {
     for (int32 j = 2 * raw_num_cols; j < sdc_num_cols; j += raw_num_cols) {
       for (int32 k = 0; k < raw_num_cols; k++) {
         int32 row = i + (j/raw_num_cols - 1) * shift;
@@ -103,7 +103,7 @@ static void UnitTestParams(Matrix<BaseFloat> &raw_features, int32 window,
   }
 }
 
-static void UnitTestEndEffects(Matrix<BaseFloat> &raw_features, int32 window, 
+static void UnitTestEndEffects(Matrix<BaseFloat> &raw_features, int32 window,
                                int32 shift, int32 n_blocks) {
   std::cout << "=== UnitTestSDCEndEffects() ===\n";
   ShiftedDeltaFeaturesOptions shifted_deltas_opts;
@@ -118,7 +118,7 @@ static void UnitTestEndEffects(Matrix<BaseFloat> &raw_features, int32 window,
   int32 raw_num_cols = raw_features.NumCols();
   int32 sdc_num_rows = shifted_deltas_features.NumRows();
   int32 sdc_num_cols = shifted_deltas_features.NumCols();
-  
+
   // If the entire window is out-of-bounds the delta should be zero.
   for (int32 i = sdc_num_rows - n_blocks + 1; i < sdc_num_rows; i++) {
     for (int32 j = 2 * raw_num_cols; j < sdc_num_cols; j += raw_num_cols) {
@@ -126,7 +126,7 @@ static void UnitTestEndEffects(Matrix<BaseFloat> &raw_features, int32 window,
         if (i + (j/raw_num_cols - 1) * shift - window/2 > sdc_num_rows)
           KALDI_ASSERT(shifted_deltas_features(i, j + k) <= 0.00001);
       }
-    } 
+    }
   }
 }
 
@@ -137,11 +137,7 @@ int main() {
   KALDI_ASSERT(wave.Data().NumRows() == 1);
   SubVector<BaseFloat> waveform(wave.Data(), 0);
 
-  // mfcc with default configuration...
   MfccOptions op;
-  op.frame_opts.window_type = "hamming";
-  op.frame_opts.remove_dc_offset = false;
-  op.frame_opts.round_to_power_of_two = true;
   op.mel_opts.low_freq = 0.0;
   op.use_energy = false;
   Mfcc mfcc(op);
@@ -163,6 +159,5 @@ int main() {
     static_cast<void>(e);
     return 1;
   }
-  
-}
 
+}
diff --git a/src/feat/feature-spectrogram.cc b/src/feat/feature-spectrogram.cc
deleted file mode 100644
index d2daa7aa829..00000000000
--- a/src/feat/feature-spectrogram.cc
+++ /dev/null
@@ -1,82 +0,0 @@
-// feat/feature-spectrogram.cc
-
-// Copyright 2009-2012  Karel Vesely
-// Copyright 2012  Navdeep Jaitly
-
-// See ../../COPYING for clarification regarding multiple authors
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//  http://www.apache.org/licenses/LICENSE-2.0
-//
-// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
-// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
-// MERCHANTABLITY OR NON-INFRINGEMENT.
-// See the Apache 2 License for the specific language governing permissions and
-// limitations under the License.
-
-
-#include "feat/feature-spectrogram.h"
-
-
-namespace kaldi {
-
-SpectrogramComputer::SpectrogramComputer(const SpectrogramOptions &opts)
-    : opts_(opts), srfft_(NULL) {
-  if (opts.energy_floor > 0.0)
-    log_energy_floor_ = Log(opts.energy_floor);
-
-  int32 padded_window_size = opts.frame_opts.PaddedWindowSize();
-  if ((padded_window_size & (padded_window_size-1)) == 0)  // Is a power of two
-    srfft_ = new SplitRadixRealFft<BaseFloat>(padded_window_size);
-}
-
-SpectrogramComputer::SpectrogramComputer(const SpectrogramComputer &other):
-    opts_(other.opts_), log_energy_floor_(other.log_energy_floor_), srfft_(NULL) {
-  if (other.srfft_ != NULL)
-    srfft_ = new SplitRadixRealFft<BaseFloat>(*other.srfft_);
-}
-
-SpectrogramComputer::~SpectrogramComputer() {
-  delete srfft_;
-}
-
-void SpectrogramComputer::Compute(BaseFloat signal_log_energy,
-                                  BaseFloat vtln_warp,
-                                  VectorBase<BaseFloat> *signal_frame,
-                                  VectorBase<BaseFloat> *feature) {
-  KALDI_ASSERT(signal_frame->Dim() == opts_.frame_opts.PaddedWindowSize() &&
-               feature->Dim() == this->Dim());
-
-
-  // Compute energy after window function (not the raw one)
-  if (!opts_.raw_energy)
-    signal_log_energy = Log(std::max<BaseFloat>(VecVec(*signal_frame, *signal_frame),
-                                     std::numeric_limits<float>::epsilon()));
-
-  if (srfft_ != NULL)  // Compute FFT using split-radix algorithm.
-    srfft_->Compute(signal_frame->Data(), true);
-  else  // An alternative algorithm that works for non-powers-of-two
-    RealFft(signal_frame, true);
-
-  // Convert the FFT into a power spectrum.
-  ComputePowerSpectrum(signal_frame);
-  SubVector<BaseFloat> power_spectrum(*signal_frame,
-                                      0, signal_frame->Dim() / 2 + 1);
-
-  power_spectrum.ApplyFloor(std::numeric_limits<float>::epsilon());
-  power_spectrum.ApplyLog();
-
-  feature->CopyFromVec(power_spectrum);
-
-  if (opts_.energy_floor > 0.0 && signal_log_energy < log_energy_floor_)
-    signal_log_energy = log_energy_floor_;
-  // The zeroth spectrogram component is always set to the signal energy,
-  // instead of the square of the constant component of the signal.
-  (*feature)(0) = signal_log_energy;
-}
-
-}  // namespace kaldi
diff --git a/src/feat/feature-spectrogram.h b/src/feat/feature-spectrogram.h
deleted file mode 100644
index 9aeb68c8df8..00000000000
--- a/src/feat/feature-spectrogram.h
+++ /dev/null
@@ -1,117 +0,0 @@
-// feat/feature-spectrogram.h
-
-// Copyright 2009-2012  Karel Vesely
-// Copyright 2012  Navdeep Jaitly
-
-// See ../../COPYING for clarification regarding multiple authors
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//  http://www.apache.org/licenses/LICENSE-2.0
-//
-// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
-// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
-// MERCHANTABLITY OR NON-INFRINGEMENT.
-// See the Apache 2 License for the specific language governing permissions and
-// limitations under the License.
-
-#ifndef KALDI_FEAT_FEATURE_SPECTROGRAM_H_
-#define KALDI_FEAT_FEATURE_SPECTROGRAM_H_
-
-
-#include <string>
-
-#include "feat/feature-common.h"
-#include "feat/feature-functions.h"
-#include "feat/feature-window.h"
-
-namespace kaldi {
-/// @addtogroup  feat FeatureExtraction
-/// @{
-
-
-/// SpectrogramOptions contains basic options for computing spectrogram
-/// features.
-struct SpectrogramOptions {
-  FrameExtractionOptions frame_opts;
-  BaseFloat energy_floor;
-  bool raw_energy;  // If true, compute energy before preemphasis and windowing
-
-  SpectrogramOptions() :
-    energy_floor(0.0),
-    raw_energy(true) {}
-
-  void Register(OptionsItf *opts) {
-    frame_opts.Register(opts);
-    opts->Register("energy-floor", &energy_floor,
-                   "Floor on energy (absolute, not relative) in Spectrogram "
-                   "computation.  Caution: this floor is applied to the zeroth "
-                   "component, representing the total signal energy.  The "
-                   "floor on the individual spectrogram elements is fixed at "
-                   "std::numeric_limits<float>::epsilon().");
-    opts->Register("raw-energy", &raw_energy,
-                   "If true, compute energy before preemphasis and windowing");
-  }
-};
-
-/// Class for computing spectrogram features.
-class SpectrogramComputer {
- public:
-  typedef SpectrogramOptions Options;
-  explicit SpectrogramComputer(const SpectrogramOptions &opts);
-  SpectrogramComputer(const SpectrogramComputer &other);
-
-  const FrameExtractionOptions& GetFrameOptions() const {
-    return opts_.frame_opts;
-  }
-
-  int32 Dim() const { return opts_.frame_opts.PaddedWindowSize() / 2 + 1; }
-
-  bool NeedRawLogEnergy() { return opts_.raw_energy; }
-
-
-  /**
-     Function that computes one frame of spectrogram features from
-     one frame of signal.
-
-     @param [in] signal_raw_log_energy The log-energy of the frame of the signal
-         prior to windowing and pre-emphasis, or
-         log(numeric_limits<float>::min()), whichever is greater.  Must be
-         ignored by this function if this class returns false from
-         this->NeedsRawLogEnergy().
-     @param [in] vtln_warp  This is ignored by this function, it's only
-         needed for interface compatibility.
-     @param [in] signal_frame  One frame of the signal,
-       as extracted using the function ExtractWindow() using the options
-       returned by this->GetFrameOptions().  The function will use the
-       vector as a workspace, which is why it's a non-const pointer.
-     @param [out] feature  Pointer to a vector of size this->Dim(), to which
-         the computed feature will be written.
-  */
-  void Compute(BaseFloat signal_log_energy,
-               BaseFloat vtln_warp,
-               VectorBase<BaseFloat> *signal_frame,
-               VectorBase<BaseFloat> *feature);
-
-  ~SpectrogramComputer();
-
- private:
-  SpectrogramOptions opts_;
-  BaseFloat log_energy_floor_;
-  SplitRadixRealFft<BaseFloat> *srfft_;
-
-  // Disallow assignment.
-  SpectrogramComputer &operator=(const SpectrogramComputer &other);
-};
-
-typedef OfflineFeatureTpl<SpectrogramComputer> Spectrogram;
-
-
-/// @} End of "addtogroup feat"
-}  // namespace kaldi
-
-
-#endif  // KALDI_FEAT_FEATURE_SPECTROGRAM_H_
diff --git a/src/feat/feature-window.cc b/src/feat/feature-window.cc
index 1d1ab381826..cd7b1a26326 100644
--- a/src/feat/feature-window.cc
+++ b/src/feat/feature-window.cc
@@ -101,8 +101,10 @@ void ProcessWindow(const FrameExtractionOptions &opts,
   int32 frame_length = opts.WindowSize();
   KALDI_ASSERT(window->Dim() == frame_length);
 
-  if (opts.remove_dc_offset)
-    window->Add(-window->Sum() / frame_length);
+
+  /*  This was formerly enabled by the --remove-dc-offset option.  Right now that
+      option is mandatory.  */
+  window->Add(-window->Sum() / frame_length);
 
   window->MulElements(window_function);
 }
@@ -160,6 +162,7 @@ void ExtractWindow(int64 sample_offset,
   SubVector<BaseFloat> frame(*window, 0, frame_length);
 
   ProcessWindow(opts, window_function, &frame);
+
 }
 
 }  // namespace kaldi
diff --git a/src/feat/feature-window.h b/src/feat/feature-window.h
index a212705d42e..979a6cac249 100644
--- a/src/feat/feature-window.h
+++ b/src/feat/feature-window.h
@@ -36,13 +36,11 @@ struct FrameExtractionOptions {
   BaseFloat samp_freq;
   BaseFloat frame_shift_ms;  // in milliseconds.
   BaseFloat frame_length_ms;  // in milliseconds.
-  bool remove_dc_offset;  // Subtract mean of wave before FFT.
   std::string window_type;  // e.g. Hamming window
   // May be "hamming", "rectangular", "povey", "hanning", "blackman"
   // "povey" is a window I made to be similar to Hamming but to go to zero at the
   // edges, it's pow((0.5 - 0.5*cos(n/N*2*pi)), 0.85)
   // I just don't think the Hamming window makes sense as a windowing function.
-  bool round_to_power_of_two;
   BaseFloat blackman_coeff;
   bool allow_downsample;
   bool allow_upsample;
@@ -51,9 +49,7 @@ struct FrameExtractionOptions {
       samp_freq(16000),
       frame_shift_ms(10.0),
       frame_length_ms(25.0),
-      remove_dc_offset(true),
       window_type("povey"),
-      round_to_power_of_two(true),
       blackman_coeff(0.42),
       allow_downsample(false),
       allow_upsample(false),
@@ -65,16 +61,11 @@ struct FrameExtractionOptions {
                    "if specified there)");
     opts->Register("frame-length", &frame_length_ms, "Frame length in milliseconds");
     opts->Register("frame-shift", &frame_shift_ms, "Frame shift in milliseconds");
-    opts->Register("remove-dc-offset", &remove_dc_offset,
-                   "Subtract mean from waveform on each frame");
     opts->Register("window-type", &window_type, "Type of window "
                    "(\"hamming\"|\"hanning\"|\"povey\"|\"rectangular\""
                    "|\"blackmann\")");
     opts->Register("blackman-coeff", &blackman_coeff,
                    "Constant coefficient for generalized Blackman window.");
-    opts->Register("round-to-power-of-two", &round_to_power_of_two,
-                   "If true, round window size to power of two by zero-padding "
-                   "input to FFT.");
     opts->Register("allow-downsample", &allow_downsample,
                    "If true, allow the input waveform to have a higher frequency than "
                    "the specified --sample-frequency (and we'll downsample).");
@@ -93,8 +84,7 @@ struct FrameExtractionOptions {
     return static_cast<int32>(samp_freq * 0.001 * frame_length_ms);
   }
   int32 PaddedWindowSize() const {
-    return (round_to_power_of_two ? RoundUpToNearestPowerOfTwo(WindowSize()) :
-                                    WindowSize());
+    return RoundUpToNearestPowerOfTwo(WindowSize());
   }
 };
 
@@ -116,8 +106,7 @@ void InitFeatureWindowFunction(
 
       @param [in] flush   True if we are asserting that this number of samples is
              'all there is', false if we expecting more data to possibly come
-             in.  This only makes a difference to the answer if opts.snips_edges
-             == false.  For offline feature extraction you always want flush ==
+             in.   For offline feature extraction you always want flush ==
              true.  In an online-decoding context, once you know (or decide) that
              no more data is coming in, you'd call it with flush == true at the
              end to flush out any remaining data.
@@ -127,19 +116,22 @@ int32 NumFrames(int64 num_samples,
                 bool flush = true);
 
 /*
-   This function returns the index of the first sample of the frame indexed
-   'frame'.  If snip-edges=true, it just returns frame * opts.WindowShift(); if
-   snip-edges=false, the formula is a little more complicated and the result may
-   be negative.
+   This function returns the sample-index of the first sample of the frame
+   indexed 'frame'.
+      @param [in]   frame   frame index frame >= 0
+      @param [in]   opts    Options class, used for window width, and frame
+                            shift.
+      @return               Returns the sample index of the first sample of
+                            this frame.  Note: this may be negative if
+                            `frame` is close to zero.  The calling code
+                            will handle this by reflecting the signal in
+                            the boundary.
 */
 int64 FirstSampleOfFrame(int32 frame,
                          const FrameExtractionOptions &opts);
 
 
 
-void Dither(VectorBase<BaseFloat> *waveform, BaseFloat dither_value);
-
-
 /**
   This function does all the windowing steps after actually extracting the
   windowed signal: depeding on the configuration, it dc offset removal and
diff --git a/src/feat/online-feature-test.cc b/src/feat/online-feature-test.cc
index fbdb9c4f11f..3e7834d6423 100644
--- a/src/feat/online-feature-test.cc
+++ b/src/feat/online-feature-test.cc
@@ -153,8 +153,7 @@ void TestOnlineMfcc() {
   // the parametrization object
   MfccOptions op;
   op.frame_opts.window_type = "hamming";
-  op.frame_opts.remove_dc_offset = false;
-  op.frame_opts.round_to_power_of_two = true;
+
   op.frame_opts.samp_freq = wave.SampFreq();
   op.mel_opts.low_freq = 0.0;
   op.use_energy = false;  // C0 not energy.
@@ -200,8 +199,7 @@ void TestOnlineTransform() {
   // build online feature interface, take OnlineMfcc as an example
   MfccOptions op;
   op.frame_opts.window_type = "hamming";
-  op.frame_opts.remove_dc_offset = false;
-  op.frame_opts.round_to_power_of_two = true;
+
   op.frame_opts.samp_freq = wave.SampFreq();
   op.mel_opts.low_freq = 0.0;
   op.use_energy = false;  // C0 not energy.
@@ -240,8 +238,6 @@ void TestOnlineAppendFeature() {
   // the parametrization object for 1st stream mfcc feature
   MfccOptions mfcc_op;
   mfcc_op.frame_opts.window_type = "hamming";
-  mfcc_op.frame_opts.remove_dc_offset = false;
-  mfcc_op.frame_opts.round_to_power_of_two = true;
   mfcc_op.frame_opts.samp_freq = wave.SampFreq();
   mfcc_op.mel_opts.low_freq = 0.0;
   mfcc_op.use_energy = false;  // C0 not energy.
diff --git a/src/matrix/matrix-functions.cc b/src/matrix/matrix-functions.cc
index 496c09f5344..c10b0bb6842 100644
--- a/src/matrix/matrix-functions.cc
+++ b/src/matrix/matrix-functions.cc
@@ -17,577 +17,13 @@
 // MERCHANTABLITY OR NON-INFRINGEMENT.
 // See the Apache 2 License for the specific language governing permissions and
 // limitations under the License.
-//
-// (*) incorporates, with permission, FFT code from his book
-// "Signal Processing with Lapped Transforms", Artech, 1992.
+
 
 #include "matrix/matrix-functions.h"
 #include "matrix/sp-matrix.h"
 
 namespace kaldi {
 
-template<typename Real> void ComplexFt (const VectorBase<Real> &in,
-                                     VectorBase<Real> *out, bool forward) {
-  int exp_sign = (forward ? -1 : 1);
-  KALDI_ASSERT(out != NULL);
-  KALDI_ASSERT(in.Dim() == out->Dim());
-  KALDI_ASSERT(in.Dim() % 2 == 0);
-  int twoN = in.Dim(), N = twoN / 2;
-  const Real *data_in = in.Data();
-  Real *data_out = out->Data();
-
-  Real exp1N_re, exp1N_im;  //  forward -> exp(-2pi / N), backward -> exp(2pi / N).
-  Real fraction = exp_sign * M_2PI / static_cast<Real>(N);  // forward -> -2pi/N, backward->-2pi/N
-  ComplexImExp(fraction, &exp1N_re, &exp1N_im);
-
-  Real expm_re = 1.0, expm_im = 0.0;  // forward -> exp(-2pi m / N).
-
-  for (int two_m = 0; two_m < twoN; two_m+=2) {  // For each output component.
-    Real expmn_re = 1.0, expmn_im = 0.0;  // forward -> exp(-2pi m n / N).
-    Real sum_re = 0.0, sum_im = 0.0;  // complex output for index m (the sum expression)
-    for (int two_n = 0; two_n < twoN; two_n+=2) {
-      ComplexAddProduct(data_in[two_n], data_in[two_n+1],
-                        expmn_re, expmn_im,
-                        &sum_re, &sum_im);
-      ComplexMul(expm_re, expm_im, &expmn_re, &expmn_im);
-    }
-    data_out[two_m] = sum_re;
-    data_out[two_m + 1] = sum_im;
-
-
-    if (two_m % 10 == 0) {  // occasionally renew "expm" from scratch to avoid
-      // loss of precision.
-      int nextm = 1 + two_m/2;
-      Real fraction_mult = fraction * nextm;
-      ComplexImExp(fraction_mult, &expm_re, &expm_im);
-    } else {
-      ComplexMul(exp1N_re, exp1N_im, &expm_re, &expm_im);
-    }
-  }
-}
-
-template
-void ComplexFt (const VectorBase<float> &in,
-                VectorBase<float> *out, bool forward);
-template
-void ComplexFt (const VectorBase<double> &in,
-                VectorBase<double> *out, bool forward);
-
-
-#define KALDI_COMPLEXFFT_BLOCKSIZE 8192
-// This #define affects how we recurse in ComplexFftRecursive.
-// We assume that memory-caching happens on a scale at
-// least as small as this.
-
-
-//! ComplexFftRecursive is a recursive function that computes the
-//! complex FFT of size N.  The "nffts" arguments specifies how many
-//! separate FFTs to compute in parallel (we assume the data for
-//! each one is consecutive in memory).  The "forward argument"
-//! specifies whether to do the FFT (true) or IFFT (false), although
-//! note that we do not include the factor of 1/N (the user should
-//! do this if required.  The iterators factor_begin and factor_end
-//! point to the beginning and end (i.e. one past the last element)
-//! of an array of small factors of N (typically prime factors).
-//! See the comments below this code for the detailed equations
-//! of the recursion.
-
-
-template<typename Real>
-void ComplexFftRecursive (Real *data, int nffts, int N,
-                          const int *factor_begin,
-                          const int *factor_end, bool forward,
-                          Vector<Real> *tmp_vec) {
-  if (factor_begin == factor_end) {
-    KALDI_ASSERT(N == 1);
-    return;
-  }
-
-  {  // an optimization: compute in smaller blocks.
-    // this block of code could be removed and it would still work.
-    MatrixIndexT size_perblock = N * 2 * sizeof(Real);
-    if (nffts > 1 && size_perblock*nffts > KALDI_COMPLEXFFT_BLOCKSIZE) {  // can break it up...
-      // Break up into multiple blocks.  This is an optimization.  We make
-      // no progress on the FFT when we do this.
-      int block_skip = KALDI_COMPLEXFFT_BLOCKSIZE / size_perblock;  // n blocks per call
-      if (block_skip == 0) block_skip = 1;
-      if (block_skip < nffts) {
-        int blocks_left = nffts;
-        while (blocks_left > 0) {
-          int skip_now = std::min(blocks_left, block_skip);
-          ComplexFftRecursive(data, skip_now, N, factor_begin, factor_end, forward, tmp_vec);
-          blocks_left -= skip_now;
-          data += skip_now * N*2;
-        }
-        return;
-      } // else do the actual algorithm.
-    } // else do the actual algorithm.
-  }
-
-  int P = *factor_begin;
-  KALDI_ASSERT(P > 1);
-  int Q = N / P;
-
-
-  if (P > 1 && Q > 1) {  // Do the rearrangement.   C.f. eq. (8) below.  Transform
-    // (a) to (b).
-    Real *data_thisblock = data;
-    if (tmp_vec->Dim() < (MatrixIndexT)N) tmp_vec->Resize(N);
-    Real *data_tmp = tmp_vec->Data();
-    for (int thisfft = 0; thisfft < nffts; thisfft++, data_thisblock+=N*2) {
-      for (int offset = 0; offset < 2; offset++) {  // 0 == real, 1 == im.
-        for (int p = 0; p < P; p++) {
-          for (int q = 0; q < Q; q++) {
-            int aidx = q*P + p, bidx = p*Q + q;
-            data_tmp[bidx] = data_thisblock[2*aidx+offset];
-          }
-        }
-        for (int n = 0;n < P*Q;n++) data_thisblock[2*n+offset] = data_tmp[n];
-      }
-    }
-  }
-
-  {  // Recurse.
-    ComplexFftRecursive(data, nffts*P, Q, factor_begin+1, factor_end, forward, tmp_vec);
-  }
-
-  int exp_sign = (forward ? -1 : 1);
-  Real rootN_re, rootN_im;  // Nth root of unity.
-  ComplexImExp(static_cast<Real>(exp_sign * M_2PI / N), &rootN_re, &rootN_im);
-
-  Real rootP_re, rootP_im;  // Pth root of unity.
-  ComplexImExp(static_cast<Real>(exp_sign * M_2PI / P), &rootP_re, &rootP_im);
-
-  {  // Do the multiplication
-    // could avoid a bunch of complex multiplies by moving the loop over data_thisblock
-    // inside.
-    if (tmp_vec->Dim() < (MatrixIndexT)(P*2)) tmp_vec->Resize(P*2);
-    Real *temp_a = tmp_vec->Data();
-
-    Real *data_thisblock = data, *data_end = data+(N*2*nffts);
-    for (; data_thisblock != data_end; data_thisblock += N*2) {  // for each separate fft.
-      Real qd_re = 1.0, qd_im = 0.0;  // 1^(q'/N)
-      for (int qd = 0; qd < Q; qd++) {
-        Real pdQ_qd_re = qd_re, pdQ_qd_im = qd_im;  // 1^((p'Q+q') / N) == 1^((p'/P) + (q'/N))
-                                              // Initialize to q'/N, corresponding to p' == 0.
-        for (int pd = 0; pd < P; pd++) {  // pd == p'
-          {  // This is the p = 0 case of the loop below [an optimization].
-            temp_a[pd*2] = data_thisblock[qd*2];
-            temp_a[pd*2 + 1] = data_thisblock[qd*2 + 1];
-          }
-          {  // This is the p = 1 case of the loop below [an optimization]
-            // **** MOST OF THE TIME (>60% I think) gets spent here. ***
-            ComplexAddProduct(pdQ_qd_re, pdQ_qd_im,
-                              data_thisblock[(qd+Q)*2], data_thisblock[(qd+Q)*2 + 1],
-                              &(temp_a[pd*2]), &(temp_a[pd*2 + 1]));
-          }
-          if (P > 2) {
-            Real p_pdQ_qd_re = pdQ_qd_re, p_pdQ_qd_im = pdQ_qd_im;  // 1^(p(p'Q+q')/N)
-            for (int p = 2; p < P; p++) {
-              ComplexMul(pdQ_qd_re, pdQ_qd_im, &p_pdQ_qd_re, &p_pdQ_qd_im);  // p_pdQ_qd *= pdQ_qd.
-              int data_idx = p*Q + qd;
-              ComplexAddProduct(p_pdQ_qd_re, p_pdQ_qd_im,
-                                data_thisblock[data_idx*2], data_thisblock[data_idx*2 + 1],
-                                &(temp_a[pd*2]), &(temp_a[pd*2 + 1]));
-            }
-          }
-          if (pd != P-1)
-            ComplexMul(rootP_re, rootP_im, &pdQ_qd_re, &pdQ_qd_im);  // pdQ_qd *= (rootP == 1^{1/P})
-          // (using 1/P == Q/N)
-        }
-        for (int pd = 0; pd < P; pd++) {
-          data_thisblock[(pd*Q + qd)*2] = temp_a[pd*2];
-          data_thisblock[(pd*Q + qd)*2 + 1] = temp_a[pd*2 + 1];
-        }
-        ComplexMul(rootN_re, rootN_im, &qd_re, &qd_im);  // qd *= rootN.
-      }
-    }
-  }
-}
-
-/* Equations for ComplexFftRecursive.
-   We consider here one of the "nffts" separate ffts; it's just a question of
-   doing them all in parallel.  We also write all equations in terms of
-   complex math (the conversion to real arithmetic is not hard, and anyway
-   takes place inside function calls).
-
-
-   Let the input (i.e. "data" at start) be a_n, n = 0..N-1, and
-   the output (Fourier transform) be d_k, k = 0..N-1.  We use these letters because
-   there will be two intermediate variables b and c.
-   We want to compute:
-
-     d_k = \sum_n a_n 1^(kn/N)                                             (1)
-
-   where we use 1^x as shorthand for exp(-2pi x) for the forward algorithm
-   and exp(2pi x) for the backward one.
-
-   We factorize N = P Q (P small, Q usually large).
-   With p = 0..P-1 and q = 0..Q-1, and also p'=0..P-1 and q'=0..P-1, we let:
-
-    k == p'Q + q'                                                           (2)
-    n == qP + p                                                             (3)
-
-   That is, we let p, q, p', q' range over these indices and observe that this way we
-   can cover all n, k.  Expanding (1) using (2) and (3), we can write:
-
-      d_k = \sum_{p, q}  a_n 1^((p'Q+q')(qP+p)/N)
-          = \sum_{p, q}  a_n 1^(p'pQ/N) 1^(q'qP/N) 1^(q'p/N)                 (4)
-
-   using 1^(PQ/N) = 1 to get rid of the terms with PQ in them.  Rearranging (4),
-
-     d_k =  \sum_p 1^(p'pQ/N) 1^(q'p/N)  \sum_q 1^(q'qP/N) a_n              (5)
-
-   The point here is to separate the index q.  Now we can expand out the remaining
-   instances of k and n using (2) and (3):
-
-     d_(p'Q+q') =  \sum_p 1^(p'pQ/N) 1^(q'p/N)  \sum_q 1^(q'qP/N) a_(qP+p)   (6)
-
-   The expression \sum_q varies with the indices p and q'.  Let us define
-
-         C_{p, q'} =  \sum_q 1^(q'qP/N) a_(qP+p)                            (7)
-
-   Here, C_{p, q'}, viewed as a sequence in q', is just the DFT of the points
-   a_(qP+p) for q = 1..Q-1.  These points are not consecutive in memory though,
-   they jump by P each time.  Let us define b as a rearranged version of a,
-   so that
-
-         b_(pQ+q) = a_(qP+p)                                                  (8)
-
-   How to do this rearrangement in place?  In
-
-   We can rearrange (7) to be written in terms of the b's, using (8), so that
-
-         C_{p, q'} =  \sum_q 1^(q'q (P/N)) b_(pQ+q)                            (9)
-
-   Here, the sequence of C_{p, q'} over q'=0..Q-1, is just the DFT of the sequence
-   of b_(pQ) .. b_(p(Q+1)-1).  Let's arrange the C_{p, q'} in a single array in
-   memory in the same way as the b's, i.e. we define
-         c_(pQ+q') == C_{p, q'}.                                                (10)
-   Note that we could have written (10) with q in place of q', as there is only
-   one index of type q present, but q' is just a more natural variable name to use
-   since we use q' elsewhere to subscript c and C.
-
-   Rewriting (9), we have:
-         c_(pQ+q')  = \sum_q 1^(q'q (P/N)) b_(pQ+q)                            (11)
-    which is the DFT computed by the recursive call to this function [after computing
-    the b's by rearranging the a's].  From the c's we want to compute the d's.
-    Taking (6), substituting in the sum (7), and using (10) to write it as an array,
-    we have:
-         d_(p'Q+q') =  \sum_p 1^(p'pQ/N) 1^(q'p/N)  c_(pQ+q')                   (12)
-    This sum is independent for different values of q'.  Note that d overwrites c
-    in memory.  We compute this in  a direct way, using a little array of size P to
-    store the computed d values for one value of q' (we reuse the array for each value
-    of q').
-
-    So the overall picture is this:
-    We get a call to compute DFT on size N.
-
-    - If N == 1 we return (nothing to do).
-    - We factor N = P Q (typically, P is small).
-    - Using (8), we rearrange the data in memory so that we have b not a in memory
-       (this is the block "do the rearrangement").
-       The pseudocode for this is as follows.  For simplicity we use a temporary array.
-
-          for p = 0..P-1
-             for q = 0..Q-1
-                bidx = pQ + q
-                aidx = qP + p
-                tmp[bidx] = data[aidx].
-             end
-          end
-          data <-- tmp
-        else
-
-        endif
-
-
-        The reason this accomplishes (8) is that we want pQ+q and qP+p to be swapped
-        over for each p, q, and the "if m > n" is a convenient way of ensuring that
-        this swapping happens only once (otherwise it would happen twice, since pQ+q
-        and qP+p both range over the entire set of numbers 0..N-1).
-
-    - We do the DFT on the smaller block size to compute c from b (this eq eq. (11)).
-      Note that this is actually multiple DFTs, one for each value of p, but this
-      goes to the "nffts" argument of the function call, which we have ignored up to now.
-
-    -We compute eq. (12) via a loop, as follows
-         allocate temporary array e of size P.
-         For q' = 0..Q-1:
-            for p' = 0..P-1:
-               set sum to zero [this will go in e[p']]
-               for p = p..P-1:
-                  sum += 1^(p'pQ/N) 1^(q'p/N)  c_(pQ+q')
-               end
-               e[p'] = sum
-            end
-            for p' = 0..P-1:
-               d_(p'Q+q') = e[p']
-            end
-         end
-         delete temporary array e
-
-*/
-
-// This is the outer-layer calling code for ComplexFftRecursive.
-// It factorizes the dimension and then calls the FFT routine.
-template<typename Real> void ComplexFft(VectorBase<Real> *v, bool forward, Vector<Real> *tmp_in) {
-  KALDI_ASSERT(v != NULL);
-
-  if (v->Dim()<=1) return;
-  KALDI_ASSERT(v->Dim() % 2 == 0);  // complex input.
-  int N = v->Dim() / 2;
-  std::vector<int> factors;
-  Factorize(N, &factors);
-  int *factor_beg = NULL;
-  if (factors.size() > 0)
-    factor_beg = &(factors[0]);
-  Vector<Real> tmp;  // allocated in ComplexFftRecursive.
-  ComplexFftRecursive(v->Data(), 1, N, factor_beg, factor_beg+factors.size(), forward, (tmp_in?tmp_in:&tmp));
-}
-
-//! Inefficient version of Fourier transform, for testing purposes.
-template<typename Real> void RealFftInefficient (VectorBase<Real> *v, bool forward) {
-  KALDI_ASSERT(v != NULL);
-  MatrixIndexT N = v->Dim();
-  KALDI_ASSERT(N%2 == 0);
-  if (N == 0) return;
-  Vector<Real> vtmp(N*2);  // store as complex.
-  if (forward) {
-    for (MatrixIndexT i = 0; i < N; i++)  vtmp(i*2) = (*v)(i);
-    ComplexFft(&vtmp, forward);  // this is already tested so we can use this.
-    v->CopyFromVec( vtmp.Range(0, N) );
-    (*v)(1) = vtmp(N);  // Copy the N/2'th fourier component, which is real,
-    // to the imaginary part of the 1st complex output.
-  } else {
-    // reverse the transformation above to get the complex spectrum.
-    vtmp(0) = (*v)(0);  // copy F_0 which is real
-    vtmp(N) = (*v)(1);  // copy F_{N/2} which is real
-    for (MatrixIndexT i = 1; i < N/2; i++) {
-      // Copy i'th to i'th fourier component
-      vtmp(2*i) = (*v)(2*i);
-      vtmp(2*i+1) = (*v)(2*i+1);
-      // Copy i'th to N-i'th, conjugated.
-      vtmp(2*(N-i)) = (*v)(2*i);
-      vtmp(2*(N-i)+1) = -(*v)(2*i+1);
-    }
-    ComplexFft(&vtmp, forward);  // actually backward since forward == false
-    // Copy back real part.  Complex part should be zero.
-    for (MatrixIndexT i = 0; i < N; i++)
-      (*v)(i) = vtmp(i*2);
-  }
-}
-
-template void RealFftInefficient (VectorBase<float> *v, bool forward);
-template void RealFftInefficient (VectorBase<double> *v, bool forward);
-
-template
-void ComplexFft(VectorBase<float> *v, bool forward, Vector<float> *tmp_in);
-template
-void ComplexFft(VectorBase<double> *v, bool forward, Vector<double> *tmp_in);
-
-
-// See the long comment below for the math behind this.
-template<typename Real> void RealFft (VectorBase<Real> *v, bool forward) {
-  KALDI_ASSERT(v != NULL);
-  MatrixIndexT N = v->Dim(), N2 = N/2;
-  KALDI_ASSERT(N%2 == 0);
-  if (N == 0) return;
-
-  if (forward) ComplexFft(v, true);
-
-  Real *data = v->Data();
-  Real rootN_re, rootN_im;  // exp(-2pi/N), forward; exp(2pi/N), backward
-  int forward_sign = forward ? -1 : 1;
-  ComplexImExp(static_cast<Real>(M_2PI/N *forward_sign), &rootN_re, &rootN_im);
-  Real kN_re = -forward_sign, kN_im = 0.0;  // exp(-2pik/N), forward; exp(-2pik/N), backward
-  // kN starts out as 1.0 for forward algorithm but -1.0 for backward.
-  for (MatrixIndexT k = 1; 2*k <= N2; k++) {
-    ComplexMul(rootN_re, rootN_im, &kN_re, &kN_im);
-
-    Real Ck_re, Ck_im, Dk_re, Dk_im;
-    // C_k = 1/2 (B_k + B_{N/2 - k}^*) :
-    Ck_re = 0.5 * (data[2*k] + data[N - 2*k]);
-    Ck_im = 0.5 * (data[2*k + 1] - data[N - 2*k + 1]);
-    // re(D_k)= 1/2 (im(B_k) + im(B_{N/2-k})):
-    Dk_re = 0.5 * (data[2*k + 1] + data[N - 2*k + 1]);
-    // im(D_k) = -1/2 (re(B_k) - re(B_{N/2-k}))
-    Dk_im =-0.5 * (data[2*k] - data[N - 2*k]);
-    // A_k = C_k + 1^(k/N) D_k:
-    data[2*k] = Ck_re;  // A_k <-- C_k
-    data[2*k+1] = Ck_im;
-    // now A_k += D_k 1^(k/N)
-    ComplexAddProduct(Dk_re, Dk_im, kN_re, kN_im, &(data[2*k]), &(data[2*k+1]));
-
-    MatrixIndexT kdash = N2 - k;
-    if (kdash != k) {
-      // Next we handle the index k' = N/2 - k.  This is necessary
-      // to do now, to avoid invalidating data that we will later need.
-      // The quantities C_{k'} and D_{k'} are just the conjugates of C_k
-      // and D_k, so the equations are simple modifications of the above,
-      // replacing Ck_im and Dk_im with their negatives.
-      data[2*kdash] = Ck_re;  // A_k' <-- C_k'
-      data[2*kdash+1] = -Ck_im;
-      // now A_k' += D_k' 1^(k'/N)
-      // We use 1^(k'/N) = 1^((N/2 - k) / N) = 1^(1/2) 1^(-k/N) = -1 * (1^(k/N))^*
-      // so it's the same as 1^(k/N) but with the real part negated.
-      ComplexAddProduct(Dk_re, -Dk_im, -kN_re, kN_im, &(data[2*kdash]), &(data[2*kdash+1]));
-    }
-  }
-
-  {  // Now handle k = 0.
-    // In simple terms: after the complex fft, data[0] becomes the sum of real
-    // parts input[0], input[2]... and data[1] becomes the sum of imaginary
-    // pats input[1], input[3]...
-    // "zeroth" [A_0] is just the sum of input[0]+input[1]+input[2]..
-    // and "n2th" [A_{N/2}] is input[0]-input[1]+input[2]... .
-    Real zeroth = data[0] + data[1],
-        n2th = data[0] - data[1];
-    data[0] = zeroth;
-    data[1] = n2th;
-    if (!forward) {
-      data[0] /= 2;
-      data[1] /= 2;
-    }
-  }
-
-  if (!forward) {
-    ComplexFft(v, false);
-    v->Scale(2.0);  // This is so we get a factor of N increase, rather than N/2 which we would
-    // otherwise get from [ComplexFft, forward] + [ComplexFft, backward] in dimension N/2.
-    // It's for consistency with our normal FFT convensions.
-  }
-}
-
-template void RealFft (VectorBase<float> *v, bool forward);
-template void RealFft (VectorBase<double> *v, bool forward);
-
-/* Notes for real FFTs.
-   We are using the same convention as above, 1^x to mean exp(-2\pi x) for the forward transform.
-   Actually, in a slight abuse of notation, we use this meaning for 1^x in both the forward and
-   backward cases because it's more convenient in this section.
-
-   Suppose we have real data a[0...N-1], with N even, and want to compute its Fourier transform.
-   We can make do with the first N/2 points of the transform, since the remaining ones are complex
-   conjugates of the first.  We want to compute:
-       for k = 0...N/2-1,
-       A_k = \sum_{n = 0}^{N-1}  a_n 1^(kn/N)                 (1)
-
-   We treat a[0..N-1] as a complex sequence of length N/2, i.e. a sequence b[0..N/2 - 1].
-   Viewed as sequences of length N/2, we have:
-       b = c + i d,
-   where c = a_0, a_2 ... and d = a_1, a_3 ...
-
-   We can recover the length-N/2 Fourier transforms of c and d by doing FT on b and
-   then doing the equations below.  Derivation is marked by (*) in a comment below (search
-   for it).  Let B, C, D be the FTs.
-   We have
-       C_k = 1/2 (B_k + B_{N/2 - k}^*)                                 (z0)
-       D_k =-1/2i (B_k - B_{N/2 - k}^*)                                (z1)
-so: re(D_k)= 1/2 (im(B_k) + im(B_{N/2-k}))                             (z2)
-    im(D_k) = -1/2 (re(B_k) - re(B_{N/2-k}))                             (z3)
-
-    To recover the FT A from C and D, we write, rearranging (1):
-
-       A_k = \sum_{n = 0, 2, ..., N-2} a_n 1^(kn/N)
-            +\sum_{n = 1, 3, ..., N-1} a_n 1^(kn/N)
-           = \sum_{n = 0, 1, ..., N/2-1} a_n 1^(2kn/N)  + a_{n+1} 1^(2kn/N) 1^(k/N)
-           = \sum_{n = 0, 1, ..., N/2-1} c_n 1^(2kn/N)  + d_n  1^(2kn/N) 1^(k/N)
-       A_k =  C_k + 1^(k/N) D_k                                              (a0)
-
-    This equation is valid for k = 0...N/2-1, which is the range of the sequences B_k and
-    C_k.  We don't use is for k = 0, which is a special case considered below.  For
-    1 < k < N/2, it's convenient to consider the pair k, k', where k' = N/2 - k.
-    Remember that C_k' = C_k^ *and D_k' = D_k^* [where * is conjugation].  Also,
-    1^(N/2 / N) = -1.  So we have:
-       A_k' = C_k^* - 1^(k/N) D_k^*                                          (a0b)
-    We do (a0) and (a0b) together.
-
-
-
-    By symmetry this gives us the Fourier components for N/2+1, ... N, if we want
-    them.  However, it doesn't give us the value for exactly k = N/2.  For k = 0 and k = N/2, it
-    is easiest to argue directly about the meaning of the A_k, B_k and C_k in terms of
-    sums of points.
-       A_0 and A_{N/2} are both real, with A_0=\sum_n a_n, and A_1 an alternating sum
-       A_1 = a_0 - a_1 + a_2 ...
-     It's easy to show that
-              A_0 = B_0 + C_0            (a1)
-              A_{N/2} = B_0 - C_0.       (a2)
-     Since B_0 and C_0 are both real, B_0 is the real coefficient of D_0 and C_0 is the
-     imaginary coefficient.
-
-     *REVERSING THE PROCESS*
-
-     Next we want to reverse this process.  We just need to work out C_k and D_k from the
-     sequence A_k.  Then we do the inverse complex fft and we get back where we started.
-     For 0 and N/2, working from (a1) and (a2) above, we can see that:
-          B_0 = 1/2 (A_0 + A_{N/2})                                       (y0)
-          C_0 = 1/2 (A_0 + A_{N/2})                                       (y1)
-     and we use
-         D_0 = B_0 + i C_0
-     to get the 1st complex coefficient of D.  This is exactly the same as the forward process
-     except with an extra factor of 1/2.
-
-     Consider equations (a0) and (a0b).  We want to work out C_k and D_k from A_k and A_k'.  Remember
-     k' = N/2 - k.
-
-     Write down
-         A_k     =  C_k + 1^(k/N) D_k        (copying a0)
-         A_k'^* =   C_k - 1^(k/N) D_k       (conjugate of a0b)
-      So
-             C_k =            0.5 (A_k + A_k'^*)                    (p0)
-             D_k = 1^(-k/N) . 0.5 (A_k - A_k'^*)                    (p1)
-      Next, we want to compute B_k and B_k' from C_k and D_k.  C.f. (z0)..(z3), and remember
-      that k' = N/2-k.  We can see
-      that
-              B_k  = C_k + i D_k                                    (p2)
-              B_k' = C_k - i D_k                                    (p3)
-
-     We would like to make the equations (p0) ... (p3) look like the forward equations (z0), (z1),
-     (a0) and (a0b) so we can reuse the code.  Define E_k = -i 1^(k/N) D_k.  Then write down (p0)..(p3).
-     We have
-             C_k  =            0.5 (A_k + A_k'^*)                    (p0')
-             E_k  =       -0.5 i   (A_k - A_k'^*)                    (p1')
-             B_k  =    C_k - 1^(-k/N) E_k                            (p2')
-             B_k' =    C_k + 1^(-k/N) E_k                            (p3')
-     So these are exactly the same as (z0), (z1), (a0), (a0b) except replacing 1^(k/N) with
-     -1^(-k/N) .  Remember that we defined 1^x above to be exp(-2pi x/N), so the signs here
-     might be opposite to what you see in the code.
-
-     MODIFICATION: we need to take care of a factor of two.  The complex FFT we implemented
-     does not divide by N in the reverse case.  So upon inversion we get larger by N/2.
-     However, this is not consistent with normal FFT conventions where you get a factor of N.
-     For this reason we multiply by two after the process described above.
-
-*/
-
-
-/*
-   (*) [this token is referred to in a comment above].
-
-   Notes for separating 2 real transforms from one complex one.  Note that the
-   letters here (A, B, C and N) are all distinct from the same letters used in the
-   place where this comment is used.
-   Suppose we
-   have two sequences a_n and b_n, n = 0..N-1.  We combine them into a complex
-   number,
-      c_n = a_n + i b_n.
-   Then we take the fourier transform to get
-      C_k = \sum_{n = 0}^{N-1} c_n 1^(n/N) .
-   Then we use symmetry.  Define A_k and B_k as the DFTs of a and b.
-   We use A_k = A_{N-k}^*, and B_k = B_{N-k}^*, since a and b are real.  Using
-      C_k     = A_k    +  i B_k,
-      C_{N-k} = A_k^*  +  i B_k^*
-              = A_k^*  -  (i B_k)^*
-   So:
-      A_k     = 1/2  (C_k + C_{N-k}^*)
-    i B_k     = 1/2  (C_k - C_{N-k}^*)
-->    B_k     =-1/2i (C_k - C_{N-k}^*)
-->  re(B_k)   = 1/2 (im(C_k) + im(C_{N-k}))
-    im(B_k)   =-1/2 (re(C_k) - re(C_{N-k}))
-
- */
 
 template<typename Real> void ComputeDctMatrix(Matrix<Real> *M) {
   //KALDI_ASSERT(M->NumRows() == M->NumCols());
diff --git a/src/matrix/matrix-functions.h b/src/matrix/matrix-functions.h
index ca50ddda7c8..8f83a4fdd71 100644
--- a/src/matrix/matrix-functions.h
+++ b/src/matrix/matrix-functions.h
@@ -34,85 +34,18 @@ namespace kaldi {
 /// @addtogroup matrix_funcs_misc
 /// @{
 
-/** The function ComplexFft does an Fft on the vector argument v.
-   v is a vector of even dimension, interpreted for both input
-   and output as a vector of complex numbers i.e.
-   \f[ v = ( re_0, im_0, re_1, im_1, ... )    \f]
-
-   If "forward == true" this routine does the Discrete Fourier Transform
-   (DFT), i.e.:
-   \f[   vout[m] \leftarrow \sum_{n = 0}^{N-1} vin[i] exp( -2pi m n / N )  \f]
-
-   If "backward" it does the Inverse Discrete Fourier Transform (IDFT)
-   *WITHOUT THE FACTOR 1/N*,
-   i.e.:
-   \f[   vout[m] <-- \sum_{n = 0}^{N-1} vin[i] exp(  2pi m n / N )   \f]
-   [note the sign difference on the 2 pi for the backward one.]
-
-   Note that this is the definition of the FT given in most texts, but
-   it differs from the Numerical Recipes version in which the forward
-   and backward algorithms are flipped.
-
-   Note that you would have to multiply by 1/N after the IDFT to get
-   back to where you started from.  We don't do this because
-   in some contexts, the transform is made symmetric by multiplying
-   by sqrt(N) in both passes.   The user can do this by themselves.
-
-   See also SplitRadixComplexFft, declared in srfft.h, which is more efficient
-   but only works if the length of the input is a power of 2.
- */
-template<typename Real> void ComplexFft (VectorBase<Real> *v, bool forward, Vector<Real> *tmp_work = NULL);
-
-/// ComplexFt is the same as ComplexFft but it implements the Fourier
-/// transform in an inefficient way.  It is mainly included for testing purposes.
-/// See comment for ComplexFft to describe the input and outputs and what it does.
-template<typename Real> void ComplexFt (const VectorBase<Real> &in,
-                                     VectorBase<Real> *out, bool forward);
-
-/// RealFft is a fourier transform of real inputs.  Internally it uses
-/// ComplexFft.  The input dimension N must be even.  If forward == true,
-/// it transforms from a sequence of N real points to its complex fourier
-/// transform; otherwise it goes in the reverse direction.  If you call it
-/// in the forward and then reverse direction and multiply by 1.0/N, you
-/// will get back the original data.
-/// The interpretation of the complex-FFT data is as follows: the array
-/// is a sequence of complex numbers C_n of length N/2 with (real, im) format,
-/// i.e. [real0, real_{N/2}, real1, im1, real2, im2, real3, im3, ...].
-/// See also SplitRadixRealFft, declared in srfft.h, which is more efficient
-/// but only works if the length of the input is a power of 2.
-
-template<typename Real> void RealFft (VectorBase<Real> *v, bool forward);
-
-
-/// RealFt has the same input and output format as RealFft above, but it is
-/// an inefficient implementation included for testing purposes.
-template<typename Real> void RealFftInefficient (VectorBase<Real> *v, bool forward);
-
-/// ComputeDctMatrix computes a matrix corresponding to the DCT, such that
-/// M * v equals the DCT of vector v.  M must be square at input.
-/// This is the type = III DCT with normalization, corresponding to the
-/// following equations, where x is the signal and X is the DCT:
-/// X_0 = 1/sqrt(2*N) \sum_{n = 0}^{N-1} x_n
-/// X_k = 1/sqrt(N) \sum_{n = 0}^{N-1} x_n cos( \pi/N (n + 1/2) k )
-/// This matrix's transpose is its own inverse, so transposing this
-/// matrix will give the inverse DCT.
-/// Caution: the type III DCT is generally known as the "inverse DCT" (with the
-/// type II being the actual DCT), so this function is somewhatd mis-named.  It
-/// was probably done this way for HTK compatibility.  We don't change it
-/// because it was this way from the start and changing it would affect the
-/// feature generation.
 
 template<typename Real> void ComputeDctMatrix(Matrix<Real> *M);
 
 
 /// ComplexMul implements, inline, the complex multiplication b *= a.
 template<typename Real> inline void ComplexMul(const Real &a_re, const Real &a_im,
-                                            Real *b_re, Real *b_im);
+                                               Real *b_re, Real *b_im);
 
 /// ComplexMul implements, inline, the complex operation c += (a * b).
 template<typename Real> inline void ComplexAddProduct(const Real &a_re, const Real &a_im,
-                                                   const Real &b_re, const Real &b_im,
-                                                   Real *c_re, Real *c_im);
+                                                      const Real &b_re, const Real &b_im,
+                                                      Real *c_re, Real *c_im);
 
 
 /// ComplexImExp implements a <-- exp(i x), inline.
diff --git a/src/matrix/matrix-lib-test.cc b/src/matrix/matrix-lib-test.cc
index afc0340c310..754fa836c97 100644
--- a/src/matrix/matrix-lib-test.cc
+++ b/src/matrix/matrix-lib-test.cc
@@ -3392,20 +3392,6 @@ template<typename Real> static void UnitTestTrace() {
 }
 
 
-template<typename Real> static void UnitTestComplexFt() {
-
-  // Make sure it inverts properly.
-  for (MatrixIndexT d = 0; d < 10; d++) {
-    MatrixIndexT N = Rand() % 100, twoN = 2*N;
-    Vector<Real> v(twoN), w(twoN), x(twoN);
-    v.SetRandn();
-    ComplexFt(v, &w, true);
-    ComplexFt(w, &x, false);
-    if (N>0) x.Scale(1.0/static_cast<Real>(N));
-    AssertEqual(v, x);
-  }
-}
-
 template<typename Real> static void UnitTestDct() {
 
   // Check that DCT matrix is orthogonal (i.e. M^T M = I);
@@ -3419,35 +3405,6 @@ template<typename Real> static void UnitTestDct() {
   }
 
 }
-template<typename Real> static void UnitTestComplexFft() {
-
-  // Make sure it inverts properly.
-  for (MatrixIndexT N_ = 0; N_ < 100; N_+=3) {
-    MatrixIndexT N = N_;
-    if (N>=95) {
-      N = ( Rand() % 150);
-      N = N*N;  // big number.
-    }
-
-    MatrixIndexT twoN = 2*N;
-    Vector<Real> v(twoN), w_base(twoN), w_alg(twoN), x_base(twoN), x_alg(twoN);
-
-    v.SetRandn();
-
-    if (N< 100) ComplexFt(v, &w_base, true);
-    w_alg.CopyFromVec(v);
-    ComplexFft(&w_alg, true);
-    if (N< 100) AssertEqual(w_base, w_alg, 0.01*N);
-
-    if (N< 100) ComplexFt(w_base, &x_base, false);
-    x_alg.CopyFromVec(w_alg);
-    ComplexFft(&x_alg, false);
-
-    if (N< 100) AssertEqual(x_base, x_alg, 0.01*N);
-    x_alg.Scale(1.0/N);
-    AssertEqual(v, x_alg, 0.001*N);
-  }
-}
 
 
 template<typename Real> static void UnitTestSplitRadixComplexFft() {
@@ -3461,11 +3418,10 @@ template<typename Real> static void UnitTestSplitRadixComplexFft() {
     std::vector<Real> temp_buffer;
     SplitRadixComplexFft<Real> srfft(N), srfft2(srfft);
     for (MatrixIndexT p = 0; p < 3; p++) {
-      Vector<Real> v(twoN), w_base(twoN), w_alg(twoN), x_base(twoN), x_alg(twoN);
+      Vector<Real> v(twoN), w_alg(twoN), x_alg(twoN);
 
       v.SetRandn();
 
-      if (N< 100) ComplexFt(v, &w_base, true);
       w_alg.CopyFromVec(v);
 
       if (Rand() % 2 == 0)
@@ -3473,13 +3429,9 @@ template<typename Real> static void UnitTestSplitRadixComplexFft() {
       else
         srfft2.Compute(w_alg.Data(), true, &temp_buffer);
 
-      if (N< 100) AssertEqual(w_base, w_alg, 0.01*N);
-
-      if (N< 100) ComplexFt(w_base, &x_base, false);
       x_alg.CopyFromVec(w_alg);
       srfft.Compute(x_alg.Data(), false);
 
-      if (N< 100) AssertEqual(x_base, x_alg, 0.01*N);
       x_alg.Scale(1.0/N);
       AssertEqual(v, x_alg, 0.001*N);
     }
@@ -3556,38 +3508,6 @@ template<typename Real> static void UnitTestAddVecToCols() {
   }
 }
 
-template<typename Real> static void UnitTestComplexFft2() {
-
-  // Make sure it inverts properly.
-  for (MatrixIndexT pos = 0; pos < 10; pos++) {
-    for (MatrixIndexT N_ = 2; N_ < 15; N_+=2) {
-      if ( pos < N_) {
-        MatrixIndexT N = N_;
-        Vector<Real> v(N), vorig(N), v2(N);
-        v(pos)  = 1.0;
-        vorig.CopyFromVec(v);
-        // KALDI_LOG << "Original v:\n" << v;
-        ComplexFft(&v, true);
-        // KALDI_LOG << "one fft:\n" << v;
-        ComplexFt(vorig, &v2, true);
-        // KALDI_LOG << "one fft[baseline]:\n" << v2;
-        if (!ApproxEqual(v, v2) ) {
-          ComplexFft(&vorig, true);
-          KALDI_ASSERT(0);
-        }
-        ComplexFft(&v, false);
-        // KALDI_LOG << "one more:\n" << v;
-        v.Scale(1.0/(N/2));
-        if (!ApproxEqual(v, vorig)) {
-          ComplexFft(&vorig, true);
-          KALDI_ASSERT(0);
-        }// AssertEqual(v, vorig);
-      }
-    }
-  }
-}
-
-
 template<typename Real> static void UnitTestSplitRadixComplexFft2() {
 
   // Make sure it inverts properly.
@@ -3608,34 +3528,6 @@ template<typename Real> static void UnitTestSplitRadixComplexFft2() {
 }
 
 
-template<typename Real> static void UnitTestRealFft() {
-
-  // First, test RealFftInefficient.
-  for (MatrixIndexT N_ = 2; N_ < 100; N_ += 6) {
-    MatrixIndexT N = N_;
-    if (N >90) N *= Rand() % 60;
-    Vector<Real> v(N), w(N), x(N), y(N);
-    v.SetRandn();
-    w.CopyFromVec(v);
-    RealFftInefficient(&w, true);
-    y.CopyFromVec(v);
-    RealFft(&y, true);  // test efficient one.
-    // KALDI_LOG <<"v = "<<v;
-    // KALDI_LOG << "Inefficient real fft of v is: "<< w;
-    // KALDI_LOG << "Efficient real fft of v is: "<< y;
-    AssertEqual(w, y, 0.01*N);
-    x.CopyFromVec(w);
-    RealFftInefficient(&x, false);
-    RealFft(&y, false);
-    // KALDI_LOG << "Inefficient real fft of v twice is: "<< x;
-    if (N != 0) x.Scale(1.0/N);
-    if (N != 0) y.Scale(1.0/N);
-    AssertEqual(v, x, 0.001*N);
-    AssertEqual(v, y, 0.001*N);  // ?
-  }
-}
-
-
 template<typename Real> static void UnitTestSplitRadixRealFft() {
 
   for (MatrixIndexT p = 0; p < 30; p++) {
@@ -3645,46 +3537,21 @@ template<typename Real> static void UnitTestSplitRadixRealFft() {
     SplitRadixRealFft<Real> srfft(N), srfft2(srfft);
     std::vector<Real> temp_buffer;
     for (MatrixIndexT q = 0; q < 3; q++) {
-      Vector<Real> v(N), w(N), x(N), y(N);
+      Vector<Real> v(N), y(N);
       v.SetRandn();
-      w.CopyFromVec(v);
-      RealFftInefficient(&w, true);
       y.CopyFromVec(v);
       if (Rand() % 2 == 0)
         srfft.Compute(y.Data(), true);
       else
         srfft2.Compute(y.Data(), true, &temp_buffer);
 
-      // KALDI_LOG <<"v = "<<v;
-      // KALDI_LOG << "Inefficient real fft of v is: "<< w;
-      // KALDI_LOG << "Efficient real fft of v is: "<< y;
-      AssertEqual(w, y, 0.01*N);
-      x.CopyFromVec(w);
-      RealFftInefficient(&x, false);
       srfft.Compute(y.Data(), false);
-      // KALDI_LOG << "Inefficient real fft of v twice is: "<< x;
-      x.Scale(1.0/N);
       y.Scale(1.0/N);
-      AssertEqual(v, x, 0.001*N);
-      AssertEqual(v, y, 0.001*N);  // ?
+      AssertEqual(v, y, 0.001*N);
     }
   }
 }
 
-
-
-template<typename Real> static void UnitTestRealFftSpeed() {
-
-  // First, test RealFftInefficient.
-  KALDI_LOG << "starting. ";
-  MatrixIndexT sz = 512;  // fairly typical size.
-  for (MatrixIndexT i = 0; i < 3000; i++) {
-    if (i % 1000 == 0) KALDI_LOG << "done 1000 [ == ten seconds of speech]";
-    Vector<Real> v(sz);
-    RealFft(&v, true);
-  }
-}
-
 template<typename Real> static void UnitTestSplitRadixRealFftSpeed() {
   KALDI_LOG << "starting. ";
   MatrixIndexT sz = 512;  // fairly typical size.
@@ -4614,14 +4481,10 @@ template<typename Real> static void MatrixUnitTest(bool full_test) {
   // commenting these out for now-- they test the speed, but take a while.
   // UnitTestSplitRadixRealFftSpeed<Real>();
   // UnitTestRealFftSpeed<Real>();   // won't exit!/
-  UnitTestComplexFt<Real>();
   KALDI_LOG << " Point B";
-  UnitTestComplexFft2<Real>();
-  UnitTestComplexFft<Real>();
   UnitTestSplitRadixComplexFft<Real>();
   UnitTestSplitRadixComplexFft2<Real>();
   UnitTestDct<Real>();
-  UnitTestRealFft<Real>();
   KALDI_LOG << " Point C";
   UnitTestSplitRadixRealFft<Real>();
   UnitTestSvd<Real>();
diff --git a/src/nnet3/nnet-attention-component.cc b/src/nnet3/nnet-attention-component.cc
index 58e662af774..6e154786f22 100644
--- a/src/nnet3/nnet-attention-component.cc
+++ b/src/nnet3/nnet-attention-component.cc
@@ -545,7 +545,7 @@ bool RestrictedAttentionComponent::IsComputable(
       } else {
         // This input index is not available.
         int32 offset = (t - output_index.t) / time_stride_;
-        if (offset >= num_left_inputs_required_ &&
+        if (offset >= -num_left_inputs_required_ &&
             offset <= num_right_inputs_required_) {
           used_inputs->clear();
           return false;
@@ -555,9 +555,9 @@ bool RestrictedAttentionComponent::IsComputable(
     // All required time-offsets of the output were computable. -> return true.
     return true;
   } else {
-    int32 t = output_index.t,
-        first_time_required = t - (time_stride_ * num_left_inputs_required_),
-        last_time_required = t + (time_stride_ * num_right_inputs_required_);
+    int32 output_t = output_index.t,
+        first_time_required = output_t - (time_stride_ * num_left_inputs_required_),
+        last_time_required = output_t + (time_stride_ * num_right_inputs_required_);
     for (int32 t = first_time_required;
          t <= last_time_required;
          t += time_stride_) {
diff --git a/src/nnet3/nnet-attention-component.h b/src/nnet3/nnet-attention-component.h
index 6072fcd681e..c280e827817 100644
--- a/src/nnet3/nnet-attention-component.h
+++ b/src/nnet3/nnet-attention-component.h
@@ -96,7 +96,9 @@ namespace nnet3 {
                       if you set this, online (looped) decoding will not work
                       correctly.  It might be wiser just to reduce num-right-inputs
                       if you care about real-time decoding.
-     key-scale        Scale on the keys (but not the added context).  Defaults to 1.0 /
+     key-scale        Scale on the keys (but not the added context, by which we
+                      mean the encoding of the position of the input frame relative
+                      to the current frame).  Defaults to 1.0 /
                       sqrt(key-dim), like the 1/sqrt(d_k) value in the
                       "Attention is all you need" paper.  This helps prevent saturation
                       of the softmax.
@@ -169,8 +171,10 @@ class RestrictedAttentionComponent: public Component {
                                const Index &output_index,
                                std::vector<Index> *desired_indexes) const;
 
-  // This function returns true if at least one of the input indexes used to
-  // compute this output index is computable.
+  // This function returns true if all of the required input indexes used to
+  // compute the output were computable (depends on num_left_inputs_required_
+  // and num_right_inputs_required_).  If used_inputs is non-NULL it outputs to
+  // there all of the input indexes (required or not) that were available.
   virtual bool IsComputable(const MiscComputationInfo &misc_info,
                             const Index &output_index,
                             const IndexSet &input_index_set,
diff --git a/src/nnet3/nnet-computation-graph.cc b/src/nnet3/nnet-computation-graph.cc
index 9c84115d406..ea30b004092 100644
--- a/src/nnet3/nnet-computation-graph.cc
+++ b/src/nnet3/nnet-computation-graph.cc
@@ -135,7 +135,9 @@ void ComputationGraphBuilder::ExplainWhyNotComputable(
     int32 first_cindex_id) const {
   int32 max_lines_print = 100;
   std::deque<int32> cindexes_to_explain;
+  std::vector<bool> added_to_queue(graph_->cindexes.size(), false);
   cindexes_to_explain.push_back(first_cindex_id);
+  added_to_queue[first_cindex_id] = true;
   KALDI_ASSERT(graph_->cindexes.size() == graph_->dependencies.size());
   std::ostringstream os;
   os << "*** cindex ";
@@ -148,18 +150,17 @@ void ComputationGraphBuilder::ExplainWhyNotComputable(
     cindexes_to_explain.pop_front();
     KALDI_ASSERT(static_cast<size_t>(cindex_id) < graph_->cindexes.size());
     PrintCindexId(os, cindex_id);
-    os << " is " << static_cast<ComputableInfo>(
-        computable_info_[cindex_id]) << ", dependencies: ";
+    os << " is " << cindex_info_[cindex_id].computable << ", dependencies: ";
     const std::vector<int32> dependencies = graph_->dependencies[cindex_id];
     std::vector<int32>::const_iterator iter = dependencies.begin(),
         end = dependencies.end();
     for (; iter != end; iter++) {
       int32 dep_cindex_id = *iter;
       PrintCindexId(os, dep_cindex_id);
-      ComputableInfo status = static_cast<ComputableInfo>(
-          computable_info_[cindex_id]);
-      if (status != kComputable) {
-        os << '[' << status << ']';
+      const CindexInfo &dep_info = cindex_info_[dep_cindex_id];
+      os << '[' << dep_info.computable << ']';
+      if (dep_info.computable != kComputable && !added_to_queue[dep_cindex_id]) {
+        added_to_queue[dep_cindex_id] = true;
         cindexes_to_explain.push_back(dep_cindex_id);
       }
       if (iter+2 != end)
@@ -223,16 +224,19 @@ void ComputationGraph::Print(std::ostream &os,
 
 
 // inline
-void ComputationGraphBuilder::AddCindexId(int32 cindex_id,
-                                          bool is_input,
-                                          bool is_output) {
+void ComputationGraphBuilder::AddCindexId(int32 cindex_id) {
   // If this cindex_id has just now been added to the graph, the following
   // assert should succeed.
-  KALDI_PARANOID_ASSERT(cindex_id == computable_queued_.size() &&
-                        cindex_id == computable_info_.size() &&
-                        cindex_id == depend_on_this_.size() &&
-                        cindex_id == usable_count_.size());
+  KALDI_PARANOID_ASSERT(cindex_id == depend_on_this_.size() &&
+                        cindex_id == cindex_info_.size());
+  depend_on_this_.push_back(std::vector<int32>());
+  cindex_info_.push_back(CindexInfo());
+}
+/*
+  CindexInfo &info = cindex_info_.back();
   if (is_input) {
+    info.computable = k
+
     computable_info_.push_back(kComputable);
     computable_queued_.push_back(false);
   } else {
@@ -242,9 +246,9 @@ void ComputationGraphBuilder::AddCindexId(int32 cindex_id,
     computable_queued_.push_back(false);
     next_queue_.push_back(cindex_id);
   }
-  depend_on_this_.push_back(std::vector<int32>());
+  depend_on_this_.
   usable_count_.push_back(is_output ? 1 : 0);
-}
+  }*/
 
 
 void ComputationGraphBuilder::AddInputs() {
@@ -263,7 +267,8 @@ void ComputationGraphBuilder::AddInputs() {
       bool is_input = true, is_new;
       int32 cindex_id = graph_->GetCindexId(cindex, is_input, &is_new);
       KALDI_ASSERT(is_new && "Input index seems to be listed more than once");
-      AddCindexId(cindex_id, true, false);
+      AddCindexId(cindex_id);
+      cindex_info_.back().computable = kComputable;
       num_added++;
     }
   }
@@ -282,7 +287,10 @@ void ComputationGraphBuilder::AddOutputs() {
       bool is_input = false, is_new;
       int32 cindex_id = graph_->GetCindexId(cindex, is_input, &is_new);
       KALDI_ASSERT(is_new && "Output index seems to be listed more than once");
-      AddCindexId(cindex_id, false, true);
+      AddCindexId(cindex_id);
+      cindex_info_.back().usable_count = 1;
+      cindex_info_.back().queued = true;
+      next_queue_.push_back(cindex_id);
       num_added++;
     }
   }
@@ -290,17 +298,15 @@ void ComputationGraphBuilder::AddOutputs() {
     KALDI_ERR << "Cannot process computation request with no outputs";
   }
   current_distance_ = 0;
-  // the calls to AddCindexId in this function will have added to next_queue_.
   KALDI_ASSERT(current_queue_.empty());
   current_queue_.swap(next_queue_);
 }
 
 bool ComputationGraphBuilder::AllOutputsAreComputable() const {
-  char is_computable_char = static_cast<char>(kComputable);
-  std::vector<char>::const_iterator iter = computable_info_.begin(),
-      end = computable_info_.end();
+  auto iter = cindex_info_.begin(),
+      end = cindex_info_.end();
   for (int32 cindex_id = 0; iter != end; ++iter, ++cindex_id) {
-    if (*iter != is_computable_char) {  // is not computable.
+    if (iter->computable != kComputable) {
       int32 network_node = graph_->cindexes[cindex_id].first;
       if (nnet_.IsOutputNode(network_node))
         return false;
@@ -318,8 +324,6 @@ std::ostream& operator << (std::ostream &os,
       break;
     case ComputationGraphBuilder::kNotComputable: os << "kNotComputable";
       break;
-    case ComputationGraphBuilder::kWillNotCompute: os << "kWillNotCompute";
-      break;
     default: os << "[invalid enum value]"; break;
   }
   return os;
@@ -335,10 +339,9 @@ void ComputationGraphBuilder::ExplainWhyAllOutputsNotComputable() const {
       end = graph_->cindexes.end();
   for (int32 cindex_id = 0; iter != end; ++iter,++cindex_id) {
     int32 network_node = iter->first;
-    ComputableInfo c = static_cast<ComputableInfo>(computable_info_[cindex_id]);
     if (nnet_.IsOutputNode(network_node)) {
       num_outputs_total++;
-      if (c != kComputable)
+      if (cindex_info_[cindex_id].computable != kComputable)
         outputs_not_computable.push_back(cindex_id);
     }
   }
@@ -362,16 +365,17 @@ void ComputationGraphBuilder::ExplainWhyAllOutputsNotComputable() const {
 // which are actually used in computing it.  It also clears the dependencies
 // of those cindexes that are not computable.
 void ComputationGraphBuilder::PruneDependencies(int32 cindex_id) {
-  ComputableInfo c = static_cast<ComputableInfo>(computable_info_[cindex_id]);
-  // by the time this is called, there should be no cindexes with unknown state.
-  KALDI_ASSERT(c != kUnknown);
-  if (c == kNotComputable || c == kWillNotCompute) {
-    // if something is not computable, there is no point
+  const CindexInfo &info = cindex_info_[cindex_id];
+  // by the time this is called, there should be no cindexes with unknown state
+  // that are usable.
+  KALDI_ASSERT(!(info.computable == kUnknown && info.usable_count != 0));
+  if (info.computable == kNotComputable || info.usable_count == 0) {
+    // if something is not computable or is not usable, there is no point
     // keeping around its dependencies.
     graph_->dependencies[cindex_id].clear();
     return;
   }
-  KALDI_ASSERT(c == kComputable);
+  KALDI_ASSERT(info.computable == kComputable);
   const Cindex &cindex = graph_->cindexes[cindex_id];
   int32 node_id = cindex.first;
   const Index &index = cindex.second;
@@ -385,7 +389,7 @@ void ComputationGraphBuilder::PruneDependencies(int32 cindex_id) {
     case kDescriptor: {
       const Descriptor &desc = node.descriptor;
       bool dont_care = false;  // there should be no kUnknown, and we check this
-      CindexSet cindex_set(*graph_, computable_info_, dont_care);
+      CindexSet cindex_set(*graph_, cindex_info_, dont_care);
       std::vector<Cindex> used_cindexes;
       bool ans = desc.IsComputable(index, cindex_set, &used_cindexes);
       // If the next assert fails it could be a failure in the assumption that
@@ -410,7 +414,7 @@ void ComputationGraphBuilder::PruneDependencies(int32 cindex_id) {
       // In the line below, node_id - 1 is the index of the component-input
       // node-- the descriptor at the input to the component.  We are interested
       // in the set of inputs to the component that are computable.
-      IndexSet index_set(*graph_, computable_info_, node_id - 1, dont_care);
+      IndexSet index_set(*graph_, cindex_info_, node_id - 1, dont_care);
       std::vector<Index> used_indexes;
       bool ans = c->IsComputable(request_->misc_info, index, index_set,
                                  &used_indexes);
@@ -476,12 +480,10 @@ void ComputationGraphBuilder::Compute(const ComputationRequest &request) {
     // only check rarely if we're running at low verbose level.
     if (GetVerboseLevel() >= 3 || RandInt(1,  (current_distance_ + 1)) == 1)
       Check(cur_segment_start);
-    // TODO: come up with a scheme to delay when we call
-    // UpdateAllComputableInfo().
-    UpdateAllComputableInfo();
     if (current_queue_.empty()) // we're done.
       break;
   }
+  KALDI_VLOG(6) << "current_distance = " << current_distance_; // TEMP
   if (current_distance_ == max_distance)
     KALDI_ERR << "Loop detected while building computation graph (bad "
               << "network topology?)";
@@ -507,7 +509,8 @@ void ComputationGraphBuilder::Check(int32 start_cindex_id) const {
         KALDI_ASSERT(std::count(dep.begin(), dep.end(), cindex_id) == 1);
       }
     }
-    { // check dependencies.
+    if (cindex_info_[cindex_id].dependencies_computed) {
+      // check dependencies.
       std::vector<int32> dependencies = graph_->dependencies[cindex_id];
       int32 size = dependencies.size();
       std::sort(dependencies.begin(), dependencies.end());
@@ -525,49 +528,40 @@ void ComputationGraphBuilder::Check(int32 start_cindex_id) const {
     {
       // check usable_count_
       int32 node_index = graph_->cindexes[cindex_id].first;
-      int32 usable_count = usable_count_[cindex_id],
+      int32 usable_count = cindex_info_[cindex_id].usable_count,
           usable_count_recomputed = nnet_.IsOutputNode(node_index) ? 1 : 0;
       std::vector<int32> depend_on_this = depend_on_this_[cindex_id];
       int32 size = depend_on_this.size();
       for (size_t j = 0; j < size; j++) {
         int32 other_cindex_id = depend_on_this[j];
-        if (usable_count_[other_cindex_id] != 0 &&
-            computable_info_[other_cindex_id] != kNotComputable)
+        if (cindex_info_[other_cindex_id].usable_count != 0 &&
+            cindex_info_[other_cindex_id].computable != kNotComputable)
           usable_count_recomputed++;
       }
       KALDI_ASSERT(usable_count == usable_count_recomputed);
     }
-    // check computable_info_.  note: this will not be accurate
-    // if the cindex_id is still queued to have dependencies added
-    // (in cur_queue_ or next_queue_).
-    if (computable_queue_.empty()) {
+    // check `computable`.
+    if (cindex_info_[cindex_id].dependencies_computed) {
       ComputationGraphBuilder::ComputableInfo c =
           ComputeComputableInfo(cindex_id);
-      // the status doesn't have to be correct if it's kWillNotCompute,
-      // because these are cindex-ids that we chose not to compute
-      // because we determined they would not be useful, and
-      // ComputeComputableInfo() will never return this value.
-      if (c != computable_info_[cindex_id] &&
-          computable_info_[cindex_id] != kWillNotCompute) {
-        int32 count_cur = std::count(current_queue_.begin(),
-                                     current_queue_.end(), cindex_id),
-            count_next = std::count(next_queue_.begin(),
-                                    next_queue_.end(), cindex_id);
-        // if it wasn't queued, then something is wrong.
-        if (count_cur + count_next == 0)
-          KALDI_ERR << "Mismatch in computable status";
+      // the status doesn't have to match if the stored info is kUnknown.
+      if (c != cindex_info_[cindex_id].computable &&
+          cindex_info_[cindex_id].computable != kUnknown) {
+        KALDI_ERR << "Mismatch in computable status";
       }
     }
-    // check computable_queued_.
+    // check `queued`.
     // note, the following checks might be a bit slow.
-    if (computable_queued_[cindex_id]) {
-      KALDI_ASSERT(std::count(computable_queue_.begin(),
-                              computable_queue_.end(),
-                              cindex_id) == 1);
-    } else {
-      KALDI_ASSERT(std::count(computable_queue_.begin(),
-                              computable_queue_.end(),
-                              cindex_id) == 0);
+    if (RandInt(0, cindex_id) == 0) {
+      if (cindex_info_[cindex_id].queued) {
+        KALDI_ASSERT(std::count(current_queue_.begin(),
+                                current_queue_.end(),
+                                cindex_id) == 1);
+      } else {
+        KALDI_ASSERT(std::count(current_queue_.begin(),
+                                current_queue_.end(),
+                                cindex_id) == 0);
+      }
     }
   }
 }
@@ -595,13 +589,13 @@ void ComputationGraphBuilder::Prune() {
   std::vector<bool> keep(num_cindex_ids - start_cindex_id, false);
   for (int32 c = start_cindex_id; c < num_cindex_ids; c++) {
     if (required[c - start_cindex_id] || graph_->is_input[c]) {
-      KALDI_ASSERT(computable_info_[c] == kComputable &&
+      KALDI_ASSERT(cindex_info_[c].computable == kComputable &&
                    "You are calling Prune when not everything is computable.");
       keep[c - start_cindex_id] = true;
     }
   }
   graph_->Renumber(start_cindex_id, keep);
-  // We also need to renumber computable_info_ and usable_count_, which
+  // We also need to renumber cindex_info_ b,
   // graph_->Renumber doesn't do for us, but we can make some shortcuts.  We set
   // all computable_info_ to kComputable because actually it all was kComputable
   // (we checked when deciding what to keep); and we set the usable_count_ to 1
@@ -613,21 +607,18 @@ void ComputationGraphBuilder::Prune() {
   // segments" is not critical.  [this information only gets used if we process
   // additional segments as part of the compilation of an online computation.]
   int32 new_num_cindex_ids = graph_->cindexes.size();
-  computable_info_.resize(start_cindex_id);
-  computable_info_.resize(new_num_cindex_ids, (char)kComputable);
-  usable_count_.resize(start_cindex_id);
-  usable_count_.resize(new_num_cindex_ids, 1);
+  cindex_info_.resize(start_cindex_id);
+  cindex_info_.resize(new_num_cindex_ids);
+  for (int32 i = start_cindex_id; i < new_num_cindex_ids; i++) {
+    cindex_info_[i].computable = kComputable;
+    cindex_info_[i].usable_count = 1;
+  }
   // depend_on_this_ is a vector of vectors-- keeping track of the reverse of
   // the dependencies-- and I believe we won't be needing this information any
   // more past this point.
   depend_on_this_.resize(start_cindex_id);
   depend_on_this_.resize(new_num_cindex_ids);
-  // computable_queued_ also shouldn't be queried past this point, but
-  // I believe they should all be false at this point anyway (note that
-  // we assert below that computable_queue_ is empty).
-  computable_queued_.resize(new_num_cindex_ids);
 
-  KALDI_ASSERT(computable_queue_.empty());
   graph_->segment_ends.push_back(new_num_cindex_ids);
 }
 
@@ -636,7 +627,6 @@ void ComputationGraphBuilder::AddDependencies(int32 cindex_id) {
   if (static_cast<int32>(graph_->dependencies.size()) <= cindex_id) {
     graph_->dependencies.resize(2 * cindex_id + 1);
   }
-
   Cindex cindex = graph_->cindexes[cindex_id];
 
   // find the dependencies of this cindex.
@@ -696,8 +686,11 @@ void ComputationGraphBuilder::AddDependencies(int32 cindex_id) {
     int32 dep_cindex_id = graph_->GetCindexId(input_cindexes[i],
                                               is_input, &is_new);
     this_dep[i] = dep_cindex_id;
-    if (is_new)
-      AddCindexId(dep_cindex_id, false, false);
+    if (is_new) {
+      AddCindexId(dep_cindex_id);
+      cindex_info_.back().queued = true;
+      next_queue_.push_back(dep_cindex_id);
+    }
     // we will keep dependent's usable_count_ up to date below
   }
   // remove duplicates of dependencies.
@@ -720,15 +713,6 @@ void ComputationGraphBuilder::AddDependencies(int32 cindex_id) {
     depend_on_this_[dep_cindex_id].push_back(cindex_id);
     IncrementUsableCount(dep_cindex_id);
   }
-
-  // Now that we've added the dependencies, we can put this into
-  // the computable_queue_ to assess whether it's computable
-  KALDI_ASSERT(computable_info_[cindex_id] == kUnknown &&
-               !computable_queued_[cindex_id]);
-  // we think it'll be faster in the next line to do push_front instead of
-  // push_back; either one would be correct.
-  computable_queue_.push_front(cindex_id);
-  computable_queued_[cindex_id] = true;
 }
 
 
@@ -743,18 +727,19 @@ ComputationGraphBuilder::ComputeComputableInfo(int32 cindex_id)
     case kDescriptor: {
       const Descriptor &desc = node.descriptor;
       {
-        CindexSet cindex_set(*graph_, computable_info_, false);
+        CindexSet cindex_set(*graph_, cindex_info_, false);
         if (desc.IsComputable(index, cindex_set, NULL)) {
-          // it's computable even without counting kUnknown inputs as computable
-          // [treat_unknown_as_computable = false] -> definitely computable.
+          // it's computable even without counting kUnknown and kWillNotCompute
+          // inputs as computable [treat_unknown_as_computable = false] ->
+          // definitely computable.
           return kComputable;
         }
       }
-      CindexSet cindex_set2(*graph_, computable_info_, true);
+      CindexSet cindex_set2(*graph_, cindex_info_, true);
       if (!desc.IsComputable(index, cindex_set2, NULL)) {
-        // it's not computable even when counting kUnknown inputs as
-        // computable [treat_unknown_as_computable = true] -> definitely not
-        // computable.
+        // it's not computable even when counting kUnknown
+        // inputs as computable [treat_unknown_as_computable = true] ->
+        // definitely not computable.
         return kNotComputable;
       }
       return kUnknown;
@@ -763,14 +748,15 @@ ComputationGraphBuilder::ComputeComputableInfo(int32 cindex_id)
       const Component *c = nnet_.GetComponent(node.u.component_index);
       const int32 input_node_id = node_id - 1;
       {
-        IndexSet index_set(*graph_, computable_info_, input_node_id, false);
+        IndexSet index_set(*graph_, cindex_info_, input_node_id, false);
         if (c->IsComputable(request_->misc_info, index, index_set, NULL)) {
-          // it's computable even without counting kUnknown inputs as computable
-          // [treat_unknown_as_computable = false] -> definitely computable.
+          // it's computable even without counting kUnknown
+          // inputs as computable [treat_unknown_as_computable = false] ->
+          // definitely computable.
           return kComputable;
         }
       }
-      IndexSet index_set2(*graph_, computable_info_, input_node_id, true);
+      IndexSet index_set2(*graph_, cindex_info_, input_node_id, true);
       if (!c->IsComputable(request_->misc_info, index, index_set2, NULL)) {
         // it's not computable even when counting kUnknown inputs as computable
         // [treat_unknown_as_computable = true] -> definitely not computable.
@@ -782,7 +768,7 @@ ComputationGraphBuilder::ComputeComputableInfo(int32 cindex_id)
       Cindex input_cindex(node.u.node_index, index);
       int32 input_cindex_id = graph_->GetCindexId(input_cindex);
       if (input_cindex_id != -1)
-        return ComputableInfo(computable_info_[input_cindex_id]);
+        return cindex_info_[input_cindex_id].computable;
       else
         return kUnknown;
     }
@@ -802,7 +788,7 @@ void ComputationGraphBuilder::GetComputableInfo(
     std::vector<std::vector<bool> > *computable) const {
   KALDI_ASSERT(!graph_->cindexes.empty() &&
                "You need to call this after Compute()!");
-  KALDI_ASSERT(!computable_info_.empty() &&
+  KALDI_ASSERT(!cindex_info_.empty() &&
                "You need to call this before Prune()!");
   computable->clear();
   computable->resize(request_->outputs.size());
@@ -817,7 +803,7 @@ void ComputationGraphBuilder::GetComputableInfo(
       Cindex cindex(n, output.indexes[j]);
       int32 cindex_id = graph_->GetCindexId(cindex);
       KALDI_ASSERT(cindex_id != -1);
-      this_vec[j] = (computable_info_[cindex_id] == kComputable);
+      this_vec[j] = (cindex_info_[cindex_id].computable == kComputable);
     }
   }
 }
@@ -826,11 +812,18 @@ void ComputationGraphBuilder::GetComputableInfo(
 void ComputationGraphBuilder::UpdateComputableInfo(int32 cindex_id) {
   // if the current computable_info_ for cindex_id value is not kUnknown, this
   // cindex_id should not have been in the queue.
-  KALDI_ASSERT(static_cast<size_t>(cindex_id) < computable_info_.size());
-  char &output = computable_info_[cindex_id];
+  KALDI_ASSERT(static_cast<size_t>(cindex_id) < cindex_info_.size());
+
+  // We don't need to update the computable info of this node since it's
+  // not usable (i.e. not currently reachable from any node that is not
+  // kNotComputable).
+  if (cindex_info_[cindex_id].usable_count == 0)
+    return;
+
+  ComputableInfo &output = cindex_info_[cindex_id].computable;
   KALDI_ASSERT(output == kUnknown);
 
-  output = static_cast<char>(ComputeComputableInfo(cindex_id));
+  output = ComputeComputableInfo(cindex_id);
 
   if (output != kUnknown) {
     // The computable status of cindexes that depend on this cindex and whose
@@ -840,15 +833,15 @@ void ComputationGraphBuilder::UpdateComputableInfo(int32 cindex_id) {
         end = depend_on_this_[cindex_id].end();
     for (; iter != end; ++iter) {
       int32 other_cindex_id = *iter;
-      if (computable_info_[other_cindex_id] == kUnknown &&
-          !computable_queued_[other_cindex_id]) {
-        computable_queue_.push_back(other_cindex_id);
-        computable_queued_[other_cindex_id] = true;
+      if (cindex_info_[other_cindex_id].computable == kUnknown &&
+          !cindex_info_[other_cindex_id].queued) {
+        cindex_info_[other_cindex_id].queued = true;
+        next_queue_.push_back(other_cindex_id);
       }
     }
-    if (output == kNotComputable && usable_count_[cindex_id] != 0) {
+    if (output == kNotComputable && cindex_info_[cindex_id].usable_count != 0) {
       // If we have just changed the computable state from kUnknown to
-      // kNotComputable, then given the way the usable_count_ is defined (see
+      // kNotComputable, then given the way the usable_count is defined (see
       // the declaration), this means that we must decrement the
       // usable_count_ of all cindex_ids that we depend on.
       std::vector<int32>::const_iterator
@@ -862,37 +855,12 @@ void ComputationGraphBuilder::UpdateComputableInfo(int32 cindex_id) {
   }
 }
 
-void ComputationGraphBuilder::SetAsWillNotCompute(int32 cindex_id) {
-  KALDI_ASSERT(usable_count_[cindex_id] == 0);
-  computable_info_[cindex_id] = kWillNotCompute;
-  std::vector<int32>::const_iterator iter = depend_on_this_[cindex_id].begin(),
-      end = depend_on_this_[cindex_id].end();
-  for (; iter != end; ++iter) {
-    int32 other_cindex_id = *iter;
-    if (computable_info_[other_cindex_id] == kUnknown &&
-        !computable_queued_[other_cindex_id]) {
-      computable_queue_.push_back(other_cindex_id);
-      computable_queued_[other_cindex_id] = true;
-    }
-  }
-}
-
-
-void ComputationGraphBuilder::UpdateAllComputableInfo() {
-  while (!computable_queue_.empty()) {
-    int32 cindex_id = computable_queue_.front();
-    computable_queue_.pop_front();
-    computable_queued_[cindex_id] = false;
-    UpdateComputableInfo(cindex_id);
-  }
-}
-
 
 void ComputationGraphBuilder::IncrementUsableCount(int32 cindex_id) {
   KALDI_PARANOID_ASSERT(static_cast<size_t>(cindex_id)<usable_count_.size());
-  // the next line post-increments the reachable count.
-  if (usable_count_[cindex_id]++ == 0 &&
-      computable_info_[cindex_id] != kNotComputable) {
+  CindexInfo &info = cindex_info_[cindex_id];
+  if (info.usable_count++ == 0 &&
+      info.computable != kNotComputable) {
     std::vector<int32>::const_iterator
         iter = graph_->dependencies[cindex_id].begin(),
         end = graph_->dependencies[cindex_id].end();
@@ -900,15 +868,22 @@ void ComputationGraphBuilder::IncrementUsableCount(int32 cindex_id) {
       int32 dep_cindex_id = *iter;
       IncrementUsableCount(dep_cindex_id);
     }
+    if (info.computable == kUnknown &&
+        !info.queued) {
+      // It's become usable, so make sure it's queued to process whether it is
+      // computable or not.
+      info.queued = true;
+      next_queue_.push_back(cindex_id);
+    }
   }
 }
 
 
 void ComputationGraphBuilder::DecrementUsableCount(int32 cindex_id) {
   KALDI_PARANOID_ASSERT(static_cast<size_t>(cindex_id)<usable_count_.size());
-  KALDI_PARANOID_ASSERT(usable_count_[cindex_id] > 0);
-  if (--usable_count_[cindex_id] == 0 &&
-      computable_info_[cindex_id] != kNotComputable) {
+  KALDI_PARANOID_ASSERT(cindex_info_[cindex_id].usable_count > 0);
+  if (--cindex_info_[cindex_id].usable_count == 0 &&
+      cindex_info_[cindex_id].computable != kNotComputable) {
     std::vector<int32>::const_iterator
         iter = graph_->dependencies[cindex_id].begin(),
         end = graph_->dependencies[cindex_id].end();
@@ -924,11 +899,19 @@ void ComputationGraphBuilder::BuildGraphOneIter() {
   while (!current_queue_.empty()) {
     int32 cindex_id = current_queue_.back();
     current_queue_.pop_back();
-    KALDI_ASSERT(computable_info_[cindex_id] == kUnknown);
-    if (usable_count_[cindex_id] == 0)
-      SetAsWillNotCompute(cindex_id);
-    else
+    cindex_info_[cindex_id].queued = false;
+    if (!cindex_info_[cindex_id].dependencies_computed &&
+        cindex_info_[cindex_id].usable_count != 0) {
+      cindex_info_[cindex_id].dependencies_computed = true;
       AddDependencies(cindex_id);
+      // Add to the queue so we can check whether it's computable.
+      if (!cindex_info_[cindex_id].queued) {
+        cindex_info_[cindex_id].queued = true;
+        next_queue_.push_back(cindex_id);
+      }
+    } else if (cindex_info_[cindex_id].computable == kUnknown) {
+      UpdateComputableInfo(cindex_id);
+    }
   }
   current_queue_.swap(next_queue_);  // now next_queue_ will be empty.
   current_distance_++;
@@ -940,7 +923,7 @@ void ComputationGraphBuilder::ComputeRequiredArray(
 
   int32 num_cindex_ids = graph_->cindexes.size();
   KALDI_ASSERT(num_cindex_ids >= start_cindex_id);
-  KALDI_ASSERT(computable_info_.size() == num_cindex_ids);
+  KALDI_ASSERT(cindex_info_.size() == num_cindex_ids);
   required->clear();
   required->resize(num_cindex_ids - start_cindex_id, false);
 
@@ -976,7 +959,7 @@ void ComputationGraphBuilder::ComputeRequiredArray(
   // usable_count_ == 0; this would indicate a bug somewhere.
   for (int32 c = start_cindex_id; c < num_cindex_ids; c++)
     KALDI_ASSERT(!((*required)[c - start_cindex_id] &&
-                   (usable_count_[c] == 0)));
+                   (cindex_info_[c].usable_count == 0)));
 
 }
 
@@ -1483,12 +1466,12 @@ void ComputeComputationPhases(
 }
 
 CindexSet::CindexSet(const ComputationGraph &graph):
-    graph_(graph), is_computable_(NULL) { }
+    graph_(graph), info_(NULL) { }
 
 CindexSet::CindexSet(const ComputationGraph &graph,
-                     const std::vector<char> &is_computable,
+                     const std::vector<ComputationGraphBuilder::CindexInfo> &info,
                      bool treat_unknown_as_computable):
-    graph_(graph), is_computable_(&is_computable),
+    graph_(graph), info_(&info),
     treat_unknown_as_computable_(treat_unknown_as_computable) { }
 
 
@@ -1497,26 +1480,23 @@ bool CindexSet::operator () (const Cindex &cindex) const {
   if (cindex_id == -1) {
     return false;
   } else {
-    if (is_computable_ == NULL) {
+    if (info_ == NULL) {
       return true;
     } else {
       ComputationGraphBuilder::ComputableInfo
-          c = static_cast<ComputationGraphBuilder::ComputableInfo>(
-              ((*is_computable_)[cindex_id]));
-      if (treat_unknown_as_computable_)
-        return (c == ComputationGraphBuilder::kComputable ||
-                c == ComputationGraphBuilder::kUnknown);
-      else
-        return (c == ComputationGraphBuilder::kComputable);
+          c = (*info_)[cindex_id].computable;
+      return (c == ComputationGraphBuilder::kComputable ||
+              (c == ComputationGraphBuilder::kUnknown &&
+               treat_unknown_as_computable_));
     }
   }
 }
 
 IndexSet::IndexSet(const ComputationGraph &graph,
-                   const std::vector<char> &is_computable,
+                   const std::vector<ComputationGraphBuilder::CindexInfo> &info,
                    int32 node_id,
                    bool treat_unknown_as_computable):
-    graph_(graph), is_computable_(is_computable), node_id_(node_id),
+    graph_(graph), info_(info), node_id_(node_id),
     treat_unknown_as_computable_(treat_unknown_as_computable) { }
 
 bool IndexSet::operator () (const Index &index) const {
@@ -1524,9 +1504,7 @@ bool IndexSet::operator () (const Index &index) const {
   if (cindex_id == -1) {
     return false;
   } else {
-    ComputationGraphBuilder::ComputableInfo
-        c = static_cast<ComputationGraphBuilder::ComputableInfo>(
-            is_computable_[cindex_id]);
+    ComputationGraphBuilder::ComputableInfo c = info_[cindex_id].computable;
     if (treat_unknown_as_computable_)
       return (c == ComputationGraphBuilder::kComputable ||
               c == ComputationGraphBuilder::kUnknown);
diff --git a/src/nnet3/nnet-computation-graph.h b/src/nnet3/nnet-computation-graph.h
index c0662756502..c310444545a 100644
--- a/src/nnet3/nnet-computation-graph.h
+++ b/src/nnet3/nnet-computation-graph.h
@@ -58,7 +58,7 @@ struct ComputationGraph {
   /// particular cindex_id directly depends on to compute it.  No repeats will
   /// be present.  Note, some of these dependencies may be optional
   /// dependencies; in early stages of compilation this will contain all
-  /// "desired" inputs and later we will prune the dependencies contain just
+  /// "desired" inputs and later we will prune the dependencies to contain just
   /// those that are used (which will vary depending on availability).
   std::vector<std::vector<int32> > dependencies;
 
@@ -157,7 +157,7 @@ class ComputationGraphBuilder {
   // dependencies of a particular cindex_id we realize that we won't be able to
   // use this cindex_id (i.e. it may be computable but it's not used) because
   // its usable_count is zero, and in those cases we change the status to
-  // kWillNotCompute even though the cindex-id may be computable- for most
+  // kWillNotCompute even though the cindex-id may be computable.  For most
   // purposes this status is treated the same as kNotComputable.
   enum ComputableInfo {
     kUnknown = 0,
@@ -165,6 +165,30 @@ class ComputationGraphBuilder {
     kNotComputable = 2,
     kWillNotCompute = 3
   };
+
+  struct CindexInfo {
+    ComputableInfo computable;  // kUnknown, kComputable, kNotComputable
+    int32 usable_count;   // usable_count_[i] for a cindex_id i is defined as 1 if i is a requested
+    // output, and otherwise as the number of other cindex_ids j such that
+    // computable_info_[j] is not kNotComputable AND usable_count_[j] > 0 AND i is
+    // a member of graph->dependencies[j].  A cindex_id is termed "usable"
+    // (meaning it could potentially participate in the computation of the output)
+    // if its usable_count_ is > 0.  This quantity is designed to be easy to keep
+    // updated as we add cindex_ids.
+
+    // True if in current_queue_ or next_queue_.
+    bool queued;
+
+    // True if we have created the cindexes that this cindex depends on.
+    bool dependencies_computed;
+
+    CindexInfo(const CindexInfo &other) = default;
+    CindexInfo(): computable(kUnknown),
+                  usable_count(0),
+                  queued(false),
+                  dependencies_computed(false) { }
+  };
+
  private:
   // This function, called from ExplainWhyNotComputable(), prints to "os"
   // a human-readable form of a given cindex_id, that looks like
@@ -189,12 +213,9 @@ class ComputationGraphBuilder {
   // the output.
   void BuildGraphOneIter();
 
-  // make sure the "computable_info_" array is up to date.
-  void UpdateAllComputableInfo();
-
-  // (called from UpdateAllComputableInfo); make sure the computable_info for
-  // cindex_id is up to date.  As a side effect this may also update the
-  // usable_count_ array.
+  // (called from BuildGraphOneIter()); make sure the computable_info for
+  // cindex_id is up to date.  Has side effects: may update usable_count
+  // values and add things to next_queue_.
   void UpdateComputableInfo(int32 cindex_id);
 
   // (called from BuildGraphOneIter()), this function sets the cindex_id to
@@ -207,11 +228,8 @@ class ComputationGraphBuilder {
   ComputableInfo ComputeComputableInfo(int32 cindex_id) const;
 
   // To be called when this cindex_id has just been newly added to graph_, this
-  // function adds various initial variables associated with it, to *this.
-  // is_input should be set to true if this cindex-id is being added as an input
-  // (from request_.inputs), and is_output should be set to true if this
-  // cindex-id is being added as an output (from request_.outputs).
-  inline void AddCindexId(int32 cindex_id, bool is_input, bool is_output);
+  // function adds a couple default variables associated with it, to *this.
+  inline void AddCindexId(int32 cindex_id);
 
   // Add cindex_ids that this cindex_id depends on.
   void AddDependencies(int32 cindex_id);
@@ -256,36 +274,18 @@ class ComputationGraphBuilder {
   // for each cindex_id, which other cindex_ids depend on it.
   std::vector<std::vector<int32> > depend_on_this_;
 
-  // this vector, indexed by cindex_id, contains our information about whether
-  // each cindex_id is computable; it's ComputableInfo, cast to char.
-  std::vector<char> computable_info_;
-
-  // this is a queue of cindex_ids that we need to re-compute whether they are
-  // computable or not (because either they are new and haven't had dependencies
-  // added, or their dependencies' computable status has changed since we last
-  // computed their computable_ value).
-  std::deque<int32> computable_queue_;
-  // this vector tells us whether a cindex_id is in computable_queued_; it
-  // stops us from adding things twice.
-  std::vector<bool> computable_queued_;
-
-  // usable_count_[i] for a cindex_id i is defined as 1 if i is a requested
-  // output, and otherwise as the number of other cindex_ids j such that
-  // computable_info_[j] is not kNotComputable AND usable_count_[j] > 0 AND i is
-  // a member of graph->dependencies[j].  A cindex_id is termed "usable"
-  // (meaning it could potentially participate in the computation of the output)
-  // if its usable_count_ is > 0.  This quantity is designed to be easy to keep
-  // updated as we add cindex_ids.
-  std::vector<int32> usable_count_;
+
+  // this vector is  indexed by cindex_id
+  std::vector<CindexInfo> cindex_info_;
 
   // current_distance_ >= 0 is the distance to the output, of the cindex_ids in
   // current_queue_.
   int32 current_distance_;
-  // the cindex_ids in current_queue_ are at distance "current_distance" to the
-  // output and have not yet had their dependencies processed.
+  // the cindex_ids in current_queue_ are at no more than distance
+  // "current_distance" to the output
   std::vector<int32> current_queue_;
-  // the cindex_ids in next_queue_ are at distance current_distance + 1 to the
-  // output and have not yet had their dependencies processed.
+  // the cindex_ids in next_queue_ are at no more than distance current_distance
+  // + 1 to the output
   std::vector<int32> next_queue_;
 };
 
@@ -305,14 +305,15 @@ class CindexSet {
 
   /// with this constructor, represents the set of all Cindexes that exist in
   /// the graph and which are computable.  If treat_unknown_as_computable is
-  /// true then we consider kComputable and kUnknown to be computable, else we
-  /// consider just nodes that are kComputable to be computable.
+  /// true then we consider kComputable, kUnknown and kWillNotCompute to be
+  /// computable; else we consider just nodes that are kComputable to be
+  /// computable.
   CindexSet(const ComputationGraph &graph,
-            const std::vector<char> &is_computable,
+            const std::vector<ComputationGraphBuilder::CindexInfo> &info,
             bool treat_unknown_as_computable);
  private:
   const ComputationGraph &graph_;
-  const std::vector<char> *is_computable_;
+  const std::vector<ComputationGraphBuilder::CindexInfo> *info_;
   bool treat_unknown_as_computable_;
 };
 
@@ -327,14 +328,14 @@ class IndexSet {
   /// (node_id, x) which is computable exists in this graph.  If
   /// treat_unknown_as_computable is true then we consider kComputable and kUnknown
   /// to be computable, else we consider just nodes that are kComputable to be
-  /// computable.
+  /// computable.  The `info` input is only needed for its `computable` member.
   IndexSet(const ComputationGraph &graph,
-           const std::vector<char> &computable_info,
+           const std::vector<ComputationGraphBuilder::CindexInfo> &info,
            int32 node_id,
            bool treat_unknown_as_computable);
  private:
   const ComputationGraph &graph_;
-  const std::vector<char> &is_computable_;
+  const std::vector<ComputationGraphBuilder::CindexInfo> &info_;
   int32 node_id_;
   bool treat_unknown_as_computable_;
 };
diff --git a/tools/Makefile b/tools/Makefile
index e690df3da88..73fb43e84f2 100644
--- a/tools/Makefile
+++ b/tools/Makefile
@@ -9,7 +9,6 @@ CC ?= gcc        # used for sph2pipe
 # e.g. g++ >= 4.7, Apple clang >= 5.0 or LLVM clang >= 3.3.
 OPENFST_VERSION ?= 1.6.7
 CUB_VERSION ?= 1.8.0
-OPENBLAS_VERSION ?= 0.3.5
 
 # Default features configured for OpenFST; can be overridden in the make command line.
 OPENFST_CONFIGURE ?= --enable-static --enable-shared --enable-far --enable-ngram-fsts
@@ -138,13 +137,7 @@ cub:
 	rm -f cub
 	ln -s cub-$(CUB_VERSION) cub
 
-# OpenBLAS is not compiled by default. Run 'make -j openblas' in this directory to build.
+# OpenBLAS is not compiled by default. Run 'make openblas' in this directory to build.
 .PHONY: openblas
 openblas:
-	@-rm -rf OpenBLAS xianyi-OpenBLAS-*
-	wget -t3 -nv -O- $$( \
-            wget -qO- 'https://api.github.com/repos/xianyi/OpenBLAS/releases/tags/v$(OPENBLAS_VERSION)' | \
-            python -c 'import sys,json;print(json.load(sys.stdin)["tarball_url"])') | \
-	  tar xzf -
-	mv xianyi-OpenBLAS-* OpenBLAS
-	$(MAKE) PREFIX=$$(pwd)/OpenBLAS/install USE_THREAD=0 -C OpenBLAS all install
+	extras/install_openblas.sh
diff --git a/tools/extras/install_openblas.sh b/tools/extras/install_openblas.sh
index 90afe8e9de4..224a9eade69 100755
--- a/tools/extras/install_openblas.sh
+++ b/tools/extras/install_openblas.sh
@@ -1,5 +1,25 @@
 #!/bin/bash
 
-# OpenBLAS is downloaded and built by tools/Makefile, but not automatically by
-# its default 'all' target.
-make -j openblas
+
+OPENBLAS_VERSION=0.3.5
+
+set -e
+
+if ! command -v gfortran 2>/dev/null; then
+  echo "$0: gfortran is not installed.  Please install it, e.g. by:"
+  echo " apt-get install gfortran"
+  echo "(if on Debian or Ubuntu), or:"
+  echo " yum install fortran"
+  echo "(if on RedHat/CentOS).  On a Mac, if brew is installed, it's:"
+  echo " brew install gfortran"
+  exit 1
+fi
+
+
+rm -rf xianyi-OpenBLAS-* OpenBLAS
+
+wget -t3 -nv -O- $(wget -qO- "https://api.github.com/repos/xianyi/OpenBLAS/releases/tags/v${OPENBLAS_VERSION}" | python -c 'import sys,json;print(json.load(sys.stdin)["tarball_url"])') | tar xzf -
+
+mv xianyi-OpenBLAS-* OpenBLAS
+
+make PREFIX=$(pwd)/OpenBLAS/install USE_THREAD=0 -C OpenBLAS all install

From d844498375fa08d3427f871b03aa70b1917a2a0b Mon Sep 17 00:00:00 2001
From: Daniel Povey <dpovey@gmail.com>
Date: Mon, 30 Sep 2019 21:30:20 +0300
Subject: [PATCH 159/163] Properly merge online2bin dir from master (previously
 accidentally lost those changes

---
 src/online2bin/apply-cmvn-online.cc           |  2 -
 .../online2-tcp-nnet3-decode-faster.cc        | 56 +++++++++++++++++--
 .../online2-wav-nnet3-latgen-faster.cc        | 14 ++++-
 .../online2-wav-nnet3-latgen-grammar.cc       | 12 +++-
 4 files changed, 76 insertions(+), 8 deletions(-)

diff --git a/src/online2bin/apply-cmvn-online.cc b/src/online2bin/apply-cmvn-online.cc
index 2745df5d4e6..06157d0fcdf 100644
--- a/src/online2bin/apply-cmvn-online.cc
+++ b/src/online2bin/apply-cmvn-online.cc
@@ -127,8 +127,6 @@ int main(int argc, char *argv[]) {
         num_done++;
         tot_t += feats.NumRows();
         feature_writer.Write(utt, normalized_feats);
-        
-        num_done++;
       }
     }
     
diff --git a/src/online2bin/online2-tcp-nnet3-decode-faster.cc b/src/online2bin/online2-tcp-nnet3-decode-faster.cc
index 7e99bc9a840..44d8a8d818d 100644
--- a/src/online2bin/online2-tcp-nnet3-decode-faster.cc
+++ b/src/online2bin/online2-tcp-nnet3-decode-faster.cc
@@ -85,6 +85,20 @@ std::string LatticeToString(const Lattice &lat, const fst::SymbolTable &word_sym
   return msg.str();
 }
 
+std::string GetTimeString(int32 t_beg, int32 t_end, BaseFloat time_unit) {
+  char buffer[100];
+  double t_beg2 = t_beg * time_unit;
+  double t_end2 = t_end * time_unit;
+  snprintf(buffer, 100, "%.2f %.2f", t_beg2, t_end2);
+  return std::string(buffer);
+}
+
+int32 GetLatticeTimeSpan(const Lattice& lat) {
+  std::vector<int32> times;
+  LatticeStateTimes(lat, &times);
+  return times.back();
+}
+
 std::string LatticeToString(const CompactLattice &clat, const fst::SymbolTable &word_syms) {
   if (clat.NumStates() == 0) {
     KALDI_WARN << "Empty lattice.";
@@ -132,6 +146,7 @@ int main(int argc, char *argv[]) {
     BaseFloat samp_freq = 16000.0;
     int port_num = 5050;
     int read_timeout = 3;
+    bool produce_time = false;
 
     po.Register("samp-freq", &samp_freq,
                 "Sampling frequency of the input signal (coded as 16-bit slinear).");
@@ -145,6 +160,8 @@ int main(int argc, char *argv[]) {
                 "Number of seconds of timout for TCP audio data to appear on the stream. Use -1 for blocking.");
     po.Register("port-num", &port_num,
                 "Port number the server will listen on.");
+    po.Register("produce-time", &produce_time,
+                "Prepend begin/end times between endpoints (e.g. '5.46 6.81 <text_output>', in seconds)");
 
     feature_opts.Register(&po);
     decodable_opts.Register(&po);
@@ -164,6 +181,9 @@ int main(int argc, char *argv[]) {
 
     OnlineNnet2FeaturePipelineInfo feature_info(feature_opts);
 
+    BaseFloat frame_shift = feature_info.FrameShiftInSeconds();
+    int32 frame_subsampling = decodable_opts.frame_subsampling_factor;
+
     KALDI_VLOG(1) << "Loading AM...";
 
     Transitions trans_model;
@@ -239,6 +259,15 @@ int main(int argc, char *argv[]) {
               CompactLattice lat;
               decoder.GetLattice(true, &lat);
               std::string msg = LatticeToString(lat, *word_syms);
+
+              // get time-span from previous endpoint to end of audio,
+              if (produce_time) {
+                int32 t_beg = frame_offset - decoder.NumFramesDecoded();
+                int32 t_end = frame_offset;
+                msg = GetTimeString(t_beg, t_end, frame_shift * frame_subsampling) + " " + msg;
+              }
+
+              KALDI_VLOG(1) << "EndOfAudio, sending message: " << msg;
               server.WriteLn(msg);
             } else
               server.Write("\n");
@@ -254,9 +283,9 @@ int main(int argc, char *argv[]) {
               feature_pipeline.IvectorFeature() != NULL) {
             silence_weighting.ComputeCurrentTraceback(decoder.Decoder());
             silence_weighting.GetDeltaWeights(feature_pipeline.NumFramesReady(),
+                                              frame_offset * decodable_opts.frame_subsampling_factor,
                                               &delta_weights);
-            feature_pipeline.UpdateFrameWeights(delta_weights,
-                                                frame_offset * decodable_opts.frame_subsampling_factor);
+            feature_pipeline.UpdateFrameWeights(delta_weights);
           }
 
           decoder.AdvanceDecoding();
@@ -265,7 +294,17 @@ int main(int argc, char *argv[]) {
             if (decoder.NumFramesDecoded() > 0) {
               Lattice lat;
               decoder.GetBestPath(false, &lat);
+              TopSort(&lat); // for LatticeStateTimes(),
               std::string msg = LatticeToString(lat, *word_syms);
+
+              // get time-span after previous endpoint,
+              if (produce_time) {
+                int32 t_beg = frame_offset;
+                int32 t_end = frame_offset + GetLatticeTimeSpan(lat);
+                msg = GetTimeString(t_beg, t_end, frame_shift * frame_subsampling) + " " + msg;
+              }
+
+              KALDI_VLOG(1) << "Temporary transcript: " << msg;
               server.WriteLn(msg, "\r");
             }
             check_count += check_period;
@@ -277,8 +316,17 @@ int main(int argc, char *argv[]) {
             CompactLattice lat;
             decoder.GetLattice(true, &lat);
             std::string msg = LatticeToString(lat, *word_syms);
+
+            // get time-span between endpoints,
+            if (produce_time) {
+              int32 t_beg = frame_offset - decoder.NumFramesDecoded();
+              int32 t_end = frame_offset;
+              msg = GetTimeString(t_beg, t_end, frame_shift * frame_subsampling) + " " + msg;
+            }
+
+            KALDI_VLOG(1) << "Endpoint, sending message: " << msg;
             server.WriteLn(msg);
-            break;
+            break; // while (true)
           }
         }
       }
@@ -383,7 +431,7 @@ bool TcpServer::ReadChunk(size_t len) {
       KALDI_WARN << "Socket timeout! Disconnecting...";
       break;
     }
-    if (client_set_[0].revents != POLLIN) {
+    if (poll_ret < 0) {
       KALDI_WARN << "Socket error! Disconnecting...";
       break;
     }
diff --git a/src/online2bin/online2-wav-nnet3-latgen-faster.cc b/src/online2bin/online2-wav-nnet3-latgen-faster.cc
index 571bd988e0b..cf6f8923ba9 100644
--- a/src/online2bin/online2-wav-nnet3-latgen-faster.cc
+++ b/src/online2bin/online2-wav-nnet3-latgen-faster.cc
@@ -147,7 +147,6 @@ int main(int argc, char *argv[]) {
         clat_wspecifier = po.GetArg(5);
 
     OnlineNnet2FeaturePipelineInfo feature_info(feature_opts);
-
     if (!online) {
       feature_info.ivector_extractor_info.use_most_recent_ivector = true;
       feature_info.ivector_extractor_info.greedy_ivector_extractor = true;
@@ -155,6 +154,14 @@ int main(int argc, char *argv[]) {
     }
 
     Transitions trans_model;
+
+    Matrix<double> global_cmvn_stats;
+    if (feature_info.global_cmvn_stats_rxfilename != "")
+      ReadKaldiObject(feature_info.global_cmvn_stats_rxfilename,
+                      &global_cmvn_stats);
+
+    TransitionModel trans_model;
+
     nnet3::AmNnetSimple am_nnet;
     {
       bool binary;
@@ -194,8 +201,11 @@ int main(int argc, char *argv[]) {
     for (; !spk2utt_reader.Done(); spk2utt_reader.Next()) {
       std::string spk = spk2utt_reader.Key();
       const std::vector<std::string> &uttlist = spk2utt_reader.Value();
+
       OnlineIvectorExtractorAdaptationState adaptation_state(
           feature_info.ivector_extractor_info);
+      OnlineCmvnState cmvn_state(global_cmvn_stats);
+
       for (size_t i = 0; i < uttlist.size(); i++) {
         std::string utt = uttlist[i];
         if (!wav_reader.HasKey(utt)) {
@@ -210,6 +220,7 @@ int main(int argc, char *argv[]) {
 
         OnlineNnet2FeaturePipeline feature_pipeline(feature_info);
         feature_pipeline.SetAdaptationState(adaptation_state);
+        feature_pipeline.SetCmvnState(cmvn_state);
 
         OnlineSilenceWeighting silence_weighting(
             trans_model,
@@ -276,6 +287,7 @@ int main(int argc, char *argv[]) {
         // In an application you might avoid updating the adaptation state if
         // you felt the utterance had low confidence.  See lat/confidence.h
         feature_pipeline.GetAdaptationState(&adaptation_state);
+        feature_pipeline.GetCmvnState(&cmvn_state);
 
         // we want to output the lattice with un-scaled acoustics.
         BaseFloat inv_acoustic_scale =
diff --git a/src/online2bin/online2-wav-nnet3-latgen-grammar.cc b/src/online2bin/online2-wav-nnet3-latgen-grammar.cc
index d68e36ceef0..2085f025471 100644
--- a/src/online2bin/online2-wav-nnet3-latgen-grammar.cc
+++ b/src/online2bin/online2-wav-nnet3-latgen-grammar.cc
@@ -150,13 +150,18 @@ int main(int argc, char *argv[]) {
         clat_wspecifier = po.GetArg(5);
 
     OnlineNnet2FeaturePipelineInfo feature_info(feature_opts);
-
     if (!online) {
       feature_info.ivector_extractor_info.use_most_recent_ivector = true;
       feature_info.ivector_extractor_info.greedy_ivector_extractor = true;
       chunk_length_secs = -1.0;
     }
 
+
+    Matrix<double> global_cmvn_stats;
+    if (feature_info.global_cmvn_stats_rxfilename != "")
+      ReadKaldiObject(feature_info.global_cmvn_stats_rxfilename,
+                      &global_cmvn_stats);
+
     Transitions trans_model;
     nnet3::AmNnetSimple am_nnet;
     {
@@ -198,8 +203,11 @@ int main(int argc, char *argv[]) {
     for (; !spk2utt_reader.Done(); spk2utt_reader.Next()) {
       std::string spk = spk2utt_reader.Key();
       const std::vector<std::string> &uttlist = spk2utt_reader.Value();
+
       OnlineIvectorExtractorAdaptationState adaptation_state(
           feature_info.ivector_extractor_info);
+      OnlineCmvnState cmvn_state(global_cmvn_stats);
+
       for (size_t i = 0; i < uttlist.size(); i++) {
         std::string utt = uttlist[i];
         if (!wav_reader.HasKey(utt)) {
@@ -214,6 +222,7 @@ int main(int argc, char *argv[]) {
 
         OnlineNnet2FeaturePipeline feature_pipeline(feature_info);
         feature_pipeline.SetAdaptationState(adaptation_state);
+        feature_pipeline.SetCmvnState(cmvn_state);
 
         OnlineSilenceWeighting silence_weighting(
             trans_model,
@@ -281,6 +290,7 @@ int main(int argc, char *argv[]) {
         // In an application you might avoid updating the adaptation state if
         // you felt the utterance had low confidence.  See lat/confidence.h
         feature_pipeline.GetAdaptationState(&adaptation_state);
+        feature_pipeline.GetCmvnState(&cmvn_state);
 
         // we want to output the lattice with un-scaled acoustics.
         BaseFloat inv_acoustic_scale =

From 85cbf754a00d3ce891c873c46e6e96201982c301 Mon Sep 17 00:00:00 2001
From: Daniel Povey <dpovey@gmail.com>
Date: Tue, 1 Oct 2019 15:42:23 +0300
Subject: [PATCH 160/163] Add link kaldi->src, will eventually move the dir to
 be named 'kaldi'

---
 kaldi | 1 +
 1 file changed, 1 insertion(+)
 create mode 120000 kaldi

diff --git a/kaldi b/kaldi
new file mode 120000
index 00000000000..e8310385c56
--- /dev/null
+++ b/kaldi
@@ -0,0 +1 @@
+src
\ No newline at end of file

From 312e68701b591d074cef5ba5e458ebc1526cdb80 Mon Sep 17 00:00:00 2001
From: Daniel Povey <dpovey@gmail.com>
Date: Wed, 2 Oct 2019 11:14:11 +0300
Subject: [PATCH 161/163] Various kaldi10 fixes after merge

---
 src/bin/post-to-phone-post.cc        |  6 +++---
 src/configure                        |  9 ++++++---
 src/cudafeat/feature-spectral-cuda.h |  9 ++++-----
 src/feat/mel-computations.h          | 29 ----------------------------
 src/tensor/tensor-common.h           | 11 +++++++----
 5 files changed, 20 insertions(+), 44 deletions(-)

diff --git a/src/bin/post-to-phone-post.cc b/src/bin/post-to-phone-post.cc
index b09e8ab0537..cf97c631243 100644
--- a/src/bin/post-to-phone-post.cc
+++ b/src/bin/post-to-phone-post.cc
@@ -98,11 +98,11 @@ int main(int argc, char *argv[]) {
 
       for (int32 i = 1; i <= num_tids; i++) {
         BaseFloat count = transition_counts(i);
-        int32 phone = trans_model.TransitionIdToPhone(i),
-            pdf_id = trans_model.TransitionIdToPdf(i);
+        const Transitions::TransitionIdInfo
+            &info = trans_model.InfoForTransitionId(i);
         // Relying on C++11 value-initialization thingies that should make the
         // map's elements default to zero.
-        pdf_to_phones[pdf_id][phone] += count;
+        pdf_to_phones[info.pdf_id][info.phone] += count;
       }
 
       for (int32 i = 0; i < num_pdfs; i++) {
diff --git a/src/configure b/src/configure
index e6ffdf337af..c727948962e 100755
--- a/src/configure
+++ b/src/configure
@@ -502,8 +502,8 @@ function configure_cuda {
     echo CUDA = true >> kaldi.mk
     echo CUDATKDIR = $CUDATKDIR >> kaldi.mk
     echo "CUDA_ARCH = $CUDA_ARCH" >> kaldi.mk
-    
-    
+
+
     echo >> kaldi.mk
 
     # 64bit/32bit? We do not support cross compilation with CUDA so, use direct
@@ -524,7 +524,7 @@ WARNING: CUDA will not be used!
          CUDA is not supported with 32-bit builds."
       exit 1;
     fi
-    
+
     #add cusolver flags for newer toolkits
     if [ "$CUSOLVER" == "true" ]; then
       echo "CUDA_LDLIBS += -lcusolver" >> kaldi.mk
@@ -1346,6 +1346,9 @@ if [ -n "$ENV_CXXFLAGS" ]; then echo "CXXFLAGS += $ENV_CXXFLAGS" >> kaldi.mk; fi
 if [ -n "$ENV_LDFLAGS" ]; then echo "LDFLAGS += $ENV_LDFLAGS" >> kaldi.mk; fi
 if [ -n "$ENV_LDLIBS" ]; then echo "LDLIBS += $ENV_LDLIBS" >> kaldi.mk; fi
 
+echo "# The following makes it possible to include as kaldi/foo/bar.h" >> kaldi.mk
+echo "CXXFLAGS += -I ../.." >> kaldi.mk
+
 # We check for slow exp implementation just before we exit. This check uses
 # and possibly modifies the kaldi.mk file that we just generated.
 check_for_slow_expf;
diff --git a/src/cudafeat/feature-spectral-cuda.h b/src/cudafeat/feature-spectral-cuda.h
index 8683372098c..ba9f8ebea0f 100644
--- a/src/cudafeat/feature-spectral-cuda.h
+++ b/src/cudafeat/feature-spectral-cuda.h
@@ -22,7 +22,6 @@
 #include <cufft.h>
 #endif
 
-#include "cudafeat/feature-window-cuda.h"
 #include "cudamatrix/cu-matrix.h"
 #include "cudamatrix/cu-vector.h"
 #include "feat/feature-fbank.h"
@@ -38,8 +37,8 @@ struct CudaSpectralFeatureOptions {
   SpectralFeatureType feature_type;
   CudaSpectralFeatureOptions(MfccOptions opts_in)
       : mfcc_opts(opts_in),
-        use_log_fbank(true), 
-	use_power(true), 
+        use_log_fbank(true),
+	use_power(true),
 	use_dct(true),
         feature_type(MFCC) {}
   CudaSpectralFeatureOptions(FbankOptions opts){
@@ -75,13 +74,13 @@ class CudaSpectralFeatures : public MfccComputer {
   ~CudaSpectralFeatures();
   CudaSpectralFeatureOptions cumfcc_opts_;
   int32 Dim()
-  // The dimension of the output is different for MFCC and Fbank. 
+  // The dimension of the output is different for MFCC and Fbank.
   // This returns the appropriate value depending on the feature
   // extraction algorithm
   {
     if (cumfcc_opts_.feature_type == MFCC) return MfccComputer::Dim();
     //If we're running fbank, we need to set the dimension right
-    else return cumfcc_opts_.mfcc_opts.mel_opts.num_bins + 
+    else return cumfcc_opts_.mfcc_opts.mel_opts.num_bins +
 	        (cumfcc_opts_.mfcc_opts.use_energy ? 1 : 0);
   }
 
diff --git a/src/feat/mel-computations.h b/src/feat/mel-computations.h
index 9c1e5d2acb2..6822debc242 100644
--- a/src/feat/mel-computations.h
+++ b/src/feat/mel-computations.h
@@ -133,35 +133,6 @@ class MelBanks {
 };
 
 
-<<<<<<< HEAD
-=======
-// Compute liftering coefficients (scaling on cepstral coeffs)
-// coeffs are numbered slightly differently from HTK: the zeroth
-// index is C0, which is not affected.
-void ComputeLifterCoeffs(BaseFloat Q, VectorBase<BaseFloat> *coeffs);
-
-
-// Durbin's recursion - converts autocorrelation coefficients to the LPC
-// pTmp - temporal place [n]
-// pAC - autocorrelation coefficients [n + 1]
-// pLP - linear prediction coefficients [n] (predicted_sn = sum_1^P{a[i-1] * s[n-i]}})
-//       F(z) = 1 / (1 - A(z)), 1 is not stored in the denominator
-// Returns log energy of residual (I think)
-BaseFloat Durbin(int n, const BaseFloat *pAC, BaseFloat *pLP, BaseFloat *pTmp);
-
-// Compute LP coefficients from autocorrelation coefficients.
-// Returns log energy of residual (I think)
-BaseFloat ComputeLpc(const VectorBase<BaseFloat> &autocorr_in,
-                     Vector<BaseFloat> *lpc_out);
-
-void Lpc2Cepstrum(int n, const BaseFloat *pLPC, BaseFloat *pCepst);
-
-
-
-void GetEqualLoudnessVector(const MelBanks &mel_banks,
-                            Vector<BaseFloat> *ans);
->>>>>>> upstream/master
-
 /// @} End of "addtogroup feat"
 }  // namespace kaldi
 
diff --git a/src/tensor/tensor-common.h b/src/tensor/tensor-common.h
index 24252d61585..bd2884a8a72 100644
--- a/src/tensor/tensor-common.h
+++ b/src/tensor/tensor-common.h
@@ -23,6 +23,7 @@
 #include <cstdint>
 #include <vector>
 #include <string>
+#include "kaldi/base/kaldi-error.h"
 
 /**
    This is some notes on plans for kaldi10 tensor stuff, nothing is fully fleshed out.
@@ -59,6 +60,8 @@ struct Device {
 };
 
 
+// TODO: use NumPy data-type enum, for greater compatibility.
+
 enum DataType {
   // We will of course later extend this with many more types, including
   // integer types and half-precision floats.
@@ -82,21 +85,21 @@ enum DataType {
 
 inline int32 SizeOf(DataType dtype) {
   switch(dtype) {
-    case 0: return 4;
+    case kFloatDtype: return 4;
     case 1: return 8;
     case 2: KALDI_ERR << "Invalid data-type " << int32(dtype); return 0;
   }
 }
 
 
-/// Enumeration that says what strides we should choose when allocating
-/// A Tensor.
+/// Enumeration that says what strides we should choose when
+/// allocating a Tensor that is a copy of a provided Tensor.
 enum StridePolicy {
   kKeepStrideOrder,  // Means: keep the size-ordering of the strides from the
                      // source Tensor (but the chosen strides will all be
                      // positive even of some of the source Tensor's strides
                      // were negative).
-  kNormalized    // Means: strides for dimensions that are != 1 are ordered from
+  kNormalized,    // Means: strides for dimensions that are != 1 are ordered from
                  // greatest to smallest as in a "C" array in the public
                  // numbering, or smallest to greatest in the private numbering.
                  // Per our policy, any dimension that is 1 will be given a zero stride.

From 4b8bab02edf52d361550cac5186b3068099b8f07 Mon Sep 17 00:00:00 2001
From: Daniel Povey <dpovey@gmail.com>
Date: Wed, 2 Oct 2019 11:24:13 +0300
Subject: [PATCH 162/163] [src] Clarification in comment

---
 src/fstext/fstext-utils.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/fstext/fstext-utils.h b/src/fstext/fstext-utils.h
index ca6a48d7fc3..950c0c87c60 100644
--- a/src/fstext/fstext-utils.h
+++ b/src/fstext/fstext-utils.h
@@ -259,8 +259,8 @@ void MakePrecedingInputSymbolsSame(bool start_is_epsilon, MutableFst<Arc> *fst);
 
 
 /// As MakePrecedingInputSymbolsSame, but takes a functor object that maps
-/// labels to (int32) classes.  Caution: it must not map kNoLabel (-1)
-/// to the same value as any real symbol.
+/// labels to (int32) classes.  Caution: it must not map kNoLabel (-1) to the
+/// same value as any real symbol (it should generally map -1 to -1).
 template<class Arc, class F>
 void MakePrecedingInputSymbolsSameClass(bool start_is_epsilon, MutableFst<Arc> *fst, const F &f);
 

From b3222753b3455c11cc4315936ddb4a731fec3952 Mon Sep 17 00:00:00 2001
From: Daniel Povey <dpovey@gmail.com>
Date: Wed, 11 Dec 2019 21:19:17 +0800
Subject: [PATCH 163/163] Fix some comments

---
 src/tensor/pattern.h | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/src/tensor/pattern.h b/src/tensor/pattern.h
index a38ef16b2da..d1d3cca9411 100644
--- a/src/tensor/pattern.h
+++ b/src/tensor/pattern.h
@@ -136,7 +136,7 @@ namespace tensor {
                      in the relevant storage region; we will assume that it is obvious
                      from the context which storage region.   See also: "Storage region"
 
-    Disjoint Patterns:  When we speak of disjoint Patterns we mean that
+    Disjoint Patterns:  When we speak of Patterns being disjoint we mean that
                     their memory-index-sets are disjoint; see memory-index-set.
 
     Eaxis-index / extended axis-index:
@@ -153,7 +153,7 @@ namespace tensor {
                       `dims = [5 1]`, we can index that Tensor with an index-tuple
                       that:
                        (1) may have nonzero index values in any axis for which
-                          with dim=1, so `index_tuple = [4 100]` would be a valid
+                          dim=1, so `index_tuple = [4 100]` would be a valid
                           index for this Tensor in extended indexing.
                        (2) may have more elements than the Tensor's num-axes; the
                          Tensor is implicitly extended with extra axes on the left
@@ -188,11 +188,11 @@ namespace tensor {
                       multiplied by the Pattern's stride for that axis.
 
     Index-tuple-set of a Pattern: The index-tuple-set I(p) of a Pattern p is the
-                      set of valid index-tuples assuming we are not allowing extended
+                      set of valid index-tuples minus those that require extended
                       indexing.  For example, for a Tensor with `dims = [2]`, the
-                      set of valid index-tuples would be `{ (0), (1) }`; for
-                      a Tensor with `dims = [2 2]` the set of valid index-tuples
-                      is `{ (0,0), (0,1), (1,0), (1,1) }`.
+                      index-tuple-set `{ (0), (1) }`; for
+                      a Tensor with `dims = [1 2]` the set of valid index-tuples
+                      is `{ (0,0), (0,1) }`.
 
     Index-tuple-set of a Pattern-tuple:  The index-tuple-set I(P, Q) of a Pattern-tuple
                       (P, Q) is the index-tuple-set that you would obtain for a